easy_ml 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,403 @@
1
+ require "active_support/core_ext/hash/deep_transform_values"
2
+ require "numo/narray"
3
+ require "json"
4
+
5
+ module EasyML
6
+ module Data
7
+ class Preprocessor
8
+ class SimpleImputer
9
+ attr_reader :statistics
10
+ attr_accessor :path, :attribute, :strategy, :options
11
+
12
+ def initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block)
13
+ @strategy = strategy.to_sym
14
+ @path = path
15
+ @attribute = attribute
16
+ @options = options || {}
17
+ apply_defaults
18
+ load
19
+ @statistics ||= {}
20
+ deep_symbolize_keys!
21
+ return unless block_given?
22
+
23
+ instance_eval(&block)
24
+ end
25
+
26
+ def deep_symbolize_keys!
27
+ @statistics = @statistics.deep_symbolize_keys
28
+ end
29
+
30
+ def apply_defaults
31
+ @options[:date_column] ||= "CREATED_DATE"
32
+
33
+ if strategy == :categorical
34
+ @options[:categorical_min] ||= 25
35
+ elsif strategy == :custom
36
+ itself = ->(col) { col }
37
+ @options[:fit] ||= itself
38
+ @options[:transform] ||= itself
39
+ end
40
+ end
41
+
42
+ def fit(x, df = nil)
43
+ x = validate_input(x)
44
+
45
+ fit_values = case @strategy
46
+ when :mean
47
+ fit_mean(x)
48
+ when :median
49
+ fit_median(x)
50
+ when :ffill
51
+ fit_ffill(x, df)
52
+ when :most_frequent
53
+ fit_most_frequent(x)
54
+ when :categorical
55
+ fit_categorical(x)
56
+ when :constant
57
+ fit_constant(x)
58
+ when :clip
59
+ fit_no_op(x)
60
+ when :today
61
+ fit_no_op(x)
62
+ when :one_hot
63
+ fit_no_op(x)
64
+ when :custom
65
+ fit_custom(x)
66
+ else
67
+ raise ArgumentError, "Invalid strategy: #{@strategy}"
68
+ end || {}
69
+
70
+ @statistics[attribute] ||= {}
71
+ @statistics[attribute][@strategy] = fit_values.merge!(original_dtype: x.dtype)
72
+ save
73
+ self
74
+ end
75
+
76
+ def transform(x)
77
+ check_is_fitted
78
+
79
+ if x.is_a?(Polars::Series)
80
+ transform_polars(x)
81
+ else
82
+ transform_dense(x)
83
+ end
84
+ end
85
+
86
+ def transform_polars(x)
87
+ result = case @strategy
88
+ when :mean, :median, :ffill, :most_frequent, :constant
89
+ x.fill_null(@statistics[@strategy][:value])
90
+ when :clip
91
+ min = options["min"] || 0
92
+ max = options["max"] || 1_000_000_000_000
93
+ if x.null_count != x.len
94
+ x.clip(min, max)
95
+ else
96
+ x
97
+ end
98
+ when :categorical
99
+ allowed_values = @statistics.dig(:categorical, :value).select do |_k, v|
100
+ v >= options[:categorical_min]
101
+ end.keys.map(&:to_s)
102
+ if x.null_count == x.len
103
+ x.fill_null(transform_categorical(nil))
104
+ else
105
+ x.apply do |val|
106
+ allowed_values.include?(val) ? val : transform_categorical(val)
107
+ end
108
+ end
109
+ when :today
110
+ x.fill_null(transform_today(nil))
111
+ when :custom
112
+ if x.null_count == x.len
113
+ x.fill_null(transform_custom(nil))
114
+ else
115
+ x.apply do |val|
116
+ should_transform_custom?(val) ? transform_custom(val) : val
117
+ end
118
+ end
119
+ else
120
+ raise ArgumentError, "Unsupported strategy for Polars::Series: #{@strategy}"
121
+ end
122
+
123
+ # Cast the result back to the original dtype
124
+ original_dtype = @statistics.dig(@strategy, :original_dtype)
125
+ original_dtype ? result.cast(original_dtype) : result
126
+ end
127
+
128
+ def file_path
129
+ raise "Need both attribute and path to save/load statistics" unless attribute.present? && path.to_s.present?
130
+
131
+ File.join(path, "statistics.json")
132
+ end
133
+
134
+ def cleanup
135
+ @statistics = {}
136
+ FileUtils.rm(file_path) if File.exist?(file_path)
137
+ end
138
+
139
+ def save
140
+ FileUtils.mkdir_p(File.dirname(file_path))
141
+
142
+ all_statistics = (File.exist?(file_path) ? JSON.parse(File.read(file_path)) : {}).deep_symbolize_keys
143
+
144
+ deep_symbolize_keys!
145
+
146
+ serialized = serialize_statistics(@statistics)
147
+ all_statistics[attribute] = {} unless all_statistics.key?(attribute)
148
+ all_statistics[attribute][@strategy] = serialized[attribute.to_sym][@strategy.to_sym]
149
+
150
+ File.open(file_path, "w") do |file|
151
+ file.write(JSON.pretty_generate(all_statistics))
152
+ end
153
+ end
154
+
155
+ def load
156
+ return unless File.exist?(file_path)
157
+
158
+ all_statistics = JSON.parse(File.read(file_path))
159
+ attribute_stats = all_statistics[@attribute]
160
+
161
+ return unless attribute_stats
162
+
163
+ @statistics = deserialize_statistics(attribute_stats)
164
+ deep_symbolize_keys!
165
+ end
166
+
167
+ def should_transform_categorical?(val)
168
+ values = @statistics.dig(:categorical, :value) || {}
169
+ min_ct = options[:categorical_min] || 25
170
+ allowed_values = values.select { |_v, c| c >= min_ct }
171
+
172
+ allowed_values.keys.map(&:to_s).exclude?(val)
173
+ end
174
+
175
+ def transform_categorical(val)
176
+ return "other" if val.nil?
177
+
178
+ values = @statistics.dig(:categorical, :value) || {}
179
+ min_ct = options[:categorical_min] || 25
180
+ allowed_values = values.select { |_v, c| c >= min_ct }.keys.map(&:to_s)
181
+
182
+ binding.pry
183
+ allowed_values.include?(val.to_s) ? val.to_s : "other"
184
+ end
185
+
186
+ def transform_today(_val)
187
+ EST.now.beginning_of_day
188
+ end
189
+
190
+ def fit_custom(x)
191
+ x
192
+ end
193
+
194
+ def should_transform_custom?(x)
195
+ if options.key?(:should_transform)
196
+ options[:should_transform].call(x)
197
+ else
198
+ should_transform_default?(x)
199
+ end
200
+ end
201
+
202
+ def transform_custom(x)
203
+ raise "Transform required" unless options.key?(:transform)
204
+
205
+ options[:transform].call(x)
206
+ end
207
+
208
+ private
209
+
210
+ def validate_input(x)
211
+ raise ArgumentError, "Input must be a Polars::Series" unless x.is_a?(Polars::Series)
212
+
213
+ x
214
+ end
215
+
216
+ def fit_mean(x)
217
+ { value: x.mean }
218
+ end
219
+
220
+ def fit_median(x)
221
+ { value: x.median }
222
+ end
223
+
224
+ def fit_ffill(x, df)
225
+ values = { value: nil, max_date: nil }
226
+
227
+ date_col = df[options[:date_column]]
228
+ return if date_col.is_null.all
229
+
230
+ sorted_df = df.sort(options[:date_column])
231
+ new_max_date = sorted_df[options[:date_column]].max
232
+
233
+ current_max_date = values[:max_date]
234
+ return if current_max_date && current_max_date > new_max_date
235
+
236
+ values[:max_date] = [current_max_date, new_max_date].compact.max
237
+
238
+ # Get the last non-null value
239
+ last_non_null = sorted_df[x.name].filter(sorted_df[x.name].is_not_null).tail(1).to_a.first
240
+ values[:value] = last_non_null
241
+
242
+ values
243
+ end
244
+
245
+ def fit_most_frequent(x)
246
+ value_counts = x.filter(x.is_not_null).value_counts
247
+ column_names = value_counts.columns
248
+ column_names[0]
249
+ count_column = column_names[1]
250
+
251
+ most_frequent_value = value_counts.sort(count_column, descending: true).row(0)[0]
252
+ { value: most_frequent_value }
253
+ end
254
+
255
+ def fit_categorical(x)
256
+ value_counts = x.value_counts
257
+ column_names = value_counts.columns
258
+ value_column = column_names[0]
259
+ count_column = column_names[1]
260
+
261
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
262
+ label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
263
+ h.tap do
264
+ h[k] = i
265
+ end
266
+ end
267
+ label_decoder = label_encoder.invert
268
+
269
+ {
270
+ value: as_hash,
271
+ label_encoder: label_encoder,
272
+ label_decoder: label_decoder
273
+ }
274
+ end
275
+
276
+ def fit_no_op(_x)
277
+ {}
278
+ end
279
+
280
+ def fit_constant(_x)
281
+ { value: @options[:fill_value] }
282
+ end
283
+
284
+ def transform_default(_val)
285
+ @statistics[strategy][:value]
286
+ end
287
+
288
+ def should_transform_default?(val)
289
+ checker_method = val.respond_to?(:nan?) ? :nan? : :nil?
290
+ val.send(checker_method)
291
+ end
292
+
293
+ def transform_dense(x)
294
+ result = x.map do |val|
295
+ strategy_method = respond_to?("transform_#{strategy}") ? "transform_#{strategy}" : "transform_default"
296
+ checker_method = respond_to?("should_transform_#{strategy}?") ? "should_transform_#{strategy}?" : "should_transform_default?"
297
+ send(checker_method, val) ? send(strategy_method, val) : val
298
+ end
299
+
300
+ # Cast the result back to the original dtype
301
+ original_dtype = @statistics[:original_dtype]
302
+ if original_dtype
303
+ result.map { |val| cast_to_dtype(val, original_dtype) }
304
+ else
305
+ result
306
+ end
307
+ end
308
+
309
+ def check_is_fitted
310
+ return if %i[clip today custom].include?(strategy)
311
+
312
+ raise "SimpleImputer has not been fitted yet for #{attribute}##{strategy}" unless @statistics[strategy]
313
+ end
314
+
315
+ def serialize_statistics(stats)
316
+ stats.deep_transform_values do |value|
317
+ case value
318
+ when Time, DateTime
319
+ { "__type__" => "datetime", "value" => value.iso8601 }
320
+ when Date
321
+ { "__type__" => "date", "value" => value.iso8601 }
322
+ when BigDecimal
323
+ { "__type__" => "bigdecimal", "value" => value.to_s }
324
+ when Polars::DataType
325
+ { "__type__" => "polars_dtype", "value" => value.to_s }
326
+ when Symbol
327
+ { "__type__" => "symbol", "value" => value.to_s }
328
+ else
329
+ value
330
+ end
331
+ end
332
+ end
333
+
334
+ def deserialize_statistics(stats)
335
+ stats.transform_values do |value|
336
+ recursive_deserialize(value)
337
+ end
338
+ end
339
+
340
+ def recursive_deserialize(value)
341
+ case value
342
+ when Hash
343
+ if value["__type__"]
344
+ deserialize_special_type(value)
345
+ else
346
+ value.transform_values { |v| recursive_deserialize(v) }
347
+ end
348
+ when Array
349
+ value.map { |v| recursive_deserialize(v) }
350
+ else
351
+ value
352
+ end
353
+ end
354
+
355
+ def deserialize_special_type(value)
356
+ case value["__type__"]
357
+ when "datetime"
358
+ DateTime.parse(value["value"])
359
+ when "date"
360
+ Date.parse(value["value"])
361
+ when "bigdecimal"
362
+ BigDecimal(value["value"])
363
+ when "polars_dtype"
364
+ parse_polars_dtype(value["value"])
365
+ when "symbol"
366
+ value["value"].to_sym
367
+ else
368
+ value["value"]
369
+ end
370
+ end
371
+
372
+ def parse_polars_dtype(dtype_string)
373
+ case dtype_string
374
+ when /^Polars::Datetime/
375
+ time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
376
+ time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
377
+ time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
378
+ Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
379
+ when /^Polars::/
380
+ Polars.const_get(dtype_string.split("::").last)
381
+ else
382
+ raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
383
+ end
384
+ end
385
+
386
+ def cast_to_dtype(value, dtype)
387
+ case dtype
388
+ when Polars::Int64
389
+ value.to_i
390
+ when Polars::Float64
391
+ value.to_f
392
+ when Polars::Boolean
393
+ !!value
394
+ when Polars::Utf8
395
+ value.to_s
396
+ else
397
+ value
398
+ end
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
@@ -0,0 +1,17 @@
1
+ module EasyML::Data
2
+ class Preprocessor
3
+ module Utils
4
+ def standardize_config(config)
5
+ config.each do |column, strategies|
6
+ next unless strategies.is_a?(Array)
7
+
8
+ config[column] = strategies.reduce({}) do |hash, strategy|
9
+ hash.tap do
10
+ hash[strategy] = true
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,238 @@
1
+ require "fileutils"
2
+ require "polars"
3
+ require "date"
4
+ require "json"
5
+ require_relative "preprocessor/utils"
6
+ require_relative "preprocessor/simple_imputer"
7
+
8
+ module EasyML::Data
9
+ class Preprocessor
10
+ include GlueGun::DSL
11
+ include EasyML::Data::Preprocessor::Utils
12
+
13
+ CATEGORICAL_COMMON_MIN = 50
14
+ PREPROCESSING_ORDER = %w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
15
+
16
+ attribute :directory, :string
17
+ attribute :verbose, :boolean, default: false
18
+ attribute :preprocessing_steps, :hash, default: {}
19
+ def preprocessing_steps=(preprocessing_steps)
20
+ super(standardize_config(preprocessing_steps).with_indifferent_access)
21
+ end
22
+
23
+ def fit(df)
24
+ return if df.nil?
25
+ return if preprocessing_steps.keys.none?
26
+
27
+ puts "Preprocessing..." if verbose
28
+ imputers = initialize_imputers(
29
+ preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
30
+ )
31
+
32
+ did_cleanup = false
33
+ imputers.each do |col, imputers|
34
+ sorted_strategies(imputers).each do |strategy|
35
+ imputer = imputers[strategy]
36
+ unless did_cleanup
37
+ imputer.cleanup
38
+ did_cleanup = true
39
+ end
40
+ if df.columns.map(&:downcase).include?(col.downcase)
41
+ actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
42
+ imputer.fit(df[actual_col], df)
43
+ if strategy == "clip" # This is the only one to transform during fit
44
+ df[actual_col] = imputer.transform(df[actual_col])
45
+ end
46
+ elsif @verbose
47
+ puts "Warning: Column '#{col}' not found in DataFrame during fit process."
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ def postprocess(df, inference: false)
54
+ puts "Postprocessing..." if verbose
55
+ return df if preprocessing_steps.keys.none?
56
+
57
+ steps = if inference
58
+ preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
59
+ else
60
+ preprocessing_steps[:training]
61
+ end
62
+
63
+ df = apply_transformations(df, steps)
64
+
65
+ puts "Postprocessing complete." if @verbose
66
+ df
67
+ end
68
+
69
+ def statistics
70
+ initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
71
+ result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
72
+ col_result[strategy] = imputer.statistics
73
+ end
74
+ end
75
+ end
76
+
77
+ def is_fit?
78
+ statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
79
+ end
80
+
81
+ def delete
82
+ return unless File.directory?(@directory)
83
+
84
+ FileUtils.rm_rf(@directory)
85
+ end
86
+
87
+ def move(to)
88
+ old_dir = directory
89
+ current_env = directory.split("/")[-1]
90
+ new_dir = directory.gsub(Regexp.new(current_env), to)
91
+
92
+ puts "Moving #{old_dir} to #{new_dir}"
93
+ FileUtils.mv(old_dir, new_dir)
94
+ @directory = new_dir
95
+ end
96
+
97
+ def decode_labels(values, col: nil)
98
+ imputers = initialize_imputers(preprocessing_steps[:training], dumb: true)
99
+ imputer = imputers.dig(col, "categorical")
100
+ decoder = imputer.statistics.dig(:categorical, :label_decoder)
101
+
102
+ other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
103
+ decoder[other_value] = "other"
104
+ decoder.stringify_keys!
105
+
106
+ values.map do |value|
107
+ decoder[value.to_s]
108
+ end
109
+ end
110
+
111
+ private
112
+
113
+ def initialize_imputers(config, dumb: false)
114
+ standardize_config(config).each_with_object({}) do |(col, strategies), hash|
115
+ hash[col] ||= {}
116
+ strategies.each do |strategy, options|
117
+ options = {} if options == true
118
+
119
+ hash[col][strategy] = EasyML::Data::Preprocessor::SimpleImputer.new(
120
+ strategy: strategy,
121
+ path: directory,
122
+ attribute: col,
123
+ options: options
124
+ )
125
+ end
126
+ end
127
+ end
128
+
129
+ def apply_transformations(df, config)
130
+ imputers = initialize_imputers(config)
131
+
132
+ standardize_config(config).each do |col, strategies|
133
+ if df.columns.map(&:downcase).include?(col.downcase)
134
+ actual_col = df.columns.find { |c| c.downcase == col.downcase }
135
+
136
+ sorted_strategies(strategies).each do |strategy|
137
+ if strategy.to_sym == :categorical
138
+ if imputers.dig(col, strategy).options.dig("one_hot")
139
+ df = apply_one_hot(df, col, imputers)
140
+ elsif imputers.dig(col, strategy).options.dig("encode_labels")
141
+ df = apply_encode_labels(df, col, imputers)
142
+ end
143
+ else
144
+ imputer = imputers.dig(col, strategy)
145
+ df[actual_col] = imputer.transform(df[actual_col]) if imputer
146
+ end
147
+ end
148
+ elsif @verbose
149
+ puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
150
+ end
151
+ end
152
+
153
+ df
154
+ end
155
+
156
+ def apply_one_hot(df, col, imputers)
157
+ cat_imputer = imputers.dig(col, "categorical")
158
+ approved_values = cat_imputer.statistics[:categorical][:value].select do |_k, v|
159
+ v >= cat_imputer.options["categorical_min"]
160
+ end.keys
161
+
162
+ # Create one-hot encoded columns
163
+ approved_values.each do |value|
164
+ new_col_name = "#{col}_#{value}".tr("-", "_")
165
+ df = df.with_column(
166
+ df[col].eq(value.to_s).cast(Polars::Int64).alias(new_col_name)
167
+ )
168
+ end
169
+
170
+ # Create 'other' column for unapproved values
171
+ other_col_name = "#{col}_other"
172
+ df[other_col_name] = df[col].map_elements do |value|
173
+ approved_values.map(&:to_s).exclude?(value)
174
+ end.cast(Polars::Int64)
175
+ df.drop([col])
176
+ end
177
+
178
+ def apply_encode_labels(df, col, imputers)
179
+ cat_imputer = imputers.dig(col, "categorical")
180
+ approved_values = cat_imputer.statistics[:categorical][:value].select do |_k, v|
181
+ v >= cat_imputer.options["categorical_min"]
182
+ end.keys
183
+
184
+ df.with_column(
185
+ df[col].map_elements do |value|
186
+ approved_values.map(&:to_s).exclude?(value) ? "other" : value
187
+ end.alias(col)
188
+ )
189
+
190
+ label_encoder = cat_imputer.statistics[:categorical][:label_encoder].stringify_keys
191
+ other_value = label_encoder.values.max + 1
192
+ label_encoder["other"] = other_value
193
+
194
+ df.with_column(
195
+ df[col].map { |v| label_encoder[v.to_s] }.alias(col)
196
+ )
197
+ end
198
+
199
+ def sorted_strategies(strategies)
200
+ strategies.keys.sort_by do |key|
201
+ PREPROCESSING_ORDER.index(key)
202
+ end
203
+ end
204
+
205
+ def prepare_for_imputation(df, col)
206
+ df = df.with_column(Polars.col(col).cast(Polars::Float64))
207
+ df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
208
+ end
209
+ end
210
+ end
211
+
212
+ # Where to put this???
213
+ #
214
+ # def self.stage_required_files
215
+ # required_files.each do |file|
216
+ # git_add(file)
217
+ # end
218
+ # end
219
+
220
+ # def self.git_add(path)
221
+ # command = "git add #{path}"
222
+ # puts command if verbose
223
+ # result = `#{command}`
224
+ # puts result if verbose
225
+ # end
226
+
227
+ # def self.set_verbose(verbose)
228
+ # @verbose = verbose
229
+ # end
230
+
231
+ # def required_files
232
+ # files = Dir.entries(@directory) - %w[. ..]
233
+ # required_file_types = %w[bin]
234
+
235
+ # files.select { |file| required_file_types.any? { |ext| file.include?(ext) } }.map do |file|
236
+ # File.join(@directory, file)
237
+ # end
238
+ # end
@@ -0,0 +1,50 @@
1
+ module EasyML
2
+ module Data
3
+ module Utils
4
+ def append_to_csv(df, path)
5
+ return if df.empty?
6
+
7
+ path = Pathname.new(path) if path.is_a?(String)
8
+ FileUtils.mkdir_p(path.dirname)
9
+ FileUtils.touch(path)
10
+
11
+ # Check if the file is empty (i.e., if this is the first write)
12
+ file_empty = File.zero?(path)
13
+
14
+ # Write the DataFrame to a temporary file
15
+ temp_file = "#{path}.tmp"
16
+ df.write_csv(temp_file)
17
+
18
+ # Append the content to the main file, skipping the header if not the first write
19
+ File.open(path, "a") do |f|
20
+ File.foreach(temp_file).with_index do |line, index|
21
+ # Skip the header line if the file is not empty
22
+ f.write(line) unless index == 0 && !file_empty
23
+ end
24
+ end
25
+
26
+ # Delete the temporary file
27
+ File.delete(temp_file)
28
+ end
29
+
30
+ def expand_dir(dir)
31
+ return dir if dir.to_s[0] == "/"
32
+
33
+ Rails.root.join(dir)
34
+ end
35
+
36
+ def null_check(df)
37
+ result = {}
38
+ null_counts = df.null_count
39
+ total_count = df.height
40
+ df.columns.each do |column|
41
+ null_count = null_counts[column][0]
42
+ next if null_count == 0
43
+
44
+ result[column] = { null_count: null_count, total_count: total_count }
45
+ end
46
+ result.empty? ? nil : result
47
+ end
48
+ end
49
+ end
50
+ end