easy_ml 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,89 @@
1
+ require "polars"
2
+
3
+ module EasyML::Data
4
+ class Datasource
5
+ class S3Datasource
6
+ include GlueGun::DSL
7
+
8
+ attribute :root_dir, :string
9
+ validates :root_dir, presence: true
10
+
11
+ attribute :polars_args, :hash, default: {}
12
+ validates :polars_args, presence: true
13
+
14
+ attribute :s3_bucket, :string
15
+ validates :s3_bucket, presence: true
16
+
17
+ attribute :s3_prefix, :string
18
+ validates :s3_prefix, presence: true
19
+ def s3_prefix=(arg)
20
+ super(arg.to_s.gsub(%r{^/|/$}, ""))
21
+ end
22
+
23
+ attribute :s3_access_key_id, :string
24
+ validates :s3_access_key_id, presence: true
25
+
26
+ attribute :s3_secret_access_key, :string
27
+ validates :s3_secret_access_key, presence: true
28
+
29
+ dependency :synced_directory do |dependency|
30
+ dependency.set_class EasyML::Support::SyncedDirectory
31
+ dependency.bind_attribute :root_dir, required: true
32
+ dependency.bind_attribute :s3_bucket, required: true
33
+ dependency.bind_attribute :s3_prefix
34
+ dependency.bind_attribute :s3_access_key_id, required: true
35
+ dependency.bind_attribute :s3_secret_access_key, required: true
36
+ end
37
+
38
+ delegate :files, :last_updated_at, to: :synced_directory
39
+
40
+ def in_batches(of: 10_000)
41
+ # Currently ignores batch size, TODO: implement
42
+ pull
43
+ files.each do |file|
44
+ csv = Polars.read_csv(file, **polars_args)
45
+ yield csv
46
+ end
47
+ end
48
+
49
+ def refresh!
50
+ synced_directory.sync
51
+ end
52
+
53
+ def data
54
+ pull do |did_sync|
55
+ output_path = File.join(root_dir, "combined_data.csv")
56
+
57
+ if did_sync
58
+ combined_df = merge_data
59
+ combined_df.write_csv(output_path)
60
+ else
61
+ Polars.read_csv(output_path, **polars_args)
62
+ end
63
+ end
64
+ combined_df
65
+ end
66
+
67
+ private
68
+
69
+ def pull
70
+ # Synced directory will only sync if needs sync
71
+ did_sync = synced_directory.sync
72
+ yield did_sync if block_given?
73
+ end
74
+
75
+ def merge_data
76
+ combined_df = nil
77
+ files.each do |file|
78
+ df = Polars.read_csv(file, **polars_args)
79
+ combined_df = if combined_df.nil?
80
+ df
81
+ else
82
+ combined_df.vstack(df)
83
+ end
84
+ end
85
+ combined_df
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,33 @@
1
+ module ML
2
+ module Data
3
+ class Datasource
4
+ attr_reader :root_dir, :polars_args
5
+
6
+ def in_batches(of: 10_000)
7
+ raise NotImplementedError, "Subclasses must implement #in_batches"
8
+ end
9
+
10
+ def files
11
+ raise NotImplementedError, "Subclasses must implement #files"
12
+ end
13
+
14
+ def last_updated_at
15
+ raise NotImplementedError, "Subclasses must implement #last_updated_at"
16
+ end
17
+
18
+ def refresh!
19
+ raise NotImplementedError, "Subclasses must implement #refresh!"
20
+ end
21
+
22
+ def data
23
+ raise NotImplementedError, "Subclasses must implement #data"
24
+ end
25
+
26
+ require_relative "datasource/s3_datasource"
27
+ require_relative "datasource/file_datasource"
28
+ require_relative "datasource/polars_datasource"
29
+ require_relative "datasource/merged_datasource"
30
+ require_relative "datasource/datasource_factory"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,205 @@
1
+ require "fileutils"
2
+ require "polars"
3
+ require "date"
4
+ require "json"
5
+
6
+ module EasyML::Data
7
+ class PreprocessingSteps
8
+ class Preprocessor
9
+ include EasyML::Data::PreprocessingSteps::Utils
10
+
11
+ CATEGORICAL_COMMON_MIN = 50
12
+ PREPROCESSING_ORDER = %w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
13
+
14
+ attr_accessor :directory, :preprocessing_steps, :verbose, :imputers, :environment
15
+
16
+ def initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development")
17
+ @directory = directory
18
+ @preprocessing_steps = standardize_config(preprocessing_steps).with_indifferent_access
19
+ @verbose = verbose
20
+ @environment = environment
21
+ end
22
+
23
+ def fit(df)
24
+ return if df.nil?
25
+ return if preprocessing_steps.keys.none?
26
+
27
+ puts "Preprocessing..." if verbose
28
+ imputers = initialize_imputers(
29
+ preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
30
+ )
31
+
32
+ did_cleanup = false
33
+ imputers.each do |col, imputers|
34
+ sorted_strategies(imputers).each do |strategy|
35
+ imputer = imputers[strategy]
36
+ unless did_cleanup
37
+ imputer.cleanup
38
+ did_cleanup = true
39
+ end
40
+ if df.columns.map(&:downcase).include?(col.downcase)
41
+ actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
42
+ imputer.fit(df[actual_col], df)
43
+ if strategy == "clip" # This is the only one to transform during fit
44
+ df[actual_col] = imputer.transform(df[actual_col])
45
+ end
46
+ elsif @verbose
47
+ puts "Warning: Column '#{col}' not found in DataFrame during fit process."
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ def postprocess(df, inference: false)
54
+ puts "Postprocessing..." if verbose
55
+ return df if preprocessing_steps.keys.none?
56
+
57
+ steps = if inference
58
+ preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
59
+ else
60
+ preprocessing_steps[:training]
61
+ end
62
+
63
+ df = apply_transformations(df, steps)
64
+
65
+ puts "Postprocessing complete." if @verbose
66
+ df
67
+ end
68
+
69
+ def statistics
70
+ initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
71
+ result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
72
+ col_result[strategy] = imputer.statistics
73
+ end
74
+ end
75
+ end
76
+
77
+ def is_fit?
78
+ statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
79
+ end
80
+
81
+ def delete
82
+ return unless File.directory?(@directory)
83
+
84
+ FileUtils.rm_rf(@directory)
85
+ end
86
+
87
+ def move(to)
88
+ old_dir = directory
89
+ current_env = directory.split("/")[-1]
90
+ new_dir = directory.gsub(Regexp.new(current_env), to)
91
+
92
+ puts "Moving #{old_dir} to #{new_dir}"
93
+ FileUtils.mv(old_dir, new_dir)
94
+ @directory = new_dir
95
+ end
96
+
97
+ private
98
+
99
+ def initialize_imputers(config)
100
+ standardize_config(config).each_with_object({}) do |(col, strategies), hash|
101
+ hash[col] ||= {}
102
+ strategies.each do |strategy, options|
103
+ next if strategy.to_sym == :one_hot
104
+
105
+ options = {} if options == true
106
+
107
+ hash[col][strategy] = EasyML::Data::PreprocessingSteps::SimpleImputer.new(
108
+ strategy: strategy,
109
+ path: directory,
110
+ attribute: col,
111
+ options: options
112
+ )
113
+ end
114
+ end
115
+ end
116
+
117
+ def apply_transformations(df, config)
118
+ imputers = initialize_imputers(config)
119
+
120
+ standardize_config(config).each do |col, strategies|
121
+ if df.columns.map(&:downcase).include?(col.downcase)
122
+ actual_col = df.columns.find { |c| c.downcase == col.downcase }
123
+
124
+ sorted_strategies(strategies).each do |strategy|
125
+ if strategy.to_sym == :one_hot
126
+ df = apply_one_hot(df, col, imputers)
127
+ else
128
+ imputer = imputers.dig(col, strategy)
129
+ df[actual_col] = imputer.transform(df[actual_col]) if imputer
130
+ end
131
+ end
132
+ elsif @verbose
133
+ puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
134
+ end
135
+ end
136
+
137
+ df
138
+ end
139
+
140
+ def apply_one_hot(df, col, imputers)
141
+ approved_values = if (cat_imputer = imputers.dig(col, "categorical")).present?
142
+ cat_imputer.statistics[:categorical][:value].select do |_k, v|
143
+ v >= cat_imputer.options["categorical_min"]
144
+ end.keys
145
+ else
146
+ df[col].uniq.to_a
147
+ end
148
+
149
+ # Create one-hot encoded columns
150
+ approved_values.each do |value|
151
+ new_col_name = "#{col}_#{value}".gsub(/-/, "_")
152
+ df = df.with_column(
153
+ df[col].eq(value.to_s).cast(Polars::Int64).alias(new_col_name)
154
+ )
155
+ end
156
+
157
+ # Create 'other' column for unapproved values
158
+ other_col_name = "#{col}_other"
159
+ df[other_col_name] = df[col].map_elements do |value|
160
+ approved_values.map(&:to_s).exclude?(value)
161
+ end.cast(Polars::Int64)
162
+ df.drop([col])
163
+ end
164
+
165
+ def sorted_strategies(strategies)
166
+ strategies.keys.sort_by do |key|
167
+ PREPROCESSING_ORDER.index(key)
168
+ end
169
+ end
170
+
171
+ def prepare_for_imputation(df, col)
172
+ df = df.with_column(Polars.col(col).cast(Polars::Float64))
173
+ df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
174
+ end
175
+ end
176
+ end
177
+ end
178
+
179
+ # Where to put this???
180
+ #
181
+ # def self.stage_required_files
182
+ # required_files.each do |file|
183
+ # git_add(file)
184
+ # end
185
+ # end
186
+
187
+ # def self.git_add(path)
188
+ # command = "git add #{path}"
189
+ # puts command if verbose
190
+ # result = `#{command}`
191
+ # puts result if verbose
192
+ # end
193
+
194
+ # def self.set_verbose(verbose)
195
+ # @verbose = verbose
196
+ # end
197
+
198
+ # def required_files
199
+ # files = Dir.entries(@directory) - %w[. ..]
200
+ # required_file_types = %w[bin]
201
+
202
+ # files.select { |file| required_file_types.any? { |ext| file.include?(ext) } }.map do |file|
203
+ # File.join(@directory, file)
204
+ # end
205
+ # end