easy_ml 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,89 @@
1
+ require "polars"
2
+
3
+ module EasyML::Data
4
+ class Datasource
5
+ class S3Datasource
6
+ include GlueGun::DSL
7
+
8
+ attribute :root_dir, :string
9
+ validates :root_dir, presence: true
10
+
11
+ attribute :polars_args, :hash, default: {}
12
+ validates :polars_args, presence: true
13
+
14
+ attribute :s3_bucket, :string
15
+ validates :s3_bucket, presence: true
16
+
17
+ attribute :s3_prefix, :string
18
+ validates :s3_prefix, presence: true
19
+ def s3_prefix=(arg)
20
+ super(arg.to_s.gsub(%r{^/|/$}, ""))
21
+ end
22
+
23
+ attribute :s3_access_key_id, :string
24
+ validates :s3_access_key_id, presence: true
25
+
26
+ attribute :s3_secret_access_key, :string
27
+ validates :s3_secret_access_key, presence: true
28
+
29
+ dependency :synced_directory do |dependency|
30
+ dependency.set_class EasyML::Support::SyncedDirectory
31
+ dependency.bind_attribute :root_dir, required: true
32
+ dependency.bind_attribute :s3_bucket, required: true
33
+ dependency.bind_attribute :s3_prefix
34
+ dependency.bind_attribute :s3_access_key_id, required: true
35
+ dependency.bind_attribute :s3_secret_access_key, required: true
36
+ end
37
+
38
+ delegate :files, :last_updated_at, to: :synced_directory
39
+
40
+ def in_batches(of: 10_000)
41
+ # Currently ignores batch size, TODO: implement
42
+ pull
43
+ files.each do |file|
44
+ csv = Polars.read_csv(file, **polars_args)
45
+ yield csv
46
+ end
47
+ end
48
+
49
+ def refresh!
50
+ synced_directory.sync
51
+ end
52
+
53
+ def data
54
+ pull do |did_sync|
55
+ output_path = File.join(root_dir, "combined_data.csv")
56
+
57
+ if did_sync
58
+ combined_df = merge_data
59
+ combined_df.write_csv(output_path)
60
+ else
61
+ Polars.read_csv(output_path, **polars_args)
62
+ end
63
+ end
64
+ combined_df
65
+ end
66
+
67
+ private
68
+
69
+ def pull
70
+ # Synced directory will only sync if needs sync
71
+ did_sync = synced_directory.sync
72
+ yield did_sync if block_given?
73
+ end
74
+
75
+ def merge_data
76
+ combined_df = nil
77
+ files.each do |file|
78
+ df = Polars.read_csv(file, **polars_args)
79
+ combined_df = if combined_df.nil?
80
+ df
81
+ else
82
+ combined_df.vstack(df)
83
+ end
84
+ end
85
+ combined_df
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,33 @@
1
+ module ML
2
+ module Data
3
+ class Datasource
4
+ attr_reader :root_dir, :polars_args
5
+
6
+ def in_batches(of: 10_000)
7
+ raise NotImplementedError, "Subclasses must implement #in_batches"
8
+ end
9
+
10
+ def files
11
+ raise NotImplementedError, "Subclasses must implement #files"
12
+ end
13
+
14
+ def last_updated_at
15
+ raise NotImplementedError, "Subclasses must implement #last_updated_at"
16
+ end
17
+
18
+ def refresh!
19
+ raise NotImplementedError, "Subclasses must implement #refresh!"
20
+ end
21
+
22
+ def data
23
+ raise NotImplementedError, "Subclasses must implement #data"
24
+ end
25
+
26
+ require_relative "datasource/s3_datasource"
27
+ require_relative "datasource/file_datasource"
28
+ require_relative "datasource/polars_datasource"
29
+ require_relative "datasource/merged_datasource"
30
+ require_relative "datasource/datasource_factory"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,205 @@
1
+ require "fileutils"
2
+ require "polars"
3
+ require "date"
4
+ require "json"
5
+
6
+ module EasyML::Data
7
+ class PreprocessingSteps
8
+ class Preprocessor
9
+ include EasyML::Data::PreprocessingSteps::Utils
10
+
11
+ CATEGORICAL_COMMON_MIN = 50
12
+ PREPROCESSING_ORDER = %w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
13
+
14
+ attr_accessor :directory, :preprocessing_steps, :verbose, :imputers, :environment
15
+
16
+ def initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development")
17
+ @directory = directory
18
+ @preprocessing_steps = standardize_config(preprocessing_steps).with_indifferent_access
19
+ @verbose = verbose
20
+ @environment = environment
21
+ end
22
+
23
+ def fit(df)
24
+ return if df.nil?
25
+ return if preprocessing_steps.keys.none?
26
+
27
+ puts "Preprocessing..." if verbose
28
+ imputers = initialize_imputers(
29
+ preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
30
+ )
31
+
32
+ did_cleanup = false
33
+ imputers.each do |col, imputers|
34
+ sorted_strategies(imputers).each do |strategy|
35
+ imputer = imputers[strategy]
36
+ unless did_cleanup
37
+ imputer.cleanup
38
+ did_cleanup = true
39
+ end
40
+ if df.columns.map(&:downcase).include?(col.downcase)
41
+ actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
42
+ imputer.fit(df[actual_col], df)
43
+ if strategy == "clip" # This is the only one to transform during fit
44
+ df[actual_col] = imputer.transform(df[actual_col])
45
+ end
46
+ elsif @verbose
47
+ puts "Warning: Column '#{col}' not found in DataFrame during fit process."
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ def postprocess(df, inference: false)
54
+ puts "Postprocessing..." if verbose
55
+ return df if preprocessing_steps.keys.none?
56
+
57
+ steps = if inference
58
+ preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
59
+ else
60
+ preprocessing_steps[:training]
61
+ end
62
+
63
+ df = apply_transformations(df, steps)
64
+
65
+ puts "Postprocessing complete." if @verbose
66
+ df
67
+ end
68
+
69
+ def statistics
70
+ initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
71
+ result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
72
+ col_result[strategy] = imputer.statistics
73
+ end
74
+ end
75
+ end
76
+
77
+ def is_fit?
78
+ statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
79
+ end
80
+
81
+ def delete
82
+ return unless File.directory?(@directory)
83
+
84
+ FileUtils.rm_rf(@directory)
85
+ end
86
+
87
+ def move(to)
88
+ old_dir = directory
89
+ current_env = directory.split("/")[-1]
90
+ new_dir = directory.gsub(Regexp.new(current_env), to)
91
+
92
+ puts "Moving #{old_dir} to #{new_dir}"
93
+ FileUtils.mv(old_dir, new_dir)
94
+ @directory = new_dir
95
+ end
96
+
97
+ private
98
+
99
+ def initialize_imputers(config)
100
+ standardize_config(config).each_with_object({}) do |(col, strategies), hash|
101
+ hash[col] ||= {}
102
+ strategies.each do |strategy, options|
103
+ next if strategy.to_sym == :one_hot
104
+
105
+ options = {} if options == true
106
+
107
+ hash[col][strategy] = EasyML::Data::PreprocessingSteps::SimpleImputer.new(
108
+ strategy: strategy,
109
+ path: directory,
110
+ attribute: col,
111
+ options: options
112
+ )
113
+ end
114
+ end
115
+ end
116
+
117
+ def apply_transformations(df, config)
118
+ imputers = initialize_imputers(config)
119
+
120
+ standardize_config(config).each do |col, strategies|
121
+ if df.columns.map(&:downcase).include?(col.downcase)
122
+ actual_col = df.columns.find { |c| c.downcase == col.downcase }
123
+
124
+ sorted_strategies(strategies).each do |strategy|
125
+ if strategy.to_sym == :one_hot
126
+ df = apply_one_hot(df, col, imputers)
127
+ else
128
+ imputer = imputers.dig(col, strategy)
129
+ df[actual_col] = imputer.transform(df[actual_col]) if imputer
130
+ end
131
+ end
132
+ elsif @verbose
133
+ puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
134
+ end
135
+ end
136
+
137
+ df
138
+ end
139
+
140
+ def apply_one_hot(df, col, imputers)
141
+ approved_values = if (cat_imputer = imputers.dig(col, "categorical")).present?
142
+ cat_imputer.statistics[:categorical][:value].select do |_k, v|
143
+ v >= cat_imputer.options["categorical_min"]
144
+ end.keys
145
+ else
146
+ df[col].uniq.to_a
147
+ end
148
+
149
+ # Create one-hot encoded columns
150
+ approved_values.each do |value|
151
+ new_col_name = "#{col}_#{value}".gsub(/-/, "_")
152
+ df = df.with_column(
153
+ df[col].eq(value.to_s).cast(Polars::Int64).alias(new_col_name)
154
+ )
155
+ end
156
+
157
+ # Create 'other' column for unapproved values
158
+ other_col_name = "#{col}_other"
159
+ df[other_col_name] = df[col].map_elements do |value|
160
+ approved_values.map(&:to_s).exclude?(value)
161
+ end.cast(Polars::Int64)
162
+ df.drop([col])
163
+ end
164
+
165
+ def sorted_strategies(strategies)
166
+ strategies.keys.sort_by do |key|
167
+ PREPROCESSING_ORDER.index(key)
168
+ end
169
+ end
170
+
171
+ def prepare_for_imputation(df, col)
172
+ df = df.with_column(Polars.col(col).cast(Polars::Float64))
173
+ df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
174
+ end
175
+ end
176
+ end
177
+ end
178
+
179
+ # Where to put this???
180
+ #
181
+ # def self.stage_required_files
182
+ # required_files.each do |file|
183
+ # git_add(file)
184
+ # end
185
+ # end
186
+
187
+ # def self.git_add(path)
188
+ # command = "git add #{path}"
189
+ # puts command if verbose
190
+ # result = `#{command}`
191
+ # puts result if verbose
192
+ # end
193
+
194
+ # def self.set_verbose(verbose)
195
+ # @verbose = verbose
196
+ # end
197
+
198
+ # def required_files
199
+ # files = Dir.entries(@directory) - %w[. ..]
200
+ # required_file_types = %w[bin]
201
+
202
+ # files.select { |file| required_file_types.any? { |ext| file.include?(ext) } }.map do |file|
203
+ # File.join(@directory, file)
204
+ # end
205
+ # end