easy_ml 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +270 -0
- data/Rakefile +12 -0
- data/app/models/easy_ml/model.rb +59 -0
- data/app/models/easy_ml/models/xgboost.rb +9 -0
- data/app/models/easy_ml/models.rb +5 -0
- data/lib/easy_ml/core/model.rb +29 -0
- data/lib/easy_ml/core/model_core.rb +181 -0
- data/lib/easy_ml/core/model_evaluator.rb +137 -0
- data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
- data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
- data/lib/easy_ml/core/models/xgboost.rb +10 -0
- data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
- data/lib/easy_ml/core/models.rb +10 -0
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
- data/lib/easy_ml/core/tuner/adapters.rb +10 -0
- data/lib/easy_ml/core/tuner.rb +105 -0
- data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
- data/lib/easy_ml/core/uploaders.rb +7 -0
- data/lib/easy_ml/core.rb +9 -0
- data/lib/easy_ml/core_ext/pathname.rb +9 -0
- data/lib/easy_ml/core_ext.rb +5 -0
- data/lib/easy_ml/data/dataloader.rb +6 -0
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
- data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
- data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
- data/lib/easy_ml/data/dataset/splits.rb +11 -0
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
- data/lib/easy_ml/data/dataset/splitters.rb +9 -0
- data/lib/easy_ml/data/dataset.rb +430 -0
- data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
- data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
- data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
- data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
- data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
- data/lib/easy_ml/data/datasource.rb +33 -0
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
- data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
- data/lib/easy_ml/data/preprocessor.rb +238 -0
- data/lib/easy_ml/data/utils.rb +50 -0
- data/lib/easy_ml/data.rb +8 -0
- data/lib/easy_ml/deployment.rb +5 -0
- data/lib/easy_ml/engine.rb +26 -0
- data/lib/easy_ml/initializers/inflections.rb +4 -0
- data/lib/easy_ml/logging.rb +38 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
- data/lib/easy_ml/support/age.rb +27 -0
- data/lib/easy_ml/support/est.rb +1 -0
- data/lib/easy_ml/support/file_rotate.rb +23 -0
- data/lib/easy_ml/support/git_ignorable.rb +66 -0
- data/lib/easy_ml/support/synced_directory.rb +134 -0
- data/lib/easy_ml/support/utc.rb +1 -0
- data/lib/easy_ml/support.rb +10 -0
- data/lib/easy_ml/trainer.rb +92 -0
- data/lib/easy_ml/transforms.rb +29 -0
- data/lib/easy_ml/version.rb +5 -0
- data/lib/easy_ml.rb +23 -0
- metadata +353 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
require "polars"
|
2
|
+
|
3
|
+
module EasyML::Data
|
4
|
+
class Datasource
|
5
|
+
class S3Datasource
|
6
|
+
include GlueGun::DSL
|
7
|
+
|
8
|
+
attribute :root_dir, :string
|
9
|
+
validates :root_dir, presence: true
|
10
|
+
|
11
|
+
attribute :polars_args, :hash, default: {}
|
12
|
+
validates :polars_args, presence: true
|
13
|
+
|
14
|
+
attribute :s3_bucket, :string
|
15
|
+
validates :s3_bucket, presence: true
|
16
|
+
|
17
|
+
attribute :s3_prefix, :string
|
18
|
+
validates :s3_prefix, presence: true
|
19
|
+
def s3_prefix=(arg)
|
20
|
+
super(arg.to_s.gsub(%r{^/|/$}, ""))
|
21
|
+
end
|
22
|
+
|
23
|
+
attribute :s3_access_key_id, :string
|
24
|
+
validates :s3_access_key_id, presence: true
|
25
|
+
|
26
|
+
attribute :s3_secret_access_key, :string
|
27
|
+
validates :s3_secret_access_key, presence: true
|
28
|
+
|
29
|
+
dependency :synced_directory do |dependency|
|
30
|
+
dependency.set_class EasyML::Support::SyncedDirectory
|
31
|
+
dependency.bind_attribute :root_dir, required: true
|
32
|
+
dependency.bind_attribute :s3_bucket, required: true
|
33
|
+
dependency.bind_attribute :s3_prefix
|
34
|
+
dependency.bind_attribute :s3_access_key_id, required: true
|
35
|
+
dependency.bind_attribute :s3_secret_access_key, required: true
|
36
|
+
end
|
37
|
+
|
38
|
+
delegate :files, :last_updated_at, to: :synced_directory
|
39
|
+
|
40
|
+
def in_batches(of: 10_000)
|
41
|
+
# Currently ignores batch size, TODO: implement
|
42
|
+
pull
|
43
|
+
files.each do |file|
|
44
|
+
csv = Polars.read_csv(file, **polars_args)
|
45
|
+
yield csv
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def refresh!
|
50
|
+
synced_directory.sync
|
51
|
+
end
|
52
|
+
|
53
|
+
def data
|
54
|
+
pull do |did_sync|
|
55
|
+
output_path = File.join(root_dir, "combined_data.csv")
|
56
|
+
|
57
|
+
if did_sync
|
58
|
+
combined_df = merge_data
|
59
|
+
combined_df.write_csv(output_path)
|
60
|
+
else
|
61
|
+
Polars.read_csv(output_path, **polars_args)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
combined_df
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def pull
|
70
|
+
# Synced directory will only sync if needs sync
|
71
|
+
did_sync = synced_directory.sync
|
72
|
+
yield did_sync if block_given?
|
73
|
+
end
|
74
|
+
|
75
|
+
def merge_data
|
76
|
+
combined_df = nil
|
77
|
+
files.each do |file|
|
78
|
+
df = Polars.read_csv(file, **polars_args)
|
79
|
+
combined_df = if combined_df.nil?
|
80
|
+
df
|
81
|
+
else
|
82
|
+
combined_df.vstack(df)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
combined_df
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module ML
|
2
|
+
module Data
|
3
|
+
class Datasource
|
4
|
+
attr_reader :root_dir, :polars_args
|
5
|
+
|
6
|
+
def in_batches(of: 10_000)
|
7
|
+
raise NotImplementedError, "Subclasses must implement #in_batches"
|
8
|
+
end
|
9
|
+
|
10
|
+
def files
|
11
|
+
raise NotImplementedError, "Subclasses must implement #files"
|
12
|
+
end
|
13
|
+
|
14
|
+
def last_updated_at
|
15
|
+
raise NotImplementedError, "Subclasses must implement #last_updated_at"
|
16
|
+
end
|
17
|
+
|
18
|
+
def refresh!
|
19
|
+
raise NotImplementedError, "Subclasses must implement #refresh!"
|
20
|
+
end
|
21
|
+
|
22
|
+
def data
|
23
|
+
raise NotImplementedError, "Subclasses must implement #data"
|
24
|
+
end
|
25
|
+
|
26
|
+
require_relative "datasource/s3_datasource"
|
27
|
+
require_relative "datasource/file_datasource"
|
28
|
+
require_relative "datasource/polars_datasource"
|
29
|
+
require_relative "datasource/merged_datasource"
|
30
|
+
require_relative "datasource/datasource_factory"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "polars"
|
3
|
+
require "date"
|
4
|
+
require "json"
|
5
|
+
|
6
|
+
module EasyML::Data
|
7
|
+
class PreprocessingSteps
|
8
|
+
class Preprocessor
|
9
|
+
include EasyML::Data::PreprocessingSteps::Utils
|
10
|
+
|
11
|
+
CATEGORICAL_COMMON_MIN = 50
|
12
|
+
PREPROCESSING_ORDER = %w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
|
13
|
+
|
14
|
+
attr_accessor :directory, :preprocessing_steps, :verbose, :imputers, :environment
|
15
|
+
|
16
|
+
def initialize(directory: nil, preprocessing_steps: {}, verbose: false, environment: "development")
|
17
|
+
@directory = directory
|
18
|
+
@preprocessing_steps = standardize_config(preprocessing_steps).with_indifferent_access
|
19
|
+
@verbose = verbose
|
20
|
+
@environment = environment
|
21
|
+
end
|
22
|
+
|
23
|
+
def fit(df)
|
24
|
+
return if df.nil?
|
25
|
+
return if preprocessing_steps.keys.none?
|
26
|
+
|
27
|
+
puts "Preprocessing..." if verbose
|
28
|
+
imputers = initialize_imputers(
|
29
|
+
preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
|
30
|
+
)
|
31
|
+
|
32
|
+
did_cleanup = false
|
33
|
+
imputers.each do |col, imputers|
|
34
|
+
sorted_strategies(imputers).each do |strategy|
|
35
|
+
imputer = imputers[strategy]
|
36
|
+
unless did_cleanup
|
37
|
+
imputer.cleanup
|
38
|
+
did_cleanup = true
|
39
|
+
end
|
40
|
+
if df.columns.map(&:downcase).include?(col.downcase)
|
41
|
+
actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
|
42
|
+
imputer.fit(df[actual_col], df)
|
43
|
+
if strategy == "clip" # This is the only one to transform during fit
|
44
|
+
df[actual_col] = imputer.transform(df[actual_col])
|
45
|
+
end
|
46
|
+
elsif @verbose
|
47
|
+
puts "Warning: Column '#{col}' not found in DataFrame during fit process."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def postprocess(df, inference: false)
|
54
|
+
puts "Postprocessing..." if verbose
|
55
|
+
return df if preprocessing_steps.keys.none?
|
56
|
+
|
57
|
+
steps = if inference
|
58
|
+
preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
|
59
|
+
else
|
60
|
+
preprocessing_steps[:training]
|
61
|
+
end
|
62
|
+
|
63
|
+
df = apply_transformations(df, steps)
|
64
|
+
|
65
|
+
puts "Postprocessing complete." if @verbose
|
66
|
+
df
|
67
|
+
end
|
68
|
+
|
69
|
+
def statistics
|
70
|
+
initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
|
71
|
+
result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
|
72
|
+
col_result[strategy] = imputer.statistics
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def is_fit?
|
78
|
+
statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
|
79
|
+
end
|
80
|
+
|
81
|
+
def delete
|
82
|
+
return unless File.directory?(@directory)
|
83
|
+
|
84
|
+
FileUtils.rm_rf(@directory)
|
85
|
+
end
|
86
|
+
|
87
|
+
def move(to)
|
88
|
+
old_dir = directory
|
89
|
+
current_env = directory.split("/")[-1]
|
90
|
+
new_dir = directory.gsub(Regexp.new(current_env), to)
|
91
|
+
|
92
|
+
puts "Moving #{old_dir} to #{new_dir}"
|
93
|
+
FileUtils.mv(old_dir, new_dir)
|
94
|
+
@directory = new_dir
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
def initialize_imputers(config)
|
100
|
+
standardize_config(config).each_with_object({}) do |(col, strategies), hash|
|
101
|
+
hash[col] ||= {}
|
102
|
+
strategies.each do |strategy, options|
|
103
|
+
next if strategy.to_sym == :one_hot
|
104
|
+
|
105
|
+
options = {} if options == true
|
106
|
+
|
107
|
+
hash[col][strategy] = EasyML::Data::PreprocessingSteps::SimpleImputer.new(
|
108
|
+
strategy: strategy,
|
109
|
+
path: directory,
|
110
|
+
attribute: col,
|
111
|
+
options: options
|
112
|
+
)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def apply_transformations(df, config)
|
118
|
+
imputers = initialize_imputers(config)
|
119
|
+
|
120
|
+
standardize_config(config).each do |col, strategies|
|
121
|
+
if df.columns.map(&:downcase).include?(col.downcase)
|
122
|
+
actual_col = df.columns.find { |c| c.downcase == col.downcase }
|
123
|
+
|
124
|
+
sorted_strategies(strategies).each do |strategy|
|
125
|
+
if strategy.to_sym == :one_hot
|
126
|
+
df = apply_one_hot(df, col, imputers)
|
127
|
+
else
|
128
|
+
imputer = imputers.dig(col, strategy)
|
129
|
+
df[actual_col] = imputer.transform(df[actual_col]) if imputer
|
130
|
+
end
|
131
|
+
end
|
132
|
+
elsif @verbose
|
133
|
+
puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
df
|
138
|
+
end
|
139
|
+
|
140
|
+
def apply_one_hot(df, col, imputers)
|
141
|
+
approved_values = if (cat_imputer = imputers.dig(col, "categorical")).present?
|
142
|
+
cat_imputer.statistics[:categorical][:value].select do |_k, v|
|
143
|
+
v >= cat_imputer.options["categorical_min"]
|
144
|
+
end.keys
|
145
|
+
else
|
146
|
+
df[col].uniq.to_a
|
147
|
+
end
|
148
|
+
|
149
|
+
# Create one-hot encoded columns
|
150
|
+
approved_values.each do |value|
|
151
|
+
new_col_name = "#{col}_#{value}".gsub(/-/, "_")
|
152
|
+
df = df.with_column(
|
153
|
+
df[col].eq(value.to_s).cast(Polars::Int64).alias(new_col_name)
|
154
|
+
)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Create 'other' column for unapproved values
|
158
|
+
other_col_name = "#{col}_other"
|
159
|
+
df[other_col_name] = df[col].map_elements do |value|
|
160
|
+
approved_values.map(&:to_s).exclude?(value)
|
161
|
+
end.cast(Polars::Int64)
|
162
|
+
df.drop([col])
|
163
|
+
end
|
164
|
+
|
165
|
+
def sorted_strategies(strategies)
|
166
|
+
strategies.keys.sort_by do |key|
|
167
|
+
PREPROCESSING_ORDER.index(key)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def prepare_for_imputation(df, col)
|
172
|
+
df = df.with_column(Polars.col(col).cast(Polars::Float64))
|
173
|
+
df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Where to put this???
|
180
|
+
#
|
181
|
+
# def self.stage_required_files
|
182
|
+
# required_files.each do |file|
|
183
|
+
# git_add(file)
|
184
|
+
# end
|
185
|
+
# end
|
186
|
+
|
187
|
+
# def self.git_add(path)
|
188
|
+
# command = "git add #{path}"
|
189
|
+
# puts command if verbose
|
190
|
+
# result = `#{command}`
|
191
|
+
# puts result if verbose
|
192
|
+
# end
|
193
|
+
|
194
|
+
# def self.set_verbose(verbose)
|
195
|
+
# @verbose = verbose
|
196
|
+
# end
|
197
|
+
|
198
|
+
# def required_files
|
199
|
+
# files = Dir.entries(@directory) - %w[. ..]
|
200
|
+
# required_file_types = %w[bin]
|
201
|
+
|
202
|
+
# files.select { |file| required_file_types.any? { |ext| file.include?(ext) } }.map do |file|
|
203
|
+
# File.join(@directory, file)
|
204
|
+
# end
|
205
|
+
# end
|