easy_ml 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +270 -0
- data/Rakefile +12 -0
- data/app/models/easy_ml/model.rb +59 -0
- data/app/models/easy_ml/models/xgboost.rb +9 -0
- data/app/models/easy_ml/models.rb +5 -0
- data/lib/easy_ml/core/model.rb +29 -0
- data/lib/easy_ml/core/model_core.rb +181 -0
- data/lib/easy_ml/core/model_evaluator.rb +137 -0
- data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
- data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
- data/lib/easy_ml/core/models/xgboost.rb +10 -0
- data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
- data/lib/easy_ml/core/models.rb +10 -0
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
- data/lib/easy_ml/core/tuner/adapters.rb +10 -0
- data/lib/easy_ml/core/tuner.rb +105 -0
- data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
- data/lib/easy_ml/core/uploaders.rb +7 -0
- data/lib/easy_ml/core.rb +9 -0
- data/lib/easy_ml/core_ext/pathname.rb +9 -0
- data/lib/easy_ml/core_ext.rb +5 -0
- data/lib/easy_ml/data/dataloader.rb +6 -0
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
- data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
- data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
- data/lib/easy_ml/data/dataset/splits.rb +11 -0
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
- data/lib/easy_ml/data/dataset/splitters.rb +9 -0
- data/lib/easy_ml/data/dataset.rb +430 -0
- data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
- data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
- data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
- data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
- data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
- data/lib/easy_ml/data/datasource.rb +33 -0
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
- data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
- data/lib/easy_ml/data/preprocessor.rb +238 -0
- data/lib/easy_ml/data/utils.rb +50 -0
- data/lib/easy_ml/data.rb +8 -0
- data/lib/easy_ml/deployment.rb +5 -0
- data/lib/easy_ml/engine.rb +26 -0
- data/lib/easy_ml/initializers/inflections.rb +4 -0
- data/lib/easy_ml/logging.rb +38 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
- data/lib/easy_ml/support/age.rb +27 -0
- data/lib/easy_ml/support/est.rb +1 -0
- data/lib/easy_ml/support/file_rotate.rb +23 -0
- data/lib/easy_ml/support/git_ignorable.rb +66 -0
- data/lib/easy_ml/support/synced_directory.rb +134 -0
- data/lib/easy_ml/support/utc.rb +1 -0
- data/lib/easy_ml/support.rb +10 -0
- data/lib/easy_ml/trainer.rb +92 -0
- data/lib/easy_ml/transforms.rb +29 -0
- data/lib/easy_ml/version.rb +5 -0
- data/lib/easy_ml.rb +23 -0
- metadata +353 -0
data/lib/easy_ml/core.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
{
|
2
|
+
"annual_revenue": {
|
3
|
+
"median": {
|
4
|
+
"value": 3000.0,
|
5
|
+
"original_dtype": {
|
6
|
+
"__type__": "polars_dtype",
|
7
|
+
"value": "Polars::Int64"
|
8
|
+
}
|
9
|
+
}
|
10
|
+
},
|
11
|
+
"loan_purpose": {
|
12
|
+
"categorical": {
|
13
|
+
"value": {
|
14
|
+
"payroll": 4,
|
15
|
+
"expansion": 1
|
16
|
+
},
|
17
|
+
"label_encoder": {
|
18
|
+
"expansion": 0,
|
19
|
+
"payroll": 1
|
20
|
+
},
|
21
|
+
"label_decoder": {
|
22
|
+
"0": "expansion",
|
23
|
+
"1": "payroll"
|
24
|
+
},
|
25
|
+
"original_dtype": {
|
26
|
+
"__type__": "polars_dtype",
|
27
|
+
"value": "Polars::String"
|
28
|
+
}
|
29
|
+
}
|
30
|
+
}
|
31
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"previous_sample":1.0}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"previous_sample":1.0}
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require_relative "split"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
module Data
|
5
|
+
class Dataset
|
6
|
+
module Splits
|
7
|
+
class FileSplit < Split
|
8
|
+
include GlueGun::DSL
|
9
|
+
include EasyML::Data::Utils
|
10
|
+
|
11
|
+
attribute :dir, :string
|
12
|
+
attribute :polars_args, :hash, default: {}
|
13
|
+
attribute :max_rows_per_file, :integer, default: 1_000_000
|
14
|
+
attribute :batch_size, :integer, default: 10_000
|
15
|
+
attribute :sample, :float, default: 1.0
|
16
|
+
attribute :verbose, :boolean, default: false
|
17
|
+
|
18
|
+
def initialize(options)
|
19
|
+
super
|
20
|
+
FileUtils.mkdir_p(dir)
|
21
|
+
end
|
22
|
+
|
23
|
+
def save(segment, df)
|
24
|
+
segment_dir = File.join(dir, segment.to_s)
|
25
|
+
FileUtils.mkdir_p(segment_dir)
|
26
|
+
|
27
|
+
current_file = current_file_for_segment(segment)
|
28
|
+
current_row_count = current_file && File.exist?(current_file) ? df(current_file).shape[0] : 0
|
29
|
+
remaining_rows = max_rows_per_file - current_row_count
|
30
|
+
|
31
|
+
while df.shape[0] > 0
|
32
|
+
if df.shape[0] <= remaining_rows
|
33
|
+
append_to_csv(df, current_file)
|
34
|
+
break
|
35
|
+
else
|
36
|
+
df_to_append = df.slice(0, remaining_rows)
|
37
|
+
df = df.slice(remaining_rows, df.shape[0] - remaining_rows)
|
38
|
+
append_to_csv(df_to_append, current_file)
|
39
|
+
current_file = new_file_path_for_segment(segment)
|
40
|
+
remaining_rows = max_rows_per_file
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
46
|
+
files = files_for_segment(segment)
|
47
|
+
|
48
|
+
if block_given?
|
49
|
+
result = nil
|
50
|
+
total_rows = files.sum { |file| df(file).shape[0] }
|
51
|
+
progress_bar = create_progress_bar(segment, total_rows) if verbose
|
52
|
+
|
53
|
+
files.each do |file|
|
54
|
+
df = self.df(file)
|
55
|
+
df = sample_data(df) if sample < 1.0
|
56
|
+
drop_cols &= df.columns
|
57
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
58
|
+
|
59
|
+
if split_ys
|
60
|
+
xs, ys = split_features_targets(df, true, target)
|
61
|
+
result = process_block_with_split_ys(block, result, xs, ys)
|
62
|
+
else
|
63
|
+
result = process_block_without_split_ys(block, result, df)
|
64
|
+
end
|
65
|
+
|
66
|
+
progress_bar.progress += df.shape[0] if verbose
|
67
|
+
end
|
68
|
+
progress_bar.finish if verbose
|
69
|
+
result
|
70
|
+
elsif files.empty?
|
71
|
+
return nil, nil if split_ys
|
72
|
+
|
73
|
+
nil
|
74
|
+
|
75
|
+
else
|
76
|
+
combined_df = combine_dataframes(files)
|
77
|
+
combined_df = sample_data(combined_df) if sample < 1.0
|
78
|
+
drop_cols &= combined_df.columns
|
79
|
+
combined_df = combined_df.drop(drop_cols) unless drop_cols.empty?
|
80
|
+
split_features_targets(combined_df, split_ys, target)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def cleanup
|
85
|
+
FileUtils.rm_rf(dir)
|
86
|
+
FileUtils.mkdir_p(dir)
|
87
|
+
end
|
88
|
+
|
89
|
+
def split_at
|
90
|
+
return nil if output_files.empty?
|
91
|
+
|
92
|
+
output_files.map { |file| File.mtime(file) }.max
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def read_csv_batched(path)
|
98
|
+
Polars.read_csv_batched(path, batch_size: batch_size, **polars_args)
|
99
|
+
end
|
100
|
+
|
101
|
+
def df(path)
|
102
|
+
Polars.read_csv(path, **polars_args)
|
103
|
+
end
|
104
|
+
|
105
|
+
def output_files
|
106
|
+
Dir.glob("#{dir}/**/*.csv")
|
107
|
+
end
|
108
|
+
|
109
|
+
def files_for_segment(segment)
|
110
|
+
segment_dir = File.join(dir, segment.to_s)
|
111
|
+
Dir.glob(File.join(segment_dir, "**/*.csv")).sort
|
112
|
+
end
|
113
|
+
|
114
|
+
def current_file_for_segment(segment)
|
115
|
+
current_file = files_for_segment(segment).last
|
116
|
+
return new_file_path_for_segment(segment) if current_file.nil?
|
117
|
+
|
118
|
+
row_count = df(current_file).shape[0]
|
119
|
+
if row_count >= max_rows_per_file
|
120
|
+
new_file_path_for_segment(segment)
|
121
|
+
else
|
122
|
+
current_file
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def new_file_path_for_segment(segment)
|
127
|
+
segment_dir = File.join(dir, segment.to_s)
|
128
|
+
file_number = Dir.glob(File.join(segment_dir, "*.csv")).count
|
129
|
+
File.join(segment_dir, "#{segment}_%04d.csv" % file_number)
|
130
|
+
end
|
131
|
+
|
132
|
+
def combine_dataframes(files)
|
133
|
+
dfs = files.map { |file| df(file) }
|
134
|
+
Polars.concat(dfs)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Dataset
|
4
|
+
module Splits
|
5
|
+
class InMemorySplit < Split
|
6
|
+
include GlueGun::DSL
|
7
|
+
|
8
|
+
attribute :sample, :float, default: 1.0
|
9
|
+
def initialize(options)
|
10
|
+
super
|
11
|
+
@data = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def save(segment, df)
|
15
|
+
@data[segment] = df
|
16
|
+
end
|
17
|
+
|
18
|
+
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
19
|
+
df = @data[segment]
|
20
|
+
return nil if df.nil?
|
21
|
+
|
22
|
+
df = sample_data(df) if sample < 1.0
|
23
|
+
drop_cols &= df.columns
|
24
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
25
|
+
|
26
|
+
if block_given?
|
27
|
+
if split_ys
|
28
|
+
xs, ys = split_features_targets(df, true, target)
|
29
|
+
process_block_with_split_ys(block, nil, xs, ys)
|
30
|
+
else
|
31
|
+
process_block_without_split_ys(block, nil, df)
|
32
|
+
end
|
33
|
+
else
|
34
|
+
split_features_targets(df, split_ys, target)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def cleanup
|
39
|
+
@data.clear
|
40
|
+
end
|
41
|
+
|
42
|
+
def split_at
|
43
|
+
@data.keys.empty? ? nil : Time.now
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Dataset
|
4
|
+
module Splits
|
5
|
+
class Split
|
6
|
+
include GlueGun::DSL
|
7
|
+
include EasyML::Data::Utils
|
8
|
+
|
9
|
+
attribute :polars_args, :hash, default: {}
|
10
|
+
attribute :max_rows_per_file, :integer, default: 1_000_000
|
11
|
+
attribute :batch_size, :integer, default: 10_000
|
12
|
+
attribute :sample, :float, default: 1.0
|
13
|
+
attribute :verbose, :boolean, default: false
|
14
|
+
|
15
|
+
def save(segment, df)
|
16
|
+
raise NotImplementedError, "Subclasses must implement #save"
|
17
|
+
end
|
18
|
+
|
19
|
+
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
20
|
+
raise NotImplementedError, "Subclasses must implement #read"
|
21
|
+
end
|
22
|
+
|
23
|
+
def train(&block)
|
24
|
+
read(:train, &block)
|
25
|
+
end
|
26
|
+
|
27
|
+
def test(&block)
|
28
|
+
read(:test, &block)
|
29
|
+
end
|
30
|
+
|
31
|
+
def valid(&block)
|
32
|
+
read(:valid, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
def cleanup
|
36
|
+
raise NotImplementedError, "Subclasses must implement #cleanup"
|
37
|
+
end
|
38
|
+
|
39
|
+
def split_at
|
40
|
+
raise NotImplementedError, "Subclasses must implement #split_at"
|
41
|
+
end
|
42
|
+
|
43
|
+
protected
|
44
|
+
|
45
|
+
def split_features_targets(df, split_ys, target)
|
46
|
+
raise ArgumentError, "Target column must be specified when split_ys is true" if split_ys && target.nil?
|
47
|
+
|
48
|
+
if split_ys
|
49
|
+
xs = df.drop(target)
|
50
|
+
ys = df.select(target)
|
51
|
+
[xs, ys]
|
52
|
+
else
|
53
|
+
df
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def sample_data(df)
|
58
|
+
return df if sample >= 1.0
|
59
|
+
|
60
|
+
df.sample(n: (df.shape[0] * sample).ceil, seed: 42)
|
61
|
+
end
|
62
|
+
|
63
|
+
def create_progress_bar(segment, total_rows)
|
64
|
+
ProgressBar.create(
|
65
|
+
title: "Reading #{segment}",
|
66
|
+
total: total_rows,
|
67
|
+
format: "%t: |%B| %p%% %e"
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
def process_block_with_split_ys(block, result, xs, ys)
|
72
|
+
case block.arity
|
73
|
+
when 3
|
74
|
+
result.nil? ? [xs, ys] : block.call(result, xs, ys)
|
75
|
+
when 2
|
76
|
+
block.call(xs, ys)
|
77
|
+
result
|
78
|
+
else
|
79
|
+
raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def process_block_without_split_ys(block, result, df)
|
84
|
+
case block.arity
|
85
|
+
when 2
|
86
|
+
result.nil? ? df : block.call(result, df)
|
87
|
+
when 1
|
88
|
+
block.call(df)
|
89
|
+
result
|
90
|
+
else
|
91
|
+
raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module EasyML::Data::Dataset::Splitters
|
2
|
+
class DateSplitter
|
3
|
+
include GlueGun::DSL
|
4
|
+
|
5
|
+
attribute :today, :datetime
|
6
|
+
def today=(value)
|
7
|
+
super(value.in_time_zone(UTC).to_datetime)
|
8
|
+
end
|
9
|
+
attribute :date_col, :string
|
10
|
+
attribute :months_test, :integer, default: 2
|
11
|
+
attribute :months_valid, :integer, default: 2
|
12
|
+
|
13
|
+
def initialize(options)
|
14
|
+
options[:today] ||= UTC.now
|
15
|
+
super(options)
|
16
|
+
end
|
17
|
+
|
18
|
+
def split(df)
|
19
|
+
unless df[date_col].dtype.is_a?(Polars::Datetime)
|
20
|
+
raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
|
21
|
+
end
|
22
|
+
|
23
|
+
validation_date_start, test_date_start = splits
|
24
|
+
|
25
|
+
test_df = df.filter(Polars.col(date_col) >= test_date_start)
|
26
|
+
remaining_df = df.filter(Polars.col(date_col) < test_date_start)
|
27
|
+
valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
|
28
|
+
train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
|
29
|
+
|
30
|
+
[train_df, valid_df, test_df]
|
31
|
+
end
|
32
|
+
|
33
|
+
def months(n)
|
34
|
+
ActiveSupport::Duration.months(n)
|
35
|
+
end
|
36
|
+
|
37
|
+
def splits
|
38
|
+
test_date_start = today.advance(months: -months_test).beginning_of_day
|
39
|
+
validation_date_start = today.advance(months: -(months_test + months_valid)).beginning_of_day
|
40
|
+
[validation_date_start, test_date_start]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|