easy_ml 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,7 @@
1
+ module EasyML
2
+ module Core
3
+ module Uploaders
4
+ require_relative "uploaders/model_uploader"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,9 @@
1
+ module EasyML
2
+ module Core
3
+ require_relative "core/uploaders"
4
+ require_relative "core/model"
5
+ require_relative "core/models"
6
+ require_relative "core/model_evaluator"
7
+ require_relative "core/tuner"
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ require "pathname"
2
+
3
+ class Pathname
4
+ def append(folder)
5
+ dir = cleanpath
6
+ dir = dir.join(folder) unless basename.to_s == folder
7
+ dir
8
+ end
9
+ end
@@ -0,0 +1,5 @@
1
+ module EasyML
2
+ module CoreExt
3
+ require_relative "core_ext/pathname"
4
+ end
5
+ end
@@ -0,0 +1,6 @@
1
+ module ML
2
+ module Data
3
+ class Dataloader
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,31 @@
1
+ {
2
+ "annual_revenue": {
3
+ "median": {
4
+ "value": 3000.0,
5
+ "original_dtype": {
6
+ "__type__": "polars_dtype",
7
+ "value": "Polars::Int64"
8
+ }
9
+ }
10
+ },
11
+ "loan_purpose": {
12
+ "categorical": {
13
+ "value": {
14
+ "payroll": 4,
15
+ "expansion": 1
16
+ },
17
+ "label_encoder": {
18
+ "expansion": 0,
19
+ "payroll": 1
20
+ },
21
+ "label_decoder": {
22
+ "0": "expansion",
23
+ "1": "payroll"
24
+ },
25
+ "original_dtype": {
26
+ "__type__": "polars_dtype",
27
+ "value": "Polars::String"
28
+ }
29
+ }
30
+ }
31
+ }
@@ -0,0 +1 @@
1
+ {"previous_sample":1.0}
@@ -0,0 +1 @@
1
+ {"previous_sample":1.0}
@@ -0,0 +1,140 @@
1
+ require_relative "split"
2
+
3
+ module EasyML
4
+ module Data
5
+ class Dataset
6
+ module Splits
7
+ class FileSplit < Split
8
+ include GlueGun::DSL
9
+ include EasyML::Data::Utils
10
+
11
+ attribute :dir, :string
12
+ attribute :polars_args, :hash, default: {}
13
+ attribute :max_rows_per_file, :integer, default: 1_000_000
14
+ attribute :batch_size, :integer, default: 10_000
15
+ attribute :sample, :float, default: 1.0
16
+ attribute :verbose, :boolean, default: false
17
+
18
+ def initialize(options)
19
+ super
20
+ FileUtils.mkdir_p(dir)
21
+ end
22
+
23
+ def save(segment, df)
24
+ segment_dir = File.join(dir, segment.to_s)
25
+ FileUtils.mkdir_p(segment_dir)
26
+
27
+ current_file = current_file_for_segment(segment)
28
+ current_row_count = current_file && File.exist?(current_file) ? df(current_file).shape[0] : 0
29
+ remaining_rows = max_rows_per_file - current_row_count
30
+
31
+ while df.shape[0] > 0
32
+ if df.shape[0] <= remaining_rows
33
+ append_to_csv(df, current_file)
34
+ break
35
+ else
36
+ df_to_append = df.slice(0, remaining_rows)
37
+ df = df.slice(remaining_rows, df.shape[0] - remaining_rows)
38
+ append_to_csv(df_to_append, current_file)
39
+ current_file = new_file_path_for_segment(segment)
40
+ remaining_rows = max_rows_per_file
41
+ end
42
+ end
43
+ end
44
+
45
+ def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
46
+ files = files_for_segment(segment)
47
+
48
+ if block_given?
49
+ result = nil
50
+ total_rows = files.sum { |file| df(file).shape[0] }
51
+ progress_bar = create_progress_bar(segment, total_rows) if verbose
52
+
53
+ files.each do |file|
54
+ df = self.df(file)
55
+ df = sample_data(df) if sample < 1.0
56
+ drop_cols &= df.columns
57
+ df = df.drop(drop_cols) unless drop_cols.empty?
58
+
59
+ if split_ys
60
+ xs, ys = split_features_targets(df, true, target)
61
+ result = process_block_with_split_ys(block, result, xs, ys)
62
+ else
63
+ result = process_block_without_split_ys(block, result, df)
64
+ end
65
+
66
+ progress_bar.progress += df.shape[0] if verbose
67
+ end
68
+ progress_bar.finish if verbose
69
+ result
70
+ elsif files.empty?
71
+ return nil, nil if split_ys
72
+
73
+ nil
74
+
75
+ else
76
+ combined_df = combine_dataframes(files)
77
+ combined_df = sample_data(combined_df) if sample < 1.0
78
+ drop_cols &= combined_df.columns
79
+ combined_df = combined_df.drop(drop_cols) unless drop_cols.empty?
80
+ split_features_targets(combined_df, split_ys, target)
81
+ end
82
+ end
83
+
84
+ def cleanup
85
+ FileUtils.rm_rf(dir)
86
+ FileUtils.mkdir_p(dir)
87
+ end
88
+
89
+ def split_at
90
+ return nil if output_files.empty?
91
+
92
+ output_files.map { |file| File.mtime(file) }.max
93
+ end
94
+
95
+ private
96
+
97
+ def read_csv_batched(path)
98
+ Polars.read_csv_batched(path, batch_size: batch_size, **polars_args)
99
+ end
100
+
101
+ def df(path)
102
+ Polars.read_csv(path, **polars_args)
103
+ end
104
+
105
+ def output_files
106
+ Dir.glob("#{dir}/**/*.csv")
107
+ end
108
+
109
+ def files_for_segment(segment)
110
+ segment_dir = File.join(dir, segment.to_s)
111
+ Dir.glob(File.join(segment_dir, "**/*.csv")).sort
112
+ end
113
+
114
+ def current_file_for_segment(segment)
115
+ current_file = files_for_segment(segment).last
116
+ return new_file_path_for_segment(segment) if current_file.nil?
117
+
118
+ row_count = df(current_file).shape[0]
119
+ if row_count >= max_rows_per_file
120
+ new_file_path_for_segment(segment)
121
+ else
122
+ current_file
123
+ end
124
+ end
125
+
126
+ def new_file_path_for_segment(segment)
127
+ segment_dir = File.join(dir, segment.to_s)
128
+ file_number = Dir.glob(File.join(segment_dir, "*.csv")).count
129
+ File.join(segment_dir, "#{segment}_%04d.csv" % file_number)
130
+ end
131
+
132
+ def combine_dataframes(files)
133
+ dfs = files.map { |file| df(file) }
134
+ Polars.concat(dfs)
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,49 @@
1
+ module EasyML
2
+ module Data
3
+ class Dataset
4
+ module Splits
5
+ class InMemorySplit < Split
6
+ include GlueGun::DSL
7
+
8
+ attribute :sample, :float, default: 1.0
9
+ def initialize(options)
10
+ super
11
+ @data = {}
12
+ end
13
+
14
+ def save(segment, df)
15
+ @data[segment] = df
16
+ end
17
+
18
+ def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
19
+ df = @data[segment]
20
+ return nil if df.nil?
21
+
22
+ df = sample_data(df) if sample < 1.0
23
+ drop_cols &= df.columns
24
+ df = df.drop(drop_cols) unless drop_cols.empty?
25
+
26
+ if block_given?
27
+ if split_ys
28
+ xs, ys = split_features_targets(df, true, target)
29
+ process_block_with_split_ys(block, nil, xs, ys)
30
+ else
31
+ process_block_without_split_ys(block, nil, df)
32
+ end
33
+ else
34
+ split_features_targets(df, split_ys, target)
35
+ end
36
+ end
37
+
38
+ def cleanup
39
+ @data.clear
40
+ end
41
+
42
+ def split_at
43
+ @data.keys.empty? ? nil : Time.now
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,98 @@
1
+ module EasyML
2
+ module Data
3
+ class Dataset
4
+ module Splits
5
+ class Split
6
+ include GlueGun::DSL
7
+ include EasyML::Data::Utils
8
+
9
+ attribute :polars_args, :hash, default: {}
10
+ attribute :max_rows_per_file, :integer, default: 1_000_000
11
+ attribute :batch_size, :integer, default: 10_000
12
+ attribute :sample, :float, default: 1.0
13
+ attribute :verbose, :boolean, default: false
14
+
15
+ def save(segment, df)
16
+ raise NotImplementedError, "Subclasses must implement #save"
17
+ end
18
+
19
+ def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
20
+ raise NotImplementedError, "Subclasses must implement #read"
21
+ end
22
+
23
+ def train(&block)
24
+ read(:train, &block)
25
+ end
26
+
27
+ def test(&block)
28
+ read(:test, &block)
29
+ end
30
+
31
+ def valid(&block)
32
+ read(:valid, &block)
33
+ end
34
+
35
+ def cleanup
36
+ raise NotImplementedError, "Subclasses must implement #cleanup"
37
+ end
38
+
39
+ def split_at
40
+ raise NotImplementedError, "Subclasses must implement #split_at"
41
+ end
42
+
43
+ protected
44
+
45
+ def split_features_targets(df, split_ys, target)
46
+ raise ArgumentError, "Target column must be specified when split_ys is true" if split_ys && target.nil?
47
+
48
+ if split_ys
49
+ xs = df.drop(target)
50
+ ys = df.select(target)
51
+ [xs, ys]
52
+ else
53
+ df
54
+ end
55
+ end
56
+
57
+ def sample_data(df)
58
+ return df if sample >= 1.0
59
+
60
+ df.sample(n: (df.shape[0] * sample).ceil, seed: 42)
61
+ end
62
+
63
+ def create_progress_bar(segment, total_rows)
64
+ ProgressBar.create(
65
+ title: "Reading #{segment}",
66
+ total: total_rows,
67
+ format: "%t: |%B| %p%% %e"
68
+ )
69
+ end
70
+
71
+ def process_block_with_split_ys(block, result, xs, ys)
72
+ case block.arity
73
+ when 3
74
+ result.nil? ? [xs, ys] : block.call(result, xs, ys)
75
+ when 2
76
+ block.call(xs, ys)
77
+ result
78
+ else
79
+ raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
80
+ end
81
+ end
82
+
83
+ def process_block_without_split_ys(block, result, df)
84
+ case block.arity
85
+ when 2
86
+ result.nil? ? df : block.call(result, df)
87
+ when 1
88
+ block.call(df)
89
+ result
90
+ else
91
+ raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,11 @@
1
+ module EasyML
2
+ module Data
3
+ class Dataset
4
+ module Splits
5
+ require_relative "splits/split"
6
+ require_relative "splits/file_split"
7
+ require_relative "splits/in_memory_split"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,43 @@
1
+ module EasyML::Data::Dataset::Splitters
2
+ class DateSplitter
3
+ include GlueGun::DSL
4
+
5
+ attribute :today, :datetime
6
+ def today=(value)
7
+ super(value.in_time_zone(UTC).to_datetime)
8
+ end
9
+ attribute :date_col, :string
10
+ attribute :months_test, :integer, default: 2
11
+ attribute :months_valid, :integer, default: 2
12
+
13
+ def initialize(options)
14
+ options[:today] ||= UTC.now
15
+ super(options)
16
+ end
17
+
18
+ def split(df)
19
+ unless df[date_col].dtype.is_a?(Polars::Datetime)
20
+ raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
21
+ end
22
+
23
+ validation_date_start, test_date_start = splits
24
+
25
+ test_df = df.filter(Polars.col(date_col) >= test_date_start)
26
+ remaining_df = df.filter(Polars.col(date_col) < test_date_start)
27
+ valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
28
+ train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
29
+
30
+ [train_df, valid_df, test_df]
31
+ end
32
+
33
+ def months(n)
34
+ ActiveSupport::Duration.months(n)
35
+ end
36
+
37
+ def splits
38
+ test_date_start = today.advance(months: -months_test).beginning_of_day
39
+ validation_date_start = today.advance(months: -(months_test + months_valid)).beginning_of_day
40
+ [validation_date_start, test_date_start]
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,9 @@
1
+ module EasyML
2
+ module Data
3
+ class Dataset
4
+ module Splitters
5
+ require_relative "splitters/date_splitter"
6
+ end
7
+ end
8
+ end
9
+ end