easy_ml 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7a959176791dac2307979438ad0f9a9319b4295fe25e489214df5f3b4c908466
4
+ data.tar.gz: c665ef3c19fda35197be653d9c14a34e9b2256d3c1b7387be4313086ba5d2c11
5
+ SHA512:
6
+ metadata.gz: bda834230add0f3de2b57d8df4abde48f79f64275946b372d5069b49c610ab09c8cc1117a89a2d9206100b19b30c8f30bbfbe2d8973ce5db551626bfb612949d
7
+ data.tar.gz: 3d80ffc4930323f3b8cff51ef2e3475f557589e96d54717c750a63507c3c3ab682650c7ccb4daf2bd1168cb35627c0101579e8bd26b310e86906e134c6626249
data/README.md ADDED
@@ -0,0 +1,270 @@
1
+ <img src="easy_ml.svg" alt="EasyML Logo" style="width: 310px; height: 300px;">
2
+
3
+ # EasyML
4
+
5
+ EasyML is a Ruby gem designed to simplify the process of building, deploying, and managing the lifecycle of machine learning models within a Ruby on Rails application. It is a plug-and-play, opinionated framework that currently supports XGBoost, with plans to expand support to a variety of models and infrastructures. EasyML aims to make deployment and lifecycle management straightforward and efficient.
6
+
7
+ ## Features
8
+
9
+ - **Plug-and-Play Architecture**: EasyML is designed to be easily extendable, allowing for the integration of various machine learning models and data sources.
10
+ - **Opinionated Framework**: Provides a structured approach to model management, ensuring best practices are followed.
11
+ - **Model Lifecycle On Rails**: Seamlessly integrates with Ruby on Rails, allowing simplified deployment of models to production.
12
+
13
+ ## Current and Planned Features
14
+
15
+ ### Models Available
16
+
17
+ | XGBoost | LightGBM | TensorFlow | PyTorch |
18
+ | ------- | -------- | ---------- | ------- |
19
+ | ✅ | ❌ | ❌ | ❌ |
20
+
21
+ ### Datasources Available
22
+
23
+ | S3 | File | Polars | SQL Databases | REST APIs |
24
+ | --- | ---- | ------ | ------------- | --------- |
25
+ | ✅ | ✅ | ✅ | ❌ | ❌ |
26
+
27
+ _Note: Features marked with ❌ are part of the roadmap and are not yet implemented._
28
+
29
+ ## Quick Start:
30
+
31
+ Building a Production pipeline is as easy as 1,2,3!
32
+
33
+ ### 1. Create Your Dataset
34
+
35
+ ```ruby
36
+ class MyDataset < EasyML::Data::Dataset
37
+ datasource :s3, s3_bucket: "my-bucket" # Every time the data changes, we'll pull new data
38
+ target "revenue" # What are we trying to predict?
39
+ splitter :date, date_column: "created_at" # How should we partition data into training, test, and validation datasets?
40
+ transforms DataPipeline # Class that manages data transformation, adding new columns, etc.
41
+ preprocessing_steps({
42
+ training: {
43
+ annual_revenue: { median: true, clip: { min: 0, max: 500_000 } }
44
+ }
45
+ }) # If annual revenue is missing, use the median value, after clipping the values into the approved list
46
+ end
47
+ ```
48
+
49
+ ### 2. Create a Model
50
+
51
+ ```ruby
52
+ class MyModel < EasyML::Models::XGBoost
53
+ dataset MyDataset
54
+ task :regression # Or classification
55
+ hyperparameters({
56
+ max_depth: 5,
57
+ learning_rate: 0.1,
58
+ objective: "reg:squarederror"
59
+ })
60
+ end
61
+ ```
62
+
63
+ ### 3. Create a Trainer
64
+
65
+ ```ruby
66
+ class MyTrainer < EasyML::Trainer
67
+ model MyModel
68
+ evaluator MyMetrics
69
+ end
70
+
71
+ class MyMetrics
72
+ def metric_we_make_money(y_pred, y_true)
73
+ return true if model_makes_money?
74
+ return false if model_lose_money?
75
+ end
76
+
77
+ def metric_sales_team_has_enough_leads(y_pred, y_true)
78
+ return false if sales_will_be_sitting_on_their_hands?
79
+ end
80
+ end
81
+ ```
82
+
83
+ Now you're ready to predict in production!
84
+
85
+ ```ruby
86
+ MyTrainer.train # Yay, we did it!
87
+ MyTrainer.deploy # Let the production hosts know it's live!
88
+ MyTrainer.predict(customer_data: "I am worth a lot of money")
89
+ # prediction: true!
90
+ ```
91
+
92
+ ## Data Management
93
+
94
+ EasyML provides a comprehensive data management system that handles all preprocessing tasks, including splitting data into train, test, and validation sets, and avoiding data leakage. The primary abstraction for data handling is the `Dataset` class, which ensures data is properly managed and prepared for machine learning tasks.
95
+
96
+ ### Preprocessing Features
97
+
98
+ EasyML offers a variety of preprocessing features to prepare your data for machine learning models. Here's a complete list of available preprocessing steps and examples of when to use them:
99
+
100
+ - **Mean Imputation**: Replace missing values with the mean of the feature. Use this when you want to maintain the average value of the data.
101
+
102
+ ```ruby
103
+ annual_revenue: {
104
+ mean: true
105
+ }
106
+ ```
107
+
108
+ - **Median Imputation**: Replace missing values with the median of the feature. This is useful when you want to maintain the central tendency of the data without being affected by outliers.
109
+
110
+ ```ruby
111
+ annual_revenue: {
112
+ median: true
113
+ }
114
+ ```
115
+
116
+ - **Forward Fill (ffill)**: Fill missing values with the last observed value. Use this for time series data where the last known value is a reasonable estimate for missing values.
117
+
118
+ ```ruby
119
+ created_date: {
120
+ ffill: true
121
+ }
122
+ ```
123
+
124
+ - **Most Frequent Imputation**: Replace missing values with the most frequently occurring value. This is useful for categorical data where the mode is a reasonable estimate for missing values.
125
+
126
+ ```ruby
127
+ loan_purpose: {
128
+ most_frequent: true
129
+ }
130
+ ```
131
+
132
+ - **Constant Imputation**: Replace missing values with a constant value. Use this when you have a specific value that should be used for missing data.
133
+
134
+ ```ruby
135
+ loan_purpose: {
136
+ constant: { fill_value: 'unknown' }
137
+ }
138
+ ```
139
+
140
+ - **Today Imputation**: Fill missing date values with the current date. Use this for features that should default to the current date.
141
+
142
+ ```ruby
143
+ created_date: {
144
+ today: true
145
+ }
146
+ ```
147
+
148
+ - **One-Hot Encoding**: Convert categorical variables into a set of binary variables. Use this when you have categorical data that needs to be converted into a numerical format for model training.
149
+
150
+ ```ruby
151
+ loan_purpose: {
152
+ one_hot: true
153
+ }
154
+ ```
155
+
156
+ - **Label Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
157
+
158
+ ```ruby
159
+ loan_purpose: {
160
+ categorical: {
161
+ encode_labels: true
162
+ }
163
+ }
164
+ ```
165
+
166
+ ### Other Dataset Features
167
+
168
+ - **Data Splitting**: Automatically split data into train, test, and validation sets using various strategies, such as date-based splitting.
169
+ - **Data Synchronization**: Ensure data is synced from its source, such as S3 or local files.
170
+ - **Batch Processing**: Process data in batches to handle large datasets efficiently.
171
+ - **Null Handling**: Alert and handle null values in datasets to ensure data quality.
172
+
173
+ ## Installation
174
+
175
+ Install necessary Python dependencies
176
+
177
+ 1. **Install Python dependencies (don't worry, all code is in Ruby, we just call through to Python)**
178
+
179
+ ```bash
180
+ pip install wandb
181
+ pip install optuna
182
+ ```
183
+
184
+ 1. **Install the gem**:
185
+
186
+ ```bash
187
+ gem install easy_ml
188
+ ```
189
+
190
+ 2. **Run the generator to store model versions**:
191
+
192
+ ```bash
193
+ rails generate easy_ml:migration
194
+ rails db:migrate
195
+ ```
196
+
197
+ 3. **Configure CarrierWave for S3 storage**:
198
+
199
+ Ensure you have CarrierWave configured to use AWS S3. If not, add the following configuration:
200
+
201
+ ```ruby
202
+ # config/initializers/carrierwave.rb
203
+ CarrierWave.configure do |config|
204
+ config.fog_provider = 'fog/aws'
205
+ config.fog_credentials = {
206
+ provider: 'AWS',
207
+ aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
208
+ aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
209
+ region: ENV['AWS_REGION'],
210
+ }
211
+ config.fog_directory = ENV['AWS_S3_BUCKET']
212
+ config.fog_public = false
213
+ config.storage = :fog
214
+ end
215
+ ```
216
+
217
+ ## Usage
218
+
219
+ To use EasyML in your Rails application, follow these steps:
220
+
221
+ 1. **Define your preprocessing steps** in a configuration hash. For example:
222
+
223
+ ```ruby
224
+ preprocessing_steps = {
225
+ training: {
226
+ annual_revenue: {
227
+ median: true,
228
+ clip: { min: 0, max: 1_000_000 }
229
+ },
230
+ loan_purpose: {
231
+ categorical: {
232
+ categorical_min: 2,
233
+ one_hot: true
234
+ }
235
+ }
236
+ }
237
+ }
238
+ ```
239
+
240
+ 2. **Create a dataset** using the `EasyML::Data::Dataset` class, providing necessary configurations such as data source, target, and preprocessing steps.
241
+
242
+ 3. **Train a model** using the `EasyML::Models` module, specifying the model class and configuration.
243
+
244
+ 4. **Deploy the model** by marking it as live and storing it in the configured S3 bucket.
245
+
246
+ ## Development
247
+
248
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
249
+
250
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
251
+
252
+ ## Contributing
253
+
254
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/easy_ml. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
255
+
256
+ ## License
257
+
258
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
259
+
260
+ ## Code of Conduct
261
+
262
+ Everyone interacting in the EasyML project's codebases, issue trackers, chat rooms, and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
263
+
264
+ ## Expected Future Enhancements
265
+
266
+ - **Support for Additional Models**: Integration with LightGBM, TensorFlow, and PyTorch.
267
+ - **Expanded Data Source Support**: Ability to pull data from SQL databases and REST APIs.
268
+ - **Enhanced Deployment Options**: More flexible deployment strategies and integration with CI/CD pipelines.
269
+ - **Advanced Monitoring and Logging**: Improved tools for monitoring model performance and logging.
270
+ - **User Interface Improvements**: Enhanced UI components for managing models and datasets.
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
@@ -0,0 +1,59 @@
1
+ require_relative "../../../lib/easy_ml/core/model"
2
+ module EasyML
3
+ class Model < ActiveRecord::Base
4
+ include EasyML::Core::ModelCore
5
+
6
+ self.table_name = "easy_ml_models"
7
+
8
+ scope :live, -> { where(is_live: true) }
9
+ attribute :root_dir, :string
10
+ after_initialize :apply_defaults
11
+
12
+ validate :only_one_model_is_live?
13
+ def only_one_model_is_live?
14
+ return if @marking_live
15
+
16
+ if previous_versions.live.count > 1
17
+ raise "Multiple previous versions of #{name} are live! This should never happen. Update previous versions to is_live=false before proceeding"
18
+ end
19
+
20
+ return unless previous_versions.live.any? && is_live
21
+
22
+ errors.add(:is_live,
23
+ "cannot mark model live when previous version is live. Explicitly use the mark_live method to mark this as the live version")
24
+ end
25
+
26
+ def mark_live
27
+ transaction do
28
+ self.class.where(name: name).where.not(id: id).update_all(is_live: false)
29
+ self.class.where(id: id).update_all(is_live: true)
30
+ end
31
+ end
32
+
33
+ def previous_versions
34
+ EasyML::Model.where(name: name).order(id: :desc)
35
+ end
36
+
37
+ private
38
+
39
+ def files_to_keep
40
+ live_models = self.class.live
41
+
42
+ recent_copies = live_models.flat_map do |live|
43
+ # Fetch all models with the same name
44
+ self.class.where(name: live.name).where(is_live: false).order(created_at: :desc).limit(live.name == name ? 4 : 5)
45
+ end
46
+
47
+ recent_versions = self.class
48
+ .where.not(
49
+ "EXISTS (SELECT 1 FROM easy_ml_models e2 WHERE e2.name = easy_ml_models.name AND e2.is_live = true)"
50
+ )
51
+ .where("created_at >= ?", 2.days.ago)
52
+ .order(created_at: :desc)
53
+ .group_by(&:name)
54
+ .flat_map { |_, models| models.take(5) }
55
+
56
+ ([self] + recent_versions + recent_copies + live_models).compact.map(&:file).map(&:path).uniq
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,9 @@
1
+ require_relative "../model"
2
+
3
+ module EasyML
4
+ module Models
5
+ class XGBoost < EasyML::Model
6
+ include EasyML::Core::Models::XGBoostCore
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,5 @@
1
+ module EasyML
2
+ module Models
3
+ require_relative "models/xgboost"
4
+ end
5
+ end
@@ -0,0 +1,29 @@
1
+ require "carrierwave"
2
+ require_relative "model_core"
3
+ require_relative "uploaders/model_uploader"
4
+
5
+ module EasyML
6
+ module Core
7
+ class Model
8
+ include GlueGun::DSL
9
+
10
+ attribute :name, :string
11
+ attribute :version, :string
12
+ attribute :task, :string, default: "regression"
13
+ attribute :metrics, :array
14
+ attribute :ml_model, :string
15
+ attribute :file, :string
16
+ attribute :root_dir, :string
17
+ attribute :objective
18
+ attribute :evaluator
19
+ attribute :evaluator_metric
20
+
21
+ include EasyML::Core::ModelCore
22
+
23
+ def initialize(options = {})
24
+ super
25
+ apply_defaults
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,181 @@
1
+ require "carrierwave"
2
+ require_relative "uploaders/model_uploader"
3
+
4
+ module EasyML
5
+ module Core
6
+ module ModelCore
7
+ attr_accessor :dataset
8
+
9
+ def self.included(base)
10
+ base.send(:include, GlueGun::DSL)
11
+ base.send(:extend, CarrierWave::Mount)
12
+ base.send(:mount_uploader, :file, EasyML::Core::Uploaders::ModelUploader)
13
+
14
+ base.class_eval do
15
+ validates :task, inclusion: { in: %w[regression classification] }
16
+ validates :task, presence: true
17
+ validate :dataset_is_a_dataset?
18
+ validate :validate_any_metrics?
19
+ validate :validate_metrics_for_task
20
+ before_validation :save_model_file, if: -> { fit? }
21
+ end
22
+ end
23
+
24
+ def dataset_is_a_dataset?
25
+ return if dataset.nil?
26
+ return if dataset.class.ancestors.include?(EasyML::Data::Dataset)
27
+
28
+ errors.add(:dataset, "Must be a subclass of EasyML::Dataset")
29
+ end
30
+
31
+ def validate_any_metrics?
32
+ return if metrics.any?
33
+
34
+ errors.add(:metrics, "Must include at least one metric. Allowed metrics are #{allowed_metrics.join(", ")}")
35
+ end
36
+
37
+ def validate_metrics_for_task
38
+ nonsensical_metrics = metrics.select do |metric|
39
+ allowed_metrics.exclude?(metric)
40
+ end
41
+
42
+ return unless nonsensical_metrics.any?
43
+
44
+ errors.add(:metrics,
45
+ "cannot use metrics: #{nonsensical_metrics.join(", ")} for task #{task}. Allowed metrics are: #{allowed_metrics.join(", ")}")
46
+ end
47
+
48
+ def fit(x_train: nil, y_train: nil, x_valid: nil, y_valid: nil)
49
+ if x_train.nil?
50
+ dataset.refresh!
51
+ train_in_batches
52
+ else
53
+ train(x_train, y_train, x_valid, y_valid)
54
+ end
55
+ @is_fit = true
56
+ end
57
+
58
+ def decode_labels(ys, col: nil)
59
+ dataset.decode_labels(ys, col: col)
60
+ end
61
+
62
+ def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil)
63
+ evaluator ||= self.evaluator
64
+ EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true,
65
+ evaluator: evaluator)
66
+ end
67
+
68
+ def predict(xs)
69
+ raise NotImplementedError, "Subclasses must implement predict method"
70
+ end
71
+
72
+ def load
73
+ raise NotImplementedError, "Subclasses must implement load method"
74
+ end
75
+
76
+ def _save_model_file
77
+ raise NotImplementedError, "Subclasses must implement _save_model_file method"
78
+ end
79
+
80
+ def save
81
+ super if defined?(super) && self.class.superclass.method_defined?(:save)
82
+ save_model_file
83
+ end
84
+
85
+ def save_model_file
86
+ raise "No trained model! Need to train model before saving (call model.fit)" unless fit?
87
+
88
+ path = File.join(model_dir, "#{version}.json")
89
+ ensure_directory_exists(File.dirname(path))
90
+
91
+ _save_model_file(path)
92
+
93
+ File.open(path) do |f|
94
+ self.file = f
95
+ end
96
+ file.store!
97
+
98
+ cleanup
99
+ end
100
+
101
+ def get_params
102
+ @hyperparameters.to_h
103
+ end
104
+
105
+ def allowed_metrics
106
+ return [] unless task.present?
107
+
108
+ case task.to_sym
109
+ when :regression
110
+ %w[mean_absolute_error mean_squared_error root_mean_squared_error r2_score]
111
+ when :classification
112
+ %w[accuracy_score precision_score recall_score f1_score auc roc_auc]
113
+ else
114
+ []
115
+ end
116
+ end
117
+
118
+ def cleanup!
119
+ [file_dir, model_dir].each do |dir|
120
+ EasyML::FileRotate.new(dir, []).cleanup(extension_allowlist)
121
+ end
122
+ end
123
+
124
+ def cleanup
125
+ [file_dir, model_dir].each do |dir|
126
+ EasyML::FileRotate.new(dir, files_to_keep).cleanup(extension_allowlist)
127
+ end
128
+ end
129
+
130
+ def fit?
131
+ @is_fit == true
132
+ end
133
+
134
+ private
135
+
136
+ def file_dir
137
+ return unless file.path.present?
138
+
139
+ File.dirname(file.path).split("/")[0..-2].join("/")
140
+ end
141
+
142
+ def extension_allowlist
143
+ EasyML::Core::Uploaders::ModelUploader.new.extension_allowlist
144
+ end
145
+
146
+ def _save_model_file(path = nil)
147
+ raise NotImplementedError, "Subclasses must implement _save_model_file method"
148
+ end
149
+
150
+ def ensure_directory_exists(dir)
151
+ FileUtils.mkdir_p(dir) unless File.directory?(dir)
152
+ end
153
+
154
+ def apply_defaults
155
+ self.version ||= generate_version_string
156
+ self.metrics ||= allowed_metrics
157
+ self.ml_model ||= get_ml_model
158
+ end
159
+
160
+ def get_ml_model
161
+ self.class.name.split("::").last.underscore
162
+ end
163
+
164
+ def generate_version_string
165
+ timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S")
166
+ model_name = self.class.name.split("::").last.underscore
167
+ "#{model_name}_#{timestamp}"
168
+ end
169
+
170
+ def model_dir
171
+ File.join(root_dir, "easy_ml_models", name.present? ? name.split.join.underscore : "")
172
+ end
173
+
174
+ def files_to_keep
175
+ Dir.glob(File.join(file_dir, "*")).select { |f| File.file?(f) }.sort_by do |filename|
176
+ Time.parse(filename.split("/").last.gsub(/\D/, ""))
177
+ end.reverse.take(5)
178
+ end
179
+ end
180
+ end
181
+ end