easy_ml 0.2.0.pre.rc9 → 0.2.0.pre.rc11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 27e223058d1e119b3f70e39fee2425e50e3d94780fbf9a54ea496e0d713f2492
4
- data.tar.gz: 42b8ffa08fe474412946628bfb7de71d83bccc4195a779db4fafd87002799c42
3
+ metadata.gz: 359318d80a26cadd767fdae12e3dd4ac790c2ff757994608de368873462af4c7
4
+ data.tar.gz: 000fdfd652cfcd11a99a80b095bd3ab9653e013d2627fbb0b747bae57addec2b
5
5
  SHA512:
6
- metadata.gz: e448d3ed98f5a3b96893115865d37a3890684de543e3a7746b11c43c184d568d775c3a38c0b06fae34b00aeaa1eb76c012f3becc91e7637aeacf85ec902ec64f
7
- data.tar.gz: 83f5d575fe8ff06d3d57ac287faacf820905313ac16d6e36bf8d54f672363a62043c078ca9121cd8e7d10b2eab08125c53e005474ca17f543e429a6feb96f79e
6
+ metadata.gz: 07b960ad72f84f90964b71818f7c14572a5c6620995ee6e8000427b86ddb8976d0f1a7753c67b7d7eec5a753d8d58aad50a225dca775db3f6481f3cc2399c1f2
7
+ data.tar.gz: fdf23f56dc00eb3deb8209f34b6e1891b265b2209c4af091ed05bb1835d63d7cd161e1038e0016466d3ab837af6a9758a1f57846281190622e55b4476806d02b
@@ -0,0 +1 @@
1
+ {}
@@ -0,0 +1,17 @@
1
+ {
2
+ "entrypoints/Application.tsx": {
3
+ "file": "assets/Application-GDgZ4vVt.js",
4
+ "name": "entrypoints/Application.tsx",
5
+ "src": "entrypoints/Application.tsx",
6
+ "isEntry": true,
7
+ "css": [
8
+ "assets/Application-tsa3Id3n.css"
9
+ ]
10
+ },
11
+ "entrypoints/application.js": {
12
+ "file": "assets/application-DBfCPIOZ.js",
13
+ "name": "entrypoints/application.js",
14
+ "src": "entrypoints/application.js",
15
+ "isEntry": true
16
+ }
17
+ }
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ class HealthController < ApplicationController
3
+ # No authentication or CSRF checks for this action
4
+ skip_before_action :verify_authenticity_token
5
+
6
+ def up
7
+ render json: { status: "OK" }, status: :ok
8
+ end
9
+ end
10
+ end
@@ -0,0 +1 @@
1
+ {}
@@ -0,0 +1,11 @@
1
+ {
2
+ "entrypoints/Application.tsx": {
3
+ "file": "assets/Application-GDgZ4vVt.js",
4
+ "name": "entrypoints/Application.tsx",
5
+ "src": "entrypoints/Application.tsx",
6
+ "isEntry": true,
7
+ "css": [
8
+ "assets/Application-tsa3Id3n.css"
9
+ ]
10
+ }
11
+ }
data/bin/build ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ bin/build_vite
5
+ gem build easy_ml.gemspec
data/bin/build_vite ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ echo "Building production assets for EasyML gem..."
5
+ # Run the Vite production build
6
+ bundle exec vite build
7
+ echo "Production assets built successfully."
data/bin/console ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "easy_ml"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ require "irb"
11
+ IRB.start(__FILE__)
data/bin/rspec ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../Gemfile", __dir__)
12
+ ENV['SPRING_APPLICATION_ROOT'] = './spec/internal'
13
+
14
+ bundle_binstub = File.expand_path("bundle", __dir__)
15
+
16
+ if File.file?(bundle_binstub)
17
+ if File.read(bundle_binstub, 300).include?("This file was generated by Bundler")
18
+ load(bundle_binstub)
19
+ else
20
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
21
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
22
+ end
23
+ end
24
+
25
+ require "rubygems"
26
+ require "bundler/setup"
27
+
28
+ load Gem.bin_path("rspec-core", "rspec")
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/bin/vite ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'vite' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../Gemfile", __dir__)
12
+
13
+ bundle_binstub = File.expand_path("bundle", __dir__)
14
+
15
+ if File.file?(bundle_binstub)
16
+ if File.read(bundle_binstub, 300).include?("This file was generated by Bundler")
17
+ load(bundle_binstub)
18
+ else
19
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
20
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
21
+ end
22
+ end
23
+
24
+ require "rubygems"
25
+ require "bundler/setup"
26
+
27
+ load Gem.bin_path("vite_ruby", "vite")
@@ -1,3 +1,9 @@
1
- require 'resque'
2
- Resque.redis = ENV['REDIS_URL'] || 'redis://localhost:6379'
1
+ require "resque"
3
2
 
3
+ gem_path = Gem::Specification.find_by_name("easy_ml").gem_dir
4
+ Resque::Pool.configure do |config|
5
+ config.path = File.join(gem_path, "config", "resque-pool.yml")
6
+ puts "Resque pool config: #{config.path}"
7
+ end
8
+
9
+ Resque.redis = ENV["REDIS_URL"] || "redis://localhost:6379"
@@ -1,6 +1,6 @@
1
1
  development:
2
- '*': 2
2
+ 'easy_ml': 5
3
3
 
4
4
  production:
5
- '*': <%= ENV['WORKER_COUNT'] || 5 %>
5
+ 'easy_ml': <%= ENV['WORKER_COUNT'] || 5 %>
6
6
 
data/config/routes.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  EasyML::Engine.routes.draw do
2
2
  root to: "models#index"
3
+ get "healthcheck", to: "health#up"
3
4
  resources :models, as: :easy_ml_models do
4
5
  member do
5
6
  post :train
@@ -65,6 +65,9 @@ module EasyML
65
65
  end
66
66
 
67
67
  initializer "easy_ml.active_job_config" do
68
+ resque_initializer = File.expand_path("config/initializers/resque.rb", root)
69
+ require resque_initializer if File.exist?(resque_initializer)
70
+
68
71
  ActiveSupport.on_load(:active_job) do
69
72
  self.queue_adapter = :resque
70
73
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc9"
4
+ VERSION = "0.2.0-rc11"
5
5
 
6
6
  module Version
7
7
  end
@@ -0,0 +1,13 @@
1
+ namespace :easy_ml do
2
+ desc "Start resque-pool with the gem's configuration"
3
+ task :resque_pool do
4
+ require "resque"
5
+ gem_path = Gem::Specification.find_by_name("easy_ml").gem_dir
6
+ config_path = File.join(gem_path, "config", "resque-pool.yml")
7
+
8
+ ENV["RESQUE_POOL_CONFIG"] = config_path
9
+ puts "Starting resque-pool with config: #{config_path}"
10
+
11
+ exec "bundle exec resque-pool --environment #{ENV["RAILS_ENV"] || "development"} --config #{config_path}"
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ {
2
+ "entrypoints/Application.tsx": {
3
+ "file": "assets/entrypoints/Application.tsx-GDgZ4vVt.js",
4
+ "name": "entrypoints/Application.tsx",
5
+ "src": "entrypoints/Application.tsx",
6
+ "isEntry": true,
7
+ "css": [
8
+ "assets/Application-tsa3Id3n.css"
9
+ ]
10
+ }
11
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc9
4
+ version: 0.2.0.pre.rc11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-01-07 00:00:00.000000000 Z
11
+ date: 2025-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -437,16 +437,19 @@ executables: []
437
437
  extensions: []
438
438
  extra_rdoc_files: []
439
439
  files:
440
- - README.md
441
- - Rakefile
440
+ - app/.vite/manifest-assets.json
441
+ - app/.vite/manifest.json
442
442
  - app/controllers/easy_ml/application_controller.rb
443
443
  - app/controllers/easy_ml/columns_controller.rb
444
444
  - app/controllers/easy_ml/datasets_controller.rb
445
445
  - app/controllers/easy_ml/datasources_controller.rb
446
446
  - app/controllers/easy_ml/deploys_controller.rb
447
+ - app/controllers/easy_ml/health_controller.rb
447
448
  - app/controllers/easy_ml/models_controller.rb
448
449
  - app/controllers/easy_ml/retraining_runs_controller.rb
449
450
  - app/controllers/easy_ml/settings_controller.rb
451
+ - app/easy_ml/.vite/manifest-assets.json
452
+ - app/easy_ml/.vite/manifest.json
450
453
  - app/easy_ml/assets/Application-GDgZ4vVt.js
451
454
  - app/easy_ml/assets/Application-GDgZ4vVt.js.map
452
455
  - app/easy_ml/assets/Application-tsa3Id3n.css
@@ -578,6 +581,12 @@ files:
578
581
  - app/serializers/easy_ml/retraining_run_serializer.rb
579
582
  - app/serializers/easy_ml/settings_serializer.rb
580
583
  - app/views/layouts/easy_ml/application.html.erb
584
+ - bin/build
585
+ - bin/build_vite
586
+ - bin/console
587
+ - bin/rspec
588
+ - bin/setup
589
+ - bin/vite
581
590
  - config/initializers/resque.rb
582
591
  - config/resque-pool.yml
583
592
  - config/routes.rb
@@ -650,7 +659,10 @@ files:
650
659
  - lib/easy_ml/support/synced_file.rb
651
660
  - lib/easy_ml/support/utc.rb
652
661
  - lib/easy_ml/version.rb
662
+ - lib/tasks/resque.rake
653
663
  - lib/tasks/vite.rake
664
+ - public/easy_ml/assets/.vite/manifest-assets.json
665
+ - public/easy_ml/assets/.vite/manifest.json
654
666
  - public/easy_ml/assets/assets/Application-tsa3Id3n.css
655
667
  - public/easy_ml/assets/assets/entrypoints/Application.tsx-GDgZ4vVt.js
656
668
  homepage: https://github.com/brettshollenberger/easy_ml
data/README.md DELETED
@@ -1,497 +0,0 @@
1
- <img src="easy_ml.svg" alt="EasyML Logo" style="width: 310px; height: 300px;">
2
-
3
- # EasyML
4
-
5
- ~~You can't do machine learning in Ruby.~~
6
-
7
- Deploy models in minutes.
8
-
9
- ## What is EasyML?
10
-
11
- EasyML is a **low code/no code**, end-to-end machine learning framework for Ruby on Rails.
12
-
13
- **Get productionized models in minutes.** It takes the guesswork out of:
14
-
15
- - Preprocessing data
16
- - Storing and batch computing features
17
- - Training models
18
- - Metric visualization
19
- - Deployment and versioning
20
- - Evaluating model performance
21
-
22
- With a dead-simple point-and-click interface, EasyML makes it stupid easy to train and deploy.
23
-
24
- Oh yeah, and it's open source!
25
-
26
- ## Features
27
-
28
- - **No Code (if you want)**: EasyML ships as a Rails engine. Just mount it in your app and get started.
29
- - **Opinionated Framework**: Provides a structured approach to data and model management, ensuring best practices are followed.
30
- - **Model Lifecycle On Rails**: Want predictions directly from your Rails app? You can do that.
31
- - **Easily Extensible**: Want a model that's not supported? Send a pull request!
32
-
33
- ## Current and Planned Features
34
-
35
- ### Models Available
36
-
37
- | XGBoost | LightGBM | TensorFlow | PyTorch |
38
- | ------- | -------- | ---------- | ------- |
39
- | ✅ | ❌ | ❌ | ❌ |
40
-
41
- ### Datasources Available
42
-
43
- | S3 | File | Polars | SQL Databases | REST APIs |
44
- | --- | ---- | ------ | ------------- | --------- |
45
- | ✅ | ✅ | ✅ | ❌ | ❌ |
46
-
47
- _Note: Features marked with ❌ are part of the roadmap and are not yet implemented._
48
-
49
- ## Quick Start:
50
-
51
- Building a Production pipeline is as easy as 1,2,3!
52
-
53
- ### 1. Create Your Dataset
54
-
55
- ```ruby
56
- class MyDataset < EasyML::Data::Dataset
57
- datasource :s3, s3_bucket: "my-bucket" # Every time the data changes, we'll pull new data
58
- target "revenue" # What are we trying to predict?
59
- splitter :date, date_column: "created_at" # How should we partition data into training, test, and validation datasets?
60
- transforms DataPipeline # Class that manages data transformation, adding new columns, etc.
61
- preprocessing_steps({
62
- training: {
63
- annual_revenue: { median: true, clip: { min: 0, max: 500_000 } }
64
- }
65
- }) # If annual revenue is missing, use the median value, after clipping the values into the approved list
66
- end
67
- ```
68
-
69
- ### 2. Create a Model
70
-
71
- ```ruby
72
- class MyModel < EasyML::Models::XGBoost
73
- dataset MyDataset
74
- task :regression # Or classification
75
- hyperparameters({
76
- max_depth: 5,
77
- learning_rate: 0.1,
78
- objective: "reg:squarederror"
79
- })
80
- end
81
- ```
82
-
83
- ### 3. Create a Trainer
84
-
85
- ```ruby
86
- class MyTrainer < EasyML::Trainer
87
- model MyModel
88
- evaluator MyMetrics
89
- end
90
-
91
- class MyMetrics
92
- def metric_we_make_money(y_pred, y_true)
93
- return true if model_makes_money?
94
- return false if model_lose_money?
95
- end
96
-
97
- def metric_sales_team_has_enough_leads(y_pred, y_true)
98
- return false if sales_will_be_sitting_on_their_hands?
99
- end
100
- end
101
- ```
102
-
103
- Now you're ready to predict in production!
104
-
105
- ```ruby
106
- MyTrainer.train # Yay, we did it!
107
- MyTrainer.deploy # Let the production hosts know it's live!
108
- MyTrainer.predict(customer_data: "I am worth a lot of money")
109
- # prediction: true!
110
- ```
111
-
112
- ## Mount The Engine
113
-
114
- ```ruby
115
- Rails.application.routes.draw do
116
- mount EasyML::Engine, at: "easy_ml"
117
- end
118
- ```
119
-
120
- ## Data Management
121
-
122
- EasyML provides a comprehensive data management system that handles all preprocessing tasks, including splitting data into train, test, and validation sets, and avoiding data leakage. The primary abstraction for data handling is the `Dataset` class, which ensures data is properly managed and prepared for machine learning tasks.
123
-
124
- ### Preprocessing Features
125
-
126
- EasyML offers a variety of preprocessing features to prepare your data for machine learning models. Here's a complete list of available preprocessing steps and examples of when to use them:
127
-
128
- - **Mean Imputation**: Replace missing values with the mean of the feature. Use this when you want to maintain the average value of the data.
129
-
130
- ```ruby
131
- annual_revenue: {
132
- mean: true
133
- }
134
- ```
135
-
136
- - **Median Imputation**: Replace missing values with the median of the feature. This is useful when you want to maintain the central tendency of the data without being affected by outliers.
137
-
138
- ```ruby
139
- annual_revenue: {
140
- median: true
141
- }
142
- ```
143
-
144
- - **Forward Fill (ffill)**: Fill missing values with the last observed value. Use this for time series data where the last known value is a reasonable estimate for missing values.
145
-
146
- ```ruby
147
- created_date: {
148
- ffill: true
149
- }
150
- ```
151
-
152
- - **Most Frequent Imputation**: Replace missing values with the most frequently occurring value. This is useful for categorical data where the mode is a reasonable estimate for missing values.
153
-
154
- ```ruby
155
- loan_purpose: {
156
- most_frequent: true
157
- }
158
- ```
159
-
160
- - **Constant Imputation**: Replace missing values with a constant value. Use this when you have a specific value that should be used for missing data.
161
-
162
- ```ruby
163
- loan_purpose: {
164
- constant: { fill_value: 'unknown' }
165
- }
166
- ```
167
-
168
- - **Today Imputation**: Fill missing date values with the current date. Use this for features that should default to the current date.
169
-
170
- ```ruby
171
- created_date: {
172
- today: true
173
- }
174
- ```
175
-
176
- - **One-Hot Encoding**: Convert categorical variables into a set of binary variables. Use this when you have categorical data that needs to be converted into a numerical format for model training.
177
-
178
- ```ruby
179
- loan_purpose: {
180
- one_hot: true
181
- }
182
- ```
183
-
184
- - **Ordinal Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
185
-
186
- ```ruby
187
- loan_purpose: {
188
- categorical: {
189
- ordinal_encoding: true
190
- }
191
- }
192
- ```
193
-
194
- ### Other Dataset Features
195
-
196
- - **Data Splitting**: Automatically split data into train, test, and validation sets using various strategies, such as date-based splitting.
197
- - **Data Synchronization**: Ensure data is synced from its source, such as S3 or local files.
198
- - **Batch Processing**: Process data in batches to handle large datasets efficiently.
199
- - **Null Handling**: Alert and handle null values in datasets to ensure data quality.
200
-
201
- ## Feature Store
202
-
203
- The Feature Store is a powerful component of EasyML that helps you manage, compute, and serve features for your machine learning models. Here's how to use it effectively:
204
-
205
- ### Setting Up Features
206
-
207
- 1. Create a `features` directory in your application:
208
-
209
- ```bash
210
- mkdir app/features
211
- ```
212
-
213
- 2. Create feature classes in this directory. Each feature should include the `EasyML::Features` module:
214
-
215
- ```ruby
216
- class MyFeature
217
- include EasyML::Features
218
-
219
- def transform(df, feature)
220
- # Your feature transformation logic here
221
- end
222
-
223
- feature name: "My Feature",
224
- description: "Description of what this feature does"
225
- end
226
- ```
227
-
228
- ### Feature Types and Configurations
229
-
230
- #### Simple Transform-Only Features
231
-
232
- For features that can be computed using only the input columns:
233
-
234
- ```ruby
235
- class DidConvert
236
- include EasyML::Features
237
-
238
- def transform(df, feature)
239
- df.with_column(
240
- (Polars.col("rev") > 0).alias("did_convert")
241
- )
242
- end
243
-
244
- feature name: "did_convert",
245
- description: "Boolean indicating if conversion occurred"
246
- end
247
- ```
248
-
249
- #### Batch Processing Features
250
-
251
- For features that require processing large datasets in chunks:
252
-
253
- ```ruby
254
- class LastConversionTimeFeature
255
- include EasyML::Features
256
-
257
- def batch(reader, feature)
258
- # Efficiently query only the company_id column for batching
259
- # This will create batches of batch_size records (default 1000)
260
- reader.query(select: ["company_id"], unique: true)["company_id"]
261
- end
262
-
263
- def fit(reader, feature, options = {})
264
- batch_start = options.dig(:batch_start)
265
- batch_end = options.dig(:batch_end)
266
-
267
- # More efficient than is_in for continuous ranges
268
- df = reader.query(
269
- filter: Polars.col("company_id").is_between(batch_start, batch_end),
270
- select: ["id", "company_id", "converted_at", "created_at"],
271
- sort: ["company_id", "created_at"]
272
- )
273
-
274
- # For each company, find the last time they converted before each application
275
- #
276
- # This value will be cached in the feature store for fast inference retrieval
277
- df.with_columns([
278
- Polars.col("converted_at")
279
- .shift(1)
280
- .filter(Polars.col("converted_at").is_not_null())
281
- .over("company_id")
282
- .alias("last_conversion_time"),
283
-
284
- # Also compute days since last conversion
285
- (Polars.col("created_at") - Polars.col("last_conversion_time"))
286
- .dt.days()
287
- .alias("days_since_last_conversion")
288
- ])[["id", "last_conversion_time", "days_since_last_conversion"]]
289
- end
290
-
291
- def transform(df, feature)
292
- # Pull the pre-computed values from the feature store
293
- stored_df = feature.query(filter: Polars.col("id").is_in(df["id"]))
294
- return df if stored_df.empty?
295
-
296
- df.join(stored_df, on: "id", how: "left")
297
- end
298
-
299
- feature name: "Last Conversion Time",
300
- description: "Computes the last time a company converted before each application",
301
- batch_size: 1000, # Process 1000 companies at a time
302
- primary_key: "id",
303
- cache_for: 24.hours # Cache feature values for 24 hours after running fit
304
- end
305
- ```
306
-
307
- This example demonstrates several key concepts:
308
-
309
- 1. **Efficient Batching**: The `batch` method uses the reader to lazily query only the necessary column for batching
310
- 1. **Batches Groups Together**: All records with the same `company_id` need to be in the same batch to properly compute the feature, so we create a custom batch (instead of using the primary key `id` column, which would split up companies into different batches)
311
- 1. **Column Selection**: Only selects required columns in the reader query
312
- 1. **Feature Computation**: Computes multiple related features (last conversion time and days since) in a single pass.
313
- 1. **Automatic Feature Store Caching**: The feature store automatically caches feature values returned from the `fit` method
314
-
315
- ### Performance Optimization
316
-
317
- #### Caching During Development
318
-
319
- Use `cache_for` to save processing time during development:
320
-
321
- ```ruby
322
- feature name: "My Feature",
323
- cache_for: 24.hours # After running fit, this feature will be cached for 24 hours (unless new data is read from datasource, like S3)
324
- ```
325
-
326
- #### Early Returns
327
-
328
- Always implement early returns in your transform method to avoid unnecessary reprocessing:
329
-
330
- ```ruby
331
- def transform(df, feature)
332
- return df if df["required_column"].nil?
333
- # Feature computation logic
334
- end
335
- ```
336
-
337
- #### Using Reader vs DataFrame
338
-
339
- - The Polars `reader` is a lazy reader that allows you to query data incrementally.
340
- - If your feature includes a `batch` method or uses the `batch_size` variable, you will receive a reader instead of a dataframe in the `fit` method
341
-
342
- ```ruby
343
- def fit(reader, feature)
344
- df = reader.query(select: ["column1", "column2"])
345
- # Process only needed columns
346
- end
347
- ```
348
-
349
- - If you don't have a `batch` method or don't use the `batch_size` variable, you will receive a dataframe in the `fit` method
350
-
351
- ````ruby
352
- def fit(df, feature)
353
- # process directly on dataframe
354
- end
355
-
356
- - To ensure you get a reader instead of a dataframe, include the `batch` method
357
-
358
- ```ruby
359
- def batch(reader, feature)
360
- reader.query(select: ["column1"])["column1"]
361
- end
362
-
363
- feature name: "My Feature", batch_size: 1_000
364
- ````
365
-
366
- ### Production Considerations
367
-
368
- #### Handling Missing Data
369
-
370
- When processing historical data:
371
-
372
- 1. Check for missing dates:
373
-
374
- ```ruby
375
- def transform(df, feature)
376
- missing_dates = feature.store.missing_dates(start_date, end_date)
377
- return df if missing_dates.empty?
378
-
379
- # Process only missing dates
380
- process_dates(df, missing_dates)
381
- end
382
- ```
383
-
384
- ### Best Practices
385
-
386
- 1. Always specify a `primary_key` to allow the feature store to partition your data
387
- 1. Use `batch/fit` to process large datasets in batches
388
- 1. Use `batch/fit` to allow faster inference feature computation
389
- 1. Use transform-only features when all required columns will be available on the inference dataset
390
- 1. Use `cache_for` to save processing time during development
391
- 1. Only query necessary columns using the reader
392
-
393
- ## Installation
394
-
395
- Install necessary Python dependencies
396
-
397
- 1. **Install Python dependencies (don't worry, all code is in Ruby, we just call through to Python)**
398
-
399
- ```bash
400
- pip install wandb optuna
401
- ```
402
-
403
- 1. **Install the gem**:
404
-
405
- ```bash
406
- gem install easy_ml
407
- ```
408
-
409
- 2. **Run the generator to store model versions**:
410
-
411
- ```bash
412
- rails generate easy_ml:migration
413
- rails db:create # If this is a new app
414
- rails db:migrate
415
- ```
416
-
417
- 3. Add the `easy_ml` dir to your `.gitignore` — This is where datasets and model files will be downloaded
418
-
419
- ```
420
- # .gitignore
421
- easy_ml/
422
- ```
423
-
424
- ## Usage
425
-
426
- To use EasyML in your Rails application, follow these steps:
427
-
428
- 1. **Define your preprocessing steps** in a configuration hash. For example:
429
-
430
- ```ruby
431
- preprocessing_steps = {
432
- training: {
433
- annual_revenue: {
434
- median: true,
435
- clip: { min: 0, max: 1_000_000 }
436
- },
437
- loan_purpose: {
438
- categorical: {
439
- categorical_min: 2,
440
- one_hot: true
441
- }
442
- }
443
- }
444
- }
445
- ```
446
-
447
- 2. **Create a dataset** using the `EasyML::Data::Dataset` class, providing necessary configurations such as data source, target, and preprocessing steps.
448
-
449
- 3. **Train a model** using the `EasyML::Models` module, specifying the model class and configuration.
450
-
451
- 4. **Deploy the model** by marking it as live and storing it in the configured S3 bucket.
452
-
453
- ## Development
454
-
455
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
456
-
457
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
458
-
459
- ## Contributing
460
-
461
- 1. Install Appraisals gemfiles:
462
-
463
- ```bash
464
- bundle exec appraisal install
465
- ```
466
-
467
- 1. Creating a test app:
468
-
469
- a. Follow the typical steps
470
- b. Declare an environment variable: `EASY_ML_DEV=true`, using Figaro, dotenv, or similar to load develoment assets
471
- c. Run `yarn vite dev` in both the `easy_ml` gem and test app directories
472
-
473
- 1. Building production assets
474
-
475
- ```bash
476
- bin/vite_build
477
- ```
478
-
479
- 1. Ensure you run tests against all supported Rails versions
480
-
481
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/easy_ml. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
482
-
483
- ## License
484
-
485
- The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
486
-
487
- ## Code of Conduct
488
-
489
- Everyone interacting in the EasyML project's codebases, issue trackers, chat rooms, and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
490
-
491
- ## Expected Future Enhancements
492
-
493
- - **Support for Additional Models**: Integration with LightGBM, TensorFlow, and PyTorch.
494
- - **Expanded Data Source Support**: Ability to pull data from SQL databases and REST APIs.
495
- - **Enhanced Deployment Options**: More flexible deployment strategies and integration with CI/CD pipelines.
496
- - **Advanced Monitoring and Logging**: Improved tools for monitoring model performance and logging.
497
- - **User Interface Improvements**: Enhanced UI components for managing models and datasets.
data/Rakefile DELETED
@@ -1,57 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "sprockets/railtie"
4
- require "bundler/gem_tasks"
5
- require "rspec/core/rake_task"
6
-
7
- RSpec::Core::RakeTask.new(:spec)
8
-
9
- require "rubocop/rake_task"
10
-
11
- RuboCop::RakeTask.new
12
-
13
- task default: %i[spec rubocop]
14
-
15
- Bundler.require(:default)
16
-
17
- # Load your gem's code
18
- require_relative "lib/easy_ml"
19
-
20
- # Load the annotate tasks
21
- require "annotate/annotate_models"
22
-
23
- task :environment do
24
- require "combustion"
25
- require "sprockets"
26
- Combustion.path = "spec/internal"
27
- Combustion.initialize! :active_record do |config|
28
- config.assets = ActiveSupport::OrderedOptions.new # Stub to avoid errors
29
- config.assets.enabled = false # Set false since assets are handled by Vite
30
- end
31
- EasyML::Engine.eager_load!
32
- end
33
-
34
- namespace :easy_ml do
35
- task annotate_models: :environment do
36
- model_dir = File.expand_path("app/models", EasyML::Engine.root)
37
- $LOAD_PATH.unshift(model_dir) unless $LOAD_PATH.include?(model_dir)
38
-
39
- AnnotateModels.do_annotations(
40
- is_rake: true,
41
- model_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
42
- root_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
43
- include_modules: true, # Include modules/namespaces in the annotation
44
- )
45
- end
46
-
47
- task :create_test_migrations do
48
- require "combustion"
49
- require "rails/generators"
50
- require_relative "lib/easy_ml/railtie/generators/migration/migration_generator"
51
-
52
- db_files = Dir.glob(EasyML::Engine.root.join("spec/internal/db/migrate/**/*"))
53
-
54
- FileUtils.rm(db_files)
55
- Rails::Generators.invoke("easy_ml:migration", [], { destination_root: EasyML::Engine.root.join("spec/internal") })
56
- end
57
- end