easy_ml 0.1.4 → 0.2.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62ec6069cb9e47af4d2fd29668202132af589b1fb526b93c8bd4766aec5df3b1
|
4
|
+
data.tar.gz: 6e06d4e607d50b74f8d7ad5a881380d05bed58f351bc22357bfa3e5038850322
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61eed2f9f210fd5ac38af1b0972d369d73e5853491a5889c0b04c9dd776e5509514ab98ac56ab8e3bd876223a72a5463c29baa238899220acd369ee5e58c3206
|
7
|
+
data.tar.gz: d266908a752c337c7817484af4235ba78b38e47495d25fe815feafad11b820cd63ca47300619c46f56971d810b2a475a3ebb2c436359026ff5f2f5f794cb0bd1
|
data/README.md
CHANGED
@@ -2,13 +2,33 @@
|
|
2
2
|
|
3
3
|
# EasyML
|
4
4
|
|
5
|
-
|
5
|
+
~~You can't do machine learning in Ruby.~~
|
6
|
+
|
7
|
+
Deploy models in minutes.
|
8
|
+
|
9
|
+
## What is EasyML?
|
10
|
+
|
11
|
+
EasyML is a **low code/no code**, end-to-end machine learning framework for Ruby on Rails.
|
12
|
+
|
13
|
+
**Get productionized models in minutes.** It takes the guesswork out of:
|
14
|
+
|
15
|
+
- Preprocessing data
|
16
|
+
- Storing and batch computing features
|
17
|
+
- Training models
|
18
|
+
- Metric visualization
|
19
|
+
- Deployment and versioning
|
20
|
+
- Evaluating model performance
|
21
|
+
|
22
|
+
With a dead-simple point-and-click interface, EasyML makes it stupid easy to train and deploy.
|
23
|
+
|
24
|
+
Oh yeah, and it's open source!
|
6
25
|
|
7
26
|
## Features
|
8
27
|
|
9
|
-
- **
|
10
|
-
- **Opinionated Framework**: Provides a structured approach to model management, ensuring best practices are followed.
|
11
|
-
- **Model Lifecycle On Rails**:
|
28
|
+
- **No Code (if you want)**: EasyML ships as a Rails engine. Just mount it in your app and get started.
|
29
|
+
- **Opinionated Framework**: Provides a structured approach to data and model management, ensuring best practices are followed.
|
30
|
+
- **Model Lifecycle On Rails**: Want predictions directly from your Rails app? You can do that.
|
31
|
+
- **Easily Extensible**: Want a model that's not supported? Send a pull request!
|
12
32
|
|
13
33
|
## Current and Planned Features
|
14
34
|
|
@@ -89,6 +109,14 @@ MyTrainer.predict(customer_data: "I am worth a lot of money")
|
|
89
109
|
# prediction: true!
|
90
110
|
```
|
91
111
|
|
112
|
+
## Mount The Engine
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
Rails.application.routes.draw do
|
116
|
+
mount EasyML::Engine, at: "easy_ml"
|
117
|
+
end
|
118
|
+
```
|
119
|
+
|
92
120
|
## Data Management
|
93
121
|
|
94
122
|
EasyML provides a comprehensive data management system that handles all preprocessing tasks, including splitting data into train, test, and validation sets, and avoiding data leakage. The primary abstraction for data handling is the `Dataset` class, which ensures data is properly managed and prepared for machine learning tasks.
|
@@ -153,12 +181,12 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
|
|
153
181
|
}
|
154
182
|
```
|
155
183
|
|
156
|
-
- **
|
184
|
+
- **Ordinal Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
|
157
185
|
|
158
186
|
```ruby
|
159
187
|
loan_purpose: {
|
160
188
|
categorical: {
|
161
|
-
|
189
|
+
ordinal_encoding: true
|
162
190
|
}
|
163
191
|
}
|
164
192
|
```
|
@@ -170,6 +198,198 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
|
|
170
198
|
- **Batch Processing**: Process data in batches to handle large datasets efficiently.
|
171
199
|
- **Null Handling**: Alert and handle null values in datasets to ensure data quality.
|
172
200
|
|
201
|
+
## Feature Store
|
202
|
+
|
203
|
+
The Feature Store is a powerful component of EasyML that helps you manage, compute, and serve features for your machine learning models. Here's how to use it effectively:
|
204
|
+
|
205
|
+
### Setting Up Features
|
206
|
+
|
207
|
+
1. Create a `features` directory in your application:
|
208
|
+
|
209
|
+
```bash
|
210
|
+
mkdir app/features
|
211
|
+
```
|
212
|
+
|
213
|
+
2. Create feature classes in this directory. Each feature should include the `EasyML::Features` module:
|
214
|
+
|
215
|
+
```ruby
|
216
|
+
class MyFeature
|
217
|
+
include EasyML::Features
|
218
|
+
|
219
|
+
def transform(df, feature)
|
220
|
+
# Your feature transformation logic here
|
221
|
+
end
|
222
|
+
|
223
|
+
feature name: "My Feature",
|
224
|
+
description: "Description of what this feature does"
|
225
|
+
end
|
226
|
+
```
|
227
|
+
|
228
|
+
### Feature Types and Configurations
|
229
|
+
|
230
|
+
#### Simple Transform-Only Features
|
231
|
+
|
232
|
+
For features that can be computed using only the input columns:
|
233
|
+
|
234
|
+
```ruby
|
235
|
+
class DidConvert
|
236
|
+
include EasyML::Features
|
237
|
+
|
238
|
+
def transform(df, feature)
|
239
|
+
df.with_column(
|
240
|
+
(Polars.col("rev") > 0).alias("did_convert")
|
241
|
+
)
|
242
|
+
end
|
243
|
+
|
244
|
+
feature name: "did_convert",
|
245
|
+
description: "Boolean indicating if conversion occurred"
|
246
|
+
end
|
247
|
+
```
|
248
|
+
|
249
|
+
#### Batch Processing Features
|
250
|
+
|
251
|
+
For features that require processing large datasets in chunks:
|
252
|
+
|
253
|
+
```ruby
|
254
|
+
class LastConversionTimeFeature
|
255
|
+
include EasyML::Features
|
256
|
+
|
257
|
+
def batch(reader, feature)
|
258
|
+
# Efficiently query only the company_id column for batching
|
259
|
+
# This will create batches of batch_size records (default 1000)
|
260
|
+
reader.query(select: ["company_id"], unique: true)["company_id"]
|
261
|
+
end
|
262
|
+
|
263
|
+
def fit(reader, feature, options = {})
|
264
|
+
batch_start = options.dig(:batch_start)
|
265
|
+
batch_end = options.dig(:batch_end)
|
266
|
+
|
267
|
+
# More efficient than is_in for continuous ranges
|
268
|
+
df = reader.query(
|
269
|
+
filter: Polars.col("company_id").is_between(batch_start, batch_end),
|
270
|
+
select: ["id", "company_id", "converted_at", "created_at"],
|
271
|
+
sort: ["company_id", "created_at"]
|
272
|
+
)
|
273
|
+
|
274
|
+
# For each company, find the last time they converted before each application
|
275
|
+
#
|
276
|
+
# This value will be cached in the feature store for fast inference retrieval
|
277
|
+
df.with_columns([
|
278
|
+
Polars.col("converted_at")
|
279
|
+
.shift(1)
|
280
|
+
.filter(Polars.col("converted_at").is_not_null())
|
281
|
+
.over("company_id")
|
282
|
+
.alias("last_conversion_time"),
|
283
|
+
|
284
|
+
# Also compute days since last conversion
|
285
|
+
(Polars.col("created_at") - Polars.col("last_conversion_time"))
|
286
|
+
.dt.days()
|
287
|
+
.alias("days_since_last_conversion")
|
288
|
+
])[["id", "last_conversion_time", "days_since_last_conversion"]]
|
289
|
+
end
|
290
|
+
|
291
|
+
def transform(df, feature)
|
292
|
+
# Pull the pre-computed values from the feature store
|
293
|
+
stored_df = feature.query(filter: Polars.col("id").is_in(df["id"]))
|
294
|
+
return df if stored_df.empty?
|
295
|
+
|
296
|
+
df.join(stored_df, on: "id", how: "left")
|
297
|
+
end
|
298
|
+
|
299
|
+
feature name: "Last Conversion Time",
|
300
|
+
description: "Computes the last time a company converted before each application",
|
301
|
+
batch_size: 1000, # Process 1000 companies at a time
|
302
|
+
primary_key: "id",
|
303
|
+
cache_for: 24.hours # Cache feature values for 24 hours after running fit
|
304
|
+
end
|
305
|
+
```
|
306
|
+
|
307
|
+
This example demonstrates several key concepts:
|
308
|
+
|
309
|
+
1. **Efficient Batching**: The `batch` method uses the reader to lazily query only the necessary column for batching
|
310
|
+
1. **Batches Groups Together**: All records with the same `company_id` need to be in the same batch to properly compute the feature, so we create a custom batch (instead of using the primary key `id` column, which would split up companies into different batches)
|
311
|
+
1. **Column Selection**: Only selects required columns in the reader query
|
312
|
+
1. **Feature Computation**: Computes multiple related features (last conversion time and days since) in a single pass.
|
313
|
+
1. **Automatic Feature Store Caching**: The feature store automatically caches feature values returned from the `fit` method
|
314
|
+
|
315
|
+
### Performance Optimization
|
316
|
+
|
317
|
+
#### Caching During Development
|
318
|
+
|
319
|
+
Use `cache_for` to save processing time during development:
|
320
|
+
|
321
|
+
```ruby
|
322
|
+
feature name: "My Feature",
|
323
|
+
cache_for: 24.hours # After running fit, this feature will be cached for 24 hours (unless new data is read from datasource, like S3)
|
324
|
+
```
|
325
|
+
|
326
|
+
#### Early Returns
|
327
|
+
|
328
|
+
Always implement early returns in your transform method to avoid unnecessary reprocessing:
|
329
|
+
|
330
|
+
```ruby
|
331
|
+
def transform(df, feature)
|
332
|
+
return df if df["required_column"].nil?
|
333
|
+
# Feature computation logic
|
334
|
+
end
|
335
|
+
```
|
336
|
+
|
337
|
+
#### Using Reader vs DataFrame
|
338
|
+
|
339
|
+
- The Polars `reader` is a lazy reader that allows you to query data incrementally.
|
340
|
+
- If your feature includes a `batch` method or uses the `batch_size` variable, you will receive a reader instead of a dataframe in the `fit` method
|
341
|
+
|
342
|
+
```ruby
|
343
|
+
def fit(reader, feature)
|
344
|
+
df = reader.query(select: ["column1", "column2"])
|
345
|
+
# Process only needed columns
|
346
|
+
end
|
347
|
+
```
|
348
|
+
|
349
|
+
- If you don't have a `batch` method or don't use the `batch_size` variable, you will receive a dataframe in the `fit` method
|
350
|
+
|
351
|
+
````ruby
|
352
|
+
def fit(df, feature)
|
353
|
+
# process directly on dataframe
|
354
|
+
end
|
355
|
+
|
356
|
+
- To ensure you get a reader instead of a dataframe, include the `batch` method
|
357
|
+
|
358
|
+
```ruby
|
359
|
+
def batch(reader, feature)
|
360
|
+
reader.query(select: ["column1"])["column1"]
|
361
|
+
end
|
362
|
+
|
363
|
+
feature name: "My Feature", batch_size: 1_000
|
364
|
+
````
|
365
|
+
|
366
|
+
### Production Considerations
|
367
|
+
|
368
|
+
#### Handling Missing Data
|
369
|
+
|
370
|
+
When processing historical data:
|
371
|
+
|
372
|
+
1. Check for missing dates:
|
373
|
+
|
374
|
+
```ruby
|
375
|
+
def transform(df, feature)
|
376
|
+
missing_dates = feature.store.missing_dates(start_date, end_date)
|
377
|
+
return df if missing_dates.empty?
|
378
|
+
|
379
|
+
# Process only missing dates
|
380
|
+
process_dates(df, missing_dates)
|
381
|
+
end
|
382
|
+
```
|
383
|
+
|
384
|
+
### Best Practices
|
385
|
+
|
386
|
+
1. Always specify a `primary_key` to allow the feature store to partition your data
|
387
|
+
1. Use `batch/fit` to process large datasets in batches
|
388
|
+
1. Use `batch/fit` to allow faster inference feature computation
|
389
|
+
1. Use transform-only features when all required columns will be available on the inference dataset
|
390
|
+
1. Use `cache_for` to save processing time during development
|
391
|
+
1. Only query necessary columns using the reader
|
392
|
+
|
173
393
|
## Installation
|
174
394
|
|
175
395
|
Install necessary Python dependencies
|
@@ -194,26 +414,6 @@ pip install optuna
|
|
194
414
|
rails db:migrate
|
195
415
|
```
|
196
416
|
|
197
|
-
3. **Configure CarrierWave for S3 storage**:
|
198
|
-
|
199
|
-
Ensure you have CarrierWave configured to use AWS S3. If not, add the following configuration:
|
200
|
-
|
201
|
-
```ruby
|
202
|
-
# config/initializers/carrierwave.rb
|
203
|
-
CarrierWave.configure do |config|
|
204
|
-
config.fog_provider = 'fog/aws'
|
205
|
-
config.fog_credentials = {
|
206
|
-
provider: 'AWS',
|
207
|
-
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
208
|
-
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
|
209
|
-
region: ENV['AWS_REGION'],
|
210
|
-
}
|
211
|
-
config.fog_directory = ENV['AWS_S3_BUCKET']
|
212
|
-
config.fog_public = false
|
213
|
-
config.storage = :fog
|
214
|
-
end
|
215
|
-
```
|
216
|
-
|
217
417
|
## Usage
|
218
418
|
|
219
419
|
To use EasyML in your Rails application, follow these steps:
|
@@ -251,6 +451,14 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
251
451
|
|
252
452
|
## Contributing
|
253
453
|
|
454
|
+
1. Install Appraisals gemfiles:
|
455
|
+
|
456
|
+
```bash
|
457
|
+
bundle exec appraisal install
|
458
|
+
```
|
459
|
+
|
460
|
+
2. Ensure you run tests against all supported Rails versions
|
461
|
+
|
254
462
|
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/easy_ml. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
|
255
463
|
|
256
464
|
## License
|
data/Rakefile
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "sprockets/railtie"
|
3
4
|
require "bundler/gem_tasks"
|
4
5
|
require "rspec/core/rake_task"
|
5
6
|
|
@@ -10,3 +11,47 @@ require "rubocop/rake_task"
|
|
10
11
|
RuboCop::RakeTask.new
|
11
12
|
|
12
13
|
task default: %i[spec rubocop]
|
14
|
+
|
15
|
+
Bundler.require(:default)
|
16
|
+
|
17
|
+
# Load your gem's code
|
18
|
+
require_relative "lib/easy_ml"
|
19
|
+
|
20
|
+
# Load the annotate tasks
|
21
|
+
require "annotate/annotate_models"
|
22
|
+
|
23
|
+
task :environment do
|
24
|
+
require "combustion"
|
25
|
+
require "sprockets"
|
26
|
+
Combustion.path = "spec/internal"
|
27
|
+
Combustion.initialize! :active_record do |config|
|
28
|
+
config.assets = ActiveSupport::OrderedOptions.new # Stub to avoid errors
|
29
|
+
config.assets.enabled = false # Set false since assets are handled by Vite
|
30
|
+
end
|
31
|
+
EasyML::Engine.eager_load!
|
32
|
+
end
|
33
|
+
|
34
|
+
namespace :easy_ml do
|
35
|
+
task annotate_models: :environment do
|
36
|
+
model_dir = File.expand_path("app/models", EasyML::Engine.root)
|
37
|
+
$LOAD_PATH.unshift(model_dir) unless $LOAD_PATH.include?(model_dir)
|
38
|
+
|
39
|
+
AnnotateModels.do_annotations(
|
40
|
+
is_rake: true,
|
41
|
+
model_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
|
42
|
+
root_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
|
43
|
+
include_modules: true, # Include modules/namespaces in the annotation
|
44
|
+
)
|
45
|
+
end
|
46
|
+
|
47
|
+
task :create_test_migrations do
|
48
|
+
require "combustion"
|
49
|
+
require "rails/generators"
|
50
|
+
require_relative "lib/easy_ml/railtie/generators/migration/migration_generator"
|
51
|
+
|
52
|
+
db_files = Dir.glob(EasyML::Engine.root.join("spec/internal/db/migrate/**/*"))
|
53
|
+
|
54
|
+
FileUtils.rm(db_files)
|
55
|
+
Rails::Generators.invoke("easy_ml:migration", [], { destination_root: EasyML::Engine.root.join("spec/internal") })
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require "action_controller"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
class ApplicationController < ActionController::Base
|
5
|
+
helper EasyML::ApplicationHelper
|
6
|
+
|
7
|
+
include InertiaRails::Controller
|
8
|
+
layout "easy_ml/application"
|
9
|
+
|
10
|
+
protect_from_forgery with: :exception
|
11
|
+
|
12
|
+
before_action :hot_reload
|
13
|
+
|
14
|
+
def hot_reload
|
15
|
+
return unless Rails.env.development? && ENV["EASY_ML_DEMO_APP"]
|
16
|
+
|
17
|
+
Dir[EasyML::Engine.root.join("lib/**/*")].select { |f| Pathname.new(f).extname == ".rb" }.each do |file|
|
18
|
+
load file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def settings_to_json(settings)
|
23
|
+
SettingsSerializer.new(settings).serializable_hash.dig(:data, :attributes)
|
24
|
+
end
|
25
|
+
|
26
|
+
def dataset_to_json(dataset)
|
27
|
+
DatasetSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
|
28
|
+
end
|
29
|
+
|
30
|
+
def datasource_to_json(datasource)
|
31
|
+
DatasourceSerializer.new(datasource).serializable_hash.dig(:data, :attributes)
|
32
|
+
end
|
33
|
+
|
34
|
+
def model_to_json(model)
|
35
|
+
ModelSerializer.new(model).serializable_hash.dig(:data, :attributes)
|
36
|
+
end
|
37
|
+
|
38
|
+
def retraining_job_to_json(job)
|
39
|
+
RetrainingJobSerializer.new(job).serializable_hash.dig(:data, :attributes)
|
40
|
+
end
|
41
|
+
|
42
|
+
def retraining_run_to_json(run)
|
43
|
+
RetrainingRunSerializer.new(run).serializable_hash.dig(:data, :attributes)
|
44
|
+
end
|
45
|
+
|
46
|
+
def easy_ml_root
|
47
|
+
Rails.application.routes.routes.find { |r| r.app.app == EasyML::Engine }&.path&.spec&.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
inertia_share do
|
51
|
+
flash_messages = []
|
52
|
+
|
53
|
+
flash_messages << { type: "success", message: flash[:notice] } if flash[:notice]
|
54
|
+
|
55
|
+
flash_messages << { type: "error", message: flash[:alert] } if flash[:alert]
|
56
|
+
|
57
|
+
flash_messages << { type: "info", message: flash[:info] } if flash[:info]
|
58
|
+
|
59
|
+
{
|
60
|
+
rootPath: easy_ml_root,
|
61
|
+
url: request.path.gsub(Regexp.new(easy_ml_root), ""),
|
62
|
+
errors: session.delete(:errors) || {},
|
63
|
+
flash: flash_messages,
|
64
|
+
}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_columns
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :json
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
#
|
20
|
+
module EasyML
|
21
|
+
class ColumnsController < ApplicationController
|
22
|
+
def update
|
23
|
+
@column = EasyML::Column.find(params[:id])
|
24
|
+
|
25
|
+
if @column.update(column_params)
|
26
|
+
head :ok
|
27
|
+
else
|
28
|
+
render json: { errors: @column.errors }, status: :unprocessable_entity
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def column_params
|
35
|
+
params.require(:column).permit(:hidden)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_datasets
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# description :string
|
8
|
+
# dataset_type :string
|
9
|
+
# status :string
|
10
|
+
# version :string
|
11
|
+
# datasource_id :bigint
|
12
|
+
# root_dir :string
|
13
|
+
# configuration :json
|
14
|
+
# num_rows :bigint
|
15
|
+
# workflow_status :string
|
16
|
+
# statistics :json
|
17
|
+
# preprocessor_statistics :json
|
18
|
+
# schema :json
|
19
|
+
# refreshed_at :datetime
|
20
|
+
# created_at :datetime not null
|
21
|
+
# updated_at :datetime not null
|
22
|
+
#
|
23
|
+
module EasyML
|
24
|
+
class DatasetsController < ApplicationController
|
25
|
+
def index
|
26
|
+
datasets = Dataset.all
|
27
|
+
|
28
|
+
render inertia: "pages/DatasetsPage", props: {
|
29
|
+
datasets: datasets.map { |dataset| dataset_to_json(dataset) },
|
30
|
+
constants: Dataset.constants,
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def new
|
35
|
+
render inertia: "pages/NewDatasetPage", props: {
|
36
|
+
constants: Dataset.constants,
|
37
|
+
datasources: Datasource.all.map { |datasource| datasource_to_json(datasource) },
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def create
|
42
|
+
EasyML::Datasource.find_by(id: params.dig(:dataset, :datasource_id))
|
43
|
+
dataset = Dataset.new(dataset_params.to_h)
|
44
|
+
|
45
|
+
if dataset.save
|
46
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset was successfully created."
|
47
|
+
else
|
48
|
+
redirect_to new_easy_ml_dataset_path, alert: dataset.errors.full_messages.join(", ")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def destroy
|
53
|
+
dataset = Dataset.find(params[:id])
|
54
|
+
|
55
|
+
if dataset.destroy
|
56
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset was successfully deleted."
|
57
|
+
else
|
58
|
+
redirect_to easy_ml_datasets_path, alert: "Failed to delete dataset."
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def show
|
63
|
+
dataset = Dataset.find(params[:id])
|
64
|
+
|
65
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
66
|
+
dataset: dataset_to_json(dataset),
|
67
|
+
constants: Dataset.constants,
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
def update
|
72
|
+
dataset = Dataset.find(params[:id])
|
73
|
+
|
74
|
+
# Iterate over columns to check and update preprocessing_steps
|
75
|
+
dataset_params[:columns_attributes]&.each do |_, column_attrs|
|
76
|
+
column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
|
77
|
+
end
|
78
|
+
|
79
|
+
if dataset.update(dataset_params)
|
80
|
+
flash.now[:notice] = "Dataset configuration was successfully updated."
|
81
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
82
|
+
dataset: dataset_to_json(dataset),
|
83
|
+
constants: Dataset.constants,
|
84
|
+
}
|
85
|
+
else
|
86
|
+
flash.now[:error] = dataset.errors.full_messages.join(", ")
|
87
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
88
|
+
dataset: dataset_to_json(dataset),
|
89
|
+
constants: Dataset.constants,
|
90
|
+
}
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def refresh
|
95
|
+
dataset = Dataset.find(params[:id])
|
96
|
+
dataset.refresh_async
|
97
|
+
|
98
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset refresh has been initiated."
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def preprocessing_params
|
104
|
+
[:method, { params: [:constant, :categorical_min, :one_hot, :ordinal_encoding, { clip: %i[min max] }] }]
|
105
|
+
end
|
106
|
+
|
107
|
+
def dataset_params
|
108
|
+
params.require(:dataset).permit(
|
109
|
+
:name,
|
110
|
+
:description,
|
111
|
+
:datasource_id,
|
112
|
+
:target,
|
113
|
+
drop_cols: [],
|
114
|
+
splitter_attributes: %i[
|
115
|
+
splitter_type
|
116
|
+
date_col
|
117
|
+
months_test
|
118
|
+
months_valid
|
119
|
+
train_ratio
|
120
|
+
test_ratio
|
121
|
+
valid_ratio
|
122
|
+
train_files
|
123
|
+
test_files
|
124
|
+
valid_files
|
125
|
+
],
|
126
|
+
columns_attributes: [
|
127
|
+
:id,
|
128
|
+
:name,
|
129
|
+
:type,
|
130
|
+
:description,
|
131
|
+
:datatype,
|
132
|
+
:polars_datatype,
|
133
|
+
:is_target,
|
134
|
+
:hidden,
|
135
|
+
:drop_if_null,
|
136
|
+
:sample_values,
|
137
|
+
:_destroy,
|
138
|
+
{
|
139
|
+
preprocessing_steps: {
|
140
|
+
training: preprocessing_params,
|
141
|
+
inference: preprocessing_params,
|
142
|
+
},
|
143
|
+
statistics: %i[mean median min max null_count],
|
144
|
+
},
|
145
|
+
],
|
146
|
+
features_attributes: %i[
|
147
|
+
id
|
148
|
+
name
|
149
|
+
feature_class
|
150
|
+
feature_position
|
151
|
+
_destroy
|
152
|
+
],
|
153
|
+
)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|