easy_ml 0.1.4 → 0.2.0.pre.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62ec6069cb9e47af4d2fd29668202132af589b1fb526b93c8bd4766aec5df3b1
|
4
|
+
data.tar.gz: 6e06d4e607d50b74f8d7ad5a881380d05bed58f351bc22357bfa3e5038850322
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61eed2f9f210fd5ac38af1b0972d369d73e5853491a5889c0b04c9dd776e5509514ab98ac56ab8e3bd876223a72a5463c29baa238899220acd369ee5e58c3206
|
7
|
+
data.tar.gz: d266908a752c337c7817484af4235ba78b38e47495d25fe815feafad11b820cd63ca47300619c46f56971d810b2a475a3ebb2c436359026ff5f2f5f794cb0bd1
|
data/README.md
CHANGED
@@ -2,13 +2,33 @@
|
|
2
2
|
|
3
3
|
# EasyML
|
4
4
|
|
5
|
-
|
5
|
+
~~You can't do machine learning in Ruby.~~
|
6
|
+
|
7
|
+
Deploy models in minutes.
|
8
|
+
|
9
|
+
## What is EasyML?
|
10
|
+
|
11
|
+
EasyML is a **low code/no code**, end-to-end machine learning framework for Ruby on Rails.
|
12
|
+
|
13
|
+
**Get productionized models in minutes.** It takes the guesswork out of:
|
14
|
+
|
15
|
+
- Preprocessing data
|
16
|
+
- Storing and batch computing features
|
17
|
+
- Training models
|
18
|
+
- Metric visualization
|
19
|
+
- Deployment and versioning
|
20
|
+
- Evaluating model performance
|
21
|
+
|
22
|
+
With a dead-simple point-and-click interface, EasyML makes it stupid easy to train and deploy.
|
23
|
+
|
24
|
+
Oh yeah, and it's open source!
|
6
25
|
|
7
26
|
## Features
|
8
27
|
|
9
|
-
- **
|
10
|
-
- **Opinionated Framework**: Provides a structured approach to model management, ensuring best practices are followed.
|
11
|
-
- **Model Lifecycle On Rails**:
|
28
|
+
- **No Code (if you want)**: EasyML ships as a Rails engine. Just mount it in your app and get started.
|
29
|
+
- **Opinionated Framework**: Provides a structured approach to data and model management, ensuring best practices are followed.
|
30
|
+
- **Model Lifecycle On Rails**: Want predictions directly from your Rails app? You can do that.
|
31
|
+
- **Easily Extensible**: Want a model that's not supported? Send a pull request!
|
12
32
|
|
13
33
|
## Current and Planned Features
|
14
34
|
|
@@ -89,6 +109,14 @@ MyTrainer.predict(customer_data: "I am worth a lot of money")
|
|
89
109
|
# prediction: true!
|
90
110
|
```
|
91
111
|
|
112
|
+
## Mount The Engine
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
Rails.application.routes.draw do
|
116
|
+
mount EasyML::Engine, at: "easy_ml"
|
117
|
+
end
|
118
|
+
```
|
119
|
+
|
92
120
|
## Data Management
|
93
121
|
|
94
122
|
EasyML provides a comprehensive data management system that handles all preprocessing tasks, including splitting data into train, test, and validation sets, and avoiding data leakage. The primary abstraction for data handling is the `Dataset` class, which ensures data is properly managed and prepared for machine learning tasks.
|
@@ -153,12 +181,12 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
|
|
153
181
|
}
|
154
182
|
```
|
155
183
|
|
156
|
-
- **
|
184
|
+
- **Ordinal Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
|
157
185
|
|
158
186
|
```ruby
|
159
187
|
loan_purpose: {
|
160
188
|
categorical: {
|
161
|
-
|
189
|
+
ordinal_encoding: true
|
162
190
|
}
|
163
191
|
}
|
164
192
|
```
|
@@ -170,6 +198,198 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
|
|
170
198
|
- **Batch Processing**: Process data in batches to handle large datasets efficiently.
|
171
199
|
- **Null Handling**: Alert and handle null values in datasets to ensure data quality.
|
172
200
|
|
201
|
+
## Feature Store
|
202
|
+
|
203
|
+
The Feature Store is a powerful component of EasyML that helps you manage, compute, and serve features for your machine learning models. Here's how to use it effectively:
|
204
|
+
|
205
|
+
### Setting Up Features
|
206
|
+
|
207
|
+
1. Create a `features` directory in your application:
|
208
|
+
|
209
|
+
```bash
|
210
|
+
mkdir app/features
|
211
|
+
```
|
212
|
+
|
213
|
+
2. Create feature classes in this directory. Each feature should include the `EasyML::Features` module:
|
214
|
+
|
215
|
+
```ruby
|
216
|
+
class MyFeature
|
217
|
+
include EasyML::Features
|
218
|
+
|
219
|
+
def transform(df, feature)
|
220
|
+
# Your feature transformation logic here
|
221
|
+
end
|
222
|
+
|
223
|
+
feature name: "My Feature",
|
224
|
+
description: "Description of what this feature does"
|
225
|
+
end
|
226
|
+
```
|
227
|
+
|
228
|
+
### Feature Types and Configurations
|
229
|
+
|
230
|
+
#### Simple Transform-Only Features
|
231
|
+
|
232
|
+
For features that can be computed using only the input columns:
|
233
|
+
|
234
|
+
```ruby
|
235
|
+
class DidConvert
|
236
|
+
include EasyML::Features
|
237
|
+
|
238
|
+
def transform(df, feature)
|
239
|
+
df.with_column(
|
240
|
+
(Polars.col("rev") > 0).alias("did_convert")
|
241
|
+
)
|
242
|
+
end
|
243
|
+
|
244
|
+
feature name: "did_convert",
|
245
|
+
description: "Boolean indicating if conversion occurred"
|
246
|
+
end
|
247
|
+
```
|
248
|
+
|
249
|
+
#### Batch Processing Features
|
250
|
+
|
251
|
+
For features that require processing large datasets in chunks:
|
252
|
+
|
253
|
+
```ruby
|
254
|
+
class LastConversionTimeFeature
|
255
|
+
include EasyML::Features
|
256
|
+
|
257
|
+
def batch(reader, feature)
|
258
|
+
# Efficiently query only the company_id column for batching
|
259
|
+
# This will create batches of batch_size records (default 1000)
|
260
|
+
reader.query(select: ["company_id"], unique: true)["company_id"]
|
261
|
+
end
|
262
|
+
|
263
|
+
def fit(reader, feature, options = {})
|
264
|
+
batch_start = options.dig(:batch_start)
|
265
|
+
batch_end = options.dig(:batch_end)
|
266
|
+
|
267
|
+
# More efficient than is_in for continuous ranges
|
268
|
+
df = reader.query(
|
269
|
+
filter: Polars.col("company_id").is_between(batch_start, batch_end),
|
270
|
+
select: ["id", "company_id", "converted_at", "created_at"],
|
271
|
+
sort: ["company_id", "created_at"]
|
272
|
+
)
|
273
|
+
|
274
|
+
# For each company, find the last time they converted before each application
|
275
|
+
#
|
276
|
+
# This value will be cached in the feature store for fast inference retrieval
|
277
|
+
df.with_columns([
|
278
|
+
Polars.col("converted_at")
|
279
|
+
.shift(1)
|
280
|
+
.filter(Polars.col("converted_at").is_not_null())
|
281
|
+
.over("company_id")
|
282
|
+
.alias("last_conversion_time"),
|
283
|
+
|
284
|
+
# Also compute days since last conversion
|
285
|
+
(Polars.col("created_at") - Polars.col("last_conversion_time"))
|
286
|
+
.dt.days()
|
287
|
+
.alias("days_since_last_conversion")
|
288
|
+
])[["id", "last_conversion_time", "days_since_last_conversion"]]
|
289
|
+
end
|
290
|
+
|
291
|
+
def transform(df, feature)
|
292
|
+
# Pull the pre-computed values from the feature store
|
293
|
+
stored_df = feature.query(filter: Polars.col("id").is_in(df["id"]))
|
294
|
+
return df if stored_df.empty?
|
295
|
+
|
296
|
+
df.join(stored_df, on: "id", how: "left")
|
297
|
+
end
|
298
|
+
|
299
|
+
feature name: "Last Conversion Time",
|
300
|
+
description: "Computes the last time a company converted before each application",
|
301
|
+
batch_size: 1000, # Process 1000 companies at a time
|
302
|
+
primary_key: "id",
|
303
|
+
cache_for: 24.hours # Cache feature values for 24 hours after running fit
|
304
|
+
end
|
305
|
+
```
|
306
|
+
|
307
|
+
This example demonstrates several key concepts:
|
308
|
+
|
309
|
+
1. **Efficient Batching**: The `batch` method uses the reader to lazily query only the necessary column for batching
|
310
|
+
1. **Batches Groups Together**: All records with the same `company_id` need to be in the same batch to properly compute the feature, so we create a custom batch (instead of using the primary key `id` column, which would split up companies into different batches)
|
311
|
+
1. **Column Selection**: Only selects required columns in the reader query
|
312
|
+
1. **Feature Computation**: Computes multiple related features (last conversion time and days since) in a single pass.
|
313
|
+
1. **Automatic Feature Store Caching**: The feature store automatically caches feature values returned from the `fit` method
|
314
|
+
|
315
|
+
### Performance Optimization
|
316
|
+
|
317
|
+
#### Caching During Development
|
318
|
+
|
319
|
+
Use `cache_for` to save processing time during development:
|
320
|
+
|
321
|
+
```ruby
|
322
|
+
feature name: "My Feature",
|
323
|
+
cache_for: 24.hours # After running fit, this feature will be cached for 24 hours (unless new data is read from datasource, like S3)
|
324
|
+
```
|
325
|
+
|
326
|
+
#### Early Returns
|
327
|
+
|
328
|
+
Always implement early returns in your transform method to avoid unnecessary reprocessing:
|
329
|
+
|
330
|
+
```ruby
|
331
|
+
def transform(df, feature)
|
332
|
+
return df if df["required_column"].nil?
|
333
|
+
# Feature computation logic
|
334
|
+
end
|
335
|
+
```
|
336
|
+
|
337
|
+
#### Using Reader vs DataFrame
|
338
|
+
|
339
|
+
- The Polars `reader` is a lazy reader that allows you to query data incrementally.
|
340
|
+
- If your feature includes a `batch` method or uses the `batch_size` variable, you will receive a reader instead of a dataframe in the `fit` method
|
341
|
+
|
342
|
+
```ruby
|
343
|
+
def fit(reader, feature)
|
344
|
+
df = reader.query(select: ["column1", "column2"])
|
345
|
+
# Process only needed columns
|
346
|
+
end
|
347
|
+
```
|
348
|
+
|
349
|
+
- If you don't have a `batch` method or don't use the `batch_size` variable, you will receive a dataframe in the `fit` method
|
350
|
+
|
351
|
+
````ruby
|
352
|
+
def fit(df, feature)
|
353
|
+
# process directly on dataframe
|
354
|
+
end
|
355
|
+
|
356
|
+
- To ensure you get a reader instead of a dataframe, include the `batch` method
|
357
|
+
|
358
|
+
```ruby
|
359
|
+
def batch(reader, feature)
|
360
|
+
reader.query(select: ["column1"])["column1"]
|
361
|
+
end
|
362
|
+
|
363
|
+
feature name: "My Feature", batch_size: 1_000
|
364
|
+
````
|
365
|
+
|
366
|
+
### Production Considerations
|
367
|
+
|
368
|
+
#### Handling Missing Data
|
369
|
+
|
370
|
+
When processing historical data:
|
371
|
+
|
372
|
+
1. Check for missing dates:
|
373
|
+
|
374
|
+
```ruby
|
375
|
+
def transform(df, feature)
|
376
|
+
missing_dates = feature.store.missing_dates(start_date, end_date)
|
377
|
+
return df if missing_dates.empty?
|
378
|
+
|
379
|
+
# Process only missing dates
|
380
|
+
process_dates(df, missing_dates)
|
381
|
+
end
|
382
|
+
```
|
383
|
+
|
384
|
+
### Best Practices
|
385
|
+
|
386
|
+
1. Always specify a `primary_key` to allow the feature store to partition your data
|
387
|
+
1. Use `batch/fit` to process large datasets in batches
|
388
|
+
1. Use `batch/fit` to allow faster inference feature computation
|
389
|
+
1. Use transform-only features when all required columns will be available on the inference dataset
|
390
|
+
1. Use `cache_for` to save processing time during development
|
391
|
+
1. Only query necessary columns using the reader
|
392
|
+
|
173
393
|
## Installation
|
174
394
|
|
175
395
|
Install necessary Python dependencies
|
@@ -194,26 +414,6 @@ pip install optuna
|
|
194
414
|
rails db:migrate
|
195
415
|
```
|
196
416
|
|
197
|
-
3. **Configure CarrierWave for S3 storage**:
|
198
|
-
|
199
|
-
Ensure you have CarrierWave configured to use AWS S3. If not, add the following configuration:
|
200
|
-
|
201
|
-
```ruby
|
202
|
-
# config/initializers/carrierwave.rb
|
203
|
-
CarrierWave.configure do |config|
|
204
|
-
config.fog_provider = 'fog/aws'
|
205
|
-
config.fog_credentials = {
|
206
|
-
provider: 'AWS',
|
207
|
-
aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
|
208
|
-
aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
|
209
|
-
region: ENV['AWS_REGION'],
|
210
|
-
}
|
211
|
-
config.fog_directory = ENV['AWS_S3_BUCKET']
|
212
|
-
config.fog_public = false
|
213
|
-
config.storage = :fog
|
214
|
-
end
|
215
|
-
```
|
216
|
-
|
217
417
|
## Usage
|
218
418
|
|
219
419
|
To use EasyML in your Rails application, follow these steps:
|
@@ -251,6 +451,14 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
251
451
|
|
252
452
|
## Contributing
|
253
453
|
|
454
|
+
1. Install Appraisals gemfiles:
|
455
|
+
|
456
|
+
```bash
|
457
|
+
bundle exec appraisal install
|
458
|
+
```
|
459
|
+
|
460
|
+
2. Ensure you run tests against all supported Rails versions
|
461
|
+
|
254
462
|
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/easy_ml. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
|
255
463
|
|
256
464
|
## License
|
data/Rakefile
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "sprockets/railtie"
|
3
4
|
require "bundler/gem_tasks"
|
4
5
|
require "rspec/core/rake_task"
|
5
6
|
|
@@ -10,3 +11,47 @@ require "rubocop/rake_task"
|
|
10
11
|
RuboCop::RakeTask.new
|
11
12
|
|
12
13
|
task default: %i[spec rubocop]
|
14
|
+
|
15
|
+
Bundler.require(:default)
|
16
|
+
|
17
|
+
# Load your gem's code
|
18
|
+
require_relative "lib/easy_ml"
|
19
|
+
|
20
|
+
# Load the annotate tasks
|
21
|
+
require "annotate/annotate_models"
|
22
|
+
|
23
|
+
task :environment do
|
24
|
+
require "combustion"
|
25
|
+
require "sprockets"
|
26
|
+
Combustion.path = "spec/internal"
|
27
|
+
Combustion.initialize! :active_record do |config|
|
28
|
+
config.assets = ActiveSupport::OrderedOptions.new # Stub to avoid errors
|
29
|
+
config.assets.enabled = false # Set false since assets are handled by Vite
|
30
|
+
end
|
31
|
+
EasyML::Engine.eager_load!
|
32
|
+
end
|
33
|
+
|
34
|
+
namespace :easy_ml do
|
35
|
+
task annotate_models: :environment do
|
36
|
+
model_dir = File.expand_path("app/models", EasyML::Engine.root)
|
37
|
+
$LOAD_PATH.unshift(model_dir) unless $LOAD_PATH.include?(model_dir)
|
38
|
+
|
39
|
+
AnnotateModels.do_annotations(
|
40
|
+
is_rake: true,
|
41
|
+
model_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
|
42
|
+
root_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
|
43
|
+
include_modules: true, # Include modules/namespaces in the annotation
|
44
|
+
)
|
45
|
+
end
|
46
|
+
|
47
|
+
task :create_test_migrations do
|
48
|
+
require "combustion"
|
49
|
+
require "rails/generators"
|
50
|
+
require_relative "lib/easy_ml/railtie/generators/migration/migration_generator"
|
51
|
+
|
52
|
+
db_files = Dir.glob(EasyML::Engine.root.join("spec/internal/db/migrate/**/*"))
|
53
|
+
|
54
|
+
FileUtils.rm(db_files)
|
55
|
+
Rails::Generators.invoke("easy_ml:migration", [], { destination_root: EasyML::Engine.root.join("spec/internal") })
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require "action_controller"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
class ApplicationController < ActionController::Base
|
5
|
+
helper EasyML::ApplicationHelper
|
6
|
+
|
7
|
+
include InertiaRails::Controller
|
8
|
+
layout "easy_ml/application"
|
9
|
+
|
10
|
+
protect_from_forgery with: :exception
|
11
|
+
|
12
|
+
before_action :hot_reload
|
13
|
+
|
14
|
+
def hot_reload
|
15
|
+
return unless Rails.env.development? && ENV["EASY_ML_DEMO_APP"]
|
16
|
+
|
17
|
+
Dir[EasyML::Engine.root.join("lib/**/*")].select { |f| Pathname.new(f).extname == ".rb" }.each do |file|
|
18
|
+
load file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def settings_to_json(settings)
|
23
|
+
SettingsSerializer.new(settings).serializable_hash.dig(:data, :attributes)
|
24
|
+
end
|
25
|
+
|
26
|
+
def dataset_to_json(dataset)
|
27
|
+
DatasetSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
|
28
|
+
end
|
29
|
+
|
30
|
+
def datasource_to_json(datasource)
|
31
|
+
DatasourceSerializer.new(datasource).serializable_hash.dig(:data, :attributes)
|
32
|
+
end
|
33
|
+
|
34
|
+
def model_to_json(model)
|
35
|
+
ModelSerializer.new(model).serializable_hash.dig(:data, :attributes)
|
36
|
+
end
|
37
|
+
|
38
|
+
def retraining_job_to_json(job)
|
39
|
+
RetrainingJobSerializer.new(job).serializable_hash.dig(:data, :attributes)
|
40
|
+
end
|
41
|
+
|
42
|
+
def retraining_run_to_json(run)
|
43
|
+
RetrainingRunSerializer.new(run).serializable_hash.dig(:data, :attributes)
|
44
|
+
end
|
45
|
+
|
46
|
+
def easy_ml_root
|
47
|
+
Rails.application.routes.routes.find { |r| r.app.app == EasyML::Engine }&.path&.spec&.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
inertia_share do
|
51
|
+
flash_messages = []
|
52
|
+
|
53
|
+
flash_messages << { type: "success", message: flash[:notice] } if flash[:notice]
|
54
|
+
|
55
|
+
flash_messages << { type: "error", message: flash[:alert] } if flash[:alert]
|
56
|
+
|
57
|
+
flash_messages << { type: "info", message: flash[:info] } if flash[:info]
|
58
|
+
|
59
|
+
{
|
60
|
+
rootPath: easy_ml_root,
|
61
|
+
url: request.path.gsub(Regexp.new(easy_ml_root), ""),
|
62
|
+
errors: session.delete(:errors) || {},
|
63
|
+
flash: flash_messages,
|
64
|
+
}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_columns
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :json
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
#
|
20
|
+
module EasyML
|
21
|
+
class ColumnsController < ApplicationController
|
22
|
+
def update
|
23
|
+
@column = EasyML::Column.find(params[:id])
|
24
|
+
|
25
|
+
if @column.update(column_params)
|
26
|
+
head :ok
|
27
|
+
else
|
28
|
+
render json: { errors: @column.errors }, status: :unprocessable_entity
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def column_params
|
35
|
+
params.require(:column).permit(:hidden)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_datasets
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# description :string
|
8
|
+
# dataset_type :string
|
9
|
+
# status :string
|
10
|
+
# version :string
|
11
|
+
# datasource_id :bigint
|
12
|
+
# root_dir :string
|
13
|
+
# configuration :json
|
14
|
+
# num_rows :bigint
|
15
|
+
# workflow_status :string
|
16
|
+
# statistics :json
|
17
|
+
# preprocessor_statistics :json
|
18
|
+
# schema :json
|
19
|
+
# refreshed_at :datetime
|
20
|
+
# created_at :datetime not null
|
21
|
+
# updated_at :datetime not null
|
22
|
+
#
|
23
|
+
module EasyML
|
24
|
+
class DatasetsController < ApplicationController
|
25
|
+
def index
|
26
|
+
datasets = Dataset.all
|
27
|
+
|
28
|
+
render inertia: "pages/DatasetsPage", props: {
|
29
|
+
datasets: datasets.map { |dataset| dataset_to_json(dataset) },
|
30
|
+
constants: Dataset.constants,
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def new
|
35
|
+
render inertia: "pages/NewDatasetPage", props: {
|
36
|
+
constants: Dataset.constants,
|
37
|
+
datasources: Datasource.all.map { |datasource| datasource_to_json(datasource) },
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def create
|
42
|
+
EasyML::Datasource.find_by(id: params.dig(:dataset, :datasource_id))
|
43
|
+
dataset = Dataset.new(dataset_params.to_h)
|
44
|
+
|
45
|
+
if dataset.save
|
46
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset was successfully created."
|
47
|
+
else
|
48
|
+
redirect_to new_easy_ml_dataset_path, alert: dataset.errors.full_messages.join(", ")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def destroy
|
53
|
+
dataset = Dataset.find(params[:id])
|
54
|
+
|
55
|
+
if dataset.destroy
|
56
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset was successfully deleted."
|
57
|
+
else
|
58
|
+
redirect_to easy_ml_datasets_path, alert: "Failed to delete dataset."
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def show
|
63
|
+
dataset = Dataset.find(params[:id])
|
64
|
+
|
65
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
66
|
+
dataset: dataset_to_json(dataset),
|
67
|
+
constants: Dataset.constants,
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
def update
|
72
|
+
dataset = Dataset.find(params[:id])
|
73
|
+
|
74
|
+
# Iterate over columns to check and update preprocessing_steps
|
75
|
+
dataset_params[:columns_attributes]&.each do |_, column_attrs|
|
76
|
+
column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
|
77
|
+
end
|
78
|
+
|
79
|
+
if dataset.update(dataset_params)
|
80
|
+
flash.now[:notice] = "Dataset configuration was successfully updated."
|
81
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
82
|
+
dataset: dataset_to_json(dataset),
|
83
|
+
constants: Dataset.constants,
|
84
|
+
}
|
85
|
+
else
|
86
|
+
flash.now[:error] = dataset.errors.full_messages.join(", ")
|
87
|
+
render inertia: "pages/DatasetDetailsPage", props: {
|
88
|
+
dataset: dataset_to_json(dataset),
|
89
|
+
constants: Dataset.constants,
|
90
|
+
}
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def refresh
|
95
|
+
dataset = Dataset.find(params[:id])
|
96
|
+
dataset.refresh_async
|
97
|
+
|
98
|
+
redirect_to easy_ml_datasets_path, notice: "Dataset refresh has been initiated."
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def preprocessing_params
|
104
|
+
[:method, { params: [:constant, :categorical_min, :one_hot, :ordinal_encoding, { clip: %i[min max] }] }]
|
105
|
+
end
|
106
|
+
|
107
|
+
def dataset_params
|
108
|
+
params.require(:dataset).permit(
|
109
|
+
:name,
|
110
|
+
:description,
|
111
|
+
:datasource_id,
|
112
|
+
:target,
|
113
|
+
drop_cols: [],
|
114
|
+
splitter_attributes: %i[
|
115
|
+
splitter_type
|
116
|
+
date_col
|
117
|
+
months_test
|
118
|
+
months_valid
|
119
|
+
train_ratio
|
120
|
+
test_ratio
|
121
|
+
valid_ratio
|
122
|
+
train_files
|
123
|
+
test_files
|
124
|
+
valid_files
|
125
|
+
],
|
126
|
+
columns_attributes: [
|
127
|
+
:id,
|
128
|
+
:name,
|
129
|
+
:type,
|
130
|
+
:description,
|
131
|
+
:datatype,
|
132
|
+
:polars_datatype,
|
133
|
+
:is_target,
|
134
|
+
:hidden,
|
135
|
+
:drop_if_null,
|
136
|
+
:sample_values,
|
137
|
+
:_destroy,
|
138
|
+
{
|
139
|
+
preprocessing_steps: {
|
140
|
+
training: preprocessing_params,
|
141
|
+
inference: preprocessing_params,
|
142
|
+
},
|
143
|
+
statistics: %i[mean median min max null_count],
|
144
|
+
},
|
145
|
+
],
|
146
|
+
features_attributes: %i[
|
147
|
+
id
|
148
|
+
name
|
149
|
+
feature_class
|
150
|
+
feature_position
|
151
|
+
_destroy
|
152
|
+
],
|
153
|
+
)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|