easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 194dfdb77406b2509c29056d821804bbf388552d675d2fb08ee5bad296119c58
4
- data.tar.gz: e456e94893d3f51f2432355f3b6ce94feda95066f7258f57499122549bebfbe5
3
+ metadata.gz: 62ec6069cb9e47af4d2fd29668202132af589b1fb526b93c8bd4766aec5df3b1
4
+ data.tar.gz: 6e06d4e607d50b74f8d7ad5a881380d05bed58f351bc22357bfa3e5038850322
5
5
  SHA512:
6
- metadata.gz: 279ca003173f2acdd2dcc802cc96fb479658edfd86e6c773dca7615fe550aa3e80999c2c2c8e665ee5affe5fa5b10c888e78c0e426688dc2ce1978d420773722
7
- data.tar.gz: 6443bb0f2e3560a0b9f28fe5d657b58bae75a56756db3193dd86bfa0772c19a66350d46d595b4ce133ab328dfdc3893ab1b06ae78c54330ae1a3174e9fc390a7
6
+ metadata.gz: 61eed2f9f210fd5ac38af1b0972d369d73e5853491a5889c0b04c9dd776e5509514ab98ac56ab8e3bd876223a72a5463c29baa238899220acd369ee5e58c3206
7
+ data.tar.gz: d266908a752c337c7817484af4235ba78b38e47495d25fe815feafad11b820cd63ca47300619c46f56971d810b2a475a3ebb2c436359026ff5f2f5f794cb0bd1
data/README.md CHANGED
@@ -2,13 +2,33 @@
2
2
 
3
3
  # EasyML
4
4
 
5
- EasyML is a Ruby gem designed to simplify the process of building, deploying, and managing the lifecycle of machine learning models within a Ruby on Rails application. It is a plug-and-play, opinionated framework that currently supports XGBoost, with plans to expand support to a variety of models and infrastructures. EasyML aims to make deployment and lifecycle management straightforward and efficient.
5
+ ~~You can't do machine learning in Ruby.~~
6
+
7
+ Deploy models in minutes.
8
+
9
+ ## What is EasyML?
10
+
11
+ EasyML is a **low code/no code**, end-to-end machine learning framework for Ruby on Rails.
12
+
13
+ **Get productionized models in minutes.** It takes the guesswork out of:
14
+
15
+ - Preprocessing data
16
+ - Storing and batch computing features
17
+ - Training models
18
+ - Metric visualization
19
+ - Deployment and versioning
20
+ - Evaluating model performance
21
+
22
+ With a dead-simple point-and-click interface, EasyML makes it stupid easy to train and deploy.
23
+
24
+ Oh yeah, and it's open source!
6
25
 
7
26
  ## Features
8
27
 
9
- - **Plug-and-Play Architecture**: EasyML is designed to be easily extendable, allowing for the integration of various machine learning models and data sources.
10
- - **Opinionated Framework**: Provides a structured approach to model management, ensuring best practices are followed.
11
- - **Model Lifecycle On Rails**: Seamlessly integrates with Ruby on Rails, allowing simplified deployment of models to production.
28
+ - **No Code (if you want)**: EasyML ships as a Rails engine. Just mount it in your app and get started.
29
+ - **Opinionated Framework**: Provides a structured approach to data and model management, ensuring best practices are followed.
30
+ - **Model Lifecycle On Rails**: Want predictions directly from your Rails app? You can do that.
31
+ - **Easily Extensible**: Want a model that's not supported? Send a pull request!
12
32
 
13
33
  ## Current and Planned Features
14
34
 
@@ -89,6 +109,14 @@ MyTrainer.predict(customer_data: "I am worth a lot of money")
89
109
  # prediction: true!
90
110
  ```
91
111
 
112
+ ## Mount The Engine
113
+
114
+ ```ruby
115
+ Rails.application.routes.draw do
116
+ mount EasyML::Engine, at: "easy_ml"
117
+ end
118
+ ```
119
+
92
120
  ## Data Management
93
121
 
94
122
  EasyML provides a comprehensive data management system that handles all preprocessing tasks, including splitting data into train, test, and validation sets, and avoiding data leakage. The primary abstraction for data handling is the `Dataset` class, which ensures data is properly managed and prepared for machine learning tasks.
@@ -153,12 +181,12 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
153
181
  }
154
182
  ```
155
183
 
156
- - **Label Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
184
+ - **Ordinal Encoding**: Convert categorical variables into integer labels. Use this when you have categorical data that can be ordinally encoded.
157
185
 
158
186
  ```ruby
159
187
  loan_purpose: {
160
188
  categorical: {
161
- encode_labels: true
189
+ ordinal_encoding: true
162
190
  }
163
191
  }
164
192
  ```
@@ -170,6 +198,198 @@ EasyML offers a variety of preprocessing features to prepare your data for machi
170
198
  - **Batch Processing**: Process data in batches to handle large datasets efficiently.
171
199
  - **Null Handling**: Alert and handle null values in datasets to ensure data quality.
172
200
 
201
+ ## Feature Store
202
+
203
+ The Feature Store is a powerful component of EasyML that helps you manage, compute, and serve features for your machine learning models. Here's how to use it effectively:
204
+
205
+ ### Setting Up Features
206
+
207
+ 1. Create a `features` directory in your application:
208
+
209
+ ```bash
210
+ mkdir app/features
211
+ ```
212
+
213
+ 2. Create feature classes in this directory. Each feature should include the `EasyML::Features` module:
214
+
215
+ ```ruby
216
+ class MyFeature
217
+ include EasyML::Features
218
+
219
+ def transform(df, feature)
220
+ # Your feature transformation logic here
221
+ end
222
+
223
+ feature name: "My Feature",
224
+ description: "Description of what this feature does"
225
+ end
226
+ ```
227
+
228
+ ### Feature Types and Configurations
229
+
230
+ #### Simple Transform-Only Features
231
+
232
+ For features that can be computed using only the input columns:
233
+
234
+ ```ruby
235
+ class DidConvert
236
+ include EasyML::Features
237
+
238
+ def transform(df, feature)
239
+ df.with_column(
240
+ (Polars.col("rev") > 0).alias("did_convert")
241
+ )
242
+ end
243
+
244
+ feature name: "did_convert",
245
+ description: "Boolean indicating if conversion occurred"
246
+ end
247
+ ```
248
+
249
+ #### Batch Processing Features
250
+
251
+ For features that require processing large datasets in chunks:
252
+
253
+ ```ruby
254
+ class LastConversionTimeFeature
255
+ include EasyML::Features
256
+
257
+ def batch(reader, feature)
258
+ # Efficiently query only the company_id column for batching
259
+ # This will create batches of batch_size records (default 1000)
260
+ reader.query(select: ["company_id"], unique: true)["company_id"]
261
+ end
262
+
263
+ def fit(reader, feature, options = {})
264
+ batch_start = options.dig(:batch_start)
265
+ batch_end = options.dig(:batch_end)
266
+
267
+ # More efficient than is_in for continuous ranges
268
+ df = reader.query(
269
+ filter: Polars.col("company_id").is_between(batch_start, batch_end),
270
+ select: ["id", "company_id", "converted_at", "created_at"],
271
+ sort: ["company_id", "created_at"]
272
+ )
273
+
274
+ # For each company, find the last time they converted before each application
275
+ #
276
+ # This value will be cached in the feature store for fast inference retrieval
277
+ df.with_columns([
278
+ Polars.col("converted_at")
279
+ .shift(1)
280
+ .filter(Polars.col("converted_at").is_not_null())
281
+ .over("company_id")
282
+ .alias("last_conversion_time"),
283
+
284
+ # Also compute days since last conversion
285
+ (Polars.col("created_at") - Polars.col("last_conversion_time"))
286
+ .dt.days()
287
+ .alias("days_since_last_conversion")
288
+ ])[["id", "last_conversion_time", "days_since_last_conversion"]]
289
+ end
290
+
291
+ def transform(df, feature)
292
+ # Pull the pre-computed values from the feature store
293
+ stored_df = feature.query(filter: Polars.col("id").is_in(df["id"]))
294
+ return df if stored_df.empty?
295
+
296
+ df.join(stored_df, on: "id", how: "left")
297
+ end
298
+
299
+ feature name: "Last Conversion Time",
300
+ description: "Computes the last time a company converted before each application",
301
+ batch_size: 1000, # Process 1000 companies at a time
302
+ primary_key: "id",
303
+ cache_for: 24.hours # Cache feature values for 24 hours after running fit
304
+ end
305
+ ```
306
+
307
+ This example demonstrates several key concepts:
308
+
309
+ 1. **Efficient Batching**: The `batch` method uses the reader to lazily query only the necessary column for batching
310
+ 1. **Batches Groups Together**: All records with the same `company_id` need to be in the same batch to properly compute the feature, so we create a custom batch (instead of using the primary key `id` column, which would split up companies into different batches)
311
+ 1. **Column Selection**: Only selects required columns in the reader query
312
+ 1. **Feature Computation**: Computes multiple related features (last conversion time and days since) in a single pass.
313
+ 1. **Automatic Feature Store Caching**: The feature store automatically caches feature values returned from the `fit` method
314
+
315
+ ### Performance Optimization
316
+
317
+ #### Caching During Development
318
+
319
+ Use `cache_for` to save processing time during development:
320
+
321
+ ```ruby
322
+ feature name: "My Feature",
323
+ cache_for: 24.hours # After running fit, this feature will be cached for 24 hours (unless new data is read from datasource, like S3)
324
+ ```
325
+
326
+ #### Early Returns
327
+
328
+ Always implement early returns in your transform method to avoid unnecessary reprocessing:
329
+
330
+ ```ruby
331
+ def transform(df, feature)
332
+ return df if df["required_column"].nil?
333
+ # Feature computation logic
334
+ end
335
+ ```
336
+
337
+ #### Using Reader vs DataFrame
338
+
339
+ - The Polars `reader` is a lazy reader that allows you to query data incrementally.
340
+ - If your feature includes a `batch` method or uses the `batch_size` variable, you will receive a reader instead of a dataframe in the `fit` method
341
+
342
+ ```ruby
343
+ def fit(reader, feature)
344
+ df = reader.query(select: ["column1", "column2"])
345
+ # Process only needed columns
346
+ end
347
+ ```
348
+
349
+ - If you don't have a `batch` method or don't use the `batch_size` variable, you will receive a dataframe in the `fit` method
350
+
351
+ ````ruby
352
+ def fit(df, feature)
353
+ # process directly on dataframe
354
+ end
355
+
356
+ - To ensure you get a reader instead of a dataframe, include the `batch` method
357
+
358
+ ```ruby
359
+ def batch(reader, feature)
360
+ reader.query(select: ["column1"])["column1"]
361
+ end
362
+
363
+ feature name: "My Feature", batch_size: 1_000
364
+ ````
365
+
366
+ ### Production Considerations
367
+
368
+ #### Handling Missing Data
369
+
370
+ When processing historical data:
371
+
372
+ 1. Check for missing dates:
373
+
374
+ ```ruby
375
+ def transform(df, feature)
376
+ missing_dates = feature.store.missing_dates(start_date, end_date)
377
+ return df if missing_dates.empty?
378
+
379
+ # Process only missing dates
380
+ process_dates(df, missing_dates)
381
+ end
382
+ ```
383
+
384
+ ### Best Practices
385
+
386
+ 1. Always specify a `primary_key` to allow the feature store to partition your data
387
+ 1. Use `batch/fit` to process large datasets in batches
388
+ 1. Use `batch/fit` to allow faster inference feature computation
389
+ 1. Use transform-only features when all required columns will be available on the inference dataset
390
+ 1. Use `cache_for` to save processing time during development
391
+ 1. Only query necessary columns using the reader
392
+
173
393
  ## Installation
174
394
 
175
395
  Install necessary Python dependencies
@@ -194,26 +414,6 @@ pip install optuna
194
414
  rails db:migrate
195
415
  ```
196
416
 
197
- 3. **Configure CarrierWave for S3 storage**:
198
-
199
- Ensure you have CarrierWave configured to use AWS S3. If not, add the following configuration:
200
-
201
- ```ruby
202
- # config/initializers/carrierwave.rb
203
- CarrierWave.configure do |config|
204
- config.fog_provider = 'fog/aws'
205
- config.fog_credentials = {
206
- provider: 'AWS',
207
- aws_access_key_id: ENV['AWS_ACCESS_KEY_ID'],
208
- aws_secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'],
209
- region: ENV['AWS_REGION'],
210
- }
211
- config.fog_directory = ENV['AWS_S3_BUCKET']
212
- config.fog_public = false
213
- config.storage = :fog
214
- end
215
- ```
216
-
217
417
  ## Usage
218
418
 
219
419
  To use EasyML in your Rails application, follow these steps:
@@ -251,6 +451,14 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
251
451
 
252
452
  ## Contributing
253
453
 
454
+ 1. Install Appraisals gemfiles:
455
+
456
+ ```bash
457
+ bundle exec appraisal install
458
+ ```
459
+
460
+ 2. Ensure you run tests against all supported Rails versions
461
+
254
462
  Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/easy_ml. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/easy_ml/blob/main/CODE_OF_CONDUCT.md).
255
463
 
256
464
  ## License
data/Rakefile CHANGED
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "sprockets/railtie"
3
4
  require "bundler/gem_tasks"
4
5
  require "rspec/core/rake_task"
5
6
 
@@ -10,3 +11,47 @@ require "rubocop/rake_task"
10
11
  RuboCop::RakeTask.new
11
12
 
12
13
  task default: %i[spec rubocop]
14
+
15
+ Bundler.require(:default)
16
+
17
+ # Load your gem's code
18
+ require_relative "lib/easy_ml"
19
+
20
+ # Load the annotate tasks
21
+ require "annotate/annotate_models"
22
+
23
+ task :environment do
24
+ require "combustion"
25
+ require "sprockets"
26
+ Combustion.path = "spec/internal"
27
+ Combustion.initialize! :active_record do |config|
28
+ config.assets = ActiveSupport::OrderedOptions.new # Stub to avoid errors
29
+ config.assets.enabled = false # Set false since assets are handled by Vite
30
+ end
31
+ EasyML::Engine.eager_load!
32
+ end
33
+
34
+ namespace :easy_ml do
35
+ task annotate_models: :environment do
36
+ model_dir = File.expand_path("app/models", EasyML::Engine.root)
37
+ $LOAD_PATH.unshift(model_dir) unless $LOAD_PATH.include?(model_dir)
38
+
39
+ AnnotateModels.do_annotations(
40
+ is_rake: true,
41
+ model_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
42
+ root_dir: [EasyML::Engine.root.join("app/models/easy_ml").to_s],
43
+ include_modules: true, # Include modules/namespaces in the annotation
44
+ )
45
+ end
46
+
47
+ task :create_test_migrations do
48
+ require "combustion"
49
+ require "rails/generators"
50
+ require_relative "lib/easy_ml/railtie/generators/migration/migration_generator"
51
+
52
+ db_files = Dir.glob(EasyML::Engine.root.join("spec/internal/db/migrate/**/*"))
53
+
54
+ FileUtils.rm(db_files)
55
+ Rails::Generators.invoke("easy_ml:migration", [], { destination_root: EasyML::Engine.root.join("spec/internal") })
56
+ end
57
+ end
@@ -0,0 +1,67 @@
1
+ require "action_controller"
2
+
3
+ module EasyML
4
+ class ApplicationController < ActionController::Base
5
+ helper EasyML::ApplicationHelper
6
+
7
+ include InertiaRails::Controller
8
+ layout "easy_ml/application"
9
+
10
+ protect_from_forgery with: :exception
11
+
12
+ before_action :hot_reload
13
+
14
+ def hot_reload
15
+ return unless Rails.env.development? && ENV["EASY_ML_DEMO_APP"]
16
+
17
+ Dir[EasyML::Engine.root.join("lib/**/*")].select { |f| Pathname.new(f).extname == ".rb" }.each do |file|
18
+ load file
19
+ end
20
+ end
21
+
22
+ def settings_to_json(settings)
23
+ SettingsSerializer.new(settings).serializable_hash.dig(:data, :attributes)
24
+ end
25
+
26
+ def dataset_to_json(dataset)
27
+ DatasetSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
28
+ end
29
+
30
+ def datasource_to_json(datasource)
31
+ DatasourceSerializer.new(datasource).serializable_hash.dig(:data, :attributes)
32
+ end
33
+
34
+ def model_to_json(model)
35
+ ModelSerializer.new(model).serializable_hash.dig(:data, :attributes)
36
+ end
37
+
38
+ def retraining_job_to_json(job)
39
+ RetrainingJobSerializer.new(job).serializable_hash.dig(:data, :attributes)
40
+ end
41
+
42
+ def retraining_run_to_json(run)
43
+ RetrainingRunSerializer.new(run).serializable_hash.dig(:data, :attributes)
44
+ end
45
+
46
+ def easy_ml_root
47
+ Rails.application.routes.routes.find { |r| r.app.app == EasyML::Engine }&.path&.spec&.to_s
48
+ end
49
+
50
+ inertia_share do
51
+ flash_messages = []
52
+
53
+ flash_messages << { type: "success", message: flash[:notice] } if flash[:notice]
54
+
55
+ flash_messages << { type: "error", message: flash[:alert] } if flash[:alert]
56
+
57
+ flash_messages << { type: "info", message: flash[:info] } if flash[:info]
58
+
59
+ {
60
+ rootPath: easy_ml_root,
61
+ url: request.path.gsub(Regexp.new(easy_ml_root), ""),
62
+ errors: session.delete(:errors) || {},
63
+ flash: flash_messages,
64
+ }
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,38 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_columns
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :json
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ #
20
+ module EasyML
21
+ class ColumnsController < ApplicationController
22
+ def update
23
+ @column = EasyML::Column.find(params[:id])
24
+
25
+ if @column.update(column_params)
26
+ head :ok
27
+ else
28
+ render json: { errors: @column.errors }, status: :unprocessable_entity
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def column_params
35
+ params.require(:column).permit(:hidden)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,156 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_datasets
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # description :string
8
+ # dataset_type :string
9
+ # status :string
10
+ # version :string
11
+ # datasource_id :bigint
12
+ # root_dir :string
13
+ # configuration :json
14
+ # num_rows :bigint
15
+ # workflow_status :string
16
+ # statistics :json
17
+ # preprocessor_statistics :json
18
+ # schema :json
19
+ # refreshed_at :datetime
20
+ # created_at :datetime not null
21
+ # updated_at :datetime not null
22
+ #
23
+ module EasyML
24
+ class DatasetsController < ApplicationController
25
+ def index
26
+ datasets = Dataset.all
27
+
28
+ render inertia: "pages/DatasetsPage", props: {
29
+ datasets: datasets.map { |dataset| dataset_to_json(dataset) },
30
+ constants: Dataset.constants,
31
+ }
32
+ end
33
+
34
+ def new
35
+ render inertia: "pages/NewDatasetPage", props: {
36
+ constants: Dataset.constants,
37
+ datasources: Datasource.all.map { |datasource| datasource_to_json(datasource) },
38
+ }
39
+ end
40
+
41
+ def create
42
+ EasyML::Datasource.find_by(id: params.dig(:dataset, :datasource_id))
43
+ dataset = Dataset.new(dataset_params.to_h)
44
+
45
+ if dataset.save
46
+ redirect_to easy_ml_datasets_path, notice: "Dataset was successfully created."
47
+ else
48
+ redirect_to new_easy_ml_dataset_path, alert: dataset.errors.full_messages.join(", ")
49
+ end
50
+ end
51
+
52
+ def destroy
53
+ dataset = Dataset.find(params[:id])
54
+
55
+ if dataset.destroy
56
+ redirect_to easy_ml_datasets_path, notice: "Dataset was successfully deleted."
57
+ else
58
+ redirect_to easy_ml_datasets_path, alert: "Failed to delete dataset."
59
+ end
60
+ end
61
+
62
+ def show
63
+ dataset = Dataset.find(params[:id])
64
+
65
+ render inertia: "pages/DatasetDetailsPage", props: {
66
+ dataset: dataset_to_json(dataset),
67
+ constants: Dataset.constants,
68
+ }
69
+ end
70
+
71
+ def update
72
+ dataset = Dataset.find(params[:id])
73
+
74
+ # Iterate over columns to check and update preprocessing_steps
75
+ dataset_params[:columns_attributes]&.each do |_, column_attrs|
76
+ column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
77
+ end
78
+
79
+ if dataset.update(dataset_params)
80
+ flash.now[:notice] = "Dataset configuration was successfully updated."
81
+ render inertia: "pages/DatasetDetailsPage", props: {
82
+ dataset: dataset_to_json(dataset),
83
+ constants: Dataset.constants,
84
+ }
85
+ else
86
+ flash.now[:error] = dataset.errors.full_messages.join(", ")
87
+ render inertia: "pages/DatasetDetailsPage", props: {
88
+ dataset: dataset_to_json(dataset),
89
+ constants: Dataset.constants,
90
+ }
91
+ end
92
+ end
93
+
94
+ def refresh
95
+ dataset = Dataset.find(params[:id])
96
+ dataset.refresh_async
97
+
98
+ redirect_to easy_ml_datasets_path, notice: "Dataset refresh has been initiated."
99
+ end
100
+
101
+ private
102
+
103
+ def preprocessing_params
104
+ [:method, { params: [:constant, :categorical_min, :one_hot, :ordinal_encoding, { clip: %i[min max] }] }]
105
+ end
106
+
107
+ def dataset_params
108
+ params.require(:dataset).permit(
109
+ :name,
110
+ :description,
111
+ :datasource_id,
112
+ :target,
113
+ drop_cols: [],
114
+ splitter_attributes: %i[
115
+ splitter_type
116
+ date_col
117
+ months_test
118
+ months_valid
119
+ train_ratio
120
+ test_ratio
121
+ valid_ratio
122
+ train_files
123
+ test_files
124
+ valid_files
125
+ ],
126
+ columns_attributes: [
127
+ :id,
128
+ :name,
129
+ :type,
130
+ :description,
131
+ :datatype,
132
+ :polars_datatype,
133
+ :is_target,
134
+ :hidden,
135
+ :drop_if_null,
136
+ :sample_values,
137
+ :_destroy,
138
+ {
139
+ preprocessing_steps: {
140
+ training: preprocessing_params,
141
+ inference: preprocessing_params,
142
+ },
143
+ statistics: %i[mean median min max null_count],
144
+ },
145
+ ],
146
+ features_attributes: %i[
147
+ id
148
+ name
149
+ feature_class
150
+ feature_position
151
+ _destroy
152
+ ],
153
+ )
154
+ end
155
+ end
156
+ end