easy_ml 0.1.3 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -4
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,113 @@
1
+ import { Dataset } from './dataset';
2
+
3
+ export type ModelStatus = 'success' | 'failed';
4
+ export type DeploymentStatus = 'training' | 'inference' | 'retired';
5
+ export type JobStatus = 'running' | 'success' | 'failed' | 'deployed';
6
+ export type Frequency = 'hourly' | 'daily' | 'weekly' | 'monthly';
7
+ export type ThresholdDirection = 'minimize' | 'maximize';
8
+ export interface Feature {
9
+ id: number;
10
+ name: string;
11
+ description: string;
12
+ groupId: number;
13
+ testDatasetId: number;
14
+ inputColumns: string[];
15
+ outputColumns: string[];
16
+ code: string;
17
+ createdAt: string;
18
+ updatedAt: string;
19
+ }
20
+
21
+ export interface FeatureGroup {
22
+ id: number;
23
+ name: string;
24
+ description: string;
25
+ features: Feature[];
26
+ createdAt: string;
27
+ updatedAt: string;
28
+ }
29
+
30
+ interface ModelVersion {
31
+ id: number;
32
+ version: string;
33
+ status: ModelStatus;
34
+ deployment_status: DeploymentStatus;
35
+ configuration: Record<string, unknown>;
36
+ createdAt: string;
37
+ updatedAt: string;
38
+ }
39
+
40
+ export interface Model {
41
+ id: number;
42
+ name: string;
43
+ model_type: string;
44
+ formatted_model_type: string;
45
+ task: string;
46
+ objective: string;
47
+ metrics: Record<string, unknown>;
48
+ status: ModelStatus;
49
+ deployment_status: DeploymentStatus;
50
+ dataset_id: number;
51
+ dataset: Dataset;
52
+ version: string;
53
+ configuration: Record<string, unknown>;
54
+ created_at: string;
55
+ updated_at: string;
56
+ retraining_runs: RetrainingRun[];
57
+ last_run_at: string | null;
58
+ last_run: RetrainingRun | null;
59
+ retraining_job: RetrainingJob | null;
60
+ formatted_frequency: string | null;
61
+ is_training: boolean;
62
+ metrics_url: string | null;
63
+ }
64
+
65
+ export interface Prediction {
66
+ id: number;
67
+ modelId: number;
68
+ timestamp: string;
69
+ input: Record<string, any>;
70
+ output: any;
71
+ groundTruth?: any;
72
+ latencyMs: number;
73
+ }
74
+
75
+ export interface RetrainingJob {
76
+ id: number;
77
+ model: string;
78
+ frequency: Frequency;
79
+ formatted_frequency: string;
80
+ at: number;
81
+ evaluator: Record<string, unknown>;
82
+ tuner_config: Record<string, unknown>;
83
+ tuning_frequency: Frequency;
84
+ last_tuning_at: string | null;
85
+ active: boolean;
86
+ status: JobStatus;
87
+ last_run_at: string | null;
88
+ locked_at: string | null;
89
+ created_at: string;
90
+ updated_at: string;
91
+ }
92
+
93
+ export interface RetrainingRun {
94
+ id: number;
95
+ model_id: number;
96
+ retraining_job_id: number;
97
+ tuner_job_id: number | null;
98
+ status: JobStatus;
99
+ metric_value: number | null;
100
+ threshold: number | null;
101
+ threshold_direction: ThresholdDirection;
102
+ deployable: boolean;
103
+ started_at: string | null;
104
+ is_deploying: boolean;
105
+ completed_at: string | null;
106
+ error_message: string | null;
107
+ metadata: Record<string, unknown>;
108
+ created_at: string;
109
+ updated_at: string;
110
+ stacktrace: string | null;
111
+ metrics: Record<string, number>;
112
+ metrics_url: string | null;
113
+ }
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EasyML
4
+ module ApplicationHelper
5
+ # Override: Returns the engine assets manifest.
6
+ def vite_manifest
7
+ EasyML::Engine.vite_ruby.manifest
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,21 @@
1
+ module EasyML
2
+ class ApplicationJob < ActiveJob::Base
3
+ queue_as :easy_ml
4
+
5
+ def create_event(model, status, error = nil)
6
+ EasyML::Event.create_event(model, status, error)
7
+ end
8
+
9
+ def handle_error(model, error)
10
+ EasyML::Event.handle_error(model, error)
11
+ end
12
+
13
+ def format_stacktrace(error)
14
+ EasyML::Event.format_stacktrace(error)
15
+ end
16
+
17
+ def wrap_text(text, max_length)
18
+ EasyML::Event.wrap_text(text, max_length)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,46 @@
1
+ module EasyML
2
+ class BatchJob
3
+ extend Resque::Plugins::BatchedJob
4
+ @queue = :easy_ml
5
+
6
+ class << self
7
+ # Default or dynamically generated batch ID
8
+ def default_batch_id
9
+ "batch_#{name}_#{SecureRandom.uuid}"
10
+ end
11
+
12
+ # E.g. EasyML::ComputeFeatureBatchJob.enqueue_batch(features.map(&:id))
13
+ #
14
+ def enqueue_batch(args_list, batch_id = default_batch_id)
15
+ args_list = args_list.map { |arg| arg.is_a?(Array) ? arg : [arg] }
16
+ store_batch_arguments(batch_id, args_list)
17
+
18
+ args_list.each do |args|
19
+ Resque.enqueue_batched_job(self, batch_id, *args)
20
+ end
21
+
22
+ batch_id
23
+ end
24
+
25
+ private
26
+
27
+ # Store batch arguments in Redis
28
+ def store_batch_arguments(batch_id, args_list)
29
+ redis_key = "#{batch(batch_id)}:original_args"
30
+ redis.set(redis_key, Resque.encode(args_list))
31
+ end
32
+
33
+ # Fetch batch arguments from Redis
34
+ def fetch_batch_arguments(batch_id)
35
+ redis_key = "#{batch(batch_id)}:original_args"
36
+ stored_args = redis.get(redis_key)
37
+ stored_args ? Resque.decode(stored_args) : []
38
+ end
39
+
40
+ # Redis instance for storing batch arguments
41
+ def redis
42
+ Resque.redis
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class ComputeFeatureJob < BatchJob
3
+ @queue = :easy_ml
4
+
5
+ def self.perform(batch_id, options = {})
6
+ options.symbolize_keys!
7
+ feature_id = options.dig(:feature_id)
8
+ feature = EasyML::Feature.find(feature_id)
9
+ feature.fit_batch(options)
10
+ end
11
+
12
+ def self.after_batch_hook(batch_id, *args)
13
+ puts "After batch!"
14
+ feature_ids = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys).pluck(:feature_id).uniq
15
+ dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
16
+ dataset.after_fit_features
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ module EasyML
2
+ class DeployJob < ApplicationJob
3
+ def perform(id)
4
+ deploy = EasyML::Deploy.find(id)
5
+
6
+ begin
7
+ deploy.actually_deploy
8
+ rescue StandardError => e
9
+ handle_error(deploy, e)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ module Jobs
3
+ class FinalizeFeatureJob < ApplicationJob
4
+ queue_as :features
5
+
6
+ def perform(feature_id)
7
+ feature = EasyML::Feature.find(feature_id)
8
+ feature.update!(
9
+ applied_at: Time.current,
10
+ needs_fit: false,
11
+ )
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,32 @@
1
+ module EasyML
2
+ class RefreshDatasetJob < ApplicationJob
3
+ def perform(id)
4
+ dataset = EasyML::Dataset.find(id)
5
+ puts "Refreshing dataset #{dataset.name}"
6
+ puts "Needs refresh? #{dataset.needs_refresh?}"
7
+ unless dataset.needs_refresh?
8
+ dataset.update(workflow_status: :ready)
9
+ end
10
+
11
+ create_event(dataset, "started")
12
+
13
+ begin
14
+ puts "Prepare! #{dataset.name}"
15
+ dataset.prepare
16
+ if dataset.features.needs_fit.any?
17
+ dataset.fit_features(async: true)
18
+ puts "Computing features!"
19
+ else
20
+ dataset.actually_refresh
21
+ puts "Done!"
22
+ end
23
+ rescue StandardError => e
24
+ puts "Error #{e.message}"
25
+ if Rails.env.test?
26
+ raise e
27
+ end
28
+ handle_error(dataset, e)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,11 @@
1
+ module EasyML
2
+ class ScheduleRetrainingJob < ApplicationJob
3
+ queue_as :easy_ml
4
+
5
+ def perform
6
+ RetrainingJob.active.each do |job|
7
+ job.model.train
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ module EasyML
2
+ class SyncDatasourceJob < ApplicationJob
3
+ queue_as :easy_ml
4
+
5
+ def perform(id)
6
+ datasource = EasyML::Datasource.find(id)
7
+ create_event(datasource, "started")
8
+
9
+ begin
10
+ datasource.refresh
11
+ rescue StandardError => e
12
+ datasource.update!(is_syncing: false)
13
+ handle_error(datasource, e)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,62 @@
1
+ module EasyML
2
+ class TrainingJob < ApplicationJob
3
+ class TrainingTimeoutError < StandardError; end
4
+
5
+ INACTIVITY_TIMEOUT = 15 # seconds
6
+
7
+ def perform(model_id)
8
+ @model = EasyML::Model.find_by(id: model_id)
9
+ return if @model.nil?
10
+
11
+ @last_activity = Time.current
12
+ setup_signal_traps
13
+ # @monitor_thread = start_monitor_thread
14
+
15
+ @model.actually_train do |iteration_info|
16
+ @last_activity = Time.current
17
+ end
18
+ ensure
19
+ # @monitor_thread&.exit
20
+ end
21
+
22
+ private
23
+
24
+ def setup_signal_traps
25
+ # Handle graceful shutdown on SIGTERM
26
+ Signal.trap("TERM") do
27
+ puts "Received SIGTERM, cleaning up..."
28
+ cleanup("Training process terminated")
29
+ raise TrainingTimeoutError, "Training process terminated"
30
+ end
31
+
32
+ # Handle Ctrl+C
33
+ Signal.trap("INT") do
34
+ puts "Received SIGINT, cleaning up..."
35
+ cleanup("Training process interrupted")
36
+ raise TrainingTimeoutError, "Training process interrupted"
37
+ end
38
+ end
39
+
40
+ def cleanup(error_message)
41
+ return if @cleaned_up
42
+ @cleaned_up = true
43
+ @model.last_run.update(status: "failed", error_message: error_message, completed_at: Time.current)
44
+ @model.update(is_training: false)
45
+ end
46
+
47
+ def start_monitor_thread
48
+ Thread.new do
49
+ while true
50
+ puts "Monitoring activity... #{Time.current - @last_activity}"
51
+ if Time.current - @last_activity >= INACTIVITY_TIMEOUT
52
+ puts "Training process inactive for #{INACTIVITY_TIMEOUT} seconds, terminating..."
53
+ cleanup("Training process timed out")
54
+ Thread.main.raise(TrainingTimeoutError)
55
+ break
56
+ end
57
+ sleep 1
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,45 @@
1
+ module EasyML
2
+ module Adapters
3
+ class BaseAdapter
4
+ attr_reader :datasource
5
+
6
+ def initialize(datasource)
7
+ @datasource = datasource
8
+ end
9
+
10
+ def query(*)
11
+ raise NotImplementedError
12
+ end
13
+
14
+ def in_batches(*)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def files
19
+ raise NotImplementedError
20
+ end
21
+
22
+ def last_updated_at
23
+ raise NotImplementedError
24
+ end
25
+
26
+ def data
27
+ raise NotImplementedError
28
+ end
29
+
30
+ def needs_refresh?
31
+ false
32
+ end
33
+
34
+ def refresh
35
+ datasource.syncing do
36
+ # Default implementation does nothing
37
+ end
38
+ end
39
+
40
+ def refresh!
41
+ refresh
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,77 @@
1
+ module EasyML
2
+ module Adapters
3
+ class PolarsAdapter < BaseAdapter
4
+ def initialize(datasource)
5
+ super
6
+ read_df_from_configuration
7
+ end
8
+
9
+ def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
10
+ return if df.nil?
11
+
12
+ df = self.df.clone
13
+ df = df.filter(filter) if filter
14
+ df = df.select(select) if select.present?
15
+ df = df.unique if unique
16
+ drop_cols &= df.columns
17
+ df = df.drop(drop_cols) unless drop_cols.empty?
18
+ df = df.sort(sort, reverse: descending) if sort
19
+ df = df.limit(limit) if limit
20
+ df
21
+ end
22
+
23
+ def in_batches(of: 10_000)
24
+ total_rows = df.shape[0]
25
+ (0...total_rows).step(of) do |start|
26
+ end_index = [start + of, total_rows].min
27
+ yield df.slice(start, end_index - start)
28
+ end
29
+ end
30
+
31
+ def files
32
+ []
33
+ end
34
+
35
+ def last_updated_at
36
+ datasource.updated_at
37
+ end
38
+
39
+ def data
40
+ df
41
+ end
42
+
43
+ private
44
+
45
+ attr_accessor :df
46
+
47
+ def store_df_in_configuration
48
+ return unless df
49
+
50
+ datasource.configuration = (datasource.configuration || {}).merge(
51
+ "df" => JSON.parse(df.write_json)
52
+ )
53
+ end
54
+
55
+ def read_df_from_configuration
56
+ return unless datasource.configuration&.key?("df")
57
+
58
+ df_data = datasource.configuration["df"]
59
+ columns = df_data["columns"].map do |col|
60
+ dtype = case col["datatype"]
61
+ when Hash
62
+ if col["datatype"]["Datetime"]
63
+ Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
64
+ else
65
+ Polars::Utf8
66
+ end
67
+ else
68
+ Polars.const_get(col["datatype"])
69
+ end
70
+ Polars::Series.new(col["name"], col["values"], dtype: dtype)
71
+ end
72
+
73
+ @df = Polars::DataFrame.new(columns)
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,82 @@
1
+ module EasyML
2
+ class Cleaner
3
+ attr_accessor :files_to_keep, :dirs_to_clean
4
+
5
+ def initialize(force: false, verbose: false)
6
+ @verbose = verbose
7
+ @files_to_keep = if force
8
+ []
9
+ else
10
+ model_files_to_keep +
11
+ dataset_files_to_keep +
12
+ datasource_files_to_keep
13
+ end
14
+ end
15
+
16
+ def self.clean(verbose: false)
17
+ new(verbose: verbose).clean
18
+ end
19
+
20
+ # Clean everything, including active models
21
+ def self.clean!(verbose: false)
22
+ new(force: true, verbose: verbose).clean
23
+ end
24
+
25
+ def clean
26
+ dirs_to_clean.each do |dir|
27
+ files_to_keep = files_to_keep_for_dir(dir)
28
+ EasyML::Support::FileRotate.new(dir, files_to_keep, verbose: @verbose).cleanup(%w[json parquet csv])
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def files_to_keep_for_dir(dir)
35
+ files_to_keep.map(&:to_s).select { |f| f.start_with?(dir.to_s) }
36
+ end
37
+
38
+ def dirs_to_clean
39
+ %w[models datasets datasources].map do |dir|
40
+ EasyML::Engine.root_dir.join(dir)
41
+ end
42
+ end
43
+
44
+ def model_dirs
45
+ EasyML::Model.all.includes(dataset: :datasource).map do |model|
46
+ File.expand_path("..", model.root_dir)
47
+ end
48
+ end
49
+
50
+ def active_models
51
+ @active_models ||= begin
52
+ inference_models = EasyML::Model.deployed
53
+ training_models = EasyML::Model.all
54
+ (training_models + inference_models).compact
55
+ end
56
+ end
57
+
58
+ def model_files_to_keep
59
+ if Rails.env.test?
60
+ []
61
+ else
62
+ active_models.map(&:model_file).compact.map(&:full_path).uniq
63
+ end
64
+ end
65
+
66
+ def dataset_files_to_keep
67
+ if Rails.env.test?
68
+ []
69
+ else
70
+ EasyML::Dataset.all.flat_map(&:files).uniq
71
+ end
72
+ end
73
+
74
+ def datasource_files_to_keep
75
+ if Rails.env.test?
76
+ Dir.glob(EasyML::Engine.root_dir.glob("datasources/**/*.{csv,parquet}")).uniq
77
+ else
78
+ EasyML::Datasource.all.flat_map(&:files).uniq
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,124 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_columns
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :json
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ #
20
+ module EasyML
21
+ class Column < ActiveRecord::Base
22
+ self.table_name = "easy_ml_columns"
23
+ include Historiographer::Silent
24
+ historiographer_mode :snapshot_only
25
+
26
+ belongs_to :dataset, class_name: "EasyML::Dataset"
27
+
28
+ validates :name, presence: true
29
+ validates :name, uniqueness: { scope: :dataset_id }
30
+
31
+ before_save :ensure_valid_datatype
32
+
33
+ # Scopes
34
+ scope :visible, -> { where(hidden: false) }
35
+ scope :numeric, -> { where(datatype: %w[float integer]) }
36
+ scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
37
+ scope :datetime, -> { where(datatype: "datetime") }
38
+
39
+ def datatype=(dtype)
40
+ write_attribute(:datatype, dtype)
41
+ write_attribute(:polars_datatype, dtype)
42
+ end
43
+
44
+ def get_polars_type(dtype)
45
+ EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
46
+ end
47
+
48
+ def polars_type
49
+ return nil if polars_datatype.blank?
50
+
51
+ get_polars_type(polars_datatype)
52
+ end
53
+
54
+ def polars_type=(dtype)
55
+ write_attribute(:polars_datatype, dtype.to_s)
56
+ write_attribute(:datatype, EasyML::Data::PolarsColumn::POLARS_MAP[type.class.to_s]&.to_s)
57
+ end
58
+
59
+ def preprocessing_steps=(steps)
60
+ return super({}) if steps.blank?
61
+
62
+ typed_steps = steps.transform_values do |config|
63
+ next config unless config[:params]&.key?(:constant)
64
+
65
+ config.deep_dup.tap do |c|
66
+ c[:params][:constant] = convert_to_type(c[:params][:constant])
67
+ end
68
+ end
69
+
70
+ super(typed_steps)
71
+ end
72
+
73
+ def preprocessing_steps
74
+ (read_attribute(:preprocessing_steps) || {}).symbolize_keys
75
+ end
76
+
77
+ def one_hot?
78
+ preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :one_hot) == true
79
+ end
80
+
81
+ def ordinal_encoding?
82
+ preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
83
+ end
84
+
85
+ def allowed_categories
86
+ return nil unless one_hot?
87
+
88
+ dataset.preprocessor.statistics.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
89
+ end
90
+
91
+ private
92
+
93
+ def ensure_valid_datatype
94
+ return if datatype.blank?
95
+
96
+ return if EasyML::Data::PolarsColumn::TYPE_MAP.key?(datatype.to_sym)
97
+
98
+ errors.add(:datatype, "must be one of: #{EasyML::Data::PolarsColumn::TYPE_MAP.keys.join(", ")}")
99
+ throw :abort
100
+ end
101
+
102
+ def convert_to_type(value)
103
+ return value if value.nil?
104
+
105
+ case datatype&.to_sym
106
+ when :float
107
+ Float(value)
108
+ when :integer
109
+ Integer(value)
110
+ when :boolean
111
+ ActiveModel::Type::Boolean.new.cast(value)
112
+ when :datetime
113
+ value.is_a?(String) ? Time.parse(value) : value
114
+ else
115
+ value.to_s
116
+ end
117
+ rescue ArgumentError, TypeError
118
+ # If conversion fails, return original value
119
+ value
120
+ end
121
+
122
+ NUMERIC_METHODS = %i[mean median].freeze
123
+ end
124
+ end