easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,113 @@
1
+ import { Dataset } from './dataset';
2
+
3
+ export type ModelStatus = 'success' | 'failed';
4
+ export type DeploymentStatus = 'training' | 'inference' | 'retired';
5
+ export type JobStatus = 'running' | 'success' | 'failed' | 'deployed';
6
+ export type Frequency = 'hourly' | 'daily' | 'weekly' | 'monthly';
7
+ export type ThresholdDirection = 'minimize' | 'maximize';
8
+ export interface Feature {
9
+ id: number;
10
+ name: string;
11
+ description: string;
12
+ groupId: number;
13
+ testDatasetId: number;
14
+ inputColumns: string[];
15
+ outputColumns: string[];
16
+ code: string;
17
+ createdAt: string;
18
+ updatedAt: string;
19
+ }
20
+
21
+ export interface FeatureGroup {
22
+ id: number;
23
+ name: string;
24
+ description: string;
25
+ features: Feature[];
26
+ createdAt: string;
27
+ updatedAt: string;
28
+ }
29
+
30
+ interface ModelVersion {
31
+ id: number;
32
+ version: string;
33
+ status: ModelStatus;
34
+ deployment_status: DeploymentStatus;
35
+ configuration: Record<string, unknown>;
36
+ createdAt: string;
37
+ updatedAt: string;
38
+ }
39
+
40
+ export interface Model {
41
+ id: number;
42
+ name: string;
43
+ model_type: string;
44
+ formatted_model_type: string;
45
+ task: string;
46
+ objective: string;
47
+ metrics: Record<string, unknown>;
48
+ status: ModelStatus;
49
+ deployment_status: DeploymentStatus;
50
+ dataset_id: number;
51
+ dataset: Dataset;
52
+ version: string;
53
+ configuration: Record<string, unknown>;
54
+ created_at: string;
55
+ updated_at: string;
56
+ retraining_runs: RetrainingRun[];
57
+ last_run_at: string | null;
58
+ last_run: RetrainingRun | null;
59
+ retraining_job: RetrainingJob | null;
60
+ formatted_frequency: string | null;
61
+ is_training: boolean;
62
+ metrics_url: string | null;
63
+ }
64
+
65
+ export interface Prediction {
66
+ id: number;
67
+ modelId: number;
68
+ timestamp: string;
69
+ input: Record<string, any>;
70
+ output: any;
71
+ groundTruth?: any;
72
+ latencyMs: number;
73
+ }
74
+
75
+ export interface RetrainingJob {
76
+ id: number;
77
+ model: string;
78
+ frequency: Frequency;
79
+ formatted_frequency: string;
80
+ at: number;
81
+ evaluator: Record<string, unknown>;
82
+ tuner_config: Record<string, unknown>;
83
+ tuning_frequency: Frequency;
84
+ last_tuning_at: string | null;
85
+ active: boolean;
86
+ status: JobStatus;
87
+ last_run_at: string | null;
88
+ locked_at: string | null;
89
+ created_at: string;
90
+ updated_at: string;
91
+ }
92
+
93
+ export interface RetrainingRun {
94
+ id: number;
95
+ model_id: number;
96
+ retraining_job_id: number;
97
+ tuner_job_id: number | null;
98
+ status: JobStatus;
99
+ metric_value: number | null;
100
+ threshold: number | null;
101
+ threshold_direction: ThresholdDirection;
102
+ deployable: boolean;
103
+ started_at: string | null;
104
+ is_deploying: boolean;
105
+ completed_at: string | null;
106
+ error_message: string | null;
107
+ metadata: Record<string, unknown>;
108
+ created_at: string;
109
+ updated_at: string;
110
+ stacktrace: string | null;
111
+ metrics: Record<string, number>;
112
+ metrics_url: string | null;
113
+ }
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EasyML
4
+ module ApplicationHelper
5
+ # Override: Returns the engine assets manifest.
6
+ def vite_manifest
7
+ EasyML::Engine.vite_ruby.manifest
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,21 @@
1
+ module EasyML
2
+ class ApplicationJob < ActiveJob::Base
3
+ queue_as :easy_ml
4
+
5
+ def create_event(model, status, error = nil)
6
+ EasyML::Event.create_event(model, status, error)
7
+ end
8
+
9
+ def handle_error(model, error)
10
+ EasyML::Event.handle_error(model, error)
11
+ end
12
+
13
+ def format_stacktrace(error)
14
+ EasyML::Event.format_stacktrace(error)
15
+ end
16
+
17
+ def wrap_text(text, max_length)
18
+ EasyML::Event.wrap_text(text, max_length)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,46 @@
1
+ module EasyML
2
+ class BatchJob
3
+ extend Resque::Plugins::BatchedJob
4
+ @queue = :easy_ml
5
+
6
+ class << self
7
+ # Default or dynamically generated batch ID
8
+ def default_batch_id
9
+ "batch_#{name}_#{SecureRandom.uuid}"
10
+ end
11
+
12
+ # E.g. EasyML::ComputeFeatureBatchJob.enqueue_batch(features.map(&:id))
13
+ #
14
+ def enqueue_batch(args_list, batch_id = default_batch_id)
15
+ args_list = args_list.map { |arg| arg.is_a?(Array) ? arg : [arg] }
16
+ store_batch_arguments(batch_id, args_list)
17
+
18
+ args_list.each do |args|
19
+ Resque.enqueue_batched_job(self, batch_id, *args)
20
+ end
21
+
22
+ batch_id
23
+ end
24
+
25
+ private
26
+
27
+ # Store batch arguments in Redis
28
+ def store_batch_arguments(batch_id, args_list)
29
+ redis_key = "#{batch(batch_id)}:original_args"
30
+ redis.set(redis_key, Resque.encode(args_list))
31
+ end
32
+
33
+ # Fetch batch arguments from Redis
34
+ def fetch_batch_arguments(batch_id)
35
+ redis_key = "#{batch(batch_id)}:original_args"
36
+ stored_args = redis.get(redis_key)
37
+ stored_args ? Resque.decode(stored_args) : []
38
+ end
39
+
40
+ # Redis instance for storing batch arguments
41
+ def redis
42
+ Resque.redis
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class ComputeFeatureJob < BatchJob
3
+ @queue = :easy_ml
4
+
5
+ def self.perform(batch_id, options = {})
6
+ options.symbolize_keys!
7
+ feature_id = options.dig(:feature_id)
8
+ feature = EasyML::Feature.find(feature_id)
9
+ feature.fit_batch(options)
10
+ end
11
+
12
+ def self.after_batch_hook(batch_id, *args)
13
+ puts "After batch!"
14
+ feature_ids = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys).pluck(:feature_id).uniq
15
+ dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
16
+ dataset.after_fit_features
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ module EasyML
2
+ class DeployJob < ApplicationJob
3
+ def perform(id)
4
+ deploy = EasyML::Deploy.find(id)
5
+
6
+ begin
7
+ deploy.actually_deploy
8
+ rescue StandardError => e
9
+ handle_error(deploy, e)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ module Jobs
3
+ class FinalizeFeatureJob < ApplicationJob
4
+ queue_as :features
5
+
6
+ def perform(feature_id)
7
+ feature = EasyML::Feature.find(feature_id)
8
+ feature.update!(
9
+ applied_at: Time.current,
10
+ needs_fit: false,
11
+ )
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,32 @@
1
+ module EasyML
2
+ class RefreshDatasetJob < ApplicationJob
3
+ def perform(id)
4
+ dataset = EasyML::Dataset.find(id)
5
+ puts "Refreshing dataset #{dataset.name}"
6
+ puts "Needs refresh? #{dataset.needs_refresh?}"
7
+ unless dataset.needs_refresh?
8
+ dataset.update(workflow_status: :ready)
9
+ end
10
+
11
+ create_event(dataset, "started")
12
+
13
+ begin
14
+ puts "Prepare! #{dataset.name}"
15
+ dataset.prepare
16
+ if dataset.features.needs_fit.any?
17
+ dataset.fit_features(async: true)
18
+ puts "Computing features!"
19
+ else
20
+ dataset.actually_refresh
21
+ puts "Done!"
22
+ end
23
+ rescue StandardError => e
24
+ puts "Error #{e.message}"
25
+ if Rails.env.test?
26
+ raise e
27
+ end
28
+ handle_error(dataset, e)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,11 @@
1
+ module EasyML
2
+ class ScheduleRetrainingJob < ApplicationJob
3
+ queue_as :easy_ml
4
+
5
+ def perform
6
+ RetrainingJob.active.each do |job|
7
+ job.model.train
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ module EasyML
2
+ class SyncDatasourceJob < ApplicationJob
3
+ queue_as :easy_ml
4
+
5
+ def perform(id)
6
+ datasource = EasyML::Datasource.find(id)
7
+ create_event(datasource, "started")
8
+
9
+ begin
10
+ datasource.refresh
11
+ rescue StandardError => e
12
+ datasource.update!(is_syncing: false)
13
+ handle_error(datasource, e)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,62 @@
1
+ module EasyML
2
+ class TrainingJob < ApplicationJob
3
+ class TrainingTimeoutError < StandardError; end
4
+
5
+ INACTIVITY_TIMEOUT = 15 # seconds
6
+
7
+ def perform(model_id)
8
+ @model = EasyML::Model.find_by(id: model_id)
9
+ return if @model.nil?
10
+
11
+ @last_activity = Time.current
12
+ setup_signal_traps
13
+ # @monitor_thread = start_monitor_thread
14
+
15
+ @model.actually_train do |iteration_info|
16
+ @last_activity = Time.current
17
+ end
18
+ ensure
19
+ # @monitor_thread&.exit
20
+ end
21
+
22
+ private
23
+
24
+ def setup_signal_traps
25
+ # Handle graceful shutdown on SIGTERM
26
+ Signal.trap("TERM") do
27
+ puts "Received SIGTERM, cleaning up..."
28
+ cleanup("Training process terminated")
29
+ raise TrainingTimeoutError, "Training process terminated"
30
+ end
31
+
32
+ # Handle Ctrl+C
33
+ Signal.trap("INT") do
34
+ puts "Received SIGINT, cleaning up..."
35
+ cleanup("Training process interrupted")
36
+ raise TrainingTimeoutError, "Training process interrupted"
37
+ end
38
+ end
39
+
40
+ def cleanup(error_message)
41
+ return if @cleaned_up
42
+ @cleaned_up = true
43
+ @model.last_run.update(status: "failed", error_message: error_message, completed_at: Time.current)
44
+ @model.update(is_training: false)
45
+ end
46
+
47
+ def start_monitor_thread
48
+ Thread.new do
49
+ while true
50
+ puts "Monitoring activity... #{Time.current - @last_activity}"
51
+ if Time.current - @last_activity >= INACTIVITY_TIMEOUT
52
+ puts "Training process inactive for #{INACTIVITY_TIMEOUT} seconds, terminating..."
53
+ cleanup("Training process timed out")
54
+ Thread.main.raise(TrainingTimeoutError)
55
+ break
56
+ end
57
+ sleep 1
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,45 @@
1
+ module EasyML
2
+ module Adapters
3
+ class BaseAdapter
4
+ attr_reader :datasource
5
+
6
+ def initialize(datasource)
7
+ @datasource = datasource
8
+ end
9
+
10
+ def query(*)
11
+ raise NotImplementedError
12
+ end
13
+
14
+ def in_batches(*)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def files
19
+ raise NotImplementedError
20
+ end
21
+
22
+ def last_updated_at
23
+ raise NotImplementedError
24
+ end
25
+
26
+ def data
27
+ raise NotImplementedError
28
+ end
29
+
30
+ def needs_refresh?
31
+ false
32
+ end
33
+
34
+ def refresh
35
+ datasource.syncing do
36
+ # Default implementation does nothing
37
+ end
38
+ end
39
+
40
+ def refresh!
41
+ refresh
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,77 @@
1
+ module EasyML
2
+ module Adapters
3
+ class PolarsAdapter < BaseAdapter
4
+ def initialize(datasource)
5
+ super
6
+ read_df_from_configuration
7
+ end
8
+
9
+ def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
10
+ return if df.nil?
11
+
12
+ df = self.df.clone
13
+ df = df.filter(filter) if filter
14
+ df = df.select(select) if select.present?
15
+ df = df.unique if unique
16
+ drop_cols &= df.columns
17
+ df = df.drop(drop_cols) unless drop_cols.empty?
18
+ df = df.sort(sort, reverse: descending) if sort
19
+ df = df.limit(limit) if limit
20
+ df
21
+ end
22
+
23
+ def in_batches(of: 10_000)
24
+ total_rows = df.shape[0]
25
+ (0...total_rows).step(of) do |start|
26
+ end_index = [start + of, total_rows].min
27
+ yield df.slice(start, end_index - start)
28
+ end
29
+ end
30
+
31
+ def files
32
+ []
33
+ end
34
+
35
+ def last_updated_at
36
+ datasource.updated_at
37
+ end
38
+
39
+ def data
40
+ df
41
+ end
42
+
43
+ private
44
+
45
+ attr_accessor :df
46
+
47
+ def store_df_in_configuration
48
+ return unless df
49
+
50
+ datasource.configuration = (datasource.configuration || {}).merge(
51
+ "df" => JSON.parse(df.write_json)
52
+ )
53
+ end
54
+
55
+ def read_df_from_configuration
56
+ return unless datasource.configuration&.key?("df")
57
+
58
+ df_data = datasource.configuration["df"]
59
+ columns = df_data["columns"].map do |col|
60
+ dtype = case col["datatype"]
61
+ when Hash
62
+ if col["datatype"]["Datetime"]
63
+ Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
64
+ else
65
+ Polars::Utf8
66
+ end
67
+ else
68
+ Polars.const_get(col["datatype"])
69
+ end
70
+ Polars::Series.new(col["name"], col["values"], dtype: dtype)
71
+ end
72
+
73
+ @df = Polars::DataFrame.new(columns)
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,82 @@
1
+ module EasyML
2
+ class Cleaner
3
+ attr_accessor :files_to_keep, :dirs_to_clean
4
+
5
+ def initialize(force: false, verbose: false)
6
+ @verbose = verbose
7
+ @files_to_keep = if force
8
+ []
9
+ else
10
+ model_files_to_keep +
11
+ dataset_files_to_keep +
12
+ datasource_files_to_keep
13
+ end
14
+ end
15
+
16
+ def self.clean(verbose: false)
17
+ new(verbose: verbose).clean
18
+ end
19
+
20
+ # Clean everything, including active models
21
+ def self.clean!(verbose: false)
22
+ new(force: true, verbose: verbose).clean
23
+ end
24
+
25
+ def clean
26
+ dirs_to_clean.each do |dir|
27
+ files_to_keep = files_to_keep_for_dir(dir)
28
+ EasyML::Support::FileRotate.new(dir, files_to_keep, verbose: @verbose).cleanup(%w[json parquet csv])
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def files_to_keep_for_dir(dir)
35
+ files_to_keep.map(&:to_s).select { |f| f.start_with?(dir.to_s) }
36
+ end
37
+
38
+ def dirs_to_clean
39
+ %w[models datasets datasources].map do |dir|
40
+ EasyML::Engine.root_dir.join(dir)
41
+ end
42
+ end
43
+
44
+ def model_dirs
45
+ EasyML::Model.all.includes(dataset: :datasource).map do |model|
46
+ File.expand_path("..", model.root_dir)
47
+ end
48
+ end
49
+
50
+ def active_models
51
+ @active_models ||= begin
52
+ inference_models = EasyML::Model.deployed
53
+ training_models = EasyML::Model.all
54
+ (training_models + inference_models).compact
55
+ end
56
+ end
57
+
58
+ def model_files_to_keep
59
+ if Rails.env.test?
60
+ []
61
+ else
62
+ active_models.map(&:model_file).compact.map(&:full_path).uniq
63
+ end
64
+ end
65
+
66
+ def dataset_files_to_keep
67
+ if Rails.env.test?
68
+ []
69
+ else
70
+ EasyML::Dataset.all.flat_map(&:files).uniq
71
+ end
72
+ end
73
+
74
+ def datasource_files_to_keep
75
+ if Rails.env.test?
76
+ Dir.glob(EasyML::Engine.root_dir.glob("datasources/**/*.{csv,parquet}")).uniq
77
+ else
78
+ EasyML::Datasource.all.flat_map(&:files).uniq
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,124 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_columns
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :json
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ #
20
+ module EasyML
21
+ class Column < ActiveRecord::Base
22
+ self.table_name = "easy_ml_columns"
23
+ include Historiographer::Silent
24
+ historiographer_mode :snapshot_only
25
+
26
+ belongs_to :dataset, class_name: "EasyML::Dataset"
27
+
28
+ validates :name, presence: true
29
+ validates :name, uniqueness: { scope: :dataset_id }
30
+
31
+ before_save :ensure_valid_datatype
32
+
33
+ # Scopes
34
+ scope :visible, -> { where(hidden: false) }
35
+ scope :numeric, -> { where(datatype: %w[float integer]) }
36
+ scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
37
+ scope :datetime, -> { where(datatype: "datetime") }
38
+
39
+ def datatype=(dtype)
40
+ write_attribute(:datatype, dtype)
41
+ write_attribute(:polars_datatype, dtype)
42
+ end
43
+
44
+ def get_polars_type(dtype)
45
+ EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
46
+ end
47
+
48
+ def polars_type
49
+ return nil if polars_datatype.blank?
50
+
51
+ get_polars_type(polars_datatype)
52
+ end
53
+
54
+ def polars_type=(dtype)
55
+ write_attribute(:polars_datatype, dtype.to_s)
56
+ write_attribute(:datatype, EasyML::Data::PolarsColumn::POLARS_MAP[type.class.to_s]&.to_s)
57
+ end
58
+
59
+ def preprocessing_steps=(steps)
60
+ return super({}) if steps.blank?
61
+
62
+ typed_steps = steps.transform_values do |config|
63
+ next config unless config[:params]&.key?(:constant)
64
+
65
+ config.deep_dup.tap do |c|
66
+ c[:params][:constant] = convert_to_type(c[:params][:constant])
67
+ end
68
+ end
69
+
70
+ super(typed_steps)
71
+ end
72
+
73
+ def preprocessing_steps
74
+ (read_attribute(:preprocessing_steps) || {}).symbolize_keys
75
+ end
76
+
77
+ def one_hot?
78
+ preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :one_hot) == true
79
+ end
80
+
81
+ def ordinal_encoding?
82
+ preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
83
+ end
84
+
85
+ def allowed_categories
86
+ return nil unless one_hot?
87
+
88
+ dataset.preprocessor.statistics.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
89
+ end
90
+
91
+ private
92
+
93
+ def ensure_valid_datatype
94
+ return if datatype.blank?
95
+
96
+ return if EasyML::Data::PolarsColumn::TYPE_MAP.key?(datatype.to_sym)
97
+
98
+ errors.add(:datatype, "must be one of: #{EasyML::Data::PolarsColumn::TYPE_MAP.keys.join(", ")}")
99
+ throw :abort
100
+ end
101
+
102
+ def convert_to_type(value)
103
+ return value if value.nil?
104
+
105
+ case datatype&.to_sym
106
+ when :float
107
+ Float(value)
108
+ when :integer
109
+ Integer(value)
110
+ when :boolean
111
+ ActiveModel::Type::Boolean.new.cast(value)
112
+ when :datetime
113
+ value.is_a?(String) ? Time.parse(value) : value
114
+ else
115
+ value.to_s
116
+ end
117
+ rescue ArgumentError, TypeError
118
+ # If conversion fails, return original value
119
+ value
120
+ end
121
+
122
+ NUMERIC_METHODS = %i[mean median].freeze
123
+ end
124
+ end