easy_ml 0.1.3 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -4
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,278 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_retraining_jobs
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model_id :bigint
7
+ # frequency :string not null
8
+ # at :json not null
9
+ # evaluator :json
10
+ # tuning_enabled :boolean default(FALSE)
11
+ # tuner_config :json
12
+ # tuning_frequency :string
13
+ # last_tuning_at :datetime
14
+ # active :boolean default(TRUE)
15
+ # status :string default("pending")
16
+ # last_run_at :datetime
17
+ # metric :string not null
18
+ # direction :string not null
19
+ # threshold :float not null
20
+ # auto_deploy :boolean default(FALSE)
21
+ # batch_mode :boolean
22
+ # batch_size :integer
23
+ # batch_overlap :integer
24
+ # batch_key :string
25
+ # created_at :datetime not null
26
+ # updated_at :datetime not null
27
+ #
28
+ module EasyML
29
+ class RetrainingJob < ActiveRecord::Base
30
+ self.table_name = "easy_ml_retraining_jobs"
31
+
32
+ has_many :retraining_runs, class_name: "EasyML::RetrainingRun", dependent: :destroy
33
+ has_many :tuner_jobs, through: :retraining_runs
34
+
35
+ belongs_to :model, class_name: "EasyML::Model", inverse_of: :retraining_job
36
+ validates :model, presence: true,
37
+ uniqueness: { message: "already has a retraining job" }
38
+
39
+ VALID_FREQUENCIES = %w[day week month always].freeze
40
+ FREQUENCY_TYPES = [
41
+ {
42
+ value: "day",
43
+ label: "Daily",
44
+ description: "Run once every day",
45
+ },
46
+ {
47
+ value: "week",
48
+ label: "Weekly",
49
+ description: "Run once every week",
50
+ },
51
+ {
52
+ value: "month",
53
+ label: "Monthly",
54
+ description: "Run once every month",
55
+ },
56
+ ].freeze
57
+ validates :frequency, presence: true, inclusion: { in: VALID_FREQUENCIES }
58
+ validates :metric, presence: true
59
+ validate :validate_metrics_allowed
60
+ validates :status, presence: true
61
+ validates :at, presence: true
62
+ validates :threshold, presence: true
63
+ validates :tuning_frequency, inclusion: {
64
+ in: VALID_FREQUENCIES,
65
+ allow_nil: true,
66
+ }
67
+ validate :evaluator_must_be_valid
68
+ validate :validate_at_format
69
+ after_initialize :set_direction, unless: :persisted?
70
+
71
+ scope :active, -> { joins(:model).where(active: true) }
72
+
73
+ def self.current
74
+ active.select do |job|
75
+ job.should_run?
76
+ end
77
+ end
78
+
79
+ def self.constants
80
+ {
81
+ frequency: FREQUENCY_TYPES,
82
+ }
83
+ end
84
+
85
+ def tuner_config
86
+ (read_attribute(:tuner_config) || {}).merge!(objective: metric).stringify_keys
87
+ end
88
+
89
+ def formatted_frequency
90
+ if active
91
+ FREQUENCY_TYPES.find { |type| type[:value] == frequency }[:label]
92
+ else
93
+ "Manually"
94
+ end
95
+ end
96
+
97
+ def should_run?
98
+ return true if last_run_at.nil?
99
+
100
+ case frequency
101
+ when "day"
102
+ current_time = Time.current
103
+ return false if last_run_at.to_date == current_time.to_date
104
+ current_time.hour == at["hour"]
105
+ when "week"
106
+ current_time = Time.current
107
+ return false if last_run_at.to_date >= current_time.beginning_of_week
108
+ current_time.wday == at["day_of_week"] && current_time.hour == at["hour"]
109
+ when "month"
110
+ current_time = Time.current
111
+ return false if last_run_at.to_date >= current_time.beginning_of_month
112
+ current_time.day == at["day_of_month"] && current_time.hour == at["hour"]
113
+ else
114
+ false
115
+ end
116
+ end
117
+
118
+ def should_tune?
119
+ return false unless tuning_enabled
120
+ return false unless tuning_frequency.present?
121
+ return true if last_tuning_at.nil?
122
+
123
+ case tuning_frequency
124
+ when "always"
125
+ true
126
+ when "hour"
127
+ last_tuning_at < Time.current.beginning_of_hour
128
+ when "day"
129
+ current_time = Time.current
130
+ current_time.hour == at["hour"] && last_tuning_at < current_time.beginning_of_day
131
+ when "week"
132
+ current_time = Time.current
133
+ current_time.hour == at["hour"] && current_time.wday == 0 && last_tuning_at < current_time.beginning_of_week
134
+ when "month"
135
+ current_time = Time.current
136
+ current_time.hour == at["hour"] && current_time.day == 1 && last_tuning_at < current_time.beginning_of_month
137
+ end
138
+ end
139
+
140
+ def metric=(metric)
141
+ write_attribute(:metric, metric)
142
+ set_direction
143
+ end
144
+
145
+ def evaluator
146
+ {
147
+ metric: metric,
148
+ max: direction == "maximize" ? threshold : nil,
149
+ min: direction == "minimize" ? threshold : nil,
150
+ direction: direction,
151
+ }.compact
152
+ end
153
+
154
+ def formatted_frequency
155
+ {
156
+ month: "Monthly",
157
+ week: "Weekly",
158
+ day: "Daily",
159
+ }[frequency.to_sym]
160
+ end
161
+
162
+ private
163
+
164
+ def metric_class
165
+ return nil unless metric
166
+
167
+ EasyML::Core::ModelEvaluator.get(metric).new
168
+ end
169
+
170
+ def set_direction
171
+ return unless metric_class.present?
172
+
173
+ write_attribute(:direction, metric_class.direction)
174
+ end
175
+
176
+ def validate_at_format
177
+ return errors.add(:at, "must be a hash") unless at.is_a?(Hash)
178
+ return if VALID_FREQUENCIES.exclude?(frequency.to_s)
179
+
180
+ required_keys = case frequency
181
+ when "day"
182
+ ["hour"]
183
+ when "week"
184
+ ["hour", "day_of_week"]
185
+ when "month"
186
+ ["hour", "day_of_month"]
187
+ end
188
+
189
+ defaults = {
190
+ "hour" => 0,
191
+ "day_of_week" => 0, # Sunday
192
+ "day_of_month" => 1,
193
+ }
194
+
195
+ missing_keys = required_keys - at.keys.map(&:to_s)
196
+ missing_keys.each do |key|
197
+ at[key] = defaults[key]
198
+ end
199
+
200
+ return if at.blank?
201
+
202
+ allowed_keys = case frequency
203
+ when "day"
204
+ ["hour"]
205
+ when "week"
206
+ ["hour", "day_of_week"]
207
+ when "month"
208
+ ["hour", "day_of_month"]
209
+ end
210
+
211
+ self.at = self.at.select { |k, v| allowed_keys.include?(k.to_s) }.to_h
212
+
213
+ if at["hour"].present?
214
+ errors.add(:at, "hour must be between 0 and 23") unless (0..23).include?(at["hour"].to_i)
215
+ end
216
+
217
+ if at["day_of_week"].present?
218
+ errors.add(:at, "day_of_week must be between 0 and 6") unless (0..6).include?(at["day_of_week"].to_i)
219
+ end
220
+
221
+ if at["day_of_month"].present?
222
+ errors.add(:at, "day_of_month must be between 1 and 31") unless (1..31).include?(at["day_of_month"].to_i)
223
+ end
224
+ end
225
+
226
+ def current_period_start
227
+ current_time = Time.current
228
+ case frequency
229
+ when "hour"
230
+ current_time.beginning_of_hour
231
+ when "day"
232
+ current_time.beginning_of_day
233
+ when "week"
234
+ current_time.beginning_of_week
235
+ when "month"
236
+ current_time.beginning_of_month
237
+ end
238
+ end
239
+
240
+ def evaluator_must_be_valid
241
+ return if evaluator.nil? || evaluator.blank?
242
+
243
+ evaluator = self.evaluator.symbolize_keys
244
+
245
+ unless evaluator[:metric].present? && (evaluator[:min].present? || evaluator[:max].present?)
246
+ errors.add(:evaluator, "must specify metric and either min or max value")
247
+ return
248
+ end
249
+
250
+ errors.add(:evaluator, "min value must be numeric") if evaluator[:min].present? && !evaluator[:min].is_a?(Numeric)
251
+
252
+ errors.add(:evaluator, "max value must be numeric") if evaluator[:max].present? && !evaluator[:max].is_a?(Numeric)
253
+
254
+ metric = evaluator[:metric].to_sym
255
+
256
+ evaluator = EasyML::Core::ModelEvaluator.get(metric)
257
+ unless evaluator.present?
258
+ allowed_metrics = EasyML::Core::ModelEvaluator.metrics
259
+ errors.add(:evaluator, "contains invalid metric. Allowed metrics are #{allowed_metrics}")
260
+ return
261
+ end
262
+
263
+ return unless evaluator.present?
264
+ return if evaluator.new.respond_to?(:evaluate)
265
+
266
+ errors.add(:evaluator, "evaluator must implement evaluate method")
267
+ end
268
+
269
+ def validate_metrics_allowed
270
+ return unless metric
271
+ metric_unknown = EasyML::Core::ModelEvaluator.metrics.exclude?(metric.to_sym)
272
+ return unless metric_unknown
273
+
274
+ errors.add(:metrics,
275
+ "don't know how to handle metric #{metric}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
276
+ end
277
+ end
278
+ end
@@ -0,0 +1,184 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_retraining_runs
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model_id :bigint
7
+ # model_history_id :bigint
8
+ # model_file_id :bigint
9
+ # retraining_job_id :bigint not null
10
+ # tuner_job_id :bigint
11
+ # status :string default("pending")
12
+ # metric_value :float
13
+ # threshold :float
14
+ # trigger :string default("manual")
15
+ # threshold_direction :string
16
+ # started_at :datetime
17
+ # completed_at :datetime
18
+ # error_message :text
19
+ # metadata :jsonb
20
+ # metrics :jsonb
21
+ # best_params :jsonb
22
+ # wandb_url :string
23
+ # snapshot_id :string
24
+ # deployable :boolean
25
+ # is_deploying :boolean
26
+ # deployed :boolean
27
+ # deploy_id :bigint
28
+ # created_at :datetime not null
29
+ # updated_at :datetime not null
30
+ #
31
+ module EasyML
32
+ class RetrainingRun < ActiveRecord::Base
33
+ self.table_name = "easy_ml_retraining_runs"
34
+
35
+ belongs_to :retraining_job
36
+ belongs_to :model, class_name: "EasyML::Model"
37
+ belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
38
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
39
+
40
+ validates :status, presence: true, inclusion: { in: %w[pending running success failed deployed] }
41
+
42
+ scope :running, -> { where(status: "running") }
43
+
44
+ def deploy(async: true)
45
+ deploy = EasyML::Deploy.create!(
46
+ model: model,
47
+ retraining_run: self,
48
+ )
49
+
50
+ deploy.deploy(async: async)
51
+ end
52
+
53
+ def wrap_training(&block)
54
+ return false unless pending?
55
+
56
+ begin
57
+ EasyML::Event.create_event(self, "started")
58
+ update!(status: "running", started_at: Time.current)
59
+
60
+ training_model, best_params = yield
61
+
62
+ if best_params.present?
63
+ tuner = EasyML::TunerJob.where(model: training_model)
64
+ .order(id: :desc)
65
+ .first
66
+ end
67
+
68
+ results = metric_results(training_model)
69
+ failed_reasons = training_model.cannot_deploy_reasons - ["Model has not changed"]
70
+ if results[:deployable] == false
71
+ status = "success"
72
+ else
73
+ status = failed_reasons.any? ? "failed" : "success"
74
+ end
75
+
76
+ if status == "success"
77
+ training_model.save_model_file
78
+ end
79
+
80
+ update!(
81
+ results.merge!(
82
+ status: status,
83
+ completed_at: failed_reasons.none? ? Time.current : nil,
84
+ error_message: failed_reasons.any? ? failed_reasons&.first : nil,
85
+ model: training_model,
86
+ metrics: training_model.evaluate,
87
+ best_params: best_params,
88
+ tuner_job_id: tuner&.id,
89
+ metadata: tuner&.metadata,
90
+ wandb_url: tuner&.wandb_url,
91
+ model_file_id: status == "success" ? training_model.model_file_id : nil,
92
+ )
93
+ )
94
+
95
+ if failed_reasons.any?
96
+ EasyML::Event.handle_error(self, failed_reasons.first)
97
+ else
98
+ EasyML::Event.create_event(self, status)
99
+ end
100
+ params = { last_run_at: Time.current, last_tuning_at: best_params.present? ? Time.current : nil }.compact
101
+ retraining_job.update!(params)
102
+
103
+ reload
104
+ if deployable? && retraining_job.auto_deploy
105
+ training_model.save_model_file
106
+ training_model.reload
107
+ deploy = EasyML::Deploy.create!(retraining_run: self, model: training_model, model_file: training_model.model_file, trigger: trigger)
108
+ deploy.deploy
109
+ end
110
+ true
111
+ rescue => e
112
+ EasyML::Event.handle_error(self, e)
113
+ update!(
114
+ status: "failed",
115
+ completed_at: Time.current,
116
+ error_message: e.message,
117
+ )
118
+ false
119
+ end
120
+ end
121
+
122
+ def pending?
123
+ status == "pending"
124
+ end
125
+
126
+ def deployed?
127
+ status == "deployed"
128
+ end
129
+
130
+ def success?
131
+ status == "success"
132
+ end
133
+
134
+ def failed?
135
+ status == "failed"
136
+ end
137
+
138
+ def running?
139
+ status == "running"
140
+ end
141
+
142
+ def should_tune?
143
+ retraining_job.tuner_config.present? && retraining_job.should_tune?
144
+ end
145
+
146
+ private
147
+
148
+ def metric_results(training_model)
149
+ return training_model.deployable? unless retraining_job.evaluator.present?
150
+
151
+ training_model.dataset.refresh
152
+ evaluator = retraining_job.evaluator.symbolize_keys
153
+ x_true, y_true = training_model.dataset.test(split_ys: true)
154
+ y_pred = training_model.predict(x_true)
155
+
156
+ metric = evaluator[:metric].to_sym
157
+ metrics = EasyML::Core::ModelEvaluator.evaluate(
158
+ model: training_model,
159
+ y_pred: y_pred,
160
+ y_true: y_true,
161
+ evaluator: evaluator,
162
+ )
163
+ metric_value = metrics[metric]
164
+
165
+ # Check against min threshold if present
166
+ if evaluator[:min].present?
167
+ threshold = evaluator[:min]
168
+ threshold_direction = "minimize"
169
+ deployable = metric_value < threshold
170
+ else
171
+ threshold = evaluator[:max]
172
+ threshold_direction = "maximize"
173
+ deployable = metric_value > threshold
174
+ end
175
+
176
+ {
177
+ metric_value: metric_value,
178
+ threshold: threshold,
179
+ threshold_direction: threshold_direction,
180
+ deployable: deployable,
181
+ }
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,37 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_settings
4
+ #
5
+ # id :bigint not null, primary key
6
+ # configuration :json
7
+ # created_at :datetime not null
8
+ # updated_at :datetime not null
9
+ #
10
+ require_relative "concerns/configurable"
11
+
12
+ module EasyML
13
+ class Settings < ActiveRecord::Base
14
+ self.table_name = "easy_ml_settings"
15
+ include EasyML::Concerns::Configurable
16
+
17
+ add_configuration_attributes :storage,
18
+ :s3_access_key_id, :s3_secret_access_key,
19
+ :s3_bucket, :s3_region, :s3_prefix, :timezone,
20
+ :wandb_api_key
21
+
22
+ validates :storage, inclusion: { in: %w[file s3] }, if: -> { storage.present? }
23
+
24
+ TIMEZONES = [
25
+ { value: "America/New_York", label: "Eastern Time" },
26
+ { value: "America/Chicago", label: "Central Time" },
27
+ { value: "America/Denver", label: "Mountain Time" },
28
+ { value: "America/Los_Angeles", label: "Pacific Time" },
29
+ ]
30
+
31
+ def self.constants
32
+ {
33
+ TIMEZONES: TIMEZONES,
34
+ }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,90 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_splitters
4
+ #
5
+ # id :bigint not null, primary key
6
+ # splitter_type :string not null
7
+ # configuration :json
8
+ # dataset_id :bigint not null
9
+ # created_at :datetime not null
10
+ # updated_at :datetime not null
11
+ #
12
+ module EasyML
13
+ class Splitter < ActiveRecord::Base
14
+ self.table_name = "easy_ml_splitters"
15
+ include Historiographer::Silent
16
+ historiographer_mode :snapshot_only
17
+
18
+ include EasyML::Concerns::Configurable
19
+
20
+ SPLITTER_OPTIONS = {
21
+ "date" => "EasyML::Splitters::DateSplitter",
22
+ "random" => "EasyML::Splitters::RandomSplitter",
23
+ "predefined" => "EasyML::Splitters::PredefinedSplitter",
24
+ }
25
+ SPLITTER_TYPES = [
26
+ {
27
+ value: "date",
28
+ label: "Date Splitter",
29
+ description: "Split dataset based on date ranges for training, validation, and testing",
30
+ },
31
+ {
32
+ value: "random",
33
+ label: "Random Splitter",
34
+ description: "Randomly split dataset into training, validation, and testing sets with configurable ratios",
35
+ },
36
+ {
37
+ value: "predefined",
38
+ label: "Predefined Splitter",
39
+ description: "Split dataset using predefined file assignments for training, validation, and testing sets",
40
+ },
41
+ ].freeze
42
+
43
+ belongs_to :dataset, class_name: "EasyML::Dataset"
44
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
45
+
46
+ validates :splitter_type, presence: true
47
+ validates :splitter_type, inclusion: { in: SPLITTER_OPTIONS.keys }
48
+
49
+ SPLITTER_NAMES = SPLITTER_OPTIONS.keys.freeze
50
+ SPLITTER_CONSTANTS = SPLITTER_OPTIONS.values.map(&:constantize)
51
+ SPLITTER_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
52
+ add_configuration_attributes attribute
53
+ end
54
+
55
+ def self.constants
56
+ {
57
+ SPLITTER_TYPES: SPLITTER_TYPES,
58
+ DEFAULT_CONFIGS: SPLITTER_OPTIONS.reduce({}) do |h, (type, klass)|
59
+ h.tap do
60
+ h[type] = klass.constantize.default_config
61
+ end
62
+ end,
63
+ }
64
+ end
65
+
66
+ def split(df, &block)
67
+ adapter.split(df, &block)
68
+ end
69
+
70
+ def splits
71
+ adapter.splits
72
+ end
73
+
74
+ private
75
+
76
+ def adapter
77
+ @adapter ||= begin
78
+ adapter_class = SPLITTER_OPTIONS[splitter_type]
79
+ raise "Don't know how to use splitter #{splitter_type}!" unless adapter_class.present?
80
+
81
+ attrs = adapter_class.constantize.configuration_attributes
82
+ adapter_class.constantize.new(self).tap do |adapter|
83
+ attrs.each do |attr|
84
+ adapter.send("#{attr}=", send(attr))
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,28 @@
1
+ module EasyML
2
+ module Splitters
3
+ class BaseSplitter
4
+ include ActiveModel::Validations
5
+ include EasyML::Concerns::Configurable
6
+
7
+ attr_reader :splitter
8
+
9
+ def split(datasource, &block)
10
+ datasource.in_batches do |df|
11
+ split_df(df).tap do |splits|
12
+ yield splits if block_given?
13
+ end
14
+ end
15
+ end
16
+
17
+ def split_df(df)
18
+ df
19
+ end
20
+
21
+ def initialize(splitter)
22
+ @splitter = splitter
23
+ end
24
+
25
+ delegate :dataset, to: :splitter
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,91 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_splitters
4
+ #
5
+ # id :bigint not null, primary key
6
+ # splitter_type :string not null
7
+ # configuration :json
8
+ # dataset_id :bigint not null
9
+ # created_at :datetime not null
10
+ # updated_at :datetime not null
11
+ #
12
+ require_relative "base_splitter"
13
+
14
+ module EasyML
15
+ module Splitters
16
+ class DateSplitter < BaseSplitter
17
+ validates :date_col, presence: true
18
+ validates :months_test, presence: true, numericality: { greater_than: 0 }
19
+ validates :months_valid, presence: true, numericality: { greater_than: 0 }
20
+
21
+ attr_accessor :today, :date_col, :months_test, :months_valid
22
+
23
+ add_configuration_attributes :today, :date_col, :months_test, :months_valid
24
+
25
+ def self.default_config
26
+ {
27
+ date_col: "",
28
+ months_test: 2,
29
+ months_valid: 2,
30
+ }
31
+ end
32
+
33
+ def split_df(df)
34
+ raise "Split by date requires argument: date_col" unless date_col.present?
35
+
36
+ df = EasyML::Data::DateConverter.maybe_convert_date(df, date_col)
37
+
38
+ unless df[date_col].dtype.is_a?(Polars::Datetime)
39
+ raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
40
+ end
41
+
42
+ validation_date_start, test_date_start = splits
43
+
44
+ test_df = Polars.concat(
45
+ [
46
+ df.filter(Polars.col(date_col) >= test_date_start),
47
+ df.filter(Polars.col(date_col).is_null),
48
+ ]
49
+ )
50
+ remaining_df = df.filter(Polars.col(date_col) < test_date_start)
51
+ valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
52
+ train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
53
+
54
+ [train_df, valid_df, test_df]
55
+ end
56
+
57
+ def months(n)
58
+ ActiveSupport::Duration.months(n)
59
+ end
60
+
61
+ def splits
62
+ reference_date = to_datetime(datasource_end) || today
63
+ test_date_start = reference_date.advance(months: -months_test).beginning_of_day
64
+ validation_date_start = test_date_start.advance(months: -months_valid).beginning_of_day
65
+ [validation_date_start, test_date_start]
66
+ end
67
+
68
+ def datasource_end
69
+ return @datasource_end if @datasource_end
70
+
71
+ @datasource_end = dataset.datasource.query(sort: date_col, descending: true, limit: 1,
72
+ select: date_col)[date_col]&.to_a&.first
73
+ end
74
+
75
+ def to_datetime(field, default: nil)
76
+ case field
77
+ when String
78
+ UTC.parse(field)
79
+ when NilClass
80
+ default
81
+ else
82
+ field
83
+ end
84
+ end
85
+
86
+ def today
87
+ to_datetime(@today, default: UTC.today)
88
+ end
89
+ end
90
+ end
91
+ end