easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,278 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_retraining_jobs
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model_id :bigint
7
+ # frequency :string not null
8
+ # at :json not null
9
+ # evaluator :json
10
+ # tuning_enabled :boolean default(FALSE)
11
+ # tuner_config :json
12
+ # tuning_frequency :string
13
+ # last_tuning_at :datetime
14
+ # active :boolean default(TRUE)
15
+ # status :string default("pending")
16
+ # last_run_at :datetime
17
+ # metric :string not null
18
+ # direction :string not null
19
+ # threshold :float not null
20
+ # auto_deploy :boolean default(FALSE)
21
+ # batch_mode :boolean
22
+ # batch_size :integer
23
+ # batch_overlap :integer
24
+ # batch_key :string
25
+ # created_at :datetime not null
26
+ # updated_at :datetime not null
27
+ #
28
+ module EasyML
29
+ class RetrainingJob < ActiveRecord::Base
30
+ self.table_name = "easy_ml_retraining_jobs"
31
+
32
+ has_many :retraining_runs, class_name: "EasyML::RetrainingRun", dependent: :destroy
33
+ has_many :tuner_jobs, through: :retraining_runs
34
+
35
+ belongs_to :model, class_name: "EasyML::Model", inverse_of: :retraining_job
36
+ validates :model, presence: true,
37
+ uniqueness: { message: "already has a retraining job" }
38
+
39
+ VALID_FREQUENCIES = %w[day week month always].freeze
40
+ FREQUENCY_TYPES = [
41
+ {
42
+ value: "day",
43
+ label: "Daily",
44
+ description: "Run once every day",
45
+ },
46
+ {
47
+ value: "week",
48
+ label: "Weekly",
49
+ description: "Run once every week",
50
+ },
51
+ {
52
+ value: "month",
53
+ label: "Monthly",
54
+ description: "Run once every month",
55
+ },
56
+ ].freeze
57
+ validates :frequency, presence: true, inclusion: { in: VALID_FREQUENCIES }
58
+ validates :metric, presence: true
59
+ validate :validate_metrics_allowed
60
+ validates :status, presence: true
61
+ validates :at, presence: true
62
+ validates :threshold, presence: true
63
+ validates :tuning_frequency, inclusion: {
64
+ in: VALID_FREQUENCIES,
65
+ allow_nil: true,
66
+ }
67
+ validate :evaluator_must_be_valid
68
+ validate :validate_at_format
69
+ after_initialize :set_direction, unless: :persisted?
70
+
71
+ scope :active, -> { joins(:model).where(active: true) }
72
+
73
+ def self.current
74
+ active.select do |job|
75
+ job.should_run?
76
+ end
77
+ end
78
+
79
+ def self.constants
80
+ {
81
+ frequency: FREQUENCY_TYPES,
82
+ }
83
+ end
84
+
85
+ def tuner_config
86
+ (read_attribute(:tuner_config) || {}).merge!(objective: metric).stringify_keys
87
+ end
88
+
89
+ def formatted_frequency
90
+ if active
91
+ FREQUENCY_TYPES.find { |type| type[:value] == frequency }[:label]
92
+ else
93
+ "Manually"
94
+ end
95
+ end
96
+
97
+ def should_run?
98
+ return true if last_run_at.nil?
99
+
100
+ case frequency
101
+ when "day"
102
+ current_time = Time.current
103
+ return false if last_run_at.to_date == current_time.to_date
104
+ current_time.hour == at["hour"]
105
+ when "week"
106
+ current_time = Time.current
107
+ return false if last_run_at.to_date >= current_time.beginning_of_week
108
+ current_time.wday == at["day_of_week"] && current_time.hour == at["hour"]
109
+ when "month"
110
+ current_time = Time.current
111
+ return false if last_run_at.to_date >= current_time.beginning_of_month
112
+ current_time.day == at["day_of_month"] && current_time.hour == at["hour"]
113
+ else
114
+ false
115
+ end
116
+ end
117
+
118
+ def should_tune?
119
+ return false unless tuning_enabled
120
+ return false unless tuning_frequency.present?
121
+ return true if last_tuning_at.nil?
122
+
123
+ case tuning_frequency
124
+ when "always"
125
+ true
126
+ when "hour"
127
+ last_tuning_at < Time.current.beginning_of_hour
128
+ when "day"
129
+ current_time = Time.current
130
+ current_time.hour == at["hour"] && last_tuning_at < current_time.beginning_of_day
131
+ when "week"
132
+ current_time = Time.current
133
+ current_time.hour == at["hour"] && current_time.wday == 0 && last_tuning_at < current_time.beginning_of_week
134
+ when "month"
135
+ current_time = Time.current
136
+ current_time.hour == at["hour"] && current_time.day == 1 && last_tuning_at < current_time.beginning_of_month
137
+ end
138
+ end
139
+
140
+ def metric=(metric)
141
+ write_attribute(:metric, metric)
142
+ set_direction
143
+ end
144
+
145
+ def evaluator
146
+ {
147
+ metric: metric,
148
+ max: direction == "maximize" ? threshold : nil,
149
+ min: direction == "minimize" ? threshold : nil,
150
+ direction: direction,
151
+ }.compact
152
+ end
153
+
154
+ def formatted_frequency
155
+ {
156
+ month: "Monthly",
157
+ week: "Weekly",
158
+ day: "Daily",
159
+ }[frequency.to_sym]
160
+ end
161
+
162
+ private
163
+
164
+ def metric_class
165
+ return nil unless metric
166
+
167
+ EasyML::Core::ModelEvaluator.get(metric).new
168
+ end
169
+
170
+ def set_direction
171
+ return unless metric_class.present?
172
+
173
+ write_attribute(:direction, metric_class.direction)
174
+ end
175
+
176
+ def validate_at_format
177
+ return errors.add(:at, "must be a hash") unless at.is_a?(Hash)
178
+ return if VALID_FREQUENCIES.exclude?(frequency.to_s)
179
+
180
+ required_keys = case frequency
181
+ when "day"
182
+ ["hour"]
183
+ when "week"
184
+ ["hour", "day_of_week"]
185
+ when "month"
186
+ ["hour", "day_of_month"]
187
+ end
188
+
189
+ defaults = {
190
+ "hour" => 0,
191
+ "day_of_week" => 0, # Sunday
192
+ "day_of_month" => 1,
193
+ }
194
+
195
+ missing_keys = required_keys - at.keys.map(&:to_s)
196
+ missing_keys.each do |key|
197
+ at[key] = defaults[key]
198
+ end
199
+
200
+ return if at.blank?
201
+
202
+ allowed_keys = case frequency
203
+ when "day"
204
+ ["hour"]
205
+ when "week"
206
+ ["hour", "day_of_week"]
207
+ when "month"
208
+ ["hour", "day_of_month"]
209
+ end
210
+
211
+ self.at = self.at.select { |k, v| allowed_keys.include?(k.to_s) }.to_h
212
+
213
+ if at["hour"].present?
214
+ errors.add(:at, "hour must be between 0 and 23") unless (0..23).include?(at["hour"].to_i)
215
+ end
216
+
217
+ if at["day_of_week"].present?
218
+ errors.add(:at, "day_of_week must be between 0 and 6") unless (0..6).include?(at["day_of_week"].to_i)
219
+ end
220
+
221
+ if at["day_of_month"].present?
222
+ errors.add(:at, "day_of_month must be between 1 and 31") unless (1..31).include?(at["day_of_month"].to_i)
223
+ end
224
+ end
225
+
226
+ def current_period_start
227
+ current_time = Time.current
228
+ case frequency
229
+ when "hour"
230
+ current_time.beginning_of_hour
231
+ when "day"
232
+ current_time.beginning_of_day
233
+ when "week"
234
+ current_time.beginning_of_week
235
+ when "month"
236
+ current_time.beginning_of_month
237
+ end
238
+ end
239
+
240
+ def evaluator_must_be_valid
241
+ return if evaluator.nil? || evaluator.blank?
242
+
243
+ evaluator = self.evaluator.symbolize_keys
244
+
245
+ unless evaluator[:metric].present? && (evaluator[:min].present? || evaluator[:max].present?)
246
+ errors.add(:evaluator, "must specify metric and either min or max value")
247
+ return
248
+ end
249
+
250
+ errors.add(:evaluator, "min value must be numeric") if evaluator[:min].present? && !evaluator[:min].is_a?(Numeric)
251
+
252
+ errors.add(:evaluator, "max value must be numeric") if evaluator[:max].present? && !evaluator[:max].is_a?(Numeric)
253
+
254
+ metric = evaluator[:metric].to_sym
255
+
256
+ evaluator = EasyML::Core::ModelEvaluator.get(metric)
257
+ unless evaluator.present?
258
+ allowed_metrics = EasyML::Core::ModelEvaluator.metrics
259
+ errors.add(:evaluator, "contains invalid metric. Allowed metrics are #{allowed_metrics}")
260
+ return
261
+ end
262
+
263
+ return unless evaluator.present?
264
+ return if evaluator.new.respond_to?(:evaluate)
265
+
266
+ errors.add(:evaluator, "evaluator must implement evaluate method")
267
+ end
268
+
269
+ def validate_metrics_allowed
270
+ return unless metric
271
+ metric_unknown = EasyML::Core::ModelEvaluator.metrics.exclude?(metric.to_sym)
272
+ return unless metric_unknown
273
+
274
+ errors.add(:metrics,
275
+ "don't know how to handle metric #{metric}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
276
+ end
277
+ end
278
+ end
@@ -0,0 +1,184 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_retraining_runs
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model_id :bigint
7
+ # model_history_id :bigint
8
+ # model_file_id :bigint
9
+ # retraining_job_id :bigint not null
10
+ # tuner_job_id :bigint
11
+ # status :string default("pending")
12
+ # metric_value :float
13
+ # threshold :float
14
+ # trigger :string default("manual")
15
+ # threshold_direction :string
16
+ # started_at :datetime
17
+ # completed_at :datetime
18
+ # error_message :text
19
+ # metadata :jsonb
20
+ # metrics :jsonb
21
+ # best_params :jsonb
22
+ # wandb_url :string
23
+ # snapshot_id :string
24
+ # deployable :boolean
25
+ # is_deploying :boolean
26
+ # deployed :boolean
27
+ # deploy_id :bigint
28
+ # created_at :datetime not null
29
+ # updated_at :datetime not null
30
+ #
31
+ module EasyML
32
+ class RetrainingRun < ActiveRecord::Base
33
+ self.table_name = "easy_ml_retraining_runs"
34
+
35
+ belongs_to :retraining_job
36
+ belongs_to :model, class_name: "EasyML::Model"
37
+ belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
38
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
39
+
40
+ validates :status, presence: true, inclusion: { in: %w[pending running success failed deployed] }
41
+
42
+ scope :running, -> { where(status: "running") }
43
+
44
+ def deploy(async: true)
45
+ deploy = EasyML::Deploy.create!(
46
+ model: model,
47
+ retraining_run: self,
48
+ )
49
+
50
+ deploy.deploy(async: async)
51
+ end
52
+
53
+ def wrap_training(&block)
54
+ return false unless pending?
55
+
56
+ begin
57
+ EasyML::Event.create_event(self, "started")
58
+ update!(status: "running", started_at: Time.current)
59
+
60
+ training_model, best_params = yield
61
+
62
+ if best_params.present?
63
+ tuner = EasyML::TunerJob.where(model: training_model)
64
+ .order(id: :desc)
65
+ .first
66
+ end
67
+
68
+ results = metric_results(training_model)
69
+ failed_reasons = training_model.cannot_deploy_reasons - ["Model has not changed"]
70
+ if results[:deployable] == false
71
+ status = "success"
72
+ else
73
+ status = failed_reasons.any? ? "failed" : "success"
74
+ end
75
+
76
+ if status == "success"
77
+ training_model.save_model_file
78
+ end
79
+
80
+ update!(
81
+ results.merge!(
82
+ status: status,
83
+ completed_at: failed_reasons.none? ? Time.current : nil,
84
+ error_message: failed_reasons.any? ? failed_reasons&.first : nil,
85
+ model: training_model,
86
+ metrics: training_model.evaluate,
87
+ best_params: best_params,
88
+ tuner_job_id: tuner&.id,
89
+ metadata: tuner&.metadata,
90
+ wandb_url: tuner&.wandb_url,
91
+ model_file_id: status == "success" ? training_model.model_file_id : nil,
92
+ )
93
+ )
94
+
95
+ if failed_reasons.any?
96
+ EasyML::Event.handle_error(self, failed_reasons.first)
97
+ else
98
+ EasyML::Event.create_event(self, status)
99
+ end
100
+ params = { last_run_at: Time.current, last_tuning_at: best_params.present? ? Time.current : nil }.compact
101
+ retraining_job.update!(params)
102
+
103
+ reload
104
+ if deployable? && retraining_job.auto_deploy
105
+ training_model.save_model_file
106
+ training_model.reload
107
+ deploy = EasyML::Deploy.create!(retraining_run: self, model: training_model, model_file: training_model.model_file, trigger: trigger)
108
+ deploy.deploy
109
+ end
110
+ true
111
+ rescue => e
112
+ EasyML::Event.handle_error(self, e)
113
+ update!(
114
+ status: "failed",
115
+ completed_at: Time.current,
116
+ error_message: e.message,
117
+ )
118
+ false
119
+ end
120
+ end
121
+
122
+ def pending?
123
+ status == "pending"
124
+ end
125
+
126
+ def deployed?
127
+ status == "deployed"
128
+ end
129
+
130
+ def success?
131
+ status == "success"
132
+ end
133
+
134
+ def failed?
135
+ status == "failed"
136
+ end
137
+
138
+ def running?
139
+ status == "running"
140
+ end
141
+
142
+ def should_tune?
143
+ retraining_job.tuner_config.present? && retraining_job.should_tune?
144
+ end
145
+
146
+ private
147
+
148
+ def metric_results(training_model)
149
+ return training_model.deployable? unless retraining_job.evaluator.present?
150
+
151
+ training_model.dataset.refresh
152
+ evaluator = retraining_job.evaluator.symbolize_keys
153
+ x_true, y_true = training_model.dataset.test(split_ys: true)
154
+ y_pred = training_model.predict(x_true)
155
+
156
+ metric = evaluator[:metric].to_sym
157
+ metrics = EasyML::Core::ModelEvaluator.evaluate(
158
+ model: training_model,
159
+ y_pred: y_pred,
160
+ y_true: y_true,
161
+ evaluator: evaluator,
162
+ )
163
+ metric_value = metrics[metric]
164
+
165
+ # Check against min threshold if present
166
+ if evaluator[:min].present?
167
+ threshold = evaluator[:min]
168
+ threshold_direction = "minimize"
169
+ deployable = metric_value < threshold
170
+ else
171
+ threshold = evaluator[:max]
172
+ threshold_direction = "maximize"
173
+ deployable = metric_value > threshold
174
+ end
175
+
176
+ {
177
+ metric_value: metric_value,
178
+ threshold: threshold,
179
+ threshold_direction: threshold_direction,
180
+ deployable: deployable,
181
+ }
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,37 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_settings
4
+ #
5
+ # id :bigint not null, primary key
6
+ # configuration :json
7
+ # created_at :datetime not null
8
+ # updated_at :datetime not null
9
+ #
10
+ require_relative "concerns/configurable"
11
+
12
+ module EasyML
13
+ class Settings < ActiveRecord::Base
14
+ self.table_name = "easy_ml_settings"
15
+ include EasyML::Concerns::Configurable
16
+
17
+ add_configuration_attributes :storage,
18
+ :s3_access_key_id, :s3_secret_access_key,
19
+ :s3_bucket, :s3_region, :s3_prefix, :timezone,
20
+ :wandb_api_key
21
+
22
+ validates :storage, inclusion: { in: %w[file s3] }, if: -> { storage.present? }
23
+
24
+ TIMEZONES = [
25
+ { value: "America/New_York", label: "Eastern Time" },
26
+ { value: "America/Chicago", label: "Central Time" },
27
+ { value: "America/Denver", label: "Mountain Time" },
28
+ { value: "America/Los_Angeles", label: "Pacific Time" },
29
+ ]
30
+
31
+ def self.constants
32
+ {
33
+ TIMEZONES: TIMEZONES,
34
+ }
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,90 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_splitters
4
+ #
5
+ # id :bigint not null, primary key
6
+ # splitter_type :string not null
7
+ # configuration :json
8
+ # dataset_id :bigint not null
9
+ # created_at :datetime not null
10
+ # updated_at :datetime not null
11
+ #
12
+ module EasyML
13
+ class Splitter < ActiveRecord::Base
14
+ self.table_name = "easy_ml_splitters"
15
+ include Historiographer::Silent
16
+ historiographer_mode :snapshot_only
17
+
18
+ include EasyML::Concerns::Configurable
19
+
20
+ SPLITTER_OPTIONS = {
21
+ "date" => "EasyML::Splitters::DateSplitter",
22
+ "random" => "EasyML::Splitters::RandomSplitter",
23
+ "predefined" => "EasyML::Splitters::PredefinedSplitter",
24
+ }
25
+ SPLITTER_TYPES = [
26
+ {
27
+ value: "date",
28
+ label: "Date Splitter",
29
+ description: "Split dataset based on date ranges for training, validation, and testing",
30
+ },
31
+ {
32
+ value: "random",
33
+ label: "Random Splitter",
34
+ description: "Randomly split dataset into training, validation, and testing sets with configurable ratios",
35
+ },
36
+ {
37
+ value: "predefined",
38
+ label: "Predefined Splitter",
39
+ description: "Split dataset using predefined file assignments for training, validation, and testing sets",
40
+ },
41
+ ].freeze
42
+
43
+ belongs_to :dataset, class_name: "EasyML::Dataset"
44
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
45
+
46
+ validates :splitter_type, presence: true
47
+ validates :splitter_type, inclusion: { in: SPLITTER_OPTIONS.keys }
48
+
49
+ SPLITTER_NAMES = SPLITTER_OPTIONS.keys.freeze
50
+ SPLITTER_CONSTANTS = SPLITTER_OPTIONS.values.map(&:constantize)
51
+ SPLITTER_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
52
+ add_configuration_attributes attribute
53
+ end
54
+
55
+ def self.constants
56
+ {
57
+ SPLITTER_TYPES: SPLITTER_TYPES,
58
+ DEFAULT_CONFIGS: SPLITTER_OPTIONS.reduce({}) do |h, (type, klass)|
59
+ h.tap do
60
+ h[type] = klass.constantize.default_config
61
+ end
62
+ end,
63
+ }
64
+ end
65
+
66
+ def split(df, &block)
67
+ adapter.split(df, &block)
68
+ end
69
+
70
+ def splits
71
+ adapter.splits
72
+ end
73
+
74
+ private
75
+
76
+ def adapter
77
+ @adapter ||= begin
78
+ adapter_class = SPLITTER_OPTIONS[splitter_type]
79
+ raise "Don't know how to use splitter #{splitter_type}!" unless adapter_class.present?
80
+
81
+ attrs = adapter_class.constantize.configuration_attributes
82
+ adapter_class.constantize.new(self).tap do |adapter|
83
+ attrs.each do |attr|
84
+ adapter.send("#{attr}=", send(attr))
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,28 @@
1
+ module EasyML
2
+ module Splitters
3
+ class BaseSplitter
4
+ include ActiveModel::Validations
5
+ include EasyML::Concerns::Configurable
6
+
7
+ attr_reader :splitter
8
+
9
+ def split(datasource, &block)
10
+ datasource.in_batches do |df|
11
+ split_df(df).tap do |splits|
12
+ yield splits if block_given?
13
+ end
14
+ end
15
+ end
16
+
17
+ def split_df(df)
18
+ df
19
+ end
20
+
21
+ def initialize(splitter)
22
+ @splitter = splitter
23
+ end
24
+
25
+ delegate :dataset, to: :splitter
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,91 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_splitters
4
+ #
5
+ # id :bigint not null, primary key
6
+ # splitter_type :string not null
7
+ # configuration :json
8
+ # dataset_id :bigint not null
9
+ # created_at :datetime not null
10
+ # updated_at :datetime not null
11
+ #
12
+ require_relative "base_splitter"
13
+
14
+ module EasyML
15
+ module Splitters
16
+ class DateSplitter < BaseSplitter
17
+ validates :date_col, presence: true
18
+ validates :months_test, presence: true, numericality: { greater_than: 0 }
19
+ validates :months_valid, presence: true, numericality: { greater_than: 0 }
20
+
21
+ attr_accessor :today, :date_col, :months_test, :months_valid
22
+
23
+ add_configuration_attributes :today, :date_col, :months_test, :months_valid
24
+
25
+ def self.default_config
26
+ {
27
+ date_col: "",
28
+ months_test: 2,
29
+ months_valid: 2,
30
+ }
31
+ end
32
+
33
+ def split_df(df)
34
+ raise "Split by date requires argument: date_col" unless date_col.present?
35
+
36
+ df = EasyML::Data::DateConverter.maybe_convert_date(df, date_col)
37
+
38
+ unless df[date_col].dtype.is_a?(Polars::Datetime)
39
+ raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
40
+ end
41
+
42
+ validation_date_start, test_date_start = splits
43
+
44
+ test_df = Polars.concat(
45
+ [
46
+ df.filter(Polars.col(date_col) >= test_date_start),
47
+ df.filter(Polars.col(date_col).is_null),
48
+ ]
49
+ )
50
+ remaining_df = df.filter(Polars.col(date_col) < test_date_start)
51
+ valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
52
+ train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
53
+
54
+ [train_df, valid_df, test_df]
55
+ end
56
+
57
+ def months(n)
58
+ ActiveSupport::Duration.months(n)
59
+ end
60
+
61
+ def splits
62
+ reference_date = to_datetime(datasource_end) || today
63
+ test_date_start = reference_date.advance(months: -months_test).beginning_of_day
64
+ validation_date_start = test_date_start.advance(months: -months_valid).beginning_of_day
65
+ [validation_date_start, test_date_start]
66
+ end
67
+
68
+ def datasource_end
69
+ return @datasource_end if @datasource_end
70
+
71
+ @datasource_end = dataset.datasource.query(sort: date_col, descending: true, limit: 1,
72
+ select: date_col)[date_col]&.to_a&.first
73
+ end
74
+
75
+ def to_datetime(field, default: nil)
76
+ case field
77
+ when String
78
+ UTC.parse(field)
79
+ when NilClass
80
+ default
81
+ else
82
+ field
83
+ end
84
+ end
85
+
86
+ def today
87
+ to_datetime(@today, default: UTC.today)
88
+ end
89
+ end
90
+ end
91
+ end