easy_ml 0.1.4 → 0.2.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,278 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_retraining_jobs
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# model_id :bigint
|
7
|
+
# frequency :string not null
|
8
|
+
# at :json not null
|
9
|
+
# evaluator :json
|
10
|
+
# tuning_enabled :boolean default(FALSE)
|
11
|
+
# tuner_config :json
|
12
|
+
# tuning_frequency :string
|
13
|
+
# last_tuning_at :datetime
|
14
|
+
# active :boolean default(TRUE)
|
15
|
+
# status :string default("pending")
|
16
|
+
# last_run_at :datetime
|
17
|
+
# metric :string not null
|
18
|
+
# direction :string not null
|
19
|
+
# threshold :float not null
|
20
|
+
# auto_deploy :boolean default(FALSE)
|
21
|
+
# batch_mode :boolean
|
22
|
+
# batch_size :integer
|
23
|
+
# batch_overlap :integer
|
24
|
+
# batch_key :string
|
25
|
+
# created_at :datetime not null
|
26
|
+
# updated_at :datetime not null
|
27
|
+
#
|
28
|
+
module EasyML
|
29
|
+
class RetrainingJob < ActiveRecord::Base
|
30
|
+
self.table_name = "easy_ml_retraining_jobs"
|
31
|
+
|
32
|
+
has_many :retraining_runs, class_name: "EasyML::RetrainingRun", dependent: :destroy
|
33
|
+
has_many :tuner_jobs, through: :retraining_runs
|
34
|
+
|
35
|
+
belongs_to :model, class_name: "EasyML::Model", inverse_of: :retraining_job
|
36
|
+
validates :model, presence: true,
|
37
|
+
uniqueness: { message: "already has a retraining job" }
|
38
|
+
|
39
|
+
VALID_FREQUENCIES = %w[day week month always].freeze
|
40
|
+
FREQUENCY_TYPES = [
|
41
|
+
{
|
42
|
+
value: "day",
|
43
|
+
label: "Daily",
|
44
|
+
description: "Run once every day",
|
45
|
+
},
|
46
|
+
{
|
47
|
+
value: "week",
|
48
|
+
label: "Weekly",
|
49
|
+
description: "Run once every week",
|
50
|
+
},
|
51
|
+
{
|
52
|
+
value: "month",
|
53
|
+
label: "Monthly",
|
54
|
+
description: "Run once every month",
|
55
|
+
},
|
56
|
+
].freeze
|
57
|
+
validates :frequency, presence: true, inclusion: { in: VALID_FREQUENCIES }
|
58
|
+
validates :metric, presence: true
|
59
|
+
validate :validate_metrics_allowed
|
60
|
+
validates :status, presence: true
|
61
|
+
validates :at, presence: true
|
62
|
+
validates :threshold, presence: true
|
63
|
+
validates :tuning_frequency, inclusion: {
|
64
|
+
in: VALID_FREQUENCIES,
|
65
|
+
allow_nil: true,
|
66
|
+
}
|
67
|
+
validate :evaluator_must_be_valid
|
68
|
+
validate :validate_at_format
|
69
|
+
after_initialize :set_direction, unless: :persisted?
|
70
|
+
|
71
|
+
scope :active, -> { joins(:model).where(active: true) }
|
72
|
+
|
73
|
+
def self.current
|
74
|
+
active.select do |job|
|
75
|
+
job.should_run?
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.constants
|
80
|
+
{
|
81
|
+
frequency: FREQUENCY_TYPES,
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
def tuner_config
|
86
|
+
(read_attribute(:tuner_config) || {}).merge!(objective: metric).stringify_keys
|
87
|
+
end
|
88
|
+
|
89
|
+
def formatted_frequency
|
90
|
+
if active
|
91
|
+
FREQUENCY_TYPES.find { |type| type[:value] == frequency }[:label]
|
92
|
+
else
|
93
|
+
"Manually"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def should_run?
|
98
|
+
return true if last_run_at.nil?
|
99
|
+
|
100
|
+
case frequency
|
101
|
+
when "day"
|
102
|
+
current_time = Time.current
|
103
|
+
return false if last_run_at.to_date == current_time.to_date
|
104
|
+
current_time.hour == at["hour"]
|
105
|
+
when "week"
|
106
|
+
current_time = Time.current
|
107
|
+
return false if last_run_at.to_date >= current_time.beginning_of_week
|
108
|
+
current_time.wday == at["day_of_week"] && current_time.hour == at["hour"]
|
109
|
+
when "month"
|
110
|
+
current_time = Time.current
|
111
|
+
return false if last_run_at.to_date >= current_time.beginning_of_month
|
112
|
+
current_time.day == at["day_of_month"] && current_time.hour == at["hour"]
|
113
|
+
else
|
114
|
+
false
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def should_tune?
|
119
|
+
return false unless tuning_enabled
|
120
|
+
return false unless tuning_frequency.present?
|
121
|
+
return true if last_tuning_at.nil?
|
122
|
+
|
123
|
+
case tuning_frequency
|
124
|
+
when "always"
|
125
|
+
true
|
126
|
+
when "hour"
|
127
|
+
last_tuning_at < Time.current.beginning_of_hour
|
128
|
+
when "day"
|
129
|
+
current_time = Time.current
|
130
|
+
current_time.hour == at["hour"] && last_tuning_at < current_time.beginning_of_day
|
131
|
+
when "week"
|
132
|
+
current_time = Time.current
|
133
|
+
current_time.hour == at["hour"] && current_time.wday == 0 && last_tuning_at < current_time.beginning_of_week
|
134
|
+
when "month"
|
135
|
+
current_time = Time.current
|
136
|
+
current_time.hour == at["hour"] && current_time.day == 1 && last_tuning_at < current_time.beginning_of_month
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def metric=(metric)
|
141
|
+
write_attribute(:metric, metric)
|
142
|
+
set_direction
|
143
|
+
end
|
144
|
+
|
145
|
+
def evaluator
|
146
|
+
{
|
147
|
+
metric: metric,
|
148
|
+
max: direction == "maximize" ? threshold : nil,
|
149
|
+
min: direction == "minimize" ? threshold : nil,
|
150
|
+
direction: direction,
|
151
|
+
}.compact
|
152
|
+
end
|
153
|
+
|
154
|
+
def formatted_frequency
|
155
|
+
{
|
156
|
+
month: "Monthly",
|
157
|
+
week: "Weekly",
|
158
|
+
day: "Daily",
|
159
|
+
}[frequency.to_sym]
|
160
|
+
end
|
161
|
+
|
162
|
+
private
|
163
|
+
|
164
|
+
def metric_class
|
165
|
+
return nil unless metric
|
166
|
+
|
167
|
+
EasyML::Core::ModelEvaluator.get(metric).new
|
168
|
+
end
|
169
|
+
|
170
|
+
def set_direction
|
171
|
+
return unless metric_class.present?
|
172
|
+
|
173
|
+
write_attribute(:direction, metric_class.direction)
|
174
|
+
end
|
175
|
+
|
176
|
+
def validate_at_format
|
177
|
+
return errors.add(:at, "must be a hash") unless at.is_a?(Hash)
|
178
|
+
return if VALID_FREQUENCIES.exclude?(frequency.to_s)
|
179
|
+
|
180
|
+
required_keys = case frequency
|
181
|
+
when "day"
|
182
|
+
["hour"]
|
183
|
+
when "week"
|
184
|
+
["hour", "day_of_week"]
|
185
|
+
when "month"
|
186
|
+
["hour", "day_of_month"]
|
187
|
+
end
|
188
|
+
|
189
|
+
defaults = {
|
190
|
+
"hour" => 0,
|
191
|
+
"day_of_week" => 0, # Sunday
|
192
|
+
"day_of_month" => 1,
|
193
|
+
}
|
194
|
+
|
195
|
+
missing_keys = required_keys - at.keys.map(&:to_s)
|
196
|
+
missing_keys.each do |key|
|
197
|
+
at[key] = defaults[key]
|
198
|
+
end
|
199
|
+
|
200
|
+
return if at.blank?
|
201
|
+
|
202
|
+
allowed_keys = case frequency
|
203
|
+
when "day"
|
204
|
+
["hour"]
|
205
|
+
when "week"
|
206
|
+
["hour", "day_of_week"]
|
207
|
+
when "month"
|
208
|
+
["hour", "day_of_month"]
|
209
|
+
end
|
210
|
+
|
211
|
+
self.at = self.at.select { |k, v| allowed_keys.include?(k.to_s) }.to_h
|
212
|
+
|
213
|
+
if at["hour"].present?
|
214
|
+
errors.add(:at, "hour must be between 0 and 23") unless (0..23).include?(at["hour"].to_i)
|
215
|
+
end
|
216
|
+
|
217
|
+
if at["day_of_week"].present?
|
218
|
+
errors.add(:at, "day_of_week must be between 0 and 6") unless (0..6).include?(at["day_of_week"].to_i)
|
219
|
+
end
|
220
|
+
|
221
|
+
if at["day_of_month"].present?
|
222
|
+
errors.add(:at, "day_of_month must be between 1 and 31") unless (1..31).include?(at["day_of_month"].to_i)
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def current_period_start
|
227
|
+
current_time = Time.current
|
228
|
+
case frequency
|
229
|
+
when "hour"
|
230
|
+
current_time.beginning_of_hour
|
231
|
+
when "day"
|
232
|
+
current_time.beginning_of_day
|
233
|
+
when "week"
|
234
|
+
current_time.beginning_of_week
|
235
|
+
when "month"
|
236
|
+
current_time.beginning_of_month
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
def evaluator_must_be_valid
|
241
|
+
return if evaluator.nil? || evaluator.blank?
|
242
|
+
|
243
|
+
evaluator = self.evaluator.symbolize_keys
|
244
|
+
|
245
|
+
unless evaluator[:metric].present? && (evaluator[:min].present? || evaluator[:max].present?)
|
246
|
+
errors.add(:evaluator, "must specify metric and either min or max value")
|
247
|
+
return
|
248
|
+
end
|
249
|
+
|
250
|
+
errors.add(:evaluator, "min value must be numeric") if evaluator[:min].present? && !evaluator[:min].is_a?(Numeric)
|
251
|
+
|
252
|
+
errors.add(:evaluator, "max value must be numeric") if evaluator[:max].present? && !evaluator[:max].is_a?(Numeric)
|
253
|
+
|
254
|
+
metric = evaluator[:metric].to_sym
|
255
|
+
|
256
|
+
evaluator = EasyML::Core::ModelEvaluator.get(metric)
|
257
|
+
unless evaluator.present?
|
258
|
+
allowed_metrics = EasyML::Core::ModelEvaluator.metrics
|
259
|
+
errors.add(:evaluator, "contains invalid metric. Allowed metrics are #{allowed_metrics}")
|
260
|
+
return
|
261
|
+
end
|
262
|
+
|
263
|
+
return unless evaluator.present?
|
264
|
+
return if evaluator.new.respond_to?(:evaluate)
|
265
|
+
|
266
|
+
errors.add(:evaluator, "evaluator must implement evaluate method")
|
267
|
+
end
|
268
|
+
|
269
|
+
def validate_metrics_allowed
|
270
|
+
return unless metric
|
271
|
+
metric_unknown = EasyML::Core::ModelEvaluator.metrics.exclude?(metric.to_sym)
|
272
|
+
return unless metric_unknown
|
273
|
+
|
274
|
+
errors.add(:metrics,
|
275
|
+
"don't know how to handle metric #{metric}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_retraining_runs
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# model_id :bigint
|
7
|
+
# model_history_id :bigint
|
8
|
+
# model_file_id :bigint
|
9
|
+
# retraining_job_id :bigint not null
|
10
|
+
# tuner_job_id :bigint
|
11
|
+
# status :string default("pending")
|
12
|
+
# metric_value :float
|
13
|
+
# threshold :float
|
14
|
+
# trigger :string default("manual")
|
15
|
+
# threshold_direction :string
|
16
|
+
# started_at :datetime
|
17
|
+
# completed_at :datetime
|
18
|
+
# error_message :text
|
19
|
+
# metadata :jsonb
|
20
|
+
# metrics :jsonb
|
21
|
+
# best_params :jsonb
|
22
|
+
# wandb_url :string
|
23
|
+
# snapshot_id :string
|
24
|
+
# deployable :boolean
|
25
|
+
# is_deploying :boolean
|
26
|
+
# deployed :boolean
|
27
|
+
# deploy_id :bigint
|
28
|
+
# created_at :datetime not null
|
29
|
+
# updated_at :datetime not null
|
30
|
+
#
|
31
|
+
module EasyML
|
32
|
+
class RetrainingRun < ActiveRecord::Base
|
33
|
+
self.table_name = "easy_ml_retraining_runs"
|
34
|
+
|
35
|
+
belongs_to :retraining_job
|
36
|
+
belongs_to :model, class_name: "EasyML::Model"
|
37
|
+
belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
|
38
|
+
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
39
|
+
|
40
|
+
validates :status, presence: true, inclusion: { in: %w[pending running success failed deployed] }
|
41
|
+
|
42
|
+
scope :running, -> { where(status: "running") }
|
43
|
+
|
44
|
+
def deploy(async: true)
|
45
|
+
deploy = EasyML::Deploy.create!(
|
46
|
+
model: model,
|
47
|
+
retraining_run: self,
|
48
|
+
)
|
49
|
+
|
50
|
+
deploy.deploy(async: async)
|
51
|
+
end
|
52
|
+
|
53
|
+
def wrap_training(&block)
|
54
|
+
return false unless pending?
|
55
|
+
|
56
|
+
begin
|
57
|
+
EasyML::Event.create_event(self, "started")
|
58
|
+
update!(status: "running", started_at: Time.current)
|
59
|
+
|
60
|
+
training_model, best_params = yield
|
61
|
+
|
62
|
+
if best_params.present?
|
63
|
+
tuner = EasyML::TunerJob.where(model: training_model)
|
64
|
+
.order(id: :desc)
|
65
|
+
.first
|
66
|
+
end
|
67
|
+
|
68
|
+
results = metric_results(training_model)
|
69
|
+
failed_reasons = training_model.cannot_deploy_reasons - ["Model has not changed"]
|
70
|
+
if results[:deployable] == false
|
71
|
+
status = "success"
|
72
|
+
else
|
73
|
+
status = failed_reasons.any? ? "failed" : "success"
|
74
|
+
end
|
75
|
+
|
76
|
+
if status == "success"
|
77
|
+
training_model.save_model_file
|
78
|
+
end
|
79
|
+
|
80
|
+
update!(
|
81
|
+
results.merge!(
|
82
|
+
status: status,
|
83
|
+
completed_at: failed_reasons.none? ? Time.current : nil,
|
84
|
+
error_message: failed_reasons.any? ? failed_reasons&.first : nil,
|
85
|
+
model: training_model,
|
86
|
+
metrics: training_model.evaluate,
|
87
|
+
best_params: best_params,
|
88
|
+
tuner_job_id: tuner&.id,
|
89
|
+
metadata: tuner&.metadata,
|
90
|
+
wandb_url: tuner&.wandb_url,
|
91
|
+
model_file_id: status == "success" ? training_model.model_file_id : nil,
|
92
|
+
)
|
93
|
+
)
|
94
|
+
|
95
|
+
if failed_reasons.any?
|
96
|
+
EasyML::Event.handle_error(self, failed_reasons.first)
|
97
|
+
else
|
98
|
+
EasyML::Event.create_event(self, status)
|
99
|
+
end
|
100
|
+
params = { last_run_at: Time.current, last_tuning_at: best_params.present? ? Time.current : nil }.compact
|
101
|
+
retraining_job.update!(params)
|
102
|
+
|
103
|
+
reload
|
104
|
+
if deployable? && retraining_job.auto_deploy
|
105
|
+
training_model.save_model_file
|
106
|
+
training_model.reload
|
107
|
+
deploy = EasyML::Deploy.create!(retraining_run: self, model: training_model, model_file: training_model.model_file, trigger: trigger)
|
108
|
+
deploy.deploy
|
109
|
+
end
|
110
|
+
true
|
111
|
+
rescue => e
|
112
|
+
EasyML::Event.handle_error(self, e)
|
113
|
+
update!(
|
114
|
+
status: "failed",
|
115
|
+
completed_at: Time.current,
|
116
|
+
error_message: e.message,
|
117
|
+
)
|
118
|
+
false
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def pending?
|
123
|
+
status == "pending"
|
124
|
+
end
|
125
|
+
|
126
|
+
def deployed?
|
127
|
+
status == "deployed"
|
128
|
+
end
|
129
|
+
|
130
|
+
def success?
|
131
|
+
status == "success"
|
132
|
+
end
|
133
|
+
|
134
|
+
def failed?
|
135
|
+
status == "failed"
|
136
|
+
end
|
137
|
+
|
138
|
+
def running?
|
139
|
+
status == "running"
|
140
|
+
end
|
141
|
+
|
142
|
+
def should_tune?
|
143
|
+
retraining_job.tuner_config.present? && retraining_job.should_tune?
|
144
|
+
end
|
145
|
+
|
146
|
+
private
|
147
|
+
|
148
|
+
def metric_results(training_model)
|
149
|
+
return training_model.deployable? unless retraining_job.evaluator.present?
|
150
|
+
|
151
|
+
training_model.dataset.refresh
|
152
|
+
evaluator = retraining_job.evaluator.symbolize_keys
|
153
|
+
x_true, y_true = training_model.dataset.test(split_ys: true)
|
154
|
+
y_pred = training_model.predict(x_true)
|
155
|
+
|
156
|
+
metric = evaluator[:metric].to_sym
|
157
|
+
metrics = EasyML::Core::ModelEvaluator.evaluate(
|
158
|
+
model: training_model,
|
159
|
+
y_pred: y_pred,
|
160
|
+
y_true: y_true,
|
161
|
+
evaluator: evaluator,
|
162
|
+
)
|
163
|
+
metric_value = metrics[metric]
|
164
|
+
|
165
|
+
# Check against min threshold if present
|
166
|
+
if evaluator[:min].present?
|
167
|
+
threshold = evaluator[:min]
|
168
|
+
threshold_direction = "minimize"
|
169
|
+
deployable = metric_value < threshold
|
170
|
+
else
|
171
|
+
threshold = evaluator[:max]
|
172
|
+
threshold_direction = "maximize"
|
173
|
+
deployable = metric_value > threshold
|
174
|
+
end
|
175
|
+
|
176
|
+
{
|
177
|
+
metric_value: metric_value,
|
178
|
+
threshold: threshold,
|
179
|
+
threshold_direction: threshold_direction,
|
180
|
+
deployable: deployable,
|
181
|
+
}
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_settings
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# configuration :json
|
7
|
+
# created_at :datetime not null
|
8
|
+
# updated_at :datetime not null
|
9
|
+
#
|
10
|
+
require_relative "concerns/configurable"
|
11
|
+
|
12
|
+
module EasyML
|
13
|
+
class Settings < ActiveRecord::Base
|
14
|
+
self.table_name = "easy_ml_settings"
|
15
|
+
include EasyML::Concerns::Configurable
|
16
|
+
|
17
|
+
add_configuration_attributes :storage,
|
18
|
+
:s3_access_key_id, :s3_secret_access_key,
|
19
|
+
:s3_bucket, :s3_region, :s3_prefix, :timezone,
|
20
|
+
:wandb_api_key
|
21
|
+
|
22
|
+
validates :storage, inclusion: { in: %w[file s3] }, if: -> { storage.present? }
|
23
|
+
|
24
|
+
TIMEZONES = [
|
25
|
+
{ value: "America/New_York", label: "Eastern Time" },
|
26
|
+
{ value: "America/Chicago", label: "Central Time" },
|
27
|
+
{ value: "America/Denver", label: "Mountain Time" },
|
28
|
+
{ value: "America/Los_Angeles", label: "Pacific Time" },
|
29
|
+
]
|
30
|
+
|
31
|
+
def self.constants
|
32
|
+
{
|
33
|
+
TIMEZONES: TIMEZONES,
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_splitters
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# splitter_type :string not null
|
7
|
+
# configuration :json
|
8
|
+
# dataset_id :bigint not null
|
9
|
+
# created_at :datetime not null
|
10
|
+
# updated_at :datetime not null
|
11
|
+
#
|
12
|
+
module EasyML
|
13
|
+
class Splitter < ActiveRecord::Base
|
14
|
+
self.table_name = "easy_ml_splitters"
|
15
|
+
include Historiographer::Silent
|
16
|
+
historiographer_mode :snapshot_only
|
17
|
+
|
18
|
+
include EasyML::Concerns::Configurable
|
19
|
+
|
20
|
+
SPLITTER_OPTIONS = {
|
21
|
+
"date" => "EasyML::Splitters::DateSplitter",
|
22
|
+
"random" => "EasyML::Splitters::RandomSplitter",
|
23
|
+
"predefined" => "EasyML::Splitters::PredefinedSplitter",
|
24
|
+
}
|
25
|
+
SPLITTER_TYPES = [
|
26
|
+
{
|
27
|
+
value: "date",
|
28
|
+
label: "Date Splitter",
|
29
|
+
description: "Split dataset based on date ranges for training, validation, and testing",
|
30
|
+
},
|
31
|
+
{
|
32
|
+
value: "random",
|
33
|
+
label: "Random Splitter",
|
34
|
+
description: "Randomly split dataset into training, validation, and testing sets with configurable ratios",
|
35
|
+
},
|
36
|
+
{
|
37
|
+
value: "predefined",
|
38
|
+
label: "Predefined Splitter",
|
39
|
+
description: "Split dataset using predefined file assignments for training, validation, and testing sets",
|
40
|
+
},
|
41
|
+
].freeze
|
42
|
+
|
43
|
+
belongs_to :dataset, class_name: "EasyML::Dataset"
|
44
|
+
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
45
|
+
|
46
|
+
validates :splitter_type, presence: true
|
47
|
+
validates :splitter_type, inclusion: { in: SPLITTER_OPTIONS.keys }
|
48
|
+
|
49
|
+
SPLITTER_NAMES = SPLITTER_OPTIONS.keys.freeze
|
50
|
+
SPLITTER_CONSTANTS = SPLITTER_OPTIONS.values.map(&:constantize)
|
51
|
+
SPLITTER_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
52
|
+
add_configuration_attributes attribute
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.constants
|
56
|
+
{
|
57
|
+
SPLITTER_TYPES: SPLITTER_TYPES,
|
58
|
+
DEFAULT_CONFIGS: SPLITTER_OPTIONS.reduce({}) do |h, (type, klass)|
|
59
|
+
h.tap do
|
60
|
+
h[type] = klass.constantize.default_config
|
61
|
+
end
|
62
|
+
end,
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def split(df, &block)
|
67
|
+
adapter.split(df, &block)
|
68
|
+
end
|
69
|
+
|
70
|
+
def splits
|
71
|
+
adapter.splits
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def adapter
|
77
|
+
@adapter ||= begin
|
78
|
+
adapter_class = SPLITTER_OPTIONS[splitter_type]
|
79
|
+
raise "Don't know how to use splitter #{splitter_type}!" unless adapter_class.present?
|
80
|
+
|
81
|
+
attrs = adapter_class.constantize.configuration_attributes
|
82
|
+
adapter_class.constantize.new(self).tap do |adapter|
|
83
|
+
attrs.each do |attr|
|
84
|
+
adapter.send("#{attr}=", send(attr))
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Splitters
|
3
|
+
class BaseSplitter
|
4
|
+
include ActiveModel::Validations
|
5
|
+
include EasyML::Concerns::Configurable
|
6
|
+
|
7
|
+
attr_reader :splitter
|
8
|
+
|
9
|
+
def split(datasource, &block)
|
10
|
+
datasource.in_batches do |df|
|
11
|
+
split_df(df).tap do |splits|
|
12
|
+
yield splits if block_given?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def split_df(df)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(splitter)
|
22
|
+
@splitter = splitter
|
23
|
+
end
|
24
|
+
|
25
|
+
delegate :dataset, to: :splitter
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_splitters
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# splitter_type :string not null
|
7
|
+
# configuration :json
|
8
|
+
# dataset_id :bigint not null
|
9
|
+
# created_at :datetime not null
|
10
|
+
# updated_at :datetime not null
|
11
|
+
#
|
12
|
+
require_relative "base_splitter"
|
13
|
+
|
14
|
+
module EasyML
|
15
|
+
module Splitters
|
16
|
+
class DateSplitter < BaseSplitter
|
17
|
+
validates :date_col, presence: true
|
18
|
+
validates :months_test, presence: true, numericality: { greater_than: 0 }
|
19
|
+
validates :months_valid, presence: true, numericality: { greater_than: 0 }
|
20
|
+
|
21
|
+
attr_accessor :today, :date_col, :months_test, :months_valid
|
22
|
+
|
23
|
+
add_configuration_attributes :today, :date_col, :months_test, :months_valid
|
24
|
+
|
25
|
+
def self.default_config
|
26
|
+
{
|
27
|
+
date_col: "",
|
28
|
+
months_test: 2,
|
29
|
+
months_valid: 2,
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
def split_df(df)
|
34
|
+
raise "Split by date requires argument: date_col" unless date_col.present?
|
35
|
+
|
36
|
+
df = EasyML::Data::DateConverter.maybe_convert_date(df, date_col)
|
37
|
+
|
38
|
+
unless df[date_col].dtype.is_a?(Polars::Datetime)
|
39
|
+
raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
|
40
|
+
end
|
41
|
+
|
42
|
+
validation_date_start, test_date_start = splits
|
43
|
+
|
44
|
+
test_df = Polars.concat(
|
45
|
+
[
|
46
|
+
df.filter(Polars.col(date_col) >= test_date_start),
|
47
|
+
df.filter(Polars.col(date_col).is_null),
|
48
|
+
]
|
49
|
+
)
|
50
|
+
remaining_df = df.filter(Polars.col(date_col) < test_date_start)
|
51
|
+
valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
|
52
|
+
train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
|
53
|
+
|
54
|
+
[train_df, valid_df, test_df]
|
55
|
+
end
|
56
|
+
|
57
|
+
def months(n)
|
58
|
+
ActiveSupport::Duration.months(n)
|
59
|
+
end
|
60
|
+
|
61
|
+
def splits
|
62
|
+
reference_date = to_datetime(datasource_end) || today
|
63
|
+
test_date_start = reference_date.advance(months: -months_test).beginning_of_day
|
64
|
+
validation_date_start = test_date_start.advance(months: -months_valid).beginning_of_day
|
65
|
+
[validation_date_start, test_date_start]
|
66
|
+
end
|
67
|
+
|
68
|
+
def datasource_end
|
69
|
+
return @datasource_end if @datasource_end
|
70
|
+
|
71
|
+
@datasource_end = dataset.datasource.query(sort: date_col, descending: true, limit: 1,
|
72
|
+
select: date_col)[date_col]&.to_a&.first
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_datetime(field, default: nil)
|
76
|
+
case field
|
77
|
+
when String
|
78
|
+
UTC.parse(field)
|
79
|
+
when NilClass
|
80
|
+
default
|
81
|
+
else
|
82
|
+
field
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def today
|
87
|
+
to_datetime(@today, default: UTC.today)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|