easy_ml 0.1.4 → 0.2.0.pre.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
import { Dataset } from './dataset';
|
2
|
+
|
3
|
+
export type ModelStatus = 'success' | 'failed';
|
4
|
+
export type DeploymentStatus = 'training' | 'inference' | 'retired';
|
5
|
+
export type JobStatus = 'running' | 'success' | 'failed' | 'deployed';
|
6
|
+
export type Frequency = 'hourly' | 'daily' | 'weekly' | 'monthly';
|
7
|
+
export type ThresholdDirection = 'minimize' | 'maximize';
|
8
|
+
export interface Feature {
|
9
|
+
id: number;
|
10
|
+
name: string;
|
11
|
+
description: string;
|
12
|
+
groupId: number;
|
13
|
+
testDatasetId: number;
|
14
|
+
inputColumns: string[];
|
15
|
+
outputColumns: string[];
|
16
|
+
code: string;
|
17
|
+
createdAt: string;
|
18
|
+
updatedAt: string;
|
19
|
+
}
|
20
|
+
|
21
|
+
export interface FeatureGroup {
|
22
|
+
id: number;
|
23
|
+
name: string;
|
24
|
+
description: string;
|
25
|
+
features: Feature[];
|
26
|
+
createdAt: string;
|
27
|
+
updatedAt: string;
|
28
|
+
}
|
29
|
+
|
30
|
+
interface ModelVersion {
|
31
|
+
id: number;
|
32
|
+
version: string;
|
33
|
+
status: ModelStatus;
|
34
|
+
deployment_status: DeploymentStatus;
|
35
|
+
configuration: Record<string, unknown>;
|
36
|
+
createdAt: string;
|
37
|
+
updatedAt: string;
|
38
|
+
}
|
39
|
+
|
40
|
+
export interface Model {
|
41
|
+
id: number;
|
42
|
+
name: string;
|
43
|
+
model_type: string;
|
44
|
+
formatted_model_type: string;
|
45
|
+
task: string;
|
46
|
+
objective: string;
|
47
|
+
metrics: Record<string, unknown>;
|
48
|
+
status: ModelStatus;
|
49
|
+
deployment_status: DeploymentStatus;
|
50
|
+
dataset_id: number;
|
51
|
+
dataset: Dataset;
|
52
|
+
version: string;
|
53
|
+
configuration: Record<string, unknown>;
|
54
|
+
created_at: string;
|
55
|
+
updated_at: string;
|
56
|
+
retraining_runs: RetrainingRun[];
|
57
|
+
last_run_at: string | null;
|
58
|
+
last_run: RetrainingRun | null;
|
59
|
+
retraining_job: RetrainingJob | null;
|
60
|
+
formatted_frequency: string | null;
|
61
|
+
is_training: boolean;
|
62
|
+
metrics_url: string | null;
|
63
|
+
}
|
64
|
+
|
65
|
+
export interface Prediction {
|
66
|
+
id: number;
|
67
|
+
modelId: number;
|
68
|
+
timestamp: string;
|
69
|
+
input: Record<string, any>;
|
70
|
+
output: any;
|
71
|
+
groundTruth?: any;
|
72
|
+
latencyMs: number;
|
73
|
+
}
|
74
|
+
|
75
|
+
export interface RetrainingJob {
|
76
|
+
id: number;
|
77
|
+
model: string;
|
78
|
+
frequency: Frequency;
|
79
|
+
formatted_frequency: string;
|
80
|
+
at: number;
|
81
|
+
evaluator: Record<string, unknown>;
|
82
|
+
tuner_config: Record<string, unknown>;
|
83
|
+
tuning_frequency: Frequency;
|
84
|
+
last_tuning_at: string | null;
|
85
|
+
active: boolean;
|
86
|
+
status: JobStatus;
|
87
|
+
last_run_at: string | null;
|
88
|
+
locked_at: string | null;
|
89
|
+
created_at: string;
|
90
|
+
updated_at: string;
|
91
|
+
}
|
92
|
+
|
93
|
+
export interface RetrainingRun {
|
94
|
+
id: number;
|
95
|
+
model_id: number;
|
96
|
+
retraining_job_id: number;
|
97
|
+
tuner_job_id: number | null;
|
98
|
+
status: JobStatus;
|
99
|
+
metric_value: number | null;
|
100
|
+
threshold: number | null;
|
101
|
+
threshold_direction: ThresholdDirection;
|
102
|
+
deployable: boolean;
|
103
|
+
started_at: string | null;
|
104
|
+
is_deploying: boolean;
|
105
|
+
completed_at: string | null;
|
106
|
+
error_message: string | null;
|
107
|
+
metadata: Record<string, unknown>;
|
108
|
+
created_at: string;
|
109
|
+
updated_at: string;
|
110
|
+
stacktrace: string | null;
|
111
|
+
metrics: Record<string, number>;
|
112
|
+
metrics_url: string | null;
|
113
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module EasyML
|
2
|
+
class ApplicationJob < ActiveJob::Base
|
3
|
+
queue_as :easy_ml
|
4
|
+
|
5
|
+
def create_event(model, status, error = nil)
|
6
|
+
EasyML::Event.create_event(model, status, error)
|
7
|
+
end
|
8
|
+
|
9
|
+
def handle_error(model, error)
|
10
|
+
EasyML::Event.handle_error(model, error)
|
11
|
+
end
|
12
|
+
|
13
|
+
def format_stacktrace(error)
|
14
|
+
EasyML::Event.format_stacktrace(error)
|
15
|
+
end
|
16
|
+
|
17
|
+
def wrap_text(text, max_length)
|
18
|
+
EasyML::Event.wrap_text(text, max_length)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module EasyML
|
2
|
+
class BatchJob
|
3
|
+
extend Resque::Plugins::BatchedJob
|
4
|
+
@queue = :easy_ml
|
5
|
+
|
6
|
+
class << self
|
7
|
+
# Default or dynamically generated batch ID
|
8
|
+
def default_batch_id
|
9
|
+
"batch_#{name}_#{SecureRandom.uuid}"
|
10
|
+
end
|
11
|
+
|
12
|
+
# E.g. EasyML::ComputeFeatureBatchJob.enqueue_batch(features.map(&:id))
|
13
|
+
#
|
14
|
+
def enqueue_batch(args_list, batch_id = default_batch_id)
|
15
|
+
args_list = args_list.map { |arg| arg.is_a?(Array) ? arg : [arg] }
|
16
|
+
store_batch_arguments(batch_id, args_list)
|
17
|
+
|
18
|
+
args_list.each do |args|
|
19
|
+
Resque.enqueue_batched_job(self, batch_id, *args)
|
20
|
+
end
|
21
|
+
|
22
|
+
batch_id
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# Store batch arguments in Redis
|
28
|
+
def store_batch_arguments(batch_id, args_list)
|
29
|
+
redis_key = "#{batch(batch_id)}:original_args"
|
30
|
+
redis.set(redis_key, Resque.encode(args_list))
|
31
|
+
end
|
32
|
+
|
33
|
+
# Fetch batch arguments from Redis
|
34
|
+
def fetch_batch_arguments(batch_id)
|
35
|
+
redis_key = "#{batch(batch_id)}:original_args"
|
36
|
+
stored_args = redis.get(redis_key)
|
37
|
+
stored_args ? Resque.decode(stored_args) : []
|
38
|
+
end
|
39
|
+
|
40
|
+
# Redis instance for storing batch arguments
|
41
|
+
def redis
|
42
|
+
Resque.redis
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class ComputeFeatureJob < BatchJob
|
3
|
+
@queue = :easy_ml
|
4
|
+
|
5
|
+
def self.perform(batch_id, options = {})
|
6
|
+
options.symbolize_keys!
|
7
|
+
feature_id = options.dig(:feature_id)
|
8
|
+
feature = EasyML::Feature.find(feature_id)
|
9
|
+
feature.fit_batch(options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.after_batch_hook(batch_id, *args)
|
13
|
+
puts "After batch!"
|
14
|
+
feature_ids = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys).pluck(:feature_id).uniq
|
15
|
+
dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
|
16
|
+
dataset.after_fit_features
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Jobs
|
3
|
+
class FinalizeFeatureJob < ApplicationJob
|
4
|
+
queue_as :features
|
5
|
+
|
6
|
+
def perform(feature_id)
|
7
|
+
feature = EasyML::Feature.find(feature_id)
|
8
|
+
feature.update!(
|
9
|
+
applied_at: Time.current,
|
10
|
+
needs_fit: false,
|
11
|
+
)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module EasyML
|
2
|
+
class RefreshDatasetJob < ApplicationJob
|
3
|
+
def perform(id)
|
4
|
+
dataset = EasyML::Dataset.find(id)
|
5
|
+
puts "Refreshing dataset #{dataset.name}"
|
6
|
+
puts "Needs refresh? #{dataset.needs_refresh?}"
|
7
|
+
unless dataset.needs_refresh?
|
8
|
+
dataset.update(workflow_status: :ready)
|
9
|
+
end
|
10
|
+
|
11
|
+
create_event(dataset, "started")
|
12
|
+
|
13
|
+
begin
|
14
|
+
puts "Prepare! #{dataset.name}"
|
15
|
+
dataset.prepare
|
16
|
+
if dataset.features.needs_fit.any?
|
17
|
+
dataset.fit_features(async: true)
|
18
|
+
puts "Computing features!"
|
19
|
+
else
|
20
|
+
dataset.actually_refresh
|
21
|
+
puts "Done!"
|
22
|
+
end
|
23
|
+
rescue StandardError => e
|
24
|
+
puts "Error #{e.message}"
|
25
|
+
if Rails.env.test?
|
26
|
+
raise e
|
27
|
+
end
|
28
|
+
handle_error(dataset, e)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module EasyML
|
2
|
+
class SyncDatasourceJob < ApplicationJob
|
3
|
+
queue_as :easy_ml
|
4
|
+
|
5
|
+
def perform(id)
|
6
|
+
datasource = EasyML::Datasource.find(id)
|
7
|
+
create_event(datasource, "started")
|
8
|
+
|
9
|
+
begin
|
10
|
+
datasource.refresh
|
11
|
+
rescue StandardError => e
|
12
|
+
datasource.update!(is_syncing: false)
|
13
|
+
handle_error(datasource, e)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module EasyML
|
2
|
+
class TrainingJob < ApplicationJob
|
3
|
+
class TrainingTimeoutError < StandardError; end
|
4
|
+
|
5
|
+
INACTIVITY_TIMEOUT = 15 # seconds
|
6
|
+
|
7
|
+
def perform(model_id)
|
8
|
+
@model = EasyML::Model.find_by(id: model_id)
|
9
|
+
return if @model.nil?
|
10
|
+
|
11
|
+
@last_activity = Time.current
|
12
|
+
setup_signal_traps
|
13
|
+
# @monitor_thread = start_monitor_thread
|
14
|
+
|
15
|
+
@model.actually_train do |iteration_info|
|
16
|
+
@last_activity = Time.current
|
17
|
+
end
|
18
|
+
ensure
|
19
|
+
# @monitor_thread&.exit
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def setup_signal_traps
|
25
|
+
# Handle graceful shutdown on SIGTERM
|
26
|
+
Signal.trap("TERM") do
|
27
|
+
puts "Received SIGTERM, cleaning up..."
|
28
|
+
cleanup("Training process terminated")
|
29
|
+
raise TrainingTimeoutError, "Training process terminated"
|
30
|
+
end
|
31
|
+
|
32
|
+
# Handle Ctrl+C
|
33
|
+
Signal.trap("INT") do
|
34
|
+
puts "Received SIGINT, cleaning up..."
|
35
|
+
cleanup("Training process interrupted")
|
36
|
+
raise TrainingTimeoutError, "Training process interrupted"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def cleanup(error_message)
|
41
|
+
return if @cleaned_up
|
42
|
+
@cleaned_up = true
|
43
|
+
@model.last_run.update(status: "failed", error_message: error_message, completed_at: Time.current)
|
44
|
+
@model.update(is_training: false)
|
45
|
+
end
|
46
|
+
|
47
|
+
def start_monitor_thread
|
48
|
+
Thread.new do
|
49
|
+
while true
|
50
|
+
puts "Monitoring activity... #{Time.current - @last_activity}"
|
51
|
+
if Time.current - @last_activity >= INACTIVITY_TIMEOUT
|
52
|
+
puts "Training process inactive for #{INACTIVITY_TIMEOUT} seconds, terminating..."
|
53
|
+
cleanup("Training process timed out")
|
54
|
+
Thread.main.raise(TrainingTimeoutError)
|
55
|
+
break
|
56
|
+
end
|
57
|
+
sleep 1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Adapters
|
3
|
+
class BaseAdapter
|
4
|
+
attr_reader :datasource
|
5
|
+
|
6
|
+
def initialize(datasource)
|
7
|
+
@datasource = datasource
|
8
|
+
end
|
9
|
+
|
10
|
+
def query(*)
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
def in_batches(*)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def files
|
19
|
+
raise NotImplementedError
|
20
|
+
end
|
21
|
+
|
22
|
+
def last_updated_at
|
23
|
+
raise NotImplementedError
|
24
|
+
end
|
25
|
+
|
26
|
+
def data
|
27
|
+
raise NotImplementedError
|
28
|
+
end
|
29
|
+
|
30
|
+
def needs_refresh?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
|
34
|
+
def refresh
|
35
|
+
datasource.syncing do
|
36
|
+
# Default implementation does nothing
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def refresh!
|
41
|
+
refresh
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Adapters
|
3
|
+
class PolarsAdapter < BaseAdapter
|
4
|
+
def initialize(datasource)
|
5
|
+
super
|
6
|
+
read_df_from_configuration
|
7
|
+
end
|
8
|
+
|
9
|
+
def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
|
10
|
+
return if df.nil?
|
11
|
+
|
12
|
+
df = self.df.clone
|
13
|
+
df = df.filter(filter) if filter
|
14
|
+
df = df.select(select) if select.present?
|
15
|
+
df = df.unique if unique
|
16
|
+
drop_cols &= df.columns
|
17
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
18
|
+
df = df.sort(sort, reverse: descending) if sort
|
19
|
+
df = df.limit(limit) if limit
|
20
|
+
df
|
21
|
+
end
|
22
|
+
|
23
|
+
def in_batches(of: 10_000)
|
24
|
+
total_rows = df.shape[0]
|
25
|
+
(0...total_rows).step(of) do |start|
|
26
|
+
end_index = [start + of, total_rows].min
|
27
|
+
yield df.slice(start, end_index - start)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def files
|
32
|
+
[]
|
33
|
+
end
|
34
|
+
|
35
|
+
def last_updated_at
|
36
|
+
datasource.updated_at
|
37
|
+
end
|
38
|
+
|
39
|
+
def data
|
40
|
+
df
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
attr_accessor :df
|
46
|
+
|
47
|
+
def store_df_in_configuration
|
48
|
+
return unless df
|
49
|
+
|
50
|
+
datasource.configuration = (datasource.configuration || {}).merge(
|
51
|
+
"df" => JSON.parse(df.write_json)
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
55
|
+
def read_df_from_configuration
|
56
|
+
return unless datasource.configuration&.key?("df")
|
57
|
+
|
58
|
+
df_data = datasource.configuration["df"]
|
59
|
+
columns = df_data["columns"].map do |col|
|
60
|
+
dtype = case col["datatype"]
|
61
|
+
when Hash
|
62
|
+
if col["datatype"]["Datetime"]
|
63
|
+
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
64
|
+
else
|
65
|
+
Polars::Utf8
|
66
|
+
end
|
67
|
+
else
|
68
|
+
Polars.const_get(col["datatype"])
|
69
|
+
end
|
70
|
+
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
71
|
+
end
|
72
|
+
|
73
|
+
@df = Polars::DataFrame.new(columns)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Cleaner
|
3
|
+
attr_accessor :files_to_keep, :dirs_to_clean
|
4
|
+
|
5
|
+
def initialize(force: false, verbose: false)
|
6
|
+
@verbose = verbose
|
7
|
+
@files_to_keep = if force
|
8
|
+
[]
|
9
|
+
else
|
10
|
+
model_files_to_keep +
|
11
|
+
dataset_files_to_keep +
|
12
|
+
datasource_files_to_keep
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.clean(verbose: false)
|
17
|
+
new(verbose: verbose).clean
|
18
|
+
end
|
19
|
+
|
20
|
+
# Clean everything, including active models
|
21
|
+
def self.clean!(verbose: false)
|
22
|
+
new(force: true, verbose: verbose).clean
|
23
|
+
end
|
24
|
+
|
25
|
+
def clean
|
26
|
+
dirs_to_clean.each do |dir|
|
27
|
+
files_to_keep = files_to_keep_for_dir(dir)
|
28
|
+
EasyML::Support::FileRotate.new(dir, files_to_keep, verbose: @verbose).cleanup(%w[json parquet csv])
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def files_to_keep_for_dir(dir)
|
35
|
+
files_to_keep.map(&:to_s).select { |f| f.start_with?(dir.to_s) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def dirs_to_clean
|
39
|
+
%w[models datasets datasources].map do |dir|
|
40
|
+
EasyML::Engine.root_dir.join(dir)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def model_dirs
|
45
|
+
EasyML::Model.all.includes(dataset: :datasource).map do |model|
|
46
|
+
File.expand_path("..", model.root_dir)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def active_models
|
51
|
+
@active_models ||= begin
|
52
|
+
inference_models = EasyML::Model.deployed
|
53
|
+
training_models = EasyML::Model.all
|
54
|
+
(training_models + inference_models).compact
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def model_files_to_keep
|
59
|
+
if Rails.env.test?
|
60
|
+
[]
|
61
|
+
else
|
62
|
+
active_models.map(&:model_file).compact.map(&:full_path).uniq
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def dataset_files_to_keep
|
67
|
+
if Rails.env.test?
|
68
|
+
[]
|
69
|
+
else
|
70
|
+
EasyML::Dataset.all.flat_map(&:files).uniq
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def datasource_files_to_keep
|
75
|
+
if Rails.env.test?
|
76
|
+
Dir.glob(EasyML::Engine.root_dir.glob("datasources/**/*.{csv,parquet}")).uniq
|
77
|
+
else
|
78
|
+
EasyML::Datasource.all.flat_map(&:files).uniq
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_columns
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :json
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
#
|
20
|
+
module EasyML
|
21
|
+
class Column < ActiveRecord::Base
|
22
|
+
self.table_name = "easy_ml_columns"
|
23
|
+
include Historiographer::Silent
|
24
|
+
historiographer_mode :snapshot_only
|
25
|
+
|
26
|
+
belongs_to :dataset, class_name: "EasyML::Dataset"
|
27
|
+
|
28
|
+
validates :name, presence: true
|
29
|
+
validates :name, uniqueness: { scope: :dataset_id }
|
30
|
+
|
31
|
+
before_save :ensure_valid_datatype
|
32
|
+
|
33
|
+
# Scopes
|
34
|
+
scope :visible, -> { where(hidden: false) }
|
35
|
+
scope :numeric, -> { where(datatype: %w[float integer]) }
|
36
|
+
scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
|
37
|
+
scope :datetime, -> { where(datatype: "datetime") }
|
38
|
+
|
39
|
+
def datatype=(dtype)
|
40
|
+
write_attribute(:datatype, dtype)
|
41
|
+
write_attribute(:polars_datatype, dtype)
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_polars_type(dtype)
|
45
|
+
EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
|
46
|
+
end
|
47
|
+
|
48
|
+
def polars_type
|
49
|
+
return nil if polars_datatype.blank?
|
50
|
+
|
51
|
+
get_polars_type(polars_datatype)
|
52
|
+
end
|
53
|
+
|
54
|
+
def polars_type=(dtype)
|
55
|
+
write_attribute(:polars_datatype, dtype.to_s)
|
56
|
+
write_attribute(:datatype, EasyML::Data::PolarsColumn::POLARS_MAP[type.class.to_s]&.to_s)
|
57
|
+
end
|
58
|
+
|
59
|
+
def preprocessing_steps=(steps)
|
60
|
+
return super({}) if steps.blank?
|
61
|
+
|
62
|
+
typed_steps = steps.transform_values do |config|
|
63
|
+
next config unless config[:params]&.key?(:constant)
|
64
|
+
|
65
|
+
config.deep_dup.tap do |c|
|
66
|
+
c[:params][:constant] = convert_to_type(c[:params][:constant])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
super(typed_steps)
|
71
|
+
end
|
72
|
+
|
73
|
+
def preprocessing_steps
|
74
|
+
(read_attribute(:preprocessing_steps) || {}).symbolize_keys
|
75
|
+
end
|
76
|
+
|
77
|
+
def one_hot?
|
78
|
+
preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :one_hot) == true
|
79
|
+
end
|
80
|
+
|
81
|
+
def ordinal_encoding?
|
82
|
+
preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
|
83
|
+
end
|
84
|
+
|
85
|
+
def allowed_categories
|
86
|
+
return nil unless one_hot?
|
87
|
+
|
88
|
+
dataset.preprocessor.statistics.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def ensure_valid_datatype
|
94
|
+
return if datatype.blank?
|
95
|
+
|
96
|
+
return if EasyML::Data::PolarsColumn::TYPE_MAP.key?(datatype.to_sym)
|
97
|
+
|
98
|
+
errors.add(:datatype, "must be one of: #{EasyML::Data::PolarsColumn::TYPE_MAP.keys.join(", ")}")
|
99
|
+
throw :abort
|
100
|
+
end
|
101
|
+
|
102
|
+
def convert_to_type(value)
|
103
|
+
return value if value.nil?
|
104
|
+
|
105
|
+
case datatype&.to_sym
|
106
|
+
when :float
|
107
|
+
Float(value)
|
108
|
+
when :integer
|
109
|
+
Integer(value)
|
110
|
+
when :boolean
|
111
|
+
ActiveModel::Type::Boolean.new.cast(value)
|
112
|
+
when :datetime
|
113
|
+
value.is_a?(String) ? Time.parse(value) : value
|
114
|
+
else
|
115
|
+
value.to_s
|
116
|
+
end
|
117
|
+
rescue ArgumentError, TypeError
|
118
|
+
# If conversion fails, return original value
|
119
|
+
value
|
120
|
+
end
|
121
|
+
|
122
|
+
NUMERIC_METHODS = %i[mean median].freeze
|
123
|
+
end
|
124
|
+
end
|