easy_ml 0.1.4 → 0.2.0.pre.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,56 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_dataset_histories
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :integer not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# dataset_type :string
|
10
|
+
# status :string
|
11
|
+
# version :string
|
12
|
+
# datasource_id :integer
|
13
|
+
# root_dir :string
|
14
|
+
# configuration :json
|
15
|
+
# num_rows :integer
|
16
|
+
# workflow_status :string
|
17
|
+
# statistics :json
|
18
|
+
# preprocessor_statistics :json
|
19
|
+
# schema :json
|
20
|
+
# refreshed_at :datetime
|
21
|
+
# created_at :datetime not null
|
22
|
+
# updated_at :datetime not null
|
23
|
+
# history_started_at :datetime not null
|
24
|
+
# history_ended_at :datetime
|
25
|
+
# history_user_id :integer
|
26
|
+
# snapshot_id :string
|
27
|
+
#
|
28
|
+
module EasyML
|
29
|
+
class DatasetHistory < ActiveRecord::Base
|
30
|
+
self.table_name = "easy_ml_dataset_histories"
|
31
|
+
include Historiographer::History
|
32
|
+
|
33
|
+
has_many :columns,
|
34
|
+
->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
|
35
|
+
class_name: "EasyML::ColumnHistory",
|
36
|
+
foreign_key: "dataset_id",
|
37
|
+
primary_key: "dataset_id",
|
38
|
+
extend: EasyML::ColumnList
|
39
|
+
|
40
|
+
def root_dir
|
41
|
+
read_attribute(:root_dir)
|
42
|
+
end
|
43
|
+
|
44
|
+
def fit
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
def processed?
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def should_split?
|
53
|
+
false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_datasources
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# datasource_type :string
|
8
|
+
# root_dir :string
|
9
|
+
# configuration :json
|
10
|
+
# refreshed_at :datetime
|
11
|
+
# created_at :datetime not null
|
12
|
+
# updated_at :datetime not null
|
13
|
+
#
|
14
|
+
module EasyML
|
15
|
+
class Datasource < ActiveRecord::Base
|
16
|
+
self.table_name = "easy_ml_datasources"
|
17
|
+
include Historiographer::Silent
|
18
|
+
historiographer_mode :snapshot_only
|
19
|
+
include EasyML::Concerns::Configurable
|
20
|
+
|
21
|
+
DATASOURCE_OPTIONS = {
|
22
|
+
"s3" => "EasyML::Datasources::S3Datasource",
|
23
|
+
"file" => "EasyML::Datasources::FileDatasource",
|
24
|
+
"polars" => "EasyML::Datasources::PolarsDatasource",
|
25
|
+
}
|
26
|
+
DATASOURCE_TYPES = [
|
27
|
+
{
|
28
|
+
value: "s3",
|
29
|
+
label: "Amazon S3",
|
30
|
+
description: "Connect to data stored in Amazon Simple Storage Service (S3) buckets",
|
31
|
+
},
|
32
|
+
{
|
33
|
+
value: "file",
|
34
|
+
label: "Local Files",
|
35
|
+
description: "Connect to data stored in local files",
|
36
|
+
},
|
37
|
+
{
|
38
|
+
value: "polars",
|
39
|
+
label: "Polars DataFrame",
|
40
|
+
description: "In-memory dataframe storage using Polars",
|
41
|
+
},
|
42
|
+
].freeze
|
43
|
+
DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
|
44
|
+
DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
|
45
|
+
|
46
|
+
validates :name, presence: true
|
47
|
+
validates :datasource_type, presence: true
|
48
|
+
validates :datasource_type, inclusion: { in: DATASOURCE_NAMES }
|
49
|
+
# validate :validate_datasource_exists
|
50
|
+
|
51
|
+
before_save :set_root_dir
|
52
|
+
after_initialize :read_adapter_from_configuration, if: :persisted?
|
53
|
+
after_find :read_adapter_from_configuration
|
54
|
+
before_save :store_adapter_in_configuration
|
55
|
+
after_create :refresh_async
|
56
|
+
|
57
|
+
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
58
|
+
attr_accessor :schema, :columns, :num_rows, :is_syncing
|
59
|
+
|
60
|
+
add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
|
61
|
+
DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
62
|
+
add_configuration_attributes attribute
|
63
|
+
end
|
64
|
+
|
65
|
+
delegate :query, :in_batches, :files, :all_files, :last_updated_at, :data, :needs_refresh?,
|
66
|
+
:should_sync?, :files_to_sync, :s3_access_key_id, :s3_secret_access_key,
|
67
|
+
:download_file, :clean, to: :adapter
|
68
|
+
|
69
|
+
def self.constants
|
70
|
+
{
|
71
|
+
DATASOURCE_TYPES: DATASOURCE_TYPES,
|
72
|
+
s3: EasyML::Datasources::S3Datasource.constants,
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
def reread(columns = nil)
|
77
|
+
return false unless adapter.respond_to?(:convert_to_parquet)
|
78
|
+
|
79
|
+
adapter.convert_to_parquet(columns)
|
80
|
+
end
|
81
|
+
|
82
|
+
def available_files
|
83
|
+
all_files.select { |f| File.exist?(f) && Pathname.new(f).extname == ".csv" }.map { |f| f.gsub(Regexp.new(Rails.root.to_s), "") }
|
84
|
+
end
|
85
|
+
|
86
|
+
def in_memory?
|
87
|
+
datasource_type == "polars"
|
88
|
+
end
|
89
|
+
|
90
|
+
def root_dir
|
91
|
+
persisted = read_attribute(:root_dir)
|
92
|
+
return persisted if persisted.present? && !persisted.to_s.blank?
|
93
|
+
|
94
|
+
default_root_dir
|
95
|
+
end
|
96
|
+
|
97
|
+
def refresh_async
|
98
|
+
update(is_syncing: true)
|
99
|
+
EasyML::SyncDatasourceJob.perform_later(id)
|
100
|
+
end
|
101
|
+
|
102
|
+
def before_sync
|
103
|
+
update!(is_syncing: true)
|
104
|
+
adapter.before_sync
|
105
|
+
Rails.logger.info("Starting sync for datasource #{id}")
|
106
|
+
end
|
107
|
+
|
108
|
+
def after_sync
|
109
|
+
adapter.after_sync
|
110
|
+
self.schema = data.schema.reduce({}) do |h, (k, v)|
|
111
|
+
h.tap do
|
112
|
+
h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
self.columns = data.columns
|
116
|
+
self.num_rows = data.shape[0]
|
117
|
+
self.is_syncing = false
|
118
|
+
self.refreshed_at = Time.now
|
119
|
+
save
|
120
|
+
end
|
121
|
+
|
122
|
+
def refresh
|
123
|
+
unless adapter.needs_refresh?
|
124
|
+
update!(is_syncing: false)
|
125
|
+
return
|
126
|
+
end
|
127
|
+
|
128
|
+
syncing do
|
129
|
+
adapter.refresh
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def refresh!
|
134
|
+
syncing do
|
135
|
+
adapter.refresh!
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def syncing
|
140
|
+
before_sync
|
141
|
+
yield.tap do
|
142
|
+
after_sync
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
private
|
147
|
+
|
148
|
+
def adapter
|
149
|
+
@adapter ||= begin
|
150
|
+
adapter_class = DATASOURCE_OPTIONS[datasource_type]
|
151
|
+
raise "Don't know how to use datasource adapter #{datasource_type}!" unless adapter_class.present?
|
152
|
+
|
153
|
+
adapter_class.constantize.new(self)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def default_root_dir
|
158
|
+
folder = name.gsub(/\s{2,}/, " ").split(" ").join("_").downcase
|
159
|
+
EasyML::Engine.root_dir.join("datasources").join(folder)
|
160
|
+
end
|
161
|
+
|
162
|
+
def set_root_dir
|
163
|
+
write_attribute(:root_dir, default_root_dir) unless read_attribute(:root_dir).present?
|
164
|
+
end
|
165
|
+
|
166
|
+
def read_adapter_from_configuration
|
167
|
+
return unless persisted?
|
168
|
+
|
169
|
+
adapter.read_from_configuration if adapter.respond_to?(:read_from_configuration)
|
170
|
+
end
|
171
|
+
|
172
|
+
def store_adapter_in_configuration
|
173
|
+
adapter.store_in_configuration if adapter.respond_to?(:store_in_configuration)
|
174
|
+
end
|
175
|
+
|
176
|
+
def validate_datasource_exists
|
177
|
+
return if adapter.exists?
|
178
|
+
|
179
|
+
errors.add(:root_dir, adapter.error_not_exists)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_datasource_histories
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# datasource_id :integer not null
|
7
|
+
# name :string not null
|
8
|
+
# datasource_type :string
|
9
|
+
# root_dir :string
|
10
|
+
# configuration :json
|
11
|
+
# refreshed_at :datetime
|
12
|
+
# created_at :datetime not null
|
13
|
+
# updated_at :datetime not null
|
14
|
+
# history_started_at :datetime not null
|
15
|
+
# history_ended_at :datetime
|
16
|
+
# history_user_id :integer
|
17
|
+
# snapshot_id :string
|
18
|
+
#
|
19
|
+
module EasyML
|
20
|
+
class DatasourceHistory < ActiveRecord::Base
|
21
|
+
self.table_name = "easy_ml_datasource_histories"
|
22
|
+
include Historiographer::History
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Datasources
|
3
|
+
class BaseDatasource
|
4
|
+
include ActiveModel::Validations
|
5
|
+
include EasyML::Concerns::Configurable
|
6
|
+
|
7
|
+
attr_reader :datasource
|
8
|
+
|
9
|
+
def clean; end
|
10
|
+
|
11
|
+
def before_sync; end
|
12
|
+
|
13
|
+
def after_sync; end
|
14
|
+
|
15
|
+
def initialize(datasource)
|
16
|
+
@datasource = datasource
|
17
|
+
end
|
18
|
+
|
19
|
+
def query(*)
|
20
|
+
raise NotImplementedError
|
21
|
+
end
|
22
|
+
|
23
|
+
def in_batches(*)
|
24
|
+
raise NotImplementedError
|
25
|
+
end
|
26
|
+
|
27
|
+
def files
|
28
|
+
raise NotImplementedError
|
29
|
+
end
|
30
|
+
|
31
|
+
def last_updated_at
|
32
|
+
raise NotImplementedError
|
33
|
+
end
|
34
|
+
|
35
|
+
def data
|
36
|
+
raise NotImplementedError
|
37
|
+
end
|
38
|
+
|
39
|
+
def needs_refresh?
|
40
|
+
false
|
41
|
+
end
|
42
|
+
|
43
|
+
def refresh
|
44
|
+
datasource.syncing do
|
45
|
+
# Default implementation does nothing
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def refresh!
|
50
|
+
refresh
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Datasources
|
3
|
+
class FileDatasource < BaseDatasource
|
4
|
+
delegate :query, :convert_to_parquet, to: :reader
|
5
|
+
|
6
|
+
def after_sync
|
7
|
+
reader.normalize
|
8
|
+
end
|
9
|
+
|
10
|
+
def in_batches(&block)
|
11
|
+
reader.in_batches(&block)
|
12
|
+
end
|
13
|
+
|
14
|
+
def all_files
|
15
|
+
reader.all_files
|
16
|
+
end
|
17
|
+
|
18
|
+
def files
|
19
|
+
reader.files
|
20
|
+
end
|
21
|
+
|
22
|
+
def last_updated_at
|
23
|
+
files.map { |file| File.mtime(file) }.max
|
24
|
+
end
|
25
|
+
|
26
|
+
def needs_refresh?
|
27
|
+
false
|
28
|
+
end
|
29
|
+
|
30
|
+
def data
|
31
|
+
return @combined_df if @combined_df.present?
|
32
|
+
|
33
|
+
combined_df = nil
|
34
|
+
reader.in_batches do |df|
|
35
|
+
combined_df = combined_df.nil? ? df : combined_df.vstack(df)
|
36
|
+
end
|
37
|
+
@combined_df = combined_df
|
38
|
+
end
|
39
|
+
|
40
|
+
def exists?
|
41
|
+
Dir.glob(File.join(datasource.root_dir, "**/*.{csv,parquet}")).any?
|
42
|
+
end
|
43
|
+
|
44
|
+
def error_not_exists
|
45
|
+
"Expected to find datasource files at #{datasource.root_dir}"
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def reader
|
51
|
+
@reader ||= EasyML::Data::PolarsReader.new(
|
52
|
+
root_dir: datasource.root_dir,
|
53
|
+
polars_args: (datasource.configuration || {}).dig("polars_args"),
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Datasources
|
3
|
+
class PolarsDatasource < BaseDatasource
|
4
|
+
validates :df, presence: true
|
5
|
+
add_configuration_attributes :df
|
6
|
+
|
7
|
+
def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
|
8
|
+
return if df.nil?
|
9
|
+
|
10
|
+
df = self.df.clone
|
11
|
+
df = df.filter(filter) if filter
|
12
|
+
df = df.select(select) if select.present?
|
13
|
+
df = df.unique if unique
|
14
|
+
drop_cols &= df.columns
|
15
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
16
|
+
df = df.sort(sort, reverse: descending) if sort
|
17
|
+
df = df.limit(limit) if limit
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def in_batches(of: 10_000)
|
22
|
+
total_rows = df.shape[0]
|
23
|
+
(0...total_rows).step(of) do |start|
|
24
|
+
end_index = [start + of, total_rows].min
|
25
|
+
yield df.slice(start, end_index - start)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def all_files
|
30
|
+
[]
|
31
|
+
end
|
32
|
+
|
33
|
+
def files
|
34
|
+
[]
|
35
|
+
end
|
36
|
+
|
37
|
+
def last_updated_at
|
38
|
+
datasource.updated_at
|
39
|
+
end
|
40
|
+
|
41
|
+
def data
|
42
|
+
df
|
43
|
+
end
|
44
|
+
|
45
|
+
def df
|
46
|
+
datasource.df
|
47
|
+
end
|
48
|
+
|
49
|
+
def exists?
|
50
|
+
df.present?
|
51
|
+
end
|
52
|
+
|
53
|
+
def error_not_exists
|
54
|
+
"Must have a dataframe"
|
55
|
+
end
|
56
|
+
|
57
|
+
def store_in_configuration
|
58
|
+
return unless df
|
59
|
+
|
60
|
+
datasource.configuration = (datasource.configuration || {}).merge(
|
61
|
+
"df" => JSON.parse(df.write_json),
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
def read_from_configuration
|
66
|
+
return unless datasource.configuration&.key?("df")
|
67
|
+
|
68
|
+
df_data = datasource.configuration["df"]
|
69
|
+
return unless df_data.present? && df_data.key?("columns")
|
70
|
+
|
71
|
+
columns = df_data["columns"].map do |col|
|
72
|
+
dtype = case col["datatype"]
|
73
|
+
when Hash
|
74
|
+
if col["datatype"]["Datetime"]
|
75
|
+
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
76
|
+
else
|
77
|
+
Polars::Utf8
|
78
|
+
end
|
79
|
+
else
|
80
|
+
Polars.const_get(col["datatype"])
|
81
|
+
end
|
82
|
+
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
83
|
+
end
|
84
|
+
|
85
|
+
datasource.df = Polars::DataFrame.new(columns)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Datasources
|
3
|
+
class S3Datasource < BaseDatasource
|
4
|
+
REGIONS = [
|
5
|
+
{ value: "us-east-1", label: "US East (N. Virginia)" },
|
6
|
+
{ value: "us-east-2", label: "US East (Ohio)" },
|
7
|
+
{ value: "us-west-1", label: "US West (N. California)" },
|
8
|
+
{ value: "us-west-2", label: "US West (Oregon)" },
|
9
|
+
].freeze
|
10
|
+
|
11
|
+
def self.constants
|
12
|
+
{ S3_REGIONS: REGIONS }
|
13
|
+
end
|
14
|
+
|
15
|
+
validates :s3_bucket, :s3_access_key_id, :s3_secret_access_key, presence: true
|
16
|
+
|
17
|
+
add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
|
18
|
+
|
19
|
+
delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
|
20
|
+
to: :synced_directory
|
21
|
+
|
22
|
+
def in_batches(&block)
|
23
|
+
synced_directory.in_batches(&block)
|
24
|
+
end
|
25
|
+
|
26
|
+
def all_files
|
27
|
+
synced_directory.all_files
|
28
|
+
end
|
29
|
+
|
30
|
+
def files
|
31
|
+
synced_directory.files
|
32
|
+
end
|
33
|
+
|
34
|
+
def last_updated_at
|
35
|
+
synced_directory.last_updated_at
|
36
|
+
end
|
37
|
+
|
38
|
+
def needs_refresh?
|
39
|
+
synced_directory.should_sync?
|
40
|
+
end
|
41
|
+
|
42
|
+
def refresh
|
43
|
+
synced_directory.sync
|
44
|
+
end
|
45
|
+
|
46
|
+
def refresh!
|
47
|
+
synced_directory.sync!
|
48
|
+
end
|
49
|
+
|
50
|
+
def files_to_sync
|
51
|
+
synced_directory.files_to_sync
|
52
|
+
end
|
53
|
+
|
54
|
+
def download_file(file)
|
55
|
+
synced_directory.download_file(file)
|
56
|
+
end
|
57
|
+
|
58
|
+
def exists?
|
59
|
+
synced_directory.files_to_sync.any?
|
60
|
+
end
|
61
|
+
|
62
|
+
def error_not_exists
|
63
|
+
"No files found at s3://#{File.join(s3_bucket, s3_prefix)}"
|
64
|
+
end
|
65
|
+
|
66
|
+
def s3_bucket
|
67
|
+
datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket
|
68
|
+
end
|
69
|
+
|
70
|
+
def s3_prefix
|
71
|
+
datasource_config.dig("s3_prefix")
|
72
|
+
end
|
73
|
+
|
74
|
+
def cache_for
|
75
|
+
datasource_config.dig("cache_for") || 0
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def datasource_config
|
81
|
+
@datasource_config ||= datasource.configuration || {}
|
82
|
+
end
|
83
|
+
|
84
|
+
def synced_directory
|
85
|
+
@synced_directory ||= EasyML::Data::SyncedDirectory.new(
|
86
|
+
root_dir: datasource.root_dir,
|
87
|
+
s3_bucket: s3_bucket,
|
88
|
+
s3_prefix: s3_prefix,
|
89
|
+
s3_access_key_id: EasyML::Configuration.s3_access_key_id,
|
90
|
+
s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
|
91
|
+
polars_args: datasource_config.dig("polars_args") || {},
|
92
|
+
cache_for: cache_for,
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_deploys
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# model_id :bigint
|
7
|
+
# model_history_id :bigint
|
8
|
+
# retraining_run_id :bigint
|
9
|
+
# model_file_id :bigint
|
10
|
+
# status :string not null
|
11
|
+
# trigger :string default("manual")
|
12
|
+
# stacktrace :text
|
13
|
+
# snapshot_id :string
|
14
|
+
# created_at :datetime not null
|
15
|
+
# updated_at :datetime not null
|
16
|
+
#
|
17
|
+
module EasyML
|
18
|
+
class Deploy < ActiveRecord::Base
|
19
|
+
self.table_name = "easy_ml_deploys"
|
20
|
+
|
21
|
+
belongs_to :model, class_name: "EasyML::Model"
|
22
|
+
belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
|
23
|
+
belongs_to :retraining_run, class_name: "EasyML::RetrainingRun"
|
24
|
+
belongs_to :model_version, class_name: "EasyML::ModelHistory", optional: true, foreign_key: :model_history_id
|
25
|
+
|
26
|
+
validates :status, presence: true
|
27
|
+
after_initialize :set_defaults, if: :new_record?
|
28
|
+
before_save :set_model_file, if: :new_record?
|
29
|
+
validates :status, presence: true, inclusion: { in: %w[pending running success failed] }
|
30
|
+
|
31
|
+
scope :latest, -> { select("DISTINCT ON (model_id) *").order("model_id, id DESC") }
|
32
|
+
|
33
|
+
def unlocked?
|
34
|
+
EasyML::Deploy.where(model_id: model_id).where.not(locked_at: nil).where(status: ["pending", "running"]).empty?
|
35
|
+
end
|
36
|
+
|
37
|
+
def locked?
|
38
|
+
!unlocked?
|
39
|
+
end
|
40
|
+
|
41
|
+
def deploy(async: true)
|
42
|
+
if async
|
43
|
+
EasyML::DeployJob.perform_later(id)
|
44
|
+
else
|
45
|
+
actually_deploy
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def actually_deploy
|
50
|
+
lock_deploy do
|
51
|
+
update(status: "running")
|
52
|
+
EasyML::Event.create_event(self, "started")
|
53
|
+
|
54
|
+
if identical_deploy.present?
|
55
|
+
self.model_file = identical_deploy.model_file
|
56
|
+
self.model_version = identical_deploy.model_version
|
57
|
+
else
|
58
|
+
if model_file.present?
|
59
|
+
model.model_file = model_file
|
60
|
+
end
|
61
|
+
model.load_model
|
62
|
+
self.model_version = model.actually_deploy
|
63
|
+
end
|
64
|
+
|
65
|
+
EasyML::Deploy.transaction do
|
66
|
+
update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
|
67
|
+
model.retraining_runs.where(status: :deployed).update_all(status: :success)
|
68
|
+
retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed, is_deploying: false)
|
69
|
+
end
|
70
|
+
|
71
|
+
model_version.tap do
|
72
|
+
EasyML::Event.create_event(self, "success")
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
alias_method :rollback, :deploy
|
78
|
+
|
79
|
+
def unlock!
|
80
|
+
Support::Lockable.unlock!(lock_key)
|
81
|
+
end
|
82
|
+
|
83
|
+
def lock_deploy
|
84
|
+
with_lock do |client|
|
85
|
+
yield
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def identical_deploy
|
90
|
+
EasyML::Deploy.where(retraining_run_id: retraining_run_id).
|
91
|
+
where.not(id: id).where(status: :success).limit(1).first
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def with_lock
|
97
|
+
EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
|
98
|
+
yield client
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def lock_key
|
103
|
+
"deploy:#{self.model.name}:#{self.model.id}"
|
104
|
+
end
|
105
|
+
|
106
|
+
def set_defaults
|
107
|
+
self.status ||= :pending
|
108
|
+
end
|
109
|
+
|
110
|
+
def set_model_file
|
111
|
+
self.model_file ||= retraining_run.model_file
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|