easy_ml 0.1.4 → 0.2.0.pre.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
module EasyML
|
2
|
+
class FeatureStore
|
3
|
+
attr_reader :feature
|
4
|
+
|
5
|
+
def initialize(feature)
|
6
|
+
@feature = feature
|
7
|
+
end
|
8
|
+
|
9
|
+
def store(df)
|
10
|
+
primary_key = feature.primary_key&.first
|
11
|
+
return store_without_partitioning(df) unless df.columns.include?(primary_key)
|
12
|
+
return store_without_partitioning(df) unless primary_key
|
13
|
+
|
14
|
+
min_key = df[primary_key].min
|
15
|
+
max_key = df[primary_key].max
|
16
|
+
batch_size = feature.batch_size || 10_000
|
17
|
+
|
18
|
+
# Try to parse as integers if they're strings
|
19
|
+
begin
|
20
|
+
min_key = Integer(min_key) if min_key.is_a?(String)
|
21
|
+
max_key = Integer(max_key) if max_key.is_a?(String)
|
22
|
+
rescue ArgumentError
|
23
|
+
return store_without_partitioning(df)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Only partition if we have integer keys where we can predict boundaries
|
27
|
+
return store_without_partitioning(df) unless min_key.is_a?(Integer) && max_key.is_a?(Integer)
|
28
|
+
|
29
|
+
partitions = compute_partition_boundaries(min_key, max_key, batch_size)
|
30
|
+
partitions.each do |partition_start|
|
31
|
+
partition_end = partition_start + batch_size - 1
|
32
|
+
partition_df = df.filter(
|
33
|
+
(Polars.col(primary_key) >= partition_start) &
|
34
|
+
(Polars.col(primary_key) <= partition_end)
|
35
|
+
)
|
36
|
+
|
37
|
+
next if partition_df.height == 0
|
38
|
+
|
39
|
+
store_partition(partition_df, primary_key, partition_start)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def query(filter: nil)
|
44
|
+
query_all_partitions(filter)
|
45
|
+
end
|
46
|
+
|
47
|
+
def empty?
|
48
|
+
list_partitions.empty?
|
49
|
+
end
|
50
|
+
|
51
|
+
def list_partitions
|
52
|
+
Dir.glob(File.join(feature_dir, "feature*.parquet")).sort
|
53
|
+
end
|
54
|
+
|
55
|
+
def wipe
|
56
|
+
FileUtils.rm_rf(feature_dir)
|
57
|
+
end
|
58
|
+
|
59
|
+
def upload_remote_files
|
60
|
+
synced_directory.upload
|
61
|
+
end
|
62
|
+
|
63
|
+
def download
|
64
|
+
synced_directory&.download
|
65
|
+
end
|
66
|
+
|
67
|
+
def cp(old_version, new_version)
|
68
|
+
old_dir = feature_dir_for_version(old_version)
|
69
|
+
new_dir = feature_dir_for_version(new_version)
|
70
|
+
|
71
|
+
return if old_dir.nil? || !Dir.exist?(old_dir)
|
72
|
+
|
73
|
+
FileUtils.mkdir_p(new_dir)
|
74
|
+
files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
|
75
|
+
|
76
|
+
files_to_cp.each do |file|
|
77
|
+
target_file = file.gsub(old_version.to_s, new_version.to_s)
|
78
|
+
FileUtils.mkdir_p(File.dirname(target_file))
|
79
|
+
FileUtils.cp(file, target_file)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def store_without_partitioning(df)
|
86
|
+
lock_file do
|
87
|
+
path = feature_path
|
88
|
+
FileUtils.mkdir_p(File.dirname(path))
|
89
|
+
df.write_parquet(path)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def store_partition(partition_df, primary_key, partition_start)
|
94
|
+
lock_partition(partition_start) do
|
95
|
+
path = partition_path(partition_start)
|
96
|
+
FileUtils.mkdir_p(File.dirname(path))
|
97
|
+
|
98
|
+
if File.exist?(path)
|
99
|
+
reader = EasyML::Data::PolarsReader.new
|
100
|
+
existing_df = reader.query([path])
|
101
|
+
preserved_records = existing_df.filter(
|
102
|
+
Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
|
103
|
+
)
|
104
|
+
partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
|
105
|
+
end
|
106
|
+
|
107
|
+
partition_df.write_parquet(path)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def query_partitions(filter)
|
112
|
+
primary_key_values = filter.extract_primary_key_values
|
113
|
+
batch_size = feature.batch_size || 10_000
|
114
|
+
|
115
|
+
partition_files = primary_key_values.map do |key|
|
116
|
+
partition_start = (key / batch_size.to_f).floor * batch_size
|
117
|
+
partition_path(partition_start)
|
118
|
+
end.uniq.select { |path| File.exist?(path) }
|
119
|
+
|
120
|
+
return Polars::DataFrame.new if partition_files.empty?
|
121
|
+
|
122
|
+
reader = EasyML::Data::PolarsReader.new
|
123
|
+
reader.query(partition_files, filter: filter)
|
124
|
+
end
|
125
|
+
|
126
|
+
def query_all_partitions(filter)
|
127
|
+
reader = EasyML::Data::PolarsReader.new
|
128
|
+
pattern = File.join(feature_dir, "feature*.parquet")
|
129
|
+
files = Dir.glob(pattern)
|
130
|
+
|
131
|
+
return Polars::DataFrame.new if files.empty?
|
132
|
+
|
133
|
+
reader.query(files, filter: filter)
|
134
|
+
end
|
135
|
+
|
136
|
+
def compute_partition_boundaries(min_key, max_key, batch_size)
|
137
|
+
start_partition = (min_key / batch_size.to_f).floor * batch_size
|
138
|
+
end_partition = (max_key / batch_size.to_f).floor * batch_size
|
139
|
+
(start_partition..end_partition).step(batch_size).to_a
|
140
|
+
end
|
141
|
+
|
142
|
+
def feature_dir_for_version(version)
|
143
|
+
File.join(
|
144
|
+
Rails.root,
|
145
|
+
"easy_ml/datasets",
|
146
|
+
feature.dataset.name.parameterize.gsub("-", "_"),
|
147
|
+
"features",
|
148
|
+
feature.name.parameterize.gsub("-", "_"),
|
149
|
+
version.to_s
|
150
|
+
)
|
151
|
+
end
|
152
|
+
|
153
|
+
def feature_dir
|
154
|
+
feature_dir_for_version(feature.version)
|
155
|
+
end
|
156
|
+
|
157
|
+
def feature_path
|
158
|
+
File.join(feature_dir, "feature.parquet")
|
159
|
+
end
|
160
|
+
|
161
|
+
def partition_path(partition_start)
|
162
|
+
File.join(feature_dir, "feature#{partition_start}.parquet")
|
163
|
+
end
|
164
|
+
|
165
|
+
def s3_prefix
|
166
|
+
File.join("datasets", feature_dir.split("datasets").last)
|
167
|
+
end
|
168
|
+
|
169
|
+
def synced_directory
|
170
|
+
return unless feature.dataset&.datasource.present?
|
171
|
+
|
172
|
+
datasource_config = feature.dataset.datasource.configuration || {}
|
173
|
+
@synced_dir ||= EasyML::Data::SyncedDirectory.new(
|
174
|
+
root_dir: feature_dir,
|
175
|
+
s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
|
176
|
+
s3_prefix: s3_prefix,
|
177
|
+
s3_access_key_id: EasyML::Configuration.s3_access_key_id,
|
178
|
+
s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
|
179
|
+
polars_args: datasource_config.dig("polars_args"),
|
180
|
+
cache_for: 0,
|
181
|
+
)
|
182
|
+
end
|
183
|
+
|
184
|
+
def lock_partition(partition_start)
|
185
|
+
Support::Lockable.with_lock(partition_lock_key(partition_start), wait_timeout: 2, stale_timeout: 60) do |client|
|
186
|
+
begin
|
187
|
+
yield client if block_given?
|
188
|
+
ensure
|
189
|
+
unlock_partition(partition_start)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def lock_file
|
195
|
+
Support::Lockable.with_lock(file_lock_key, wait_timeout: 2, stale_timeout: 60) do |client|
|
196
|
+
begin
|
197
|
+
yield client if block_given?
|
198
|
+
ensure
|
199
|
+
unlock_file
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def unlock_partition(partition_start)
|
205
|
+
Support::Lockable.unlock!(partition_lock_key(partition_start))
|
206
|
+
end
|
207
|
+
|
208
|
+
def unlock_file
|
209
|
+
Support::Lockable.unlock!(file_lock_key)
|
210
|
+
end
|
211
|
+
|
212
|
+
def unlock_all_partitions
|
213
|
+
list_partitions.each do |partition_path|
|
214
|
+
partition_start = partition_path.match(/feature(\d+)\.parquet/)[1].to_i
|
215
|
+
unlock_partition(partition_start)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def partition_lock_key(partition_start)
|
220
|
+
"feature_store:#{feature.id}.partition.#{partition_start}"
|
221
|
+
end
|
222
|
+
|
223
|
+
def file_lock_key
|
224
|
+
"feature_store:#{feature.id}.file"
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module EasyML::Features
|
2
|
+
def transform(df, feature)
|
3
|
+
raise NotImplementedError
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.included(base)
|
7
|
+
base.extend(ClassMethods)
|
8
|
+
Registry.register(base)
|
9
|
+
end
|
10
|
+
|
11
|
+
module ClassMethods
|
12
|
+
def features
|
13
|
+
@features ||= []
|
14
|
+
end
|
15
|
+
|
16
|
+
def feature(**kwargs)
|
17
|
+
features << kwargs.merge!(feature_class: self.to_s)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def missing_any?(list1, list2)
|
22
|
+
(list1 - list2).any?
|
23
|
+
end
|
24
|
+
|
25
|
+
class Registry
|
26
|
+
class << self
|
27
|
+
def register(feature_class, namespace: nil)
|
28
|
+
namespace = namespace&.to_sym
|
29
|
+
registry[namespace] ||= {}
|
30
|
+
registry[namespace][feature_class] = feature_class
|
31
|
+
end
|
32
|
+
|
33
|
+
def list(namespace: nil)
|
34
|
+
require_files
|
35
|
+
namespace ? registry[namespace.to_sym] : registry
|
36
|
+
end
|
37
|
+
|
38
|
+
def require_files
|
39
|
+
Dir.glob(Rails.root.join("app/features/**/*.rb")).each { |f| require_dependency f }
|
40
|
+
end
|
41
|
+
|
42
|
+
def list_flat
|
43
|
+
(list.try(:values) || []).flat_map(&:values).flat_map(&:features)
|
44
|
+
end
|
45
|
+
|
46
|
+
def find(name)
|
47
|
+
list_flat.detect { |feature| feature[:name] == name || feature[:feature_class] == name }
|
48
|
+
end
|
49
|
+
|
50
|
+
def clear
|
51
|
+
@registry = {}
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def registry
|
57
|
+
@registry ||= {}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -1,4 +1,18 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
module EasyML
|
2
|
+
module Initializers
|
3
|
+
module Inflections
|
4
|
+
def self.inflect
|
5
|
+
ActiveSupport::Inflector.inflections(:en) do |inflect|
|
6
|
+
inflect.acronym "EasyML"
|
7
|
+
inflect.acronym "ML"
|
8
|
+
inflect.acronym "STI"
|
9
|
+
inflect.acronym "XGBoost"
|
10
|
+
inflect.acronym "GBLinear"
|
11
|
+
inflect.acronym "GBTree"
|
12
|
+
inflect.acronym "EST"
|
13
|
+
inflect.acronym "UTC"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
4
18
|
end
|
data/lib/easy_ml/logging.rb
CHANGED
@@ -7,9 +7,9 @@ module EasyML
|
|
7
7
|
module ClassMethods
|
8
8
|
def log_method(method_name, message, verbose: false)
|
9
9
|
original_method = instance_method(method_name)
|
10
|
-
define_method(method_name) do |*args, &block|
|
10
|
+
define_method(method_name) do |*args, **kwargs, &block|
|
11
11
|
log_message(message, verbose: verbose)
|
12
|
-
result = original_method.bind(self).call(*args, &block)
|
12
|
+
result = original_method.bind(self).call(*args, **kwargs, &block)
|
13
13
|
result
|
14
14
|
end
|
15
15
|
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require "singleton"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
class Predict
|
5
|
+
include Singleton
|
6
|
+
|
7
|
+
attr_reader :models
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@models = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.predict(model_name, df)
|
14
|
+
if df.is_a?(Hash)
|
15
|
+
df = Polars::DataFrame.new(df)
|
16
|
+
end
|
17
|
+
raw_input = df.to_hashes&.first
|
18
|
+
df = instance.normalize(model_name, df)
|
19
|
+
preds = instance.predict(model_name, df)
|
20
|
+
current_version = instance.get_model(model_name)
|
21
|
+
|
22
|
+
EasyML::Prediction.create!(
|
23
|
+
model: current_version.model,
|
24
|
+
model_history: current_version,
|
25
|
+
prediction_type: current_version.model.task,
|
26
|
+
prediction_value: {
|
27
|
+
value: preds.first,
|
28
|
+
}.compact,
|
29
|
+
raw_input: raw_input,
|
30
|
+
normalized_input: df.to_hashes&.first,
|
31
|
+
)
|
32
|
+
|
33
|
+
preds
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.train(model_name, tuner: nil, evaluator: nil)
|
37
|
+
instance.train(model_name, tuner: tuner, evaluator: evaluator)
|
38
|
+
end
|
39
|
+
|
40
|
+
def predict(model_name, df)
|
41
|
+
get_model(model_name).predict(df)
|
42
|
+
end
|
43
|
+
|
44
|
+
def normalize(model_name, df)
|
45
|
+
get_model(model_name).dataset.normalize(df, inference: true)
|
46
|
+
end
|
47
|
+
|
48
|
+
def get_model(model_name)
|
49
|
+
load_model(model_name)
|
50
|
+
models[model_name]
|
51
|
+
end
|
52
|
+
|
53
|
+
def reset
|
54
|
+
@models = {}
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.reset
|
58
|
+
instance.reset
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def load_model(model_name)
|
64
|
+
current_model = EasyML::Model.find_by!(name: model_name).inference_version
|
65
|
+
|
66
|
+
# Load new model if not loaded or different version
|
67
|
+
model_not_loaded = models[model_name].nil?
|
68
|
+
model_is_new_version = models[model_name]&.id != current_model&.id
|
69
|
+
return unless model_not_loaded || model_is_new_version
|
70
|
+
|
71
|
+
models[model_name] = current_model
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -2,47 +2,203 @@ require "rails/generators"
|
|
2
2
|
require "rails/generators/active_record/migration"
|
3
3
|
|
4
4
|
module EasyML
|
5
|
-
module
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
5
|
+
module Railtie
|
6
|
+
module Generators
|
7
|
+
module Migration
|
8
|
+
class MigrationGenerator < Rails::Generators::Base
|
9
|
+
include Rails::Generators::Migration
|
10
|
+
namespace "easy_ml:migration"
|
11
|
+
|
12
|
+
# Set the source directory for templates
|
13
|
+
source_root File.expand_path("../../templates/migration", __dir__)
|
14
|
+
|
15
|
+
# Define the migration name
|
16
|
+
desc "Generates migrations for EasyMLModel"
|
17
|
+
|
18
|
+
# Specify the next migration number
|
19
|
+
def self.next_migration_number(dirname)
|
20
|
+
sleep(1)
|
21
|
+
if ActiveRecord.version < Gem::Version.new("7")
|
22
|
+
Time.now.utc.strftime("%Y%m%d%H%M%S")
|
23
|
+
elsif ActiveRecord.timestamped_migrations
|
24
|
+
Time.now.utc.strftime("%Y%m%d%H%M%S")
|
25
|
+
else
|
26
|
+
format("%.3d", (current_migration_number(dirname) + 1))
|
27
|
+
end
|
25
28
|
end
|
26
|
-
end
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
30
|
+
# Generate the migration files using the templates
|
31
|
+
def create_migration_files
|
32
|
+
create_easy_ml_datasource_migration
|
33
|
+
create_easy_ml_datasets_migration
|
34
|
+
create_easy_ml_columns_migration
|
35
|
+
create_easy_ml_models_migration
|
36
|
+
create_easy_ml_model_files_migration
|
37
|
+
create_easy_ml_tuner_jobs_migration
|
38
|
+
create_easy_ml_retraining_jobs_migration
|
39
|
+
create_easy_ml_settings_migration
|
40
|
+
create_easy_ml_events_migration
|
41
|
+
create_easy_ml_features_migration
|
42
|
+
create_easy_ml_splitters_migration
|
43
|
+
create_easy_ml_splitter_histories_migration
|
44
|
+
create_easy_ml_deploys
|
45
|
+
|
46
|
+
create_easy_ml_datasource_histories_migration
|
47
|
+
create_easy_ml_dataset_histories_migration
|
48
|
+
create_easy_ml_column_histories_migration
|
49
|
+
create_easy_ml_model_histories_migration
|
50
|
+
create_easy_ml_model_file_histories_migration
|
51
|
+
create_easy_ml_feature_histories_migration
|
52
|
+
create_easy_ml_predictions_migration
|
53
|
+
end
|
32
54
|
|
33
|
-
|
55
|
+
private
|
34
56
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
57
|
+
# Generate the migration file for EasyMLModel using the template
|
58
|
+
def create_easy_ml_models_migration
|
59
|
+
migration_template(
|
60
|
+
"create_easy_ml_models.rb.tt",
|
61
|
+
"db/migrate/create_easy_ml_models.rb"
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
def create_easy_ml_model_files_migration
|
66
|
+
migration_template(
|
67
|
+
"create_easy_ml_model_files.rb.tt",
|
68
|
+
"db/migrate/create_easy_ml_model_files.rb"
|
69
|
+
)
|
70
|
+
end
|
71
|
+
|
72
|
+
def create_easy_ml_datasource_migration
|
73
|
+
migration_template(
|
74
|
+
"create_easy_ml_datasources.rb.tt",
|
75
|
+
"db/migrate/create_easy_ml_datasources.rb"
|
76
|
+
)
|
77
|
+
end
|
78
|
+
|
79
|
+
def create_easy_ml_datasets_migration
|
80
|
+
migration_template(
|
81
|
+
"create_easy_ml_datasets.rb.tt",
|
82
|
+
"db/migrate/create_easy_ml_datasets.rb"
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
def create_easy_ml_tuner_jobs_migration
|
87
|
+
migration_template(
|
88
|
+
"create_easy_ml_tuner_jobs.rb.tt",
|
89
|
+
"db/migrate/create_easy_ml_tuner_jobs.rb"
|
90
|
+
)
|
91
|
+
end
|
92
|
+
|
93
|
+
def create_easy_ml_retraining_jobs_migration
|
94
|
+
migration_template(
|
95
|
+
"create_easy_ml_retraining_jobs.rb.tt",
|
96
|
+
"db/migrate/create_easy_ml_retraining_jobs.rb"
|
97
|
+
)
|
98
|
+
end
|
99
|
+
|
100
|
+
def create_easy_ml_settings_migration
|
101
|
+
migration_template(
|
102
|
+
"create_easy_ml_settings.rb.tt",
|
103
|
+
"db/migrate/create_easy_ml_settings.rb"
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
def create_easy_ml_events_migration
|
108
|
+
migration_template(
|
109
|
+
"create_easy_ml_events.rb.tt",
|
110
|
+
"db/migrate/create_easy_ml_events.rb"
|
111
|
+
)
|
112
|
+
end
|
113
|
+
|
114
|
+
def create_easy_ml_columns_migration
|
115
|
+
migration_template(
|
116
|
+
"create_easy_ml_columns.rb.tt",
|
117
|
+
"db/migrate/create_easy_ml_columns.rb"
|
118
|
+
)
|
119
|
+
end
|
120
|
+
|
121
|
+
def create_easy_ml_features_migration
|
122
|
+
migration_template(
|
123
|
+
"create_easy_ml_features.rb.tt",
|
124
|
+
"db/migrate/create_easy_ml_features.rb"
|
125
|
+
)
|
126
|
+
end
|
127
|
+
|
128
|
+
def create_easy_ml_splitters_migration
|
129
|
+
migration_template(
|
130
|
+
"create_easy_ml_splitters.rb.tt",
|
131
|
+
"db/migrate/create_easy_ml_splitters.rb"
|
132
|
+
)
|
133
|
+
end
|
134
|
+
|
135
|
+
def create_easy_ml_splitter_histories_migration
|
136
|
+
migration_template(
|
137
|
+
"create_easy_ml_splitter_histories.rb.tt",
|
138
|
+
"db/migrate/create_easy_ml_splitter_histories.rb"
|
139
|
+
)
|
140
|
+
end
|
141
|
+
|
142
|
+
def create_easy_ml_datasource_histories_migration
|
143
|
+
migration_template(
|
144
|
+
"create_easy_ml_datasource_histories.rb.tt",
|
145
|
+
"db/migrate/create_easy_ml_datasource_histories.rb"
|
146
|
+
)
|
147
|
+
end
|
148
|
+
|
149
|
+
def create_easy_ml_dataset_histories_migration
|
150
|
+
migration_template(
|
151
|
+
"create_easy_ml_dataset_histories.rb.tt",
|
152
|
+
"db/migrate/create_easy_ml_dataset_histories.rb"
|
153
|
+
)
|
154
|
+
end
|
155
|
+
|
156
|
+
def create_easy_ml_column_histories_migration
|
157
|
+
migration_template(
|
158
|
+
"create_easy_ml_column_histories.rb.tt",
|
159
|
+
"db/migrate/create_easy_ml_column_histories.rb"
|
160
|
+
)
|
161
|
+
end
|
162
|
+
|
163
|
+
def create_easy_ml_model_histories_migration
|
164
|
+
migration_template(
|
165
|
+
"create_easy_ml_model_histories.rb.tt",
|
166
|
+
"db/migrate/create_easy_ml_model_histories.rb"
|
167
|
+
)
|
168
|
+
end
|
169
|
+
|
170
|
+
def create_easy_ml_feature_histories_migration
|
171
|
+
migration_template(
|
172
|
+
"create_easy_ml_feature_histories.rb.tt",
|
173
|
+
"db/migrate/create_easy_ml_feature_histories.rb"
|
174
|
+
)
|
175
|
+
end
|
176
|
+
|
177
|
+
def create_easy_ml_model_file_histories_migration
|
178
|
+
migration_template(
|
179
|
+
"create_easy_ml_model_file_histories.rb.tt",
|
180
|
+
"db/migrate/create_easy_ml_model_file_histories.rb"
|
181
|
+
)
|
182
|
+
end
|
183
|
+
|
184
|
+
def create_easy_ml_deploys
|
185
|
+
migration_template(
|
186
|
+
"create_easy_ml_deploys.rb.tt",
|
187
|
+
"db/migrate/create_easy_ml_deploys.rb"
|
188
|
+
)
|
189
|
+
end
|
42
190
|
|
43
|
-
|
44
|
-
|
45
|
-
|
191
|
+
def create_easy_ml_predictions_migration
|
192
|
+
migration_template(
|
193
|
+
"create_easy_ml_predictions.rb.tt",
|
194
|
+
"db/migrate/create_easy_ml_predictions.rb"
|
195
|
+
)
|
196
|
+
end
|
197
|
+
|
198
|
+
# Get the next migration number
|
199
|
+
def next_migration_number
|
200
|
+
self.class.next_migration_number(Rails.root.join("db/migrate"))
|
201
|
+
end
|
46
202
|
end
|
47
203
|
end
|
48
204
|
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require "historiographer/postgres_migration"
|
2
|
+
|
3
|
+
class CreateEasyMLColumnHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
4
|
+
def change
|
5
|
+
create_table :easy_ml_column_histories do |t|
|
6
|
+
t.histories(foreign_key: :column_id)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class CreateEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
create_table :easy_ml_columns do |t|
|
4
|
+
t.bigint :dataset_id, null: false
|
5
|
+
t.string :name, null: false
|
6
|
+
t.string :description
|
7
|
+
t.string :datatype # The symbol representation (e.g., 'float', 'integer')
|
8
|
+
t.string :polars_datatype # The full Polars class name (e.g., 'Polars::Float64')
|
9
|
+
t.boolean :is_target
|
10
|
+
t.boolean :hidden, default: false
|
11
|
+
t.boolean :drop_if_null, default: false
|
12
|
+
t.json :preprocessing_steps
|
13
|
+
t.json :sample_values # Store up to 3 sample values
|
14
|
+
t.json :statistics
|
15
|
+
|
16
|
+
t.timestamps
|
17
|
+
|
18
|
+
t.index [:dataset_id, :name], unique: true
|
19
|
+
t.index :datatype
|
20
|
+
t.index :hidden
|
21
|
+
t.index :drop_if_null
|
22
|
+
t.index :is_target
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require "historiographer/postgres_migration"
|
2
|
+
|
3
|
+
class CreateEasyMLDatasetHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
4
|
+
def change
|
5
|
+
create_table :easy_ml_dataset_histories do |t|
|
6
|
+
t.histories(foreign_key: :dataset_id)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|