easy_ml 0.1.4 → 0.2.0.pre.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -1,49 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Data
|
3
|
-
class Dataset
|
4
|
-
module Splits
|
5
|
-
class InMemorySplit < Split
|
6
|
-
include GlueGun::DSL
|
7
|
-
|
8
|
-
attribute :sample, :float, default: 1.0
|
9
|
-
def initialize(options)
|
10
|
-
super
|
11
|
-
@data = {}
|
12
|
-
end
|
13
|
-
|
14
|
-
def save(segment, df)
|
15
|
-
@data[segment] = df
|
16
|
-
end
|
17
|
-
|
18
|
-
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
19
|
-
df = @data[segment]
|
20
|
-
return nil if df.nil?
|
21
|
-
|
22
|
-
df = sample_data(df) if sample < 1.0
|
23
|
-
drop_cols &= df.columns
|
24
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
25
|
-
|
26
|
-
if block_given?
|
27
|
-
if split_ys
|
28
|
-
xs, ys = split_features_targets(df, true, target)
|
29
|
-
process_block_with_split_ys(block, nil, xs, ys)
|
30
|
-
else
|
31
|
-
process_block_without_split_ys(block, nil, df)
|
32
|
-
end
|
33
|
-
else
|
34
|
-
split_features_targets(df, split_ys, target)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def cleanup
|
39
|
-
@data.clear
|
40
|
-
end
|
41
|
-
|
42
|
-
def split_at
|
43
|
-
@data.keys.empty? ? nil : Time.now
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Data
|
3
|
-
class Dataset
|
4
|
-
module Splits
|
5
|
-
class Split
|
6
|
-
include GlueGun::DSL
|
7
|
-
include EasyML::Data::Utils
|
8
|
-
|
9
|
-
attribute :polars_args, :hash, default: {}
|
10
|
-
attribute :max_rows_per_file, :integer, default: 1_000_000
|
11
|
-
attribute :batch_size, :integer, default: 10_000
|
12
|
-
attribute :sample, :float, default: 1.0
|
13
|
-
attribute :verbose, :boolean, default: false
|
14
|
-
|
15
|
-
def save(segment, df)
|
16
|
-
raise NotImplementedError, "Subclasses must implement #save"
|
17
|
-
end
|
18
|
-
|
19
|
-
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
20
|
-
raise NotImplementedError, "Subclasses must implement #read"
|
21
|
-
end
|
22
|
-
|
23
|
-
def train(&block)
|
24
|
-
read(:train, &block)
|
25
|
-
end
|
26
|
-
|
27
|
-
def test(&block)
|
28
|
-
read(:test, &block)
|
29
|
-
end
|
30
|
-
|
31
|
-
def valid(&block)
|
32
|
-
read(:valid, &block)
|
33
|
-
end
|
34
|
-
|
35
|
-
def cleanup
|
36
|
-
raise NotImplementedError, "Subclasses must implement #cleanup"
|
37
|
-
end
|
38
|
-
|
39
|
-
def split_at
|
40
|
-
raise NotImplementedError, "Subclasses must implement #split_at"
|
41
|
-
end
|
42
|
-
|
43
|
-
protected
|
44
|
-
|
45
|
-
def split_features_targets(df, split_ys, target)
|
46
|
-
raise ArgumentError, "Target column must be specified when split_ys is true" if split_ys && target.nil?
|
47
|
-
|
48
|
-
if split_ys
|
49
|
-
xs = df.drop(target)
|
50
|
-
ys = df.select(target)
|
51
|
-
[xs, ys]
|
52
|
-
else
|
53
|
-
df
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def sample_data(df)
|
58
|
-
return df if sample >= 1.0
|
59
|
-
|
60
|
-
df.sample(n: (df.shape[0] * sample).ceil, seed: 42)
|
61
|
-
end
|
62
|
-
|
63
|
-
def create_progress_bar(segment, total_rows)
|
64
|
-
ProgressBar.create(
|
65
|
-
title: "Reading #{segment}",
|
66
|
-
total: total_rows,
|
67
|
-
format: "%t: |%B| %p%% %e"
|
68
|
-
)
|
69
|
-
end
|
70
|
-
|
71
|
-
def process_block_with_split_ys(block, result, xs, ys)
|
72
|
-
case block.arity
|
73
|
-
when 3
|
74
|
-
result.nil? ? [xs, ys] : block.call(result, xs, ys)
|
75
|
-
when 2
|
76
|
-
block.call(xs, ys)
|
77
|
-
result
|
78
|
-
else
|
79
|
-
raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_block_without_split_ys(block, result, df)
|
84
|
-
case block.arity
|
85
|
-
when 2
|
86
|
-
result.nil? ? df : block.call(result, df)
|
87
|
-
when 1
|
88
|
-
block.call(df)
|
89
|
-
result
|
90
|
-
else
|
91
|
-
raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
module EasyML::Data::Dataset::Splitters
|
2
|
-
class DateSplitter
|
3
|
-
include GlueGun::DSL
|
4
|
-
|
5
|
-
attribute :today, :datetime
|
6
|
-
def today=(value)
|
7
|
-
super(value.in_time_zone(UTC).to_datetime)
|
8
|
-
end
|
9
|
-
attribute :date_col, :string
|
10
|
-
attribute :months_test, :integer, default: 2
|
11
|
-
attribute :months_valid, :integer, default: 2
|
12
|
-
|
13
|
-
def initialize(options)
|
14
|
-
options[:today] ||= UTC.now
|
15
|
-
super(options)
|
16
|
-
end
|
17
|
-
|
18
|
-
def split(df)
|
19
|
-
unless df[date_col].dtype.is_a?(Polars::Datetime)
|
20
|
-
raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
|
21
|
-
end
|
22
|
-
|
23
|
-
validation_date_start, test_date_start = splits
|
24
|
-
|
25
|
-
test_df = df.filter(Polars.col(date_col) >= test_date_start)
|
26
|
-
remaining_df = df.filter(Polars.col(date_col) < test_date_start)
|
27
|
-
valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
|
28
|
-
train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
|
29
|
-
|
30
|
-
[train_df, valid_df, test_df]
|
31
|
-
end
|
32
|
-
|
33
|
-
def months(n)
|
34
|
-
ActiveSupport::Duration.months(n)
|
35
|
-
end
|
36
|
-
|
37
|
-
def splits
|
38
|
-
test_date_start = today.advance(months: -months_test).beginning_of_day
|
39
|
-
validation_date_start = today.advance(months: -(months_test + months_valid)).beginning_of_day
|
40
|
-
[validation_date_start, test_date_start]
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
data/lib/easy_ml/data/dataset.rb
DELETED
@@ -1,430 +0,0 @@
|
|
1
|
-
require "polars"
|
2
|
-
require_relative "datasource"
|
3
|
-
require_relative "dataset/splitters"
|
4
|
-
require_relative "dataset/splits"
|
5
|
-
|
6
|
-
# Dataset is responsible for:
|
7
|
-
#
|
8
|
-
# 1) Ensuring data is synced from its source (e.g. S3 — delegates to datasource)
|
9
|
-
# 2) Ensuring the data is properly split into train, test, and validation data (delegates to splitter)
|
10
|
-
# 3) Knowing where data is stored on disk, and pulling batches of data into memory
|
11
|
-
# 4) Knowing where to save updated data (after preprocessing steps)
|
12
|
-
#
|
13
|
-
module EasyML
|
14
|
-
module Data
|
15
|
-
class Dataset
|
16
|
-
include GlueGun::DSL
|
17
|
-
include EasyML::Logging
|
18
|
-
include EasyML::Data::Utils
|
19
|
-
|
20
|
-
# include GitIgnorable
|
21
|
-
# gitignore :root_dir do |dir|
|
22
|
-
# if Rails.env.test? # Don't gitignore our test files
|
23
|
-
# nil
|
24
|
-
# else
|
25
|
-
# File.join(dir, "files/**/*")
|
26
|
-
# end
|
27
|
-
# end
|
28
|
-
|
29
|
-
# These helpers are defined in GlueGun::DSL.
|
30
|
-
#
|
31
|
-
# define_attr defines configurable attributes for subclasses,
|
32
|
-
# for example, a class sub-classing Dataset will want to define its
|
33
|
-
# target (e.g. the column we are trying to predict)
|
34
|
-
#
|
35
|
-
# These can either be defined on a class-level like this:
|
36
|
-
#
|
37
|
-
# class Dataset < EasyML::Data::Dataset
|
38
|
-
# target "REVENUE"
|
39
|
-
# end
|
40
|
-
#
|
41
|
-
# Or passed in during initialization:
|
42
|
-
#
|
43
|
-
# Dataset.new(target: "REV")
|
44
|
-
#
|
45
|
-
attribute :verbose, :boolean, default: false
|
46
|
-
attribute :today, :date, default: -> { UTC.now }
|
47
|
-
def today=(value)
|
48
|
-
super(value.in_time_zone(UTC).to_date)
|
49
|
-
end
|
50
|
-
attribute :target, :string
|
51
|
-
validates :target, presence: true
|
52
|
-
|
53
|
-
attribute :batch_size, :integer, default: 50_000
|
54
|
-
|
55
|
-
attribute :root_dir, :string
|
56
|
-
validates :root_dir, presence: true
|
57
|
-
def root_dir=(value)
|
58
|
-
super(Pathname.new(value).append("data").to_s)
|
59
|
-
end
|
60
|
-
|
61
|
-
attribute :sample, :float, default: 1.0
|
62
|
-
attribute :drop_if_null, :array, default: []
|
63
|
-
|
64
|
-
# define_attr can also define default values, as well as argument helpers
|
65
|
-
attribute :polars_args, :hash, default: {}
|
66
|
-
def polars_args=(args)
|
67
|
-
super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
|
68
|
-
hash.tap do
|
69
|
-
hash[k] = v
|
70
|
-
hash[k] = v.stringify_keys if k == :dtypes
|
71
|
-
end
|
72
|
-
end)
|
73
|
-
end
|
74
|
-
|
75
|
-
attribute :transforms, default: nil
|
76
|
-
validate :transforms_are_transforms
|
77
|
-
def transforms_are_transforms
|
78
|
-
return if transforms.nil? || transforms.respond_to?(:transform)
|
79
|
-
|
80
|
-
errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
|
81
|
-
end
|
82
|
-
|
83
|
-
attribute :drop_cols, :array, default: []
|
84
|
-
|
85
|
-
dependency :datasource, EasyML::Data::Datasource::DatasourceFactory
|
86
|
-
|
87
|
-
# dependency defines a configurable dependency, with optional args,
|
88
|
-
# for example, here we define a datasource:
|
89
|
-
#
|
90
|
-
# class YourDataset
|
91
|
-
# datasource :s3, s3_bucket: "fundera-bart", s3_prefix: "xyz"
|
92
|
-
# # This automatically uses the S3Datasource class to pull data
|
93
|
-
# end
|
94
|
-
#
|
95
|
-
# If we define any models based on other data sources (e.g. postgres),
|
96
|
-
# you would just define a new PostgresDatasource
|
97
|
-
#
|
98
|
-
|
99
|
-
# Here we define splitter options, inspired by common Python data splitting techniques:
|
100
|
-
#
|
101
|
-
# 1. Date-based splitter (similar to TimeSeriesSplit from sklearn)
|
102
|
-
#
|
103
|
-
# NOT IMPLEMENTED (but you could implement as necessary):
|
104
|
-
# 2. Random splitter (similar to train_test_split from sklearn)
|
105
|
-
# 3. Stratified splitter (similar to StratifiedKFold from sklearn)
|
106
|
-
# 4. Group-based splitter (similar to GroupKFold from sklearn)
|
107
|
-
# 5. Sliding window splitter (similar to TimeSeriesSplit with a sliding window)
|
108
|
-
#
|
109
|
-
dependency :splitter do |dependency|
|
110
|
-
dependency.option :date do |option|
|
111
|
-
option.default
|
112
|
-
option.set_class EasyML::Data::Dataset::Splitters::DateSplitter
|
113
|
-
option.bind_attribute :today, required: true
|
114
|
-
option.bind_attribute :date_col, required: true
|
115
|
-
option.bind_attribute :months_test, required: true
|
116
|
-
option.bind_attribute :months_valid, required: true
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
# Here we define the preprocessing logic.
|
121
|
-
# Aka what to do with null values. For instance:
|
122
|
-
#
|
123
|
-
# class YourDataset
|
124
|
-
# preprocessing_steps: {
|
125
|
-
# training: {
|
126
|
-
# annual_revenue: {
|
127
|
-
# clip: {min: 0, max: 1_000_000} # Clip values between these
|
128
|
-
# median: true, # Then learn the median based on clipped values
|
129
|
-
# },
|
130
|
-
# created_date: { ffill: true } # During training, use the latest value in the dataset
|
131
|
-
# },
|
132
|
-
# inference: {
|
133
|
-
# created_date: { today: true } # During inference, use the current date
|
134
|
-
# }
|
135
|
-
# }
|
136
|
-
# end
|
137
|
-
#
|
138
|
-
attribute :preprocessing_steps, :hash, default: {}
|
139
|
-
dependency :preprocessor do |dependency|
|
140
|
-
dependency.set_class EasyML::Data::Preprocessor
|
141
|
-
dependency.bind_attribute :directory, source: :root_dir do |value|
|
142
|
-
Pathname.new(value).append("preprocessor")
|
143
|
-
end
|
144
|
-
dependency.bind_attribute :preprocessing_steps
|
145
|
-
end
|
146
|
-
|
147
|
-
# Here we define the raw dataset (uses the Split class)
|
148
|
-
# We use this to learn dataset statistics (e.g. median annual revenue)
|
149
|
-
# But we NEVER overwrite it
|
150
|
-
#
|
151
|
-
dependency :raw do |dependency|
|
152
|
-
dependency.option :file do |option|
|
153
|
-
option.default
|
154
|
-
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
155
|
-
option.bind_attribute :dir, source: :root_dir do |value|
|
156
|
-
Pathname.new(value).append("files/splits/raw")
|
157
|
-
end
|
158
|
-
option.bind_attribute :polars_args
|
159
|
-
option.bind_attribute :max_rows_per_file, source: :batch_size
|
160
|
-
option.bind_attribute :batch_size
|
161
|
-
option.bind_attribute :sample
|
162
|
-
option.bind_attribute :verbose
|
163
|
-
end
|
164
|
-
|
165
|
-
dependency.option :memory do |option|
|
166
|
-
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
167
|
-
option.bind_attribute :sample
|
168
|
-
end
|
169
|
-
|
170
|
-
dependency.when do |_dep|
|
171
|
-
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
# Here we define the processed dataset (uses the Split class)
|
176
|
-
# After we learn the dataset statistics, we fill null values
|
177
|
-
# using the learned statistics (e.g. fill annual_revenue with median annual_revenue)
|
178
|
-
#
|
179
|
-
dependency :processed do |dependency|
|
180
|
-
dependency.option :file do |option|
|
181
|
-
option.default
|
182
|
-
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
183
|
-
option.bind_attribute :dir, source: :root_dir do |value|
|
184
|
-
Pathname.new(value).append("files/splits/processed")
|
185
|
-
end
|
186
|
-
option.bind_attribute :polars_args
|
187
|
-
option.bind_attribute :max_rows_per_file, source: :batch_size
|
188
|
-
option.bind_attribute :batch_size
|
189
|
-
option.bind_attribute :sample
|
190
|
-
option.bind_attribute :verbose
|
191
|
-
end
|
192
|
-
|
193
|
-
dependency.option :memory do |option|
|
194
|
-
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
195
|
-
option.bind_attribute :sample
|
196
|
-
end
|
197
|
-
|
198
|
-
dependency.when do |_dep|
|
199
|
-
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
200
|
-
end
|
201
|
-
end
|
202
|
-
|
203
|
-
delegate :new_data_available?, :synced?, :stale?, to: :datasource
|
204
|
-
delegate :train, :test, :valid, to: :split
|
205
|
-
delegate :splits, to: :splitter
|
206
|
-
|
207
|
-
def refresh!
|
208
|
-
refresh_datasource
|
209
|
-
split_data
|
210
|
-
fit
|
211
|
-
normalize_all
|
212
|
-
alert_nulls
|
213
|
-
end
|
214
|
-
|
215
|
-
def normalize(df = nil)
|
216
|
-
df = drop_nulls(df)
|
217
|
-
df = apply_transforms(df)
|
218
|
-
preprocessor.postprocess(df)
|
219
|
-
end
|
220
|
-
|
221
|
-
# A "production" preprocessor is predicting live values (e.g. used on live webservers)
|
222
|
-
# A "development" preprocessor is used during training (e.g. we're learning new values for the dataset)
|
223
|
-
#
|
224
|
-
delegate :statistics, to: :preprocessor
|
225
|
-
|
226
|
-
def train(split_ys: false, all_columns: false, &block)
|
227
|
-
load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
|
228
|
-
end
|
229
|
-
|
230
|
-
def valid(split_ys: false, all_columns: false, &block)
|
231
|
-
load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
|
232
|
-
end
|
233
|
-
|
234
|
-
def test(split_ys: false, all_columns: false, &block)
|
235
|
-
load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
|
236
|
-
end
|
237
|
-
|
238
|
-
def data(split_ys: false, all_columns: false)
|
239
|
-
if split_ys
|
240
|
-
x_train, y_train = train(split_ys: true, all_columns: all_columns)
|
241
|
-
x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
|
242
|
-
x_test, y_test = test(split_ys: true, all_columns: all_columns)
|
243
|
-
|
244
|
-
xs = Polars.concat([x_train, x_valid, x_test])
|
245
|
-
ys = Polars.concat([y_train, y_valid, y_test])
|
246
|
-
[xs, ys]
|
247
|
-
else
|
248
|
-
train_df = train(split_ys: false, all_columns: all_columns)
|
249
|
-
valid_df = valid(split_ys: false, all_columns: all_columns)
|
250
|
-
test_df = test(split_ys: false, all_columns: all_columns)
|
251
|
-
|
252
|
-
Polars.concat([train_df, valid_df, test_df])
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
def cleanup
|
257
|
-
raw.cleanup
|
258
|
-
processed.cleanup
|
259
|
-
end
|
260
|
-
|
261
|
-
def check_nulls(data_type = :processed)
|
262
|
-
result = %i[train test valid].each_with_object({}) do |segment, acc|
|
263
|
-
segment_result = { nulls: {}, total: 0 }
|
264
|
-
|
265
|
-
data_source = data_type == :raw ? raw : processed
|
266
|
-
data_source.read(segment) do |df|
|
267
|
-
df_nulls = null_check(df)
|
268
|
-
df.columns.each do |column|
|
269
|
-
segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
|
270
|
-
if df_nulls && df_nulls[column]
|
271
|
-
segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
|
272
|
-
end
|
273
|
-
segment_result[:nulls][column][:total_count] += df.height
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
segment_result[:nulls].each do |column, counts|
|
278
|
-
percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
|
279
|
-
acc[column] ||= {}
|
280
|
-
acc[column][segment] = percentage
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
# Remove columns that have no nulls across all segments
|
285
|
-
result.reject! { |_, v| v.values.all?(&:zero?) }
|
286
|
-
|
287
|
-
result.empty? ? nil : result
|
288
|
-
end
|
289
|
-
|
290
|
-
def processed?
|
291
|
-
!should_split?
|
292
|
-
end
|
293
|
-
|
294
|
-
def decode_labels(ys, col: nil)
|
295
|
-
preprocessor.decode_labels(ys, col: col.nil? ? target : col)
|
296
|
-
end
|
297
|
-
|
298
|
-
private
|
299
|
-
|
300
|
-
def refresh_datasource
|
301
|
-
datasource.refresh!
|
302
|
-
end
|
303
|
-
log_method :refresh!, "Refreshing datasource", verbose: true
|
304
|
-
|
305
|
-
def normalize_all
|
306
|
-
processed.cleanup
|
307
|
-
|
308
|
-
%i[train test valid].each do |segment|
|
309
|
-
raw.read(segment) do |df|
|
310
|
-
processed_df = normalize(df)
|
311
|
-
processed.save(segment, processed_df)
|
312
|
-
end
|
313
|
-
end
|
314
|
-
end
|
315
|
-
log_method :normalize_all, "Normalizing dataset", verbose: true
|
316
|
-
|
317
|
-
def drop_nulls(df)
|
318
|
-
return df if drop_if_null.nil? || drop_if_null.empty?
|
319
|
-
|
320
|
-
df.drop_nulls(subset: drop_if_null)
|
321
|
-
end
|
322
|
-
|
323
|
-
def drop_columns(all_columns: false)
|
324
|
-
if all_columns
|
325
|
-
[]
|
326
|
-
else
|
327
|
-
drop_cols
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
def load_data(segment, split_ys: false, all_columns: false, &block)
|
332
|
-
drop_cols = drop_columns(all_columns: all_columns)
|
333
|
-
if processed?
|
334
|
-
processed.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
335
|
-
else
|
336
|
-
raw.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
|
-
def fit(xs = nil)
|
341
|
-
xs = raw.train if xs.nil?
|
342
|
-
|
343
|
-
preprocessor.fit(xs)
|
344
|
-
end
|
345
|
-
log_method :fit, "Learning statistics", verbose: true
|
346
|
-
|
347
|
-
def in_batches(segment, processed: true, &block)
|
348
|
-
if processed
|
349
|
-
processed.read(segment, &block)
|
350
|
-
else
|
351
|
-
raw.read(segment, &block)
|
352
|
-
end
|
353
|
-
end
|
354
|
-
|
355
|
-
def split_data
|
356
|
-
return unless should_split?
|
357
|
-
|
358
|
-
cleanup
|
359
|
-
datasource.in_batches do |df|
|
360
|
-
train_df, valid_df, test_df = splitter.split(df)
|
361
|
-
raw.save(:train, train_df)
|
362
|
-
raw.save(:valid, valid_df)
|
363
|
-
raw.save(:test, test_df)
|
364
|
-
end
|
365
|
-
|
366
|
-
# Update the persisted sample size after splitting
|
367
|
-
save_previous_sample(sample)
|
368
|
-
end
|
369
|
-
log_method :split_data, "Splitting data", verbose: true
|
370
|
-
|
371
|
-
def should_split?
|
372
|
-
split_timestamp = raw.split_at
|
373
|
-
previous_sample = load_previous_sample
|
374
|
-
sample_increased = previous_sample && sample > previous_sample
|
375
|
-
previous_sample.nil? || split_timestamp.nil? || split_timestamp < datasource.last_updated_at || sample_increased
|
376
|
-
end
|
377
|
-
|
378
|
-
def sample_info_file
|
379
|
-
File.join(root_dir, "sample_info.json")
|
380
|
-
end
|
381
|
-
|
382
|
-
def save_previous_sample(sample_size)
|
383
|
-
File.write(sample_info_file, JSON.generate({ previous_sample: sample_size }))
|
384
|
-
end
|
385
|
-
|
386
|
-
def load_previous_sample
|
387
|
-
return nil unless File.exist?(sample_info_file)
|
388
|
-
|
389
|
-
JSON.parse(File.read(sample_info_file))["previous_sample"]
|
390
|
-
end
|
391
|
-
|
392
|
-
def apply_transforms(df)
|
393
|
-
if transforms.nil?
|
394
|
-
df
|
395
|
-
else
|
396
|
-
transforms.apply_transforms(df)
|
397
|
-
end
|
398
|
-
end
|
399
|
-
|
400
|
-
def alert_nulls
|
401
|
-
processed_nulls = check_nulls(:processed)
|
402
|
-
raw_nulls = check_nulls(:raw)
|
403
|
-
|
404
|
-
if processed_nulls
|
405
|
-
log_warning("Nulls found in the processed dataset:")
|
406
|
-
processed_nulls.each do |column, segments|
|
407
|
-
segments.each do |segment, percentage|
|
408
|
-
log_warning(" #{column} - #{segment}: #{percentage}% nulls")
|
409
|
-
end
|
410
|
-
end
|
411
|
-
else
|
412
|
-
log_info("No nulls found in the processed dataset.")
|
413
|
-
end
|
414
|
-
|
415
|
-
if raw_nulls
|
416
|
-
raw_nulls.each do |column, segments|
|
417
|
-
segments.each do |segment, percentage|
|
418
|
-
if percentage > 50
|
419
|
-
log_warning("Data processing issue detected: #{column} - #{segment} has #{percentage}% nulls in the raw dataset")
|
420
|
-
end
|
421
|
-
end
|
422
|
-
end
|
423
|
-
end
|
424
|
-
|
425
|
-
nil
|
426
|
-
end
|
427
|
-
log_method :alert_nulls, "Checking for nulls", verbose: true
|
428
|
-
end
|
429
|
-
end
|
430
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
require_relative "merged_datasource"
|
2
|
-
|
3
|
-
module EasyML
|
4
|
-
module Data
|
5
|
-
class Datasource
|
6
|
-
class DatasourceFactory
|
7
|
-
include GlueGun::DSL
|
8
|
-
|
9
|
-
dependency :datasource do |dependency|
|
10
|
-
dependency.option :s3 do |option|
|
11
|
-
option.default
|
12
|
-
option.set_class EasyML::Data::Datasource::S3Datasource
|
13
|
-
option.bind_attribute :root_dir do |value|
|
14
|
-
Pathname.new(value).append("files")
|
15
|
-
end
|
16
|
-
option.bind_attribute :polars_args, default: {}
|
17
|
-
option.bind_attribute :s3_bucket, required: true
|
18
|
-
option.bind_attribute :s3_prefix
|
19
|
-
option.bind_attribute :s3_access_key_id, required: true
|
20
|
-
option.bind_attribute :s3_secret_access_key, required: true
|
21
|
-
end
|
22
|
-
|
23
|
-
dependency.option :file do |option|
|
24
|
-
option.set_class EasyML::Data::Datasource::FileDatasource
|
25
|
-
option.bind_attribute :root_dir do |value|
|
26
|
-
Pathname.new(value).append("files/raw")
|
27
|
-
end
|
28
|
-
option.bind_attribute :polars_args
|
29
|
-
end
|
30
|
-
|
31
|
-
dependency.option :polars do |option|
|
32
|
-
option.set_class EasyML::Data::Datasource::PolarsDatasource
|
33
|
-
option.bind_attribute :df
|
34
|
-
end
|
35
|
-
|
36
|
-
dependency.option :merged do |option|
|
37
|
-
option.set_class EasyML::Data::Datasource::MergedDatasource
|
38
|
-
option.bind_attribute :root_dir
|
39
|
-
end
|
40
|
-
|
41
|
-
# Passing in datasource: Polars::DataFrame will wrap properly
|
42
|
-
# So will passing in datasource /path/to/dir
|
43
|
-
dependency.when do |dep|
|
44
|
-
case dep
|
45
|
-
when Polars::DataFrame
|
46
|
-
{ option: :polars, as: :df }
|
47
|
-
when String, Pathname
|
48
|
-
{ option: :file, as: :root_dir }
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Do this here otherwise we'll end up with a circular dependency
|
58
|
-
class EasyML::Data::Datasource::MergedDatasource
|
59
|
-
dependency :datasources, DatasourceFactory
|
60
|
-
end
|