easy_ml 0.1.3 → 0.2.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -4
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -1,49 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Data
|
3
|
-
class Dataset
|
4
|
-
module Splits
|
5
|
-
class InMemorySplit < Split
|
6
|
-
include GlueGun::DSL
|
7
|
-
|
8
|
-
attribute :sample, :float, default: 1.0
|
9
|
-
def initialize(options)
|
10
|
-
super
|
11
|
-
@data = {}
|
12
|
-
end
|
13
|
-
|
14
|
-
def save(segment, df)
|
15
|
-
@data[segment] = df
|
16
|
-
end
|
17
|
-
|
18
|
-
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
19
|
-
df = @data[segment]
|
20
|
-
return nil if df.nil?
|
21
|
-
|
22
|
-
df = sample_data(df) if sample < 1.0
|
23
|
-
drop_cols &= df.columns
|
24
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
25
|
-
|
26
|
-
if block_given?
|
27
|
-
if split_ys
|
28
|
-
xs, ys = split_features_targets(df, true, target)
|
29
|
-
process_block_with_split_ys(block, nil, xs, ys)
|
30
|
-
else
|
31
|
-
process_block_without_split_ys(block, nil, df)
|
32
|
-
end
|
33
|
-
else
|
34
|
-
split_features_targets(df, split_ys, target)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def cleanup
|
39
|
-
@data.clear
|
40
|
-
end
|
41
|
-
|
42
|
-
def split_at
|
43
|
-
@data.keys.empty? ? nil : Time.now
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Data
|
3
|
-
class Dataset
|
4
|
-
module Splits
|
5
|
-
class Split
|
6
|
-
include GlueGun::DSL
|
7
|
-
include EasyML::Data::Utils
|
8
|
-
|
9
|
-
attribute :polars_args, :hash, default: {}
|
10
|
-
attribute :max_rows_per_file, :integer, default: 1_000_000
|
11
|
-
attribute :batch_size, :integer, default: 10_000
|
12
|
-
attribute :sample, :float, default: 1.0
|
13
|
-
attribute :verbose, :boolean, default: false
|
14
|
-
|
15
|
-
def save(segment, df)
|
16
|
-
raise NotImplementedError, "Subclasses must implement #save"
|
17
|
-
end
|
18
|
-
|
19
|
-
def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
|
20
|
-
raise NotImplementedError, "Subclasses must implement #read"
|
21
|
-
end
|
22
|
-
|
23
|
-
def train(&block)
|
24
|
-
read(:train, &block)
|
25
|
-
end
|
26
|
-
|
27
|
-
def test(&block)
|
28
|
-
read(:test, &block)
|
29
|
-
end
|
30
|
-
|
31
|
-
def valid(&block)
|
32
|
-
read(:valid, &block)
|
33
|
-
end
|
34
|
-
|
35
|
-
def cleanup
|
36
|
-
raise NotImplementedError, "Subclasses must implement #cleanup"
|
37
|
-
end
|
38
|
-
|
39
|
-
def split_at
|
40
|
-
raise NotImplementedError, "Subclasses must implement #split_at"
|
41
|
-
end
|
42
|
-
|
43
|
-
protected
|
44
|
-
|
45
|
-
def split_features_targets(df, split_ys, target)
|
46
|
-
raise ArgumentError, "Target column must be specified when split_ys is true" if split_ys && target.nil?
|
47
|
-
|
48
|
-
if split_ys
|
49
|
-
xs = df.drop(target)
|
50
|
-
ys = df.select(target)
|
51
|
-
[xs, ys]
|
52
|
-
else
|
53
|
-
df
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def sample_data(df)
|
58
|
-
return df if sample >= 1.0
|
59
|
-
|
60
|
-
df.sample(n: (df.shape[0] * sample).ceil, seed: 42)
|
61
|
-
end
|
62
|
-
|
63
|
-
def create_progress_bar(segment, total_rows)
|
64
|
-
ProgressBar.create(
|
65
|
-
title: "Reading #{segment}",
|
66
|
-
total: total_rows,
|
67
|
-
format: "%t: |%B| %p%% %e"
|
68
|
-
)
|
69
|
-
end
|
70
|
-
|
71
|
-
def process_block_with_split_ys(block, result, xs, ys)
|
72
|
-
case block.arity
|
73
|
-
when 3
|
74
|
-
result.nil? ? [xs, ys] : block.call(result, xs, ys)
|
75
|
-
when 2
|
76
|
-
block.call(xs, ys)
|
77
|
-
result
|
78
|
-
else
|
79
|
-
raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_block_without_split_ys(block, result, df)
|
84
|
-
case block.arity
|
85
|
-
when 2
|
86
|
-
result.nil? ? df : block.call(result, df)
|
87
|
-
when 1
|
88
|
-
block.call(df)
|
89
|
-
result
|
90
|
-
else
|
91
|
-
raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
module EasyML::Data::Dataset::Splitters
|
2
|
-
class DateSplitter
|
3
|
-
include GlueGun::DSL
|
4
|
-
|
5
|
-
attribute :today, :datetime
|
6
|
-
def today=(value)
|
7
|
-
super(value.in_time_zone(UTC).to_datetime)
|
8
|
-
end
|
9
|
-
attribute :date_col, :string
|
10
|
-
attribute :months_test, :integer, default: 2
|
11
|
-
attribute :months_valid, :integer, default: 2
|
12
|
-
|
13
|
-
def initialize(options)
|
14
|
-
options[:today] ||= UTC.now
|
15
|
-
super(options)
|
16
|
-
end
|
17
|
-
|
18
|
-
def split(df)
|
19
|
-
unless df[date_col].dtype.is_a?(Polars::Datetime)
|
20
|
-
raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
|
21
|
-
end
|
22
|
-
|
23
|
-
validation_date_start, test_date_start = splits
|
24
|
-
|
25
|
-
test_df = df.filter(Polars.col(date_col) >= test_date_start)
|
26
|
-
remaining_df = df.filter(Polars.col(date_col) < test_date_start)
|
27
|
-
valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
|
28
|
-
train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
|
29
|
-
|
30
|
-
[train_df, valid_df, test_df]
|
31
|
-
end
|
32
|
-
|
33
|
-
def months(n)
|
34
|
-
ActiveSupport::Duration.months(n)
|
35
|
-
end
|
36
|
-
|
37
|
-
def splits
|
38
|
-
test_date_start = today.advance(months: -months_test).beginning_of_day
|
39
|
-
validation_date_start = today.advance(months: -(months_test + months_valid)).beginning_of_day
|
40
|
-
[validation_date_start, test_date_start]
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
data/lib/easy_ml/data/dataset.rb
DELETED
@@ -1,430 +0,0 @@
|
|
1
|
-
require "polars"
|
2
|
-
require_relative "datasource"
|
3
|
-
require_relative "dataset/splitters"
|
4
|
-
require_relative "dataset/splits"
|
5
|
-
|
6
|
-
# Dataset is responsible for:
|
7
|
-
#
|
8
|
-
# 1) Ensuring data is synced from its source (e.g. S3 — delegates to datasource)
|
9
|
-
# 2) Ensuring the data is properly split into train, test, and validation data (delegates to splitter)
|
10
|
-
# 3) Knowing where data is stored on disk, and pulling batches of data into memory
|
11
|
-
# 4) Knowing where to save updated data (after preprocessing steps)
|
12
|
-
#
|
13
|
-
module EasyML
|
14
|
-
module Data
|
15
|
-
class Dataset
|
16
|
-
include GlueGun::DSL
|
17
|
-
include EasyML::Logging
|
18
|
-
include EasyML::Data::Utils
|
19
|
-
|
20
|
-
# include GitIgnorable
|
21
|
-
# gitignore :root_dir do |dir|
|
22
|
-
# if Rails.env.test? # Don't gitignore our test files
|
23
|
-
# nil
|
24
|
-
# else
|
25
|
-
# File.join(dir, "files/**/*")
|
26
|
-
# end
|
27
|
-
# end
|
28
|
-
|
29
|
-
# These helpers are defined in GlueGun::DSL.
|
30
|
-
#
|
31
|
-
# define_attr defines configurable attributes for subclasses,
|
32
|
-
# for example, a class sub-classing Dataset will want to define its
|
33
|
-
# target (e.g. the column we are trying to predict)
|
34
|
-
#
|
35
|
-
# These can either be defined on a class-level like this:
|
36
|
-
#
|
37
|
-
# class Dataset < EasyML::Data::Dataset
|
38
|
-
# target "REVENUE"
|
39
|
-
# end
|
40
|
-
#
|
41
|
-
# Or passed in during initialization:
|
42
|
-
#
|
43
|
-
# Dataset.new(target: "REV")
|
44
|
-
#
|
45
|
-
attribute :verbose, :boolean, default: false
|
46
|
-
attribute :today, :date, default: -> { UTC.now }
|
47
|
-
def today=(value)
|
48
|
-
super(value.in_time_zone(UTC).to_date)
|
49
|
-
end
|
50
|
-
attribute :target, :string
|
51
|
-
validates :target, presence: true
|
52
|
-
|
53
|
-
attribute :batch_size, :integer, default: 50_000
|
54
|
-
|
55
|
-
attribute :root_dir, :string
|
56
|
-
validates :root_dir, presence: true
|
57
|
-
def root_dir=(value)
|
58
|
-
super(Pathname.new(value).append("data").to_s)
|
59
|
-
end
|
60
|
-
|
61
|
-
attribute :sample, :float, default: 1.0
|
62
|
-
attribute :drop_if_null, :array, default: []
|
63
|
-
|
64
|
-
# define_attr can also define default values, as well as argument helpers
|
65
|
-
attribute :polars_args, :hash, default: {}
|
66
|
-
def polars_args=(args)
|
67
|
-
super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
|
68
|
-
hash.tap do
|
69
|
-
hash[k] = v
|
70
|
-
hash[k] = v.stringify_keys if k == :dtypes
|
71
|
-
end
|
72
|
-
end)
|
73
|
-
end
|
74
|
-
|
75
|
-
attribute :transforms, default: nil
|
76
|
-
validate :transforms_are_transforms
|
77
|
-
def transforms_are_transforms
|
78
|
-
return if transforms.nil? || transforms.respond_to?(:transform)
|
79
|
-
|
80
|
-
errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
|
81
|
-
end
|
82
|
-
|
83
|
-
attribute :drop_cols, :array, default: []
|
84
|
-
|
85
|
-
dependency :datasource, EasyML::Data::Datasource::DatasourceFactory
|
86
|
-
|
87
|
-
# dependency defines a configurable dependency, with optional args,
|
88
|
-
# for example, here we define a datasource:
|
89
|
-
#
|
90
|
-
# class YourDataset
|
91
|
-
# datasource :s3, s3_bucket: "fundera-bart", s3_prefix: "xyz"
|
92
|
-
# # This automatically uses the S3Datasource class to pull data
|
93
|
-
# end
|
94
|
-
#
|
95
|
-
# If we define any models based on other data sources (e.g. postgres),
|
96
|
-
# you would just define a new PostgresDatasource
|
97
|
-
#
|
98
|
-
|
99
|
-
# Here we define splitter options, inspired by common Python data splitting techniques:
|
100
|
-
#
|
101
|
-
# 1. Date-based splitter (similar to TimeSeriesSplit from sklearn)
|
102
|
-
#
|
103
|
-
# NOT IMPLEMENTED (but you could implement as necessary):
|
104
|
-
# 2. Random splitter (similar to train_test_split from sklearn)
|
105
|
-
# 3. Stratified splitter (similar to StratifiedKFold from sklearn)
|
106
|
-
# 4. Group-based splitter (similar to GroupKFold from sklearn)
|
107
|
-
# 5. Sliding window splitter (similar to TimeSeriesSplit with a sliding window)
|
108
|
-
#
|
109
|
-
dependency :splitter do |dependency|
|
110
|
-
dependency.option :date do |option|
|
111
|
-
option.default
|
112
|
-
option.set_class EasyML::Data::Dataset::Splitters::DateSplitter
|
113
|
-
option.bind_attribute :today, required: true
|
114
|
-
option.bind_attribute :date_col, required: true
|
115
|
-
option.bind_attribute :months_test, required: true
|
116
|
-
option.bind_attribute :months_valid, required: true
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
# Here we define the preprocessing logic.
|
121
|
-
# Aka what to do with null values. For instance:
|
122
|
-
#
|
123
|
-
# class YourDataset
|
124
|
-
# preprocessing_steps: {
|
125
|
-
# training: {
|
126
|
-
# annual_revenue: {
|
127
|
-
# clip: {min: 0, max: 1_000_000} # Clip values between these
|
128
|
-
# median: true, # Then learn the median based on clipped values
|
129
|
-
# },
|
130
|
-
# created_date: { ffill: true } # During training, use the latest value in the dataset
|
131
|
-
# },
|
132
|
-
# inference: {
|
133
|
-
# created_date: { today: true } # During inference, use the current date
|
134
|
-
# }
|
135
|
-
# }
|
136
|
-
# end
|
137
|
-
#
|
138
|
-
attribute :preprocessing_steps, :hash, default: {}
|
139
|
-
dependency :preprocessor do |dependency|
|
140
|
-
dependency.set_class EasyML::Data::Preprocessor
|
141
|
-
dependency.bind_attribute :directory, source: :root_dir do |value|
|
142
|
-
Pathname.new(value).append("preprocessor")
|
143
|
-
end
|
144
|
-
dependency.bind_attribute :preprocessing_steps
|
145
|
-
end
|
146
|
-
|
147
|
-
# Here we define the raw dataset (uses the Split class)
|
148
|
-
# We use this to learn dataset statistics (e.g. median annual revenue)
|
149
|
-
# But we NEVER overwrite it
|
150
|
-
#
|
151
|
-
dependency :raw do |dependency|
|
152
|
-
dependency.option :file do |option|
|
153
|
-
option.default
|
154
|
-
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
155
|
-
option.bind_attribute :dir, source: :root_dir do |value|
|
156
|
-
Pathname.new(value).append("files/splits/raw")
|
157
|
-
end
|
158
|
-
option.bind_attribute :polars_args
|
159
|
-
option.bind_attribute :max_rows_per_file, source: :batch_size
|
160
|
-
option.bind_attribute :batch_size
|
161
|
-
option.bind_attribute :sample
|
162
|
-
option.bind_attribute :verbose
|
163
|
-
end
|
164
|
-
|
165
|
-
dependency.option :memory do |option|
|
166
|
-
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
167
|
-
option.bind_attribute :sample
|
168
|
-
end
|
169
|
-
|
170
|
-
dependency.when do |_dep|
|
171
|
-
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
# Here we define the processed dataset (uses the Split class)
|
176
|
-
# After we learn the dataset statistics, we fill null values
|
177
|
-
# using the learned statistics (e.g. fill annual_revenue with median annual_revenue)
|
178
|
-
#
|
179
|
-
dependency :processed do |dependency|
|
180
|
-
dependency.option :file do |option|
|
181
|
-
option.default
|
182
|
-
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
183
|
-
option.bind_attribute :dir, source: :root_dir do |value|
|
184
|
-
Pathname.new(value).append("files/splits/processed")
|
185
|
-
end
|
186
|
-
option.bind_attribute :polars_args
|
187
|
-
option.bind_attribute :max_rows_per_file, source: :batch_size
|
188
|
-
option.bind_attribute :batch_size
|
189
|
-
option.bind_attribute :sample
|
190
|
-
option.bind_attribute :verbose
|
191
|
-
end
|
192
|
-
|
193
|
-
dependency.option :memory do |option|
|
194
|
-
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
195
|
-
option.bind_attribute :sample
|
196
|
-
end
|
197
|
-
|
198
|
-
dependency.when do |_dep|
|
199
|
-
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
200
|
-
end
|
201
|
-
end
|
202
|
-
|
203
|
-
delegate :new_data_available?, :synced?, :stale?, to: :datasource
|
204
|
-
delegate :train, :test, :valid, to: :split
|
205
|
-
delegate :splits, to: :splitter
|
206
|
-
|
207
|
-
def refresh!
|
208
|
-
refresh_datasource
|
209
|
-
split_data
|
210
|
-
fit
|
211
|
-
normalize_all
|
212
|
-
alert_nulls
|
213
|
-
end
|
214
|
-
|
215
|
-
def normalize(df = nil)
|
216
|
-
df = drop_nulls(df)
|
217
|
-
df = apply_transforms(df)
|
218
|
-
preprocessor.postprocess(df)
|
219
|
-
end
|
220
|
-
|
221
|
-
# A "production" preprocessor is predicting live values (e.g. used on live webservers)
|
222
|
-
# A "development" preprocessor is used during training (e.g. we're learning new values for the dataset)
|
223
|
-
#
|
224
|
-
delegate :statistics, to: :preprocessor
|
225
|
-
|
226
|
-
def train(split_ys: false, all_columns: false, &block)
|
227
|
-
load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
|
228
|
-
end
|
229
|
-
|
230
|
-
def valid(split_ys: false, all_columns: false, &block)
|
231
|
-
load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
|
232
|
-
end
|
233
|
-
|
234
|
-
def test(split_ys: false, all_columns: false, &block)
|
235
|
-
load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
|
236
|
-
end
|
237
|
-
|
238
|
-
def data(split_ys: false, all_columns: false)
|
239
|
-
if split_ys
|
240
|
-
x_train, y_train = train(split_ys: true, all_columns: all_columns)
|
241
|
-
x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
|
242
|
-
x_test, y_test = test(split_ys: true, all_columns: all_columns)
|
243
|
-
|
244
|
-
xs = Polars.concat([x_train, x_valid, x_test])
|
245
|
-
ys = Polars.concat([y_train, y_valid, y_test])
|
246
|
-
[xs, ys]
|
247
|
-
else
|
248
|
-
train_df = train(split_ys: false, all_columns: all_columns)
|
249
|
-
valid_df = valid(split_ys: false, all_columns: all_columns)
|
250
|
-
test_df = test(split_ys: false, all_columns: all_columns)
|
251
|
-
|
252
|
-
Polars.concat([train_df, valid_df, test_df])
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
def cleanup
|
257
|
-
raw.cleanup
|
258
|
-
processed.cleanup
|
259
|
-
end
|
260
|
-
|
261
|
-
def check_nulls(data_type = :processed)
|
262
|
-
result = %i[train test valid].each_with_object({}) do |segment, acc|
|
263
|
-
segment_result = { nulls: {}, total: 0 }
|
264
|
-
|
265
|
-
data_source = data_type == :raw ? raw : processed
|
266
|
-
data_source.read(segment) do |df|
|
267
|
-
df_nulls = null_check(df)
|
268
|
-
df.columns.each do |column|
|
269
|
-
segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
|
270
|
-
if df_nulls && df_nulls[column]
|
271
|
-
segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
|
272
|
-
end
|
273
|
-
segment_result[:nulls][column][:total_count] += df.height
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
segment_result[:nulls].each do |column, counts|
|
278
|
-
percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
|
279
|
-
acc[column] ||= {}
|
280
|
-
acc[column][segment] = percentage
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
# Remove columns that have no nulls across all segments
|
285
|
-
result.reject! { |_, v| v.values.all?(&:zero?) }
|
286
|
-
|
287
|
-
result.empty? ? nil : result
|
288
|
-
end
|
289
|
-
|
290
|
-
def processed?
|
291
|
-
!should_split?
|
292
|
-
end
|
293
|
-
|
294
|
-
def decode_labels(ys, col: nil)
|
295
|
-
preprocessor.decode_labels(ys, col: col.nil? ? target : col)
|
296
|
-
end
|
297
|
-
|
298
|
-
private
|
299
|
-
|
300
|
-
def refresh_datasource
|
301
|
-
datasource.refresh!
|
302
|
-
end
|
303
|
-
log_method :refresh!, "Refreshing datasource", verbose: true
|
304
|
-
|
305
|
-
def normalize_all
|
306
|
-
processed.cleanup
|
307
|
-
|
308
|
-
%i[train test valid].each do |segment|
|
309
|
-
raw.read(segment) do |df|
|
310
|
-
processed_df = normalize(df)
|
311
|
-
processed.save(segment, processed_df)
|
312
|
-
end
|
313
|
-
end
|
314
|
-
end
|
315
|
-
log_method :normalize_all, "Normalizing dataset", verbose: true
|
316
|
-
|
317
|
-
def drop_nulls(df)
|
318
|
-
return df if drop_if_null.nil? || drop_if_null.empty?
|
319
|
-
|
320
|
-
df.drop_nulls(subset: drop_if_null)
|
321
|
-
end
|
322
|
-
|
323
|
-
def drop_columns(all_columns: false)
|
324
|
-
if all_columns
|
325
|
-
[]
|
326
|
-
else
|
327
|
-
drop_cols
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
|
-
def load_data(segment, split_ys: false, all_columns: false, &block)
|
332
|
-
drop_cols = drop_columns(all_columns: all_columns)
|
333
|
-
if processed?
|
334
|
-
processed.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
335
|
-
else
|
336
|
-
raw.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
|
-
def fit(xs = nil)
|
341
|
-
xs = raw.train if xs.nil?
|
342
|
-
|
343
|
-
preprocessor.fit(xs)
|
344
|
-
end
|
345
|
-
log_method :fit, "Learning statistics", verbose: true
|
346
|
-
|
347
|
-
def in_batches(segment, processed: true, &block)
|
348
|
-
if processed
|
349
|
-
processed.read(segment, &block)
|
350
|
-
else
|
351
|
-
raw.read(segment, &block)
|
352
|
-
end
|
353
|
-
end
|
354
|
-
|
355
|
-
def split_data
|
356
|
-
return unless should_split?
|
357
|
-
|
358
|
-
cleanup
|
359
|
-
datasource.in_batches do |df|
|
360
|
-
train_df, valid_df, test_df = splitter.split(df)
|
361
|
-
raw.save(:train, train_df)
|
362
|
-
raw.save(:valid, valid_df)
|
363
|
-
raw.save(:test, test_df)
|
364
|
-
end
|
365
|
-
|
366
|
-
# Update the persisted sample size after splitting
|
367
|
-
save_previous_sample(sample)
|
368
|
-
end
|
369
|
-
log_method :split_data, "Splitting data", verbose: true
|
370
|
-
|
371
|
-
def should_split?
|
372
|
-
split_timestamp = raw.split_at
|
373
|
-
previous_sample = load_previous_sample
|
374
|
-
sample_increased = previous_sample && sample > previous_sample
|
375
|
-
previous_sample.nil? || split_timestamp.nil? || split_timestamp < datasource.last_updated_at || sample_increased
|
376
|
-
end
|
377
|
-
|
378
|
-
def sample_info_file
|
379
|
-
File.join(root_dir, "sample_info.json")
|
380
|
-
end
|
381
|
-
|
382
|
-
def save_previous_sample(sample_size)
|
383
|
-
File.write(sample_info_file, JSON.generate({ previous_sample: sample_size }))
|
384
|
-
end
|
385
|
-
|
386
|
-
def load_previous_sample
|
387
|
-
return nil unless File.exist?(sample_info_file)
|
388
|
-
|
389
|
-
JSON.parse(File.read(sample_info_file))["previous_sample"]
|
390
|
-
end
|
391
|
-
|
392
|
-
def apply_transforms(df)
|
393
|
-
if transforms.nil?
|
394
|
-
df
|
395
|
-
else
|
396
|
-
transforms.apply_transforms(df)
|
397
|
-
end
|
398
|
-
end
|
399
|
-
|
400
|
-
def alert_nulls
|
401
|
-
processed_nulls = check_nulls(:processed)
|
402
|
-
raw_nulls = check_nulls(:raw)
|
403
|
-
|
404
|
-
if processed_nulls
|
405
|
-
log_warning("Nulls found in the processed dataset:")
|
406
|
-
processed_nulls.each do |column, segments|
|
407
|
-
segments.each do |segment, percentage|
|
408
|
-
log_warning(" #{column} - #{segment}: #{percentage}% nulls")
|
409
|
-
end
|
410
|
-
end
|
411
|
-
else
|
412
|
-
log_info("No nulls found in the processed dataset.")
|
413
|
-
end
|
414
|
-
|
415
|
-
if raw_nulls
|
416
|
-
raw_nulls.each do |column, segments|
|
417
|
-
segments.each do |segment, percentage|
|
418
|
-
if percentage > 50
|
419
|
-
log_warning("Data processing issue detected: #{column} - #{segment} has #{percentage}% nulls in the raw dataset")
|
420
|
-
end
|
421
|
-
end
|
422
|
-
end
|
423
|
-
end
|
424
|
-
|
425
|
-
nil
|
426
|
-
end
|
427
|
-
log_method :alert_nulls, "Checking for nulls", verbose: true
|
428
|
-
end
|
429
|
-
end
|
430
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
require_relative "merged_datasource"
|
2
|
-
|
3
|
-
module EasyML
|
4
|
-
module Data
|
5
|
-
class Datasource
|
6
|
-
class DatasourceFactory
|
7
|
-
include GlueGun::DSL
|
8
|
-
|
9
|
-
dependency :datasource do |dependency|
|
10
|
-
dependency.option :s3 do |option|
|
11
|
-
option.default
|
12
|
-
option.set_class EasyML::Data::Datasource::S3Datasource
|
13
|
-
option.bind_attribute :root_dir do |value|
|
14
|
-
Pathname.new(value).append("files")
|
15
|
-
end
|
16
|
-
option.bind_attribute :polars_args, default: {}
|
17
|
-
option.bind_attribute :s3_bucket, required: true
|
18
|
-
option.bind_attribute :s3_prefix
|
19
|
-
option.bind_attribute :s3_access_key_id, required: true
|
20
|
-
option.bind_attribute :s3_secret_access_key, required: true
|
21
|
-
end
|
22
|
-
|
23
|
-
dependency.option :file do |option|
|
24
|
-
option.set_class EasyML::Data::Datasource::FileDatasource
|
25
|
-
option.bind_attribute :root_dir do |value|
|
26
|
-
Pathname.new(value).append("files/raw")
|
27
|
-
end
|
28
|
-
option.bind_attribute :polars_args
|
29
|
-
end
|
30
|
-
|
31
|
-
dependency.option :polars do |option|
|
32
|
-
option.set_class EasyML::Data::Datasource::PolarsDatasource
|
33
|
-
option.bind_attribute :df
|
34
|
-
end
|
35
|
-
|
36
|
-
dependency.option :merged do |option|
|
37
|
-
option.set_class EasyML::Data::Datasource::MergedDatasource
|
38
|
-
option.bind_attribute :root_dir
|
39
|
-
end
|
40
|
-
|
41
|
-
# Passing in datasource: Polars::DataFrame will wrap properly
|
42
|
-
# So will passing in datasource /path/to/dir
|
43
|
-
dependency.when do |dep|
|
44
|
-
case dep
|
45
|
-
when Polars::DataFrame
|
46
|
-
{ option: :polars, as: :df }
|
47
|
-
when String, Pathname
|
48
|
-
{ option: :file, as: :root_dir }
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Do this here otherwise we'll end up with a circular dependency
|
58
|
-
class EasyML::Data::Datasource::MergedDatasource
|
59
|
-
dependency :datasources, DatasourceFactory
|
60
|
-
end
|