easy_ml 0.1.4 → 0.2.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
require_relative "date_converter"
|
2
|
+
require_relative "polars_column"
|
3
|
+
|
4
|
+
module EasyML::Data
|
5
|
+
class StatisticsLearner
|
6
|
+
attr_accessor :verbose
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@verbose = options[:verbose]
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.learn(raw, processed)
|
13
|
+
output = { raw: learn_split(raw) }
|
14
|
+
output[:processed] = learn_split(processed) if processed.data.present?
|
15
|
+
output
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.learn_split(split)
|
19
|
+
df = split.read(:all)
|
20
|
+
train_df = split.read(:train)
|
21
|
+
all_stats = learn_df(df)
|
22
|
+
train_stats = learn_df(train_df)
|
23
|
+
|
24
|
+
all_stats.reduce({}) do |output, (k, _)|
|
25
|
+
output.tap do
|
26
|
+
output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
|
27
|
+
train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value)
|
28
|
+
)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.learn_df(df)
|
34
|
+
return if df.nil?
|
35
|
+
|
36
|
+
base_stats = describe_to_h(df).deep_symbolize_keys
|
37
|
+
|
38
|
+
# Add basic column statistics first
|
39
|
+
df.columns.each_with_object({}) do |col, stats|
|
40
|
+
series = df[col]
|
41
|
+
return {} if series.dtype == Polars::Null
|
42
|
+
field_type = PolarsColumn.determine_type(series)
|
43
|
+
|
44
|
+
stats[col] = {
|
45
|
+
num_rows: series.shape,
|
46
|
+
null_count: base_stats[col.to_sym][:null_count].to_i,
|
47
|
+
}
|
48
|
+
|
49
|
+
# Add type-specific statistics
|
50
|
+
case field_type
|
51
|
+
when :integer, :float
|
52
|
+
allowed_attrs = if id_column?(col)
|
53
|
+
%i[field_type null_count min max]
|
54
|
+
else
|
55
|
+
base_stats[col.to_sym].keys
|
56
|
+
end
|
57
|
+
stats[col].merge!(base_stats[col.to_sym].slice(*allowed_attrs))
|
58
|
+
when :categorical, :string, :text, :boolean
|
59
|
+
stats[col].merge!(most_frequent_value: series.mode.sort.to_a&.first)
|
60
|
+
if field_type == :categorical
|
61
|
+
stats[col].merge!(
|
62
|
+
unique_count: series.n_unique,
|
63
|
+
counts: Hash[series.value_counts.to_hashes.map(&:values)],
|
64
|
+
)
|
65
|
+
end
|
66
|
+
when :datetime
|
67
|
+
stats[col].merge!(
|
68
|
+
unique_count: series.n_unique,
|
69
|
+
last_value: series.sort[-1],
|
70
|
+
)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.id_column?(column)
|
76
|
+
col = column.to_s.downcase
|
77
|
+
col.match?(/^id$/) || col.match?(/.*_id/)
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.describe_to_h(df)
|
81
|
+
init_h = df.describe.to_h
|
82
|
+
rows = init_h.values.map(&:to_a)
|
83
|
+
keys = rows.first
|
84
|
+
column_names = init_h.keys[1..-1]
|
85
|
+
column_values = rows[1..-1]
|
86
|
+
column_names.zip(column_values).inject({}) do |hash, (col_name, col_values)|
|
87
|
+
hash.tap do
|
88
|
+
hash[col_name] = Hash[keys.zip(col_values)]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,341 @@
|
|
1
|
+
require_relative "polars_reader"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
module Data
|
5
|
+
class SyncedDirectory
|
6
|
+
attr_accessor :root_dir, :s3_bucket, :s3_prefix, :s3_access_key_id, :s3_secret_access_key, :cache_for, :polars_args
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@root_dir = options.dig(:root_dir)
|
10
|
+
@s3_bucket = options.dig(:s3_bucket)
|
11
|
+
@s3_prefix = options.dig(:s3_prefix)
|
12
|
+
@s3_access_key_id = options.dig(:s3_access_key_id)
|
13
|
+
@s3_secret_access_key = options.dig(:s3_secret_access_key)
|
14
|
+
@cache_for = options.dig(:cache_for)
|
15
|
+
@polars_args = options.dig(:polars_args)
|
16
|
+
end
|
17
|
+
|
18
|
+
delegate :query, :data, :all_files, :files, to: :reader
|
19
|
+
|
20
|
+
def before_sync
|
21
|
+
return unless should_sync?
|
22
|
+
|
23
|
+
clean
|
24
|
+
end
|
25
|
+
|
26
|
+
def after_sync
|
27
|
+
reader.normalize
|
28
|
+
end
|
29
|
+
|
30
|
+
def clean
|
31
|
+
mk_dir
|
32
|
+
clean_dir!
|
33
|
+
reader.clean
|
34
|
+
end
|
35
|
+
|
36
|
+
def remote_files
|
37
|
+
s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix)
|
38
|
+
end
|
39
|
+
|
40
|
+
def should_sync?(force = false)
|
41
|
+
force || !synced?
|
42
|
+
end
|
43
|
+
|
44
|
+
def sync!(parallel: true)
|
45
|
+
sync(force: true, parallel: parallel)
|
46
|
+
end
|
47
|
+
|
48
|
+
def sync(force: false, parallel: false)
|
49
|
+
return false unless should_sync?(force)
|
50
|
+
|
51
|
+
files = files_to_sync
|
52
|
+
|
53
|
+
if parallel
|
54
|
+
Parallel.each(files, in_processes: 4, timeout: 10) { |object| download_file(object) }
|
55
|
+
else
|
56
|
+
files.each { |object| download_file(object) }
|
57
|
+
end
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
def files_to_sync
|
62
|
+
objects = s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents
|
63
|
+
objects.reject { |object| object.key.end_with?("/") }
|
64
|
+
end
|
65
|
+
|
66
|
+
def in_batches(&block)
|
67
|
+
reader.in_batches(&block)
|
68
|
+
end
|
69
|
+
|
70
|
+
def files
|
71
|
+
reader.files
|
72
|
+
end
|
73
|
+
|
74
|
+
def age(format: "human")
|
75
|
+
EasyML::Support::Age.age(last_updated_at, EasyML::Support::EST.now, format: format)
|
76
|
+
end
|
77
|
+
|
78
|
+
def stale?
|
79
|
+
!synced?
|
80
|
+
end
|
81
|
+
|
82
|
+
def synced?
|
83
|
+
return @synced unless @synced.nil?
|
84
|
+
|
85
|
+
return true if use_cached?
|
86
|
+
|
87
|
+
@synced = calculate_synced
|
88
|
+
end
|
89
|
+
|
90
|
+
def use_cached?
|
91
|
+
return false unless cache_for.present?
|
92
|
+
return false if last_updated_at.nil?
|
93
|
+
|
94
|
+
age_in_seconds = EasyML::Support::Age.age(last_updated_at, EasyML::Support::EST.now, format: "integer")
|
95
|
+
age_in_seconds < cache_for.to_i
|
96
|
+
end
|
97
|
+
|
98
|
+
def last_updated_at
|
99
|
+
return nil if files.empty?
|
100
|
+
|
101
|
+
files.map { |file| File.mtime(file) }.max.in_time_zone(EasyML::Support::EST)
|
102
|
+
end
|
103
|
+
|
104
|
+
def schema
|
105
|
+
reader.schema
|
106
|
+
end
|
107
|
+
|
108
|
+
def num_rows
|
109
|
+
reader.num_rows
|
110
|
+
end
|
111
|
+
|
112
|
+
def download_file(object)
|
113
|
+
# When s3_prefix is present, strip it from the key and just use the filename
|
114
|
+
key_without_prefix = s3_prefix.present? ? object.key.sub(/^#{Regexp.escape(s3_prefix)}\//, "") : object.key
|
115
|
+
local_file_path = File.join(root_dir, File.basename(key_without_prefix))
|
116
|
+
FileUtils.mkdir_p(File.dirname(local_file_path))
|
117
|
+
|
118
|
+
Rails.logger.info("Downloading object #{object.key} to #{local_file_path}")
|
119
|
+
|
120
|
+
s3.get_object(
|
121
|
+
response_target: local_file_path,
|
122
|
+
bucket: s3_bucket,
|
123
|
+
key: object.key,
|
124
|
+
)
|
125
|
+
|
126
|
+
Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
|
127
|
+
ungzipped_file_path = ungzip_file(local_file_path)
|
128
|
+
Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
|
129
|
+
rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
|
130
|
+
Rails.logger.error("Failed to process #{object.key}: #{e.message}")
|
131
|
+
raise e
|
132
|
+
end
|
133
|
+
|
134
|
+
def upload!(parallel: true)
|
135
|
+
upload(force: true, parallel: parallel)
|
136
|
+
end
|
137
|
+
|
138
|
+
def upload(force: false, parallel: true)
|
139
|
+
files = force ? files_to_upload : files_to_upload.select { |f| should_upload?(f) }
|
140
|
+
return true if files.empty?
|
141
|
+
|
142
|
+
if parallel
|
143
|
+
Parallel.each(files, in_processes: 4, timeout: 10) { |file| upload_file(file) }
|
144
|
+
else
|
145
|
+
files.each { |file| upload_file(file) }
|
146
|
+
end
|
147
|
+
true
|
148
|
+
end
|
149
|
+
|
150
|
+
def files_to_upload
|
151
|
+
return [] unless Dir.exist?(root_dir)
|
152
|
+
|
153
|
+
local_files = Dir.glob(File.join(root_dir, "**", "*")).select { |f| File.file?(f) }
|
154
|
+
|
155
|
+
# Get remote files and their last modified times
|
156
|
+
remote_files = {}
|
157
|
+
self.remote_files.contents.each do |object|
|
158
|
+
next if object.key.end_with?("/")
|
159
|
+
|
160
|
+
# Remove .gz extension and s3_prefix to match local paths
|
161
|
+
local_key = object.key.sub(/\.gz$/, "")
|
162
|
+
local_key = local_key.sub(%r{^#{Regexp.escape(s3_prefix)}/}, "") if s3_prefix.present?
|
163
|
+
remote_files[local_key] = object.last_modified.in_time_zone(EasyML::Support::EST)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Filter files that are newer locally
|
167
|
+
local_files.select do |file_path|
|
168
|
+
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
|
169
|
+
local_mtime = File.mtime(file_path).in_time_zone(EasyML::Support::EST)
|
170
|
+
|
171
|
+
# Upload if file doesn't exist remotely or is newer locally
|
172
|
+
!remote_files.key?(relative_path) || local_mtime > remote_files[relative_path]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# Add aliases for sync methods
|
177
|
+
alias download! sync!
|
178
|
+
alias download sync
|
179
|
+
|
180
|
+
private
|
181
|
+
|
182
|
+
def dir
|
183
|
+
root_dir
|
184
|
+
end
|
185
|
+
|
186
|
+
def relative_path(path)
|
187
|
+
if s3_prefix.present?
|
188
|
+
path.sub(Regexp.escape(s3_prefix), "").gsub(%r{/$}, "")
|
189
|
+
else
|
190
|
+
path
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def reader
|
195
|
+
return @reader if @reader
|
196
|
+
|
197
|
+
@reader = EasyML::Data::PolarsReader.new(
|
198
|
+
root_dir: dir,
|
199
|
+
polars_args: polars_args,
|
200
|
+
refresh: false,
|
201
|
+
)
|
202
|
+
end
|
203
|
+
|
204
|
+
def mk_dir
|
205
|
+
FileUtils.mkdir_p(root_dir)
|
206
|
+
end
|
207
|
+
|
208
|
+
def clean_dir!
|
209
|
+
unless root_dir.start_with?(Rails.root.to_s)
|
210
|
+
raise "Refusing to wipe directory #{root_dir}, as it is not in the scope of #{Rails.root}"
|
211
|
+
end
|
212
|
+
|
213
|
+
FileUtils.rm_rf(root_dir)
|
214
|
+
end
|
215
|
+
|
216
|
+
def s3
|
217
|
+
credentials = Aws::Credentials.new(
|
218
|
+
s3_access_key_id,
|
219
|
+
s3_secret_access_key
|
220
|
+
)
|
221
|
+
Aws::S3::Client.new(
|
222
|
+
credentials: credentials,
|
223
|
+
http_open_timeout: 5, # Timeout for establishing connection (in seconds)
|
224
|
+
http_read_timeout: 30, # Timeout for reading response (in seconds))
|
225
|
+
http_wire_trace: false, # Enable verbose HTTP logging
|
226
|
+
http_idle_timeout: 0,
|
227
|
+
logger: Logger.new(STDOUT), # Logs to STDOUT; you can also set a file
|
228
|
+
)
|
229
|
+
end
|
230
|
+
|
231
|
+
def ungzip_file(gzipped_file_path)
|
232
|
+
ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
|
233
|
+
|
234
|
+
Zlib::GzipReader.open(gzipped_file_path) do |gz|
|
235
|
+
File.open(ungzipped_file_path, "wb") do |file|
|
236
|
+
file.write(gz.read)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
|
241
|
+
ungzipped_file_path
|
242
|
+
end
|
243
|
+
|
244
|
+
def expand_dir(dir)
|
245
|
+
return dir if dir.to_s[0] == "/"
|
246
|
+
|
247
|
+
Rails.root.join(dir)
|
248
|
+
end
|
249
|
+
|
250
|
+
def new_data_available?
|
251
|
+
return false if files_to_sync.empty?
|
252
|
+
return true if files.empty?
|
253
|
+
|
254
|
+
local_latest = last_updated_at
|
255
|
+
s3_latest = s3_last_updated_at
|
256
|
+
|
257
|
+
return false if s3_latest.nil?
|
258
|
+
|
259
|
+
s3_latest > local_latest
|
260
|
+
end
|
261
|
+
|
262
|
+
def calculate_synced
|
263
|
+
!new_data_available?
|
264
|
+
end
|
265
|
+
|
266
|
+
def s3_last_updated_at
|
267
|
+
s3_latest = nil
|
268
|
+
|
269
|
+
s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
|
270
|
+
next if object.key.end_with?("/")
|
271
|
+
|
272
|
+
s3_latest = [s3_latest, object.last_modified].compact.max
|
273
|
+
end
|
274
|
+
|
275
|
+
s3_latest.in_time_zone(EasyML::Support::EST)
|
276
|
+
end
|
277
|
+
|
278
|
+
def upload_file(file_path)
|
279
|
+
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
|
280
|
+
s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
|
281
|
+
|
282
|
+
# Create a temporary gzipped version of the file
|
283
|
+
gzipped_file_path = "#{file_path}.gz"
|
284
|
+
|
285
|
+
begin
|
286
|
+
Rails.logger.info("Compressing and uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
|
287
|
+
|
288
|
+
# Compress the file
|
289
|
+
Zlib::GzipWriter.open(gzipped_file_path) do |gz|
|
290
|
+
File.open(file_path, "rb") do |file|
|
291
|
+
gz.write(file.read)
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
# Upload the gzipped file
|
296
|
+
File.open(gzipped_file_path, "rb") do |file|
|
297
|
+
s3.put_object(
|
298
|
+
bucket: s3_bucket,
|
299
|
+
key: "#{s3_key}.gz",
|
300
|
+
body: file,
|
301
|
+
content_encoding: "gzip",
|
302
|
+
)
|
303
|
+
end
|
304
|
+
|
305
|
+
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
|
306
|
+
rescue Aws::S3::Errors::ServiceError, StandardError => e
|
307
|
+
Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
|
308
|
+
raise e
|
309
|
+
ensure
|
310
|
+
# Clean up temporary gzipped file
|
311
|
+
File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def should_upload?(file_path)
|
316
|
+
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
|
317
|
+
s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
|
318
|
+
|
319
|
+
begin
|
320
|
+
# Check if file exists in S3
|
321
|
+
response = s3.head_object(
|
322
|
+
bucket: s3_bucket,
|
323
|
+
key: "#{s3_key}.gz",
|
324
|
+
)
|
325
|
+
|
326
|
+
# Compare modification times
|
327
|
+
local_mtime = File.mtime(file_path).in_time_zone(EasyML::Support::EST)
|
328
|
+
remote_mtime = response.last_modified.in_time_zone(EasyML::Support::EST)
|
329
|
+
|
330
|
+
local_mtime > remote_mtime
|
331
|
+
rescue Aws::S3::Errors::NotFound
|
332
|
+
# File doesn't exist in S3, should upload
|
333
|
+
true
|
334
|
+
rescue Aws::S3::Errors::ServiceError => e
|
335
|
+
Rails.logger.error("Error checking S3 object: #{e.message}")
|
336
|
+
raise e
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
data/lib/easy_ml/data.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
module EasyML
|
2
2
|
module Data
|
3
3
|
require_relative "data/utils"
|
4
|
+
require_relative "data/polars_reader"
|
5
|
+
require_relative "data/synced_directory"
|
4
6
|
require_relative "data/preprocessor"
|
5
|
-
require_relative "data/
|
6
|
-
require_relative "data/
|
7
|
+
require_relative "data/splits"
|
8
|
+
require_relative "data/polars_column"
|
9
|
+
require_relative "data/statistics_learner"
|
10
|
+
require_relative "data/date_converter"
|
7
11
|
end
|
8
12
|
end
|
data/lib/easy_ml/engine.rb
CHANGED
@@ -1,26 +1,125 @@
|
|
1
|
+
require "aws-sdk"
|
2
|
+
require "awesome_print"
|
3
|
+
require "action_controller"
|
4
|
+
require "inertia_rails"
|
5
|
+
require "jsonapi/serializer"
|
6
|
+
require "numo/narray"
|
7
|
+
require "numpy"
|
8
|
+
require "parallel"
|
9
|
+
require "polars-df"
|
10
|
+
require "pycall"
|
11
|
+
require "optuna"
|
12
|
+
require "tailwindcss-rails"
|
13
|
+
require "wandb"
|
14
|
+
require "xgb"
|
15
|
+
require "sidekiq"
|
16
|
+
require "vite_ruby"
|
1
17
|
require "rails/engine"
|
18
|
+
require "activerecord-import"
|
19
|
+
require "historiographer"
|
2
20
|
|
3
21
|
module EasyML
|
4
22
|
class Engine < Rails::Engine
|
5
23
|
isolate_namespace EasyML
|
6
24
|
|
25
|
+
def root_dir
|
26
|
+
Rails.root.join("easy_ml")
|
27
|
+
end
|
28
|
+
|
29
|
+
config.autoload_paths += [
|
30
|
+
root.join("app/models"),
|
31
|
+
root.join("app/models/datasources"),
|
32
|
+
root.join("app/models/models"),
|
33
|
+
root.join("lib/easy_ml"),
|
34
|
+
]
|
35
|
+
|
36
|
+
config.eager_load_paths += [
|
37
|
+
root.join("app/models"),
|
38
|
+
root.join("app/models/datasources"),
|
39
|
+
root.join("app/models/models"),
|
40
|
+
root.join("lib/easy_ml"),
|
41
|
+
]
|
42
|
+
|
7
43
|
initializer "easy_ml.inflections" do
|
8
44
|
require_relative "initializers/inflections"
|
45
|
+
EasyML::Initializers::Inflections.inflect
|
46
|
+
end
|
47
|
+
|
48
|
+
initializer "easy_ml.enable_string_cache" do
|
49
|
+
Polars.enable_string_cache
|
50
|
+
end
|
51
|
+
|
52
|
+
unless %w[rake rails].include?(File.basename($0)) && %w[generate db:migrate db:drop easy_ml:migration].include?(ARGV.first)
|
53
|
+
config.after_initialize do
|
54
|
+
Dir.glob(File.expand_path("app/models/easy_ml/datasources/*.rb", EasyML::Engine.root)).each do |file|
|
55
|
+
require file
|
56
|
+
end
|
57
|
+
Dir.glob(File.expand_path("app/models/easy_ml/models/*.rb", EasyML::Engine.root)).each do |file|
|
58
|
+
require file
|
59
|
+
end
|
60
|
+
Dir.glob(File.expand_path("app/models/easy_ml/splitters/*.rb", EasyML::Engine.root)).each do |file|
|
61
|
+
require file
|
62
|
+
end
|
63
|
+
Dir.glob(File.expand_path("app/models/easy_ml/**/*.rb", EasyML::Engine.root)).each do |file|
|
64
|
+
require file
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
initializer "easy_ml.active_job_config" do
|
70
|
+
ActiveSupport.on_load(:active_job) do
|
71
|
+
self.queue_adapter = :resque
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# This tells our demo app where to look for assets like css, js
|
76
|
+
initializer "easy_ml.assets" do |app|
|
77
|
+
if app.config.respond_to?(:assets)
|
78
|
+
app.config.assets.precompile += %w[
|
79
|
+
easy_ml/application.js
|
80
|
+
easy_ml/application.css
|
81
|
+
]
|
82
|
+
app.config.assets.paths << root.join("app", "frontend")
|
83
|
+
end
|
9
84
|
end
|
10
85
|
|
11
86
|
initializer "easy_ml.setup_generators" do |app|
|
87
|
+
generators_path = EasyML::Engine.root.join("lib/easy_ml/railtie/generators")
|
88
|
+
generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
|
89
|
+
generators_dirs.each { |file| require file }
|
90
|
+
|
12
91
|
app.config.generators do |g|
|
13
92
|
g.templates.unshift File.expand_path("../templates", __dir__)
|
14
93
|
end
|
15
94
|
end
|
16
95
|
|
17
|
-
|
18
|
-
|
19
|
-
|
96
|
+
delegate :vite_ruby, to: :class
|
97
|
+
|
98
|
+
def self.vite_ruby
|
99
|
+
@vite_ruby ||= ViteRuby.new(root: root)
|
100
|
+
end
|
101
|
+
|
102
|
+
unless Rails.env.development?
|
103
|
+
config.app_middleware.use(Rack::Static,
|
104
|
+
urls: ["/#{vite_ruby.config.public_output_dir}"],
|
105
|
+
root: root.join(vite_ruby.config.public_dir))
|
106
|
+
end
|
107
|
+
|
108
|
+
initializer "vite_rails_engine.proxy" do |app|
|
109
|
+
if vite_ruby.run_proxy?
|
110
|
+
# Use Vite proxy in development for live assets
|
111
|
+
app.middleware.insert_before 0, ViteRuby::DevServerProxy, ssl_verify_none: true, vite_ruby: vite_ruby
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
initializer "vite_rails_engine.logger" do
|
116
|
+
config.after_initialize do
|
117
|
+
vite_ruby.logger = Rails.logger
|
118
|
+
end
|
119
|
+
end
|
20
120
|
|
21
|
-
|
22
|
-
|
23
|
-
require_relative "../../app/models/easy_ml/models"
|
121
|
+
def list_routes
|
122
|
+
EasyML::Engine.routes.routes.map { |r| "#{r.name} #{r.path.spec}" }
|
24
123
|
end
|
25
124
|
end
|
26
125
|
end
|