easy_ml 0.1.4 → 0.2.0.pre.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +234 -26
- data/Rakefile +45 -0
- data/app/controllers/easy_ml/application_controller.rb +67 -0
- data/app/controllers/easy_ml/columns_controller.rb +38 -0
- data/app/controllers/easy_ml/datasets_controller.rb +156 -0
- data/app/controllers/easy_ml/datasources_controller.rb +88 -0
- data/app/controllers/easy_ml/deploys_controller.rb +20 -0
- data/app/controllers/easy_ml/models_controller.rb +151 -0
- data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
- data/app/controllers/easy_ml/settings_controller.rb +59 -0
- data/app/frontend/components/AlertProvider.tsx +108 -0
- data/app/frontend/components/DatasetPreview.tsx +161 -0
- data/app/frontend/components/EmptyState.tsx +28 -0
- data/app/frontend/components/ModelCard.tsx +255 -0
- data/app/frontend/components/ModelDetails.tsx +334 -0
- data/app/frontend/components/ModelForm.tsx +384 -0
- data/app/frontend/components/Navigation.tsx +300 -0
- data/app/frontend/components/Pagination.tsx +72 -0
- data/app/frontend/components/Popover.tsx +55 -0
- data/app/frontend/components/PredictionStream.tsx +105 -0
- data/app/frontend/components/ScheduleModal.tsx +726 -0
- data/app/frontend/components/SearchInput.tsx +23 -0
- data/app/frontend/components/SearchableSelect.tsx +132 -0
- data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
- data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
- data/app/frontend/components/dataset/ColumnList.tsx +101 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
- data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
- data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
- data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
- data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
- data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
- data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
- data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
- data/app/frontend/components/dataset/splitters/constants.ts +77 -0
- data/app/frontend/components/dataset/splitters/types.ts +168 -0
- data/app/frontend/components/dataset/splitters/utils.ts +53 -0
- data/app/frontend/components/features/CodeEditor.tsx +46 -0
- data/app/frontend/components/features/DataPreview.tsx +150 -0
- data/app/frontend/components/features/FeatureCard.tsx +88 -0
- data/app/frontend/components/features/FeatureForm.tsx +235 -0
- data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
- data/app/frontend/components/settings/PluginSettings.tsx +81 -0
- data/app/frontend/components/ui/badge.tsx +44 -0
- data/app/frontend/components/ui/collapsible.tsx +9 -0
- data/app/frontend/components/ui/scroll-area.tsx +46 -0
- data/app/frontend/components/ui/separator.tsx +29 -0
- data/app/frontend/entrypoints/App.tsx +40 -0
- data/app/frontend/entrypoints/Application.tsx +24 -0
- data/app/frontend/hooks/useAutosave.ts +61 -0
- data/app/frontend/layouts/Layout.tsx +38 -0
- data/app/frontend/lib/utils.ts +6 -0
- data/app/frontend/mockData.ts +272 -0
- data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
- data/app/frontend/pages/DatasetsPage.tsx +261 -0
- data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
- data/app/frontend/pages/DatasourcesPage.tsx +261 -0
- data/app/frontend/pages/EditModelPage.tsx +45 -0
- data/app/frontend/pages/EditTransformationPage.tsx +56 -0
- data/app/frontend/pages/ModelsPage.tsx +115 -0
- data/app/frontend/pages/NewDatasetPage.tsx +366 -0
- data/app/frontend/pages/NewModelPage.tsx +45 -0
- data/app/frontend/pages/NewTransformationPage.tsx +43 -0
- data/app/frontend/pages/SettingsPage.tsx +272 -0
- data/app/frontend/pages/ShowModelPage.tsx +30 -0
- data/app/frontend/pages/TransformationsPage.tsx +95 -0
- data/app/frontend/styles/application.css +100 -0
- data/app/frontend/types/dataset.ts +146 -0
- data/app/frontend/types/datasource.ts +33 -0
- data/app/frontend/types/preprocessing.ts +1 -0
- data/app/frontend/types.ts +113 -0
- data/app/helpers/easy_ml/application_helper.rb +10 -0
- data/app/jobs/easy_ml/application_job.rb +21 -0
- data/app/jobs/easy_ml/batch_job.rb +46 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
- data/app/jobs/easy_ml/deploy_job.rb +13 -0
- data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
- data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
- data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
- data/app/jobs/easy_ml/training_job.rb +62 -0
- data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
- data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
- data/app/models/easy_ml/cleaner.rb +82 -0
- data/app/models/easy_ml/column.rb +124 -0
- data/app/models/easy_ml/column_history.rb +30 -0
- data/app/models/easy_ml/column_list.rb +122 -0
- data/app/models/easy_ml/concerns/configurable.rb +61 -0
- data/app/models/easy_ml/concerns/versionable.rb +19 -0
- data/app/models/easy_ml/dataset.rb +767 -0
- data/app/models/easy_ml/dataset_history.rb +56 -0
- data/app/models/easy_ml/datasource.rb +182 -0
- data/app/models/easy_ml/datasource_history.rb +24 -0
- data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
- data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
- data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
- data/app/models/easy_ml/deploy.rb +114 -0
- data/app/models/easy_ml/event.rb +79 -0
- data/app/models/easy_ml/feature.rb +437 -0
- data/app/models/easy_ml/feature_history.rb +38 -0
- data/app/models/easy_ml/model.rb +575 -41
- data/app/models/easy_ml/model_file.rb +133 -0
- data/app/models/easy_ml/model_file_history.rb +24 -0
- data/app/models/easy_ml/model_history.rb +51 -0
- data/app/models/easy_ml/models/base_model.rb +58 -0
- data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
- data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
- data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
- data/app/models/easy_ml/models/xgboost.rb +544 -5
- data/app/models/easy_ml/prediction.rb +44 -0
- data/app/models/easy_ml/retraining_job.rb +278 -0
- data/app/models/easy_ml/retraining_run.rb +184 -0
- data/app/models/easy_ml/settings.rb +37 -0
- data/app/models/easy_ml/splitter.rb +90 -0
- data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
- data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
- data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
- data/app/models/easy_ml/tuner_job.rb +56 -0
- data/app/models/easy_ml/tuner_run.rb +31 -0
- data/app/models/splitter_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +27 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
- data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
- data/app/serializers/easy_ml/feature_serializer.rb +27 -0
- data/app/serializers/easy_ml/model_serializer.rb +90 -0
- data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
- data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
- data/app/serializers/easy_ml/settings_serializer.rb +9 -0
- data/app/views/layouts/easy_ml/application.html.erb +15 -0
- data/config/initializers/resque.rb +3 -0
- data/config/resque-pool.yml +6 -0
- data/config/routes.rb +39 -0
- data/config/spring.rb +1 -0
- data/config/vite.json +15 -0
- data/lib/easy_ml/configuration.rb +64 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
- data/lib/easy_ml/core/model_evaluator.rb +161 -89
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
- data/lib/easy_ml/core/tuner.rb +123 -62
- data/lib/easy_ml/core.rb +0 -3
- data/lib/easy_ml/core_ext/hash.rb +24 -0
- data/lib/easy_ml/core_ext/pathname.rb +11 -5
- data/lib/easy_ml/data/date_converter.rb +90 -0
- data/lib/easy_ml/data/filter_extensions.rb +31 -0
- data/lib/easy_ml/data/polars_column.rb +126 -0
- data/lib/easy_ml/data/polars_reader.rb +297 -0
- data/lib/easy_ml/data/preprocessor.rb +280 -142
- data/lib/easy_ml/data/simple_imputer.rb +255 -0
- data/lib/easy_ml/data/splits/file_split.rb +252 -0
- data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
- data/lib/easy_ml/data/splits/split.rb +95 -0
- data/lib/easy_ml/data/splits.rb +9 -0
- data/lib/easy_ml/data/statistics_learner.rb +93 -0
- data/lib/easy_ml/data/synced_directory.rb +341 -0
- data/lib/easy_ml/data.rb +6 -2
- data/lib/easy_ml/engine.rb +105 -6
- data/lib/easy_ml/feature_store.rb +227 -0
- data/lib/easy_ml/features.rb +61 -0
- data/lib/easy_ml/initializers/inflections.rb +17 -3
- data/lib/easy_ml/logging.rb +2 -2
- data/lib/easy_ml/predict.rb +74 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
- data/lib/easy_ml/support/est.rb +5 -1
- data/lib/easy_ml/support/file_rotate.rb +79 -15
- data/lib/easy_ml/support/file_support.rb +9 -0
- data/lib/easy_ml/support/local_file.rb +24 -0
- data/lib/easy_ml/support/lockable.rb +62 -0
- data/lib/easy_ml/support/synced_file.rb +103 -0
- data/lib/easy_ml/support/utc.rb +5 -1
- data/lib/easy_ml/support.rb +6 -3
- data/lib/easy_ml/version.rb +4 -1
- data/lib/easy_ml.rb +7 -2
- metadata +355 -72
- data/app/models/easy_ml/models.rb +0 -5
- data/lib/easy_ml/core/model.rb +0 -30
- data/lib/easy_ml/core/model_core.rb +0 -181
- data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
- data/lib/easy_ml/core/models/xgboost.rb +0 -10
- data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
- data/lib/easy_ml/core/models.rb +0 -10
- data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
- data/lib/easy_ml/core/uploaders.rb +0 -7
- data/lib/easy_ml/data/dataloader.rb +0 -6
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
- data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
- data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
- data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
- data/lib/easy_ml/data/dataset/splits.rb +0 -11
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
- data/lib/easy_ml/data/dataset/splitters.rb +0 -9
- data/lib/easy_ml/data/dataset.rb +0 -430
- data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
- data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
- data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
- data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
- data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
- data/lib/easy_ml/data/datasource.rb +0 -33
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
- data/lib/easy_ml/deployment.rb +0 -5
- data/lib/easy_ml/support/synced_directory.rb +0 -134
- data/lib/easy_ml/transforms.rb +0 -29
- /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_events
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# status :string not null
|
8
|
+
# eventable_type :string
|
9
|
+
# eventable_id :bigint
|
10
|
+
# stacktrace :text
|
11
|
+
# created_at :datetime not null
|
12
|
+
# updated_at :datetime not null
|
13
|
+
#
|
14
|
+
module EasyML
|
15
|
+
class Event < ActiveRecord::Base
|
16
|
+
MAX_LINE_LENGTH = 65
|
17
|
+
self.table_name = "easy_ml_events"
|
18
|
+
|
19
|
+
STATUSES = %w[started success failed].freeze
|
20
|
+
|
21
|
+
belongs_to :eventable, polymorphic: true, optional: true
|
22
|
+
|
23
|
+
validates :name, presence: true
|
24
|
+
validates :status, presence: true, inclusion: { in: STATUSES }
|
25
|
+
|
26
|
+
# Helper method to extract worker name from class
|
27
|
+
def self.worker_name(worker_class)
|
28
|
+
worker_class.to_s.demodulize
|
29
|
+
end
|
30
|
+
|
31
|
+
# Scopes to help query events
|
32
|
+
scope :for_worker, ->(worker_class) { where(name: worker_name(worker_class)) }
|
33
|
+
scope :started, -> { where(status: "started") }
|
34
|
+
scope :succeeded, -> { where(status: "success") }
|
35
|
+
scope :failed, -> { where(status: "failed") }
|
36
|
+
|
37
|
+
def self.create_event(model, status, error = nil)
|
38
|
+
EasyML::Event.create!(
|
39
|
+
name: model.class.name.demodulize,
|
40
|
+
status: status,
|
41
|
+
eventable: model,
|
42
|
+
stacktrace: format_stacktrace(error),
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.handle_error(model, error)
|
47
|
+
if error.is_a?(String)
|
48
|
+
begin
|
49
|
+
raise error
|
50
|
+
rescue StandardError => e
|
51
|
+
error = e
|
52
|
+
end
|
53
|
+
end
|
54
|
+
create_event(model, "failed", error)
|
55
|
+
Rails.logger.error("#{self.class.name} failed: #{error.message}")
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.format_stacktrace(error)
|
59
|
+
return nil if error.nil?
|
60
|
+
|
61
|
+
topline = error.inspect
|
62
|
+
|
63
|
+
stacktrace = error.backtrace.select do |loc|
|
64
|
+
loc.match?(/easy_ml/)
|
65
|
+
end
|
66
|
+
|
67
|
+
%(#{topline}
|
68
|
+
|
69
|
+
#{stacktrace.join("\n")}
|
70
|
+
).split("\n").map do |l|
|
71
|
+
l.gsub(/\s{2,}/, " ").strip
|
72
|
+
end.flat_map { |line| wrap_text(line, MAX_LINE_LENGTH) }.join("\n")
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.wrap_text(text, max_length)
|
76
|
+
text.strip.scan(/.{1,#{max_length}}/)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,437 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_features
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string
|
8
|
+
# version :bigint
|
9
|
+
# feature_class :string not null
|
10
|
+
# feature_position :integer
|
11
|
+
# batch_size :integer
|
12
|
+
# needs_fit :boolean
|
13
|
+
# sha :string
|
14
|
+
# primary_key :string is an Array
|
15
|
+
# applied_at :datetime
|
16
|
+
# fit_at :datetime
|
17
|
+
# refresh_every :bigint
|
18
|
+
# created_at :datetime not null
|
19
|
+
# updated_at :datetime not null
|
20
|
+
#
|
21
|
+
module EasyML
|
22
|
+
class Feature < ActiveRecord::Base
|
23
|
+
self.table_name = "easy_ml_features"
|
24
|
+
include Historiographer::Silent
|
25
|
+
historiographer_mode :snapshot_only
|
26
|
+
|
27
|
+
class << self
|
28
|
+
def compute_sha(feature_class)
|
29
|
+
require "digest"
|
30
|
+
path = feature_class.constantize.instance_method(:transform).source_location.first
|
31
|
+
current_mtime = File.mtime(path)
|
32
|
+
cache_key = "feature_sha/#{path}"
|
33
|
+
|
34
|
+
cached = Rails.cache.read(cache_key)
|
35
|
+
|
36
|
+
if cached && cached[:mtime] == current_mtime
|
37
|
+
cached[:sha]
|
38
|
+
else
|
39
|
+
# Compute new SHA and cache it with the current mtime
|
40
|
+
sha = Digest::SHA256.hexdigest(File.read(path))
|
41
|
+
Rails.cache.write(cache_key, { sha: sha, mtime: current_mtime })
|
42
|
+
sha
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def clear_sha_cache!
|
47
|
+
Rails.cache.delete_matched("feature_sha/*")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
belongs_to :dataset, class_name: "EasyML::Dataset"
|
52
|
+
|
53
|
+
validates :feature_class, presence: true
|
54
|
+
validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
|
55
|
+
before_validation :set_feature_position, on: :create
|
56
|
+
|
57
|
+
scope :ordered, -> { order(feature_position: :asc) }
|
58
|
+
scope :has_changes, lambda {
|
59
|
+
# Get all unique feature classes
|
60
|
+
feature_classes = pluck(:feature_class).uniq
|
61
|
+
|
62
|
+
# Build conditions for each feature class
|
63
|
+
conditions = feature_classes.map do |klass|
|
64
|
+
current_sha = compute_sha(klass)
|
65
|
+
sanitize_sql_array(["(feature_class = ? AND (sha IS NULL OR sha != ?))", klass, current_sha])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Combine all conditions with OR
|
69
|
+
where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
|
70
|
+
}
|
71
|
+
scope :never_applied, -> { where(applied_at: nil) }
|
72
|
+
scope :never_fit, -> do
|
73
|
+
fittable = where(fit_at: nil)
|
74
|
+
fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
|
75
|
+
where(id: fittable.map(&:id))
|
76
|
+
end
|
77
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
78
|
+
|
79
|
+
before_save :apply_defaults, if: :new_record?
|
80
|
+
before_save :update_sha
|
81
|
+
after_find :update_from_feature_class
|
82
|
+
before_save :update_from_feature_class
|
83
|
+
|
84
|
+
def feature_klass
|
85
|
+
feature_class.constantize
|
86
|
+
rescue NameError
|
87
|
+
raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
|
88
|
+
end
|
89
|
+
|
90
|
+
def adapter
|
91
|
+
@adapter ||= feature_klass.new
|
92
|
+
end
|
93
|
+
|
94
|
+
def fit_reasons
|
95
|
+
return [] if !adapter.respond_to?(:fit)
|
96
|
+
|
97
|
+
{
|
98
|
+
"Needs fit manually set" => read_attribute(:needs_fit),
|
99
|
+
"Datasource was refreshed" => datasource_was_refreshed?,
|
100
|
+
"Code changed" => code_changed?,
|
101
|
+
"Cache expired" => cache_expired?,
|
102
|
+
}.select { |k, v| v }.map { |k, v| k }
|
103
|
+
end
|
104
|
+
|
105
|
+
alias_method :refresh_reasons, :fit_reasons
|
106
|
+
|
107
|
+
def needs_fit?
|
108
|
+
fit_reasons.any?
|
109
|
+
end
|
110
|
+
|
111
|
+
def cache_expired?
|
112
|
+
return false if refresh_every.nil? || fit_at.nil?
|
113
|
+
|
114
|
+
fit_at < refresh_every.seconds.ago
|
115
|
+
end
|
116
|
+
|
117
|
+
def code_changed?
|
118
|
+
current_sha = self.class.compute_sha(feature_class)
|
119
|
+
sha != current_sha
|
120
|
+
end
|
121
|
+
|
122
|
+
def datasource_was_refreshed?
|
123
|
+
return true if fit_at.nil?
|
124
|
+
return false if dataset.datasource.refreshed_at.nil?
|
125
|
+
|
126
|
+
dataset.datasource.refreshed_at > fit_at
|
127
|
+
end
|
128
|
+
|
129
|
+
def batchable?
|
130
|
+
adapter.respond_to?(:batch) || (batch_size.present? &&
|
131
|
+
numeric_primary_key?)
|
132
|
+
end
|
133
|
+
|
134
|
+
def should_be_batchable?
|
135
|
+
adapter.respond_to?(:batch) || config.dig(:batch_size).present?
|
136
|
+
end
|
137
|
+
|
138
|
+
def numeric_primary_key?
|
139
|
+
if primary_key.nil?
|
140
|
+
return false unless should_be_batchable?
|
141
|
+
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
142
|
+
end
|
143
|
+
|
144
|
+
dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(&:values).all? do |value|
|
145
|
+
case value
|
146
|
+
when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
|
147
|
+
else
|
148
|
+
value.is_a?(Numeric)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def build_batches
|
154
|
+
if batchable?
|
155
|
+
batch
|
156
|
+
else
|
157
|
+
[{ feature_id: id }]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def batch
|
162
|
+
reader = dataset.raw
|
163
|
+
|
164
|
+
if adapter.respond_to?(:batch)
|
165
|
+
array = adapter.batch(reader, self)
|
166
|
+
min_id = array.min
|
167
|
+
max_id = array.max
|
168
|
+
else
|
169
|
+
# Get all primary keys
|
170
|
+
begin
|
171
|
+
unless primary_key.present?
|
172
|
+
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
173
|
+
end
|
174
|
+
df = reader.query(select: [primary_key.first])
|
175
|
+
rescue => e
|
176
|
+
raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
|
177
|
+
end
|
178
|
+
return [] if df.nil?
|
179
|
+
|
180
|
+
min_id = df[primary_key.first].min
|
181
|
+
max_id = df[primary_key.first].max
|
182
|
+
end
|
183
|
+
|
184
|
+
(min_id..max_id).step(batch_size).map do |batch_start|
|
185
|
+
batch_end = [batch_start + batch_size, max_id + 1].min - 1
|
186
|
+
{
|
187
|
+
feature_id: id,
|
188
|
+
batch_start: batch_start,
|
189
|
+
batch_end: batch_end,
|
190
|
+
}
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def wipe
|
195
|
+
feature_store.wipe
|
196
|
+
end
|
197
|
+
|
198
|
+
def fit(features: [self], async: false)
|
199
|
+
jobs = features.flat_map(&:build_batches)
|
200
|
+
if async
|
201
|
+
EasyML::ComputeFeatureJob.enqueue_batch(jobs)
|
202
|
+
else
|
203
|
+
jobs.each do |job|
|
204
|
+
EasyML::ComputeFeatureJob.perform(nil, job)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# Fit a single batch, used for testing the user's feature implementation
|
210
|
+
def fit_batch(batch_args = {})
|
211
|
+
batch_args.symbolize_keys!
|
212
|
+
if batch_args.key?(:batch_start)
|
213
|
+
actually_fit_batch(batch_args)
|
214
|
+
else
|
215
|
+
actually_fit_batch(get_batch_args(**batch_args))
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# Transform a single batch, used for testing the user's feature implementation
|
220
|
+
def transform_batch(df = nil, batch_args = {})
|
221
|
+
if df.present?
|
222
|
+
actually_transform_batch(df)
|
223
|
+
else
|
224
|
+
actually_transform_batch(build_batch(get_batch_args(**batch_args)))
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def get_batch_args(batch_args = {})
|
229
|
+
unless batch_args.key?(:random)
|
230
|
+
batch_args[:random] = true
|
231
|
+
end
|
232
|
+
if batch_args[:random]
|
233
|
+
batch = build_batches.sample
|
234
|
+
else
|
235
|
+
batch = build_batches.first
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
def build_batch(batch_args = {})
|
240
|
+
batch_start = batch_args.dig(:batch_start)
|
241
|
+
batch_end = batch_args.dig(:batch_end)
|
242
|
+
|
243
|
+
if batch_start && batch_end
|
244
|
+
select = needs_columns.present? ? needs_columns : nil
|
245
|
+
filter = Polars.col(primary_key.first).is_between(batch_start, batch_end)
|
246
|
+
params = {
|
247
|
+
select: select,
|
248
|
+
filter: filter,
|
249
|
+
}.compact
|
250
|
+
else
|
251
|
+
params = {}
|
252
|
+
end
|
253
|
+
dataset.raw.query(**params)
|
254
|
+
end
|
255
|
+
|
256
|
+
def actually_fit_batch(batch_args = {})
|
257
|
+
return false unless adapter.respond_to?(:fit)
|
258
|
+
|
259
|
+
if adapter.respond_to?(:fit)
|
260
|
+
batch_args.symbolize_keys!
|
261
|
+
|
262
|
+
if adapter.respond_to?(:batch)
|
263
|
+
batch_df = adapter.fit(dataset.raw, self, batch_args)
|
264
|
+
else
|
265
|
+
df = build_batch(batch_args)
|
266
|
+
batch_df = adapter.fit(df, self, batch_args)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
raise "Feature #{feature_class}#fit must return a dataframe" unless batch_df.present?
|
270
|
+
store(batch_df)
|
271
|
+
updates = {
|
272
|
+
applied_at: Time.current,
|
273
|
+
needs_fit: false,
|
274
|
+
}.compact
|
275
|
+
update!(updates)
|
276
|
+
batch_df
|
277
|
+
end
|
278
|
+
|
279
|
+
def actually_transform_batch(df)
|
280
|
+
return nil unless df.present?
|
281
|
+
return df if adapter.respond_to?(:fit) && feature_store.empty?
|
282
|
+
|
283
|
+
result = adapter.transform(df, self)
|
284
|
+
update!(applied_at: Time.current)
|
285
|
+
result
|
286
|
+
end
|
287
|
+
|
288
|
+
def compute_sha
|
289
|
+
self.class.compute_sha(feature_class)
|
290
|
+
end
|
291
|
+
|
292
|
+
# Position manipulation methods
|
293
|
+
def insert
|
294
|
+
save!
|
295
|
+
self
|
296
|
+
end
|
297
|
+
|
298
|
+
def insert_where(feature_class)
|
299
|
+
features = dataset.features.reload
|
300
|
+
target = features.detect { |t| t.feature_class == feature_class.to_s }
|
301
|
+
target_position = target&.feature_position
|
302
|
+
yield target_position
|
303
|
+
features.select { |t| target_position.nil? || t.feature_position > target_position }.each { |t| t.feature_position += 1 }
|
304
|
+
features += [self]
|
305
|
+
|
306
|
+
bulk_update_positions(features)
|
307
|
+
self
|
308
|
+
end
|
309
|
+
|
310
|
+
def prepend
|
311
|
+
insert_where(nil) do |_position|
|
312
|
+
self.feature_position = 0
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
def insert_before(feature_class)
|
317
|
+
insert_where(feature_class) do |position|
|
318
|
+
self.feature_position = position - 1
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def insert_after(feature_class)
|
323
|
+
insert_where(feature_class) do |position|
|
324
|
+
self.feature_position = position + 1
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
def bump_version
|
329
|
+
old_version = version
|
330
|
+
write_attribute(:version, version + 1)
|
331
|
+
feature_store.cp(old_version, version)
|
332
|
+
self
|
333
|
+
end
|
334
|
+
|
335
|
+
def apply_defaults
|
336
|
+
self.name ||= self.feature_class.demodulize.titleize
|
337
|
+
self.version ||= 1
|
338
|
+
end
|
339
|
+
|
340
|
+
def needs_columns
|
341
|
+
config.dig(:needs_columns) || []
|
342
|
+
end
|
343
|
+
|
344
|
+
def upload_remote_files
|
345
|
+
feature_store.upload
|
346
|
+
end
|
347
|
+
|
348
|
+
def feature_store
|
349
|
+
@feature_store ||= EasyML::FeatureStore.new(self)
|
350
|
+
end
|
351
|
+
|
352
|
+
def upload_remote_files
|
353
|
+
feature_store.upload_remote_files
|
354
|
+
end
|
355
|
+
|
356
|
+
def files
|
357
|
+
feature_store.list_partitions
|
358
|
+
end
|
359
|
+
|
360
|
+
def query(filter: nil)
|
361
|
+
feature_store.query(filter: filter)
|
362
|
+
end
|
363
|
+
|
364
|
+
def store(df)
|
365
|
+
feature_store.store(df)
|
366
|
+
end
|
367
|
+
|
368
|
+
def batch_size
|
369
|
+
read_attribute(:batch_size) ||
|
370
|
+
config.dig(:batch_size) ||
|
371
|
+
(should_be_batchable? ? 10_000 : nil)
|
372
|
+
end
|
373
|
+
|
374
|
+
private
|
375
|
+
|
376
|
+
def bulk_update_positions(features)
|
377
|
+
# Use activerecord-import for bulk updates
|
378
|
+
features = order_features(features)
|
379
|
+
features.each(&:apply_defaults)
|
380
|
+
new_features = features.reject(&:persisted?)
|
381
|
+
existing_features = features.select(&:persisted?)
|
382
|
+
Feature.import(
|
383
|
+
existing_features,
|
384
|
+
on_duplicate_key_update: [:feature_position],
|
385
|
+
validate: false,
|
386
|
+
)
|
387
|
+
Feature.import(new_features)
|
388
|
+
end
|
389
|
+
|
390
|
+
def order_features(features)
|
391
|
+
features.sort_by { |t| t.feature_position }.each_with_index do |feature, index|
|
392
|
+
feature.feature_position = index
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
def set_feature_position
|
397
|
+
return if feature_position.present?
|
398
|
+
|
399
|
+
max_feature_position = dataset&.features&.maximum(:feature_position) || -1
|
400
|
+
self.feature_position = max_feature_position + 1
|
401
|
+
end
|
402
|
+
|
403
|
+
def update_sha
|
404
|
+
new_sha = compute_sha
|
405
|
+
if new_sha != self.sha
|
406
|
+
self.sha = new_sha
|
407
|
+
self.needs_fit = true
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
def update_from_feature_class
|
412
|
+
if read_attribute(:batch_size) != config.dig(:batch_size)
|
413
|
+
write_attribute(:batch_size, config.dig(:batch_size))
|
414
|
+
self.needs_fit = true
|
415
|
+
end
|
416
|
+
|
417
|
+
if self.primary_key != config.dig(:primary_key)
|
418
|
+
self.primary_key = [config.dig(:primary_key)].flatten
|
419
|
+
end
|
420
|
+
|
421
|
+
if new_refresh_every = config.dig(:refresh_every)
|
422
|
+
self.refresh_every = new_refresh_every.to_i
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
def feature_klass
|
427
|
+
@feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
|
428
|
+
end
|
429
|
+
|
430
|
+
def config
|
431
|
+
raise "Feature not found: #{feature_class}" unless feature_klass
|
432
|
+
feature_klass.features&.first
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
class InvalidFeatureError < StandardError; end
|
437
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_feature_histories
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# feature_id :integer not null
|
7
|
+
# dataset_id :integer not null
|
8
|
+
# name :string
|
9
|
+
# version :integer
|
10
|
+
# feature_class :string not null
|
11
|
+
# feature_position :integer
|
12
|
+
# batch_size :integer
|
13
|
+
# needs_fit :boolean
|
14
|
+
# sha :string
|
15
|
+
# primary_key :string
|
16
|
+
# applied_at :datetime
|
17
|
+
# fit_at :datetime
|
18
|
+
# refresh_every :integer
|
19
|
+
# created_at :datetime not null
|
20
|
+
# updated_at :datetime not null
|
21
|
+
# history_started_at :datetime not null
|
22
|
+
# history_ended_at :datetime
|
23
|
+
# history_user_id :integer
|
24
|
+
# snapshot_id :string
|
25
|
+
#
|
26
|
+
module EasyML
|
27
|
+
class FeatureHistory < ActiveRecord::Base
|
28
|
+
self.table_name = "easy_ml_feature_histories"
|
29
|
+
include Historiographer::History
|
30
|
+
|
31
|
+
after_find :download_remote_files
|
32
|
+
scope :ordered, -> { order(feature_position: :asc) }
|
33
|
+
|
34
|
+
def download_remote_files
|
35
|
+
feature_store&.download
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|