easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +33 -0
- data/app/controllers/easy_ml/datasources_controller.rb +7 -0
- data/app/controllers/easy_ml/models_controller.rb +38 -0
- data/app/frontend/components/DatasetCard.tsx +212 -0
- data/app/frontend/components/ModelCard.tsx +69 -29
- data/app/frontend/components/StackTrace.tsx +13 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
- data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
- data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
- data/app/frontend/components/models/UploadModelModal.tsx +212 -0
- data/app/frontend/components/models/index.ts +2 -0
- data/app/frontend/pages/DatasetsPage.tsx +36 -130
- data/app/frontend/pages/DatasourcesPage.tsx +22 -2
- data/app/frontend/pages/ModelsPage.tsx +37 -11
- data/app/frontend/types/dataset.ts +1 -2
- data/app/frontend/types.ts +1 -1
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +4 -0
- data/app/models/easy_ml/column/imputers/clip.rb +5 -3
- data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
- data/app/models/easy_ml/column/imputers/mean.rb +7 -3
- data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
- data/app/models/easy_ml/column/imputers.rb +3 -1
- data/app/models/easy_ml/column/lineage/base.rb +5 -1
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
- data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +4 -0
- data/app/models/easy_ml/column.rb +79 -63
- data/app/models/easy_ml/column_history.rb +28 -28
- data/app/models/easy_ml/column_list/imputer.rb +23 -0
- data/app/models/easy_ml/column_list.rb +39 -26
- data/app/models/easy_ml/dataset/learner/base.rb +34 -0
- data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
- data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
- data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
- data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
- data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
- data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
- data/app/models/easy_ml/dataset/learner/query.rb +25 -0
- data/app/models/easy_ml/dataset/learner.rb +100 -0
- data/app/models/easy_ml/dataset.rb +150 -36
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/datasource.rb +9 -0
- data/app/models/easy_ml/event.rb +4 -0
- data/app/models/easy_ml/export/column.rb +27 -0
- data/app/models/easy_ml/export/dataset.rb +37 -0
- data/app/models/easy_ml/export/datasource.rb +12 -0
- data/app/models/easy_ml/export/feature.rb +24 -0
- data/app/models/easy_ml/export/model.rb +40 -0
- data/app/models/easy_ml/export/retraining_job.rb +20 -0
- data/app/models/easy_ml/export/splitter.rb +14 -0
- data/app/models/easy_ml/feature.rb +21 -0
- data/app/models/easy_ml/import/column.rb +35 -0
- data/app/models/easy_ml/import/dataset.rb +148 -0
- data/app/models/easy_ml/import/feature.rb +36 -0
- data/app/models/easy_ml/import/model.rb +136 -0
- data/app/models/easy_ml/import/retraining_job.rb +29 -0
- data/app/models/easy_ml/import/splitter.rb +34 -0
- data/app/models/easy_ml/lineage.rb +44 -0
- data/app/models/easy_ml/model.rb +93 -36
- data/app/models/easy_ml/model_file.rb +6 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
- data/app/models/easy_ml/models/xgboost.rb +33 -9
- data/app/models/easy_ml/retraining_job.rb +8 -1
- data/app/models/easy_ml/retraining_run.rb +6 -4
- data/app/models/easy_ml/splitter.rb +8 -0
- data/app/models/lineage_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +7 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
- data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
- data/config/routes.rb +13 -1
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
- data/lib/easy_ml/core/tuner.rb +12 -11
- data/lib/easy_ml/data/polars_column.rb +149 -100
- data/lib/easy_ml/data/polars_reader.rb +8 -5
- data/lib/easy_ml/data/polars_schema.rb +56 -0
- data/lib/easy_ml/data/splits/file_split.rb +20 -2
- data/lib/easy_ml/data/splits/split.rb +10 -1
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/deep_compact.rb +19 -0
- data/lib/easy_ml/feature_store.rb +2 -6
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
- data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
- data/lib/easy_ml/timing.rb +34 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +2 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
- metadata +52 -12
- data/app/models/easy_ml/column/learners/base.rb +0 -103
- data/app/models/easy_ml/column/learners/boolean.rb +0 -11
- data/app/models/easy_ml/column/learners/categorical.rb +0 -51
- data/app/models/easy_ml/column/learners/datetime.rb +0 -19
- data/app/models/easy_ml/column/learners/null.rb +0 -22
- data/app/models/easy_ml/column/learners/numeric.rb +0 -33
- data/app/models/easy_ml/column/learners/string.rb +0 -15
- data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -0,0 +1,148 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class Dataset
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::Dataset.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::Dataset::UNCONFIGURABLE_COLUMNS.map(&:to_sym) +
|
7
|
+
[:columns, :features, :splitter, :datasource]
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.from_config(json_config, action: nil, dataset: nil)
|
11
|
+
raise ArgumentError, "Target dataset must be specified" if action == :update && dataset.nil?
|
12
|
+
|
13
|
+
config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
|
14
|
+
dataset_config = config["dataset"]
|
15
|
+
|
16
|
+
# Extract configs for related models
|
17
|
+
datasource_config = dataset_config.delete("datasource")
|
18
|
+
splitter_config = dataset_config.delete("splitter")
|
19
|
+
columns_config = dataset_config.delete("columns") || []
|
20
|
+
features_config = dataset_config.delete("features") || []
|
21
|
+
|
22
|
+
if action == :create
|
23
|
+
name = dataset_config["name"]
|
24
|
+
dataset = EasyML::Dataset.find_by(name: name)
|
25
|
+
action = dataset.present? ? :update : :create
|
26
|
+
end
|
27
|
+
raise ArgumentError, "Action must be specified" unless action.present?
|
28
|
+
|
29
|
+
if action == :create
|
30
|
+
create_dataset(
|
31
|
+
dataset_config,
|
32
|
+
datasource_config,
|
33
|
+
splitter_config,
|
34
|
+
columns_config,
|
35
|
+
features_config
|
36
|
+
)
|
37
|
+
elsif action == :update
|
38
|
+
update_dataset(
|
39
|
+
dataset,
|
40
|
+
dataset_config,
|
41
|
+
columns_config,
|
42
|
+
features_config
|
43
|
+
)
|
44
|
+
else
|
45
|
+
raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def self.create_dataset(dataset_config, datasource_config, splitter_config, columns_config, features_config)
|
52
|
+
# Create new datasource
|
53
|
+
datasource = EasyML::Datasource.find_or_create_by(name: datasource_config["name"]) do |ds|
|
54
|
+
ds.assign_attributes(datasource_config)
|
55
|
+
end
|
56
|
+
datasource.update!(datasource_config)
|
57
|
+
|
58
|
+
# Create new dataset
|
59
|
+
dataset = EasyML::Dataset.create!(
|
60
|
+
dataset_config.merge(datasource: datasource)
|
61
|
+
)
|
62
|
+
|
63
|
+
# Create splitter if config exists
|
64
|
+
EasyML::Splitter.from_config(splitter_config, dataset) if splitter_config.present?
|
65
|
+
|
66
|
+
# Create columns
|
67
|
+
columns_config.each do |column_config|
|
68
|
+
EasyML::Column.from_config(column_config, dataset, action: :create)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Create features
|
72
|
+
features_config.each do |feature_config|
|
73
|
+
EasyML::Feature.from_config(feature_config, dataset, action: :create)
|
74
|
+
end
|
75
|
+
|
76
|
+
dataset
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.update_dataset(dataset, dataset_config, columns_config, features_config)
|
80
|
+
# Update dataset attributes except name (preserve original name)
|
81
|
+
dataset.update!(dataset_config.except("name", "datasource"))
|
82
|
+
|
83
|
+
needs_refresh = false
|
84
|
+
|
85
|
+
# Update existing columns
|
86
|
+
columns_config.each do |column_config|
|
87
|
+
column_name = column_config["name"]
|
88
|
+
existing_column = dataset.columns.find_by(name: column_name)
|
89
|
+
|
90
|
+
if existing_column
|
91
|
+
old_drop_if_null = existing_column.drop_if_null
|
92
|
+
new_drop_if_null = column_config["drop_if_null"]
|
93
|
+
|
94
|
+
# Check if drop_if_null has changed
|
95
|
+
needs_refresh ||= !new_drop_if_null.nil? && old_drop_if_null != new_drop_if_null
|
96
|
+
end
|
97
|
+
|
98
|
+
EasyML::Column.from_config(column_config, dataset, action: :update)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Update or create features
|
102
|
+
features_config.each do |feature_config|
|
103
|
+
EasyML::Feature.from_config(feature_config, dataset, action: :update)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Refresh if needed
|
107
|
+
dataset.refresh_async if needs_refresh
|
108
|
+
|
109
|
+
dataset
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.validate(dataset_config)
|
113
|
+
extra_keys = dataset_config.keys.map(&:to_sym) - permitted_keys
|
114
|
+
raise ArgumentError, "Invalid dataset keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
115
|
+
|
116
|
+
if dataset_config[:splitter].present?
|
117
|
+
dataset_config[:splitter] = EasyML::Import::Splitter.validate(dataset_config[:splitter])
|
118
|
+
end
|
119
|
+
|
120
|
+
if dataset_config[:columns].present?
|
121
|
+
unless dataset_config[:columns].is_a?(Array)
|
122
|
+
raise ArgumentError, "Columns configuration must be an array"
|
123
|
+
end
|
124
|
+
dataset_config[:columns].each_with_index do |col_config, idx|
|
125
|
+
unless col_config.is_a?(Hash)
|
126
|
+
raise ArgumentError, "Each column configuration must be a hash, at index #{idx}"
|
127
|
+
end
|
128
|
+
EasyML::Import::Column.validate(col_config, idx)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
if dataset_config[:features].present?
|
133
|
+
unless dataset_config[:features].is_a?(Array)
|
134
|
+
raise ArgumentError, "Features configuration must be an array"
|
135
|
+
end
|
136
|
+
dataset_config[:features].each_with_index do |feat_config, idx|
|
137
|
+
unless feat_config.is_a?(Hash)
|
138
|
+
raise ArgumentError, "Each feature configuration must be a hash, at index #{idx}"
|
139
|
+
end
|
140
|
+
EasyML::Import::Feature.validate(feat_config, idx)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
dataset_config
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class Feature
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::Feature.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::Feature::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_config(config, dataset, action: :create)
|
10
|
+
feature_name = config["name"]
|
11
|
+
existing_feature = dataset.features.find_by(name: feature_name)
|
12
|
+
|
13
|
+
case action
|
14
|
+
when :create
|
15
|
+
dataset.features.create!(config)
|
16
|
+
when :update
|
17
|
+
if existing_feature
|
18
|
+
existing_feature.update!(config)
|
19
|
+
existing_feature
|
20
|
+
else
|
21
|
+
# Features can be added during update, unlike columns
|
22
|
+
dataset.features.create!(config)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.validate(config, idx)
|
30
|
+
extra_keys = config.keys.map(&:to_sym) - permitted_keys
|
31
|
+
raise ArgumentError, "Invalid keys in feature config at index #{idx}: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
32
|
+
config
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class Model
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::Model.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::Model::UNCONFIGURABLE_COLUMNS.map(&:to_sym) +
|
7
|
+
[:weights] +
|
8
|
+
EasyML::Model.configuration_attributes.map(&:to_sym) +
|
9
|
+
[:dataset, :splitter, :retraining_job]
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.from_config(json_config, action: nil, model: nil, include_dataset: true, dataset: nil)
|
13
|
+
raise ArgumentError, "Action must be specified" unless action.present?
|
14
|
+
raise ArgumentError, "Target model must be specified" if action == :update && model.nil?
|
15
|
+
raise ArgumentError, "Dataset must be specified when creating a model" if action == :create && !include_dataset && dataset.nil?
|
16
|
+
|
17
|
+
config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
|
18
|
+
config = config.deep_dup.with_indifferent_access
|
19
|
+
|
20
|
+
# Validate the configuration
|
21
|
+
validate(config)
|
22
|
+
model_config = config["model"]
|
23
|
+
|
24
|
+
# Config variables would skip custom setters, so better to manually merge
|
25
|
+
configuration = model_config.delete("configuration")
|
26
|
+
model_config.merge!(configuration) if configuration.present?
|
27
|
+
|
28
|
+
case action
|
29
|
+
when :create
|
30
|
+
create_model(model_config, include_dataset: include_dataset, dataset: dataset)
|
31
|
+
when :update
|
32
|
+
update_model(model, model_config, include_dataset: include_dataset)
|
33
|
+
else
|
34
|
+
raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def self.create_model(model_config, include_dataset:, dataset:)
|
41
|
+
# Handle dataset if included
|
42
|
+
model_dataset = if include_dataset && model_config["dataset"].present?
|
43
|
+
dataset_config = { "dataset" => model_config.delete("dataset") }
|
44
|
+
EasyML::Import::Dataset.from_config(dataset_config, action: :create)
|
45
|
+
else
|
46
|
+
dataset
|
47
|
+
end
|
48
|
+
|
49
|
+
# Create model
|
50
|
+
model = EasyML::Model.new(model_config.except("weights", "dataset", "retraining_job"))
|
51
|
+
model.dataset = model_dataset
|
52
|
+
|
53
|
+
model_name = model_config["name"]
|
54
|
+
if (existing_model = EasyML::Model.find_by(name: model_name)).present?
|
55
|
+
model.name = generate_unique_name(model_name)
|
56
|
+
end
|
57
|
+
model.save!
|
58
|
+
|
59
|
+
if model_config["retraining_job"].present?
|
60
|
+
retraining_job = EasyML::RetrainingJob.from_config(model_config["retraining_job"], model)
|
61
|
+
model.retraining_job = retraining_job
|
62
|
+
model.save!
|
63
|
+
model.reload
|
64
|
+
end
|
65
|
+
|
66
|
+
# Update weights if present
|
67
|
+
if model_config["weights"].present?
|
68
|
+
model.update!(weights: model_config["weights"])
|
69
|
+
model.import
|
70
|
+
end
|
71
|
+
|
72
|
+
model
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.update_model(model, model_config, include_dataset:)
|
76
|
+
# Update dataset if included
|
77
|
+
if include_dataset && model_config["dataset"].present?
|
78
|
+
dataset_config = { "dataset" => model_config.delete("dataset") }
|
79
|
+
EasyML::Import::Dataset.from_config(dataset_config, action: :update, dataset: model.dataset)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Update model attributes except name (preserve original name)
|
83
|
+
model.update!(model_config.except("name", "weights", "dataset", "retraining_job"))
|
84
|
+
|
85
|
+
if model_config["retraining_job"].present?
|
86
|
+
retraining_job = EasyML::RetrainingJob.from_config(model_config["retraining_job"], model)
|
87
|
+
model.retraining_job = retraining_job
|
88
|
+
model.save!
|
89
|
+
model.reload
|
90
|
+
end
|
91
|
+
|
92
|
+
# Update weights if present
|
93
|
+
if model_config["weights"].present?
|
94
|
+
model.update!(weights: model_config["weights"])
|
95
|
+
model.import
|
96
|
+
end
|
97
|
+
|
98
|
+
model
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.validate(json_config)
|
102
|
+
config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
|
103
|
+
config = config.deep_dup.with_indifferent_access
|
104
|
+
|
105
|
+
# Validate root keys: must have only "model"
|
106
|
+
extra_keys = config.keys.map(&:to_sym) - [:model]
|
107
|
+
raise ArgumentError, "Invalid root keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
108
|
+
|
109
|
+
model_config = config[:model]
|
110
|
+
# Validate that model_config does not contain keys that are unconfigurable
|
111
|
+
extra_keys = model_config.keys.map(&:to_sym) - permitted_keys
|
112
|
+
raise ArgumentError, "Invalid model keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
113
|
+
|
114
|
+
# Delegate nested validations to individual importers
|
115
|
+
if model_config["dataset"].present?
|
116
|
+
model_config["dataset"] = EasyML::Import::Dataset.validate(model_config["dataset"])
|
117
|
+
end
|
118
|
+
|
119
|
+
if model_config["retraining_job"].present?
|
120
|
+
model_config["retraining_job"] = EasyML::Import::RetrainingJob.validate(model_config["retraining_job"])
|
121
|
+
end
|
122
|
+
|
123
|
+
config
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.generate_unique_name(base_name)
|
127
|
+
revision = EasyML::Model.where("name LIKE ?", "#{base_name} (Revision %)")
|
128
|
+
.map { |m| m.name.match(/\(Revision (\d+)\)/).try(:[], 1).try(:to_i) }
|
129
|
+
.compact
|
130
|
+
.max || 0
|
131
|
+
|
132
|
+
"#{base_name} (Revision #{revision + 1})"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class RetrainingJob
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::RetrainingJob.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::RetrainingJob::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_config(config, model)
|
10
|
+
existing_job = model.get_retraining_job
|
11
|
+
existing_job.update!(config)
|
12
|
+
existing_job
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.validate(config)
|
16
|
+
return nil unless config.present?
|
17
|
+
|
18
|
+
unless config.is_a?(Hash)
|
19
|
+
raise ArgumentError, "Retraining job configuration must be a hash"
|
20
|
+
end
|
21
|
+
|
22
|
+
extra_keys = config.keys.map(&:to_sym) - permitted_keys
|
23
|
+
raise ArgumentError, "Invalid retraining job keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
24
|
+
|
25
|
+
config
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class Splitter
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::Splitter.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::Splitter::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_config(config, dataset)
|
10
|
+
return nil unless config.present?
|
11
|
+
|
12
|
+
if dataset.splitter.present?
|
13
|
+
dataset.splitter.update!(config)
|
14
|
+
dataset.splitter
|
15
|
+
else
|
16
|
+
dataset.create_splitter!(config)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.validate(config)
|
21
|
+
return nil unless config.present?
|
22
|
+
|
23
|
+
unless config.is_a?(Hash)
|
24
|
+
raise ArgumentError, "Splitter configuration must be a hash"
|
25
|
+
end
|
26
|
+
|
27
|
+
extra_keys = config.keys.map(&:to_sym) - permitted_keys
|
28
|
+
raise ArgumentError, "Invalid splitter keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
29
|
+
|
30
|
+
config
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_lineages
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# column_id :bigint not null
|
7
|
+
# key :string not null
|
8
|
+
# description :string
|
9
|
+
# occurred_at :datetime
|
10
|
+
# created_at :datetime not null
|
11
|
+
# updated_at :datetime not null
|
12
|
+
#
|
13
|
+
module EasyML
|
14
|
+
class Lineage < ActiveRecord::Base
|
15
|
+
belongs_to :column
|
16
|
+
|
17
|
+
class << self
|
18
|
+
def learn(column)
|
19
|
+
@lineage = EasyML::Column::Lineage.new(column).lineage
|
20
|
+
|
21
|
+
existing_lineage = where(column_id: column.id)
|
22
|
+
missing_lineage = @lineage.select { |l| !existing_lineage.exists?(key: l[:key]) }
|
23
|
+
|
24
|
+
missing_lineage = missing_lineage.map { |l|
|
25
|
+
EasyML::Lineage.new(
|
26
|
+
column_id: column.id,
|
27
|
+
key: l[:key],
|
28
|
+
occurred_at: l[:occurred_at],
|
29
|
+
description: l[:description],
|
30
|
+
)
|
31
|
+
}
|
32
|
+
existing_lineage = existing_lineage.map do |lineage|
|
33
|
+
matching_lineage = @lineage.detect { |ll| ll[:key].to_sym == lineage.key.to_sym }
|
34
|
+
|
35
|
+
lineage&.assign_attributes(
|
36
|
+
occurred_at: matching_lineage[:occurred_at],
|
37
|
+
description: matching_lineage[:description],
|
38
|
+
)
|
39
|
+
end
|
40
|
+
missing_lineage.concat(existing_lineage)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -45,7 +45,7 @@ module EasyML
|
|
45
45
|
MODEL_NAMES = MODEL_OPTIONS.keys.freeze
|
46
46
|
MODEL_CONSTANTS = MODEL_OPTIONS.values.map(&:constantize)
|
47
47
|
|
48
|
-
add_configuration_attributes :task, :objective, :hyperparameters, :
|
48
|
+
add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics
|
49
49
|
MODEL_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
50
50
|
add_configuration_attributes attribute
|
51
51
|
end
|
@@ -53,10 +53,10 @@ module EasyML
|
|
53
53
|
belongs_to :dataset
|
54
54
|
belongs_to :model_file, class_name: "EasyML::ModelFile", foreign_key: "model_file_id", optional: true
|
55
55
|
|
56
|
-
has_one :retraining_job, class_name: "EasyML::RetrainingJob"
|
56
|
+
has_one :retraining_job, class_name: "EasyML::RetrainingJob", dependent: :destroy
|
57
57
|
accepts_nested_attributes_for :retraining_job
|
58
|
-
has_many :retraining_runs, class_name: "EasyML::RetrainingRun"
|
59
|
-
has_many :deploys, class_name: "EasyML::Deploy"
|
58
|
+
has_many :retraining_runs, class_name: "EasyML::RetrainingRun", dependent: :destroy
|
59
|
+
has_many :deploys, class_name: "EasyML::Deploy", dependent: :destroy
|
60
60
|
|
61
61
|
scope :deployed, -> { EasyML::ModelHistory.deployed }
|
62
62
|
|
@@ -127,26 +127,41 @@ module EasyML
|
|
127
127
|
end
|
128
128
|
end
|
129
129
|
|
130
|
+
def trained?
|
131
|
+
retraining_runs.where(status: :success).exists?
|
132
|
+
end
|
133
|
+
|
134
|
+
def deployed?
|
135
|
+
inference_version.present?
|
136
|
+
end
|
137
|
+
|
138
|
+
def weights=(weights)
|
139
|
+
raise ArgumentError, "Cannot set weights on model without type" unless model_type.present?
|
140
|
+
|
141
|
+
model_file = get_model_file
|
142
|
+
adapter.set_weights(model_file, weights)
|
143
|
+
save_model_file
|
144
|
+
end
|
145
|
+
|
146
|
+
def weights
|
147
|
+
adapter.weights(get_model_file)
|
148
|
+
end
|
149
|
+
|
130
150
|
def get_retraining_job
|
131
|
-
if retraining_job
|
132
|
-
self.evaluator = retraining_job.evaluator
|
133
|
-
evaluator = self.evaluator.symbolize_keys
|
134
|
-
else
|
135
|
-
default_eval = Core::ModelEvaluator.default_evaluator(task)
|
136
|
-
self.evaluator = default_eval
|
137
|
-
evaluator = default_eval
|
138
|
-
end
|
151
|
+
return retraining_job if retraining_job.present?
|
139
152
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
evaluator = Core::ModelEvaluator.default_evaluator(task).symbolize_keys
|
154
|
+
|
155
|
+
method = persisted? ? :create_retraining_job : :build_retraining_job
|
156
|
+
|
157
|
+
send(method,
|
158
|
+
model: self,
|
159
|
+
active: false,
|
160
|
+
metric: evaluator[:metric],
|
161
|
+
direction: evaluator[:direction],
|
162
|
+
threshold: evaluator[:threshold],
|
163
|
+
frequency: "month",
|
164
|
+
at: { hour: 0, day_of_month: 1 })
|
150
165
|
end
|
151
166
|
|
152
167
|
def pending_run
|
@@ -154,6 +169,15 @@ module EasyML
|
|
154
169
|
job.retraining_runs.find_or_create_by(status: "pending", model: self)
|
155
170
|
end
|
156
171
|
|
172
|
+
def import
|
173
|
+
lock_model do
|
174
|
+
run = pending_run
|
175
|
+
run.wrap_training do
|
176
|
+
[self, hyperparameters.to_h]
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
157
181
|
def actually_train(&progress_block)
|
158
182
|
lock_model do
|
159
183
|
run = pending_run
|
@@ -193,6 +217,20 @@ module EasyML
|
|
193
217
|
"training:#{self.name}:#{self.id}"
|
194
218
|
end
|
195
219
|
|
220
|
+
def hyperparameters=(hyperparameters)
|
221
|
+
return unless model_type.present?
|
222
|
+
|
223
|
+
@hypers = adapter.build_hyperparameters(hyperparameters)
|
224
|
+
end
|
225
|
+
|
226
|
+
def hyperparameters
|
227
|
+
@hypers ||= adapter.build_hyperparameters(@hyperparameters)
|
228
|
+
end
|
229
|
+
|
230
|
+
def callbacks
|
231
|
+
@cbs ||= adapter.build_callbacks(@callbacks)
|
232
|
+
end
|
233
|
+
|
196
234
|
def hyperparameter_search(&progress_block)
|
197
235
|
tuner = retraining_job.tuner_config.symbolize_keys
|
198
236
|
extra_params = {
|
@@ -239,16 +277,11 @@ module EasyML
|
|
239
277
|
alias_method :latest_version, :inference_version
|
240
278
|
alias_method :deployed, :inference_version
|
241
279
|
|
242
|
-
def hyperparameters
|
243
|
-
@hypers ||= adapter.build_hyperparameters(@hyperparameters)
|
244
|
-
end
|
245
|
-
|
246
|
-
def callbacks
|
247
|
-
@cbs ||= adapter.build_callbacks(@callbacks)
|
248
|
-
end
|
249
|
-
|
250
280
|
def predict(xs)
|
251
281
|
load_model!
|
282
|
+
unless xs.is_a?(XGBoost::DMatrix)
|
283
|
+
xs = dataset.normalize(xs, inference: true)
|
284
|
+
end
|
252
285
|
adapter.predict(xs)
|
253
286
|
end
|
254
287
|
|
@@ -361,6 +394,10 @@ module EasyML
|
|
361
394
|
dataset.decode_labels(ys, col: col)
|
362
395
|
end
|
363
396
|
|
397
|
+
def evaluator
|
398
|
+
get_retraining_job&.evaluator || default_evaluator
|
399
|
+
end
|
400
|
+
|
364
401
|
def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil, dataset: nil)
|
365
402
|
evaluator ||= self.evaluator
|
366
403
|
if y_pred.nil?
|
@@ -373,10 +410,6 @@ module EasyML
|
|
373
410
|
EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset, evaluator: evaluator)
|
374
411
|
end
|
375
412
|
|
376
|
-
def evaluator
|
377
|
-
instance_variable_get(:@evaluator) || default_evaluator
|
378
|
-
end
|
379
|
-
|
380
413
|
def default_evaluator
|
381
414
|
return nil unless task.present?
|
382
415
|
|
@@ -388,7 +421,7 @@ module EasyML
|
|
388
421
|
end
|
389
422
|
|
390
423
|
def evals
|
391
|
-
last_run&.metrics || {}
|
424
|
+
(last_run&.metrics || {}).with_indifferent_access
|
392
425
|
end
|
393
426
|
|
394
427
|
def metric_accessor(metric)
|
@@ -543,6 +576,28 @@ module EasyML
|
|
543
576
|
end
|
544
577
|
end
|
545
578
|
|
579
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
580
|
+
id
|
581
|
+
dataset_id
|
582
|
+
model_file_id
|
583
|
+
root_dir
|
584
|
+
file
|
585
|
+
sha
|
586
|
+
last_trained_at
|
587
|
+
is_training
|
588
|
+
created_at
|
589
|
+
updated_at
|
590
|
+
slug
|
591
|
+
)
|
592
|
+
|
593
|
+
def to_config(include_dataset: false)
|
594
|
+
EasyML::Export::Model.to_config(self, include_dataset: include_dataset)
|
595
|
+
end
|
596
|
+
|
597
|
+
def self.from_config(json_config, action: nil, model: nil, include_dataset: true, dataset: nil)
|
598
|
+
EasyML::Import::Model.from_config(json_config, action: action, model: model, include_dataset: include_dataset, dataset: dataset)
|
599
|
+
end
|
600
|
+
|
546
601
|
private
|
547
602
|
|
548
603
|
def default_evaluation_inputs
|
@@ -622,6 +677,8 @@ module EasyML
|
|
622
677
|
end
|
623
678
|
|
624
679
|
def validate_metrics_allowed
|
680
|
+
set_defaults if metrics.nil? || metrics.empty?
|
681
|
+
|
625
682
|
unknown_metrics = metrics.select { |metric| allowed_metrics.exclude?(metric) }
|
626
683
|
return unless unknown_metrics.any?
|
627
684
|
|
@@ -631,7 +688,7 @@ module EasyML
|
|
631
688
|
|
632
689
|
def set_slug
|
633
690
|
if slug.nil? && name.present?
|
634
|
-
self.slug = name.gsub(/\s/, "_").downcase
|
691
|
+
self.slug = name.gsub(/\s/, "_").gsub(/[^a-zA-Z0-9_]/, "").downcase
|
635
692
|
end
|
636
693
|
end
|
637
694
|
end
|
@@ -32,9 +32,9 @@ module EasyML
|
|
32
32
|
false
|
33
33
|
end
|
34
34
|
|
35
|
-
def
|
35
|
+
def valid_dataset
|
36
36
|
if tuner.present?
|
37
|
-
[tuner.
|
37
|
+
[tuner.x_valid, tuner.y_valid]
|
38
38
|
else
|
39
39
|
model.dataset.valid(split_ys: true)
|
40
40
|
end
|
@@ -46,12 +46,12 @@ module EasyML
|
|
46
46
|
log_frequency = 10
|
47
47
|
if epoch % log_frequency == 0
|
48
48
|
model.adapter.external_model = booster
|
49
|
-
|
50
|
-
@preprocessed ||= model.preprocess(
|
49
|
+
x_valid, y_valid = valid_dataset
|
50
|
+
@preprocessed ||= model.preprocess(x_valid)
|
51
51
|
y_pred = model.predict(@preprocessed)
|
52
|
-
dataset = model.dataset.
|
52
|
+
dataset = model.dataset.valid(all_columns: true)
|
53
53
|
|
54
|
-
metrics = model.evaluate(y_pred: y_pred, y_true:
|
54
|
+
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
55
55
|
Wandb.log(metrics)
|
56
56
|
end
|
57
57
|
|
@@ -67,7 +67,7 @@ module EasyML
|
|
67
67
|
def after_training(booster)
|
68
68
|
return booster unless wandb_enabled?
|
69
69
|
|
70
|
-
if model.last_run&.wandb_url.nil?
|
70
|
+
if model.last_run.present? && model.last_run&.wandb_url.nil?
|
71
71
|
if tuner.present? && !tuner.current_run.wandb_url.present?
|
72
72
|
tuner.current_run.wandb_url = Wandb.current_run.url
|
73
73
|
end
|