easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +38 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +69 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/dataset/PreprocessingConfig.tsx +2 -2
  10. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  11. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  12. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  13. data/app/frontend/components/models/index.ts +2 -0
  14. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  15. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  16. data/app/frontend/pages/ModelsPage.tsx +37 -11
  17. data/app/frontend/types/dataset.ts +1 -2
  18. data/app/frontend/types.ts +1 -1
  19. data/app/jobs/easy_ml/training_job.rb +2 -2
  20. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  21. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  22. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  23. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  24. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  25. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  26. data/app/models/easy_ml/column/imputers.rb +3 -1
  27. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  28. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  30. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  31. data/app/models/easy_ml/column/selector.rb +4 -0
  32. data/app/models/easy_ml/column.rb +79 -63
  33. data/app/models/easy_ml/column_history.rb +28 -28
  34. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  35. data/app/models/easy_ml/column_list.rb +39 -26
  36. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  37. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  38. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  39. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  40. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  47. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  48. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  49. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  50. data/app/models/easy_ml/dataset/learner.rb +100 -0
  51. data/app/models/easy_ml/dataset.rb +150 -36
  52. data/app/models/easy_ml/dataset_history.rb +1 -0
  53. data/app/models/easy_ml/datasource.rb +13 -5
  54. data/app/models/easy_ml/event.rb +4 -0
  55. data/app/models/easy_ml/export/column.rb +27 -0
  56. data/app/models/easy_ml/export/dataset.rb +37 -0
  57. data/app/models/easy_ml/export/datasource.rb +12 -0
  58. data/app/models/easy_ml/export/feature.rb +24 -0
  59. data/app/models/easy_ml/export/model.rb +40 -0
  60. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  61. data/app/models/easy_ml/export/splitter.rb +14 -0
  62. data/app/models/easy_ml/feature.rb +21 -0
  63. data/app/models/easy_ml/import/column.rb +35 -0
  64. data/app/models/easy_ml/import/dataset.rb +148 -0
  65. data/app/models/easy_ml/import/feature.rb +36 -0
  66. data/app/models/easy_ml/import/model.rb +136 -0
  67. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  68. data/app/models/easy_ml/import/splitter.rb +34 -0
  69. data/app/models/easy_ml/lineage.rb +44 -0
  70. data/app/models/easy_ml/model.rb +93 -36
  71. data/app/models/easy_ml/model_file.rb +6 -0
  72. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  73. data/app/models/easy_ml/models/xgboost.rb +33 -9
  74. data/app/models/easy_ml/retraining_job.rb +8 -1
  75. data/app/models/easy_ml/retraining_run.rb +6 -4
  76. data/app/models/easy_ml/splitter.rb +8 -0
  77. data/app/models/lineage_history.rb +6 -0
  78. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  79. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  80. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  81. data/config/routes.rb +13 -1
  82. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  83. data/lib/easy_ml/core/tuner.rb +12 -11
  84. data/lib/easy_ml/data/polars_column.rb +149 -100
  85. data/lib/easy_ml/data/polars_reader.rb +8 -5
  86. data/lib/easy_ml/data/polars_schema.rb +56 -0
  87. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  88. data/lib/easy_ml/data/splits/split.rb +10 -1
  89. data/lib/easy_ml/data.rb +1 -0
  90. data/lib/easy_ml/deep_compact.rb +19 -0
  91. data/lib/easy_ml/feature_store.rb +2 -6
  92. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  93. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  95. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  96. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  97. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  98. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  99. data/lib/easy_ml/timing.rb +34 -0
  100. data/lib/easy_ml/version.rb +1 -1
  101. data/lib/easy_ml.rb +2 -0
  102. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  103. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +1 -0
  104. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +522 -0
  105. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js.map +1 -0
  106. metadata +52 -12
  107. data/app/models/easy_ml/column/learners/base.rb +0 -103
  108. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  109. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  110. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  111. data/app/models/easy_ml/column/learners/null.rb +0 -22
  112. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  113. data/app/models/easy_ml/column/learners/string.rb +0 -15
  114. data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
  115. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
  116. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -0,0 +1,148 @@
1
+ module EasyML
2
+ module Import
3
+ class Dataset
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::Dataset.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::Dataset::UNCONFIGURABLE_COLUMNS.map(&:to_sym) +
7
+ [:columns, :features, :splitter, :datasource]
8
+ end
9
+
10
+ def self.from_config(json_config, action: nil, dataset: nil)
11
+ raise ArgumentError, "Target dataset must be specified" if action == :update && dataset.nil?
12
+
13
+ config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
14
+ dataset_config = config["dataset"]
15
+
16
+ # Extract configs for related models
17
+ datasource_config = dataset_config.delete("datasource")
18
+ splitter_config = dataset_config.delete("splitter")
19
+ columns_config = dataset_config.delete("columns") || []
20
+ features_config = dataset_config.delete("features") || []
21
+
22
+ if action == :create
23
+ name = dataset_config["name"]
24
+ dataset = EasyML::Dataset.find_by(name: name)
25
+ action = dataset.present? ? :update : :create
26
+ end
27
+ raise ArgumentError, "Action must be specified" unless action.present?
28
+
29
+ if action == :create
30
+ create_dataset(
31
+ dataset_config,
32
+ datasource_config,
33
+ splitter_config,
34
+ columns_config,
35
+ features_config
36
+ )
37
+ elsif action == :update
38
+ update_dataset(
39
+ dataset,
40
+ dataset_config,
41
+ columns_config,
42
+ features_config
43
+ )
44
+ else
45
+ raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def self.create_dataset(dataset_config, datasource_config, splitter_config, columns_config, features_config)
52
+ # Create new datasource
53
+ datasource = EasyML::Datasource.find_or_create_by(name: datasource_config["name"]) do |ds|
54
+ ds.assign_attributes(datasource_config)
55
+ end
56
+ datasource.update!(datasource_config)
57
+
58
+ # Create new dataset
59
+ dataset = EasyML::Dataset.create!(
60
+ dataset_config.merge(datasource: datasource)
61
+ )
62
+
63
+ # Create splitter if config exists
64
+ EasyML::Splitter.from_config(splitter_config, dataset) if splitter_config.present?
65
+
66
+ # Create columns
67
+ columns_config.each do |column_config|
68
+ EasyML::Column.from_config(column_config, dataset, action: :create)
69
+ end
70
+
71
+ # Create features
72
+ features_config.each do |feature_config|
73
+ EasyML::Feature.from_config(feature_config, dataset, action: :create)
74
+ end
75
+
76
+ dataset
77
+ end
78
+
79
+ def self.update_dataset(dataset, dataset_config, columns_config, features_config)
80
+ # Update dataset attributes except name (preserve original name)
81
+ dataset.update!(dataset_config.except("name", "datasource"))
82
+
83
+ needs_refresh = false
84
+
85
+ # Update existing columns
86
+ columns_config.each do |column_config|
87
+ column_name = column_config["name"]
88
+ existing_column = dataset.columns.find_by(name: column_name)
89
+
90
+ if existing_column
91
+ old_drop_if_null = existing_column.drop_if_null
92
+ new_drop_if_null = column_config["drop_if_null"]
93
+
94
+ # Check if drop_if_null has changed
95
+ needs_refresh ||= !new_drop_if_null.nil? && old_drop_if_null != new_drop_if_null
96
+ end
97
+
98
+ EasyML::Column.from_config(column_config, dataset, action: :update)
99
+ end
100
+
101
+ # Update or create features
102
+ features_config.each do |feature_config|
103
+ EasyML::Feature.from_config(feature_config, dataset, action: :update)
104
+ end
105
+
106
+ # Refresh if needed
107
+ dataset.refresh_async if needs_refresh
108
+
109
+ dataset
110
+ end
111
+
112
+ def self.validate(dataset_config)
113
+ extra_keys = dataset_config.keys.map(&:to_sym) - permitted_keys
114
+ raise ArgumentError, "Invalid dataset keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
115
+
116
+ if dataset_config[:splitter].present?
117
+ dataset_config[:splitter] = EasyML::Import::Splitter.validate(dataset_config[:splitter])
118
+ end
119
+
120
+ if dataset_config[:columns].present?
121
+ unless dataset_config[:columns].is_a?(Array)
122
+ raise ArgumentError, "Columns configuration must be an array"
123
+ end
124
+ dataset_config[:columns].each_with_index do |col_config, idx|
125
+ unless col_config.is_a?(Hash)
126
+ raise ArgumentError, "Each column configuration must be a hash, at index #{idx}"
127
+ end
128
+ EasyML::Import::Column.validate(col_config, idx)
129
+ end
130
+ end
131
+
132
+ if dataset_config[:features].present?
133
+ unless dataset_config[:features].is_a?(Array)
134
+ raise ArgumentError, "Features configuration must be an array"
135
+ end
136
+ dataset_config[:features].each_with_index do |feat_config, idx|
137
+ unless feat_config.is_a?(Hash)
138
+ raise ArgumentError, "Each feature configuration must be a hash, at index #{idx}"
139
+ end
140
+ EasyML::Import::Feature.validate(feat_config, idx)
141
+ end
142
+ end
143
+
144
+ dataset_config
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,36 @@
1
+ module EasyML
2
+ module Import
3
+ class Feature
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::Feature.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::Feature::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
7
+ end
8
+
9
+ def self.from_config(config, dataset, action: :create)
10
+ feature_name = config["name"]
11
+ existing_feature = dataset.features.find_by(name: feature_name)
12
+
13
+ case action
14
+ when :create
15
+ dataset.features.create!(config)
16
+ when :update
17
+ if existing_feature
18
+ existing_feature.update!(config)
19
+ existing_feature
20
+ else
21
+ # Features can be added during update, unlike columns
22
+ dataset.features.create!(config)
23
+ end
24
+ else
25
+ raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
26
+ end
27
+ end
28
+
29
+ def self.validate(config, idx)
30
+ extra_keys = config.keys.map(&:to_sym) - permitted_keys
31
+ raise ArgumentError, "Invalid keys in feature config at index #{idx}: #{extra_keys.join(", ")}" unless extra_keys.empty?
32
+ config
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,136 @@
1
+ module EasyML
2
+ module Import
3
+ class Model
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::Model.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::Model::UNCONFIGURABLE_COLUMNS.map(&:to_sym) +
7
+ [:weights] +
8
+ EasyML::Model.configuration_attributes.map(&:to_sym) +
9
+ [:dataset, :splitter, :retraining_job]
10
+ end
11
+
12
+ def self.from_config(json_config, action: nil, model: nil, include_dataset: true, dataset: nil)
13
+ raise ArgumentError, "Action must be specified" unless action.present?
14
+ raise ArgumentError, "Target model must be specified" if action == :update && model.nil?
15
+ raise ArgumentError, "Dataset must be specified when creating a model" if action == :create && !include_dataset && dataset.nil?
16
+
17
+ config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
18
+ config = config.deep_dup.with_indifferent_access
19
+
20
+ # Validate the configuration
21
+ validate(config)
22
+ model_config = config["model"]
23
+
24
+ # Config variables would skip custom setters, so better to manually merge
25
+ configuration = model_config.delete("configuration")
26
+ model_config.merge!(configuration) if configuration.present?
27
+
28
+ case action
29
+ when :create
30
+ create_model(model_config, include_dataset: include_dataset, dataset: dataset)
31
+ when :update
32
+ update_model(model, model_config, include_dataset: include_dataset)
33
+ else
34
+ raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def self.create_model(model_config, include_dataset:, dataset:)
41
+ # Handle dataset if included
42
+ model_dataset = if include_dataset && model_config["dataset"].present?
43
+ dataset_config = { "dataset" => model_config.delete("dataset") }
44
+ EasyML::Import::Dataset.from_config(dataset_config, action: :create)
45
+ else
46
+ dataset
47
+ end
48
+
49
+ # Create model
50
+ model = EasyML::Model.new(model_config.except("weights", "dataset", "retraining_job"))
51
+ model.dataset = model_dataset
52
+
53
+ model_name = model_config["name"]
54
+ if (existing_model = EasyML::Model.find_by(name: model_name)).present?
55
+ model.name = generate_unique_name(model_name)
56
+ end
57
+ model.save!
58
+
59
+ if model_config["retraining_job"].present?
60
+ retraining_job = EasyML::RetrainingJob.from_config(model_config["retraining_job"], model)
61
+ model.retraining_job = retraining_job
62
+ model.save!
63
+ model.reload
64
+ end
65
+
66
+ # Update weights if present
67
+ if model_config["weights"].present?
68
+ model.update!(weights: model_config["weights"])
69
+ model.import
70
+ end
71
+
72
+ model
73
+ end
74
+
75
+ def self.update_model(model, model_config, include_dataset:)
76
+ # Update dataset if included
77
+ if include_dataset && model_config["dataset"].present?
78
+ dataset_config = { "dataset" => model_config.delete("dataset") }
79
+ EasyML::Import::Dataset.from_config(dataset_config, action: :update, dataset: model.dataset)
80
+ end
81
+
82
+ # Update model attributes except name (preserve original name)
83
+ model.update!(model_config.except("name", "weights", "dataset", "retraining_job"))
84
+
85
+ if model_config["retraining_job"].present?
86
+ retraining_job = EasyML::RetrainingJob.from_config(model_config["retraining_job"], model)
87
+ model.retraining_job = retraining_job
88
+ model.save!
89
+ model.reload
90
+ end
91
+
92
+ # Update weights if present
93
+ if model_config["weights"].present?
94
+ model.update!(weights: model_config["weights"])
95
+ model.import
96
+ end
97
+
98
+ model
99
+ end
100
+
101
+ def self.validate(json_config)
102
+ config = json_config.is_a?(String) ? JSON.parse(json_config) : json_config
103
+ config = config.deep_dup.with_indifferent_access
104
+
105
+ # Validate root keys: must have only "model"
106
+ extra_keys = config.keys.map(&:to_sym) - [:model]
107
+ raise ArgumentError, "Invalid root keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
108
+
109
+ model_config = config[:model]
110
+ # Validate that model_config does not contain keys that are unconfigurable
111
+ extra_keys = model_config.keys.map(&:to_sym) - permitted_keys
112
+ raise ArgumentError, "Invalid model keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
113
+
114
+ # Delegate nested validations to individual importers
115
+ if model_config["dataset"].present?
116
+ model_config["dataset"] = EasyML::Import::Dataset.validate(model_config["dataset"])
117
+ end
118
+
119
+ if model_config["retraining_job"].present?
120
+ model_config["retraining_job"] = EasyML::Import::RetrainingJob.validate(model_config["retraining_job"])
121
+ end
122
+
123
+ config
124
+ end
125
+
126
+ def self.generate_unique_name(base_name)
127
+ revision = EasyML::Model.where("name LIKE ?", "#{base_name} (Revision %)")
128
+ .map { |m| m.name.match(/\(Revision (\d+)\)/).try(:[], 1).try(:to_i) }
129
+ .compact
130
+ .max || 0
131
+
132
+ "#{base_name} (Revision #{revision + 1})"
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,29 @@
1
+ module EasyML
2
+ module Import
3
+ class RetrainingJob
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::RetrainingJob.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::RetrainingJob::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
7
+ end
8
+
9
+ def self.from_config(config, model)
10
+ existing_job = model.get_retraining_job
11
+ existing_job.update!(config)
12
+ existing_job
13
+ end
14
+
15
+ def self.validate(config)
16
+ return nil unless config.present?
17
+
18
+ unless config.is_a?(Hash)
19
+ raise ArgumentError, "Retraining job configuration must be a hash"
20
+ end
21
+
22
+ extra_keys = config.keys.map(&:to_sym) - permitted_keys
23
+ raise ArgumentError, "Invalid retraining job keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
24
+
25
+ config
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,34 @@
1
+ module EasyML
2
+ module Import
3
+ class Splitter
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::Splitter.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::Splitter::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
7
+ end
8
+
9
+ def self.from_config(config, dataset)
10
+ return nil unless config.present?
11
+
12
+ if dataset.splitter.present?
13
+ dataset.splitter.update!(config)
14
+ dataset.splitter
15
+ else
16
+ dataset.create_splitter!(config)
17
+ end
18
+ end
19
+
20
+ def self.validate(config)
21
+ return nil unless config.present?
22
+
23
+ unless config.is_a?(Hash)
24
+ raise ArgumentError, "Splitter configuration must be a hash"
25
+ end
26
+
27
+ extra_keys = config.keys.map(&:to_sym) - permitted_keys
28
+ raise ArgumentError, "Invalid splitter keys: #{extra_keys.join(", ")}" unless extra_keys.empty?
29
+
30
+ config
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,44 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_lineages
4
+ #
5
+ # id :bigint not null, primary key
6
+ # column_id :bigint not null
7
+ # key :string not null
8
+ # description :string
9
+ # occurred_at :datetime
10
+ # created_at :datetime not null
11
+ # updated_at :datetime not null
12
+ #
13
+ module EasyML
14
+ class Lineage < ActiveRecord::Base
15
+ belongs_to :column
16
+
17
+ class << self
18
+ def learn(column)
19
+ @lineage = EasyML::Column::Lineage.new(column).lineage
20
+
21
+ existing_lineage = where(column_id: column.id)
22
+ missing_lineage = @lineage.select { |l| !existing_lineage.exists?(key: l[:key]) }
23
+
24
+ missing_lineage = missing_lineage.map { |l|
25
+ EasyML::Lineage.new(
26
+ column_id: column.id,
27
+ key: l[:key],
28
+ occurred_at: l[:occurred_at],
29
+ description: l[:description],
30
+ )
31
+ }
32
+ existing_lineage = existing_lineage.map do |lineage|
33
+ matching_lineage = @lineage.detect { |ll| ll[:key].to_sym == lineage.key.to_sym }
34
+
35
+ lineage&.assign_attributes(
36
+ occurred_at: matching_lineage[:occurred_at],
37
+ description: matching_lineage[:description],
38
+ )
39
+ end
40
+ missing_lineage.concat(existing_lineage)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -45,7 +45,7 @@ module EasyML
45
45
  MODEL_NAMES = MODEL_OPTIONS.keys.freeze
46
46
  MODEL_CONSTANTS = MODEL_OPTIONS.values.map(&:constantize)
47
47
 
48
- add_configuration_attributes :task, :objective, :hyperparameters, :evaluator, :callbacks, :metrics
48
+ add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics
49
49
  MODEL_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
50
50
  add_configuration_attributes attribute
51
51
  end
@@ -53,10 +53,10 @@ module EasyML
53
53
  belongs_to :dataset
54
54
  belongs_to :model_file, class_name: "EasyML::ModelFile", foreign_key: "model_file_id", optional: true
55
55
 
56
- has_one :retraining_job, class_name: "EasyML::RetrainingJob"
56
+ has_one :retraining_job, class_name: "EasyML::RetrainingJob", dependent: :destroy
57
57
  accepts_nested_attributes_for :retraining_job
58
- has_many :retraining_runs, class_name: "EasyML::RetrainingRun"
59
- has_many :deploys, class_name: "EasyML::Deploy"
58
+ has_many :retraining_runs, class_name: "EasyML::RetrainingRun", dependent: :destroy
59
+ has_many :deploys, class_name: "EasyML::Deploy", dependent: :destroy
60
60
 
61
61
  scope :deployed, -> { EasyML::ModelHistory.deployed }
62
62
 
@@ -127,26 +127,41 @@ module EasyML
127
127
  end
128
128
  end
129
129
 
130
+ def trained?
131
+ retraining_runs.where(status: :success).exists?
132
+ end
133
+
134
+ def deployed?
135
+ inference_version.present?
136
+ end
137
+
138
+ def weights=(weights)
139
+ raise ArgumentError, "Cannot set weights on model without type" unless model_type.present?
140
+
141
+ model_file = get_model_file
142
+ adapter.set_weights(model_file, weights)
143
+ save_model_file
144
+ end
145
+
146
+ def weights
147
+ adapter.weights(get_model_file)
148
+ end
149
+
130
150
  def get_retraining_job
131
- if retraining_job
132
- self.evaluator = retraining_job.evaluator
133
- evaluator = self.evaluator.symbolize_keys
134
- else
135
- default_eval = Core::ModelEvaluator.default_evaluator(task)
136
- self.evaluator = default_eval
137
- evaluator = default_eval
138
- end
151
+ return retraining_job if retraining_job.present?
139
152
 
140
- retraining_job || create_retraining_job(
141
- model: self,
142
- active: false,
143
- evaluator: evaluator,
144
- metric: evaluator[:metric],
145
- direction: evaluator[:direction],
146
- threshold: evaluator[:threshold],
147
- frequency: "month",
148
- at: { hour: 0, day_of_month: 1 },
149
- )
153
+ evaluator = Core::ModelEvaluator.default_evaluator(task).symbolize_keys
154
+
155
+ method = persisted? ? :create_retraining_job : :build_retraining_job
156
+
157
+ send(method,
158
+ model: self,
159
+ active: false,
160
+ metric: evaluator[:metric],
161
+ direction: evaluator[:direction],
162
+ threshold: evaluator[:threshold],
163
+ frequency: "month",
164
+ at: { hour: 0, day_of_month: 1 })
150
165
  end
151
166
 
152
167
  def pending_run
@@ -154,6 +169,15 @@ module EasyML
154
169
  job.retraining_runs.find_or_create_by(status: "pending", model: self)
155
170
  end
156
171
 
172
+ def import
173
+ lock_model do
174
+ run = pending_run
175
+ run.wrap_training do
176
+ [self, hyperparameters.to_h]
177
+ end
178
+ end
179
+ end
180
+
157
181
  def actually_train(&progress_block)
158
182
  lock_model do
159
183
  run = pending_run
@@ -193,6 +217,20 @@ module EasyML
193
217
  "training:#{self.name}:#{self.id}"
194
218
  end
195
219
 
220
+ def hyperparameters=(hyperparameters)
221
+ return unless model_type.present?
222
+
223
+ @hypers = adapter.build_hyperparameters(hyperparameters)
224
+ end
225
+
226
+ def hyperparameters
227
+ @hypers ||= adapter.build_hyperparameters(@hyperparameters)
228
+ end
229
+
230
+ def callbacks
231
+ @cbs ||= adapter.build_callbacks(@callbacks)
232
+ end
233
+
196
234
  def hyperparameter_search(&progress_block)
197
235
  tuner = retraining_job.tuner_config.symbolize_keys
198
236
  extra_params = {
@@ -239,16 +277,11 @@ module EasyML
239
277
  alias_method :latest_version, :inference_version
240
278
  alias_method :deployed, :inference_version
241
279
 
242
- def hyperparameters
243
- @hypers ||= adapter.build_hyperparameters(@hyperparameters)
244
- end
245
-
246
- def callbacks
247
- @cbs ||= adapter.build_callbacks(@callbacks)
248
- end
249
-
250
280
  def predict(xs)
251
281
  load_model!
282
+ unless xs.is_a?(XGBoost::DMatrix)
283
+ xs = dataset.normalize(xs, inference: true)
284
+ end
252
285
  adapter.predict(xs)
253
286
  end
254
287
 
@@ -361,6 +394,10 @@ module EasyML
361
394
  dataset.decode_labels(ys, col: col)
362
395
  end
363
396
 
397
+ def evaluator
398
+ get_retraining_job&.evaluator || default_evaluator
399
+ end
400
+
364
401
  def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil, dataset: nil)
365
402
  evaluator ||= self.evaluator
366
403
  if y_pred.nil?
@@ -373,10 +410,6 @@ module EasyML
373
410
  EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset, evaluator: evaluator)
374
411
  end
375
412
 
376
- def evaluator
377
- instance_variable_get(:@evaluator) || default_evaluator
378
- end
379
-
380
413
  def default_evaluator
381
414
  return nil unless task.present?
382
415
 
@@ -388,7 +421,7 @@ module EasyML
388
421
  end
389
422
 
390
423
  def evals
391
- last_run&.metrics || {}
424
+ (last_run&.metrics || {}).with_indifferent_access
392
425
  end
393
426
 
394
427
  def metric_accessor(metric)
@@ -543,6 +576,28 @@ module EasyML
543
576
  end
544
577
  end
545
578
 
579
+ UNCONFIGURABLE_COLUMNS = %w(
580
+ id
581
+ dataset_id
582
+ model_file_id
583
+ root_dir
584
+ file
585
+ sha
586
+ last_trained_at
587
+ is_training
588
+ created_at
589
+ updated_at
590
+ slug
591
+ )
592
+
593
+ def to_config(include_dataset: false)
594
+ EasyML::Export::Model.to_config(self, include_dataset: include_dataset)
595
+ end
596
+
597
+ def self.from_config(json_config, action: nil, model: nil, include_dataset: true, dataset: nil)
598
+ EasyML::Import::Model.from_config(json_config, action: action, model: model, include_dataset: include_dataset, dataset: dataset)
599
+ end
600
+
546
601
  private
547
602
 
548
603
  def default_evaluation_inputs
@@ -622,6 +677,8 @@ module EasyML
622
677
  end
623
678
 
624
679
  def validate_metrics_allowed
680
+ set_defaults if metrics.nil? || metrics.empty?
681
+
625
682
  unknown_metrics = metrics.select { |metric| allowed_metrics.exclude?(metric) }
626
683
  return unless unknown_metrics.any?
627
684
 
@@ -631,7 +688,7 @@ module EasyML
631
688
 
632
689
  def set_slug
633
690
  if slug.nil? && name.present?
634
- self.slug = name.gsub(/\s/, "_").downcase
691
+ self.slug = name.gsub(/\s/, "_").gsub(/[^a-zA-Z0-9_]/, "").downcase
635
692
  end
636
693
  end
637
694
  end
@@ -97,5 +97,11 @@ module EasyML
97
97
  def extension_allowlist
98
98
  %w[bin model json]
99
99
  end
100
+
101
+ def write(content)
102
+ FileUtils.mkdir_p(File.dirname(full_path))
103
+ File.write(full_path, content)
104
+ upload(full_path)
105
+ end
100
106
  end
101
107
  end
@@ -32,9 +32,9 @@ module EasyML
32
32
  false
33
33
  end
34
34
 
35
- def test_dataset
35
+ def valid_dataset
36
36
  if tuner.present?
37
- [tuner.x_true, tuner.y_true]
37
+ [tuner.x_valid, tuner.y_valid]
38
38
  else
39
39
  model.dataset.valid(split_ys: true)
40
40
  end
@@ -46,12 +46,12 @@ module EasyML
46
46
  log_frequency = 10
47
47
  if epoch % log_frequency == 0
48
48
  model.adapter.external_model = booster
49
- x_true, y_true = test_dataset
50
- @preprocessed ||= model.preprocess(x_true)
49
+ x_valid, y_valid = valid_dataset
50
+ @preprocessed ||= model.preprocess(x_valid)
51
51
  y_pred = model.predict(@preprocessed)
52
- dataset = model.dataset.test(all_columns: true)
52
+ dataset = model.dataset.valid(all_columns: true)
53
53
 
54
- metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
54
+ metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
55
55
  Wandb.log(metrics)
56
56
  end
57
57
 
@@ -67,7 +67,7 @@ module EasyML
67
67
  def after_training(booster)
68
68
  return booster unless wandb_enabled?
69
69
 
70
- if model.last_run&.wandb_url.nil?
70
+ if model.last_run.present? && model.last_run&.wandb_url.nil?
71
71
  if tuner.present? && !tuner.current_run.wandb_url.present?
72
72
  tuner.current_run.wandb_url = Wandb.current_run.url
73
73
  end