easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +38 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +69 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  10. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  11. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  12. data/app/frontend/components/models/index.ts +2 -0
  13. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  14. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  15. data/app/frontend/pages/ModelsPage.tsx +37 -11
  16. data/app/frontend/types/dataset.ts +1 -2
  17. data/app/frontend/types.ts +1 -1
  18. data/app/jobs/easy_ml/training_job.rb +2 -2
  19. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  21. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  22. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  23. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  24. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  25. data/app/models/easy_ml/column/imputers.rb +3 -1
  26. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  27. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  28. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  30. data/app/models/easy_ml/column/selector.rb +4 -0
  31. data/app/models/easy_ml/column.rb +79 -63
  32. data/app/models/easy_ml/column_history.rb +28 -28
  33. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  34. data/app/models/easy_ml/column_list.rb +39 -26
  35. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  36. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  37. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  38. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  39. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  40. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  47. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  48. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  49. data/app/models/easy_ml/dataset/learner.rb +100 -0
  50. data/app/models/easy_ml/dataset.rb +150 -36
  51. data/app/models/easy_ml/dataset_history.rb +1 -0
  52. data/app/models/easy_ml/datasource.rb +9 -0
  53. data/app/models/easy_ml/event.rb +4 -0
  54. data/app/models/easy_ml/export/column.rb +27 -0
  55. data/app/models/easy_ml/export/dataset.rb +37 -0
  56. data/app/models/easy_ml/export/datasource.rb +12 -0
  57. data/app/models/easy_ml/export/feature.rb +24 -0
  58. data/app/models/easy_ml/export/model.rb +40 -0
  59. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  60. data/app/models/easy_ml/export/splitter.rb +14 -0
  61. data/app/models/easy_ml/feature.rb +21 -0
  62. data/app/models/easy_ml/import/column.rb +35 -0
  63. data/app/models/easy_ml/import/dataset.rb +148 -0
  64. data/app/models/easy_ml/import/feature.rb +36 -0
  65. data/app/models/easy_ml/import/model.rb +136 -0
  66. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  67. data/app/models/easy_ml/import/splitter.rb +34 -0
  68. data/app/models/easy_ml/lineage.rb +44 -0
  69. data/app/models/easy_ml/model.rb +93 -36
  70. data/app/models/easy_ml/model_file.rb +6 -0
  71. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  72. data/app/models/easy_ml/models/xgboost.rb +33 -9
  73. data/app/models/easy_ml/retraining_job.rb +8 -1
  74. data/app/models/easy_ml/retraining_run.rb +6 -4
  75. data/app/models/easy_ml/splitter.rb +8 -0
  76. data/app/models/lineage_history.rb +6 -0
  77. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  78. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  79. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  80. data/config/routes.rb +13 -1
  81. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  82. data/lib/easy_ml/core/tuner.rb +12 -11
  83. data/lib/easy_ml/data/polars_column.rb +149 -100
  84. data/lib/easy_ml/data/polars_reader.rb +8 -5
  85. data/lib/easy_ml/data/polars_schema.rb +56 -0
  86. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  87. data/lib/easy_ml/data/splits/split.rb +10 -1
  88. data/lib/easy_ml/data.rb +1 -0
  89. data/lib/easy_ml/deep_compact.rb +19 -0
  90. data/lib/easy_ml/feature_store.rb +2 -6
  91. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  92. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  93. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  95. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  96. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  97. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  98. data/lib/easy_ml/timing.rb +34 -0
  99. data/lib/easy_ml/version.rb +1 -1
  100. data/lib/easy_ml.rb +2 -0
  101. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  102. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
  103. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
  104. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
  105. metadata +52 -12
  106. data/app/models/easy_ml/column/learners/base.rb +0 -103
  107. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  108. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  109. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  110. data/app/models/easy_ml/column/learners/null.rb +0 -22
  111. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  112. data/app/models/easy_ml/column/learners/string.rb +0 -15
  113. data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
  114. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
  115. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -8,13 +8,17 @@ module EasyML
8
8
  "Mean imputation"
9
9
  end
10
10
 
11
+ def expr
12
+ return super unless mean.present?
13
+
14
+ Polars.col(column.name).fill_null(mean).alias(column.name)
15
+ end
16
+
11
17
  def transform(df)
12
18
  return df unless mean.present?
13
19
 
14
20
  mean = statistics(:mean)
15
- df = df.with_column(
16
- Polars.col(column.name).fill_null(mean).alias(column.name)
17
- )
21
+ df = df.with_column(expr)
18
22
  df
19
23
  end
20
24
 
@@ -6,6 +6,9 @@ module EasyML
6
6
  false
7
7
  end
8
8
 
9
+ def exprs
10
+ end
11
+
9
12
  def method_missing(_name, df)
10
13
  df
11
14
  end
@@ -50,7 +50,11 @@ module EasyML
50
50
  end
51
51
 
52
52
  def cast_encoder(encoder)
53
- encoder.transform_keys { |k| column.cast(k) }
53
+ begin
54
+ encoder.transform_keys { |k| column.cast(k) }
55
+ rescue => e
56
+ binding.pry
57
+ end
54
58
  end
55
59
 
56
60
  def cast_decoder(decoder)
@@ -74,9 +74,10 @@ module EasyML
74
74
  @supported_methods ||= []
75
75
  end
76
76
 
77
- def initialize(column)
77
+ def initialize(column, imputers: [])
78
78
  @column = column
79
79
  @dataset = column.dataset
80
+ @_imputers = imputers
80
81
  end
81
82
 
82
83
  class << self
@@ -97,6 +98,7 @@ module EasyML
97
98
  hash[key.to_sym] = Imputer.new(
98
99
  column,
99
100
  column.preprocessing_steps[key],
101
+ @_imputers
100
102
  )
101
103
  end
102
104
  end
@@ -9,11 +9,15 @@ module EasyML
9
9
  @dataset = column.dataset
10
10
  end
11
11
 
12
+ def expr
13
+ Polars.col(column.name)
14
+ end
15
+
12
16
  def as_json
13
17
  {
14
18
  key: key,
15
19
  description: description,
16
- timestamp: timestamp,
20
+ occurred_at: occurred_at,
17
21
  }.with_indifferent_access
18
22
  end
19
23
  end
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Computed by #{column.computed_by}"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.feature.fit_at || column.feature.applied_at
15
15
  end
16
16
 
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.dataset.refreshed_at
15
15
  end
16
16
 
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Present in raw dataset"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.dataset.datasource.refreshed_at
15
15
  end
16
16
 
@@ -1,6 +1,8 @@
1
1
  module EasyML
2
2
  class Column
3
3
  class Selector
4
+ include EasyML::Timing
5
+
4
6
  attr_accessor :selected, :dataset, :column, :transform
5
7
 
6
8
  def initialize(column, selected = nil, &block)
@@ -28,6 +30,8 @@ module EasyML
28
30
  end
29
31
  end
30
32
 
33
+ measure_method_timing :clipped
34
+
31
35
  def processed
32
36
  Selector.new(column, :processed)
33
37
  end
@@ -2,29 +2,29 @@
2
2
  #
3
3
  # Table name: easy_ml_columns
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :bigint not null
7
- # name :string not null
8
- # description :string
9
- # datatype :string
10
- # polars_datatype :string
11
- # is_target :boolean default(FALSE)
12
- # hidden :boolean default(FALSE)
13
- # drop_if_null :boolean default(FALSE)
14
- # preprocessing_steps :json
15
- # sample_values :json
16
- # statistics :json
17
- # created_at :datetime not null
18
- # updated_at :datetime not null
19
- # is_date_column :boolean default(FALSE)
20
- # computed_by :string
21
- # is_computed :boolean default(FALSE)
22
- # feature_id :bigint
23
- # learned_at :datetime
24
- # is_learning :boolean default(FALSE)
25
- # last_datasource_sha :string
26
- # last_feature_sha :string
27
- # configuration_changed_at :datetime
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean default(FALSE)
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :jsonb
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ # is_date_column :boolean default(FALSE)
20
+ # computed_by :string
21
+ # is_computed :boolean default(FALSE)
22
+ # feature_id :bigint
23
+ # learned_at :datetime
24
+ # is_learning :boolean default(FALSE)
25
+ # last_datasource_sha :string
26
+ # last_feature_sha :string
27
+ # in_raw_dataset :boolean
28
28
  #
29
29
  module EasyML
30
30
  class Column < ActiveRecord::Base
@@ -32,8 +32,11 @@ module EasyML
32
32
  include Historiographer::Silent
33
33
  historiographer_mode :snapshot_only
34
34
 
35
+ include EasyML::Timing
36
+
35
37
  belongs_to :dataset, class_name: "EasyML::Dataset"
36
38
  belongs_to :feature, class_name: "EasyML::Feature", optional: true
39
+ has_many :lineages, class_name: "EasyML::Lineage"
37
40
 
38
41
  validates :name, presence: true
39
42
  validates :name, uniqueness: { scope: :dataset_id }
@@ -43,7 +46,7 @@ module EasyML
43
46
  before_save :set_defaults
44
47
  before_save :set_feature_lineage
45
48
  before_save :set_polars_datatype
46
- after_find :ensure_feature_exists
49
+ # after_find :ensure_feature_exists
47
50
 
48
51
  # Scopes
49
52
  scope :visible, -> { where(hidden: false) }
@@ -60,6 +63,7 @@ module EasyML
60
63
  scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
61
64
  scope :computed, -> { where(is_computed: true) }
62
65
  scope :raw, -> { where(is_computed: false) }
66
+ scope :has_clip, -> { where("preprocessing_steps->'training'->>'params' IS NOT NULL AND preprocessing_steps->'training'->'params' @> jsonb_build_object('clip', jsonb_build_object())") }
63
67
  scope :needs_learn, -> {
64
68
  datasource_changed
65
69
  .or(feature_applied)
@@ -142,26 +146,10 @@ module EasyML
142
146
  data.blank?
143
147
  end
144
148
 
145
- def learn(type: :all)
146
- return if (!in_raw_dataset? && type != :processed)
147
-
148
- if !in_raw_dataset? && read_attribute(:datatype).nil?
149
- assign_attributes(datatype: processed.data.to_series.dtype)
150
- end
151
- set_sample_values
152
- new_stats = learner.learn(type: type).symbolize_keys
153
-
154
- if !in_raw_dataset?
155
- new_stats[:raw] = new_stats[:processed]
156
- end
149
+ def merge_statistics(new_stats)
150
+ return unless new_stats.present?
157
151
 
158
- assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(new_stats))
159
- assign_attributes(
160
- learned_at: UTC.now,
161
- last_datasource_sha: dataset.last_datasource_sha,
162
- last_feature_sha: feature&.sha,
163
- is_learning: type == :raw,
164
- )
152
+ assign_attributes(statistics: (statistics || {}).symbolize_keys.deep_merge!(new_stats))
165
153
  end
166
154
 
167
155
  def set_configuration_changed_at
@@ -174,7 +162,7 @@ module EasyML
174
162
  use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
175
163
 
176
164
  base = use_processed ? processed : raw
177
- sample_values = base.data(limit: 5, unique: true)
165
+ sample_values = base.data(limit: 5, unique: true, select: [name])
178
166
  if sample_values.columns.include?(name)
179
167
  sample_values = sample_values[name].to_a.uniq[0...5]
180
168
  assign_attributes(sample_values: sample_values)
@@ -188,8 +176,8 @@ module EasyML
188
176
  df
189
177
  end
190
178
 
191
- def imputers
192
- @imputers ||= Column::Imputers.new(self)
179
+ def imputers(imputers = [])
180
+ @imputers ||= Column::Imputers.new(self, imputers: imputers)
193
181
  end
194
182
 
195
183
  def decode_labels(df)
@@ -202,29 +190,29 @@ module EasyML
202
190
 
203
191
  def datatype=(dtype)
204
192
  if dtype.is_a?(Polars::DataType)
205
- dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
193
+ dtype = polars_to_sym(dtype)
206
194
  end
207
195
  write_attribute(:datatype, dtype)
208
196
  set_polars_datatype
209
197
  end
210
198
 
199
+ def polars_to_sym(dtype)
200
+ EasyML::Data::PolarsColumn.polars_to_sym(dtype)
201
+ end
202
+
211
203
  def datatype
212
- read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
204
+ read_attribute(:datatype) || write_attribute(:datatype, polars_to_sym(assumed_datatype))
213
205
  end
214
206
 
215
207
  def raw_dtype
216
- return @raw_dtype if @raw_dtype
217
- set_feature_lineage
208
+ dtype = dataset.raw_schema[name]
209
+ return nil if dtype.nil?
218
210
 
219
- if in_raw_dataset?
220
- @raw_dtype = raw&.data&.to_series.try(:dtype)
221
- elsif already_computed?
222
- @raw_dtype = processed&.data&.to_series&.dtype
223
- end
211
+ polars_to_sym(dtype)
224
212
  end
225
213
 
226
214
  def set_polars_datatype
227
- raw_type = raw_dtype
215
+ raw_type = datatype
228
216
  user_type = get_polars_type(datatype)
229
217
 
230
218
  if raw_type == user_type
@@ -267,8 +255,11 @@ module EasyML
267
255
  return @assumed_datatype if @assumed_datatype
268
256
 
269
257
  if in_raw_dataset?
270
- series = (raw.data || datasource_raw).to_series
271
- @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
258
+ @assumed_datatype = dataset.raw_schema[name]
259
+ # series = (raw.data || datasource_raw).to_series
260
+ # @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
261
+ elsif dataset.processed_schema.present?
262
+ @assumed_datatype = dataset.processed_schema[name]
272
263
  elsif already_computed?
273
264
  return nil if processed.data.nil?
274
265
 
@@ -277,9 +268,16 @@ module EasyML
277
268
  end
278
269
 
279
270
  def in_raw_dataset?
271
+ value = read_attribute(:in_raw_dataset)
272
+ return value unless value.nil?
273
+
274
+ write_attribute(:in_raw_dataset, check_in_raw_dataset?)
275
+ end
276
+
277
+ def check_in_raw_dataset?
280
278
  return false if dataset&.raw&.data.nil?
281
279
 
282
- dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
280
+ dataset.raw.data(all_columns: true, lazy: true).schema.key?(name) || false
283
281
  end
284
282
 
285
283
  def computing_feature
@@ -398,10 +396,6 @@ module EasyML
398
396
  is_date_column
399
397
  end
400
398
 
401
- def lineage
402
- @lineage ||= EasyML::Column::Lineage.new(self).lineage
403
- end
404
-
405
399
  def required?
406
400
  !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
407
401
  end
@@ -420,6 +414,28 @@ module EasyML
420
414
  }.compact
421
415
  end
422
416
 
417
+ UNCONFIGURABLE_COLUMNS = %w(
418
+ id
419
+ feature_id
420
+ dataset_id
421
+ last_datasource_sha
422
+ last_feature_sha
423
+ learned_at
424
+ is_learning
425
+ configuration_changed_at
426
+ statistics
427
+ created_at
428
+ updated_at
429
+ )
430
+
431
+ def to_config
432
+ EasyML::Export::Column.to_config(self)
433
+ end
434
+
435
+ def self.from_config(config, dataset, action: :create)
436
+ EasyML::Import::Column.from_config(config, dataset, action: action)
437
+ end
438
+
423
439
  def cast(value)
424
440
  return value if value.nil?
425
441
 
@@ -2,34 +2,34 @@
2
2
  #
3
3
  # Table name: easy_ml_column_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # column_id :integer not null
7
- # dataset_id :integer not null
8
- # name :string not null
9
- # description :string
10
- # datatype :string
11
- # polars_datatype :string
12
- # is_target :boolean default(FALSE)
13
- # hidden :boolean default(FALSE)
14
- # drop_if_null :boolean default(FALSE)
15
- # preprocessing_steps :json
16
- # sample_values :json
17
- # statistics :json
18
- # created_at :datetime not null
19
- # updated_at :datetime not null
20
- # history_started_at :datetime not null
21
- # history_ended_at :datetime
22
- # history_user_id :integer
23
- # snapshot_id :string
24
- # is_date_column :boolean default(FALSE)
25
- # computed_by :string
26
- # is_computed :boolean default(FALSE)
27
- # feature_id :bigint
28
- # learned_at :datetime
29
- # is_learning :boolean default(FALSE)
30
- # last_datasource_sha :string
31
- # last_feature_sha :string
32
- # configuration_changed_at :datetime
5
+ # id :bigint not null, primary key
6
+ # column_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string not null
9
+ # description :string
10
+ # datatype :string
11
+ # polars_datatype :string
12
+ # is_target :boolean default(FALSE)
13
+ # hidden :boolean default(FALSE)
14
+ # drop_if_null :boolean default(FALSE)
15
+ # preprocessing_steps :jsonb
16
+ # sample_values :json
17
+ # statistics :json
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ # history_started_at :datetime not null
21
+ # history_ended_at :datetime
22
+ # history_user_id :integer
23
+ # snapshot_id :string
24
+ # is_date_column :boolean default(FALSE)
25
+ # computed_by :string
26
+ # is_computed :boolean default(FALSE)
27
+ # feature_id :bigint
28
+ # learned_at :datetime
29
+ # is_learning :boolean default(FALSE)
30
+ # last_datasource_sha :string
31
+ # last_feature_sha :string
32
+ # in_raw_dataset :boolean
33
33
  #
34
34
  module EasyML
35
35
  class ColumnHistory < ActiveRecord::Base
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ module ColumnList
3
+ class Imputer
4
+ attr_accessor :dataset, :df, :inference, :columns
5
+
6
+ def initialize(dataset, df, columns: nil, imputers: [], inference: false)
7
+ @dataset = dataset
8
+ @df = df
9
+ @columns = (columns.nil? || columns.empty?) ? dataset.columns : columns
10
+ @inference = inference
11
+ @_imputers = imputers
12
+ end
13
+
14
+ def imputers
15
+ @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
16
+ end
17
+
18
+ def exprs
19
+ imputers.flat_map(&:exprs).compact
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,6 +1,7 @@
1
1
  module EasyML
2
2
  module ColumnList
3
3
  include Historiographer::Relation
4
+ include EasyML::Timing
4
5
 
5
6
  def sync(delete: true)
6
7
  return unless dataset.schema.present?
@@ -39,35 +40,28 @@ module EasyML
39
40
  df
40
41
  end
41
42
 
43
+ measure_method_timing :transform
44
+
45
+ def apply_clip(df)
46
+ clip_cols = has_clip.raw
47
+ return df unless clip_cols.any?
48
+
49
+ clipped_exprs = EasyML::ColumnList::Imputer.new(
50
+ dataset,
51
+ df,
52
+ columns: clip_cols,
53
+ imputers: [:clip],
54
+ ).exprs
55
+
56
+ df.with_columns(clipped_exprs)
57
+ end
58
+
42
59
  def learn(type: :raw, computed: false)
43
- cols_to_learn = column_list.reload.needs_learn
44
- cols_to_learn = cols_to_learn.computed if computed
45
- cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
46
- cols_to_learn.each { |col| col.learn(type: type) }
47
- EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
48
- statistics
49
- learned_at
50
- sample_values
51
- last_datasource_sha
52
- is_learning
53
- datatype
54
- polars_datatype
55
- ] })
56
- set_feature_lineage
60
+ EasyML::Dataset::Learner.new(dataset, type: type).learn
57
61
  reload
58
62
  end
59
63
 
60
- def set_feature_lineage
61
- names = dataset.features.computed_column_names
62
- columns = where(name: names, computed_by: nil).map do |col|
63
- col.assign_attributes(
64
- is_computed: true,
65
- computed_by: col.computing_feature&.name,
66
- )
67
- col
68
- end
69
- EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
70
- end
64
+ measure_method_timing :learn
71
65
 
72
66
  def statistics
73
67
  stats = { raw: {}, processed: {} }
@@ -115,6 +109,25 @@ module EasyML
115
109
  column_list.sort_by { |col| [col.sort_required, col.name] }
116
110
  end
117
111
 
112
+ def set_feature_lineage(cols_to_learn)
113
+ names = dataset.features.computed_column_names
114
+ columns = where(name: names, computed_by: nil).map do |col|
115
+ col.assign_attributes(
116
+ is_computed: true,
117
+ computed_by: col.computing_feature&.name,
118
+ )
119
+ col
120
+ end
121
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
122
+
123
+ lineage = cols_to_learn.flat_map do |col|
124
+ EasyML::Lineage.learn(col)
125
+ end.compact
126
+ EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
127
+ end
128
+
129
+ measure_method_timing :set_feature_lineage
130
+
118
131
  private
119
132
 
120
133
  def import_new(new_columns, existing_columns)
@@ -127,7 +140,7 @@ module EasyML
127
140
  col
128
141
  end
129
142
  EasyML::Column.import(cols_to_insert)
130
- set_feature_lineage
143
+ set_feature_lineage(cols_to_insert)
131
144
  column_list.reload
132
145
  end
133
146
 
@@ -0,0 +1,34 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Base
5
+ attr_reader :dataset, :columns, :type
6
+
7
+ def initialize(dataset, columns, type: :raw)
8
+ @dataset = dataset
9
+ @columns = columns
10
+ @type = type
11
+ end
12
+
13
+ def skip_processing?(column, type)
14
+ (!column.in_raw_dataset? && type.to_sym != :processed) ||
15
+ (column.one_hot? && type.to_sym == :processed)
16
+ end
17
+
18
+ TYPES_ALL = %i(raw clipped processed)
19
+ TYPES_RAW = %i(raw clipped)
20
+ TYPES_PROCESSED = %i(processed)
21
+
22
+ def types(type = :all)
23
+ case type
24
+ when :all then TYPES_ALL
25
+ when :raw then TYPES_RAW
26
+ when :processed then TYPES_PROCESSED
27
+ else
28
+ TYPES_ALL
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Boolean < Categorical
6
+ end
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Categorical < Query
6
+ def train_query(df)
7
+ {
8
+ counts: counts(df).to_hash,
9
+ allowed_categories: allowed_categories(df).to_series.to_a,
10
+ }.merge!(
11
+ learn_encoder_decoder(df)
12
+ )
13
+ end
14
+
15
+ def learn_encoder_decoder(df)
16
+ unsorted = allowed_categories(df).lazy.with_row_count.collect.to_hash.invert
17
+
18
+ label_encoder = unsorted.transform_keys(&column.method(:cast)).keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
19
+ h.tap do
20
+ h[k] = i
21
+ end
22
+ end
23
+ label_decoder = label_encoder.invert
24
+
25
+ {
26
+ label_encoder: label_encoder,
27
+ label_decoder: label_decoder,
28
+ }
29
+ end
30
+
31
+ def counts(df)
32
+ return @counts if @counts
33
+
34
+ @counts = df.group_by(column.name)
35
+ .agg(Polars.col(column.name).count.alias("count"))
36
+ end
37
+
38
+ def allowed_categories(df)
39
+ return @allowed_categories if @allowed_categories
40
+
41
+ @allowed_categories = df.join(counts(df), on: column.name)
42
+ .filter(Polars.col("count").ge(column.categorical_min))
43
+ .select(column.name)
44
+ .unique
45
+ .sort(column.name, reverse: true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end