easy_ml 0.2.0.pre.rc71 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +46 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +114 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  10. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  11. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  12. data/app/frontend/components/models/index.ts +2 -0
  13. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  14. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  15. data/app/frontend/pages/ModelsPage.tsx +37 -11
  16. data/app/frontend/types/dataset.ts +1 -2
  17. data/app/frontend/types.ts +1 -1
  18. data/app/jobs/easy_ml/reaper.rb +55 -0
  19. data/app/jobs/easy_ml/training_job.rb +1 -1
  20. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  21. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  22. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  23. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  24. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  25. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  26. data/app/models/easy_ml/column/imputers.rb +3 -1
  27. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  28. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  30. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  31. data/app/models/easy_ml/column/selector.rb +4 -0
  32. data/app/models/easy_ml/column.rb +79 -63
  33. data/app/models/easy_ml/column_history.rb +28 -28
  34. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  35. data/app/models/easy_ml/column_list.rb +39 -26
  36. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  37. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  38. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  39. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  40. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  47. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  48. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  49. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  50. data/app/models/easy_ml/dataset/learner.rb +100 -0
  51. data/app/models/easy_ml/dataset.rb +150 -36
  52. data/app/models/easy_ml/dataset_history.rb +1 -0
  53. data/app/models/easy_ml/datasource.rb +9 -0
  54. data/app/models/easy_ml/event.rb +5 -7
  55. data/app/models/easy_ml/export/column.rb +27 -0
  56. data/app/models/easy_ml/export/dataset.rb +37 -0
  57. data/app/models/easy_ml/export/datasource.rb +12 -0
  58. data/app/models/easy_ml/export/feature.rb +24 -0
  59. data/app/models/easy_ml/export/model.rb +40 -0
  60. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  61. data/app/models/easy_ml/export/splitter.rb +14 -0
  62. data/app/models/easy_ml/feature.rb +21 -0
  63. data/app/models/easy_ml/import/column.rb +35 -0
  64. data/app/models/easy_ml/import/dataset.rb +148 -0
  65. data/app/models/easy_ml/import/feature.rb +36 -0
  66. data/app/models/easy_ml/import/model.rb +136 -0
  67. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  68. data/app/models/easy_ml/import/splitter.rb +34 -0
  69. data/app/models/easy_ml/lineage.rb +44 -0
  70. data/app/models/easy_ml/model.rb +101 -37
  71. data/app/models/easy_ml/model_file.rb +6 -0
  72. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  73. data/app/models/easy_ml/models/xgboost.rb +33 -9
  74. data/app/models/easy_ml/retraining_job.rb +8 -1
  75. data/app/models/easy_ml/retraining_run.rb +7 -5
  76. data/app/models/easy_ml/splitter.rb +8 -0
  77. data/app/models/lineage_history.rb +6 -0
  78. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  79. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  80. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  81. data/config/routes.rb +14 -1
  82. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  83. data/lib/easy_ml/core/tuner.rb +13 -12
  84. data/lib/easy_ml/data/polars_column.rb +149 -100
  85. data/lib/easy_ml/data/polars_reader.rb +8 -5
  86. data/lib/easy_ml/data/polars_schema.rb +56 -0
  87. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  88. data/lib/easy_ml/data/splits/split.rb +10 -1
  89. data/lib/easy_ml/data.rb +1 -0
  90. data/lib/easy_ml/deep_compact.rb +19 -0
  91. data/lib/easy_ml/engine.rb +1 -0
  92. data/lib/easy_ml/feature_store.rb +2 -6
  93. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  95. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  96. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  97. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  98. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  99. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  100. data/lib/easy_ml/timing.rb +34 -0
  101. data/lib/easy_ml/version.rb +1 -1
  102. data/lib/easy_ml.rb +2 -0
  103. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  104. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
  105. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
  106. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
  107. metadata +53 -12
  108. data/app/models/easy_ml/column/learners/base.rb +0 -103
  109. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  110. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  111. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  112. data/app/models/easy_ml/column/learners/null.rb +0 -22
  113. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  114. data/app/models/easy_ml/column/learners/string.rb +0 -15
  115. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +0 -1
  116. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js +0 -489
  117. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js.map +0 -1
@@ -0,0 +1,55 @@
1
+ module EasyML
2
+ class Reaper
3
+ class << self
4
+ def list_workers
5
+ Resque::Worker.all.map do |worker|
6
+ if worker.working?
7
+ job = worker.job
8
+ args = job.dig("payload", "args")&.first
9
+ {
10
+ worker: worker,
11
+ working: true,
12
+ class: args.dig("job_class"),
13
+ args: args.dig("arguments"),
14
+ pid: worker.pid,
15
+ }
16
+ else
17
+ { working: false, class: nil, pid: nil }
18
+ end
19
+ end
20
+ end
21
+
22
+ def find_job(worker_class, *args)
23
+ list_workers.select do |config|
24
+ config.dig(:class) == worker_class.to_s && config.dig(:args) == args
25
+ end
26
+ end
27
+
28
+ def kill(worker_class, *args)
29
+ find_job(worker_class, *args).each do |job|
30
+ begin
31
+ # Send TERM signal to the process
32
+ Process.kill("TERM", job[:pid])
33
+
34
+ # Remove the worker from Redis so it doesn't show up as a zombie
35
+ # in the Resque web interface. This is important because:
36
+ # 1. It keeps the interface clean
37
+ # 2. Prevents confusion about running workers
38
+ # 3. Allows proper worker cleanup in Redis
39
+ job[:worker].done_working
40
+ job[:worker].unregister_worker
41
+ rescue Errno::ESRCH
42
+ # Process already gone, but still try to clean up Redis
43
+ begin
44
+ job[:worker].done_working
45
+ job[:worker].unregister_worker
46
+ rescue => e
47
+ # Redis cleanup failed, worker might already be unregistered
48
+ puts "Failed to unregister worker: #{e.message}"
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -2,7 +2,7 @@ module EasyML
2
2
  class TrainingJob < ApplicationJob
3
3
  class TrainingTimeoutError < StandardError; end
4
4
 
5
- INACTIVITY_TIMEOUT = 15 # seconds
5
+ INACTIVITY_TIMEOUT = 300 # seconds
6
6
 
7
7
  def perform(model_id)
8
8
  @model = EasyML::Model.find_by(id: model_id)
@@ -27,6 +27,10 @@ module EasyML
27
27
  @preprocessing_step = preprocessing_step.with_indifferent_access
28
28
  end
29
29
 
30
+ def expr
31
+ Polars.col(column.name)
32
+ end
33
+
30
34
  def applies?
31
35
  method_applies? || param_applies?
32
36
  end
@@ -10,10 +10,12 @@ module EasyML
10
10
  "Clip"
11
11
  end
12
12
 
13
+ def expr
14
+ Polars.col(column.name).clip(min, max).alias(column.name)
15
+ end
16
+
13
17
  def transform(df)
14
- df = df.with_column(
15
- Polars.col(column.name).clip(min, max).alias(column.name)
16
- )
18
+ df = df.with_column(expr)
17
19
  df
18
20
  end
19
21
 
@@ -2,18 +2,23 @@ module EasyML
2
2
  class Column
3
3
  class Imputers
4
4
  class Imputer
5
- attr_accessor :dataset, :column, :preprocessing_step
5
+ attr_accessor :dataset, :column, :preprocessing_step, :allowed_adapters
6
6
 
7
- def initialize(column, preprocessing_step)
7
+ def initialize(column, preprocessing_step, allowed_adapters = [])
8
8
  @column = column
9
9
  @dataset = column.dataset
10
10
  @preprocessing_step = preprocessing_step.with_indifferent_access
11
+ @allowed_adapters = allowed_adapters.map(&:to_sym)
11
12
  end
12
13
 
13
14
  def inspect
14
15
  "#<#{self.class.name} adapters=#{adapters.map(&:inspect).join(", ")}>"
15
16
  end
16
17
 
18
+ def exprs
19
+ adapters.map(&:expr)
20
+ end
21
+
17
22
  def ordered_adapters
18
23
  [
19
24
  Clip,
@@ -29,19 +34,12 @@ module EasyML
29
34
  ]
30
35
  end
31
36
 
32
- def adapters
33
- @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select(&:applies?)
37
+ def allowed?(adapter)
38
+ allowed_adapters.empty? || allowed_adapters.include?(adapter.class.name.split("::").last.underscore.to_sym)
34
39
  end
35
40
 
36
- def imputers
37
- return nil if column.preprocessing_steps.blank?
38
-
39
- @imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
40
- hash[key.to_sym] = Imputer.new(
41
- column: column,
42
- preprocessing_step: column.preprocessing_steps[key],
43
- )
44
- end
41
+ def adapters
42
+ @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
45
43
  end
46
44
 
47
45
  def description
@@ -8,13 +8,17 @@ module EasyML
8
8
  "Mean imputation"
9
9
  end
10
10
 
11
+ def expr
12
+ return super unless mean.present?
13
+
14
+ Polars.col(column.name).fill_null(mean).alias(column.name)
15
+ end
16
+
11
17
  def transform(df)
12
18
  return df unless mean.present?
13
19
 
14
20
  mean = statistics(:mean)
15
- df = df.with_column(
16
- Polars.col(column.name).fill_null(mean).alias(column.name)
17
- )
21
+ df = df.with_column(expr)
18
22
  df
19
23
  end
20
24
 
@@ -6,6 +6,9 @@ module EasyML
6
6
  false
7
7
  end
8
8
 
9
+ def exprs
10
+ end
11
+
9
12
  def method_missing(_name, df)
10
13
  df
11
14
  end
@@ -50,7 +50,11 @@ module EasyML
50
50
  end
51
51
 
52
52
  def cast_encoder(encoder)
53
- encoder.transform_keys { |k| column.cast(k) }
53
+ begin
54
+ encoder.transform_keys { |k| column.cast(k) }
55
+ rescue => e
56
+ binding.pry
57
+ end
54
58
  end
55
59
 
56
60
  def cast_decoder(decoder)
@@ -74,9 +74,10 @@ module EasyML
74
74
  @supported_methods ||= []
75
75
  end
76
76
 
77
- def initialize(column)
77
+ def initialize(column, imputers: [])
78
78
  @column = column
79
79
  @dataset = column.dataset
80
+ @_imputers = imputers
80
81
  end
81
82
 
82
83
  class << self
@@ -97,6 +98,7 @@ module EasyML
97
98
  hash[key.to_sym] = Imputer.new(
98
99
  column,
99
100
  column.preprocessing_steps[key],
101
+ @_imputers
100
102
  )
101
103
  end
102
104
  end
@@ -9,11 +9,15 @@ module EasyML
9
9
  @dataset = column.dataset
10
10
  end
11
11
 
12
+ def expr
13
+ Polars.col(column.name)
14
+ end
15
+
12
16
  def as_json
13
17
  {
14
18
  key: key,
15
19
  description: description,
16
- timestamp: timestamp,
20
+ occurred_at: occurred_at,
17
21
  }.with_indifferent_access
18
22
  end
19
23
  end
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Computed by #{column.computed_by}"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.feature.fit_at || column.feature.applied_at
15
15
  end
16
16
 
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.dataset.refreshed_at
15
15
  end
16
16
 
@@ -10,7 +10,7 @@ module EasyML
10
10
  "Present in raw dataset"
11
11
  end
12
12
 
13
- def timestamp
13
+ def occurred_at
14
14
  column.dataset.datasource.refreshed_at
15
15
  end
16
16
 
@@ -1,6 +1,8 @@
1
1
  module EasyML
2
2
  class Column
3
3
  class Selector
4
+ include EasyML::Timing
5
+
4
6
  attr_accessor :selected, :dataset, :column, :transform
5
7
 
6
8
  def initialize(column, selected = nil, &block)
@@ -28,6 +30,8 @@ module EasyML
28
30
  end
29
31
  end
30
32
 
33
+ measure_method_timing :clipped
34
+
31
35
  def processed
32
36
  Selector.new(column, :processed)
33
37
  end
@@ -2,29 +2,29 @@
2
2
  #
3
3
  # Table name: easy_ml_columns
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :bigint not null
7
- # name :string not null
8
- # description :string
9
- # datatype :string
10
- # polars_datatype :string
11
- # is_target :boolean default(FALSE)
12
- # hidden :boolean default(FALSE)
13
- # drop_if_null :boolean default(FALSE)
14
- # preprocessing_steps :json
15
- # sample_values :json
16
- # statistics :json
17
- # created_at :datetime not null
18
- # updated_at :datetime not null
19
- # is_date_column :boolean default(FALSE)
20
- # computed_by :string
21
- # is_computed :boolean default(FALSE)
22
- # feature_id :bigint
23
- # learned_at :datetime
24
- # is_learning :boolean default(FALSE)
25
- # last_datasource_sha :string
26
- # last_feature_sha :string
27
- # configuration_changed_at :datetime
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean default(FALSE)
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :jsonb
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ # is_date_column :boolean default(FALSE)
20
+ # computed_by :string
21
+ # is_computed :boolean default(FALSE)
22
+ # feature_id :bigint
23
+ # learned_at :datetime
24
+ # is_learning :boolean default(FALSE)
25
+ # last_datasource_sha :string
26
+ # last_feature_sha :string
27
+ # in_raw_dataset :boolean
28
28
  #
29
29
  module EasyML
30
30
  class Column < ActiveRecord::Base
@@ -32,8 +32,11 @@ module EasyML
32
32
  include Historiographer::Silent
33
33
  historiographer_mode :snapshot_only
34
34
 
35
+ include EasyML::Timing
36
+
35
37
  belongs_to :dataset, class_name: "EasyML::Dataset"
36
38
  belongs_to :feature, class_name: "EasyML::Feature", optional: true
39
+ has_many :lineages, class_name: "EasyML::Lineage"
37
40
 
38
41
  validates :name, presence: true
39
42
  validates :name, uniqueness: { scope: :dataset_id }
@@ -43,7 +46,7 @@ module EasyML
43
46
  before_save :set_defaults
44
47
  before_save :set_feature_lineage
45
48
  before_save :set_polars_datatype
46
- after_find :ensure_feature_exists
49
+ # after_find :ensure_feature_exists
47
50
 
48
51
  # Scopes
49
52
  scope :visible, -> { where(hidden: false) }
@@ -60,6 +63,7 @@ module EasyML
60
63
  scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
61
64
  scope :computed, -> { where(is_computed: true) }
62
65
  scope :raw, -> { where(is_computed: false) }
66
+ scope :has_clip, -> { where("preprocessing_steps->'training'->>'params' IS NOT NULL AND preprocessing_steps->'training'->'params' @> jsonb_build_object('clip', jsonb_build_object())") }
63
67
  scope :needs_learn, -> {
64
68
  datasource_changed
65
69
  .or(feature_applied)
@@ -142,26 +146,10 @@ module EasyML
142
146
  data.blank?
143
147
  end
144
148
 
145
- def learn(type: :all)
146
- return if (!in_raw_dataset? && type != :processed)
147
-
148
- if !in_raw_dataset? && read_attribute(:datatype).nil?
149
- assign_attributes(datatype: processed.data.to_series.dtype)
150
- end
151
- set_sample_values
152
- new_stats = learner.learn(type: type).symbolize_keys
153
-
154
- if !in_raw_dataset?
155
- new_stats[:raw] = new_stats[:processed]
156
- end
149
+ def merge_statistics(new_stats)
150
+ return unless new_stats.present?
157
151
 
158
- assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(new_stats))
159
- assign_attributes(
160
- learned_at: UTC.now,
161
- last_datasource_sha: dataset.last_datasource_sha,
162
- last_feature_sha: feature&.sha,
163
- is_learning: type == :raw,
164
- )
152
+ assign_attributes(statistics: (statistics || {}).symbolize_keys.deep_merge!(new_stats))
165
153
  end
166
154
 
167
155
  def set_configuration_changed_at
@@ -174,7 +162,7 @@ module EasyML
174
162
  use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
175
163
 
176
164
  base = use_processed ? processed : raw
177
- sample_values = base.data(limit: 5, unique: true)
165
+ sample_values = base.data(limit: 5, unique: true, select: [name])
178
166
  if sample_values.columns.include?(name)
179
167
  sample_values = sample_values[name].to_a.uniq[0...5]
180
168
  assign_attributes(sample_values: sample_values)
@@ -188,8 +176,8 @@ module EasyML
188
176
  df
189
177
  end
190
178
 
191
- def imputers
192
- @imputers ||= Column::Imputers.new(self)
179
+ def imputers(imputers = [])
180
+ @imputers ||= Column::Imputers.new(self, imputers: imputers)
193
181
  end
194
182
 
195
183
  def decode_labels(df)
@@ -202,29 +190,29 @@ module EasyML
202
190
 
203
191
  def datatype=(dtype)
204
192
  if dtype.is_a?(Polars::DataType)
205
- dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
193
+ dtype = polars_to_sym(dtype)
206
194
  end
207
195
  write_attribute(:datatype, dtype)
208
196
  set_polars_datatype
209
197
  end
210
198
 
199
+ def polars_to_sym(dtype)
200
+ EasyML::Data::PolarsColumn.polars_to_sym(dtype)
201
+ end
202
+
211
203
  def datatype
212
- read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
204
+ read_attribute(:datatype) || write_attribute(:datatype, polars_to_sym(assumed_datatype))
213
205
  end
214
206
 
215
207
  def raw_dtype
216
- return @raw_dtype if @raw_dtype
217
- set_feature_lineage
208
+ dtype = dataset.raw_schema[name]
209
+ return nil if dtype.nil?
218
210
 
219
- if in_raw_dataset?
220
- @raw_dtype = raw&.data&.to_series.try(:dtype)
221
- elsif already_computed?
222
- @raw_dtype = processed&.data&.to_series&.dtype
223
- end
211
+ polars_to_sym(dtype)
224
212
  end
225
213
 
226
214
  def set_polars_datatype
227
- raw_type = raw_dtype
215
+ raw_type = datatype
228
216
  user_type = get_polars_type(datatype)
229
217
 
230
218
  if raw_type == user_type
@@ -267,8 +255,11 @@ module EasyML
267
255
  return @assumed_datatype if @assumed_datatype
268
256
 
269
257
  if in_raw_dataset?
270
- series = (raw.data || datasource_raw).to_series
271
- @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
258
+ @assumed_datatype = dataset.raw_schema[name]
259
+ # series = (raw.data || datasource_raw).to_series
260
+ # @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
261
+ elsif dataset.processed_schema.present?
262
+ @assumed_datatype = dataset.processed_schema[name]
272
263
  elsif already_computed?
273
264
  return nil if processed.data.nil?
274
265
 
@@ -277,9 +268,16 @@ module EasyML
277
268
  end
278
269
 
279
270
  def in_raw_dataset?
271
+ value = read_attribute(:in_raw_dataset)
272
+ return value unless value.nil?
273
+
274
+ write_attribute(:in_raw_dataset, check_in_raw_dataset?)
275
+ end
276
+
277
+ def check_in_raw_dataset?
280
278
  return false if dataset&.raw&.data.nil?
281
279
 
282
- dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
280
+ dataset.raw.data(all_columns: true, lazy: true).schema.key?(name) || false
283
281
  end
284
282
 
285
283
  def computing_feature
@@ -398,10 +396,6 @@ module EasyML
398
396
  is_date_column
399
397
  end
400
398
 
401
- def lineage
402
- @lineage ||= EasyML::Column::Lineage.new(self).lineage
403
- end
404
-
405
399
  def required?
406
400
  !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
407
401
  end
@@ -420,6 +414,28 @@ module EasyML
420
414
  }.compact
421
415
  end
422
416
 
417
+ UNCONFIGURABLE_COLUMNS = %w(
418
+ id
419
+ feature_id
420
+ dataset_id
421
+ last_datasource_sha
422
+ last_feature_sha
423
+ learned_at
424
+ is_learning
425
+ configuration_changed_at
426
+ statistics
427
+ created_at
428
+ updated_at
429
+ )
430
+
431
+ def to_config
432
+ EasyML::Export::Column.to_config(self)
433
+ end
434
+
435
+ def self.from_config(config, dataset, action: :create)
436
+ EasyML::Import::Column.from_config(config, dataset, action: action)
437
+ end
438
+
423
439
  def cast(value)
424
440
  return value if value.nil?
425
441
 
@@ -2,34 +2,34 @@
2
2
  #
3
3
  # Table name: easy_ml_column_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # column_id :integer not null
7
- # dataset_id :integer not null
8
- # name :string not null
9
- # description :string
10
- # datatype :string
11
- # polars_datatype :string
12
- # is_target :boolean default(FALSE)
13
- # hidden :boolean default(FALSE)
14
- # drop_if_null :boolean default(FALSE)
15
- # preprocessing_steps :json
16
- # sample_values :json
17
- # statistics :json
18
- # created_at :datetime not null
19
- # updated_at :datetime not null
20
- # history_started_at :datetime not null
21
- # history_ended_at :datetime
22
- # history_user_id :integer
23
- # snapshot_id :string
24
- # is_date_column :boolean default(FALSE)
25
- # computed_by :string
26
- # is_computed :boolean default(FALSE)
27
- # feature_id :bigint
28
- # learned_at :datetime
29
- # is_learning :boolean default(FALSE)
30
- # last_datasource_sha :string
31
- # last_feature_sha :string
32
- # configuration_changed_at :datetime
5
+ # id :bigint not null, primary key
6
+ # column_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string not null
9
+ # description :string
10
+ # datatype :string
11
+ # polars_datatype :string
12
+ # is_target :boolean default(FALSE)
13
+ # hidden :boolean default(FALSE)
14
+ # drop_if_null :boolean default(FALSE)
15
+ # preprocessing_steps :jsonb
16
+ # sample_values :json
17
+ # statistics :json
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ # history_started_at :datetime not null
21
+ # history_ended_at :datetime
22
+ # history_user_id :integer
23
+ # snapshot_id :string
24
+ # is_date_column :boolean default(FALSE)
25
+ # computed_by :string
26
+ # is_computed :boolean default(FALSE)
27
+ # feature_id :bigint
28
+ # learned_at :datetime
29
+ # is_learning :boolean default(FALSE)
30
+ # last_datasource_sha :string
31
+ # last_feature_sha :string
32
+ # in_raw_dataset :boolean
33
33
  #
34
34
  module EasyML
35
35
  class ColumnHistory < ActiveRecord::Base
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ module ColumnList
3
+ class Imputer
4
+ attr_accessor :dataset, :df, :inference, :columns
5
+
6
+ def initialize(dataset, df, columns: nil, imputers: [], inference: false)
7
+ @dataset = dataset
8
+ @df = df
9
+ @columns = (columns.nil? || columns.empty?) ? dataset.columns : columns
10
+ @inference = inference
11
+ @_imputers = imputers
12
+ end
13
+
14
+ def imputers
15
+ @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
16
+ end
17
+
18
+ def exprs
19
+ imputers.flat_map(&:exprs).compact
20
+ end
21
+ end
22
+ end
23
+ end