easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +38 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +69 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  10. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  11. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  12. data/app/frontend/components/models/index.ts +2 -0
  13. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  14. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  15. data/app/frontend/pages/ModelsPage.tsx +37 -11
  16. data/app/frontend/types/dataset.ts +1 -2
  17. data/app/frontend/types.ts +1 -1
  18. data/app/jobs/easy_ml/training_job.rb +2 -2
  19. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  21. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  22. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  23. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  24. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  25. data/app/models/easy_ml/column/imputers.rb +3 -1
  26. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  27. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  28. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  30. data/app/models/easy_ml/column/selector.rb +4 -0
  31. data/app/models/easy_ml/column.rb +79 -63
  32. data/app/models/easy_ml/column_history.rb +28 -28
  33. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  34. data/app/models/easy_ml/column_list.rb +39 -26
  35. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  36. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  37. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  38. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  39. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  40. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  47. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  48. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  49. data/app/models/easy_ml/dataset/learner.rb +100 -0
  50. data/app/models/easy_ml/dataset.rb +150 -36
  51. data/app/models/easy_ml/dataset_history.rb +1 -0
  52. data/app/models/easy_ml/datasource.rb +9 -0
  53. data/app/models/easy_ml/event.rb +4 -0
  54. data/app/models/easy_ml/export/column.rb +27 -0
  55. data/app/models/easy_ml/export/dataset.rb +37 -0
  56. data/app/models/easy_ml/export/datasource.rb +12 -0
  57. data/app/models/easy_ml/export/feature.rb +24 -0
  58. data/app/models/easy_ml/export/model.rb +40 -0
  59. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  60. data/app/models/easy_ml/export/splitter.rb +14 -0
  61. data/app/models/easy_ml/feature.rb +21 -0
  62. data/app/models/easy_ml/import/column.rb +35 -0
  63. data/app/models/easy_ml/import/dataset.rb +148 -0
  64. data/app/models/easy_ml/import/feature.rb +36 -0
  65. data/app/models/easy_ml/import/model.rb +136 -0
  66. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  67. data/app/models/easy_ml/import/splitter.rb +34 -0
  68. data/app/models/easy_ml/lineage.rb +44 -0
  69. data/app/models/easy_ml/model.rb +93 -36
  70. data/app/models/easy_ml/model_file.rb +6 -0
  71. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  72. data/app/models/easy_ml/models/xgboost.rb +33 -9
  73. data/app/models/easy_ml/retraining_job.rb +8 -1
  74. data/app/models/easy_ml/retraining_run.rb +6 -4
  75. data/app/models/easy_ml/splitter.rb +8 -0
  76. data/app/models/lineage_history.rb +6 -0
  77. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  78. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  79. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  80. data/config/routes.rb +13 -1
  81. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  82. data/lib/easy_ml/core/tuner.rb +12 -11
  83. data/lib/easy_ml/data/polars_column.rb +149 -100
  84. data/lib/easy_ml/data/polars_reader.rb +8 -5
  85. data/lib/easy_ml/data/polars_schema.rb +56 -0
  86. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  87. data/lib/easy_ml/data/splits/split.rb +10 -1
  88. data/lib/easy_ml/data.rb +1 -0
  89. data/lib/easy_ml/deep_compact.rb +19 -0
  90. data/lib/easy_ml/feature_store.rb +2 -6
  91. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  92. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  93. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  95. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  96. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  97. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  98. data/lib/easy_ml/timing.rb +34 -0
  99. data/lib/easy_ml/version.rb +1 -1
  100. data/lib/easy_ml.rb +2 -0
  101. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  102. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
  103. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
  104. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
  105. metadata +52 -12
  106. data/app/models/easy_ml/column/learners/base.rb +0 -103
  107. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  108. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  109. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  110. data/app/models/easy_ml/column/learners/null.rb +0 -22
  111. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  112. data/app/models/easy_ml/column/learners/string.rb +0 -15
  113. data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
  114. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
  115. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -199,7 +199,7 @@ module EasyML
199
199
  set_default_wandb_project_name unless tuning
200
200
 
201
201
  # Prepare validation data
202
- x_valid, y_valid = dataset.valid(split_ys: true)
202
+ x_valid, y_valid = dataset.valid(split_ys: true, select: dataset.col_order)
203
203
  d_valid = preprocess(x_valid, y_valid)
204
204
 
205
205
  num_iterations = hyperparameters.to_h[:n_estimators]
@@ -217,7 +217,7 @@ module EasyML
217
217
  callbacks << ::XGBoost::EvaluationMonitor.new(period: 1)
218
218
 
219
219
  # Generate batches without loading full dataset
220
- batches = dataset.train(split_ys: true, batch_size: batch_size, batch_start: batch_start, batch_key: batch_key)
220
+ batches = dataset.train(split_ys: true, batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, select: dataset.col_order)
221
221
  prev_xs = []
222
222
  prev_ys = []
223
223
 
@@ -281,9 +281,32 @@ module EasyML
281
281
  return @booster
282
282
  end
283
283
 
284
- def weights
285
- @booster.save_model("tmp/xgboost_model.json")
286
- @booster.get_dump
284
+ def weights(model_file)
285
+ return nil unless model_file.present? && model_file.fit?
286
+
287
+ JSON.parse(model_file.read)
288
+ end
289
+
290
+ def set_weights(model_file, weights)
291
+ raise ArgumentError, "Weights must be provided" unless weights.present?
292
+
293
+ # Create a temp file with the weights
294
+ temp_file = Tempfile.new(["xgboost_weights", ".json"])
295
+ begin
296
+ temp_file.write(weights.to_json)
297
+ temp_file.close
298
+
299
+ # Load the weights into a new booster
300
+ initialize_model do
301
+ attrs = {
302
+ params: hyperparameters.to_h.symbolize_keys.compact,
303
+ model_file: temp_file.path,
304
+ }.compact
305
+ booster_class.new(**attrs)
306
+ end
307
+ ensure
308
+ temp_file.unlink
309
+ end
287
310
  end
288
311
 
289
312
  def predict(xs)
@@ -397,11 +420,12 @@ module EasyML
397
420
 
398
421
  def prepare_data
399
422
  if @d_train.nil?
400
- x_sample, y_sample = dataset.train(split_ys: true, limit: 5)
423
+ col_order = dataset.col_order
424
+ x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order)
401
425
  preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
402
- x_train, y_train = dataset.train(split_ys: true)
403
- x_valid, y_valid = dataset.valid(split_ys: true)
404
- x_test, y_test = dataset.test(split_ys: true)
426
+ x_train, y_train = dataset.train(split_ys: true, select: col_order)
427
+ x_valid, y_valid = dataset.valid(split_ys: true, select: col_order)
428
+ x_test, y_test = dataset.test(split_ys: true, select: col_order)
405
429
  @d_train = preprocess(x_train, y_train)
406
430
  @d_valid = preprocess(x_valid, y_valid)
407
431
  @d_test = preprocess(x_test, y_test)
@@ -6,7 +6,6 @@
6
6
  # model_id :bigint
7
7
  # frequency :string not null
8
8
  # at :json not null
9
- # evaluator :json
10
9
  # tuning_enabled :boolean default(FALSE)
11
10
  # tuner_config :json
12
11
  # tuning_frequency :string
@@ -160,6 +159,14 @@ module EasyML
160
159
  }[frequency.to_sym]
161
160
  end
162
161
 
162
+ def to_config
163
+ EasyML::Export::RetrainingJob.to_config(self)
164
+ end
165
+
166
+ def self.from_config(config, model)
167
+ EasyML::Import::RetrainingJob.from_config(config, model)
168
+ end
169
+
163
170
  private
164
171
 
165
172
  def metric_class
@@ -83,7 +83,6 @@ module EasyML
83
83
  completed_at: failed_reasons.none? ? Time.current : nil,
84
84
  error_message: failed_reasons.any? ? failed_reasons&.first : nil,
85
85
  model: training_model,
86
- metrics: training_model.evaluate,
87
86
  best_params: best_params,
88
87
  tuner_job_id: tuner&.id,
89
88
  metadata: tuner&.metadata,
@@ -109,6 +108,7 @@ module EasyML
109
108
  end
110
109
  true
111
110
  rescue => e
111
+ puts EasyML::Event.easy_ml_context(e.backtrace)
112
112
  EasyML::Event.handle_error(self, e)
113
113
  update!(
114
114
  status: "failed",
@@ -150,14 +150,15 @@ module EasyML
150
150
 
151
151
  training_model.dataset.refresh
152
152
  evaluator = retraining_job.evaluator.symbolize_keys
153
- x_true, y_true = training_model.dataset.test(split_ys: true)
154
- y_pred = training_model.predict(x_true)
153
+ x_test, y_test = training_model.dataset.test(split_ys: true)
154
+ y_pred = training_model.predict(x_test)
155
155
 
156
156
  metric = evaluator[:metric].to_sym
157
157
  metrics = EasyML::Core::ModelEvaluator.evaluate(
158
158
  model: training_model,
159
159
  y_pred: y_pred,
160
- y_true: y_true,
160
+ y_true: y_test,
161
+ x_true: x_test,
161
162
  dataset: training_model.dataset.test(all_columns: true),
162
163
  evaluator: evaluator,
163
164
  )
@@ -176,6 +177,7 @@ module EasyML
176
177
 
177
178
  {
178
179
  metric_value: metric_value,
180
+ metrics: metrics,
179
181
  threshold: threshold,
180
182
  threshold_direction: threshold_direction,
181
183
  deployable: deployable,
@@ -75,6 +75,14 @@ module EasyML
75
75
  }
76
76
  end
77
77
 
78
+ def to_config
79
+ EasyML::Export::Splitter.to_config(self)
80
+ end
81
+
82
+ def self.from_config(config, dataset)
83
+ EasyML::Import::Splitter.from_config(config, dataset)
84
+ end
85
+
78
86
  def split(df, &block)
79
87
  adapter.split(df, &block)
80
88
  end
@@ -0,0 +1,6 @@
1
+ module EasyML
2
+ class LineageHistory < ActiveRecord::Base
3
+ self.table_name = "easy_ml_lineage_histories"
4
+ include Historiographer::History
5
+ end
6
+ end
@@ -28,10 +28,16 @@ module EasyML
28
28
 
29
29
  attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
30
30
  :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
31
- :is_computed, :computed_by, :lineage
31
+ :is_computed, :computed_by
32
32
 
33
33
  attribute :required do |object|
34
34
  object.required?
35
35
  end
36
+
37
+ attribute :lineage do |column|
38
+ column.lineages.map do |lineage|
39
+ LineageSerializer.new(lineage).serializable_hash.dig(:data, :attributes)
40
+ end
41
+ end
36
42
  end
37
43
  end
@@ -59,7 +59,8 @@ module EasyML
59
59
  end
60
60
 
61
61
  attribute :columns do |dataset|
62
- dataset.columns.order(:id).map do |column|
62
+ col_order = dataset.col_order
63
+ dataset.columns.sort_by { |c| col_order.index(c.name) || Float::INFINITY }.map do |column|
63
64
  ColumnSerializer.new(column).serializable_hash.dig(:data, :attributes)
64
65
  end
65
66
  end
@@ -0,0 +1,9 @@
1
+ require "jsonapi/serializer"
2
+
3
+ module EasyML
4
+ class LineageSerializer
5
+ include JSONAPI::Serializer
6
+
7
+ attributes :id, :key, :description, :occurred_at
8
+ end
9
+ end
data/config/routes.rb CHANGED
@@ -18,10 +18,15 @@ EasyML::Engine.routes.draw do
18
18
  member do
19
19
  post :train
20
20
  post :abort
21
+ get :download
22
+ post :upload
21
23
  get :retraining_runs, to: "retraining_runs#index"
22
24
  end
25
+ collection do
26
+ get "new", as: "new"
27
+ post :upload
28
+ end
23
29
  resources :deploys, only: [:create]
24
- get "new", on: :collection, as: "new"
25
30
  end
26
31
 
27
32
  resources :retraining_runs, only: [:show]
@@ -30,6 +35,7 @@ EasyML::Engine.routes.draw do
30
35
  resources :datasources, as: :easy_ml_datasources do
31
36
  member do
32
37
  post :sync
38
+ post :abort
33
39
  end
34
40
  end
35
41
 
@@ -37,6 +43,12 @@ EasyML::Engine.routes.draw do
37
43
  resources :datasets, as: :easy_ml_datasets do
38
44
  member do
39
45
  post :refresh
46
+ post :abort
47
+ get :download
48
+ post :upload
49
+ end
50
+ collection do
51
+ post :upload
40
52
  end
41
53
  end
42
54
 
@@ -4,7 +4,7 @@ module EasyML
4
4
  module Adapters
5
5
  class BaseAdapter
6
6
  attr_accessor :config, :project_name, :tune_started_at, :model,
7
- :x_true, :y_true, :metadata, :model
7
+ :x_valid, :y_valid, :metadata, :model
8
8
 
9
9
  def initialize(options = {})
10
10
  @model = options[:model]
@@ -12,8 +12,8 @@ module EasyML
12
12
  @project_name = options[:project_name]
13
13
  @tune_started_at = options[:tune_started_at]
14
14
  @model = options[:model]
15
- @x_true = options[:x_true]
16
- @y_true = options[:y_true]
15
+ @x_valid = options[:x_valid]
16
+ @y_valid = options[:y_valid]
17
17
  @metadata = options[:metadata] || {}
18
18
  end
19
19
 
@@ -6,7 +6,7 @@ module EasyML
6
6
  class Tuner
7
7
  attr_accessor :model, :dataset, :project_name, :task, :config,
8
8
  :metrics, :objective, :n_trials, :direction, :evaluator,
9
- :study, :results, :adapter, :tune_started_at, :x_true, :y_true,
9
+ :study, :results, :adapter, :tune_started_at, :x_valid, :y_valid,
10
10
  :project_name, :job, :current_run, :trial_enumerator, :progress_block,
11
11
  :tuner_job, :dataset
12
12
 
@@ -34,7 +34,7 @@ module EasyML
34
34
  config: config,
35
35
  project_name: project_name,
36
36
  tune_started_at: nil, # This will be set during tune
37
- y_true: nil, # This will be set during tune
37
+ y_valid: nil, # This will be set during tune
38
38
  )
39
39
  end
40
40
  end
@@ -70,17 +70,16 @@ module EasyML
70
70
  @job = tuner_job
71
71
  @study = Optuna::Study.new(direction: direction)
72
72
  @results = []
73
- model.evaluator = evaluator if evaluator.present?
74
73
  model.task = task
75
74
 
76
75
  model.dataset.refresh if model.dataset.needs_refresh?
77
- x_true, y_true = model.dataset.test(split_ys: true)
78
- self.x_true = x_true
79
- self.y_true = y_true
80
- self.dataset = model.dataset.test(all_columns: true)
76
+ x_valid, y_valid = model.dataset.valid(split_ys: true, select: model.dataset.col_order)
77
+ self.x_valid = x_valid
78
+ self.y_valid = y_valid
79
+ self.dataset = model.dataset.valid(all_columns: true)
81
80
  adapter.tune_started_at = tune_started_at
82
- adapter.y_true = y_true
83
- adapter.x_true = x_true
81
+ adapter.x_valid = x_valid
82
+ adapter.y_valid = y_valid
84
83
 
85
84
  model.prepare_data unless model.batch_mode
86
85
  model.prepare_callbacks(self)
@@ -99,6 +98,7 @@ module EasyML
99
98
  @results.push(result)
100
99
  @study.tell(@current_trial, result)
101
100
  rescue StandardError => e
101
+ puts EasyML::Event.easy_ml_context(e.backtrace)
102
102
  @tuner_run.update!(status: :failed, hyperparameters: {})
103
103
  puts "Optuna failed with: #{e.message}"
104
104
  raise e
@@ -118,6 +118,7 @@ module EasyML
118
118
 
119
119
  best_run&.hyperparameters
120
120
  rescue StandardError => e
121
+ puts EasyML::Event.easy_ml_context(e.backtrace)
121
122
  tuner_job&.update!(status: :failed, completed_at: Time.current)
122
123
  raise e
123
124
  end
@@ -137,9 +138,9 @@ module EasyML
137
138
  end
138
139
  end
139
140
 
140
- y_pred = model.predict(x_true)
141
+ y_pred = model.predict(x_valid)
141
142
  model.metrics = metrics
142
- metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
143
+ metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
143
144
  metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
144
145
 
145
146
  puts metrics
@@ -2,7 +2,7 @@ require_relative "date_converter"
2
2
 
3
3
  module EasyML
4
4
  module Data
5
- module PolarsColumn
5
+ class PolarsColumn
6
6
  TYPE_MAP = {
7
7
  float: Polars::Float64,
8
8
  integer: Polars::Int64,
@@ -14,132 +14,181 @@ module EasyML
14
14
  categorical: Polars::Categorical,
15
15
  null: Polars::Null,
16
16
  }
17
- POLARS_MAP = TYPE_MAP.invert.stringify_keys
17
+ POLARS_MAP = {
18
+ Polars::Float64 => :float,
19
+ Polars::Int64 => :integer,
20
+ Polars::Float32 => :float,
21
+ Polars::Int32 => :integer,
22
+ Polars::Boolean => :boolean,
23
+ Polars::Datetime => :datetime,
24
+ Polars::Date => :date,
25
+ Polars::String => :string,
26
+ Polars::Categorical => :categorical,
27
+ Polars::Null => :null,
28
+ }.stringify_keys
29
+ include EasyML::Timing
30
+
18
31
  class << self
19
32
  def polars_to_sym(polars_type)
20
- POLARS_MAP.dig(polars_type.class.to_s)
33
+ new.polars_to_sym(polars_type)
34
+ end
35
+
36
+ def determine_type(series, polars_type = false)
37
+ new.determine_type(series, polars_type)
21
38
  end
22
39
 
23
40
  def parse_polars_dtype(dtype_string)
24
- case dtype_string
25
- when /^Polars::Datetime/
26
- time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
27
- time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
28
- time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
29
- Polars::Datetime.new(time_unit, time_zone)
30
- when /^Polars::/
31
- Polars.const_get(dtype_string.split("::").last)
32
- else
33
- raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
34
- end
41
+ new.parse_polars_dtype(dtype_string)
42
+ end
43
+
44
+ def get_polars_type(dtype)
45
+ new.get_polars_type(dtype)
46
+ end
47
+
48
+ def polars_dtype_to_sym(dtype_string)
49
+ new.polars_dtype_to_sym(dtype_string)
35
50
  end
36
51
 
37
52
  def sym_to_polars(symbol)
38
- TYPE_MAP.dig(symbol)
53
+ new.sym_to_polars(symbol)
39
54
  end
55
+ end
40
56
 
41
- # Determines the semantic type of a field based on its data
42
- # @param series [Polars::Series] The series to analyze
43
- # @return [Symbol] One of :numeric, :datetime, :categorical, or :text
44
- def determine_type(series, polars_type = false)
45
- dtype = series.dtype
46
-
47
- if dtype.is_a?(Polars::Utf8)
48
- string_type = determine_string_type(series)
49
- if string_type == :datetime
50
- date = EasyML::Data::DateConverter.maybe_convert_date(series)
51
- return polars_type ? date[date.columns.first].dtype : :datetime
52
- end
53
- end
57
+ def polars_to_sym(polars_type)
58
+ return nil if polars_type.nil?
59
+
60
+ if polars_type.is_a?(Polars::DataType)
61
+ POLARS_MAP.dig(polars_type.class.to_s)
62
+ else
63
+ polars_type.to_sym if TYPE_MAP.keys.include?(polars_type.to_sym)
64
+ end
65
+ end
54
66
 
55
- type_name = case dtype
56
- when Polars::Float64
57
- :float
58
- when Polars::Int64
59
- :integer
60
- when Polars::Datetime
61
- :datetime
62
- when Polars::Date
63
- :date
64
- when Polars::Boolean
65
- :boolean
66
- when Polars::Utf8
67
- determine_string_type(series)
68
- when Polars::Null
69
- :null
70
- else
71
- :categorical
72
- end
73
-
74
- polars_type ? sym_to_polars(type_name) : type_name
67
+ def parse_polars_dtype(dtype_string)
68
+ case dtype_string
69
+ when /^Polars::Datetime/
70
+ time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
71
+ time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
72
+ time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
73
+ Polars::Datetime.new(time_unit, time_zone)
74
+ when /^Polars::/
75
+ Polars.const_get(dtype_string.split("::").last)
76
+ else
77
+ raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
75
78
  end
79
+ end
76
80
 
77
- # Determines if a string field is a date, text, or categorical
78
- # @param series [Polars::Series] The string series to analyze
79
- # @return [Symbol] One of :datetime, :text, or :categorical
80
- def determine_string_type(series)
81
- if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
82
- :temp)[:temp].dtype.is_a?(Polars::Datetime)
81
+ def sym_to_polars(symbol)
82
+ TYPE_MAP.dig(symbol.to_sym)
83
+ end
84
+
85
+ # Determines the semantic type of a field based on its data
86
+ # @param series [Polars::Series] The series to analyze
87
+ # @return [Symbol] One of :numeric, :datetime, :categorical, or :text
88
+ def determine_type(series, polars_type = false)
89
+ dtype = series.dtype
90
+
91
+ if dtype.is_a?(Polars::Utf8)
92
+ string_type = determine_string_type(series)
93
+ if string_type == :datetime
94
+ date = EasyML::Data::DateConverter.maybe_convert_date(series)
95
+ return polars_type ? date[date.columns.first].dtype : :datetime
96
+ end
97
+ end
98
+
99
+ type_name = case dtype
100
+ when Polars::Float64
101
+ :float
102
+ when Polars::Int64
103
+ :integer
104
+ when Polars::Datetime
83
105
  :datetime
106
+ when Polars::Date
107
+ :date
108
+ when Polars::Boolean
109
+ :boolean
110
+ when Polars::Utf8
111
+ determine_string_type(series)
112
+ when Polars::Null
113
+ :null
84
114
  else
85
- categorical_or_text?(series)
115
+ :categorical
86
116
  end
117
+
118
+ polars_type ? sym_to_polars(type_name) : type_name
119
+ end
120
+
121
+ measure_method_timing :determine_type
122
+
123
+ # Determines if a string field is a date, text, or categorical
124
+ # @param series [Polars::Series] The string series to analyze
125
+ # @return [Symbol] One of :datetime, :text, or :categorical
126
+ def determine_string_type(series)
127
+ if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
128
+ :temp)[:temp].dtype.is_a?(Polars::Datetime)
129
+ :datetime
130
+ else
131
+ categorical_or_text?(series)
87
132
  end
133
+ end
88
134
 
89
- # Determines if a string field is categorical or free text
90
- # @param series [Polars::Series] The string series to analyze
91
- # @return [Symbol] Either :categorical or :text
92
- def categorical_or_text?(series)
93
- return :categorical if series.null_count == series.len
135
+ measure_method_timing :determine_string_type
94
136
 
95
- # Get non-null count for percentage calculations
96
- non_null_count = series.len - series.null_count
97
- return :categorical if non_null_count == 0
137
+ # Determines if a string field is categorical or free text
138
+ # @param series [Polars::Series] The string series to analyze
139
+ # @return [Symbol] Either :categorical or :text
140
+ def categorical_or_text?(series)
141
+ return :categorical if series.null_count == series.len
98
142
 
99
- # Get value counts as percentages
100
- value_counts = series.value_counts(parallel: true)
101
- percentages = value_counts.with_column(
102
- (value_counts["count"] / non_null_count.to_f * 100).alias("percentage")
103
- )
143
+ # Get non-null count for percentage calculations
144
+ non_null_count = series.len - series.null_count
145
+ return :categorical if non_null_count == 0
104
146
 
105
- # Check if any category represents more than 10% of the data
106
- max_percentage = percentages["percentage"].max
107
- return :text if max_percentage < 10.0
147
+ # Get value counts as percentages
148
+ value_counts = series.value_counts(parallel: true)
149
+ percentages = value_counts.with_column(
150
+ (value_counts["count"] / non_null_count.to_f * 100).alias("percentage")
151
+ )
108
152
 
109
- # Calculate average percentage per category
110
- avg_percentage = 100.0 / series.n_unique
153
+ # Check if any category represents more than 10% of the data
154
+ max_percentage = percentages["percentage"].max
155
+ return :text if max_percentage < 10.0
111
156
 
112
- # If average category represents less than 1% of data, it's likely text
113
- avg_percentage < 1.0 ? :text : :categorical
114
- end
157
+ # Calculate average percentage per category
158
+ avg_percentage = 100.0 / series.n_unique
115
159
 
116
- # Returns whether the field type is numeric
117
- # @param field_type [Symbol] The field type to check
118
- # @return [Boolean]
119
- def numeric?(field_type)
120
- field_type == :numeric
121
- end
160
+ # If average category represents less than 1% of data, it's likely text
161
+ avg_percentage < 1.0 ? :text : :categorical
162
+ end
122
163
 
123
- # Returns whether the field type is categorical
124
- # @param field_type [Symbol] The field type to check
125
- # @return [Boolean]
126
- def categorical?(field_type)
127
- field_type == :categorical
128
- end
164
+ measure_method_timing :categorical_or_text?
129
165
 
130
- # Returns whether the field type is datetime
131
- # @param field_type [Symbol] The field type to check
132
- # @return [Boolean]
133
- def datetime?(field_type)
134
- field_type == :datetime
135
- end
166
+ # Returns whether the field type is numeric
167
+ # @param field_type [Symbol] The field type to check
168
+ # @return [Boolean]
169
+ def numeric?(field_type)
170
+ field_type == :numeric
171
+ end
136
172
 
137
- # Returns whether the field type is text
138
- # @param field_type [Symbol] The field type to check
139
- # @return [Boolean]
140
- def text?(field_type)
141
- field_type == :text
142
- end
173
+ # Returns whether the field type is categorical
174
+ # @param field_type [Symbol] The field type to check
175
+ # @return [Boolean]
176
+ def categorical?(field_type)
177
+ field_type == :categorical
178
+ end
179
+
180
+ # Returns whether the field type is datetime
181
+ # @param field_type [Symbol] The field type to check
182
+ # @return [Boolean]
183
+ def datetime?(field_type)
184
+ field_type == :datetime
185
+ end
186
+
187
+ # Returns whether the field type is text
188
+ # @param field_type [Symbol] The field type to check
189
+ # @return [Boolean]
190
+ def text?(field_type)
191
+ field_type == :text
143
192
  end
144
193
  end
145
194
  end
@@ -88,17 +88,20 @@ module EasyML
88
88
  end
89
89
 
90
90
  def query(files = nil, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
91
- batch_size: nil, batch_start: nil, batch_key: nil, &block)
91
+ batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, &block)
92
92
  files ||= self.files
93
93
  PolarsReader.query(files, drop_cols: drop_cols, filter: filter, limit: limit,
94
94
  select: select, unique: unique, sort: sort, descending: descending,
95
- batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, &block)
95
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, lazy: lazy, &block)
96
96
  end
97
97
 
98
98
  def self.query(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
99
- batch_size: nil, batch_start: nil, batch_key: nil, &block)
100
- return query_files(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
101
- unique: unique, sort: sort, descending: descending).collect unless batch_size.present?
99
+ batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, &block)
100
+ unless batch_size.present?
101
+ result = query_files(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
102
+ unique: unique, sort: sort, descending: descending)
103
+ return lazy ? result : result.collect
104
+ end
102
105
 
103
106
  return batch_enumerator(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
104
107
  batch_size: batch_size, batch_start: batch_start, batch_key: batch_key) unless block_given?