easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/application_controller.rb +4 -0
  4. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  5. data/app/controllers/easy_ml/models_controller.rb +3 -0
  6. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  7. data/app/frontend/components/DatasetPreview.tsx +50 -19
  8. data/app/frontend/components/ModelForm.tsx +1 -1
  9. data/app/frontend/components/SearchableSelect.tsx +0 -1
  10. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  11. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  12. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  13. data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
  14. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  15. data/app/frontend/types/dataset.ts +3 -0
  16. data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
  17. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  18. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  19. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  21. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  22. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  23. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  24. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  25. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  26. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  27. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  28. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  29. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  30. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  31. data/app/models/easy_ml/column/imputers.rb +126 -0
  32. data/app/models/easy_ml/column/learner.rb +18 -0
  33. data/app/models/easy_ml/column/learners/base.rb +103 -0
  34. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  35. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  36. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  37. data/app/models/easy_ml/column/learners/null.rb +22 -0
  38. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  39. data/app/models/easy_ml/column/learners/string.rb +15 -0
  40. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  41. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  42. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  43. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  44. data/app/models/easy_ml/column/lineage.rb +28 -0
  45. data/app/models/easy_ml/column/selector.rb +96 -0
  46. data/app/models/easy_ml/column.rb +344 -39
  47. data/app/models/easy_ml/column_history.rb +31 -20
  48. data/app/models/easy_ml/column_list.rb +79 -62
  49. data/app/models/easy_ml/dataset.rb +156 -104
  50. data/app/models/easy_ml/dataset_history.rb +23 -23
  51. data/app/models/easy_ml/datasource.rb +4 -0
  52. data/app/models/easy_ml/datasource_history.rb +1 -0
  53. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  54. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  55. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  56. data/app/models/easy_ml/feature.rb +29 -10
  57. data/app/models/easy_ml/feature_history.rb +12 -0
  58. data/app/models/easy_ml/feature_list.rb +15 -0
  59. data/app/models/easy_ml/model.rb +25 -4
  60. data/app/models/easy_ml/model_history.rb +1 -0
  61. data/app/models/easy_ml/retraining_run.rb +1 -0
  62. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  63. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  64. data/config/initializers/enumerable.rb +17 -0
  65. data/config/initializers/inflections.rb +2 -0
  66. data/config/routes.rb +3 -0
  67. data/lib/easy_ml/core/tuner.rb +1 -1
  68. data/lib/easy_ml/data/date_converter.rb +137 -30
  69. data/lib/easy_ml/data/polars_column.rb +17 -0
  70. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  71. data/lib/easy_ml/data/polars_reader.rb +20 -1
  72. data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
  73. data/lib/easy_ml/data/splits/split.rb +2 -1
  74. data/lib/easy_ml/data/synced_directory.rb +5 -3
  75. data/lib/easy_ml/data.rb +1 -2
  76. data/lib/easy_ml/feature_store.rb +33 -22
  77. data/lib/easy_ml/predict.rb +13 -2
  78. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
  79. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
  80. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  81. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  82. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  83. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  84. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  85. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  86. data/lib/easy_ml/version.rb +1 -1
  87. data/lib/tasks/profile.rake +40 -0
  88. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  89. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  90. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  91. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  92. metadata +45 -10
  93. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  94. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  95. data/lib/easy_ml/data/preprocessor.rb +0 -383
  96. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  97. data/lib/easy_ml/data/statistics_learner.rb +0 -128
  98. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  99. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
  100. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -55,6 +55,7 @@ module EasyML
55
55
  end
56
56
 
57
57
  belongs_to :dataset, class_name: "EasyML::Dataset"
58
+ has_many :columns, class_name: "EasyML::Column", dependent: :destroy
58
59
 
59
60
  validates :feature_class, presence: true
60
61
  validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
@@ -72,7 +73,7 @@ module EasyML
72
73
  end
73
74
 
74
75
  # Combine all conditions with OR
75
- where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
76
+ where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).map(&:id))
76
77
  }
77
78
  scope :never_applied, -> { where(applied_at: nil) }
78
79
  scope :never_fit, -> do
@@ -81,6 +82,7 @@ module EasyML
81
82
  where(id: fittable.map(&:id))
82
83
  end
83
84
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
85
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
84
86
 
85
87
  before_save :apply_defaults, if: :new_record?
86
88
  before_save :update_sha
@@ -165,6 +167,13 @@ module EasyML
165
167
  end
166
168
  end
167
169
 
170
+ def computes_columns
171
+ unless adapter.respond_to?(:computes_columns)
172
+ raise "Feature #{feature_class} must declare which columns it computes using the :computes_columns method"
173
+ end
174
+ adapter.computes_columns
175
+ end
176
+
168
177
  def build_batches
169
178
  if batchable?
170
179
  batch
@@ -216,8 +225,11 @@ module EasyML
216
225
  def fit(features: [self], async: false)
217
226
  ordered_features = features.sort_by(&:feature_position)
218
227
  jobs = ordered_features.map(&:build_batches)
228
+ job_count = jobs.dup.flatten.size
219
229
 
220
- if async
230
+ # This is very important! For whatever reason, Resque BatchJob does not properly
231
+ # handle batch finished callbacks for batch size = 1
232
+ if async && job_count > 1
221
233
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
222
234
  else
223
235
  jobs.flatten.each do |job|
@@ -233,13 +245,14 @@ module EasyML
233
245
  if batch_args.key?(:batch_start)
234
246
  actually_fit_batch(batch_args)
235
247
  else
236
- actually_fit_batch(get_batch_args(**batch_args))
248
+ batch_args = get_batch_args(**batch_args)
249
+ actually_fit_batch(batch_args)
237
250
  end
238
251
  end
239
252
 
240
253
  # Transform a single batch, used for testing the user's feature implementation
241
254
  def transform_batch(df = nil, batch_args = {})
242
- if df.present?
255
+ if df.is_a?(Polars::DataFrame)
243
256
  actually_transform_batch(df)
244
257
  else
245
258
  actually_transform_batch(build_batch(get_batch_args(**batch_args)))
@@ -281,12 +294,14 @@ module EasyML
281
294
  batch_args.symbolize_keys!
282
295
 
283
296
  if adapter.respond_to?(:batch)
284
- batch_df = adapter.fit(dataset.raw, self, batch_args)
297
+ df = dataset.raw
285
298
  else
286
299
  df = build_batch(batch_args)
287
- batch_df = adapter.fit(df, self, batch_args)
288
300
  end
289
301
  end
302
+ return if df.blank?
303
+
304
+ batch_df = adapter.fit(df, self, batch_args)
290
305
  if batch_df.present?
291
306
  store(batch_df)
292
307
  else
@@ -296,10 +311,14 @@ module EasyML
296
311
  end
297
312
 
298
313
  def actually_transform_batch(df)
299
- return nil unless df.present?
300
- return df if adapter.respond_to?(:fit) && feature_store.empty?
314
+ return nil unless df.is_a?(Polars::DataFrame)
315
+ return df if !adapter.respond_to?(:transform) && feature_store.empty?
301
316
 
317
+ df_len_was = df.shape[0]
302
318
  result = adapter.transform(df, self)
319
+ raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
320
+ df_len_now = result.shape[0]
321
+ raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
303
322
  update!(applied_at: Time.current)
304
323
  result
305
324
  end
@@ -377,8 +396,8 @@ module EasyML
377
396
  feature_store.list_partitions
378
397
  end
379
398
 
380
- def query(filter: nil)
381
- feature_store.query(filter: filter)
399
+ def query(**kwargs)
400
+ feature_store.query(**kwargs)
382
401
  end
383
402
 
384
403
  def store(df)
@@ -31,6 +31,18 @@ module EasyML
31
31
 
32
32
  after_find :download_remote_files
33
33
  scope :ordered, -> { order(feature_position: :asc) }
34
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
35
+ scope :has_changes, lambda {
36
+ none
37
+ }
38
+ scope :never_applied, -> { where(applied_at: nil) }
39
+ scope :never_fit, -> do
40
+ fittable = where(fit_at: nil)
41
+ fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
42
+ where(id: fittable.map(&:id))
43
+ end
44
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
45
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
34
46
 
35
47
  def download_remote_files
36
48
  feature_store&.download
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ module FeatureList
3
+ def feature_list
4
+ self
5
+ end
6
+
7
+ def dataset
8
+ proxy_association.owner
9
+ end
10
+
11
+ def computed_column_names
12
+ flat_map(&:computes_columns).uniq
13
+ end
14
+ end
15
+ end
@@ -17,6 +17,7 @@
17
17
  # is_training :boolean
18
18
  # created_at :datetime not null
19
19
  # updated_at :datetime not null
20
+ # slug :string not null
20
21
  #
21
22
  require_relative "models/hyperparameters"
22
23
 
@@ -66,6 +67,7 @@ module EasyML
66
67
  after_initialize :bump_version, if: -> { new_record? }
67
68
  after_initialize :set_defaults, if: -> { new_record? }
68
69
  before_save :save_model_file, if: -> { is_fit? && !is_history_class? && model_changed? && !@skip_save_model_file }
70
+ before_validation :set_slug, if: :name_changed?
69
71
 
70
72
  VALID_TASKS = %i[regression classification].freeze
71
73
 
@@ -91,6 +93,7 @@ module EasyML
91
93
  }
92
94
  validates :model_type, inclusion: { in: MODEL_NAMES }
93
95
  validates :dataset_id, presence: true
96
+ validates :slug, presence: true, uniqueness: true
94
97
  validate :validate_metrics_allowed
95
98
  before_save :set_root_dir
96
99
 
@@ -189,6 +192,7 @@ module EasyML
189
192
  evaluator: evaluator,
190
193
  model: self,
191
194
  dataset: dataset,
195
+ metrics: metrics,
192
196
  }.compact
193
197
  tuner.merge!(extra_params)
194
198
  tuner_instance = EasyML::Core::Tuner.new(tuner)
@@ -307,7 +311,6 @@ module EasyML
307
311
 
308
312
  dataset.refresh
309
313
  adapter.fit(tuning: tuning, x_train: x_train, y_train: y_train, x_valid: x_valid, y_valid: y_valid, &progress_block)
310
- @is_fit = true
311
314
  end
312
315
 
313
316
  def batch_args
@@ -334,11 +337,8 @@ module EasyML
334
337
 
335
338
  def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
336
339
  adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
337
- @is_fit = true
338
340
  end
339
341
 
340
- attr_accessor :is_fit
341
-
342
342
  def is_fit?
343
343
  model_file = get_model_file
344
344
  return true if model_file.present? && model_file.fit?
@@ -447,6 +447,21 @@ module EasyML
447
447
  )
448
448
  end
449
449
 
450
+ include Rails.application.routes.mounted_helpers
451
+
452
+ def api_fields
453
+ {
454
+ url: EasyML::Engine.routes.url_helpers.predictions_path,
455
+ method: "POST",
456
+ data: {
457
+ model: slug,
458
+ input: dataset.columns.api_inputs.sort_by_required.map(&:to_api).each_with_object({}) do |field, hash|
459
+ hash[field[:name]] = field.except(:name)
460
+ end,
461
+ },
462
+ }
463
+ end
464
+
450
465
  class CannotdeployError < StandardError
451
466
  end
452
467
 
@@ -606,6 +621,12 @@ module EasyML
606
621
  errors.add(:metrics,
607
622
  "don't know how to handle #{"metrics".pluralize(unknown_metrics)} #{unknown_metrics.join(", ")}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
608
623
  end
624
+
625
+ def set_slug
626
+ if slug.nil? && name.present?
627
+ self.slug = name.gsub(/\s/, "_").downcase
628
+ end
629
+ end
609
630
  end
610
631
  end
611
632
 
@@ -22,6 +22,7 @@
22
22
  # history_ended_at :datetime
23
23
  # history_user_id :integer
24
24
  # snapshot_id :string
25
+ # slug :string
25
26
  #
26
27
  module EasyML
27
28
  class ModelHistory < ActiveRecord::Base
@@ -158,6 +158,7 @@ module EasyML
158
158
  model: training_model,
159
159
  y_pred: y_pred,
160
160
  y_true: y_true,
161
+ dataset: training_model.dataset.test(all_columns: true),
161
162
  evaluator: evaluator,
162
163
  )
163
164
  metric_value = metrics[metric]
@@ -19,9 +19,19 @@
19
19
  #
20
20
  module EasyML
21
21
  class ColumnSerializer
22
+ class SmallSerializer
23
+ include JSONAPI::Serializer
24
+ attributes :id, :name
25
+ end
26
+
22
27
  include JSONAPI::Serializer
23
28
 
24
29
  attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
25
- :hidden, :drop_if_null, :sample_values, :statistics, :is_target
30
+ :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
31
+ :is_computed, :computed_by, :lineage
32
+
33
+ attribute :required do |object|
34
+ object.required?
35
+ end
26
36
  end
27
37
  end
@@ -24,6 +24,27 @@ require_relative "./column_serializer"
24
24
  #
25
25
  module EasyML
26
26
  class DatasetSerializer
27
+ class SmallSerializer
28
+ include JSONAPI::Serializer
29
+
30
+ attributes :id, :name, :description, :target, :num_rows, :status,
31
+ :datasource_id, :preprocessing_steps, :workflow_status, :statistics
32
+
33
+ attribute :columns do |dataset|
34
+ dataset.columns.order(:id).map do |column|
35
+ ColumnSerializer::SmallSerializer.new(column).serializable_hash.dig(:data, :attributes)
36
+ end
37
+ end
38
+ attribute :stacktrace do |object|
39
+ if !object.failed? || object.events.empty?
40
+ nil
41
+ else
42
+ last_event = object.events.where(status: :failed).order(id: :desc).limit(1).last
43
+ last_event&.stacktrace
44
+ end
45
+ end
46
+ end
47
+
27
48
  include JSONAPI::Serializer
28
49
 
29
50
  attributes :id, :name, :description, :target, :num_rows, :status,
@@ -47,7 +68,7 @@ module EasyML
47
68
  if dataset.workflow_status.to_sym == :analyzing
48
69
  nil
49
70
  else
50
- dataset.data(limit: 10, all_columns: true)&.to_hashes
71
+ dataset.data(limit: 10, all_columns: true, refresh: false)&.to_hashes || dataset.raw.data(limit: 10, all_columns: true).to_hashes
51
72
  end
52
73
  end
53
74
 
@@ -62,7 +83,7 @@ module EasyML
62
83
  end
63
84
 
64
85
  attribute :needs_refresh do |dataset|
65
- dataset.needs_refresh?
86
+ dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
66
87
  end
67
88
 
68
89
  attribute :stacktrace do |object|
@@ -0,0 +1,17 @@
1
+ module Enumerable
2
+ def count_by(&block)
3
+ self.group_by(&block).inject({}) do |h, (k, v)|
4
+ h.tap do
5
+ h[k] = v.count
6
+ end
7
+ end
8
+ end
9
+
10
+ def key_by(&block)
11
+ self.group_by(&block).inject({}) do |h, (k, v)|
12
+ h.tap do
13
+ h[k] = v.first
14
+ end
15
+ end
16
+ end
17
+ end
@@ -12,6 +12,8 @@ module EasyML
12
12
  inflect.acronym "EST"
13
13
  inflect.acronym "UTC"
14
14
  inflect.acronym "HTML"
15
+ inflect.acronym "API"
16
+ inflect.acronym "APIs"
15
17
  end
16
18
  end
17
19
  end
data/config/routes.rb CHANGED
@@ -11,6 +11,9 @@ EasyML::Engine.routes.draw do
11
11
  # Predictions API
12
12
  resources :predictions, only: [:create]
13
13
 
14
+ # API Documentation
15
+ get "api", to: "apis#show"
16
+
14
17
  resources :models, as: :easy_ml_models do
15
18
  member do
16
19
  post :train
@@ -173,7 +173,7 @@ module EasyML
173
173
  end
174
174
  raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
175
175
 
176
- self.metrics = EasyML::Model.new(task: task).allowed_metrics if metrics.nil? || metrics.empty?
176
+ self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
177
177
  end
178
178
  end
179
179
  end
@@ -3,35 +3,104 @@ module EasyML
3
3
  module DateConverter
4
4
  COMMON_DATE_FORMATS = [
5
5
  "%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
6
- "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
- "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
- "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
9
- "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
- "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
- "%Y-%m-%d", # e.g., "2021-01-01"
12
- "%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
13
- "%m/%d/%Y", # e.g., "01/01/2021"
14
- "%d-%m-%Y", # e.g., "01-01-2021"
15
- "%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
16
- "%d-%b-%Y", # e.g., "01-Jan-2021"
17
- "%b %d, %Y", # e.g., "Jan 01, 2021"
18
- "%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
6
+ "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
+ "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
+ "%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
9
+ "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
+ "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
+ "%Y-%m-%d", # e.g., "2021-01-01"
12
+ "%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
13
+ "%m/%d/%Y", # e.g., "01/01/2021"
14
+ "%d-%m-%Y", # e.g., "01-01-2021"
15
+ "%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
16
+ "%d-%b-%Y", # e.g., "01-Jan-2021"
17
+ "%b %d, %Y", # e.g., "Jan 01, 2021"
18
+ "%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
19
19
  "%Y/%m/%d", # e.g., "2021/01/01"
20
20
  ].freeze
21
21
 
22
22
  FORMAT_MAPPINGS = {
23
23
  ruby_to_polars: {
24
24
  "%L" => "%3f", # milliseconds
25
- "%6N" => "%6f", # microseconds
26
- "%N" => "%9f", # nanoseconds
25
+ "%6N" => "%6f", # microseconds
26
+ "%N" => "%9f", # nanoseconds
27
27
  },
28
28
  }.freeze
29
29
 
30
30
  class << self
31
- # Attempts to convert a string column to datetime if it appears to be a date
32
- # @param df [Polars::DataFrame] The dataframe containing the series
33
- # @param column [String] The name of the column to convert
34
- # @return [Polars::DataFrame] The dataframe with converted column (if successful)
31
+ # Infers a strftime format string from the given date string.
32
+ #
33
+ # @param date_str [String] The date string to analyze.
34
+ # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
35
+ def infer_strftime_format(date_str)
36
+ return nil if date_str.blank?
37
+
38
+ # YYYY-MM-DD (e.g., "2021-01-01")
39
+ return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
40
+
41
+ # YYYY/MM/DD (e.g., "2021/01/01")
42
+ return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
43
+
44
+ # Date & time with T separator (ISO 8601-like)
45
+ if date_str.include?("T")
46
+ # Without fractional seconds, e.g., "2021-01-01T12:34:56"
47
+ return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
48
+
49
+ # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
50
+ if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
51
+ fraction = Regexp.last_match(1)
52
+ case fraction.length
53
+ when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
54
+ when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
55
+ when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
56
+ else
57
+ # Fallback if fractional part has unexpected length:
58
+ return "%Y-%m-%dT%H:%M:%S.%N"
59
+ end
60
+ end
61
+ end
62
+
63
+ # Date & time with space separator
64
+ if date_str.include?(" ")
65
+ # Without fractional seconds, e.g., "2021-01-01 12:34:56"
66
+ return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
67
+
68
+ # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
69
+ if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
70
+ fraction = Regexp.last_match(1)
71
+ case fraction.length
72
+ when 3 then return "%Y-%m-%d %H:%M:%S.%L"
73
+ when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
74
+ when 9 then return "%Y-%m-%d %H:%M:%S.%N"
75
+ else
76
+ return "%Y-%m-%d %H:%M:%S.%N"
77
+ end
78
+ end
79
+ end
80
+
81
+ # Common US-style formats
82
+
83
+ # MM/DD/YYYY (e.g., "01/31/2021")
84
+ return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
85
+
86
+ # DD-MM-YYYY (e.g., "31-01-2021")
87
+ return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
88
+
89
+ # DD-Mon-YYYY (e.g., "31-Jan-2021")
90
+ return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
91
+
92
+ # Mon DD, YYYY (e.g., "Jan 31, 2021")
93
+ return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
94
+
95
+ # Could add additional heuristics as needed...
96
+
97
+ nil # Return nil if no known format matches.
98
+ end
99
+
100
+ # Attempts to convert a string column to datetime if it appears to be a date.
101
+ # @param df [Polars::DataFrame] The dataframe containing the series.
102
+ # @param column [String] The name of the column to convert.
103
+ # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
35
104
  def maybe_convert_date(df, column = nil)
36
105
  if column.nil?
37
106
  series = df
@@ -40,19 +109,42 @@ module EasyML
40
109
  else
41
110
  series = df[column]
42
111
  end
112
+
43
113
  return df if series.dtype.is_a?(Polars::Datetime)
44
114
  return df unless series.dtype == Polars::Utf8
45
115
 
46
- format = detect_polars_format(series)
47
- return df unless format
116
+ sample = series.filter(series.is_not_null).head(100).to_a
117
+ ruby_format = detect_date_format(sample)
48
118
 
49
- df.with_column(
50
- Polars.col(column.to_s).str.strptime(Polars::Datetime, format).alias(column.to_s)
51
- )
119
+ if ruby_format
120
+ format = convert_format(:ruby_to_polars, ruby_format)
121
+ df = try_format(df, column, format)
122
+
123
+ if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
124
+ df = df.drop("TRY")
125
+ best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
126
+ infer_strftime_format(date_str)
127
+ end.max_by { |_format, count| count }[0]
128
+ df = try_format(df, column, best_format)
129
+ end
130
+
131
+ df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
132
+ end
133
+
134
+ df
52
135
  end
53
136
 
54
137
  private
55
138
 
139
+ def try_format(df, column, format)
140
+ df = df.with_column(
141
+ Polars.col(column.to_s)
142
+ .str
143
+ .strptime(Polars::Datetime, format, strict: false)
144
+ .alias("TRY")
145
+ )
146
+ end
147
+
56
148
  def detect_polars_format(series)
57
149
  return nil unless series.is_a?(Polars::Series)
58
150
 
@@ -66,14 +158,29 @@ module EasyML
66
158
 
67
159
  sample = date_strings.compact.sample([100, date_strings.length].min)
68
160
 
69
- COMMON_DATE_FORMATS.detect do |format|
70
- sample.all? do |date_str|
71
- DateTime.strptime(date_str, format)
72
- true
73
- rescue StandardError
74
- false
161
+ best_format = nil
162
+ best_success_rate = 0.0
163
+ sample_count = sample.length
164
+
165
+ COMMON_DATE_FORMATS.each do |fmt|
166
+ success_count = sample.count do |date_str|
167
+ begin
168
+ DateTime.strptime(date_str, fmt)
169
+ true
170
+ rescue StandardError
171
+ false
172
+ end
75
173
  end
174
+ success_rate = success_count.to_f / sample_count
175
+ if success_rate > best_success_rate
176
+ best_success_rate = success_rate
177
+ best_format = fmt
178
+ end
179
+ # If every sample string matches this format, return it immediately.
180
+ return fmt if success_rate == 1.0
76
181
  end
182
+
183
+ best_success_rate >= 0.8 ? best_format : nil
77
184
  end
78
185
 
79
186
  def convert_format(conversion, format)
@@ -12,6 +12,7 @@ module EasyML
12
12
  string: Polars::String,
13
13
  text: Polars::String,
14
14
  categorical: Polars::Categorical,
15
+ null: Polars::Null,
15
16
  }
16
17
  POLARS_MAP = TYPE_MAP.invert.stringify_keys
17
18
  class << self
@@ -19,6 +20,20 @@ module EasyML
19
20
  POLARS_MAP.dig(polars_type.class.to_s)
20
21
  end
21
22
 
23
+ def parse_polars_dtype(dtype_string)
24
+ case dtype_string
25
+ when /^Polars::Datetime/
26
+ time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
27
+ time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
28
+ time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
29
+ Polars::Datetime.new(time_unit, time_zone)
30
+ when /^Polars::/
31
+ Polars.const_get(dtype_string.split("::").last)
32
+ else
33
+ raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
34
+ end
35
+ end
36
+
22
37
  def sym_to_polars(symbol)
23
38
  TYPE_MAP.dig(symbol)
24
39
  end
@@ -50,6 +65,8 @@ module EasyML
50
65
  :boolean
51
66
  when Polars::Utf8
52
67
  determine_string_type(series)
68
+ when Polars::Null
69
+ :null
53
70
  else
54
71
  :categorical
55
72
  end
@@ -0,0 +1,30 @@
1
+ module EasyML
2
+ module Data
3
+ class PolarsInMemory
4
+ attr_reader :df
5
+
6
+ def initialize(df)
7
+ @df = df
8
+ end
9
+
10
+ def self.query(df, **kwargs)
11
+ new(df).query(**kwargs)
12
+ end
13
+
14
+ def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
15
+ return if df.nil?
16
+
17
+ df = self.df.clone
18
+ df = df.filter(filter) if filter
19
+ select = df.columns & ([select] || []).flatten
20
+ df = df.select(select) if select.present?
21
+ df = df.unique if unique
22
+ drop_cols &= df.columns
23
+ df = df.drop(drop_cols) unless drop_cols.empty?
24
+ df = df.sort(sort, reverse: descending) if sort
25
+ df = df.limit(limit) if limit
26
+ df
27
+ end
28
+ end
29
+ end
30
+ end
@@ -12,6 +12,22 @@ module EasyML
12
12
  @schema = options[:schema]
13
13
  end
14
14
 
15
+ def sha
16
+ files = parquet_files.sort
17
+
18
+ file_hashes = files.map do |file|
19
+ meta = Polars.read_parquet_schema(file)
20
+ row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
21
+
22
+ Digest::SHA256.hexdigest([
23
+ meta.to_json,
24
+ row_count.to_s,
25
+ ].join("|"))
26
+ end
27
+
28
+ Digest::SHA256.hexdigest(file_hashes.join)
29
+ end
30
+
15
31
  def schema=(value)
16
32
  @schema = value
17
33
  polars_args[:dtypes] = value
@@ -55,7 +71,10 @@ module EasyML
55
71
  return files if any_parquet? && columns.nil?
56
72
 
57
73
  puts "Converting to Parquet..."
58
-
74
+ if columns.nil? || columns.all? { |c| c.datatype.nil? }
75
+ learn_dataset
76
+ columns = nil
77
+ end
59
78
  csv_files.each do |path|
60
79
  df = read_file(path, columns)
61
80
  df = cast(df, columns)