easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +344 -39
- data/app/models/easy_ml/column_history.rb +31 -20
- data/app/models/easy_ml/column_list.rb +79 -62
- data/app/models/easy_ml/dataset.rb +156 -104
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +4 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +29 -10
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/models/easy_ml/model.rb +25 -4
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +5 -3
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +45 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -383
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -128
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -55,6 +55,7 @@ module EasyML
|
|
55
55
|
end
|
56
56
|
|
57
57
|
belongs_to :dataset, class_name: "EasyML::Dataset"
|
58
|
+
has_many :columns, class_name: "EasyML::Column", dependent: :destroy
|
58
59
|
|
59
60
|
validates :feature_class, presence: true
|
60
61
|
validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
|
@@ -72,7 +73,7 @@ module EasyML
|
|
72
73
|
end
|
73
74
|
|
74
75
|
# Combine all conditions with OR
|
75
|
-
where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).
|
76
|
+
where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).map(&:id))
|
76
77
|
}
|
77
78
|
scope :never_applied, -> { where(applied_at: nil) }
|
78
79
|
scope :never_fit, -> do
|
@@ -81,6 +82,7 @@ module EasyML
|
|
81
82
|
where(id: fittable.map(&:id))
|
82
83
|
end
|
83
84
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
84
86
|
|
85
87
|
before_save :apply_defaults, if: :new_record?
|
86
88
|
before_save :update_sha
|
@@ -165,6 +167,13 @@ module EasyML
|
|
165
167
|
end
|
166
168
|
end
|
167
169
|
|
170
|
+
def computes_columns
|
171
|
+
unless adapter.respond_to?(:computes_columns)
|
172
|
+
raise "Feature #{feature_class} must declare which columns it computes using the :computes_columns method"
|
173
|
+
end
|
174
|
+
adapter.computes_columns
|
175
|
+
end
|
176
|
+
|
168
177
|
def build_batches
|
169
178
|
if batchable?
|
170
179
|
batch
|
@@ -216,8 +225,11 @@ module EasyML
|
|
216
225
|
def fit(features: [self], async: false)
|
217
226
|
ordered_features = features.sort_by(&:feature_position)
|
218
227
|
jobs = ordered_features.map(&:build_batches)
|
228
|
+
job_count = jobs.dup.flatten.size
|
219
229
|
|
220
|
-
|
230
|
+
# This is very important! For whatever reason, Resque BatchJob does not properly
|
231
|
+
# handle batch finished callbacks for batch size = 1
|
232
|
+
if async && job_count > 1
|
221
233
|
EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
|
222
234
|
else
|
223
235
|
jobs.flatten.each do |job|
|
@@ -233,13 +245,14 @@ module EasyML
|
|
233
245
|
if batch_args.key?(:batch_start)
|
234
246
|
actually_fit_batch(batch_args)
|
235
247
|
else
|
236
|
-
|
248
|
+
batch_args = get_batch_args(**batch_args)
|
249
|
+
actually_fit_batch(batch_args)
|
237
250
|
end
|
238
251
|
end
|
239
252
|
|
240
253
|
# Transform a single batch, used for testing the user's feature implementation
|
241
254
|
def transform_batch(df = nil, batch_args = {})
|
242
|
-
if df.
|
255
|
+
if df.is_a?(Polars::DataFrame)
|
243
256
|
actually_transform_batch(df)
|
244
257
|
else
|
245
258
|
actually_transform_batch(build_batch(get_batch_args(**batch_args)))
|
@@ -281,12 +294,14 @@ module EasyML
|
|
281
294
|
batch_args.symbolize_keys!
|
282
295
|
|
283
296
|
if adapter.respond_to?(:batch)
|
284
|
-
|
297
|
+
df = dataset.raw
|
285
298
|
else
|
286
299
|
df = build_batch(batch_args)
|
287
|
-
batch_df = adapter.fit(df, self, batch_args)
|
288
300
|
end
|
289
301
|
end
|
302
|
+
return if df.blank?
|
303
|
+
|
304
|
+
batch_df = adapter.fit(df, self, batch_args)
|
290
305
|
if batch_df.present?
|
291
306
|
store(batch_df)
|
292
307
|
else
|
@@ -296,10 +311,14 @@ module EasyML
|
|
296
311
|
end
|
297
312
|
|
298
313
|
def actually_transform_batch(df)
|
299
|
-
return nil unless df.
|
300
|
-
return df if adapter.respond_to?(:
|
314
|
+
return nil unless df.is_a?(Polars::DataFrame)
|
315
|
+
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
301
316
|
|
317
|
+
df_len_was = df.shape[0]
|
302
318
|
result = adapter.transform(df, self)
|
319
|
+
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
320
|
+
df_len_now = result.shape[0]
|
321
|
+
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
|
303
322
|
update!(applied_at: Time.current)
|
304
323
|
result
|
305
324
|
end
|
@@ -377,8 +396,8 @@ module EasyML
|
|
377
396
|
feature_store.list_partitions
|
378
397
|
end
|
379
398
|
|
380
|
-
def query(
|
381
|
-
feature_store.query(
|
399
|
+
def query(**kwargs)
|
400
|
+
feature_store.query(**kwargs)
|
382
401
|
end
|
383
402
|
|
384
403
|
def store(df)
|
@@ -31,6 +31,18 @@ module EasyML
|
|
31
31
|
|
32
32
|
after_find :download_remote_files
|
33
33
|
scope :ordered, -> { order(feature_position: :asc) }
|
34
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
35
|
+
scope :has_changes, lambda {
|
36
|
+
none
|
37
|
+
}
|
38
|
+
scope :never_applied, -> { where(applied_at: nil) }
|
39
|
+
scope :never_fit, -> do
|
40
|
+
fittable = where(fit_at: nil)
|
41
|
+
fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
|
42
|
+
where(id: fittable.map(&:id))
|
43
|
+
end
|
44
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
45
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
34
46
|
|
35
47
|
def download_remote_files
|
36
48
|
feature_store&.download
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -17,6 +17,7 @@
|
|
17
17
|
# is_training :boolean
|
18
18
|
# created_at :datetime not null
|
19
19
|
# updated_at :datetime not null
|
20
|
+
# slug :string not null
|
20
21
|
#
|
21
22
|
require_relative "models/hyperparameters"
|
22
23
|
|
@@ -66,6 +67,7 @@ module EasyML
|
|
66
67
|
after_initialize :bump_version, if: -> { new_record? }
|
67
68
|
after_initialize :set_defaults, if: -> { new_record? }
|
68
69
|
before_save :save_model_file, if: -> { is_fit? && !is_history_class? && model_changed? && !@skip_save_model_file }
|
70
|
+
before_validation :set_slug, if: :name_changed?
|
69
71
|
|
70
72
|
VALID_TASKS = %i[regression classification].freeze
|
71
73
|
|
@@ -91,6 +93,7 @@ module EasyML
|
|
91
93
|
}
|
92
94
|
validates :model_type, inclusion: { in: MODEL_NAMES }
|
93
95
|
validates :dataset_id, presence: true
|
96
|
+
validates :slug, presence: true, uniqueness: true
|
94
97
|
validate :validate_metrics_allowed
|
95
98
|
before_save :set_root_dir
|
96
99
|
|
@@ -189,6 +192,7 @@ module EasyML
|
|
189
192
|
evaluator: evaluator,
|
190
193
|
model: self,
|
191
194
|
dataset: dataset,
|
195
|
+
metrics: metrics,
|
192
196
|
}.compact
|
193
197
|
tuner.merge!(extra_params)
|
194
198
|
tuner_instance = EasyML::Core::Tuner.new(tuner)
|
@@ -307,7 +311,6 @@ module EasyML
|
|
307
311
|
|
308
312
|
dataset.refresh
|
309
313
|
adapter.fit(tuning: tuning, x_train: x_train, y_train: y_train, x_valid: x_valid, y_valid: y_valid, &progress_block)
|
310
|
-
@is_fit = true
|
311
314
|
end
|
312
315
|
|
313
316
|
def batch_args
|
@@ -334,11 +337,8 @@ module EasyML
|
|
334
337
|
|
335
338
|
def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
|
336
339
|
adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
|
337
|
-
@is_fit = true
|
338
340
|
end
|
339
341
|
|
340
|
-
attr_accessor :is_fit
|
341
|
-
|
342
342
|
def is_fit?
|
343
343
|
model_file = get_model_file
|
344
344
|
return true if model_file.present? && model_file.fit?
|
@@ -447,6 +447,21 @@ module EasyML
|
|
447
447
|
)
|
448
448
|
end
|
449
449
|
|
450
|
+
include Rails.application.routes.mounted_helpers
|
451
|
+
|
452
|
+
def api_fields
|
453
|
+
{
|
454
|
+
url: EasyML::Engine.routes.url_helpers.predictions_path,
|
455
|
+
method: "POST",
|
456
|
+
data: {
|
457
|
+
model: slug,
|
458
|
+
input: dataset.columns.api_inputs.sort_by_required.map(&:to_api).each_with_object({}) do |field, hash|
|
459
|
+
hash[field[:name]] = field.except(:name)
|
460
|
+
end,
|
461
|
+
},
|
462
|
+
}
|
463
|
+
end
|
464
|
+
|
450
465
|
class CannotdeployError < StandardError
|
451
466
|
end
|
452
467
|
|
@@ -606,6 +621,12 @@ module EasyML
|
|
606
621
|
errors.add(:metrics,
|
607
622
|
"don't know how to handle #{"metrics".pluralize(unknown_metrics)} #{unknown_metrics.join(", ")}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
|
608
623
|
end
|
624
|
+
|
625
|
+
def set_slug
|
626
|
+
if slug.nil? && name.present?
|
627
|
+
self.slug = name.gsub(/\s/, "_").downcase
|
628
|
+
end
|
629
|
+
end
|
609
630
|
end
|
610
631
|
end
|
611
632
|
|
@@ -19,9 +19,19 @@
|
|
19
19
|
#
|
20
20
|
module EasyML
|
21
21
|
class ColumnSerializer
|
22
|
+
class SmallSerializer
|
23
|
+
include JSONAPI::Serializer
|
24
|
+
attributes :id, :name
|
25
|
+
end
|
26
|
+
|
22
27
|
include JSONAPI::Serializer
|
23
28
|
|
24
29
|
attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
|
25
|
-
:hidden, :drop_if_null, :sample_values, :statistics, :is_target
|
30
|
+
:hidden, :drop_if_null, :sample_values, :statistics, :is_target,
|
31
|
+
:is_computed, :computed_by, :lineage
|
32
|
+
|
33
|
+
attribute :required do |object|
|
34
|
+
object.required?
|
35
|
+
end
|
26
36
|
end
|
27
37
|
end
|
@@ -24,6 +24,27 @@ require_relative "./column_serializer"
|
|
24
24
|
#
|
25
25
|
module EasyML
|
26
26
|
class DatasetSerializer
|
27
|
+
class SmallSerializer
|
28
|
+
include JSONAPI::Serializer
|
29
|
+
|
30
|
+
attributes :id, :name, :description, :target, :num_rows, :status,
|
31
|
+
:datasource_id, :preprocessing_steps, :workflow_status, :statistics
|
32
|
+
|
33
|
+
attribute :columns do |dataset|
|
34
|
+
dataset.columns.order(:id).map do |column|
|
35
|
+
ColumnSerializer::SmallSerializer.new(column).serializable_hash.dig(:data, :attributes)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
attribute :stacktrace do |object|
|
39
|
+
if !object.failed? || object.events.empty?
|
40
|
+
nil
|
41
|
+
else
|
42
|
+
last_event = object.events.where(status: :failed).order(id: :desc).limit(1).last
|
43
|
+
last_event&.stacktrace
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
27
48
|
include JSONAPI::Serializer
|
28
49
|
|
29
50
|
attributes :id, :name, :description, :target, :num_rows, :status,
|
@@ -47,7 +68,7 @@ module EasyML
|
|
47
68
|
if dataset.workflow_status.to_sym == :analyzing
|
48
69
|
nil
|
49
70
|
else
|
50
|
-
dataset.data(limit: 10, all_columns: true)&.to_hashes
|
71
|
+
dataset.data(limit: 10, all_columns: true, refresh: false)&.to_hashes || dataset.raw.data(limit: 10, all_columns: true).to_hashes
|
51
72
|
end
|
52
73
|
end
|
53
74
|
|
@@ -62,7 +83,7 @@ module EasyML
|
|
62
83
|
end
|
63
84
|
|
64
85
|
attribute :needs_refresh do |dataset|
|
65
|
-
dataset.needs_refresh?
|
86
|
+
dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
|
66
87
|
end
|
67
88
|
|
68
89
|
attribute :stacktrace do |object|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Enumerable
|
2
|
+
def count_by(&block)
|
3
|
+
self.group_by(&block).inject({}) do |h, (k, v)|
|
4
|
+
h.tap do
|
5
|
+
h[k] = v.count
|
6
|
+
end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def key_by(&block)
|
11
|
+
self.group_by(&block).inject({}) do |h, (k, v)|
|
12
|
+
h.tap do
|
13
|
+
h[k] = v.first
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/config/routes.rb
CHANGED
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -173,7 +173,7 @@ module EasyML
|
|
173
173
|
end
|
174
174
|
raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
|
175
175
|
|
176
|
-
self.metrics = EasyML::Model.new(task: task).
|
176
|
+
self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
|
177
177
|
end
|
178
178
|
end
|
179
179
|
end
|
@@ -3,35 +3,104 @@ module EasyML
|
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
5
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
|
-
"%Y-%m-%d %H:%M:%S.%L Z",
|
7
|
-
"%Y-%m-%d %H:%M:%S.%L",
|
8
|
-
"%Y-%m-%d %H:%M:%S.%L",
|
9
|
-
"%Y-%m-%d %H:%M:%S",
|
10
|
-
"%Y-%m-%d %H:%M",
|
11
|
-
"%Y-%m-%d",
|
12
|
-
"%m/%d/%Y %H:%M:%S",
|
13
|
-
"%m/%d/%Y",
|
14
|
-
"%d-%m-%Y",
|
15
|
-
"%d-%b-%Y %H:%M:%S",
|
16
|
-
"%d-%b-%Y",
|
17
|
-
"%b %d, %Y",
|
18
|
-
"%Y/%m/%d %H:%M:%S",
|
6
|
+
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
7
|
+
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
|
+
"%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
|
9
|
+
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|
10
|
+
"%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
|
11
|
+
"%Y-%m-%d", # e.g., "2021-01-01"
|
12
|
+
"%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
|
13
|
+
"%m/%d/%Y", # e.g., "01/01/2021"
|
14
|
+
"%d-%m-%Y", # e.g., "01-01-2021"
|
15
|
+
"%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
|
16
|
+
"%d-%b-%Y", # e.g., "01-Jan-2021"
|
17
|
+
"%b %d, %Y", # e.g., "Jan 01, 2021"
|
18
|
+
"%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
|
19
19
|
"%Y/%m/%d", # e.g., "2021/01/01"
|
20
20
|
].freeze
|
21
21
|
|
22
22
|
FORMAT_MAPPINGS = {
|
23
23
|
ruby_to_polars: {
|
24
24
|
"%L" => "%3f", # milliseconds
|
25
|
-
"%6N" => "%6f",
|
26
|
-
"%N" => "%9f",
|
25
|
+
"%6N" => "%6f", # microseconds
|
26
|
+
"%N" => "%9f", # nanoseconds
|
27
27
|
},
|
28
28
|
}.freeze
|
29
29
|
|
30
30
|
class << self
|
31
|
-
#
|
32
|
-
#
|
33
|
-
# @param
|
34
|
-
# @return [
|
31
|
+
# Infers a strftime format string from the given date string.
|
32
|
+
#
|
33
|
+
# @param date_str [String] The date string to analyze.
|
34
|
+
# @return [String, nil] The corresponding strftime format if recognized, or nil if not.
|
35
|
+
def infer_strftime_format(date_str)
|
36
|
+
return nil if date_str.blank?
|
37
|
+
|
38
|
+
# YYYY-MM-DD (e.g., "2021-01-01")
|
39
|
+
return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
|
40
|
+
|
41
|
+
# YYYY/MM/DD (e.g., "2021/01/01")
|
42
|
+
return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
|
43
|
+
|
44
|
+
# Date & time with T separator (ISO 8601-like)
|
45
|
+
if date_str.include?("T")
|
46
|
+
# Without fractional seconds, e.g., "2021-01-01T12:34:56"
|
47
|
+
return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
|
48
|
+
|
49
|
+
# With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
|
50
|
+
if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
|
51
|
+
fraction = Regexp.last_match(1)
|
52
|
+
case fraction.length
|
53
|
+
when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
|
54
|
+
when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
|
55
|
+
when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
|
56
|
+
else
|
57
|
+
# Fallback if fractional part has unexpected length:
|
58
|
+
return "%Y-%m-%dT%H:%M:%S.%N"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Date & time with space separator
|
64
|
+
if date_str.include?(" ")
|
65
|
+
# Without fractional seconds, e.g., "2021-01-01 12:34:56"
|
66
|
+
return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
|
67
|
+
|
68
|
+
# With fractional seconds, e.g., "2021-01-01 12:34:56.789"
|
69
|
+
if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
|
70
|
+
fraction = Regexp.last_match(1)
|
71
|
+
case fraction.length
|
72
|
+
when 3 then return "%Y-%m-%d %H:%M:%S.%L"
|
73
|
+
when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
|
74
|
+
when 9 then return "%Y-%m-%d %H:%M:%S.%N"
|
75
|
+
else
|
76
|
+
return "%Y-%m-%d %H:%M:%S.%N"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Common US-style formats
|
82
|
+
|
83
|
+
# MM/DD/YYYY (e.g., "01/31/2021")
|
84
|
+
return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
|
85
|
+
|
86
|
+
# DD-MM-YYYY (e.g., "31-01-2021")
|
87
|
+
return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
|
88
|
+
|
89
|
+
# DD-Mon-YYYY (e.g., "31-Jan-2021")
|
90
|
+
return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
|
91
|
+
|
92
|
+
# Mon DD, YYYY (e.g., "Jan 31, 2021")
|
93
|
+
return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
|
94
|
+
|
95
|
+
# Could add additional heuristics as needed...
|
96
|
+
|
97
|
+
nil # Return nil if no known format matches.
|
98
|
+
end
|
99
|
+
|
100
|
+
# Attempts to convert a string column to datetime if it appears to be a date.
|
101
|
+
# @param df [Polars::DataFrame] The dataframe containing the series.
|
102
|
+
# @param column [String] The name of the column to convert.
|
103
|
+
# @return [Polars::DataFrame] The dataframe with the converted column (if successful).
|
35
104
|
def maybe_convert_date(df, column = nil)
|
36
105
|
if column.nil?
|
37
106
|
series = df
|
@@ -40,19 +109,42 @@ module EasyML
|
|
40
109
|
else
|
41
110
|
series = df[column]
|
42
111
|
end
|
112
|
+
|
43
113
|
return df if series.dtype.is_a?(Polars::Datetime)
|
44
114
|
return df unless series.dtype == Polars::Utf8
|
45
115
|
|
46
|
-
|
47
|
-
|
116
|
+
sample = series.filter(series.is_not_null).head(100).to_a
|
117
|
+
ruby_format = detect_date_format(sample)
|
48
118
|
|
49
|
-
|
50
|
-
|
51
|
-
|
119
|
+
if ruby_format
|
120
|
+
format = convert_format(:ruby_to_polars, ruby_format)
|
121
|
+
df = try_format(df, column, format)
|
122
|
+
|
123
|
+
if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
|
124
|
+
df = df.drop("TRY")
|
125
|
+
best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
|
126
|
+
infer_strftime_format(date_str)
|
127
|
+
end.max_by { |_format, count| count }[0]
|
128
|
+
df = try_format(df, column, best_format)
|
129
|
+
end
|
130
|
+
|
131
|
+
df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
|
132
|
+
end
|
133
|
+
|
134
|
+
df
|
52
135
|
end
|
53
136
|
|
54
137
|
private
|
55
138
|
|
139
|
+
def try_format(df, column, format)
|
140
|
+
df = df.with_column(
|
141
|
+
Polars.col(column.to_s)
|
142
|
+
.str
|
143
|
+
.strptime(Polars::Datetime, format, strict: false)
|
144
|
+
.alias("TRY")
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
56
148
|
def detect_polars_format(series)
|
57
149
|
return nil unless series.is_a?(Polars::Series)
|
58
150
|
|
@@ -66,14 +158,29 @@ module EasyML
|
|
66
158
|
|
67
159
|
sample = date_strings.compact.sample([100, date_strings.length].min)
|
68
160
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
161
|
+
best_format = nil
|
162
|
+
best_success_rate = 0.0
|
163
|
+
sample_count = sample.length
|
164
|
+
|
165
|
+
COMMON_DATE_FORMATS.each do |fmt|
|
166
|
+
success_count = sample.count do |date_str|
|
167
|
+
begin
|
168
|
+
DateTime.strptime(date_str, fmt)
|
169
|
+
true
|
170
|
+
rescue StandardError
|
171
|
+
false
|
172
|
+
end
|
75
173
|
end
|
174
|
+
success_rate = success_count.to_f / sample_count
|
175
|
+
if success_rate > best_success_rate
|
176
|
+
best_success_rate = success_rate
|
177
|
+
best_format = fmt
|
178
|
+
end
|
179
|
+
# If every sample string matches this format, return it immediately.
|
180
|
+
return fmt if success_rate == 1.0
|
76
181
|
end
|
182
|
+
|
183
|
+
best_success_rate >= 0.8 ? best_format : nil
|
77
184
|
end
|
78
185
|
|
79
186
|
def convert_format(conversion, format)
|
@@ -12,6 +12,7 @@ module EasyML
|
|
12
12
|
string: Polars::String,
|
13
13
|
text: Polars::String,
|
14
14
|
categorical: Polars::Categorical,
|
15
|
+
null: Polars::Null,
|
15
16
|
}
|
16
17
|
POLARS_MAP = TYPE_MAP.invert.stringify_keys
|
17
18
|
class << self
|
@@ -19,6 +20,20 @@ module EasyML
|
|
19
20
|
POLARS_MAP.dig(polars_type.class.to_s)
|
20
21
|
end
|
21
22
|
|
23
|
+
def parse_polars_dtype(dtype_string)
|
24
|
+
case dtype_string
|
25
|
+
when /^Polars::Datetime/
|
26
|
+
time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
|
27
|
+
time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
|
28
|
+
time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
|
29
|
+
Polars::Datetime.new(time_unit, time_zone)
|
30
|
+
when /^Polars::/
|
31
|
+
Polars.const_get(dtype_string.split("::").last)
|
32
|
+
else
|
33
|
+
raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
22
37
|
def sym_to_polars(symbol)
|
23
38
|
TYPE_MAP.dig(symbol)
|
24
39
|
end
|
@@ -50,6 +65,8 @@ module EasyML
|
|
50
65
|
:boolean
|
51
66
|
when Polars::Utf8
|
52
67
|
determine_string_type(series)
|
68
|
+
when Polars::Null
|
69
|
+
:null
|
53
70
|
else
|
54
71
|
:categorical
|
55
72
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class PolarsInMemory
|
4
|
+
attr_reader :df
|
5
|
+
|
6
|
+
def initialize(df)
|
7
|
+
@df = df
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.query(df, **kwargs)
|
11
|
+
new(df).query(**kwargs)
|
12
|
+
end
|
13
|
+
|
14
|
+
def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
|
15
|
+
return if df.nil?
|
16
|
+
|
17
|
+
df = self.df.clone
|
18
|
+
df = df.filter(filter) if filter
|
19
|
+
select = df.columns & ([select] || []).flatten
|
20
|
+
df = df.select(select) if select.present?
|
21
|
+
df = df.unique if unique
|
22
|
+
drop_cols &= df.columns
|
23
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
24
|
+
df = df.sort(sort, reverse: descending) if sort
|
25
|
+
df = df.limit(limit) if limit
|
26
|
+
df
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -12,6 +12,22 @@ module EasyML
|
|
12
12
|
@schema = options[:schema]
|
13
13
|
end
|
14
14
|
|
15
|
+
def sha
|
16
|
+
files = parquet_files.sort
|
17
|
+
|
18
|
+
file_hashes = files.map do |file|
|
19
|
+
meta = Polars.read_parquet_schema(file)
|
20
|
+
row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
|
21
|
+
|
22
|
+
Digest::SHA256.hexdigest([
|
23
|
+
meta.to_json,
|
24
|
+
row_count.to_s,
|
25
|
+
].join("|"))
|
26
|
+
end
|
27
|
+
|
28
|
+
Digest::SHA256.hexdigest(file_hashes.join)
|
29
|
+
end
|
30
|
+
|
15
31
|
def schema=(value)
|
16
32
|
@schema = value
|
17
33
|
polars_args[:dtypes] = value
|
@@ -55,7 +71,10 @@ module EasyML
|
|
55
71
|
return files if any_parquet? && columns.nil?
|
56
72
|
|
57
73
|
puts "Converting to Parquet..."
|
58
|
-
|
74
|
+
if columns.nil? || columns.all? { |c| c.datatype.nil? }
|
75
|
+
learn_dataset
|
76
|
+
columns = nil
|
77
|
+
end
|
59
78
|
csv_files.each do |path|
|
60
79
|
df = read_file(path, columns)
|
61
80
|
df = cast(df, columns)
|