easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/engine.rb +1 -0
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc61
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -559,10 +559,36 @@ files:
|
|
559
559
|
- app/jobs/easy_ml/sync_datasource_job.rb
|
560
560
|
- app/jobs/easy_ml/training_job.rb
|
561
561
|
- app/models/concerns/easy_ml/dataframe_serialization.rb
|
562
|
-
- app/models/easy_ml/adapters/base_adapter.rb
|
563
|
-
- app/models/easy_ml/adapters/polars_adapter.rb
|
564
562
|
- app/models/easy_ml/cleaner.rb
|
565
563
|
- app/models/easy_ml/column.rb
|
564
|
+
- app/models/easy_ml/column/imputers.rb
|
565
|
+
- app/models/easy_ml/column/imputers/base.rb
|
566
|
+
- app/models/easy_ml/column/imputers/categorical.rb
|
567
|
+
- app/models/easy_ml/column/imputers/clip.rb
|
568
|
+
- app/models/easy_ml/column/imputers/constant.rb
|
569
|
+
- app/models/easy_ml/column/imputers/ffill.rb
|
570
|
+
- app/models/easy_ml/column/imputers/imputer.rb
|
571
|
+
- app/models/easy_ml/column/imputers/mean.rb
|
572
|
+
- app/models/easy_ml/column/imputers/median.rb
|
573
|
+
- app/models/easy_ml/column/imputers/most_frequent.rb
|
574
|
+
- app/models/easy_ml/column/imputers/null_imputer.rb
|
575
|
+
- app/models/easy_ml/column/imputers/one_hot_encoder.rb
|
576
|
+
- app/models/easy_ml/column/imputers/ordinal_encoder.rb
|
577
|
+
- app/models/easy_ml/column/imputers/today.rb
|
578
|
+
- app/models/easy_ml/column/learner.rb
|
579
|
+
- app/models/easy_ml/column/learners/base.rb
|
580
|
+
- app/models/easy_ml/column/learners/boolean.rb
|
581
|
+
- app/models/easy_ml/column/learners/categorical.rb
|
582
|
+
- app/models/easy_ml/column/learners/datetime.rb
|
583
|
+
- app/models/easy_ml/column/learners/null.rb
|
584
|
+
- app/models/easy_ml/column/learners/numeric.rb
|
585
|
+
- app/models/easy_ml/column/learners/string.rb
|
586
|
+
- app/models/easy_ml/column/lineage.rb
|
587
|
+
- app/models/easy_ml/column/lineage/base.rb
|
588
|
+
- app/models/easy_ml/column/lineage/computed_by_feature.rb
|
589
|
+
- app/models/easy_ml/column/lineage/preprocessed.rb
|
590
|
+
- app/models/easy_ml/column/lineage/raw_dataset.rb
|
591
|
+
- app/models/easy_ml/column/selector.rb
|
566
592
|
- app/models/easy_ml/column_history.rb
|
567
593
|
- app/models/easy_ml/column_list.rb
|
568
594
|
- app/models/easy_ml/concerns/configurable.rb
|
@@ -580,6 +606,7 @@ files:
|
|
580
606
|
- app/models/easy_ml/event_context.rb
|
581
607
|
- app/models/easy_ml/feature.rb
|
582
608
|
- app/models/easy_ml/feature_history.rb
|
609
|
+
- app/models/easy_ml/feature_list.rb
|
583
610
|
- app/models/easy_ml/model.rb
|
584
611
|
- app/models/easy_ml/model_file.rb
|
585
612
|
- app/models/easy_ml/model_file_history.rb
|
@@ -628,6 +655,7 @@ files:
|
|
628
655
|
- bin/setup
|
629
656
|
- bin/vite
|
630
657
|
- config/initializers/dataframe.rb
|
658
|
+
- config/initializers/enumerable.rb
|
631
659
|
- config/initializers/evaluators.rb
|
632
660
|
- config/initializers/inflections.rb
|
633
661
|
- config/initializers/resque.rb
|
@@ -654,15 +682,13 @@ files:
|
|
654
682
|
- lib/easy_ml/data/date_converter.rb
|
655
683
|
- lib/easy_ml/data/filter_extensions.rb
|
656
684
|
- lib/easy_ml/data/polars_column.rb
|
685
|
+
- lib/easy_ml/data/polars_in_memory.rb
|
657
686
|
- lib/easy_ml/data/polars_reader.rb
|
658
|
-
- lib/easy_ml/data/preprocessor.rb
|
659
687
|
- lib/easy_ml/data/preprocessor/utils.rb
|
660
|
-
- lib/easy_ml/data/simple_imputer.rb
|
661
688
|
- lib/easy_ml/data/splits.rb
|
662
689
|
- lib/easy_ml/data/splits/file_split.rb
|
663
690
|
- lib/easy_ml/data/splits/in_memory_split.rb
|
664
691
|
- lib/easy_ml/data/splits/split.rb
|
665
|
-
- lib/easy_ml/data/statistics_learner.rb
|
666
692
|
- lib/easy_ml/data/synced_directory.rb
|
667
693
|
- lib/easy_ml/data/utils.rb
|
668
694
|
- lib/easy_ml/engine.rb
|
@@ -677,6 +703,9 @@ files:
|
|
677
703
|
- lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt
|
678
704
|
- lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt
|
679
705
|
- lib/easy_ml/railtie/templates/migration/add_is_date_column_to_easy_ml_columns.rb.tt
|
706
|
+
- lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt
|
707
|
+
- lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt
|
708
|
+
- lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
|
680
709
|
- lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt
|
681
710
|
- lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt
|
682
711
|
- lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt
|
@@ -701,6 +730,7 @@ files:
|
|
701
730
|
- lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt
|
702
731
|
- lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt
|
703
732
|
- lib/easy_ml/railtie/templates/migration/drop_path_from_easy_ml_model_files.rb.tt
|
733
|
+
- lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt
|
704
734
|
- lib/easy_ml/support.rb
|
705
735
|
- lib/easy_ml/support/age.rb
|
706
736
|
- lib/easy_ml/support/est.rb
|
@@ -712,14 +742,15 @@ files:
|
|
712
742
|
- lib/easy_ml/support/synced_file.rb
|
713
743
|
- lib/easy_ml/support/utc.rb
|
714
744
|
- lib/easy_ml/version.rb
|
745
|
+
- lib/tasks/profile.rake
|
715
746
|
- lib/tasks/resque.rake
|
716
747
|
- lib/tasks/vite.rake
|
717
748
|
- lib/tasks/zhong.rake
|
718
749
|
- public/easy_ml/assets/.vite/manifest-assets.json
|
719
750
|
- public/easy_ml/assets/.vite/manifest.json
|
720
|
-
- public/easy_ml/assets/assets/Application-
|
721
|
-
- public/easy_ml/assets/assets/entrypoints/Application.tsx-
|
722
|
-
- public/easy_ml/assets/assets/entrypoints/Application.tsx-
|
751
|
+
- public/easy_ml/assets/assets/Application-BbFobaXt.css
|
752
|
+
- public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js
|
753
|
+
- public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map
|
723
754
|
homepage: https://github.com/brettshollenberger/easy_ml
|
724
755
|
licenses:
|
725
756
|
- MIT
|
@@ -1,45 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Adapters
|
3
|
-
class BaseAdapter
|
4
|
-
attr_reader :datasource
|
5
|
-
|
6
|
-
def initialize(datasource)
|
7
|
-
@datasource = datasource
|
8
|
-
end
|
9
|
-
|
10
|
-
def query(*)
|
11
|
-
raise NotImplementedError
|
12
|
-
end
|
13
|
-
|
14
|
-
def in_batches(*)
|
15
|
-
raise NotImplementedError
|
16
|
-
end
|
17
|
-
|
18
|
-
def files
|
19
|
-
raise NotImplementedError
|
20
|
-
end
|
21
|
-
|
22
|
-
def last_updated_at
|
23
|
-
raise NotImplementedError
|
24
|
-
end
|
25
|
-
|
26
|
-
def data
|
27
|
-
raise NotImplementedError
|
28
|
-
end
|
29
|
-
|
30
|
-
def needs_refresh?
|
31
|
-
false
|
32
|
-
end
|
33
|
-
|
34
|
-
def refresh
|
35
|
-
datasource.syncing do
|
36
|
-
# Default implementation does nothing
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def refresh!
|
41
|
-
refresh
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
@@ -1,77 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Adapters
|
3
|
-
class PolarsAdapter < BaseAdapter
|
4
|
-
def initialize(datasource)
|
5
|
-
super
|
6
|
-
read_df_from_configuration
|
7
|
-
end
|
8
|
-
|
9
|
-
def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
|
10
|
-
return if df.nil?
|
11
|
-
|
12
|
-
df = self.df.clone
|
13
|
-
df = df.filter(filter) if filter
|
14
|
-
df = df.select(select) if select.present?
|
15
|
-
df = df.unique if unique
|
16
|
-
drop_cols &= df.columns
|
17
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
18
|
-
df = df.sort(sort, reverse: descending) if sort
|
19
|
-
df = df.limit(limit) if limit
|
20
|
-
df
|
21
|
-
end
|
22
|
-
|
23
|
-
def in_batches(of: 10_000)
|
24
|
-
total_rows = df.shape[0]
|
25
|
-
(0...total_rows).step(of) do |start|
|
26
|
-
end_index = [start + of, total_rows].min
|
27
|
-
yield df.slice(start, end_index - start)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def files
|
32
|
-
[]
|
33
|
-
end
|
34
|
-
|
35
|
-
def last_updated_at
|
36
|
-
datasource.updated_at
|
37
|
-
end
|
38
|
-
|
39
|
-
def data
|
40
|
-
df
|
41
|
-
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
attr_accessor :df
|
46
|
-
|
47
|
-
def store_df_in_configuration
|
48
|
-
return unless df
|
49
|
-
|
50
|
-
datasource.configuration = (datasource.configuration || {}).merge(
|
51
|
-
"df" => JSON.parse(df.write_json)
|
52
|
-
)
|
53
|
-
end
|
54
|
-
|
55
|
-
def read_df_from_configuration
|
56
|
-
return unless datasource.configuration&.key?("df")
|
57
|
-
|
58
|
-
df_data = datasource.configuration["df"]
|
59
|
-
columns = df_data["columns"].map do |col|
|
60
|
-
dtype = case col["datatype"]
|
61
|
-
when Hash
|
62
|
-
if col["datatype"]["Datetime"]
|
63
|
-
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
64
|
-
else
|
65
|
-
Polars::Utf8
|
66
|
-
end
|
67
|
-
else
|
68
|
-
Polars.const_get(col["datatype"])
|
69
|
-
end
|
70
|
-
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
71
|
-
end
|
72
|
-
|
73
|
-
@df = Polars::DataFrame.new(columns)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
@@ -1,340 +0,0 @@
|
|
1
|
-
require "fileutils"
|
2
|
-
require "polars"
|
3
|
-
require "date"
|
4
|
-
require "json"
|
5
|
-
require_relative "simple_imputer"
|
6
|
-
|
7
|
-
module EasyML::Data
|
8
|
-
class Preprocessor
|
9
|
-
CATEGORICAL_COMMON_MIN = 50
|
10
|
-
|
11
|
-
ALLOWED_PARAMS = {
|
12
|
-
constant: [:constant],
|
13
|
-
categorical: %i[categorical_min one_hot ordinal_encoding],
|
14
|
-
most_frequent: %i[one_hot ordinal_encoding],
|
15
|
-
mean: [:clip],
|
16
|
-
median: [:clip],
|
17
|
-
}
|
18
|
-
|
19
|
-
PREPROCESSING_STRATEGIES = {
|
20
|
-
float: [
|
21
|
-
{ value: "ffill", label: "Forward Fill" },
|
22
|
-
{ value: "mean", label: "Mean" },
|
23
|
-
{ value: "median", label: "Median" },
|
24
|
-
{ value: "constant", label: "Constant Value" },
|
25
|
-
],
|
26
|
-
integer: [
|
27
|
-
{ value: "ffill", label: "Forward Fill" },
|
28
|
-
{ value: "mean", label: "Mean" },
|
29
|
-
{ value: "median", label: "Median" },
|
30
|
-
{ value: "constant", label: "Constant Value" },
|
31
|
-
],
|
32
|
-
boolean: [
|
33
|
-
{ value: "ffill", label: "Forward Fill" },
|
34
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
35
|
-
{ value: "constant", label: "Constant Value" },
|
36
|
-
],
|
37
|
-
datetime: [
|
38
|
-
{ value: "ffill", label: "Forward Fill" },
|
39
|
-
{ value: "constant", label: "Constant Value" },
|
40
|
-
{ value: "today", label: "Current Date" },
|
41
|
-
],
|
42
|
-
string: [
|
43
|
-
{ value: "ffill", label: "Forward Fill" },
|
44
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
45
|
-
{ value: "constant", label: "Constant Value" },
|
46
|
-
],
|
47
|
-
text: [
|
48
|
-
{ value: "ffill", label: "Forward Fill" },
|
49
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
50
|
-
{ value: "constant", label: "Constant Value" },
|
51
|
-
],
|
52
|
-
categorical: [
|
53
|
-
{ value: "ffill", label: "Forward Fill" },
|
54
|
-
{ value: "categorical", label: "Categorical" },
|
55
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
56
|
-
{ value: "constant", label: "Constant Value" },
|
57
|
-
],
|
58
|
-
}.freeze
|
59
|
-
|
60
|
-
attr_accessor :directory, :verbose, :imputers, :preprocessing_steps, :dataset
|
61
|
-
attr_reader :statistics
|
62
|
-
|
63
|
-
def initialize(options = {})
|
64
|
-
@directory = options[:directory]
|
65
|
-
@verbose = options[:verbose]
|
66
|
-
@imputers = options[:imputers]
|
67
|
-
@preprocessing_steps = options[:preprocessing_steps]
|
68
|
-
@dataset = options[:dataset]
|
69
|
-
@statistics = {}
|
70
|
-
end
|
71
|
-
|
72
|
-
def statistics=(stats)
|
73
|
-
@statistics = (stats || {}).deep_symbolize_keys
|
74
|
-
end
|
75
|
-
|
76
|
-
def apply_clip(df, preprocessing_steps)
|
77
|
-
df = df.clone
|
78
|
-
preprocessing_steps ||= {}
|
79
|
-
preprocessing_steps.deep_symbolize_keys!
|
80
|
-
|
81
|
-
(preprocessing_steps[:training] || {}).each_key do |col|
|
82
|
-
clip_params = preprocessing_steps.dig(:training, col, :params, :clip)
|
83
|
-
next unless clip_params
|
84
|
-
|
85
|
-
min = clip_params[:min]
|
86
|
-
max = clip_params[:max]
|
87
|
-
df[col.to_s] = df[col.to_s].clip(min, max)
|
88
|
-
end
|
89
|
-
|
90
|
-
df
|
91
|
-
end
|
92
|
-
|
93
|
-
def fit(df, precomputed_stats = {})
|
94
|
-
return if df.nil?
|
95
|
-
return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
96
|
-
|
97
|
-
preprocessing_steps.deep_symbolize_keys!
|
98
|
-
df = apply_clip(df, preprocessing_steps)
|
99
|
-
|
100
|
-
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
|
101
|
-
precomputed_stats
|
102
|
-
).deep_symbolize_keys
|
103
|
-
end
|
104
|
-
|
105
|
-
def postprocess(df, inference: false, computed: false)
|
106
|
-
puts "Postprocessing..." if verbose
|
107
|
-
return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
108
|
-
|
109
|
-
steps = if inference
|
110
|
-
preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
|
111
|
-
else
|
112
|
-
preprocessing_steps[:training]
|
113
|
-
end
|
114
|
-
|
115
|
-
if computed
|
116
|
-
computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
|
117
|
-
steps = steps.deep_dup.slice(*computed_cols)
|
118
|
-
end
|
119
|
-
|
120
|
-
df = apply_transformations(df, steps)
|
121
|
-
|
122
|
-
puts "Postprocessing complete." if @verbose
|
123
|
-
df
|
124
|
-
end
|
125
|
-
|
126
|
-
def decode_labels(values, col: nil)
|
127
|
-
decoder = statistics.dig(col.to_sym, :label_decoder)
|
128
|
-
other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
|
129
|
-
decoder[other_value] = "other"
|
130
|
-
decoder.stringify_keys!
|
131
|
-
|
132
|
-
values.map do |value|
|
133
|
-
decoder[value.to_s]
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
def is_fit?
|
138
|
-
statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
|
139
|
-
end
|
140
|
-
|
141
|
-
def delete
|
142
|
-
return unless File.directory?(@directory)
|
143
|
-
|
144
|
-
FileUtils.rm_rf(@directory)
|
145
|
-
end
|
146
|
-
|
147
|
-
def serialize
|
148
|
-
{
|
149
|
-
directory: directory,
|
150
|
-
verbose: verbose,
|
151
|
-
imputers: imputers,
|
152
|
-
preprocessing_steps: preprocessing_steps,
|
153
|
-
statistics: serialize_statistics(statistics || {}),
|
154
|
-
}
|
155
|
-
end
|
156
|
-
|
157
|
-
private
|
158
|
-
|
159
|
-
def initialize_imputers(config)
|
160
|
-
config.each_with_object({}) do |(col, conf), hash|
|
161
|
-
hash[col] ||= {}
|
162
|
-
conf.symbolize_keys!
|
163
|
-
method = conf[:method]
|
164
|
-
params = conf[:params] || {}
|
165
|
-
|
166
|
-
hash[col][method] = EasyML::Data::SimpleImputer.new(
|
167
|
-
strategy: method,
|
168
|
-
options: params,
|
169
|
-
path: directory,
|
170
|
-
attribute: col,
|
171
|
-
statistics: statistics.dig(col),
|
172
|
-
)
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
def apply_transformations(df, config)
|
177
|
-
imputers = initialize_imputers(config)
|
178
|
-
|
179
|
-
df = apply_clip(df, { training: config })
|
180
|
-
|
181
|
-
config.each do |col, conf|
|
182
|
-
conf.symbolize_keys!
|
183
|
-
if df.columns.map(&:downcase).map(&:to_s).include?(col.downcase.to_s)
|
184
|
-
actual_col = df.columns.map(&:to_s).find { |c| c.to_s.downcase == col.to_s.downcase }
|
185
|
-
|
186
|
-
strategy = conf[:method]
|
187
|
-
params = conf[:params]
|
188
|
-
imputer = imputers.dig(col, strategy)
|
189
|
-
|
190
|
-
df[actual_col] = imputer.transform(df[actual_col]) if imputer
|
191
|
-
|
192
|
-
if params.is_a?(Hash) && params.key?(:one_hot) && params[:one_hot] == true
|
193
|
-
df = apply_one_hot(df, col)
|
194
|
-
elsif params.is_a?(Hash) && params.key?(:ordinal_encoding) && params[:ordinal_encoding] == true
|
195
|
-
df = apply_ordinal_encoding(df, col)
|
196
|
-
end
|
197
|
-
elsif @verbose
|
198
|
-
puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
df
|
203
|
-
end
|
204
|
-
|
205
|
-
def apply_one_hot(df, col)
|
206
|
-
approved_values = statistics.dig(col, :allowed_categories).sort
|
207
|
-
|
208
|
-
# Create one-hot encoded columns
|
209
|
-
approved_values.each do |value|
|
210
|
-
new_col_name = "#{col}_#{value}".gsub(/-/, "_")
|
211
|
-
df = df.with_column(
|
212
|
-
df[col].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
|
213
|
-
)
|
214
|
-
end
|
215
|
-
|
216
|
-
# Create 'other' column for unapproved values
|
217
|
-
other_col_name = "#{col}_other"
|
218
|
-
df[other_col_name] = df[col].map_elements do |value|
|
219
|
-
approved_values.map(&:to_s).exclude?(value)
|
220
|
-
end.cast(Polars::Boolean)
|
221
|
-
df.drop([col.to_s])
|
222
|
-
end
|
223
|
-
|
224
|
-
def apply_ordinal_encoding(df, col)
|
225
|
-
approved_values = statistics.dig(col, :allowed_categories)
|
226
|
-
|
227
|
-
df.with_column(
|
228
|
-
df[col].map_elements do |value|
|
229
|
-
approved_values.map(&:to_s).exclude?(value) ? "other" : value
|
230
|
-
end.alias(col.to_s)
|
231
|
-
)
|
232
|
-
|
233
|
-
label_encoder = statistics.dig(col, :label_encoder).stringify_keys
|
234
|
-
other_value = label_encoder.values.max + 1
|
235
|
-
label_encoder["other"] = other_value
|
236
|
-
df.with_column(
|
237
|
-
df[col].map { |v| label_encoder[v.to_s] }.alias(col.to_s)
|
238
|
-
)
|
239
|
-
end
|
240
|
-
|
241
|
-
def prepare_for_imputation(df, col)
|
242
|
-
df = df.with_column(Polars.col(col).cast(Polars::Float64))
|
243
|
-
df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
|
244
|
-
end
|
245
|
-
|
246
|
-
def serialize_statistics(stats)
|
247
|
-
stats.deep_transform_values do |value|
|
248
|
-
case value
|
249
|
-
when Time, DateTime
|
250
|
-
{ "__type__" => "datetime", "value" => value.iso8601 }
|
251
|
-
when Date
|
252
|
-
{ "__type__" => "date", "value" => value.iso8601 }
|
253
|
-
when BigDecimal
|
254
|
-
{ "__type__" => "bigdecimal", "value" => value.to_s }
|
255
|
-
when Polars::DataType
|
256
|
-
{ "__type__" => "polars_dtype", "value" => value.to_s }
|
257
|
-
when Symbol
|
258
|
-
{ "__type__" => "symbol", "value" => value.to_s }
|
259
|
-
else
|
260
|
-
value
|
261
|
-
end
|
262
|
-
end
|
263
|
-
end
|
264
|
-
|
265
|
-
def deserialize_statistics(stats)
|
266
|
-
return nil if stats.nil?
|
267
|
-
|
268
|
-
stats.transform_values do |value|
|
269
|
-
recursive_deserialize(value)
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
def recursive_deserialize(value)
|
274
|
-
case value
|
275
|
-
when Hash
|
276
|
-
if value["__type__"]
|
277
|
-
deserialize_special_type(value)
|
278
|
-
else
|
279
|
-
value.transform_values { |v| recursive_deserialize(v) }
|
280
|
-
end
|
281
|
-
when Array
|
282
|
-
value.map { |v| recursive_deserialize(v) }
|
283
|
-
else
|
284
|
-
value
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
def deserialize_special_type(value)
|
289
|
-
case value["__type__"]
|
290
|
-
when "datetime"
|
291
|
-
DateTime.parse(value["value"])
|
292
|
-
when "date"
|
293
|
-
Date.parse(value["value"])
|
294
|
-
when "bigdecimal"
|
295
|
-
BigDecimal(value["value"])
|
296
|
-
when "polars_dtype"
|
297
|
-
parse_polars_dtype(value["value"])
|
298
|
-
when "symbol"
|
299
|
-
value["value"].to_sym
|
300
|
-
else
|
301
|
-
value["value"]
|
302
|
-
end
|
303
|
-
end
|
304
|
-
|
305
|
-
def parse_polars_dtype(dtype_string)
|
306
|
-
case dtype_string
|
307
|
-
when /^Polars::Datetime/
|
308
|
-
time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
|
309
|
-
time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
|
310
|
-
time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
|
311
|
-
Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
|
312
|
-
when /^Polars::/
|
313
|
-
Polars.const_get(dtype_string.split("::").last)
|
314
|
-
else
|
315
|
-
raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
def cast_to_dtype(value, dtype)
|
320
|
-
case dtype
|
321
|
-
when Polars::Int64
|
322
|
-
value.to_i
|
323
|
-
when Polars::Float64
|
324
|
-
value.to_f
|
325
|
-
when Polars::Boolean
|
326
|
-
!!value
|
327
|
-
when Polars::Utf8
|
328
|
-
value.to_s
|
329
|
-
else
|
330
|
-
value
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def self.constants
|
335
|
-
{
|
336
|
-
preprocessing_strategies: PREPROCESSING_STRATEGIES,
|
337
|
-
}
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|