easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc58
4
+ version: 0.2.0.pre.rc61
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-02 00:00:00.000000000 Z
11
+ date: 2025-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -559,10 +559,36 @@ files:
559
559
  - app/jobs/easy_ml/sync_datasource_job.rb
560
560
  - app/jobs/easy_ml/training_job.rb
561
561
  - app/models/concerns/easy_ml/dataframe_serialization.rb
562
- - app/models/easy_ml/adapters/base_adapter.rb
563
- - app/models/easy_ml/adapters/polars_adapter.rb
564
562
  - app/models/easy_ml/cleaner.rb
565
563
  - app/models/easy_ml/column.rb
564
+ - app/models/easy_ml/column/imputers.rb
565
+ - app/models/easy_ml/column/imputers/base.rb
566
+ - app/models/easy_ml/column/imputers/categorical.rb
567
+ - app/models/easy_ml/column/imputers/clip.rb
568
+ - app/models/easy_ml/column/imputers/constant.rb
569
+ - app/models/easy_ml/column/imputers/ffill.rb
570
+ - app/models/easy_ml/column/imputers/imputer.rb
571
+ - app/models/easy_ml/column/imputers/mean.rb
572
+ - app/models/easy_ml/column/imputers/median.rb
573
+ - app/models/easy_ml/column/imputers/most_frequent.rb
574
+ - app/models/easy_ml/column/imputers/null_imputer.rb
575
+ - app/models/easy_ml/column/imputers/one_hot_encoder.rb
576
+ - app/models/easy_ml/column/imputers/ordinal_encoder.rb
577
+ - app/models/easy_ml/column/imputers/today.rb
578
+ - app/models/easy_ml/column/learner.rb
579
+ - app/models/easy_ml/column/learners/base.rb
580
+ - app/models/easy_ml/column/learners/boolean.rb
581
+ - app/models/easy_ml/column/learners/categorical.rb
582
+ - app/models/easy_ml/column/learners/datetime.rb
583
+ - app/models/easy_ml/column/learners/null.rb
584
+ - app/models/easy_ml/column/learners/numeric.rb
585
+ - app/models/easy_ml/column/learners/string.rb
586
+ - app/models/easy_ml/column/lineage.rb
587
+ - app/models/easy_ml/column/lineage/base.rb
588
+ - app/models/easy_ml/column/lineage/computed_by_feature.rb
589
+ - app/models/easy_ml/column/lineage/preprocessed.rb
590
+ - app/models/easy_ml/column/lineage/raw_dataset.rb
591
+ - app/models/easy_ml/column/selector.rb
566
592
  - app/models/easy_ml/column_history.rb
567
593
  - app/models/easy_ml/column_list.rb
568
594
  - app/models/easy_ml/concerns/configurable.rb
@@ -580,6 +606,7 @@ files:
580
606
  - app/models/easy_ml/event_context.rb
581
607
  - app/models/easy_ml/feature.rb
582
608
  - app/models/easy_ml/feature_history.rb
609
+ - app/models/easy_ml/feature_list.rb
583
610
  - app/models/easy_ml/model.rb
584
611
  - app/models/easy_ml/model_file.rb
585
612
  - app/models/easy_ml/model_file_history.rb
@@ -628,6 +655,7 @@ files:
628
655
  - bin/setup
629
656
  - bin/vite
630
657
  - config/initializers/dataframe.rb
658
+ - config/initializers/enumerable.rb
631
659
  - config/initializers/evaluators.rb
632
660
  - config/initializers/inflections.rb
633
661
  - config/initializers/resque.rb
@@ -654,15 +682,13 @@ files:
654
682
  - lib/easy_ml/data/date_converter.rb
655
683
  - lib/easy_ml/data/filter_extensions.rb
656
684
  - lib/easy_ml/data/polars_column.rb
685
+ - lib/easy_ml/data/polars_in_memory.rb
657
686
  - lib/easy_ml/data/polars_reader.rb
658
- - lib/easy_ml/data/preprocessor.rb
659
687
  - lib/easy_ml/data/preprocessor/utils.rb
660
- - lib/easy_ml/data/simple_imputer.rb
661
688
  - lib/easy_ml/data/splits.rb
662
689
  - lib/easy_ml/data/splits/file_split.rb
663
690
  - lib/easy_ml/data/splits/in_memory_split.rb
664
691
  - lib/easy_ml/data/splits/split.rb
665
- - lib/easy_ml/data/statistics_learner.rb
666
692
  - lib/easy_ml/data/synced_directory.rb
667
693
  - lib/easy_ml/data/utils.rb
668
694
  - lib/easy_ml/engine.rb
@@ -677,6 +703,9 @@ files:
677
703
  - lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt
678
704
  - lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt
679
705
  - lib/easy_ml/railtie/templates/migration/add_is_date_column_to_easy_ml_columns.rb.tt
706
+ - lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt
707
+ - lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt
708
+ - lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
680
709
  - lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt
681
710
  - lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt
682
711
  - lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt
@@ -701,6 +730,7 @@ files:
701
730
  - lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt
702
731
  - lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt
703
732
  - lib/easy_ml/railtie/templates/migration/drop_path_from_easy_ml_model_files.rb.tt
733
+ - lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt
704
734
  - lib/easy_ml/support.rb
705
735
  - lib/easy_ml/support/age.rb
706
736
  - lib/easy_ml/support/est.rb
@@ -712,14 +742,15 @@ files:
712
742
  - lib/easy_ml/support/synced_file.rb
713
743
  - lib/easy_ml/support/utc.rb
714
744
  - lib/easy_ml/version.rb
745
+ - lib/tasks/profile.rake
715
746
  - lib/tasks/resque.rake
716
747
  - lib/tasks/vite.rake
717
748
  - lib/tasks/zhong.rake
718
749
  - public/easy_ml/assets/.vite/manifest-assets.json
719
750
  - public/easy_ml/assets/.vite/manifest.json
720
- - public/easy_ml/assets/assets/Application-BUsRR6b6.css
721
- - public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js
722
- - public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map
751
+ - public/easy_ml/assets/assets/Application-BbFobaXt.css
752
+ - public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js
753
+ - public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map
723
754
  homepage: https://github.com/brettshollenberger/easy_ml
724
755
  licenses:
725
756
  - MIT
@@ -1,45 +0,0 @@
1
- module EasyML
2
- module Adapters
3
- class BaseAdapter
4
- attr_reader :datasource
5
-
6
- def initialize(datasource)
7
- @datasource = datasource
8
- end
9
-
10
- def query(*)
11
- raise NotImplementedError
12
- end
13
-
14
- def in_batches(*)
15
- raise NotImplementedError
16
- end
17
-
18
- def files
19
- raise NotImplementedError
20
- end
21
-
22
- def last_updated_at
23
- raise NotImplementedError
24
- end
25
-
26
- def data
27
- raise NotImplementedError
28
- end
29
-
30
- def needs_refresh?
31
- false
32
- end
33
-
34
- def refresh
35
- datasource.syncing do
36
- # Default implementation does nothing
37
- end
38
- end
39
-
40
- def refresh!
41
- refresh
42
- end
43
- end
44
- end
45
- end
@@ -1,77 +0,0 @@
1
- module EasyML
2
- module Adapters
3
- class PolarsAdapter < BaseAdapter
4
- def initialize(datasource)
5
- super
6
- read_df_from_configuration
7
- end
8
-
9
- def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
10
- return if df.nil?
11
-
12
- df = self.df.clone
13
- df = df.filter(filter) if filter
14
- df = df.select(select) if select.present?
15
- df = df.unique if unique
16
- drop_cols &= df.columns
17
- df = df.drop(drop_cols) unless drop_cols.empty?
18
- df = df.sort(sort, reverse: descending) if sort
19
- df = df.limit(limit) if limit
20
- df
21
- end
22
-
23
- def in_batches(of: 10_000)
24
- total_rows = df.shape[0]
25
- (0...total_rows).step(of) do |start|
26
- end_index = [start + of, total_rows].min
27
- yield df.slice(start, end_index - start)
28
- end
29
- end
30
-
31
- def files
32
- []
33
- end
34
-
35
- def last_updated_at
36
- datasource.updated_at
37
- end
38
-
39
- def data
40
- df
41
- end
42
-
43
- private
44
-
45
- attr_accessor :df
46
-
47
- def store_df_in_configuration
48
- return unless df
49
-
50
- datasource.configuration = (datasource.configuration || {}).merge(
51
- "df" => JSON.parse(df.write_json)
52
- )
53
- end
54
-
55
- def read_df_from_configuration
56
- return unless datasource.configuration&.key?("df")
57
-
58
- df_data = datasource.configuration["df"]
59
- columns = df_data["columns"].map do |col|
60
- dtype = case col["datatype"]
61
- when Hash
62
- if col["datatype"]["Datetime"]
63
- Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
64
- else
65
- Polars::Utf8
66
- end
67
- else
68
- Polars.const_get(col["datatype"])
69
- end
70
- Polars::Series.new(col["name"], col["values"], dtype: dtype)
71
- end
72
-
73
- @df = Polars::DataFrame.new(columns)
74
- end
75
- end
76
- end
77
- end
@@ -1,340 +0,0 @@
1
- require "fileutils"
2
- require "polars"
3
- require "date"
4
- require "json"
5
- require_relative "simple_imputer"
6
-
7
- module EasyML::Data
8
- class Preprocessor
9
- CATEGORICAL_COMMON_MIN = 50
10
-
11
- ALLOWED_PARAMS = {
12
- constant: [:constant],
13
- categorical: %i[categorical_min one_hot ordinal_encoding],
14
- most_frequent: %i[one_hot ordinal_encoding],
15
- mean: [:clip],
16
- median: [:clip],
17
- }
18
-
19
- PREPROCESSING_STRATEGIES = {
20
- float: [
21
- { value: "ffill", label: "Forward Fill" },
22
- { value: "mean", label: "Mean" },
23
- { value: "median", label: "Median" },
24
- { value: "constant", label: "Constant Value" },
25
- ],
26
- integer: [
27
- { value: "ffill", label: "Forward Fill" },
28
- { value: "mean", label: "Mean" },
29
- { value: "median", label: "Median" },
30
- { value: "constant", label: "Constant Value" },
31
- ],
32
- boolean: [
33
- { value: "ffill", label: "Forward Fill" },
34
- { value: "most_frequent", label: "Most Frequent" },
35
- { value: "constant", label: "Constant Value" },
36
- ],
37
- datetime: [
38
- { value: "ffill", label: "Forward Fill" },
39
- { value: "constant", label: "Constant Value" },
40
- { value: "today", label: "Current Date" },
41
- ],
42
- string: [
43
- { value: "ffill", label: "Forward Fill" },
44
- { value: "most_frequent", label: "Most Frequent" },
45
- { value: "constant", label: "Constant Value" },
46
- ],
47
- text: [
48
- { value: "ffill", label: "Forward Fill" },
49
- { value: "most_frequent", label: "Most Frequent" },
50
- { value: "constant", label: "Constant Value" },
51
- ],
52
- categorical: [
53
- { value: "ffill", label: "Forward Fill" },
54
- { value: "categorical", label: "Categorical" },
55
- { value: "most_frequent", label: "Most Frequent" },
56
- { value: "constant", label: "Constant Value" },
57
- ],
58
- }.freeze
59
-
60
- attr_accessor :directory, :verbose, :imputers, :preprocessing_steps, :dataset
61
- attr_reader :statistics
62
-
63
- def initialize(options = {})
64
- @directory = options[:directory]
65
- @verbose = options[:verbose]
66
- @imputers = options[:imputers]
67
- @preprocessing_steps = options[:preprocessing_steps]
68
- @dataset = options[:dataset]
69
- @statistics = {}
70
- end
71
-
72
- def statistics=(stats)
73
- @statistics = (stats || {}).deep_symbolize_keys
74
- end
75
-
76
- def apply_clip(df, preprocessing_steps)
77
- df = df.clone
78
- preprocessing_steps ||= {}
79
- preprocessing_steps.deep_symbolize_keys!
80
-
81
- (preprocessing_steps[:training] || {}).each_key do |col|
82
- clip_params = preprocessing_steps.dig(:training, col, :params, :clip)
83
- next unless clip_params
84
-
85
- min = clip_params[:min]
86
- max = clip_params[:max]
87
- df[col.to_s] = df[col.to_s].clip(min, max)
88
- end
89
-
90
- df
91
- end
92
-
93
- def fit(df, precomputed_stats = {})
94
- return if df.nil?
95
- return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
96
-
97
- preprocessing_steps.deep_symbolize_keys!
98
- df = apply_clip(df, preprocessing_steps)
99
-
100
- self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
101
- precomputed_stats
102
- ).deep_symbolize_keys
103
- end
104
-
105
- def postprocess(df, inference: false, computed: false)
106
- puts "Postprocessing..." if verbose
107
- return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
108
-
109
- steps = if inference
110
- preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
111
- else
112
- preprocessing_steps[:training]
113
- end
114
-
115
- if computed
116
- computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
117
- steps = steps.deep_dup.slice(*computed_cols)
118
- end
119
-
120
- df = apply_transformations(df, steps)
121
-
122
- puts "Postprocessing complete." if @verbose
123
- df
124
- end
125
-
126
- def decode_labels(values, col: nil)
127
- decoder = statistics.dig(col.to_sym, :label_decoder)
128
- other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
129
- decoder[other_value] = "other"
130
- decoder.stringify_keys!
131
-
132
- values.map do |value|
133
- decoder[value.to_s]
134
- end
135
- end
136
-
137
- def is_fit?
138
- statistics.any? { |_, col_stats| col_stats.any? { |_, strategy_stats| strategy_stats.present? } }
139
- end
140
-
141
- def delete
142
- return unless File.directory?(@directory)
143
-
144
- FileUtils.rm_rf(@directory)
145
- end
146
-
147
- def serialize
148
- {
149
- directory: directory,
150
- verbose: verbose,
151
- imputers: imputers,
152
- preprocessing_steps: preprocessing_steps,
153
- statistics: serialize_statistics(statistics || {}),
154
- }
155
- end
156
-
157
- private
158
-
159
- def initialize_imputers(config)
160
- config.each_with_object({}) do |(col, conf), hash|
161
- hash[col] ||= {}
162
- conf.symbolize_keys!
163
- method = conf[:method]
164
- params = conf[:params] || {}
165
-
166
- hash[col][method] = EasyML::Data::SimpleImputer.new(
167
- strategy: method,
168
- options: params,
169
- path: directory,
170
- attribute: col,
171
- statistics: statistics.dig(col),
172
- )
173
- end
174
- end
175
-
176
- def apply_transformations(df, config)
177
- imputers = initialize_imputers(config)
178
-
179
- df = apply_clip(df, { training: config })
180
-
181
- config.each do |col, conf|
182
- conf.symbolize_keys!
183
- if df.columns.map(&:downcase).map(&:to_s).include?(col.downcase.to_s)
184
- actual_col = df.columns.map(&:to_s).find { |c| c.to_s.downcase == col.to_s.downcase }
185
-
186
- strategy = conf[:method]
187
- params = conf[:params]
188
- imputer = imputers.dig(col, strategy)
189
-
190
- df[actual_col] = imputer.transform(df[actual_col]) if imputer
191
-
192
- if params.is_a?(Hash) && params.key?(:one_hot) && params[:one_hot] == true
193
- df = apply_one_hot(df, col)
194
- elsif params.is_a?(Hash) && params.key?(:ordinal_encoding) && params[:ordinal_encoding] == true
195
- df = apply_ordinal_encoding(df, col)
196
- end
197
- elsif @verbose
198
- puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
199
- end
200
- end
201
-
202
- df
203
- end
204
-
205
- def apply_one_hot(df, col)
206
- approved_values = statistics.dig(col, :allowed_categories).sort
207
-
208
- # Create one-hot encoded columns
209
- approved_values.each do |value|
210
- new_col_name = "#{col}_#{value}".gsub(/-/, "_")
211
- df = df.with_column(
212
- df[col].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
213
- )
214
- end
215
-
216
- # Create 'other' column for unapproved values
217
- other_col_name = "#{col}_other"
218
- df[other_col_name] = df[col].map_elements do |value|
219
- approved_values.map(&:to_s).exclude?(value)
220
- end.cast(Polars::Boolean)
221
- df.drop([col.to_s])
222
- end
223
-
224
- def apply_ordinal_encoding(df, col)
225
- approved_values = statistics.dig(col, :allowed_categories)
226
-
227
- df.with_column(
228
- df[col].map_elements do |value|
229
- approved_values.map(&:to_s).exclude?(value) ? "other" : value
230
- end.alias(col.to_s)
231
- )
232
-
233
- label_encoder = statistics.dig(col, :label_encoder).stringify_keys
234
- other_value = label_encoder.values.max + 1
235
- label_encoder["other"] = other_value
236
- df.with_column(
237
- df[col].map { |v| label_encoder[v.to_s] }.alias(col.to_s)
238
- )
239
- end
240
-
241
- def prepare_for_imputation(df, col)
242
- df = df.with_column(Polars.col(col).cast(Polars::Float64))
243
- df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
244
- end
245
-
246
- def serialize_statistics(stats)
247
- stats.deep_transform_values do |value|
248
- case value
249
- when Time, DateTime
250
- { "__type__" => "datetime", "value" => value.iso8601 }
251
- when Date
252
- { "__type__" => "date", "value" => value.iso8601 }
253
- when BigDecimal
254
- { "__type__" => "bigdecimal", "value" => value.to_s }
255
- when Polars::DataType
256
- { "__type__" => "polars_dtype", "value" => value.to_s }
257
- when Symbol
258
- { "__type__" => "symbol", "value" => value.to_s }
259
- else
260
- value
261
- end
262
- end
263
- end
264
-
265
- def deserialize_statistics(stats)
266
- return nil if stats.nil?
267
-
268
- stats.transform_values do |value|
269
- recursive_deserialize(value)
270
- end
271
- end
272
-
273
- def recursive_deserialize(value)
274
- case value
275
- when Hash
276
- if value["__type__"]
277
- deserialize_special_type(value)
278
- else
279
- value.transform_values { |v| recursive_deserialize(v) }
280
- end
281
- when Array
282
- value.map { |v| recursive_deserialize(v) }
283
- else
284
- value
285
- end
286
- end
287
-
288
- def deserialize_special_type(value)
289
- case value["__type__"]
290
- when "datetime"
291
- DateTime.parse(value["value"])
292
- when "date"
293
- Date.parse(value["value"])
294
- when "bigdecimal"
295
- BigDecimal(value["value"])
296
- when "polars_dtype"
297
- parse_polars_dtype(value["value"])
298
- when "symbol"
299
- value["value"].to_sym
300
- else
301
- value["value"]
302
- end
303
- end
304
-
305
- def parse_polars_dtype(dtype_string)
306
- case dtype_string
307
- when /^Polars::Datetime/
308
- time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
309
- time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
310
- time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
311
- Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
312
- when /^Polars::/
313
- Polars.const_get(dtype_string.split("::").last)
314
- else
315
- raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
316
- end
317
- end
318
-
319
- def cast_to_dtype(value, dtype)
320
- case dtype
321
- when Polars::Int64
322
- value.to_i
323
- when Polars::Float64
324
- value.to_f
325
- when Polars::Boolean
326
- !!value
327
- when Polars::Utf8
328
- value.to_s
329
- else
330
- value
331
- end
332
- end
333
-
334
- def self.constants
335
- {
336
- preprocessing_strategies: PREPROCESSING_STRATEGIES,
337
- }
338
- end
339
- end
340
- end