easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -2,63 +2,135 @@ require "fileutils"
2
2
  require "polars"
3
3
  require "date"
4
4
  require "json"
5
- require_relative "preprocessor/utils"
6
- require_relative "preprocessor/simple_imputer"
5
+ require_relative "simple_imputer"
7
6
 
8
7
  module EasyML::Data
9
8
  class Preprocessor
10
- include GlueGun::DSL
11
- include EasyML::Data::Preprocessor::Utils
12
-
13
9
  CATEGORICAL_COMMON_MIN = 50
14
- PREPROCESSING_ORDER = %w[clip mean median constant categorical one_hot ffill custom fill_date add_datepart]
15
10
 
16
- attribute :directory, :string
17
- attribute :verbose, :boolean, default: false
18
- attribute :preprocessing_steps, :hash, default: {}
19
- def preprocessing_steps=(preprocessing_steps)
20
- super(standardize_config(preprocessing_steps).with_indifferent_access)
11
+ ALLOWED_PARAMS = {
12
+ constant: [:constant],
13
+ categorical: %i[categorical_min one_hot ordinal_encoding],
14
+ most_frequent: %i[one_hot ordinal_encoding],
15
+ mean: [:clip],
16
+ median: [:clip],
17
+ }
18
+
19
+ PREPROCESSING_STRATEGIES = {
20
+ float: [
21
+ { value: "mean", label: "Mean" },
22
+ { value: "median", label: "Median" },
23
+ { value: "constant", label: "Constant Value" },
24
+ ],
25
+ integer: [
26
+ { value: "mean", label: "Mean" },
27
+ { value: "median", label: "Median" },
28
+ { value: "constant", label: "Constant Value" },
29
+ ],
30
+ boolean: [
31
+ { value: "most_frequent", label: "Most Frequent" },
32
+ { value: "constant", label: "Constant Value" },
33
+ ],
34
+ datetime: [
35
+ { value: "ffill", label: "Forward Fill" },
36
+ { value: "constant", label: "Constant Value" },
37
+ { value: "today", label: "Current Date" },
38
+ ],
39
+ string: [
40
+ { value: "most_frequent", label: "Most Frequent" },
41
+ { value: "constant", label: "Constant Value" },
42
+ ],
43
+ text: [
44
+ { value: "most_frequent", label: "Most Frequent" },
45
+ { value: "constant", label: "Constant Value" },
46
+ ],
47
+ categorical: [
48
+ { value: "categorical", label: "Categorical" },
49
+ { value: "most_frequent", label: "Most Frequent" },
50
+ { value: "constant", label: "Constant Value" },
51
+ ],
52
+ }.freeze
53
+
54
+ attr_accessor :directory, :verbose, :imputers, :preprocessing_steps
55
+ attr_reader :statistics
56
+
57
+ def initialize(options = {})
58
+ @directory = options[:directory]
59
+ @verbose = options[:verbose]
60
+ @imputers = options[:imputers]
61
+ @preprocessing_steps = options[:preprocessing_steps]
62
+ @statistics = {}
63
+ end
64
+
65
+ def statistics=(stats)
66
+ @statistics = (stats || {}).deep_symbolize_keys
67
+ end
68
+
69
+ def apply_clip(df, preprocessing_steps)
70
+ df = df.clone
71
+ preprocessing_steps ||= {}
72
+ preprocessing_steps.deep_symbolize_keys!
73
+
74
+ (preprocessing_steps[:training] || {}).each_key do |col|
75
+ clip_params = preprocessing_steps.dig(:training, col, :params, :clip)
76
+ next unless clip_params
77
+
78
+ min = clip_params[:min]
79
+ max = clip_params[:max]
80
+ df[col.to_s] = df[col.to_s].clip(min, max)
81
+ end
82
+
83
+ df
84
+ end
85
+
86
+ def learn_categorical_min(df, preprocessing_steps)
87
+ preprocessing_steps ||= {}
88
+ preprocessing_steps.deep_symbolize_keys!
89
+
90
+ allowed_categories = {}
91
+ (preprocessing_steps[:training] || {}).each_key do |col|
92
+ next unless [
93
+ preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
94
+ preprocessing_steps.dig(:training, col, :params, :one_hot),
95
+ preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
96
+ ].any?
97
+
98
+ cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
99
+ val_counts = df[col].value_counts
100
+ allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
101
+ end
102
+ allowed_categories
21
103
  end
22
104
 
23
105
  def fit(df)
24
106
  return if df.nil?
25
- return if preprocessing_steps.keys.none?
107
+ return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
26
108
 
27
- puts "Preprocessing..." if verbose
28
- imputers = initialize_imputers(
29
- preprocessing_steps[:training].merge!(preprocessing_steps[:inference] || {})
30
- )
109
+ preprocessing_steps.deep_symbolize_keys!
110
+ df = apply_clip(df, preprocessing_steps)
111
+ allowed_categories = learn_categorical_min(df, preprocessing_steps)
31
112
 
32
- did_cleanup = false
33
- imputers.each do |col, imputers|
34
- sorted_strategies(imputers).each do |strategy|
35
- imputer = imputers[strategy]
36
- unless did_cleanup
37
- imputer.cleanup
38
- did_cleanup = true
39
- end
40
- if df.columns.map(&:downcase).include?(col.downcase)
41
- actual_col = df.columns.find { |c| c.downcase == imputer.attribute.downcase }
42
- imputer.fit(df[actual_col], df)
43
- if strategy == "clip" # This is the only one to transform during fit
44
- df[actual_col] = imputer.transform(df[actual_col])
45
- end
46
- elsif @verbose
47
- puts "Warning: Column '#{col}' not found in DataFrame during fit process."
48
- end
49
- end
113
+ self.statistics = StatisticsLearner.learn_df(df).deep_symbolize_keys
114
+
115
+ # Merge allowed categories into statistics
116
+ allowed_categories.each do |col, categories|
117
+ statistics[col] ||= {}
118
+ statistics[col][:allowed_categories] = categories
119
+ statistics[col].merge!(
120
+ fit_categorical(df[col], preprocessing_steps)
121
+ )
50
122
  end
51
123
  end
52
124
 
53
125
  def postprocess(df, inference: false)
54
126
  puts "Postprocessing..." if verbose
55
- return df if preprocessing_steps.keys.none?
127
+ return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
56
128
 
57
129
  steps = if inference
58
- preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
59
- else
60
- preprocessing_steps[:training]
61
- end
130
+ preprocessing_steps[:training].merge(preprocessing_steps[:inference] || {})
131
+ else
132
+ preprocessing_steps[:training]
133
+ end
62
134
 
63
135
  df = apply_transformations(df, steps)
64
136
 
@@ -66,11 +138,14 @@ module EasyML::Data
66
138
  df
67
139
  end
68
140
 
69
- def statistics
70
- initialize_imputers(preprocessing_steps[:training]).each_with_object({}) do |(col, strategies), result|
71
- result[col] = strategies.each_with_object({}) do |(strategy, imputer), col_result|
72
- col_result[strategy] = imputer.statistics
73
- end
141
+ def decode_labels(values, col: nil)
142
+ decoder = statistics.dig(col.to_sym, :label_decoder)
143
+ other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
144
+ decoder[other_value] = "other"
145
+ decoder.stringify_keys!
146
+
147
+ values.map do |value|
148
+ decoder[value.to_s]
74
149
  end
75
150
  end
76
151
 
@@ -84,66 +159,55 @@ module EasyML::Data
84
159
  FileUtils.rm_rf(@directory)
85
160
  end
86
161
 
87
- def move(to)
88
- old_dir = directory
89
- current_env = directory.split("/")[-1]
90
- new_dir = directory.gsub(Regexp.new(current_env), to)
91
-
92
- puts "Moving #{old_dir} to #{new_dir}"
93
- FileUtils.mv(old_dir, new_dir)
94
- @directory = new_dir
95
- end
96
-
97
- def decode_labels(values, col: nil)
98
- imputers = initialize_imputers(preprocessing_steps[:training], dumb: true)
99
- imputer = imputers.dig(col, "categorical")
100
- decoder = imputer.statistics.dig(:categorical, :label_decoder)
101
-
102
- other_value = decoder.keys.map(&:to_s).map(&:to_i).max + 1
103
- decoder[other_value] = "other"
104
- decoder.stringify_keys!
105
-
106
- values.map do |value|
107
- decoder[value.to_s]
108
- end
162
+ def serialize
163
+ {
164
+ directory: directory,
165
+ verbose: verbose,
166
+ imputers: imputers,
167
+ preprocessing_steps: preprocessing_steps,
168
+ statistics: serialize_statistics(statistics || {}),
169
+ }
109
170
  end
110
171
 
111
172
  private
112
173
 
113
- def initialize_imputers(config, dumb: false)
114
- standardize_config(config).each_with_object({}) do |(col, strategies), hash|
174
+ def initialize_imputers(config)
175
+ config.each_with_object({}) do |(col, conf), hash|
115
176
  hash[col] ||= {}
116
- strategies.each do |strategy, options|
117
- options = {} if options == true
118
-
119
- hash[col][strategy] = EasyML::Data::Preprocessor::SimpleImputer.new(
120
- strategy: strategy,
121
- path: directory,
122
- attribute: col,
123
- options: options
124
- )
125
- end
177
+ conf.symbolize_keys!
178
+ method = conf[:method]
179
+ params = conf[:params] || {}
180
+
181
+ hash[col][method] = EasyML::Data::SimpleImputer.new(
182
+ strategy: method,
183
+ options: params,
184
+ path: directory,
185
+ attribute: col,
186
+ statistics: statistics.dig(col),
187
+ )
126
188
  end
127
189
  end
128
190
 
129
191
  def apply_transformations(df, config)
130
192
  imputers = initialize_imputers(config)
131
193
 
132
- standardize_config(config).each do |col, strategies|
133
- if df.columns.map(&:downcase).include?(col.downcase)
134
- actual_col = df.columns.find { |c| c.downcase == col.downcase }
135
-
136
- sorted_strategies(strategies).each do |strategy|
137
- if strategy.to_sym == :categorical
138
- if imputers.dig(col, strategy).options.dig("one_hot")
139
- df = apply_one_hot(df, col, imputers)
140
- elsif imputers.dig(col, strategy).options.dig("encode_labels")
141
- df = apply_encode_labels(df, col, imputers)
142
- end
143
- else
144
- imputer = imputers.dig(col, strategy)
145
- df[actual_col] = imputer.transform(df[actual_col]) if imputer
146
- end
194
+ df = apply_clip(df, { training: config })
195
+
196
+ config.each do |col, conf|
197
+ conf.symbolize_keys!
198
+ if df.columns.map(&:downcase).map(&:to_s).include?(col.downcase.to_s)
199
+ actual_col = df.columns.map(&:to_s).find { |c| c.to_s.downcase == col.to_s.downcase }
200
+
201
+ strategy = conf[:method]
202
+ params = conf[:params]
203
+ imputer = imputers.dig(col, strategy)
204
+
205
+ df[actual_col] = imputer.transform(df[actual_col]) if imputer
206
+
207
+ if params.is_a?(Hash) && params.key?(:one_hot) && params[:one_hot] == true
208
+ df = apply_one_hot(df, col)
209
+ elsif params.is_a?(Hash) && params.key?(:ordinal_encoding) && params[:ordinal_encoding] == true
210
+ df = apply_ordinal_encoding(df, col)
147
211
  end
148
212
  elsif @verbose
149
213
  puts "Warning: Column '#{col}' not found in DataFrame during apply_transformations process."
@@ -153,17 +217,14 @@ module EasyML::Data
153
217
  df
154
218
  end
155
219
 
156
- def apply_one_hot(df, col, imputers)
157
- cat_imputer = imputers.dig(col, "categorical")
158
- approved_values = cat_imputer.statistics[:categorical][:value].select do |_k, v|
159
- v >= cat_imputer.options["categorical_min"]
160
- end.keys
220
+ def apply_one_hot(df, col)
221
+ approved_values = statistics.dig(col, :allowed_categories).sort
161
222
 
162
223
  # Create one-hot encoded columns
163
224
  approved_values.each do |value|
164
- new_col_name = "#{col}_#{value}".tr("-", "_")
225
+ new_col_name = "#{col}_#{value}".gsub(/-/, "_")
165
226
  df = df.with_column(
166
- df[col].eq(value.to_s).cast(Polars::Int64).alias(new_col_name)
227
+ df[col].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
167
228
  )
168
229
  end
169
230
 
@@ -171,68 +232,145 @@ module EasyML::Data
171
232
  other_col_name = "#{col}_other"
172
233
  df[other_col_name] = df[col].map_elements do |value|
173
234
  approved_values.map(&:to_s).exclude?(value)
174
- end.cast(Polars::Int64)
175
- df.drop([col])
235
+ end.cast(Polars::Boolean)
236
+ df.drop([col.to_s])
176
237
  end
177
238
 
178
- def apply_encode_labels(df, col, imputers)
179
- cat_imputer = imputers.dig(col, "categorical")
180
- approved_values = cat_imputer.statistics[:categorical][:value].select do |_k, v|
181
- v >= cat_imputer.options["categorical_min"]
182
- end.keys
239
+ def apply_ordinal_encoding(df, col)
240
+ approved_values = statistics.dig(col, :allowed_categories)
183
241
 
184
242
  df.with_column(
185
243
  df[col].map_elements do |value|
186
244
  approved_values.map(&:to_s).exclude?(value) ? "other" : value
187
- end.alias(col)
245
+ end.alias(col.to_s)
188
246
  )
189
247
 
190
- label_encoder = cat_imputer.statistics[:categorical][:label_encoder].stringify_keys
248
+ label_encoder = statistics.dig(col, :label_encoder).stringify_keys
191
249
  other_value = label_encoder.values.max + 1
192
250
  label_encoder["other"] = other_value
193
-
194
251
  df.with_column(
195
- df[col].map { |v| label_encoder[v.to_s] }.alias(col)
252
+ df[col].map { |v| label_encoder[v.to_s] }.alias(col.to_s)
196
253
  )
197
254
  end
198
255
 
199
- def sorted_strategies(strategies)
200
- strategies.keys.sort_by do |key|
201
- PREPROCESSING_ORDER.index(key)
256
+ def fit_categorical(series, _preprocessing_steps)
257
+ value_counts = series.value_counts
258
+ column_names = value_counts.columns
259
+ value_column = column_names[0]
260
+ count_column = column_names[1]
261
+
262
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
263
+ label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
264
+ h.tap do
265
+ h[k] = i
266
+ end
202
267
  end
268
+ label_decoder = label_encoder.invert
269
+
270
+ {
271
+ value: as_hash,
272
+ label_encoder: label_encoder,
273
+ label_decoder: label_decoder,
274
+ }
203
275
  end
204
276
 
205
277
  def prepare_for_imputation(df, col)
206
278
  df = df.with_column(Polars.col(col).cast(Polars::Float64))
207
279
  df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
208
280
  end
281
+
282
+ def serialize_statistics(stats)
283
+ stats.deep_transform_values do |value|
284
+ case value
285
+ when Time, DateTime
286
+ { "__type__" => "datetime", "value" => value.iso8601 }
287
+ when Date
288
+ { "__type__" => "date", "value" => value.iso8601 }
289
+ when BigDecimal
290
+ { "__type__" => "bigdecimal", "value" => value.to_s }
291
+ when Polars::DataType
292
+ { "__type__" => "polars_dtype", "value" => value.to_s }
293
+ when Symbol
294
+ { "__type__" => "symbol", "value" => value.to_s }
295
+ else
296
+ value
297
+ end
298
+ end
299
+ end
300
+
301
+ def deserialize_statistics(stats)
302
+ return nil if stats.nil?
303
+
304
+ stats.transform_values do |value|
305
+ recursive_deserialize(value)
306
+ end
307
+ end
308
+
309
+ def recursive_deserialize(value)
310
+ case value
311
+ when Hash
312
+ if value["__type__"]
313
+ deserialize_special_type(value)
314
+ else
315
+ value.transform_values { |v| recursive_deserialize(v) }
316
+ end
317
+ when Array
318
+ value.map { |v| recursive_deserialize(v) }
319
+ else
320
+ value
321
+ end
322
+ end
323
+
324
+ def deserialize_special_type(value)
325
+ case value["__type__"]
326
+ when "datetime"
327
+ DateTime.parse(value["value"])
328
+ when "date"
329
+ Date.parse(value["value"])
330
+ when "bigdecimal"
331
+ BigDecimal(value["value"])
332
+ when "polars_dtype"
333
+ parse_polars_dtype(value["value"])
334
+ when "symbol"
335
+ value["value"].to_sym
336
+ else
337
+ value["value"]
338
+ end
339
+ end
340
+
341
+ def parse_polars_dtype(dtype_string)
342
+ case dtype_string
343
+ when /^Polars::Datetime/
344
+ time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
345
+ time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
346
+ time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
347
+ Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
348
+ when /^Polars::/
349
+ Polars.const_get(dtype_string.split("::").last)
350
+ else
351
+ raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
352
+ end
353
+ end
354
+
355
+ def cast_to_dtype(value, dtype)
356
+ case dtype
357
+ when Polars::Int64
358
+ value.to_i
359
+ when Polars::Float64
360
+ value.to_f
361
+ when Polars::Boolean
362
+ !!value
363
+ when Polars::Utf8
364
+ value.to_s
365
+ else
366
+ value
367
+ end
368
+ end
369
+
370
+ def self.constants
371
+ {
372
+ preprocessing_strategies: PREPROCESSING_STRATEGIES,
373
+ }
374
+ end
209
375
  end
210
376
  end
211
-
212
- # Where to put this???
213
- #
214
- # def self.stage_required_files
215
- # required_files.each do |file|
216
- # git_add(file)
217
- # end
218
- # end
219
-
220
- # def self.git_add(path)
221
- # command = "git add #{path}"
222
- # puts command if verbose
223
- # result = `#{command}`
224
- # puts result if verbose
225
- # end
226
-
227
- # def self.set_verbose(verbose)
228
- # @verbose = verbose
229
- # end
230
-
231
- # def required_files
232
- # files = Dir.entries(@directory) - %w[. ..]
233
- # required_file_types = %w[bin]
234
-
235
- # files.select { |file| required_file_types.any? { |ext| file.include?(ext) } }.map do |file|
236
- # File.join(@directory, file)
237
- # end
238
- # end