easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,255 @@
1
+ require "active_support/core_ext/hash/deep_transform_values"
2
+ require "numo/narray"
3
+ require "json"
4
+
5
+ module EasyML
6
+ module Data
7
+ class SimpleImputer
8
+ attr_reader :statistics
9
+ attr_accessor :path, :attribute, :strategy, :options
10
+
11
+ def initialize(strategy: "mean", path: nil, attribute: nil, options: {}, statistics: {}, &block)
12
+ @strategy = strategy.to_sym
13
+ @path = path
14
+ @attribute = attribute
15
+ @options = options || {}
16
+ apply_defaults
17
+ @statistics = statistics || {}
18
+ deep_symbolize_keys!
19
+ return unless block_given?
20
+
21
+ instance_eval(&block)
22
+ end
23
+
24
+ def deep_symbolize_keys!
25
+ @statistics = @statistics.deep_symbolize_keys
26
+ end
27
+
28
+ def apply_defaults
29
+ @options[:date_column] ||= "CREATED_DATE"
30
+
31
+ if strategy == :categorical
32
+ @options[:categorical_min] ||= 25
33
+ elsif strategy == :custom
34
+ itself = ->(col) { col }
35
+ @options[:fit] ||= itself
36
+ @options[:transform] ||= itself
37
+ end
38
+ end
39
+
40
+ def fit(x, df = nil)
41
+ x = validate_input(x)
42
+
43
+ fit_values = case @strategy
44
+ when :mean
45
+ fit_mean(x)
46
+ when :median
47
+ fit_median(x)
48
+ when :ffill
49
+ fit_ffill(x, df)
50
+ when :most_frequent
51
+ fit_most_frequent(x)
52
+ when :categorical
53
+ fit_categorical(x)
54
+ when :constant
55
+ fit_constant(x)
56
+ when :clip
57
+ fit_no_op(x)
58
+ when :today
59
+ fit_no_op(x)
60
+ when :one_hot
61
+ fit_no_op(x)
62
+ when :custom
63
+ fit_custom(x)
64
+ else
65
+ raise ArgumentError, "Invalid strategy: #{@strategy}"
66
+ end || {}
67
+
68
+ @statistics[attribute] ||= {}
69
+ @statistics[attribute][@strategy] = fit_values.merge!(original_dtype: x.dtype)
70
+ @statistics.deep_symbolize_keys
71
+ end
72
+
73
+ def transform(x)
74
+ check_is_fitted
75
+
76
+ if x.is_a?(Polars::Series)
77
+ transform_polars(x)
78
+ else
79
+ transform_dense(x)
80
+ end
81
+ end
82
+
83
+ def transform_polars(x)
84
+ case @strategy
85
+ when :mean, :median
86
+ x.fill_null(@statistics[@strategy])
87
+ when :ffill
88
+ x.fill_null(@statistics[:last_value])
89
+ when :most_frequent
90
+ x.fill_null(@statistics[:most_frequent_value])
91
+ when :constant
92
+ x.fill_null(@options[:constant])
93
+ when :categorical
94
+ allowed_cats = statistics[:allowed_categories]
95
+ df = Polars::DataFrame.new({ x: x })
96
+ df.with_column(
97
+ Polars.when(Polars.col("x").is_in(allowed_cats))
98
+ .then(Polars.col("x"))
99
+ .otherwise(Polars.lit("other"))
100
+ .alias("x")
101
+ )["x"]
102
+ when :clip
103
+ min = options["min"] || 0
104
+ max = options["max"] || 1_000_000_000_000
105
+ if x.null_count != x.len
106
+ x.clip(min, max)
107
+ else
108
+ x
109
+ end
110
+ when :today
111
+ x.fill_null(transform_today(nil))
112
+ when :custom
113
+ if x.null_count == x.len
114
+ x.fill_null(transform_custom(nil))
115
+ else
116
+ x.apply do |val|
117
+ should_transform_custom?(val) ? transform_custom(val) : val
118
+ end
119
+ end
120
+ else
121
+ raise ArgumentError, "Unsupported strategy for Polars::Series: #{@strategy}"
122
+ end
123
+ end
124
+
125
+ def file_path
126
+ raise "Need both attribute and path to save/load statistics" unless attribute.present? && path.to_s.present?
127
+
128
+ File.join(path, "statistics.json")
129
+ end
130
+
131
+ def transform_today(_val)
132
+ UTC.now.beginning_of_day
133
+ end
134
+
135
+ def fit_custom(x)
136
+ x
137
+ end
138
+
139
+ def should_transform_custom?(x)
140
+ if options.key?(:should_transform)
141
+ options[:should_transform].call(x)
142
+ else
143
+ should_transform_default?(x)
144
+ end
145
+ end
146
+
147
+ def transform_custom(x)
148
+ raise "transform required" unless options.key?(:transform)
149
+
150
+ options[:transform].call(x)
151
+ end
152
+
153
+ private
154
+
155
+ def validate_input(x)
156
+ raise ArgumentError, "Input must be a Polars::Series" unless x.is_a?(Polars::Series)
157
+
158
+ x
159
+ end
160
+
161
+ def fit_mean(x)
162
+ { value: x.mean }
163
+ end
164
+
165
+ def fit_median(x)
166
+ { value: x.median }
167
+ end
168
+
169
+ def fit_ffill(x, df)
170
+ values = { value: nil, max_date: nil }
171
+
172
+ date_col = df[options[:date_column]]
173
+ return if date_col.is_null.all
174
+
175
+ sorted_df = df.sort(options[:date_column])
176
+ new_max_date = sorted_df[options[:date_column]].max
177
+
178
+ current_max_date = values[:max_date]
179
+ return if current_max_date && current_max_date > new_max_date
180
+
181
+ values[:max_date] = [current_max_date, new_max_date].compact.max
182
+
183
+ # Get the last non-null value
184
+ last_non_null = sorted_df[x.name].filter(sorted_df[x.name].is_not_null).tail(1).to_a.first
185
+ values[:value] = last_non_null
186
+
187
+ values
188
+ end
189
+
190
+ def fit_most_frequent(x)
191
+ value_counts = x.filter(x.is_not_null).value_counts
192
+ column_names = value_counts.columns
193
+ column_names[0]
194
+ count_column = column_names[1]
195
+
196
+ most_frequent_value = value_counts.sort(count_column, descending: true).row(0)[0]
197
+ { value: most_frequent_value }
198
+ end
199
+
200
+ def fit_no_op(_x)
201
+ {}
202
+ end
203
+
204
+ def fit_constant(_x)
205
+ { value: @options[:fill_value] }
206
+ end
207
+
208
+ def transform_default(_val)
209
+ @statistics[strategy][:value]
210
+ end
211
+
212
+ def should_transform_default?(val)
213
+ checker_method = val.respond_to?(:nan?) ? :nan? : :nil?
214
+ val.send(checker_method)
215
+ end
216
+
217
+ def transform_dense(x)
218
+ result = x.map do |val|
219
+ strategy_method = respond_to?("transform_#{strategy}") ? "transform_#{strategy}" : "transform_default"
220
+ checker_method = respond_to?("should_transform_#{strategy}?") ? "should_transform_#{strategy}?" : "should_transform_default?"
221
+ send(checker_method, val) ? send(strategy_method, val) : val
222
+ end
223
+
224
+ # Cast the result back to the original dtype
225
+ original_dtype = @statistics[:original_dtype]
226
+ if original_dtype
227
+ result.map { |val| cast_to_dtype(val, original_dtype) }
228
+ else
229
+ result
230
+ end
231
+ end
232
+
233
+ def check_is_fitted
234
+ return if %i[clip today custom].include?(strategy)
235
+
236
+ pass_check = case strategy
237
+ when :mean
238
+ @statistics.dig(:mean).present?
239
+ when :median
240
+ @statistics.dig(:median).present?
241
+ when :ffill
242
+ @statistics.dig(:last_value).present?
243
+ when :most_frequent
244
+ @statistics.key?(:most_frequent_value)
245
+ when :constant
246
+ options.dig(:constant).present?
247
+ when :categorical
248
+ true
249
+ end
250
+
251
+ raise "SimpleImputer has not been fitted yet for #{attribute}##{strategy}" unless pass_check
252
+ end
253
+ end
254
+ end
255
+ end
@@ -0,0 +1,252 @@
1
+ require_relative "split"
2
+
3
+ module EasyML
4
+ module Data
5
+ module Splits
6
+ class FileSplit < Split
7
+ include EasyML::Data::Utils
8
+
9
+ attr_accessor :dir, :polars_args, :max_rows_per_file, :batch_size, :verbose,
10
+ :dataset, :datasource
11
+
12
+ def initialize(options = {})
13
+ super
14
+ @dir = options[:dir]
15
+ @polars_args = options[:polars_args] || {}
16
+ @max_rows_per_file = options[:max_rows_per_file] || 1_000_000
17
+ @batch_size = options[:batch_size] || 10_000
18
+ @verbose = options[:verbose] || false
19
+ @dataset = options[:dataset]
20
+ @datasource = options[:datasource]
21
+ FileUtils.mkdir_p(dir)
22
+ end
23
+
24
+ def attributes
25
+ {
26
+ dir: dir,
27
+ polars_args: polars_args,
28
+ max_rows_per_file: max_rows_per_file,
29
+ batch_size: batch_size,
30
+ verbose: verbose,
31
+ dataset: dataset,
32
+ datasource: datasource,
33
+ }.with_indifferent_access
34
+ end
35
+
36
+ def s3_prefix
37
+ File.join("datasets", dir.split("datasets").last)
38
+ end
39
+
40
+ def synced_directory
41
+ datasource_config = datasource.configuration || {}
42
+ @synced_dir ||= EasyML::Data::SyncedDirectory.new(
43
+ root_dir: dir,
44
+ s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
45
+ s3_prefix: s3_prefix,
46
+ s3_access_key_id: EasyML::Configuration.s3_access_key_id,
47
+ s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
48
+ polars_args: datasource_config.dig("polars_args"),
49
+ cache_for: 0,
50
+ )
51
+ end
52
+
53
+ def should_sync?
54
+ synced_directory.should_sync?
55
+ end
56
+
57
+ def download
58
+ synced_directory.download
59
+ end
60
+
61
+ def upload
62
+ synced_directory.upload
63
+ end
64
+
65
+ # cp can receive a directory or just a version string
66
+ def cp(target_dir)
67
+ target_dir = version_to_dir(target_dir) if is_version?(target_dir)
68
+ return self if target_dir.nil?
69
+
70
+ target_dir = target_dir.to_s
71
+
72
+ puts "copying #{dir} to #{target_dir}"
73
+ FileUtils.mkdir_p(target_dir)
74
+
75
+ files_to_cp = Dir.glob(Pathname.new(dir).join("**/*")).select { |f| File.file?(f) }
76
+
77
+ files_to_cp.each do |file|
78
+ target_parts = target_dir.split("/")
79
+ file_parts = file.split("/")
80
+ _, not_shared = file_parts.partition.with_index { |part, index| target_parts[index] == part }
81
+ not_shared = not_shared[1..-1].join("/") # remove leading "processed", "raw", etc
82
+
83
+ target_file = File.join(target_dir, not_shared)
84
+ FileUtils.mkdir_p(File.dirname(target_file))
85
+ FileUtils.cp(file, target_file)
86
+ end
87
+
88
+ self.class.new(dir: target_dir, **attributes.except("dir"))
89
+ end
90
+
91
+ def save(segment, df)
92
+ return unless df.present?
93
+
94
+ segment_dir = File.join(dir, segment.to_s)
95
+ FileUtils.mkdir_p(segment_dir)
96
+
97
+ file_path = new_file_path_for_segment(segment)
98
+ df.write_parquet(file_path)
99
+ file_path
100
+ end
101
+
102
+ def query(**kwargs, &block)
103
+ read(:all, **kwargs, &block)
104
+ end
105
+
106
+ def read(segment, split_ys: false, target: nil, drop_cols: [], filter: nil, limit: nil, select: nil,
107
+ unique: nil, sort: nil, descending: false, batch_size: nil, batch_start: nil, batch_key: nil, &block)
108
+ files = files_for_segment(segment)
109
+ return split_ys ? [nil, nil] : nil if files.empty?
110
+
111
+ query_params = {
112
+ filter: filter,
113
+ limit: limit,
114
+ select: select,
115
+ unique: unique,
116
+ drop_cols: drop_cols,
117
+ sort: sort,
118
+ descending: descending,
119
+ batch_size: batch_size,
120
+ batch_start: batch_start,
121
+ batch_key: batch_key,
122
+ }.compact
123
+
124
+ if batch_size.present?
125
+ base_enumerator = EasyML::Data::PolarsReader.query(files, **query_params)
126
+
127
+ if block_given?
128
+ wrap_with_block(base_enumerator, split_ys, target, &block)
129
+ else
130
+ wrap_with_split(base_enumerator, target)
131
+ end
132
+ else
133
+ df = EasyML::Data::PolarsReader.query(files, **query_params, &block)
134
+ split_features_targets(df, split_ys, target)
135
+ end
136
+ end
137
+
138
+ def cleanup
139
+ FileUtils.rm_rf(dir)
140
+ FileUtils.mkdir_p(dir)
141
+ end
142
+
143
+ def split_at
144
+ return nil if output_files.empty?
145
+
146
+ output_files.map { |file| File.mtime(file) }.max
147
+ end
148
+
149
+ def num_batches(segment)
150
+ files_for_segment(segment).count
151
+ end
152
+
153
+ def files
154
+ files_for_segment("all")
155
+ end
156
+
157
+ private
158
+
159
+ def wrap_with_block(base_enumerator, split_ys, target, &block)
160
+ Enumerator.new do |yielder|
161
+ base_enumerator.each do |df|
162
+ df = block.call(df)
163
+ result = process_dataframe(df, split_ys, target)
164
+ yielder << result
165
+ end
166
+ end
167
+ end
168
+
169
+ def wrap_with_split(base_enumerator, target)
170
+ Enumerator.new do |yielder|
171
+ base_enumerator.each do |df|
172
+ result = process_dataframe(df, true, target)
173
+ yielder << result
174
+ end
175
+ end
176
+ end
177
+
178
+ def process_dataframe(df, split_ys, target)
179
+ return df unless split_ys
180
+
181
+ df = df.collect if df.is_a?(Polars::LazyFrame)
182
+ split_features_targets(df, split_ys, target)
183
+ end
184
+
185
+ def version_to_dir(version)
186
+ relative_path = dir.gsub(Regexp.new(Rails.root.to_s), "")
187
+ current_path = Pathname.new(relative_path)
188
+
189
+ # Find the version component in the path
190
+ path_parts = current_path.each_filename.to_a
191
+ version_index = path_parts.find_index { |part| part.match?(version_pattern) }
192
+
193
+ return unless version_index
194
+
195
+ old_version = path_parts[version_index]
196
+ return if old_version == version
197
+
198
+ # Replace the version number with the new version
199
+ path_parts[version_index] = version
200
+ Rails.root.join(File.join(*path_parts))
201
+ end
202
+
203
+ def version_pattern
204
+ /^\d{14}$/
205
+ end
206
+
207
+ def is_version?(string)
208
+ string.to_s.match?(version_pattern)
209
+ end
210
+
211
+ def df(path)
212
+ filtered_args = filter_polars_args(Polars.method(:read_parquet))
213
+ Polars.read_parquet(path, **filtered_args)
214
+ end
215
+
216
+ def filter_polars_args(method)
217
+ supported_params = method.parameters.map { |_, name| name }
218
+ polars_args.select { |k, _| supported_params.include?(k) }
219
+ end
220
+
221
+ def output_files
222
+ Dir.glob("#{dir}/**/*.parquet")
223
+ end
224
+
225
+ def files_for_segment(segment)
226
+ if segment.to_s == "all"
227
+ files_for_segment("train") + files_for_segment("test") + files_for_segment("valid")
228
+ else
229
+ segment_dir = File.join(dir, segment.to_s)
230
+ Dir.glob(File.join(segment_dir, "**/*.parquet")).sort
231
+ end
232
+ end
233
+
234
+ def current_file_for_segment(segment)
235
+ segment_dir = File.join(dir, segment.to_s)
236
+ File.join(segment_dir, "#{segment}.parquet")
237
+ end
238
+
239
+ def new_file_path_for_segment(segment)
240
+ segment_dir = File.join(dir, segment.to_s)
241
+ file_number = Dir.glob(File.join(segment_dir, "*.parquet")).count
242
+ File.join(segment_dir, "#{segment}_%04d.parquet" % file_number)
243
+ end
244
+
245
+ def combine_dataframes(files)
246
+ dfs = files.map { |file| df(file) }
247
+ Polars.concat(dfs)
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
@@ -0,0 +1,54 @@
1
+ module EasyML
2
+ module Data
3
+ module Splits
4
+ class InMemorySplit < Split
5
+ attr_accessor :dataset
6
+
7
+ def initialize(options = {})
8
+ @data = {}
9
+ @dataset = options[:dataset]
10
+ end
11
+
12
+ # We don't backup in-memory splits to s3
13
+ def download; end
14
+
15
+ def upload; end
16
+
17
+ def files
18
+ []
19
+ end
20
+
21
+ def save(segment, df)
22
+ @data[segment] = df
23
+ end
24
+
25
+ def read(segment, split_ys: false, target: nil, drop_cols: [], filter: nil, limit: nil, select: nil,
26
+ unique: nil)
27
+ return nil if @data.keys.none?
28
+
29
+ df = if segment.to_s == "all"
30
+ Polars.concat(EasyML::Dataset::SPLIT_ORDER.map { |segment| @data[segment] }.compact)
31
+ else
32
+ @data[segment]
33
+ end
34
+ return nil if df.nil?
35
+
36
+ df = df.filter(filter) if filter.present?
37
+ drop_cols &= df.columns
38
+ df = df.drop(drop_cols) unless drop_cols.empty?
39
+ df = df.unique if unique
40
+
41
+ split_features_targets(df, split_ys, target)
42
+ end
43
+
44
+ def cleanup
45
+ @data.clear
46
+ end
47
+
48
+ def split_at
49
+ @data.keys.empty? ? nil : Time.now
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,95 @@
1
+ module EasyML
2
+ module Data
3
+ module Splits
4
+ class Split
5
+ include EasyML::Data::Utils
6
+
7
+ VALID_SEGMENTS = %w[train test valid all].freeze
8
+
9
+ def initialize(options = {})
10
+ end
11
+
12
+ def load_data(segment, **kwargs)
13
+ drop_cols = dataset.drop_columns(all_columns: kwargs[:all_columns] || false)
14
+ kwargs.delete(:all_columns)
15
+ kwargs = kwargs.merge!(drop_cols: drop_cols, target: dataset.target)
16
+ read(segment, **kwargs)
17
+ end
18
+
19
+ def save(segment, _df)
20
+ validate_segment!(segment)
21
+ raise NotImplementedError, "Subclasses must implement #save"
22
+ end
23
+
24
+ def data(**kwargs, &block)
25
+ load_data(:all, **kwargs, &block)
26
+ end
27
+
28
+ def train(**kwargs, &block)
29
+ load_data(:train, **kwargs, &block)
30
+ end
31
+
32
+ def test(**kwargs, &block)
33
+ load_data(:test, **kwargs, &block)
34
+ end
35
+
36
+ def valid(**kwargs, &block)
37
+ load_data(:valid, **kwargs, &block)
38
+ end
39
+
40
+ def cleanup
41
+ raise NotImplementedError, "Subclasses must implement #cleanup"
42
+ end
43
+
44
+ def split_at
45
+ raise NotImplementedError, "Subclasses must implement #split_at"
46
+ end
47
+
48
+ protected
49
+
50
+ def split_features_targets(df, split_ys, target)
51
+ return df unless split_ys
52
+ raise ArgumentError, "Target column must be specified when split_ys is true" if target.nil?
53
+
54
+ xs = df.drop(target)
55
+ ys = df.select(target)
56
+ [xs, ys]
57
+ end
58
+
59
+ def validate_segment!(segment)
60
+ segment = segment.to_s
61
+ return if VALID_SEGMENTS.include?(segment)
62
+
63
+ raise ArgumentError, "Invalid segment: #{segment}. Must be one of: #{VALID_SEGMENTS.join(", ")}"
64
+ end
65
+
66
+ def validate_read_options!(options)
67
+ valid_options = %i[filter limit select unique sort descending batch_size batch_start batch_key]
68
+ invalid_options = options.keys - valid_options
69
+ return if invalid_options.empty?
70
+
71
+ raise ArgumentError,
72
+ "Invalid options: #{invalid_options.join(", ")}. Valid options are: #{valid_options.join(", ")}"
73
+ end
74
+
75
+ private
76
+
77
+ def process_block_with_split_ys(block, result, xs, ys)
78
+ case block.arity
79
+ when 3 then result.nil? ? [xs, ys] : block.call(result, xs, ys)
80
+ when 2 then block.call(xs, ys) && result
81
+ else raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
82
+ end
83
+ end
84
+
85
+ def process_block_without_split_ys(block, result, df)
86
+ case block.arity
87
+ when 2 then result.nil? ? df : block.call(result, df)
88
+ when 1 then block.call(df) && result
89
+ else raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,9 @@
1
+ module EasyML
2
+ module Data
3
+ module Splits
4
+ require_relative "splits/split"
5
+ require_relative "splits/file_split"
6
+ require_relative "splits/in_memory_split"
7
+ end
8
+ end
9
+ end