easy_ml 0.1.4 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -1,402 +0,0 @@
1
- require "active_support/core_ext/hash/deep_transform_values"
2
- require "numo/narray"
3
- require "json"
4
-
5
- module EasyML
6
- module Data
7
- class Preprocessor
8
- class SimpleImputer
9
- attr_reader :statistics
10
- attr_accessor :path, :attribute, :strategy, :options
11
-
12
- def initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block)
13
- @strategy = strategy.to_sym
14
- @path = path
15
- @attribute = attribute
16
- @options = options || {}
17
- apply_defaults
18
- load
19
- @statistics ||= {}
20
- deep_symbolize_keys!
21
- return unless block_given?
22
-
23
- instance_eval(&block)
24
- end
25
-
26
- def deep_symbolize_keys!
27
- @statistics = @statistics.deep_symbolize_keys
28
- end
29
-
30
- def apply_defaults
31
- @options[:date_column] ||= "CREATED_DATE"
32
-
33
- if strategy == :categorical
34
- @options[:categorical_min] ||= 25
35
- elsif strategy == :custom
36
- itself = ->(col) { col }
37
- @options[:fit] ||= itself
38
- @options[:transform] ||= itself
39
- end
40
- end
41
-
42
- def fit(x, df = nil)
43
- x = validate_input(x)
44
-
45
- fit_values = case @strategy
46
- when :mean
47
- fit_mean(x)
48
- when :median
49
- fit_median(x)
50
- when :ffill
51
- fit_ffill(x, df)
52
- when :most_frequent
53
- fit_most_frequent(x)
54
- when :categorical
55
- fit_categorical(x)
56
- when :constant
57
- fit_constant(x)
58
- when :clip
59
- fit_no_op(x)
60
- when :today
61
- fit_no_op(x)
62
- when :one_hot
63
- fit_no_op(x)
64
- when :custom
65
- fit_custom(x)
66
- else
67
- raise ArgumentError, "Invalid strategy: #{@strategy}"
68
- end || {}
69
-
70
- @statistics[attribute] ||= {}
71
- @statistics[attribute][@strategy] = fit_values.merge!(original_dtype: x.dtype)
72
- save
73
- self
74
- end
75
-
76
- def transform(x)
77
- check_is_fitted
78
-
79
- if x.is_a?(Polars::Series)
80
- transform_polars(x)
81
- else
82
- transform_dense(x)
83
- end
84
- end
85
-
86
- def transform_polars(x)
87
- result = case @strategy
88
- when :mean, :median, :ffill, :most_frequent, :constant
89
- x.fill_null(@statistics[@strategy][:value])
90
- when :clip
91
- min = options["min"] || 0
92
- max = options["max"] || 1_000_000_000_000
93
- if x.null_count != x.len
94
- x.clip(min, max)
95
- else
96
- x
97
- end
98
- when :categorical
99
- allowed_values = @statistics.dig(:categorical, :value).select do |_k, v|
100
- v >= options[:categorical_min]
101
- end.keys.map(&:to_s)
102
- if x.null_count == x.len
103
- x.fill_null(transform_categorical(nil))
104
- else
105
- x.apply do |val|
106
- allowed_values.include?(val) ? val : transform_categorical(val)
107
- end
108
- end
109
- when :today
110
- x.fill_null(transform_today(nil))
111
- when :custom
112
- if x.null_count == x.len
113
- x.fill_null(transform_custom(nil))
114
- else
115
- x.apply do |val|
116
- should_transform_custom?(val) ? transform_custom(val) : val
117
- end
118
- end
119
- else
120
- raise ArgumentError, "Unsupported strategy for Polars::Series: #{@strategy}"
121
- end
122
-
123
- # Cast the result back to the original dtype
124
- original_dtype = @statistics.dig(@strategy, :original_dtype)
125
- original_dtype ? result.cast(original_dtype) : result
126
- end
127
-
128
- def file_path
129
- raise "Need both attribute and path to save/load statistics" unless attribute.present? && path.to_s.present?
130
-
131
- File.join(path, "statistics.json")
132
- end
133
-
134
- def cleanup
135
- @statistics = {}
136
- FileUtils.rm(file_path) if File.exist?(file_path)
137
- end
138
-
139
- def save
140
- FileUtils.mkdir_p(File.dirname(file_path))
141
-
142
- all_statistics = (File.exist?(file_path) ? JSON.parse(File.read(file_path)) : {}).deep_symbolize_keys
143
-
144
- deep_symbolize_keys!
145
-
146
- serialized = serialize_statistics(@statistics)
147
- all_statistics[attribute] = {} unless all_statistics.key?(attribute)
148
- all_statistics[attribute][@strategy] = serialized[attribute.to_sym][@strategy.to_sym]
149
-
150
- File.open(file_path, "w") do |file|
151
- file.write(JSON.pretty_generate(all_statistics))
152
- end
153
- end
154
-
155
- def load
156
- return unless File.exist?(file_path)
157
-
158
- all_statistics = JSON.parse(File.read(file_path))
159
- attribute_stats = all_statistics[@attribute]
160
-
161
- return unless attribute_stats
162
-
163
- @statistics = deserialize_statistics(attribute_stats)
164
- deep_symbolize_keys!
165
- end
166
-
167
- def should_transform_categorical?(val)
168
- values = @statistics.dig(:categorical, :value) || {}
169
- min_ct = options[:categorical_min] || 25
170
- allowed_values = values.select { |_v, c| c >= min_ct }
171
-
172
- allowed_values.keys.map(&:to_s).exclude?(val)
173
- end
174
-
175
- def transform_categorical(val)
176
- return "other" if val.nil?
177
-
178
- values = @statistics.dig(:categorical, :value) || {}
179
- min_ct = options[:categorical_min] || 25
180
- allowed_values = values.select { |_v, c| c >= min_ct }.keys.map(&:to_s)
181
-
182
- allowed_values.include?(val.to_s) ? val.to_s : "other"
183
- end
184
-
185
- def transform_today(_val)
186
- EST.now.beginning_of_day
187
- end
188
-
189
- def fit_custom(x)
190
- x
191
- end
192
-
193
- def should_transform_custom?(x)
194
- if options.key?(:should_transform)
195
- options[:should_transform].call(x)
196
- else
197
- should_transform_default?(x)
198
- end
199
- end
200
-
201
- def transform_custom(x)
202
- raise "Transform required" unless options.key?(:transform)
203
-
204
- options[:transform].call(x)
205
- end
206
-
207
- private
208
-
209
- def validate_input(x)
210
- raise ArgumentError, "Input must be a Polars::Series" unless x.is_a?(Polars::Series)
211
-
212
- x
213
- end
214
-
215
- def fit_mean(x)
216
- { value: x.mean }
217
- end
218
-
219
- def fit_median(x)
220
- { value: x.median }
221
- end
222
-
223
- def fit_ffill(x, df)
224
- values = { value: nil, max_date: nil }
225
-
226
- date_col = df[options[:date_column]]
227
- return if date_col.is_null.all
228
-
229
- sorted_df = df.sort(options[:date_column])
230
- new_max_date = sorted_df[options[:date_column]].max
231
-
232
- current_max_date = values[:max_date]
233
- return if current_max_date && current_max_date > new_max_date
234
-
235
- values[:max_date] = [current_max_date, new_max_date].compact.max
236
-
237
- # Get the last non-null value
238
- last_non_null = sorted_df[x.name].filter(sorted_df[x.name].is_not_null).tail(1).to_a.first
239
- values[:value] = last_non_null
240
-
241
- values
242
- end
243
-
244
- def fit_most_frequent(x)
245
- value_counts = x.filter(x.is_not_null).value_counts
246
- column_names = value_counts.columns
247
- column_names[0]
248
- count_column = column_names[1]
249
-
250
- most_frequent_value = value_counts.sort(count_column, descending: true).row(0)[0]
251
- { value: most_frequent_value }
252
- end
253
-
254
- def fit_categorical(x)
255
- value_counts = x.value_counts
256
- column_names = value_counts.columns
257
- value_column = column_names[0]
258
- count_column = column_names[1]
259
-
260
- as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
261
- label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
262
- h.tap do
263
- h[k] = i
264
- end
265
- end
266
- label_decoder = label_encoder.invert
267
-
268
- {
269
- value: as_hash,
270
- label_encoder: label_encoder,
271
- label_decoder: label_decoder
272
- }
273
- end
274
-
275
- def fit_no_op(_x)
276
- {}
277
- end
278
-
279
- def fit_constant(_x)
280
- { value: @options[:fill_value] }
281
- end
282
-
283
- def transform_default(_val)
284
- @statistics[strategy][:value]
285
- end
286
-
287
- def should_transform_default?(val)
288
- checker_method = val.respond_to?(:nan?) ? :nan? : :nil?
289
- val.send(checker_method)
290
- end
291
-
292
- def transform_dense(x)
293
- result = x.map do |val|
294
- strategy_method = respond_to?("transform_#{strategy}") ? "transform_#{strategy}" : "transform_default"
295
- checker_method = respond_to?("should_transform_#{strategy}?") ? "should_transform_#{strategy}?" : "should_transform_default?"
296
- send(checker_method, val) ? send(strategy_method, val) : val
297
- end
298
-
299
- # Cast the result back to the original dtype
300
- original_dtype = @statistics[:original_dtype]
301
- if original_dtype
302
- result.map { |val| cast_to_dtype(val, original_dtype) }
303
- else
304
- result
305
- end
306
- end
307
-
308
- def check_is_fitted
309
- return if %i[clip today custom].include?(strategy)
310
-
311
- raise "SimpleImputer has not been fitted yet for #{attribute}##{strategy}" unless @statistics[strategy]
312
- end
313
-
314
- def serialize_statistics(stats)
315
- stats.deep_transform_values do |value|
316
- case value
317
- when Time, DateTime
318
- { "__type__" => "datetime", "value" => value.iso8601 }
319
- when Date
320
- { "__type__" => "date", "value" => value.iso8601 }
321
- when BigDecimal
322
- { "__type__" => "bigdecimal", "value" => value.to_s }
323
- when Polars::DataType
324
- { "__type__" => "polars_dtype", "value" => value.to_s }
325
- when Symbol
326
- { "__type__" => "symbol", "value" => value.to_s }
327
- else
328
- value
329
- end
330
- end
331
- end
332
-
333
- def deserialize_statistics(stats)
334
- stats.transform_values do |value|
335
- recursive_deserialize(value)
336
- end
337
- end
338
-
339
- def recursive_deserialize(value)
340
- case value
341
- when Hash
342
- if value["__type__"]
343
- deserialize_special_type(value)
344
- else
345
- value.transform_values { |v| recursive_deserialize(v) }
346
- end
347
- when Array
348
- value.map { |v| recursive_deserialize(v) }
349
- else
350
- value
351
- end
352
- end
353
-
354
- def deserialize_special_type(value)
355
- case value["__type__"]
356
- when "datetime"
357
- DateTime.parse(value["value"])
358
- when "date"
359
- Date.parse(value["value"])
360
- when "bigdecimal"
361
- BigDecimal(value["value"])
362
- when "polars_dtype"
363
- parse_polars_dtype(value["value"])
364
- when "symbol"
365
- value["value"].to_sym
366
- else
367
- value["value"]
368
- end
369
- end
370
-
371
- def parse_polars_dtype(dtype_string)
372
- case dtype_string
373
- when /^Polars::Datetime/
374
- time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
375
- time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
376
- time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
377
- Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
378
- when /^Polars::/
379
- Polars.const_get(dtype_string.split("::").last)
380
- else
381
- raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
382
- end
383
- end
384
-
385
- def cast_to_dtype(value, dtype)
386
- case dtype
387
- when Polars::Int64
388
- value.to_i
389
- when Polars::Float64
390
- value.to_f
391
- when Polars::Boolean
392
- !!value
393
- when Polars::Utf8
394
- value.to_s
395
- else
396
- value
397
- end
398
- end
399
- end
400
- end
401
- end
402
- end
@@ -1,5 +0,0 @@
1
- module EasyML
2
- module Deployment
3
- require_relative "deployment/model_uploader"
4
- end
5
- end
@@ -1,134 +0,0 @@
1
- require "glue_gun"
2
-
3
- module EasyML
4
- module Support
5
- class SyncedDirectory
6
- include GlueGun::DSL
7
-
8
- attribute :root_dir, :string
9
- attribute :s3_bucket, :string
10
- attribute :s3_prefix, :string
11
- attribute :s3_access_key_id, :string
12
- attribute :s3_secret_access_key, :string
13
-
14
- def sync
15
- return false if synced?
16
-
17
- mk_dir
18
- clean_dir!
19
- download
20
- true
21
- end
22
-
23
- def files
24
- Dir.glob(File.join(root_dir, File.join(s3_prefix, "*.csv")))
25
- end
26
-
27
- def age(format: "human")
28
- Age.age(last_updated_at, EST.now, format: format)
29
- end
30
-
31
- def stale?
32
- !synced?
33
- end
34
-
35
- def synced?
36
- return @synced unless @synced.nil?
37
-
38
- @synced = calculate_synced
39
- end
40
-
41
- def last_updated_at
42
- return nil if files.empty?
43
-
44
- files.map { |file| File.mtime(file) }.max.in_time_zone(EST)
45
- end
46
-
47
- private
48
-
49
- def mk_dir
50
- FileUtils.mkdir_p(root_dir)
51
- end
52
-
53
- def clean_dir!
54
- FileUtils.rm_rf(root_dir)
55
- end
56
-
57
- def s3
58
- @s3 ||= begin
59
- credentials = Aws::Credentials.new(s3_access_key_id, s3_secret_access_key)
60
- Aws::S3::Client.new(credentials: credentials)
61
- end
62
- end
63
-
64
- def download
65
- s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
66
- next if object.key.end_with?("/") # skip folders
67
-
68
- gzipped_file_path = File.join(root_dir, object.key)
69
- FileUtils.mkdir_p(File.dirname(gzipped_file_path))
70
-
71
- s3.get_object(
72
- response_target: gzipped_file_path,
73
- bucket: s3_bucket,
74
- key: object.key
75
- )
76
-
77
- puts "Downloaded #{object.key} to #{gzipped_file_path}"
78
-
79
- # Ungzip the file
80
- ungzipped_file_path = ungzip_file(gzipped_file_path)
81
- puts "Ungzipped to #{ungzipped_file_path}"
82
- end
83
- end
84
-
85
- def ungzip_file(gzipped_file_path)
86
- ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
87
-
88
- Zlib::GzipReader.open(gzipped_file_path) do |gz|
89
- File.open(ungzipped_file_path, "wb") do |file|
90
- file.write(gz.read)
91
- end
92
- end
93
-
94
- File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
95
- ungzipped_file_path
96
- end
97
-
98
- def expand_dir(dir)
99
- return dir if dir.to_s[0] == "/"
100
-
101
- Rails.root.join(dir)
102
- end
103
-
104
- def new_data_available?
105
- return true if files.empty?
106
-
107
- local_latest = last_updated_at
108
- s3_latest = s3_last_updated_at
109
-
110
- return false if s3_latest.nil?
111
-
112
- s3_latest > local_latest
113
- end
114
-
115
- def calculate_synced
116
- return false if age.nil?
117
-
118
- !new_data_available?
119
- end
120
-
121
- def s3_last_updated_at
122
- s3_latest = nil
123
-
124
- s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
125
- next if object.key.end_with?("/")
126
-
127
- s3_latest = [s3_latest, object.last_modified].compact.max
128
- end
129
-
130
- s3_latest.in_time_zone(EST)
131
- end
132
- end
133
- end
134
- end
@@ -1,29 +0,0 @@
1
- module EasyML::Transforms
2
- def self.included(base)
3
- base.extend(ClassMethods)
4
- end
5
-
6
- module ClassMethods
7
- def transforms
8
- @transforms ||= []
9
- end
10
-
11
- def transform(method_name)
12
- transforms << method_name
13
- end
14
-
15
- def apply_transforms(df)
16
- new.apply_transforms(df)
17
- end
18
- end
19
-
20
- def missing_any?(list1, list2)
21
- (list1 - list2).any?
22
- end
23
-
24
- def apply_transforms(df)
25
- self.class.transforms.reduce(df) do |df, transform_method|
26
- send(transform_method, df)
27
- end
28
- end
29
- end