easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -1,402 +0,0 @@
1
- require "active_support/core_ext/hash/deep_transform_values"
2
- require "numo/narray"
3
- require "json"
4
-
5
- module EasyML
6
- module Data
7
- class Preprocessor
8
- class SimpleImputer
9
- attr_reader :statistics
10
- attr_accessor :path, :attribute, :strategy, :options
11
-
12
- def initialize(strategy: "mean", path: nil, attribute: nil, options: {}, &block)
13
- @strategy = strategy.to_sym
14
- @path = path
15
- @attribute = attribute
16
- @options = options || {}
17
- apply_defaults
18
- load
19
- @statistics ||= {}
20
- deep_symbolize_keys!
21
- return unless block_given?
22
-
23
- instance_eval(&block)
24
- end
25
-
26
- def deep_symbolize_keys!
27
- @statistics = @statistics.deep_symbolize_keys
28
- end
29
-
30
- def apply_defaults
31
- @options[:date_column] ||= "CREATED_DATE"
32
-
33
- if strategy == :categorical
34
- @options[:categorical_min] ||= 25
35
- elsif strategy == :custom
36
- itself = ->(col) { col }
37
- @options[:fit] ||= itself
38
- @options[:transform] ||= itself
39
- end
40
- end
41
-
42
- def fit(x, df = nil)
43
- x = validate_input(x)
44
-
45
- fit_values = case @strategy
46
- when :mean
47
- fit_mean(x)
48
- when :median
49
- fit_median(x)
50
- when :ffill
51
- fit_ffill(x, df)
52
- when :most_frequent
53
- fit_most_frequent(x)
54
- when :categorical
55
- fit_categorical(x)
56
- when :constant
57
- fit_constant(x)
58
- when :clip
59
- fit_no_op(x)
60
- when :today
61
- fit_no_op(x)
62
- when :one_hot
63
- fit_no_op(x)
64
- when :custom
65
- fit_custom(x)
66
- else
67
- raise ArgumentError, "Invalid strategy: #{@strategy}"
68
- end || {}
69
-
70
- @statistics[attribute] ||= {}
71
- @statistics[attribute][@strategy] = fit_values.merge!(original_dtype: x.dtype)
72
- save
73
- self
74
- end
75
-
76
- def transform(x)
77
- check_is_fitted
78
-
79
- if x.is_a?(Polars::Series)
80
- transform_polars(x)
81
- else
82
- transform_dense(x)
83
- end
84
- end
85
-
86
- def transform_polars(x)
87
- result = case @strategy
88
- when :mean, :median, :ffill, :most_frequent, :constant
89
- x.fill_null(@statistics[@strategy][:value])
90
- when :clip
91
- min = options["min"] || 0
92
- max = options["max"] || 1_000_000_000_000
93
- if x.null_count != x.len
94
- x.clip(min, max)
95
- else
96
- x
97
- end
98
- when :categorical
99
- allowed_values = @statistics.dig(:categorical, :value).select do |_k, v|
100
- v >= options[:categorical_min]
101
- end.keys.map(&:to_s)
102
- if x.null_count == x.len
103
- x.fill_null(transform_categorical(nil))
104
- else
105
- x.apply do |val|
106
- allowed_values.include?(val) ? val : transform_categorical(val)
107
- end
108
- end
109
- when :today
110
- x.fill_null(transform_today(nil))
111
- when :custom
112
- if x.null_count == x.len
113
- x.fill_null(transform_custom(nil))
114
- else
115
- x.apply do |val|
116
- should_transform_custom?(val) ? transform_custom(val) : val
117
- end
118
- end
119
- else
120
- raise ArgumentError, "Unsupported strategy for Polars::Series: #{@strategy}"
121
- end
122
-
123
- # Cast the result back to the original dtype
124
- original_dtype = @statistics.dig(@strategy, :original_dtype)
125
- original_dtype ? result.cast(original_dtype) : result
126
- end
127
-
128
- def file_path
129
- raise "Need both attribute and path to save/load statistics" unless attribute.present? && path.to_s.present?
130
-
131
- File.join(path, "statistics.json")
132
- end
133
-
134
- def cleanup
135
- @statistics = {}
136
- FileUtils.rm(file_path) if File.exist?(file_path)
137
- end
138
-
139
- def save
140
- FileUtils.mkdir_p(File.dirname(file_path))
141
-
142
- all_statistics = (File.exist?(file_path) ? JSON.parse(File.read(file_path)) : {}).deep_symbolize_keys
143
-
144
- deep_symbolize_keys!
145
-
146
- serialized = serialize_statistics(@statistics)
147
- all_statistics[attribute] = {} unless all_statistics.key?(attribute)
148
- all_statistics[attribute][@strategy] = serialized[attribute.to_sym][@strategy.to_sym]
149
-
150
- File.open(file_path, "w") do |file|
151
- file.write(JSON.pretty_generate(all_statistics))
152
- end
153
- end
154
-
155
- def load
156
- return unless File.exist?(file_path)
157
-
158
- all_statistics = JSON.parse(File.read(file_path))
159
- attribute_stats = all_statistics[@attribute]
160
-
161
- return unless attribute_stats
162
-
163
- @statistics = deserialize_statistics(attribute_stats)
164
- deep_symbolize_keys!
165
- end
166
-
167
- def should_transform_categorical?(val)
168
- values = @statistics.dig(:categorical, :value) || {}
169
- min_ct = options[:categorical_min] || 25
170
- allowed_values = values.select { |_v, c| c >= min_ct }
171
-
172
- allowed_values.keys.map(&:to_s).exclude?(val)
173
- end
174
-
175
- def transform_categorical(val)
176
- return "other" if val.nil?
177
-
178
- values = @statistics.dig(:categorical, :value) || {}
179
- min_ct = options[:categorical_min] || 25
180
- allowed_values = values.select { |_v, c| c >= min_ct }.keys.map(&:to_s)
181
-
182
- allowed_values.include?(val.to_s) ? val.to_s : "other"
183
- end
184
-
185
- def transform_today(_val)
186
- EST.now.beginning_of_day
187
- end
188
-
189
- def fit_custom(x)
190
- x
191
- end
192
-
193
- def should_transform_custom?(x)
194
- if options.key?(:should_transform)
195
- options[:should_transform].call(x)
196
- else
197
- should_transform_default?(x)
198
- end
199
- end
200
-
201
- def transform_custom(x)
202
- raise "Transform required" unless options.key?(:transform)
203
-
204
- options[:transform].call(x)
205
- end
206
-
207
- private
208
-
209
- def validate_input(x)
210
- raise ArgumentError, "Input must be a Polars::Series" unless x.is_a?(Polars::Series)
211
-
212
- x
213
- end
214
-
215
- def fit_mean(x)
216
- { value: x.mean }
217
- end
218
-
219
- def fit_median(x)
220
- { value: x.median }
221
- end
222
-
223
- def fit_ffill(x, df)
224
- values = { value: nil, max_date: nil }
225
-
226
- date_col = df[options[:date_column]]
227
- return if date_col.is_null.all
228
-
229
- sorted_df = df.sort(options[:date_column])
230
- new_max_date = sorted_df[options[:date_column]].max
231
-
232
- current_max_date = values[:max_date]
233
- return if current_max_date && current_max_date > new_max_date
234
-
235
- values[:max_date] = [current_max_date, new_max_date].compact.max
236
-
237
- # Get the last non-null value
238
- last_non_null = sorted_df[x.name].filter(sorted_df[x.name].is_not_null).tail(1).to_a.first
239
- values[:value] = last_non_null
240
-
241
- values
242
- end
243
-
244
- def fit_most_frequent(x)
245
- value_counts = x.filter(x.is_not_null).value_counts
246
- column_names = value_counts.columns
247
- column_names[0]
248
- count_column = column_names[1]
249
-
250
- most_frequent_value = value_counts.sort(count_column, descending: true).row(0)[0]
251
- { value: most_frequent_value }
252
- end
253
-
254
- def fit_categorical(x)
255
- value_counts = x.value_counts
256
- column_names = value_counts.columns
257
- value_column = column_names[0]
258
- count_column = column_names[1]
259
-
260
- as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
261
- label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
262
- h.tap do
263
- h[k] = i
264
- end
265
- end
266
- label_decoder = label_encoder.invert
267
-
268
- {
269
- value: as_hash,
270
- label_encoder: label_encoder,
271
- label_decoder: label_decoder
272
- }
273
- end
274
-
275
- def fit_no_op(_x)
276
- {}
277
- end
278
-
279
- def fit_constant(_x)
280
- { value: @options[:fill_value] }
281
- end
282
-
283
- def transform_default(_val)
284
- @statistics[strategy][:value]
285
- end
286
-
287
- def should_transform_default?(val)
288
- checker_method = val.respond_to?(:nan?) ? :nan? : :nil?
289
- val.send(checker_method)
290
- end
291
-
292
- def transform_dense(x)
293
- result = x.map do |val|
294
- strategy_method = respond_to?("transform_#{strategy}") ? "transform_#{strategy}" : "transform_default"
295
- checker_method = respond_to?("should_transform_#{strategy}?") ? "should_transform_#{strategy}?" : "should_transform_default?"
296
- send(checker_method, val) ? send(strategy_method, val) : val
297
- end
298
-
299
- # Cast the result back to the original dtype
300
- original_dtype = @statistics[:original_dtype]
301
- if original_dtype
302
- result.map { |val| cast_to_dtype(val, original_dtype) }
303
- else
304
- result
305
- end
306
- end
307
-
308
- def check_is_fitted
309
- return if %i[clip today custom].include?(strategy)
310
-
311
- raise "SimpleImputer has not been fitted yet for #{attribute}##{strategy}" unless @statistics[strategy]
312
- end
313
-
314
- def serialize_statistics(stats)
315
- stats.deep_transform_values do |value|
316
- case value
317
- when Time, DateTime
318
- { "__type__" => "datetime", "value" => value.iso8601 }
319
- when Date
320
- { "__type__" => "date", "value" => value.iso8601 }
321
- when BigDecimal
322
- { "__type__" => "bigdecimal", "value" => value.to_s }
323
- when Polars::DataType
324
- { "__type__" => "polars_dtype", "value" => value.to_s }
325
- when Symbol
326
- { "__type__" => "symbol", "value" => value.to_s }
327
- else
328
- value
329
- end
330
- end
331
- end
332
-
333
- def deserialize_statistics(stats)
334
- stats.transform_values do |value|
335
- recursive_deserialize(value)
336
- end
337
- end
338
-
339
- def recursive_deserialize(value)
340
- case value
341
- when Hash
342
- if value["__type__"]
343
- deserialize_special_type(value)
344
- else
345
- value.transform_values { |v| recursive_deserialize(v) }
346
- end
347
- when Array
348
- value.map { |v| recursive_deserialize(v) }
349
- else
350
- value
351
- end
352
- end
353
-
354
- def deserialize_special_type(value)
355
- case value["__type__"]
356
- when "datetime"
357
- DateTime.parse(value["value"])
358
- when "date"
359
- Date.parse(value["value"])
360
- when "bigdecimal"
361
- BigDecimal(value["value"])
362
- when "polars_dtype"
363
- parse_polars_dtype(value["value"])
364
- when "symbol"
365
- value["value"].to_sym
366
- else
367
- value["value"]
368
- end
369
- end
370
-
371
- def parse_polars_dtype(dtype_string)
372
- case dtype_string
373
- when /^Polars::Datetime/
374
- time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
375
- time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
376
- time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
377
- Polars::Datetime.new(time_unit: time_unit, time_zone: time_zone).class
378
- when /^Polars::/
379
- Polars.const_get(dtype_string.split("::").last)
380
- else
381
- raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
382
- end
383
- end
384
-
385
- def cast_to_dtype(value, dtype)
386
- case dtype
387
- when Polars::Int64
388
- value.to_i
389
- when Polars::Float64
390
- value.to_f
391
- when Polars::Boolean
392
- !!value
393
- when Polars::Utf8
394
- value.to_s
395
- else
396
- value
397
- end
398
- end
399
- end
400
- end
401
- end
402
- end
@@ -1,5 +0,0 @@
1
- module EasyML
2
- module Deployment
3
- require_relative "deployment/model_uploader"
4
- end
5
- end
@@ -1,134 +0,0 @@
1
- require "glue_gun"
2
-
3
- module EasyML
4
- module Support
5
- class SyncedDirectory
6
- include GlueGun::DSL
7
-
8
- attribute :root_dir, :string
9
- attribute :s3_bucket, :string
10
- attribute :s3_prefix, :string
11
- attribute :s3_access_key_id, :string
12
- attribute :s3_secret_access_key, :string
13
-
14
- def sync
15
- return false if synced?
16
-
17
- mk_dir
18
- clean_dir!
19
- download
20
- true
21
- end
22
-
23
- def files
24
- Dir.glob(File.join(root_dir, File.join(s3_prefix, "*.csv")))
25
- end
26
-
27
- def age(format: "human")
28
- Age.age(last_updated_at, EST.now, format: format)
29
- end
30
-
31
- def stale?
32
- !synced?
33
- end
34
-
35
- def synced?
36
- return @synced unless @synced.nil?
37
-
38
- @synced = calculate_synced
39
- end
40
-
41
- def last_updated_at
42
- return nil if files.empty?
43
-
44
- files.map { |file| File.mtime(file) }.max.in_time_zone(EST)
45
- end
46
-
47
- private
48
-
49
- def mk_dir
50
- FileUtils.mkdir_p(root_dir)
51
- end
52
-
53
- def clean_dir!
54
- FileUtils.rm_rf(root_dir)
55
- end
56
-
57
- def s3
58
- @s3 ||= begin
59
- credentials = Aws::Credentials.new(s3_access_key_id, s3_secret_access_key)
60
- Aws::S3::Client.new(credentials: credentials)
61
- end
62
- end
63
-
64
- def download
65
- s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
66
- next if object.key.end_with?("/") # skip folders
67
-
68
- gzipped_file_path = File.join(root_dir, object.key)
69
- FileUtils.mkdir_p(File.dirname(gzipped_file_path))
70
-
71
- s3.get_object(
72
- response_target: gzipped_file_path,
73
- bucket: s3_bucket,
74
- key: object.key
75
- )
76
-
77
- puts "Downloaded #{object.key} to #{gzipped_file_path}"
78
-
79
- # Ungzip the file
80
- ungzipped_file_path = ungzip_file(gzipped_file_path)
81
- puts "Ungzipped to #{ungzipped_file_path}"
82
- end
83
- end
84
-
85
- def ungzip_file(gzipped_file_path)
86
- ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
87
-
88
- Zlib::GzipReader.open(gzipped_file_path) do |gz|
89
- File.open(ungzipped_file_path, "wb") do |file|
90
- file.write(gz.read)
91
- end
92
- end
93
-
94
- File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
95
- ungzipped_file_path
96
- end
97
-
98
- def expand_dir(dir)
99
- return dir if dir.to_s[0] == "/"
100
-
101
- Rails.root.join(dir)
102
- end
103
-
104
- def new_data_available?
105
- return true if files.empty?
106
-
107
- local_latest = last_updated_at
108
- s3_latest = s3_last_updated_at
109
-
110
- return false if s3_latest.nil?
111
-
112
- s3_latest > local_latest
113
- end
114
-
115
- def calculate_synced
116
- return false if age.nil?
117
-
118
- !new_data_available?
119
- end
120
-
121
- def s3_last_updated_at
122
- s3_latest = nil
123
-
124
- s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
125
- next if object.key.end_with?("/")
126
-
127
- s3_latest = [s3_latest, object.last_modified].compact.max
128
- end
129
-
130
- s3_latest.in_time_zone(EST)
131
- end
132
- end
133
- end
134
- end
@@ -1,29 +0,0 @@
1
- module EasyML::Transforms
2
- def self.included(base)
3
- base.extend(ClassMethods)
4
- end
5
-
6
- module ClassMethods
7
- def transforms
8
- @transforms ||= []
9
- end
10
-
11
- def transform(method_name)
12
- transforms << method_name
13
- end
14
-
15
- def apply_transforms(df)
16
- new.apply_transforms(df)
17
- end
18
- end
19
-
20
- def missing_any?(list1, list2)
21
- (list1 - list2).any?
22
- end
23
-
24
- def apply_transforms(df)
25
- self.class.transforms.reduce(df) do |df, transform_method|
26
- send(transform_method, df)
27
- end
28
- end
29
- end