easy_ml 0.1.3 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -4
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,767 @@
1
+ # == Schetuma Information
2
+ #
3
+ # Table name: easy_ml_datasets
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # description :string
8
+ # dataset_type :string
9
+ # status :string
10
+ # version :string
11
+ # datasource_id :bigint
12
+ # root_dir :string
13
+ # configuration :json
14
+ # num_rows :bigint
15
+ # workflow_status :string
16
+ # statistics :json
17
+ # preprocessor_statistics :json
18
+ # schema :json
19
+ # refreshed_at :datetime
20
+ # created_at :datetime not null
21
+ # updated_at :datetime not null
22
+ #
23
+ module EasyML
24
+ class Dataset < ActiveRecord::Base
25
+ self.table_name = "easy_ml_datasets"
26
+ include EasyML::Concerns::Configurable
27
+ include EasyML::Concerns::Versionable
28
+ include Historiographer::Silent
29
+ historiographer_mode :snapshot_only
30
+
31
+ enum workflow_status: {
32
+ analyzing: "analyzing",
33
+ ready: "ready",
34
+ failed: "failed",
35
+ }
36
+
37
+ SPLIT_ORDER = %i[train valid test]
38
+
39
+ self.filter_attributes += %i[configuration statistics schema]
40
+
41
+ validates :name, presence: true
42
+ belongs_to :datasource, class_name: "EasyML::Datasource"
43
+
44
+ has_many :models, class_name: "EasyML::Model"
45
+ has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
46
+ accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
47
+
48
+ has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
49
+ accepts_nested_attributes_for :features, allow_destroy: true
50
+
51
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
52
+
53
+ before_destroy :destructively_cleanup!
54
+
55
+ delegate :new_data_available?, :synced?, :stale?, to: :datasource
56
+ delegate :train, :test, :valid, to: :split
57
+ delegate :splits, to: :splitter
58
+
59
+ has_one :splitter, class_name: "EasyML::Splitter", dependent: :destroy, inverse_of: :dataset
60
+
61
+ accepts_nested_attributes_for :splitter,
62
+ allow_destroy: true,
63
+ reject_if: :all_blank
64
+
65
+ validates :datasource, presence: true
66
+
67
+ add_configuration_attributes :remote_files
68
+
69
+ after_find :download_remote_files
70
+ after_create :refresh_async
71
+ after_initialize do
72
+ bump_version unless version.present?
73
+ write_attribute(:workflow_status, :ready) if workflow_status.nil?
74
+ end
75
+ before_save :set_root_dir
76
+ before_validation :filter_duplicate_features
77
+
78
+ def self.constants
79
+ {
80
+ column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
81
+ { value: type.to_s, label: type.to_s.titleize }
82
+ end,
83
+ preprocessing_strategies: EasyML::Data::Preprocessor.constants[:preprocessing_strategies],
84
+ feature_options: EasyML::Features::Registry.list_flat,
85
+ splitter_constants: EasyML::Splitter.constants,
86
+ }
87
+ end
88
+
89
+ def root_dir=(value)
90
+ raise "Cannot override value of root_dir!" unless value.to_s == root_dir.to_s
91
+
92
+ write_attribute(:root_dir, value)
93
+ end
94
+
95
+ def set_root_dir
96
+ write_attribute(:root_dir, root_dir)
97
+ end
98
+
99
+ def root_dir
100
+ bump_version
101
+ EasyML::Engine.root_dir.join("datasets").join(underscored_name).join(version).to_s
102
+ end
103
+
104
+ def destructively_cleanup!
105
+ FileUtils.rm_rf(root_dir) if root_dir.present?
106
+ end
107
+
108
+ def schema
109
+ read_attribute(:schema) || datasource.schema
110
+ end
111
+
112
+ def processed_schema
113
+ processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
114
+ end
115
+
116
+ def refresh_datatypes
117
+ return unless columns_need_refresh?
118
+
119
+ cleanup
120
+ datasource.reread(columns)
121
+ end
122
+
123
+ def num_rows
124
+ if datasource&.num_rows.nil?
125
+ datasource.after_sync
126
+ end
127
+ datasource&.num_rows
128
+ end
129
+
130
+ def refresh_async
131
+ return if analyzing?
132
+
133
+ update(workflow_status: "analyzing")
134
+ EasyML::RefreshDatasetJob.perform_later(id)
135
+ end
136
+
137
+ def raw
138
+ return @raw if @raw && @raw.dataset
139
+
140
+ @raw = initialize_split("raw")
141
+ end
142
+
143
+ def processed
144
+ return @processed if @processed && @processed.dataset
145
+
146
+ @processed = initialize_split("processed")
147
+ end
148
+
149
+ def bump_versions(version)
150
+ self.version = version
151
+
152
+ @raw = raw.cp(version)
153
+ @processed = processed.cp(version)
154
+ features.each(&:bump_version)
155
+
156
+ save
157
+ end
158
+
159
+ def prepare!
160
+ cleanup
161
+ refresh_datasource!
162
+ split_data
163
+ end
164
+
165
+ def prepare
166
+ refresh_datasource
167
+ split_data
168
+ end
169
+
170
+ def actually_refresh
171
+ refreshing do
172
+ split_data
173
+ process_data
174
+ fully_reload
175
+ learn
176
+ now = UTC.now
177
+ update(workflow_status: "ready", refreshed_at: now, updated_at: now)
178
+ fully_reload
179
+ end
180
+ end
181
+
182
+ def refresh!(async: false)
183
+ refreshing do
184
+ prepare!
185
+ fit_features!(async: async)
186
+ end
187
+ after_fit_features unless async
188
+ end
189
+
190
+ def refresh(async: false)
191
+ return refresh_async if async
192
+
193
+ refreshing do
194
+ prepare
195
+ fit_features(async: async)
196
+ end
197
+ after_fit_features unless async
198
+ end
199
+
200
+ def fit_features!(async: false, features: self.features)
201
+ fit_features(async: async, features: features, force: true)
202
+ end
203
+
204
+ def fit_features(async: false, features: self.features, force: false)
205
+ features_to_compute = force ? features : features.needs_fit
206
+ return if features_to_compute.empty?
207
+
208
+ features.first.fit(features: features_to_compute, async: async)
209
+ end
210
+
211
+ def after_fit_features
212
+ features.update_all(needs_fit: false, fit_at: Time.current)
213
+ unlock!
214
+ actually_refresh
215
+ end
216
+
217
+ def columns_need_refresh
218
+ preloaded_columns.select do |col|
219
+ col.updated_at.present? &&
220
+ refreshed_at.present? &&
221
+ col.updated_at > refreshed_at
222
+ end
223
+ end
224
+
225
+ def columns_need_refresh?
226
+ columns_need_refresh.any?
227
+ end
228
+
229
+ def features_need_fit
230
+ preloaded_features.select do |f|
231
+ (f.updated_at.present? && refreshed_at.present? && f.updated_at > refreshed_at) ||
232
+ f.needs_fit?
233
+ end
234
+ end
235
+
236
+ def features_need_fit?
237
+ features_need_fit.any?
238
+ end
239
+
240
+ def refresh_reasons
241
+ {
242
+ "Not split" => not_split?,
243
+ "Refreshed at is nil" => refreshed_at.nil?,
244
+ "Columns need refresh" => columns_need_refresh?,
245
+ "Features need refresh" => features_need_fit?,
246
+ "Datasource needs refresh" => datasource_needs_refresh?,
247
+ "Datasource was refreshed" => datasource_was_refreshed?,
248
+ }.select { |k, v| v }.map { |k, v| k }
249
+ end
250
+
251
+ def needs_refresh?
252
+ refresh_reasons.any?
253
+ end
254
+
255
+ def not_split?
256
+ processed.split_at.nil? || raw.split_at.nil?
257
+ end
258
+
259
+ def datasource_needs_refresh?
260
+ datasource&.needs_refresh?
261
+ end
262
+
263
+ def datasource_was_refreshed?
264
+ raw.split_at.present? && raw.split_at < datasource.last_updated_at
265
+ end
266
+
267
+ def learn
268
+ learn_schema
269
+ learn_statistics
270
+ columns.sync
271
+ end
272
+
273
+ def refreshing
274
+ return false if is_history_class?
275
+ unlock! unless analyzing?
276
+
277
+ lock_dataset do
278
+ update(workflow_status: "analyzing")
279
+ fully_reload
280
+ yield
281
+ ensure
282
+ unlock!
283
+ end
284
+ rescue => e
285
+ update(workflow_status: "failed")
286
+ e.backtrace.grep(/easy_ml/).each do |line|
287
+ puts line
288
+ end
289
+ raise e
290
+ end
291
+
292
+ def unlock!
293
+ Support::Lockable.unlock!(lock_key)
294
+ end
295
+
296
+ def locked?
297
+ Support::Lockable.locked?(lock_key)
298
+ end
299
+
300
+ def lock_dataset
301
+ data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
302
+ with_lock do |client|
303
+ yield
304
+ end
305
+ end
306
+
307
+ def with_lock
308
+ EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
309
+ yield client
310
+ end
311
+ end
312
+
313
+ def lock_key
314
+ "dataset:#{id}"
315
+ end
316
+
317
+ def learn_schema
318
+ data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
319
+ schema = data.schema.reduce({}) do |h, (k, v)|
320
+ h.tap do
321
+ h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
322
+ end
323
+ end
324
+ write_attribute(:schema, schema)
325
+ end
326
+
327
+ def learn_statistics
328
+ update(
329
+ statistics: EasyML::Data::StatisticsLearner.learn(raw, processed),
330
+ )
331
+ end
332
+
333
+ def process_data
334
+ split_data
335
+ fit
336
+ normalize_all
337
+ # alert_nulls
338
+ end
339
+
340
+ def needs_learn?(df)
341
+ return true if columns_need_refresh?
342
+
343
+ never_learned = columns.none?
344
+ return true if never_learned
345
+
346
+ new_features = features.any? { |f| f.updated_at > columns.maximum(:updated_at) }
347
+ return true if new_features
348
+
349
+ new_cols = df.present? ? (df.columns - columns.map(&:name)) : []
350
+ new_cols = columns.syncable
351
+
352
+ return true if new_cols.any?
353
+ end
354
+
355
+ def compare_datasets(df, df_was)
356
+ # Step 1: Check if the entire dataset is identical
357
+ if df == df_was
358
+ return "The datasets are identical."
359
+ end
360
+
361
+ # Step 2: Identify columns with differences
362
+ differing_columns = df.columns.select do |column|
363
+ df[column] != df_was[column]
364
+ end
365
+
366
+ # Step 3: Find row-level differences for each differing column
367
+ differences = {}
368
+ differing_columns.each do |column|
369
+ mask = df[column] != df_was[column]
370
+ differing_rows = df[mask][column].zip(df_was[mask][column]).map.with_index do |(df_value, df_was_value), index|
371
+ { row_index: index, df_value: df_value, df_was_value: df_was_value }
372
+ end
373
+
374
+ differences[column] = differing_rows
375
+ end
376
+
377
+ { differing_columns: differing_columns, differences: differences }
378
+ end
379
+
380
+ def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features, idx: nil)
381
+ df = apply_features(df, features)
382
+ df = drop_nulls(df)
383
+ df = apply_missing_features(df, inference: inference)
384
+ df = preprocessor.postprocess(df, inference: inference)
385
+
386
+ # Learn will update columns, so if any features have been added
387
+ # since the last time columns were learned, we should re-learn the schema
388
+ learn if idx == 0 && needs_learn?(df)
389
+ df = apply_column_mask(df, inference: inference) unless all_columns
390
+ raise_on_nulls(df) if inference
391
+ df, = processed.split_features_targets(df, true, target) if split_ys
392
+ df
393
+ end
394
+
395
+ def raise_on_nulls(df)
396
+ desc_df = df.describe
397
+
398
+ # Get the 'null_count' row
399
+ null_count_row = desc_df.filter(desc_df[:describe] == "null_count")
400
+
401
+ # Select columns with non-zero null counts
402
+ columns_with_nulls = null_count_row.columns.select do |col|
403
+ null_count_row[col][0].to_i > 0
404
+ end
405
+
406
+ if columns_with_nulls.any?
407
+ raise "Null values found in columns: #{columns_with_nulls.join(", ")}"
408
+ end
409
+ end
410
+
411
+ # Filter data using Polars predicates:
412
+ # dataset.data(filter: Polars.col("CREATED_DATE") > EST.now - 2.days)
413
+ # dataset.data(limit: 10)
414
+ # dataset.data(select: ["column1", "column2", "column3"], limit: 10)
415
+ # dataset.data(split_ys: true)
416
+ # dataset.data(all_columns: true) # Include all columns, even ones we SHOULDN'T train on (e.g. drop_cols). Be very careful! This is for data analysis purposes ONLY!
417
+ #
418
+ def train(**kwargs, &block)
419
+ load_data(:train, **kwargs, &block)
420
+ end
421
+
422
+ def valid(**kwargs, &block)
423
+ load_data(:valid, **kwargs, &block)
424
+ end
425
+
426
+ def test(**kwargs, &block)
427
+ load_data(:test, **kwargs, &block)
428
+ end
429
+
430
+ def data(**kwargs, &block)
431
+ load_data(:all, **kwargs, &block)
432
+ end
433
+
434
+ alias_method :query, :data
435
+
436
+ def cleanup
437
+ raw.cleanup
438
+ processed.cleanup
439
+ end
440
+
441
+ def check_nulls(data_type = :processed)
442
+ result = SPLIT_ORDER.each_with_object({}) do |segment, acc|
443
+ segment_result = { nulls: {}, total: 0 }
444
+
445
+ data_source = data_type == :raw ? raw : processed
446
+ data_source.read(segment) do |df|
447
+ df_nulls = null_check(df)
448
+ df.columns.each do |column|
449
+ segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
450
+ segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count] if df_nulls && df_nulls[column]
451
+ segment_result[:nulls][column][:total_count] += df.height
452
+ end
453
+ end
454
+
455
+ segment_result[:nulls].each do |column, counts|
456
+ percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
457
+ acc[column] ||= {}
458
+ acc[column][segment] = percentage
459
+ end
460
+ end
461
+
462
+ # Remove columns that have no nulls across all segments
463
+ result.reject! { |_, v| v.values.all?(&:zero?) }
464
+
465
+ result.empty? ? nil : result
466
+ end
467
+
468
+ def processed?
469
+ !should_split?
470
+ end
471
+
472
+ def decode_labels(ys, col: nil)
473
+ preprocessor.decode_labels(ys, col: col.nil? ? target : col)
474
+ end
475
+
476
+ def preprocessing_steps
477
+ return if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
478
+ return @preprocessing_steps if @preprocessing_steps.present?
479
+
480
+ training = standardize_preprocessing_steps(:training)
481
+ inference = standardize_preprocessing_steps(:inference)
482
+
483
+ @preprocessing_steps = {
484
+ training: training,
485
+ inference: inference,
486
+ }.compact.deep_symbolize_keys
487
+ end
488
+
489
+ def preprocessor
490
+ @preprocessor ||= initialize_preprocessor
491
+ return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
492
+
493
+ @preprocessor = initialize_preprocessor
494
+ end
495
+
496
+ def target
497
+ @target ||= preloaded_columns.find(&:is_target)&.name
498
+ end
499
+
500
+ def drop_cols
501
+ @drop_cols ||= preloaded_columns.select(&:hidden).map(&:name)
502
+ end
503
+
504
+ def drop_if_null
505
+ @drop_if_null ||= preloaded_columns.select(&:drop_if_null).map(&:name)
506
+ end
507
+
508
+ def col_order(inference: false)
509
+ # Filter preloaded columns in memory
510
+ scope = preloaded_columns.reject(&:hidden)
511
+ scope = scope.reject(&:is_target) if inference
512
+
513
+ # Get one_hot columns for category mapping
514
+ one_hots = scope.select(&:one_hot?)
515
+ one_hot_cats = columns.allowed_categories.symbolize_keys
516
+
517
+ # Map columns to names, handling one_hot expansion
518
+ scope.sort_by(&:id).flat_map do |col|
519
+ if col.one_hot?
520
+ one_hot_cats[col.name.to_sym].map do |cat|
521
+ "#{col.name}_#{cat}"
522
+ end
523
+ else
524
+ col.name
525
+ end
526
+ end
527
+ end
528
+
529
+ def column_mask(df, inference: false)
530
+ cols = df.columns & col_order(inference: inference)
531
+ cols.sort_by { |col| col_order.index(col) }
532
+ end
533
+
534
+ def apply_column_mask(df, inference: false)
535
+ df[column_mask(df, inference: inference)]
536
+ end
537
+
538
+ def apply_missing_features(df, inference: false)
539
+ return df unless inference
540
+
541
+ missing_features = (col_order(inference: inference) - df.columns).compact
542
+ df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
543
+ end
544
+
545
+ def drop_columns(all_columns: false)
546
+ if all_columns
547
+ []
548
+ else
549
+ drop_cols
550
+ end
551
+ end
552
+
553
+ def files
554
+ [raw, processed].flat_map(&:files)
555
+ end
556
+
557
+ def load_dataset
558
+ download_remote_files
559
+ end
560
+
561
+ def upload_remote_files
562
+ return unless processed?
563
+
564
+ processed.upload.tap do
565
+ features.each(&:upload_remote_files)
566
+ features.each(&:save)
567
+ save
568
+ end
569
+ end
570
+
571
+ def reload(*args)
572
+ # Call the original reload method
573
+ super(*args)
574
+ # Reset preloaded instance variables
575
+ @preloaded_columns = nil
576
+ @preloaded_features = nil
577
+ self
578
+ end
579
+
580
+ private
581
+
582
+ def preloaded_features
583
+ @preloaded_features ||= features.includes(:dataset).load
584
+ end
585
+
586
+ def preloaded_columns
587
+ @preloaded_columns ||= columns.load
588
+ end
589
+
590
+ def download_remote_files
591
+ return unless is_history_class? # Only historical datasets need this
592
+ return if processed.present? && processed.data
593
+
594
+ processed.download
595
+ end
596
+
597
+ def initialize_splits
598
+ @raw = nil
599
+ @processed = nil
600
+ raw
601
+ processed
602
+ end
603
+
604
+ def initialize_split(type)
605
+ return unless datasource.present?
606
+
607
+ args = { dataset: self, datasource: datasource }
608
+ case split_type.to_s
609
+ when EasyML::Data::Splits::InMemorySplit.to_s
610
+ split_type.new(**args)
611
+ when EasyML::Data::Splits::FileSplit.to_s
612
+ split_type.new(**args.merge(
613
+ dir: Pathname.new(root_dir).append("files/splits/#{type}").to_s,
614
+ ))
615
+ end
616
+ end
617
+
618
+ def split_type
619
+ datasource.in_memory? ? EasyML::Data::Splits::InMemorySplit : EasyML::Data::Splits::FileSplit
620
+ end
621
+
622
+ def refresh_datasource
623
+ datasource.reload.refresh
624
+ refresh_datatypes
625
+ initialize_splits
626
+ end
627
+
628
+ def refresh_datasource!
629
+ datasource.reload.refresh!
630
+ refresh_datatypes
631
+ initialize_splits
632
+ end
633
+
634
+ def normalize_all
635
+ processed.cleanup
636
+
637
+ SPLIT_ORDER.each_with_index do |segment, idx|
638
+ df = raw.read(segment)
639
+ processed_df = normalize(df, all_columns: true, idx: idx)
640
+ processed.save(segment, processed_df)
641
+ end
642
+ @normalized = true
643
+ end
644
+
645
+ def drop_nulls(df)
646
+ return df if drop_if_null.nil? || drop_if_null.empty?
647
+
648
+ drop = (df.columns & drop_if_null)
649
+ return df if drop.empty?
650
+
651
+ df.drop_nulls(subset: drop)
652
+ end
653
+
654
+ def load_data(segment, **kwargs, &block)
655
+ if processed?
656
+ processed.load_data(segment, **kwargs, &block)
657
+ else
658
+ raw.load_data(segment, **kwargs, &block)
659
+ end
660
+ end
661
+
662
+ def fit(xs = nil)
663
+ xs = raw.train(all_columns: true) if xs.nil?
664
+
665
+ preprocessor.fit(xs)
666
+ self.preprocessor_statistics = preprocessor.statistics
667
+ end
668
+
669
+ # log_method :fit, "Learning statistics", verbose: true
670
+
671
+ def split_data!
672
+ split_data(force: true)
673
+ end
674
+
675
+ def split_data(force: false)
676
+ return unless force || should_split?
677
+
678
+ cleanup
679
+ features = self.features.ordered.load
680
+ splitter.split(datasource) do |train_df, valid_df, test_df|
681
+ [:train, :valid, :test].zip([train_df, valid_df, test_df]).each do |segment, df|
682
+ raw.save(segment, df)
683
+ end
684
+ end
685
+ end
686
+
687
+ def should_split?
688
+ needs_refresh?
689
+ end
690
+
691
+ def filter_duplicate_features
692
+ return unless attributes["features_attributes"].present?
693
+
694
+ existing_feature_names = features.pluck(:name)
695
+ attributes["features_attributes"].each do |_, attrs|
696
+ # Skip if it's marked for destruction or is an existing record
697
+ next if attrs["_destroy"] == "1" || attrs["id"].present?
698
+
699
+ # Reject the feature if it would be a duplicate
700
+ attrs["_destroy"] = "1" if existing_feature_names.include?(attrs["name"])
701
+ end
702
+ end
703
+
704
+ def apply_features(df, features = self.features)
705
+ if features.nil? || features.empty?
706
+ df
707
+ else
708
+ # Eager load all features with their necessary associations in one query
709
+ if features.is_a?(Array) # Used for testing (feature.transform_batch)
710
+ features_to_apply = features
711
+ else
712
+ features_to_apply = features.ordered.includes(dataset: :datasource).to_a
713
+ end
714
+
715
+ # Preload all feature SHAs in one batch
716
+ feature_classes = features_to_apply.map(&:feature_class).uniq
717
+ shas = feature_classes.map { |klass| [klass, Feature.compute_sha(klass)] }.to_h
718
+
719
+ # Apply features in sequence with preloaded data
720
+ features_to_apply.reduce(df) do |acc_df, feature|
721
+ # Set SHA without querying
722
+ feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
723
+
724
+ result = feature.transform_batch(acc_df)
725
+
726
+ unless result.is_a?(Polars::DataFrame)
727
+ raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
728
+ end
729
+
730
+ result
731
+ end
732
+ end
733
+ end
734
+
735
+ def standardize_preprocessing_steps(type)
736
+ columns.map(&:name).zip(columns.map do |col|
737
+ col.preprocessing_steps&.dig(type)
738
+ end).to_h.compact.reject { |_k, v| v["method"] == "none" }
739
+ end
740
+
741
+ def initialize_preprocessor
742
+ EasyML::Data::Preprocessor.new(
743
+ directory: Pathname.new(root_dir).append("preprocessor"),
744
+ preprocessing_steps: preprocessing_steps,
745
+ ).tap do |preprocessor|
746
+ preprocessor.statistics = preprocessor_statistics
747
+ end
748
+ end
749
+
750
+ def fully_reload
751
+ return unless persisted?
752
+
753
+ base_vars = self.class.new.instance_variables
754
+ dirty_vars = (instance_variables - base_vars)
755
+ in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
756
+ dirty_vars.each do |ivar|
757
+ value = instance_variable_get(ivar)
758
+ remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
759
+ end
760
+ reload
761
+ end
762
+
763
+ def underscored_name
764
+ name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
765
+ end
766
+ end
767
+ end