easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,767 @@
1
+ # == Schetuma Information
2
+ #
3
+ # Table name: easy_ml_datasets
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # description :string
8
+ # dataset_type :string
9
+ # status :string
10
+ # version :string
11
+ # datasource_id :bigint
12
+ # root_dir :string
13
+ # configuration :json
14
+ # num_rows :bigint
15
+ # workflow_status :string
16
+ # statistics :json
17
+ # preprocessor_statistics :json
18
+ # schema :json
19
+ # refreshed_at :datetime
20
+ # created_at :datetime not null
21
+ # updated_at :datetime not null
22
+ #
23
+ module EasyML
24
+ class Dataset < ActiveRecord::Base
25
+ self.table_name = "easy_ml_datasets"
26
+ include EasyML::Concerns::Configurable
27
+ include EasyML::Concerns::Versionable
28
+ include Historiographer::Silent
29
+ historiographer_mode :snapshot_only
30
+
31
+ enum workflow_status: {
32
+ analyzing: "analyzing",
33
+ ready: "ready",
34
+ failed: "failed",
35
+ }
36
+
37
+ SPLIT_ORDER = %i[train valid test]
38
+
39
+ self.filter_attributes += %i[configuration statistics schema]
40
+
41
+ validates :name, presence: true
42
+ belongs_to :datasource, class_name: "EasyML::Datasource"
43
+
44
+ has_many :models, class_name: "EasyML::Model"
45
+ has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
46
+ accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
47
+
48
+ has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
49
+ accepts_nested_attributes_for :features, allow_destroy: true
50
+
51
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
52
+
53
+ before_destroy :destructively_cleanup!
54
+
55
+ delegate :new_data_available?, :synced?, :stale?, to: :datasource
56
+ delegate :train, :test, :valid, to: :split
57
+ delegate :splits, to: :splitter
58
+
59
+ has_one :splitter, class_name: "EasyML::Splitter", dependent: :destroy, inverse_of: :dataset
60
+
61
+ accepts_nested_attributes_for :splitter,
62
+ allow_destroy: true,
63
+ reject_if: :all_blank
64
+
65
+ validates :datasource, presence: true
66
+
67
+ add_configuration_attributes :remote_files
68
+
69
+ after_find :download_remote_files
70
+ after_create :refresh_async
71
+ after_initialize do
72
+ bump_version unless version.present?
73
+ write_attribute(:workflow_status, :ready) if workflow_status.nil?
74
+ end
75
+ before_save :set_root_dir
76
+ before_validation :filter_duplicate_features
77
+
78
+ def self.constants
79
+ {
80
+ column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
81
+ { value: type.to_s, label: type.to_s.titleize }
82
+ end,
83
+ preprocessing_strategies: EasyML::Data::Preprocessor.constants[:preprocessing_strategies],
84
+ feature_options: EasyML::Features::Registry.list_flat,
85
+ splitter_constants: EasyML::Splitter.constants,
86
+ }
87
+ end
88
+
89
+ def root_dir=(value)
90
+ raise "Cannot override value of root_dir!" unless value.to_s == root_dir.to_s
91
+
92
+ write_attribute(:root_dir, value)
93
+ end
94
+
95
+ def set_root_dir
96
+ write_attribute(:root_dir, root_dir)
97
+ end
98
+
99
+ def root_dir
100
+ bump_version
101
+ EasyML::Engine.root_dir.join("datasets").join(underscored_name).join(version).to_s
102
+ end
103
+
104
+ def destructively_cleanup!
105
+ FileUtils.rm_rf(root_dir) if root_dir.present?
106
+ end
107
+
108
+ def schema
109
+ read_attribute(:schema) || datasource.schema
110
+ end
111
+
112
+ def processed_schema
113
+ processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
114
+ end
115
+
116
+ def refresh_datatypes
117
+ return unless columns_need_refresh?
118
+
119
+ cleanup
120
+ datasource.reread(columns)
121
+ end
122
+
123
+ def num_rows
124
+ if datasource&.num_rows.nil?
125
+ datasource.after_sync
126
+ end
127
+ datasource&.num_rows
128
+ end
129
+
130
+ def refresh_async
131
+ return if analyzing?
132
+
133
+ update(workflow_status: "analyzing")
134
+ EasyML::RefreshDatasetJob.perform_later(id)
135
+ end
136
+
137
+ def raw
138
+ return @raw if @raw && @raw.dataset
139
+
140
+ @raw = initialize_split("raw")
141
+ end
142
+
143
+ def processed
144
+ return @processed if @processed && @processed.dataset
145
+
146
+ @processed = initialize_split("processed")
147
+ end
148
+
149
+ def bump_versions(version)
150
+ self.version = version
151
+
152
+ @raw = raw.cp(version)
153
+ @processed = processed.cp(version)
154
+ features.each(&:bump_version)
155
+
156
+ save
157
+ end
158
+
159
+ def prepare!
160
+ cleanup
161
+ refresh_datasource!
162
+ split_data
163
+ end
164
+
165
+ def prepare
166
+ refresh_datasource
167
+ split_data
168
+ end
169
+
170
+ def actually_refresh
171
+ refreshing do
172
+ split_data
173
+ process_data
174
+ fully_reload
175
+ learn
176
+ now = UTC.now
177
+ update(workflow_status: "ready", refreshed_at: now, updated_at: now)
178
+ fully_reload
179
+ end
180
+ end
181
+
182
+ def refresh!(async: false)
183
+ refreshing do
184
+ prepare!
185
+ fit_features!(async: async)
186
+ end
187
+ after_fit_features unless async
188
+ end
189
+
190
+ def refresh(async: false)
191
+ return refresh_async if async
192
+
193
+ refreshing do
194
+ prepare
195
+ fit_features(async: async)
196
+ end
197
+ after_fit_features unless async
198
+ end
199
+
200
+ def fit_features!(async: false, features: self.features)
201
+ fit_features(async: async, features: features, force: true)
202
+ end
203
+
204
+ def fit_features(async: false, features: self.features, force: false)
205
+ features_to_compute = force ? features : features.needs_fit
206
+ return if features_to_compute.empty?
207
+
208
+ features.first.fit(features: features_to_compute, async: async)
209
+ end
210
+
211
+ def after_fit_features
212
+ features.update_all(needs_fit: false, fit_at: Time.current)
213
+ unlock!
214
+ actually_refresh
215
+ end
216
+
217
+ def columns_need_refresh
218
+ preloaded_columns.select do |col|
219
+ col.updated_at.present? &&
220
+ refreshed_at.present? &&
221
+ col.updated_at > refreshed_at
222
+ end
223
+ end
224
+
225
+ def columns_need_refresh?
226
+ columns_need_refresh.any?
227
+ end
228
+
229
+ def features_need_fit
230
+ preloaded_features.select do |f|
231
+ (f.updated_at.present? && refreshed_at.present? && f.updated_at > refreshed_at) ||
232
+ f.needs_fit?
233
+ end
234
+ end
235
+
236
+ def features_need_fit?
237
+ features_need_fit.any?
238
+ end
239
+
240
+ def refresh_reasons
241
+ {
242
+ "Not split" => not_split?,
243
+ "Refreshed at is nil" => refreshed_at.nil?,
244
+ "Columns need refresh" => columns_need_refresh?,
245
+ "Features need refresh" => features_need_fit?,
246
+ "Datasource needs refresh" => datasource_needs_refresh?,
247
+ "Datasource was refreshed" => datasource_was_refreshed?,
248
+ }.select { |k, v| v }.map { |k, v| k }
249
+ end
250
+
251
+ def needs_refresh?
252
+ refresh_reasons.any?
253
+ end
254
+
255
+ def not_split?
256
+ processed.split_at.nil? || raw.split_at.nil?
257
+ end
258
+
259
+ def datasource_needs_refresh?
260
+ datasource&.needs_refresh?
261
+ end
262
+
263
+ def datasource_was_refreshed?
264
+ raw.split_at.present? && raw.split_at < datasource.last_updated_at
265
+ end
266
+
267
+ def learn
268
+ learn_schema
269
+ learn_statistics
270
+ columns.sync
271
+ end
272
+
273
+ def refreshing
274
+ return false if is_history_class?
275
+ unlock! unless analyzing?
276
+
277
+ lock_dataset do
278
+ update(workflow_status: "analyzing")
279
+ fully_reload
280
+ yield
281
+ ensure
282
+ unlock!
283
+ end
284
+ rescue => e
285
+ update(workflow_status: "failed")
286
+ e.backtrace.grep(/easy_ml/).each do |line|
287
+ puts line
288
+ end
289
+ raise e
290
+ end
291
+
292
+ def unlock!
293
+ Support::Lockable.unlock!(lock_key)
294
+ end
295
+
296
+ def locked?
297
+ Support::Lockable.locked?(lock_key)
298
+ end
299
+
300
+ def lock_dataset
301
+ data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
302
+ with_lock do |client|
303
+ yield
304
+ end
305
+ end
306
+
307
+ def with_lock
308
+ EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
309
+ yield client
310
+ end
311
+ end
312
+
313
+ def lock_key
314
+ "dataset:#{id}"
315
+ end
316
+
317
+ def learn_schema
318
+ data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
319
+ schema = data.schema.reduce({}) do |h, (k, v)|
320
+ h.tap do
321
+ h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
322
+ end
323
+ end
324
+ write_attribute(:schema, schema)
325
+ end
326
+
327
+ def learn_statistics
328
+ update(
329
+ statistics: EasyML::Data::StatisticsLearner.learn(raw, processed),
330
+ )
331
+ end
332
+
333
+ def process_data
334
+ split_data
335
+ fit
336
+ normalize_all
337
+ # alert_nulls
338
+ end
339
+
340
+ def needs_learn?(df)
341
+ return true if columns_need_refresh?
342
+
343
+ never_learned = columns.none?
344
+ return true if never_learned
345
+
346
+ new_features = features.any? { |f| f.updated_at > columns.maximum(:updated_at) }
347
+ return true if new_features
348
+
349
+ new_cols = df.present? ? (df.columns - columns.map(&:name)) : []
350
+ new_cols = columns.syncable
351
+
352
+ return true if new_cols.any?
353
+ end
354
+
355
+ def compare_datasets(df, df_was)
356
+ # Step 1: Check if the entire dataset is identical
357
+ if df == df_was
358
+ return "The datasets are identical."
359
+ end
360
+
361
+ # Step 2: Identify columns with differences
362
+ differing_columns = df.columns.select do |column|
363
+ df[column] != df_was[column]
364
+ end
365
+
366
+ # Step 3: Find row-level differences for each differing column
367
+ differences = {}
368
+ differing_columns.each do |column|
369
+ mask = df[column] != df_was[column]
370
+ differing_rows = df[mask][column].zip(df_was[mask][column]).map.with_index do |(df_value, df_was_value), index|
371
+ { row_index: index, df_value: df_value, df_was_value: df_was_value }
372
+ end
373
+
374
+ differences[column] = differing_rows
375
+ end
376
+
377
+ { differing_columns: differing_columns, differences: differences }
378
+ end
379
+
380
+ def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features, idx: nil)
381
+ df = apply_features(df, features)
382
+ df = drop_nulls(df)
383
+ df = apply_missing_features(df, inference: inference)
384
+ df = preprocessor.postprocess(df, inference: inference)
385
+
386
+ # Learn will update columns, so if any features have been added
387
+ # since the last time columns were learned, we should re-learn the schema
388
+ learn if idx == 0 && needs_learn?(df)
389
+ df = apply_column_mask(df, inference: inference) unless all_columns
390
+ raise_on_nulls(df) if inference
391
+ df, = processed.split_features_targets(df, true, target) if split_ys
392
+ df
393
+ end
394
+
395
+ def raise_on_nulls(df)
396
+ desc_df = df.describe
397
+
398
+ # Get the 'null_count' row
399
+ null_count_row = desc_df.filter(desc_df[:describe] == "null_count")
400
+
401
+ # Select columns with non-zero null counts
402
+ columns_with_nulls = null_count_row.columns.select do |col|
403
+ null_count_row[col][0].to_i > 0
404
+ end
405
+
406
+ if columns_with_nulls.any?
407
+ raise "Null values found in columns: #{columns_with_nulls.join(", ")}"
408
+ end
409
+ end
410
+
411
+ # Filter data using Polars predicates:
412
+ # dataset.data(filter: Polars.col("CREATED_DATE") > EST.now - 2.days)
413
+ # dataset.data(limit: 10)
414
+ # dataset.data(select: ["column1", "column2", "column3"], limit: 10)
415
+ # dataset.data(split_ys: true)
416
+ # dataset.data(all_columns: true) # Include all columns, even ones we SHOULDN'T train on (e.g. drop_cols). Be very careful! This is for data analysis purposes ONLY!
417
+ #
418
+ def train(**kwargs, &block)
419
+ load_data(:train, **kwargs, &block)
420
+ end
421
+
422
+ def valid(**kwargs, &block)
423
+ load_data(:valid, **kwargs, &block)
424
+ end
425
+
426
+ def test(**kwargs, &block)
427
+ load_data(:test, **kwargs, &block)
428
+ end
429
+
430
+ def data(**kwargs, &block)
431
+ load_data(:all, **kwargs, &block)
432
+ end
433
+
434
+ alias_method :query, :data
435
+
436
+ def cleanup
437
+ raw.cleanup
438
+ processed.cleanup
439
+ end
440
+
441
+ def check_nulls(data_type = :processed)
442
+ result = SPLIT_ORDER.each_with_object({}) do |segment, acc|
443
+ segment_result = { nulls: {}, total: 0 }
444
+
445
+ data_source = data_type == :raw ? raw : processed
446
+ data_source.read(segment) do |df|
447
+ df_nulls = null_check(df)
448
+ df.columns.each do |column|
449
+ segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
450
+ segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count] if df_nulls && df_nulls[column]
451
+ segment_result[:nulls][column][:total_count] += df.height
452
+ end
453
+ end
454
+
455
+ segment_result[:nulls].each do |column, counts|
456
+ percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
457
+ acc[column] ||= {}
458
+ acc[column][segment] = percentage
459
+ end
460
+ end
461
+
462
+ # Remove columns that have no nulls across all segments
463
+ result.reject! { |_, v| v.values.all?(&:zero?) }
464
+
465
+ result.empty? ? nil : result
466
+ end
467
+
468
+ def processed?
469
+ !should_split?
470
+ end
471
+
472
+ def decode_labels(ys, col: nil)
473
+ preprocessor.decode_labels(ys, col: col.nil? ? target : col)
474
+ end
475
+
476
+ def preprocessing_steps
477
+ return if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
478
+ return @preprocessing_steps if @preprocessing_steps.present?
479
+
480
+ training = standardize_preprocessing_steps(:training)
481
+ inference = standardize_preprocessing_steps(:inference)
482
+
483
+ @preprocessing_steps = {
484
+ training: training,
485
+ inference: inference,
486
+ }.compact.deep_symbolize_keys
487
+ end
488
+
489
+ def preprocessor
490
+ @preprocessor ||= initialize_preprocessor
491
+ return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
492
+
493
+ @preprocessor = initialize_preprocessor
494
+ end
495
+
496
+ def target
497
+ @target ||= preloaded_columns.find(&:is_target)&.name
498
+ end
499
+
500
+ def drop_cols
501
+ @drop_cols ||= preloaded_columns.select(&:hidden).map(&:name)
502
+ end
503
+
504
+ def drop_if_null
505
+ @drop_if_null ||= preloaded_columns.select(&:drop_if_null).map(&:name)
506
+ end
507
+
508
+ def col_order(inference: false)
509
+ # Filter preloaded columns in memory
510
+ scope = preloaded_columns.reject(&:hidden)
511
+ scope = scope.reject(&:is_target) if inference
512
+
513
+ # Get one_hot columns for category mapping
514
+ one_hots = scope.select(&:one_hot?)
515
+ one_hot_cats = columns.allowed_categories.symbolize_keys
516
+
517
+ # Map columns to names, handling one_hot expansion
518
+ scope.sort_by(&:id).flat_map do |col|
519
+ if col.one_hot?
520
+ one_hot_cats[col.name.to_sym].map do |cat|
521
+ "#{col.name}_#{cat}"
522
+ end
523
+ else
524
+ col.name
525
+ end
526
+ end
527
+ end
528
+
529
+ def column_mask(df, inference: false)
530
+ cols = df.columns & col_order(inference: inference)
531
+ cols.sort_by { |col| col_order.index(col) }
532
+ end
533
+
534
+ def apply_column_mask(df, inference: false)
535
+ df[column_mask(df, inference: inference)]
536
+ end
537
+
538
+ def apply_missing_features(df, inference: false)
539
+ return df unless inference
540
+
541
+ missing_features = (col_order(inference: inference) - df.columns).compact
542
+ df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
543
+ end
544
+
545
+ def drop_columns(all_columns: false)
546
+ if all_columns
547
+ []
548
+ else
549
+ drop_cols
550
+ end
551
+ end
552
+
553
+ def files
554
+ [raw, processed].flat_map(&:files)
555
+ end
556
+
557
+ def load_dataset
558
+ download_remote_files
559
+ end
560
+
561
+ def upload_remote_files
562
+ return unless processed?
563
+
564
+ processed.upload.tap do
565
+ features.each(&:upload_remote_files)
566
+ features.each(&:save)
567
+ save
568
+ end
569
+ end
570
+
571
+ def reload(*args)
572
+ # Call the original reload method
573
+ super(*args)
574
+ # Reset preloaded instance variables
575
+ @preloaded_columns = nil
576
+ @preloaded_features = nil
577
+ self
578
+ end
579
+
580
+ private
581
+
582
+ def preloaded_features
583
+ @preloaded_features ||= features.includes(:dataset).load
584
+ end
585
+
586
+ def preloaded_columns
587
+ @preloaded_columns ||= columns.load
588
+ end
589
+
590
+ def download_remote_files
591
+ return unless is_history_class? # Only historical datasets need this
592
+ return if processed.present? && processed.data
593
+
594
+ processed.download
595
+ end
596
+
597
+ def initialize_splits
598
+ @raw = nil
599
+ @processed = nil
600
+ raw
601
+ processed
602
+ end
603
+
604
+ def initialize_split(type)
605
+ return unless datasource.present?
606
+
607
+ args = { dataset: self, datasource: datasource }
608
+ case split_type.to_s
609
+ when EasyML::Data::Splits::InMemorySplit.to_s
610
+ split_type.new(**args)
611
+ when EasyML::Data::Splits::FileSplit.to_s
612
+ split_type.new(**args.merge(
613
+ dir: Pathname.new(root_dir).append("files/splits/#{type}").to_s,
614
+ ))
615
+ end
616
+ end
617
+
618
+ def split_type
619
+ datasource.in_memory? ? EasyML::Data::Splits::InMemorySplit : EasyML::Data::Splits::FileSplit
620
+ end
621
+
622
+ def refresh_datasource
623
+ datasource.reload.refresh
624
+ refresh_datatypes
625
+ initialize_splits
626
+ end
627
+
628
+ def refresh_datasource!
629
+ datasource.reload.refresh!
630
+ refresh_datatypes
631
+ initialize_splits
632
+ end
633
+
634
+ def normalize_all
635
+ processed.cleanup
636
+
637
+ SPLIT_ORDER.each_with_index do |segment, idx|
638
+ df = raw.read(segment)
639
+ processed_df = normalize(df, all_columns: true, idx: idx)
640
+ processed.save(segment, processed_df)
641
+ end
642
+ @normalized = true
643
+ end
644
+
645
+ def drop_nulls(df)
646
+ return df if drop_if_null.nil? || drop_if_null.empty?
647
+
648
+ drop = (df.columns & drop_if_null)
649
+ return df if drop.empty?
650
+
651
+ df.drop_nulls(subset: drop)
652
+ end
653
+
654
+ def load_data(segment, **kwargs, &block)
655
+ if processed?
656
+ processed.load_data(segment, **kwargs, &block)
657
+ else
658
+ raw.load_data(segment, **kwargs, &block)
659
+ end
660
+ end
661
+
662
+ def fit(xs = nil)
663
+ xs = raw.train(all_columns: true) if xs.nil?
664
+
665
+ preprocessor.fit(xs)
666
+ self.preprocessor_statistics = preprocessor.statistics
667
+ end
668
+
669
+ # log_method :fit, "Learning statistics", verbose: true
670
+
671
+ def split_data!
672
+ split_data(force: true)
673
+ end
674
+
675
+ def split_data(force: false)
676
+ return unless force || should_split?
677
+
678
+ cleanup
679
+ features = self.features.ordered.load
680
+ splitter.split(datasource) do |train_df, valid_df, test_df|
681
+ [:train, :valid, :test].zip([train_df, valid_df, test_df]).each do |segment, df|
682
+ raw.save(segment, df)
683
+ end
684
+ end
685
+ end
686
+
687
+ def should_split?
688
+ needs_refresh?
689
+ end
690
+
691
+ def filter_duplicate_features
692
+ return unless attributes["features_attributes"].present?
693
+
694
+ existing_feature_names = features.pluck(:name)
695
+ attributes["features_attributes"].each do |_, attrs|
696
+ # Skip if it's marked for destruction or is an existing record
697
+ next if attrs["_destroy"] == "1" || attrs["id"].present?
698
+
699
+ # Reject the feature if it would be a duplicate
700
+ attrs["_destroy"] = "1" if existing_feature_names.include?(attrs["name"])
701
+ end
702
+ end
703
+
704
+ def apply_features(df, features = self.features)
705
+ if features.nil? || features.empty?
706
+ df
707
+ else
708
+ # Eager load all features with their necessary associations in one query
709
+ if features.is_a?(Array) # Used for testing (feature.transform_batch)
710
+ features_to_apply = features
711
+ else
712
+ features_to_apply = features.ordered.includes(dataset: :datasource).to_a
713
+ end
714
+
715
+ # Preload all feature SHAs in one batch
716
+ feature_classes = features_to_apply.map(&:feature_class).uniq
717
+ shas = feature_classes.map { |klass| [klass, Feature.compute_sha(klass)] }.to_h
718
+
719
+ # Apply features in sequence with preloaded data
720
+ features_to_apply.reduce(df) do |acc_df, feature|
721
+ # Set SHA without querying
722
+ feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
723
+
724
+ result = feature.transform_batch(acc_df)
725
+
726
+ unless result.is_a?(Polars::DataFrame)
727
+ raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
728
+ end
729
+
730
+ result
731
+ end
732
+ end
733
+ end
734
+
735
+ def standardize_preprocessing_steps(type)
736
+ columns.map(&:name).zip(columns.map do |col|
737
+ col.preprocessing_steps&.dig(type)
738
+ end).to_h.compact.reject { |_k, v| v["method"] == "none" }
739
+ end
740
+
741
+ def initialize_preprocessor
742
+ EasyML::Data::Preprocessor.new(
743
+ directory: Pathname.new(root_dir).append("preprocessor"),
744
+ preprocessing_steps: preprocessing_steps,
745
+ ).tap do |preprocessor|
746
+ preprocessor.statistics = preprocessor_statistics
747
+ end
748
+ end
749
+
750
+ def fully_reload
751
+ return unless persisted?
752
+
753
+ base_vars = self.class.new.instance_variables
754
+ dirty_vars = (instance_variables - base_vars)
755
+ in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
756
+ dirty_vars.each do |ivar|
757
+ value = instance_variable_get(ivar)
758
+ remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
759
+ end
760
+ reload
761
+ end
762
+
763
+ def underscored_name
764
+ name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
765
+ end
766
+ end
767
+ end