easy_ml 0.1.3 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -4
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,227 @@
1
+ module EasyML
2
+ class FeatureStore
3
+ attr_reader :feature
4
+
5
+ def initialize(feature)
6
+ @feature = feature
7
+ end
8
+
9
+ def store(df)
10
+ primary_key = feature.primary_key&.first
11
+ return store_without_partitioning(df) unless df.columns.include?(primary_key)
12
+ return store_without_partitioning(df) unless primary_key
13
+
14
+ min_key = df[primary_key].min
15
+ max_key = df[primary_key].max
16
+ batch_size = feature.batch_size || 10_000
17
+
18
+ # Try to parse as integers if they're strings
19
+ begin
20
+ min_key = Integer(min_key) if min_key.is_a?(String)
21
+ max_key = Integer(max_key) if max_key.is_a?(String)
22
+ rescue ArgumentError
23
+ return store_without_partitioning(df)
24
+ end
25
+
26
+ # Only partition if we have integer keys where we can predict boundaries
27
+ return store_without_partitioning(df) unless min_key.is_a?(Integer) && max_key.is_a?(Integer)
28
+
29
+ partitions = compute_partition_boundaries(min_key, max_key, batch_size)
30
+ partitions.each do |partition_start|
31
+ partition_end = partition_start + batch_size - 1
32
+ partition_df = df.filter(
33
+ (Polars.col(primary_key) >= partition_start) &
34
+ (Polars.col(primary_key) <= partition_end)
35
+ )
36
+
37
+ next if partition_df.height == 0
38
+
39
+ store_partition(partition_df, primary_key, partition_start)
40
+ end
41
+ end
42
+
43
+ def query(filter: nil)
44
+ query_all_partitions(filter)
45
+ end
46
+
47
+ def empty?
48
+ list_partitions.empty?
49
+ end
50
+
51
+ def list_partitions
52
+ Dir.glob(File.join(feature_dir, "feature*.parquet")).sort
53
+ end
54
+
55
+ def wipe
56
+ FileUtils.rm_rf(feature_dir)
57
+ end
58
+
59
+ def upload_remote_files
60
+ synced_directory.upload
61
+ end
62
+
63
+ def download
64
+ synced_directory&.download
65
+ end
66
+
67
+ def cp(old_version, new_version)
68
+ old_dir = feature_dir_for_version(old_version)
69
+ new_dir = feature_dir_for_version(new_version)
70
+
71
+ return if old_dir.nil? || !Dir.exist?(old_dir)
72
+
73
+ FileUtils.mkdir_p(new_dir)
74
+ files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
75
+
76
+ files_to_cp.each do |file|
77
+ target_file = file.gsub(old_version.to_s, new_version.to_s)
78
+ FileUtils.mkdir_p(File.dirname(target_file))
79
+ FileUtils.cp(file, target_file)
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def store_without_partitioning(df)
86
+ lock_file do
87
+ path = feature_path
88
+ FileUtils.mkdir_p(File.dirname(path))
89
+ df.write_parquet(path)
90
+ end
91
+ end
92
+
93
+ def store_partition(partition_df, primary_key, partition_start)
94
+ lock_partition(partition_start) do
95
+ path = partition_path(partition_start)
96
+ FileUtils.mkdir_p(File.dirname(path))
97
+
98
+ if File.exist?(path)
99
+ reader = EasyML::Data::PolarsReader.new
100
+ existing_df = reader.query([path])
101
+ preserved_records = existing_df.filter(
102
+ Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
103
+ )
104
+ partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
105
+ end
106
+
107
+ partition_df.write_parquet(path)
108
+ end
109
+ end
110
+
111
+ def query_partitions(filter)
112
+ primary_key_values = filter.extract_primary_key_values
113
+ batch_size = feature.batch_size || 10_000
114
+
115
+ partition_files = primary_key_values.map do |key|
116
+ partition_start = (key / batch_size.to_f).floor * batch_size
117
+ partition_path(partition_start)
118
+ end.uniq.select { |path| File.exist?(path) }
119
+
120
+ return Polars::DataFrame.new if partition_files.empty?
121
+
122
+ reader = EasyML::Data::PolarsReader.new
123
+ reader.query(partition_files, filter: filter)
124
+ end
125
+
126
+ def query_all_partitions(filter)
127
+ reader = EasyML::Data::PolarsReader.new
128
+ pattern = File.join(feature_dir, "feature*.parquet")
129
+ files = Dir.glob(pattern)
130
+
131
+ return Polars::DataFrame.new if files.empty?
132
+
133
+ reader.query(files, filter: filter)
134
+ end
135
+
136
+ def compute_partition_boundaries(min_key, max_key, batch_size)
137
+ start_partition = (min_key / batch_size.to_f).floor * batch_size
138
+ end_partition = (max_key / batch_size.to_f).floor * batch_size
139
+ (start_partition..end_partition).step(batch_size).to_a
140
+ end
141
+
142
+ def feature_dir_for_version(version)
143
+ File.join(
144
+ Rails.root,
145
+ "easy_ml/datasets",
146
+ feature.dataset.name.parameterize.gsub("-", "_"),
147
+ "features",
148
+ feature.name.parameterize.gsub("-", "_"),
149
+ version.to_s
150
+ )
151
+ end
152
+
153
+ def feature_dir
154
+ feature_dir_for_version(feature.version)
155
+ end
156
+
157
+ def feature_path
158
+ File.join(feature_dir, "feature.parquet")
159
+ end
160
+
161
+ def partition_path(partition_start)
162
+ File.join(feature_dir, "feature#{partition_start}.parquet")
163
+ end
164
+
165
+ def s3_prefix
166
+ File.join("datasets", feature_dir.split("datasets").last)
167
+ end
168
+
169
+ def synced_directory
170
+ return unless feature.dataset&.datasource.present?
171
+
172
+ datasource_config = feature.dataset.datasource.configuration || {}
173
+ @synced_dir ||= EasyML::Data::SyncedDirectory.new(
174
+ root_dir: feature_dir,
175
+ s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
176
+ s3_prefix: s3_prefix,
177
+ s3_access_key_id: EasyML::Configuration.s3_access_key_id,
178
+ s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
179
+ polars_args: datasource_config.dig("polars_args"),
180
+ cache_for: 0,
181
+ )
182
+ end
183
+
184
+ def lock_partition(partition_start)
185
+ Support::Lockable.with_lock(partition_lock_key(partition_start), wait_timeout: 2, stale_timeout: 60) do |client|
186
+ begin
187
+ yield client if block_given?
188
+ ensure
189
+ unlock_partition(partition_start)
190
+ end
191
+ end
192
+ end
193
+
194
+ def lock_file
195
+ Support::Lockable.with_lock(file_lock_key, wait_timeout: 2, stale_timeout: 60) do |client|
196
+ begin
197
+ yield client if block_given?
198
+ ensure
199
+ unlock_file
200
+ end
201
+ end
202
+ end
203
+
204
+ def unlock_partition(partition_start)
205
+ Support::Lockable.unlock!(partition_lock_key(partition_start))
206
+ end
207
+
208
+ def unlock_file
209
+ Support::Lockable.unlock!(file_lock_key)
210
+ end
211
+
212
+ def unlock_all_partitions
213
+ list_partitions.each do |partition_path|
214
+ partition_start = partition_path.match(/feature(\d+)\.parquet/)[1].to_i
215
+ unlock_partition(partition_start)
216
+ end
217
+ end
218
+
219
+ def partition_lock_key(partition_start)
220
+ "feature_store:#{feature.id}.partition.#{partition_start}"
221
+ end
222
+
223
+ def file_lock_key
224
+ "feature_store:#{feature.id}.file"
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,61 @@
1
+ module EasyML::Features
2
+ def transform(df, feature)
3
+ raise NotImplementedError
4
+ end
5
+
6
+ def self.included(base)
7
+ base.extend(ClassMethods)
8
+ Registry.register(base)
9
+ end
10
+
11
+ module ClassMethods
12
+ def features
13
+ @features ||= []
14
+ end
15
+
16
+ def feature(**kwargs)
17
+ features << kwargs.merge!(feature_class: self.to_s)
18
+ end
19
+ end
20
+
21
+ def missing_any?(list1, list2)
22
+ (list1 - list2).any?
23
+ end
24
+
25
+ class Registry
26
+ class << self
27
+ def register(feature_class, namespace: nil)
28
+ namespace = namespace&.to_sym
29
+ registry[namespace] ||= {}
30
+ registry[namespace][feature_class] = feature_class
31
+ end
32
+
33
+ def list(namespace: nil)
34
+ require_files
35
+ namespace ? registry[namespace.to_sym] : registry
36
+ end
37
+
38
+ def require_files
39
+ Dir.glob(Rails.root.join("app/features/**/*.rb")).each { |f| require_dependency f }
40
+ end
41
+
42
+ def list_flat
43
+ (list.try(:values) || []).flat_map(&:values).flat_map(&:features)
44
+ end
45
+
46
+ def find(name)
47
+ list_flat.detect { |feature| feature[:name] == name || feature[:feature_class] == name }
48
+ end
49
+
50
+ def clear
51
+ @registry = {}
52
+ end
53
+
54
+ private
55
+
56
+ def registry
57
+ @registry ||= {}
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,4 +1,18 @@
1
- ActiveSupport::Inflector.inflections(:en) do |inflect|
2
- inflect.acronym "EasyML"
3
- inflect.acronym "ML"
1
+ module EasyML
2
+ module Initializers
3
+ module Inflections
4
+ def self.inflect
5
+ ActiveSupport::Inflector.inflections(:en) do |inflect|
6
+ inflect.acronym "EasyML"
7
+ inflect.acronym "ML"
8
+ inflect.acronym "STI"
9
+ inflect.acronym "XGBoost"
10
+ inflect.acronym "GBLinear"
11
+ inflect.acronym "GBTree"
12
+ inflect.acronym "EST"
13
+ inflect.acronym "UTC"
14
+ end
15
+ end
16
+ end
17
+ end
4
18
  end
@@ -7,9 +7,9 @@ module EasyML
7
7
  module ClassMethods
8
8
  def log_method(method_name, message, verbose: false)
9
9
  original_method = instance_method(method_name)
10
- define_method(method_name) do |*args, &block|
10
+ define_method(method_name) do |*args, **kwargs, &block|
11
11
  log_message(message, verbose: verbose)
12
- result = original_method.bind(self).call(*args, &block)
12
+ result = original_method.bind(self).call(*args, **kwargs, &block)
13
13
  result
14
14
  end
15
15
  end
@@ -0,0 +1,74 @@
1
+ require "singleton"
2
+
3
+ module EasyML
4
+ class Predict
5
+ include Singleton
6
+
7
+ attr_reader :models
8
+
9
+ def initialize
10
+ @models = {}
11
+ end
12
+
13
+ def self.predict(model_name, df)
14
+ if df.is_a?(Hash)
15
+ df = Polars::DataFrame.new(df)
16
+ end
17
+ raw_input = df.to_hashes&.first
18
+ df = instance.normalize(model_name, df)
19
+ preds = instance.predict(model_name, df)
20
+ current_version = instance.get_model(model_name)
21
+
22
+ EasyML::Prediction.create!(
23
+ model: current_version.model,
24
+ model_history: current_version,
25
+ prediction_type: current_version.model.task,
26
+ prediction_value: {
27
+ value: preds.first,
28
+ }.compact,
29
+ raw_input: raw_input,
30
+ normalized_input: df.to_hashes&.first,
31
+ )
32
+
33
+ preds
34
+ end
35
+
36
+ def self.train(model_name, tuner: nil, evaluator: nil)
37
+ instance.train(model_name, tuner: tuner, evaluator: evaluator)
38
+ end
39
+
40
+ def predict(model_name, df)
41
+ get_model(model_name).predict(df)
42
+ end
43
+
44
+ def normalize(model_name, df)
45
+ get_model(model_name).dataset.normalize(df, inference: true)
46
+ end
47
+
48
+ def get_model(model_name)
49
+ load_model(model_name)
50
+ models[model_name]
51
+ end
52
+
53
+ def reset
54
+ @models = {}
55
+ end
56
+
57
+ def self.reset
58
+ instance.reset
59
+ end
60
+
61
+ private
62
+
63
+ def load_model(model_name)
64
+ current_model = EasyML::Model.find_by!(name: model_name).inference_version
65
+
66
+ # Load new model if not loaded or different version
67
+ model_not_loaded = models[model_name].nil?
68
+ model_is_new_version = models[model_name]&.id != current_model&.id
69
+ return unless model_not_loaded || model_is_new_version
70
+
71
+ models[model_name] = current_model
72
+ end
73
+ end
74
+ end
@@ -2,47 +2,203 @@ require "rails/generators"
2
2
  require "rails/generators/active_record/migration"
3
3
 
4
4
  module EasyML
5
- module Generators
6
- module Migration
7
- class MigrationGenerator < Rails::Generators::Base
8
- include Rails::Generators::Migration
9
- namespace "easy_ml:migration"
10
-
11
- # Set the source directory for templates
12
- source_root File.expand_path("../../templates/migration", __dir__)
13
-
14
- # Define the migration name
15
- desc "Generates migrations for EasyMLModel, Dataset, and TunerRun"
16
-
17
- # Specify the next migration number
18
- def self.next_migration_number(dirname)
19
- if ActiveRecord.version < Gem::Version.new("7")
20
- Time.now.utc.strftime("%Y%m%d%H%M%S")
21
- elsif ActiveRecord.timestamped_migrations
22
- Time.now.utc.strftime("%Y%m%d%H%M%S")
23
- else
24
- format("%.3d", (current_migration_number(dirname) + 1))
5
+ module Railtie
6
+ module Generators
7
+ module Migration
8
+ class MigrationGenerator < Rails::Generators::Base
9
+ include Rails::Generators::Migration
10
+ namespace "easy_ml:migration"
11
+
12
+ # Set the source directory for templates
13
+ source_root File.expand_path("../../templates/migration", __dir__)
14
+
15
+ # Define the migration name
16
+ desc "Generates migrations for EasyMLModel"
17
+
18
+ # Specify the next migration number
19
+ def self.next_migration_number(dirname)
20
+ sleep(1)
21
+ if ActiveRecord.version < Gem::Version.new("7")
22
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
23
+ elsif ActiveRecord.timestamped_migrations
24
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
25
+ else
26
+ format("%.3d", (current_migration_number(dirname) + 1))
27
+ end
25
28
  end
26
- end
27
29
 
28
- # Generate the migration files using the templates
29
- def create_migration_files
30
- create_easy_ml_models_migration
31
- end
30
+ # Generate the migration files using the templates
31
+ def create_migration_files
32
+ create_easy_ml_datasource_migration
33
+ create_easy_ml_datasets_migration
34
+ create_easy_ml_columns_migration
35
+ create_easy_ml_models_migration
36
+ create_easy_ml_model_files_migration
37
+ create_easy_ml_tuner_jobs_migration
38
+ create_easy_ml_retraining_jobs_migration
39
+ create_easy_ml_settings_migration
40
+ create_easy_ml_events_migration
41
+ create_easy_ml_features_migration
42
+ create_easy_ml_splitters_migration
43
+ create_easy_ml_splitter_histories_migration
44
+ create_easy_ml_deploys
45
+
46
+ create_easy_ml_datasource_histories_migration
47
+ create_easy_ml_dataset_histories_migration
48
+ create_easy_ml_column_histories_migration
49
+ create_easy_ml_model_histories_migration
50
+ create_easy_ml_model_file_histories_migration
51
+ create_easy_ml_feature_histories_migration
52
+ create_easy_ml_predictions_migration
53
+ end
32
54
 
33
- private
55
+ private
34
56
 
35
- # Generate the migration file for EasyMLModel using the template
36
- def create_easy_ml_models_migration
37
- migration_template(
38
- "create_easy_ml_models.rb.tt",
39
- "db/migrate/create_easy_ml_models.rb"
40
- )
41
- end
57
+ # Generate the migration file for EasyMLModel using the template
58
+ def create_easy_ml_models_migration
59
+ migration_template(
60
+ "create_easy_ml_models.rb.tt",
61
+ "db/migrate/create_easy_ml_models.rb"
62
+ )
63
+ end
64
+
65
+ def create_easy_ml_model_files_migration
66
+ migration_template(
67
+ "create_easy_ml_model_files.rb.tt",
68
+ "db/migrate/create_easy_ml_model_files.rb"
69
+ )
70
+ end
71
+
72
+ def create_easy_ml_datasource_migration
73
+ migration_template(
74
+ "create_easy_ml_datasources.rb.tt",
75
+ "db/migrate/create_easy_ml_datasources.rb"
76
+ )
77
+ end
78
+
79
+ def create_easy_ml_datasets_migration
80
+ migration_template(
81
+ "create_easy_ml_datasets.rb.tt",
82
+ "db/migrate/create_easy_ml_datasets.rb"
83
+ )
84
+ end
85
+
86
+ def create_easy_ml_tuner_jobs_migration
87
+ migration_template(
88
+ "create_easy_ml_tuner_jobs.rb.tt",
89
+ "db/migrate/create_easy_ml_tuner_jobs.rb"
90
+ )
91
+ end
92
+
93
+ def create_easy_ml_retraining_jobs_migration
94
+ migration_template(
95
+ "create_easy_ml_retraining_jobs.rb.tt",
96
+ "db/migrate/create_easy_ml_retraining_jobs.rb"
97
+ )
98
+ end
99
+
100
+ def create_easy_ml_settings_migration
101
+ migration_template(
102
+ "create_easy_ml_settings.rb.tt",
103
+ "db/migrate/create_easy_ml_settings.rb"
104
+ )
105
+ end
106
+
107
+ def create_easy_ml_events_migration
108
+ migration_template(
109
+ "create_easy_ml_events.rb.tt",
110
+ "db/migrate/create_easy_ml_events.rb"
111
+ )
112
+ end
113
+
114
+ def create_easy_ml_columns_migration
115
+ migration_template(
116
+ "create_easy_ml_columns.rb.tt",
117
+ "db/migrate/create_easy_ml_columns.rb"
118
+ )
119
+ end
120
+
121
+ def create_easy_ml_features_migration
122
+ migration_template(
123
+ "create_easy_ml_features.rb.tt",
124
+ "db/migrate/create_easy_ml_features.rb"
125
+ )
126
+ end
127
+
128
+ def create_easy_ml_splitters_migration
129
+ migration_template(
130
+ "create_easy_ml_splitters.rb.tt",
131
+ "db/migrate/create_easy_ml_splitters.rb"
132
+ )
133
+ end
134
+
135
+ def create_easy_ml_splitter_histories_migration
136
+ migration_template(
137
+ "create_easy_ml_splitter_histories.rb.tt",
138
+ "db/migrate/create_easy_ml_splitter_histories.rb"
139
+ )
140
+ end
141
+
142
+ def create_easy_ml_datasource_histories_migration
143
+ migration_template(
144
+ "create_easy_ml_datasource_histories.rb.tt",
145
+ "db/migrate/create_easy_ml_datasource_histories.rb"
146
+ )
147
+ end
148
+
149
+ def create_easy_ml_dataset_histories_migration
150
+ migration_template(
151
+ "create_easy_ml_dataset_histories.rb.tt",
152
+ "db/migrate/create_easy_ml_dataset_histories.rb"
153
+ )
154
+ end
155
+
156
+ def create_easy_ml_column_histories_migration
157
+ migration_template(
158
+ "create_easy_ml_column_histories.rb.tt",
159
+ "db/migrate/create_easy_ml_column_histories.rb"
160
+ )
161
+ end
162
+
163
+ def create_easy_ml_model_histories_migration
164
+ migration_template(
165
+ "create_easy_ml_model_histories.rb.tt",
166
+ "db/migrate/create_easy_ml_model_histories.rb"
167
+ )
168
+ end
169
+
170
+ def create_easy_ml_feature_histories_migration
171
+ migration_template(
172
+ "create_easy_ml_feature_histories.rb.tt",
173
+ "db/migrate/create_easy_ml_feature_histories.rb"
174
+ )
175
+ end
176
+
177
+ def create_easy_ml_model_file_histories_migration
178
+ migration_template(
179
+ "create_easy_ml_model_file_histories.rb.tt",
180
+ "db/migrate/create_easy_ml_model_file_histories.rb"
181
+ )
182
+ end
183
+
184
+ def create_easy_ml_deploys
185
+ migration_template(
186
+ "create_easy_ml_deploys.rb.tt",
187
+ "db/migrate/create_easy_ml_deploys.rb"
188
+ )
189
+ end
42
190
 
43
- # Get the next migration number
44
- def next_migration_number
45
- self.class.next_migration_number(Rails.root.join("db/migrate"))
191
+ def create_easy_ml_predictions_migration
192
+ migration_template(
193
+ "create_easy_ml_predictions.rb.tt",
194
+ "db/migrate/create_easy_ml_predictions.rb"
195
+ )
196
+ end
197
+
198
+ # Get the next migration number
199
+ def next_migration_number
200
+ self.class.next_migration_number(Rails.root.join("db/migrate"))
201
+ end
46
202
  end
47
203
  end
48
204
  end
@@ -0,0 +1,9 @@
1
+ require "historiographer/postgres_migration"
2
+
3
+ class CreateEasyMLColumnHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :easy_ml_column_histories do |t|
6
+ t.histories(foreign_key: :column_id)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,25 @@
1
+ class CreateEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ create_table :easy_ml_columns do |t|
4
+ t.bigint :dataset_id, null: false
5
+ t.string :name, null: false
6
+ t.string :description
7
+ t.string :datatype # The symbol representation (e.g., 'float', 'integer')
8
+ t.string :polars_datatype # The full Polars class name (e.g., 'Polars::Float64')
9
+ t.boolean :is_target
10
+ t.boolean :hidden, default: false
11
+ t.boolean :drop_if_null, default: false
12
+ t.json :preprocessing_steps
13
+ t.json :sample_values # Store up to 3 sample values
14
+ t.json :statistics
15
+
16
+ t.timestamps
17
+
18
+ t.index [:dataset_id, :name], unique: true
19
+ t.index :datatype
20
+ t.index :hidden
21
+ t.index :drop_if_null
22
+ t.index :is_target
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,9 @@
1
+ require "historiographer/postgres_migration"
2
+
3
+ class CreateEasyMLDatasetHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :easy_ml_dataset_histories do |t|
6
+ t.histories(foreign_key: :dataset_id)
7
+ end
8
+ end
9
+ end