easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,227 @@
1
+ module EasyML
2
+ class FeatureStore
3
+ attr_reader :feature
4
+
5
+ def initialize(feature)
6
+ @feature = feature
7
+ end
8
+
9
+ def store(df)
10
+ primary_key = feature.primary_key&.first
11
+ return store_without_partitioning(df) unless df.columns.include?(primary_key)
12
+ return store_without_partitioning(df) unless primary_key
13
+
14
+ min_key = df[primary_key].min
15
+ max_key = df[primary_key].max
16
+ batch_size = feature.batch_size || 10_000
17
+
18
+ # Try to parse as integers if they're strings
19
+ begin
20
+ min_key = Integer(min_key) if min_key.is_a?(String)
21
+ max_key = Integer(max_key) if max_key.is_a?(String)
22
+ rescue ArgumentError
23
+ return store_without_partitioning(df)
24
+ end
25
+
26
+ # Only partition if we have integer keys where we can predict boundaries
27
+ return store_without_partitioning(df) unless min_key.is_a?(Integer) && max_key.is_a?(Integer)
28
+
29
+ partitions = compute_partition_boundaries(min_key, max_key, batch_size)
30
+ partitions.each do |partition_start|
31
+ partition_end = partition_start + batch_size - 1
32
+ partition_df = df.filter(
33
+ (Polars.col(primary_key) >= partition_start) &
34
+ (Polars.col(primary_key) <= partition_end)
35
+ )
36
+
37
+ next if partition_df.height == 0
38
+
39
+ store_partition(partition_df, primary_key, partition_start)
40
+ end
41
+ end
42
+
43
+ def query(filter: nil)
44
+ query_all_partitions(filter)
45
+ end
46
+
47
+ def empty?
48
+ list_partitions.empty?
49
+ end
50
+
51
+ def list_partitions
52
+ Dir.glob(File.join(feature_dir, "feature*.parquet")).sort
53
+ end
54
+
55
+ def wipe
56
+ FileUtils.rm_rf(feature_dir)
57
+ end
58
+
59
+ def upload_remote_files
60
+ synced_directory.upload
61
+ end
62
+
63
+ def download
64
+ synced_directory&.download
65
+ end
66
+
67
+ def cp(old_version, new_version)
68
+ old_dir = feature_dir_for_version(old_version)
69
+ new_dir = feature_dir_for_version(new_version)
70
+
71
+ return if old_dir.nil? || !Dir.exist?(old_dir)
72
+
73
+ FileUtils.mkdir_p(new_dir)
74
+ files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
75
+
76
+ files_to_cp.each do |file|
77
+ target_file = file.gsub(old_version.to_s, new_version.to_s)
78
+ FileUtils.mkdir_p(File.dirname(target_file))
79
+ FileUtils.cp(file, target_file)
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def store_without_partitioning(df)
86
+ lock_file do
87
+ path = feature_path
88
+ FileUtils.mkdir_p(File.dirname(path))
89
+ df.write_parquet(path)
90
+ end
91
+ end
92
+
93
+ def store_partition(partition_df, primary_key, partition_start)
94
+ lock_partition(partition_start) do
95
+ path = partition_path(partition_start)
96
+ FileUtils.mkdir_p(File.dirname(path))
97
+
98
+ if File.exist?(path)
99
+ reader = EasyML::Data::PolarsReader.new
100
+ existing_df = reader.query([path])
101
+ preserved_records = existing_df.filter(
102
+ Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
103
+ )
104
+ partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
105
+ end
106
+
107
+ partition_df.write_parquet(path)
108
+ end
109
+ end
110
+
111
+ def query_partitions(filter)
112
+ primary_key_values = filter.extract_primary_key_values
113
+ batch_size = feature.batch_size || 10_000
114
+
115
+ partition_files = primary_key_values.map do |key|
116
+ partition_start = (key / batch_size.to_f).floor * batch_size
117
+ partition_path(partition_start)
118
+ end.uniq.select { |path| File.exist?(path) }
119
+
120
+ return Polars::DataFrame.new if partition_files.empty?
121
+
122
+ reader = EasyML::Data::PolarsReader.new
123
+ reader.query(partition_files, filter: filter)
124
+ end
125
+
126
+ def query_all_partitions(filter)
127
+ reader = EasyML::Data::PolarsReader.new
128
+ pattern = File.join(feature_dir, "feature*.parquet")
129
+ files = Dir.glob(pattern)
130
+
131
+ return Polars::DataFrame.new if files.empty?
132
+
133
+ reader.query(files, filter: filter)
134
+ end
135
+
136
+ def compute_partition_boundaries(min_key, max_key, batch_size)
137
+ start_partition = (min_key / batch_size.to_f).floor * batch_size
138
+ end_partition = (max_key / batch_size.to_f).floor * batch_size
139
+ (start_partition..end_partition).step(batch_size).to_a
140
+ end
141
+
142
+ def feature_dir_for_version(version)
143
+ File.join(
144
+ Rails.root,
145
+ "easy_ml/datasets",
146
+ feature.dataset.name.parameterize.gsub("-", "_"),
147
+ "features",
148
+ feature.name.parameterize.gsub("-", "_"),
149
+ version.to_s
150
+ )
151
+ end
152
+
153
+ def feature_dir
154
+ feature_dir_for_version(feature.version)
155
+ end
156
+
157
+ def feature_path
158
+ File.join(feature_dir, "feature.parquet")
159
+ end
160
+
161
+ def partition_path(partition_start)
162
+ File.join(feature_dir, "feature#{partition_start}.parquet")
163
+ end
164
+
165
+ def s3_prefix
166
+ File.join("datasets", feature_dir.split("datasets").last)
167
+ end
168
+
169
+ def synced_directory
170
+ return unless feature.dataset&.datasource.present?
171
+
172
+ datasource_config = feature.dataset.datasource.configuration || {}
173
+ @synced_dir ||= EasyML::Data::SyncedDirectory.new(
174
+ root_dir: feature_dir,
175
+ s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
176
+ s3_prefix: s3_prefix,
177
+ s3_access_key_id: EasyML::Configuration.s3_access_key_id,
178
+ s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
179
+ polars_args: datasource_config.dig("polars_args"),
180
+ cache_for: 0,
181
+ )
182
+ end
183
+
184
+ def lock_partition(partition_start)
185
+ Support::Lockable.with_lock(partition_lock_key(partition_start), wait_timeout: 2, stale_timeout: 60) do |client|
186
+ begin
187
+ yield client if block_given?
188
+ ensure
189
+ unlock_partition(partition_start)
190
+ end
191
+ end
192
+ end
193
+
194
+ def lock_file
195
+ Support::Lockable.with_lock(file_lock_key, wait_timeout: 2, stale_timeout: 60) do |client|
196
+ begin
197
+ yield client if block_given?
198
+ ensure
199
+ unlock_file
200
+ end
201
+ end
202
+ end
203
+
204
+ def unlock_partition(partition_start)
205
+ Support::Lockable.unlock!(partition_lock_key(partition_start))
206
+ end
207
+
208
+ def unlock_file
209
+ Support::Lockable.unlock!(file_lock_key)
210
+ end
211
+
212
+ def unlock_all_partitions
213
+ list_partitions.each do |partition_path|
214
+ partition_start = partition_path.match(/feature(\d+)\.parquet/)[1].to_i
215
+ unlock_partition(partition_start)
216
+ end
217
+ end
218
+
219
+ def partition_lock_key(partition_start)
220
+ "feature_store:#{feature.id}.partition.#{partition_start}"
221
+ end
222
+
223
+ def file_lock_key
224
+ "feature_store:#{feature.id}.file"
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,61 @@
1
+ module EasyML::Features
2
+ def transform(df, feature)
3
+ raise NotImplementedError
4
+ end
5
+
6
+ def self.included(base)
7
+ base.extend(ClassMethods)
8
+ Registry.register(base)
9
+ end
10
+
11
+ module ClassMethods
12
+ def features
13
+ @features ||= []
14
+ end
15
+
16
+ def feature(**kwargs)
17
+ features << kwargs.merge!(feature_class: self.to_s)
18
+ end
19
+ end
20
+
21
+ def missing_any?(list1, list2)
22
+ (list1 - list2).any?
23
+ end
24
+
25
+ class Registry
26
+ class << self
27
+ def register(feature_class, namespace: nil)
28
+ namespace = namespace&.to_sym
29
+ registry[namespace] ||= {}
30
+ registry[namespace][feature_class] = feature_class
31
+ end
32
+
33
+ def list(namespace: nil)
34
+ require_files
35
+ namespace ? registry[namespace.to_sym] : registry
36
+ end
37
+
38
+ def require_files
39
+ Dir.glob(Rails.root.join("app/features/**/*.rb")).each { |f| require_dependency f }
40
+ end
41
+
42
+ def list_flat
43
+ (list.try(:values) || []).flat_map(&:values).flat_map(&:features)
44
+ end
45
+
46
+ def find(name)
47
+ list_flat.detect { |feature| feature[:name] == name || feature[:feature_class] == name }
48
+ end
49
+
50
+ def clear
51
+ @registry = {}
52
+ end
53
+
54
+ private
55
+
56
+ def registry
57
+ @registry ||= {}
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,4 +1,18 @@
1
- ActiveSupport::Inflector.inflections(:en) do |inflect|
2
- inflect.acronym "EasyML"
3
- inflect.acronym "ML"
1
+ module EasyML
2
+ module Initializers
3
+ module Inflections
4
+ def self.inflect
5
+ ActiveSupport::Inflector.inflections(:en) do |inflect|
6
+ inflect.acronym "EasyML"
7
+ inflect.acronym "ML"
8
+ inflect.acronym "STI"
9
+ inflect.acronym "XGBoost"
10
+ inflect.acronym "GBLinear"
11
+ inflect.acronym "GBTree"
12
+ inflect.acronym "EST"
13
+ inflect.acronym "UTC"
14
+ end
15
+ end
16
+ end
17
+ end
4
18
  end
@@ -7,9 +7,9 @@ module EasyML
7
7
  module ClassMethods
8
8
  def log_method(method_name, message, verbose: false)
9
9
  original_method = instance_method(method_name)
10
- define_method(method_name) do |*args, &block|
10
+ define_method(method_name) do |*args, **kwargs, &block|
11
11
  log_message(message, verbose: verbose)
12
- result = original_method.bind(self).call(*args, &block)
12
+ result = original_method.bind(self).call(*args, **kwargs, &block)
13
13
  result
14
14
  end
15
15
  end
@@ -0,0 +1,74 @@
1
+ require "singleton"
2
+
3
+ module EasyML
4
+ class Predict
5
+ include Singleton
6
+
7
+ attr_reader :models
8
+
9
+ def initialize
10
+ @models = {}
11
+ end
12
+
13
+ def self.predict(model_name, df)
14
+ if df.is_a?(Hash)
15
+ df = Polars::DataFrame.new(df)
16
+ end
17
+ raw_input = df.to_hashes&.first
18
+ df = instance.normalize(model_name, df)
19
+ preds = instance.predict(model_name, df)
20
+ current_version = instance.get_model(model_name)
21
+
22
+ EasyML::Prediction.create!(
23
+ model: current_version.model,
24
+ model_history: current_version,
25
+ prediction_type: current_version.model.task,
26
+ prediction_value: {
27
+ value: preds.first,
28
+ }.compact,
29
+ raw_input: raw_input,
30
+ normalized_input: df.to_hashes&.first,
31
+ )
32
+
33
+ preds
34
+ end
35
+
36
+ def self.train(model_name, tuner: nil, evaluator: nil)
37
+ instance.train(model_name, tuner: tuner, evaluator: evaluator)
38
+ end
39
+
40
+ def predict(model_name, df)
41
+ get_model(model_name).predict(df)
42
+ end
43
+
44
+ def normalize(model_name, df)
45
+ get_model(model_name).dataset.normalize(df, inference: true)
46
+ end
47
+
48
+ def get_model(model_name)
49
+ load_model(model_name)
50
+ models[model_name]
51
+ end
52
+
53
+ def reset
54
+ @models = {}
55
+ end
56
+
57
+ def self.reset
58
+ instance.reset
59
+ end
60
+
61
+ private
62
+
63
+ def load_model(model_name)
64
+ current_model = EasyML::Model.find_by!(name: model_name).inference_version
65
+
66
+ # Load new model if not loaded or different version
67
+ model_not_loaded = models[model_name].nil?
68
+ model_is_new_version = models[model_name]&.id != current_model&.id
69
+ return unless model_not_loaded || model_is_new_version
70
+
71
+ models[model_name] = current_model
72
+ end
73
+ end
74
+ end
@@ -2,47 +2,203 @@ require "rails/generators"
2
2
  require "rails/generators/active_record/migration"
3
3
 
4
4
  module EasyML
5
- module Generators
6
- module Migration
7
- class MigrationGenerator < Rails::Generators::Base
8
- include Rails::Generators::Migration
9
- namespace "easy_ml:migration"
10
-
11
- # Set the source directory for templates
12
- source_root File.expand_path("../../templates/migration", __dir__)
13
-
14
- # Define the migration name
15
- desc "Generates migrations for EasyMLModel, Dataset, and TunerRun"
16
-
17
- # Specify the next migration number
18
- def self.next_migration_number(dirname)
19
- if ActiveRecord.version < Gem::Version.new("7")
20
- Time.now.utc.strftime("%Y%m%d%H%M%S")
21
- elsif ActiveRecord.timestamped_migrations
22
- Time.now.utc.strftime("%Y%m%d%H%M%S")
23
- else
24
- format("%.3d", (current_migration_number(dirname) + 1))
5
+ module Railtie
6
+ module Generators
7
+ module Migration
8
+ class MigrationGenerator < Rails::Generators::Base
9
+ include Rails::Generators::Migration
10
+ namespace "easy_ml:migration"
11
+
12
+ # Set the source directory for templates
13
+ source_root File.expand_path("../../templates/migration", __dir__)
14
+
15
+ # Define the migration name
16
+ desc "Generates migrations for EasyMLModel"
17
+
18
+ # Specify the next migration number
19
+ def self.next_migration_number(dirname)
20
+ sleep(1)
21
+ if ActiveRecord.version < Gem::Version.new("7")
22
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
23
+ elsif ActiveRecord.timestamped_migrations
24
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
25
+ else
26
+ format("%.3d", (current_migration_number(dirname) + 1))
27
+ end
25
28
  end
26
- end
27
29
 
28
- # Generate the migration files using the templates
29
- def create_migration_files
30
- create_easy_ml_models_migration
31
- end
30
+ # Generate the migration files using the templates
31
+ def create_migration_files
32
+ create_easy_ml_datasource_migration
33
+ create_easy_ml_datasets_migration
34
+ create_easy_ml_columns_migration
35
+ create_easy_ml_models_migration
36
+ create_easy_ml_model_files_migration
37
+ create_easy_ml_tuner_jobs_migration
38
+ create_easy_ml_retraining_jobs_migration
39
+ create_easy_ml_settings_migration
40
+ create_easy_ml_events_migration
41
+ create_easy_ml_features_migration
42
+ create_easy_ml_splitters_migration
43
+ create_easy_ml_splitter_histories_migration
44
+ create_easy_ml_deploys
45
+
46
+ create_easy_ml_datasource_histories_migration
47
+ create_easy_ml_dataset_histories_migration
48
+ create_easy_ml_column_histories_migration
49
+ create_easy_ml_model_histories_migration
50
+ create_easy_ml_model_file_histories_migration
51
+ create_easy_ml_feature_histories_migration
52
+ create_easy_ml_predictions_migration
53
+ end
32
54
 
33
- private
55
+ private
34
56
 
35
- # Generate the migration file for EasyMLModel using the template
36
- def create_easy_ml_models_migration
37
- migration_template(
38
- "create_easy_ml_models.rb.tt",
39
- "db/migrate/create_easy_ml_models.rb"
40
- )
41
- end
57
+ # Generate the migration file for EasyMLModel using the template
58
+ def create_easy_ml_models_migration
59
+ migration_template(
60
+ "create_easy_ml_models.rb.tt",
61
+ "db/migrate/create_easy_ml_models.rb"
62
+ )
63
+ end
64
+
65
+ def create_easy_ml_model_files_migration
66
+ migration_template(
67
+ "create_easy_ml_model_files.rb.tt",
68
+ "db/migrate/create_easy_ml_model_files.rb"
69
+ )
70
+ end
71
+
72
+ def create_easy_ml_datasource_migration
73
+ migration_template(
74
+ "create_easy_ml_datasources.rb.tt",
75
+ "db/migrate/create_easy_ml_datasources.rb"
76
+ )
77
+ end
78
+
79
+ def create_easy_ml_datasets_migration
80
+ migration_template(
81
+ "create_easy_ml_datasets.rb.tt",
82
+ "db/migrate/create_easy_ml_datasets.rb"
83
+ )
84
+ end
85
+
86
+ def create_easy_ml_tuner_jobs_migration
87
+ migration_template(
88
+ "create_easy_ml_tuner_jobs.rb.tt",
89
+ "db/migrate/create_easy_ml_tuner_jobs.rb"
90
+ )
91
+ end
92
+
93
+ def create_easy_ml_retraining_jobs_migration
94
+ migration_template(
95
+ "create_easy_ml_retraining_jobs.rb.tt",
96
+ "db/migrate/create_easy_ml_retraining_jobs.rb"
97
+ )
98
+ end
99
+
100
+ def create_easy_ml_settings_migration
101
+ migration_template(
102
+ "create_easy_ml_settings.rb.tt",
103
+ "db/migrate/create_easy_ml_settings.rb"
104
+ )
105
+ end
106
+
107
+ def create_easy_ml_events_migration
108
+ migration_template(
109
+ "create_easy_ml_events.rb.tt",
110
+ "db/migrate/create_easy_ml_events.rb"
111
+ )
112
+ end
113
+
114
+ def create_easy_ml_columns_migration
115
+ migration_template(
116
+ "create_easy_ml_columns.rb.tt",
117
+ "db/migrate/create_easy_ml_columns.rb"
118
+ )
119
+ end
120
+
121
+ def create_easy_ml_features_migration
122
+ migration_template(
123
+ "create_easy_ml_features.rb.tt",
124
+ "db/migrate/create_easy_ml_features.rb"
125
+ )
126
+ end
127
+
128
+ def create_easy_ml_splitters_migration
129
+ migration_template(
130
+ "create_easy_ml_splitters.rb.tt",
131
+ "db/migrate/create_easy_ml_splitters.rb"
132
+ )
133
+ end
134
+
135
+ def create_easy_ml_splitter_histories_migration
136
+ migration_template(
137
+ "create_easy_ml_splitter_histories.rb.tt",
138
+ "db/migrate/create_easy_ml_splitter_histories.rb"
139
+ )
140
+ end
141
+
142
+ def create_easy_ml_datasource_histories_migration
143
+ migration_template(
144
+ "create_easy_ml_datasource_histories.rb.tt",
145
+ "db/migrate/create_easy_ml_datasource_histories.rb"
146
+ )
147
+ end
148
+
149
+ def create_easy_ml_dataset_histories_migration
150
+ migration_template(
151
+ "create_easy_ml_dataset_histories.rb.tt",
152
+ "db/migrate/create_easy_ml_dataset_histories.rb"
153
+ )
154
+ end
155
+
156
+ def create_easy_ml_column_histories_migration
157
+ migration_template(
158
+ "create_easy_ml_column_histories.rb.tt",
159
+ "db/migrate/create_easy_ml_column_histories.rb"
160
+ )
161
+ end
162
+
163
+ def create_easy_ml_model_histories_migration
164
+ migration_template(
165
+ "create_easy_ml_model_histories.rb.tt",
166
+ "db/migrate/create_easy_ml_model_histories.rb"
167
+ )
168
+ end
169
+
170
+ def create_easy_ml_feature_histories_migration
171
+ migration_template(
172
+ "create_easy_ml_feature_histories.rb.tt",
173
+ "db/migrate/create_easy_ml_feature_histories.rb"
174
+ )
175
+ end
176
+
177
+ def create_easy_ml_model_file_histories_migration
178
+ migration_template(
179
+ "create_easy_ml_model_file_histories.rb.tt",
180
+ "db/migrate/create_easy_ml_model_file_histories.rb"
181
+ )
182
+ end
183
+
184
+ def create_easy_ml_deploys
185
+ migration_template(
186
+ "create_easy_ml_deploys.rb.tt",
187
+ "db/migrate/create_easy_ml_deploys.rb"
188
+ )
189
+ end
42
190
 
43
- # Get the next migration number
44
- def next_migration_number
45
- self.class.next_migration_number(Rails.root.join("db/migrate"))
191
+ def create_easy_ml_predictions_migration
192
+ migration_template(
193
+ "create_easy_ml_predictions.rb.tt",
194
+ "db/migrate/create_easy_ml_predictions.rb"
195
+ )
196
+ end
197
+
198
+ # Get the next migration number
199
+ def next_migration_number
200
+ self.class.next_migration_number(Rails.root.join("db/migrate"))
201
+ end
46
202
  end
47
203
  end
48
204
  end
@@ -0,0 +1,9 @@
1
+ require "historiographer/postgres_migration"
2
+
3
+ class CreateEasyMLColumnHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :easy_ml_column_histories do |t|
6
+ t.histories(foreign_key: :column_id)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,25 @@
1
+ class CreateEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ create_table :easy_ml_columns do |t|
4
+ t.bigint :dataset_id, null: false
5
+ t.string :name, null: false
6
+ t.string :description
7
+ t.string :datatype # The symbol representation (e.g., 'float', 'integer')
8
+ t.string :polars_datatype # The full Polars class name (e.g., 'Polars::Float64')
9
+ t.boolean :is_target
10
+ t.boolean :hidden, default: false
11
+ t.boolean :drop_if_null, default: false
12
+ t.json :preprocessing_steps
13
+ t.json :sample_values # Store up to 3 sample values
14
+ t.json :statistics
15
+
16
+ t.timestamps
17
+
18
+ t.index [:dataset_id, :name], unique: true
19
+ t.index :datatype
20
+ t.index :hidden
21
+ t.index :drop_if_null
22
+ t.index :is_target
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,9 @@
1
+ require "historiographer/postgres_migration"
2
+
3
+ class CreateEasyMLDatasetHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :easy_ml_dataset_histories do |t|
6
+ t.histories(foreign_key: :dataset_id)
7
+ end
8
+ end
9
+ end