easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -1,49 +0,0 @@
1
- module EasyML
2
- module Data
3
- class Dataset
4
- module Splits
5
- class InMemorySplit < Split
6
- include GlueGun::DSL
7
-
8
- attribute :sample, :float, default: 1.0
9
- def initialize(options)
10
- super
11
- @data = {}
12
- end
13
-
14
- def save(segment, df)
15
- @data[segment] = df
16
- end
17
-
18
- def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
19
- df = @data[segment]
20
- return nil if df.nil?
21
-
22
- df = sample_data(df) if sample < 1.0
23
- drop_cols &= df.columns
24
- df = df.drop(drop_cols) unless drop_cols.empty?
25
-
26
- if block_given?
27
- if split_ys
28
- xs, ys = split_features_targets(df, true, target)
29
- process_block_with_split_ys(block, nil, xs, ys)
30
- else
31
- process_block_without_split_ys(block, nil, df)
32
- end
33
- else
34
- split_features_targets(df, split_ys, target)
35
- end
36
- end
37
-
38
- def cleanup
39
- @data.clear
40
- end
41
-
42
- def split_at
43
- @data.keys.empty? ? nil : Time.now
44
- end
45
- end
46
- end
47
- end
48
- end
49
- end
@@ -1,98 +0,0 @@
1
- module EasyML
2
- module Data
3
- class Dataset
4
- module Splits
5
- class Split
6
- include GlueGun::DSL
7
- include EasyML::Data::Utils
8
-
9
- attribute :polars_args, :hash, default: {}
10
- attribute :max_rows_per_file, :integer, default: 1_000_000
11
- attribute :batch_size, :integer, default: 10_000
12
- attribute :sample, :float, default: 1.0
13
- attribute :verbose, :boolean, default: false
14
-
15
- def save(segment, df)
16
- raise NotImplementedError, "Subclasses must implement #save"
17
- end
18
-
19
- def read(segment, split_ys: false, target: nil, drop_cols: [], &block)
20
- raise NotImplementedError, "Subclasses must implement #read"
21
- end
22
-
23
- def train(&block)
24
- read(:train, &block)
25
- end
26
-
27
- def test(&block)
28
- read(:test, &block)
29
- end
30
-
31
- def valid(&block)
32
- read(:valid, &block)
33
- end
34
-
35
- def cleanup
36
- raise NotImplementedError, "Subclasses must implement #cleanup"
37
- end
38
-
39
- def split_at
40
- raise NotImplementedError, "Subclasses must implement #split_at"
41
- end
42
-
43
- protected
44
-
45
- def split_features_targets(df, split_ys, target)
46
- raise ArgumentError, "Target column must be specified when split_ys is true" if split_ys && target.nil?
47
-
48
- if split_ys
49
- xs = df.drop(target)
50
- ys = df.select(target)
51
- [xs, ys]
52
- else
53
- df
54
- end
55
- end
56
-
57
- def sample_data(df)
58
- return df if sample >= 1.0
59
-
60
- df.sample(n: (df.shape[0] * sample).ceil, seed: 42)
61
- end
62
-
63
- def create_progress_bar(segment, total_rows)
64
- ProgressBar.create(
65
- title: "Reading #{segment}",
66
- total: total_rows,
67
- format: "%t: |%B| %p%% %e"
68
- )
69
- end
70
-
71
- def process_block_with_split_ys(block, result, xs, ys)
72
- case block.arity
73
- when 3
74
- result.nil? ? [xs, ys] : block.call(result, xs, ys)
75
- when 2
76
- block.call(xs, ys)
77
- result
78
- else
79
- raise ArgumentError, "Block must accept 2 or 3 arguments when split_ys is true"
80
- end
81
- end
82
-
83
- def process_block_without_split_ys(block, result, df)
84
- case block.arity
85
- when 2
86
- result.nil? ? df : block.call(result, df)
87
- when 1
88
- block.call(df)
89
- result
90
- else
91
- raise ArgumentError, "Block must accept 1 or 2 arguments when split_ys is false"
92
- end
93
- end
94
- end
95
- end
96
- end
97
- end
98
- end
@@ -1,11 +0,0 @@
1
- module EasyML
2
- module Data
3
- class Dataset
4
- module Splits
5
- require_relative "splits/split"
6
- require_relative "splits/file_split"
7
- require_relative "splits/in_memory_split"
8
- end
9
- end
10
- end
11
- end
@@ -1,43 +0,0 @@
1
- module EasyML::Data::Dataset::Splitters
2
- class DateSplitter
3
- include GlueGun::DSL
4
-
5
- attribute :today, :datetime
6
- def today=(value)
7
- super(value.in_time_zone(UTC).to_datetime)
8
- end
9
- attribute :date_col, :string
10
- attribute :months_test, :integer, default: 2
11
- attribute :months_valid, :integer, default: 2
12
-
13
- def initialize(options)
14
- options[:today] ||= UTC.now
15
- super(options)
16
- end
17
-
18
- def split(df)
19
- unless df[date_col].dtype.is_a?(Polars::Datetime)
20
- raise "Date splitter cannot split on non-date col #{date_col}, dtype is #{df[date_col].dtype}"
21
- end
22
-
23
- validation_date_start, test_date_start = splits
24
-
25
- test_df = df.filter(Polars.col(date_col) >= test_date_start)
26
- remaining_df = df.filter(Polars.col(date_col) < test_date_start)
27
- valid_df = remaining_df.filter(Polars.col(date_col) >= validation_date_start)
28
- train_df = remaining_df.filter(Polars.col(date_col) < validation_date_start)
29
-
30
- [train_df, valid_df, test_df]
31
- end
32
-
33
- def months(n)
34
- ActiveSupport::Duration.months(n)
35
- end
36
-
37
- def splits
38
- test_date_start = today.advance(months: -months_test).beginning_of_day
39
- validation_date_start = today.advance(months: -(months_test + months_valid)).beginning_of_day
40
- [validation_date_start, test_date_start]
41
- end
42
- end
43
- end
@@ -1,9 +0,0 @@
1
- module EasyML
2
- module Data
3
- class Dataset
4
- module Splitters
5
- require_relative "splitters/date_splitter"
6
- end
7
- end
8
- end
9
- end
@@ -1,430 +0,0 @@
1
- require "polars"
2
- require_relative "datasource"
3
- require_relative "dataset/splitters"
4
- require_relative "dataset/splits"
5
-
6
- # Dataset is responsible for:
7
- #
8
- # 1) Ensuring data is synced from its source (e.g. S3 — delegates to datasource)
9
- # 2) Ensuring the data is properly split into train, test, and validation data (delegates to splitter)
10
- # 3) Knowing where data is stored on disk, and pulling batches of data into memory
11
- # 4) Knowing where to save updated data (after preprocessing steps)
12
- #
13
- module EasyML
14
- module Data
15
- class Dataset
16
- include GlueGun::DSL
17
- include EasyML::Logging
18
- include EasyML::Data::Utils
19
-
20
- # include GitIgnorable
21
- # gitignore :root_dir do |dir|
22
- # if Rails.env.test? # Don't gitignore our test files
23
- # nil
24
- # else
25
- # File.join(dir, "files/**/*")
26
- # end
27
- # end
28
-
29
- # These helpers are defined in GlueGun::DSL.
30
- #
31
- # define_attr defines configurable attributes for subclasses,
32
- # for example, a class sub-classing Dataset will want to define its
33
- # target (e.g. the column we are trying to predict)
34
- #
35
- # These can either be defined on a class-level like this:
36
- #
37
- # class Dataset < EasyML::Data::Dataset
38
- # target "REVENUE"
39
- # end
40
- #
41
- # Or passed in during initialization:
42
- #
43
- # Dataset.new(target: "REV")
44
- #
45
- attribute :verbose, :boolean, default: false
46
- attribute :today, :date, default: -> { UTC.now }
47
- def today=(value)
48
- super(value.in_time_zone(UTC).to_date)
49
- end
50
- attribute :target, :string
51
- validates :target, presence: true
52
-
53
- attribute :batch_size, :integer, default: 50_000
54
-
55
- attribute :root_dir, :string
56
- validates :root_dir, presence: true
57
- def root_dir=(value)
58
- super(Pathname.new(value).append("data").to_s)
59
- end
60
-
61
- attribute :sample, :float, default: 1.0
62
- attribute :drop_if_null, :array, default: []
63
-
64
- # define_attr can also define default values, as well as argument helpers
65
- attribute :polars_args, :hash, default: {}
66
- def polars_args=(args)
67
- super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
68
- hash.tap do
69
- hash[k] = v
70
- hash[k] = v.stringify_keys if k == :dtypes
71
- end
72
- end)
73
- end
74
-
75
- attribute :transforms, default: nil
76
- validate :transforms_are_transforms
77
- def transforms_are_transforms
78
- return if transforms.nil? || transforms.respond_to?(:transform)
79
-
80
- errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
81
- end
82
-
83
- attribute :drop_cols, :array, default: []
84
-
85
- dependency :datasource, EasyML::Data::Datasource::DatasourceFactory
86
-
87
- # dependency defines a configurable dependency, with optional args,
88
- # for example, here we define a datasource:
89
- #
90
- # class YourDataset
91
- # datasource :s3, s3_bucket: "fundera-bart", s3_prefix: "xyz"
92
- # # This automatically uses the S3Datasource class to pull data
93
- # end
94
- #
95
- # If we define any models based on other data sources (e.g. postgres),
96
- # you would just define a new PostgresDatasource
97
- #
98
-
99
- # Here we define splitter options, inspired by common Python data splitting techniques:
100
- #
101
- # 1. Date-based splitter (similar to TimeSeriesSplit from sklearn)
102
- #
103
- # NOT IMPLEMENTED (but you could implement as necessary):
104
- # 2. Random splitter (similar to train_test_split from sklearn)
105
- # 3. Stratified splitter (similar to StratifiedKFold from sklearn)
106
- # 4. Group-based splitter (similar to GroupKFold from sklearn)
107
- # 5. Sliding window splitter (similar to TimeSeriesSplit with a sliding window)
108
- #
109
- dependency :splitter do |dependency|
110
- dependency.option :date do |option|
111
- option.default
112
- option.set_class EasyML::Data::Dataset::Splitters::DateSplitter
113
- option.bind_attribute :today, required: true
114
- option.bind_attribute :date_col, required: true
115
- option.bind_attribute :months_test, required: true
116
- option.bind_attribute :months_valid, required: true
117
- end
118
- end
119
-
120
- # Here we define the preprocessing logic.
121
- # Aka what to do with null values. For instance:
122
- #
123
- # class YourDataset
124
- # preprocessing_steps: {
125
- # training: {
126
- # annual_revenue: {
127
- # clip: {min: 0, max: 1_000_000} # Clip values between these
128
- # median: true, # Then learn the median based on clipped values
129
- # },
130
- # created_date: { ffill: true } # During training, use the latest value in the dataset
131
- # },
132
- # inference: {
133
- # created_date: { today: true } # During inference, use the current date
134
- # }
135
- # }
136
- # end
137
- #
138
- attribute :preprocessing_steps, :hash, default: {}
139
- dependency :preprocessor do |dependency|
140
- dependency.set_class EasyML::Data::Preprocessor
141
- dependency.bind_attribute :directory, source: :root_dir do |value|
142
- Pathname.new(value).append("preprocessor")
143
- end
144
- dependency.bind_attribute :preprocessing_steps
145
- end
146
-
147
- # Here we define the raw dataset (uses the Split class)
148
- # We use this to learn dataset statistics (e.g. median annual revenue)
149
- # But we NEVER overwrite it
150
- #
151
- dependency :raw do |dependency|
152
- dependency.option :file do |option|
153
- option.default
154
- option.set_class EasyML::Data::Dataset::Splits::FileSplit
155
- option.bind_attribute :dir, source: :root_dir do |value|
156
- Pathname.new(value).append("files/splits/raw")
157
- end
158
- option.bind_attribute :polars_args
159
- option.bind_attribute :max_rows_per_file, source: :batch_size
160
- option.bind_attribute :batch_size
161
- option.bind_attribute :sample
162
- option.bind_attribute :verbose
163
- end
164
-
165
- dependency.option :memory do |option|
166
- option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
167
- option.bind_attribute :sample
168
- end
169
-
170
- dependency.when do |_dep|
171
- { option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
172
- end
173
- end
174
-
175
- # Here we define the processed dataset (uses the Split class)
176
- # After we learn the dataset statistics, we fill null values
177
- # using the learned statistics (e.g. fill annual_revenue with median annual_revenue)
178
- #
179
- dependency :processed do |dependency|
180
- dependency.option :file do |option|
181
- option.default
182
- option.set_class EasyML::Data::Dataset::Splits::FileSplit
183
- option.bind_attribute :dir, source: :root_dir do |value|
184
- Pathname.new(value).append("files/splits/processed")
185
- end
186
- option.bind_attribute :polars_args
187
- option.bind_attribute :max_rows_per_file, source: :batch_size
188
- option.bind_attribute :batch_size
189
- option.bind_attribute :sample
190
- option.bind_attribute :verbose
191
- end
192
-
193
- dependency.option :memory do |option|
194
- option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
195
- option.bind_attribute :sample
196
- end
197
-
198
- dependency.when do |_dep|
199
- { option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
200
- end
201
- end
202
-
203
- delegate :new_data_available?, :synced?, :stale?, to: :datasource
204
- delegate :train, :test, :valid, to: :split
205
- delegate :splits, to: :splitter
206
-
207
- def refresh!
208
- refresh_datasource
209
- split_data
210
- fit
211
- normalize_all
212
- alert_nulls
213
- end
214
-
215
- def normalize(df = nil)
216
- df = drop_nulls(df)
217
- df = apply_transforms(df)
218
- preprocessor.postprocess(df)
219
- end
220
-
221
- # A "production" preprocessor is predicting live values (e.g. used on live webservers)
222
- # A "development" preprocessor is used during training (e.g. we're learning new values for the dataset)
223
- #
224
- delegate :statistics, to: :preprocessor
225
-
226
- def train(split_ys: false, all_columns: false, &block)
227
- load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
228
- end
229
-
230
- def valid(split_ys: false, all_columns: false, &block)
231
- load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
232
- end
233
-
234
- def test(split_ys: false, all_columns: false, &block)
235
- load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
236
- end
237
-
238
- def data(split_ys: false, all_columns: false)
239
- if split_ys
240
- x_train, y_train = train(split_ys: true, all_columns: all_columns)
241
- x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
242
- x_test, y_test = test(split_ys: true, all_columns: all_columns)
243
-
244
- xs = Polars.concat([x_train, x_valid, x_test])
245
- ys = Polars.concat([y_train, y_valid, y_test])
246
- [xs, ys]
247
- else
248
- train_df = train(split_ys: false, all_columns: all_columns)
249
- valid_df = valid(split_ys: false, all_columns: all_columns)
250
- test_df = test(split_ys: false, all_columns: all_columns)
251
-
252
- Polars.concat([train_df, valid_df, test_df])
253
- end
254
- end
255
-
256
- def cleanup
257
- raw.cleanup
258
- processed.cleanup
259
- end
260
-
261
- def check_nulls(data_type = :processed)
262
- result = %i[train test valid].each_with_object({}) do |segment, acc|
263
- segment_result = { nulls: {}, total: 0 }
264
-
265
- data_source = data_type == :raw ? raw : processed
266
- data_source.read(segment) do |df|
267
- df_nulls = null_check(df)
268
- df.columns.each do |column|
269
- segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
270
- if df_nulls && df_nulls[column]
271
- segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
272
- end
273
- segment_result[:nulls][column][:total_count] += df.height
274
- end
275
- end
276
-
277
- segment_result[:nulls].each do |column, counts|
278
- percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
279
- acc[column] ||= {}
280
- acc[column][segment] = percentage
281
- end
282
- end
283
-
284
- # Remove columns that have no nulls across all segments
285
- result.reject! { |_, v| v.values.all?(&:zero?) }
286
-
287
- result.empty? ? nil : result
288
- end
289
-
290
- def processed?
291
- !should_split?
292
- end
293
-
294
- def decode_labels(ys, col: nil)
295
- preprocessor.decode_labels(ys, col: col.nil? ? target : col)
296
- end
297
-
298
- private
299
-
300
- def refresh_datasource
301
- datasource.refresh!
302
- end
303
- log_method :refresh!, "Refreshing datasource", verbose: true
304
-
305
- def normalize_all
306
- processed.cleanup
307
-
308
- %i[train test valid].each do |segment|
309
- raw.read(segment) do |df|
310
- processed_df = normalize(df)
311
- processed.save(segment, processed_df)
312
- end
313
- end
314
- end
315
- log_method :normalize_all, "Normalizing dataset", verbose: true
316
-
317
- def drop_nulls(df)
318
- return df if drop_if_null.nil? || drop_if_null.empty?
319
-
320
- df.drop_nulls(subset: drop_if_null)
321
- end
322
-
323
- def drop_columns(all_columns: false)
324
- if all_columns
325
- []
326
- else
327
- drop_cols
328
- end
329
- end
330
-
331
- def load_data(segment, split_ys: false, all_columns: false, &block)
332
- drop_cols = drop_columns(all_columns: all_columns)
333
- if processed?
334
- processed.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
335
- else
336
- raw.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
337
- end
338
- end
339
-
340
- def fit(xs = nil)
341
- xs = raw.train if xs.nil?
342
-
343
- preprocessor.fit(xs)
344
- end
345
- log_method :fit, "Learning statistics", verbose: true
346
-
347
- def in_batches(segment, processed: true, &block)
348
- if processed
349
- processed.read(segment, &block)
350
- else
351
- raw.read(segment, &block)
352
- end
353
- end
354
-
355
- def split_data
356
- return unless should_split?
357
-
358
- cleanup
359
- datasource.in_batches do |df|
360
- train_df, valid_df, test_df = splitter.split(df)
361
- raw.save(:train, train_df)
362
- raw.save(:valid, valid_df)
363
- raw.save(:test, test_df)
364
- end
365
-
366
- # Update the persisted sample size after splitting
367
- save_previous_sample(sample)
368
- end
369
- log_method :split_data, "Splitting data", verbose: true
370
-
371
- def should_split?
372
- split_timestamp = raw.split_at
373
- previous_sample = load_previous_sample
374
- sample_increased = previous_sample && sample > previous_sample
375
- previous_sample.nil? || split_timestamp.nil? || split_timestamp < datasource.last_updated_at || sample_increased
376
- end
377
-
378
- def sample_info_file
379
- File.join(root_dir, "sample_info.json")
380
- end
381
-
382
- def save_previous_sample(sample_size)
383
- File.write(sample_info_file, JSON.generate({ previous_sample: sample_size }))
384
- end
385
-
386
- def load_previous_sample
387
- return nil unless File.exist?(sample_info_file)
388
-
389
- JSON.parse(File.read(sample_info_file))["previous_sample"]
390
- end
391
-
392
- def apply_transforms(df)
393
- if transforms.nil?
394
- df
395
- else
396
- transforms.apply_transforms(df)
397
- end
398
- end
399
-
400
- def alert_nulls
401
- processed_nulls = check_nulls(:processed)
402
- raw_nulls = check_nulls(:raw)
403
-
404
- if processed_nulls
405
- log_warning("Nulls found in the processed dataset:")
406
- processed_nulls.each do |column, segments|
407
- segments.each do |segment, percentage|
408
- log_warning(" #{column} - #{segment}: #{percentage}% nulls")
409
- end
410
- end
411
- else
412
- log_info("No nulls found in the processed dataset.")
413
- end
414
-
415
- if raw_nulls
416
- raw_nulls.each do |column, segments|
417
- segments.each do |segment, percentage|
418
- if percentage > 50
419
- log_warning("Data processing issue detected: #{column} - #{segment} has #{percentage}% nulls in the raw dataset")
420
- end
421
- end
422
- end
423
- end
424
-
425
- nil
426
- end
427
- log_method :alert_nulls, "Checking for nulls", verbose: true
428
- end
429
- end
430
- end
@@ -1,60 +0,0 @@
1
- require_relative "merged_datasource"
2
-
3
- module EasyML
4
- module Data
5
- class Datasource
6
- class DatasourceFactory
7
- include GlueGun::DSL
8
-
9
- dependency :datasource do |dependency|
10
- dependency.option :s3 do |option|
11
- option.default
12
- option.set_class EasyML::Data::Datasource::S3Datasource
13
- option.bind_attribute :root_dir do |value|
14
- Pathname.new(value).append("files")
15
- end
16
- option.bind_attribute :polars_args, default: {}
17
- option.bind_attribute :s3_bucket, required: true
18
- option.bind_attribute :s3_prefix
19
- option.bind_attribute :s3_access_key_id, required: true
20
- option.bind_attribute :s3_secret_access_key, required: true
21
- end
22
-
23
- dependency.option :file do |option|
24
- option.set_class EasyML::Data::Datasource::FileDatasource
25
- option.bind_attribute :root_dir do |value|
26
- Pathname.new(value).append("files/raw")
27
- end
28
- option.bind_attribute :polars_args
29
- end
30
-
31
- dependency.option :polars do |option|
32
- option.set_class EasyML::Data::Datasource::PolarsDatasource
33
- option.bind_attribute :df
34
- end
35
-
36
- dependency.option :merged do |option|
37
- option.set_class EasyML::Data::Datasource::MergedDatasource
38
- option.bind_attribute :root_dir
39
- end
40
-
41
- # Passing in datasource: Polars::DataFrame will wrap properly
42
- # So will passing in datasource /path/to/dir
43
- dependency.when do |dep|
44
- case dep
45
- when Polars::DataFrame
46
- { option: :polars, as: :df }
47
- when String, Pathname
48
- { option: :file, as: :root_dir }
49
- end
50
- end
51
- end
52
- end
53
- end
54
- end
55
- end
56
-
57
- # Do this here otherwise we'll end up with a circular dependency
58
- class EasyML::Data::Datasource::MergedDatasource
59
- dependency :datasources, DatasourceFactory
60
- end