easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,297 @@
1
+ module EasyML
2
+ module Data
3
+ class PolarsReader
4
+ attr_accessor :root_dir, :polars_args, :refresh, :num_rows
5
+ attr_reader :schema
6
+
7
+ def initialize(options = {})
8
+ @root_dir = options[:root_dir]
9
+ @polars_args = options[:polars_args] || {}
10
+ @refresh = options[:refresh] || false
11
+ @num_rows = options[:num_rows]
12
+ @schema = options[:schema]
13
+ end
14
+
15
+ def schema=(value)
16
+ @schema = value
17
+ polars_args[:dtypes] = value
18
+ end
19
+
20
+ def normalize
21
+ learn_dataset
22
+ convert_to_parquet
23
+ files
24
+ end
25
+
26
+ def clean
27
+ FileUtils.rm(parquet_files) unless Rails.env.test?
28
+ end
29
+
30
+ def in_batches
31
+ normalize
32
+
33
+ files.each do |file|
34
+ yield read_file(file)
35
+ end
36
+ end
37
+
38
+ def all_files
39
+ (parquet_files + csv_files)
40
+ end
41
+
42
+ def files
43
+ if parquet_files.any? && !refresh
44
+ parquet_files
45
+ else
46
+ csv_files
47
+ end
48
+ end
49
+
50
+ def data
51
+ query
52
+ end
53
+
54
+ def convert_to_parquet(columns = nil)
55
+ return files if any_parquet? && columns.nil?
56
+
57
+ puts "Converting to Parquet..."
58
+
59
+ csv_files.each do |path|
60
+ df = read_file(path, columns)
61
+ df = cast(df, columns)
62
+ path.dup
63
+ filename = Pathname.new(path).basename
64
+ ext = Pathname.new(path).extname.gsub(/\./, "")
65
+ filename = filename.to_s.gsub(Regexp.new(ext), "parquet")
66
+ path = File.join(root_dir, filename).to_s
67
+ df.write_parquet(path)
68
+ end
69
+ end
70
+
71
+ def query(files = nil, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
72
+ batch_size: nil, batch_start: nil, batch_key: nil, &block)
73
+ files ||= self.files
74
+ PolarsReader.query(files, drop_cols: drop_cols, filter: filter, limit: limit,
75
+ select: select, unique: unique, sort: sort, descending: descending,
76
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, &block)
77
+ end
78
+
79
+ def self.query(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
80
+ batch_size: nil, batch_start: nil, batch_key: nil, &block)
81
+ return query_files(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
82
+ unique: unique, sort: sort, descending: descending).collect unless batch_size.present?
83
+
84
+ return batch_enumerator(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
85
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key) unless block_given?
86
+
87
+ process_batches(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
88
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, &block)
89
+ end
90
+
91
+ private
92
+
93
+ def self.batch_enumerator(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
94
+ batch_size: nil, batch_start: nil, batch_key: nil, &block)
95
+ Enumerator.new do |yielder|
96
+ process_batches(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
97
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key) do |batch|
98
+ yielder << batch
99
+ end
100
+ end
101
+ end
102
+
103
+ def self.process_batches(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
104
+ batch_size: nil, batch_start: nil, batch_key: nil, &block)
105
+ batch_key ||= identify_primary_key(files, select: select)
106
+ raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
107
+
108
+ sort = batch_key
109
+ batch_start = query_files(files, sort: sort, descending: descending, select: batch_key, limit: 1).collect[batch_key].to_a.last unless batch_start
110
+ final_value = query_files(files, sort: sort, descending: !descending, select: batch_key, limit: 1).collect[batch_key].to_a.last
111
+
112
+ is_first_batch = true
113
+ current_start = batch_start
114
+
115
+ while current_start < final_value
116
+ filter = is_first_batch ? Polars.col(sort) >= current_start : Polars.col(sort) > current_start
117
+ batch = query_files(files, drop_cols: drop_cols, filter: filter, limit: batch_size, select: select, unique: unique, sort: sort, descending: descending)
118
+ yield batch
119
+ current_start = query_files(files, sort: sort, descending: descending, limit: batch_size, filter: filter).sort(sort, reverse: !descending).limit(1).select(batch_key).collect[batch_key].to_a.last
120
+ is_first_batch = false
121
+ end
122
+ end
123
+
124
+ def self.query_files(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
125
+ lazy_frames = to_lazy_frames(files)
126
+ combined_lazy_df = Polars.concat(lazy_frames)
127
+
128
+ # Apply the predicate filter if given
129
+ combined_lazy_df = combined_lazy_df.filter(filter) if filter
130
+ # Apply select columns if provided
131
+ combined_lazy_df = combined_lazy_df.select(select) if select.present?
132
+ combined_lazy_df = combined_lazy_df.unique if unique
133
+
134
+ # Apply sorting if provided
135
+ combined_lazy_df = combined_lazy_df.sort(sort, reverse: descending) if sort
136
+
137
+ # Apply drop columns
138
+ drop_cols &= combined_lazy_df.columns
139
+ combined_lazy_df = combined_lazy_df.drop(drop_cols) unless drop_cols.empty?
140
+
141
+ # Collect the DataFrame (execute the lazy operations)
142
+ combined_lazy_df = combined_lazy_df.limit(limit) if limit
143
+ combined_lazy_df
144
+ end
145
+
146
+ def self.identify_primary_key(files, select: nil)
147
+ lazy_df = to_lazy_frames([files.first]).first
148
+ if select
149
+ # Lazily filter only the selected columns
150
+ lazy_df = lazy_df.select(select)
151
+
152
+ # Lazily compute the unique count for each column and compare with total row count
153
+ primary_keys = select.select do |col|
154
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
155
+ end
156
+ else
157
+ primary_keys = lazy_df.collect.columns.select do |col|
158
+ # Lazily count unique values and compare with the total row count
159
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
160
+ end
161
+ end
162
+
163
+ if primary_keys.count > 1
164
+ key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
165
+ if key
166
+ primary_keys = [key]
167
+ end
168
+ end
169
+
170
+ if primary_keys.count != 1
171
+ raise "Unable to determine primary key for dataset"
172
+ end
173
+
174
+ return primary_keys.first
175
+ end
176
+
177
+ def self.lazy_schema(files)
178
+ to_lazy_frames([files.first]).first.schema
179
+ end
180
+
181
+ def self.to_lazy_frames(files)
182
+ files.map do |file|
183
+ case Pathname.new(file).extname.gsub(/\./, "")
184
+ when "csv"
185
+ Polars.scan_csv(file)
186
+ when "parquet"
187
+ Polars.scan_parquet(file)
188
+ end
189
+ end
190
+ end
191
+
192
+ def read_file(file, columns = nil)
193
+ if columns
194
+ dtypes = columns_to_dtypes(columns)
195
+ polars_args[:dtypes] ||= {}
196
+ polars_args[:dtypes].merge!(dtypes)
197
+ end
198
+ ext = Pathname.new(file).extname.gsub(/\./, "")
199
+ case ext
200
+ when "csv"
201
+ filtered_args = filter_polars_args(Polars.method(:read_csv))
202
+ filtered_args.merge!(infer_schema_length: 1_000_000, null_values: ["\\N", "\\\\N", "NULL"])
203
+ df = Polars.read_csv(file, **filtered_args)
204
+ when "parquet"
205
+ filtered_args = filter_polars_args(Polars.method(:read_parquet))
206
+ df = Polars.read_parquet(file, **filtered_args)
207
+ end
208
+ df
209
+ end
210
+
211
+ def any_parquet?
212
+ files.any? { |f| f.match?(/\.parquet$/) }
213
+ end
214
+
215
+ def filter_polars_args(method)
216
+ supported_params = method.parameters.map { |_, name| name }
217
+ polars_args.select { |k, _| supported_params.include?(k) }
218
+ end
219
+
220
+ def csv_files
221
+ Dir.glob(File.join(root_dir, "**/*.{csv}"))
222
+ end
223
+
224
+ def parquet_files
225
+ Dir.glob(File.join(root_dir, "**/*.{parquet}"))
226
+ end
227
+
228
+ def columns_to_dtypes(columns)
229
+ columns.reduce({}) { |h, c| h[c.name] = c.polars_type; h }
230
+ end
231
+
232
+ def cast(df, columns = [])
233
+ keep_cols = columns && columns.any? ? columns.map(&:name) : schema.keys
234
+ lookup = columns && columns.any? ? columns_to_dtypes(columns) : schema
235
+ cast_cols = keep_cols & df.columns
236
+ df = df.with_columns(
237
+ cast_cols.map do |column|
238
+ dtype = lookup[column]
239
+ df[column].cast(dtype).alias(column)
240
+ end
241
+ )
242
+ end
243
+
244
+ def existing_parquet_schema
245
+ return nil if parquet_files.empty?
246
+
247
+ file_path = parquet_files.first
248
+ df = Polars.scan_parquet(file_path)
249
+ df = df.limit(1)
250
+ df.collect.schema
251
+ end
252
+
253
+ def learn_dataset
254
+ return schema if schema.present?
255
+
256
+ existing_schema = existing_parquet_schema
257
+ schema = existing_schema || normalize_dataset
258
+
259
+ self.schema = schema
260
+ polars_args[:dtypes] = schema
261
+ end
262
+
263
+ def normalize_dataset
264
+ puts "Normalizing schema..."
265
+ self.num_rows = 0
266
+ first_file = read_file(files.first)
267
+
268
+ files.map.with_index do |path, _idx|
269
+ df = read_file(path)
270
+ self.num_rows += df.shape[0]
271
+ df.schema
272
+ end.inject({}) do |h, schema|
273
+ h.tap do
274
+ schema.each do |key, value|
275
+ h[key] ||= []
276
+ h[key] << value unless h[key].include?(value)
277
+ end
278
+ end
279
+ end.inject({}) do |h, (k, v)|
280
+ h.tap do
281
+ values = v.map { |klass| klass.to_s.gsub(/Polars::/, "") }
282
+ h[k] = if values.any? { |v| v.match?(/Float/) }
283
+ Polars::Float64
284
+ elsif values.any? { |v| v.match?(/Int/) }
285
+ Polars::Int64
286
+ else
287
+ type = EasyML::Data::PolarsColumn.determine_type(first_file[k], true)
288
+ raise "Cannot determine polars type for field #{k}" if type.nil?
289
+
290
+ type
291
+ end
292
+ end
293
+ end
294
+ end
295
+ end
296
+ end
297
+ end