easy_ml 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,430 @@
1
+ require "polars"
2
+ require_relative "datasource"
3
+ require_relative "dataset/splitters"
4
+ require_relative "dataset/splits"
5
+
6
+ # Dataset is responsible for:
7
+ #
8
+ # 1) Ensuring data is synced from its source (e.g. S3 — delegates to datasource)
9
+ # 2) Ensuring the data is properly split into train, test, and validation data (delegates to splitter)
10
+ # 3) Knowing where data is stored on disk, and pulling batches of data into memory
11
+ # 4) Knowing where to save updated data (after preprocessing steps)
12
+ #
13
+ module EasyML
14
+ module Data
15
+ class Dataset
16
+ include GlueGun::DSL
17
+ include EasyML::Logging
18
+ include EasyML::Data::Utils
19
+
20
+ # include GitIgnorable
21
+ # gitignore :root_dir do |dir|
22
+ # if Rails.env.test? # Don't gitignore our test files
23
+ # nil
24
+ # else
25
+ # File.join(dir, "files/**/*")
26
+ # end
27
+ # end
28
+
29
+ # These helpers are defined in GlueGun::DSL.
30
+ #
31
+ # define_attr defines configurable attributes for subclasses,
32
+ # for example, a class sub-classing Dataset will want to define its
33
+ # target (e.g. the column we are trying to predict)
34
+ #
35
+ # These can either be defined on a class-level like this:
36
+ #
37
+ # class Dataset < EasyML::Data::Dataset
38
+ # target "REVENUE"
39
+ # end
40
+ #
41
+ # Or passed in during initialization:
42
+ #
43
+ # Dataset.new(target: "REV")
44
+ #
45
+ attribute :verbose, :boolean, default: false
46
+ attribute :today, :date, default: -> { UTC.now }
47
+ def today=(value)
48
+ super(value.in_time_zone(UTC).to_date)
49
+ end
50
+ attribute :target, :string
51
+ validates :target, presence: true
52
+
53
+ attribute :batch_size, :integer, default: 50_000
54
+
55
+ attribute :root_dir, :string
56
+ validates :root_dir, presence: true
57
+ def root_dir=(value)
58
+ super(Pathname.new(value).append("data").to_s)
59
+ end
60
+
61
+ attribute :sample, :float, default: 1.0
62
+ attribute :drop_if_null, :array, default: []
63
+
64
+ # define_attr can also define default values, as well as argument helpers
65
+ attribute :polars_args, :hash, default: {}
66
+ def polars_args=(args)
67
+ super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
68
+ hash.tap do
69
+ hash[k] = v
70
+ hash[k] = v.stringify_keys if k == :dtypes
71
+ end
72
+ end)
73
+ end
74
+
75
+ attribute :transforms, default: nil
76
+ validate :transforms_are_transforms
77
+ def transforms_are_transforms
78
+ return if transforms.nil? || transforms.respond_to?(:transform)
79
+
80
+ errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
81
+ end
82
+
83
+ attribute :drop_cols, :array, default: []
84
+
85
+ dependency :datasource, EasyML::Data::Datasource::DatasourceFactory
86
+
87
+ # dependency defines a configurable dependency, with optional args,
88
+ # for example, here we define a datasource:
89
+ #
90
+ # class YourDataset
91
+ # datasource :s3, s3_bucket: "fundera-bart", s3_prefix: "xyz"
92
+ # # This automatically uses the S3Datasource class to pull data
93
+ # end
94
+ #
95
+ # If we define any models based on other data sources (e.g. postgres),
96
+ # you would just define a new PostgresDatasource
97
+ #
98
+
99
+ # Here we define splitter options, inspired by common Python data splitting techniques:
100
+ #
101
+ # 1. Date-based splitter (similar to TimeSeriesSplit from sklearn)
102
+ #
103
+ # NOT IMPLEMENTED (but you could implement as necessary):
104
+ # 2. Random splitter (similar to train_test_split from sklearn)
105
+ # 3. Stratified splitter (similar to StratifiedKFold from sklearn)
106
+ # 4. Group-based splitter (similar to GroupKFold from sklearn)
107
+ # 5. Sliding window splitter (similar to TimeSeriesSplit with a sliding window)
108
+ #
109
+ dependency :splitter do |dependency|
110
+ dependency.option :date do |option|
111
+ option.default
112
+ option.set_class EasyML::Data::Dataset::Splitters::DateSplitter
113
+ option.bind_attribute :today, required: true
114
+ option.bind_attribute :date_col, required: true
115
+ option.bind_attribute :months_test, required: true
116
+ option.bind_attribute :months_valid, required: true
117
+ end
118
+ end
119
+
120
+ # Here we define the preprocessing logic.
121
+ # Aka what to do with null values. For instance:
122
+ #
123
+ # class YourDataset
124
+ # preprocessing_steps: {
125
+ # training: {
126
+ # annual_revenue: {
127
+ # clip: {min: 0, max: 1_000_000} # Clip values between these
128
+ # median: true, # Then learn the median based on clipped values
129
+ # },
130
+ # created_date: { ffill: true } # During training, use the latest value in the dataset
131
+ # },
132
+ # inference: {
133
+ # created_date: { today: true } # During inference, use the current date
134
+ # }
135
+ # }
136
+ # end
137
+ #
138
+ attribute :preprocessing_steps, :hash, default: {}
139
+ dependency :preprocessor do |dependency|
140
+ dependency.set_class EasyML::Data::Preprocessor
141
+ dependency.bind_attribute :directory, source: :root_dir do |value|
142
+ Pathname.new(value).append("preprocessor")
143
+ end
144
+ dependency.bind_attribute :preprocessing_steps
145
+ end
146
+
147
+ # Here we define the raw dataset (uses the Split class)
148
+ # We use this to learn dataset statistics (e.g. median annual revenue)
149
+ # But we NEVER overwrite it
150
+ #
151
+ dependency :raw do |dependency|
152
+ dependency.option :file do |option|
153
+ option.default
154
+ option.set_class EasyML::Data::Dataset::Splits::FileSplit
155
+ option.bind_attribute :dir, source: :root_dir do |value|
156
+ Pathname.new(value).append("files/splits/raw")
157
+ end
158
+ option.bind_attribute :polars_args
159
+ option.bind_attribute :max_rows_per_file, source: :batch_size
160
+ option.bind_attribute :batch_size
161
+ option.bind_attribute :sample
162
+ option.bind_attribute :verbose
163
+ end
164
+
165
+ dependency.option :memory do |option|
166
+ option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
167
+ option.bind_attribute :sample
168
+ end
169
+
170
+ dependency.when do |_dep|
171
+ { option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
172
+ end
173
+ end
174
+
175
+ # Here we define the processed dataset (uses the Split class)
176
+ # After we learn the dataset statistics, we fill null values
177
+ # using the learned statistics (e.g. fill annual_revenue with median annual_revenue)
178
+ #
179
+ dependency :processed do |dependency|
180
+ dependency.option :file do |option|
181
+ option.default
182
+ option.set_class EasyML::Data::Dataset::Splits::FileSplit
183
+ option.bind_attribute :dir, source: :root_dir do |value|
184
+ Pathname.new(value).append("files/splits/processed")
185
+ end
186
+ option.bind_attribute :polars_args
187
+ option.bind_attribute :max_rows_per_file, source: :batch_size
188
+ option.bind_attribute :batch_size
189
+ option.bind_attribute :sample
190
+ option.bind_attribute :verbose
191
+ end
192
+
193
+ dependency.option :memory do |option|
194
+ option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
195
+ option.bind_attribute :sample
196
+ end
197
+
198
+ dependency.when do |_dep|
199
+ { option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
200
+ end
201
+ end
202
+
203
+ delegate :new_data_available?, :synced?, :stale?, to: :datasource
204
+ delegate :train, :test, :valid, to: :split
205
+ delegate :splits, to: :splitter
206
+
207
+ def refresh!
208
+ refresh_datasource
209
+ split_data
210
+ fit
211
+ normalize_all
212
+ alert_nulls
213
+ end
214
+
215
+ def normalize(df = nil)
216
+ df = drop_nulls(df)
217
+ df = apply_transforms(df)
218
+ preprocessor.postprocess(df)
219
+ end
220
+
221
+ # A "production" preprocessor is predicting live values (e.g. used on live webservers)
222
+ # A "development" preprocessor is used during training (e.g. we're learning new values for the dataset)
223
+ #
224
+ delegate :statistics, to: :preprocessor
225
+
226
+ def train(split_ys: false, all_columns: false, &block)
227
+ load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
228
+ end
229
+
230
+ def valid(split_ys: false, all_columns: false, &block)
231
+ load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
232
+ end
233
+
234
+ def test(split_ys: false, all_columns: false, &block)
235
+ load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
236
+ end
237
+
238
+ def data(split_ys: false, all_columns: false)
239
+ if split_ys
240
+ x_train, y_train = train(split_ys: true, all_columns: all_columns)
241
+ x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
242
+ x_test, y_test = test(split_ys: true, all_columns: all_columns)
243
+
244
+ xs = Polars.concat([x_train, x_valid, x_test])
245
+ ys = Polars.concat([y_train, y_valid, y_test])
246
+ [xs, ys]
247
+ else
248
+ train_df = train(split_ys: false, all_columns: all_columns)
249
+ valid_df = valid(split_ys: false, all_columns: all_columns)
250
+ test_df = test(split_ys: false, all_columns: all_columns)
251
+
252
+ Polars.concat([train_df, valid_df, test_df])
253
+ end
254
+ end
255
+
256
+ def cleanup
257
+ raw.cleanup
258
+ processed.cleanup
259
+ end
260
+
261
+ def check_nulls(data_type = :processed)
262
+ result = %i[train test valid].each_with_object({}) do |segment, acc|
263
+ segment_result = { nulls: {}, total: 0 }
264
+
265
+ data_source = data_type == :raw ? raw : processed
266
+ data_source.read(segment) do |df|
267
+ df_nulls = null_check(df)
268
+ df.columns.each do |column|
269
+ segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
270
+ if df_nulls && df_nulls[column]
271
+ segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
272
+ end
273
+ segment_result[:nulls][column][:total_count] += df.height
274
+ end
275
+ end
276
+
277
+ segment_result[:nulls].each do |column, counts|
278
+ percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
279
+ acc[column] ||= {}
280
+ acc[column][segment] = percentage
281
+ end
282
+ end
283
+
284
+ # Remove columns that have no nulls across all segments
285
+ result.reject! { |_, v| v.values.all?(&:zero?) }
286
+
287
+ result.empty? ? nil : result
288
+ end
289
+
290
+ def processed?
291
+ !should_split?
292
+ end
293
+
294
+ def decode_labels(ys, col: nil)
295
+ preprocessor.decode_labels(ys, col: col.nil? ? target : col)
296
+ end
297
+
298
+ private
299
+
300
+ def refresh_datasource
301
+ datasource.refresh!
302
+ end
303
+ log_method :refresh!, "Refreshing datasource", verbose: true
304
+
305
+ def normalize_all
306
+ processed.cleanup
307
+
308
+ %i[train test valid].each do |segment|
309
+ raw.read(segment) do |df|
310
+ processed_df = normalize(df)
311
+ processed.save(segment, processed_df)
312
+ end
313
+ end
314
+ end
315
+ log_method :normalize_all, "Normalizing dataset", verbose: true
316
+
317
+ def drop_nulls(df)
318
+ return df if drop_if_null.nil? || drop_if_null.empty?
319
+
320
+ df.drop_nulls(subset: drop_if_null)
321
+ end
322
+
323
+ def drop_columns(all_columns: false)
324
+ if all_columns
325
+ []
326
+ else
327
+ drop_cols
328
+ end
329
+ end
330
+
331
+ def load_data(segment, split_ys: false, all_columns: false, &block)
332
+ drop_cols = drop_columns(all_columns: all_columns)
333
+ if processed?
334
+ processed.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
335
+ else
336
+ raw.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
337
+ end
338
+ end
339
+
340
+ def fit(xs = nil)
341
+ xs = raw.train if xs.nil?
342
+
343
+ preprocessor.fit(xs)
344
+ end
345
+ log_method :fit, "Learning statistics", verbose: true
346
+
347
+ def in_batches(segment, processed: true, &block)
348
+ if processed
349
+ processed.read(segment, &block)
350
+ else
351
+ raw.read(segment, &block)
352
+ end
353
+ end
354
+
355
+ def split_data
356
+ return unless should_split?
357
+
358
+ cleanup
359
+ datasource.in_batches do |df|
360
+ train_df, valid_df, test_df = splitter.split(df)
361
+ raw.save(:train, train_df)
362
+ raw.save(:valid, valid_df)
363
+ raw.save(:test, test_df)
364
+ end
365
+
366
+ # Update the persisted sample size after splitting
367
+ save_previous_sample(sample)
368
+ end
369
+ log_method :split_data, "Splitting data", verbose: true
370
+
371
+ def should_split?
372
+ split_timestamp = raw.split_at
373
+ previous_sample = load_previous_sample
374
+ sample_increased = previous_sample && sample > previous_sample
375
+ previous_sample.nil? || split_timestamp.nil? || split_timestamp < datasource.last_updated_at || sample_increased
376
+ end
377
+
378
+ def sample_info_file
379
+ File.join(root_dir, "sample_info.json")
380
+ end
381
+
382
+ def save_previous_sample(sample_size)
383
+ File.write(sample_info_file, JSON.generate({ previous_sample: sample_size }))
384
+ end
385
+
386
+ def load_previous_sample
387
+ return nil unless File.exist?(sample_info_file)
388
+
389
+ JSON.parse(File.read(sample_info_file))["previous_sample"]
390
+ end
391
+
392
+ def apply_transforms(df)
393
+ if transforms.nil?
394
+ df
395
+ else
396
+ transforms.apply_transforms(df)
397
+ end
398
+ end
399
+
400
+ def alert_nulls
401
+ processed_nulls = check_nulls(:processed)
402
+ raw_nulls = check_nulls(:raw)
403
+
404
+ if processed_nulls
405
+ log_warning("Nulls found in the processed dataset:")
406
+ processed_nulls.each do |column, segments|
407
+ segments.each do |segment, percentage|
408
+ log_warning(" #{column} - #{segment}: #{percentage}% nulls")
409
+ end
410
+ end
411
+ else
412
+ log_info("No nulls found in the processed dataset.")
413
+ end
414
+
415
+ if raw_nulls
416
+ raw_nulls.each do |column, segments|
417
+ segments.each do |segment, percentage|
418
+ if percentage > 50
419
+ log_warning("Data processing issue detected: #{column} - #{segment} has #{percentage}% nulls in the raw dataset")
420
+ end
421
+ end
422
+ end
423
+ end
424
+
425
+ nil
426
+ end
427
+ log_method :alert_nulls, "Checking for nulls", verbose: true
428
+ end
429
+ end
430
+ end
@@ -0,0 +1,60 @@
1
+ require_relative "merged_datasource"
2
+
3
+ module EasyML
4
+ module Data
5
+ class Datasource
6
+ class DatasourceFactory
7
+ include GlueGun::DSL
8
+
9
+ dependency :datasource do |dependency|
10
+ dependency.option :s3 do |option|
11
+ option.default
12
+ option.set_class EasyML::Data::Datasource::S3Datasource
13
+ option.bind_attribute :root_dir do |value|
14
+ Pathname.new(value).append("files")
15
+ end
16
+ option.bind_attribute :polars_args, default: {}
17
+ option.bind_attribute :s3_bucket, required: true
18
+ option.bind_attribute :s3_prefix
19
+ option.bind_attribute :s3_access_key_id, required: true
20
+ option.bind_attribute :s3_secret_access_key, required: true
21
+ end
22
+
23
+ dependency.option :file do |option|
24
+ option.set_class EasyML::Data::Datasource::FileDatasource
25
+ option.bind_attribute :root_dir do |value|
26
+ Pathname.new(value).append("files/raw")
27
+ end
28
+ option.bind_attribute :polars_args
29
+ end
30
+
31
+ dependency.option :polars do |option|
32
+ option.set_class EasyML::Data::Datasource::PolarsDatasource
33
+ option.bind_attribute :df
34
+ end
35
+
36
+ dependency.option :merged do |option|
37
+ option.set_class EasyML::Data::Datasource::MergedDatasource
38
+ option.bind_attribute :root_dir
39
+ end
40
+
41
+ # Passing in datasource: Polars::DataFrame will wrap properly
42
+ # So will passing in datasource /path/to/dir
43
+ dependency.when do |dep|
44
+ case dep
45
+ when Polars::DataFrame
46
+ { option: :polars, as: :df }
47
+ when String, Pathname
48
+ { option: :file, as: :root_dir }
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ # Do this here otherwise we'll end up with a circular dependency
58
+ class EasyML::Data::Datasource::MergedDatasource
59
+ dependency :datasources, DatasourceFactory
60
+ end
@@ -0,0 +1,40 @@
1
+ module EasyML::Data
2
+ class Datasource
3
+ class FileDatasource < Datasource
4
+ include GlueGun::DSL
5
+
6
+ attribute :root_dir, :string
7
+ attribute :polars_args, :hash, default: {}
8
+
9
+ validates :root_dir, presence: true
10
+
11
+ def in_batches(of: 10_000)
12
+ files.each do |file|
13
+ df = Polars.read_csv(file, **polars_args)
14
+ yield df
15
+ end
16
+ end
17
+
18
+ def files
19
+ Dir.glob(File.join(root_dir, "**/*.csv")).sort
20
+ end
21
+
22
+ def last_updated_at
23
+ files.map { |file| File.mtime(file) }.max
24
+ end
25
+
26
+ def refresh!
27
+ # No need to refresh for directory-based datasource
28
+ end
29
+
30
+ def data
31
+ combined_df = nil
32
+ files.each do |file|
33
+ df = Polars.read_csv(file, **polars_args)
34
+ combined_df = combined_df.nil? ? df : combined_df.vstack(df)
35
+ end
36
+ combined_df
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,64 @@
1
+ module EasyML::Data
2
+ class Datasource
3
+ class MergedDatasource < Datasource
4
+ include GlueGun::DSL
5
+
6
+ attribute :root_dir, :string
7
+ attribute :polars_args, :hash, default: {}
8
+ attribute :merge
9
+ validates :root_dir, presence: true
10
+ validates :merge, presence: true
11
+
12
+ def in_batches(of: 10_000, &block)
13
+ Polars.read_csv(file_path, **polars_args).iter_batches(batch_size: of, &block)
14
+ end
15
+
16
+ def file_path
17
+ @file_path ||= File.join(root_dir, "merged_data.csv")
18
+ end
19
+
20
+ def last_updated_at
21
+ datasources.map(&:last_updated_at).min
22
+ end
23
+
24
+ def refresh!
25
+ cleanup
26
+ if datasources.is_a?(Array)
27
+ datasources.each(&:refresh!)
28
+ elsif datasources.is_a?(Hash)
29
+ datasources.values.each(&:refresh!)
30
+ end
31
+ end
32
+
33
+ def data
34
+ @data ||= if file_exists?
35
+ Polars.read_csv(file_path, **polars_args)
36
+ else
37
+ merge_and_save
38
+ end
39
+ end
40
+
41
+ def cleanup
42
+ FileUtils.rm_f(file_path)
43
+ end
44
+
45
+ private
46
+
47
+ def file_exists?
48
+ File.exist?(file_path)
49
+ end
50
+
51
+ def merge_and_save
52
+ refresh!
53
+ merge.call(datasources).tap do |merged_data|
54
+ save_to_file(merged_data)
55
+ end
56
+ end
57
+
58
+ def save_to_file(df)
59
+ FileUtils.mkdir_p(root_dir)
60
+ df.write_csv(file_path)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,41 @@
1
+ module EasyML::Data
2
+ class Datasource
3
+ class PolarsDatasource < Datasource
4
+ include GlueGun::DSL
5
+
6
+ attribute :df
7
+ validate :df_is_dataframe
8
+ def df_is_dataframe
9
+ return if df.nil? || df.is_a?(Polars::DataFrame)
10
+
11
+ errors.add(:df, "Must be an instance of Polars::DataFrame")
12
+ end
13
+ attr_accessor :last_updated_at
14
+
15
+ def initialize(options)
16
+ super
17
+ @last_updated_at = Time.now
18
+ end
19
+
20
+ def in_batches(of: 10_000)
21
+ total_rows = df.shape[0]
22
+ (0...total_rows).step(of) do |start|
23
+ end_index = [start + of, total_rows].min
24
+ yield df.slice(start, end_index - start)
25
+ end
26
+ end
27
+
28
+ def files
29
+ [] # No files, as this is in-memory
30
+ end
31
+
32
+ def refresh!
33
+ # No need to refresh for in-memory datasource
34
+ end
35
+
36
+ def data
37
+ df
38
+ end
39
+ end
40
+ end
41
+ end