easy_ml 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +270 -0
- data/Rakefile +12 -0
- data/app/models/easy_ml/model.rb +59 -0
- data/app/models/easy_ml/models/xgboost.rb +9 -0
- data/app/models/easy_ml/models.rb +5 -0
- data/lib/easy_ml/core/model.rb +29 -0
- data/lib/easy_ml/core/model_core.rb +181 -0
- data/lib/easy_ml/core/model_evaluator.rb +137 -0
- data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
- data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
- data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
- data/lib/easy_ml/core/models/xgboost.rb +10 -0
- data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
- data/lib/easy_ml/core/models.rb +10 -0
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
- data/lib/easy_ml/core/tuner/adapters.rb +10 -0
- data/lib/easy_ml/core/tuner.rb +105 -0
- data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
- data/lib/easy_ml/core/uploaders.rb +7 -0
- data/lib/easy_ml/core.rb +9 -0
- data/lib/easy_ml/core_ext/pathname.rb +9 -0
- data/lib/easy_ml/core_ext.rb +5 -0
- data/lib/easy_ml/data/dataloader.rb +6 -0
- data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
- data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
- data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
- data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
- data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
- data/lib/easy_ml/data/dataset/splits.rb +11 -0
- data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
- data/lib/easy_ml/data/dataset/splitters.rb +9 -0
- data/lib/easy_ml/data/dataset.rb +430 -0
- data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
- data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
- data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
- data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
- data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
- data/lib/easy_ml/data/datasource.rb +33 -0
- data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
- data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
- data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
- data/lib/easy_ml/data/preprocessor.rb +238 -0
- data/lib/easy_ml/data/utils.rb +50 -0
- data/lib/easy_ml/data.rb +8 -0
- data/lib/easy_ml/deployment.rb +5 -0
- data/lib/easy_ml/engine.rb +26 -0
- data/lib/easy_ml/initializers/inflections.rb +4 -0
- data/lib/easy_ml/logging.rb +38 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
- data/lib/easy_ml/support/age.rb +27 -0
- data/lib/easy_ml/support/est.rb +1 -0
- data/lib/easy_ml/support/file_rotate.rb +23 -0
- data/lib/easy_ml/support/git_ignorable.rb +66 -0
- data/lib/easy_ml/support/synced_directory.rb +134 -0
- data/lib/easy_ml/support/utc.rb +1 -0
- data/lib/easy_ml/support.rb +10 -0
- data/lib/easy_ml/trainer.rb +92 -0
- data/lib/easy_ml/transforms.rb +29 -0
- data/lib/easy_ml/version.rb +5 -0
- data/lib/easy_ml.rb +23 -0
- metadata +353 -0
@@ -0,0 +1,430 @@
|
|
1
|
+
require "polars"
|
2
|
+
require_relative "datasource"
|
3
|
+
require_relative "dataset/splitters"
|
4
|
+
require_relative "dataset/splits"
|
5
|
+
|
6
|
+
# Dataset is responsible for:
|
7
|
+
#
|
8
|
+
# 1) Ensuring data is synced from its source (e.g. S3 — delegates to datasource)
|
9
|
+
# 2) Ensuring the data is properly split into train, test, and validation data (delegates to splitter)
|
10
|
+
# 3) Knowing where data is stored on disk, and pulling batches of data into memory
|
11
|
+
# 4) Knowing where to save updated data (after preprocessing steps)
|
12
|
+
#
|
13
|
+
module EasyML
|
14
|
+
module Data
|
15
|
+
class Dataset
|
16
|
+
include GlueGun::DSL
|
17
|
+
include EasyML::Logging
|
18
|
+
include EasyML::Data::Utils
|
19
|
+
|
20
|
+
# include GitIgnorable
|
21
|
+
# gitignore :root_dir do |dir|
|
22
|
+
# if Rails.env.test? # Don't gitignore our test files
|
23
|
+
# nil
|
24
|
+
# else
|
25
|
+
# File.join(dir, "files/**/*")
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
|
29
|
+
# These helpers are defined in GlueGun::DSL.
|
30
|
+
#
|
31
|
+
# define_attr defines configurable attributes for subclasses,
|
32
|
+
# for example, a class sub-classing Dataset will want to define its
|
33
|
+
# target (e.g. the column we are trying to predict)
|
34
|
+
#
|
35
|
+
# These can either be defined on a class-level like this:
|
36
|
+
#
|
37
|
+
# class Dataset < EasyML::Data::Dataset
|
38
|
+
# target "REVENUE"
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# Or passed in during initialization:
|
42
|
+
#
|
43
|
+
# Dataset.new(target: "REV")
|
44
|
+
#
|
45
|
+
attribute :verbose, :boolean, default: false
|
46
|
+
attribute :today, :date, default: -> { UTC.now }
|
47
|
+
def today=(value)
|
48
|
+
super(value.in_time_zone(UTC).to_date)
|
49
|
+
end
|
50
|
+
attribute :target, :string
|
51
|
+
validates :target, presence: true
|
52
|
+
|
53
|
+
attribute :batch_size, :integer, default: 50_000
|
54
|
+
|
55
|
+
attribute :root_dir, :string
|
56
|
+
validates :root_dir, presence: true
|
57
|
+
def root_dir=(value)
|
58
|
+
super(Pathname.new(value).append("data").to_s)
|
59
|
+
end
|
60
|
+
|
61
|
+
attribute :sample, :float, default: 1.0
|
62
|
+
attribute :drop_if_null, :array, default: []
|
63
|
+
|
64
|
+
# define_attr can also define default values, as well as argument helpers
|
65
|
+
attribute :polars_args, :hash, default: {}
|
66
|
+
def polars_args=(args)
|
67
|
+
super(args.deep_symbolize_keys.inject({}) do |hash, (k, v)|
|
68
|
+
hash.tap do
|
69
|
+
hash[k] = v
|
70
|
+
hash[k] = v.stringify_keys if k == :dtypes
|
71
|
+
end
|
72
|
+
end)
|
73
|
+
end
|
74
|
+
|
75
|
+
attribute :transforms, default: nil
|
76
|
+
validate :transforms_are_transforms
|
77
|
+
def transforms_are_transforms
|
78
|
+
return if transforms.nil? || transforms.respond_to?(:transform)
|
79
|
+
|
80
|
+
errors.add(:transforms, "Must respond to transform, try including EasyML::Data::Transforms")
|
81
|
+
end
|
82
|
+
|
83
|
+
attribute :drop_cols, :array, default: []
|
84
|
+
|
85
|
+
dependency :datasource, EasyML::Data::Datasource::DatasourceFactory
|
86
|
+
|
87
|
+
# dependency defines a configurable dependency, with optional args,
|
88
|
+
# for example, here we define a datasource:
|
89
|
+
#
|
90
|
+
# class YourDataset
|
91
|
+
# datasource :s3, s3_bucket: "fundera-bart", s3_prefix: "xyz"
|
92
|
+
# # This automatically uses the S3Datasource class to pull data
|
93
|
+
# end
|
94
|
+
#
|
95
|
+
# If we define any models based on other data sources (e.g. postgres),
|
96
|
+
# you would just define a new PostgresDatasource
|
97
|
+
#
|
98
|
+
|
99
|
+
# Here we define splitter options, inspired by common Python data splitting techniques:
|
100
|
+
#
|
101
|
+
# 1. Date-based splitter (similar to TimeSeriesSplit from sklearn)
|
102
|
+
#
|
103
|
+
# NOT IMPLEMENTED (but you could implement as necessary):
|
104
|
+
# 2. Random splitter (similar to train_test_split from sklearn)
|
105
|
+
# 3. Stratified splitter (similar to StratifiedKFold from sklearn)
|
106
|
+
# 4. Group-based splitter (similar to GroupKFold from sklearn)
|
107
|
+
# 5. Sliding window splitter (similar to TimeSeriesSplit with a sliding window)
|
108
|
+
#
|
109
|
+
dependency :splitter do |dependency|
|
110
|
+
dependency.option :date do |option|
|
111
|
+
option.default
|
112
|
+
option.set_class EasyML::Data::Dataset::Splitters::DateSplitter
|
113
|
+
option.bind_attribute :today, required: true
|
114
|
+
option.bind_attribute :date_col, required: true
|
115
|
+
option.bind_attribute :months_test, required: true
|
116
|
+
option.bind_attribute :months_valid, required: true
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Here we define the preprocessing logic.
|
121
|
+
# Aka what to do with null values. For instance:
|
122
|
+
#
|
123
|
+
# class YourDataset
|
124
|
+
# preprocessing_steps: {
|
125
|
+
# training: {
|
126
|
+
# annual_revenue: {
|
127
|
+
# clip: {min: 0, max: 1_000_000} # Clip values between these
|
128
|
+
# median: true, # Then learn the median based on clipped values
|
129
|
+
# },
|
130
|
+
# created_date: { ffill: true } # During training, use the latest value in the dataset
|
131
|
+
# },
|
132
|
+
# inference: {
|
133
|
+
# created_date: { today: true } # During inference, use the current date
|
134
|
+
# }
|
135
|
+
# }
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
attribute :preprocessing_steps, :hash, default: {}
|
139
|
+
dependency :preprocessor do |dependency|
|
140
|
+
dependency.set_class EasyML::Data::Preprocessor
|
141
|
+
dependency.bind_attribute :directory, source: :root_dir do |value|
|
142
|
+
Pathname.new(value).append("preprocessor")
|
143
|
+
end
|
144
|
+
dependency.bind_attribute :preprocessing_steps
|
145
|
+
end
|
146
|
+
|
147
|
+
# Here we define the raw dataset (uses the Split class)
|
148
|
+
# We use this to learn dataset statistics (e.g. median annual revenue)
|
149
|
+
# But we NEVER overwrite it
|
150
|
+
#
|
151
|
+
dependency :raw do |dependency|
|
152
|
+
dependency.option :file do |option|
|
153
|
+
option.default
|
154
|
+
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
155
|
+
option.bind_attribute :dir, source: :root_dir do |value|
|
156
|
+
Pathname.new(value).append("files/splits/raw")
|
157
|
+
end
|
158
|
+
option.bind_attribute :polars_args
|
159
|
+
option.bind_attribute :max_rows_per_file, source: :batch_size
|
160
|
+
option.bind_attribute :batch_size
|
161
|
+
option.bind_attribute :sample
|
162
|
+
option.bind_attribute :verbose
|
163
|
+
end
|
164
|
+
|
165
|
+
dependency.option :memory do |option|
|
166
|
+
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
167
|
+
option.bind_attribute :sample
|
168
|
+
end
|
169
|
+
|
170
|
+
dependency.when do |_dep|
|
171
|
+
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Here we define the processed dataset (uses the Split class)
|
176
|
+
# After we learn the dataset statistics, we fill null values
|
177
|
+
# using the learned statistics (e.g. fill annual_revenue with median annual_revenue)
|
178
|
+
#
|
179
|
+
dependency :processed do |dependency|
|
180
|
+
dependency.option :file do |option|
|
181
|
+
option.default
|
182
|
+
option.set_class EasyML::Data::Dataset::Splits::FileSplit
|
183
|
+
option.bind_attribute :dir, source: :root_dir do |value|
|
184
|
+
Pathname.new(value).append("files/splits/processed")
|
185
|
+
end
|
186
|
+
option.bind_attribute :polars_args
|
187
|
+
option.bind_attribute :max_rows_per_file, source: :batch_size
|
188
|
+
option.bind_attribute :batch_size
|
189
|
+
option.bind_attribute :sample
|
190
|
+
option.bind_attribute :verbose
|
191
|
+
end
|
192
|
+
|
193
|
+
dependency.option :memory do |option|
|
194
|
+
option.set_class EasyML::Data::Dataset::Splits::InMemorySplit
|
195
|
+
option.bind_attribute :sample
|
196
|
+
end
|
197
|
+
|
198
|
+
dependency.when do |_dep|
|
199
|
+
{ option: :memory } if datasource.is_a?(EasyML::Data::Datasource::PolarsDatasource)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
delegate :new_data_available?, :synced?, :stale?, to: :datasource
|
204
|
+
delegate :train, :test, :valid, to: :split
|
205
|
+
delegate :splits, to: :splitter
|
206
|
+
|
207
|
+
def refresh!
|
208
|
+
refresh_datasource
|
209
|
+
split_data
|
210
|
+
fit
|
211
|
+
normalize_all
|
212
|
+
alert_nulls
|
213
|
+
end
|
214
|
+
|
215
|
+
def normalize(df = nil)
|
216
|
+
df = drop_nulls(df)
|
217
|
+
df = apply_transforms(df)
|
218
|
+
preprocessor.postprocess(df)
|
219
|
+
end
|
220
|
+
|
221
|
+
# A "production" preprocessor is predicting live values (e.g. used on live webservers)
|
222
|
+
# A "development" preprocessor is used during training (e.g. we're learning new values for the dataset)
|
223
|
+
#
|
224
|
+
delegate :statistics, to: :preprocessor
|
225
|
+
|
226
|
+
def train(split_ys: false, all_columns: false, &block)
|
227
|
+
load_data(:train, split_ys: split_ys, all_columns: all_columns, &block)
|
228
|
+
end
|
229
|
+
|
230
|
+
def valid(split_ys: false, all_columns: false, &block)
|
231
|
+
load_data(:valid, split_ys: split_ys, all_columns: all_columns, &block)
|
232
|
+
end
|
233
|
+
|
234
|
+
def test(split_ys: false, all_columns: false, &block)
|
235
|
+
load_data(:test, split_ys: split_ys, all_columns: all_columns, &block)
|
236
|
+
end
|
237
|
+
|
238
|
+
def data(split_ys: false, all_columns: false)
|
239
|
+
if split_ys
|
240
|
+
x_train, y_train = train(split_ys: true, all_columns: all_columns)
|
241
|
+
x_valid, y_valid = valid(split_ys: true, all_columns: all_columns)
|
242
|
+
x_test, y_test = test(split_ys: true, all_columns: all_columns)
|
243
|
+
|
244
|
+
xs = Polars.concat([x_train, x_valid, x_test])
|
245
|
+
ys = Polars.concat([y_train, y_valid, y_test])
|
246
|
+
[xs, ys]
|
247
|
+
else
|
248
|
+
train_df = train(split_ys: false, all_columns: all_columns)
|
249
|
+
valid_df = valid(split_ys: false, all_columns: all_columns)
|
250
|
+
test_df = test(split_ys: false, all_columns: all_columns)
|
251
|
+
|
252
|
+
Polars.concat([train_df, valid_df, test_df])
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def cleanup
|
257
|
+
raw.cleanup
|
258
|
+
processed.cleanup
|
259
|
+
end
|
260
|
+
|
261
|
+
def check_nulls(data_type = :processed)
|
262
|
+
result = %i[train test valid].each_with_object({}) do |segment, acc|
|
263
|
+
segment_result = { nulls: {}, total: 0 }
|
264
|
+
|
265
|
+
data_source = data_type == :raw ? raw : processed
|
266
|
+
data_source.read(segment) do |df|
|
267
|
+
df_nulls = null_check(df)
|
268
|
+
df.columns.each do |column|
|
269
|
+
segment_result[:nulls][column] ||= { null_count: 0, total_count: 0 }
|
270
|
+
if df_nulls && df_nulls[column]
|
271
|
+
segment_result[:nulls][column][:null_count] += df_nulls[column][:null_count]
|
272
|
+
end
|
273
|
+
segment_result[:nulls][column][:total_count] += df.height
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
segment_result[:nulls].each do |column, counts|
|
278
|
+
percentage = (counts[:null_count].to_f / counts[:total_count] * 100).round(1)
|
279
|
+
acc[column] ||= {}
|
280
|
+
acc[column][segment] = percentage
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Remove columns that have no nulls across all segments
|
285
|
+
result.reject! { |_, v| v.values.all?(&:zero?) }
|
286
|
+
|
287
|
+
result.empty? ? nil : result
|
288
|
+
end
|
289
|
+
|
290
|
+
def processed?
|
291
|
+
!should_split?
|
292
|
+
end
|
293
|
+
|
294
|
+
def decode_labels(ys, col: nil)
|
295
|
+
preprocessor.decode_labels(ys, col: col.nil? ? target : col)
|
296
|
+
end
|
297
|
+
|
298
|
+
private
|
299
|
+
|
300
|
+
def refresh_datasource
|
301
|
+
datasource.refresh!
|
302
|
+
end
|
303
|
+
log_method :refresh!, "Refreshing datasource", verbose: true
|
304
|
+
|
305
|
+
def normalize_all
|
306
|
+
processed.cleanup
|
307
|
+
|
308
|
+
%i[train test valid].each do |segment|
|
309
|
+
raw.read(segment) do |df|
|
310
|
+
processed_df = normalize(df)
|
311
|
+
processed.save(segment, processed_df)
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
log_method :normalize_all, "Normalizing dataset", verbose: true
|
316
|
+
|
317
|
+
def drop_nulls(df)
|
318
|
+
return df if drop_if_null.nil? || drop_if_null.empty?
|
319
|
+
|
320
|
+
df.drop_nulls(subset: drop_if_null)
|
321
|
+
end
|
322
|
+
|
323
|
+
def drop_columns(all_columns: false)
|
324
|
+
if all_columns
|
325
|
+
[]
|
326
|
+
else
|
327
|
+
drop_cols
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
def load_data(segment, split_ys: false, all_columns: false, &block)
|
332
|
+
drop_cols = drop_columns(all_columns: all_columns)
|
333
|
+
if processed?
|
334
|
+
processed.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
335
|
+
else
|
336
|
+
raw.read(segment, split_ys: split_ys, target: target, drop_cols: drop_cols, &block)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
def fit(xs = nil)
|
341
|
+
xs = raw.train if xs.nil?
|
342
|
+
|
343
|
+
preprocessor.fit(xs)
|
344
|
+
end
|
345
|
+
log_method :fit, "Learning statistics", verbose: true
|
346
|
+
|
347
|
+
def in_batches(segment, processed: true, &block)
|
348
|
+
if processed
|
349
|
+
processed.read(segment, &block)
|
350
|
+
else
|
351
|
+
raw.read(segment, &block)
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
def split_data
|
356
|
+
return unless should_split?
|
357
|
+
|
358
|
+
cleanup
|
359
|
+
datasource.in_batches do |df|
|
360
|
+
train_df, valid_df, test_df = splitter.split(df)
|
361
|
+
raw.save(:train, train_df)
|
362
|
+
raw.save(:valid, valid_df)
|
363
|
+
raw.save(:test, test_df)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Update the persisted sample size after splitting
|
367
|
+
save_previous_sample(sample)
|
368
|
+
end
|
369
|
+
log_method :split_data, "Splitting data", verbose: true
|
370
|
+
|
371
|
+
def should_split?
|
372
|
+
split_timestamp = raw.split_at
|
373
|
+
previous_sample = load_previous_sample
|
374
|
+
sample_increased = previous_sample && sample > previous_sample
|
375
|
+
previous_sample.nil? || split_timestamp.nil? || split_timestamp < datasource.last_updated_at || sample_increased
|
376
|
+
end
|
377
|
+
|
378
|
+
def sample_info_file
|
379
|
+
File.join(root_dir, "sample_info.json")
|
380
|
+
end
|
381
|
+
|
382
|
+
def save_previous_sample(sample_size)
|
383
|
+
File.write(sample_info_file, JSON.generate({ previous_sample: sample_size }))
|
384
|
+
end
|
385
|
+
|
386
|
+
def load_previous_sample
|
387
|
+
return nil unless File.exist?(sample_info_file)
|
388
|
+
|
389
|
+
JSON.parse(File.read(sample_info_file))["previous_sample"]
|
390
|
+
end
|
391
|
+
|
392
|
+
def apply_transforms(df)
|
393
|
+
if transforms.nil?
|
394
|
+
df
|
395
|
+
else
|
396
|
+
transforms.apply_transforms(df)
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
def alert_nulls
|
401
|
+
processed_nulls = check_nulls(:processed)
|
402
|
+
raw_nulls = check_nulls(:raw)
|
403
|
+
|
404
|
+
if processed_nulls
|
405
|
+
log_warning("Nulls found in the processed dataset:")
|
406
|
+
processed_nulls.each do |column, segments|
|
407
|
+
segments.each do |segment, percentage|
|
408
|
+
log_warning(" #{column} - #{segment}: #{percentage}% nulls")
|
409
|
+
end
|
410
|
+
end
|
411
|
+
else
|
412
|
+
log_info("No nulls found in the processed dataset.")
|
413
|
+
end
|
414
|
+
|
415
|
+
if raw_nulls
|
416
|
+
raw_nulls.each do |column, segments|
|
417
|
+
segments.each do |segment, percentage|
|
418
|
+
if percentage > 50
|
419
|
+
log_warning("Data processing issue detected: #{column} - #{segment} has #{percentage}% nulls in the raw dataset")
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|
423
|
+
end
|
424
|
+
|
425
|
+
nil
|
426
|
+
end
|
427
|
+
log_method :alert_nulls, "Checking for nulls", verbose: true
|
428
|
+
end
|
429
|
+
end
|
430
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require_relative "merged_datasource"
|
2
|
+
|
3
|
+
module EasyML
|
4
|
+
module Data
|
5
|
+
class Datasource
|
6
|
+
class DatasourceFactory
|
7
|
+
include GlueGun::DSL
|
8
|
+
|
9
|
+
dependency :datasource do |dependency|
|
10
|
+
dependency.option :s3 do |option|
|
11
|
+
option.default
|
12
|
+
option.set_class EasyML::Data::Datasource::S3Datasource
|
13
|
+
option.bind_attribute :root_dir do |value|
|
14
|
+
Pathname.new(value).append("files")
|
15
|
+
end
|
16
|
+
option.bind_attribute :polars_args, default: {}
|
17
|
+
option.bind_attribute :s3_bucket, required: true
|
18
|
+
option.bind_attribute :s3_prefix
|
19
|
+
option.bind_attribute :s3_access_key_id, required: true
|
20
|
+
option.bind_attribute :s3_secret_access_key, required: true
|
21
|
+
end
|
22
|
+
|
23
|
+
dependency.option :file do |option|
|
24
|
+
option.set_class EasyML::Data::Datasource::FileDatasource
|
25
|
+
option.bind_attribute :root_dir do |value|
|
26
|
+
Pathname.new(value).append("files/raw")
|
27
|
+
end
|
28
|
+
option.bind_attribute :polars_args
|
29
|
+
end
|
30
|
+
|
31
|
+
dependency.option :polars do |option|
|
32
|
+
option.set_class EasyML::Data::Datasource::PolarsDatasource
|
33
|
+
option.bind_attribute :df
|
34
|
+
end
|
35
|
+
|
36
|
+
dependency.option :merged do |option|
|
37
|
+
option.set_class EasyML::Data::Datasource::MergedDatasource
|
38
|
+
option.bind_attribute :root_dir
|
39
|
+
end
|
40
|
+
|
41
|
+
# Passing in datasource: Polars::DataFrame will wrap properly
|
42
|
+
# So will passing in datasource /path/to/dir
|
43
|
+
dependency.when do |dep|
|
44
|
+
case dep
|
45
|
+
when Polars::DataFrame
|
46
|
+
{ option: :polars, as: :df }
|
47
|
+
when String, Pathname
|
48
|
+
{ option: :file, as: :root_dir }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Do this here otherwise we'll end up with a circular dependency
|
58
|
+
class EasyML::Data::Datasource::MergedDatasource
|
59
|
+
dependency :datasources, DatasourceFactory
|
60
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EasyML::Data
|
2
|
+
class Datasource
|
3
|
+
class FileDatasource < Datasource
|
4
|
+
include GlueGun::DSL
|
5
|
+
|
6
|
+
attribute :root_dir, :string
|
7
|
+
attribute :polars_args, :hash, default: {}
|
8
|
+
|
9
|
+
validates :root_dir, presence: true
|
10
|
+
|
11
|
+
def in_batches(of: 10_000)
|
12
|
+
files.each do |file|
|
13
|
+
df = Polars.read_csv(file, **polars_args)
|
14
|
+
yield df
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def files
|
19
|
+
Dir.glob(File.join(root_dir, "**/*.csv")).sort
|
20
|
+
end
|
21
|
+
|
22
|
+
def last_updated_at
|
23
|
+
files.map { |file| File.mtime(file) }.max
|
24
|
+
end
|
25
|
+
|
26
|
+
def refresh!
|
27
|
+
# No need to refresh for directory-based datasource
|
28
|
+
end
|
29
|
+
|
30
|
+
def data
|
31
|
+
combined_df = nil
|
32
|
+
files.each do |file|
|
33
|
+
df = Polars.read_csv(file, **polars_args)
|
34
|
+
combined_df = combined_df.nil? ? df : combined_df.vstack(df)
|
35
|
+
end
|
36
|
+
combined_df
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module EasyML::Data
|
2
|
+
class Datasource
|
3
|
+
class MergedDatasource < Datasource
|
4
|
+
include GlueGun::DSL
|
5
|
+
|
6
|
+
attribute :root_dir, :string
|
7
|
+
attribute :polars_args, :hash, default: {}
|
8
|
+
attribute :merge
|
9
|
+
validates :root_dir, presence: true
|
10
|
+
validates :merge, presence: true
|
11
|
+
|
12
|
+
def in_batches(of: 10_000, &block)
|
13
|
+
Polars.read_csv(file_path, **polars_args).iter_batches(batch_size: of, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def file_path
|
17
|
+
@file_path ||= File.join(root_dir, "merged_data.csv")
|
18
|
+
end
|
19
|
+
|
20
|
+
def last_updated_at
|
21
|
+
datasources.map(&:last_updated_at).min
|
22
|
+
end
|
23
|
+
|
24
|
+
def refresh!
|
25
|
+
cleanup
|
26
|
+
if datasources.is_a?(Array)
|
27
|
+
datasources.each(&:refresh!)
|
28
|
+
elsif datasources.is_a?(Hash)
|
29
|
+
datasources.values.each(&:refresh!)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def data
|
34
|
+
@data ||= if file_exists?
|
35
|
+
Polars.read_csv(file_path, **polars_args)
|
36
|
+
else
|
37
|
+
merge_and_save
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def cleanup
|
42
|
+
FileUtils.rm_f(file_path)
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def file_exists?
|
48
|
+
File.exist?(file_path)
|
49
|
+
end
|
50
|
+
|
51
|
+
def merge_and_save
|
52
|
+
refresh!
|
53
|
+
merge.call(datasources).tap do |merged_data|
|
54
|
+
save_to_file(merged_data)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def save_to_file(df)
|
59
|
+
FileUtils.mkdir_p(root_dir)
|
60
|
+
df.write_csv(file_path)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module EasyML::Data
|
2
|
+
class Datasource
|
3
|
+
class PolarsDatasource < Datasource
|
4
|
+
include GlueGun::DSL
|
5
|
+
|
6
|
+
attribute :df
|
7
|
+
validate :df_is_dataframe
|
8
|
+
def df_is_dataframe
|
9
|
+
return if df.nil? || df.is_a?(Polars::DataFrame)
|
10
|
+
|
11
|
+
errors.add(:df, "Must be an instance of Polars::DataFrame")
|
12
|
+
end
|
13
|
+
attr_accessor :last_updated_at
|
14
|
+
|
15
|
+
def initialize(options)
|
16
|
+
super
|
17
|
+
@last_updated_at = Time.now
|
18
|
+
end
|
19
|
+
|
20
|
+
def in_batches(of: 10_000)
|
21
|
+
total_rows = df.shape[0]
|
22
|
+
(0...total_rows).step(of) do |start|
|
23
|
+
end_index = [start + of, total_rows].min
|
24
|
+
yield df.slice(start, end_index - start)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def files
|
29
|
+
[] # No files, as this is in-memory
|
30
|
+
end
|
31
|
+
|
32
|
+
def refresh!
|
33
|
+
# No need to refresh for in-memory datasource
|
34
|
+
end
|
35
|
+
|
36
|
+
def data
|
37
|
+
df
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|