easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
- data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
- data/app/models/easy_ml/dataset.rb +4 -49
- data/app/models/easy_ml/feature.rb +16 -36
- data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
- data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +122 -0
- data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
- data/lib/easy_ml/data/dataset_manager/writer.rb +76 -0
- data/lib/easy_ml/data/dataset_manager.rb +134 -0
- data/lib/easy_ml/data/partition/boundaries.rb +60 -0
- data/lib/easy_ml/data/partition.rb +7 -0
- data/lib/easy_ml/data/synced_directory.rb +1 -2
- data/lib/easy_ml/data.rb +2 -0
- data/lib/easy_ml/feature_store.rb +15 -185
- data/lib/easy_ml/reasons.rb +41 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -1
- metadata +20 -4
- data/lib/easy_ml/data/filter_extensions.rb +0 -31
- /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee980703e3a768458e43d54a878bfa712d4f026967f3ccd8fa5bb2d1df50304c
|
4
|
+
data.tar.gz: eb5eb31b580e9112886527f416d4f360ffe9b0ee73d9e2e7dd70d9a48528ea09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddc8a0005b22caf186c13790e9209d7b843181d62be8e70710e33bd6e244e3f31e6cc02efc4effa82d9f38674ff8ba2d8abdc1e43db8ba339d712f0e12d10ec4
|
7
|
+
data.tar.gz: 52193a2c0da5c0aca86bb627afff7efd9cbdfcf80ba2db8ae4e9baec061b716244e14c49f30451f52a221700b2b1a160a71935cb76cd77732a4e8fdcf25bd3a1
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class RefreshReasons < EasyML::Reasons
|
4
|
+
add_reason "Not split", -> { not_split? }
|
5
|
+
add_reason "Refreshed at is nil", -> { refreshed_at.nil? }
|
6
|
+
add_reason "Columns need refresh", -> { columns_need_refresh? }
|
7
|
+
add_reason "Features need refresh", -> { features_need_fit? }
|
8
|
+
add_reason "Datasource needs refresh", -> { datasource_needs_refresh? }
|
9
|
+
add_reason "Datasource was refreshed", -> { datasource_was_refreshed? }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -265,9 +265,7 @@ module EasyML
|
|
265
265
|
|
266
266
|
def refresh!(async: false)
|
267
267
|
refreshing do
|
268
|
-
puts "Prepare..."
|
269
268
|
prepare!
|
270
|
-
puts "Fit features..."
|
271
269
|
fit_features!(async: async)
|
272
270
|
end
|
273
271
|
end
|
@@ -276,9 +274,7 @@ module EasyML
|
|
276
274
|
return refresh_async if async
|
277
275
|
|
278
276
|
refreshing do
|
279
|
-
puts "prepare.."
|
280
277
|
prepare
|
281
|
-
puts "fit features..."
|
282
278
|
fit_features(async: async)
|
283
279
|
end
|
284
280
|
end
|
@@ -299,7 +295,6 @@ module EasyML
|
|
299
295
|
measure_method_timing :fit_features
|
300
296
|
|
301
297
|
def after_fit_features
|
302
|
-
puts "after fit features..."
|
303
298
|
unlock!
|
304
299
|
reload
|
305
300
|
return if failed?
|
@@ -338,45 +333,12 @@ module EasyML
|
|
338
333
|
#
|
339
334
|
# So yes this is an annoying way to structure a method, but it's helpful for performance
|
340
335
|
#
|
341
|
-
def refresh_reasons(
|
342
|
-
|
343
|
-
not_split: {
|
344
|
-
name: "Not split",
|
345
|
-
check: -> { not_split? },
|
346
|
-
},
|
347
|
-
refreshed_at_is_nil: {
|
348
|
-
name: "Refreshed at is nil",
|
349
|
-
check: -> { refreshed_at.nil? },
|
350
|
-
},
|
351
|
-
columns_need_refresh: {
|
352
|
-
name: "Columns need refresh",
|
353
|
-
check: -> { columns_need_refresh? },
|
354
|
-
},
|
355
|
-
features_need_fit: {
|
356
|
-
name: "Features need refresh",
|
357
|
-
check: -> { features_need_fit? },
|
358
|
-
},
|
359
|
-
datasource_needs_refresh: {
|
360
|
-
name: "Datasource needs refresh",
|
361
|
-
check: -> { datasource_needs_refresh? },
|
362
|
-
},
|
363
|
-
refreshed_datasource: {
|
364
|
-
name: "Refreshed datasource",
|
365
|
-
check: -> { refreshed_datasource? },
|
366
|
-
},
|
367
|
-
datasource_was_refreshed: {
|
368
|
-
name: "Datasource was refreshed",
|
369
|
-
check: -> { datasource_was_refreshed? },
|
370
|
-
},
|
371
|
-
}.except(*exclude).select do |k, config|
|
372
|
-
config[:check].call
|
373
|
-
end.map do |k, config|
|
374
|
-
config[:name]
|
375
|
-
end
|
336
|
+
def refresh_reasons(except: [])
|
337
|
+
RefreshReasons.new(self).check(except: except)
|
376
338
|
end
|
377
339
|
|
378
|
-
def needs_refresh?(
|
379
|
-
refresh_reasons(
|
340
|
+
def needs_refresh?(except: [])
|
341
|
+
refresh_reasons(except: except).any?
|
380
342
|
end
|
381
343
|
|
382
344
|
def processed?
|
@@ -518,19 +480,12 @@ module EasyML
|
|
518
480
|
end
|
519
481
|
|
520
482
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
521
|
-
puts "Apply missing features..."
|
522
483
|
df = apply_missing_columns(df, inference: inference)
|
523
|
-
puts "Transform columns..."
|
524
484
|
df = columns.transform(df, inference: inference)
|
525
|
-
puts "Apply features..."
|
526
485
|
df = apply_features(df, features)
|
527
|
-
puts "Transform columns..."
|
528
486
|
df = columns.transform(df, inference: inference, computed: true)
|
529
|
-
puts "Apply column mask..."
|
530
487
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
531
|
-
puts "Drop nulls..."
|
532
488
|
df = drop_nulls(df) unless inference
|
533
|
-
puts "Split features and targets..."
|
534
489
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
535
490
|
df
|
536
491
|
end
|
@@ -190,31 +190,21 @@ module EasyML
|
|
190
190
|
reader = dataset.raw
|
191
191
|
|
192
192
|
if adapter.respond_to?(:batch)
|
193
|
-
|
194
|
-
|
195
|
-
max_id = array.max
|
193
|
+
series = adapter.batch(reader, self)
|
194
|
+
primary_key = series.name
|
196
195
|
else
|
197
|
-
|
198
|
-
begin
|
199
|
-
unless primary_key.present?
|
200
|
-
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
201
|
-
end
|
202
|
-
df = reader.query(select: primary_key)
|
203
|
-
rescue => e
|
204
|
-
raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
|
205
|
-
end
|
206
|
-
return [] if df.nil?
|
207
|
-
|
208
|
-
min_id = df[primary_key.first].min
|
209
|
-
max_id = df[primary_key.last].max
|
196
|
+
primary_key = self.primary_key
|
210
197
|
end
|
211
198
|
|
212
|
-
|
213
|
-
|
199
|
+
EasyML::Data::Partition::Boundaries.new(
|
200
|
+
reader.data(lazy: true),
|
201
|
+
primary_key,
|
202
|
+
batch_size
|
203
|
+
).to_a.map.with_index do |partition, idx|
|
214
204
|
{
|
215
205
|
feature_id: id,
|
216
|
-
batch_start:
|
217
|
-
batch_end:
|
206
|
+
batch_start: partition[:partition_start],
|
207
|
+
batch_end: partition[:partition_end],
|
218
208
|
batch_number: feature_position,
|
219
209
|
subbatch_number: idx,
|
220
210
|
parent_batch_id: Random.uuid,
|
@@ -231,6 +221,8 @@ module EasyML
|
|
231
221
|
jobs = ordered_features.map(&:build_batches)
|
232
222
|
job_count = jobs.dup.flatten.size
|
233
223
|
|
224
|
+
ordered_features.each(&:wipe)
|
225
|
+
|
234
226
|
# This is very important! For whatever reason, Resque BatchJob does not properly
|
235
227
|
# handle batch finished callbacks for batch size = 1
|
236
228
|
if async && job_count > 1
|
@@ -325,6 +317,7 @@ module EasyML
|
|
325
317
|
params = {
|
326
318
|
select: select,
|
327
319
|
filter: filter,
|
320
|
+
sort: primary_key,
|
328
321
|
}.compact
|
329
322
|
else
|
330
323
|
params = {}
|
@@ -438,24 +431,10 @@ module EasyML
|
|
438
431
|
end
|
439
432
|
|
440
433
|
def feature_store
|
441
|
-
|
442
|
-
end
|
443
|
-
|
444
|
-
def upload_remote_files
|
445
|
-
feature_store.upload_remote_files
|
446
|
-
end
|
447
|
-
|
448
|
-
def files
|
449
|
-
feature_store.list_partitions
|
434
|
+
EasyML::FeatureStore.new(self)
|
450
435
|
end
|
451
436
|
|
452
|
-
|
453
|
-
feature_store.query(**kwargs)
|
454
|
-
end
|
455
|
-
|
456
|
-
def store(df)
|
457
|
-
feature_store.store(df)
|
458
|
-
end
|
437
|
+
delegate :files, :query, :store, :compact, to: :feature_store
|
459
438
|
|
460
439
|
def batch_size
|
461
440
|
read_attribute(:batch_size) ||
|
@@ -466,6 +445,7 @@ module EasyML
|
|
466
445
|
def after_fit
|
467
446
|
update_sha
|
468
447
|
|
448
|
+
feature_store.compact
|
469
449
|
updates = {
|
470
450
|
fit_at: Time.current,
|
471
451
|
needs_fit: false,
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class Base
|
6
|
+
DEFAULTS = {
|
7
|
+
drop_cols: [],
|
8
|
+
filter: nil,
|
9
|
+
limit: nil,
|
10
|
+
select: nil,
|
11
|
+
unique: nil,
|
12
|
+
sort: nil,
|
13
|
+
descending: false,
|
14
|
+
batch_size: nil,
|
15
|
+
batch_start: nil,
|
16
|
+
batch_key: nil,
|
17
|
+
lazy: false,
|
18
|
+
}
|
19
|
+
|
20
|
+
DEFAULTS.each do |k, _|
|
21
|
+
attr_accessor k
|
22
|
+
end
|
23
|
+
attr_accessor :block, :options, :input
|
24
|
+
attr_accessor :options
|
25
|
+
|
26
|
+
def initialize(options, &block)
|
27
|
+
options = apply_defaults(options)
|
28
|
+
@block = block
|
29
|
+
@options = options
|
30
|
+
end
|
31
|
+
|
32
|
+
def query
|
33
|
+
raise "Not implemented"
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def apply_defaults(kwargs)
|
39
|
+
options = kwargs.dup
|
40
|
+
|
41
|
+
DEFAULTS.each do |k, default|
|
42
|
+
unless options.key?(k)
|
43
|
+
options[k] = default
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
options.each do |k, v|
|
48
|
+
send("#{k}=", v)
|
49
|
+
end
|
50
|
+
|
51
|
+
options
|
52
|
+
end
|
53
|
+
|
54
|
+
def query_dataframes(df, schema)
|
55
|
+
num_rows = df.is_a?(Polars::LazyFrame) ? df.select(Polars.length).collect[0, 0] : df.shape[0]
|
56
|
+
return df if num_rows == 0
|
57
|
+
|
58
|
+
# Apply the predicate filter if given
|
59
|
+
df = df.filter(filter) if filter
|
60
|
+
# Apply select columns if provided
|
61
|
+
df = df.select(select) if select.present?
|
62
|
+
df = df.unique if unique
|
63
|
+
|
64
|
+
# Apply sorting if provided
|
65
|
+
df = df.sort(sort, reverse: descending) if sort
|
66
|
+
|
67
|
+
# Apply drop columns
|
68
|
+
drop_cols = self.drop_cols
|
69
|
+
drop_cols &= schema.keys
|
70
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
71
|
+
|
72
|
+
# Collect the DataFrame (execute the lazy operations)
|
73
|
+
df = df.limit(limit) if limit
|
74
|
+
lazy ? df : df.collect
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class Batch < File
|
6
|
+
def query
|
7
|
+
return batch_enumerator unless block.present?
|
8
|
+
return process_batches
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def batch_enumerator
|
14
|
+
Enumerator.new do |yielder|
|
15
|
+
process_batches do |batch|
|
16
|
+
yielder << batch
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def process_batches(&b)
|
22
|
+
raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
|
23
|
+
block = b || self.block
|
24
|
+
|
25
|
+
sort = batch_key
|
26
|
+
|
27
|
+
current_start = get_batch_start
|
28
|
+
final_value = get_final_value
|
29
|
+
|
30
|
+
while current_start < final_value
|
31
|
+
filter = Polars.col(sort) >= current_start
|
32
|
+
batch = query_files(filter: filter, limit: batch_size, lazy: true, sort: sort, descending: descending)
|
33
|
+
block.yield(batch)
|
34
|
+
current_start = File.new(input: input, lazy: true)
|
35
|
+
.query
|
36
|
+
.filter(filter)
|
37
|
+
.sort(sort, reverse: descending)
|
38
|
+
.limit(batch_size + 1)
|
39
|
+
.sort(sort, reverse: !descending)
|
40
|
+
.limit(1)
|
41
|
+
.select(sort)
|
42
|
+
.collect
|
43
|
+
.to_a.first&.dig(sort) || final_value
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def query_files(overrides = {})
|
48
|
+
query = options.deep_dup.merge!(overrides).except(:batch_size, :batch_start, :batch_key)
|
49
|
+
File.new(query).query
|
50
|
+
end
|
51
|
+
|
52
|
+
def get_batch_start
|
53
|
+
if batch_start.present?
|
54
|
+
batch_start
|
55
|
+
else
|
56
|
+
get_sorted_batch_keys(descending)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_final_value
|
61
|
+
get_sorted_batch_keys(!descending)
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_sorted_batch_keys(descending, filter: nil)
|
65
|
+
query = query_files(lazy: true)
|
66
|
+
query = query.filter(filter) if filter
|
67
|
+
query.sort(batch_key, reverse: descending).limit(1).select(batch_key).collect.to_a.last.dig(batch_key)
|
68
|
+
end
|
69
|
+
|
70
|
+
def batch_key
|
71
|
+
return @batch_key if @batch_key
|
72
|
+
|
73
|
+
lazy_df = lazy_frames([files.first]).first
|
74
|
+
if select
|
75
|
+
# Lazily filter only the selected columns
|
76
|
+
lazy_df = lazy_df.select(select)
|
77
|
+
|
78
|
+
# Lazily compute the unique count for each column and compare with total row count
|
79
|
+
primary_keys = select.select do |col|
|
80
|
+
lazy_df.select(col).unique.collect.height == lazy_df.collect.height
|
81
|
+
end
|
82
|
+
else
|
83
|
+
primary_keys = lazy_df.collect.columns.select do |col|
|
84
|
+
# Lazily count unique values and compare with the total row count
|
85
|
+
lazy_df.select(col).unique.collect.height == lazy_df.collect.height
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
if primary_keys.count > 1
|
90
|
+
key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
|
91
|
+
if key
|
92
|
+
primary_keys = [key]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
if primary_keys.count != 1
|
97
|
+
raise "Unable to determine primary key for dataset"
|
98
|
+
end
|
99
|
+
|
100
|
+
@batch_key = primary_keys.first
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
module EasyML
|
3
|
+
module Data
|
4
|
+
class DatasetManager
|
5
|
+
class Reader
|
6
|
+
class DataFrame < File
|
7
|
+
def query
|
8
|
+
return query_dataframes(lazy_frames, schema)
|
9
|
+
end
|
10
|
+
|
11
|
+
def schema
|
12
|
+
input.schema
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def lazy_frames
|
17
|
+
input.lazy
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class File < Base
|
6
|
+
attr_accessor :file_filter
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
@file_filter = options.dig(:file_filter) || ->(file) { true }
|
11
|
+
end
|
12
|
+
|
13
|
+
def query
|
14
|
+
return query_dataframes(dataframe, schema) unless batch_size.present?
|
15
|
+
return Batch.new(options, &block).query
|
16
|
+
end
|
17
|
+
|
18
|
+
def schema
|
19
|
+
@schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def files
|
23
|
+
filter_files do
|
24
|
+
if is_file?
|
25
|
+
@files ||= [input]
|
26
|
+
elsif is_dir?
|
27
|
+
@files ||= Dir.glob(::File.join(root_dir, "**/*.{parquet}"))
|
28
|
+
else
|
29
|
+
@files ||= []
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def filter_files(&block)
|
37
|
+
yield
|
38
|
+
@files = @files.select(&file_filter)
|
39
|
+
end
|
40
|
+
|
41
|
+
def is_dir?
|
42
|
+
path.directory?
|
43
|
+
end
|
44
|
+
|
45
|
+
def is_file?
|
46
|
+
path.file?
|
47
|
+
end
|
48
|
+
|
49
|
+
def root_dir
|
50
|
+
path if is_dir?
|
51
|
+
end
|
52
|
+
|
53
|
+
def path
|
54
|
+
@path ||= input.is_a?(Pathname) ? input : Pathname.new(input)
|
55
|
+
end
|
56
|
+
|
57
|
+
def dataframe
|
58
|
+
@dataframe = lazy_frames.any? ? Polars.concat(lazy_frames) : Polars::LazyFrame.new
|
59
|
+
end
|
60
|
+
|
61
|
+
def lazy_frames(files = nil)
|
62
|
+
return @lazy_frames if @lazy_frames
|
63
|
+
|
64
|
+
files ||= self.files
|
65
|
+
@lazy_frames = files.map do |file|
|
66
|
+
Polars.scan_parquet(file)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
require_relative "batch"
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
require_relative "reader/base"
|
6
|
+
require_relative "reader/file"
|
7
|
+
require_relative "reader/data_frame"
|
8
|
+
|
9
|
+
ADAPTERS = [
|
10
|
+
File,
|
11
|
+
DataFrame,
|
12
|
+
]
|
13
|
+
|
14
|
+
def self.query(input, **kwargs, &block)
|
15
|
+
adapter(input).new(
|
16
|
+
kwargs.merge!(input: input), &block
|
17
|
+
).query
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.schema(input, **kwargs, &block)
|
21
|
+
adapter(input).new(
|
22
|
+
kwargs.merge!(input: input), &block
|
23
|
+
).schema
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.files(dir)
|
27
|
+
Dir.glob(::File.join(dir, "**/*.{parquet}"))
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.sha
|
31
|
+
files = sha.sort
|
32
|
+
|
33
|
+
file_hashes = files.map do |file|
|
34
|
+
meta = Polars.read_parquet_schema(file)
|
35
|
+
row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
|
36
|
+
|
37
|
+
Digest::SHA256.hexdigest([
|
38
|
+
meta.to_json,
|
39
|
+
row_count.to_s,
|
40
|
+
].join("|"))
|
41
|
+
end
|
42
|
+
|
43
|
+
Digest::SHA256.hexdigest(file_hashes.join)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def self.adapter(input)
|
49
|
+
if input.is_a?(Polars::DataFrame) || input.is_a?(Polars::LazyFrame)
|
50
|
+
DataFrame
|
51
|
+
else
|
52
|
+
File
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
class AppendOnly < Base
|
6
|
+
attr_accessor :primary_key
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
super
|
10
|
+
@primary_key = options.dig(:primary_key)
|
11
|
+
raise "primary_key required for append_only writer" if primary_key.nil?
|
12
|
+
raise "filenames required: specify the prefix to uuse for unique new files" unless filenames.present?
|
13
|
+
end
|
14
|
+
|
15
|
+
def store
|
16
|
+
# If there are no existing files, just store as normal
|
17
|
+
return super if files.empty?
|
18
|
+
|
19
|
+
# Get existing data lazily
|
20
|
+
existing_keys = query(lazy: true)
|
21
|
+
.select(primary_key)
|
22
|
+
.collect[primary_key]
|
23
|
+
.to_a
|
24
|
+
|
25
|
+
# Convert input to lazy if it isn't already
|
26
|
+
input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
|
27
|
+
|
28
|
+
# Filter out records that already exist
|
29
|
+
new_records = input_data.filter(
|
30
|
+
Polars.col(primary_key).is_in(existing_keys).not_
|
31
|
+
)
|
32
|
+
|
33
|
+
# If we have new records, store them
|
34
|
+
if new_records.clone.select(Polars.length).collect[0, 0] > 0
|
35
|
+
@df = new_records
|
36
|
+
store_to_unique_file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def compact
|
41
|
+
files = self.files
|
42
|
+
return if files.empty?
|
43
|
+
|
44
|
+
clear_unique_id
|
45
|
+
|
46
|
+
# Mv existing compacted parquet to a temp file, so it doesn't conflict with write,
|
47
|
+
# but can still be queried
|
48
|
+
compacted_file = File.join(root_dir, "compacted.parquet")
|
49
|
+
if File.exist?(compacted_file)
|
50
|
+
tmp_file = File.join(root_dir, "compacted.orig.parquet")
|
51
|
+
FileUtils.mv(compacted_file, tmp_file)
|
52
|
+
end
|
53
|
+
files = self.files
|
54
|
+
|
55
|
+
compacted_file.tap do |target_file|
|
56
|
+
compacted_data = query(lazy: true).sort(primary_key)
|
57
|
+
|
58
|
+
safe_write(compacted_data, target_file)
|
59
|
+
FileUtils.rm(files)
|
60
|
+
clear_unique_id
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|