easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
  3. data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
  4. data/app/models/easy_ml/dataset.rb +4 -49
  5. data/app/models/easy_ml/feature.rb +16 -36
  6. data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
  7. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
  8. data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
  9. data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
  10. data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
  11. data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
  12. data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
  13. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
  14. data/lib/easy_ml/data/dataset_manager/writer/base.rb +122 -0
  15. data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
  16. data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
  17. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
  18. data/lib/easy_ml/data/dataset_manager/writer.rb +76 -0
  19. data/lib/easy_ml/data/dataset_manager.rb +134 -0
  20. data/lib/easy_ml/data/partition/boundaries.rb +60 -0
  21. data/lib/easy_ml/data/partition.rb +7 -0
  22. data/lib/easy_ml/data/synced_directory.rb +1 -2
  23. data/lib/easy_ml/data.rb +2 -0
  24. data/lib/easy_ml/feature_store.rb +15 -185
  25. data/lib/easy_ml/reasons.rb +41 -0
  26. data/lib/easy_ml/version.rb +1 -1
  27. data/lib/easy_ml.rb +1 -1
  28. metadata +20 -4
  29. data/lib/easy_ml/data/filter_extensions.rb +0 -31
  30. /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1eebc157e0f33c3da40ef2b1bdb7cc0ed1c2b6f73615cdf26a6898cb60e60d2d
4
- data.tar.gz: a12b441fe0736f251de773574858316346ba19c5b3784d73f3db200af0e619e4
3
+ metadata.gz: ee980703e3a768458e43d54a878bfa712d4f026967f3ccd8fa5bb2d1df50304c
4
+ data.tar.gz: eb5eb31b580e9112886527f416d4f360ffe9b0ee73d9e2e7dd70d9a48528ea09
5
5
  SHA512:
6
- metadata.gz: 4aabb816a9d02a6f2bd870cde3db3eaaf00a314cf5e0d50a11bf707534b9d93eddee648d62304f48976916ea9d5942269dbeded81d49df23199ffcc13d6ae0eb
7
- data.tar.gz: 284973f49424ac622ceb3e44071e88336ea316154dee788b0e7c865441eeb01939192289deea84283b691bf8f5a3b79f708d3d62ab9fcec3d596f67ff4c093a9
6
+ metadata.gz: ddc8a0005b22caf186c13790e9209d7b843181d62be8e70710e33bd6e244e3f31e6cc02efc4effa82d9f38674ff8ba2d8abdc1e43db8ba339d712f0e12d10ec4
7
+ data.tar.gz: 52193a2c0da5c0aca86bb627afff7efd9cbdfcf80ba2db8ae4e9baec061b716244e14c49f30451f52a221700b2b1a160a71935cb76cd77732a4e8fdcf25bd3a1
@@ -50,11 +50,7 @@ module EasyML
50
50
  end
51
51
 
52
52
  def cast_encoder(encoder)
53
- begin
54
- encoder.transform_keys { |k| column.cast(k) }
55
- rescue => e
56
- binding.pry
57
- end
53
+ encoder.transform_keys { |k| column.cast(k) }
58
54
  end
59
55
 
60
56
  def cast_decoder(decoder)
@@ -0,0 +1,12 @@
1
+ module EasyML
2
+ class Dataset
3
+ class RefreshReasons < EasyML::Reasons
4
+ add_reason "Not split", -> { not_split? }
5
+ add_reason "Refreshed at is nil", -> { refreshed_at.nil? }
6
+ add_reason "Columns need refresh", -> { columns_need_refresh? }
7
+ add_reason "Features need refresh", -> { features_need_fit? }
8
+ add_reason "Datasource needs refresh", -> { datasource_needs_refresh? }
9
+ add_reason "Datasource was refreshed", -> { datasource_was_refreshed? }
10
+ end
11
+ end
12
+ end
@@ -265,9 +265,7 @@ module EasyML
265
265
 
266
266
  def refresh!(async: false)
267
267
  refreshing do
268
- puts "Prepare..."
269
268
  prepare!
270
- puts "Fit features..."
271
269
  fit_features!(async: async)
272
270
  end
273
271
  end
@@ -276,9 +274,7 @@ module EasyML
276
274
  return refresh_async if async
277
275
 
278
276
  refreshing do
279
- puts "prepare.."
280
277
  prepare
281
- puts "fit features..."
282
278
  fit_features(async: async)
283
279
  end
284
280
  end
@@ -299,7 +295,6 @@ module EasyML
299
295
  measure_method_timing :fit_features
300
296
 
301
297
  def after_fit_features
302
- puts "after fit features..."
303
298
  unlock!
304
299
  reload
305
300
  return if failed?
@@ -338,45 +333,12 @@ module EasyML
338
333
  #
339
334
  # So yes this is an annoying way to structure a method, but it's helpful for performance
340
335
  #
341
- def refresh_reasons(exclude: [])
342
- {
343
- not_split: {
344
- name: "Not split",
345
- check: -> { not_split? },
346
- },
347
- refreshed_at_is_nil: {
348
- name: "Refreshed at is nil",
349
- check: -> { refreshed_at.nil? },
350
- },
351
- columns_need_refresh: {
352
- name: "Columns need refresh",
353
- check: -> { columns_need_refresh? },
354
- },
355
- features_need_fit: {
356
- name: "Features need refresh",
357
- check: -> { features_need_fit? },
358
- },
359
- datasource_needs_refresh: {
360
- name: "Datasource needs refresh",
361
- check: -> { datasource_needs_refresh? },
362
- },
363
- refreshed_datasource: {
364
- name: "Refreshed datasource",
365
- check: -> { refreshed_datasource? },
366
- },
367
- datasource_was_refreshed: {
368
- name: "Datasource was refreshed",
369
- check: -> { datasource_was_refreshed? },
370
- },
371
- }.except(*exclude).select do |k, config|
372
- config[:check].call
373
- end.map do |k, config|
374
- config[:name]
375
- end
336
+ def refresh_reasons(except: [])
337
+ RefreshReasons.new(self).check(except: except)
376
338
  end
377
339
 
378
- def needs_refresh?(exclude: [])
379
- refresh_reasons(exclude: exclude).any?
340
+ def needs_refresh?(except: [])
341
+ refresh_reasons(except: except).any?
380
342
  end
381
343
 
382
344
  def processed?
@@ -518,19 +480,12 @@ module EasyML
518
480
  end
519
481
 
520
482
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
521
- puts "Apply missing features..."
522
483
  df = apply_missing_columns(df, inference: inference)
523
- puts "Transform columns..."
524
484
  df = columns.transform(df, inference: inference)
525
- puts "Apply features..."
526
485
  df = apply_features(df, features)
527
- puts "Transform columns..."
528
486
  df = columns.transform(df, inference: inference, computed: true)
529
- puts "Apply column mask..."
530
487
  df = apply_column_mask(df, inference: inference) unless all_columns
531
- puts "Drop nulls..."
532
488
  df = drop_nulls(df) unless inference
533
- puts "Split features and targets..."
534
489
  df, = processed.split_features_targets(df, true, target) if split_ys
535
490
  df
536
491
  end
@@ -190,31 +190,21 @@ module EasyML
190
190
  reader = dataset.raw
191
191
 
192
192
  if adapter.respond_to?(:batch)
193
- array = adapter.batch(reader, self)
194
- min_id = array.min
195
- max_id = array.max
193
+ series = adapter.batch(reader, self)
194
+ primary_key = series.name
196
195
  else
197
- # Get all primary keys
198
- begin
199
- unless primary_key.present?
200
- raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
201
- end
202
- df = reader.query(select: primary_key)
203
- rescue => e
204
- raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
205
- end
206
- return [] if df.nil?
207
-
208
- min_id = df[primary_key.first].min
209
- max_id = df[primary_key.last].max
196
+ primary_key = self.primary_key
210
197
  end
211
198
 
212
- (min_id..max_id).step(batch_size).map.with_index do |batch_start, idx|
213
- batch_end = [batch_start + batch_size, max_id + 1].min - 1
199
+ EasyML::Data::Partition::Boundaries.new(
200
+ reader.data(lazy: true),
201
+ primary_key,
202
+ batch_size
203
+ ).to_a.map.with_index do |partition, idx|
214
204
  {
215
205
  feature_id: id,
216
- batch_start: batch_start,
217
- batch_end: batch_end,
206
+ batch_start: partition[:partition_start],
207
+ batch_end: partition[:partition_end],
218
208
  batch_number: feature_position,
219
209
  subbatch_number: idx,
220
210
  parent_batch_id: Random.uuid,
@@ -231,6 +221,8 @@ module EasyML
231
221
  jobs = ordered_features.map(&:build_batches)
232
222
  job_count = jobs.dup.flatten.size
233
223
 
224
+ ordered_features.each(&:wipe)
225
+
234
226
  # This is very important! For whatever reason, Resque BatchJob does not properly
235
227
  # handle batch finished callbacks for batch size = 1
236
228
  if async && job_count > 1
@@ -325,6 +317,7 @@ module EasyML
325
317
  params = {
326
318
  select: select,
327
319
  filter: filter,
320
+ sort: primary_key,
328
321
  }.compact
329
322
  else
330
323
  params = {}
@@ -438,24 +431,10 @@ module EasyML
438
431
  end
439
432
 
440
433
  def feature_store
441
- @feature_store ||= EasyML::FeatureStore.new(self)
442
- end
443
-
444
- def upload_remote_files
445
- feature_store.upload_remote_files
446
- end
447
-
448
- def files
449
- feature_store.list_partitions
434
+ EasyML::FeatureStore.new(self)
450
435
  end
451
436
 
452
- def query(**kwargs)
453
- feature_store.query(**kwargs)
454
- end
455
-
456
- def store(df)
457
- feature_store.store(df)
458
- end
437
+ delegate :files, :query, :store, :compact, to: :feature_store
459
438
 
460
439
  def batch_size
461
440
  read_attribute(:batch_size) ||
@@ -466,6 +445,7 @@ module EasyML
466
445
  def after_fit
467
446
  update_sha
468
447
 
448
+ feature_store.compact
469
449
  updates = {
470
450
  fit_at: Time.current,
471
451
  needs_fit: false,
@@ -84,7 +84,7 @@ module EasyML
84
84
  end
85
85
 
86
86
  attribute :needs_refresh do |dataset|
87
- dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
87
+ dataset.needs_refresh?(except: [:datasource_needs_refresh])
88
88
  end
89
89
 
90
90
  attribute :stacktrace do |object|
File without changes
@@ -0,0 +1,80 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class Base
6
+ DEFAULTS = {
7
+ drop_cols: [],
8
+ filter: nil,
9
+ limit: nil,
10
+ select: nil,
11
+ unique: nil,
12
+ sort: nil,
13
+ descending: false,
14
+ batch_size: nil,
15
+ batch_start: nil,
16
+ batch_key: nil,
17
+ lazy: false,
18
+ }
19
+
20
+ DEFAULTS.each do |k, _|
21
+ attr_accessor k
22
+ end
23
+ attr_accessor :block, :options, :input
24
+ attr_accessor :options
25
+
26
+ def initialize(options, &block)
27
+ options = apply_defaults(options)
28
+ @block = block
29
+ @options = options
30
+ end
31
+
32
+ def query
33
+ raise "Not implemented"
34
+ end
35
+
36
+ private
37
+
38
+ def apply_defaults(kwargs)
39
+ options = kwargs.dup
40
+
41
+ DEFAULTS.each do |k, default|
42
+ unless options.key?(k)
43
+ options[k] = default
44
+ end
45
+ end
46
+
47
+ options.each do |k, v|
48
+ send("#{k}=", v)
49
+ end
50
+
51
+ options
52
+ end
53
+
54
+ def query_dataframes(df, schema)
55
+ num_rows = df.is_a?(Polars::LazyFrame) ? df.select(Polars.length).collect[0, 0] : df.shape[0]
56
+ return df if num_rows == 0
57
+
58
+ # Apply the predicate filter if given
59
+ df = df.filter(filter) if filter
60
+ # Apply select columns if provided
61
+ df = df.select(select) if select.present?
62
+ df = df.unique if unique
63
+
64
+ # Apply sorting if provided
65
+ df = df.sort(sort, reverse: descending) if sort
66
+
67
+ # Apply drop columns
68
+ drop_cols = self.drop_cols
69
+ drop_cols &= schema.keys
70
+ df = df.drop(drop_cols) unless drop_cols.empty?
71
+
72
+ # Collect the DataFrame (execute the lazy operations)
73
+ df = df.limit(limit) if limit
74
+ lazy ? df : df.collect
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,106 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class Batch < File
6
+ def query
7
+ return batch_enumerator unless block.present?
8
+ return process_batches
9
+ end
10
+
11
+ private
12
+
13
+ def batch_enumerator
14
+ Enumerator.new do |yielder|
15
+ process_batches do |batch|
16
+ yielder << batch
17
+ end
18
+ end
19
+ end
20
+
21
+ def process_batches(&b)
22
+ raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
23
+ block = b || self.block
24
+
25
+ sort = batch_key
26
+
27
+ current_start = get_batch_start
28
+ final_value = get_final_value
29
+
30
+ while current_start < final_value
31
+ filter = Polars.col(sort) >= current_start
32
+ batch = query_files(filter: filter, limit: batch_size, lazy: true, sort: sort, descending: descending)
33
+ block.yield(batch)
34
+ current_start = File.new(input: input, lazy: true)
35
+ .query
36
+ .filter(filter)
37
+ .sort(sort, reverse: descending)
38
+ .limit(batch_size + 1)
39
+ .sort(sort, reverse: !descending)
40
+ .limit(1)
41
+ .select(sort)
42
+ .collect
43
+ .to_a.first&.dig(sort) || final_value
44
+ end
45
+ end
46
+
47
+ def query_files(overrides = {})
48
+ query = options.deep_dup.merge!(overrides).except(:batch_size, :batch_start, :batch_key)
49
+ File.new(query).query
50
+ end
51
+
52
+ def get_batch_start
53
+ if batch_start.present?
54
+ batch_start
55
+ else
56
+ get_sorted_batch_keys(descending)
57
+ end
58
+ end
59
+
60
+ def get_final_value
61
+ get_sorted_batch_keys(!descending)
62
+ end
63
+
64
+ def get_sorted_batch_keys(descending, filter: nil)
65
+ query = query_files(lazy: true)
66
+ query = query.filter(filter) if filter
67
+ query.sort(batch_key, reverse: descending).limit(1).select(batch_key).collect.to_a.last.dig(batch_key)
68
+ end
69
+
70
+ def batch_key
71
+ return @batch_key if @batch_key
72
+
73
+ lazy_df = lazy_frames([files.first]).first
74
+ if select
75
+ # Lazily filter only the selected columns
76
+ lazy_df = lazy_df.select(select)
77
+
78
+ # Lazily compute the unique count for each column and compare with total row count
79
+ primary_keys = select.select do |col|
80
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
81
+ end
82
+ else
83
+ primary_keys = lazy_df.collect.columns.select do |col|
84
+ # Lazily count unique values and compare with the total row count
85
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
86
+ end
87
+ end
88
+
89
+ if primary_keys.count > 1
90
+ key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
91
+ if key
92
+ primary_keys = [key]
93
+ end
94
+ end
95
+
96
+ if primary_keys.count != 1
97
+ raise "Unable to determine primary key for dataset"
98
+ end
99
+
100
+ @batch_key = primary_keys.first
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,23 @@
1
+
2
+ module EasyML
3
+ module Data
4
+ class DatasetManager
5
+ class Reader
6
+ class DataFrame < File
7
+ def query
8
+ return query_dataframes(lazy_frames, schema)
9
+ end
10
+
11
+ def schema
12
+ input.schema
13
+ end
14
+
15
+ private
16
+ def lazy_frames
17
+ input.lazy
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,75 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class File < Base
6
+ attr_accessor :file_filter
7
+
8
+ def initialize(options = {})
9
+ super
10
+ @file_filter = options.dig(:file_filter) || ->(file) { true }
11
+ end
12
+
13
+ def query
14
+ return query_dataframes(dataframe, schema) unless batch_size.present?
15
+ return Batch.new(options, &block).query
16
+ end
17
+
18
+ def schema
19
+ @schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
20
+ end
21
+
22
+ def files
23
+ filter_files do
24
+ if is_file?
25
+ @files ||= [input]
26
+ elsif is_dir?
27
+ @files ||= Dir.glob(::File.join(root_dir, "**/*.{parquet}"))
28
+ else
29
+ @files ||= []
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def filter_files(&block)
37
+ yield
38
+ @files = @files.select(&file_filter)
39
+ end
40
+
41
+ def is_dir?
42
+ path.directory?
43
+ end
44
+
45
+ def is_file?
46
+ path.file?
47
+ end
48
+
49
+ def root_dir
50
+ path if is_dir?
51
+ end
52
+
53
+ def path
54
+ @path ||= input.is_a?(Pathname) ? input : Pathname.new(input)
55
+ end
56
+
57
+ def dataframe
58
+ @dataframe = lazy_frames.any? ? Polars.concat(lazy_frames) : Polars::LazyFrame.new
59
+ end
60
+
61
+ def lazy_frames(files = nil)
62
+ return @lazy_frames if @lazy_frames
63
+
64
+ files ||= self.files
65
+ @lazy_frames = files.map do |file|
66
+ Polars.scan_parquet(file)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ require_relative "batch"
@@ -0,0 +1,58 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ require_relative "reader/base"
6
+ require_relative "reader/file"
7
+ require_relative "reader/data_frame"
8
+
9
+ ADAPTERS = [
10
+ File,
11
+ DataFrame,
12
+ ]
13
+
14
+ def self.query(input, **kwargs, &block)
15
+ adapter(input).new(
16
+ kwargs.merge!(input: input), &block
17
+ ).query
18
+ end
19
+
20
+ def self.schema(input, **kwargs, &block)
21
+ adapter(input).new(
22
+ kwargs.merge!(input: input), &block
23
+ ).schema
24
+ end
25
+
26
+ def self.files(dir)
27
+ Dir.glob(::File.join(dir, "**/*.{parquet}"))
28
+ end
29
+
30
+ def self.sha
31
+ files = sha.sort
32
+
33
+ file_hashes = files.map do |file|
34
+ meta = Polars.read_parquet_schema(file)
35
+ row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
36
+
37
+ Digest::SHA256.hexdigest([
38
+ meta.to_json,
39
+ row_count.to_s,
40
+ ].join("|"))
41
+ end
42
+
43
+ Digest::SHA256.hexdigest(file_hashes.join)
44
+ end
45
+
46
+ private
47
+
48
+ def self.adapter(input)
49
+ if input.is_a?(Polars::DataFrame) || input.is_a?(Polars::LazyFrame)
50
+ DataFrame
51
+ else
52
+ File
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,67 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ class AppendOnly < Base
6
+ attr_accessor :primary_key
7
+
8
+ def initialize(options)
9
+ super
10
+ @primary_key = options.dig(:primary_key)
11
+ raise "primary_key required for append_only writer" if primary_key.nil?
12
+ raise "filenames required: specify the prefix to uuse for unique new files" unless filenames.present?
13
+ end
14
+
15
+ def store
16
+ # If there are no existing files, just store as normal
17
+ return super if files.empty?
18
+
19
+ # Get existing data lazily
20
+ existing_keys = query(lazy: true)
21
+ .select(primary_key)
22
+ .collect[primary_key]
23
+ .to_a
24
+
25
+ # Convert input to lazy if it isn't already
26
+ input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
27
+
28
+ # Filter out records that already exist
29
+ new_records = input_data.filter(
30
+ Polars.col(primary_key).is_in(existing_keys).not_
31
+ )
32
+
33
+ # If we have new records, store them
34
+ if new_records.clone.select(Polars.length).collect[0, 0] > 0
35
+ @df = new_records
36
+ store_to_unique_file
37
+ end
38
+ end
39
+
40
+ def compact
41
+ files = self.files
42
+ return if files.empty?
43
+
44
+ clear_unique_id
45
+
46
+ # Mv existing compacted parquet to a temp file, so it doesn't conflict with write,
47
+ # but can still be queried
48
+ compacted_file = File.join(root_dir, "compacted.parquet")
49
+ if File.exist?(compacted_file)
50
+ tmp_file = File.join(root_dir, "compacted.orig.parquet")
51
+ FileUtils.mv(compacted_file, tmp_file)
52
+ end
53
+ files = self.files
54
+
55
+ compacted_file.tap do |target_file|
56
+ compacted_data = query(lazy: true).sort(primary_key)
57
+
58
+ safe_write(compacted_data, target_file)
59
+ FileUtils.rm(files)
60
+ clear_unique_id
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end