easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
  3. data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
  4. data/app/models/easy_ml/dataset.rb +4 -49
  5. data/app/models/easy_ml/feature.rb +16 -36
  6. data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
  7. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
  8. data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
  9. data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
  10. data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
  11. data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
  12. data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
  13. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
  14. data/lib/easy_ml/data/dataset_manager/writer/base.rb +122 -0
  15. data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
  16. data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
  17. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
  18. data/lib/easy_ml/data/dataset_manager/writer.rb +76 -0
  19. data/lib/easy_ml/data/dataset_manager.rb +134 -0
  20. data/lib/easy_ml/data/partition/boundaries.rb +60 -0
  21. data/lib/easy_ml/data/partition.rb +7 -0
  22. data/lib/easy_ml/data/synced_directory.rb +1 -2
  23. data/lib/easy_ml/data.rb +2 -0
  24. data/lib/easy_ml/feature_store.rb +15 -185
  25. data/lib/easy_ml/reasons.rb +41 -0
  26. data/lib/easy_ml/version.rb +1 -1
  27. data/lib/easy_ml.rb +1 -1
  28. metadata +20 -4
  29. data/lib/easy_ml/data/filter_extensions.rb +0 -31
  30. /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -1,67 +1,23 @@
1
1
  module EasyML
2
- class FeatureStore
2
+ class FeatureStore < EasyML::Data::DatasetManager
3
3
  attr_reader :feature
4
4
 
5
5
  def initialize(feature)
6
6
  @feature = feature
7
- end
8
-
9
- def store(df)
10
- primary_key = feature.primary_key&.first
11
- return store_without_partitioning(df) unless df.columns.include?(primary_key)
12
- return store_without_partitioning(df) unless primary_key
13
-
14
- min_key = df[primary_key].min
15
- max_key = df[primary_key].max
16
- batch_size = feature.batch_size || 10_000
17
-
18
- begin
19
- # We are intentionally not using to_i, so it will raise an error for keys like "A1"
20
- min_key = Integer(min_key) if min_key.is_a?(String)
21
- max_key = Integer(max_key) if max_key.is_a?(String)
22
- rescue ArgumentError
23
- return store_without_partitioning(df)
24
- end
25
-
26
- # Only partition if we have integer keys where we can predict boundaries
27
- return store_without_partitioning(df) unless min_key.is_a?(Integer) && max_key.is_a?(Integer)
28
-
29
- partitions = compute_partition_boundaries(min_key, max_key, batch_size)
30
- partitions.each do |partition_start|
31
- partition_end = partition_start + batch_size - 1
32
- partition_df = df.filter(
33
- (Polars.col(primary_key) >= partition_start) &
34
- (Polars.col(primary_key) <= partition_end)
35
- )
36
-
37
- next if partition_df.height == 0
38
-
39
- store_partition(partition_df, primary_key, partition_start)
40
- end
41
- end
42
-
43
- def query(**kwargs)
44
- query_all_partitions(**kwargs)
45
- end
46
-
47
- def empty?
48
- list_partitions.empty?
49
- end
50
-
51
- def list_partitions
52
- Dir.glob(File.join(feature_dir, "feature*.parquet")).sort
53
- end
54
-
55
- def wipe
56
- FileUtils.rm_rf(feature_dir)
57
- end
58
7
 
59
- def upload_remote_files
60
- synced_directory.upload
61
- end
8
+ datasource_config = feature.dataset.datasource.configuration || {}
62
9
 
63
- def download
64
- synced_directory&.download
10
+ options = {
11
+ root_dir: feature_dir,
12
+ filenames: "feature",
13
+ append_only: false,
14
+ primary_key: feature.primary_key&.first,
15
+ partition_size: batch_size,
16
+ s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
17
+ s3_prefix: s3_prefix,
18
+ polars_args: datasource_config.dig("polars_args"),
19
+ }.compact
20
+ super(options)
65
21
  end
66
22
 
67
23
  def cp(old_version, new_version)
@@ -82,68 +38,8 @@ module EasyML
82
38
 
83
39
  private
84
40
 
85
- def cleanup(type: :partitions)
86
- case type
87
- when :partitions
88
- list_partitions.each do |partition|
89
- FileUtils.rm(partition)
90
- end
91
- when :no_partitions
92
- FileUtils.rm_rf(feature_path)
93
- when :all
94
- wipe
95
- end
96
- end
97
-
98
- def store_without_partitioning(df)
99
- lock_file do
100
- cleanup(type: :partitions)
101
- path = feature_path
102
- safe_write(df, path)
103
- end
104
- end
105
-
106
- def safe_write(df, path)
107
- FileUtils.mkdir_p(File.dirname(path))
108
- df.write_parquet(path)
109
- end
110
-
111
- def store_partition(partition_df, primary_key, partition_start)
112
- lock_partition(partition_start) do
113
- cleanup(type: :no_partitions)
114
- path = partition_path(partition_start)
115
-
116
- if File.exist?(path)
117
- reader = EasyML::Data::PolarsReader.new
118
- existing_df = reader.query([path])
119
- preserved_records = existing_df.filter(
120
- Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
121
- )
122
- if preserved_records.shape[1] != partition_df.shape[1]
123
- wipe
124
- else
125
- partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
126
- end
127
- end
128
-
129
- safe_write(partition_df, path)
130
- end
131
- end
132
-
133
- def query_all_partitions(**kwargs)
134
- reader = EasyML::Data::PolarsReader.new
135
- pattern = File.join(feature_dir, "feature*.parquet")
136
- files = Dir.glob(pattern)
137
-
138
- return Polars::DataFrame.new if files.empty?
139
-
140
- reader.query(files, **kwargs)
141
- end
142
-
143
- def compute_partition_boundaries(min_key, max_key, batch_size)
144
- start_partition = (min_key / batch_size.to_f).floor * batch_size
145
- end_partition = (max_key / batch_size.to_f).floor * batch_size
146
- (start_partition..end_partition).step(batch_size).to_a
41
+ def batch_size
42
+ @batch_size ||= feature.batch_size || 10_000
147
43
  end
148
44
 
149
45
  def feature_dir_for_version(version)
@@ -161,74 +57,8 @@ module EasyML
161
57
  feature_dir_for_version(feature.version)
162
58
  end
163
59
 
164
- def feature_path
165
- File.join(feature_dir, "feature.parquet")
166
- end
167
-
168
- def partition_path(partition_start)
169
- File.join(feature_dir, "feature#{partition_start}.parquet")
170
- end
171
-
172
60
  def s3_prefix
173
61
  File.join("datasets", feature_dir.split("datasets").last)
174
62
  end
175
-
176
- def synced_directory
177
- return unless feature.dataset&.datasource.present?
178
-
179
- datasource_config = feature.dataset.datasource.configuration || {}
180
- @synced_dir ||= EasyML::Data::SyncedDirectory.new(
181
- root_dir: feature_dir,
182
- s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
183
- s3_prefix: s3_prefix,
184
- s3_access_key_id: EasyML::Configuration.s3_access_key_id,
185
- s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
186
- polars_args: datasource_config.dig("polars_args"),
187
- cache_for: 0,
188
- )
189
- end
190
-
191
- def lock_partition(partition_start)
192
- Support::Lockable.with_lock(partition_lock_key(partition_start), wait_timeout: 2, stale_timeout: 60) do |client|
193
- begin
194
- yield client if block_given?
195
- ensure
196
- unlock_partition(partition_start)
197
- end
198
- end
199
- end
200
-
201
- def lock_file
202
- Support::Lockable.with_lock(file_lock_key, wait_timeout: 2, stale_timeout: 60) do |client|
203
- begin
204
- yield client if block_given?
205
- ensure
206
- unlock_file
207
- end
208
- end
209
- end
210
-
211
- def unlock_partition(partition_start)
212
- Support::Lockable.unlock!(partition_lock_key(partition_start))
213
- end
214
-
215
- def unlock_file
216
- Support::Lockable.unlock!(file_lock_key)
217
- end
218
-
219
- def unlock_all_partitions
220
- list_partitions.each do |partition_path|
221
- partition_start = partition_path.match(/feature(\d+)\.parquet/)[1].to_i
222
- unlock_partition(partition_start)
223
- end
224
- end
225
-
226
- def partition_lock_key(partition_start)
227
- "feature_store:#{feature.id}.partition.#{partition_start}"
228
- end
229
-
230
- def file_lock_key
231
- "feature_store:#{feature.id}.file"
232
- end
233
63
  end
234
64
  end
@@ -0,0 +1,41 @@
1
+ module EasyML
2
+ class Reasons
3
+ def initialize(context)
4
+ @context = context
5
+ @reasons = {}
6
+ end
7
+
8
+ class << self
9
+ def add_reason(name, check)
10
+ @reasons ||= {}
11
+ key = name.to_s.downcase.gsub(/\s/, "_").to_sym
12
+ @reasons[key] = { name: name, check: check }
13
+ end
14
+
15
+ def reasons
16
+ @reasons ||= {}
17
+ end
18
+ end
19
+
20
+ def inspect
21
+ "#<#{self.class.name.split("::").last} checks=[#{self.class.reasons.map { |k, v| "#{v[:name]}" }.join(", ")}]>"
22
+ end
23
+
24
+ def none?(except: [])
25
+ check(except: except).none?
26
+ end
27
+
28
+ def check(except: [])
29
+ self.class.reasons.except(*except).select do |_, config|
30
+ @context.instance_exec(&config[:check])
31
+ end.map do |_, config|
32
+ config[:name]
33
+ end
34
+ end
35
+
36
+ def explain
37
+ reasons = check
38
+ reasons.any? ? reasons.join(", ") : :none
39
+ end
40
+ end
41
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc76"
4
+ VERSION = "0.2.0-rc77"
5
5
 
6
6
  module Version
7
7
  end
data/lib/easy_ml.rb CHANGED
@@ -15,13 +15,13 @@ module EasyML
15
15
  class Error < StandardError; end
16
16
 
17
17
  require_relative "easy_ml/configuration"
18
+ require_relative "easy_ml/reasons"
18
19
  require_relative "easy_ml/deep_compact"
19
20
  require_relative "easy_ml/timing"
20
21
  require_relative "easy_ml/support"
21
22
  require_relative "easy_ml/core_ext"
22
23
  require_relative "easy_ml/logging"
23
24
  require_relative "easy_ml/data"
24
- require_relative "easy_ml/data/filter_extensions"
25
25
  require_relative "easy_ml/evaluators"
26
26
  require_relative "easy_ml/features"
27
27
  require_relative "easy_ml/feature_store"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc76
4
+ version: 0.2.0.pre.rc77
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-13 00:00:00.000000000 Z
11
+ date: 2025-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -604,6 +604,7 @@ files:
604
604
  - app/models/easy_ml/dataset/learner/lazy/query.rb
605
605
  - app/models/easy_ml/dataset/learner/lazy/string.rb
606
606
  - app/models/easy_ml/dataset/learner/query.rb
607
+ - app/models/easy_ml/dataset/refresh_reasons.rb
607
608
  - app/models/easy_ml/dataset_history.rb
608
609
  - app/models/easy_ml/datasource.rb
609
610
  - app/models/easy_ml/datasource_history.rb
@@ -631,6 +632,7 @@ files:
631
632
  - app/models/easy_ml/import/retraining_job.rb
632
633
  - app/models/easy_ml/import/splitter.rb
633
634
  - app/models/easy_ml/lineage.rb
635
+ - app/models/easy_ml/lineage_history.rb
634
636
  - app/models/easy_ml/model.rb
635
637
  - app/models/easy_ml/model_file.rb
636
638
  - app/models/easy_ml/model_file_history.rb
@@ -657,7 +659,6 @@ files:
657
659
  - app/models/easy_ml/splitters/random_splitter.rb
658
660
  - app/models/easy_ml/tuner_job.rb
659
661
  - app/models/easy_ml/tuner_run.rb
660
- - app/models/lineage_history.rb
661
662
  - app/serializers/easy_ml/column_serializer.rb
662
663
  - app/serializers/easy_ml/dataset_serializer.rb
663
664
  - app/serializers/easy_ml/datasource_serializer.rb
@@ -705,8 +706,22 @@ files:
705
706
  - lib/easy_ml/core_ext/hash.rb
706
707
  - lib/easy_ml/core_ext/pathname.rb
707
708
  - lib/easy_ml/data.rb
709
+ - lib/easy_ml/data/dataset_manager.rb
710
+ - lib/easy_ml/data/dataset_manager/normalizer.rb
711
+ - lib/easy_ml/data/dataset_manager/reader.rb
712
+ - lib/easy_ml/data/dataset_manager/reader/base.rb
713
+ - lib/easy_ml/data/dataset_manager/reader/batch.rb
714
+ - lib/easy_ml/data/dataset_manager/reader/data_frame.rb
715
+ - lib/easy_ml/data/dataset_manager/reader/file.rb
716
+ - lib/easy_ml/data/dataset_manager/writer.rb
717
+ - lib/easy_ml/data/dataset_manager/writer/append_only.rb
718
+ - lib/easy_ml/data/dataset_manager/writer/base.rb
719
+ - lib/easy_ml/data/dataset_manager/writer/named.rb
720
+ - lib/easy_ml/data/dataset_manager/writer/partitioned.rb
721
+ - lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
708
722
  - lib/easy_ml/data/date_converter.rb
709
- - lib/easy_ml/data/filter_extensions.rb
723
+ - lib/easy_ml/data/partition.rb
724
+ - lib/easy_ml/data/partition/boundaries.rb
710
725
  - lib/easy_ml/data/polars_column.rb
711
726
  - lib/easy_ml/data/polars_in_memory.rb
712
727
  - lib/easy_ml/data/polars_reader.rb
@@ -765,6 +780,7 @@ files:
765
780
  - lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt
766
781
  - lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt
767
782
  - lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt
783
+ - lib/easy_ml/reasons.rb
768
784
  - lib/easy_ml/support.rb
769
785
  - lib/easy_ml/support/age.rb
770
786
  - lib/easy_ml/support/est.rb
@@ -1,31 +0,0 @@
1
- module EasyML
2
- module Data
3
- module FilterExtensions
4
- def is_primary_key_filter?(primary_key)
5
- return false unless primary_key
6
- primary_key = [primary_key] unless primary_key.is_a?(Array)
7
- # Filter expressions in Polars are represented as strings like:
8
- # [([(col("LOAN_APP_ID")) > (dyn int: 4)]) & ([(col("LOAN_APP_ID")) < (dyn int: 16)])]
9
- expr_str = to_s
10
- return false unless expr_str.include?(primary_key.first)
11
-
12
- # Check for common primary key operations
13
- primary_key_ops = [">", "<", ">=", "<=", "=", "eq", "gt", "lt", "ge", "le"]
14
- primary_key_ops.any? { |op| expr_str.include?(op) }
15
- end
16
-
17
- def extract_primary_key_values
18
- expr_str = to_s
19
- # Extract numeric values from the expression
20
- # This will match both integers and floats
21
- values = expr_str.scan(/(?:dyn int|float): (-?\d+(?:\.\d+)?)/).flatten.map(&:to_f)
22
- values.uniq
23
- end
24
- end
25
- end
26
- end
27
-
28
- # Extend Polars classes with our filter functionality
29
- [Polars::Expr].each do |klass|
30
- klass.include(EasyML::Data::FilterExtensions)
31
- end