easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
- data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
- data/app/models/easy_ml/dataset.rb +4 -49
- data/app/models/easy_ml/feature.rb +16 -36
- data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
- data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +122 -0
- data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
- data/lib/easy_ml/data/dataset_manager/writer.rb +76 -0
- data/lib/easy_ml/data/dataset_manager.rb +134 -0
- data/lib/easy_ml/data/partition/boundaries.rb +60 -0
- data/lib/easy_ml/data/partition.rb +7 -0
- data/lib/easy_ml/data/synced_directory.rb +1 -2
- data/lib/easy_ml/data.rb +2 -0
- data/lib/easy_ml/feature_store.rb +15 -185
- data/lib/easy_ml/reasons.rb +41 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -1
- metadata +20 -4
- data/lib/easy_ml/data/filter_extensions.rb +0 -31
- /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -1,67 +1,23 @@
|
|
1
1
|
module EasyML
|
2
|
-
class FeatureStore
|
2
|
+
class FeatureStore < EasyML::Data::DatasetManager
|
3
3
|
attr_reader :feature
|
4
4
|
|
5
5
|
def initialize(feature)
|
6
6
|
@feature = feature
|
7
|
-
end
|
8
|
-
|
9
|
-
def store(df)
|
10
|
-
primary_key = feature.primary_key&.first
|
11
|
-
return store_without_partitioning(df) unless df.columns.include?(primary_key)
|
12
|
-
return store_without_partitioning(df) unless primary_key
|
13
|
-
|
14
|
-
min_key = df[primary_key].min
|
15
|
-
max_key = df[primary_key].max
|
16
|
-
batch_size = feature.batch_size || 10_000
|
17
|
-
|
18
|
-
begin
|
19
|
-
# We are intentionally not using to_i, so it will raise an error for keys like "A1"
|
20
|
-
min_key = Integer(min_key) if min_key.is_a?(String)
|
21
|
-
max_key = Integer(max_key) if max_key.is_a?(String)
|
22
|
-
rescue ArgumentError
|
23
|
-
return store_without_partitioning(df)
|
24
|
-
end
|
25
|
-
|
26
|
-
# Only partition if we have integer keys where we can predict boundaries
|
27
|
-
return store_without_partitioning(df) unless min_key.is_a?(Integer) && max_key.is_a?(Integer)
|
28
|
-
|
29
|
-
partitions = compute_partition_boundaries(min_key, max_key, batch_size)
|
30
|
-
partitions.each do |partition_start|
|
31
|
-
partition_end = partition_start + batch_size - 1
|
32
|
-
partition_df = df.filter(
|
33
|
-
(Polars.col(primary_key) >= partition_start) &
|
34
|
-
(Polars.col(primary_key) <= partition_end)
|
35
|
-
)
|
36
|
-
|
37
|
-
next if partition_df.height == 0
|
38
|
-
|
39
|
-
store_partition(partition_df, primary_key, partition_start)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def query(**kwargs)
|
44
|
-
query_all_partitions(**kwargs)
|
45
|
-
end
|
46
|
-
|
47
|
-
def empty?
|
48
|
-
list_partitions.empty?
|
49
|
-
end
|
50
|
-
|
51
|
-
def list_partitions
|
52
|
-
Dir.glob(File.join(feature_dir, "feature*.parquet")).sort
|
53
|
-
end
|
54
|
-
|
55
|
-
def wipe
|
56
|
-
FileUtils.rm_rf(feature_dir)
|
57
|
-
end
|
58
7
|
|
59
|
-
|
60
|
-
synced_directory.upload
|
61
|
-
end
|
8
|
+
datasource_config = feature.dataset.datasource.configuration || {}
|
62
9
|
|
63
|
-
|
64
|
-
|
10
|
+
options = {
|
11
|
+
root_dir: feature_dir,
|
12
|
+
filenames: "feature",
|
13
|
+
append_only: false,
|
14
|
+
primary_key: feature.primary_key&.first,
|
15
|
+
partition_size: batch_size,
|
16
|
+
s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
|
17
|
+
s3_prefix: s3_prefix,
|
18
|
+
polars_args: datasource_config.dig("polars_args"),
|
19
|
+
}.compact
|
20
|
+
super(options)
|
65
21
|
end
|
66
22
|
|
67
23
|
def cp(old_version, new_version)
|
@@ -82,68 +38,8 @@ module EasyML
|
|
82
38
|
|
83
39
|
private
|
84
40
|
|
85
|
-
def
|
86
|
-
|
87
|
-
when :partitions
|
88
|
-
list_partitions.each do |partition|
|
89
|
-
FileUtils.rm(partition)
|
90
|
-
end
|
91
|
-
when :no_partitions
|
92
|
-
FileUtils.rm_rf(feature_path)
|
93
|
-
when :all
|
94
|
-
wipe
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def store_without_partitioning(df)
|
99
|
-
lock_file do
|
100
|
-
cleanup(type: :partitions)
|
101
|
-
path = feature_path
|
102
|
-
safe_write(df, path)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
def safe_write(df, path)
|
107
|
-
FileUtils.mkdir_p(File.dirname(path))
|
108
|
-
df.write_parquet(path)
|
109
|
-
end
|
110
|
-
|
111
|
-
def store_partition(partition_df, primary_key, partition_start)
|
112
|
-
lock_partition(partition_start) do
|
113
|
-
cleanup(type: :no_partitions)
|
114
|
-
path = partition_path(partition_start)
|
115
|
-
|
116
|
-
if File.exist?(path)
|
117
|
-
reader = EasyML::Data::PolarsReader.new
|
118
|
-
existing_df = reader.query([path])
|
119
|
-
preserved_records = existing_df.filter(
|
120
|
-
Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
|
121
|
-
)
|
122
|
-
if preserved_records.shape[1] != partition_df.shape[1]
|
123
|
-
wipe
|
124
|
-
else
|
125
|
-
partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
safe_write(partition_df, path)
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def query_all_partitions(**kwargs)
|
134
|
-
reader = EasyML::Data::PolarsReader.new
|
135
|
-
pattern = File.join(feature_dir, "feature*.parquet")
|
136
|
-
files = Dir.glob(pattern)
|
137
|
-
|
138
|
-
return Polars::DataFrame.new if files.empty?
|
139
|
-
|
140
|
-
reader.query(files, **kwargs)
|
141
|
-
end
|
142
|
-
|
143
|
-
def compute_partition_boundaries(min_key, max_key, batch_size)
|
144
|
-
start_partition = (min_key / batch_size.to_f).floor * batch_size
|
145
|
-
end_partition = (max_key / batch_size.to_f).floor * batch_size
|
146
|
-
(start_partition..end_partition).step(batch_size).to_a
|
41
|
+
def batch_size
|
42
|
+
@batch_size ||= feature.batch_size || 10_000
|
147
43
|
end
|
148
44
|
|
149
45
|
def feature_dir_for_version(version)
|
@@ -161,74 +57,8 @@ module EasyML
|
|
161
57
|
feature_dir_for_version(feature.version)
|
162
58
|
end
|
163
59
|
|
164
|
-
def feature_path
|
165
|
-
File.join(feature_dir, "feature.parquet")
|
166
|
-
end
|
167
|
-
|
168
|
-
def partition_path(partition_start)
|
169
|
-
File.join(feature_dir, "feature#{partition_start}.parquet")
|
170
|
-
end
|
171
|
-
|
172
60
|
def s3_prefix
|
173
61
|
File.join("datasets", feature_dir.split("datasets").last)
|
174
62
|
end
|
175
|
-
|
176
|
-
def synced_directory
|
177
|
-
return unless feature.dataset&.datasource.present?
|
178
|
-
|
179
|
-
datasource_config = feature.dataset.datasource.configuration || {}
|
180
|
-
@synced_dir ||= EasyML::Data::SyncedDirectory.new(
|
181
|
-
root_dir: feature_dir,
|
182
|
-
s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
|
183
|
-
s3_prefix: s3_prefix,
|
184
|
-
s3_access_key_id: EasyML::Configuration.s3_access_key_id,
|
185
|
-
s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
|
186
|
-
polars_args: datasource_config.dig("polars_args"),
|
187
|
-
cache_for: 0,
|
188
|
-
)
|
189
|
-
end
|
190
|
-
|
191
|
-
def lock_partition(partition_start)
|
192
|
-
Support::Lockable.with_lock(partition_lock_key(partition_start), wait_timeout: 2, stale_timeout: 60) do |client|
|
193
|
-
begin
|
194
|
-
yield client if block_given?
|
195
|
-
ensure
|
196
|
-
unlock_partition(partition_start)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
def lock_file
|
202
|
-
Support::Lockable.with_lock(file_lock_key, wait_timeout: 2, stale_timeout: 60) do |client|
|
203
|
-
begin
|
204
|
-
yield client if block_given?
|
205
|
-
ensure
|
206
|
-
unlock_file
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
def unlock_partition(partition_start)
|
212
|
-
Support::Lockable.unlock!(partition_lock_key(partition_start))
|
213
|
-
end
|
214
|
-
|
215
|
-
def unlock_file
|
216
|
-
Support::Lockable.unlock!(file_lock_key)
|
217
|
-
end
|
218
|
-
|
219
|
-
def unlock_all_partitions
|
220
|
-
list_partitions.each do |partition_path|
|
221
|
-
partition_start = partition_path.match(/feature(\d+)\.parquet/)[1].to_i
|
222
|
-
unlock_partition(partition_start)
|
223
|
-
end
|
224
|
-
end
|
225
|
-
|
226
|
-
def partition_lock_key(partition_start)
|
227
|
-
"feature_store:#{feature.id}.partition.#{partition_start}"
|
228
|
-
end
|
229
|
-
|
230
|
-
def file_lock_key
|
231
|
-
"feature_store:#{feature.id}.file"
|
232
|
-
end
|
233
63
|
end
|
234
64
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Reasons
|
3
|
+
def initialize(context)
|
4
|
+
@context = context
|
5
|
+
@reasons = {}
|
6
|
+
end
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def add_reason(name, check)
|
10
|
+
@reasons ||= {}
|
11
|
+
key = name.to_s.downcase.gsub(/\s/, "_").to_sym
|
12
|
+
@reasons[key] = { name: name, check: check }
|
13
|
+
end
|
14
|
+
|
15
|
+
def reasons
|
16
|
+
@reasons ||= {}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def inspect
|
21
|
+
"#<#{self.class.name.split("::").last} checks=[#{self.class.reasons.map { |k, v| "#{v[:name]}" }.join(", ")}]>"
|
22
|
+
end
|
23
|
+
|
24
|
+
def none?(except: [])
|
25
|
+
check(except: except).none?
|
26
|
+
end
|
27
|
+
|
28
|
+
def check(except: [])
|
29
|
+
self.class.reasons.except(*except).select do |_, config|
|
30
|
+
@context.instance_exec(&config[:check])
|
31
|
+
end.map do |_, config|
|
32
|
+
config[:name]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def explain
|
37
|
+
reasons = check
|
38
|
+
reasons.any? ? reasons.join(", ") : :none
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED
data/lib/easy_ml.rb
CHANGED
@@ -15,13 +15,13 @@ module EasyML
|
|
15
15
|
class Error < StandardError; end
|
16
16
|
|
17
17
|
require_relative "easy_ml/configuration"
|
18
|
+
require_relative "easy_ml/reasons"
|
18
19
|
require_relative "easy_ml/deep_compact"
|
19
20
|
require_relative "easy_ml/timing"
|
20
21
|
require_relative "easy_ml/support"
|
21
22
|
require_relative "easy_ml/core_ext"
|
22
23
|
require_relative "easy_ml/logging"
|
23
24
|
require_relative "easy_ml/data"
|
24
|
-
require_relative "easy_ml/data/filter_extensions"
|
25
25
|
require_relative "easy_ml/evaluators"
|
26
26
|
require_relative "easy_ml/features"
|
27
27
|
require_relative "easy_ml/feature_store"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc77
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -604,6 +604,7 @@ files:
|
|
604
604
|
- app/models/easy_ml/dataset/learner/lazy/query.rb
|
605
605
|
- app/models/easy_ml/dataset/learner/lazy/string.rb
|
606
606
|
- app/models/easy_ml/dataset/learner/query.rb
|
607
|
+
- app/models/easy_ml/dataset/refresh_reasons.rb
|
607
608
|
- app/models/easy_ml/dataset_history.rb
|
608
609
|
- app/models/easy_ml/datasource.rb
|
609
610
|
- app/models/easy_ml/datasource_history.rb
|
@@ -631,6 +632,7 @@ files:
|
|
631
632
|
- app/models/easy_ml/import/retraining_job.rb
|
632
633
|
- app/models/easy_ml/import/splitter.rb
|
633
634
|
- app/models/easy_ml/lineage.rb
|
635
|
+
- app/models/easy_ml/lineage_history.rb
|
634
636
|
- app/models/easy_ml/model.rb
|
635
637
|
- app/models/easy_ml/model_file.rb
|
636
638
|
- app/models/easy_ml/model_file_history.rb
|
@@ -657,7 +659,6 @@ files:
|
|
657
659
|
- app/models/easy_ml/splitters/random_splitter.rb
|
658
660
|
- app/models/easy_ml/tuner_job.rb
|
659
661
|
- app/models/easy_ml/tuner_run.rb
|
660
|
-
- app/models/lineage_history.rb
|
661
662
|
- app/serializers/easy_ml/column_serializer.rb
|
662
663
|
- app/serializers/easy_ml/dataset_serializer.rb
|
663
664
|
- app/serializers/easy_ml/datasource_serializer.rb
|
@@ -705,8 +706,22 @@ files:
|
|
705
706
|
- lib/easy_ml/core_ext/hash.rb
|
706
707
|
- lib/easy_ml/core_ext/pathname.rb
|
707
708
|
- lib/easy_ml/data.rb
|
709
|
+
- lib/easy_ml/data/dataset_manager.rb
|
710
|
+
- lib/easy_ml/data/dataset_manager/normalizer.rb
|
711
|
+
- lib/easy_ml/data/dataset_manager/reader.rb
|
712
|
+
- lib/easy_ml/data/dataset_manager/reader/base.rb
|
713
|
+
- lib/easy_ml/data/dataset_manager/reader/batch.rb
|
714
|
+
- lib/easy_ml/data/dataset_manager/reader/data_frame.rb
|
715
|
+
- lib/easy_ml/data/dataset_manager/reader/file.rb
|
716
|
+
- lib/easy_ml/data/dataset_manager/writer.rb
|
717
|
+
- lib/easy_ml/data/dataset_manager/writer/append_only.rb
|
718
|
+
- lib/easy_ml/data/dataset_manager/writer/base.rb
|
719
|
+
- lib/easy_ml/data/dataset_manager/writer/named.rb
|
720
|
+
- lib/easy_ml/data/dataset_manager/writer/partitioned.rb
|
721
|
+
- lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
|
708
722
|
- lib/easy_ml/data/date_converter.rb
|
709
|
-
- lib/easy_ml/data/
|
723
|
+
- lib/easy_ml/data/partition.rb
|
724
|
+
- lib/easy_ml/data/partition/boundaries.rb
|
710
725
|
- lib/easy_ml/data/polars_column.rb
|
711
726
|
- lib/easy_ml/data/polars_in_memory.rb
|
712
727
|
- lib/easy_ml/data/polars_reader.rb
|
@@ -765,6 +780,7 @@ files:
|
|
765
780
|
- lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt
|
766
781
|
- lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt
|
767
782
|
- lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt
|
783
|
+
- lib/easy_ml/reasons.rb
|
768
784
|
- lib/easy_ml/support.rb
|
769
785
|
- lib/easy_ml/support/age.rb
|
770
786
|
- lib/easy_ml/support/est.rb
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module EasyML
|
2
|
-
module Data
|
3
|
-
module FilterExtensions
|
4
|
-
def is_primary_key_filter?(primary_key)
|
5
|
-
return false unless primary_key
|
6
|
-
primary_key = [primary_key] unless primary_key.is_a?(Array)
|
7
|
-
# Filter expressions in Polars are represented as strings like:
|
8
|
-
# [([(col("LOAN_APP_ID")) > (dyn int: 4)]) & ([(col("LOAN_APP_ID")) < (dyn int: 16)])]
|
9
|
-
expr_str = to_s
|
10
|
-
return false unless expr_str.include?(primary_key.first)
|
11
|
-
|
12
|
-
# Check for common primary key operations
|
13
|
-
primary_key_ops = [">", "<", ">=", "<=", "=", "eq", "gt", "lt", "ge", "le"]
|
14
|
-
primary_key_ops.any? { |op| expr_str.include?(op) }
|
15
|
-
end
|
16
|
-
|
17
|
-
def extract_primary_key_values
|
18
|
-
expr_str = to_s
|
19
|
-
# Extract numeric values from the expression
|
20
|
-
# This will match both integers and floats
|
21
|
-
values = expr_str.scan(/(?:dyn int|float): (-?\d+(?:\.\d+)?)/).flatten.map(&:to_f)
|
22
|
-
values.uniq
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
# Extend Polars classes with our filter functionality
|
29
|
-
[Polars::Expr].each do |klass|
|
30
|
-
klass.include(EasyML::Data::FilterExtensions)
|
31
|
-
end
|
File without changes
|