easy_ml 0.2.0.pre.rc75 → 0.2.0.pre.rc77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/app/frontend/components/dataset/PreprocessingConfig.tsx +2 -2
  3. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
  4. data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
  5. data/app/models/easy_ml/dataset.rb +4 -49
  6. data/app/models/easy_ml/datasource.rb +4 -5
  7. data/app/models/easy_ml/feature.rb +16 -36
  8. data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
  9. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
  10. data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
  11. data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
  12. data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
  13. data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
  14. data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
  15. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
  16. data/lib/easy_ml/data/dataset_manager/writer/base.rb +122 -0
  17. data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
  18. data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
  19. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
  20. data/lib/easy_ml/data/dataset_manager/writer.rb +76 -0
  21. data/lib/easy_ml/data/dataset_manager.rb +134 -0
  22. data/lib/easy_ml/data/partition/boundaries.rb +60 -0
  23. data/lib/easy_ml/data/partition.rb +7 -0
  24. data/lib/easy_ml/data/synced_directory.rb +1 -2
  25. data/lib/easy_ml/data.rb +2 -0
  26. data/lib/easy_ml/feature_store.rb +15 -185
  27. data/lib/easy_ml/reasons.rb +41 -0
  28. data/lib/easy_ml/version.rb +1 -1
  29. data/lib/easy_ml.rb +1 -1
  30. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  31. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +1 -0
  32. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-Rrzo4ecT.js → Application.tsx-B1qLZuyu.js} +2 -2
  33. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-Rrzo4ecT.js.map → Application.tsx-B1qLZuyu.js.map} +1 -1
  34. metadata +23 -7
  35. data/lib/easy_ml/data/filter_extensions.rb +0 -31
  36. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +0 -1
  37. /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -0,0 +1,122 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ class Base
6
+ attr_accessor :filenames, :root_dir, :options, :append_only, :df
7
+
8
+ def initialize(options)
9
+ @root_dir = options.dig(:root_dir)
10
+ @filenames = options.dig(:filenames)
11
+ @append_only = options.dig(:append_only)
12
+ @options = options
13
+ @df = options.dig(:df)
14
+ end
15
+
16
+ def wipe
17
+ clear_unique_id
18
+ FileUtils.rm_rf(root_dir)
19
+ end
20
+
21
+ def store
22
+ store_to_unique_file
23
+ end
24
+
25
+ def compact
26
+ files = self.files
27
+
28
+ clear_unique_id
29
+ File.join(root_dir, "compacted.parquet").tap do |target_file|
30
+ safe_write(
31
+ query(lazy: true),
32
+ target_file
33
+ )
34
+ FileUtils.rm(files)
35
+ end
36
+ clear_unique_id
37
+ end
38
+
39
+ private
40
+
41
+ def files
42
+ DatasetManager.new(options).files
43
+ end
44
+
45
+ def query(**kwargs, &block)
46
+ DatasetManager.new(options).query(root_dir, **kwargs, &block)
47
+ end
48
+
49
+ def store_to_unique_file(subdir: nil)
50
+ safe_write(df, unique_path(subdir: subdir))
51
+ end
52
+
53
+ def unique_path(subdir: nil)
54
+ filename = [filenames, unique_id(subdir: subdir), "parquet"].compact.join(".")
55
+
56
+ File.join(root_dir, subdir.to_s, filename)
57
+ end
58
+
59
+ def safe_write(df, path)
60
+ FileUtils.mkdir_p(File.dirname(path))
61
+ df.is_a?(Polars::LazyFrame) ? df.sink_parquet(path) : df.write_parquet(path)
62
+ path
63
+ end
64
+
65
+ def clear_all_keys
66
+ keys = list_keys
67
+ Support::Lockable.with_lock(keys, wait_timeout: 2) do |suo|
68
+ suo.client.del(keys)
69
+ end
70
+ end
71
+
72
+ def clear_unique_id(subdir: nil)
73
+ key = unique_id_key(subdir: subdir)
74
+ Support::Lockable.with_lock(key, wait_timeout: 2) do |suo|
75
+ suo.client.del(key)
76
+ end
77
+ end
78
+
79
+ def unique_id_key(subdir: nil)
80
+ File.join("dataset_managers", root_dir, subdir.to_s, "sequence")
81
+ end
82
+
83
+ def add_key(key)
84
+ keylist = unique_id_key(subdir: "keylist")
85
+
86
+ Support::Lockable.with_lock(keylist, wait_timeout: 2) do |suo|
87
+ suo.client.sadd(keylist, key)
88
+ end
89
+ end
90
+
91
+ def list_keys
92
+ keylist = unique_id_key(subdir: "keylist")
93
+
94
+ Support::Lockable.with_lock(keylist, wait_timeout: 2) do |suo|
95
+ suo.client.smembers(keylist)
96
+ end
97
+ end
98
+
99
+ def key_exists?(key)
100
+ keylist = unique_id_key(subdir: "keylist")
101
+ Support::Lockable.with_lock(keylist, wait_timeout: 2) do |suo|
102
+ suo.client.sismember(keylist, key)
103
+ end
104
+ end
105
+
106
+ def unique_id(subdir: nil)
107
+ key = unique_id_key(subdir: subdir)
108
+ add_key(key)
109
+
110
+ Support::Lockable.with_lock(key, wait_timeout: 2) do |suo|
111
+ redis = suo.client
112
+
113
+ seq = (redis.get(key) || "0").to_i
114
+ redis.set(key, (seq + 1).to_s)
115
+ seq + 1
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,14 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ class Named < Base
6
+ def store(name)
7
+ clear_unique_id(subdir: name)
8
+ store_to_unique_file(subdir: name)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ class Partitioned < Base
6
+ class PartitionReasons < EasyML::Reasons
7
+ add_reason "Missing primary key", -> { primary_key.nil? }
8
+ add_reason "Df does not contain primary key", -> { df.columns.exclude?(primary_key) }
9
+ add_reason "Primary key is not numeric", -> { !numeric_primary_key? }
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,150 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ class Partitioned < Base
6
+ require_relative "partitioned/partition_reasons"
7
+
8
+ attr_accessor :partition_size, :partition, :primary_key, :df
9
+
10
+ def initialize(options)
11
+ super
12
+ @partition_size = options.dig(:partition_size)
13
+ @partition = options.dig(:partition)
14
+ @primary_key = options.dig(:primary_key)
15
+
16
+ raise "filenames required: specify the prefix to use for unique new files" unless filenames.present?
17
+ end
18
+
19
+ def wipe
20
+ partitions.each do |partition|
21
+ FileUtils.rm_rf(File.join(root_dir, partition))
22
+ end
23
+ clear_all_keys
24
+ end
25
+
26
+ def store
27
+ unless can_partition?
28
+ puts cannot_partition_reasons.explain
29
+ return Base.new(options).store
30
+ end
31
+
32
+ store_each_partition
33
+ end
34
+
35
+ def compact
36
+ files = self.files
37
+ @df = query(lazy: true)
38
+
39
+ clear_unique_id(subdir: "compacted")
40
+ compact_each_partition.tap do
41
+ FileUtils.rm(files)
42
+ clear_unique_id
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ def partitions
49
+ Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
50
+ end
51
+
52
+ def compact_each_partition
53
+ with_each_partition do |partition_df, _|
54
+ safe_write(
55
+ partition_df.sort(Polars.col(primary_key)),
56
+ unique_path(subdir: "compacted")
57
+ )
58
+ end
59
+ end
60
+
61
+ def with_each_partition(&block)
62
+ partition_boundaries.map do |partition|
63
+ partition_start = partition[:partition_start]
64
+ partition_end = partition[:partition_end]
65
+ partition_df = df.filter(Polars.col(primary_key).is_between(partition_start, partition_end))
66
+ num_rows = lazy? ? partition_df.select(Polars.length).collect[0, 0] : partition_df.shape[0]
67
+
68
+ binding.pry if num_rows == 0
69
+ next if num_rows == 0
70
+ yield partition_df, partition
71
+ end
72
+ end
73
+
74
+ def store_each_partition
75
+ with_each_partition do |partition_df, partition|
76
+ safe_write(
77
+ partition_df,
78
+ unique_path(subdir: partition[:partition])
79
+ )
80
+ end
81
+ end
82
+
83
+ def partition_boundaries
84
+ EasyML::Data::Partition::Boundaries.new(df, primary_key, partition_size).to_a
85
+ end
86
+
87
+ def cannot_partition_reasons
88
+ @cannot_partition_reasons ||= PartitionReasons.new(self)
89
+ end
90
+
91
+ def can_partition?
92
+ @partitioned ||= cannot_partition_reasons.none?
93
+ end
94
+
95
+ def lazy?
96
+ df.is_a?(Polars::LazyFrame)
97
+ end
98
+
99
+ def cast_primary_key
100
+ case dtype_primary_key
101
+ when Polars::Categorical
102
+ Polars.col(primary_key).cast(Polars::String)
103
+ else
104
+ Polars.col(primary_key)
105
+ end
106
+ end
107
+
108
+ def dtype_primary_key
109
+ @dtype_primary_key ||= schema[primary_key]
110
+ end
111
+
112
+ def schema
113
+ @schema ||= df.schema
114
+ end
115
+
116
+ def min_key
117
+ return @min_key if @min_key
118
+
119
+ if lazy?
120
+ @min_key = df.select(cast_primary_key).min.collect.to_a[0].dig(primary_key)
121
+ else
122
+ @min_key = df[primary_key].min
123
+ end
124
+ end
125
+
126
+ def max_key
127
+ return @max_key if @max_key
128
+
129
+ if lazy?
130
+ @max_key = df.select(cast_primary_key).max.collect.to_a[0].dig(primary_key)
131
+ else
132
+ @max_key = df[primary_key].max
133
+ end
134
+ end
135
+
136
+ def numeric_primary_key?
137
+ begin
138
+ # We are intentionally not using to_i, so it will raise an error for keys like "A1"
139
+ min = min_key.is_a?(String) ? Integer(min_key) : min_key
140
+ max = max_key.is_a?(String) ? Integer(max_key) : max_key
141
+ min.is_a?(Integer) && max.is_a?(Integer)
142
+ rescue ArgumentError
143
+ false
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,76 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Writer
5
+ require_relative "writer/base"
6
+ require_relative "writer/partitioned"
7
+ require_relative "writer/append_only"
8
+ require_relative "writer/named"
9
+
10
+ ADAPTERS = [
11
+ Base,
12
+ Partitioned,
13
+ AppendOnly,
14
+ Named,
15
+ ]
16
+
17
+ attr_accessor :filenames, :root_dir, :partition,
18
+ :append_only, :primary_key, :options
19
+
20
+ def initialize(options)
21
+ @root_dir = options.dig(:root_dir)
22
+ @filenames = options.dig(:filenames)
23
+ @partition = options.dig(:partition) || (options.dig(:partition_size).present? && options.dig(:primary_key).present?)
24
+ @append_only = options.dig(:append_only)
25
+ @primary_key = options.dig(:primary_key)
26
+ @named = options.dig(:named) || false
27
+ @options = options
28
+ end
29
+
30
+ def store(df, *args)
31
+ adapter_class.new(options.merge!(df: df)).store(*args)
32
+ end
33
+
34
+ def wipe
35
+ adapter_class.new(options).wipe
36
+ end
37
+
38
+ def compact
39
+ adapter_class.new(options).compact
40
+ end
41
+
42
+ def inspect
43
+ keys = %w(root_dir append_only partition primary_key)
44
+ attrs = keys.map { |k| "#{k}=#{send(k)}" unless send(k).nil? }.compact
45
+ "#<#{self.class.name} #{attrs.join(" ")}>"
46
+ end
47
+
48
+ private
49
+
50
+ def adapter_class
51
+ if partition?
52
+ Partitioned
53
+ elsif append_only?
54
+ AppendOnly
55
+ elsif named?
56
+ Named
57
+ else
58
+ Base
59
+ end
60
+ end
61
+
62
+ def named?
63
+ @named
64
+ end
65
+
66
+ def partition?
67
+ @partition
68
+ end
69
+
70
+ def append_only?
71
+ @append_only
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,134 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ require_relative "dataset_manager/writer"
5
+ require_relative "dataset_manager/reader"
6
+
7
+ attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
8
+ :partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
9
+ :s3_secret_access_key, :polars_args, :source_of_truth,
10
+ :options
11
+
12
+ def initialize(options = {})
13
+ @root_dir = options.dig(:root_dir)
14
+ @partition = options.dig(:partition) || (options.dig(:partition_size).present? && options.dig(:primary_key).present?)
15
+ @append_only = options.dig(:append_only) || false
16
+ @filenames = options.dig(:filenames) || "file"
17
+ @primary_key = options.dig(:primary_key)
18
+ @partition_size = options.dig(:partition_size) || nil
19
+ @s3_bucket = options.dig(:s3_bucket) || EasyML::Configuration.s3_bucket
20
+ @s3_prefix = options.dig(:s3_prefix) || nil
21
+ @s3_access_key_id = options.dig(:s3_access_key_id) || EasyML::Configuration.s3_access_key_id
22
+ @s3_secret_access_key = options.dig(:s3_secret_access_key) || EasyML::Configuration.s3_secret_access_key
23
+ @polars_args = options.dig(:polars_args) || {}
24
+ @source_of_truth = options.dig(:source_of_truth) || :local
25
+ @options = options
26
+
27
+ raise "primary_key required: how should we divide partitions?" if partition && primary_key.nil?
28
+ raise "partition_size required: specify number of rows in each partition" if partition && partition_size.nil?
29
+ raise "root_dir required: specify the root_dir of the dataset" unless root_dir.present?
30
+ end
31
+
32
+ def inspect
33
+ keys = %w(root append_only partition primary_key)
34
+ attrs = keys.map { |k| "#{k}=#{send(k)}" unless send(k).nil? }.compact
35
+ "#<#{self.class.name} #{attrs.join("\n\t")}>"
36
+ end
37
+
38
+ class << self
39
+ def query(input = nil, **kwargs, &block)
40
+ Reader.query(input, **kwargs, &block)
41
+ end
42
+
43
+ def schema(input = nil, **kwargs, &block)
44
+ Reader.schema(input, **kwargs, &block)
45
+ end
46
+
47
+ def num_rows
48
+ Reader.num_rows
49
+ end
50
+ end
51
+
52
+ def num_rows
53
+ Reader.num_rows(root_dir)
54
+ end
55
+
56
+ def query(input = nil, **kwargs, &block)
57
+ input = root_dir if input.nil?
58
+ DatasetManager.query(input, **kwargs, &block)
59
+ end
60
+
61
+ def schema(input = nil, **kwargs, &block)
62
+ input = root_dir if input.nil?
63
+ DatasetManager.schema(input, **kwargs, &block)
64
+ end
65
+
66
+ def sha
67
+ Reader.sha(root_dir)
68
+ end
69
+
70
+ def normalize
71
+ Normalizer.normalize(root_dir)
72
+ end
73
+
74
+ def data
75
+ query
76
+ end
77
+
78
+ def store(df, *args)
79
+ writer.store(df, *args)
80
+ end
81
+
82
+ def compact
83
+ writer.compact
84
+ end
85
+
86
+ def cp(from, to)
87
+ writer.cp(from, to)
88
+ end
89
+
90
+ def empty?
91
+ files.empty? || query(limit: 1).empty?
92
+ end
93
+
94
+ def files
95
+ Reader.files(root_dir)
96
+ end
97
+
98
+ def wipe
99
+ writer.wipe
100
+ end
101
+
102
+ def upload
103
+ synced_directory.upload
104
+ end
105
+
106
+ def download
107
+ synced_directory.download
108
+ end
109
+
110
+ private
111
+
112
+ def root
113
+ root_dir.gsub(/^#{Rails.root.to_s}/, "")
114
+ end
115
+
116
+ def writer
117
+ Writer.new(options)
118
+ end
119
+
120
+ def synced_directory
121
+ @synced_dir ||= EasyML::Data::SyncedDirectory.new(
122
+ root_dir: root_dir,
123
+ source_of_truth: source_of_truth,
124
+ s3_bucket: s3_bucket,
125
+ s3_prefix: s3_prefix,
126
+ s3_access_key_id: s3_access_key_id,
127
+ s3_secret_access_key: s3_secret_access_key,
128
+ polars_args: polars_args,
129
+ cache_for: 0,
130
+ )
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,60 @@
1
+ module EasyML
2
+ module Data
3
+ module Partition
4
+ class Boundaries
5
+ attr_reader :df, :primary_key, :partition_size
6
+
7
+ def initialize(df, primary_key, partition_size)
8
+ @df = df
9
+ @primary_key = primary_key.is_a?(Array) ? primary_key.first : primary_key
10
+ @partition_size = partition_size
11
+ end
12
+
13
+ def inspect
14
+ "#<#{self.class.name.split("::").last} partition_size=#{partition_size} primary_key=#{primary_key}>"
15
+ end
16
+
17
+ def boundaries
18
+ return @boundaries if @boundaries
19
+
20
+ @boundaries = df.with_columns(
21
+ Polars.col(primary_key)
22
+ .truediv(partition_size)
23
+ .floor
24
+ .add(1)
25
+ .cast(Polars::Int64)
26
+ .alias("partition")
27
+ )
28
+ @boundaries = @boundaries.with_columns(
29
+ Polars.col("partition")
30
+ .sub(1)
31
+ .mul(partition_size)
32
+ .cast(Polars::Int64)
33
+ .alias("partition_start"),
34
+ Polars.col("partition")
35
+ .mul(partition_size)
36
+ .sub(1)
37
+ .cast(Polars::Int64)
38
+ .alias("partition_end")
39
+ )
40
+ # @boundaries = @boundaries.with_columns(
41
+ # Polars.col(primary_key).is_between(Polars.col("partition_start"), Polars.col("partition_end")).select("partition")
42
+ # )
43
+ end
44
+
45
+ def to_a
46
+ is_lazy = df.is_a?(Polars::LazyFrame)
47
+ empty = is_lazy ? df.limit(1).collect.empty? : df.shape[0] == 0
48
+ return [] if empty
49
+
50
+ sorted = boundaries.select(["partition", "partition_start", "partition_end"]).unique.sort("partition")
51
+ array = (is_lazy ? sorted.collect.to_a : sorted.to_a).map(&:with_indifferent_access)
52
+ # For the last partition, set the end to the total number of rows (so we read the last row with is_between queries)
53
+ last_idx = array.size - 1
54
+ array[last_idx]["partition_end"] = is_lazy ? df.select(Polars.col(primary_key)).max.collect.to_a.first.dig(primary_key) : df[primary_key].max
55
+ array
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,7 @@
1
+ module EasyML
2
+ module Data
3
+ module Partition
4
+ require_relative "partition/boundaries"
5
+ end
6
+ end
7
+ end
@@ -1,5 +1,3 @@
1
- require_relative "polars_reader"
2
-
3
1
  module EasyML
4
2
  module Data
5
3
  class SyncedDirectory
@@ -16,6 +14,7 @@ module EasyML
16
14
  @s3_region = options.dig(:s3_region) || EasyML::Configuration.s3_region
17
15
  @cache_for = options.dig(:cache_for)
18
16
  @polars_args = options.dig(:polars_args)
17
+ @source_of_truth = options.dig(:source_of_truth) || :remote
19
18
  end
20
19
 
21
20
  delegate :query, :data, :all_files, :files, :sha, to: :reader
data/lib/easy_ml/data.rb CHANGED
@@ -8,5 +8,7 @@ module EasyML
8
8
  require_relative "data/polars_column"
9
9
  require_relative "data/polars_schema"
10
10
  require_relative "data/date_converter"
11
+ require_relative "data/dataset_manager"
12
+ require_relative "data/partition"
11
13
  end
12
14
  end