easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -19,9 +19,19 @@
|
|
19
19
|
#
|
20
20
|
module EasyML
|
21
21
|
class ColumnSerializer
|
22
|
+
class SmallSerializer
|
23
|
+
include JSONAPI::Serializer
|
24
|
+
attributes :id, :name
|
25
|
+
end
|
26
|
+
|
22
27
|
include JSONAPI::Serializer
|
23
28
|
|
24
29
|
attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
|
25
|
-
:hidden, :drop_if_null, :sample_values, :statistics, :is_target
|
30
|
+
:hidden, :drop_if_null, :sample_values, :statistics, :is_target,
|
31
|
+
:is_computed, :computed_by, :lineage
|
32
|
+
|
33
|
+
attribute :required do |object|
|
34
|
+
object.required?
|
35
|
+
end
|
26
36
|
end
|
27
37
|
end
|
@@ -24,6 +24,27 @@ require_relative "./column_serializer"
|
|
24
24
|
#
|
25
25
|
module EasyML
|
26
26
|
class DatasetSerializer
|
27
|
+
class SmallSerializer
|
28
|
+
include JSONAPI::Serializer
|
29
|
+
|
30
|
+
attributes :id, :name, :description, :target, :num_rows, :status,
|
31
|
+
:datasource_id, :preprocessing_steps, :workflow_status, :statistics
|
32
|
+
|
33
|
+
attribute :columns do |dataset|
|
34
|
+
dataset.columns.order(:id).map do |column|
|
35
|
+
ColumnSerializer::SmallSerializer.new(column).serializable_hash.dig(:data, :attributes)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
attribute :stacktrace do |object|
|
39
|
+
if !object.failed? || object.events.empty?
|
40
|
+
nil
|
41
|
+
else
|
42
|
+
last_event = object.events.where(status: :failed).order(id: :desc).limit(1).last
|
43
|
+
last_event&.stacktrace
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
27
48
|
include JSONAPI::Serializer
|
28
49
|
|
29
50
|
attributes :id, :name, :description, :target, :num_rows, :status,
|
@@ -47,7 +68,7 @@ module EasyML
|
|
47
68
|
if dataset.workflow_status.to_sym == :analyzing
|
48
69
|
nil
|
49
70
|
else
|
50
|
-
dataset.data(limit: 10, all_columns: true)&.to_hashes
|
71
|
+
dataset.data(limit: 10, all_columns: true, refresh: false)&.to_hashes || dataset.raw.data(limit: 10, all_columns: true).to_hashes
|
51
72
|
end
|
52
73
|
end
|
53
74
|
|
@@ -62,7 +83,7 @@ module EasyML
|
|
62
83
|
end
|
63
84
|
|
64
85
|
attribute :needs_refresh do |dataset|
|
65
|
-
dataset.needs_refresh?
|
86
|
+
dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
|
66
87
|
end
|
67
88
|
|
68
89
|
attribute :stacktrace do |object|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Enumerable
|
2
|
+
def count_by(&block)
|
3
|
+
self.group_by(&block).inject({}) do |h, (k, v)|
|
4
|
+
h.tap do
|
5
|
+
h[k] = v.count
|
6
|
+
end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def key_by(&block)
|
11
|
+
self.group_by(&block).inject({}) do |h, (k, v)|
|
12
|
+
h.tap do
|
13
|
+
h[k] = v.first
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -3,35 +3,104 @@ module EasyML
|
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
5
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
|
-
"%Y-%m-%d %H:%M:%S.%L Z",
|
7
|
-
"%Y-%m-%d %H:%M:%S.%L",
|
8
|
-
"%Y-%m-%d %H:%M:%S.%L",
|
9
|
-
"%Y-%m-%d %H:%M:%S",
|
10
|
-
"%Y-%m-%d %H:%M",
|
11
|
-
"%Y-%m-%d",
|
12
|
-
"%m/%d/%Y %H:%M:%S",
|
13
|
-
"%m/%d/%Y",
|
14
|
-
"%d-%m-%Y",
|
15
|
-
"%d-%b-%Y %H:%M:%S",
|
16
|
-
"%d-%b-%Y",
|
17
|
-
"%b %d, %Y",
|
18
|
-
"%Y/%m/%d %H:%M:%S",
|
6
|
+
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
7
|
+
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
|
+
"%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
|
9
|
+
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|
10
|
+
"%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
|
11
|
+
"%Y-%m-%d", # e.g., "2021-01-01"
|
12
|
+
"%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
|
13
|
+
"%m/%d/%Y", # e.g., "01/01/2021"
|
14
|
+
"%d-%m-%Y", # e.g., "01-01-2021"
|
15
|
+
"%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
|
16
|
+
"%d-%b-%Y", # e.g., "01-Jan-2021"
|
17
|
+
"%b %d, %Y", # e.g., "Jan 01, 2021"
|
18
|
+
"%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
|
19
19
|
"%Y/%m/%d", # e.g., "2021/01/01"
|
20
20
|
].freeze
|
21
21
|
|
22
22
|
FORMAT_MAPPINGS = {
|
23
23
|
ruby_to_polars: {
|
24
24
|
"%L" => "%3f", # milliseconds
|
25
|
-
"%6N" => "%6f",
|
26
|
-
"%N" => "%9f",
|
25
|
+
"%6N" => "%6f", # microseconds
|
26
|
+
"%N" => "%9f", # nanoseconds
|
27
27
|
},
|
28
28
|
}.freeze
|
29
29
|
|
30
30
|
class << self
|
31
|
-
#
|
32
|
-
#
|
33
|
-
# @param
|
34
|
-
# @return [
|
31
|
+
# Infers a strftime format string from the given date string.
|
32
|
+
#
|
33
|
+
# @param date_str [String] The date string to analyze.
|
34
|
+
# @return [String, nil] The corresponding strftime format if recognized, or nil if not.
|
35
|
+
def infer_strftime_format(date_str)
|
36
|
+
return nil if date_str.blank?
|
37
|
+
|
38
|
+
# YYYY-MM-DD (e.g., "2021-01-01")
|
39
|
+
return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
|
40
|
+
|
41
|
+
# YYYY/MM/DD (e.g., "2021/01/01")
|
42
|
+
return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
|
43
|
+
|
44
|
+
# Date & time with T separator (ISO 8601-like)
|
45
|
+
if date_str.include?("T")
|
46
|
+
# Without fractional seconds, e.g., "2021-01-01T12:34:56"
|
47
|
+
return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
|
48
|
+
|
49
|
+
# With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
|
50
|
+
if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
|
51
|
+
fraction = Regexp.last_match(1)
|
52
|
+
case fraction.length
|
53
|
+
when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
|
54
|
+
when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
|
55
|
+
when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
|
56
|
+
else
|
57
|
+
# Fallback if fractional part has unexpected length:
|
58
|
+
return "%Y-%m-%dT%H:%M:%S.%N"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Date & time with space separator
|
64
|
+
if date_str.include?(" ")
|
65
|
+
# Without fractional seconds, e.g., "2021-01-01 12:34:56"
|
66
|
+
return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
|
67
|
+
|
68
|
+
# With fractional seconds, e.g., "2021-01-01 12:34:56.789"
|
69
|
+
if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
|
70
|
+
fraction = Regexp.last_match(1)
|
71
|
+
case fraction.length
|
72
|
+
when 3 then return "%Y-%m-%d %H:%M:%S.%L"
|
73
|
+
when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
|
74
|
+
when 9 then return "%Y-%m-%d %H:%M:%S.%N"
|
75
|
+
else
|
76
|
+
return "%Y-%m-%d %H:%M:%S.%N"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Common US-style formats
|
82
|
+
|
83
|
+
# MM/DD/YYYY (e.g., "01/31/2021")
|
84
|
+
return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
|
85
|
+
|
86
|
+
# DD-MM-YYYY (e.g., "31-01-2021")
|
87
|
+
return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
|
88
|
+
|
89
|
+
# DD-Mon-YYYY (e.g., "31-Jan-2021")
|
90
|
+
return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
|
91
|
+
|
92
|
+
# Mon DD, YYYY (e.g., "Jan 31, 2021")
|
93
|
+
return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
|
94
|
+
|
95
|
+
# Could add additional heuristics as needed...
|
96
|
+
|
97
|
+
nil # Return nil if no known format matches.
|
98
|
+
end
|
99
|
+
|
100
|
+
# Attempts to convert a string column to datetime if it appears to be a date.
|
101
|
+
# @param df [Polars::DataFrame] The dataframe containing the series.
|
102
|
+
# @param column [String] The name of the column to convert.
|
103
|
+
# @return [Polars::DataFrame] The dataframe with the converted column (if successful).
|
35
104
|
def maybe_convert_date(df, column = nil)
|
36
105
|
if column.nil?
|
37
106
|
series = df
|
@@ -40,19 +109,42 @@ module EasyML
|
|
40
109
|
else
|
41
110
|
series = df[column]
|
42
111
|
end
|
112
|
+
|
43
113
|
return df if series.dtype.is_a?(Polars::Datetime)
|
44
114
|
return df unless series.dtype == Polars::Utf8
|
45
115
|
|
46
|
-
|
47
|
-
|
116
|
+
sample = series.filter(series.is_not_null).head(100).to_a
|
117
|
+
ruby_format = detect_date_format(sample)
|
48
118
|
|
49
|
-
|
50
|
-
|
51
|
-
|
119
|
+
if ruby_format
|
120
|
+
format = convert_format(:ruby_to_polars, ruby_format)
|
121
|
+
df = try_format(df, column, format)
|
122
|
+
|
123
|
+
if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
|
124
|
+
df = df.drop("TRY")
|
125
|
+
best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
|
126
|
+
infer_strftime_format(date_str)
|
127
|
+
end.max_by { |_format, count| count }[0]
|
128
|
+
df = try_format(df, column, best_format)
|
129
|
+
end
|
130
|
+
|
131
|
+
df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
|
132
|
+
end
|
133
|
+
|
134
|
+
df
|
52
135
|
end
|
53
136
|
|
54
137
|
private
|
55
138
|
|
139
|
+
def try_format(df, column, format)
|
140
|
+
df = df.with_column(
|
141
|
+
Polars.col(column.to_s)
|
142
|
+
.str
|
143
|
+
.strptime(Polars::Datetime, format, strict: false)
|
144
|
+
.alias("TRY")
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
56
148
|
def detect_polars_format(series)
|
57
149
|
return nil unless series.is_a?(Polars::Series)
|
58
150
|
|
@@ -66,14 +158,29 @@ module EasyML
|
|
66
158
|
|
67
159
|
sample = date_strings.compact.sample([100, date_strings.length].min)
|
68
160
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
161
|
+
best_format = nil
|
162
|
+
best_success_rate = 0.0
|
163
|
+
sample_count = sample.length
|
164
|
+
|
165
|
+
COMMON_DATE_FORMATS.each do |fmt|
|
166
|
+
success_count = sample.count do |date_str|
|
167
|
+
begin
|
168
|
+
DateTime.strptime(date_str, fmt)
|
169
|
+
true
|
170
|
+
rescue StandardError
|
171
|
+
false
|
172
|
+
end
|
75
173
|
end
|
174
|
+
success_rate = success_count.to_f / sample_count
|
175
|
+
if success_rate > best_success_rate
|
176
|
+
best_success_rate = success_rate
|
177
|
+
best_format = fmt
|
178
|
+
end
|
179
|
+
# If every sample string matches this format, return it immediately.
|
180
|
+
return fmt if success_rate == 1.0
|
76
181
|
end
|
182
|
+
|
183
|
+
best_success_rate >= 0.8 ? best_format : nil
|
77
184
|
end
|
78
185
|
|
79
186
|
def convert_format(conversion, format)
|
@@ -12,6 +12,7 @@ module EasyML
|
|
12
12
|
string: Polars::String,
|
13
13
|
text: Polars::String,
|
14
14
|
categorical: Polars::Categorical,
|
15
|
+
null: Polars::Null,
|
15
16
|
}
|
16
17
|
POLARS_MAP = TYPE_MAP.invert.stringify_keys
|
17
18
|
class << self
|
@@ -19,6 +20,20 @@ module EasyML
|
|
19
20
|
POLARS_MAP.dig(polars_type.class.to_s)
|
20
21
|
end
|
21
22
|
|
23
|
+
def parse_polars_dtype(dtype_string)
|
24
|
+
case dtype_string
|
25
|
+
when /^Polars::Datetime/
|
26
|
+
time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
|
27
|
+
time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
|
28
|
+
time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
|
29
|
+
Polars::Datetime.new(time_unit, time_zone)
|
30
|
+
when /^Polars::/
|
31
|
+
Polars.const_get(dtype_string.split("::").last)
|
32
|
+
else
|
33
|
+
raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
22
37
|
def sym_to_polars(symbol)
|
23
38
|
TYPE_MAP.dig(symbol)
|
24
39
|
end
|
@@ -50,6 +65,8 @@ module EasyML
|
|
50
65
|
:boolean
|
51
66
|
when Polars::Utf8
|
52
67
|
determine_string_type(series)
|
68
|
+
when Polars::Null
|
69
|
+
:null
|
53
70
|
else
|
54
71
|
:categorical
|
55
72
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class PolarsInMemory
|
4
|
+
attr_reader :df
|
5
|
+
|
6
|
+
def initialize(df)
|
7
|
+
@df = df
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.query(df, **kwargs)
|
11
|
+
new(df).query(**kwargs)
|
12
|
+
end
|
13
|
+
|
14
|
+
def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
|
15
|
+
return if df.nil?
|
16
|
+
|
17
|
+
df = self.df.clone
|
18
|
+
df = df.filter(filter) if filter
|
19
|
+
select = df.columns & ([select] || []).flatten
|
20
|
+
df = df.select(select) if select.present?
|
21
|
+
df = df.unique if unique
|
22
|
+
drop_cols &= df.columns
|
23
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
24
|
+
df = df.sort(sort, reverse: descending) if sort
|
25
|
+
df = df.limit(limit) if limit
|
26
|
+
df
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -12,6 +12,22 @@ module EasyML
|
|
12
12
|
@schema = options[:schema]
|
13
13
|
end
|
14
14
|
|
15
|
+
def sha
|
16
|
+
files = parquet_files.sort
|
17
|
+
|
18
|
+
file_hashes = files.map do |file|
|
19
|
+
meta = Polars.read_parquet_schema(file)
|
20
|
+
row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
|
21
|
+
|
22
|
+
Digest::SHA256.hexdigest([
|
23
|
+
meta.to_json,
|
24
|
+
row_count.to_s,
|
25
|
+
].join("|"))
|
26
|
+
end
|
27
|
+
|
28
|
+
Digest::SHA256.hexdigest(file_hashes.join)
|
29
|
+
end
|
30
|
+
|
15
31
|
def schema=(value)
|
16
32
|
@schema = value
|
17
33
|
polars_args[:dtypes] = value
|
@@ -55,7 +71,10 @@ module EasyML
|
|
55
71
|
return files if any_parquet? && columns.nil?
|
56
72
|
|
57
73
|
puts "Converting to Parquet..."
|
58
|
-
|
74
|
+
if columns.nil? || columns.all? { |c| c.datatype.nil? }
|
75
|
+
learn_dataset
|
76
|
+
columns = nil
|
77
|
+
end
|
59
78
|
csv_files.each do |path|
|
60
79
|
df = read_file(path, columns)
|
61
80
|
df = cast(df, columns)
|
@@ -23,7 +23,7 @@ module EasyML
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def read(segment, split_ys: false, target: nil, drop_cols: [], filter: nil, limit: nil, select: nil,
|
26
|
-
unique: nil)
|
26
|
+
unique: nil, sort: nil, descending: false)
|
27
27
|
return nil if @data.keys.none?
|
28
28
|
|
29
29
|
df = if segment.to_s == "all"
|
@@ -33,10 +33,8 @@ module EasyML
|
|
33
33
|
end
|
34
34
|
return nil if df.nil?
|
35
35
|
|
36
|
-
df =
|
37
|
-
|
38
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
39
|
-
df = df.unique if unique
|
36
|
+
df = EasyML::Data::PolarsInMemory.query(df, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
|
37
|
+
unique: unique, sort: sort, descending: descending)
|
40
38
|
|
41
39
|
split_features_targets(df, split_ys, target)
|
42
40
|
end
|
@@ -7,10 +7,11 @@ module EasyML
|
|
7
7
|
VALID_SEGMENTS = %w[train test valid all].freeze
|
8
8
|
|
9
9
|
def initialize(options = {})
|
10
|
+
# Method kept for compatibility with subclasses
|
10
11
|
end
|
11
12
|
|
12
13
|
def load_data(segment, **kwargs)
|
13
|
-
drop_cols = dataset.drop_columns(all_columns: kwargs[:all_columns]
|
14
|
+
drop_cols = dataset.drop_columns(all_columns: kwargs.key?(:all_columns) && kwargs[:all_columns])
|
14
15
|
kwargs.delete(:all_columns)
|
15
16
|
kwargs = kwargs.merge!(drop_cols: drop_cols, target: dataset.target)
|
16
17
|
read(segment, **kwargs)
|
data/lib/easy_ml/data.rb
CHANGED
@@ -2,11 +2,10 @@ module EasyML
|
|
2
2
|
module Data
|
3
3
|
require_relative "data/utils"
|
4
4
|
require_relative "data/polars_reader"
|
5
|
+
require_relative "data/polars_in_memory"
|
5
6
|
require_relative "data/synced_directory"
|
6
|
-
require_relative "data/preprocessor"
|
7
7
|
require_relative "data/splits"
|
8
8
|
require_relative "data/polars_column"
|
9
|
-
require_relative "data/statistics_learner"
|
10
9
|
require_relative "data/date_converter"
|
11
10
|
end
|
12
11
|
end
|
@@ -40,8 +40,8 @@ module EasyML
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
def query(
|
44
|
-
query_all_partitions(
|
43
|
+
def query(**kwargs)
|
44
|
+
query_all_partitions(**kwargs)
|
45
45
|
end
|
46
46
|
|
47
47
|
def empty?
|
@@ -82,18 +82,40 @@ module EasyML
|
|
82
82
|
|
83
83
|
private
|
84
84
|
|
85
|
+
def cleanup(type: :partitions)
|
86
|
+
case type
|
87
|
+
when :partitions
|
88
|
+
list_partitions.each do |partition|
|
89
|
+
FileUtils.rm(partition)
|
90
|
+
end
|
91
|
+
when :no_partitions
|
92
|
+
FileUtils.rm_rf(feature_path)
|
93
|
+
when :all
|
94
|
+
wipe
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
85
98
|
def store_without_partitioning(df)
|
86
99
|
lock_file do
|
100
|
+
cleanup(type: :partitions)
|
87
101
|
path = feature_path
|
102
|
+
safe_write(df, path)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def safe_write(df, path)
|
107
|
+
begin
|
88
108
|
FileUtils.mkdir_p(File.dirname(path))
|
89
109
|
df.write_parquet(path)
|
110
|
+
rescue => e
|
111
|
+
binding.pry
|
90
112
|
end
|
91
113
|
end
|
92
114
|
|
93
115
|
def store_partition(partition_df, primary_key, partition_start)
|
94
116
|
lock_partition(partition_start) do
|
117
|
+
cleanup(type: :no_partitions)
|
95
118
|
path = partition_path(partition_start)
|
96
|
-
FileUtils.mkdir_p(File.dirname(path))
|
97
119
|
|
98
120
|
if File.exist?(path)
|
99
121
|
reader = EasyML::Data::PolarsReader.new
|
@@ -101,36 +123,25 @@ module EasyML
|
|
101
123
|
preserved_records = existing_df.filter(
|
102
124
|
Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
|
103
125
|
)
|
104
|
-
|
126
|
+
if preserved_records.shape[1] != partition_df.shape[1]
|
127
|
+
wipe
|
128
|
+
else
|
129
|
+
partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
|
130
|
+
end
|
105
131
|
end
|
106
132
|
|
107
|
-
partition_df
|
133
|
+
safe_write(partition_df, path)
|
108
134
|
end
|
109
135
|
end
|
110
136
|
|
111
|
-
def
|
112
|
-
primary_key_values = filter.extract_primary_key_values
|
113
|
-
batch_size = feature.batch_size || 10_000
|
114
|
-
|
115
|
-
partition_files = primary_key_values.map do |key|
|
116
|
-
partition_start = (key / batch_size.to_f).floor * batch_size
|
117
|
-
partition_path(partition_start)
|
118
|
-
end.uniq.select { |path| File.exist?(path) }
|
119
|
-
|
120
|
-
return Polars::DataFrame.new if partition_files.empty?
|
121
|
-
|
122
|
-
reader = EasyML::Data::PolarsReader.new
|
123
|
-
reader.query(partition_files, filter: filter)
|
124
|
-
end
|
125
|
-
|
126
|
-
def query_all_partitions(filter)
|
137
|
+
def query_all_partitions(**kwargs)
|
127
138
|
reader = EasyML::Data::PolarsReader.new
|
128
139
|
pattern = File.join(feature_dir, "feature*.parquet")
|
129
140
|
files = Dir.glob(pattern)
|
130
141
|
|
131
142
|
return Polars::DataFrame.new if files.empty?
|
132
143
|
|
133
|
-
reader.query(files,
|
144
|
+
reader.query(files, **kwargs)
|
134
145
|
end
|
135
146
|
|
136
147
|
def compute_partition_boundaries(min_key, max_key, batch_size)
|
@@ -44,6 +44,10 @@ module EasyML
|
|
44
44
|
add_computed_columns_to_easy_ml_columns
|
45
45
|
add_slug_to_easy_ml_models
|
46
46
|
add_default_to_is_target
|
47
|
+
remove_preprocessor_statistics_from_easy_ml_datasets
|
48
|
+
add_learned_at_to_easy_ml_columns
|
49
|
+
add_sha_to_datasources_datasets_and_columns
|
50
|
+
add_last_feature_sha_to_columns
|
47
51
|
].freeze
|
48
52
|
|
49
53
|
# Specify the next migration number
|
@@ -2,13 +2,17 @@ class AddComputedColumnsToEasyMLColumns < ActiveRecord::Migration[<%= ActiveReco
|
|
2
2
|
def change
|
3
3
|
add_column :easy_ml_columns, :computed_by, :string
|
4
4
|
add_column :easy_ml_columns, :is_computed, :boolean, default: false
|
5
|
+
add_column :easy_ml_columns, :feature_id, :bigint
|
5
6
|
|
6
7
|
add_index :easy_ml_columns, :computed_by
|
7
8
|
add_index :easy_ml_columns, :is_computed
|
9
|
+
add_index :easy_ml_columns, :feature_id
|
8
10
|
|
9
11
|
add_column :easy_ml_column_histories, :computed_by, :string
|
10
12
|
add_index :easy_ml_column_histories, :computed_by
|
11
13
|
add_column :easy_ml_column_histories, :is_computed, :boolean, default: false
|
12
14
|
add_index :easy_ml_column_histories, :is_computed
|
15
|
+
add_column :easy_ml_column_histories, :feature_id, :bigint
|
16
|
+
add_index :easy_ml_column_histories, :feature_id
|
13
17
|
end
|
14
18
|
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
class AddLastFeatureShaToColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_columns, :last_feature_sha, :string
|
4
|
+
add_index :easy_ml_columns, :last_feature_sha
|
5
|
+
|
6
|
+
add_column :easy_ml_column_histories, :last_feature_sha, :string
|
7
|
+
add_index :easy_ml_column_histories, :last_feature_sha
|
8
|
+
end
|
9
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class AddLearnedAtToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_columns, :learned_at, :timestamp
|
4
|
+
add_column :easy_ml_columns, :is_learning, :boolean, default: false
|
5
|
+
add_index :easy_ml_columns, :learned_at
|
6
|
+
add_index :easy_ml_columns, :is_learning
|
7
|
+
|
8
|
+
add_column :easy_ml_column_histories, :learned_at, :timestamp
|
9
|
+
add_column :easy_ml_column_histories, :is_learning, :boolean, default: false
|
10
|
+
add_index :easy_ml_column_histories, :learned_at
|
11
|
+
add_index :easy_ml_column_histories, :is_learning
|
12
|
+
end
|
13
|
+
end
|
data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
class AddShaToDatasourcesDatasetsAndColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_datasources, :sha, :string
|
4
|
+
add_column :easy_ml_datasets, :last_datasource_sha, :string
|
5
|
+
|
6
|
+
add_index :easy_ml_datasources, :sha
|
7
|
+
add_index :easy_ml_datasets, :last_datasource_sha
|
8
|
+
|
9
|
+
add_column :easy_ml_datasource_histories, :sha, :string
|
10
|
+
add_index :easy_ml_datasource_histories, :sha
|
11
|
+
|
12
|
+
add_column :easy_ml_dataset_histories, :last_datasource_sha, :string
|
13
|
+
add_index :easy_ml_dataset_histories, :last_datasource_sha
|
14
|
+
|
15
|
+
add_column :easy_ml_columns, :last_datasource_sha, :string
|
16
|
+
add_index :easy_ml_columns, :last_datasource_sha
|
17
|
+
|
18
|
+
add_column :easy_ml_column_histories, :last_datasource_sha, :string
|
19
|
+
add_index :easy_ml_column_histories, :last_datasource_sha
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class RemovePreprocessorStatisticsFromEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
if column_exists?(:easy_ml_datasets, :preprocessor_statistics)
|
4
|
+
remove_column :easy_ml_datasets, :preprocessor_statistics
|
5
|
+
end
|
6
|
+
|
7
|
+
if column_exists?(:easy_ml_dataset_histories, :preprocessor_statistics)
|
8
|
+
remove_column :easy_ml_dataset_histories, :preprocessor_statistics
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|