easy_ml 0.2.0.pre.rc100 → 0.2.0.pre.rc102
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +1 -0
- data/app/frontend/components/dataset/splitters/types.ts +3 -4
- data/app/frontend/pages/NewDatasetPage.tsx +17 -0
- data/app/frontend/types/datasource.ts +14 -6
- data/app/models/easy_ml/column/imputers/base.rb +3 -1
- data/app/models/easy_ml/column.rb +8 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
- data/app/models/easy_ml/dataset.rb +47 -9
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/feature.rb +5 -13
- data/app/models/easy_ml/lineage.rb +2 -1
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
- data/app/models/easy_ml/models/xgboost.rb +7 -2
- data/app/models/easy_ml/prediction.rb +1 -1
- data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
- data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
- data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
- data/config/initializers/zhong.rb +6 -0
- data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
- data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
- data/lib/easy_ml/data/dataset_manager.rb +5 -0
- data/lib/easy_ml/data/date_converter.rb +24 -165
- data/lib/easy_ml/data/polars_column.rb +5 -2
- data/lib/easy_ml/data/polars_reader.rb +4 -1
- data/lib/easy_ml/data/synced_directory.rb +36 -23
- data/lib/easy_ml/engine.rb +4 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
- metadata +7 -5
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
@@ -3,6 +3,7 @@ module EasyML
|
|
3
3
|
class DatasetManager
|
4
4
|
require_relative "dataset_manager/writer"
|
5
5
|
require_relative "dataset_manager/reader"
|
6
|
+
require_relative "dataset_manager/schema"
|
6
7
|
|
7
8
|
attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
|
8
9
|
:partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
|
@@ -55,6 +56,10 @@ module EasyML
|
|
55
56
|
def cp(from, to)
|
56
57
|
Writer.cp(from, to)
|
57
58
|
end
|
59
|
+
|
60
|
+
def normalize_schema(files)
|
61
|
+
Schema::Normalizer.new(files).normalize
|
62
|
+
end
|
58
63
|
end
|
59
64
|
|
60
65
|
def list_nulls(input = nil, **kwargs, &block)
|
@@ -2,10 +2,10 @@ module EasyML
|
|
2
2
|
module Data
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
|
+
"%Y-%m-%d %H:%M:%S.%f %Z",
|
5
6
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
7
|
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
7
8
|
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
|
-
"%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
|
9
9
|
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|
10
10
|
"%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
|
11
11
|
"%Y-%m-%d", # e.g., "2021-01-01"
|
@@ -19,179 +19,38 @@ module EasyML
|
|
19
19
|
"%Y/%m/%d", # e.g., "2021/01/01"
|
20
20
|
].freeze
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
},
|
28
|
-
}.freeze
|
29
|
-
|
30
|
-
class << self
|
31
|
-
# Infers a strftime format string from the given date string.
|
32
|
-
#
|
33
|
-
# @param date_str [String] The date string to analyze.
|
34
|
-
# @return [String, nil] The corresponding strftime format if recognized, or nil if not.
|
35
|
-
def infer_strftime_format(date_str)
|
36
|
-
return nil if date_str.blank?
|
37
|
-
|
38
|
-
# YYYY-MM-DD (e.g., "2021-01-01")
|
39
|
-
return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
|
40
|
-
|
41
|
-
# YYYY/MM/DD (e.g., "2021/01/01")
|
42
|
-
return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
|
43
|
-
|
44
|
-
# Date & time with T separator (ISO 8601-like)
|
45
|
-
if date_str.include?("T")
|
46
|
-
# Without fractional seconds, e.g., "2021-01-01T12:34:56"
|
47
|
-
return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
|
48
|
-
|
49
|
-
# With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
|
50
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
|
51
|
-
fraction = Regexp.last_match(1)
|
52
|
-
case fraction.length
|
53
|
-
when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
|
54
|
-
when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
|
55
|
-
when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
|
56
|
-
else
|
57
|
-
# Fallback if fractional part has unexpected length:
|
58
|
-
return "%Y-%m-%dT%H:%M:%S.%N"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Date & time with space separator
|
64
|
-
if date_str.include?(" ")
|
65
|
-
# Without fractional seconds, e.g., "2021-01-01 12:34:56"
|
66
|
-
return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
|
67
|
-
|
68
|
-
# With fractional seconds, e.g., "2021-01-01 12:34:56.789"
|
69
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
|
70
|
-
fraction = Regexp.last_match(1)
|
71
|
-
case fraction.length
|
72
|
-
when 3 then return "%Y-%m-%d %H:%M:%S.%L"
|
73
|
-
when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
|
74
|
-
when 9 then return "%Y-%m-%d %H:%M:%S.%N"
|
75
|
-
else
|
76
|
-
return "%Y-%m-%d %H:%M:%S.%N"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Common US-style formats
|
82
|
-
|
83
|
-
# MM/DD/YYYY (e.g., "01/31/2021")
|
84
|
-
return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
|
85
|
-
|
86
|
-
# DD-MM-YYYY (e.g., "31-01-2021")
|
87
|
-
return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
|
88
|
-
|
89
|
-
# DD-Mon-YYYY (e.g., "31-Jan-2021")
|
90
|
-
return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
|
91
|
-
|
92
|
-
# Mon DD, YYYY (e.g., "Jan 31, 2021")
|
93
|
-
return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
|
94
|
-
|
95
|
-
# Could add additional heuristics as needed...
|
96
|
-
|
97
|
-
nil # Return nil if no known format matches.
|
22
|
+
def self.maybe_convert_date(df, column = nil)
|
23
|
+
column = column.to_s if column.present?
|
24
|
+
if df.is_a?(Polars::Series)
|
25
|
+
column = "temp" if column.nil?
|
26
|
+
df = Polars::DataFrame.new({ column.to_s => df })
|
98
27
|
end
|
28
|
+
return df unless df.columns.include?(column)
|
29
|
+
return df if df[column].dtype.is_a?(Polars::Datetime)
|
99
30
|
|
100
|
-
|
101
|
-
|
102
|
-
# @param column [String] The name of the column to convert.
|
103
|
-
# @return [Polars::DataFrame] The dataframe with the converted column (if successful).
|
104
|
-
def maybe_convert_date(df, column = nil)
|
105
|
-
if column.nil?
|
106
|
-
series = df
|
107
|
-
column = series.name
|
108
|
-
df = Polars::DataFrame.new(series)
|
109
|
-
else
|
110
|
-
series = df[column]
|
111
|
-
end
|
112
|
-
|
113
|
-
return df if series.dtype.is_a?(Polars::Datetime)
|
114
|
-
return df unless series.dtype == Polars::Utf8
|
115
|
-
|
116
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
117
|
-
ruby_format = detect_date_format(sample)
|
31
|
+
conversions = df.select(queries(column)).to_hashes&.first || []
|
32
|
+
return df unless conversions.any?
|
118
33
|
|
119
|
-
|
120
|
-
|
121
|
-
df = try_format(df, column, format)
|
34
|
+
conversions = conversions.select { |k, v| v }
|
35
|
+
return df unless conversions.any?
|
122
36
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
infer_strftime_format(date_str)
|
127
|
-
end.max_by { |_format, count| count }[0]
|
128
|
-
df = try_format(df, column, best_format)
|
129
|
-
end
|
130
|
-
|
131
|
-
df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
|
132
|
-
end
|
133
|
-
|
134
|
-
df
|
135
|
-
end
|
136
|
-
|
137
|
-
private
|
138
|
-
|
139
|
-
def try_format(df, column, format)
|
140
|
-
df = df.with_column(
|
141
|
-
Polars.col(column.to_s)
|
142
|
-
.str
|
143
|
-
.strptime(Polars::Datetime, format, strict: false)
|
144
|
-
.alias("TRY")
|
145
|
-
)
|
146
|
-
end
|
147
|
-
|
148
|
-
def detect_polars_format(series)
|
149
|
-
return nil unless series.is_a?(Polars::Series)
|
150
|
-
|
151
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
152
|
-
ruby_format = detect_date_format(sample)
|
153
|
-
convert_format(:ruby_to_polars, ruby_format)
|
37
|
+
conversions.map do |k, _|
|
38
|
+
conversion = conversion(k)
|
39
|
+
df = df.with_columns(conversion)
|
154
40
|
end
|
155
41
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
sample = date_strings.compact.sample([100, date_strings.length].min)
|
160
|
-
|
161
|
-
best_format = nil
|
162
|
-
best_success_rate = 0.0
|
163
|
-
sample_count = sample.length
|
164
|
-
|
165
|
-
COMMON_DATE_FORMATS.each do |fmt|
|
166
|
-
success_count = sample.count do |date_str|
|
167
|
-
begin
|
168
|
-
DateTime.strptime(date_str, fmt)
|
169
|
-
true
|
170
|
-
rescue StandardError
|
171
|
-
false
|
172
|
-
end
|
173
|
-
end
|
174
|
-
success_rate = success_count.to_f / sample_count
|
175
|
-
if success_rate > best_success_rate
|
176
|
-
best_success_rate = success_rate
|
177
|
-
best_format = fmt
|
178
|
-
end
|
179
|
-
# If every sample string matches this format, return it immediately.
|
180
|
-
return fmt if success_rate == 1.0
|
181
|
-
end
|
42
|
+
df
|
43
|
+
end
|
182
44
|
|
183
|
-
|
45
|
+
def self.queries(column)
|
46
|
+
COMMON_DATE_FORMATS.map do |format|
|
47
|
+
Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
|
184
48
|
end
|
49
|
+
end
|
185
50
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
result = format.dup
|
190
|
-
FORMAT_MAPPINGS[conversion].each do |from, to|
|
191
|
-
result = result.gsub(from, to)
|
192
|
-
end
|
193
|
-
result
|
194
|
-
end
|
51
|
+
def self.conversion(key)
|
52
|
+
key, ruby_type = key.split("convert_").last.split("_to_")
|
53
|
+
Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
|
195
54
|
end
|
196
55
|
end
|
197
56
|
end
|
@@ -16,10 +16,12 @@ module EasyML
|
|
16
16
|
array: Polars::List,
|
17
17
|
}
|
18
18
|
POLARS_MAP = {
|
19
|
+
Polars::Decimal => :float,
|
19
20
|
Polars::Float64 => :float,
|
20
21
|
Polars::Int64 => :integer,
|
21
|
-
Polars::Float32 => :float,
|
22
22
|
Polars::Int32 => :integer,
|
23
|
+
Polars::Int8 => :integer,
|
24
|
+
Polars::Float32 => :float,
|
23
25
|
Polars::Boolean => :boolean,
|
24
26
|
Polars::Datetime => :datetime,
|
25
27
|
Polars::Date => :date,
|
@@ -137,8 +139,9 @@ module EasyML
|
|
137
139
|
return :numeric
|
138
140
|
rescue StandardError
|
139
141
|
# If not numeric, check for datetime or categorical
|
140
|
-
|
142
|
+
is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
|
141
143
|
:temp)[:temp].dtype.is_a?(Polars::Datetime)
|
144
|
+
if is_datetime
|
142
145
|
:datetime
|
143
146
|
else
|
144
147
|
categorical_or_text?(series)
|
@@ -319,7 +319,10 @@ module EasyML
|
|
319
319
|
def learn_dataset
|
320
320
|
return schema if schema.present?
|
321
321
|
|
322
|
-
|
322
|
+
if parquet_files.present?
|
323
|
+
EasyML::Data::DatasetManager.normalize_schema(parquet_files)
|
324
|
+
existing_schema = existing_parquet_schema
|
325
|
+
end
|
323
326
|
schema = existing_schema || normalize_dataset
|
324
327
|
|
325
328
|
self.schema = schema
|
@@ -126,7 +126,7 @@ module EasyML
|
|
126
126
|
)
|
127
127
|
|
128
128
|
Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
|
129
|
-
if object.key.end_with?(".gz")
|
129
|
+
if object.key.end_with?(".gz") && !object.key.end_with?(".parquet.gz")
|
130
130
|
ungzipped_file_path = ungzip_file(local_file_path)
|
131
131
|
Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
|
132
132
|
end
|
@@ -284,48 +284,61 @@ module EasyML
|
|
284
284
|
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
|
285
285
|
s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
|
286
286
|
|
287
|
-
# Create a temporary gzipped version of the file
|
288
|
-
gzipped_file_path = "#{file_path}.gz"
|
289
|
-
|
290
287
|
begin
|
291
|
-
Rails.logger.info("
|
288
|
+
Rails.logger.info("Uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
|
292
289
|
|
293
|
-
|
294
|
-
|
290
|
+
if file_path.end_with?(".parquet")
|
291
|
+
# Upload parquet files directly without compression
|
295
292
|
File.open(file_path, "rb") do |file|
|
296
|
-
|
293
|
+
s3.put_object(
|
294
|
+
bucket: s3_bucket,
|
295
|
+
key: s3_key,
|
296
|
+
body: file
|
297
|
+
)
|
298
|
+
end
|
299
|
+
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}")
|
300
|
+
else
|
301
|
+
# Create a temporary gzipped version of the file
|
302
|
+
gzipped_file_path = "#{file_path}.gz"
|
303
|
+
|
304
|
+
# Compress the file
|
305
|
+
Zlib::GzipWriter.open(gzipped_file_path) do |gz|
|
306
|
+
File.open(file_path, "rb") do |file|
|
307
|
+
gz.write(file.read)
|
308
|
+
end
|
297
309
|
end
|
298
|
-
end
|
299
310
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
311
|
+
# Upload the gzipped file
|
312
|
+
File.open(gzipped_file_path, "rb") do |file|
|
313
|
+
s3.put_object(
|
314
|
+
bucket: s3_bucket,
|
315
|
+
key: "#{s3_key}.gz",
|
316
|
+
body: file,
|
317
|
+
content_encoding: "gzip",
|
318
|
+
)
|
319
|
+
end
|
320
|
+
|
321
|
+
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
|
309
322
|
|
310
|
-
|
323
|
+
# Clean up temporary gzipped file
|
324
|
+
File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
|
325
|
+
end
|
311
326
|
rescue Aws::S3::Errors::ServiceError, StandardError => e
|
312
327
|
Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
|
313
328
|
raise e
|
314
|
-
ensure
|
315
|
-
# Clean up temporary gzipped file
|
316
|
-
File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
|
317
329
|
end
|
318
330
|
end
|
319
331
|
|
320
332
|
def should_upload?(file_path)
|
321
333
|
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
|
322
334
|
s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
|
335
|
+
s3_key = "#{s3_key}.gz" unless file_path.end_with?(".parquet")
|
323
336
|
|
324
337
|
begin
|
325
338
|
# Check if file exists in S3
|
326
339
|
response = s3.head_object(
|
327
340
|
bucket: s3_bucket,
|
328
|
-
key:
|
341
|
+
key: s3_key,
|
329
342
|
)
|
330
343
|
|
331
344
|
# Compare modification times
|
data/lib/easy_ml/engine.rb
CHANGED
@@ -0,0 +1,9 @@
|
|
1
|
+
class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_datasets, :view_class, :string
|
4
|
+
add_index :easy_ml_datasets, :view_class
|
5
|
+
|
6
|
+
add_column :easy_ml_dataset_histories, :view_class, :string
|
7
|
+
add_index :easy_ml_dataset_histories, :view_class
|
8
|
+
end
|
9
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED