easy_ml 0.2.0.pre.rc101 → 0.2.0.pre.rc102
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +1 -0
- data/app/frontend/components/dataset/splitters/types.ts +3 -4
- data/app/frontend/pages/NewDatasetPage.tsx +17 -0
- data/app/frontend/types/datasource.ts +14 -6
- data/app/models/easy_ml/column/imputers/base.rb +3 -1
- data/app/models/easy_ml/column.rb +8 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
- data/app/models/easy_ml/dataset.rb +47 -9
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/feature.rb +5 -13
- data/app/models/easy_ml/lineage.rb +2 -1
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
- data/app/models/easy_ml/models/xgboost.rb +7 -2
- data/app/models/easy_ml/prediction.rb +1 -1
- data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
- data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
- data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
- data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
- data/lib/easy_ml/data/dataset_manager.rb +5 -0
- data/lib/easy_ml/data/date_converter.rb +24 -165
- data/lib/easy_ml/data/polars_column.rb +4 -2
- data/lib/easy_ml/data/polars_reader.rb +4 -1
- data/lib/easy_ml/engine.rb +4 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
- metadata +7 -5
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
@@ -3,6 +3,7 @@ module EasyML
|
|
3
3
|
class DatasetManager
|
4
4
|
require_relative "dataset_manager/writer"
|
5
5
|
require_relative "dataset_manager/reader"
|
6
|
+
require_relative "dataset_manager/schema"
|
6
7
|
|
7
8
|
attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
|
8
9
|
:partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
|
@@ -55,6 +56,10 @@ module EasyML
|
|
55
56
|
def cp(from, to)
|
56
57
|
Writer.cp(from, to)
|
57
58
|
end
|
59
|
+
|
60
|
+
def normalize_schema(files)
|
61
|
+
Schema::Normalizer.new(files).normalize
|
62
|
+
end
|
58
63
|
end
|
59
64
|
|
60
65
|
def list_nulls(input = nil, **kwargs, &block)
|
@@ -2,10 +2,10 @@ module EasyML
|
|
2
2
|
module Data
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
|
+
"%Y-%m-%d %H:%M:%S.%f %Z",
|
5
6
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
7
|
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
7
8
|
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
|
-
"%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
|
9
9
|
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|
10
10
|
"%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
|
11
11
|
"%Y-%m-%d", # e.g., "2021-01-01"
|
@@ -19,179 +19,38 @@ module EasyML
|
|
19
19
|
"%Y/%m/%d", # e.g., "2021/01/01"
|
20
20
|
].freeze
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
},
|
28
|
-
}.freeze
|
29
|
-
|
30
|
-
class << self
|
31
|
-
# Infers a strftime format string from the given date string.
|
32
|
-
#
|
33
|
-
# @param date_str [String] The date string to analyze.
|
34
|
-
# @return [String, nil] The corresponding strftime format if recognized, or nil if not.
|
35
|
-
def infer_strftime_format(date_str)
|
36
|
-
return nil if date_str.blank?
|
37
|
-
|
38
|
-
# YYYY-MM-DD (e.g., "2021-01-01")
|
39
|
-
return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
|
40
|
-
|
41
|
-
# YYYY/MM/DD (e.g., "2021/01/01")
|
42
|
-
return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
|
43
|
-
|
44
|
-
# Date & time with T separator (ISO 8601-like)
|
45
|
-
if date_str.include?("T")
|
46
|
-
# Without fractional seconds, e.g., "2021-01-01T12:34:56"
|
47
|
-
return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
|
48
|
-
|
49
|
-
# With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
|
50
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
|
51
|
-
fraction = Regexp.last_match(1)
|
52
|
-
case fraction.length
|
53
|
-
when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
|
54
|
-
when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
|
55
|
-
when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
|
56
|
-
else
|
57
|
-
# Fallback if fractional part has unexpected length:
|
58
|
-
return "%Y-%m-%dT%H:%M:%S.%N"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Date & time with space separator
|
64
|
-
if date_str.include?(" ")
|
65
|
-
# Without fractional seconds, e.g., "2021-01-01 12:34:56"
|
66
|
-
return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
|
67
|
-
|
68
|
-
# With fractional seconds, e.g., "2021-01-01 12:34:56.789"
|
69
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
|
70
|
-
fraction = Regexp.last_match(1)
|
71
|
-
case fraction.length
|
72
|
-
when 3 then return "%Y-%m-%d %H:%M:%S.%L"
|
73
|
-
when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
|
74
|
-
when 9 then return "%Y-%m-%d %H:%M:%S.%N"
|
75
|
-
else
|
76
|
-
return "%Y-%m-%d %H:%M:%S.%N"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Common US-style formats
|
82
|
-
|
83
|
-
# MM/DD/YYYY (e.g., "01/31/2021")
|
84
|
-
return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
|
85
|
-
|
86
|
-
# DD-MM-YYYY (e.g., "31-01-2021")
|
87
|
-
return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
|
88
|
-
|
89
|
-
# DD-Mon-YYYY (e.g., "31-Jan-2021")
|
90
|
-
return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
|
91
|
-
|
92
|
-
# Mon DD, YYYY (e.g., "Jan 31, 2021")
|
93
|
-
return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
|
94
|
-
|
95
|
-
# Could add additional heuristics as needed...
|
96
|
-
|
97
|
-
nil # Return nil if no known format matches.
|
22
|
+
def self.maybe_convert_date(df, column = nil)
|
23
|
+
column = column.to_s if column.present?
|
24
|
+
if df.is_a?(Polars::Series)
|
25
|
+
column = "temp" if column.nil?
|
26
|
+
df = Polars::DataFrame.new({ column.to_s => df })
|
98
27
|
end
|
28
|
+
return df unless df.columns.include?(column)
|
29
|
+
return df if df[column].dtype.is_a?(Polars::Datetime)
|
99
30
|
|
100
|
-
|
101
|
-
|
102
|
-
# @param column [String] The name of the column to convert.
|
103
|
-
# @return [Polars::DataFrame] The dataframe with the converted column (if successful).
|
104
|
-
def maybe_convert_date(df, column = nil)
|
105
|
-
if column.nil?
|
106
|
-
series = df
|
107
|
-
column = series.name
|
108
|
-
df = Polars::DataFrame.new(series)
|
109
|
-
else
|
110
|
-
series = df[column]
|
111
|
-
end
|
112
|
-
|
113
|
-
return df if series.dtype.is_a?(Polars::Datetime)
|
114
|
-
return df unless series.dtype == Polars::Utf8
|
115
|
-
|
116
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
117
|
-
ruby_format = detect_date_format(sample)
|
31
|
+
conversions = df.select(queries(column)).to_hashes&.first || []
|
32
|
+
return df unless conversions.any?
|
118
33
|
|
119
|
-
|
120
|
-
|
121
|
-
df = try_format(df, column, format)
|
34
|
+
conversions = conversions.select { |k, v| v }
|
35
|
+
return df unless conversions.any?
|
122
36
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
infer_strftime_format(date_str)
|
127
|
-
end.max_by { |_format, count| count }[0]
|
128
|
-
df = try_format(df, column, best_format)
|
129
|
-
end
|
130
|
-
|
131
|
-
df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
|
132
|
-
end
|
133
|
-
|
134
|
-
df
|
135
|
-
end
|
136
|
-
|
137
|
-
private
|
138
|
-
|
139
|
-
def try_format(df, column, format)
|
140
|
-
df = df.with_column(
|
141
|
-
Polars.col(column.to_s)
|
142
|
-
.str
|
143
|
-
.strptime(Polars::Datetime, format, strict: false)
|
144
|
-
.alias("TRY")
|
145
|
-
)
|
146
|
-
end
|
147
|
-
|
148
|
-
def detect_polars_format(series)
|
149
|
-
return nil unless series.is_a?(Polars::Series)
|
150
|
-
|
151
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
152
|
-
ruby_format = detect_date_format(sample)
|
153
|
-
convert_format(:ruby_to_polars, ruby_format)
|
37
|
+
conversions.map do |k, _|
|
38
|
+
conversion = conversion(k)
|
39
|
+
df = df.with_columns(conversion)
|
154
40
|
end
|
155
41
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
sample = date_strings.compact.sample([100, date_strings.length].min)
|
160
|
-
|
161
|
-
best_format = nil
|
162
|
-
best_success_rate = 0.0
|
163
|
-
sample_count = sample.length
|
164
|
-
|
165
|
-
COMMON_DATE_FORMATS.each do |fmt|
|
166
|
-
success_count = sample.count do |date_str|
|
167
|
-
begin
|
168
|
-
DateTime.strptime(date_str, fmt)
|
169
|
-
true
|
170
|
-
rescue StandardError
|
171
|
-
false
|
172
|
-
end
|
173
|
-
end
|
174
|
-
success_rate = success_count.to_f / sample_count
|
175
|
-
if success_rate > best_success_rate
|
176
|
-
best_success_rate = success_rate
|
177
|
-
best_format = fmt
|
178
|
-
end
|
179
|
-
# If every sample string matches this format, return it immediately.
|
180
|
-
return fmt if success_rate == 1.0
|
181
|
-
end
|
42
|
+
df
|
43
|
+
end
|
182
44
|
|
183
|
-
|
45
|
+
def self.queries(column)
|
46
|
+
COMMON_DATE_FORMATS.map do |format|
|
47
|
+
Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
|
184
48
|
end
|
49
|
+
end
|
185
50
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
result = format.dup
|
190
|
-
FORMAT_MAPPINGS[conversion].each do |from, to|
|
191
|
-
result = result.gsub(from, to)
|
192
|
-
end
|
193
|
-
result
|
194
|
-
end
|
51
|
+
def self.conversion(key)
|
52
|
+
key, ruby_type = key.split("convert_").last.split("_to_")
|
53
|
+
Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
|
195
54
|
end
|
196
55
|
end
|
197
56
|
end
|
@@ -19,8 +19,9 @@ module EasyML
|
|
19
19
|
Polars::Decimal => :float,
|
20
20
|
Polars::Float64 => :float,
|
21
21
|
Polars::Int64 => :integer,
|
22
|
-
Polars::Float32 => :float,
|
23
22
|
Polars::Int32 => :integer,
|
23
|
+
Polars::Int8 => :integer,
|
24
|
+
Polars::Float32 => :float,
|
24
25
|
Polars::Boolean => :boolean,
|
25
26
|
Polars::Datetime => :datetime,
|
26
27
|
Polars::Date => :date,
|
@@ -138,8 +139,9 @@ module EasyML
|
|
138
139
|
return :numeric
|
139
140
|
rescue StandardError
|
140
141
|
# If not numeric, check for datetime or categorical
|
141
|
-
|
142
|
+
is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
|
142
143
|
:temp)[:temp].dtype.is_a?(Polars::Datetime)
|
144
|
+
if is_datetime
|
143
145
|
:datetime
|
144
146
|
else
|
145
147
|
categorical_or_text?(series)
|
@@ -319,7 +319,10 @@ module EasyML
|
|
319
319
|
def learn_dataset
|
320
320
|
return schema if schema.present?
|
321
321
|
|
322
|
-
|
322
|
+
if parquet_files.present?
|
323
|
+
EasyML::Data::DatasetManager.normalize_schema(parquet_files)
|
324
|
+
existing_schema = existing_parquet_schema
|
325
|
+
end
|
323
326
|
schema = existing_schema || normalize_dataset
|
324
327
|
|
325
328
|
self.schema = schema
|
data/lib/easy_ml/engine.rb
CHANGED
@@ -0,0 +1,9 @@
|
|
1
|
+
class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_datasets, :view_class, :string
|
4
|
+
add_index :easy_ml_datasets, :view_class
|
5
|
+
|
6
|
+
add_column :easy_ml_dataset_histories, :view_class, :string
|
7
|
+
add_index :easy_ml_dataset_histories, :view_class
|
8
|
+
end
|
9
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED