easy_ml 0.2.0.pre.rc101 → 0.2.0.pre.rc103
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +1 -0
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +0 -1
- data/app/frontend/components/dataset/splitters/types.ts +3 -4
- data/app/frontend/pages/NewDatasetPage.tsx +17 -0
- data/app/frontend/types/datasource.ts +14 -6
- data/app/models/easy_ml/column/imputers/base.rb +3 -1
- data/app/models/easy_ml/column.rb +26 -13
- data/app/models/easy_ml/column_list.rb +2 -2
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +24 -5
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +19 -7
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +4 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +17 -4
- data/app/models/easy_ml/dataset.rb +47 -9
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/feature.rb +5 -13
- data/app/models/easy_ml/lineage.rb +2 -1
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
- data/app/models/easy_ml/models/xgboost.rb +8 -3
- data/app/models/easy_ml/prediction.rb +1 -1
- data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
- data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
- data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
- data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
- data/lib/easy_ml/data/dataset_manager.rb +5 -0
- data/lib/easy_ml/data/date_converter.rb +24 -165
- data/lib/easy_ml/data/polars_column.rb +4 -2
- data/lib/easy_ml/data/polars_reader.rb +5 -2
- data/lib/easy_ml/engine.rb +4 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-gkZ77wo8.js} +8 -8
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-gkZ77wo8.js.map} +1 -1
- metadata +7 -5
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Schema
|
5
|
+
class Normalizer
|
6
|
+
|
7
|
+
attr_accessor :files
|
8
|
+
|
9
|
+
def initialize(files)
|
10
|
+
@files = files
|
11
|
+
end
|
12
|
+
|
13
|
+
def normalize
|
14
|
+
shared_schema = find_common_schema(files)
|
15
|
+
if schema_changed?(files, shared_schema)
|
16
|
+
queries = schema_to_queries(shared_schema)
|
17
|
+
rewrite_dataset(files, queries)
|
18
|
+
end
|
19
|
+
|
20
|
+
queries = improve_schema(files, shared_schema)
|
21
|
+
if queries.any?
|
22
|
+
rewrite_dataset(files, queries)
|
23
|
+
end
|
24
|
+
files
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def schema_changed?(files, schema)
|
30
|
+
Polars.scan_parquet(files.first).schema != schema
|
31
|
+
end
|
32
|
+
|
33
|
+
def rewrite_dataset(files, queries)
|
34
|
+
files.each do |file|
|
35
|
+
Polars.scan_parquet(file).select(queries).collect.write_parquet("#{file}_normalized.parquet")
|
36
|
+
puts "Rewriting #{file}..."
|
37
|
+
File.delete(file)
|
38
|
+
FileUtils.mv("#{file}_normalized.parquet", file)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def improve_schema(files, schema)
|
43
|
+
checks = schema_checks(schema)
|
44
|
+
return [] unless checks.any?
|
45
|
+
|
46
|
+
improvements = Polars.scan_parquet(files).select(checks).collect
|
47
|
+
conversions = improvements.to_hashes&.first || []
|
48
|
+
return [] unless conversions.any?
|
49
|
+
conversions = conversions&.select { |k,v| v }
|
50
|
+
return [] unless conversions.any?
|
51
|
+
|
52
|
+
conversions = conversions.reduce({}) do |hash, (k, _)|
|
53
|
+
hash.tap do
|
54
|
+
key, ruby_type = k.split("convert_").last.split("_to_")
|
55
|
+
conversion = case ruby_type
|
56
|
+
when "int"
|
57
|
+
Polars.col(key).cast(Polars::Int64).alias(key)
|
58
|
+
else
|
59
|
+
EasyML::Data::DateConverter.conversion(k)
|
60
|
+
end
|
61
|
+
hash[key] = conversion
|
62
|
+
end
|
63
|
+
end
|
64
|
+
schema.map do |k, v|
|
65
|
+
conversions[k] || Polars.col(k).cast(v).alias(k)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def schema_to_queries(schema)
|
70
|
+
schema.map do |k, v|
|
71
|
+
Polars.col(k).cast(v).alias(k)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def schema_checks(schema)
|
76
|
+
schema.flat_map do |key, value|
|
77
|
+
case value
|
78
|
+
when Polars::FloatType, Polars::Decimal
|
79
|
+
Polars.col(key).cast(Polars::Int64).cast(value).eq(Polars.col(key)).all().alias("convert_#{key}_to_int")
|
80
|
+
when Polars::String
|
81
|
+
EasyML::Data::DateConverter.queries(key)
|
82
|
+
end
|
83
|
+
end.compact
|
84
|
+
end
|
85
|
+
|
86
|
+
# Function to find a common schema across multiple parquet files
|
87
|
+
def find_common_schema(parquet_files)
|
88
|
+
# Get schema from each file
|
89
|
+
schemas = []
|
90
|
+
|
91
|
+
parquet_files.each do |file|
|
92
|
+
begin
|
93
|
+
# Read just the schema without loading data
|
94
|
+
schema = Polars.scan_parquet(file).schema
|
95
|
+
schemas << schema
|
96
|
+
rescue => e
|
97
|
+
puts "Warning: Error reading schema from #{file}: #{e.message}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Find common schema - start with first file's schema
|
102
|
+
return {} if schemas.empty?
|
103
|
+
|
104
|
+
key_count = Hash.new(0)
|
105
|
+
common_schema = schemas.first
|
106
|
+
|
107
|
+
# Reconcile types across all schemas
|
108
|
+
schemas.each do |schema|
|
109
|
+
schema.each do |name, dtype|
|
110
|
+
key_count[name] += 1
|
111
|
+
if common_schema.key?(name)
|
112
|
+
# If types don't match, choose the more general type
|
113
|
+
if common_schema[name] != dtype
|
114
|
+
common_schema[name] = choose_compatible_type(common_schema[name], dtype)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Filter out columns that aren't present in all files
|
121
|
+
common_schema = common_schema.select { |name, _| key_count[name] == schemas.length }
|
122
|
+
|
123
|
+
return common_schema
|
124
|
+
end
|
125
|
+
|
126
|
+
# Choose a type that's compatible with both input types
|
127
|
+
def choose_compatible_type(type1, type2)
|
128
|
+
# Integer types - use the larger of the two
|
129
|
+
int_types = [Polars::Int8, Polars::Int16, Polars::Int32, Polars::Int64]
|
130
|
+
|
131
|
+
# If both are integers, choose the larger one
|
132
|
+
if int_types.include?(type1.class) && int_types.include?(type2.class)
|
133
|
+
return [type1, type2].max_by { |t| int_types.index(t.class) }
|
134
|
+
end
|
135
|
+
|
136
|
+
# If one is Int64 and one is Decimal with scale 0, use Decimal
|
137
|
+
if (type1.is_a?(Polars::Int64) && type2.is_a?(Polars::Decimal) && type2.scale == 0) ||
|
138
|
+
(type2.is_a?(Polars::Int64) && type1.is_a?(Polars::Decimal) && type1.scale == 0)
|
139
|
+
return type1.is_a?(Polars::Decimal) ? type1 : type2
|
140
|
+
end
|
141
|
+
|
142
|
+
# If types are drastically different, convert to string as a safe fallback
|
143
|
+
if [Polars::String, Polars::Categorical].include?(type1.class) ||
|
144
|
+
[Polars::String, Polars::Categorical].include?(type2.class)
|
145
|
+
return Polars::String.new
|
146
|
+
end
|
147
|
+
|
148
|
+
# For float vs decimal, choose decimal if it has scale > 0
|
149
|
+
if (type1.is_a?(Polars::Float64) && type2.is_a?(Polars::Decimal) && type2.scale > 0) ||
|
150
|
+
(type2.is_a?(Polars::Float64) && type1.is_a?(Polars::Decimal) && type1.scale > 0)
|
151
|
+
return type1.is_a?(Polars::Decimal) ? type1 : type2
|
152
|
+
end
|
153
|
+
|
154
|
+
# Default to Float64 for numeric type conflicts
|
155
|
+
if [Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type1.is_a?(t) } &&
|
156
|
+
[Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type2.is_a?(t) }
|
157
|
+
return Polars::Float64.new
|
158
|
+
end
|
159
|
+
|
160
|
+
# Fallback - use first type
|
161
|
+
return type1
|
162
|
+
end
|
163
|
+
|
164
|
+
# Apply a common schema to read all parquet files
|
165
|
+
def read_with_common_schema(parquet_files)
|
166
|
+
schema = find_common_schema(parquet_files)
|
167
|
+
return Polars.scan_parquet(parquet_files).with_schema(schema).collect
|
168
|
+
end
|
169
|
+
|
170
|
+
# Alternative approach using a union scan
|
171
|
+
def union_scan_parquet(parquet_files)
|
172
|
+
if parquet_files.empty?
|
173
|
+
return Polars.DataFrame.new
|
174
|
+
end
|
175
|
+
|
176
|
+
# Create separate scans with explicit schemas
|
177
|
+
scans = []
|
178
|
+
schema = find_common_schema(parquet_files)
|
179
|
+
|
180
|
+
parquet_files.each do |file|
|
181
|
+
scans << Polars.scan_parquet(file).with_schema(schema)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Union all scans
|
185
|
+
if scans.length == 1
|
186
|
+
return scans.first.collect
|
187
|
+
else
|
188
|
+
# Combine using concat (union all)
|
189
|
+
union = scans.first
|
190
|
+
scans[1..-1].each do |scan|
|
191
|
+
union = union.concat(scan)
|
192
|
+
end
|
193
|
+
|
194
|
+
return union.collect
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
@@ -3,6 +3,7 @@ module EasyML
|
|
3
3
|
class DatasetManager
|
4
4
|
require_relative "dataset_manager/writer"
|
5
5
|
require_relative "dataset_manager/reader"
|
6
|
+
require_relative "dataset_manager/schema"
|
6
7
|
|
7
8
|
attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
|
8
9
|
:partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
|
@@ -55,6 +56,10 @@ module EasyML
|
|
55
56
|
def cp(from, to)
|
56
57
|
Writer.cp(from, to)
|
57
58
|
end
|
59
|
+
|
60
|
+
def normalize_schema(files)
|
61
|
+
Schema::Normalizer.new(files).normalize
|
62
|
+
end
|
58
63
|
end
|
59
64
|
|
60
65
|
def list_nulls(input = nil, **kwargs, &block)
|
@@ -2,10 +2,10 @@ module EasyML
|
|
2
2
|
module Data
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
|
+
"%Y-%m-%d %H:%M:%S.%f %Z",
|
5
6
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
7
|
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
7
8
|
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
|
-
"%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
|
9
9
|
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|
10
10
|
"%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
|
11
11
|
"%Y-%m-%d", # e.g., "2021-01-01"
|
@@ -19,179 +19,38 @@ module EasyML
|
|
19
19
|
"%Y/%m/%d", # e.g., "2021/01/01"
|
20
20
|
].freeze
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
},
|
28
|
-
}.freeze
|
29
|
-
|
30
|
-
class << self
|
31
|
-
# Infers a strftime format string from the given date string.
|
32
|
-
#
|
33
|
-
# @param date_str [String] The date string to analyze.
|
34
|
-
# @return [String, nil] The corresponding strftime format if recognized, or nil if not.
|
35
|
-
def infer_strftime_format(date_str)
|
36
|
-
return nil if date_str.blank?
|
37
|
-
|
38
|
-
# YYYY-MM-DD (e.g., "2021-01-01")
|
39
|
-
return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
|
40
|
-
|
41
|
-
# YYYY/MM/DD (e.g., "2021/01/01")
|
42
|
-
return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
|
43
|
-
|
44
|
-
# Date & time with T separator (ISO 8601-like)
|
45
|
-
if date_str.include?("T")
|
46
|
-
# Without fractional seconds, e.g., "2021-01-01T12:34:56"
|
47
|
-
return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
|
48
|
-
|
49
|
-
# With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
|
50
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
|
51
|
-
fraction = Regexp.last_match(1)
|
52
|
-
case fraction.length
|
53
|
-
when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
|
54
|
-
when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
|
55
|
-
when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
|
56
|
-
else
|
57
|
-
# Fallback if fractional part has unexpected length:
|
58
|
-
return "%Y-%m-%dT%H:%M:%S.%N"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Date & time with space separator
|
64
|
-
if date_str.include?(" ")
|
65
|
-
# Without fractional seconds, e.g., "2021-01-01 12:34:56"
|
66
|
-
return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
|
67
|
-
|
68
|
-
# With fractional seconds, e.g., "2021-01-01 12:34:56.789"
|
69
|
-
if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
|
70
|
-
fraction = Regexp.last_match(1)
|
71
|
-
case fraction.length
|
72
|
-
when 3 then return "%Y-%m-%d %H:%M:%S.%L"
|
73
|
-
when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
|
74
|
-
when 9 then return "%Y-%m-%d %H:%M:%S.%N"
|
75
|
-
else
|
76
|
-
return "%Y-%m-%d %H:%M:%S.%N"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Common US-style formats
|
82
|
-
|
83
|
-
# MM/DD/YYYY (e.g., "01/31/2021")
|
84
|
-
return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
|
85
|
-
|
86
|
-
# DD-MM-YYYY (e.g., "31-01-2021")
|
87
|
-
return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
|
88
|
-
|
89
|
-
# DD-Mon-YYYY (e.g., "31-Jan-2021")
|
90
|
-
return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
|
91
|
-
|
92
|
-
# Mon DD, YYYY (e.g., "Jan 31, 2021")
|
93
|
-
return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
|
94
|
-
|
95
|
-
# Could add additional heuristics as needed...
|
96
|
-
|
97
|
-
nil # Return nil if no known format matches.
|
22
|
+
def self.maybe_convert_date(df, column = nil)
|
23
|
+
column = column.to_s if column.present?
|
24
|
+
if df.is_a?(Polars::Series)
|
25
|
+
column = "temp" if column.nil?
|
26
|
+
df = Polars::DataFrame.new({ column.to_s => df })
|
98
27
|
end
|
28
|
+
return df unless df.columns.include?(column)
|
29
|
+
return df if df[column].dtype.is_a?(Polars::Datetime)
|
99
30
|
|
100
|
-
|
101
|
-
|
102
|
-
# @param column [String] The name of the column to convert.
|
103
|
-
# @return [Polars::DataFrame] The dataframe with the converted column (if successful).
|
104
|
-
def maybe_convert_date(df, column = nil)
|
105
|
-
if column.nil?
|
106
|
-
series = df
|
107
|
-
column = series.name
|
108
|
-
df = Polars::DataFrame.new(series)
|
109
|
-
else
|
110
|
-
series = df[column]
|
111
|
-
end
|
112
|
-
|
113
|
-
return df if series.dtype.is_a?(Polars::Datetime)
|
114
|
-
return df unless series.dtype == Polars::Utf8
|
115
|
-
|
116
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
117
|
-
ruby_format = detect_date_format(sample)
|
31
|
+
conversions = df.select(queries(column)).to_hashes&.first || []
|
32
|
+
return df unless conversions.any?
|
118
33
|
|
119
|
-
|
120
|
-
|
121
|
-
df = try_format(df, column, format)
|
34
|
+
conversions = conversions.select { |k, v| v }
|
35
|
+
return df unless conversions.any?
|
122
36
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
infer_strftime_format(date_str)
|
127
|
-
end.max_by { |_format, count| count }[0]
|
128
|
-
df = try_format(df, column, best_format)
|
129
|
-
end
|
130
|
-
|
131
|
-
df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
|
132
|
-
end
|
133
|
-
|
134
|
-
df
|
135
|
-
end
|
136
|
-
|
137
|
-
private
|
138
|
-
|
139
|
-
def try_format(df, column, format)
|
140
|
-
df = df.with_column(
|
141
|
-
Polars.col(column.to_s)
|
142
|
-
.str
|
143
|
-
.strptime(Polars::Datetime, format, strict: false)
|
144
|
-
.alias("TRY")
|
145
|
-
)
|
146
|
-
end
|
147
|
-
|
148
|
-
def detect_polars_format(series)
|
149
|
-
return nil unless series.is_a?(Polars::Series)
|
150
|
-
|
151
|
-
sample = series.filter(series.is_not_null).head(100).to_a
|
152
|
-
ruby_format = detect_date_format(sample)
|
153
|
-
convert_format(:ruby_to_polars, ruby_format)
|
37
|
+
conversions.map do |k, _|
|
38
|
+
conversion = conversion(k)
|
39
|
+
df = df.with_columns(conversion)
|
154
40
|
end
|
155
41
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
sample = date_strings.compact.sample([100, date_strings.length].min)
|
160
|
-
|
161
|
-
best_format = nil
|
162
|
-
best_success_rate = 0.0
|
163
|
-
sample_count = sample.length
|
164
|
-
|
165
|
-
COMMON_DATE_FORMATS.each do |fmt|
|
166
|
-
success_count = sample.count do |date_str|
|
167
|
-
begin
|
168
|
-
DateTime.strptime(date_str, fmt)
|
169
|
-
true
|
170
|
-
rescue StandardError
|
171
|
-
false
|
172
|
-
end
|
173
|
-
end
|
174
|
-
success_rate = success_count.to_f / sample_count
|
175
|
-
if success_rate > best_success_rate
|
176
|
-
best_success_rate = success_rate
|
177
|
-
best_format = fmt
|
178
|
-
end
|
179
|
-
# If every sample string matches this format, return it immediately.
|
180
|
-
return fmt if success_rate == 1.0
|
181
|
-
end
|
42
|
+
df
|
43
|
+
end
|
182
44
|
|
183
|
-
|
45
|
+
def self.queries(column)
|
46
|
+
COMMON_DATE_FORMATS.map do |format|
|
47
|
+
Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
|
184
48
|
end
|
49
|
+
end
|
185
50
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
result = format.dup
|
190
|
-
FORMAT_MAPPINGS[conversion].each do |from, to|
|
191
|
-
result = result.gsub(from, to)
|
192
|
-
end
|
193
|
-
result
|
194
|
-
end
|
51
|
+
def self.conversion(key)
|
52
|
+
key, ruby_type = key.split("convert_").last.split("_to_")
|
53
|
+
Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
|
195
54
|
end
|
196
55
|
end
|
197
56
|
end
|
@@ -19,8 +19,9 @@ module EasyML
|
|
19
19
|
Polars::Decimal => :float,
|
20
20
|
Polars::Float64 => :float,
|
21
21
|
Polars::Int64 => :integer,
|
22
|
-
Polars::Float32 => :float,
|
23
22
|
Polars::Int32 => :integer,
|
23
|
+
Polars::Int8 => :integer,
|
24
|
+
Polars::Float32 => :float,
|
24
25
|
Polars::Boolean => :boolean,
|
25
26
|
Polars::Datetime => :datetime,
|
26
27
|
Polars::Date => :date,
|
@@ -138,8 +139,9 @@ module EasyML
|
|
138
139
|
return :numeric
|
139
140
|
rescue StandardError
|
140
141
|
# If not numeric, check for datetime or categorical
|
141
|
-
|
142
|
+
is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
|
142
143
|
:temp)[:temp].dtype.is_a?(Polars::Datetime)
|
144
|
+
if is_datetime
|
143
145
|
:datetime
|
144
146
|
else
|
145
147
|
categorical_or_text?(series)
|
@@ -175,7 +175,7 @@ module EasyML
|
|
175
175
|
end
|
176
176
|
combined_lazy_df = combined_lazy_df.with_columns(
|
177
177
|
cast.map do |col, dtype|
|
178
|
-
Polars.col(col).cast(dtype).alias(col)
|
178
|
+
Polars.col(col).cast(dtype, strict: false).alias(col)
|
179
179
|
end
|
180
180
|
)
|
181
181
|
end
|
@@ -319,7 +319,10 @@ module EasyML
|
|
319
319
|
def learn_dataset
|
320
320
|
return schema if schema.present?
|
321
321
|
|
322
|
-
|
322
|
+
if parquet_files.present?
|
323
|
+
EasyML::Data::DatasetManager.normalize_schema(parquet_files)
|
324
|
+
existing_schema = existing_parquet_schema
|
325
|
+
end
|
323
326
|
schema = existing_schema || normalize_dataset
|
324
327
|
|
325
328
|
self.schema = schema
|
data/lib/easy_ml/engine.rb
CHANGED
@@ -0,0 +1,9 @@
|
|
1
|
+
class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_datasets, :view_class, :string
|
4
|
+
add_index :easy_ml_datasets, :view_class
|
5
|
+
|
6
|
+
add_column :easy_ml_dataset_histories, :view_class, :string
|
7
|
+
add_index :easy_ml_dataset_histories, :view_class
|
8
|
+
end
|
9
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED