easy_ml 0.2.0.pre.rc101 → 0.2.0.pre.rc102
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +1 -0
- data/app/frontend/components/dataset/splitters/types.ts +3 -4
- data/app/frontend/pages/NewDatasetPage.tsx +17 -0
- data/app/frontend/types/datasource.ts +14 -6
- data/app/models/easy_ml/column/imputers/base.rb +3 -1
- data/app/models/easy_ml/column.rb +8 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
- data/app/models/easy_ml/dataset.rb +47 -9
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/feature.rb +5 -13
- data/app/models/easy_ml/lineage.rb +2 -1
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
- data/app/models/easy_ml/models/xgboost.rb +7 -2
- data/app/models/easy_ml/prediction.rb +1 -1
- data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
- data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
- data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
- data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
- data/lib/easy_ml/data/dataset_manager.rb +5 -0
- data/lib/easy_ml/data/date_converter.rb +24 -165
- data/lib/easy_ml/data/polars_column.rb +4 -2
- data/lib/easy_ml/data/polars_reader.rb +4 -1
- data/lib/easy_ml/engine.rb +4 -0
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
- metadata +7 -5
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4c4736c4959fd2d08faac5dbe0c4525014edb8faa7e5b914875a0a84f58e53f2
|
4
|
+
data.tar.gz: bbab12ed80cf8c3bd608388648cd8362d7f4b46408b135aaf79ef494dca7deed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8dd7645d2b4da2d03a0c3fc1eaf9bcfdfd05ae31e9871782154ade2149ca4269ee5d78e6cb959d6f10e498cdbd427dfb958dc37c3b1208b0fe8885abac61dcad
|
7
|
+
data.tar.gz: 6690b85ba40db78063ffe8fbf3b9302b82a52c3137f5a59d8ba8bb70838ebd931965968185b6031b90fa0b9d8f3192bcc1d2150e3cce43a5ad1959af738d180e
|
@@ -1,12 +1,11 @@
|
|
1
|
-
import type {
|
1
|
+
import type { Constants } from '../../../types/datasource';
|
2
2
|
import type { Datasource } from '../types/datasource';
|
3
3
|
|
4
4
|
export type NewDatasetFormProps = {
|
5
5
|
datasources: Datasource[];
|
6
|
-
constants:
|
7
|
-
columns: ColumnType[];
|
8
|
-
};
|
6
|
+
constants: Constants;
|
9
7
|
}
|
8
|
+
|
10
9
|
export type SplitterType =
|
11
10
|
| 'date'
|
12
11
|
| 'random'
|
@@ -78,6 +78,7 @@ export default function NewDatasetPage({ constants, datasources }: NewDatasetFor
|
|
78
78
|
dataset: {
|
79
79
|
name: '',
|
80
80
|
datasource_id: '',
|
81
|
+
view_class: '',
|
81
82
|
splitter_attributes: {
|
82
83
|
splitter_type: selectedSplitterType,
|
83
84
|
...getDefaultConfig(selectedSplitterType)
|
@@ -249,6 +250,22 @@ export default function NewDatasetPage({ constants, datasources }: NewDatasetFor
|
|
249
250
|
/>
|
250
251
|
</div>
|
251
252
|
|
253
|
+
<div>
|
254
|
+
<label
|
255
|
+
htmlFor="view_class"
|
256
|
+
className="block text-sm font-medium text-gray-700"
|
257
|
+
>
|
258
|
+
View Class
|
259
|
+
</label>
|
260
|
+
<SearchableSelect
|
261
|
+
value={formData.dataset.view_class}
|
262
|
+
onChange={(value) => setData('dataset.view_class', value)}
|
263
|
+
options={constants.available_views}
|
264
|
+
className="mt-1"
|
265
|
+
placeholder="Select a view class (optional)..."
|
266
|
+
/>
|
267
|
+
</div>
|
268
|
+
|
252
269
|
{selectedDatasource && (
|
253
270
|
<div className={`rounded-lg p-4 ${
|
254
271
|
selectedDatasource.sync_error
|
@@ -10,6 +10,19 @@ export interface Schema {
|
|
10
10
|
[key: string]: ColumnType;
|
11
11
|
}
|
12
12
|
|
13
|
+
export interface Constants {
|
14
|
+
column_types: Array<{ value: string; label: string }>;
|
15
|
+
preprocessing_strategies: any;
|
16
|
+
feature_options: any;
|
17
|
+
splitter_constants: any;
|
18
|
+
embedding_constants: any;
|
19
|
+
available_views: Array<{ value: string; label: string }>;
|
20
|
+
DATASOURCE_TYPES: Array<{ value: string; label: string; description: string }>;
|
21
|
+
s3: {
|
22
|
+
S3_REGIONS: Array<{ value: string; label: string }>;
|
23
|
+
};
|
24
|
+
}
|
25
|
+
|
13
26
|
export interface Datasource {
|
14
27
|
id: number;
|
15
28
|
name: string;
|
@@ -23,10 +36,5 @@ export interface Datasource {
|
|
23
36
|
|
24
37
|
export interface DatasourceFormProps {
|
25
38
|
datasource?: Datasource;
|
26
|
-
constants:
|
27
|
-
DATASOURCE_TYPES: Array<{ value: string; label: string; description: string }>;
|
28
|
-
s3: {
|
29
|
-
S3_REGIONS: Array<{ value: string; label: string }>;
|
30
|
-
};
|
31
|
-
};
|
39
|
+
constants: Constants;
|
32
40
|
}
|
@@ -46,6 +46,8 @@ module EasyML
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def param_applies?
|
49
|
+
return false unless params.present?
|
50
|
+
|
49
51
|
params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
|
50
52
|
end
|
51
53
|
|
@@ -60,7 +62,7 @@ module EasyML
|
|
60
62
|
end
|
61
63
|
|
62
64
|
def imputers_own_params
|
63
|
-
Imputers.params_by_class[self.class] ||
|
65
|
+
Imputers.params_by_class[self.class] || {}
|
64
66
|
end
|
65
67
|
|
66
68
|
def imputers_own_encodings
|
@@ -71,6 +71,7 @@ module EasyML
|
|
71
71
|
scope :has_clip, -> { where("preprocessing_steps->'training'->>'params' IS NOT NULL AND preprocessing_steps->'training'->'params' @> jsonb_build_object('clip', jsonb_build_object())") }
|
72
72
|
scope :needs_learn, -> {
|
73
73
|
datasource_changed
|
74
|
+
.or(is_view)
|
74
75
|
.or(feature_applied)
|
75
76
|
.or(feature_changed)
|
76
77
|
.or(column_changed)
|
@@ -88,6 +89,13 @@ module EasyML
|
|
88
89
|
)
|
89
90
|
}
|
90
91
|
|
92
|
+
scope :is_view, -> {
|
93
|
+
left_joins(dataset: :datasource)
|
94
|
+
.left_joins(:feature)
|
95
|
+
.where(
|
96
|
+
Dataset.arel_table[:view_class].not_eq(nil)
|
97
|
+
)
|
98
|
+
}
|
91
99
|
scope :feature_changed, -> {
|
92
100
|
where(feature_id: Feature.has_changes.map(&:id))
|
93
101
|
}
|
@@ -22,9 +22,22 @@ module EasyML
|
|
22
22
|
def run_queries(split, type)
|
23
23
|
queries = build_queries(split, type)
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
begin
|
26
|
+
dataset.columns.apply_clip(
|
27
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true)
|
28
|
+
)
|
29
|
+
.select(queries).collect
|
30
|
+
rescue => e
|
31
|
+
problematic_query = queries.detect {
|
32
|
+
begin
|
33
|
+
dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
|
34
|
+
false
|
35
|
+
rescue => e
|
36
|
+
true
|
37
|
+
end
|
38
|
+
}
|
39
|
+
raise "Query failed for column #{problematic_query}, likely wrong datatype"
|
40
|
+
end
|
28
41
|
end
|
29
42
|
|
30
43
|
def get_column_statistics(query_results)
|
@@ -20,6 +20,7 @@
|
|
20
20
|
# updated_at :datetime not null
|
21
21
|
# last_datasource_sha :string
|
22
22
|
# raw_schema :jsonb
|
23
|
+
# view_class :string
|
23
24
|
#
|
24
25
|
module EasyML
|
25
26
|
class Dataset < ActiveRecord::Base
|
@@ -64,6 +65,7 @@ module EasyML
|
|
64
65
|
reject_if: :all_blank
|
65
66
|
|
66
67
|
validates :datasource, presence: true
|
68
|
+
validate :view_class_exists, if: -> { view_class.present? }
|
67
69
|
|
68
70
|
add_configuration_attributes :remote_files
|
69
71
|
|
@@ -85,6 +87,10 @@ module EasyML
|
|
85
87
|
feature_options: EasyML::Features::Registry.list_flat,
|
86
88
|
splitter_constants: EasyML::Splitter.constants,
|
87
89
|
embedding_constants: EasyML::Data::Embeddings::Embedder.constants,
|
90
|
+
available_views: Rails.root.join("app/datasets").glob("*.rb").map { |f|
|
91
|
+
name = f.basename(".rb").to_s.camelize
|
92
|
+
{ value: name, label: name.titleize }
|
93
|
+
}
|
88
94
|
}
|
89
95
|
end
|
90
96
|
|
@@ -148,7 +154,7 @@ module EasyML
|
|
148
154
|
return @schema if @schema
|
149
155
|
return read_attribute(:schema) if @serializing
|
150
156
|
|
151
|
-
schema = read_attribute(:schema) || datasource.schema || datasource.after_sync.schema
|
157
|
+
schema = read_attribute(:schema) || materialized_view&.schema || datasource.schema || datasource.after_sync.schema
|
152
158
|
schema = set_schema(schema)
|
153
159
|
@schema = EasyML::Data::PolarsSchema.deserialize(schema)
|
154
160
|
end
|
@@ -157,7 +163,7 @@ module EasyML
|
|
157
163
|
return @raw_schema if @raw_schema
|
158
164
|
return read_attribute(:raw_schema) if @serializing
|
159
165
|
|
160
|
-
raw_schema = read_attribute(:raw_schema) || datasource.schema || datasource.after_sync.schema
|
166
|
+
raw_schema = read_attribute(:raw_schema) || materialized_view&.schema || datasource.schema || datasource.after_sync.schema
|
161
167
|
raw_schema = set_raw_schema(raw_schema)
|
162
168
|
@raw_schema = EasyML::Data::PolarsSchema.deserialize(raw_schema)
|
163
169
|
end
|
@@ -178,7 +184,12 @@ module EasyML
|
|
178
184
|
if datasource&.num_rows.nil?
|
179
185
|
datasource.after_sync
|
180
186
|
end
|
181
|
-
|
187
|
+
|
188
|
+
if materialized_view.present?
|
189
|
+
materialized_view.shape[0]
|
190
|
+
else
|
191
|
+
datasource&.num_rows
|
192
|
+
end
|
182
193
|
end
|
183
194
|
|
184
195
|
def abort!
|
@@ -234,6 +245,29 @@ module EasyML
|
|
234
245
|
features.update_all(workflow_status: "ready")
|
235
246
|
end
|
236
247
|
|
248
|
+
def view_class_exists
|
249
|
+
begin
|
250
|
+
view_class.constantize
|
251
|
+
rescue NameError
|
252
|
+
errors.add(:view_class, "must be a valid class name")
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def materialize_view(df)
|
257
|
+
df
|
258
|
+
end
|
259
|
+
|
260
|
+
def materialized_view
|
261
|
+
return @materialized_view if @materialized_view
|
262
|
+
|
263
|
+
original_df = datasource.data
|
264
|
+
if view_class.present?
|
265
|
+
@materialized_view = view_class.constantize.new.materialize_view(original_df)
|
266
|
+
else
|
267
|
+
@materialized_view = materialize_view(original_df)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
237
271
|
def prepare!
|
238
272
|
prepare_features
|
239
273
|
cleanup
|
@@ -423,6 +457,7 @@ module EasyML
|
|
423
457
|
end
|
424
458
|
|
425
459
|
def needs_learn?
|
460
|
+
return true if view_class.present?
|
426
461
|
return true if columns_need_refresh?
|
427
462
|
|
428
463
|
never_learned = columns.none?
|
@@ -471,6 +506,7 @@ module EasyML
|
|
471
506
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
472
507
|
df = apply_missing_columns(df, inference: inference)
|
473
508
|
df = transform_columns(df, inference: inference, encode: false)
|
509
|
+
df = apply_cast(df)
|
474
510
|
df = apply_features(df, features, inference: inference)
|
475
511
|
df = apply_cast(df) if inference
|
476
512
|
df = transform_columns(df, inference: inference)
|
@@ -798,7 +834,8 @@ module EasyML
|
|
798
834
|
df = df.clone
|
799
835
|
df = apply_features(df)
|
800
836
|
processed.save(:train, df)
|
801
|
-
|
837
|
+
learn(delete: false)
|
838
|
+
learn_statistics(type: :processed, computed: true)
|
802
839
|
processed.cleanup
|
803
840
|
end
|
804
841
|
|
@@ -836,11 +873,12 @@ module EasyML
|
|
836
873
|
return unless force || needs_refresh?
|
837
874
|
|
838
875
|
cleanup
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
876
|
+
|
877
|
+
train_df, valid_df, test_df = splitter.split(self)
|
878
|
+
raw.save(:train, train_df)
|
879
|
+
raw.save(:valid, valid_df)
|
880
|
+
raw.save(:test, test_df)
|
881
|
+
|
844
882
|
raw_schema # Set if not already set
|
845
883
|
end
|
846
884
|
|
@@ -277,24 +277,16 @@ module EasyML
|
|
277
277
|
feature.fit_batch(batch_args.merge!(batch_id: batch_id))
|
278
278
|
rescue => e
|
279
279
|
EasyML::Feature.transaction do
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
280
|
+
if dataset.reload.workflow_status != :failed
|
281
|
+
feature.update(workflow_status: :failed)
|
282
|
+
dataset.update(workflow_status: :failed)
|
283
|
+
EasyML::Event.handle_error(dataset, e)
|
284
|
+
end
|
285
285
|
end
|
286
286
|
raise e
|
287
287
|
end
|
288
288
|
end
|
289
289
|
|
290
|
-
def self.build_error_with_context(dataset, error, batch_id, feature)
|
291
|
-
error = EasyML::Event.handle_error(dataset, error)
|
292
|
-
batch = feature.build_batch(batch_id: batch_id)
|
293
|
-
|
294
|
-
# Convert any dataframes in the context to serialized form
|
295
|
-
error.create_context(context: batch)
|
296
|
-
end
|
297
|
-
|
298
290
|
def self.fit_feature_failed(dataset, e)
|
299
291
|
dataset.update(workflow_status: :failed)
|
300
292
|
EasyML::Event.handle_error(dataset, e)
|
@@ -31,12 +31,13 @@ module EasyML
|
|
31
31
|
}
|
32
32
|
existing_lineage = existing_lineage.map do |key, lineage|
|
33
33
|
matching_lineage = @lineage.detect { |ll| ll[:key].to_sym == lineage.key.to_sym }
|
34
|
+
next unless matching_lineage.present?
|
34
35
|
|
35
36
|
lineage&.assign_attributes(
|
36
37
|
occurred_at: matching_lineage[:occurred_at],
|
37
38
|
description: matching_lineage[:description],
|
38
39
|
)
|
39
|
-
end
|
40
|
+
end.compact
|
40
41
|
missing_lineage.concat(existing_lineage)
|
41
42
|
end
|
42
43
|
end
|
@@ -320,7 +320,10 @@ module EasyML
|
|
320
320
|
raise "Cannot predict on nil — XGBoost" if xs.nil?
|
321
321
|
|
322
322
|
begin
|
323
|
+
@predicting = true
|
323
324
|
y_pred = yield(preprocess(xs))
|
325
|
+
@predicting = false
|
326
|
+
y_pred
|
324
327
|
rescue StandardError => e
|
325
328
|
raise e unless e.message.match?(/Number of columns does not match/)
|
326
329
|
|
@@ -499,8 +502,10 @@ module EasyML
|
|
499
502
|
feature_cols = exploded.columns
|
500
503
|
features = lazy ? exploded.collect.to_numo : exploded.to_numo
|
501
504
|
|
502
|
-
|
503
|
-
|
505
|
+
unless @predicting
|
506
|
+
weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
|
507
|
+
weights = weights.flatten if weights
|
508
|
+
end
|
504
509
|
if ys.present?
|
505
510
|
ys = ys.is_a?(Array) ? Polars::Series.new(ys) : ys
|
506
511
|
labels = lazy ? ys.collect.to_numo.flatten : ys.to_numo.flatten
|
@@ -6,18 +6,14 @@ module EasyML
|
|
6
6
|
|
7
7
|
attr_reader :splitter
|
8
8
|
|
9
|
-
def split(datasource, &block)
|
10
|
-
datasource.in_batches do |df|
|
11
|
-
split_df(df).tap do |splits|
|
12
|
-
yield splits if block_given?
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
9
|
def split_df(df)
|
18
10
|
df
|
19
11
|
end
|
20
12
|
|
13
|
+
def split(dataset)
|
14
|
+
split_df(dataset.materialized_view)
|
15
|
+
end
|
16
|
+
|
21
17
|
def initialize(splitter)
|
22
18
|
@splitter = splitter
|
23
19
|
end
|
@@ -41,9 +41,10 @@ module EasyML
|
|
41
41
|
|
42
42
|
validation_date_start, test_date_start = splits
|
43
43
|
|
44
|
+
dtype = df[date_col].dtype
|
44
45
|
test_df = Polars.concat(
|
45
46
|
[
|
46
|
-
df.filter(Polars.col(date_col)
|
47
|
+
df.filter(Polars.col(date_col).ge(Polars.lit(test_date_start).cast(dtype))),
|
47
48
|
df.filter(Polars.col(date_col).is_null),
|
48
49
|
]
|
49
50
|
)
|
@@ -15,13 +15,18 @@ module EasyML
|
|
15
15
|
}
|
16
16
|
end
|
17
17
|
|
18
|
-
def split(
|
18
|
+
def split(dataset, &block)
|
19
19
|
validate!
|
20
20
|
|
21
|
-
files = datasource.all_files
|
21
|
+
files = dataset.datasource.all_files
|
22
22
|
train, valid, test = match_files(files)
|
23
23
|
|
24
|
-
|
24
|
+
values = [reader.query(train), reader.query(valid), reader.query(test)]
|
25
|
+
if block_given?
|
26
|
+
yield values
|
27
|
+
else
|
28
|
+
values
|
29
|
+
end
|
25
30
|
end
|
26
31
|
|
27
32
|
def match_files(files)
|
@@ -0,0 +1,201 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Schema
|
5
|
+
class Normalizer
|
6
|
+
|
7
|
+
attr_accessor :files
|
8
|
+
|
9
|
+
def initialize(files)
|
10
|
+
@files = files
|
11
|
+
end
|
12
|
+
|
13
|
+
def normalize
|
14
|
+
shared_schema = find_common_schema(files)
|
15
|
+
if schema_changed?(files, shared_schema)
|
16
|
+
queries = schema_to_queries(shared_schema)
|
17
|
+
rewrite_dataset(files, queries)
|
18
|
+
end
|
19
|
+
|
20
|
+
queries = improve_schema(files, shared_schema)
|
21
|
+
if queries.any?
|
22
|
+
rewrite_dataset(files, queries)
|
23
|
+
end
|
24
|
+
files
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def schema_changed?(files, schema)
|
30
|
+
Polars.scan_parquet(files.first).schema != schema
|
31
|
+
end
|
32
|
+
|
33
|
+
def rewrite_dataset(files, queries)
|
34
|
+
files.each do |file|
|
35
|
+
Polars.scan_parquet(file).select(queries).collect.write_parquet("#{file}_normalized.parquet")
|
36
|
+
puts "Rewriting #{file}..."
|
37
|
+
File.delete(file)
|
38
|
+
FileUtils.mv("#{file}_normalized.parquet", file)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def improve_schema(files, schema)
|
43
|
+
checks = schema_checks(schema)
|
44
|
+
return [] unless checks.any?
|
45
|
+
|
46
|
+
improvements = Polars.scan_parquet(files).select(checks).collect
|
47
|
+
conversions = improvements.to_hashes&.first || []
|
48
|
+
return [] unless conversions.any?
|
49
|
+
conversions = conversions&.select { |k,v| v }
|
50
|
+
return [] unless conversions.any?
|
51
|
+
|
52
|
+
conversions = conversions.reduce({}) do |hash, (k, _)|
|
53
|
+
hash.tap do
|
54
|
+
key, ruby_type = k.split("convert_").last.split("_to_")
|
55
|
+
conversion = case ruby_type
|
56
|
+
when "int"
|
57
|
+
Polars.col(key).cast(Polars::Int64).alias(key)
|
58
|
+
else
|
59
|
+
EasyML::Data::DateConverter.conversion(k)
|
60
|
+
end
|
61
|
+
hash[key] = conversion
|
62
|
+
end
|
63
|
+
end
|
64
|
+
schema.map do |k, v|
|
65
|
+
conversions[k] || Polars.col(k).cast(v).alias(k)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def schema_to_queries(schema)
|
70
|
+
schema.map do |k, v|
|
71
|
+
Polars.col(k).cast(v).alias(k)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def schema_checks(schema)
|
76
|
+
schema.flat_map do |key, value|
|
77
|
+
case value
|
78
|
+
when Polars::FloatType, Polars::Decimal
|
79
|
+
Polars.col(key).cast(Polars::Int64).cast(value).eq(Polars.col(key)).all().alias("convert_#{key}_to_int")
|
80
|
+
when Polars::String
|
81
|
+
EasyML::Data::DateConverter.queries(key)
|
82
|
+
end
|
83
|
+
end.compact
|
84
|
+
end
|
85
|
+
|
86
|
+
# Function to find a common schema across multiple parquet files
|
87
|
+
def find_common_schema(parquet_files)
|
88
|
+
# Get schema from each file
|
89
|
+
schemas = []
|
90
|
+
|
91
|
+
parquet_files.each do |file|
|
92
|
+
begin
|
93
|
+
# Read just the schema without loading data
|
94
|
+
schema = Polars.scan_parquet(file).schema
|
95
|
+
schemas << schema
|
96
|
+
rescue => e
|
97
|
+
puts "Warning: Error reading schema from #{file}: #{e.message}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Find common schema - start with first file's schema
|
102
|
+
return {} if schemas.empty?
|
103
|
+
|
104
|
+
key_count = Hash.new(0)
|
105
|
+
common_schema = schemas.first
|
106
|
+
|
107
|
+
# Reconcile types across all schemas
|
108
|
+
schemas.each do |schema|
|
109
|
+
schema.each do |name, dtype|
|
110
|
+
key_count[name] += 1
|
111
|
+
if common_schema.key?(name)
|
112
|
+
# If types don't match, choose the more general type
|
113
|
+
if common_schema[name] != dtype
|
114
|
+
common_schema[name] = choose_compatible_type(common_schema[name], dtype)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Filter out columns that aren't present in all files
|
121
|
+
common_schema = common_schema.select { |name, _| key_count[name] == schemas.length }
|
122
|
+
|
123
|
+
return common_schema
|
124
|
+
end
|
125
|
+
|
126
|
+
# Choose a type that's compatible with both input types
|
127
|
+
def choose_compatible_type(type1, type2)
|
128
|
+
# Integer types - use the larger of the two
|
129
|
+
int_types = [Polars::Int8, Polars::Int16, Polars::Int32, Polars::Int64]
|
130
|
+
|
131
|
+
# If both are integers, choose the larger one
|
132
|
+
if int_types.include?(type1.class) && int_types.include?(type2.class)
|
133
|
+
return [type1, type2].max_by { |t| int_types.index(t.class) }
|
134
|
+
end
|
135
|
+
|
136
|
+
# If one is Int64 and one is Decimal with scale 0, use Decimal
|
137
|
+
if (type1.is_a?(Polars::Int64) && type2.is_a?(Polars::Decimal) && type2.scale == 0) ||
|
138
|
+
(type2.is_a?(Polars::Int64) && type1.is_a?(Polars::Decimal) && type1.scale == 0)
|
139
|
+
return type1.is_a?(Polars::Decimal) ? type1 : type2
|
140
|
+
end
|
141
|
+
|
142
|
+
# If types are drastically different, convert to string as a safe fallback
|
143
|
+
if [Polars::String, Polars::Categorical].include?(type1.class) ||
|
144
|
+
[Polars::String, Polars::Categorical].include?(type2.class)
|
145
|
+
return Polars::String.new
|
146
|
+
end
|
147
|
+
|
148
|
+
# For float vs decimal, choose decimal if it has scale > 0
|
149
|
+
if (type1.is_a?(Polars::Float64) && type2.is_a?(Polars::Decimal) && type2.scale > 0) ||
|
150
|
+
(type2.is_a?(Polars::Float64) && type1.is_a?(Polars::Decimal) && type1.scale > 0)
|
151
|
+
return type1.is_a?(Polars::Decimal) ? type1 : type2
|
152
|
+
end
|
153
|
+
|
154
|
+
# Default to Float64 for numeric type conflicts
|
155
|
+
if [Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type1.is_a?(t) } &&
|
156
|
+
[Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type2.is_a?(t) }
|
157
|
+
return Polars::Float64.new
|
158
|
+
end
|
159
|
+
|
160
|
+
# Fallback - use first type
|
161
|
+
return type1
|
162
|
+
end
|
163
|
+
|
164
|
+
# Apply a common schema to read all parquet files
|
165
|
+
def read_with_common_schema(parquet_files)
|
166
|
+
schema = find_common_schema(parquet_files)
|
167
|
+
return Polars.scan_parquet(parquet_files).with_schema(schema).collect
|
168
|
+
end
|
169
|
+
|
170
|
+
# Alternative approach using a union scan
|
171
|
+
def union_scan_parquet(parquet_files)
|
172
|
+
if parquet_files.empty?
|
173
|
+
return Polars.DataFrame.new
|
174
|
+
end
|
175
|
+
|
176
|
+
# Create separate scans with explicit schemas
|
177
|
+
scans = []
|
178
|
+
schema = find_common_schema(parquet_files)
|
179
|
+
|
180
|
+
parquet_files.each do |file|
|
181
|
+
scans << Polars.scan_parquet(file).with_schema(schema)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Union all scans
|
185
|
+
if scans.length == 1
|
186
|
+
return scans.first.collect
|
187
|
+
else
|
188
|
+
# Combine using concat (union all)
|
189
|
+
union = scans.first
|
190
|
+
scans[1..-1].each do |scan|
|
191
|
+
union = union.concat(scan)
|
192
|
+
end
|
193
|
+
|
194
|
+
return union.collect
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|