easy_ml 0.2.0.pre.rc102 → 0.2.0.pre.rc103
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +0 -1
- data/app/models/easy_ml/column.rb +18 -13
- data/app/models/easy_ml/column_list.rb +2 -2
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +24 -5
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +19 -7
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +4 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -4
- data/app/models/easy_ml/models/xgboost.rb +1 -1
- data/lib/easy_ml/data/polars_reader.rb +1 -1
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-CRS5bRgw.js → Application.tsx-gkZ77wo8.js} +2 -2
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-CRS5bRgw.js.map → Application.tsx-gkZ77wo8.js.map} +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef3f840cce99d7205957fbb39a6b319a45035624dce2e4e10f681383cb088abf
|
4
|
+
data.tar.gz: e25100f792ad48cfa4feab7eb652a2d6c49bfc6e28f3bcb97c8150f9bdd1bfc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f58395d392158d149db34ad5019a0e011164ca8d331846553e44e6564a291d88323ad0090c1c5ded60f696940b30949cba4e1a614fa9cd502e94372ef949707
|
7
|
+
data.tar.gz: 9497391351ad054308a985cc6b9e608f8dfef61be7417d66502cb11c26ca4f7825456b31aab010c016ac10feff791a8f7a01893743ecf11a26fabb9de7405b82
|
@@ -522,27 +522,32 @@ module EasyML
|
|
522
522
|
EasyML::Import::Column.from_config(config, dataset, action: action)
|
523
523
|
end
|
524
524
|
|
525
|
-
def cast_statement(
|
526
|
-
expected_dtype =
|
527
|
-
actual_type =
|
525
|
+
def cast_statement(series = nil)
|
526
|
+
expected_dtype = polars_datatype
|
527
|
+
actual_type = series&.dtype || expected_dtype
|
528
|
+
|
529
|
+
return Polars.col(name).cast(expected_dtype).alias(name) if expected_dtype == actual_type
|
528
530
|
|
529
531
|
cast_statement = case expected_dtype.to_s
|
530
|
-
|
532
|
+
when /Polars::List/
|
533
|
+
# we should start tracking polars args so we can know what type of list it is
|
534
|
+
Polars.col(name)
|
535
|
+
when /Polars::Boolean/
|
531
536
|
case actual_type.to_s
|
532
|
-
when
|
533
|
-
Polars.col(
|
534
|
-
when
|
535
|
-
Polars.col(
|
536
|
-
when
|
537
|
-
Polars.col(
|
537
|
+
when /Polars::Boolean/, /Polars::Int/
|
538
|
+
Polars.col(name).cast(expected_dtype)
|
539
|
+
when /Polars::Utf/, /Polars::Categorical/, /Polars::String/
|
540
|
+
Polars.col(name).eq("true").cast(expected_dtype)
|
541
|
+
when /Polars::Null/
|
542
|
+
Polars.col(name)
|
538
543
|
else
|
539
|
-
raise "Unexpected dtype: #{actual_type} for column: #{
|
544
|
+
raise "Unexpected dtype: #{actual_type} for column: #{name}"
|
540
545
|
end
|
541
546
|
else
|
542
|
-
Polars.col(
|
547
|
+
Polars.col(name).cast(expected_dtype, strict: false)
|
543
548
|
end
|
544
549
|
|
545
|
-
cast_statement.alias(
|
550
|
+
cast_statement.alias(name)
|
546
551
|
end
|
547
552
|
|
548
553
|
def cast(value)
|
@@ -101,10 +101,10 @@ module EasyML
|
|
101
101
|
end
|
102
102
|
cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
|
103
103
|
db_col = column_index[df_col]
|
104
|
-
|
105
|
-
db_col.cast_statement(df, df_col, expected_dtype)
|
104
|
+
db_col.cast_statement(df[df_col])
|
106
105
|
end
|
107
106
|
df = df.with_columns(cast_statements)
|
107
|
+
df
|
108
108
|
end
|
109
109
|
|
110
110
|
def cast(processed_or_raw)
|
@@ -5,11 +5,30 @@ module EasyML
|
|
5
5
|
class Numeric < Query
|
6
6
|
def train_query
|
7
7
|
super.concat([
|
8
|
-
Polars.col(column.name)
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
Polars.col(column.name)
|
9
|
+
.cast(column.polars_datatype)
|
10
|
+
.mean
|
11
|
+
.alias("#{column.name}__mean"),
|
12
|
+
|
13
|
+
Polars.col(column.name)
|
14
|
+
.cast(column.polars_datatype)
|
15
|
+
.median
|
16
|
+
.alias("#{column.name}__median"),
|
17
|
+
|
18
|
+
Polars.col(column.name)
|
19
|
+
.cast(column.polars_datatype)
|
20
|
+
.min
|
21
|
+
.alias("#{column.name}__min"),
|
22
|
+
|
23
|
+
Polars.col(column.name)
|
24
|
+
.cast(column.polars_datatype)
|
25
|
+
.max
|
26
|
+
.alias("#{column.name}__max"),
|
27
|
+
|
28
|
+
Polars.col(column.name)
|
29
|
+
.cast(column.polars_datatype)
|
30
|
+
.std
|
31
|
+
.alias("#{column.name}__std"),
|
13
32
|
])
|
14
33
|
end
|
15
34
|
end
|
@@ -44,25 +44,37 @@ module EasyML
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def null_count
|
47
|
-
Polars.col(column.name)
|
47
|
+
Polars.col(column.name)
|
48
|
+
.cast(column.polars_datatype)
|
49
|
+
.null_count
|
50
|
+
.alias("#{column.name}__null_count")
|
48
51
|
end
|
49
52
|
|
50
53
|
def num_rows
|
51
|
-
Polars.col(column.name)
|
54
|
+
Polars.col(column.name)
|
55
|
+
.cast(column.polars_datatype)
|
56
|
+
.len
|
57
|
+
.alias("#{column.name}__num_rows")
|
52
58
|
end
|
53
59
|
|
54
60
|
def most_frequent_value
|
55
|
-
Polars.col(column.name)
|
61
|
+
Polars.col(column.name)
|
62
|
+
.cast(column.polars_datatype)
|
63
|
+
.filter(Polars.col(column.name).is_not_null)
|
64
|
+
.mode
|
65
|
+
.first
|
66
|
+
.alias("#{column.name}__most_frequent_value")
|
56
67
|
end
|
57
68
|
|
58
69
|
def last_value
|
59
70
|
return unless dataset.date_column.present?
|
60
71
|
|
61
72
|
Polars.col(column.name)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
73
|
+
.cast(column.polars_datatype)
|
74
|
+
.sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
|
75
|
+
.filter(Polars.col(column.name).is_not_null)
|
76
|
+
.first
|
77
|
+
.alias("#{column.name}__last_value")
|
66
78
|
end
|
67
79
|
end
|
68
80
|
end
|
@@ -10,7 +10,10 @@ module EasyML
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def unique_count
|
13
|
-
Polars.col(column.name)
|
13
|
+
Polars.col(column.name)
|
14
|
+
.cast(Polars::String)
|
15
|
+
.n_unique
|
16
|
+
.alias("#{column.name}__unique_count")
|
14
17
|
end
|
15
18
|
end
|
16
19
|
end
|
@@ -28,15 +28,15 @@ module EasyML
|
|
28
28
|
)
|
29
29
|
.select(queries).collect
|
30
30
|
rescue => e
|
31
|
-
|
31
|
+
problematic_queries = queries.select { |query|
|
32
32
|
begin
|
33
|
-
dataset.send(type).send(split, all_columns: true, lazy: true).select(
|
33
|
+
dataset.send(type).send(split, all_columns: true, lazy: true).select([query]).collect
|
34
34
|
false
|
35
35
|
rescue => e
|
36
36
|
true
|
37
37
|
end
|
38
38
|
}
|
39
|
-
raise "Query failed for
|
39
|
+
raise "Query failed for queries... likely due to wrong column datatype: #{problematic_queries.join("\n")}"
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
@@ -64,4 +64,4 @@ module EasyML
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
67
|
-
end
|
67
|
+
end
|
@@ -498,7 +498,7 @@ module EasyML
|
|
498
498
|
feature_cols -= [weights_col] if weights_col
|
499
499
|
|
500
500
|
# Get features, labels and weights
|
501
|
-
exploded = explode_embeddings(xs
|
501
|
+
exploded = explode_embeddings(xs)
|
502
502
|
feature_cols = exploded.columns
|
503
503
|
features = lazy ? exploded.collect.to_numo : exploded.to_numo
|
504
504
|
|
data/lib/easy_ml/version.rb
CHANGED