easy_ml 0.2.0.pre.rc102 → 0.2.0.pre.rc104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +0 -1
- data/app/models/easy_ml/column.rb +30 -13
- data/app/models/easy_ml/column_list.rb +2 -2
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +24 -5
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +19 -7
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +4 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -4
- data/lib/easy_ml/data/polars_reader.rb +1 -1
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-CRS5bRgw.js → Application.tsx-gkZ77wo8.js} +2 -2
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-CRS5bRgw.js.map → Application.tsx-gkZ77wo8.js.map} +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 617edee53d32c1340b1a996a48e6a8a60d8cccff4345e27b2e5cf2ffc926c4ac
|
4
|
+
data.tar.gz: 225c133b9365d62e579e39e862ef76efaf7759d1c40a59b89968441381c8c5ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8a31abca5a4086eab323b1dfc112c05f89682bbe1fab0211ed111a0978cdc43f45dfd3ece03c6920001cfb4286d22b41e52421d280f02476e2feb63f0214eefd
|
7
|
+
data.tar.gz: 1f109b18e911eee6f81fe797f80320cf37a06e5ae25e4eb76894ffa98dbc931a66b63264462d36ce58e29ab0a7b95e90d522457e53b4fc15a71ece5473d73389
|
@@ -522,27 +522,44 @@ module EasyML
|
|
522
522
|
EasyML::Import::Column.from_config(config, dataset, action: action)
|
523
523
|
end
|
524
524
|
|
525
|
-
def cast_statement(
|
526
|
-
expected_dtype =
|
527
|
-
actual_type =
|
525
|
+
def cast_statement(series = nil)
|
526
|
+
expected_dtype = polars_datatype
|
527
|
+
actual_type = series&.dtype || expected_dtype
|
528
|
+
|
529
|
+
return Polars.col(name).cast(expected_dtype).alias(name) if expected_dtype == actual_type
|
530
|
+
|
531
|
+
if encoding.present?
|
532
|
+
encoding_cast = case encoding.to_sym
|
533
|
+
when :one_hot
|
534
|
+
Polars.col(series.name).cast(Polars::Boolean).alias(series.name)
|
535
|
+
when :ordinal
|
536
|
+
Polars.col(series.name).cast(Polars::Int64).alias(series.name)
|
537
|
+
when :embedding
|
538
|
+
Polars.col(series.name).alias(series.name)
|
539
|
+
end
|
540
|
+
return encoding_cast
|
541
|
+
end
|
528
542
|
|
529
543
|
cast_statement = case expected_dtype.to_s
|
530
|
-
|
544
|
+
when /Polars::List/
|
545
|
+
# we should start tracking polars args so we can know what type of list it is
|
546
|
+
Polars.col(name)
|
547
|
+
when /Polars::Boolean/
|
531
548
|
case actual_type.to_s
|
532
|
-
when
|
533
|
-
Polars.col(
|
534
|
-
when
|
535
|
-
Polars.col(
|
536
|
-
when
|
537
|
-
Polars.col(
|
549
|
+
when /Polars::Boolean/, /Polars::Int/
|
550
|
+
Polars.col(name).cast(expected_dtype)
|
551
|
+
when /Polars::Utf/, /Polars::Categorical/, /Polars::String/
|
552
|
+
Polars.col(name).eq("true").cast(expected_dtype)
|
553
|
+
when /Polars::Null/
|
554
|
+
Polars.col(name)
|
538
555
|
else
|
539
|
-
raise "Unexpected dtype: #{actual_type} for column: #{
|
556
|
+
raise "Unexpected dtype: #{actual_type} for column: #{name}"
|
540
557
|
end
|
541
558
|
else
|
542
|
-
Polars.col(
|
559
|
+
Polars.col(name).cast(expected_dtype, strict: false)
|
543
560
|
end
|
544
561
|
|
545
|
-
cast_statement.alias(
|
562
|
+
cast_statement.alias(name)
|
546
563
|
end
|
547
564
|
|
548
565
|
def cast(value)
|
@@ -101,10 +101,10 @@ module EasyML
|
|
101
101
|
end
|
102
102
|
cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
|
103
103
|
db_col = column_index[df_col]
|
104
|
-
|
105
|
-
db_col.cast_statement(df, df_col, expected_dtype)
|
104
|
+
db_col.cast_statement(df[df_col])
|
106
105
|
end
|
107
106
|
df = df.with_columns(cast_statements)
|
107
|
+
df
|
108
108
|
end
|
109
109
|
|
110
110
|
def cast(processed_or_raw)
|
@@ -5,11 +5,30 @@ module EasyML
|
|
5
5
|
class Numeric < Query
|
6
6
|
def train_query
|
7
7
|
super.concat([
|
8
|
-
Polars.col(column.name)
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
Polars.col(column.name)
|
9
|
+
.cast(column.polars_datatype)
|
10
|
+
.mean
|
11
|
+
.alias("#{column.name}__mean"),
|
12
|
+
|
13
|
+
Polars.col(column.name)
|
14
|
+
.cast(column.polars_datatype)
|
15
|
+
.median
|
16
|
+
.alias("#{column.name}__median"),
|
17
|
+
|
18
|
+
Polars.col(column.name)
|
19
|
+
.cast(column.polars_datatype)
|
20
|
+
.min
|
21
|
+
.alias("#{column.name}__min"),
|
22
|
+
|
23
|
+
Polars.col(column.name)
|
24
|
+
.cast(column.polars_datatype)
|
25
|
+
.max
|
26
|
+
.alias("#{column.name}__max"),
|
27
|
+
|
28
|
+
Polars.col(column.name)
|
29
|
+
.cast(column.polars_datatype)
|
30
|
+
.std
|
31
|
+
.alias("#{column.name}__std"),
|
13
32
|
])
|
14
33
|
end
|
15
34
|
end
|
@@ -44,25 +44,37 @@ module EasyML
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def null_count
|
47
|
-
Polars.col(column.name)
|
47
|
+
Polars.col(column.name)
|
48
|
+
.cast(column.polars_datatype)
|
49
|
+
.null_count
|
50
|
+
.alias("#{column.name}__null_count")
|
48
51
|
end
|
49
52
|
|
50
53
|
def num_rows
|
51
|
-
Polars.col(column.name)
|
54
|
+
Polars.col(column.name)
|
55
|
+
.cast(column.polars_datatype)
|
56
|
+
.len
|
57
|
+
.alias("#{column.name}__num_rows")
|
52
58
|
end
|
53
59
|
|
54
60
|
def most_frequent_value
|
55
|
-
Polars.col(column.name)
|
61
|
+
Polars.col(column.name)
|
62
|
+
.cast(column.polars_datatype)
|
63
|
+
.filter(Polars.col(column.name).is_not_null)
|
64
|
+
.mode
|
65
|
+
.first
|
66
|
+
.alias("#{column.name}__most_frequent_value")
|
56
67
|
end
|
57
68
|
|
58
69
|
def last_value
|
59
70
|
return unless dataset.date_column.present?
|
60
71
|
|
61
72
|
Polars.col(column.name)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
73
|
+
.cast(column.polars_datatype)
|
74
|
+
.sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
|
75
|
+
.filter(Polars.col(column.name).is_not_null)
|
76
|
+
.first
|
77
|
+
.alias("#{column.name}__last_value")
|
66
78
|
end
|
67
79
|
end
|
68
80
|
end
|
@@ -10,7 +10,10 @@ module EasyML
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def unique_count
|
13
|
-
Polars.col(column.name)
|
13
|
+
Polars.col(column.name)
|
14
|
+
.cast(Polars::String)
|
15
|
+
.n_unique
|
16
|
+
.alias("#{column.name}__unique_count")
|
14
17
|
end
|
15
18
|
end
|
16
19
|
end
|
@@ -28,15 +28,15 @@ module EasyML
|
|
28
28
|
)
|
29
29
|
.select(queries).collect
|
30
30
|
rescue => e
|
31
|
-
|
31
|
+
problematic_queries = queries.select { |query|
|
32
32
|
begin
|
33
|
-
dataset.send(type).send(split, all_columns: true, lazy: true).select(
|
33
|
+
dataset.send(type).send(split, all_columns: true, lazy: true).select([query]).collect
|
34
34
|
false
|
35
35
|
rescue => e
|
36
36
|
true
|
37
37
|
end
|
38
38
|
}
|
39
|
-
raise "Query failed for
|
39
|
+
raise "Query failed for queries... likely due to wrong column datatype: #{problematic_queries.join("\n")}"
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
@@ -64,4 +64,4 @@ module EasyML
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
67
|
-
end
|
67
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED