polars-df 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/Cargo.lock +211 -320
- data/LICENSE.txt +1 -1
- data/ext/polars/Cargo.toml +13 -9
- data/ext/polars/src/batched_csv.rs +2 -2
- data/ext/polars/src/catalog/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +450 -0
- data/ext/polars/src/conversion/any_value.rs +9 -19
- data/ext/polars/src/conversion/categorical.rs +30 -0
- data/ext/polars/src/conversion/chunked_array.rs +8 -8
- data/ext/polars/src/conversion/mod.rs +187 -109
- data/ext/polars/src/dataframe/construction.rs +2 -2
- data/ext/polars/src/dataframe/export.rs +2 -2
- data/ext/polars/src/dataframe/general.rs +4 -2
- data/ext/polars/src/dataframe/io.rs +2 -2
- data/ext/polars/src/exceptions.rs +1 -1
- data/ext/polars/src/expr/datatype.rs +14 -0
- data/ext/polars/src/expr/general.rs +36 -44
- data/ext/polars/src/expr/list.rs +27 -17
- data/ext/polars/src/expr/meta.rs +18 -41
- data/ext/polars/src/expr/mod.rs +3 -1
- data/ext/polars/src/expr/name.rs +2 -2
- data/ext/polars/src/expr/rolling.rs +1 -1
- data/ext/polars/src/expr/selector.rs +219 -0
- data/ext/polars/src/expr/string.rs +14 -7
- data/ext/polars/src/file.rs +12 -6
- data/ext/polars/src/functions/io.rs +2 -11
- data/ext/polars/src/functions/lazy.rs +22 -54
- data/ext/polars/src/functions/meta.rs +2 -2
- data/ext/polars/src/functions/misc.rs +1 -1
- data/ext/polars/src/functions/range.rs +14 -10
- data/ext/polars/src/functions/string_cache.rs +4 -5
- data/ext/polars/src/interop/numo/numo_rs.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/io/mod.rs +102 -0
- data/ext/polars/src/lazyframe/general.rs +75 -113
- data/ext/polars/src/lazyframe/serde.rs +1 -1
- data/ext/polars/src/lazyframe/sink.rs +6 -6
- data/ext/polars/src/lib.rs +104 -26
- data/ext/polars/src/map/dataframe.rs +7 -7
- data/ext/polars/src/map/lazy.rs +1 -1
- data/ext/polars/src/map/mod.rs +31 -19
- data/ext/polars/src/map/series.rs +8 -8
- data/ext/polars/src/on_startup.rs +5 -2
- data/ext/polars/src/rb_modules.rs +1 -1
- data/ext/polars/src/series/construction.rs +11 -7
- data/ext/polars/src/series/export.rs +6 -4
- data/ext/polars/src/series/general.rs +12 -207
- data/ext/polars/src/series/import.rs +2 -2
- data/ext/polars/src/series/map.rs +227 -0
- data/ext/polars/src/series/mod.rs +2 -1
- data/ext/polars/src/series/scatter.rs +1 -1
- data/ext/polars/src/utils.rs +10 -2
- data/lib/polars/cat_name_space.rb +3 -43
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/convert.rb +10 -0
- data/lib/polars/data_frame.rb +151 -30
- data/lib/polars/data_types.rb +47 -3
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +48 -39
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/eager.rb +1 -1
- data/lib/polars/functions/lazy.rb +114 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +18 -0
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +45 -63
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +163 -75
- data/lib/polars/list_expr.rb +213 -17
- data/lib/polars/list_name_space.rb +121 -8
- data/lib/polars/meta_expr.rb +14 -29
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +6 -1
- data/lib/polars/selector.rb +138 -0
- data/lib/polars/selectors.rb +931 -202
- data/lib/polars/series.rb +46 -19
- data/lib/polars/string_expr.rb +24 -3
- data/lib/polars/string_name_space.rb +12 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +8 -0
- metadata +17 -2
@@ -0,0 +1,227 @@
|
|
1
|
+
use magnus::Value;
|
2
|
+
|
3
|
+
use super::RbSeries;
|
4
|
+
use crate::map::check_nested_object;
|
5
|
+
use crate::map::series::{ApplyLambda, call_lambda_and_extract};
|
6
|
+
use crate::prelude::*;
|
7
|
+
use crate::{RbPolarsErr, RbResult};
|
8
|
+
use crate::{apply_method_all_arrow_series2, raise_err};
|
9
|
+
|
10
|
+
impl RbSeries {
|
11
|
+
pub fn map_elements(
|
12
|
+
&self,
|
13
|
+
function: Value,
|
14
|
+
return_dtype: Option<Wrap<DataType>>,
|
15
|
+
skip_nulls: bool,
|
16
|
+
) -> RbResult<Self> {
|
17
|
+
let series = &self.series.borrow();
|
18
|
+
|
19
|
+
if return_dtype.is_none() {
|
20
|
+
polars_warn!(
|
21
|
+
MapWithoutReturnDtypeWarning,
|
22
|
+
"Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. \
|
23
|
+
Specify `return_dtype` to silence this warning."
|
24
|
+
)
|
25
|
+
}
|
26
|
+
|
27
|
+
if skip_nulls && (series.null_count() == series.len()) {
|
28
|
+
if let Some(return_dtype) = return_dtype {
|
29
|
+
return Ok(
|
30
|
+
Series::full_null(series.name().clone(), series.len(), &return_dtype.0).into(),
|
31
|
+
);
|
32
|
+
}
|
33
|
+
let msg = "The output type of the 'map_elements' function cannot be determined.\n\
|
34
|
+
The function was never called because 'skip_nulls: true' and all values are null.\n\
|
35
|
+
Consider setting 'skip_nulls: false' or setting the 'return_dtype'.";
|
36
|
+
raise_err!(msg, ComputeError)
|
37
|
+
}
|
38
|
+
|
39
|
+
let return_dtype = return_dtype.map(|dt| dt.0);
|
40
|
+
|
41
|
+
macro_rules! dispatch_apply {
|
42
|
+
($self:expr, $method:ident, $($args:expr),*) => {
|
43
|
+
match $self.dtype() {
|
44
|
+
DataType::Object(_) => {
|
45
|
+
// let ca = $self.0.unpack::<ObjectType<ObjectValue>>().unwrap();
|
46
|
+
// ca.$method($($args),*)
|
47
|
+
todo!()
|
48
|
+
}
|
49
|
+
_ => {
|
50
|
+
apply_method_all_arrow_series2!(
|
51
|
+
$self,
|
52
|
+
$method,
|
53
|
+
$($args),*
|
54
|
+
)
|
55
|
+
}
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
}
|
60
|
+
|
61
|
+
if matches!(
|
62
|
+
series.dtype(),
|
63
|
+
DataType::Datetime(_, _)
|
64
|
+
| DataType::Date
|
65
|
+
| DataType::Duration(_)
|
66
|
+
| DataType::Categorical(_, _)
|
67
|
+
| DataType::Enum(_, _)
|
68
|
+
| DataType::Binary
|
69
|
+
| DataType::Array(_, _)
|
70
|
+
| DataType::Time
|
71
|
+
| DataType::Decimal(_, _)
|
72
|
+
) || !skip_nulls
|
73
|
+
{
|
74
|
+
let mut avs = Vec::with_capacity(series.len());
|
75
|
+
let s = series.rechunk();
|
76
|
+
|
77
|
+
for av in s.iter() {
|
78
|
+
let out = match (skip_nulls, av) {
|
79
|
+
(true, AnyValue::Null) => AnyValue::Null,
|
80
|
+
(_, av) => {
|
81
|
+
let av: Option<Wrap<AnyValue>> =
|
82
|
+
call_lambda_and_extract(function, Wrap(av))?;
|
83
|
+
match av {
|
84
|
+
None => AnyValue::Null,
|
85
|
+
Some(av) => av.0,
|
86
|
+
}
|
87
|
+
}
|
88
|
+
};
|
89
|
+
avs.push(out)
|
90
|
+
}
|
91
|
+
let out = Series::new(series.name().clone(), &avs);
|
92
|
+
let dtype = out.dtype();
|
93
|
+
if dtype.is_nested() {
|
94
|
+
check_nested_object(dtype)?;
|
95
|
+
}
|
96
|
+
|
97
|
+
return Ok(out.into());
|
98
|
+
}
|
99
|
+
|
100
|
+
let out = match return_dtype {
|
101
|
+
Some(DataType::Int8) => {
|
102
|
+
let ca: Int8Chunked = dispatch_apply!(
|
103
|
+
series,
|
104
|
+
apply_lambda_with_primitive_out_type,
|
105
|
+
function,
|
106
|
+
0,
|
107
|
+
None
|
108
|
+
)?;
|
109
|
+
ca.into_series()
|
110
|
+
}
|
111
|
+
Some(DataType::Int16) => {
|
112
|
+
let ca: Int16Chunked = dispatch_apply!(
|
113
|
+
series,
|
114
|
+
apply_lambda_with_primitive_out_type,
|
115
|
+
function,
|
116
|
+
0,
|
117
|
+
None
|
118
|
+
)?;
|
119
|
+
ca.into_series()
|
120
|
+
}
|
121
|
+
Some(DataType::Int32) => {
|
122
|
+
let ca: Int32Chunked = dispatch_apply!(
|
123
|
+
series,
|
124
|
+
apply_lambda_with_primitive_out_type,
|
125
|
+
function,
|
126
|
+
0,
|
127
|
+
None
|
128
|
+
)?;
|
129
|
+
ca.into_series()
|
130
|
+
}
|
131
|
+
Some(DataType::Int64) => {
|
132
|
+
let ca: Int64Chunked = dispatch_apply!(
|
133
|
+
series,
|
134
|
+
apply_lambda_with_primitive_out_type,
|
135
|
+
function,
|
136
|
+
0,
|
137
|
+
None
|
138
|
+
)?;
|
139
|
+
ca.into_series()
|
140
|
+
}
|
141
|
+
Some(DataType::UInt8) => {
|
142
|
+
let ca: UInt8Chunked = dispatch_apply!(
|
143
|
+
series,
|
144
|
+
apply_lambda_with_primitive_out_type,
|
145
|
+
function,
|
146
|
+
0,
|
147
|
+
None
|
148
|
+
)?;
|
149
|
+
ca.into_series()
|
150
|
+
}
|
151
|
+
Some(DataType::UInt16) => {
|
152
|
+
let ca: UInt16Chunked = dispatch_apply!(
|
153
|
+
series,
|
154
|
+
apply_lambda_with_primitive_out_type,
|
155
|
+
function,
|
156
|
+
0,
|
157
|
+
None
|
158
|
+
)?;
|
159
|
+
ca.into_series()
|
160
|
+
}
|
161
|
+
Some(DataType::UInt32) => {
|
162
|
+
let ca: UInt32Chunked = dispatch_apply!(
|
163
|
+
series,
|
164
|
+
apply_lambda_with_primitive_out_type,
|
165
|
+
function,
|
166
|
+
0,
|
167
|
+
None
|
168
|
+
)?;
|
169
|
+
ca.into_series()
|
170
|
+
}
|
171
|
+
Some(DataType::UInt64) => {
|
172
|
+
let ca: UInt64Chunked = dispatch_apply!(
|
173
|
+
series,
|
174
|
+
apply_lambda_with_primitive_out_type,
|
175
|
+
function,
|
176
|
+
0,
|
177
|
+
None
|
178
|
+
)?;
|
179
|
+
ca.into_series()
|
180
|
+
}
|
181
|
+
Some(DataType::Float32) => {
|
182
|
+
let ca: Float32Chunked = dispatch_apply!(
|
183
|
+
series,
|
184
|
+
apply_lambda_with_primitive_out_type,
|
185
|
+
function,
|
186
|
+
0,
|
187
|
+
None
|
188
|
+
)?;
|
189
|
+
ca.into_series()
|
190
|
+
}
|
191
|
+
Some(DataType::Float64) => {
|
192
|
+
let ca: Float64Chunked = dispatch_apply!(
|
193
|
+
series,
|
194
|
+
apply_lambda_with_primitive_out_type,
|
195
|
+
function,
|
196
|
+
0,
|
197
|
+
None
|
198
|
+
)?;
|
199
|
+
ca.into_series()
|
200
|
+
}
|
201
|
+
Some(DataType::Boolean) => {
|
202
|
+
let ca: BooleanChunked =
|
203
|
+
dispatch_apply!(series, apply_lambda_with_bool_out_type, function, 0, None)?;
|
204
|
+
ca.into_series()
|
205
|
+
}
|
206
|
+
Some(DataType::String) => {
|
207
|
+
let ca =
|
208
|
+
dispatch_apply!(series, apply_lambda_with_utf8_out_type, function, 0, None)?;
|
209
|
+
|
210
|
+
ca.into_series()
|
211
|
+
}
|
212
|
+
Some(DataType::List(_inner)) => {
|
213
|
+
todo!()
|
214
|
+
}
|
215
|
+
Some(DataType::Object(_)) => {
|
216
|
+
let ca =
|
217
|
+
dispatch_apply!(series, apply_lambda_with_object_out_type, function, 0, None)?;
|
218
|
+
ca.into_series()
|
219
|
+
}
|
220
|
+
None => return dispatch_apply!(series, apply_lambda_unknown, function),
|
221
|
+
|
222
|
+
_ => return dispatch_apply!(series, apply_lambda_unknown, function),
|
223
|
+
};
|
224
|
+
|
225
|
+
Ok(RbSeries::new(out))
|
226
|
+
}
|
227
|
+
}
|
@@ -100,7 +100,7 @@ fn scatter(mut s: Series, idx: &Series, values: &Series) -> PolarsResult<Series>
|
|
100
100
|
let values = values.str()?;
|
101
101
|
ca.scatter(idx, values)
|
102
102
|
}
|
103
|
-
_ => panic!("not yet implemented for dtype: {}"
|
103
|
+
_ => panic!("not yet implemented for dtype: {logical_dtype}"),
|
104
104
|
};
|
105
105
|
|
106
106
|
s.and_then(|s| s.cast(&logical_dtype))
|
data/ext/polars/src/utils.rs
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
use crate::{RbErr, RbPolarsErr};
|
2
|
+
|
1
3
|
#[macro_export]
|
2
4
|
macro_rules! apply_method_all_arrow_series2 {
|
3
5
|
($self:expr, $method:ident, $($args:expr),*) => {
|
@@ -14,11 +16,17 @@ macro_rules! apply_method_all_arrow_series2 {
|
|
14
16
|
DataType::Int64 => $self.i64().unwrap().$method($($args),*),
|
15
17
|
DataType::Float32 => $self.f32().unwrap().$method($($args),*),
|
16
18
|
DataType::Float64 => $self.f64().unwrap().$method($($args),*),
|
17
|
-
DataType::Date => $self.date().unwrap().$method($($args),*),
|
18
|
-
DataType::Datetime(_, _) => $self.datetime().unwrap().$method($($args),*),
|
19
|
+
DataType::Date => $self.date().unwrap().physical().$method($($args),*),
|
20
|
+
DataType::Datetime(_, _) => $self.datetime().unwrap().physical().$method($($args),*),
|
19
21
|
// DataType::List(_) => $self.list().unwrap().$method($($args),*),
|
20
22
|
DataType::Struct(_) => $self.struct_().unwrap().$method($($args),*),
|
21
23
|
dt => panic!("dtype {:?} not supported", dt)
|
22
24
|
}
|
23
25
|
}
|
24
26
|
}
|
27
|
+
|
28
|
+
/// Boilerplate for `|e| RbPolarsErr::from(e).into()`
|
29
|
+
#[allow(unused)]
|
30
|
+
pub(crate) fn to_rb_err<E: Into<RbPolarsErr>>(e: E) -> RbErr {
|
31
|
+
e.into().into()
|
32
|
+
}
|
@@ -31,56 +31,16 @@ module Polars
|
|
31
31
|
|
32
32
|
# Return whether or not the column is a local categorical.
|
33
33
|
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
# @example Categoricals constructed without a string cache are considered local.
|
37
|
-
# s = Polars::Series.new(["a", "b", "a"], dtype: Polars::Categorical)
|
38
|
-
# s.cat.is_local
|
39
|
-
# # => true
|
34
|
+
# Always returns false.
|
40
35
|
#
|
41
|
-
# @
|
42
|
-
# s = nil
|
43
|
-
# Polars::StringCache.new do
|
44
|
-
# s = Polars::Series.new(["a", "b", "a"], dtype: Polars::Categorical)
|
45
|
-
# end
|
46
|
-
# s.cat.is_local
|
47
|
-
# # => false
|
36
|
+
# @return [Boolean]
|
48
37
|
def is_local
|
49
38
|
_s.cat_is_local
|
50
39
|
end
|
51
40
|
|
52
|
-
#
|
53
|
-
#
|
54
|
-
# This may change the underlying physical representation of the column.
|
41
|
+
# Simply returns the column as-is, local representations are deprecated.
|
55
42
|
#
|
56
43
|
# @return [Series]
|
57
|
-
#
|
58
|
-
# @example Compare the global and local representations of a categorical.
|
59
|
-
# s = nil
|
60
|
-
# Polars::StringCache.new do
|
61
|
-
# _ = Polars::Series.new("x", ["a", "b", "a"], dtype: Polars::Categorical)
|
62
|
-
# s = Polars::Series.new("y", ["c", "b", "d"], dtype: Polars::Categorical)
|
63
|
-
# end
|
64
|
-
# s.to_physical
|
65
|
-
# # =>
|
66
|
-
# # shape: (3,)
|
67
|
-
# # Series: 'y' [u32]
|
68
|
-
# # [
|
69
|
-
# # 2
|
70
|
-
# # 1
|
71
|
-
# # 3
|
72
|
-
# # ]
|
73
|
-
#
|
74
|
-
# @example
|
75
|
-
# s.cat.to_local.to_physical
|
76
|
-
# # =>
|
77
|
-
# # shape: (3,)
|
78
|
-
# # Series: 'y' [u32]
|
79
|
-
# # [
|
80
|
-
# # 0
|
81
|
-
# # 1
|
82
|
-
# # 2
|
83
|
-
# # ]
|
84
44
|
def to_local
|
85
45
|
Utils.wrap_s(_s.cat_to_local)
|
86
46
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Polars
|
2
|
+
class Catalog
|
3
|
+
module Unity
|
4
|
+
# Information for a catalog within a metastore.
|
5
|
+
CatalogInfo =
|
6
|
+
::Struct.new(
|
7
|
+
:name,
|
8
|
+
:comment,
|
9
|
+
:properties,
|
10
|
+
:options,
|
11
|
+
:storage_location,
|
12
|
+
:created_at,
|
13
|
+
:created_by,
|
14
|
+
:updated_at,
|
15
|
+
:updated_by,
|
16
|
+
keyword_init: true
|
17
|
+
)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Polars
|
2
|
+
class Catalog
|
3
|
+
module Unity
|
4
|
+
# Information for a column within a catalog table.
|
5
|
+
ColumnInfo =
|
6
|
+
::Struct.new(
|
7
|
+
:name,
|
8
|
+
:type_name,
|
9
|
+
:type_text,
|
10
|
+
:type_json,
|
11
|
+
:position,
|
12
|
+
:comment,
|
13
|
+
:partition_index,
|
14
|
+
keyword_init: true
|
15
|
+
)
|
16
|
+
|
17
|
+
class ColumnInfo
|
18
|
+
# Get the native polars datatype of this column.
|
19
|
+
#
|
20
|
+
# @note
|
21
|
+
# This functionality is considered **unstable**. It may be changed
|
22
|
+
# at any point without it being considered a breaking change.
|
23
|
+
#
|
24
|
+
# @return [Object]
|
25
|
+
def get_polars_dtype
|
26
|
+
RbCatalogClient.type_json_to_polars_type(type_json)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Polars
|
2
|
+
class Catalog
|
3
|
+
module Unity
|
4
|
+
# Information for a namespace within a catalog.
|
5
|
+
#
|
6
|
+
# This is also known by the name "schema" in unity catalog terminology.
|
7
|
+
NamespaceInfo =
|
8
|
+
::Struct.new(
|
9
|
+
:name,
|
10
|
+
:comment,
|
11
|
+
:properties,
|
12
|
+
:storage_location,
|
13
|
+
:created_at,
|
14
|
+
:created_by,
|
15
|
+
:updated_at,
|
16
|
+
:updated_by,
|
17
|
+
keyword_init: true
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Polars
|
2
|
+
class Catalog
|
3
|
+
module Unity
|
4
|
+
# Information for a catalog table.
|
5
|
+
TableInfo =
|
6
|
+
::Struct.new(
|
7
|
+
:name,
|
8
|
+
:comment,
|
9
|
+
:table_id,
|
10
|
+
:table_type,
|
11
|
+
:storage_location,
|
12
|
+
:data_source_format,
|
13
|
+
:columns,
|
14
|
+
:properties,
|
15
|
+
:created_at,
|
16
|
+
:created_by,
|
17
|
+
:updated_at,
|
18
|
+
:updated_by,
|
19
|
+
keyword_init: true
|
20
|
+
)
|
21
|
+
|
22
|
+
class TableInfo
|
23
|
+
# Get the native polars schema of this table.
|
24
|
+
#
|
25
|
+
# @note
|
26
|
+
# This functionality is considered **unstable**. It may be changed
|
27
|
+
# at any point without it being considered a breaking change.
|
28
|
+
#
|
29
|
+
# @return [Schema]
|
30
|
+
def get_polars_schema
|
31
|
+
if columns.nil?
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
|
35
|
+
schema = Schema.new(check_dtypes: false)
|
36
|
+
|
37
|
+
columns.each do |column_info|
|
38
|
+
if schema[column_info.name]
|
39
|
+
msg = "duplicate column name: #{column_info.name}"
|
40
|
+
raise DuplicateError, msg
|
41
|
+
end
|
42
|
+
schema[column_info.name] = column_info.get_polars_dtype
|
43
|
+
end
|
44
|
+
|
45
|
+
schema
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|