polars-df 0.13.0 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +10 -13
- data/ext/polars/src/conversion/any_value.rs +37 -21
- data/ext/polars/src/conversion/chunked_array.rs +3 -3
- data/ext/polars/src/conversion/mod.rs +159 -46
- data/ext/polars/src/dataframe/construction.rs +4 -7
- data/ext/polars/src/dataframe/export.rs +9 -2
- data/ext/polars/src/dataframe/general.rs +22 -16
- data/ext/polars/src/dataframe/io.rs +78 -174
- data/ext/polars/src/dataframe/mod.rs +1 -0
- data/ext/polars/src/dataframe/serde.rs +15 -0
- data/ext/polars/src/error.rs +31 -48
- data/ext/polars/src/exceptions.rs +24 -0
- data/ext/polars/src/expr/binary.rs +4 -42
- data/ext/polars/src/expr/datetime.rs +16 -7
- data/ext/polars/src/expr/general.rs +14 -23
- data/ext/polars/src/expr/list.rs +18 -11
- data/ext/polars/src/expr/name.rs +3 -2
- data/ext/polars/src/expr/rolling.rs +6 -7
- data/ext/polars/src/expr/string.rs +17 -37
- data/ext/polars/src/file.rs +59 -22
- data/ext/polars/src/functions/business.rs +15 -0
- data/ext/polars/src/functions/io.rs +6 -6
- data/ext/polars/src/functions/lazy.rs +17 -8
- data/ext/polars/src/functions/mod.rs +1 -0
- data/ext/polars/src/functions/range.rs +4 -2
- data/ext/polars/src/interop/arrow/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/lazyframe/general.rs +877 -0
- data/ext/polars/src/lazyframe/mod.rs +3 -825
- data/ext/polars/src/lazyframe/serde.rs +31 -0
- data/ext/polars/src/lib.rs +44 -13
- data/ext/polars/src/map/dataframe.rs +46 -14
- data/ext/polars/src/map/lazy.rs +65 -4
- data/ext/polars/src/map/mod.rs +17 -16
- data/ext/polars/src/map/series.rs +106 -64
- data/ext/polars/src/on_startup.rs +2 -2
- data/ext/polars/src/series/aggregation.rs +1 -5
- data/ext/polars/src/series/arithmetic.rs +10 -10
- data/ext/polars/src/series/construction.rs +52 -25
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +643 -0
- data/ext/polars/src/series/import.rs +55 -0
- data/ext/polars/src/series/mod.rs +11 -638
- data/ext/polars/src/series/scatter.rs +2 -2
- data/ext/polars/src/utils.rs +0 -20
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +14 -4
- data/lib/polars/functions.rb +0 -57
data/LICENSE.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
Copyright (c) 2020 Ritchie Vink
|
2
2
|
Copyright (c) 2022-2024 Andrew Kane
|
3
|
+
Some portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
3
4
|
|
4
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
6
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
@@ -17,8 +17,7 @@ gem "polars-df"
|
|
17
17
|
This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
|
18
18
|
|
19
19
|
```ruby
|
20
|
-
Polars.
|
21
|
-
.lazy
|
20
|
+
Polars.scan_csv("iris.csv")
|
22
21
|
.filter(Polars.col("sepal_length") > 5)
|
23
22
|
.group_by("species")
|
24
23
|
.agg(Polars.all.sum)
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.15.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,17 +12,20 @@ crate-type = ["cdylib"]
|
|
12
12
|
|
13
13
|
[dependencies]
|
14
14
|
ahash = "0.8"
|
15
|
+
arrow = { package = "polars-arrow", version = "=0.44.2" }
|
16
|
+
bytes = "1"
|
15
17
|
chrono = "0.4"
|
16
18
|
either = "1.8"
|
17
19
|
magnus = "0.7"
|
18
|
-
polars-core = "=0.
|
19
|
-
polars-
|
20
|
-
polars-
|
20
|
+
polars-core = "=0.44.2"
|
21
|
+
polars-plan = "=0.44.2"
|
22
|
+
polars-parquet = "=0.44.2"
|
23
|
+
polars-utils = "=0.44.2"
|
24
|
+
regex = "1"
|
21
25
|
serde_json = "1"
|
22
|
-
smartstring = "1"
|
23
26
|
|
24
27
|
[dependencies.polars]
|
25
|
-
version = "=0.
|
28
|
+
version = "=0.44.2"
|
26
29
|
features = [
|
27
30
|
"abs",
|
28
31
|
"approx_unique",
|
@@ -31,7 +34,11 @@ features = [
|
|
31
34
|
"array_count",
|
32
35
|
"asof_join",
|
33
36
|
"avro",
|
37
|
+
"aws",
|
38
|
+
"azure",
|
34
39
|
"binary_encoding",
|
40
|
+
"business",
|
41
|
+
"cloud",
|
35
42
|
"concat_str",
|
36
43
|
"cov",
|
37
44
|
"cross_join",
|
@@ -52,6 +59,8 @@ features = [
|
|
52
59
|
"extract_jsonpath",
|
53
60
|
"find_many",
|
54
61
|
"fmt",
|
62
|
+
"gcp",
|
63
|
+
"http",
|
55
64
|
"interpolate",
|
56
65
|
"ipc",
|
57
66
|
"ipc_streaming",
|
@@ -34,8 +34,7 @@ impl RbBatchedCsv {
|
|
34
34
|
let n_threads = Option::<usize>::try_convert(arguments[11])?;
|
35
35
|
let path = PathBuf::try_convert(arguments[12])?;
|
36
36
|
let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[13])?;
|
37
|
-
|
38
|
-
let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::None; // Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
|
37
|
+
let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
|
39
38
|
let low_memory = bool::try_convert(arguments[15])?;
|
40
39
|
let comment_prefix = Option::<String>::try_convert(arguments[16])?;
|
41
40
|
let quote_char = Option::<String>::try_convert(arguments[17])?;
|
@@ -44,17 +43,16 @@ impl RbBatchedCsv {
|
|
44
43
|
let try_parse_dates = bool::try_convert(arguments[20])?;
|
45
44
|
let skip_rows_after_header = usize::try_convert(arguments[21])?;
|
46
45
|
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
|
47
|
-
let
|
48
|
-
let
|
49
|
-
let
|
50
|
-
let
|
51
|
-
let decimal_comma = bool::try_convert(arguments[27])?;
|
46
|
+
let eol_char = String::try_convert(arguments[23])?;
|
47
|
+
let raise_if_empty = bool::try_convert(arguments[24])?;
|
48
|
+
let truncate_ragged_lines = bool::try_convert(arguments[25])?;
|
49
|
+
let decimal_comma = bool::try_convert(arguments[26])?;
|
52
50
|
// end arguments
|
53
51
|
|
54
52
|
let null_values = null_values.map(|w| w.0);
|
55
53
|
let eol_char = eol_char.as_bytes()[0];
|
56
54
|
let row_index = row_index.map(|(name, offset)| RowIndex {
|
57
|
-
name:
|
55
|
+
name: name.into(),
|
58
56
|
offset,
|
59
57
|
});
|
60
58
|
let quote_char = if let Some(s) = quote_char {
|
@@ -72,7 +70,7 @@ impl RbBatchedCsv {
|
|
72
70
|
.iter()
|
73
71
|
.map(|(name, dtype)| {
|
74
72
|
let dtype = dtype.0.clone();
|
75
|
-
Field::new(name, dtype)
|
73
|
+
Field::new((&**name).into(), dtype)
|
76
74
|
})
|
77
75
|
.collect::<Schema>()
|
78
76
|
});
|
@@ -84,7 +82,7 @@ impl RbBatchedCsv {
|
|
84
82
|
.collect::<Vec<_>>()
|
85
83
|
});
|
86
84
|
|
87
|
-
let file = std::fs::File::open(path).map_err(RbPolarsErr::
|
85
|
+
let file = std::fs::File::open(path).map_err(RbPolarsErr::from)?;
|
88
86
|
let reader = Box::new(file) as Box<dyn MmapBytesReader>;
|
89
87
|
let reader = CsvReadOptions::default()
|
90
88
|
.with_infer_schema_length(infer_schema_length)
|
@@ -95,13 +93,12 @@ impl RbBatchedCsv {
|
|
95
93
|
.with_projection(projection.map(Arc::new))
|
96
94
|
.with_rechunk(rechunk)
|
97
95
|
.with_chunk_size(chunk_size)
|
98
|
-
.with_columns(columns.map(
|
96
|
+
.with_columns(columns.map(|x| x.into_iter().map(PlSmallStr::from_string).collect()))
|
99
97
|
.with_n_threads(n_threads)
|
100
98
|
.with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
|
101
99
|
.with_low_memory(low_memory)
|
102
100
|
.with_skip_rows_after_header(skip_rows_after_header)
|
103
101
|
.with_row_index(row_index)
|
104
|
-
.with_sample_size(sample_size)
|
105
102
|
.with_raise_if_empty(raise_if_empty)
|
106
103
|
.with_parse_options(
|
107
104
|
CsvParseOptions::default()
|
@@ -132,7 +129,7 @@ impl RbBatchedCsv {
|
|
132
129
|
let batches = reader
|
133
130
|
.borrow()
|
134
131
|
.lock()
|
135
|
-
.map_err(|e| RbPolarsErr::
|
132
|
+
.map_err(|e| RbPolarsErr::Other(e.to_string()))?
|
136
133
|
.next_batches(n)
|
137
134
|
.map_err(RbPolarsErr::from)?;
|
138
135
|
|
@@ -7,9 +7,9 @@ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
|
|
7
7
|
|
8
8
|
use super::{struct_dict, ObjectValue, Wrap};
|
9
9
|
|
10
|
-
use crate::
|
10
|
+
use crate::exceptions::RbOverflowError;
|
11
11
|
use crate::rb_modules::utils;
|
12
|
-
use crate::{RbPolarsErr, RbResult, RbSeries};
|
12
|
+
use crate::{RbErr, RbPolarsErr, RbResult, RbSeries};
|
13
13
|
|
14
14
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
15
15
|
fn into_value_with(self, ruby: &Ruby) -> Value {
|
@@ -47,12 +47,20 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
47
47
|
};
|
48
48
|
s.into_value()
|
49
49
|
}
|
50
|
+
AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
|
51
|
+
let s = if arr.is_null() {
|
52
|
+
rev.get(idx)
|
53
|
+
} else {
|
54
|
+
unsafe { arr.deref_unchecked().value(idx as usize) }
|
55
|
+
};
|
56
|
+
s.into_value()
|
57
|
+
}
|
50
58
|
AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
|
51
59
|
AnyValue::Datetime(v, time_unit, time_zone) => {
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
60
|
+
datetime_to_rb_object(v, time_unit, time_zone)
|
61
|
+
}
|
62
|
+
AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
|
63
|
+
datetime_to_rb_object(v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
|
56
64
|
}
|
57
65
|
AnyValue::Duration(v, time_unit) => {
|
58
66
|
let time_unit = time_unit.to_ascii();
|
@@ -66,11 +74,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
66
74
|
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
67
75
|
AnyValue::Object(v) => {
|
68
76
|
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
69
|
-
object.
|
77
|
+
object.to_value()
|
70
78
|
}
|
71
79
|
AnyValue::ObjectOwned(v) => {
|
72
80
|
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
73
|
-
object.
|
81
|
+
object.to_value()
|
74
82
|
}
|
75
83
|
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
76
84
|
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
@@ -80,6 +88,13 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
80
88
|
}
|
81
89
|
}
|
82
90
|
|
91
|
+
fn datetime_to_rb_object(v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> Value {
|
92
|
+
let tu = tu.to_ascii();
|
93
|
+
utils()
|
94
|
+
.funcall("_to_ruby_datetime", (v, tu, tz.map(|v| v.to_string())))
|
95
|
+
.unwrap()
|
96
|
+
}
|
97
|
+
|
83
98
|
pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
|
84
99
|
// Conversion functions.
|
85
100
|
fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
@@ -122,7 +137,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
122
137
|
fn get_list(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
123
138
|
let v = RArray::from_value(ob).unwrap();
|
124
139
|
if v.is_empty() {
|
125
|
-
Ok(AnyValue::List(Series::new_empty(
|
140
|
+
Ok(AnyValue::List(Series::new_empty(
|
141
|
+
PlSmallStr::EMPTY,
|
142
|
+
&DataType::Null,
|
143
|
+
)))
|
126
144
|
} else {
|
127
145
|
let list = v;
|
128
146
|
|
@@ -142,7 +160,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
142
160
|
avs.push(Wrap::<AnyValue>::try_convert(item)?.0)
|
143
161
|
}
|
144
162
|
|
145
|
-
let s = Series::from_any_values_and_dtype(
|
163
|
+
let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, true)
|
146
164
|
.map_err(RbPolarsErr::from)?;
|
147
165
|
Ok(AnyValue::List(s))
|
148
166
|
}
|
@@ -158,11 +176,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
158
176
|
let len = dict.len();
|
159
177
|
let mut keys = Vec::with_capacity(len);
|
160
178
|
let mut vals = Vec::with_capacity(len);
|
161
|
-
dict.foreach(|
|
162
|
-
let
|
163
|
-
let val = Wrap::<AnyValue>::try_convert(v)?.0;
|
179
|
+
dict.foreach(|key: String, val: Wrap<AnyValue>| {
|
180
|
+
let val = val.0;
|
164
181
|
let dtype = DataType::from(&val);
|
165
|
-
keys.push(Field::new(
|
182
|
+
keys.push(Field::new(key.into(), dtype));
|
166
183
|
vals.push(val);
|
167
184
|
Ok(ForEach::Continue)
|
168
185
|
})?;
|
@@ -184,7 +201,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
184
201
|
let v = sec * 1_000_000_000 + nsec;
|
185
202
|
// TODO support time zone when possible
|
186
203
|
// https://github.com/pola-rs/polars/issues/9103
|
187
|
-
Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds,
|
204
|
+
Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, None))
|
188
205
|
}
|
189
206
|
|
190
207
|
fn get_datetime(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
@@ -193,7 +210,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
193
210
|
Ok(AnyValue::Datetime(
|
194
211
|
sec * 1_000_000_000 + nsec,
|
195
212
|
TimeUnit::Nanoseconds,
|
196
|
-
|
213
|
+
None,
|
197
214
|
))
|
198
215
|
}
|
199
216
|
|
@@ -218,7 +235,9 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
218
235
|
|
219
236
|
let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
|
220
237
|
let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
|
221
|
-
RbPolarsErr::
|
238
|
+
RbErr::from(RbPolarsErr::Other(
|
239
|
+
"BigDecimal is too large to fit in Decimal128".into(),
|
240
|
+
))
|
222
241
|
})?;
|
223
242
|
if sign < 0 {
|
224
243
|
// TODO better error
|
@@ -253,9 +272,6 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
253
272
|
} else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
|
254
273
|
get_decimal(ob, strict)
|
255
274
|
} else {
|
256
|
-
Err(RbPolarsErr::
|
257
|
-
"object type not supported {:?}",
|
258
|
-
ob
|
259
|
-
)))
|
275
|
+
Err(RbPolarsErr::Other(format!("object type not supported {:?}", ob)).into())
|
260
276
|
}
|
261
277
|
}
|
@@ -9,7 +9,7 @@ use crate::RbResult;
|
|
9
9
|
impl TryConvert for Wrap<StringChunked> {
|
10
10
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
11
11
|
let (seq, len) = get_rbseq(obj)?;
|
12
|
-
let mut builder = StringChunkedBuilder::new(
|
12
|
+
let mut builder = StringChunkedBuilder::new(PlSmallStr::EMPTY, len);
|
13
13
|
|
14
14
|
for res in seq.into_iter() {
|
15
15
|
let item = res;
|
@@ -25,7 +25,7 @@ impl TryConvert for Wrap<StringChunked> {
|
|
25
25
|
impl TryConvert for Wrap<BinaryChunked> {
|
26
26
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
27
27
|
let (seq, len) = get_rbseq(obj)?;
|
28
|
-
let mut builder = BinaryChunkedBuilder::new(
|
28
|
+
let mut builder = BinaryChunkedBuilder::new(PlSmallStr::EMPTY, len);
|
29
29
|
|
30
30
|
for res in seq.into_iter() {
|
31
31
|
let item = res;
|
@@ -90,7 +90,7 @@ impl IntoValue for Wrap<&DatetimeChunked> {
|
|
90
90
|
fn into_value_with(self, _: &Ruby) -> Value {
|
91
91
|
let utils = utils();
|
92
92
|
let time_unit = Wrap(self.0.time_unit()).into_value();
|
93
|
-
let time_zone = self.0.time_zone().
|
93
|
+
let time_zone = self.0.time_zone().as_deref().map(|v| v.into_value());
|
94
94
|
let iter = self.0.into_iter().map(|opt_v| {
|
95
95
|
opt_v.map(|v| {
|
96
96
|
utils
|