polars-df 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +10 -13
- data/ext/polars/src/conversion/any_value.rs +37 -21
- data/ext/polars/src/conversion/chunked_array.rs +3 -3
- data/ext/polars/src/conversion/mod.rs +159 -46
- data/ext/polars/src/dataframe/construction.rs +4 -7
- data/ext/polars/src/dataframe/export.rs +9 -2
- data/ext/polars/src/dataframe/general.rs +22 -16
- data/ext/polars/src/dataframe/io.rs +78 -174
- data/ext/polars/src/dataframe/mod.rs +1 -0
- data/ext/polars/src/dataframe/serde.rs +15 -0
- data/ext/polars/src/error.rs +31 -48
- data/ext/polars/src/exceptions.rs +24 -0
- data/ext/polars/src/expr/binary.rs +4 -42
- data/ext/polars/src/expr/datetime.rs +16 -7
- data/ext/polars/src/expr/general.rs +14 -23
- data/ext/polars/src/expr/list.rs +18 -11
- data/ext/polars/src/expr/name.rs +3 -2
- data/ext/polars/src/expr/rolling.rs +6 -7
- data/ext/polars/src/expr/string.rs +17 -37
- data/ext/polars/src/file.rs +59 -22
- data/ext/polars/src/functions/business.rs +15 -0
- data/ext/polars/src/functions/io.rs +6 -6
- data/ext/polars/src/functions/lazy.rs +17 -8
- data/ext/polars/src/functions/mod.rs +1 -0
- data/ext/polars/src/functions/range.rs +4 -2
- data/ext/polars/src/interop/arrow/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/lazyframe/general.rs +877 -0
- data/ext/polars/src/lazyframe/mod.rs +3 -825
- data/ext/polars/src/lazyframe/serde.rs +31 -0
- data/ext/polars/src/lib.rs +44 -13
- data/ext/polars/src/map/dataframe.rs +46 -14
- data/ext/polars/src/map/lazy.rs +65 -4
- data/ext/polars/src/map/mod.rs +17 -16
- data/ext/polars/src/map/series.rs +106 -64
- data/ext/polars/src/on_startup.rs +2 -2
- data/ext/polars/src/series/aggregation.rs +1 -5
- data/ext/polars/src/series/arithmetic.rs +10 -10
- data/ext/polars/src/series/construction.rs +52 -25
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +643 -0
- data/ext/polars/src/series/import.rs +55 -0
- data/ext/polars/src/series/mod.rs +11 -638
- data/ext/polars/src/series/scatter.rs +2 -2
- data/ext/polars/src/utils.rs +0 -20
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +14 -4
- data/lib/polars/functions.rb +0 -57
data/LICENSE.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
Copyright (c) 2020 Ritchie Vink
|
2
2
|
Copyright (c) 2022-2024 Andrew Kane
|
3
|
+
Some portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
3
4
|
|
4
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
6
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
@@ -17,8 +17,7 @@ gem "polars-df"
|
|
17
17
|
This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
|
18
18
|
|
19
19
|
```ruby
|
20
|
-
Polars.
|
21
|
-
.lazy
|
20
|
+
Polars.scan_csv("iris.csv")
|
22
21
|
.filter(Polars.col("sepal_length") > 5)
|
23
22
|
.group_by("species")
|
24
23
|
.agg(Polars.all.sum)
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.15.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,17 +12,20 @@ crate-type = ["cdylib"]
|
|
12
12
|
|
13
13
|
[dependencies]
|
14
14
|
ahash = "0.8"
|
15
|
+
arrow = { package = "polars-arrow", version = "=0.44.2" }
|
16
|
+
bytes = "1"
|
15
17
|
chrono = "0.4"
|
16
18
|
either = "1.8"
|
17
19
|
magnus = "0.7"
|
18
|
-
polars-core = "=0.
|
19
|
-
polars-
|
20
|
-
polars-
|
20
|
+
polars-core = "=0.44.2"
|
21
|
+
polars-plan = "=0.44.2"
|
22
|
+
polars-parquet = "=0.44.2"
|
23
|
+
polars-utils = "=0.44.2"
|
24
|
+
regex = "1"
|
21
25
|
serde_json = "1"
|
22
|
-
smartstring = "1"
|
23
26
|
|
24
27
|
[dependencies.polars]
|
25
|
-
version = "=0.
|
28
|
+
version = "=0.44.2"
|
26
29
|
features = [
|
27
30
|
"abs",
|
28
31
|
"approx_unique",
|
@@ -31,7 +34,11 @@ features = [
|
|
31
34
|
"array_count",
|
32
35
|
"asof_join",
|
33
36
|
"avro",
|
37
|
+
"aws",
|
38
|
+
"azure",
|
34
39
|
"binary_encoding",
|
40
|
+
"business",
|
41
|
+
"cloud",
|
35
42
|
"concat_str",
|
36
43
|
"cov",
|
37
44
|
"cross_join",
|
@@ -52,6 +59,8 @@ features = [
|
|
52
59
|
"extract_jsonpath",
|
53
60
|
"find_many",
|
54
61
|
"fmt",
|
62
|
+
"gcp",
|
63
|
+
"http",
|
55
64
|
"interpolate",
|
56
65
|
"ipc",
|
57
66
|
"ipc_streaming",
|
@@ -34,8 +34,7 @@ impl RbBatchedCsv {
|
|
34
34
|
let n_threads = Option::<usize>::try_convert(arguments[11])?;
|
35
35
|
let path = PathBuf::try_convert(arguments[12])?;
|
36
36
|
let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[13])?;
|
37
|
-
|
38
|
-
let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::None; // Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
|
37
|
+
let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
|
39
38
|
let low_memory = bool::try_convert(arguments[15])?;
|
40
39
|
let comment_prefix = Option::<String>::try_convert(arguments[16])?;
|
41
40
|
let quote_char = Option::<String>::try_convert(arguments[17])?;
|
@@ -44,17 +43,16 @@ impl RbBatchedCsv {
|
|
44
43
|
let try_parse_dates = bool::try_convert(arguments[20])?;
|
45
44
|
let skip_rows_after_header = usize::try_convert(arguments[21])?;
|
46
45
|
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
|
47
|
-
let
|
48
|
-
let
|
49
|
-
let
|
50
|
-
let
|
51
|
-
let decimal_comma = bool::try_convert(arguments[27])?;
|
46
|
+
let eol_char = String::try_convert(arguments[23])?;
|
47
|
+
let raise_if_empty = bool::try_convert(arguments[24])?;
|
48
|
+
let truncate_ragged_lines = bool::try_convert(arguments[25])?;
|
49
|
+
let decimal_comma = bool::try_convert(arguments[26])?;
|
52
50
|
// end arguments
|
53
51
|
|
54
52
|
let null_values = null_values.map(|w| w.0);
|
55
53
|
let eol_char = eol_char.as_bytes()[0];
|
56
54
|
let row_index = row_index.map(|(name, offset)| RowIndex {
|
57
|
-
name:
|
55
|
+
name: name.into(),
|
58
56
|
offset,
|
59
57
|
});
|
60
58
|
let quote_char = if let Some(s) = quote_char {
|
@@ -72,7 +70,7 @@ impl RbBatchedCsv {
|
|
72
70
|
.iter()
|
73
71
|
.map(|(name, dtype)| {
|
74
72
|
let dtype = dtype.0.clone();
|
75
|
-
Field::new(name, dtype)
|
73
|
+
Field::new((&**name).into(), dtype)
|
76
74
|
})
|
77
75
|
.collect::<Schema>()
|
78
76
|
});
|
@@ -84,7 +82,7 @@ impl RbBatchedCsv {
|
|
84
82
|
.collect::<Vec<_>>()
|
85
83
|
});
|
86
84
|
|
87
|
-
let file = std::fs::File::open(path).map_err(RbPolarsErr::
|
85
|
+
let file = std::fs::File::open(path).map_err(RbPolarsErr::from)?;
|
88
86
|
let reader = Box::new(file) as Box<dyn MmapBytesReader>;
|
89
87
|
let reader = CsvReadOptions::default()
|
90
88
|
.with_infer_schema_length(infer_schema_length)
|
@@ -95,13 +93,12 @@ impl RbBatchedCsv {
|
|
95
93
|
.with_projection(projection.map(Arc::new))
|
96
94
|
.with_rechunk(rechunk)
|
97
95
|
.with_chunk_size(chunk_size)
|
98
|
-
.with_columns(columns.map(
|
96
|
+
.with_columns(columns.map(|x| x.into_iter().map(PlSmallStr::from_string).collect()))
|
99
97
|
.with_n_threads(n_threads)
|
100
98
|
.with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
|
101
99
|
.with_low_memory(low_memory)
|
102
100
|
.with_skip_rows_after_header(skip_rows_after_header)
|
103
101
|
.with_row_index(row_index)
|
104
|
-
.with_sample_size(sample_size)
|
105
102
|
.with_raise_if_empty(raise_if_empty)
|
106
103
|
.with_parse_options(
|
107
104
|
CsvParseOptions::default()
|
@@ -132,7 +129,7 @@ impl RbBatchedCsv {
|
|
132
129
|
let batches = reader
|
133
130
|
.borrow()
|
134
131
|
.lock()
|
135
|
-
.map_err(|e| RbPolarsErr::
|
132
|
+
.map_err(|e| RbPolarsErr::Other(e.to_string()))?
|
136
133
|
.next_batches(n)
|
137
134
|
.map_err(RbPolarsErr::from)?;
|
138
135
|
|
@@ -7,9 +7,9 @@ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
|
|
7
7
|
|
8
8
|
use super::{struct_dict, ObjectValue, Wrap};
|
9
9
|
|
10
|
-
use crate::
|
10
|
+
use crate::exceptions::RbOverflowError;
|
11
11
|
use crate::rb_modules::utils;
|
12
|
-
use crate::{RbPolarsErr, RbResult, RbSeries};
|
12
|
+
use crate::{RbErr, RbPolarsErr, RbResult, RbSeries};
|
13
13
|
|
14
14
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
15
15
|
fn into_value_with(self, ruby: &Ruby) -> Value {
|
@@ -47,12 +47,20 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
47
47
|
};
|
48
48
|
s.into_value()
|
49
49
|
}
|
50
|
+
AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
|
51
|
+
let s = if arr.is_null() {
|
52
|
+
rev.get(idx)
|
53
|
+
} else {
|
54
|
+
unsafe { arr.deref_unchecked().value(idx as usize) }
|
55
|
+
};
|
56
|
+
s.into_value()
|
57
|
+
}
|
50
58
|
AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
|
51
59
|
AnyValue::Datetime(v, time_unit, time_zone) => {
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
60
|
+
datetime_to_rb_object(v, time_unit, time_zone)
|
61
|
+
}
|
62
|
+
AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
|
63
|
+
datetime_to_rb_object(v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
|
56
64
|
}
|
57
65
|
AnyValue::Duration(v, time_unit) => {
|
58
66
|
let time_unit = time_unit.to_ascii();
|
@@ -66,11 +74,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
66
74
|
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
67
75
|
AnyValue::Object(v) => {
|
68
76
|
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
69
|
-
object.
|
77
|
+
object.to_value()
|
70
78
|
}
|
71
79
|
AnyValue::ObjectOwned(v) => {
|
72
80
|
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
73
|
-
object.
|
81
|
+
object.to_value()
|
74
82
|
}
|
75
83
|
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
76
84
|
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
@@ -80,6 +88,13 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
80
88
|
}
|
81
89
|
}
|
82
90
|
|
91
|
+
fn datetime_to_rb_object(v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> Value {
|
92
|
+
let tu = tu.to_ascii();
|
93
|
+
utils()
|
94
|
+
.funcall("_to_ruby_datetime", (v, tu, tz.map(|v| v.to_string())))
|
95
|
+
.unwrap()
|
96
|
+
}
|
97
|
+
|
83
98
|
pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
|
84
99
|
// Conversion functions.
|
85
100
|
fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
@@ -122,7 +137,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
122
137
|
fn get_list(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
123
138
|
let v = RArray::from_value(ob).unwrap();
|
124
139
|
if v.is_empty() {
|
125
|
-
Ok(AnyValue::List(Series::new_empty(
|
140
|
+
Ok(AnyValue::List(Series::new_empty(
|
141
|
+
PlSmallStr::EMPTY,
|
142
|
+
&DataType::Null,
|
143
|
+
)))
|
126
144
|
} else {
|
127
145
|
let list = v;
|
128
146
|
|
@@ -142,7 +160,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
142
160
|
avs.push(Wrap::<AnyValue>::try_convert(item)?.0)
|
143
161
|
}
|
144
162
|
|
145
|
-
let s = Series::from_any_values_and_dtype(
|
163
|
+
let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, true)
|
146
164
|
.map_err(RbPolarsErr::from)?;
|
147
165
|
Ok(AnyValue::List(s))
|
148
166
|
}
|
@@ -158,11 +176,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
158
176
|
let len = dict.len();
|
159
177
|
let mut keys = Vec::with_capacity(len);
|
160
178
|
let mut vals = Vec::with_capacity(len);
|
161
|
-
dict.foreach(|
|
162
|
-
let
|
163
|
-
let val = Wrap::<AnyValue>::try_convert(v)?.0;
|
179
|
+
dict.foreach(|key: String, val: Wrap<AnyValue>| {
|
180
|
+
let val = val.0;
|
164
181
|
let dtype = DataType::from(&val);
|
165
|
-
keys.push(Field::new(
|
182
|
+
keys.push(Field::new(key.into(), dtype));
|
166
183
|
vals.push(val);
|
167
184
|
Ok(ForEach::Continue)
|
168
185
|
})?;
|
@@ -184,7 +201,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
184
201
|
let v = sec * 1_000_000_000 + nsec;
|
185
202
|
// TODO support time zone when possible
|
186
203
|
// https://github.com/pola-rs/polars/issues/9103
|
187
|
-
Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds,
|
204
|
+
Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, None))
|
188
205
|
}
|
189
206
|
|
190
207
|
fn get_datetime(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
@@ -193,7 +210,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
193
210
|
Ok(AnyValue::Datetime(
|
194
211
|
sec * 1_000_000_000 + nsec,
|
195
212
|
TimeUnit::Nanoseconds,
|
196
|
-
|
213
|
+
None,
|
197
214
|
))
|
198
215
|
}
|
199
216
|
|
@@ -218,7 +235,9 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
218
235
|
|
219
236
|
let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
|
220
237
|
let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
|
221
|
-
RbPolarsErr::
|
238
|
+
RbErr::from(RbPolarsErr::Other(
|
239
|
+
"BigDecimal is too large to fit in Decimal128".into(),
|
240
|
+
))
|
222
241
|
})?;
|
223
242
|
if sign < 0 {
|
224
243
|
// TODO better error
|
@@ -253,9 +272,6 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
253
272
|
} else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
|
254
273
|
get_decimal(ob, strict)
|
255
274
|
} else {
|
256
|
-
Err(RbPolarsErr::
|
257
|
-
"object type not supported {:?}",
|
258
|
-
ob
|
259
|
-
)))
|
275
|
+
Err(RbPolarsErr::Other(format!("object type not supported {:?}", ob)).into())
|
260
276
|
}
|
261
277
|
}
|
@@ -9,7 +9,7 @@ use crate::RbResult;
|
|
9
9
|
impl TryConvert for Wrap<StringChunked> {
|
10
10
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
11
11
|
let (seq, len) = get_rbseq(obj)?;
|
12
|
-
let mut builder = StringChunkedBuilder::new(
|
12
|
+
let mut builder = StringChunkedBuilder::new(PlSmallStr::EMPTY, len);
|
13
13
|
|
14
14
|
for res in seq.into_iter() {
|
15
15
|
let item = res;
|
@@ -25,7 +25,7 @@ impl TryConvert for Wrap<StringChunked> {
|
|
25
25
|
impl TryConvert for Wrap<BinaryChunked> {
|
26
26
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
27
27
|
let (seq, len) = get_rbseq(obj)?;
|
28
|
-
let mut builder = BinaryChunkedBuilder::new(
|
28
|
+
let mut builder = BinaryChunkedBuilder::new(PlSmallStr::EMPTY, len);
|
29
29
|
|
30
30
|
for res in seq.into_iter() {
|
31
31
|
let item = res;
|
@@ -90,7 +90,7 @@ impl IntoValue for Wrap<&DatetimeChunked> {
|
|
90
90
|
fn into_value_with(self, _: &Ruby) -> Value {
|
91
91
|
let utils = utils();
|
92
92
|
let time_unit = Wrap(self.0.time_unit()).into_value();
|
93
|
-
let time_zone = self.0.time_zone().
|
93
|
+
let time_zone = self.0.time_zone().as_deref().map(|v| v.into_value());
|
94
94
|
let iter = self.0.into_iter().map(|opt_v| {
|
95
95
|
opt_v.map(|v| {
|
96
96
|
utils
|