polars-df 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/README.md +7 -6
- data/ext/polars/Cargo.toml +10 -6
- data/ext/polars/src/batched_csv.rs +53 -50
- data/ext/polars/src/conversion/anyvalue.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +31 -67
- data/ext/polars/src/dataframe/construction.rs +186 -0
- data/ext/polars/src/dataframe/export.rs +48 -0
- data/ext/polars/src/dataframe/general.rs +607 -0
- data/ext/polars/src/dataframe/io.rs +463 -0
- data/ext/polars/src/dataframe/mod.rs +26 -0
- data/ext/polars/src/expr/array.rs +6 -2
- data/ext/polars/src/expr/datetime.rs +13 -4
- data/ext/polars/src/expr/general.rs +50 -9
- data/ext/polars/src/expr/list.rs +6 -2
- data/ext/polars/src/expr/rolling.rs +185 -69
- data/ext/polars/src/expr/string.rs +12 -33
- data/ext/polars/src/file.rs +158 -11
- data/ext/polars/src/functions/lazy.rs +20 -3
- data/ext/polars/src/functions/range.rs +74 -0
- data/ext/polars/src/functions/whenthen.rs +47 -17
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/interop/numo/mod.rs +2 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
- data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
- data/ext/polars/src/lazyframe/mod.rs +111 -56
- data/ext/polars/src/lib.rs +68 -34
- data/ext/polars/src/map/dataframe.rs +17 -9
- data/ext/polars/src/map/lazy.rs +5 -25
- data/ext/polars/src/map/series.rs +7 -1
- data/ext/polars/src/series/aggregation.rs +47 -30
- data/ext/polars/src/series/export.rs +131 -49
- data/ext/polars/src/series/mod.rs +13 -133
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +23 -8
- data/ext/polars/src/dataframe.rs +0 -1182
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -1,57 +1,139 @@
|
|
1
|
-
use magnus::{
|
1
|
+
use magnus::{value::qnil, IntoValue, RArray, Value};
|
2
2
|
use polars_core::prelude::*;
|
3
3
|
|
4
|
-
use crate::
|
4
|
+
use crate::prelude::*;
|
5
|
+
use crate::RbSeries;
|
5
6
|
|
6
7
|
impl RbSeries {
|
7
|
-
///
|
8
|
-
/// This
|
9
|
-
pub fn
|
10
|
-
let
|
11
|
-
match s.dtype() {
|
12
|
-
DataType::String => {
|
13
|
-
let ca = s.str().unwrap();
|
8
|
+
/// Convert this Series to a Ruby array.
|
9
|
+
/// This operation copies data.
|
10
|
+
pub fn to_a(&self) -> Value {
|
11
|
+
let series = &self.series.borrow();
|
14
12
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
.
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
13
|
+
fn to_a_recursive(series: &Series) -> Value {
|
14
|
+
let rblist = match series.dtype() {
|
15
|
+
DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
|
16
|
+
DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
|
17
|
+
DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
|
18
|
+
DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
|
19
|
+
DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
|
20
|
+
DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
|
21
|
+
DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
|
22
|
+
DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
|
23
|
+
DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
|
24
|
+
DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
|
25
|
+
DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
|
26
|
+
DataType::Categorical(_, _) | DataType::Enum(_, _) => {
|
27
|
+
RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
|
28
|
+
}
|
29
|
+
DataType::Object(_, _) => {
|
30
|
+
let v = RArray::with_capacity(series.len());
|
31
|
+
for i in 0..series.len() {
|
32
|
+
let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
|
33
|
+
match obj {
|
34
|
+
Some(val) => v.push(val.to_object()).unwrap(),
|
35
|
+
None => v.push(qnil()).unwrap(),
|
36
|
+
};
|
37
|
+
}
|
38
|
+
v.into_value()
|
39
|
+
}
|
40
|
+
DataType::List(_) => {
|
41
|
+
let v = RArray::new();
|
42
|
+
let ca = series.list().unwrap();
|
43
|
+
for opt_s in unsafe { ca.amortized_iter() } {
|
44
|
+
match opt_s {
|
45
|
+
None => {
|
46
|
+
v.push(qnil()).unwrap();
|
47
|
+
}
|
48
|
+
Some(s) => {
|
49
|
+
let rblst = to_a_recursive(s.as_ref());
|
50
|
+
v.push(rblst).unwrap();
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
v.into_value()
|
55
|
+
}
|
56
|
+
DataType::Array(_, _) => {
|
57
|
+
let v = RArray::new();
|
58
|
+
let ca = series.array().unwrap();
|
59
|
+
for opt_s in ca.amortized_iter() {
|
60
|
+
match opt_s {
|
61
|
+
None => {
|
62
|
+
v.push(qnil()).unwrap();
|
63
|
+
}
|
64
|
+
Some(s) => {
|
65
|
+
let rblst = to_a_recursive(s.as_ref());
|
66
|
+
v.push(rblst).unwrap();
|
67
|
+
}
|
68
|
+
}
|
69
|
+
}
|
70
|
+
v.into_value()
|
71
|
+
}
|
72
|
+
DataType::Date => {
|
73
|
+
let ca = series.date().unwrap();
|
74
|
+
return Wrap(ca).into_value();
|
75
|
+
}
|
76
|
+
DataType::Time => {
|
77
|
+
let ca = series.time().unwrap();
|
78
|
+
return Wrap(ca).into_value();
|
79
|
+
}
|
80
|
+
DataType::Datetime(_, _) => {
|
81
|
+
let ca = series.datetime().unwrap();
|
82
|
+
return Wrap(ca).into_value();
|
83
|
+
}
|
84
|
+
DataType::Decimal(_, _) => {
|
85
|
+
let ca = series.decimal().unwrap();
|
86
|
+
return Wrap(ca).into_value();
|
87
|
+
}
|
88
|
+
DataType::String => {
|
89
|
+
let ca = series.str().unwrap();
|
90
|
+
return Wrap(ca).into_value();
|
91
|
+
}
|
92
|
+
DataType::Struct(_) => {
|
93
|
+
let ca = series.struct_().unwrap();
|
94
|
+
return Wrap(ca).into_value();
|
95
|
+
}
|
96
|
+
DataType::Duration(_) => {
|
97
|
+
let ca = series.duration().unwrap();
|
98
|
+
return Wrap(ca).into_value();
|
99
|
+
}
|
100
|
+
DataType::Binary => {
|
101
|
+
let ca = series.binary().unwrap();
|
102
|
+
return Wrap(ca).into_value();
|
103
|
+
}
|
104
|
+
DataType::Null => {
|
105
|
+
let null: Option<u8> = None;
|
106
|
+
let n = series.len();
|
107
|
+
let iter = std::iter::repeat(null).take(n);
|
108
|
+
use std::iter::{Repeat, Take};
|
109
|
+
struct NullIter {
|
110
|
+
iter: Take<Repeat<Option<u8>>>,
|
111
|
+
n: usize,
|
112
|
+
}
|
113
|
+
impl Iterator for NullIter {
|
114
|
+
type Item = Option<u8>;
|
115
|
+
|
116
|
+
fn next(&mut self) -> Option<Self::Item> {
|
117
|
+
self.iter.next()
|
118
|
+
}
|
119
|
+
fn size_hint(&self) -> (usize, Option<usize>) {
|
120
|
+
(self.n, Some(self.n))
|
121
|
+
}
|
122
|
+
}
|
123
|
+
impl ExactSizeIterator for NullIter {}
|
124
|
+
|
125
|
+
RArray::from_iter(NullIter { iter, n }).into_value()
|
126
|
+
}
|
127
|
+
DataType::Unknown(_) => {
|
128
|
+
panic!("to_a not implemented for unknown")
|
129
|
+
}
|
130
|
+
DataType::BinaryOffset => {
|
131
|
+
unreachable!()
|
132
|
+
}
|
133
|
+
};
|
134
|
+
rblist
|
55
135
|
}
|
136
|
+
|
137
|
+
to_a_recursive(series)
|
56
138
|
}
|
57
139
|
}
|
@@ -5,7 +5,7 @@ mod construction;
|
|
5
5
|
mod export;
|
6
6
|
mod scatter;
|
7
7
|
|
8
|
-
use magnus::{exception, prelude::*,
|
8
|
+
use magnus::{exception, prelude::*, Error, IntoValue, RArray, Value};
|
9
9
|
use polars::prelude::*;
|
10
10
|
use polars::series::IsSorted;
|
11
11
|
use std::cell::RefCell;
|
@@ -233,8 +233,18 @@ impl RbSeries {
|
|
233
233
|
}
|
234
234
|
}
|
235
235
|
|
236
|
-
pub fn sort(&self, descending: bool, nulls_last: bool) -> Self {
|
237
|
-
(self
|
236
|
+
pub fn sort(&self, descending: bool, nulls_last: bool, multithreaded: bool) -> RbResult<Self> {
|
237
|
+
Ok(self
|
238
|
+
.series
|
239
|
+
.borrow_mut()
|
240
|
+
.sort(
|
241
|
+
SortOptions::default()
|
242
|
+
.with_order_descending(descending)
|
243
|
+
.with_nulls_last(nulls_last)
|
244
|
+
.with_multithreaded(multithreaded),
|
245
|
+
)
|
246
|
+
.map_err(RbPolarsErr::from)?
|
247
|
+
.into())
|
238
248
|
}
|
239
249
|
|
240
250
|
pub fn value_counts(&self, sorted: bool) -> RbResult<RbDataFrame> {
|
@@ -315,136 +325,6 @@ impl RbSeries {
|
|
315
325
|
self.series.borrow().len()
|
316
326
|
}
|
317
327
|
|
318
|
-
pub fn to_a(&self) -> Value {
|
319
|
-
let series = &self.series.borrow();
|
320
|
-
|
321
|
-
fn to_a_recursive(series: &Series) -> Value {
|
322
|
-
let rblist = match series.dtype() {
|
323
|
-
DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
|
324
|
-
DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
|
325
|
-
DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
|
326
|
-
DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
|
327
|
-
DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
|
328
|
-
DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
|
329
|
-
DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
|
330
|
-
DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
|
331
|
-
DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
|
332
|
-
DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
|
333
|
-
DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
|
334
|
-
DataType::Categorical(_, _) | DataType::Enum(_, _) => {
|
335
|
-
RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
|
336
|
-
}
|
337
|
-
DataType::Object(_, _) => {
|
338
|
-
let v = RArray::with_capacity(series.len());
|
339
|
-
for i in 0..series.len() {
|
340
|
-
let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
|
341
|
-
match obj {
|
342
|
-
Some(val) => v.push(val.to_object()).unwrap(),
|
343
|
-
None => v.push(qnil()).unwrap(),
|
344
|
-
};
|
345
|
-
}
|
346
|
-
v.into_value()
|
347
|
-
}
|
348
|
-
DataType::List(_) => {
|
349
|
-
let v = RArray::new();
|
350
|
-
let ca = series.list().unwrap();
|
351
|
-
for opt_s in unsafe { ca.amortized_iter() } {
|
352
|
-
match opt_s {
|
353
|
-
None => {
|
354
|
-
v.push(qnil()).unwrap();
|
355
|
-
}
|
356
|
-
Some(s) => {
|
357
|
-
let rblst = to_a_recursive(s.as_ref());
|
358
|
-
v.push(rblst).unwrap();
|
359
|
-
}
|
360
|
-
}
|
361
|
-
}
|
362
|
-
v.into_value()
|
363
|
-
}
|
364
|
-
DataType::Array(_, _) => {
|
365
|
-
let v = RArray::new();
|
366
|
-
let ca = series.array().unwrap();
|
367
|
-
for opt_s in ca.amortized_iter() {
|
368
|
-
match opt_s {
|
369
|
-
None => {
|
370
|
-
v.push(qnil()).unwrap();
|
371
|
-
}
|
372
|
-
Some(s) => {
|
373
|
-
let rblst = to_a_recursive(s.as_ref());
|
374
|
-
v.push(rblst).unwrap();
|
375
|
-
}
|
376
|
-
}
|
377
|
-
}
|
378
|
-
v.into_value()
|
379
|
-
}
|
380
|
-
DataType::Date => {
|
381
|
-
let ca = series.date().unwrap();
|
382
|
-
return Wrap(ca).into_value();
|
383
|
-
}
|
384
|
-
DataType::Time => {
|
385
|
-
let ca = series.time().unwrap();
|
386
|
-
return Wrap(ca).into_value();
|
387
|
-
}
|
388
|
-
DataType::Datetime(_, _) => {
|
389
|
-
let ca = series.datetime().unwrap();
|
390
|
-
return Wrap(ca).into_value();
|
391
|
-
}
|
392
|
-
DataType::Decimal(_, _) => {
|
393
|
-
let ca = series.decimal().unwrap();
|
394
|
-
return Wrap(ca).into_value();
|
395
|
-
}
|
396
|
-
DataType::String => {
|
397
|
-
let ca = series.str().unwrap();
|
398
|
-
return Wrap(ca).into_value();
|
399
|
-
}
|
400
|
-
DataType::Struct(_) => {
|
401
|
-
let ca = series.struct_().unwrap();
|
402
|
-
return Wrap(ca).into_value();
|
403
|
-
}
|
404
|
-
DataType::Duration(_) => {
|
405
|
-
let ca = series.duration().unwrap();
|
406
|
-
return Wrap(ca).into_value();
|
407
|
-
}
|
408
|
-
DataType::Binary => {
|
409
|
-
let ca = series.binary().unwrap();
|
410
|
-
return Wrap(ca).into_value();
|
411
|
-
}
|
412
|
-
DataType::Null => {
|
413
|
-
let null: Option<u8> = None;
|
414
|
-
let n = series.len();
|
415
|
-
let iter = std::iter::repeat(null).take(n);
|
416
|
-
use std::iter::{Repeat, Take};
|
417
|
-
struct NullIter {
|
418
|
-
iter: Take<Repeat<Option<u8>>>,
|
419
|
-
n: usize,
|
420
|
-
}
|
421
|
-
impl Iterator for NullIter {
|
422
|
-
type Item = Option<u8>;
|
423
|
-
|
424
|
-
fn next(&mut self) -> Option<Self::Item> {
|
425
|
-
self.iter.next()
|
426
|
-
}
|
427
|
-
fn size_hint(&self) -> (usize, Option<usize>) {
|
428
|
-
(self.n, Some(self.n))
|
429
|
-
}
|
430
|
-
}
|
431
|
-
impl ExactSizeIterator for NullIter {}
|
432
|
-
|
433
|
-
RArray::from_iter(NullIter { iter, n }).into_value()
|
434
|
-
}
|
435
|
-
DataType::Unknown => {
|
436
|
-
panic!("to_a not implemented for unknown")
|
437
|
-
}
|
438
|
-
DataType::BinaryOffset => {
|
439
|
-
unreachable!()
|
440
|
-
}
|
441
|
-
};
|
442
|
-
rblist
|
443
|
-
}
|
444
|
-
|
445
|
-
to_a_recursive(series)
|
446
|
-
}
|
447
|
-
|
448
328
|
pub fn clone(&self) -> Self {
|
449
329
|
RbSeries::new(self.series.borrow().clone())
|
450
330
|
}
|
data/lib/polars/array_expr.rb
CHANGED
@@ -333,6 +333,10 @@ module Polars
|
|
333
333
|
#
|
334
334
|
# @param index [Integer]
|
335
335
|
# Index to return per sub-array
|
336
|
+
# @param null_on_oob [Boolean]
|
337
|
+
# Behavior if an index is out of bounds:
|
338
|
+
# true -> set as null
|
339
|
+
# false -> raise an error
|
336
340
|
#
|
337
341
|
# @return [Expr]
|
338
342
|
#
|
@@ -353,9 +357,9 @@ module Polars
|
|
353
357
|
# # │ [4, 5, 6] ┆ -2 ┆ 5 │
|
354
358
|
# # │ [7, 8, 9] ┆ 4 ┆ null │
|
355
359
|
# # └───────────────┴─────┴──────┘
|
356
|
-
def get(index)
|
360
|
+
def get(index, null_on_oob: true)
|
357
361
|
index = Utils.parse_as_expression(index)
|
358
|
-
Utils.wrap_expr(_rbexpr.arr_get(index))
|
362
|
+
Utils.wrap_expr(_rbexpr.arr_get(index, null_on_oob))
|
359
363
|
end
|
360
364
|
|
361
365
|
# Get the first value of the sub-arrays.
|
@@ -13,6 +13,7 @@ module Polars
|
|
13
13
|
skip_rows: 0,
|
14
14
|
dtypes: nil,
|
15
15
|
null_values: nil,
|
16
|
+
missing_utf8_is_empty_string: false,
|
16
17
|
ignore_errors: false,
|
17
18
|
parse_dates: false,
|
18
19
|
n_threads: nil,
|
@@ -27,10 +28,13 @@ module Polars
|
|
27
28
|
row_count_offset: 0,
|
28
29
|
sample_size: 1024,
|
29
30
|
eol_char: "\n",
|
30
|
-
new_columns: nil
|
31
|
+
new_columns: nil,
|
32
|
+
raise_if_empty: true,
|
33
|
+
truncate_ragged_lines: false,
|
34
|
+
decimal_comma: false
|
31
35
|
)
|
32
36
|
if Utils.pathlike?(file)
|
33
|
-
path = Utils.
|
37
|
+
path = Utils.normalize_filepath(file)
|
34
38
|
end
|
35
39
|
|
36
40
|
dtype_list = nil
|
@@ -71,11 +75,15 @@ module Polars
|
|
71
75
|
comment_char,
|
72
76
|
quote_char,
|
73
77
|
processed_null_values,
|
78
|
+
missing_utf8_is_empty_string,
|
74
79
|
parse_dates,
|
75
80
|
skip_rows_after_header,
|
76
81
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
77
82
|
sample_size,
|
78
|
-
eol_char
|
83
|
+
eol_char,
|
84
|
+
raise_if_empty,
|
85
|
+
truncate_ragged_lines,
|
86
|
+
decimal_comma
|
79
87
|
)
|
80
88
|
self.new_columns = new_columns
|
81
89
|
end
|
data/lib/polars/convert.rb
CHANGED
@@ -27,7 +27,12 @@ module Polars
|
|
27
27
|
# # │ 2 ┆ 4 │
|
28
28
|
# # └─────┴─────┘
|
29
29
|
def from_hash(data, schema: nil, columns: nil)
|
30
|
-
|
30
|
+
Utils.wrap_df(
|
31
|
+
DataFrame.hash_to_rbdf(
|
32
|
+
data,
|
33
|
+
schema: schema || columns
|
34
|
+
)
|
35
|
+
)
|
31
36
|
end
|
32
37
|
|
33
38
|
# Construct a DataFrame from a sequence of dictionaries. This operation clones data.
|