polars-df 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +90 -48
- data/README.md +6 -6
- data/ext/polars/Cargo.toml +7 -5
- data/ext/polars/src/batched_csv.rs +53 -52
- data/ext/polars/src/conversion/mod.rs +13 -60
- data/ext/polars/src/dataframe/construction.rs +186 -0
- data/ext/polars/src/dataframe/export.rs +48 -0
- data/ext/polars/src/dataframe/general.rs +607 -0
- data/ext/polars/src/dataframe/io.rs +463 -0
- data/ext/polars/src/dataframe/mod.rs +26 -0
- data/ext/polars/src/expr/datetime.rs +6 -2
- data/ext/polars/src/expr/general.rs +28 -6
- data/ext/polars/src/expr/rolling.rs +185 -69
- data/ext/polars/src/expr/string.rs +9 -30
- data/ext/polars/src/functions/lazy.rs +2 -0
- data/ext/polars/src/functions/range.rs +74 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/interop/numo/mod.rs +2 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
- data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
- data/ext/polars/src/lazyframe/mod.rs +54 -38
- data/ext/polars/src/lib.rs +46 -21
- data/ext/polars/src/map/lazy.rs +5 -25
- data/ext/polars/src/map/series.rs +7 -1
- data/ext/polars/src/series/aggregation.rs +47 -30
- data/ext/polars/src/series/export.rs +131 -49
- data/ext/polars/src/series/mod.rs +1 -131
- data/lib/polars/batched_csv_reader.rb +9 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +83 -302
- data/lib/polars/date_time_expr.rb +1 -0
- data/lib/polars/date_time_name_space.rb +5 -1
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1134 -20
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +296 -490
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +23 -166
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +2 -2
- data/lib/polars/string_expr.rb +37 -36
- data/lib/polars/utils.rb +35 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -1
- metadata +21 -5
- data/ext/polars/src/dataframe.rs +0 -1208
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
|
|
7
7
|
|
8
8
|
use magnus::{
|
9
9
|
class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
|
10
|
-
Ruby,
|
10
|
+
Ruby, TryConvert, Value,
|
11
11
|
};
|
12
12
|
use polars::chunked_array::object::PolarsObjectSafe;
|
13
13
|
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
|
|
18
18
|
use polars::prelude::*;
|
19
19
|
use polars::series::ops::NullBehavior;
|
20
20
|
use polars_core::utils::arrow::array::Array;
|
21
|
+
use polars_core::utils::materialize_dyn_int;
|
21
22
|
use polars_utils::total_ord::{TotalEq, TotalHash};
|
22
23
|
use smartstring::alias::String as SmartString;
|
23
24
|
|
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
|
|
154
155
|
let class = pl.const_get::<_, Value>("Float32").unwrap();
|
155
156
|
class.funcall("new", ()).unwrap()
|
156
157
|
}
|
157
|
-
DataType::Float64 => {
|
158
|
+
DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
|
158
159
|
let class = pl.const_get::<_, Value>("Float64").unwrap();
|
159
160
|
class.funcall("new", ()).unwrap()
|
160
161
|
}
|
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
|
|
168
169
|
let class = pl.const_get::<_, Value>("Boolean").unwrap();
|
169
170
|
class.funcall("new", ()).unwrap()
|
170
171
|
}
|
171
|
-
DataType::String => {
|
172
|
+
DataType::String | DataType::Unknown(UnknownKind::Str) => {
|
172
173
|
let class = pl.const_get::<_, Value>("String").unwrap();
|
173
174
|
class.funcall("new", ()).unwrap()
|
174
175
|
}
|
@@ -242,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
|
|
242
243
|
let class = pl.const_get::<_, Value>("Null").unwrap();
|
243
244
|
class.funcall("new", ()).unwrap()
|
244
245
|
}
|
245
|
-
DataType::Unknown => {
|
246
|
+
DataType::Unknown(UnknownKind::Int(v)) => {
|
247
|
+
Wrap(materialize_dyn_int(v).dtype()).into_value()
|
248
|
+
}
|
249
|
+
DataType::Unknown(_) => {
|
246
250
|
let class = pl.const_get::<_, Value>("Unknown").unwrap();
|
247
251
|
class.funcall("new", ()).unwrap()
|
248
252
|
}
|
@@ -310,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
|
|
310
314
|
"Polars::Object" => DataType::Object(OBJECT_NAME, None),
|
311
315
|
"Polars::List" => DataType::List(Box::new(DataType::Null)),
|
312
316
|
"Polars::Null" => DataType::Null,
|
313
|
-
"Polars::Unknown" => DataType::Unknown,
|
317
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
314
318
|
dt => {
|
315
319
|
return Err(RbValueError::new_err(format!(
|
316
320
|
"{dt} is not a correct polars DataType.",
|
@@ -350,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
|
|
350
354
|
"Polars::Float32" => DataType::Float32,
|
351
355
|
"Polars::Float64" => DataType::Float64,
|
352
356
|
"Polars::Null" => DataType::Null,
|
353
|
-
"Polars::Unknown" => DataType::Unknown,
|
357
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
354
358
|
"Polars::Duration" => {
|
355
359
|
let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
|
356
360
|
let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
|
@@ -410,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
|
|
410
414
|
"obj" => DataType::Object(OBJECT_NAME, None),
|
411
415
|
"list" => DataType::List(Box::new(DataType::Boolean)),
|
412
416
|
"null" => DataType::Null,
|
413
|
-
"unk" => DataType::Unknown,
|
417
|
+
"unk" => DataType::Unknown(Default::default()),
|
414
418
|
_ => {
|
415
419
|
return Err(RbValueError::new_err(format!(
|
416
420
|
"{} is not a supported DataType.",
|
@@ -546,57 +550,6 @@ impl Default for ObjectValue {
|
|
546
550
|
}
|
547
551
|
}
|
548
552
|
|
549
|
-
pub(crate) fn dicts_to_rows(
|
550
|
-
records: &Value,
|
551
|
-
infer_schema_len: Option<usize>,
|
552
|
-
schema_columns: PlIndexSet<String>,
|
553
|
-
) -> RbResult<(Vec<Row>, Vec<String>)> {
|
554
|
-
let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
|
555
|
-
let (dicts, len) = get_rbseq(*records)?;
|
556
|
-
|
557
|
-
let key_names = {
|
558
|
-
if !schema_columns.is_empty() {
|
559
|
-
schema_columns
|
560
|
-
} else {
|
561
|
-
let mut inferred_keys = PlIndexSet::new();
|
562
|
-
for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
|
563
|
-
let d = d?;
|
564
|
-
let d = RHash::try_convert(d)?;
|
565
|
-
|
566
|
-
d.foreach(|name: Value, _value: Value| {
|
567
|
-
if let Some(v) = Symbol::from_value(name) {
|
568
|
-
inferred_keys.insert(v.name()?.into());
|
569
|
-
} else {
|
570
|
-
inferred_keys.insert(String::try_convert(name)?);
|
571
|
-
};
|
572
|
-
Ok(ForEach::Continue)
|
573
|
-
})?;
|
574
|
-
}
|
575
|
-
inferred_keys
|
576
|
-
}
|
577
|
-
};
|
578
|
-
|
579
|
-
let mut rows = Vec::with_capacity(len);
|
580
|
-
|
581
|
-
for d in dicts.each() {
|
582
|
-
let d = d?;
|
583
|
-
let d = RHash::try_convert(d)?;
|
584
|
-
|
585
|
-
let mut row = Vec::with_capacity(key_names.len());
|
586
|
-
|
587
|
-
for k in key_names.iter() {
|
588
|
-
// TODO improve performance
|
589
|
-
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
590
|
-
None => AnyValue::Null,
|
591
|
-
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
592
|
-
};
|
593
|
-
row.push(val)
|
594
|
-
}
|
595
|
-
rows.push(Row(row))
|
596
|
-
}
|
597
|
-
Ok((rows, key_names.into_iter().collect()))
|
598
|
-
}
|
599
|
-
|
600
553
|
impl TryConvert for Wrap<AsofStrategy> {
|
601
554
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
602
555
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -733,8 +686,8 @@ impl TryConvert for Wrap<JoinType> {
|
|
733
686
|
let parsed = match String::try_convert(ob)?.as_str() {
|
734
687
|
"inner" => JoinType::Inner,
|
735
688
|
"left" => JoinType::Left,
|
736
|
-
"outer" => JoinType::Outer
|
737
|
-
"outer_coalesce" => JoinType::Outer
|
689
|
+
"outer" => JoinType::Outer,
|
690
|
+
"outer_coalesce" => JoinType::Outer,
|
738
691
|
"semi" => JoinType::Semi,
|
739
692
|
"anti" => JoinType::Anti,
|
740
693
|
"cross" => JoinType::Cross,
|
@@ -0,0 +1,186 @@
|
|
1
|
+
use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
|
2
|
+
use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
|
3
|
+
use polars::prelude::*;
|
4
|
+
|
5
|
+
use super::*;
|
6
|
+
use crate::conversion::*;
|
7
|
+
use crate::{RbPolarsErr, RbResult};
|
8
|
+
|
9
|
+
impl RbDataFrame {
|
10
|
+
pub fn from_rows(
|
11
|
+
rb_rows: RArray,
|
12
|
+
infer_schema_length: Option<usize>,
|
13
|
+
schema: Option<Wrap<Schema>>,
|
14
|
+
) -> RbResult<Self> {
|
15
|
+
let mut data = Vec::with_capacity(rb_rows.len());
|
16
|
+
for v in rb_rows.each() {
|
17
|
+
let rb_row = RArray::try_convert(v?)?;
|
18
|
+
let mut row = Vec::with_capacity(rb_row.len());
|
19
|
+
for val in rb_row.each() {
|
20
|
+
row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
|
21
|
+
}
|
22
|
+
data.push(Row(row));
|
23
|
+
}
|
24
|
+
let schema = schema.map(|wrap| wrap.0);
|
25
|
+
finish_from_rows(data, schema, None, infer_schema_length)
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn from_hashes(
|
29
|
+
data: Value,
|
30
|
+
schema: Option<Wrap<Schema>>,
|
31
|
+
schema_overrides: Option<Wrap<Schema>>,
|
32
|
+
strict: bool,
|
33
|
+
infer_schema_length: Option<usize>,
|
34
|
+
) -> RbResult<Self> {
|
35
|
+
let schema = schema.map(|wrap| wrap.0);
|
36
|
+
let schema_overrides = schema_overrides.map(|wrap| wrap.0);
|
37
|
+
|
38
|
+
let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
|
39
|
+
let rows = dicts_to_rows(&data, &names, strict)?;
|
40
|
+
|
41
|
+
let schema = schema.or_else(|| {
|
42
|
+
Some(columns_names_to_empty_schema(
|
43
|
+
names.iter().map(String::as_str),
|
44
|
+
))
|
45
|
+
});
|
46
|
+
|
47
|
+
finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
fn finish_from_rows(
|
52
|
+
rows: Vec<Row>,
|
53
|
+
schema: Option<Schema>,
|
54
|
+
schema_overrides: Option<Schema>,
|
55
|
+
infer_schema_length: Option<usize>,
|
56
|
+
) -> RbResult<RbDataFrame> {
|
57
|
+
// Object builder must be registered
|
58
|
+
crate::on_startup::register_object_builder();
|
59
|
+
|
60
|
+
let mut schema = if let Some(mut schema) = schema {
|
61
|
+
resolve_schema_overrides(&mut schema, schema_overrides);
|
62
|
+
update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
|
63
|
+
schema
|
64
|
+
} else {
|
65
|
+
rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
|
66
|
+
};
|
67
|
+
|
68
|
+
// TODO: Remove this step when Decimals are supported properly.
|
69
|
+
// Erasing the decimal precision/scale here will just require us to infer it again later.
|
70
|
+
// https://github.com/pola-rs/polars/issues/14427
|
71
|
+
erase_decimal_precision_scale(&mut schema);
|
72
|
+
|
73
|
+
let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
|
74
|
+
Ok(df.into())
|
75
|
+
}
|
76
|
+
|
77
|
+
fn update_schema_from_rows(
|
78
|
+
schema: &mut Schema,
|
79
|
+
rows: &[Row],
|
80
|
+
infer_schema_length: Option<usize>,
|
81
|
+
) -> RbResult<()> {
|
82
|
+
let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
|
83
|
+
if schema_is_complete {
|
84
|
+
return Ok(());
|
85
|
+
}
|
86
|
+
|
87
|
+
// TODO: Only infer dtypes for columns with an unknown dtype
|
88
|
+
let inferred_dtypes =
|
89
|
+
rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
|
90
|
+
let inferred_dtypes_slice = inferred_dtypes.as_slice();
|
91
|
+
|
92
|
+
for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
|
93
|
+
if !dtype.is_known() {
|
94
|
+
*dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
|
95
|
+
polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
|
96
|
+
})
|
97
|
+
.map_err(RbPolarsErr::from)?
|
98
|
+
.clone();
|
99
|
+
}
|
100
|
+
}
|
101
|
+
Ok(())
|
102
|
+
}
|
103
|
+
|
104
|
+
fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
|
105
|
+
if let Some(overrides) = schema_overrides {
|
106
|
+
for (name, dtype) in overrides.into_iter() {
|
107
|
+
schema.set_dtype(name.as_str(), dtype);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
fn erase_decimal_precision_scale(schema: &mut Schema) {
|
113
|
+
for dtype in schema.iter_dtypes_mut() {
|
114
|
+
if let DataType::Decimal(_, _) = dtype {
|
115
|
+
*dtype = DataType::Decimal(None, None)
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
|
121
|
+
where
|
122
|
+
I: IntoIterator<Item = &'a str>,
|
123
|
+
{
|
124
|
+
let fields = column_names
|
125
|
+
.into_iter()
|
126
|
+
.map(|c| Field::new(c, DataType::Unknown(Default::default())));
|
127
|
+
Schema::from_iter(fields)
|
128
|
+
}
|
129
|
+
|
130
|
+
fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
|
131
|
+
let (data, len) = get_rbseq(*data)?;
|
132
|
+
let mut rows = Vec::with_capacity(len);
|
133
|
+
for d in data.each() {
|
134
|
+
let d = d?;
|
135
|
+
let d = RHash::try_convert(d)?;
|
136
|
+
|
137
|
+
let mut row = Vec::with_capacity(names.len());
|
138
|
+
for k in names.iter() {
|
139
|
+
// TODO improve performance
|
140
|
+
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
141
|
+
None => AnyValue::Null,
|
142
|
+
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
143
|
+
};
|
144
|
+
row.push(val)
|
145
|
+
}
|
146
|
+
rows.push(Row(row))
|
147
|
+
}
|
148
|
+
Ok(rows)
|
149
|
+
}
|
150
|
+
|
151
|
+
fn get_schema_names(
|
152
|
+
data: &Value,
|
153
|
+
schema: Option<&Schema>,
|
154
|
+
infer_schema_length: Option<usize>,
|
155
|
+
) -> RbResult<Vec<String>> {
|
156
|
+
if let Some(schema) = schema {
|
157
|
+
Ok(schema.iter_names().map(|n| n.to_string()).collect())
|
158
|
+
} else {
|
159
|
+
infer_schema_names_from_data(data, infer_schema_length)
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
fn infer_schema_names_from_data(
|
164
|
+
data: &Value,
|
165
|
+
infer_schema_length: Option<usize>,
|
166
|
+
) -> RbResult<Vec<String>> {
|
167
|
+
let (data, data_len) = get_rbseq(*data)?;
|
168
|
+
let infer_schema_length = infer_schema_length
|
169
|
+
.map(|n| std::cmp::max(1, n))
|
170
|
+
.unwrap_or(data_len);
|
171
|
+
|
172
|
+
let mut names = PlIndexSet::new();
|
173
|
+
for d in data.each().take(infer_schema_length) {
|
174
|
+
let d = d?;
|
175
|
+
let d = RHash::try_convert(d)?;
|
176
|
+
d.foreach(|name: Value, _value: Value| {
|
177
|
+
if let Some(v) = Symbol::from_value(name) {
|
178
|
+
names.insert(v.name()?.into());
|
179
|
+
} else {
|
180
|
+
names.insert(String::try_convert(name)?);
|
181
|
+
};
|
182
|
+
Ok(ForEach::Continue)
|
183
|
+
})?;
|
184
|
+
}
|
185
|
+
Ok(names.into_iter().collect())
|
186
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
use magnus::{prelude::*, IntoValue, RArray, Value};
|
2
|
+
|
3
|
+
use super::*;
|
4
|
+
use crate::conversion::{ObjectValue, Wrap};
|
5
|
+
|
6
|
+
impl RbDataFrame {
|
7
|
+
pub fn row_tuple(&self, idx: i64) -> Value {
|
8
|
+
let idx = if idx < 0 {
|
9
|
+
(self.df.borrow().height() as i64 + idx) as usize
|
10
|
+
} else {
|
11
|
+
idx as usize
|
12
|
+
};
|
13
|
+
RArray::from_iter(
|
14
|
+
self.df
|
15
|
+
.borrow()
|
16
|
+
.get_columns()
|
17
|
+
.iter()
|
18
|
+
.map(|s| match s.dtype() {
|
19
|
+
DataType::Object(_, _) => {
|
20
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
21
|
+
obj.unwrap().to_object()
|
22
|
+
}
|
23
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
24
|
+
}),
|
25
|
+
)
|
26
|
+
.as_value()
|
27
|
+
}
|
28
|
+
|
29
|
+
pub fn row_tuples(&self) -> Value {
|
30
|
+
let df = &self.df;
|
31
|
+
RArray::from_iter((0..df.borrow().height()).map(|idx| {
|
32
|
+
RArray::from_iter(
|
33
|
+
self.df
|
34
|
+
.borrow()
|
35
|
+
.get_columns()
|
36
|
+
.iter()
|
37
|
+
.map(|s| match s.dtype() {
|
38
|
+
DataType::Object(_, _) => {
|
39
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
40
|
+
obj.unwrap().to_object()
|
41
|
+
}
|
42
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
43
|
+
}),
|
44
|
+
)
|
45
|
+
}))
|
46
|
+
.as_value()
|
47
|
+
}
|
48
|
+
}
|