polars-df 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +29 -0
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +53 -12
- data/ext/polars/src/dataframe.rs +36 -39
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +19 -5
- data/ext/polars/src/series.rs +13 -1
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +5 -3
data/Cargo.toml
CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
|
|
4
4
|
[patch.crates-io]
|
5
5
|
jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
|
6
6
|
halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
|
7
|
-
arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
|
8
7
|
|
9
8
|
[profile.release]
|
10
9
|
strip = true
|
data/README.md
CHANGED
@@ -50,6 +50,9 @@ From Parquet
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
Polars.read_parquet("file.parquet")
|
53
|
+
|
54
|
+
# or lazily with
|
55
|
+
Polars.scan_parquet("file.parquet")
|
53
56
|
```
|
54
57
|
|
55
58
|
From Active Record
|
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
|
|
60
63
|
Polars.read_sql("SELECT * FROM users")
|
61
64
|
```
|
62
65
|
|
66
|
+
From JSON
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Polars.read_json("file.json")
|
70
|
+
# or
|
71
|
+
Polars.read_ndjson("file.ndjson")
|
72
|
+
|
73
|
+
# or lazily with
|
74
|
+
Polars.scan_ndjson("file.ndjson")
|
75
|
+
```
|
76
|
+
|
77
|
+
From Feather / Arrow IPC
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
Polars.read_ipc("file.arrow")
|
81
|
+
|
82
|
+
# or lazily with
|
83
|
+
Polars.scan_ipc("file.arrow")
|
84
|
+
```
|
85
|
+
|
86
|
+
From Avro
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Polars.read_avro("file.avro")
|
90
|
+
```
|
91
|
+
|
63
92
|
From a hash
|
64
93
|
|
65
94
|
```ruby
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
magnus = "0.5"
|
15
|
-
polars-core = "0.
|
15
|
+
polars-core = "0.28.0"
|
16
16
|
serde_json = "1"
|
17
|
+
smartstring = "1"
|
17
18
|
|
18
19
|
[dependencies.polars]
|
19
|
-
version = "0.
|
20
|
+
version = "0.28.0"
|
20
21
|
features = [
|
21
22
|
"abs",
|
22
23
|
"arange",
|
@@ -44,6 +45,7 @@ features = [
|
|
44
45
|
"ipc",
|
45
46
|
"is_first",
|
46
47
|
"is_in",
|
48
|
+
"is_unique",
|
47
49
|
"json",
|
48
50
|
"lazy",
|
49
51
|
"lazy_regex",
|
@@ -7,11 +7,17 @@ use std::cell::RefCell;
|
|
7
7
|
use std::path::PathBuf;
|
8
8
|
|
9
9
|
use crate::conversion::*;
|
10
|
+
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
10
11
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
11
12
|
|
13
|
+
pub enum BatchedReader {
|
14
|
+
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
+
Read(OwnedBatchedCsvReader),
|
16
|
+
}
|
17
|
+
|
12
18
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
13
19
|
pub struct RbBatchedCsv {
|
14
|
-
pub reader: RefCell<
|
20
|
+
pub reader: RefCell<BatchedReader>,
|
15
21
|
}
|
16
22
|
|
17
23
|
impl RbBatchedCsv {
|
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
|
|
38
44
|
let comment_char: Option<String> = arguments[16].try_convert()?;
|
39
45
|
let quote_char: Option<String> = arguments[17].try_convert()?;
|
40
46
|
let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
|
41
|
-
let
|
47
|
+
let try_parse_dates: bool = arguments[19].try_convert()?;
|
42
48
|
let skip_rows_after_header: usize = arguments[20].try_convert()?;
|
43
49
|
let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
|
44
50
|
let sample_size: usize = arguments[22].try_convert()?;
|
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
|
|
95
101
|
.low_memory(low_memory)
|
96
102
|
.with_comment_char(comment_char)
|
97
103
|
.with_null_values(null_values)
|
98
|
-
.
|
104
|
+
.with_try_parse_dates(try_parse_dates)
|
99
105
|
.with_quote_char(quote_char)
|
100
106
|
.with_end_of_line_char(eol_char)
|
101
107
|
.with_skip_rows_after_header(skip_rows_after_header)
|
102
108
|
.with_row_count(row_count)
|
103
|
-
.sample_size(sample_size)
|
104
|
-
|
105
|
-
|
109
|
+
.sample_size(sample_size);
|
110
|
+
|
111
|
+
let reader = if low_memory {
|
112
|
+
let reader = reader
|
113
|
+
.batched_read(overwrite_dtype.map(Arc::new))
|
114
|
+
.map_err(RbPolarsErr::from)?;
|
115
|
+
BatchedReader::Read(reader)
|
116
|
+
} else {
|
117
|
+
let reader = reader
|
118
|
+
.batched_mmap(overwrite_dtype.map(Arc::new))
|
119
|
+
.map_err(RbPolarsErr::from)?;
|
120
|
+
BatchedReader::MMap(reader)
|
121
|
+
};
|
106
122
|
|
107
123
|
Ok(RbBatchedCsv {
|
108
124
|
reader: RefCell::new(reader),
|
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
|
|
110
126
|
}
|
111
127
|
|
112
128
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
113
|
-
let batches = self
|
114
|
-
.
|
115
|
-
.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
}))
|
129
|
+
let batches = match &mut *self.reader.borrow_mut() {
|
130
|
+
BatchedReader::MMap(reader) => reader.next_batches(n),
|
131
|
+
BatchedReader::Read(reader) => reader.next_batches(n),
|
132
|
+
}
|
133
|
+
.map_err(RbPolarsErr::from)?;
|
134
|
+
|
135
|
+
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
121
136
|
}
|
122
137
|
}
|
@@ -1,3 +1,6 @@
|
|
1
|
+
use std::fmt::{Display, Formatter};
|
2
|
+
use std::hash::{Hash, Hasher};
|
3
|
+
|
1
4
|
use magnus::{
|
2
5
|
class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
|
3
6
|
RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
|
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
|
|
10
13
|
use polars::io::avro::AvroCompression;
|
11
14
|
use polars::prelude::*;
|
12
15
|
use polars::series::ops::NullBehavior;
|
13
|
-
use
|
14
|
-
use std::hash::{Hash, Hasher};
|
16
|
+
use smartstring::alias::String as SmartString;
|
15
17
|
|
16
18
|
use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
17
19
|
|
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
|
|
82
84
|
}
|
83
85
|
}
|
84
86
|
|
87
|
+
impl TryConvert for Wrap<BinaryChunked> {
|
88
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
89
|
+
let (seq, len) = get_rbseq(obj)?;
|
90
|
+
let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
|
91
|
+
|
92
|
+
for res in seq.each() {
|
93
|
+
let item = res?;
|
94
|
+
match item.try_convert::<RString>() {
|
95
|
+
Ok(val) => builder.append_value(unsafe { val.as_slice() }),
|
96
|
+
Err(_) => builder.append_null(),
|
97
|
+
}
|
98
|
+
}
|
99
|
+
Ok(Wrap(builder.finish()))
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
85
103
|
impl TryConvert for Wrap<NullValues> {
|
86
104
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
87
105
|
if let Ok(s) = ob.try_convert::<String>() {
|
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
|
|
98
116
|
}
|
99
117
|
}
|
100
118
|
|
119
|
+
fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
120
|
+
let dict = RHash::new();
|
121
|
+
for (fld, val) in flds.iter().zip(vals) {
|
122
|
+
dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
|
123
|
+
}
|
124
|
+
dict.into_value()
|
125
|
+
}
|
126
|
+
|
101
127
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
102
128
|
fn into_value_with(self, _: &RubyHandle) -> Value {
|
103
129
|
match self.0 {
|
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
114
140
|
AnyValue::Null => *QNIL,
|
115
141
|
AnyValue::Boolean(v) => Value::from(v),
|
116
142
|
AnyValue::Utf8(v) => Value::from(v),
|
117
|
-
AnyValue::Utf8Owned(
|
143
|
+
AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
|
118
144
|
AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
|
119
145
|
AnyValue::Date(v) => class::time()
|
120
146
|
.funcall::<_, _, Value>("at", (v * 86400,))
|
@@ -157,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
157
183
|
AnyValue::Duration(_v, _tu) => todo!(),
|
158
184
|
AnyValue::Time(_v) => todo!(),
|
159
185
|
AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
|
160
|
-
ref
|
161
|
-
AnyValue::StructOwned(
|
162
|
-
AnyValue::Object(
|
163
|
-
|
164
|
-
|
165
|
-
|
186
|
+
ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
|
187
|
+
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
188
|
+
AnyValue::Object(v) => {
|
189
|
+
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
190
|
+
object.inner
|
191
|
+
}
|
192
|
+
AnyValue::ObjectOwned(v) => {
|
193
|
+
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
194
|
+
object.inner
|
195
|
+
}
|
196
|
+
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
197
|
+
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
198
|
+
AnyValue::Decimal(_v, _scale) => todo!(),
|
166
199
|
}
|
167
200
|
}
|
168
201
|
}
|
@@ -182,7 +215,7 @@ impl IntoValue for Wrap<DataType> {
|
|
182
215
|
DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
|
183
216
|
DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
|
184
217
|
DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
|
185
|
-
DataType::
|
218
|
+
DataType::Decimal(_precision, _scale) => todo!(),
|
186
219
|
DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
|
187
220
|
DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
|
188
221
|
DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
|
@@ -210,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
|
|
210
243
|
DataType::Struct(fields) => {
|
211
244
|
let field_class = pl.const_get::<_, Value>("Field").unwrap();
|
212
245
|
let iter = fields.iter().map(|fld| {
|
213
|
-
let name = fld.name().
|
246
|
+
let name = fld.name().as_str();
|
214
247
|
let dtype = Wrap(fld.data_type().clone());
|
215
248
|
field_class
|
216
249
|
.funcall::<_, _, Value>("new", (name, dtype))
|
@@ -340,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
340
373
|
let n = 25;
|
341
374
|
let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
|
342
375
|
.map_err(RbPolarsErr::from)?;
|
343
|
-
let s = Series::from_any_values_and_dtype("", &avs, &dtype)
|
376
|
+
let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
|
344
377
|
.map_err(RbPolarsErr::from)?;
|
345
378
|
Ok(Wrap(AnyValue::List(s)))
|
346
379
|
}
|
@@ -870,3 +903,11 @@ pub fn parse_parquet_compression(
|
|
870
903
|
};
|
871
904
|
Ok(parsed)
|
872
905
|
}
|
906
|
+
|
907
|
+
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
|
908
|
+
where
|
909
|
+
I: IntoIterator<Item = S>,
|
910
|
+
S: AsRef<str>,
|
911
|
+
{
|
912
|
+
container.into_iter().map(|s| s.as_ref().into()).collect()
|
913
|
+
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -115,7 +115,7 @@ impl RbDataFrame {
|
|
115
115
|
let comment_char: Option<String> = arguments[17].try_convert()?;
|
116
116
|
let quote_char: Option<String> = arguments[18].try_convert()?;
|
117
117
|
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
118
|
-
let
|
118
|
+
let try_parse_dates: bool = arguments[20].try_convert()?;
|
119
119
|
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
120
120
|
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
121
121
|
let sample_size: usize = arguments[23].try_convert()?;
|
@@ -168,12 +168,12 @@ impl RbDataFrame {
|
|
168
168
|
.with_columns(columns)
|
169
169
|
.with_n_threads(n_threads)
|
170
170
|
.with_path(path)
|
171
|
-
.with_dtypes(overwrite_dtype.
|
171
|
+
.with_dtypes(overwrite_dtype.map(Arc::new))
|
172
172
|
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
173
173
|
.low_memory(low_memory)
|
174
174
|
.with_comment_char(comment_char)
|
175
175
|
.with_null_values(null_values)
|
176
|
-
.
|
176
|
+
.with_try_parse_dates(try_parse_dates)
|
177
177
|
.with_quote_char(quote_char)
|
178
178
|
.with_end_of_line_char(eol_char)
|
179
179
|
.with_skip_rows_after_header(skip_rows_after_header)
|
@@ -184,6 +184,7 @@ impl RbDataFrame {
|
|
184
184
|
Ok(df.into())
|
185
185
|
}
|
186
186
|
|
187
|
+
#[allow(clippy::too_many_arguments)]
|
187
188
|
pub fn read_parquet(
|
188
189
|
rb_f: Value,
|
189
190
|
columns: Option<Vec<String>>,
|
@@ -192,6 +193,8 @@ impl RbDataFrame {
|
|
192
193
|
parallel: Wrap<ParallelStrategy>,
|
193
194
|
row_count: Option<(String, IdxSize)>,
|
194
195
|
low_memory: bool,
|
196
|
+
use_statistics: bool,
|
197
|
+
rechunk: bool,
|
195
198
|
) -> RbResult<Self> {
|
196
199
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
197
200
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
@@ -202,6 +205,8 @@ impl RbDataFrame {
|
|
202
205
|
.with_n_rows(n_rows)
|
203
206
|
.with_row_count(row_count)
|
204
207
|
.set_low_memory(low_memory)
|
208
|
+
.use_statistics(use_statistics)
|
209
|
+
.set_rechunk(rechunk)
|
205
210
|
.finish()
|
206
211
|
.map_err(RbPolarsErr::from)?;
|
207
212
|
Ok(RbDataFrame::new(df))
|
@@ -254,7 +259,7 @@ impl RbDataFrame {
|
|
254
259
|
use polars::io::avro::AvroWriter;
|
255
260
|
|
256
261
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
257
|
-
let f = std::fs::File::create(
|
262
|
+
let f = std::fs::File::create(s).unwrap();
|
258
263
|
AvroWriter::new(f)
|
259
264
|
.with_compression(compression.0)
|
260
265
|
.finish(&mut self.df.borrow_mut())
|
@@ -339,7 +344,7 @@ impl RbDataFrame {
|
|
339
344
|
// ensure the new names are used
|
340
345
|
if let Some(schema) = &schema_overwrite {
|
341
346
|
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
342
|
-
*name = new_name.
|
347
|
+
*name = new_name.to_string();
|
343
348
|
}
|
344
349
|
}
|
345
350
|
let rbdf = Self::finish_from_rows(
|
@@ -348,17 +353,19 @@ impl RbDataFrame {
|
|
348
353
|
schema_overwrite.map(|wrap| wrap.0),
|
349
354
|
)?;
|
350
355
|
|
351
|
-
|
352
|
-
.
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
356
|
+
unsafe {
|
357
|
+
rbdf.df
|
358
|
+
.borrow_mut()
|
359
|
+
.get_columns_mut()
|
360
|
+
.iter_mut()
|
361
|
+
.zip(&names)
|
362
|
+
.for_each(|(s, name)| {
|
363
|
+
s.rename(name);
|
364
|
+
});
|
365
|
+
}
|
359
366
|
let length = names.len();
|
360
367
|
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
361
|
-
let err = PolarsError::
|
368
|
+
let err = PolarsError::SchemaMismatch("duplicate column names found".into());
|
362
369
|
Err(RbPolarsErr::from(err))?;
|
363
370
|
}
|
364
371
|
|
@@ -394,7 +401,7 @@ impl RbDataFrame {
|
|
394
401
|
let null = null_value.unwrap_or_default();
|
395
402
|
|
396
403
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
397
|
-
let f = std::fs::File::create(
|
404
|
+
let f = std::fs::File::create(s).unwrap();
|
398
405
|
// no need for a buffered writer, because the csv writer does internal buffering
|
399
406
|
CsvWriter::new(f)
|
400
407
|
.has_header(has_header)
|
@@ -436,7 +443,7 @@ impl RbDataFrame {
|
|
436
443
|
compression: Wrap<Option<IpcCompression>>,
|
437
444
|
) -> RbResult<()> {
|
438
445
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
439
|
-
let f = std::fs::File::create(
|
446
|
+
let f = std::fs::File::create(s).unwrap();
|
440
447
|
IpcWriter::new(f)
|
441
448
|
.with_compression(compression.0)
|
442
449
|
.finish(&mut self.df.borrow_mut())
|
@@ -524,7 +531,7 @@ impl RbDataFrame {
|
|
524
531
|
let compression = parse_parquet_compression(&compression, compression_level)?;
|
525
532
|
|
526
533
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
527
|
-
let f = std::fs::File::create(
|
534
|
+
let f = std::fs::File::create(s).unwrap();
|
528
535
|
ParquetWriter::new(f)
|
529
536
|
.with_compression(compression)
|
530
537
|
.with_statistics(statistics)
|
@@ -627,7 +634,7 @@ impl RbDataFrame {
|
|
627
634
|
}
|
628
635
|
|
629
636
|
pub fn get_columns(&self) -> RArray {
|
630
|
-
let cols = self.df.borrow().get_columns().
|
637
|
+
let cols = self.df.borrow().get_columns().to_vec();
|
631
638
|
to_rbseries_collection(cols)
|
632
639
|
}
|
633
640
|
|
@@ -881,10 +888,11 @@ impl RbDataFrame {
|
|
881
888
|
variable_name: Option<String>,
|
882
889
|
) -> RbResult<Self> {
|
883
890
|
let args = MeltArgs {
|
884
|
-
id_vars,
|
885
|
-
value_vars,
|
886
|
-
value_name,
|
887
|
-
variable_name,
|
891
|
+
id_vars: strings_to_smartstrings(id_vars),
|
892
|
+
value_vars: strings_to_smartstrings(value_vars),
|
893
|
+
value_name: value_name.map(|s| s.into()),
|
894
|
+
variable_name: variable_name.map(|s| s.into()),
|
895
|
+
streamable: false,
|
888
896
|
};
|
889
897
|
|
890
898
|
let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
|
@@ -897,22 +905,26 @@ impl RbDataFrame {
|
|
897
905
|
values: Vec<String>,
|
898
906
|
index: Vec<String>,
|
899
907
|
columns: Vec<String>,
|
900
|
-
aggregate_expr: &RbExpr,
|
901
908
|
maintain_order: bool,
|
902
909
|
sort_columns: bool,
|
910
|
+
aggregate_expr: Option<&RbExpr>,
|
903
911
|
separator: Option<String>,
|
904
912
|
) -> RbResult<Self> {
|
905
913
|
let fun = match maintain_order {
|
906
914
|
true => pivot_stable,
|
907
915
|
false => pivot,
|
908
916
|
};
|
917
|
+
let agg_expr = match aggregate_expr {
|
918
|
+
Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
|
919
|
+
None => None,
|
920
|
+
};
|
909
921
|
let df = fun(
|
910
922
|
&self.df.borrow(),
|
911
923
|
values,
|
912
924
|
index,
|
913
925
|
columns,
|
914
|
-
aggregate_expr.inner.clone(),
|
915
926
|
sort_columns,
|
927
|
+
agg_expr,
|
916
928
|
separator.as_deref(),
|
917
929
|
)
|
918
930
|
.map_err(RbPolarsErr::from)?;
|
@@ -933,21 +945,6 @@ impl RbDataFrame {
|
|
933
945
|
self.df.borrow().shift(periods).into()
|
934
946
|
}
|
935
947
|
|
936
|
-
pub fn unique(
|
937
|
-
&self,
|
938
|
-
maintain_order: bool,
|
939
|
-
subset: Option<Vec<String>>,
|
940
|
-
keep: Wrap<UniqueKeepStrategy>,
|
941
|
-
) -> RbResult<Self> {
|
942
|
-
let subset = subset.as_ref().map(|v| v.as_ref());
|
943
|
-
let df = match maintain_order {
|
944
|
-
true => self.df.borrow().unique_stable(subset, keep.0),
|
945
|
-
false => self.df.borrow().unique(subset, keep.0),
|
946
|
-
}
|
947
|
-
.map_err(RbPolarsErr::from)?;
|
948
|
-
Ok(df.into())
|
949
|
-
}
|
950
|
-
|
951
948
|
pub fn lazy(&self) -> RbLazyFrame {
|
952
949
|
self.df.borrow().clone().lazy().into()
|
953
950
|
}
|
@@ -4,6 +4,7 @@ use polars::lazy::frame::{LazyFrame, LazyGroupBy};
|
|
4
4
|
use polars::prelude::*;
|
5
5
|
use std::cell::RefCell;
|
6
6
|
use std::io::{BufWriter, Read};
|
7
|
+
use std::path::PathBuf;
|
7
8
|
|
8
9
|
use crate::conversion::*;
|
9
10
|
use crate::file::get_file_like;
|
@@ -118,7 +119,7 @@ impl RbLazyFrame {
|
|
118
119
|
let skip_rows_after_header: usize = arguments[15].try_convert()?;
|
119
120
|
let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
|
120
121
|
let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
|
121
|
-
let
|
122
|
+
let try_parse_dates: bool = arguments[18].try_convert()?;
|
122
123
|
let eol_char: String = arguments[19].try_convert()?;
|
123
124
|
// end arguments
|
124
125
|
|
@@ -153,7 +154,7 @@ impl RbLazyFrame {
|
|
153
154
|
.with_skip_rows_after_header(skip_rows_after_header)
|
154
155
|
.with_encoding(encoding.0)
|
155
156
|
.with_row_count(row_count)
|
156
|
-
.
|
157
|
+
.with_try_parse_dates(try_parse_dates)
|
157
158
|
.with_null_values(null_values);
|
158
159
|
|
159
160
|
if let Some(_lambda) = with_schema_modify {
|
@@ -163,6 +164,7 @@ impl RbLazyFrame {
|
|
163
164
|
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
164
165
|
}
|
165
166
|
|
167
|
+
#[allow(clippy::too_many_arguments)]
|
166
168
|
pub fn new_from_parquet(
|
167
169
|
path: String,
|
168
170
|
n_rows: Option<usize>,
|
@@ -171,6 +173,7 @@ impl RbLazyFrame {
|
|
171
173
|
rechunk: bool,
|
172
174
|
row_count: Option<(String, IdxSize)>,
|
173
175
|
low_memory: bool,
|
176
|
+
use_statistics: bool,
|
174
177
|
) -> RbResult<Self> {
|
175
178
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
176
179
|
let args = ScanArgsParquet {
|
@@ -182,6 +185,7 @@ impl RbLazyFrame {
|
|
182
185
|
low_memory,
|
183
186
|
// TODO support cloud options
|
184
187
|
cloud_options: None,
|
188
|
+
use_statistics,
|
185
189
|
};
|
186
190
|
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
187
191
|
Ok(lf.into())
|
@@ -284,6 +288,32 @@ impl RbLazyFrame {
|
|
284
288
|
Ok(df.into())
|
285
289
|
}
|
286
290
|
|
291
|
+
#[allow(clippy::too_many_arguments)]
|
292
|
+
pub fn sink_parquet(
|
293
|
+
&self,
|
294
|
+
path: PathBuf,
|
295
|
+
compression: String,
|
296
|
+
compression_level: Option<i32>,
|
297
|
+
statistics: bool,
|
298
|
+
row_group_size: Option<usize>,
|
299
|
+
data_pagesize_limit: Option<usize>,
|
300
|
+
maintain_order: bool,
|
301
|
+
) -> RbResult<()> {
|
302
|
+
let compression = parse_parquet_compression(&compression, compression_level)?;
|
303
|
+
|
304
|
+
let options = ParquetWriteOptions {
|
305
|
+
compression,
|
306
|
+
statistics,
|
307
|
+
row_group_size,
|
308
|
+
data_pagesize_limit,
|
309
|
+
maintain_order,
|
310
|
+
};
|
311
|
+
|
312
|
+
let ldf = self.ldf.clone();
|
313
|
+
ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
|
314
|
+
Ok(())
|
315
|
+
}
|
316
|
+
|
287
317
|
pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
|
288
318
|
let ldf = self.ldf.clone();
|
289
319
|
let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
|
@@ -328,7 +358,7 @@ impl RbLazyFrame {
|
|
328
358
|
let lazy_gb = ldf.groupby_rolling(
|
329
359
|
by,
|
330
360
|
RollingGroupOptions {
|
331
|
-
index_column,
|
361
|
+
index_column: index_column.into(),
|
332
362
|
period: Duration::parse(&period),
|
333
363
|
offset: Duration::parse(&offset),
|
334
364
|
closed_window,
|
@@ -359,7 +389,7 @@ impl RbLazyFrame {
|
|
359
389
|
let lazy_gb = ldf.groupby_dynamic(
|
360
390
|
by,
|
361
391
|
DynamicGroupOptions {
|
362
|
-
index_column,
|
392
|
+
index_column: index_column.into(),
|
363
393
|
every: Duration::parse(&every),
|
364
394
|
period: Duration::parse(&period),
|
365
395
|
offset: Duration::parse(&offset),
|
@@ -415,10 +445,10 @@ impl RbLazyFrame {
|
|
415
445
|
.force_parallel(force_parallel)
|
416
446
|
.how(JoinType::AsOf(AsOfOptions {
|
417
447
|
strategy: strategy.0,
|
418
|
-
left_by,
|
419
|
-
right_by,
|
448
|
+
left_by: left_by.map(strings_to_smartstrings),
|
449
|
+
right_by: right_by.map(strings_to_smartstrings),
|
420
450
|
tolerance: tolerance.map(|t| t.0.into_static().unwrap()),
|
421
|
-
tolerance_str,
|
451
|
+
tolerance_str: tolerance_str.map(|s| s.into()),
|
422
452
|
}))
|
423
453
|
.suffix(suffix)
|
424
454
|
.finish()
|
@@ -570,12 +600,14 @@ impl RbLazyFrame {
|
|
570
600
|
value_vars: Vec<String>,
|
571
601
|
value_name: Option<String>,
|
572
602
|
variable_name: Option<String>,
|
603
|
+
streamable: bool,
|
573
604
|
) -> Self {
|
574
605
|
let args = MeltArgs {
|
575
|
-
id_vars,
|
576
|
-
value_vars,
|
577
|
-
value_name,
|
578
|
-
variable_name,
|
606
|
+
id_vars: strings_to_smartstrings(id_vars),
|
607
|
+
value_vars: strings_to_smartstrings(value_vars),
|
608
|
+
value_name: value_name.map(|s| s.into()),
|
609
|
+
variable_name: variable_name.map(|s| s.into()),
|
610
|
+
streamable,
|
579
611
|
};
|
580
612
|
|
581
613
|
let ldf = self.ldf.clone();
|
@@ -596,8 +628,10 @@ impl RbLazyFrame {
|
|
596
628
|
self.ldf.clone().into()
|
597
629
|
}
|
598
630
|
|
599
|
-
pub fn columns(&self) -> RbResult<
|
600
|
-
|
631
|
+
pub fn columns(&self) -> RbResult<RArray> {
|
632
|
+
let schema = self.get_schema()?;
|
633
|
+
let iter = schema.iter_names().map(|s| s.as_str());
|
634
|
+
Ok(RArray::from_iter(iter))
|
601
635
|
}
|
602
636
|
|
603
637
|
pub fn dtypes(&self) -> RbResult<RArray> {
|
@@ -614,7 +648,7 @@ impl RbLazyFrame {
|
|
614
648
|
// TODO remove unwrap
|
615
649
|
schema_dict
|
616
650
|
.aset::<String, Value>(
|
617
|
-
fld.name().
|
651
|
+
fld.name().to_string(),
|
618
652
|
Wrap(fld.data_type().clone()).into_value(),
|
619
653
|
)
|
620
654
|
.unwrap();
|