polars-df 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +29 -0
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +53 -12
- data/ext/polars/src/dataframe.rs +36 -39
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +19 -5
- data/ext/polars/src/series.rs +13 -1
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +5 -3
data/Cargo.toml
CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
|
|
4
4
|
[patch.crates-io]
|
5
5
|
jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
|
6
6
|
halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
|
7
|
-
arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
|
8
7
|
|
9
8
|
[profile.release]
|
10
9
|
strip = true
|
data/README.md
CHANGED
@@ -50,6 +50,9 @@ From Parquet
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
Polars.read_parquet("file.parquet")
|
53
|
+
|
54
|
+
# or lazily with
|
55
|
+
Polars.scan_parquet("file.parquet")
|
53
56
|
```
|
54
57
|
|
55
58
|
From Active Record
|
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
|
|
60
63
|
Polars.read_sql("SELECT * FROM users")
|
61
64
|
```
|
62
65
|
|
66
|
+
From JSON
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Polars.read_json("file.json")
|
70
|
+
# or
|
71
|
+
Polars.read_ndjson("file.ndjson")
|
72
|
+
|
73
|
+
# or lazily with
|
74
|
+
Polars.scan_ndjson("file.ndjson")
|
75
|
+
```
|
76
|
+
|
77
|
+
From Feather / Arrow IPC
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
Polars.read_ipc("file.arrow")
|
81
|
+
|
82
|
+
# or lazily with
|
83
|
+
Polars.scan_ipc("file.arrow")
|
84
|
+
```
|
85
|
+
|
86
|
+
From Avro
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Polars.read_avro("file.avro")
|
90
|
+
```
|
91
|
+
|
63
92
|
From a hash
|
64
93
|
|
65
94
|
```ruby
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
magnus = "0.5"
|
15
|
-
polars-core = "0.
|
15
|
+
polars-core = "0.28.0"
|
16
16
|
serde_json = "1"
|
17
|
+
smartstring = "1"
|
17
18
|
|
18
19
|
[dependencies.polars]
|
19
|
-
version = "0.
|
20
|
+
version = "0.28.0"
|
20
21
|
features = [
|
21
22
|
"abs",
|
22
23
|
"arange",
|
@@ -44,6 +45,7 @@ features = [
|
|
44
45
|
"ipc",
|
45
46
|
"is_first",
|
46
47
|
"is_in",
|
48
|
+
"is_unique",
|
47
49
|
"json",
|
48
50
|
"lazy",
|
49
51
|
"lazy_regex",
|
@@ -7,11 +7,17 @@ use std::cell::RefCell;
|
|
7
7
|
use std::path::PathBuf;
|
8
8
|
|
9
9
|
use crate::conversion::*;
|
10
|
+
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
10
11
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
11
12
|
|
13
|
+
pub enum BatchedReader {
|
14
|
+
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
+
Read(OwnedBatchedCsvReader),
|
16
|
+
}
|
17
|
+
|
12
18
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
13
19
|
pub struct RbBatchedCsv {
|
14
|
-
pub reader: RefCell<
|
20
|
+
pub reader: RefCell<BatchedReader>,
|
15
21
|
}
|
16
22
|
|
17
23
|
impl RbBatchedCsv {
|
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
|
|
38
44
|
let comment_char: Option<String> = arguments[16].try_convert()?;
|
39
45
|
let quote_char: Option<String> = arguments[17].try_convert()?;
|
40
46
|
let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
|
41
|
-
let
|
47
|
+
let try_parse_dates: bool = arguments[19].try_convert()?;
|
42
48
|
let skip_rows_after_header: usize = arguments[20].try_convert()?;
|
43
49
|
let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
|
44
50
|
let sample_size: usize = arguments[22].try_convert()?;
|
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
|
|
95
101
|
.low_memory(low_memory)
|
96
102
|
.with_comment_char(comment_char)
|
97
103
|
.with_null_values(null_values)
|
98
|
-
.
|
104
|
+
.with_try_parse_dates(try_parse_dates)
|
99
105
|
.with_quote_char(quote_char)
|
100
106
|
.with_end_of_line_char(eol_char)
|
101
107
|
.with_skip_rows_after_header(skip_rows_after_header)
|
102
108
|
.with_row_count(row_count)
|
103
|
-
.sample_size(sample_size)
|
104
|
-
|
105
|
-
|
109
|
+
.sample_size(sample_size);
|
110
|
+
|
111
|
+
let reader = if low_memory {
|
112
|
+
let reader = reader
|
113
|
+
.batched_read(overwrite_dtype.map(Arc::new))
|
114
|
+
.map_err(RbPolarsErr::from)?;
|
115
|
+
BatchedReader::Read(reader)
|
116
|
+
} else {
|
117
|
+
let reader = reader
|
118
|
+
.batched_mmap(overwrite_dtype.map(Arc::new))
|
119
|
+
.map_err(RbPolarsErr::from)?;
|
120
|
+
BatchedReader::MMap(reader)
|
121
|
+
};
|
106
122
|
|
107
123
|
Ok(RbBatchedCsv {
|
108
124
|
reader: RefCell::new(reader),
|
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
|
|
110
126
|
}
|
111
127
|
|
112
128
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
113
|
-
let batches = self
|
114
|
-
.
|
115
|
-
.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
}))
|
129
|
+
let batches = match &mut *self.reader.borrow_mut() {
|
130
|
+
BatchedReader::MMap(reader) => reader.next_batches(n),
|
131
|
+
BatchedReader::Read(reader) => reader.next_batches(n),
|
132
|
+
}
|
133
|
+
.map_err(RbPolarsErr::from)?;
|
134
|
+
|
135
|
+
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
121
136
|
}
|
122
137
|
}
|
@@ -1,3 +1,6 @@
|
|
1
|
+
use std::fmt::{Display, Formatter};
|
2
|
+
use std::hash::{Hash, Hasher};
|
3
|
+
|
1
4
|
use magnus::{
|
2
5
|
class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
|
3
6
|
RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
|
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
|
|
10
13
|
use polars::io::avro::AvroCompression;
|
11
14
|
use polars::prelude::*;
|
12
15
|
use polars::series::ops::NullBehavior;
|
13
|
-
use
|
14
|
-
use std::hash::{Hash, Hasher};
|
16
|
+
use smartstring::alias::String as SmartString;
|
15
17
|
|
16
18
|
use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
17
19
|
|
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
|
|
82
84
|
}
|
83
85
|
}
|
84
86
|
|
87
|
+
impl TryConvert for Wrap<BinaryChunked> {
|
88
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
89
|
+
let (seq, len) = get_rbseq(obj)?;
|
90
|
+
let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
|
91
|
+
|
92
|
+
for res in seq.each() {
|
93
|
+
let item = res?;
|
94
|
+
match item.try_convert::<RString>() {
|
95
|
+
Ok(val) => builder.append_value(unsafe { val.as_slice() }),
|
96
|
+
Err(_) => builder.append_null(),
|
97
|
+
}
|
98
|
+
}
|
99
|
+
Ok(Wrap(builder.finish()))
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
85
103
|
impl TryConvert for Wrap<NullValues> {
|
86
104
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
87
105
|
if let Ok(s) = ob.try_convert::<String>() {
|
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
|
|
98
116
|
}
|
99
117
|
}
|
100
118
|
|
119
|
+
fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
120
|
+
let dict = RHash::new();
|
121
|
+
for (fld, val) in flds.iter().zip(vals) {
|
122
|
+
dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
|
123
|
+
}
|
124
|
+
dict.into_value()
|
125
|
+
}
|
126
|
+
|
101
127
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
102
128
|
fn into_value_with(self, _: &RubyHandle) -> Value {
|
103
129
|
match self.0 {
|
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
114
140
|
AnyValue::Null => *QNIL,
|
115
141
|
AnyValue::Boolean(v) => Value::from(v),
|
116
142
|
AnyValue::Utf8(v) => Value::from(v),
|
117
|
-
AnyValue::Utf8Owned(
|
143
|
+
AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
|
118
144
|
AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
|
119
145
|
AnyValue::Date(v) => class::time()
|
120
146
|
.funcall::<_, _, Value>("at", (v * 86400,))
|
@@ -157,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
157
183
|
AnyValue::Duration(_v, _tu) => todo!(),
|
158
184
|
AnyValue::Time(_v) => todo!(),
|
159
185
|
AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
|
160
|
-
ref
|
161
|
-
AnyValue::StructOwned(
|
162
|
-
AnyValue::Object(
|
163
|
-
|
164
|
-
|
165
|
-
|
186
|
+
ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
|
187
|
+
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
188
|
+
AnyValue::Object(v) => {
|
189
|
+
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
190
|
+
object.inner
|
191
|
+
}
|
192
|
+
AnyValue::ObjectOwned(v) => {
|
193
|
+
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
194
|
+
object.inner
|
195
|
+
}
|
196
|
+
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
197
|
+
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
198
|
+
AnyValue::Decimal(_v, _scale) => todo!(),
|
166
199
|
}
|
167
200
|
}
|
168
201
|
}
|
@@ -182,7 +215,7 @@ impl IntoValue for Wrap<DataType> {
|
|
182
215
|
DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
|
183
216
|
DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
|
184
217
|
DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
|
185
|
-
DataType::
|
218
|
+
DataType::Decimal(_precision, _scale) => todo!(),
|
186
219
|
DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
|
187
220
|
DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
|
188
221
|
DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
|
@@ -210,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
|
|
210
243
|
DataType::Struct(fields) => {
|
211
244
|
let field_class = pl.const_get::<_, Value>("Field").unwrap();
|
212
245
|
let iter = fields.iter().map(|fld| {
|
213
|
-
let name = fld.name().
|
246
|
+
let name = fld.name().as_str();
|
214
247
|
let dtype = Wrap(fld.data_type().clone());
|
215
248
|
field_class
|
216
249
|
.funcall::<_, _, Value>("new", (name, dtype))
|
@@ -340,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
340
373
|
let n = 25;
|
341
374
|
let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
|
342
375
|
.map_err(RbPolarsErr::from)?;
|
343
|
-
let s = Series::from_any_values_and_dtype("", &avs, &dtype)
|
376
|
+
let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
|
344
377
|
.map_err(RbPolarsErr::from)?;
|
345
378
|
Ok(Wrap(AnyValue::List(s)))
|
346
379
|
}
|
@@ -870,3 +903,11 @@ pub fn parse_parquet_compression(
|
|
870
903
|
};
|
871
904
|
Ok(parsed)
|
872
905
|
}
|
906
|
+
|
907
|
+
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
|
908
|
+
where
|
909
|
+
I: IntoIterator<Item = S>,
|
910
|
+
S: AsRef<str>,
|
911
|
+
{
|
912
|
+
container.into_iter().map(|s| s.as_ref().into()).collect()
|
913
|
+
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -115,7 +115,7 @@ impl RbDataFrame {
|
|
115
115
|
let comment_char: Option<String> = arguments[17].try_convert()?;
|
116
116
|
let quote_char: Option<String> = arguments[18].try_convert()?;
|
117
117
|
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
118
|
-
let
|
118
|
+
let try_parse_dates: bool = arguments[20].try_convert()?;
|
119
119
|
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
120
120
|
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
121
121
|
let sample_size: usize = arguments[23].try_convert()?;
|
@@ -168,12 +168,12 @@ impl RbDataFrame {
|
|
168
168
|
.with_columns(columns)
|
169
169
|
.with_n_threads(n_threads)
|
170
170
|
.with_path(path)
|
171
|
-
.with_dtypes(overwrite_dtype.
|
171
|
+
.with_dtypes(overwrite_dtype.map(Arc::new))
|
172
172
|
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
173
173
|
.low_memory(low_memory)
|
174
174
|
.with_comment_char(comment_char)
|
175
175
|
.with_null_values(null_values)
|
176
|
-
.
|
176
|
+
.with_try_parse_dates(try_parse_dates)
|
177
177
|
.with_quote_char(quote_char)
|
178
178
|
.with_end_of_line_char(eol_char)
|
179
179
|
.with_skip_rows_after_header(skip_rows_after_header)
|
@@ -184,6 +184,7 @@ impl RbDataFrame {
|
|
184
184
|
Ok(df.into())
|
185
185
|
}
|
186
186
|
|
187
|
+
#[allow(clippy::too_many_arguments)]
|
187
188
|
pub fn read_parquet(
|
188
189
|
rb_f: Value,
|
189
190
|
columns: Option<Vec<String>>,
|
@@ -192,6 +193,8 @@ impl RbDataFrame {
|
|
192
193
|
parallel: Wrap<ParallelStrategy>,
|
193
194
|
row_count: Option<(String, IdxSize)>,
|
194
195
|
low_memory: bool,
|
196
|
+
use_statistics: bool,
|
197
|
+
rechunk: bool,
|
195
198
|
) -> RbResult<Self> {
|
196
199
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
197
200
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
@@ -202,6 +205,8 @@ impl RbDataFrame {
|
|
202
205
|
.with_n_rows(n_rows)
|
203
206
|
.with_row_count(row_count)
|
204
207
|
.set_low_memory(low_memory)
|
208
|
+
.use_statistics(use_statistics)
|
209
|
+
.set_rechunk(rechunk)
|
205
210
|
.finish()
|
206
211
|
.map_err(RbPolarsErr::from)?;
|
207
212
|
Ok(RbDataFrame::new(df))
|
@@ -254,7 +259,7 @@ impl RbDataFrame {
|
|
254
259
|
use polars::io::avro::AvroWriter;
|
255
260
|
|
256
261
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
257
|
-
let f = std::fs::File::create(
|
262
|
+
let f = std::fs::File::create(s).unwrap();
|
258
263
|
AvroWriter::new(f)
|
259
264
|
.with_compression(compression.0)
|
260
265
|
.finish(&mut self.df.borrow_mut())
|
@@ -339,7 +344,7 @@ impl RbDataFrame {
|
|
339
344
|
// ensure the new names are used
|
340
345
|
if let Some(schema) = &schema_overwrite {
|
341
346
|
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
342
|
-
*name = new_name.
|
347
|
+
*name = new_name.to_string();
|
343
348
|
}
|
344
349
|
}
|
345
350
|
let rbdf = Self::finish_from_rows(
|
@@ -348,17 +353,19 @@ impl RbDataFrame {
|
|
348
353
|
schema_overwrite.map(|wrap| wrap.0),
|
349
354
|
)?;
|
350
355
|
|
351
|
-
|
352
|
-
.
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
356
|
+
unsafe {
|
357
|
+
rbdf.df
|
358
|
+
.borrow_mut()
|
359
|
+
.get_columns_mut()
|
360
|
+
.iter_mut()
|
361
|
+
.zip(&names)
|
362
|
+
.for_each(|(s, name)| {
|
363
|
+
s.rename(name);
|
364
|
+
});
|
365
|
+
}
|
359
366
|
let length = names.len();
|
360
367
|
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
361
|
-
let err = PolarsError::
|
368
|
+
let err = PolarsError::SchemaMismatch("duplicate column names found".into());
|
362
369
|
Err(RbPolarsErr::from(err))?;
|
363
370
|
}
|
364
371
|
|
@@ -394,7 +401,7 @@ impl RbDataFrame {
|
|
394
401
|
let null = null_value.unwrap_or_default();
|
395
402
|
|
396
403
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
397
|
-
let f = std::fs::File::create(
|
404
|
+
let f = std::fs::File::create(s).unwrap();
|
398
405
|
// no need for a buffered writer, because the csv writer does internal buffering
|
399
406
|
CsvWriter::new(f)
|
400
407
|
.has_header(has_header)
|
@@ -436,7 +443,7 @@ impl RbDataFrame {
|
|
436
443
|
compression: Wrap<Option<IpcCompression>>,
|
437
444
|
) -> RbResult<()> {
|
438
445
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
439
|
-
let f = std::fs::File::create(
|
446
|
+
let f = std::fs::File::create(s).unwrap();
|
440
447
|
IpcWriter::new(f)
|
441
448
|
.with_compression(compression.0)
|
442
449
|
.finish(&mut self.df.borrow_mut())
|
@@ -524,7 +531,7 @@ impl RbDataFrame {
|
|
524
531
|
let compression = parse_parquet_compression(&compression, compression_level)?;
|
525
532
|
|
526
533
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
527
|
-
let f = std::fs::File::create(
|
534
|
+
let f = std::fs::File::create(s).unwrap();
|
528
535
|
ParquetWriter::new(f)
|
529
536
|
.with_compression(compression)
|
530
537
|
.with_statistics(statistics)
|
@@ -627,7 +634,7 @@ impl RbDataFrame {
|
|
627
634
|
}
|
628
635
|
|
629
636
|
pub fn get_columns(&self) -> RArray {
|
630
|
-
let cols = self.df.borrow().get_columns().
|
637
|
+
let cols = self.df.borrow().get_columns().to_vec();
|
631
638
|
to_rbseries_collection(cols)
|
632
639
|
}
|
633
640
|
|
@@ -881,10 +888,11 @@ impl RbDataFrame {
|
|
881
888
|
variable_name: Option<String>,
|
882
889
|
) -> RbResult<Self> {
|
883
890
|
let args = MeltArgs {
|
884
|
-
id_vars,
|
885
|
-
value_vars,
|
886
|
-
value_name,
|
887
|
-
variable_name,
|
891
|
+
id_vars: strings_to_smartstrings(id_vars),
|
892
|
+
value_vars: strings_to_smartstrings(value_vars),
|
893
|
+
value_name: value_name.map(|s| s.into()),
|
894
|
+
variable_name: variable_name.map(|s| s.into()),
|
895
|
+
streamable: false,
|
888
896
|
};
|
889
897
|
|
890
898
|
let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
|
@@ -897,22 +905,26 @@ impl RbDataFrame {
|
|
897
905
|
values: Vec<String>,
|
898
906
|
index: Vec<String>,
|
899
907
|
columns: Vec<String>,
|
900
|
-
aggregate_expr: &RbExpr,
|
901
908
|
maintain_order: bool,
|
902
909
|
sort_columns: bool,
|
910
|
+
aggregate_expr: Option<&RbExpr>,
|
903
911
|
separator: Option<String>,
|
904
912
|
) -> RbResult<Self> {
|
905
913
|
let fun = match maintain_order {
|
906
914
|
true => pivot_stable,
|
907
915
|
false => pivot,
|
908
916
|
};
|
917
|
+
let agg_expr = match aggregate_expr {
|
918
|
+
Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
|
919
|
+
None => None,
|
920
|
+
};
|
909
921
|
let df = fun(
|
910
922
|
&self.df.borrow(),
|
911
923
|
values,
|
912
924
|
index,
|
913
925
|
columns,
|
914
|
-
aggregate_expr.inner.clone(),
|
915
926
|
sort_columns,
|
927
|
+
agg_expr,
|
916
928
|
separator.as_deref(),
|
917
929
|
)
|
918
930
|
.map_err(RbPolarsErr::from)?;
|
@@ -933,21 +945,6 @@ impl RbDataFrame {
|
|
933
945
|
self.df.borrow().shift(periods).into()
|
934
946
|
}
|
935
947
|
|
936
|
-
pub fn unique(
|
937
|
-
&self,
|
938
|
-
maintain_order: bool,
|
939
|
-
subset: Option<Vec<String>>,
|
940
|
-
keep: Wrap<UniqueKeepStrategy>,
|
941
|
-
) -> RbResult<Self> {
|
942
|
-
let subset = subset.as_ref().map(|v| v.as_ref());
|
943
|
-
let df = match maintain_order {
|
944
|
-
true => self.df.borrow().unique_stable(subset, keep.0),
|
945
|
-
false => self.df.borrow().unique(subset, keep.0),
|
946
|
-
}
|
947
|
-
.map_err(RbPolarsErr::from)?;
|
948
|
-
Ok(df.into())
|
949
|
-
}
|
950
|
-
|
951
948
|
pub fn lazy(&self) -> RbLazyFrame {
|
952
949
|
self.df.borrow().clone().lazy().into()
|
953
950
|
}
|
@@ -4,6 +4,7 @@ use polars::lazy::frame::{LazyFrame, LazyGroupBy};
|
|
4
4
|
use polars::prelude::*;
|
5
5
|
use std::cell::RefCell;
|
6
6
|
use std::io::{BufWriter, Read};
|
7
|
+
use std::path::PathBuf;
|
7
8
|
|
8
9
|
use crate::conversion::*;
|
9
10
|
use crate::file::get_file_like;
|
@@ -118,7 +119,7 @@ impl RbLazyFrame {
|
|
118
119
|
let skip_rows_after_header: usize = arguments[15].try_convert()?;
|
119
120
|
let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
|
120
121
|
let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
|
121
|
-
let
|
122
|
+
let try_parse_dates: bool = arguments[18].try_convert()?;
|
122
123
|
let eol_char: String = arguments[19].try_convert()?;
|
123
124
|
// end arguments
|
124
125
|
|
@@ -153,7 +154,7 @@ impl RbLazyFrame {
|
|
153
154
|
.with_skip_rows_after_header(skip_rows_after_header)
|
154
155
|
.with_encoding(encoding.0)
|
155
156
|
.with_row_count(row_count)
|
156
|
-
.
|
157
|
+
.with_try_parse_dates(try_parse_dates)
|
157
158
|
.with_null_values(null_values);
|
158
159
|
|
159
160
|
if let Some(_lambda) = with_schema_modify {
|
@@ -163,6 +164,7 @@ impl RbLazyFrame {
|
|
163
164
|
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
164
165
|
}
|
165
166
|
|
167
|
+
#[allow(clippy::too_many_arguments)]
|
166
168
|
pub fn new_from_parquet(
|
167
169
|
path: String,
|
168
170
|
n_rows: Option<usize>,
|
@@ -171,6 +173,7 @@ impl RbLazyFrame {
|
|
171
173
|
rechunk: bool,
|
172
174
|
row_count: Option<(String, IdxSize)>,
|
173
175
|
low_memory: bool,
|
176
|
+
use_statistics: bool,
|
174
177
|
) -> RbResult<Self> {
|
175
178
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
176
179
|
let args = ScanArgsParquet {
|
@@ -182,6 +185,7 @@ impl RbLazyFrame {
|
|
182
185
|
low_memory,
|
183
186
|
// TODO support cloud options
|
184
187
|
cloud_options: None,
|
188
|
+
use_statistics,
|
185
189
|
};
|
186
190
|
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
187
191
|
Ok(lf.into())
|
@@ -284,6 +288,32 @@ impl RbLazyFrame {
|
|
284
288
|
Ok(df.into())
|
285
289
|
}
|
286
290
|
|
291
|
+
#[allow(clippy::too_many_arguments)]
|
292
|
+
pub fn sink_parquet(
|
293
|
+
&self,
|
294
|
+
path: PathBuf,
|
295
|
+
compression: String,
|
296
|
+
compression_level: Option<i32>,
|
297
|
+
statistics: bool,
|
298
|
+
row_group_size: Option<usize>,
|
299
|
+
data_pagesize_limit: Option<usize>,
|
300
|
+
maintain_order: bool,
|
301
|
+
) -> RbResult<()> {
|
302
|
+
let compression = parse_parquet_compression(&compression, compression_level)?;
|
303
|
+
|
304
|
+
let options = ParquetWriteOptions {
|
305
|
+
compression,
|
306
|
+
statistics,
|
307
|
+
row_group_size,
|
308
|
+
data_pagesize_limit,
|
309
|
+
maintain_order,
|
310
|
+
};
|
311
|
+
|
312
|
+
let ldf = self.ldf.clone();
|
313
|
+
ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
|
314
|
+
Ok(())
|
315
|
+
}
|
316
|
+
|
287
317
|
pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
|
288
318
|
let ldf = self.ldf.clone();
|
289
319
|
let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
|
@@ -328,7 +358,7 @@ impl RbLazyFrame {
|
|
328
358
|
let lazy_gb = ldf.groupby_rolling(
|
329
359
|
by,
|
330
360
|
RollingGroupOptions {
|
331
|
-
index_column,
|
361
|
+
index_column: index_column.into(),
|
332
362
|
period: Duration::parse(&period),
|
333
363
|
offset: Duration::parse(&offset),
|
334
364
|
closed_window,
|
@@ -359,7 +389,7 @@ impl RbLazyFrame {
|
|
359
389
|
let lazy_gb = ldf.groupby_dynamic(
|
360
390
|
by,
|
361
391
|
DynamicGroupOptions {
|
362
|
-
index_column,
|
392
|
+
index_column: index_column.into(),
|
363
393
|
every: Duration::parse(&every),
|
364
394
|
period: Duration::parse(&period),
|
365
395
|
offset: Duration::parse(&offset),
|
@@ -415,10 +445,10 @@ impl RbLazyFrame {
|
|
415
445
|
.force_parallel(force_parallel)
|
416
446
|
.how(JoinType::AsOf(AsOfOptions {
|
417
447
|
strategy: strategy.0,
|
418
|
-
left_by,
|
419
|
-
right_by,
|
448
|
+
left_by: left_by.map(strings_to_smartstrings),
|
449
|
+
right_by: right_by.map(strings_to_smartstrings),
|
420
450
|
tolerance: tolerance.map(|t| t.0.into_static().unwrap()),
|
421
|
-
tolerance_str,
|
451
|
+
tolerance_str: tolerance_str.map(|s| s.into()),
|
422
452
|
}))
|
423
453
|
.suffix(suffix)
|
424
454
|
.finish()
|
@@ -570,12 +600,14 @@ impl RbLazyFrame {
|
|
570
600
|
value_vars: Vec<String>,
|
571
601
|
value_name: Option<String>,
|
572
602
|
variable_name: Option<String>,
|
603
|
+
streamable: bool,
|
573
604
|
) -> Self {
|
574
605
|
let args = MeltArgs {
|
575
|
-
id_vars,
|
576
|
-
value_vars,
|
577
|
-
value_name,
|
578
|
-
variable_name,
|
606
|
+
id_vars: strings_to_smartstrings(id_vars),
|
607
|
+
value_vars: strings_to_smartstrings(value_vars),
|
608
|
+
value_name: value_name.map(|s| s.into()),
|
609
|
+
variable_name: variable_name.map(|s| s.into()),
|
610
|
+
streamable,
|
579
611
|
};
|
580
612
|
|
581
613
|
let ldf = self.ldf.clone();
|
@@ -596,8 +628,10 @@ impl RbLazyFrame {
|
|
596
628
|
self.ldf.clone().into()
|
597
629
|
}
|
598
630
|
|
599
|
-
pub fn columns(&self) -> RbResult<
|
600
|
-
|
631
|
+
pub fn columns(&self) -> RbResult<RArray> {
|
632
|
+
let schema = self.get_schema()?;
|
633
|
+
let iter = schema.iter_names().map(|s| s.as_str());
|
634
|
+
Ok(RArray::from_iter(iter))
|
601
635
|
}
|
602
636
|
|
603
637
|
pub fn dtypes(&self) -> RbResult<RArray> {
|
@@ -614,7 +648,7 @@ impl RbLazyFrame {
|
|
614
648
|
// TODO remove unwrap
|
615
649
|
schema_dict
|
616
650
|
.aset::<String, Value>(
|
617
|
-
fld.name().
|
651
|
+
fld.name().to_string(),
|
618
652
|
Wrap(fld.data_type().clone()).into_value(),
|
619
653
|
)
|
620
654
|
.unwrap();
|