polars-df 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +69 -2
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +69 -16
- data/ext/polars/src/dataframe.rs +56 -39
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +24 -5
- data/ext/polars/src/numo.rs +57 -0
- data/ext/polars/src/series.rs +57 -33
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +89 -43
- data/lib/polars/data_types.rb +4 -0
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/group_by.rb +11 -0
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/series.rb +50 -4
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +7 -3
data/Cargo.toml
CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
|
|
4
4
|
[patch.crates-io]
|
5
5
|
jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
|
6
6
|
halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
|
7
|
-
arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
|
8
7
|
|
9
8
|
[profile.release]
|
10
9
|
strip = true
|
data/README.md
CHANGED
@@ -50,6 +50,9 @@ From Parquet
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
Polars.read_parquet("file.parquet")
|
53
|
+
|
54
|
+
# or lazily with
|
55
|
+
Polars.scan_parquet("file.parquet")
|
53
56
|
```
|
54
57
|
|
55
58
|
From Active Record
|
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
|
|
60
63
|
Polars.read_sql("SELECT * FROM users")
|
61
64
|
```
|
62
65
|
|
66
|
+
From JSON
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Polars.read_json("file.json")
|
70
|
+
# or
|
71
|
+
Polars.read_ndjson("file.ndjson")
|
72
|
+
|
73
|
+
# or lazily with
|
74
|
+
Polars.scan_ndjson("file.ndjson")
|
75
|
+
```
|
76
|
+
|
77
|
+
From Feather / Arrow IPC
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
Polars.read_ipc("file.arrow")
|
81
|
+
|
82
|
+
# or lazily with
|
83
|
+
Polars.scan_ipc("file.arrow")
|
84
|
+
```
|
85
|
+
|
86
|
+
From Avro
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Polars.read_avro("file.avro")
|
90
|
+
```
|
91
|
+
|
63
92
|
From a hash
|
64
93
|
|
65
94
|
```ruby
|
@@ -282,10 +311,10 @@ df.to_dummies
|
|
282
311
|
|
283
312
|
## Conversion
|
284
313
|
|
285
|
-
Array of
|
314
|
+
Array of hashes
|
286
315
|
|
287
316
|
```ruby
|
288
|
-
df.rows
|
317
|
+
df.rows(named: true)
|
289
318
|
```
|
290
319
|
|
291
320
|
Hash of series
|
@@ -308,6 +337,12 @@ Parquet
|
|
308
337
|
df.write_parquet("file.parquet")
|
309
338
|
```
|
310
339
|
|
340
|
+
Numo array
|
341
|
+
|
342
|
+
```ruby
|
343
|
+
df.to_numo
|
344
|
+
```
|
345
|
+
|
311
346
|
## Types
|
312
347
|
|
313
348
|
You can specify column types when creating a data frame
|
@@ -343,6 +378,38 @@ Cast a column
|
|
343
378
|
df["a"].cast(Polars::Int32)
|
344
379
|
```
|
345
380
|
|
381
|
+
## Visualization
|
382
|
+
|
383
|
+
Add [Vega](https://github.com/ankane/vega-ruby) to your application’s Gemfile:
|
384
|
+
|
385
|
+
```ruby
|
386
|
+
gem "vega"
|
387
|
+
```
|
388
|
+
|
389
|
+
And use:
|
390
|
+
|
391
|
+
```ruby
|
392
|
+
df.plot("a", "b")
|
393
|
+
```
|
394
|
+
|
395
|
+
Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
|
396
|
+
|
397
|
+
```ruby
|
398
|
+
df.plot("a", "b", type: "pie")
|
399
|
+
```
|
400
|
+
|
401
|
+
Group data
|
402
|
+
|
403
|
+
```ruby
|
404
|
+
df.groupby("c").plot("a", "b")
|
405
|
+
```
|
406
|
+
|
407
|
+
Stacked columns or bars
|
408
|
+
|
409
|
+
```ruby
|
410
|
+
df.groupby("c").plot("a", "b", stacked: true)
|
411
|
+
```
|
412
|
+
|
346
413
|
## History
|
347
414
|
|
348
415
|
View the [changelog](CHANGELOG.md)
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
magnus = "0.5"
|
15
|
-
polars-core = "0.
|
15
|
+
polars-core = "0.28.0"
|
16
16
|
serde_json = "1"
|
17
|
+
smartstring = "1"
|
17
18
|
|
18
19
|
[dependencies.polars]
|
19
|
-
version = "0.
|
20
|
+
version = "0.28.0"
|
20
21
|
features = [
|
21
22
|
"abs",
|
22
23
|
"arange",
|
@@ -44,6 +45,7 @@ features = [
|
|
44
45
|
"ipc",
|
45
46
|
"is_first",
|
46
47
|
"is_in",
|
48
|
+
"is_unique",
|
47
49
|
"json",
|
48
50
|
"lazy",
|
49
51
|
"lazy_regex",
|
@@ -7,11 +7,17 @@ use std::cell::RefCell;
|
|
7
7
|
use std::path::PathBuf;
|
8
8
|
|
9
9
|
use crate::conversion::*;
|
10
|
+
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
10
11
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
11
12
|
|
13
|
+
pub enum BatchedReader {
|
14
|
+
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
+
Read(OwnedBatchedCsvReader),
|
16
|
+
}
|
17
|
+
|
12
18
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
13
19
|
pub struct RbBatchedCsv {
|
14
|
-
pub reader: RefCell<
|
20
|
+
pub reader: RefCell<BatchedReader>,
|
15
21
|
}
|
16
22
|
|
17
23
|
impl RbBatchedCsv {
|
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
|
|
38
44
|
let comment_char: Option<String> = arguments[16].try_convert()?;
|
39
45
|
let quote_char: Option<String> = arguments[17].try_convert()?;
|
40
46
|
let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
|
41
|
-
let
|
47
|
+
let try_parse_dates: bool = arguments[19].try_convert()?;
|
42
48
|
let skip_rows_after_header: usize = arguments[20].try_convert()?;
|
43
49
|
let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
|
44
50
|
let sample_size: usize = arguments[22].try_convert()?;
|
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
|
|
95
101
|
.low_memory(low_memory)
|
96
102
|
.with_comment_char(comment_char)
|
97
103
|
.with_null_values(null_values)
|
98
|
-
.
|
104
|
+
.with_try_parse_dates(try_parse_dates)
|
99
105
|
.with_quote_char(quote_char)
|
100
106
|
.with_end_of_line_char(eol_char)
|
101
107
|
.with_skip_rows_after_header(skip_rows_after_header)
|
102
108
|
.with_row_count(row_count)
|
103
|
-
.sample_size(sample_size)
|
104
|
-
|
105
|
-
|
109
|
+
.sample_size(sample_size);
|
110
|
+
|
111
|
+
let reader = if low_memory {
|
112
|
+
let reader = reader
|
113
|
+
.batched_read(overwrite_dtype.map(Arc::new))
|
114
|
+
.map_err(RbPolarsErr::from)?;
|
115
|
+
BatchedReader::Read(reader)
|
116
|
+
} else {
|
117
|
+
let reader = reader
|
118
|
+
.batched_mmap(overwrite_dtype.map(Arc::new))
|
119
|
+
.map_err(RbPolarsErr::from)?;
|
120
|
+
BatchedReader::MMap(reader)
|
121
|
+
};
|
106
122
|
|
107
123
|
Ok(RbBatchedCsv {
|
108
124
|
reader: RefCell::new(reader),
|
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
|
|
110
126
|
}
|
111
127
|
|
112
128
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
113
|
-
let batches = self
|
114
|
-
.
|
115
|
-
.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
}))
|
129
|
+
let batches = match &mut *self.reader.borrow_mut() {
|
130
|
+
BatchedReader::MMap(reader) => reader.next_batches(n),
|
131
|
+
BatchedReader::Read(reader) => reader.next_batches(n),
|
132
|
+
}
|
133
|
+
.map_err(RbPolarsErr::from)?;
|
134
|
+
|
135
|
+
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
121
136
|
}
|
122
137
|
}
|
@@ -1,3 +1,6 @@
|
|
1
|
+
use std::fmt::{Display, Formatter};
|
2
|
+
use std::hash::{Hash, Hasher};
|
3
|
+
|
1
4
|
use magnus::{
|
2
5
|
class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
|
3
6
|
RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
|
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
|
|
10
13
|
use polars::io::avro::AvroCompression;
|
11
14
|
use polars::prelude::*;
|
12
15
|
use polars::series::ops::NullBehavior;
|
13
|
-
use
|
14
|
-
use std::hash::{Hash, Hasher};
|
16
|
+
use smartstring::alias::String as SmartString;
|
15
17
|
|
16
18
|
use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
17
19
|
|
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
|
|
82
84
|
}
|
83
85
|
}
|
84
86
|
|
87
|
+
impl TryConvert for Wrap<BinaryChunked> {
|
88
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
89
|
+
let (seq, len) = get_rbseq(obj)?;
|
90
|
+
let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
|
91
|
+
|
92
|
+
for res in seq.each() {
|
93
|
+
let item = res?;
|
94
|
+
match item.try_convert::<RString>() {
|
95
|
+
Ok(val) => builder.append_value(unsafe { val.as_slice() }),
|
96
|
+
Err(_) => builder.append_null(),
|
97
|
+
}
|
98
|
+
}
|
99
|
+
Ok(Wrap(builder.finish()))
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
85
103
|
impl TryConvert for Wrap<NullValues> {
|
86
104
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
87
105
|
if let Ok(s) = ob.try_convert::<String>() {
|
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
|
|
98
116
|
}
|
99
117
|
}
|
100
118
|
|
119
|
+
fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
120
|
+
let dict = RHash::new();
|
121
|
+
for (fld, val) in flds.iter().zip(vals) {
|
122
|
+
dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
|
123
|
+
}
|
124
|
+
dict.into_value()
|
125
|
+
}
|
126
|
+
|
101
127
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
102
128
|
fn into_value_with(self, _: &RubyHandle) -> Value {
|
103
129
|
match self.0 {
|
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
114
140
|
AnyValue::Null => *QNIL,
|
115
141
|
AnyValue::Boolean(v) => Value::from(v),
|
116
142
|
AnyValue::Utf8(v) => Value::from(v),
|
117
|
-
AnyValue::Utf8Owned(
|
143
|
+
AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
|
118
144
|
AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
|
119
145
|
AnyValue::Date(v) => class::time()
|
120
146
|
.funcall::<_, _, Value>("at", (v * 86400,))
|
@@ -125,7 +151,13 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
125
151
|
.unwrap(),
|
126
152
|
AnyValue::Datetime(v, tu, tz) => {
|
127
153
|
let t = match tu {
|
128
|
-
TimeUnit::Nanoseconds =>
|
154
|
+
TimeUnit::Nanoseconds => {
|
155
|
+
let sec = v / 1000000000;
|
156
|
+
let subsec = v % 1000000000;
|
157
|
+
class::time()
|
158
|
+
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("nsec")))
|
159
|
+
.unwrap()
|
160
|
+
}
|
129
161
|
TimeUnit::Microseconds => {
|
130
162
|
let sec = v / 1000000;
|
131
163
|
let subsec = v % 1000000;
|
@@ -133,7 +165,13 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
133
165
|
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("usec")))
|
134
166
|
.unwrap()
|
135
167
|
}
|
136
|
-
TimeUnit::Milliseconds =>
|
168
|
+
TimeUnit::Milliseconds => {
|
169
|
+
let sec = v / 1000;
|
170
|
+
let subsec = v % 1000;
|
171
|
+
class::time()
|
172
|
+
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("millisecond")))
|
173
|
+
.unwrap()
|
174
|
+
}
|
137
175
|
};
|
138
176
|
|
139
177
|
if tz.is_some() {
|
@@ -145,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
145
183
|
AnyValue::Duration(_v, _tu) => todo!(),
|
146
184
|
AnyValue::Time(_v) => todo!(),
|
147
185
|
AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
|
148
|
-
ref
|
149
|
-
AnyValue::StructOwned(
|
150
|
-
AnyValue::Object(
|
151
|
-
|
152
|
-
|
153
|
-
|
186
|
+
ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
|
187
|
+
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
188
|
+
AnyValue::Object(v) => {
|
189
|
+
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
190
|
+
object.inner
|
191
|
+
}
|
192
|
+
AnyValue::ObjectOwned(v) => {
|
193
|
+
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
194
|
+
object.inner
|
195
|
+
}
|
196
|
+
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
197
|
+
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
198
|
+
AnyValue::Decimal(_v, _scale) => todo!(),
|
154
199
|
}
|
155
200
|
}
|
156
201
|
}
|
@@ -170,12 +215,12 @@ impl IntoValue for Wrap<DataType> {
|
|
170
215
|
DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
|
171
216
|
DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
|
172
217
|
DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
|
173
|
-
DataType::
|
218
|
+
DataType::Decimal(_precision, _scale) => todo!(),
|
174
219
|
DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
|
175
220
|
DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
|
176
221
|
DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
|
177
222
|
DataType::List(inner) => {
|
178
|
-
let inner = Wrap(*inner
|
223
|
+
let inner = Wrap(*inner);
|
179
224
|
let list_class = pl.const_get::<_, Value>("List").unwrap();
|
180
225
|
list_class.funcall::<_, _, Value>("new", (inner,)).unwrap()
|
181
226
|
}
|
@@ -183,7 +228,7 @@ impl IntoValue for Wrap<DataType> {
|
|
183
228
|
DataType::Datetime(tu, tz) => {
|
184
229
|
let datetime_class = pl.const_get::<_, Value>("Datetime").unwrap();
|
185
230
|
datetime_class
|
186
|
-
.funcall::<_, _, Value>("new", (tu.to_ascii(), tz
|
231
|
+
.funcall::<_, _, Value>("new", (tu.to_ascii(), tz))
|
187
232
|
.unwrap()
|
188
233
|
}
|
189
234
|
DataType::Duration(tu) => {
|
@@ -198,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
|
|
198
243
|
DataType::Struct(fields) => {
|
199
244
|
let field_class = pl.const_get::<_, Value>("Field").unwrap();
|
200
245
|
let iter = fields.iter().map(|fld| {
|
201
|
-
let name = fld.name().
|
246
|
+
let name = fld.name().as_str();
|
202
247
|
let dtype = Wrap(fld.data_type().clone());
|
203
248
|
field_class
|
204
249
|
.funcall::<_, _, Value>("new", (name, dtype))
|
@@ -328,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
328
373
|
let n = 25;
|
329
374
|
let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
|
330
375
|
.map_err(RbPolarsErr::from)?;
|
331
|
-
let s = Series::from_any_values_and_dtype("", &avs, &dtype)
|
376
|
+
let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
|
332
377
|
.map_err(RbPolarsErr::from)?;
|
333
378
|
Ok(Wrap(AnyValue::List(s)))
|
334
379
|
}
|
@@ -858,3 +903,11 @@ pub fn parse_parquet_compression(
|
|
858
903
|
};
|
859
904
|
Ok(parsed)
|
860
905
|
}
|
906
|
+
|
907
|
+
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
|
908
|
+
where
|
909
|
+
I: IntoIterator<Item = S>,
|
910
|
+
S: AsRef<str>,
|
911
|
+
{
|
912
|
+
container.into_iter().map(|s| s.as_ref().into()).collect()
|
913
|
+
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -6,6 +6,7 @@ use polars::io::mmap::ReaderBytes;
|
|
6
6
|
use polars::io::RowCount;
|
7
7
|
use polars::prelude::pivot::{pivot, pivot_stable};
|
8
8
|
use polars::prelude::*;
|
9
|
+
use polars_core::utils::try_get_supertype;
|
9
10
|
use std::cell::RefCell;
|
10
11
|
use std::io::{BufWriter, Cursor};
|
11
12
|
use std::ops::Deref;
|
@@ -114,7 +115,7 @@ impl RbDataFrame {
|
|
114
115
|
let comment_char: Option<String> = arguments[17].try_convert()?;
|
115
116
|
let quote_char: Option<String> = arguments[18].try_convert()?;
|
116
117
|
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
117
|
-
let
|
118
|
+
let try_parse_dates: bool = arguments[20].try_convert()?;
|
118
119
|
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
119
120
|
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
120
121
|
let sample_size: usize = arguments[23].try_convert()?;
|
@@ -167,12 +168,12 @@ impl RbDataFrame {
|
|
167
168
|
.with_columns(columns)
|
168
169
|
.with_n_threads(n_threads)
|
169
170
|
.with_path(path)
|
170
|
-
.with_dtypes(overwrite_dtype.
|
171
|
+
.with_dtypes(overwrite_dtype.map(Arc::new))
|
171
172
|
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
172
173
|
.low_memory(low_memory)
|
173
174
|
.with_comment_char(comment_char)
|
174
175
|
.with_null_values(null_values)
|
175
|
-
.
|
176
|
+
.with_try_parse_dates(try_parse_dates)
|
176
177
|
.with_quote_char(quote_char)
|
177
178
|
.with_end_of_line_char(eol_char)
|
178
179
|
.with_skip_rows_after_header(skip_rows_after_header)
|
@@ -183,6 +184,7 @@ impl RbDataFrame {
|
|
183
184
|
Ok(df.into())
|
184
185
|
}
|
185
186
|
|
187
|
+
#[allow(clippy::too_many_arguments)]
|
186
188
|
pub fn read_parquet(
|
187
189
|
rb_f: Value,
|
188
190
|
columns: Option<Vec<String>>,
|
@@ -191,6 +193,8 @@ impl RbDataFrame {
|
|
191
193
|
parallel: Wrap<ParallelStrategy>,
|
192
194
|
row_count: Option<(String, IdxSize)>,
|
193
195
|
low_memory: bool,
|
196
|
+
use_statistics: bool,
|
197
|
+
rechunk: bool,
|
194
198
|
) -> RbResult<Self> {
|
195
199
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
196
200
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
@@ -201,6 +205,8 @@ impl RbDataFrame {
|
|
201
205
|
.with_n_rows(n_rows)
|
202
206
|
.with_row_count(row_count)
|
203
207
|
.set_low_memory(low_memory)
|
208
|
+
.use_statistics(use_statistics)
|
209
|
+
.set_rechunk(rechunk)
|
204
210
|
.finish()
|
205
211
|
.map_err(RbPolarsErr::from)?;
|
206
212
|
Ok(RbDataFrame::new(df))
|
@@ -253,7 +259,7 @@ impl RbDataFrame {
|
|
253
259
|
use polars::io::avro::AvroWriter;
|
254
260
|
|
255
261
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
256
|
-
let f = std::fs::File::create(
|
262
|
+
let f = std::fs::File::create(s).unwrap();
|
257
263
|
AvroWriter::new(f)
|
258
264
|
.with_compression(compression.0)
|
259
265
|
.finish(&mut self.df.borrow_mut())
|
@@ -338,7 +344,7 @@ impl RbDataFrame {
|
|
338
344
|
// ensure the new names are used
|
339
345
|
if let Some(schema) = &schema_overwrite {
|
340
346
|
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
341
|
-
*name = new_name.
|
347
|
+
*name = new_name.to_string();
|
342
348
|
}
|
343
349
|
}
|
344
350
|
let rbdf = Self::finish_from_rows(
|
@@ -347,17 +353,19 @@ impl RbDataFrame {
|
|
347
353
|
schema_overwrite.map(|wrap| wrap.0),
|
348
354
|
)?;
|
349
355
|
|
350
|
-
|
351
|
-
.
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
356
|
+
unsafe {
|
357
|
+
rbdf.df
|
358
|
+
.borrow_mut()
|
359
|
+
.get_columns_mut()
|
360
|
+
.iter_mut()
|
361
|
+
.zip(&names)
|
362
|
+
.for_each(|(s, name)| {
|
363
|
+
s.rename(name);
|
364
|
+
});
|
365
|
+
}
|
358
366
|
let length = names.len();
|
359
367
|
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
360
|
-
let err = PolarsError::
|
368
|
+
let err = PolarsError::SchemaMismatch("duplicate column names found".into());
|
361
369
|
Err(RbPolarsErr::from(err))?;
|
362
370
|
}
|
363
371
|
|
@@ -393,7 +401,7 @@ impl RbDataFrame {
|
|
393
401
|
let null = null_value.unwrap_or_default();
|
394
402
|
|
395
403
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
396
|
-
let f = std::fs::File::create(
|
404
|
+
let f = std::fs::File::create(s).unwrap();
|
397
405
|
// no need for a buffered writer, because the csv writer does internal buffering
|
398
406
|
CsvWriter::new(f)
|
399
407
|
.has_header(has_header)
|
@@ -435,7 +443,7 @@ impl RbDataFrame {
|
|
435
443
|
compression: Wrap<Option<IpcCompression>>,
|
436
444
|
) -> RbResult<()> {
|
437
445
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
438
|
-
let f = std::fs::File::create(
|
446
|
+
let f = std::fs::File::create(s).unwrap();
|
439
447
|
IpcWriter::new(f)
|
440
448
|
.with_compression(compression.0)
|
441
449
|
.finish(&mut self.df.borrow_mut())
|
@@ -493,6 +501,25 @@ impl RbDataFrame {
|
|
493
501
|
.into()
|
494
502
|
}
|
495
503
|
|
504
|
+
pub fn to_numo(&self) -> Option<Value> {
|
505
|
+
let mut st = None;
|
506
|
+
for s in self.df.borrow().iter() {
|
507
|
+
let dt_i = s.dtype();
|
508
|
+
match st {
|
509
|
+
None => st = Some(dt_i.clone()),
|
510
|
+
Some(ref mut st) => {
|
511
|
+
*st = try_get_supertype(st, dt_i).ok()?;
|
512
|
+
}
|
513
|
+
}
|
514
|
+
}
|
515
|
+
let st = st?;
|
516
|
+
|
517
|
+
match st {
|
518
|
+
// TODO
|
519
|
+
_ => None,
|
520
|
+
}
|
521
|
+
}
|
522
|
+
|
496
523
|
pub fn write_parquet(
|
497
524
|
&self,
|
498
525
|
rb_f: Value,
|
@@ -504,7 +531,7 @@ impl RbDataFrame {
|
|
504
531
|
let compression = parse_parquet_compression(&compression, compression_level)?;
|
505
532
|
|
506
533
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
507
|
-
let f = std::fs::File::create(
|
534
|
+
let f = std::fs::File::create(s).unwrap();
|
508
535
|
ParquetWriter::new(f)
|
509
536
|
.with_compression(compression)
|
510
537
|
.with_statistics(statistics)
|
@@ -607,7 +634,7 @@ impl RbDataFrame {
|
|
607
634
|
}
|
608
635
|
|
609
636
|
pub fn get_columns(&self) -> RArray {
|
610
|
-
let cols = self.df.borrow().get_columns().
|
637
|
+
let cols = self.df.borrow().get_columns().to_vec();
|
611
638
|
to_rbseries_collection(cols)
|
612
639
|
}
|
613
640
|
|
@@ -861,10 +888,11 @@ impl RbDataFrame {
|
|
861
888
|
variable_name: Option<String>,
|
862
889
|
) -> RbResult<Self> {
|
863
890
|
let args = MeltArgs {
|
864
|
-
id_vars,
|
865
|
-
value_vars,
|
866
|
-
value_name,
|
867
|
-
variable_name,
|
891
|
+
id_vars: strings_to_smartstrings(id_vars),
|
892
|
+
value_vars: strings_to_smartstrings(value_vars),
|
893
|
+
value_name: value_name.map(|s| s.into()),
|
894
|
+
variable_name: variable_name.map(|s| s.into()),
|
895
|
+
streamable: false,
|
868
896
|
};
|
869
897
|
|
870
898
|
let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
|
@@ -877,22 +905,26 @@ impl RbDataFrame {
|
|
877
905
|
values: Vec<String>,
|
878
906
|
index: Vec<String>,
|
879
907
|
columns: Vec<String>,
|
880
|
-
aggregate_expr: &RbExpr,
|
881
908
|
maintain_order: bool,
|
882
909
|
sort_columns: bool,
|
910
|
+
aggregate_expr: Option<&RbExpr>,
|
883
911
|
separator: Option<String>,
|
884
912
|
) -> RbResult<Self> {
|
885
913
|
let fun = match maintain_order {
|
886
914
|
true => pivot_stable,
|
887
915
|
false => pivot,
|
888
916
|
};
|
917
|
+
let agg_expr = match aggregate_expr {
|
918
|
+
Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
|
919
|
+
None => None,
|
920
|
+
};
|
889
921
|
let df = fun(
|
890
922
|
&self.df.borrow(),
|
891
923
|
values,
|
892
924
|
index,
|
893
925
|
columns,
|
894
|
-
aggregate_expr.inner.clone(),
|
895
926
|
sort_columns,
|
927
|
+
agg_expr,
|
896
928
|
separator.as_deref(),
|
897
929
|
)
|
898
930
|
.map_err(RbPolarsErr::from)?;
|
@@ -913,21 +945,6 @@ impl RbDataFrame {
|
|
913
945
|
self.df.borrow().shift(periods).into()
|
914
946
|
}
|
915
947
|
|
916
|
-
pub fn unique(
|
917
|
-
&self,
|
918
|
-
maintain_order: bool,
|
919
|
-
subset: Option<Vec<String>>,
|
920
|
-
keep: Wrap<UniqueKeepStrategy>,
|
921
|
-
) -> RbResult<Self> {
|
922
|
-
let subset = subset.as_ref().map(|v| v.as_ref());
|
923
|
-
let df = match maintain_order {
|
924
|
-
true => self.df.borrow().unique_stable(subset, keep.0),
|
925
|
-
false => self.df.borrow().unique(subset, keep.0),
|
926
|
-
}
|
927
|
-
.map_err(RbPolarsErr::from)?;
|
928
|
-
Ok(df.into())
|
929
|
-
}
|
930
|
-
|
931
948
|
pub fn lazy(&self) -> RbLazyFrame {
|
932
949
|
self.df.borrow().clone().lazy().into()
|
933
950
|
}
|
data/ext/polars/src/error.rs
CHANGED
@@ -43,3 +43,11 @@ impl ComputeError {
|
|
43
43
|
Error::new(exception::runtime_error(), message)
|
44
44
|
}
|
45
45
|
}
|
46
|
+
|
47
|
+
#[macro_export]
|
48
|
+
macro_rules! raise_err(
|
49
|
+
($msg:expr, $err:ident) => {{
|
50
|
+
Err(PolarsError::$err($msg.into())).map_err(RbPolarsErr::from)?;
|
51
|
+
unreachable!()
|
52
|
+
}}
|
53
|
+
);
|