polars-df 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +69 -2
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +69 -16
- data/ext/polars/src/dataframe.rs +56 -39
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +24 -5
- data/ext/polars/src/numo.rs +57 -0
- data/ext/polars/src/series.rs +57 -33
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +89 -43
- data/lib/polars/data_types.rb +4 -0
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/group_by.rb +11 -0
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/series.rb +50 -4
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +7 -3
data/Cargo.toml
CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
|
|
4
4
|
[patch.crates-io]
|
5
5
|
jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
|
6
6
|
halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
|
7
|
-
arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
|
8
7
|
|
9
8
|
[profile.release]
|
10
9
|
strip = true
|
data/README.md
CHANGED
@@ -50,6 +50,9 @@ From Parquet
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
Polars.read_parquet("file.parquet")
|
53
|
+
|
54
|
+
# or lazily with
|
55
|
+
Polars.scan_parquet("file.parquet")
|
53
56
|
```
|
54
57
|
|
55
58
|
From Active Record
|
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
|
|
60
63
|
Polars.read_sql("SELECT * FROM users")
|
61
64
|
```
|
62
65
|
|
66
|
+
From JSON
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Polars.read_json("file.json")
|
70
|
+
# or
|
71
|
+
Polars.read_ndjson("file.ndjson")
|
72
|
+
|
73
|
+
# or lazily with
|
74
|
+
Polars.scan_ndjson("file.ndjson")
|
75
|
+
```
|
76
|
+
|
77
|
+
From Feather / Arrow IPC
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
Polars.read_ipc("file.arrow")
|
81
|
+
|
82
|
+
# or lazily with
|
83
|
+
Polars.scan_ipc("file.arrow")
|
84
|
+
```
|
85
|
+
|
86
|
+
From Avro
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Polars.read_avro("file.avro")
|
90
|
+
```
|
91
|
+
|
63
92
|
From a hash
|
64
93
|
|
65
94
|
```ruby
|
@@ -282,10 +311,10 @@ df.to_dummies
|
|
282
311
|
|
283
312
|
## Conversion
|
284
313
|
|
285
|
-
Array of
|
314
|
+
Array of hashes
|
286
315
|
|
287
316
|
```ruby
|
288
|
-
df.rows
|
317
|
+
df.rows(named: true)
|
289
318
|
```
|
290
319
|
|
291
320
|
Hash of series
|
@@ -308,6 +337,12 @@ Parquet
|
|
308
337
|
df.write_parquet("file.parquet")
|
309
338
|
```
|
310
339
|
|
340
|
+
Numo array
|
341
|
+
|
342
|
+
```ruby
|
343
|
+
df.to_numo
|
344
|
+
```
|
345
|
+
|
311
346
|
## Types
|
312
347
|
|
313
348
|
You can specify column types when creating a data frame
|
@@ -343,6 +378,38 @@ Cast a column
|
|
343
378
|
df["a"].cast(Polars::Int32)
|
344
379
|
```
|
345
380
|
|
381
|
+
## Visualization
|
382
|
+
|
383
|
+
Add [Vega](https://github.com/ankane/vega-ruby) to your application’s Gemfile:
|
384
|
+
|
385
|
+
```ruby
|
386
|
+
gem "vega"
|
387
|
+
```
|
388
|
+
|
389
|
+
And use:
|
390
|
+
|
391
|
+
```ruby
|
392
|
+
df.plot("a", "b")
|
393
|
+
```
|
394
|
+
|
395
|
+
Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
|
396
|
+
|
397
|
+
```ruby
|
398
|
+
df.plot("a", "b", type: "pie")
|
399
|
+
```
|
400
|
+
|
401
|
+
Group data
|
402
|
+
|
403
|
+
```ruby
|
404
|
+
df.groupby("c").plot("a", "b")
|
405
|
+
```
|
406
|
+
|
407
|
+
Stacked columns or bars
|
408
|
+
|
409
|
+
```ruby
|
410
|
+
df.groupby("c").plot("a", "b", stacked: true)
|
411
|
+
```
|
412
|
+
|
346
413
|
## History
|
347
414
|
|
348
415
|
View the [changelog](CHANGELOG.md)
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
license = "MIT"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
magnus = "0.5"
|
15
|
-
polars-core = "0.
|
15
|
+
polars-core = "0.28.0"
|
16
16
|
serde_json = "1"
|
17
|
+
smartstring = "1"
|
17
18
|
|
18
19
|
[dependencies.polars]
|
19
|
-
version = "0.
|
20
|
+
version = "0.28.0"
|
20
21
|
features = [
|
21
22
|
"abs",
|
22
23
|
"arange",
|
@@ -44,6 +45,7 @@ features = [
|
|
44
45
|
"ipc",
|
45
46
|
"is_first",
|
46
47
|
"is_in",
|
48
|
+
"is_unique",
|
47
49
|
"json",
|
48
50
|
"lazy",
|
49
51
|
"lazy_regex",
|
@@ -7,11 +7,17 @@ use std::cell::RefCell;
|
|
7
7
|
use std::path::PathBuf;
|
8
8
|
|
9
9
|
use crate::conversion::*;
|
10
|
+
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
10
11
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
11
12
|
|
13
|
+
pub enum BatchedReader {
|
14
|
+
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
+
Read(OwnedBatchedCsvReader),
|
16
|
+
}
|
17
|
+
|
12
18
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
13
19
|
pub struct RbBatchedCsv {
|
14
|
-
pub reader: RefCell<
|
20
|
+
pub reader: RefCell<BatchedReader>,
|
15
21
|
}
|
16
22
|
|
17
23
|
impl RbBatchedCsv {
|
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
|
|
38
44
|
let comment_char: Option<String> = arguments[16].try_convert()?;
|
39
45
|
let quote_char: Option<String> = arguments[17].try_convert()?;
|
40
46
|
let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
|
41
|
-
let
|
47
|
+
let try_parse_dates: bool = arguments[19].try_convert()?;
|
42
48
|
let skip_rows_after_header: usize = arguments[20].try_convert()?;
|
43
49
|
let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
|
44
50
|
let sample_size: usize = arguments[22].try_convert()?;
|
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
|
|
95
101
|
.low_memory(low_memory)
|
96
102
|
.with_comment_char(comment_char)
|
97
103
|
.with_null_values(null_values)
|
98
|
-
.
|
104
|
+
.with_try_parse_dates(try_parse_dates)
|
99
105
|
.with_quote_char(quote_char)
|
100
106
|
.with_end_of_line_char(eol_char)
|
101
107
|
.with_skip_rows_after_header(skip_rows_after_header)
|
102
108
|
.with_row_count(row_count)
|
103
|
-
.sample_size(sample_size)
|
104
|
-
|
105
|
-
|
109
|
+
.sample_size(sample_size);
|
110
|
+
|
111
|
+
let reader = if low_memory {
|
112
|
+
let reader = reader
|
113
|
+
.batched_read(overwrite_dtype.map(Arc::new))
|
114
|
+
.map_err(RbPolarsErr::from)?;
|
115
|
+
BatchedReader::Read(reader)
|
116
|
+
} else {
|
117
|
+
let reader = reader
|
118
|
+
.batched_mmap(overwrite_dtype.map(Arc::new))
|
119
|
+
.map_err(RbPolarsErr::from)?;
|
120
|
+
BatchedReader::MMap(reader)
|
121
|
+
};
|
106
122
|
|
107
123
|
Ok(RbBatchedCsv {
|
108
124
|
reader: RefCell::new(reader),
|
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
|
|
110
126
|
}
|
111
127
|
|
112
128
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
113
|
-
let batches = self
|
114
|
-
.
|
115
|
-
.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
}))
|
129
|
+
let batches = match &mut *self.reader.borrow_mut() {
|
130
|
+
BatchedReader::MMap(reader) => reader.next_batches(n),
|
131
|
+
BatchedReader::Read(reader) => reader.next_batches(n),
|
132
|
+
}
|
133
|
+
.map_err(RbPolarsErr::from)?;
|
134
|
+
|
135
|
+
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
121
136
|
}
|
122
137
|
}
|
@@ -1,3 +1,6 @@
|
|
1
|
+
use std::fmt::{Display, Formatter};
|
2
|
+
use std::hash::{Hash, Hasher};
|
3
|
+
|
1
4
|
use magnus::{
|
2
5
|
class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
|
3
6
|
RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
|
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
|
|
10
13
|
use polars::io::avro::AvroCompression;
|
11
14
|
use polars::prelude::*;
|
12
15
|
use polars::series::ops::NullBehavior;
|
13
|
-
use
|
14
|
-
use std::hash::{Hash, Hasher};
|
16
|
+
use smartstring::alias::String as SmartString;
|
15
17
|
|
16
18
|
use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
17
19
|
|
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
|
|
82
84
|
}
|
83
85
|
}
|
84
86
|
|
87
|
+
impl TryConvert for Wrap<BinaryChunked> {
|
88
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
89
|
+
let (seq, len) = get_rbseq(obj)?;
|
90
|
+
let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
|
91
|
+
|
92
|
+
for res in seq.each() {
|
93
|
+
let item = res?;
|
94
|
+
match item.try_convert::<RString>() {
|
95
|
+
Ok(val) => builder.append_value(unsafe { val.as_slice() }),
|
96
|
+
Err(_) => builder.append_null(),
|
97
|
+
}
|
98
|
+
}
|
99
|
+
Ok(Wrap(builder.finish()))
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
85
103
|
impl TryConvert for Wrap<NullValues> {
|
86
104
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
87
105
|
if let Ok(s) = ob.try_convert::<String>() {
|
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
|
|
98
116
|
}
|
99
117
|
}
|
100
118
|
|
119
|
+
fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
120
|
+
let dict = RHash::new();
|
121
|
+
for (fld, val) in flds.iter().zip(vals) {
|
122
|
+
dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
|
123
|
+
}
|
124
|
+
dict.into_value()
|
125
|
+
}
|
126
|
+
|
101
127
|
impl IntoValue for Wrap<AnyValue<'_>> {
|
102
128
|
fn into_value_with(self, _: &RubyHandle) -> Value {
|
103
129
|
match self.0 {
|
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
114
140
|
AnyValue::Null => *QNIL,
|
115
141
|
AnyValue::Boolean(v) => Value::from(v),
|
116
142
|
AnyValue::Utf8(v) => Value::from(v),
|
117
|
-
AnyValue::Utf8Owned(
|
143
|
+
AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
|
118
144
|
AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
|
119
145
|
AnyValue::Date(v) => class::time()
|
120
146
|
.funcall::<_, _, Value>("at", (v * 86400,))
|
@@ -125,7 +151,13 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
125
151
|
.unwrap(),
|
126
152
|
AnyValue::Datetime(v, tu, tz) => {
|
127
153
|
let t = match tu {
|
128
|
-
TimeUnit::Nanoseconds =>
|
154
|
+
TimeUnit::Nanoseconds => {
|
155
|
+
let sec = v / 1000000000;
|
156
|
+
let subsec = v % 1000000000;
|
157
|
+
class::time()
|
158
|
+
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("nsec")))
|
159
|
+
.unwrap()
|
160
|
+
}
|
129
161
|
TimeUnit::Microseconds => {
|
130
162
|
let sec = v / 1000000;
|
131
163
|
let subsec = v % 1000000;
|
@@ -133,7 +165,13 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
133
165
|
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("usec")))
|
134
166
|
.unwrap()
|
135
167
|
}
|
136
|
-
TimeUnit::Milliseconds =>
|
168
|
+
TimeUnit::Milliseconds => {
|
169
|
+
let sec = v / 1000;
|
170
|
+
let subsec = v % 1000;
|
171
|
+
class::time()
|
172
|
+
.funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("millisecond")))
|
173
|
+
.unwrap()
|
174
|
+
}
|
137
175
|
};
|
138
176
|
|
139
177
|
if tz.is_some() {
|
@@ -145,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
145
183
|
AnyValue::Duration(_v, _tu) => todo!(),
|
146
184
|
AnyValue::Time(_v) => todo!(),
|
147
185
|
AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
|
148
|
-
ref
|
149
|
-
AnyValue::StructOwned(
|
150
|
-
AnyValue::Object(
|
151
|
-
|
152
|
-
|
153
|
-
|
186
|
+
ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
|
187
|
+
AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
|
188
|
+
AnyValue::Object(v) => {
|
189
|
+
let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
|
190
|
+
object.inner
|
191
|
+
}
|
192
|
+
AnyValue::ObjectOwned(v) => {
|
193
|
+
let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
|
194
|
+
object.inner
|
195
|
+
}
|
196
|
+
AnyValue::Binary(v) => RString::from_slice(v).into_value(),
|
197
|
+
AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
|
198
|
+
AnyValue::Decimal(_v, _scale) => todo!(),
|
154
199
|
}
|
155
200
|
}
|
156
201
|
}
|
@@ -170,12 +215,12 @@ impl IntoValue for Wrap<DataType> {
|
|
170
215
|
DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
|
171
216
|
DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
|
172
217
|
DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
|
173
|
-
DataType::
|
218
|
+
DataType::Decimal(_precision, _scale) => todo!(),
|
174
219
|
DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
|
175
220
|
DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
|
176
221
|
DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
|
177
222
|
DataType::List(inner) => {
|
178
|
-
let inner = Wrap(*inner
|
223
|
+
let inner = Wrap(*inner);
|
179
224
|
let list_class = pl.const_get::<_, Value>("List").unwrap();
|
180
225
|
list_class.funcall::<_, _, Value>("new", (inner,)).unwrap()
|
181
226
|
}
|
@@ -183,7 +228,7 @@ impl IntoValue for Wrap<DataType> {
|
|
183
228
|
DataType::Datetime(tu, tz) => {
|
184
229
|
let datetime_class = pl.const_get::<_, Value>("Datetime").unwrap();
|
185
230
|
datetime_class
|
186
|
-
.funcall::<_, _, Value>("new", (tu.to_ascii(), tz
|
231
|
+
.funcall::<_, _, Value>("new", (tu.to_ascii(), tz))
|
187
232
|
.unwrap()
|
188
233
|
}
|
189
234
|
DataType::Duration(tu) => {
|
@@ -198,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
|
|
198
243
|
DataType::Struct(fields) => {
|
199
244
|
let field_class = pl.const_get::<_, Value>("Field").unwrap();
|
200
245
|
let iter = fields.iter().map(|fld| {
|
201
|
-
let name = fld.name().
|
246
|
+
let name = fld.name().as_str();
|
202
247
|
let dtype = Wrap(fld.data_type().clone());
|
203
248
|
field_class
|
204
249
|
.funcall::<_, _, Value>("new", (name, dtype))
|
@@ -328,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
328
373
|
let n = 25;
|
329
374
|
let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
|
330
375
|
.map_err(RbPolarsErr::from)?;
|
331
|
-
let s = Series::from_any_values_and_dtype("", &avs, &dtype)
|
376
|
+
let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
|
332
377
|
.map_err(RbPolarsErr::from)?;
|
333
378
|
Ok(Wrap(AnyValue::List(s)))
|
334
379
|
}
|
@@ -858,3 +903,11 @@ pub fn parse_parquet_compression(
|
|
858
903
|
};
|
859
904
|
Ok(parsed)
|
860
905
|
}
|
906
|
+
|
907
|
+
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
|
908
|
+
where
|
909
|
+
I: IntoIterator<Item = S>,
|
910
|
+
S: AsRef<str>,
|
911
|
+
{
|
912
|
+
container.into_iter().map(|s| s.as_ref().into()).collect()
|
913
|
+
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -6,6 +6,7 @@ use polars::io::mmap::ReaderBytes;
|
|
6
6
|
use polars::io::RowCount;
|
7
7
|
use polars::prelude::pivot::{pivot, pivot_stable};
|
8
8
|
use polars::prelude::*;
|
9
|
+
use polars_core::utils::try_get_supertype;
|
9
10
|
use std::cell::RefCell;
|
10
11
|
use std::io::{BufWriter, Cursor};
|
11
12
|
use std::ops::Deref;
|
@@ -114,7 +115,7 @@ impl RbDataFrame {
|
|
114
115
|
let comment_char: Option<String> = arguments[17].try_convert()?;
|
115
116
|
let quote_char: Option<String> = arguments[18].try_convert()?;
|
116
117
|
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
117
|
-
let
|
118
|
+
let try_parse_dates: bool = arguments[20].try_convert()?;
|
118
119
|
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
119
120
|
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
120
121
|
let sample_size: usize = arguments[23].try_convert()?;
|
@@ -167,12 +168,12 @@ impl RbDataFrame {
|
|
167
168
|
.with_columns(columns)
|
168
169
|
.with_n_threads(n_threads)
|
169
170
|
.with_path(path)
|
170
|
-
.with_dtypes(overwrite_dtype.
|
171
|
+
.with_dtypes(overwrite_dtype.map(Arc::new))
|
171
172
|
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
172
173
|
.low_memory(low_memory)
|
173
174
|
.with_comment_char(comment_char)
|
174
175
|
.with_null_values(null_values)
|
175
|
-
.
|
176
|
+
.with_try_parse_dates(try_parse_dates)
|
176
177
|
.with_quote_char(quote_char)
|
177
178
|
.with_end_of_line_char(eol_char)
|
178
179
|
.with_skip_rows_after_header(skip_rows_after_header)
|
@@ -183,6 +184,7 @@ impl RbDataFrame {
|
|
183
184
|
Ok(df.into())
|
184
185
|
}
|
185
186
|
|
187
|
+
#[allow(clippy::too_many_arguments)]
|
186
188
|
pub fn read_parquet(
|
187
189
|
rb_f: Value,
|
188
190
|
columns: Option<Vec<String>>,
|
@@ -191,6 +193,8 @@ impl RbDataFrame {
|
|
191
193
|
parallel: Wrap<ParallelStrategy>,
|
192
194
|
row_count: Option<(String, IdxSize)>,
|
193
195
|
low_memory: bool,
|
196
|
+
use_statistics: bool,
|
197
|
+
rechunk: bool,
|
194
198
|
) -> RbResult<Self> {
|
195
199
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
196
200
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
@@ -201,6 +205,8 @@ impl RbDataFrame {
|
|
201
205
|
.with_n_rows(n_rows)
|
202
206
|
.with_row_count(row_count)
|
203
207
|
.set_low_memory(low_memory)
|
208
|
+
.use_statistics(use_statistics)
|
209
|
+
.set_rechunk(rechunk)
|
204
210
|
.finish()
|
205
211
|
.map_err(RbPolarsErr::from)?;
|
206
212
|
Ok(RbDataFrame::new(df))
|
@@ -253,7 +259,7 @@ impl RbDataFrame {
|
|
253
259
|
use polars::io::avro::AvroWriter;
|
254
260
|
|
255
261
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
256
|
-
let f = std::fs::File::create(
|
262
|
+
let f = std::fs::File::create(s).unwrap();
|
257
263
|
AvroWriter::new(f)
|
258
264
|
.with_compression(compression.0)
|
259
265
|
.finish(&mut self.df.borrow_mut())
|
@@ -338,7 +344,7 @@ impl RbDataFrame {
|
|
338
344
|
// ensure the new names are used
|
339
345
|
if let Some(schema) = &schema_overwrite {
|
340
346
|
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
341
|
-
*name = new_name.
|
347
|
+
*name = new_name.to_string();
|
342
348
|
}
|
343
349
|
}
|
344
350
|
let rbdf = Self::finish_from_rows(
|
@@ -347,17 +353,19 @@ impl RbDataFrame {
|
|
347
353
|
schema_overwrite.map(|wrap| wrap.0),
|
348
354
|
)?;
|
349
355
|
|
350
|
-
|
351
|
-
.
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
356
|
+
unsafe {
|
357
|
+
rbdf.df
|
358
|
+
.borrow_mut()
|
359
|
+
.get_columns_mut()
|
360
|
+
.iter_mut()
|
361
|
+
.zip(&names)
|
362
|
+
.for_each(|(s, name)| {
|
363
|
+
s.rename(name);
|
364
|
+
});
|
365
|
+
}
|
358
366
|
let length = names.len();
|
359
367
|
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
360
|
-
let err = PolarsError::
|
368
|
+
let err = PolarsError::SchemaMismatch("duplicate column names found".into());
|
361
369
|
Err(RbPolarsErr::from(err))?;
|
362
370
|
}
|
363
371
|
|
@@ -393,7 +401,7 @@ impl RbDataFrame {
|
|
393
401
|
let null = null_value.unwrap_or_default();
|
394
402
|
|
395
403
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
396
|
-
let f = std::fs::File::create(
|
404
|
+
let f = std::fs::File::create(s).unwrap();
|
397
405
|
// no need for a buffered writer, because the csv writer does internal buffering
|
398
406
|
CsvWriter::new(f)
|
399
407
|
.has_header(has_header)
|
@@ -435,7 +443,7 @@ impl RbDataFrame {
|
|
435
443
|
compression: Wrap<Option<IpcCompression>>,
|
436
444
|
) -> RbResult<()> {
|
437
445
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
438
|
-
let f = std::fs::File::create(
|
446
|
+
let f = std::fs::File::create(s).unwrap();
|
439
447
|
IpcWriter::new(f)
|
440
448
|
.with_compression(compression.0)
|
441
449
|
.finish(&mut self.df.borrow_mut())
|
@@ -493,6 +501,25 @@ impl RbDataFrame {
|
|
493
501
|
.into()
|
494
502
|
}
|
495
503
|
|
504
|
+
pub fn to_numo(&self) -> Option<Value> {
|
505
|
+
let mut st = None;
|
506
|
+
for s in self.df.borrow().iter() {
|
507
|
+
let dt_i = s.dtype();
|
508
|
+
match st {
|
509
|
+
None => st = Some(dt_i.clone()),
|
510
|
+
Some(ref mut st) => {
|
511
|
+
*st = try_get_supertype(st, dt_i).ok()?;
|
512
|
+
}
|
513
|
+
}
|
514
|
+
}
|
515
|
+
let st = st?;
|
516
|
+
|
517
|
+
match st {
|
518
|
+
// TODO
|
519
|
+
_ => None,
|
520
|
+
}
|
521
|
+
}
|
522
|
+
|
496
523
|
pub fn write_parquet(
|
497
524
|
&self,
|
498
525
|
rb_f: Value,
|
@@ -504,7 +531,7 @@ impl RbDataFrame {
|
|
504
531
|
let compression = parse_parquet_compression(&compression, compression_level)?;
|
505
532
|
|
506
533
|
if let Ok(s) = rb_f.try_convert::<String>() {
|
507
|
-
let f = std::fs::File::create(
|
534
|
+
let f = std::fs::File::create(s).unwrap();
|
508
535
|
ParquetWriter::new(f)
|
509
536
|
.with_compression(compression)
|
510
537
|
.with_statistics(statistics)
|
@@ -607,7 +634,7 @@ impl RbDataFrame {
|
|
607
634
|
}
|
608
635
|
|
609
636
|
pub fn get_columns(&self) -> RArray {
|
610
|
-
let cols = self.df.borrow().get_columns().
|
637
|
+
let cols = self.df.borrow().get_columns().to_vec();
|
611
638
|
to_rbseries_collection(cols)
|
612
639
|
}
|
613
640
|
|
@@ -861,10 +888,11 @@ impl RbDataFrame {
|
|
861
888
|
variable_name: Option<String>,
|
862
889
|
) -> RbResult<Self> {
|
863
890
|
let args = MeltArgs {
|
864
|
-
id_vars,
|
865
|
-
value_vars,
|
866
|
-
value_name,
|
867
|
-
variable_name,
|
891
|
+
id_vars: strings_to_smartstrings(id_vars),
|
892
|
+
value_vars: strings_to_smartstrings(value_vars),
|
893
|
+
value_name: value_name.map(|s| s.into()),
|
894
|
+
variable_name: variable_name.map(|s| s.into()),
|
895
|
+
streamable: false,
|
868
896
|
};
|
869
897
|
|
870
898
|
let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
|
@@ -877,22 +905,26 @@ impl RbDataFrame {
|
|
877
905
|
values: Vec<String>,
|
878
906
|
index: Vec<String>,
|
879
907
|
columns: Vec<String>,
|
880
|
-
aggregate_expr: &RbExpr,
|
881
908
|
maintain_order: bool,
|
882
909
|
sort_columns: bool,
|
910
|
+
aggregate_expr: Option<&RbExpr>,
|
883
911
|
separator: Option<String>,
|
884
912
|
) -> RbResult<Self> {
|
885
913
|
let fun = match maintain_order {
|
886
914
|
true => pivot_stable,
|
887
915
|
false => pivot,
|
888
916
|
};
|
917
|
+
let agg_expr = match aggregate_expr {
|
918
|
+
Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
|
919
|
+
None => None,
|
920
|
+
};
|
889
921
|
let df = fun(
|
890
922
|
&self.df.borrow(),
|
891
923
|
values,
|
892
924
|
index,
|
893
925
|
columns,
|
894
|
-
aggregate_expr.inner.clone(),
|
895
926
|
sort_columns,
|
927
|
+
agg_expr,
|
896
928
|
separator.as_deref(),
|
897
929
|
)
|
898
930
|
.map_err(RbPolarsErr::from)?;
|
@@ -913,21 +945,6 @@ impl RbDataFrame {
|
|
913
945
|
self.df.borrow().shift(periods).into()
|
914
946
|
}
|
915
947
|
|
916
|
-
pub fn unique(
|
917
|
-
&self,
|
918
|
-
maintain_order: bool,
|
919
|
-
subset: Option<Vec<String>>,
|
920
|
-
keep: Wrap<UniqueKeepStrategy>,
|
921
|
-
) -> RbResult<Self> {
|
922
|
-
let subset = subset.as_ref().map(|v| v.as_ref());
|
923
|
-
let df = match maintain_order {
|
924
|
-
true => self.df.borrow().unique_stable(subset, keep.0),
|
925
|
-
false => self.df.borrow().unique(subset, keep.0),
|
926
|
-
}
|
927
|
-
.map_err(RbPolarsErr::from)?;
|
928
|
-
Ok(df.into())
|
929
|
-
}
|
930
|
-
|
931
948
|
pub fn lazy(&self) -> RbLazyFrame {
|
932
949
|
self.df.borrow().clone().lazy().into()
|
933
950
|
}
|
data/ext/polars/src/error.rs
CHANGED
@@ -43,3 +43,11 @@ impl ComputeError {
|
|
43
43
|
Error::new(exception::runtime_error(), message)
|
44
44
|
}
|
45
45
|
}
|
46
|
+
|
47
|
+
#[macro_export]
|
48
|
+
macro_rules! raise_err(
|
49
|
+
($msg:expr, $err:ident) => {{
|
50
|
+
Err(PolarsError::$err($msg.into())).map_err(RbPolarsErr::from)?;
|
51
|
+
unreachable!()
|
52
|
+
}}
|
53
|
+
);
|