polars-df 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/ext/polars/src/dataframe.rs
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
-
use magnus::{r_hash::ForEach,
|
1
|
+
use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
|
2
2
|
use polars::io::mmap::ReaderBytes;
|
3
|
+
use polars::io::RowCount;
|
3
4
|
use polars::prelude::*;
|
4
5
|
use std::cell::RefCell;
|
5
|
-
use std::
|
6
|
-
use std::io::{BufReader, BufWriter, Cursor};
|
6
|
+
use std::io::{BufWriter, Cursor};
|
7
7
|
use std::ops::Deref;
|
8
|
-
use std::path::PathBuf;
|
9
8
|
|
10
9
|
use crate::conversion::*;
|
11
10
|
use crate::file::{get_file_like, get_mmap_bytes_reader};
|
12
|
-
use crate::series::to_rbseries_collection;
|
11
|
+
use crate::series::{to_rbseries_collection, to_series_collection};
|
13
12
|
use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
|
14
13
|
|
15
14
|
#[magnus::wrap(class = "Polars::RbDataFrame")]
|
@@ -43,22 +42,141 @@ impl RbDataFrame {
|
|
43
42
|
self.df.borrow().estimated_size()
|
44
43
|
}
|
45
44
|
|
46
|
-
pub fn read_csv(
|
45
|
+
pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
|
46
|
+
// start arguments
|
47
|
+
// this pattern is needed for more than 16
|
48
|
+
let rb_f: Value = arguments[0].try_convert()?;
|
49
|
+
let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
|
50
|
+
let chunk_size: usize = arguments[2].try_convert()?;
|
51
|
+
let has_header: bool = arguments[3].try_convert()?;
|
52
|
+
let ignore_errors: bool = arguments[4].try_convert()?;
|
53
|
+
let n_rows: Option<usize> = arguments[5].try_convert()?;
|
54
|
+
let skip_rows: usize = arguments[6].try_convert()?;
|
55
|
+
let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
|
56
|
+
let sep: String = arguments[8].try_convert()?;
|
57
|
+
let rechunk: bool = arguments[9].try_convert()?;
|
58
|
+
let columns: Option<Vec<String>> = arguments[10].try_convert()?;
|
59
|
+
let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
|
60
|
+
let n_threads: Option<usize> = arguments[12].try_convert()?;
|
61
|
+
let path: Option<String> = arguments[13].try_convert()?;
|
62
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
|
63
|
+
// TODO fix
|
64
|
+
let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
|
65
|
+
let low_memory: bool = arguments[16].try_convert()?;
|
66
|
+
let comment_char: Option<String> = arguments[17].try_convert()?;
|
67
|
+
let quote_char: Option<String> = arguments[18].try_convert()?;
|
68
|
+
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
69
|
+
let parse_dates: bool = arguments[20].try_convert()?;
|
70
|
+
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
71
|
+
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
72
|
+
let sample_size: usize = arguments[23].try_convert()?;
|
73
|
+
let eol_char: String = arguments[24].try_convert()?;
|
74
|
+
// end arguments
|
75
|
+
|
76
|
+
let null_values = null_values.map(|w| w.0);
|
77
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
78
|
+
let eol_char = eol_char.as_bytes()[0];
|
79
|
+
|
80
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
81
|
+
|
82
|
+
let quote_char = if let Some(s) = quote_char {
|
83
|
+
if s.is_empty() {
|
84
|
+
None
|
85
|
+
} else {
|
86
|
+
Some(s.as_bytes()[0])
|
87
|
+
}
|
88
|
+
} else {
|
89
|
+
None
|
90
|
+
};
|
91
|
+
|
92
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
93
|
+
let fields = overwrite_dtype.iter().map(|(name, dtype)| {
|
94
|
+
let dtype = dtype.0.clone();
|
95
|
+
Field::new(name, dtype)
|
96
|
+
});
|
97
|
+
Schema::from(fields)
|
98
|
+
});
|
99
|
+
|
100
|
+
let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
|
101
|
+
overwrite_dtype
|
102
|
+
.iter()
|
103
|
+
.map(|dt| dt.0.clone())
|
104
|
+
.collect::<Vec<_>>()
|
105
|
+
});
|
106
|
+
|
47
107
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
48
108
|
let df = CsvReader::new(mmap_bytes_r)
|
109
|
+
.infer_schema(infer_schema_length)
|
49
110
|
.has_header(has_header)
|
111
|
+
.with_n_rows(n_rows)
|
112
|
+
.with_delimiter(sep.as_bytes()[0])
|
113
|
+
.with_skip_rows(skip_rows)
|
114
|
+
.with_ignore_parser_errors(ignore_errors)
|
115
|
+
.with_projection(projection)
|
116
|
+
.with_rechunk(rechunk)
|
117
|
+
.with_chunk_size(chunk_size)
|
118
|
+
.with_encoding(encoding.0)
|
119
|
+
.with_columns(columns)
|
120
|
+
.with_n_threads(n_threads)
|
121
|
+
.with_path(path)
|
122
|
+
.with_dtypes(overwrite_dtype.as_ref())
|
123
|
+
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
124
|
+
.low_memory(low_memory)
|
125
|
+
.with_comment_char(comment_char)
|
126
|
+
.with_null_values(null_values)
|
127
|
+
.with_parse_dates(parse_dates)
|
128
|
+
.with_quote_char(quote_char)
|
129
|
+
.with_end_of_line_char(eol_char)
|
130
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
131
|
+
.with_row_count(row_count)
|
132
|
+
.sample_size(sample_size)
|
50
133
|
.finish()
|
51
134
|
.map_err(RbPolarsErr::from)?;
|
52
135
|
Ok(df.into())
|
53
136
|
}
|
54
137
|
|
55
|
-
pub fn read_parquet(
|
56
|
-
|
57
|
-
|
58
|
-
|
138
|
+
pub fn read_parquet(
|
139
|
+
rb_f: Value,
|
140
|
+
columns: Option<Vec<String>>,
|
141
|
+
projection: Option<Vec<usize>>,
|
142
|
+
n_rows: Option<usize>,
|
143
|
+
parallel: Wrap<ParallelStrategy>,
|
144
|
+
row_count: Option<(String, IdxSize)>,
|
145
|
+
low_memory: bool,
|
146
|
+
) -> RbResult<Self> {
|
147
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
148
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
149
|
+
let df = ParquetReader::new(mmap_bytes_r)
|
150
|
+
.with_projection(projection)
|
151
|
+
.with_columns(columns)
|
152
|
+
.read_parallel(parallel.0)
|
153
|
+
.with_n_rows(n_rows)
|
154
|
+
.with_row_count(row_count)
|
155
|
+
.set_low_memory(low_memory)
|
59
156
|
.finish()
|
60
|
-
.map_err(RbPolarsErr::from)
|
61
|
-
|
157
|
+
.map_err(RbPolarsErr::from)?;
|
158
|
+
Ok(RbDataFrame::new(df))
|
159
|
+
}
|
160
|
+
|
161
|
+
pub fn read_ipc(
|
162
|
+
rb_f: Value,
|
163
|
+
columns: Option<Vec<String>>,
|
164
|
+
projection: Option<Vec<usize>>,
|
165
|
+
n_rows: Option<usize>,
|
166
|
+
row_count: Option<(String, IdxSize)>,
|
167
|
+
memory_map: bool,
|
168
|
+
) -> RbResult<Self> {
|
169
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
170
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
171
|
+
let df = IpcReader::new(mmap_bytes_r)
|
172
|
+
.with_projection(projection)
|
173
|
+
.with_columns(columns)
|
174
|
+
.with_n_rows(n_rows)
|
175
|
+
.with_row_count(row_count)
|
176
|
+
.memory_mapped(memory_map)
|
177
|
+
.finish()
|
178
|
+
.map_err(RbPolarsErr::from)?;
|
179
|
+
Ok(RbDataFrame::new(df))
|
62
180
|
}
|
63
181
|
|
64
182
|
pub fn read_json(rb_f: Value) -> RbResult<Self> {
|
@@ -185,6 +303,77 @@ impl RbDataFrame {
|
|
185
303
|
Ok(())
|
186
304
|
}
|
187
305
|
|
306
|
+
pub fn write_ipc(
|
307
|
+
&self,
|
308
|
+
rb_f: Value,
|
309
|
+
compression: Wrap<Option<IpcCompression>>,
|
310
|
+
) -> RbResult<()> {
|
311
|
+
if let Ok(s) = rb_f.try_convert::<String>() {
|
312
|
+
let f = std::fs::File::create(&s).unwrap();
|
313
|
+
IpcWriter::new(f)
|
314
|
+
.with_compression(compression.0)
|
315
|
+
.finish(&mut self.df.borrow_mut())
|
316
|
+
.map_err(RbPolarsErr::from)?;
|
317
|
+
} else {
|
318
|
+
let mut buf = get_file_like(rb_f, true)?;
|
319
|
+
|
320
|
+
IpcWriter::new(&mut buf)
|
321
|
+
.with_compression(compression.0)
|
322
|
+
.finish(&mut self.df.borrow_mut())
|
323
|
+
.map_err(RbPolarsErr::from)?;
|
324
|
+
}
|
325
|
+
Ok(())
|
326
|
+
}
|
327
|
+
|
328
|
+
pub fn row_tuple(&self, idx: i64) -> Value {
|
329
|
+
let idx = if idx < 0 {
|
330
|
+
(self.df.borrow().height() as i64 + idx) as usize
|
331
|
+
} else {
|
332
|
+
idx as usize
|
333
|
+
};
|
334
|
+
RArray::from_vec(
|
335
|
+
self.df
|
336
|
+
.borrow()
|
337
|
+
.get_columns()
|
338
|
+
.iter()
|
339
|
+
.map(|s| match s.dtype() {
|
340
|
+
DataType::Object(_) => {
|
341
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
342
|
+
obj.unwrap().to_object()
|
343
|
+
}
|
344
|
+
_ => Wrap(s.get(idx)).into(),
|
345
|
+
})
|
346
|
+
.collect(),
|
347
|
+
)
|
348
|
+
.into()
|
349
|
+
}
|
350
|
+
|
351
|
+
pub fn row_tuples(&self) -> Value {
|
352
|
+
let df = &self.df;
|
353
|
+
RArray::from_vec(
|
354
|
+
(0..df.borrow().height())
|
355
|
+
.map(|idx| {
|
356
|
+
RArray::from_vec(
|
357
|
+
self.df
|
358
|
+
.borrow()
|
359
|
+
.get_columns()
|
360
|
+
.iter()
|
361
|
+
.map(|s| match s.dtype() {
|
362
|
+
DataType::Object(_) => {
|
363
|
+
let obj: Option<&ObjectValue> =
|
364
|
+
s.get_object(idx).map(|any| any.into());
|
365
|
+
obj.unwrap().to_object()
|
366
|
+
}
|
367
|
+
_ => Wrap(s.get(idx)).into(),
|
368
|
+
})
|
369
|
+
.collect(),
|
370
|
+
)
|
371
|
+
})
|
372
|
+
.collect(),
|
373
|
+
)
|
374
|
+
.into()
|
375
|
+
}
|
376
|
+
|
188
377
|
pub fn write_parquet(
|
189
378
|
&self,
|
190
379
|
rb_f: Value,
|
@@ -210,6 +399,86 @@ impl RbDataFrame {
|
|
210
399
|
Ok(())
|
211
400
|
}
|
212
401
|
|
402
|
+
pub fn add(&self, s: &RbSeries) -> RbResult<Self> {
|
403
|
+
let df = (&*self.df.borrow() + &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
404
|
+
Ok(df.into())
|
405
|
+
}
|
406
|
+
|
407
|
+
pub fn sub(&self, s: &RbSeries) -> RbResult<Self> {
|
408
|
+
let df = (&*self.df.borrow() - &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
409
|
+
Ok(df.into())
|
410
|
+
}
|
411
|
+
|
412
|
+
pub fn div(&self, s: &RbSeries) -> RbResult<Self> {
|
413
|
+
let df = (&*self.df.borrow() / &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
414
|
+
Ok(df.into())
|
415
|
+
}
|
416
|
+
|
417
|
+
pub fn mul(&self, s: &RbSeries) -> RbResult<Self> {
|
418
|
+
let df = (&*self.df.borrow() * &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
419
|
+
Ok(df.into())
|
420
|
+
}
|
421
|
+
|
422
|
+
pub fn rem(&self, s: &RbSeries) -> RbResult<Self> {
|
423
|
+
let df = (&*self.df.borrow() % &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
424
|
+
Ok(df.into())
|
425
|
+
}
|
426
|
+
|
427
|
+
pub fn add_df(&self, s: &Self) -> RbResult<Self> {
|
428
|
+
let df = (&*self.df.borrow() + &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
429
|
+
Ok(df.into())
|
430
|
+
}
|
431
|
+
|
432
|
+
pub fn sub_df(&self, s: &Self) -> RbResult<Self> {
|
433
|
+
let df = (&*self.df.borrow() - &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
434
|
+
Ok(df.into())
|
435
|
+
}
|
436
|
+
|
437
|
+
pub fn div_df(&self, s: &Self) -> RbResult<Self> {
|
438
|
+
let df = (&*self.df.borrow() / &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
439
|
+
Ok(df.into())
|
440
|
+
}
|
441
|
+
|
442
|
+
pub fn mul_df(&self, s: &Self) -> RbResult<Self> {
|
443
|
+
let df = (&*self.df.borrow() * &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
444
|
+
Ok(df.into())
|
445
|
+
}
|
446
|
+
|
447
|
+
pub fn rem_df(&self, s: &Self) -> RbResult<Self> {
|
448
|
+
let df = (&*self.df.borrow() % &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
449
|
+
Ok(df.into())
|
450
|
+
}
|
451
|
+
|
452
|
+
pub fn sample_n(
|
453
|
+
&self,
|
454
|
+
n: usize,
|
455
|
+
with_replacement: bool,
|
456
|
+
shuffle: bool,
|
457
|
+
seed: Option<u64>,
|
458
|
+
) -> RbResult<Self> {
|
459
|
+
let df = self
|
460
|
+
.df
|
461
|
+
.borrow()
|
462
|
+
.sample_n(n, with_replacement, shuffle, seed)
|
463
|
+
.map_err(RbPolarsErr::from)?;
|
464
|
+
Ok(df.into())
|
465
|
+
}
|
466
|
+
|
467
|
+
pub fn sample_frac(
|
468
|
+
&self,
|
469
|
+
frac: f64,
|
470
|
+
with_replacement: bool,
|
471
|
+
shuffle: bool,
|
472
|
+
seed: Option<u64>,
|
473
|
+
) -> RbResult<Self> {
|
474
|
+
let df = self
|
475
|
+
.df
|
476
|
+
.borrow()
|
477
|
+
.sample_frac(frac, with_replacement, shuffle, seed)
|
478
|
+
.map_err(RbPolarsErr::from)?;
|
479
|
+
Ok(df.into())
|
480
|
+
}
|
481
|
+
|
213
482
|
pub fn rechunk(&self) -> Self {
|
214
483
|
self.df.borrow().agg_chunks().into()
|
215
484
|
}
|
@@ -240,11 +509,11 @@ impl RbDataFrame {
|
|
240
509
|
Ok(())
|
241
510
|
}
|
242
511
|
|
243
|
-
pub fn dtypes(&self) -> Vec<
|
512
|
+
pub fn dtypes(&self) -> Vec<Value> {
|
244
513
|
self.df
|
245
514
|
.borrow()
|
246
515
|
.iter()
|
247
|
-
.map(|s| s.dtype().
|
516
|
+
.map(|s| Wrap(s.dtype().clone()).into())
|
248
517
|
.collect()
|
249
518
|
}
|
250
519
|
|
@@ -265,6 +534,73 @@ impl RbDataFrame {
|
|
265
534
|
self.df.borrow().width()
|
266
535
|
}
|
267
536
|
|
537
|
+
pub fn hstack_mut(&self, columns: RArray) -> RbResult<()> {
|
538
|
+
let columns = to_series_collection(columns)?;
|
539
|
+
self.df
|
540
|
+
.borrow_mut()
|
541
|
+
.hstack_mut(&columns)
|
542
|
+
.map_err(RbPolarsErr::from)?;
|
543
|
+
Ok(())
|
544
|
+
}
|
545
|
+
|
546
|
+
pub fn hstack(&self, columns: RArray) -> RbResult<Self> {
|
547
|
+
let columns = to_series_collection(columns)?;
|
548
|
+
let df = self
|
549
|
+
.df
|
550
|
+
.borrow()
|
551
|
+
.hstack(&columns)
|
552
|
+
.map_err(RbPolarsErr::from)?;
|
553
|
+
Ok(df.into())
|
554
|
+
}
|
555
|
+
|
556
|
+
pub fn extend(&self, df: &RbDataFrame) -> RbResult<()> {
|
557
|
+
self.df
|
558
|
+
.borrow_mut()
|
559
|
+
.extend(&df.df.borrow())
|
560
|
+
.map_err(RbPolarsErr::from)?;
|
561
|
+
Ok(())
|
562
|
+
}
|
563
|
+
|
564
|
+
pub fn vstack_mut(&self, df: &RbDataFrame) -> RbResult<()> {
|
565
|
+
self.df
|
566
|
+
.borrow_mut()
|
567
|
+
.vstack_mut(&df.df.borrow())
|
568
|
+
.map_err(RbPolarsErr::from)?;
|
569
|
+
Ok(())
|
570
|
+
}
|
571
|
+
|
572
|
+
pub fn vstack(&self, df: &RbDataFrame) -> RbResult<Self> {
|
573
|
+
let df = self
|
574
|
+
.df
|
575
|
+
.borrow()
|
576
|
+
.vstack(&df.df.borrow())
|
577
|
+
.map_err(RbPolarsErr::from)?;
|
578
|
+
Ok(df.into())
|
579
|
+
}
|
580
|
+
|
581
|
+
pub fn drop_in_place(&self, name: String) -> RbResult<RbSeries> {
|
582
|
+
let s = self
|
583
|
+
.df
|
584
|
+
.borrow_mut()
|
585
|
+
.drop_in_place(&name)
|
586
|
+
.map_err(RbPolarsErr::from)?;
|
587
|
+
Ok(RbSeries::new(s))
|
588
|
+
}
|
589
|
+
|
590
|
+
pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> RbResult<Self> {
|
591
|
+
let df = self
|
592
|
+
.df
|
593
|
+
.borrow()
|
594
|
+
.drop_nulls(subset.as_ref().map(|s| s.as_ref()))
|
595
|
+
.map_err(RbPolarsErr::from)?;
|
596
|
+
Ok(df.into())
|
597
|
+
}
|
598
|
+
|
599
|
+
pub fn drop(&self, name: String) -> RbResult<Self> {
|
600
|
+
let df = self.df.borrow().drop(&name).map_err(RbPolarsErr::from)?;
|
601
|
+
Ok(RbDataFrame::new(df))
|
602
|
+
}
|
603
|
+
|
268
604
|
pub fn select_at_idx(&self, idx: usize) -> Option<RbSeries> {
|
269
605
|
self.df
|
270
606
|
.borrow()
|
@@ -272,6 +608,10 @@ impl RbDataFrame {
|
|
272
608
|
.map(|s| RbSeries::new(s.clone()))
|
273
609
|
}
|
274
610
|
|
611
|
+
pub fn find_idx_by_name(&self, name: String) -> Option<usize> {
|
612
|
+
self.df.borrow().find_idx_by_name(&name)
|
613
|
+
}
|
614
|
+
|
275
615
|
// TODO remove clone
|
276
616
|
pub fn column(&self, name: String) -> RbResult<RbSeries> {
|
277
617
|
self.df
|
@@ -418,7 +758,7 @@ impl RbDataFrame {
|
|
418
758
|
self.df.borrow().partition_by(groups)
|
419
759
|
}
|
420
760
|
.map_err(RbPolarsErr::from)?;
|
421
|
-
Ok(out.into_iter().map(
|
761
|
+
Ok(out.into_iter().map(RbDataFrame::new).collect())
|
422
762
|
}
|
423
763
|
|
424
764
|
pub fn shift(&self, periods: i64) -> Self {
|
@@ -574,6 +914,11 @@ impl RbDataFrame {
|
|
574
914
|
Ok(out.into())
|
575
915
|
}
|
576
916
|
|
917
|
+
pub fn to_struct(&self, name: String) -> RbSeries {
|
918
|
+
let s = self.df.borrow().clone().into_struct(&name);
|
919
|
+
s.into_series().into()
|
920
|
+
}
|
921
|
+
|
577
922
|
pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
|
578
923
|
let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
|
579
924
|
Ok(df.into())
|
data/ext/polars/src/error.rs
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
use magnus::exception::arg_error;
|
2
2
|
use magnus::Error;
|
3
|
+
use polars::error::ArrowError;
|
3
4
|
use polars::prelude::PolarsError;
|
4
5
|
|
5
6
|
pub struct RbPolarsErr {}
|
@@ -10,6 +11,14 @@ impl RbPolarsErr {
|
|
10
11
|
Error::runtime_error(e.to_string())
|
11
12
|
}
|
12
13
|
|
14
|
+
pub fn arrow(e: ArrowError) -> Error {
|
15
|
+
Error::runtime_error(e.to_string())
|
16
|
+
}
|
17
|
+
|
18
|
+
pub fn io(e: std::io::Error) -> Error {
|
19
|
+
Error::runtime_error(e.to_string())
|
20
|
+
}
|
21
|
+
|
13
22
|
pub fn other(message: String) -> Error {
|
14
23
|
Error::runtime_error(message)
|
15
24
|
}
|
data/ext/polars/src/file.rs
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
use magnus::{Error, RString, Value};
|
2
2
|
use polars::io::mmap::MmapBytesReader;
|
3
|
-
use std::fs::
|
3
|
+
use std::fs::File;
|
4
4
|
use std::io::Cursor;
|
5
5
|
use std::path::PathBuf;
|
6
6
|
|
7
7
|
use crate::RbResult;
|
8
8
|
|
9
9
|
pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
let str_slice = f.try_convert::<PathBuf>()?;
|
11
|
+
let f = if truncate {
|
12
|
+
File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
|
13
|
+
} else {
|
14
|
+
File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
|
15
|
+
};
|
16
|
+
Ok(f)
|
16
17
|
}
|
17
18
|
|
18
19
|
pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{RArray, RHash, Value};
|
2
|
+
use polars::io::RowCount;
|
2
3
|
use polars::lazy::frame::{LazyFrame, LazyGroupBy};
|
3
4
|
use polars::prelude::*;
|
4
5
|
use std::cell::RefCell;
|
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
52
53
|
}
|
53
54
|
|
54
55
|
impl RbLazyFrame {
|
56
|
+
pub fn new_from_ndjson(
|
57
|
+
path: String,
|
58
|
+
infer_schema_length: Option<usize>,
|
59
|
+
batch_size: Option<usize>,
|
60
|
+
n_rows: Option<usize>,
|
61
|
+
low_memory: bool,
|
62
|
+
rechunk: bool,
|
63
|
+
row_count: Option<(String, IdxSize)>,
|
64
|
+
) -> RbResult<Self> {
|
65
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
66
|
+
|
67
|
+
let lf = LazyJsonLineReader::new(path)
|
68
|
+
.with_infer_schema_length(infer_schema_length)
|
69
|
+
.with_batch_size(batch_size)
|
70
|
+
.with_n_rows(n_rows)
|
71
|
+
.low_memory(low_memory)
|
72
|
+
.with_rechunk(rechunk)
|
73
|
+
.with_row_count(row_count)
|
74
|
+
.finish()
|
75
|
+
.map_err(RbPolarsErr::from)?;
|
76
|
+
Ok(lf.into())
|
77
|
+
}
|
78
|
+
|
79
|
+
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
80
|
+
// start arguments
|
81
|
+
// this pattern is needed for more than 16
|
82
|
+
let path: String = arguments[0].try_convert()?;
|
83
|
+
let sep: String = arguments[1].try_convert()?;
|
84
|
+
let has_header: bool = arguments[2].try_convert()?;
|
85
|
+
let ignore_errors: bool = arguments[3].try_convert()?;
|
86
|
+
let skip_rows: usize = arguments[4].try_convert()?;
|
87
|
+
let n_rows: Option<usize> = arguments[5].try_convert()?;
|
88
|
+
let cache: bool = arguments[6].try_convert()?;
|
89
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
|
90
|
+
let low_memory: bool = arguments[8].try_convert()?;
|
91
|
+
let comment_char: Option<String> = arguments[9].try_convert()?;
|
92
|
+
let quote_char: Option<String> = arguments[10].try_convert()?;
|
93
|
+
let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
|
94
|
+
let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
|
95
|
+
let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
|
96
|
+
let rechunk: bool = arguments[14].try_convert()?;
|
97
|
+
let skip_rows_after_header: usize = arguments[15].try_convert()?;
|
98
|
+
let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
|
99
|
+
let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
|
100
|
+
let parse_dates: bool = arguments[18].try_convert()?;
|
101
|
+
let eol_char: String = arguments[19].try_convert()?;
|
102
|
+
// end arguments
|
103
|
+
|
104
|
+
let null_values = null_values.map(|w| w.0);
|
105
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
106
|
+
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
107
|
+
let delimiter = sep.as_bytes()[0];
|
108
|
+
let eol_char = eol_char.as_bytes()[0];
|
109
|
+
|
110
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
111
|
+
|
112
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
113
|
+
let fields = overwrite_dtype
|
114
|
+
.into_iter()
|
115
|
+
.map(|(name, dtype)| Field::new(&name, dtype.0));
|
116
|
+
Schema::from(fields)
|
117
|
+
});
|
118
|
+
let r = LazyCsvReader::new(path)
|
119
|
+
.with_infer_schema_length(infer_schema_length)
|
120
|
+
.with_delimiter(delimiter)
|
121
|
+
.has_header(has_header)
|
122
|
+
.with_ignore_parser_errors(ignore_errors)
|
123
|
+
.with_skip_rows(skip_rows)
|
124
|
+
.with_n_rows(n_rows)
|
125
|
+
.with_cache(cache)
|
126
|
+
.with_dtype_overwrite(overwrite_dtype.as_ref())
|
127
|
+
.low_memory(low_memory)
|
128
|
+
.with_comment_char(comment_char)
|
129
|
+
.with_quote_char(quote_char)
|
130
|
+
.with_end_of_line_char(eol_char)
|
131
|
+
.with_rechunk(rechunk)
|
132
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
133
|
+
.with_encoding(encoding.0)
|
134
|
+
.with_row_count(row_count)
|
135
|
+
.with_parse_dates(parse_dates)
|
136
|
+
.with_null_values(null_values);
|
137
|
+
|
138
|
+
if let Some(_lambda) = with_schema_modify {
|
139
|
+
todo!();
|
140
|
+
}
|
141
|
+
|
142
|
+
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
143
|
+
}
|
144
|
+
|
145
|
+
pub fn new_from_parquet(
|
146
|
+
path: String,
|
147
|
+
n_rows: Option<usize>,
|
148
|
+
cache: bool,
|
149
|
+
parallel: Wrap<ParallelStrategy>,
|
150
|
+
rechunk: bool,
|
151
|
+
row_count: Option<(String, IdxSize)>,
|
152
|
+
low_memory: bool,
|
153
|
+
) -> RbResult<Self> {
|
154
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
155
|
+
let args = ScanArgsParquet {
|
156
|
+
n_rows,
|
157
|
+
cache,
|
158
|
+
parallel: parallel.0,
|
159
|
+
rechunk,
|
160
|
+
row_count,
|
161
|
+
low_memory,
|
162
|
+
};
|
163
|
+
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
164
|
+
Ok(lf.into())
|
165
|
+
}
|
166
|
+
|
167
|
+
pub fn new_from_ipc(
|
168
|
+
path: String,
|
169
|
+
n_rows: Option<usize>,
|
170
|
+
cache: bool,
|
171
|
+
rechunk: bool,
|
172
|
+
row_count: Option<(String, IdxSize)>,
|
173
|
+
memory_map: bool,
|
174
|
+
) -> RbResult<Self> {
|
175
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
176
|
+
let args = ScanArgsIpc {
|
177
|
+
n_rows,
|
178
|
+
cache,
|
179
|
+
rechunk,
|
180
|
+
row_count,
|
181
|
+
memmap: memory_map,
|
182
|
+
};
|
183
|
+
let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
|
184
|
+
Ok(lf.into())
|
185
|
+
}
|
186
|
+
|
55
187
|
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
56
188
|
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
57
189
|
serde_json::to_writer(file, &self.ldf.logical_plan)
|
@@ -388,9 +520,9 @@ impl RbLazyFrame {
|
|
388
520
|
Ok(self.get_schema()?.iter_names().cloned().collect())
|
389
521
|
}
|
390
522
|
|
391
|
-
pub fn dtypes(&self) -> RbResult<Vec<
|
523
|
+
pub fn dtypes(&self) -> RbResult<Vec<Value>> {
|
392
524
|
let schema = self.get_schema()?;
|
393
|
-
let iter = schema.iter_dtypes().map(|dt| dt.
|
525
|
+
let iter = schema.iter_dtypes().map(|dt| Wrap(dt.clone()).into());
|
394
526
|
Ok(iter.collect())
|
395
527
|
}
|
396
528
|
|
@@ -401,7 +533,7 @@ impl RbLazyFrame {
|
|
401
533
|
schema.iter_fields().for_each(|fld| {
|
402
534
|
// TODO remove unwrap
|
403
535
|
schema_dict
|
404
|
-
.aset(fld.name().clone(), fld.data_type().
|
536
|
+
.aset::<String, Value>(fld.name().clone(), Wrap(fld.data_type().clone()).into())
|
405
537
|
.unwrap();
|
406
538
|
});
|
407
539
|
Ok(schema_dict)
|