polars-df 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/ext/polars/src/dataframe.rs
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
-
use magnus::{r_hash::ForEach,
|
1
|
+
use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
|
2
2
|
use polars::io::mmap::ReaderBytes;
|
3
|
+
use polars::io::RowCount;
|
3
4
|
use polars::prelude::*;
|
4
5
|
use std::cell::RefCell;
|
5
|
-
use std::
|
6
|
-
use std::io::{BufReader, BufWriter, Cursor};
|
6
|
+
use std::io::{BufWriter, Cursor};
|
7
7
|
use std::ops::Deref;
|
8
|
-
use std::path::PathBuf;
|
9
8
|
|
10
9
|
use crate::conversion::*;
|
11
10
|
use crate::file::{get_file_like, get_mmap_bytes_reader};
|
12
|
-
use crate::series::to_rbseries_collection;
|
11
|
+
use crate::series::{to_rbseries_collection, to_series_collection};
|
13
12
|
use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
|
14
13
|
|
15
14
|
#[magnus::wrap(class = "Polars::RbDataFrame")]
|
@@ -43,22 +42,141 @@ impl RbDataFrame {
|
|
43
42
|
self.df.borrow().estimated_size()
|
44
43
|
}
|
45
44
|
|
46
|
-
pub fn read_csv(
|
45
|
+
pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
|
46
|
+
// start arguments
|
47
|
+
// this pattern is needed for more than 16
|
48
|
+
let rb_f: Value = arguments[0].try_convert()?;
|
49
|
+
let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
|
50
|
+
let chunk_size: usize = arguments[2].try_convert()?;
|
51
|
+
let has_header: bool = arguments[3].try_convert()?;
|
52
|
+
let ignore_errors: bool = arguments[4].try_convert()?;
|
53
|
+
let n_rows: Option<usize> = arguments[5].try_convert()?;
|
54
|
+
let skip_rows: usize = arguments[6].try_convert()?;
|
55
|
+
let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
|
56
|
+
let sep: String = arguments[8].try_convert()?;
|
57
|
+
let rechunk: bool = arguments[9].try_convert()?;
|
58
|
+
let columns: Option<Vec<String>> = arguments[10].try_convert()?;
|
59
|
+
let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
|
60
|
+
let n_threads: Option<usize> = arguments[12].try_convert()?;
|
61
|
+
let path: Option<String> = arguments[13].try_convert()?;
|
62
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
|
63
|
+
// TODO fix
|
64
|
+
let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
|
65
|
+
let low_memory: bool = arguments[16].try_convert()?;
|
66
|
+
let comment_char: Option<String> = arguments[17].try_convert()?;
|
67
|
+
let quote_char: Option<String> = arguments[18].try_convert()?;
|
68
|
+
let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
|
69
|
+
let parse_dates: bool = arguments[20].try_convert()?;
|
70
|
+
let skip_rows_after_header: usize = arguments[21].try_convert()?;
|
71
|
+
let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
|
72
|
+
let sample_size: usize = arguments[23].try_convert()?;
|
73
|
+
let eol_char: String = arguments[24].try_convert()?;
|
74
|
+
// end arguments
|
75
|
+
|
76
|
+
let null_values = null_values.map(|w| w.0);
|
77
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
78
|
+
let eol_char = eol_char.as_bytes()[0];
|
79
|
+
|
80
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
81
|
+
|
82
|
+
let quote_char = if let Some(s) = quote_char {
|
83
|
+
if s.is_empty() {
|
84
|
+
None
|
85
|
+
} else {
|
86
|
+
Some(s.as_bytes()[0])
|
87
|
+
}
|
88
|
+
} else {
|
89
|
+
None
|
90
|
+
};
|
91
|
+
|
92
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
93
|
+
let fields = overwrite_dtype.iter().map(|(name, dtype)| {
|
94
|
+
let dtype = dtype.0.clone();
|
95
|
+
Field::new(name, dtype)
|
96
|
+
});
|
97
|
+
Schema::from(fields)
|
98
|
+
});
|
99
|
+
|
100
|
+
let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
|
101
|
+
overwrite_dtype
|
102
|
+
.iter()
|
103
|
+
.map(|dt| dt.0.clone())
|
104
|
+
.collect::<Vec<_>>()
|
105
|
+
});
|
106
|
+
|
47
107
|
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
48
108
|
let df = CsvReader::new(mmap_bytes_r)
|
109
|
+
.infer_schema(infer_schema_length)
|
49
110
|
.has_header(has_header)
|
111
|
+
.with_n_rows(n_rows)
|
112
|
+
.with_delimiter(sep.as_bytes()[0])
|
113
|
+
.with_skip_rows(skip_rows)
|
114
|
+
.with_ignore_parser_errors(ignore_errors)
|
115
|
+
.with_projection(projection)
|
116
|
+
.with_rechunk(rechunk)
|
117
|
+
.with_chunk_size(chunk_size)
|
118
|
+
.with_encoding(encoding.0)
|
119
|
+
.with_columns(columns)
|
120
|
+
.with_n_threads(n_threads)
|
121
|
+
.with_path(path)
|
122
|
+
.with_dtypes(overwrite_dtype.as_ref())
|
123
|
+
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
124
|
+
.low_memory(low_memory)
|
125
|
+
.with_comment_char(comment_char)
|
126
|
+
.with_null_values(null_values)
|
127
|
+
.with_parse_dates(parse_dates)
|
128
|
+
.with_quote_char(quote_char)
|
129
|
+
.with_end_of_line_char(eol_char)
|
130
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
131
|
+
.with_row_count(row_count)
|
132
|
+
.sample_size(sample_size)
|
50
133
|
.finish()
|
51
134
|
.map_err(RbPolarsErr::from)?;
|
52
135
|
Ok(df.into())
|
53
136
|
}
|
54
137
|
|
55
|
-
pub fn read_parquet(
|
56
|
-
|
57
|
-
|
58
|
-
|
138
|
+
pub fn read_parquet(
|
139
|
+
rb_f: Value,
|
140
|
+
columns: Option<Vec<String>>,
|
141
|
+
projection: Option<Vec<usize>>,
|
142
|
+
n_rows: Option<usize>,
|
143
|
+
parallel: Wrap<ParallelStrategy>,
|
144
|
+
row_count: Option<(String, IdxSize)>,
|
145
|
+
low_memory: bool,
|
146
|
+
) -> RbResult<Self> {
|
147
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
148
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
149
|
+
let df = ParquetReader::new(mmap_bytes_r)
|
150
|
+
.with_projection(projection)
|
151
|
+
.with_columns(columns)
|
152
|
+
.read_parallel(parallel.0)
|
153
|
+
.with_n_rows(n_rows)
|
154
|
+
.with_row_count(row_count)
|
155
|
+
.set_low_memory(low_memory)
|
59
156
|
.finish()
|
60
|
-
.map_err(RbPolarsErr::from)
|
61
|
-
|
157
|
+
.map_err(RbPolarsErr::from)?;
|
158
|
+
Ok(RbDataFrame::new(df))
|
159
|
+
}
|
160
|
+
|
161
|
+
pub fn read_ipc(
|
162
|
+
rb_f: Value,
|
163
|
+
columns: Option<Vec<String>>,
|
164
|
+
projection: Option<Vec<usize>>,
|
165
|
+
n_rows: Option<usize>,
|
166
|
+
row_count: Option<(String, IdxSize)>,
|
167
|
+
memory_map: bool,
|
168
|
+
) -> RbResult<Self> {
|
169
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
170
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
171
|
+
let df = IpcReader::new(mmap_bytes_r)
|
172
|
+
.with_projection(projection)
|
173
|
+
.with_columns(columns)
|
174
|
+
.with_n_rows(n_rows)
|
175
|
+
.with_row_count(row_count)
|
176
|
+
.memory_mapped(memory_map)
|
177
|
+
.finish()
|
178
|
+
.map_err(RbPolarsErr::from)?;
|
179
|
+
Ok(RbDataFrame::new(df))
|
62
180
|
}
|
63
181
|
|
64
182
|
pub fn read_json(rb_f: Value) -> RbResult<Self> {
|
@@ -185,6 +303,77 @@ impl RbDataFrame {
|
|
185
303
|
Ok(())
|
186
304
|
}
|
187
305
|
|
306
|
+
pub fn write_ipc(
|
307
|
+
&self,
|
308
|
+
rb_f: Value,
|
309
|
+
compression: Wrap<Option<IpcCompression>>,
|
310
|
+
) -> RbResult<()> {
|
311
|
+
if let Ok(s) = rb_f.try_convert::<String>() {
|
312
|
+
let f = std::fs::File::create(&s).unwrap();
|
313
|
+
IpcWriter::new(f)
|
314
|
+
.with_compression(compression.0)
|
315
|
+
.finish(&mut self.df.borrow_mut())
|
316
|
+
.map_err(RbPolarsErr::from)?;
|
317
|
+
} else {
|
318
|
+
let mut buf = get_file_like(rb_f, true)?;
|
319
|
+
|
320
|
+
IpcWriter::new(&mut buf)
|
321
|
+
.with_compression(compression.0)
|
322
|
+
.finish(&mut self.df.borrow_mut())
|
323
|
+
.map_err(RbPolarsErr::from)?;
|
324
|
+
}
|
325
|
+
Ok(())
|
326
|
+
}
|
327
|
+
|
328
|
+
pub fn row_tuple(&self, idx: i64) -> Value {
|
329
|
+
let idx = if idx < 0 {
|
330
|
+
(self.df.borrow().height() as i64 + idx) as usize
|
331
|
+
} else {
|
332
|
+
idx as usize
|
333
|
+
};
|
334
|
+
RArray::from_vec(
|
335
|
+
self.df
|
336
|
+
.borrow()
|
337
|
+
.get_columns()
|
338
|
+
.iter()
|
339
|
+
.map(|s| match s.dtype() {
|
340
|
+
DataType::Object(_) => {
|
341
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
342
|
+
obj.unwrap().to_object()
|
343
|
+
}
|
344
|
+
_ => Wrap(s.get(idx)).into(),
|
345
|
+
})
|
346
|
+
.collect(),
|
347
|
+
)
|
348
|
+
.into()
|
349
|
+
}
|
350
|
+
|
351
|
+
pub fn row_tuples(&self) -> Value {
|
352
|
+
let df = &self.df;
|
353
|
+
RArray::from_vec(
|
354
|
+
(0..df.borrow().height())
|
355
|
+
.map(|idx| {
|
356
|
+
RArray::from_vec(
|
357
|
+
self.df
|
358
|
+
.borrow()
|
359
|
+
.get_columns()
|
360
|
+
.iter()
|
361
|
+
.map(|s| match s.dtype() {
|
362
|
+
DataType::Object(_) => {
|
363
|
+
let obj: Option<&ObjectValue> =
|
364
|
+
s.get_object(idx).map(|any| any.into());
|
365
|
+
obj.unwrap().to_object()
|
366
|
+
}
|
367
|
+
_ => Wrap(s.get(idx)).into(),
|
368
|
+
})
|
369
|
+
.collect(),
|
370
|
+
)
|
371
|
+
})
|
372
|
+
.collect(),
|
373
|
+
)
|
374
|
+
.into()
|
375
|
+
}
|
376
|
+
|
188
377
|
pub fn write_parquet(
|
189
378
|
&self,
|
190
379
|
rb_f: Value,
|
@@ -210,6 +399,86 @@ impl RbDataFrame {
|
|
210
399
|
Ok(())
|
211
400
|
}
|
212
401
|
|
402
|
+
pub fn add(&self, s: &RbSeries) -> RbResult<Self> {
|
403
|
+
let df = (&*self.df.borrow() + &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
404
|
+
Ok(df.into())
|
405
|
+
}
|
406
|
+
|
407
|
+
pub fn sub(&self, s: &RbSeries) -> RbResult<Self> {
|
408
|
+
let df = (&*self.df.borrow() - &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
409
|
+
Ok(df.into())
|
410
|
+
}
|
411
|
+
|
412
|
+
pub fn div(&self, s: &RbSeries) -> RbResult<Self> {
|
413
|
+
let df = (&*self.df.borrow() / &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
414
|
+
Ok(df.into())
|
415
|
+
}
|
416
|
+
|
417
|
+
pub fn mul(&self, s: &RbSeries) -> RbResult<Self> {
|
418
|
+
let df = (&*self.df.borrow() * &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
419
|
+
Ok(df.into())
|
420
|
+
}
|
421
|
+
|
422
|
+
pub fn rem(&self, s: &RbSeries) -> RbResult<Self> {
|
423
|
+
let df = (&*self.df.borrow() % &*s.series.borrow()).map_err(RbPolarsErr::from)?;
|
424
|
+
Ok(df.into())
|
425
|
+
}
|
426
|
+
|
427
|
+
pub fn add_df(&self, s: &Self) -> RbResult<Self> {
|
428
|
+
let df = (&*self.df.borrow() + &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
429
|
+
Ok(df.into())
|
430
|
+
}
|
431
|
+
|
432
|
+
pub fn sub_df(&self, s: &Self) -> RbResult<Self> {
|
433
|
+
let df = (&*self.df.borrow() - &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
434
|
+
Ok(df.into())
|
435
|
+
}
|
436
|
+
|
437
|
+
pub fn div_df(&self, s: &Self) -> RbResult<Self> {
|
438
|
+
let df = (&*self.df.borrow() / &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
439
|
+
Ok(df.into())
|
440
|
+
}
|
441
|
+
|
442
|
+
pub fn mul_df(&self, s: &Self) -> RbResult<Self> {
|
443
|
+
let df = (&*self.df.borrow() * &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
444
|
+
Ok(df.into())
|
445
|
+
}
|
446
|
+
|
447
|
+
pub fn rem_df(&self, s: &Self) -> RbResult<Self> {
|
448
|
+
let df = (&*self.df.borrow() % &*s.df.borrow()).map_err(RbPolarsErr::from)?;
|
449
|
+
Ok(df.into())
|
450
|
+
}
|
451
|
+
|
452
|
+
pub fn sample_n(
|
453
|
+
&self,
|
454
|
+
n: usize,
|
455
|
+
with_replacement: bool,
|
456
|
+
shuffle: bool,
|
457
|
+
seed: Option<u64>,
|
458
|
+
) -> RbResult<Self> {
|
459
|
+
let df = self
|
460
|
+
.df
|
461
|
+
.borrow()
|
462
|
+
.sample_n(n, with_replacement, shuffle, seed)
|
463
|
+
.map_err(RbPolarsErr::from)?;
|
464
|
+
Ok(df.into())
|
465
|
+
}
|
466
|
+
|
467
|
+
pub fn sample_frac(
|
468
|
+
&self,
|
469
|
+
frac: f64,
|
470
|
+
with_replacement: bool,
|
471
|
+
shuffle: bool,
|
472
|
+
seed: Option<u64>,
|
473
|
+
) -> RbResult<Self> {
|
474
|
+
let df = self
|
475
|
+
.df
|
476
|
+
.borrow()
|
477
|
+
.sample_frac(frac, with_replacement, shuffle, seed)
|
478
|
+
.map_err(RbPolarsErr::from)?;
|
479
|
+
Ok(df.into())
|
480
|
+
}
|
481
|
+
|
213
482
|
pub fn rechunk(&self) -> Self {
|
214
483
|
self.df.borrow().agg_chunks().into()
|
215
484
|
}
|
@@ -240,11 +509,11 @@ impl RbDataFrame {
|
|
240
509
|
Ok(())
|
241
510
|
}
|
242
511
|
|
243
|
-
pub fn dtypes(&self) -> Vec<
|
512
|
+
pub fn dtypes(&self) -> Vec<Value> {
|
244
513
|
self.df
|
245
514
|
.borrow()
|
246
515
|
.iter()
|
247
|
-
.map(|s| s.dtype().
|
516
|
+
.map(|s| Wrap(s.dtype().clone()).into())
|
248
517
|
.collect()
|
249
518
|
}
|
250
519
|
|
@@ -265,6 +534,73 @@ impl RbDataFrame {
|
|
265
534
|
self.df.borrow().width()
|
266
535
|
}
|
267
536
|
|
537
|
+
pub fn hstack_mut(&self, columns: RArray) -> RbResult<()> {
|
538
|
+
let columns = to_series_collection(columns)?;
|
539
|
+
self.df
|
540
|
+
.borrow_mut()
|
541
|
+
.hstack_mut(&columns)
|
542
|
+
.map_err(RbPolarsErr::from)?;
|
543
|
+
Ok(())
|
544
|
+
}
|
545
|
+
|
546
|
+
pub fn hstack(&self, columns: RArray) -> RbResult<Self> {
|
547
|
+
let columns = to_series_collection(columns)?;
|
548
|
+
let df = self
|
549
|
+
.df
|
550
|
+
.borrow()
|
551
|
+
.hstack(&columns)
|
552
|
+
.map_err(RbPolarsErr::from)?;
|
553
|
+
Ok(df.into())
|
554
|
+
}
|
555
|
+
|
556
|
+
pub fn extend(&self, df: &RbDataFrame) -> RbResult<()> {
|
557
|
+
self.df
|
558
|
+
.borrow_mut()
|
559
|
+
.extend(&df.df.borrow())
|
560
|
+
.map_err(RbPolarsErr::from)?;
|
561
|
+
Ok(())
|
562
|
+
}
|
563
|
+
|
564
|
+
pub fn vstack_mut(&self, df: &RbDataFrame) -> RbResult<()> {
|
565
|
+
self.df
|
566
|
+
.borrow_mut()
|
567
|
+
.vstack_mut(&df.df.borrow())
|
568
|
+
.map_err(RbPolarsErr::from)?;
|
569
|
+
Ok(())
|
570
|
+
}
|
571
|
+
|
572
|
+
pub fn vstack(&self, df: &RbDataFrame) -> RbResult<Self> {
|
573
|
+
let df = self
|
574
|
+
.df
|
575
|
+
.borrow()
|
576
|
+
.vstack(&df.df.borrow())
|
577
|
+
.map_err(RbPolarsErr::from)?;
|
578
|
+
Ok(df.into())
|
579
|
+
}
|
580
|
+
|
581
|
+
pub fn drop_in_place(&self, name: String) -> RbResult<RbSeries> {
|
582
|
+
let s = self
|
583
|
+
.df
|
584
|
+
.borrow_mut()
|
585
|
+
.drop_in_place(&name)
|
586
|
+
.map_err(RbPolarsErr::from)?;
|
587
|
+
Ok(RbSeries::new(s))
|
588
|
+
}
|
589
|
+
|
590
|
+
pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> RbResult<Self> {
|
591
|
+
let df = self
|
592
|
+
.df
|
593
|
+
.borrow()
|
594
|
+
.drop_nulls(subset.as_ref().map(|s| s.as_ref()))
|
595
|
+
.map_err(RbPolarsErr::from)?;
|
596
|
+
Ok(df.into())
|
597
|
+
}
|
598
|
+
|
599
|
+
pub fn drop(&self, name: String) -> RbResult<Self> {
|
600
|
+
let df = self.df.borrow().drop(&name).map_err(RbPolarsErr::from)?;
|
601
|
+
Ok(RbDataFrame::new(df))
|
602
|
+
}
|
603
|
+
|
268
604
|
pub fn select_at_idx(&self, idx: usize) -> Option<RbSeries> {
|
269
605
|
self.df
|
270
606
|
.borrow()
|
@@ -272,6 +608,10 @@ impl RbDataFrame {
|
|
272
608
|
.map(|s| RbSeries::new(s.clone()))
|
273
609
|
}
|
274
610
|
|
611
|
+
pub fn find_idx_by_name(&self, name: String) -> Option<usize> {
|
612
|
+
self.df.borrow().find_idx_by_name(&name)
|
613
|
+
}
|
614
|
+
|
275
615
|
// TODO remove clone
|
276
616
|
pub fn column(&self, name: String) -> RbResult<RbSeries> {
|
277
617
|
self.df
|
@@ -418,7 +758,7 @@ impl RbDataFrame {
|
|
418
758
|
self.df.borrow().partition_by(groups)
|
419
759
|
}
|
420
760
|
.map_err(RbPolarsErr::from)?;
|
421
|
-
Ok(out.into_iter().map(
|
761
|
+
Ok(out.into_iter().map(RbDataFrame::new).collect())
|
422
762
|
}
|
423
763
|
|
424
764
|
pub fn shift(&self, periods: i64) -> Self {
|
@@ -574,6 +914,11 @@ impl RbDataFrame {
|
|
574
914
|
Ok(out.into())
|
575
915
|
}
|
576
916
|
|
917
|
+
pub fn to_struct(&self, name: String) -> RbSeries {
|
918
|
+
let s = self.df.borrow().clone().into_struct(&name);
|
919
|
+
s.into_series().into()
|
920
|
+
}
|
921
|
+
|
577
922
|
pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
|
578
923
|
let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
|
579
924
|
Ok(df.into())
|
data/ext/polars/src/error.rs
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
use magnus::exception::arg_error;
|
2
2
|
use magnus::Error;
|
3
|
+
use polars::error::ArrowError;
|
3
4
|
use polars::prelude::PolarsError;
|
4
5
|
|
5
6
|
pub struct RbPolarsErr {}
|
@@ -10,6 +11,14 @@ impl RbPolarsErr {
|
|
10
11
|
Error::runtime_error(e.to_string())
|
11
12
|
}
|
12
13
|
|
14
|
+
pub fn arrow(e: ArrowError) -> Error {
|
15
|
+
Error::runtime_error(e.to_string())
|
16
|
+
}
|
17
|
+
|
18
|
+
pub fn io(e: std::io::Error) -> Error {
|
19
|
+
Error::runtime_error(e.to_string())
|
20
|
+
}
|
21
|
+
|
13
22
|
pub fn other(message: String) -> Error {
|
14
23
|
Error::runtime_error(message)
|
15
24
|
}
|
data/ext/polars/src/file.rs
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
use magnus::{Error, RString, Value};
|
2
2
|
use polars::io::mmap::MmapBytesReader;
|
3
|
-
use std::fs::
|
3
|
+
use std::fs::File;
|
4
4
|
use std::io::Cursor;
|
5
5
|
use std::path::PathBuf;
|
6
6
|
|
7
7
|
use crate::RbResult;
|
8
8
|
|
9
9
|
pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
let str_slice = f.try_convert::<PathBuf>()?;
|
11
|
+
let f = if truncate {
|
12
|
+
File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
|
13
|
+
} else {
|
14
|
+
File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
|
15
|
+
};
|
16
|
+
Ok(f)
|
16
17
|
}
|
17
18
|
|
18
19
|
pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{RArray, RHash, Value};
|
2
|
+
use polars::io::RowCount;
|
2
3
|
use polars::lazy::frame::{LazyFrame, LazyGroupBy};
|
3
4
|
use polars::prelude::*;
|
4
5
|
use std::cell::RefCell;
|
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
52
53
|
}
|
53
54
|
|
54
55
|
impl RbLazyFrame {
|
56
|
+
pub fn new_from_ndjson(
|
57
|
+
path: String,
|
58
|
+
infer_schema_length: Option<usize>,
|
59
|
+
batch_size: Option<usize>,
|
60
|
+
n_rows: Option<usize>,
|
61
|
+
low_memory: bool,
|
62
|
+
rechunk: bool,
|
63
|
+
row_count: Option<(String, IdxSize)>,
|
64
|
+
) -> RbResult<Self> {
|
65
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
66
|
+
|
67
|
+
let lf = LazyJsonLineReader::new(path)
|
68
|
+
.with_infer_schema_length(infer_schema_length)
|
69
|
+
.with_batch_size(batch_size)
|
70
|
+
.with_n_rows(n_rows)
|
71
|
+
.low_memory(low_memory)
|
72
|
+
.with_rechunk(rechunk)
|
73
|
+
.with_row_count(row_count)
|
74
|
+
.finish()
|
75
|
+
.map_err(RbPolarsErr::from)?;
|
76
|
+
Ok(lf.into())
|
77
|
+
}
|
78
|
+
|
79
|
+
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
80
|
+
// start arguments
|
81
|
+
// this pattern is needed for more than 16
|
82
|
+
let path: String = arguments[0].try_convert()?;
|
83
|
+
let sep: String = arguments[1].try_convert()?;
|
84
|
+
let has_header: bool = arguments[2].try_convert()?;
|
85
|
+
let ignore_errors: bool = arguments[3].try_convert()?;
|
86
|
+
let skip_rows: usize = arguments[4].try_convert()?;
|
87
|
+
let n_rows: Option<usize> = arguments[5].try_convert()?;
|
88
|
+
let cache: bool = arguments[6].try_convert()?;
|
89
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
|
90
|
+
let low_memory: bool = arguments[8].try_convert()?;
|
91
|
+
let comment_char: Option<String> = arguments[9].try_convert()?;
|
92
|
+
let quote_char: Option<String> = arguments[10].try_convert()?;
|
93
|
+
let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
|
94
|
+
let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
|
95
|
+
let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
|
96
|
+
let rechunk: bool = arguments[14].try_convert()?;
|
97
|
+
let skip_rows_after_header: usize = arguments[15].try_convert()?;
|
98
|
+
let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
|
99
|
+
let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
|
100
|
+
let parse_dates: bool = arguments[18].try_convert()?;
|
101
|
+
let eol_char: String = arguments[19].try_convert()?;
|
102
|
+
// end arguments
|
103
|
+
|
104
|
+
let null_values = null_values.map(|w| w.0);
|
105
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
106
|
+
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
107
|
+
let delimiter = sep.as_bytes()[0];
|
108
|
+
let eol_char = eol_char.as_bytes()[0];
|
109
|
+
|
110
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
111
|
+
|
112
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
113
|
+
let fields = overwrite_dtype
|
114
|
+
.into_iter()
|
115
|
+
.map(|(name, dtype)| Field::new(&name, dtype.0));
|
116
|
+
Schema::from(fields)
|
117
|
+
});
|
118
|
+
let r = LazyCsvReader::new(path)
|
119
|
+
.with_infer_schema_length(infer_schema_length)
|
120
|
+
.with_delimiter(delimiter)
|
121
|
+
.has_header(has_header)
|
122
|
+
.with_ignore_parser_errors(ignore_errors)
|
123
|
+
.with_skip_rows(skip_rows)
|
124
|
+
.with_n_rows(n_rows)
|
125
|
+
.with_cache(cache)
|
126
|
+
.with_dtype_overwrite(overwrite_dtype.as_ref())
|
127
|
+
.low_memory(low_memory)
|
128
|
+
.with_comment_char(comment_char)
|
129
|
+
.with_quote_char(quote_char)
|
130
|
+
.with_end_of_line_char(eol_char)
|
131
|
+
.with_rechunk(rechunk)
|
132
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
133
|
+
.with_encoding(encoding.0)
|
134
|
+
.with_row_count(row_count)
|
135
|
+
.with_parse_dates(parse_dates)
|
136
|
+
.with_null_values(null_values);
|
137
|
+
|
138
|
+
if let Some(_lambda) = with_schema_modify {
|
139
|
+
todo!();
|
140
|
+
}
|
141
|
+
|
142
|
+
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
143
|
+
}
|
144
|
+
|
145
|
+
pub fn new_from_parquet(
|
146
|
+
path: String,
|
147
|
+
n_rows: Option<usize>,
|
148
|
+
cache: bool,
|
149
|
+
parallel: Wrap<ParallelStrategy>,
|
150
|
+
rechunk: bool,
|
151
|
+
row_count: Option<(String, IdxSize)>,
|
152
|
+
low_memory: bool,
|
153
|
+
) -> RbResult<Self> {
|
154
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
155
|
+
let args = ScanArgsParquet {
|
156
|
+
n_rows,
|
157
|
+
cache,
|
158
|
+
parallel: parallel.0,
|
159
|
+
rechunk,
|
160
|
+
row_count,
|
161
|
+
low_memory,
|
162
|
+
};
|
163
|
+
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
164
|
+
Ok(lf.into())
|
165
|
+
}
|
166
|
+
|
167
|
+
pub fn new_from_ipc(
|
168
|
+
path: String,
|
169
|
+
n_rows: Option<usize>,
|
170
|
+
cache: bool,
|
171
|
+
rechunk: bool,
|
172
|
+
row_count: Option<(String, IdxSize)>,
|
173
|
+
memory_map: bool,
|
174
|
+
) -> RbResult<Self> {
|
175
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
176
|
+
let args = ScanArgsIpc {
|
177
|
+
n_rows,
|
178
|
+
cache,
|
179
|
+
rechunk,
|
180
|
+
row_count,
|
181
|
+
memmap: memory_map,
|
182
|
+
};
|
183
|
+
let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
|
184
|
+
Ok(lf.into())
|
185
|
+
}
|
186
|
+
|
55
187
|
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
56
188
|
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
57
189
|
serde_json::to_writer(file, &self.ldf.logical_plan)
|
@@ -388,9 +520,9 @@ impl RbLazyFrame {
|
|
388
520
|
Ok(self.get_schema()?.iter_names().cloned().collect())
|
389
521
|
}
|
390
522
|
|
391
|
-
pub fn dtypes(&self) -> RbResult<Vec<
|
523
|
+
pub fn dtypes(&self) -> RbResult<Vec<Value>> {
|
392
524
|
let schema = self.get_schema()?;
|
393
|
-
let iter = schema.iter_dtypes().map(|dt| dt.
|
525
|
+
let iter = schema.iter_dtypes().map(|dt| Wrap(dt.clone()).into());
|
394
526
|
Ok(iter.collect())
|
395
527
|
}
|
396
528
|
|
@@ -401,7 +533,7 @@ impl RbLazyFrame {
|
|
401
533
|
schema.iter_fields().for_each(|fld| {
|
402
534
|
// TODO remove unwrap
|
403
535
|
schema_dict
|
404
|
-
.aset(fld.name().clone(), fld.data_type().
|
536
|
+
.aset::<String, Value>(fld.name().clone(), Wrap(fld.data_type().clone()).into())
|
405
537
|
.unwrap();
|
406
538
|
});
|
407
539
|
Ok(schema_dict)
|