polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,14 @@
1
- use magnus::RArray;
1
+ use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
4
+ use polars::prelude::*;
3
5
  use std::cell::RefCell;
6
+ use std::io::BufWriter;
4
7
 
5
- use crate::conversion::wrap_join_type;
8
+ use crate::conversion::*;
9
+ use crate::file::get_file_like;
6
10
  use crate::lazy::utils::rb_exprs_to_exprs;
7
- use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult};
11
+ use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult, RbValueError};
8
12
 
9
13
  #[magnus::wrap(class = "Polars::RbLazyGroupBy")]
10
14
  pub struct RbLazyGroupBy {
@@ -17,6 +21,16 @@ impl RbLazyGroupBy {
17
21
  let aggs = rb_exprs_to_exprs(aggs)?;
18
22
  Ok(lgb.agg(aggs).into())
19
23
  }
24
+
25
+ pub fn head(&self, n: usize) -> RbLazyFrame {
26
+ let lgb = self.lgb.take().unwrap();
27
+ lgb.head(Some(n)).into()
28
+ }
29
+
30
+ pub fn tail(&self, n: usize) -> RbLazyFrame {
31
+ let lgb = self.lgb.take().unwrap();
32
+ lgb.tail(Some(n)).into()
33
+ }
20
34
  }
21
35
 
22
36
  #[magnus::wrap(class = "Polars::RbLazyFrame")]
@@ -25,6 +39,13 @@ pub struct RbLazyFrame {
25
39
  pub ldf: LazyFrame,
26
40
  }
27
41
 
42
+ impl RbLazyFrame {
43
+ fn get_schema(&self) -> RbResult<SchemaRef> {
44
+ let schema = self.ldf.schema().map_err(RbPolarsErr::from)?;
45
+ Ok(schema)
46
+ }
47
+ }
48
+
28
49
  impl From<LazyFrame> for RbLazyFrame {
29
50
  fn from(ldf: LazyFrame) -> Self {
30
51
  RbLazyFrame { ldf }
@@ -32,6 +53,156 @@ impl From<LazyFrame> for RbLazyFrame {
32
53
  }
33
54
 
34
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
187
+ pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
188
+ let file = BufWriter::new(get_file_like(rb_f, true)?);
189
+ serde_json::to_writer(file, &self.ldf.logical_plan)
190
+ .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
191
+ Ok(())
192
+ }
193
+
194
+ pub fn describe_plan(&self) -> String {
195
+ self.ldf.describe_plan()
196
+ }
197
+
198
+ pub fn describe_optimized_plan(&self) -> RbResult<String> {
199
+ let result = self
200
+ .ldf
201
+ .describe_optimized_plan()
202
+ .map_err(RbPolarsErr::from)?;
203
+ Ok(result)
204
+ }
205
+
35
206
  #[allow(clippy::too_many_arguments)]
36
207
  pub fn optimization_toggle(
37
208
  &self,
@@ -55,18 +226,52 @@ impl RbLazyFrame {
55
226
  ldf.into()
56
227
  }
57
228
 
229
+ pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> Self {
230
+ let ldf = self.ldf.clone();
231
+ ldf.sort(
232
+ &by_column,
233
+ SortOptions {
234
+ descending: reverse,
235
+ nulls_last,
236
+ },
237
+ )
238
+ .into()
239
+ }
240
+
241
+ pub fn sort_by_exprs(
242
+ &self,
243
+ by_column: RArray,
244
+ reverse: Vec<bool>,
245
+ nulls_last: bool,
246
+ ) -> RbResult<Self> {
247
+ let ldf = self.ldf.clone();
248
+ let exprs = rb_exprs_to_exprs(by_column)?;
249
+ Ok(ldf.sort_by_exprs(exprs, reverse, nulls_last).into())
250
+ }
251
+
252
+ pub fn cache(&self) -> Self {
253
+ let ldf = self.ldf.clone();
254
+ ldf.cache().into()
255
+ }
256
+
58
257
  pub fn collect(&self) -> RbResult<RbDataFrame> {
59
258
  let ldf = self.ldf.clone();
60
259
  let df = ldf.collect().map_err(RbPolarsErr::from)?;
61
260
  Ok(df.into())
62
261
  }
63
262
 
64
- pub fn filter(&self, predicate: &RbExpr) -> RbLazyFrame {
263
+ pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
264
+ let ldf = self.ldf.clone();
265
+ let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
266
+ Ok(df.into())
267
+ }
268
+
269
+ pub fn filter(&self, predicate: &RbExpr) -> Self {
65
270
  let ldf = self.ldf.clone();
66
271
  ldf.filter(predicate.inner.clone()).into()
67
272
  }
68
273
 
69
- pub fn select(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
274
+ pub fn select(&self, exprs: RArray) -> RbResult<Self> {
70
275
  let ldf = self.ldf.clone();
71
276
  let exprs = rb_exprs_to_exprs(exprs)?;
72
277
  Ok(ldf.select(exprs).into())
@@ -85,6 +290,65 @@ impl RbLazyFrame {
85
290
  })
86
291
  }
87
292
 
293
+ pub fn groupby_rolling(
294
+ &self,
295
+ index_column: String,
296
+ period: String,
297
+ offset: String,
298
+ closed: Wrap<ClosedWindow>,
299
+ by: RArray,
300
+ ) -> RbResult<RbLazyGroupBy> {
301
+ let closed_window = closed.0;
302
+ let ldf = self.ldf.clone();
303
+ let by = rb_exprs_to_exprs(by)?;
304
+ let lazy_gb = ldf.groupby_rolling(
305
+ by,
306
+ RollingGroupOptions {
307
+ index_column,
308
+ period: Duration::parse(&period),
309
+ offset: Duration::parse(&offset),
310
+ closed_window,
311
+ },
312
+ );
313
+
314
+ Ok(RbLazyGroupBy {
315
+ lgb: RefCell::new(Some(lazy_gb)),
316
+ })
317
+ }
318
+
319
+ #[allow(clippy::too_many_arguments)]
320
+ pub fn groupby_dynamic(
321
+ &self,
322
+ index_column: String,
323
+ every: String,
324
+ period: String,
325
+ offset: String,
326
+ truncate: bool,
327
+ include_boundaries: bool,
328
+ closed: Wrap<ClosedWindow>,
329
+ by: RArray,
330
+ ) -> RbResult<RbLazyGroupBy> {
331
+ let closed_window = closed.0;
332
+ let by = rb_exprs_to_exprs(by)?;
333
+ let ldf = self.ldf.clone();
334
+ let lazy_gb = ldf.groupby_dynamic(
335
+ by,
336
+ DynamicGroupOptions {
337
+ index_column,
338
+ every: Duration::parse(&every),
339
+ period: Duration::parse(&period),
340
+ offset: Duration::parse(&offset),
341
+ truncate,
342
+ include_boundaries,
343
+ closed_window,
344
+ },
345
+ );
346
+
347
+ Ok(RbLazyGroupBy {
348
+ lgb: RefCell::new(Some(lazy_gb)),
349
+ })
350
+ }
351
+
88
352
  #[allow(clippy::too_many_arguments)]
89
353
  pub fn join(
90
354
  &self,
@@ -93,11 +357,9 @@ impl RbLazyFrame {
93
357
  right_on: RArray,
94
358
  allow_parallel: bool,
95
359
  force_parallel: bool,
96
- how: String,
360
+ how: Wrap<JoinType>,
97
361
  suffix: String,
98
362
  ) -> RbResult<Self> {
99
- let how = wrap_join_type(&how)?;
100
-
101
363
  let ldf = self.ldf.clone();
102
364
  let other = other.ldf.clone();
103
365
  let left_on = rb_exprs_to_exprs(left_on)?;
@@ -110,14 +372,178 @@ impl RbLazyFrame {
110
372
  .right_on(right_on)
111
373
  .allow_parallel(allow_parallel)
112
374
  .force_parallel(force_parallel)
113
- .how(how)
375
+ .how(how.0)
114
376
  .suffix(suffix)
115
377
  .finish()
116
378
  .into())
117
379
  }
118
380
 
119
- pub fn with_columns(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
381
+ pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
120
382
  let ldf = self.ldf.clone();
121
383
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
122
384
  }
385
+
386
+ pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
387
+ let ldf = self.ldf.clone();
388
+ ldf.rename(existing, new).into()
389
+ }
390
+
391
+ pub fn reverse(&self) -> Self {
392
+ let ldf = self.ldf.clone();
393
+ ldf.reverse().into()
394
+ }
395
+
396
+ pub fn shift(&self, periods: i64) -> Self {
397
+ let ldf = self.ldf.clone();
398
+ ldf.shift(periods).into()
399
+ }
400
+
401
+ pub fn shift_and_fill(&self, periods: i64, fill_value: &RbExpr) -> Self {
402
+ let ldf = self.ldf.clone();
403
+ ldf.shift_and_fill(periods, fill_value.inner.clone()).into()
404
+ }
405
+
406
+ pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
407
+ let ldf = self.ldf.clone();
408
+ ldf.fill_nan(fill_value.inner.clone()).into()
409
+ }
410
+
411
+ pub fn min(&self) -> Self {
412
+ let ldf = self.ldf.clone();
413
+ ldf.min().into()
414
+ }
415
+
416
+ pub fn max(&self) -> Self {
417
+ let ldf = self.ldf.clone();
418
+ ldf.max().into()
419
+ }
420
+
421
+ pub fn sum(&self) -> Self {
422
+ let ldf = self.ldf.clone();
423
+ ldf.sum().into()
424
+ }
425
+
426
+ pub fn mean(&self) -> Self {
427
+ let ldf = self.ldf.clone();
428
+ ldf.mean().into()
429
+ }
430
+
431
+ pub fn std(&self, ddof: u8) -> Self {
432
+ let ldf = self.ldf.clone();
433
+ ldf.std(ddof).into()
434
+ }
435
+
436
+ pub fn var(&self, ddof: u8) -> Self {
437
+ let ldf = self.ldf.clone();
438
+ ldf.var(ddof).into()
439
+ }
440
+
441
+ pub fn median(&self) -> Self {
442
+ let ldf = self.ldf.clone();
443
+ ldf.median().into()
444
+ }
445
+
446
+ pub fn quantile(&self, quantile: f64, interpolation: Wrap<QuantileInterpolOptions>) -> Self {
447
+ let ldf = self.ldf.clone();
448
+ ldf.quantile(quantile, interpolation.0).into()
449
+ }
450
+
451
+ pub fn explode(&self, column: RArray) -> RbResult<Self> {
452
+ let ldf = self.ldf.clone();
453
+ let column = rb_exprs_to_exprs(column)?;
454
+ Ok(ldf.explode(column).into())
455
+ }
456
+
457
+ pub fn unique(
458
+ &self,
459
+ maintain_order: bool,
460
+ subset: Option<Vec<String>>,
461
+ keep: Wrap<UniqueKeepStrategy>,
462
+ ) -> RbResult<Self> {
463
+ let ldf = self.ldf.clone();
464
+ Ok(match maintain_order {
465
+ true => ldf.unique_stable(subset, keep.0),
466
+ false => ldf.unique(subset, keep.0),
467
+ }
468
+ .into())
469
+ }
470
+
471
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
472
+ let ldf = self.ldf.clone();
473
+ ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
474
+ .into()
475
+ }
476
+
477
+ pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
478
+ let ldf = self.ldf.clone();
479
+ ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
480
+ }
481
+
482
+ pub fn tail(&self, n: IdxSize) -> Self {
483
+ let ldf = self.ldf.clone();
484
+ ldf.tail(n).into()
485
+ }
486
+
487
+ pub fn melt(
488
+ &self,
489
+ id_vars: Vec<String>,
490
+ value_vars: Vec<String>,
491
+ value_name: Option<String>,
492
+ variable_name: Option<String>,
493
+ ) -> Self {
494
+ let args = MeltArgs {
495
+ id_vars,
496
+ value_vars,
497
+ value_name,
498
+ variable_name,
499
+ };
500
+
501
+ let ldf = self.ldf.clone();
502
+ ldf.melt(args).into()
503
+ }
504
+
505
+ pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
506
+ let ldf = self.ldf.clone();
507
+ ldf.with_row_count(&name, offset).into()
508
+ }
509
+
510
+ pub fn drop_columns(&self, cols: Vec<String>) -> Self {
511
+ let ldf = self.ldf.clone();
512
+ ldf.drop_columns(cols).into()
513
+ }
514
+
515
+ pub fn clone(&self) -> Self {
516
+ self.ldf.clone().into()
517
+ }
518
+
519
+ pub fn columns(&self) -> RbResult<Vec<String>> {
520
+ Ok(self.get_schema()?.iter_names().cloned().collect())
521
+ }
522
+
523
+ pub fn dtypes(&self) -> RbResult<Vec<String>> {
524
+ let schema = self.get_schema()?;
525
+ let iter = schema.iter_dtypes().map(|dt| dt.to_string());
526
+ Ok(iter.collect())
527
+ }
528
+
529
+ pub fn schema(&self) -> RbResult<RHash> {
530
+ let schema = self.get_schema()?;
531
+ let schema_dict = RHash::new();
532
+
533
+ schema.iter_fields().for_each(|fld| {
534
+ // TODO remove unwrap
535
+ schema_dict
536
+ .aset(fld.name().clone(), fld.data_type().to_string())
537
+ .unwrap();
538
+ });
539
+ Ok(schema_dict)
540
+ }
541
+
542
+ pub fn unnest(&self, cols: Vec<String>) -> Self {
543
+ self.ldf.clone().unnest(cols).into()
544
+ }
545
+
546
+ pub fn width(&self) -> RbResult<usize> {
547
+ Ok(self.get_schema()?.len())
548
+ }
123
549
  }