polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,14 @@
1
- use magnus::RArray;
1
+ use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
4
+ use polars::prelude::*;
3
5
  use std::cell::RefCell;
6
+ use std::io::BufWriter;
4
7
 
5
- use crate::conversion::wrap_join_type;
8
+ use crate::conversion::*;
9
+ use crate::file::get_file_like;
6
10
  use crate::lazy::utils::rb_exprs_to_exprs;
7
- use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult};
11
+ use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult, RbValueError};
8
12
 
9
13
  #[magnus::wrap(class = "Polars::RbLazyGroupBy")]
10
14
  pub struct RbLazyGroupBy {
@@ -17,6 +21,16 @@ impl RbLazyGroupBy {
17
21
  let aggs = rb_exprs_to_exprs(aggs)?;
18
22
  Ok(lgb.agg(aggs).into())
19
23
  }
24
+
25
+ pub fn head(&self, n: usize) -> RbLazyFrame {
26
+ let lgb = self.lgb.take().unwrap();
27
+ lgb.head(Some(n)).into()
28
+ }
29
+
30
+ pub fn tail(&self, n: usize) -> RbLazyFrame {
31
+ let lgb = self.lgb.take().unwrap();
32
+ lgb.tail(Some(n)).into()
33
+ }
20
34
  }
21
35
 
22
36
  #[magnus::wrap(class = "Polars::RbLazyFrame")]
@@ -25,6 +39,13 @@ pub struct RbLazyFrame {
25
39
  pub ldf: LazyFrame,
26
40
  }
27
41
 
42
+ impl RbLazyFrame {
43
+ fn get_schema(&self) -> RbResult<SchemaRef> {
44
+ let schema = self.ldf.schema().map_err(RbPolarsErr::from)?;
45
+ Ok(schema)
46
+ }
47
+ }
48
+
28
49
  impl From<LazyFrame> for RbLazyFrame {
29
50
  fn from(ldf: LazyFrame) -> Self {
30
51
  RbLazyFrame { ldf }
@@ -32,6 +53,156 @@ impl From<LazyFrame> for RbLazyFrame {
32
53
  }
33
54
 
34
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
187
+ pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
188
+ let file = BufWriter::new(get_file_like(rb_f, true)?);
189
+ serde_json::to_writer(file, &self.ldf.logical_plan)
190
+ .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
191
+ Ok(())
192
+ }
193
+
194
+ pub fn describe_plan(&self) -> String {
195
+ self.ldf.describe_plan()
196
+ }
197
+
198
+ pub fn describe_optimized_plan(&self) -> RbResult<String> {
199
+ let result = self
200
+ .ldf
201
+ .describe_optimized_plan()
202
+ .map_err(RbPolarsErr::from)?;
203
+ Ok(result)
204
+ }
205
+
35
206
  #[allow(clippy::too_many_arguments)]
36
207
  pub fn optimization_toggle(
37
208
  &self,
@@ -55,18 +226,52 @@ impl RbLazyFrame {
55
226
  ldf.into()
56
227
  }
57
228
 
229
+ pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> Self {
230
+ let ldf = self.ldf.clone();
231
+ ldf.sort(
232
+ &by_column,
233
+ SortOptions {
234
+ descending: reverse,
235
+ nulls_last,
236
+ },
237
+ )
238
+ .into()
239
+ }
240
+
241
+ pub fn sort_by_exprs(
242
+ &self,
243
+ by_column: RArray,
244
+ reverse: Vec<bool>,
245
+ nulls_last: bool,
246
+ ) -> RbResult<Self> {
247
+ let ldf = self.ldf.clone();
248
+ let exprs = rb_exprs_to_exprs(by_column)?;
249
+ Ok(ldf.sort_by_exprs(exprs, reverse, nulls_last).into())
250
+ }
251
+
252
+ pub fn cache(&self) -> Self {
253
+ let ldf = self.ldf.clone();
254
+ ldf.cache().into()
255
+ }
256
+
58
257
  pub fn collect(&self) -> RbResult<RbDataFrame> {
59
258
  let ldf = self.ldf.clone();
60
259
  let df = ldf.collect().map_err(RbPolarsErr::from)?;
61
260
  Ok(df.into())
62
261
  }
63
262
 
64
- pub fn filter(&self, predicate: &RbExpr) -> RbLazyFrame {
263
+ pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
264
+ let ldf = self.ldf.clone();
265
+ let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
266
+ Ok(df.into())
267
+ }
268
+
269
+ pub fn filter(&self, predicate: &RbExpr) -> Self {
65
270
  let ldf = self.ldf.clone();
66
271
  ldf.filter(predicate.inner.clone()).into()
67
272
  }
68
273
 
69
- pub fn select(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
274
+ pub fn select(&self, exprs: RArray) -> RbResult<Self> {
70
275
  let ldf = self.ldf.clone();
71
276
  let exprs = rb_exprs_to_exprs(exprs)?;
72
277
  Ok(ldf.select(exprs).into())
@@ -85,6 +290,65 @@ impl RbLazyFrame {
85
290
  })
86
291
  }
87
292
 
293
+ pub fn groupby_rolling(
294
+ &self,
295
+ index_column: String,
296
+ period: String,
297
+ offset: String,
298
+ closed: Wrap<ClosedWindow>,
299
+ by: RArray,
300
+ ) -> RbResult<RbLazyGroupBy> {
301
+ let closed_window = closed.0;
302
+ let ldf = self.ldf.clone();
303
+ let by = rb_exprs_to_exprs(by)?;
304
+ let lazy_gb = ldf.groupby_rolling(
305
+ by,
306
+ RollingGroupOptions {
307
+ index_column,
308
+ period: Duration::parse(&period),
309
+ offset: Duration::parse(&offset),
310
+ closed_window,
311
+ },
312
+ );
313
+
314
+ Ok(RbLazyGroupBy {
315
+ lgb: RefCell::new(Some(lazy_gb)),
316
+ })
317
+ }
318
+
319
+ #[allow(clippy::too_many_arguments)]
320
+ pub fn groupby_dynamic(
321
+ &self,
322
+ index_column: String,
323
+ every: String,
324
+ period: String,
325
+ offset: String,
326
+ truncate: bool,
327
+ include_boundaries: bool,
328
+ closed: Wrap<ClosedWindow>,
329
+ by: RArray,
330
+ ) -> RbResult<RbLazyGroupBy> {
331
+ let closed_window = closed.0;
332
+ let by = rb_exprs_to_exprs(by)?;
333
+ let ldf = self.ldf.clone();
334
+ let lazy_gb = ldf.groupby_dynamic(
335
+ by,
336
+ DynamicGroupOptions {
337
+ index_column,
338
+ every: Duration::parse(&every),
339
+ period: Duration::parse(&period),
340
+ offset: Duration::parse(&offset),
341
+ truncate,
342
+ include_boundaries,
343
+ closed_window,
344
+ },
345
+ );
346
+
347
+ Ok(RbLazyGroupBy {
348
+ lgb: RefCell::new(Some(lazy_gb)),
349
+ })
350
+ }
351
+
88
352
  #[allow(clippy::too_many_arguments)]
89
353
  pub fn join(
90
354
  &self,
@@ -93,11 +357,9 @@ impl RbLazyFrame {
93
357
  right_on: RArray,
94
358
  allow_parallel: bool,
95
359
  force_parallel: bool,
96
- how: String,
360
+ how: Wrap<JoinType>,
97
361
  suffix: String,
98
362
  ) -> RbResult<Self> {
99
- let how = wrap_join_type(&how)?;
100
-
101
363
  let ldf = self.ldf.clone();
102
364
  let other = other.ldf.clone();
103
365
  let left_on = rb_exprs_to_exprs(left_on)?;
@@ -110,14 +372,178 @@ impl RbLazyFrame {
110
372
  .right_on(right_on)
111
373
  .allow_parallel(allow_parallel)
112
374
  .force_parallel(force_parallel)
113
- .how(how)
375
+ .how(how.0)
114
376
  .suffix(suffix)
115
377
  .finish()
116
378
  .into())
117
379
  }
118
380
 
119
- pub fn with_columns(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
381
+ pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
120
382
  let ldf = self.ldf.clone();
121
383
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
122
384
  }
385
+
386
+ pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
387
+ let ldf = self.ldf.clone();
388
+ ldf.rename(existing, new).into()
389
+ }
390
+
391
+ pub fn reverse(&self) -> Self {
392
+ let ldf = self.ldf.clone();
393
+ ldf.reverse().into()
394
+ }
395
+
396
+ pub fn shift(&self, periods: i64) -> Self {
397
+ let ldf = self.ldf.clone();
398
+ ldf.shift(periods).into()
399
+ }
400
+
401
+ pub fn shift_and_fill(&self, periods: i64, fill_value: &RbExpr) -> Self {
402
+ let ldf = self.ldf.clone();
403
+ ldf.shift_and_fill(periods, fill_value.inner.clone()).into()
404
+ }
405
+
406
+ pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
407
+ let ldf = self.ldf.clone();
408
+ ldf.fill_nan(fill_value.inner.clone()).into()
409
+ }
410
+
411
+ pub fn min(&self) -> Self {
412
+ let ldf = self.ldf.clone();
413
+ ldf.min().into()
414
+ }
415
+
416
+ pub fn max(&self) -> Self {
417
+ let ldf = self.ldf.clone();
418
+ ldf.max().into()
419
+ }
420
+
421
+ pub fn sum(&self) -> Self {
422
+ let ldf = self.ldf.clone();
423
+ ldf.sum().into()
424
+ }
425
+
426
+ pub fn mean(&self) -> Self {
427
+ let ldf = self.ldf.clone();
428
+ ldf.mean().into()
429
+ }
430
+
431
+ pub fn std(&self, ddof: u8) -> Self {
432
+ let ldf = self.ldf.clone();
433
+ ldf.std(ddof).into()
434
+ }
435
+
436
+ pub fn var(&self, ddof: u8) -> Self {
437
+ let ldf = self.ldf.clone();
438
+ ldf.var(ddof).into()
439
+ }
440
+
441
+ pub fn median(&self) -> Self {
442
+ let ldf = self.ldf.clone();
443
+ ldf.median().into()
444
+ }
445
+
446
+ pub fn quantile(&self, quantile: f64, interpolation: Wrap<QuantileInterpolOptions>) -> Self {
447
+ let ldf = self.ldf.clone();
448
+ ldf.quantile(quantile, interpolation.0).into()
449
+ }
450
+
451
+ pub fn explode(&self, column: RArray) -> RbResult<Self> {
452
+ let ldf = self.ldf.clone();
453
+ let column = rb_exprs_to_exprs(column)?;
454
+ Ok(ldf.explode(column).into())
455
+ }
456
+
457
+ pub fn unique(
458
+ &self,
459
+ maintain_order: bool,
460
+ subset: Option<Vec<String>>,
461
+ keep: Wrap<UniqueKeepStrategy>,
462
+ ) -> RbResult<Self> {
463
+ let ldf = self.ldf.clone();
464
+ Ok(match maintain_order {
465
+ true => ldf.unique_stable(subset, keep.0),
466
+ false => ldf.unique(subset, keep.0),
467
+ }
468
+ .into())
469
+ }
470
+
471
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
472
+ let ldf = self.ldf.clone();
473
+ ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
474
+ .into()
475
+ }
476
+
477
+ pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
478
+ let ldf = self.ldf.clone();
479
+ ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
480
+ }
481
+
482
+ pub fn tail(&self, n: IdxSize) -> Self {
483
+ let ldf = self.ldf.clone();
484
+ ldf.tail(n).into()
485
+ }
486
+
487
+ pub fn melt(
488
+ &self,
489
+ id_vars: Vec<String>,
490
+ value_vars: Vec<String>,
491
+ value_name: Option<String>,
492
+ variable_name: Option<String>,
493
+ ) -> Self {
494
+ let args = MeltArgs {
495
+ id_vars,
496
+ value_vars,
497
+ value_name,
498
+ variable_name,
499
+ };
500
+
501
+ let ldf = self.ldf.clone();
502
+ ldf.melt(args).into()
503
+ }
504
+
505
+ pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
506
+ let ldf = self.ldf.clone();
507
+ ldf.with_row_count(&name, offset).into()
508
+ }
509
+
510
+ pub fn drop_columns(&self, cols: Vec<String>) -> Self {
511
+ let ldf = self.ldf.clone();
512
+ ldf.drop_columns(cols).into()
513
+ }
514
+
515
+ pub fn clone(&self) -> Self {
516
+ self.ldf.clone().into()
517
+ }
518
+
519
+ pub fn columns(&self) -> RbResult<Vec<String>> {
520
+ Ok(self.get_schema()?.iter_names().cloned().collect())
521
+ }
522
+
523
+ pub fn dtypes(&self) -> RbResult<Vec<String>> {
524
+ let schema = self.get_schema()?;
525
+ let iter = schema.iter_dtypes().map(|dt| dt.to_string());
526
+ Ok(iter.collect())
527
+ }
528
+
529
+ pub fn schema(&self) -> RbResult<RHash> {
530
+ let schema = self.get_schema()?;
531
+ let schema_dict = RHash::new();
532
+
533
+ schema.iter_fields().for_each(|fld| {
534
+ // TODO remove unwrap
535
+ schema_dict
536
+ .aset(fld.name().clone(), fld.data_type().to_string())
537
+ .unwrap();
538
+ });
539
+ Ok(schema_dict)
540
+ }
541
+
542
+ pub fn unnest(&self, cols: Vec<String>) -> Self {
543
+ self.ldf.clone().unnest(cols).into()
544
+ }
545
+
546
+ pub fn width(&self) -> RbResult<usize> {
547
+ Ok(self.get_schema()?.len())
548
+ }
123
549
  }