polars-df 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
- use magnus::RArray;
1
+ use magnus::{RArray, RHash, Value};
2
2
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
3
+ use polars::prelude::*;
3
4
  use std::cell::RefCell;
5
+ use std::io::BufWriter;
4
6
 
5
- use crate::conversion::wrap_join_type;
7
+ use crate::conversion::*;
8
+ use crate::file::get_file_like;
6
9
  use crate::lazy::utils::rb_exprs_to_exprs;
7
- use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult};
10
+ use crate::{RbDataFrame, RbExpr, RbPolarsErr, RbResult, RbValueError};
8
11
 
9
12
  #[magnus::wrap(class = "Polars::RbLazyGroupBy")]
10
13
  pub struct RbLazyGroupBy {
@@ -17,6 +20,16 @@ impl RbLazyGroupBy {
17
20
  let aggs = rb_exprs_to_exprs(aggs)?;
18
21
  Ok(lgb.agg(aggs).into())
19
22
  }
23
+
24
+ pub fn head(&self, n: usize) -> RbLazyFrame {
25
+ let lgb = self.lgb.take().unwrap();
26
+ lgb.head(Some(n)).into()
27
+ }
28
+
29
+ pub fn tail(&self, n: usize) -> RbLazyFrame {
30
+ let lgb = self.lgb.take().unwrap();
31
+ lgb.tail(Some(n)).into()
32
+ }
20
33
  }
21
34
 
22
35
  #[magnus::wrap(class = "Polars::RbLazyFrame")]
@@ -25,6 +38,13 @@ pub struct RbLazyFrame {
25
38
  pub ldf: LazyFrame,
26
39
  }
27
40
 
41
+ impl RbLazyFrame {
42
+ fn get_schema(&self) -> RbResult<SchemaRef> {
43
+ let schema = self.ldf.schema().map_err(RbPolarsErr::from)?;
44
+ Ok(schema)
45
+ }
46
+ }
47
+
28
48
  impl From<LazyFrame> for RbLazyFrame {
29
49
  fn from(ldf: LazyFrame) -> Self {
30
50
  RbLazyFrame { ldf }
@@ -32,6 +52,25 @@ impl From<LazyFrame> for RbLazyFrame {
32
52
  }
33
53
 
34
54
  impl RbLazyFrame {
55
+ pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
56
+ let file = BufWriter::new(get_file_like(rb_f, true)?);
57
+ serde_json::to_writer(file, &self.ldf.logical_plan)
58
+ .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
59
+ Ok(())
60
+ }
61
+
62
+ pub fn describe_plan(&self) -> String {
63
+ self.ldf.describe_plan()
64
+ }
65
+
66
+ pub fn describe_optimized_plan(&self) -> RbResult<String> {
67
+ let result = self
68
+ .ldf
69
+ .describe_optimized_plan()
70
+ .map_err(RbPolarsErr::from)?;
71
+ Ok(result)
72
+ }
73
+
35
74
  #[allow(clippy::too_many_arguments)]
36
75
  pub fn optimization_toggle(
37
76
  &self,
@@ -55,18 +94,52 @@ impl RbLazyFrame {
55
94
  ldf.into()
56
95
  }
57
96
 
97
+ pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> Self {
98
+ let ldf = self.ldf.clone();
99
+ ldf.sort(
100
+ &by_column,
101
+ SortOptions {
102
+ descending: reverse,
103
+ nulls_last,
104
+ },
105
+ )
106
+ .into()
107
+ }
108
+
109
+ pub fn sort_by_exprs(
110
+ &self,
111
+ by_column: RArray,
112
+ reverse: Vec<bool>,
113
+ nulls_last: bool,
114
+ ) -> RbResult<Self> {
115
+ let ldf = self.ldf.clone();
116
+ let exprs = rb_exprs_to_exprs(by_column)?;
117
+ Ok(ldf.sort_by_exprs(exprs, reverse, nulls_last).into())
118
+ }
119
+
120
+ pub fn cache(&self) -> Self {
121
+ let ldf = self.ldf.clone();
122
+ ldf.cache().into()
123
+ }
124
+
58
125
  pub fn collect(&self) -> RbResult<RbDataFrame> {
59
126
  let ldf = self.ldf.clone();
60
127
  let df = ldf.collect().map_err(RbPolarsErr::from)?;
61
128
  Ok(df.into())
62
129
  }
63
130
 
64
- pub fn filter(&self, predicate: &RbExpr) -> RbLazyFrame {
131
+ pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
132
+ let ldf = self.ldf.clone();
133
+ let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
134
+ Ok(df.into())
135
+ }
136
+
137
+ pub fn filter(&self, predicate: &RbExpr) -> Self {
65
138
  let ldf = self.ldf.clone();
66
139
  ldf.filter(predicate.inner.clone()).into()
67
140
  }
68
141
 
69
- pub fn select(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
142
+ pub fn select(&self, exprs: RArray) -> RbResult<Self> {
70
143
  let ldf = self.ldf.clone();
71
144
  let exprs = rb_exprs_to_exprs(exprs)?;
72
145
  Ok(ldf.select(exprs).into())
@@ -85,6 +158,65 @@ impl RbLazyFrame {
85
158
  })
86
159
  }
87
160
 
161
+ pub fn groupby_rolling(
162
+ &self,
163
+ index_column: String,
164
+ period: String,
165
+ offset: String,
166
+ closed: Wrap<ClosedWindow>,
167
+ by: RArray,
168
+ ) -> RbResult<RbLazyGroupBy> {
169
+ let closed_window = closed.0;
170
+ let ldf = self.ldf.clone();
171
+ let by = rb_exprs_to_exprs(by)?;
172
+ let lazy_gb = ldf.groupby_rolling(
173
+ by,
174
+ RollingGroupOptions {
175
+ index_column,
176
+ period: Duration::parse(&period),
177
+ offset: Duration::parse(&offset),
178
+ closed_window,
179
+ },
180
+ );
181
+
182
+ Ok(RbLazyGroupBy {
183
+ lgb: RefCell::new(Some(lazy_gb)),
184
+ })
185
+ }
186
+
187
+ #[allow(clippy::too_many_arguments)]
188
+ pub fn groupby_dynamic(
189
+ &self,
190
+ index_column: String,
191
+ every: String,
192
+ period: String,
193
+ offset: String,
194
+ truncate: bool,
195
+ include_boundaries: bool,
196
+ closed: Wrap<ClosedWindow>,
197
+ by: RArray,
198
+ ) -> RbResult<RbLazyGroupBy> {
199
+ let closed_window = closed.0;
200
+ let by = rb_exprs_to_exprs(by)?;
201
+ let ldf = self.ldf.clone();
202
+ let lazy_gb = ldf.groupby_dynamic(
203
+ by,
204
+ DynamicGroupOptions {
205
+ index_column,
206
+ every: Duration::parse(&every),
207
+ period: Duration::parse(&period),
208
+ offset: Duration::parse(&offset),
209
+ truncate,
210
+ include_boundaries,
211
+ closed_window,
212
+ },
213
+ );
214
+
215
+ Ok(RbLazyGroupBy {
216
+ lgb: RefCell::new(Some(lazy_gb)),
217
+ })
218
+ }
219
+
88
220
  #[allow(clippy::too_many_arguments)]
89
221
  pub fn join(
90
222
  &self,
@@ -93,11 +225,9 @@ impl RbLazyFrame {
93
225
  right_on: RArray,
94
226
  allow_parallel: bool,
95
227
  force_parallel: bool,
96
- how: String,
228
+ how: Wrap<JoinType>,
97
229
  suffix: String,
98
230
  ) -> RbResult<Self> {
99
- let how = wrap_join_type(&how)?;
100
-
101
231
  let ldf = self.ldf.clone();
102
232
  let other = other.ldf.clone();
103
233
  let left_on = rb_exprs_to_exprs(left_on)?;
@@ -110,14 +240,178 @@ impl RbLazyFrame {
110
240
  .right_on(right_on)
111
241
  .allow_parallel(allow_parallel)
112
242
  .force_parallel(force_parallel)
113
- .how(how)
243
+ .how(how.0)
114
244
  .suffix(suffix)
115
245
  .finish()
116
246
  .into())
117
247
  }
118
248
 
119
- pub fn with_columns(&self, exprs: RArray) -> RbResult<RbLazyFrame> {
249
+ pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
120
250
  let ldf = self.ldf.clone();
121
251
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
122
252
  }
253
+
254
+ pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
255
+ let ldf = self.ldf.clone();
256
+ ldf.rename(existing, new).into()
257
+ }
258
+
259
+ pub fn reverse(&self) -> Self {
260
+ let ldf = self.ldf.clone();
261
+ ldf.reverse().into()
262
+ }
263
+
264
+ pub fn shift(&self, periods: i64) -> Self {
265
+ let ldf = self.ldf.clone();
266
+ ldf.shift(periods).into()
267
+ }
268
+
269
+ pub fn shift_and_fill(&self, periods: i64, fill_value: &RbExpr) -> Self {
270
+ let ldf = self.ldf.clone();
271
+ ldf.shift_and_fill(periods, fill_value.inner.clone()).into()
272
+ }
273
+
274
+ pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
275
+ let ldf = self.ldf.clone();
276
+ ldf.fill_nan(fill_value.inner.clone()).into()
277
+ }
278
+
279
+ pub fn min(&self) -> Self {
280
+ let ldf = self.ldf.clone();
281
+ ldf.min().into()
282
+ }
283
+
284
+ pub fn max(&self) -> Self {
285
+ let ldf = self.ldf.clone();
286
+ ldf.max().into()
287
+ }
288
+
289
+ pub fn sum(&self) -> Self {
290
+ let ldf = self.ldf.clone();
291
+ ldf.sum().into()
292
+ }
293
+
294
+ pub fn mean(&self) -> Self {
295
+ let ldf = self.ldf.clone();
296
+ ldf.mean().into()
297
+ }
298
+
299
+ pub fn std(&self, ddof: u8) -> Self {
300
+ let ldf = self.ldf.clone();
301
+ ldf.std(ddof).into()
302
+ }
303
+
304
+ pub fn var(&self, ddof: u8) -> Self {
305
+ let ldf = self.ldf.clone();
306
+ ldf.var(ddof).into()
307
+ }
308
+
309
+ pub fn median(&self) -> Self {
310
+ let ldf = self.ldf.clone();
311
+ ldf.median().into()
312
+ }
313
+
314
+ pub fn quantile(&self, quantile: f64, interpolation: Wrap<QuantileInterpolOptions>) -> Self {
315
+ let ldf = self.ldf.clone();
316
+ ldf.quantile(quantile, interpolation.0).into()
317
+ }
318
+
319
+ pub fn explode(&self, column: RArray) -> RbResult<Self> {
320
+ let ldf = self.ldf.clone();
321
+ let column = rb_exprs_to_exprs(column)?;
322
+ Ok(ldf.explode(column).into())
323
+ }
324
+
325
+ pub fn unique(
326
+ &self,
327
+ maintain_order: bool,
328
+ subset: Option<Vec<String>>,
329
+ keep: Wrap<UniqueKeepStrategy>,
330
+ ) -> RbResult<Self> {
331
+ let ldf = self.ldf.clone();
332
+ Ok(match maintain_order {
333
+ true => ldf.unique_stable(subset, keep.0),
334
+ false => ldf.unique(subset, keep.0),
335
+ }
336
+ .into())
337
+ }
338
+
339
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
340
+ let ldf = self.ldf.clone();
341
+ ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
342
+ .into()
343
+ }
344
+
345
+ pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
346
+ let ldf = self.ldf.clone();
347
+ ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
348
+ }
349
+
350
+ pub fn tail(&self, n: IdxSize) -> Self {
351
+ let ldf = self.ldf.clone();
352
+ ldf.tail(n).into()
353
+ }
354
+
355
+ pub fn melt(
356
+ &self,
357
+ id_vars: Vec<String>,
358
+ value_vars: Vec<String>,
359
+ value_name: Option<String>,
360
+ variable_name: Option<String>,
361
+ ) -> Self {
362
+ let args = MeltArgs {
363
+ id_vars,
364
+ value_vars,
365
+ value_name,
366
+ variable_name,
367
+ };
368
+
369
+ let ldf = self.ldf.clone();
370
+ ldf.melt(args).into()
371
+ }
372
+
373
+ pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
374
+ let ldf = self.ldf.clone();
375
+ ldf.with_row_count(&name, offset).into()
376
+ }
377
+
378
+ pub fn drop_columns(&self, cols: Vec<String>) -> Self {
379
+ let ldf = self.ldf.clone();
380
+ ldf.drop_columns(cols).into()
381
+ }
382
+
383
+ pub fn clone(&self) -> Self {
384
+ self.ldf.clone().into()
385
+ }
386
+
387
+ pub fn columns(&self) -> RbResult<Vec<String>> {
388
+ Ok(self.get_schema()?.iter_names().cloned().collect())
389
+ }
390
+
391
+ pub fn dtypes(&self) -> RbResult<Vec<String>> {
392
+ let schema = self.get_schema()?;
393
+ let iter = schema.iter_dtypes().map(|dt| dt.to_string());
394
+ Ok(iter.collect())
395
+ }
396
+
397
+ pub fn schema(&self) -> RbResult<RHash> {
398
+ let schema = self.get_schema()?;
399
+ let schema_dict = RHash::new();
400
+
401
+ schema.iter_fields().for_each(|fld| {
402
+ // TODO remove unwrap
403
+ schema_dict
404
+ .aset(fld.name().clone(), fld.data_type().to_string())
405
+ .unwrap();
406
+ });
407
+ Ok(schema_dict)
408
+ }
409
+
410
+ pub fn unnest(&self, cols: Vec<String>) -> Self {
411
+ self.ldf.clone().unnest(cols).into()
412
+ }
413
+
414
+ pub fn width(&self) -> RbResult<usize> {
415
+ Ok(self.get_schema()?.len())
416
+ }
123
417
  }