polars-df 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, RArray, RHash, Value};
1
+ use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
2
2
  use polars::io::RowCount;
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
@@ -78,32 +78,32 @@ impl RbLazyFrame {
78
78
  pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
79
79
  // start arguments
80
80
  // this pattern is needed for more than 16
81
- let path: String = arguments[0].try_convert()?;
82
- let sep: String = arguments[1].try_convert()?;
83
- let has_header: bool = arguments[2].try_convert()?;
84
- let ignore_errors: bool = arguments[3].try_convert()?;
85
- let skip_rows: usize = arguments[4].try_convert()?;
86
- let n_rows: Option<usize> = arguments[5].try_convert()?;
87
- let cache: bool = arguments[6].try_convert()?;
88
- let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
89
- let low_memory: bool = arguments[8].try_convert()?;
90
- let comment_char: Option<String> = arguments[9].try_convert()?;
91
- let quote_char: Option<String> = arguments[10].try_convert()?;
92
- let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
93
- let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
94
- let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
95
- let rechunk: bool = arguments[14].try_convert()?;
96
- let skip_rows_after_header: usize = arguments[15].try_convert()?;
97
- let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
98
- let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
99
- let try_parse_dates: bool = arguments[18].try_convert()?;
100
- let eol_char: String = arguments[19].try_convert()?;
81
+ let path = String::try_convert(arguments[0])?;
82
+ let separator = String::try_convert(arguments[1])?;
83
+ let has_header = bool::try_convert(arguments[2])?;
84
+ let ignore_errors = bool::try_convert(arguments[3])?;
85
+ let skip_rows = usize::try_convert(arguments[4])?;
86
+ let n_rows = Option::<usize>::try_convert(arguments[5])?;
87
+ let cache = bool::try_convert(arguments[6])?;
88
+ let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
89
+ let low_memory = bool::try_convert(arguments[8])?;
90
+ let comment_char = Option::<String>::try_convert(arguments[9])?;
91
+ let quote_char = Option::<String>::try_convert(arguments[10])?;
92
+ let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
93
+ let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
94
+ let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
95
+ let rechunk = bool::try_convert(arguments[14])?;
96
+ let skip_rows_after_header = usize::try_convert(arguments[15])?;
97
+ let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
98
+ let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
99
+ let try_parse_dates = bool::try_convert(arguments[18])?;
100
+ let eol_char = String::try_convert(arguments[19])?;
101
101
  // end arguments
102
102
 
103
103
  let null_values = null_values.map(|w| w.0);
104
104
  let comment_char = comment_char.map(|s| s.as_bytes()[0]);
105
105
  let quote_char = quote_char.map(|s| s.as_bytes()[0]);
106
- let delimiter = sep.as_bytes()[0];
106
+ let separator = separator.as_bytes()[0];
107
107
  let eol_char = eol_char.as_bytes()[0];
108
108
 
109
109
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
@@ -116,7 +116,7 @@ impl RbLazyFrame {
116
116
  });
117
117
  let r = LazyCsvReader::new(path)
118
118
  .with_infer_schema_length(infer_schema_length)
119
- .with_delimiter(delimiter)
119
+ .with_separator(separator)
120
120
  .has_header(has_header)
121
121
  .with_ignore_errors(ignore_errors)
122
122
  .with_skip_rows(skip_rows)
@@ -151,6 +151,7 @@ impl RbLazyFrame {
151
151
  row_count: Option<(String, IdxSize)>,
152
152
  low_memory: bool,
153
153
  use_statistics: bool,
154
+ hive_partitioning: bool,
154
155
  ) -> RbResult<Self> {
155
156
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
156
157
  let args = ScanArgsParquet {
@@ -163,6 +164,7 @@ impl RbLazyFrame {
163
164
  // TODO support cloud options
164
165
  cloud_options: None,
165
166
  use_statistics,
167
+ hive_partitioning,
166
168
  };
167
169
  let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
168
170
  Ok(lf.into())
@@ -217,6 +219,7 @@ impl RbLazyFrame {
217
219
  slice_pushdown: bool,
218
220
  cse: bool,
219
221
  allow_streaming: bool,
222
+ _eager: bool,
220
223
  ) -> RbLazyFrame {
221
224
  let ldf = self.ldf.clone();
222
225
  let ldf = ldf
@@ -224,13 +227,20 @@ impl RbLazyFrame {
224
227
  .with_predicate_pushdown(predicate_pushdown)
225
228
  .with_simplify_expr(simplify_expr)
226
229
  .with_slice_pushdown(slice_pushdown)
227
- .with_common_subplan_elimination(cse)
230
+ .with_comm_subplan_elim(cse)
228
231
  .with_streaming(allow_streaming)
232
+ ._with_eager(_eager)
229
233
  .with_projection_pushdown(projection_pushdown);
230
234
  ldf.into()
231
235
  }
232
236
 
233
- pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> Self {
237
+ pub fn sort(
238
+ &self,
239
+ by_column: String,
240
+ reverse: bool,
241
+ nulls_last: bool,
242
+ maintain_order: bool,
243
+ ) -> Self {
234
244
  let ldf = self.ldf.clone();
235
245
  ldf.sort(
236
246
  &by_column,
@@ -238,6 +248,7 @@ impl RbLazyFrame {
238
248
  descending: reverse,
239
249
  nulls_last,
240
250
  multithreaded: true,
251
+ maintain_order,
241
252
  },
242
253
  )
243
254
  .into()
@@ -248,10 +259,13 @@ impl RbLazyFrame {
248
259
  by_column: RArray,
249
260
  reverse: Vec<bool>,
250
261
  nulls_last: bool,
262
+ maintain_order: bool,
251
263
  ) -> RbResult<Self> {
252
264
  let ldf = self.ldf.clone();
253
265
  let exprs = rb_exprs_to_exprs(by_column)?;
254
- Ok(ldf.sort_by_exprs(exprs, reverse, nulls_last).into())
266
+ Ok(ldf
267
+ .sort_by_exprs(exprs, reverse, nulls_last, maintain_order)
268
+ .into())
255
269
  }
256
270
 
257
271
  pub fn cache(&self) -> Self {
@@ -308,31 +322,32 @@ impl RbLazyFrame {
308
322
  Ok(ldf.select(exprs).into())
309
323
  }
310
324
 
311
- pub fn groupby(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
325
+ pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
312
326
  let ldf = self.ldf.clone();
313
327
  let by = rb_exprs_to_exprs(by)?;
314
328
  let lazy_gb = if maintain_order {
315
- ldf.groupby_stable(by)
329
+ ldf.group_by_stable(by)
316
330
  } else {
317
- ldf.groupby(by)
331
+ ldf.group_by(by)
318
332
  };
319
333
  Ok(RbLazyGroupBy {
320
334
  lgb: RefCell::new(Some(lazy_gb)),
321
335
  })
322
336
  }
323
337
 
324
- pub fn groupby_rolling(
338
+ pub fn group_by_rolling(
325
339
  &self,
326
340
  index_column: &RbExpr,
327
341
  period: String,
328
342
  offset: String,
329
343
  closed: Wrap<ClosedWindow>,
330
344
  by: RArray,
345
+ check_sorted: bool,
331
346
  ) -> RbResult<RbLazyGroupBy> {
332
347
  let closed_window = closed.0;
333
348
  let ldf = self.ldf.clone();
334
349
  let by = rb_exprs_to_exprs(by)?;
335
- let lazy_gb = ldf.groupby_rolling(
350
+ let lazy_gb = ldf.group_by_rolling(
336
351
  index_column.inner.clone(),
337
352
  by,
338
353
  RollingGroupOptions {
@@ -340,6 +355,7 @@ impl RbLazyFrame {
340
355
  period: Duration::parse(&period),
341
356
  offset: Duration::parse(&offset),
342
357
  closed_window,
358
+ check_sorted,
343
359
  },
344
360
  );
345
361
 
@@ -349,32 +365,34 @@ impl RbLazyFrame {
349
365
  }
350
366
 
351
367
  #[allow(clippy::too_many_arguments)]
352
- pub fn groupby_dynamic(
368
+ pub fn group_by_dynamic(
353
369
  &self,
354
370
  index_column: &RbExpr,
355
371
  every: String,
356
372
  period: String,
357
373
  offset: String,
358
- truncate: bool,
374
+ label: Wrap<Label>,
359
375
  include_boundaries: bool,
360
376
  closed: Wrap<ClosedWindow>,
361
377
  by: RArray,
362
378
  start_by: Wrap<StartBy>,
379
+ check_sorted: bool,
363
380
  ) -> RbResult<RbLazyGroupBy> {
364
381
  let closed_window = closed.0;
365
382
  let by = rb_exprs_to_exprs(by)?;
366
383
  let ldf = self.ldf.clone();
367
- let lazy_gb = ldf.groupby_dynamic(
384
+ let lazy_gb = ldf.group_by_dynamic(
368
385
  index_column.inner.clone(),
369
386
  by,
370
387
  DynamicGroupOptions {
371
388
  every: Duration::parse(&every),
372
389
  period: Duration::parse(&period),
373
390
  offset: Duration::parse(&offset),
374
- truncate,
391
+ label: label.0,
375
392
  include_boundaries,
376
393
  closed_window,
377
394
  start_by: start_by.0,
395
+ check_sorted,
378
396
  ..Default::default()
379
397
  },
380
398
  );
@@ -387,7 +405,7 @@ impl RbLazyFrame {
387
405
  pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
388
406
  let contexts = contexts
389
407
  .each()
390
- .map(|v| v.unwrap().try_convert())
408
+ .map(|v| TryConvert::try_convert(v.unwrap()))
391
409
  .collect::<RbResult<Vec<&RbLazyFrame>>>()?;
392
410
  let contexts = contexts
393
411
  .into_iter()
@@ -478,14 +496,13 @@ impl RbLazyFrame {
478
496
  ldf.reverse().into()
479
497
  }
480
498
 
481
- pub fn shift(&self, periods: i64) -> Self {
482
- let ldf = self.ldf.clone();
483
- ldf.shift(periods).into()
484
- }
485
-
486
- pub fn shift_and_fill(&self, periods: i64, fill_value: &RbExpr) -> Self {
487
- let ldf = self.ldf.clone();
488
- ldf.shift_and_fill(periods, fill_value.inner.clone()).into()
499
+ pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
500
+ let lf = self.ldf.clone();
501
+ let out = match fill_value {
502
+ Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
503
+ None => lf.shift(n.inner.clone()),
504
+ };
505
+ out.into()
489
506
  }
490
507
 
491
508
  pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {