polars-df 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +17 -6
  8. data/ext/polars/src/batched_csv.rs +6 -7
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +268 -347
  12. data/ext/polars/src/dataframe.rs +96 -116
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/categorical.rs +8 -1
  15. data/ext/polars/src/expr/datetime.rs +22 -56
  16. data/ext/polars/src/expr/general.rs +124 -37
  17. data/ext/polars/src/expr/list.rs +52 -4
  18. data/ext/polars/src/expr/meta.rs +48 -0
  19. data/ext/polars/src/expr/rolling.rs +16 -10
  20. data/ext/polars/src/expr/string.rs +68 -17
  21. data/ext/polars/src/expr/struct.rs +8 -4
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +103 -48
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/range.rs +5 -10
  26. data/ext/polars/src/functions/string_cache.rs +14 -0
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +166 -41
  28. data/ext/polars/src/lib.rs +245 -187
  29. data/ext/polars/src/map/dataframe.rs +1 -1
  30. data/ext/polars/src/map/mod.rs +2 -2
  31. data/ext/polars/src/map/series.rs +6 -6
  32. data/ext/polars/src/object.rs +0 -30
  33. data/ext/polars/src/on_startup.rs +32 -0
  34. data/ext/polars/src/series/aggregation.rs +23 -0
  35. data/ext/polars/src/series/construction.rs +1 -1
  36. data/ext/polars/src/series/export.rs +2 -2
  37. data/ext/polars/src/{series.rs → series/mod.rs} +45 -21
  38. data/ext/polars/src/series/{set_at_idx.rs → scatter.rs} +18 -18
  39. data/ext/polars/src/utils.rs +1 -1
  40. data/lib/polars/array_expr.rb +449 -0
  41. data/lib/polars/array_name_space.rb +346 -0
  42. data/lib/polars/cat_expr.rb +24 -0
  43. data/lib/polars/cat_name_space.rb +75 -0
  44. data/lib/polars/config.rb +2 -2
  45. data/lib/polars/data_frame.rb +248 -108
  46. data/lib/polars/data_types.rb +195 -29
  47. data/lib/polars/date_time_expr.rb +41 -24
  48. data/lib/polars/date_time_name_space.rb +12 -12
  49. data/lib/polars/exceptions.rb +12 -1
  50. data/lib/polars/expr.rb +1080 -195
  51. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  52. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  53. data/lib/polars/functions/as_datatype.rb +248 -0
  54. data/lib/polars/functions/col.rb +47 -0
  55. data/lib/polars/functions/eager.rb +182 -0
  56. data/lib/polars/functions/lazy.rb +1280 -0
  57. data/lib/polars/functions/len.rb +49 -0
  58. data/lib/polars/functions/lit.rb +35 -0
  59. data/lib/polars/functions/random.rb +16 -0
  60. data/lib/polars/functions/range/date_range.rb +103 -0
  61. data/lib/polars/functions/range/int_range.rb +51 -0
  62. data/lib/polars/functions/repeat.rb +144 -0
  63. data/lib/polars/functions/whenthen.rb +27 -0
  64. data/lib/polars/functions.rb +29 -416
  65. data/lib/polars/group_by.rb +3 -3
  66. data/lib/polars/io.rb +21 -28
  67. data/lib/polars/lazy_frame.rb +390 -76
  68. data/lib/polars/list_expr.rb +152 -6
  69. data/lib/polars/list_name_space.rb +102 -0
  70. data/lib/polars/meta_expr.rb +175 -7
  71. data/lib/polars/series.rb +557 -59
  72. data/lib/polars/sql_context.rb +1 -1
  73. data/lib/polars/string_cache.rb +75 -0
  74. data/lib/polars/string_expr.rb +412 -96
  75. data/lib/polars/string_name_space.rb +4 -4
  76. data/lib/polars/struct_expr.rb +1 -1
  77. data/lib/polars/struct_name_space.rb +1 -1
  78. data/lib/polars/testing.rb +507 -0
  79. data/lib/polars/utils.rb +64 -20
  80. data/lib/polars/version.rb +1 -1
  81. data/lib/polars.rb +15 -2
  82. metadata +40 -9
  83. data/lib/polars/lazy_functions.rb +0 -1197
@@ -1,9 +1,10 @@
1
1
  use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
2
- use polars::io::RowCount;
2
+ use polars::io::RowIndex;
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::num::NonZeroUsize;
7
8
  use std::path::PathBuf;
8
9
 
9
10
  use crate::conversion::*;
@@ -55,13 +56,14 @@ impl RbLazyFrame {
55
56
  pub fn new_from_ndjson(
56
57
  path: String,
57
58
  infer_schema_length: Option<usize>,
58
- batch_size: Option<usize>,
59
+ batch_size: Option<Wrap<NonZeroUsize>>,
59
60
  n_rows: Option<usize>,
60
61
  low_memory: bool,
61
62
  rechunk: bool,
62
- row_count: Option<(String, IdxSize)>,
63
+ row_index: Option<(String, IdxSize)>,
63
64
  ) -> RbResult<Self> {
64
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
65
+ let batch_size = batch_size.map(|v| v.0);
66
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
65
67
 
66
68
  let lf = LazyJsonLineReader::new(path)
67
69
  .with_infer_schema_length(infer_schema_length)
@@ -69,7 +71,7 @@ impl RbLazyFrame {
69
71
  .with_n_rows(n_rows)
70
72
  .low_memory(low_memory)
71
73
  .with_rechunk(rechunk)
72
- .with_row_count(row_count)
74
+ .with_row_index(row_index)
73
75
  .finish()
74
76
  .map_err(RbPolarsErr::from)?;
75
77
  Ok(lf.into())
@@ -87,7 +89,7 @@ impl RbLazyFrame {
87
89
  let cache = bool::try_convert(arguments[6])?;
88
90
  let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
89
91
  let low_memory = bool::try_convert(arguments[8])?;
90
- let comment_char = Option::<String>::try_convert(arguments[9])?;
92
+ let comment_prefix = Option::<String>::try_convert(arguments[9])?;
91
93
  let quote_char = Option::<String>::try_convert(arguments[10])?;
92
94
  let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
93
95
  let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
@@ -95,18 +97,17 @@ impl RbLazyFrame {
95
97
  let rechunk = bool::try_convert(arguments[14])?;
96
98
  let skip_rows_after_header = usize::try_convert(arguments[15])?;
97
99
  let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
98
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
100
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
99
101
  let try_parse_dates = bool::try_convert(arguments[18])?;
100
102
  let eol_char = String::try_convert(arguments[19])?;
101
103
  // end arguments
102
104
 
103
105
  let null_values = null_values.map(|w| w.0);
104
- let comment_char = comment_char.map(|s| s.as_bytes()[0]);
105
106
  let quote_char = quote_char.map(|s| s.as_bytes()[0]);
106
107
  let separator = separator.as_bytes()[0];
107
108
  let eol_char = eol_char.as_bytes()[0];
108
109
 
109
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
110
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
110
111
 
111
112
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
112
113
  overwrite_dtype
@@ -124,13 +125,13 @@ impl RbLazyFrame {
124
125
  .with_cache(cache)
125
126
  .with_dtype_overwrite(overwrite_dtype.as_ref())
126
127
  .low_memory(low_memory)
127
- .with_comment_char(comment_char)
128
+ .with_comment_prefix(comment_prefix.as_deref())
128
129
  .with_quote_char(quote_char)
129
130
  .with_end_of_line_char(eol_char)
130
131
  .with_rechunk(rechunk)
131
132
  .with_skip_rows_after_header(skip_rows_after_header)
132
133
  .with_encoding(encoding.0)
133
- .with_row_count(row_count)
134
+ .with_row_index(row_index)
134
135
  .with_try_parse_dates(try_parse_dates)
135
136
  .with_null_values(null_values);
136
137
 
@@ -148,18 +149,18 @@ impl RbLazyFrame {
148
149
  cache: bool,
149
150
  parallel: Wrap<ParallelStrategy>,
150
151
  rechunk: bool,
151
- row_count: Option<(String, IdxSize)>,
152
+ row_index: Option<(String, IdxSize)>,
152
153
  low_memory: bool,
153
154
  use_statistics: bool,
154
155
  hive_partitioning: bool,
155
156
  ) -> RbResult<Self> {
156
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
157
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
157
158
  let args = ScanArgsParquet {
158
159
  n_rows,
159
160
  cache,
160
161
  parallel: parallel.0,
161
162
  rechunk,
162
- row_count,
163
+ row_index,
163
164
  low_memory,
164
165
  // TODO support cloud options
165
166
  cloud_options: None,
@@ -175,15 +176,15 @@ impl RbLazyFrame {
175
176
  n_rows: Option<usize>,
176
177
  cache: bool,
177
178
  rechunk: bool,
178
- row_count: Option<(String, IdxSize)>,
179
+ row_index: Option<(String, IdxSize)>,
179
180
  memory_map: bool,
180
181
  ) -> RbResult<Self> {
181
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
182
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
182
183
  let args = ScanArgsIpc {
183
184
  n_rows,
184
185
  cache,
185
186
  rechunk,
186
- row_count,
187
+ row_index,
187
188
  memmap: memory_map,
188
189
  };
189
190
  let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
@@ -217,20 +218,24 @@ impl RbLazyFrame {
217
218
  projection_pushdown: bool,
218
219
  simplify_expr: bool,
219
220
  slice_pushdown: bool,
220
- cse: bool,
221
+ comm_subplan_elim: bool,
222
+ comm_subexpr_elim: bool,
221
223
  allow_streaming: bool,
222
224
  _eager: bool,
223
225
  ) -> RbLazyFrame {
224
226
  let ldf = self.ldf.clone();
225
- let ldf = ldf
227
+ let mut ldf = ldf
226
228
  .with_type_coercion(type_coercion)
227
229
  .with_predicate_pushdown(predicate_pushdown)
228
230
  .with_simplify_expr(simplify_expr)
229
231
  .with_slice_pushdown(slice_pushdown)
230
- .with_comm_subplan_elim(cse)
231
232
  .with_streaming(allow_streaming)
232
233
  ._with_eager(_eager)
233
234
  .with_projection_pushdown(projection_pushdown);
235
+
236
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
237
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
238
+
234
239
  ldf.into()
235
240
  }
236
241
 
@@ -305,6 +310,75 @@ impl RbLazyFrame {
305
310
  Ok(())
306
311
  }
307
312
 
313
+ pub fn sink_ipc(
314
+ &self,
315
+ path: PathBuf,
316
+ compression: Option<Wrap<IpcCompression>>,
317
+ maintain_order: bool,
318
+ ) -> RbResult<()> {
319
+ let options = IpcWriterOptions {
320
+ compression: compression.map(|c| c.0),
321
+ maintain_order,
322
+ };
323
+
324
+ let ldf = self.ldf.clone();
325
+ ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
326
+ Ok(())
327
+ }
328
+
329
+ pub fn sink_csv(
330
+ &self,
331
+ path: PathBuf,
332
+ include_bom: bool,
333
+ include_header: bool,
334
+ separator: u8,
335
+ line_terminator: String,
336
+ quote_char: u8,
337
+ batch_size: Wrap<NonZeroUsize>,
338
+ datetime_format: Option<String>,
339
+ date_format: Option<String>,
340
+ time_format: Option<String>,
341
+ float_precision: Option<usize>,
342
+ null_value: Option<String>,
343
+ quote_style: Option<Wrap<QuoteStyle>>,
344
+ maintain_order: bool,
345
+ ) -> RbResult<()> {
346
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
347
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
348
+
349
+ let serialize_options = SerializeOptions {
350
+ date_format,
351
+ time_format,
352
+ datetime_format,
353
+ float_precision,
354
+ separator,
355
+ quote_char,
356
+ null: null_value,
357
+ line_terminator,
358
+ quote_style,
359
+ };
360
+
361
+ let options = CsvWriterOptions {
362
+ include_bom,
363
+ include_header,
364
+ maintain_order,
365
+ batch_size: batch_size.0,
366
+ serialize_options,
367
+ };
368
+
369
+ let ldf = self.ldf.clone();
370
+ ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
371
+ Ok(())
372
+ }
373
+
374
+ pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
375
+ let options = JsonWriterOptions { maintain_order };
376
+
377
+ let ldf = self.ldf.clone();
378
+ ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
379
+ Ok(())
380
+ }
381
+
308
382
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
309
383
  let ldf = self.ldf.clone();
310
384
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -322,6 +396,12 @@ impl RbLazyFrame {
322
396
  Ok(ldf.select(exprs).into())
323
397
  }
324
398
 
399
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
400
+ let ldf = self.ldf.clone();
401
+ let exprs = rb_exprs_to_exprs(exprs)?;
402
+ Ok(ldf.select_seq(exprs).into())
403
+ }
404
+
325
405
  pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
326
406
  let ldf = self.ldf.clone();
327
407
  let by = rb_exprs_to_exprs(by)?;
@@ -335,7 +415,7 @@ impl RbLazyFrame {
335
415
  })
336
416
  }
337
417
 
338
- pub fn group_by_rolling(
418
+ pub fn rolling(
339
419
  &self,
340
420
  index_column: &RbExpr,
341
421
  period: String,
@@ -460,6 +540,7 @@ impl RbLazyFrame {
460
540
  right_on: RArray,
461
541
  allow_parallel: bool,
462
542
  force_parallel: bool,
543
+ join_nulls: bool,
463
544
  how: Wrap<JoinType>,
464
545
  suffix: String,
465
546
  ) -> RbResult<Self> {
@@ -475,17 +556,28 @@ impl RbLazyFrame {
475
556
  .right_on(right_on)
476
557
  .allow_parallel(allow_parallel)
477
558
  .force_parallel(force_parallel)
559
+ .join_nulls(join_nulls)
478
560
  .how(how.0)
479
561
  .suffix(suffix)
480
562
  .finish()
481
563
  .into())
482
564
  }
483
565
 
566
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
567
+ let ldf = self.ldf.clone();
568
+ ldf.with_column(expr.inner.clone()).into()
569
+ }
570
+
484
571
  pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
485
572
  let ldf = self.ldf.clone();
486
573
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
487
574
  }
488
575
 
576
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
577
+ let ldf = self.ldf.clone();
578
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
579
+ }
580
+
489
581
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
490
582
  let ldf = self.ldf.clone();
491
583
  ldf.rename(existing, new).into()
@@ -510,48 +602,58 @@ impl RbLazyFrame {
510
602
  ldf.fill_nan(fill_value.inner.clone()).into()
511
603
  }
512
604
 
513
- pub fn min(&self) -> Self {
605
+ pub fn min(&self) -> RbResult<Self> {
514
606
  let ldf = self.ldf.clone();
515
- ldf.min().into()
607
+ let out = ldf.min().map_err(RbPolarsErr::from)?;
608
+ Ok(out.into())
516
609
  }
517
610
 
518
- pub fn max(&self) -> Self {
611
+ pub fn max(&self) -> RbResult<Self> {
519
612
  let ldf = self.ldf.clone();
520
- ldf.max().into()
613
+ let out = ldf.max().map_err(RbPolarsErr::from)?;
614
+ Ok(out.into())
521
615
  }
522
616
 
523
- pub fn sum(&self) -> Self {
617
+ pub fn sum(&self) -> RbResult<Self> {
524
618
  let ldf = self.ldf.clone();
525
- ldf.sum().into()
619
+ let out = ldf.sum().map_err(RbPolarsErr::from)?;
620
+ Ok(out.into())
526
621
  }
527
622
 
528
- pub fn mean(&self) -> Self {
623
+ pub fn mean(&self) -> RbResult<Self> {
529
624
  let ldf = self.ldf.clone();
530
- ldf.mean().into()
625
+ let out = ldf.mean().map_err(RbPolarsErr::from)?;
626
+ Ok(out.into())
531
627
  }
532
628
 
533
- pub fn std(&self, ddof: u8) -> Self {
629
+ pub fn std(&self, ddof: u8) -> RbResult<Self> {
534
630
  let ldf = self.ldf.clone();
535
- ldf.std(ddof).into()
631
+ let out = ldf.std(ddof).map_err(RbPolarsErr::from)?;
632
+ Ok(out.into())
536
633
  }
537
634
 
538
- pub fn var(&self, ddof: u8) -> Self {
635
+ pub fn var(&self, ddof: u8) -> RbResult<Self> {
539
636
  let ldf = self.ldf.clone();
540
- ldf.var(ddof).into()
637
+ let out = ldf.var(ddof).map_err(RbPolarsErr::from)?;
638
+ Ok(out.into())
541
639
  }
542
640
 
543
- pub fn median(&self) -> Self {
641
+ pub fn median(&self) -> RbResult<Self> {
544
642
  let ldf = self.ldf.clone();
545
- ldf.median().into()
643
+ let out = ldf.median().map_err(RbPolarsErr::from)?;
644
+ Ok(out.into())
546
645
  }
547
646
 
548
647
  pub fn quantile(
549
648
  &self,
550
649
  quantile: &RbExpr,
551
650
  interpolation: Wrap<QuantileInterpolOptions>,
552
- ) -> Self {
651
+ ) -> RbResult<Self> {
553
652
  let ldf = self.ldf.clone();
554
- ldf.quantile(quantile.inner.clone(), interpolation.0).into()
653
+ let out = ldf
654
+ .quantile(quantile.inner.clone(), interpolation.0)
655
+ .map_err(RbPolarsErr::from)?;
656
+ Ok(out.into())
555
657
  }
556
658
 
557
659
  pub fn explode(&self, column: RArray) -> RbResult<Self> {
@@ -560,6 +662,11 @@ impl RbLazyFrame {
560
662
  Ok(ldf.explode(column).into())
561
663
  }
562
664
 
665
+ pub fn null_count(&self) -> Self {
666
+ let ldf = self.ldf.clone();
667
+ ldf.null_count().into()
668
+ }
669
+
563
670
  pub fn unique(
564
671
  &self,
565
672
  maintain_order: bool,
@@ -610,14 +717,18 @@ impl RbLazyFrame {
610
717
  ldf.melt(args).into()
611
718
  }
612
719
 
613
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
720
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
614
721
  let ldf = self.ldf.clone();
615
- ldf.with_row_count(&name, offset).into()
722
+ ldf.with_row_index(&name, offset).into()
616
723
  }
617
724
 
618
- pub fn drop_columns(&self, cols: Vec<String>) -> Self {
725
+ pub fn drop(&self, cols: Vec<String>) -> Self {
619
726
  let ldf = self.ldf.clone();
620
- ldf.drop_columns(cols).into()
727
+ ldf.drop(cols).into()
728
+ }
729
+
730
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
731
+ self.ldf.clone().cast_all(dtype.0, strict).into()
621
732
  }
622
733
 
623
734
  pub fn clone(&self) -> Self {
@@ -659,4 +770,18 @@ impl RbLazyFrame {
659
770
  pub fn width(&self) -> RbResult<usize> {
660
771
  Ok(self.get_schema()?.len())
661
772
  }
773
+
774
+ pub fn count(&self) -> Self {
775
+ let ldf = self.ldf.clone();
776
+ ldf.count().into()
777
+ }
778
+
779
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
780
+ let out = self
781
+ .ldf
782
+ .clone()
783
+ .merge_sorted(other.ldf.clone(), &key)
784
+ .map_err(RbPolarsErr::from)?;
785
+ Ok(out.into())
786
+ }
662
787
  }