polars-df 0.7.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +17 -6
  8. data/ext/polars/src/batched_csv.rs +6 -7
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +268 -347
  12. data/ext/polars/src/dataframe.rs +96 -116
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/categorical.rs +8 -1
  15. data/ext/polars/src/expr/datetime.rs +22 -56
  16. data/ext/polars/src/expr/general.rs +124 -37
  17. data/ext/polars/src/expr/list.rs +52 -4
  18. data/ext/polars/src/expr/meta.rs +48 -0
  19. data/ext/polars/src/expr/rolling.rs +16 -10
  20. data/ext/polars/src/expr/string.rs +68 -17
  21. data/ext/polars/src/expr/struct.rs +8 -4
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +103 -48
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/range.rs +5 -10
  26. data/ext/polars/src/functions/string_cache.rs +14 -0
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +166 -41
  28. data/ext/polars/src/lib.rs +245 -187
  29. data/ext/polars/src/map/dataframe.rs +1 -1
  30. data/ext/polars/src/map/mod.rs +2 -2
  31. data/ext/polars/src/map/series.rs +6 -6
  32. data/ext/polars/src/object.rs +0 -30
  33. data/ext/polars/src/on_startup.rs +32 -0
  34. data/ext/polars/src/series/aggregation.rs +23 -0
  35. data/ext/polars/src/series/construction.rs +1 -1
  36. data/ext/polars/src/series/export.rs +2 -2
  37. data/ext/polars/src/{series.rs → series/mod.rs} +45 -21
  38. data/ext/polars/src/series/{set_at_idx.rs → scatter.rs} +18 -18
  39. data/ext/polars/src/utils.rs +1 -1
  40. data/lib/polars/array_expr.rb +449 -0
  41. data/lib/polars/array_name_space.rb +346 -0
  42. data/lib/polars/cat_expr.rb +24 -0
  43. data/lib/polars/cat_name_space.rb +75 -0
  44. data/lib/polars/config.rb +2 -2
  45. data/lib/polars/data_frame.rb +248 -108
  46. data/lib/polars/data_types.rb +195 -29
  47. data/lib/polars/date_time_expr.rb +41 -24
  48. data/lib/polars/date_time_name_space.rb +12 -12
  49. data/lib/polars/exceptions.rb +12 -1
  50. data/lib/polars/expr.rb +1080 -195
  51. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  52. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  53. data/lib/polars/functions/as_datatype.rb +248 -0
  54. data/lib/polars/functions/col.rb +47 -0
  55. data/lib/polars/functions/eager.rb +182 -0
  56. data/lib/polars/functions/lazy.rb +1280 -0
  57. data/lib/polars/functions/len.rb +49 -0
  58. data/lib/polars/functions/lit.rb +35 -0
  59. data/lib/polars/functions/random.rb +16 -0
  60. data/lib/polars/functions/range/date_range.rb +103 -0
  61. data/lib/polars/functions/range/int_range.rb +51 -0
  62. data/lib/polars/functions/repeat.rb +144 -0
  63. data/lib/polars/functions/whenthen.rb +27 -0
  64. data/lib/polars/functions.rb +29 -416
  65. data/lib/polars/group_by.rb +3 -3
  66. data/lib/polars/io.rb +21 -28
  67. data/lib/polars/lazy_frame.rb +390 -76
  68. data/lib/polars/list_expr.rb +152 -6
  69. data/lib/polars/list_name_space.rb +102 -0
  70. data/lib/polars/meta_expr.rb +175 -7
  71. data/lib/polars/series.rb +557 -59
  72. data/lib/polars/sql_context.rb +1 -1
  73. data/lib/polars/string_cache.rb +75 -0
  74. data/lib/polars/string_expr.rb +412 -96
  75. data/lib/polars/string_name_space.rb +4 -4
  76. data/lib/polars/struct_expr.rb +1 -1
  77. data/lib/polars/struct_name_space.rb +1 -1
  78. data/lib/polars/testing.rb +507 -0
  79. data/lib/polars/utils.rb +64 -20
  80. data/lib/polars/version.rb +1 -1
  81. data/lib/polars.rb +15 -2
  82. metadata +40 -9
  83. data/lib/polars/lazy_functions.rb +0 -1197
@@ -1,9 +1,10 @@
1
1
  use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
2
- use polars::io::RowCount;
2
+ use polars::io::RowIndex;
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::num::NonZeroUsize;
7
8
  use std::path::PathBuf;
8
9
 
9
10
  use crate::conversion::*;
@@ -55,13 +56,14 @@ impl RbLazyFrame {
55
56
  pub fn new_from_ndjson(
56
57
  path: String,
57
58
  infer_schema_length: Option<usize>,
58
- batch_size: Option<usize>,
59
+ batch_size: Option<Wrap<NonZeroUsize>>,
59
60
  n_rows: Option<usize>,
60
61
  low_memory: bool,
61
62
  rechunk: bool,
62
- row_count: Option<(String, IdxSize)>,
63
+ row_index: Option<(String, IdxSize)>,
63
64
  ) -> RbResult<Self> {
64
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
65
+ let batch_size = batch_size.map(|v| v.0);
66
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
65
67
 
66
68
  let lf = LazyJsonLineReader::new(path)
67
69
  .with_infer_schema_length(infer_schema_length)
@@ -69,7 +71,7 @@ impl RbLazyFrame {
69
71
  .with_n_rows(n_rows)
70
72
  .low_memory(low_memory)
71
73
  .with_rechunk(rechunk)
72
- .with_row_count(row_count)
74
+ .with_row_index(row_index)
73
75
  .finish()
74
76
  .map_err(RbPolarsErr::from)?;
75
77
  Ok(lf.into())
@@ -87,7 +89,7 @@ impl RbLazyFrame {
87
89
  let cache = bool::try_convert(arguments[6])?;
88
90
  let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
89
91
  let low_memory = bool::try_convert(arguments[8])?;
90
- let comment_char = Option::<String>::try_convert(arguments[9])?;
92
+ let comment_prefix = Option::<String>::try_convert(arguments[9])?;
91
93
  let quote_char = Option::<String>::try_convert(arguments[10])?;
92
94
  let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
93
95
  let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
@@ -95,18 +97,17 @@ impl RbLazyFrame {
95
97
  let rechunk = bool::try_convert(arguments[14])?;
96
98
  let skip_rows_after_header = usize::try_convert(arguments[15])?;
97
99
  let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
98
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
100
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
99
101
  let try_parse_dates = bool::try_convert(arguments[18])?;
100
102
  let eol_char = String::try_convert(arguments[19])?;
101
103
  // end arguments
102
104
 
103
105
  let null_values = null_values.map(|w| w.0);
104
- let comment_char = comment_char.map(|s| s.as_bytes()[0]);
105
106
  let quote_char = quote_char.map(|s| s.as_bytes()[0]);
106
107
  let separator = separator.as_bytes()[0];
107
108
  let eol_char = eol_char.as_bytes()[0];
108
109
 
109
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
110
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
110
111
 
111
112
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
112
113
  overwrite_dtype
@@ -124,13 +125,13 @@ impl RbLazyFrame {
124
125
  .with_cache(cache)
125
126
  .with_dtype_overwrite(overwrite_dtype.as_ref())
126
127
  .low_memory(low_memory)
127
- .with_comment_char(comment_char)
128
+ .with_comment_prefix(comment_prefix.as_deref())
128
129
  .with_quote_char(quote_char)
129
130
  .with_end_of_line_char(eol_char)
130
131
  .with_rechunk(rechunk)
131
132
  .with_skip_rows_after_header(skip_rows_after_header)
132
133
  .with_encoding(encoding.0)
133
- .with_row_count(row_count)
134
+ .with_row_index(row_index)
134
135
  .with_try_parse_dates(try_parse_dates)
135
136
  .with_null_values(null_values);
136
137
 
@@ -148,18 +149,18 @@ impl RbLazyFrame {
148
149
  cache: bool,
149
150
  parallel: Wrap<ParallelStrategy>,
150
151
  rechunk: bool,
151
- row_count: Option<(String, IdxSize)>,
152
+ row_index: Option<(String, IdxSize)>,
152
153
  low_memory: bool,
153
154
  use_statistics: bool,
154
155
  hive_partitioning: bool,
155
156
  ) -> RbResult<Self> {
156
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
157
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
157
158
  let args = ScanArgsParquet {
158
159
  n_rows,
159
160
  cache,
160
161
  parallel: parallel.0,
161
162
  rechunk,
162
- row_count,
163
+ row_index,
163
164
  low_memory,
164
165
  // TODO support cloud options
165
166
  cloud_options: None,
@@ -175,15 +176,15 @@ impl RbLazyFrame {
175
176
  n_rows: Option<usize>,
176
177
  cache: bool,
177
178
  rechunk: bool,
178
- row_count: Option<(String, IdxSize)>,
179
+ row_index: Option<(String, IdxSize)>,
179
180
  memory_map: bool,
180
181
  ) -> RbResult<Self> {
181
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
182
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
182
183
  let args = ScanArgsIpc {
183
184
  n_rows,
184
185
  cache,
185
186
  rechunk,
186
- row_count,
187
+ row_index,
187
188
  memmap: memory_map,
188
189
  };
189
190
  let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
@@ -217,20 +218,24 @@ impl RbLazyFrame {
217
218
  projection_pushdown: bool,
218
219
  simplify_expr: bool,
219
220
  slice_pushdown: bool,
220
- cse: bool,
221
+ comm_subplan_elim: bool,
222
+ comm_subexpr_elim: bool,
221
223
  allow_streaming: bool,
222
224
  _eager: bool,
223
225
  ) -> RbLazyFrame {
224
226
  let ldf = self.ldf.clone();
225
- let ldf = ldf
227
+ let mut ldf = ldf
226
228
  .with_type_coercion(type_coercion)
227
229
  .with_predicate_pushdown(predicate_pushdown)
228
230
  .with_simplify_expr(simplify_expr)
229
231
  .with_slice_pushdown(slice_pushdown)
230
- .with_comm_subplan_elim(cse)
231
232
  .with_streaming(allow_streaming)
232
233
  ._with_eager(_eager)
233
234
  .with_projection_pushdown(projection_pushdown);
235
+
236
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
237
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
238
+
234
239
  ldf.into()
235
240
  }
236
241
 
@@ -305,6 +310,75 @@ impl RbLazyFrame {
305
310
  Ok(())
306
311
  }
307
312
 
313
+ pub fn sink_ipc(
314
+ &self,
315
+ path: PathBuf,
316
+ compression: Option<Wrap<IpcCompression>>,
317
+ maintain_order: bool,
318
+ ) -> RbResult<()> {
319
+ let options = IpcWriterOptions {
320
+ compression: compression.map(|c| c.0),
321
+ maintain_order,
322
+ };
323
+
324
+ let ldf = self.ldf.clone();
325
+ ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
326
+ Ok(())
327
+ }
328
+
329
+ pub fn sink_csv(
330
+ &self,
331
+ path: PathBuf,
332
+ include_bom: bool,
333
+ include_header: bool,
334
+ separator: u8,
335
+ line_terminator: String,
336
+ quote_char: u8,
337
+ batch_size: Wrap<NonZeroUsize>,
338
+ datetime_format: Option<String>,
339
+ date_format: Option<String>,
340
+ time_format: Option<String>,
341
+ float_precision: Option<usize>,
342
+ null_value: Option<String>,
343
+ quote_style: Option<Wrap<QuoteStyle>>,
344
+ maintain_order: bool,
345
+ ) -> RbResult<()> {
346
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
347
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
348
+
349
+ let serialize_options = SerializeOptions {
350
+ date_format,
351
+ time_format,
352
+ datetime_format,
353
+ float_precision,
354
+ separator,
355
+ quote_char,
356
+ null: null_value,
357
+ line_terminator,
358
+ quote_style,
359
+ };
360
+
361
+ let options = CsvWriterOptions {
362
+ include_bom,
363
+ include_header,
364
+ maintain_order,
365
+ batch_size: batch_size.0,
366
+ serialize_options,
367
+ };
368
+
369
+ let ldf = self.ldf.clone();
370
+ ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
371
+ Ok(())
372
+ }
373
+
374
+ pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
375
+ let options = JsonWriterOptions { maintain_order };
376
+
377
+ let ldf = self.ldf.clone();
378
+ ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
379
+ Ok(())
380
+ }
381
+
308
382
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
309
383
  let ldf = self.ldf.clone();
310
384
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -322,6 +396,12 @@ impl RbLazyFrame {
322
396
  Ok(ldf.select(exprs).into())
323
397
  }
324
398
 
399
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
400
+ let ldf = self.ldf.clone();
401
+ let exprs = rb_exprs_to_exprs(exprs)?;
402
+ Ok(ldf.select_seq(exprs).into())
403
+ }
404
+
325
405
  pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
326
406
  let ldf = self.ldf.clone();
327
407
  let by = rb_exprs_to_exprs(by)?;
@@ -335,7 +415,7 @@ impl RbLazyFrame {
335
415
  })
336
416
  }
337
417
 
338
- pub fn group_by_rolling(
418
+ pub fn rolling(
339
419
  &self,
340
420
  index_column: &RbExpr,
341
421
  period: String,
@@ -460,6 +540,7 @@ impl RbLazyFrame {
460
540
  right_on: RArray,
461
541
  allow_parallel: bool,
462
542
  force_parallel: bool,
543
+ join_nulls: bool,
463
544
  how: Wrap<JoinType>,
464
545
  suffix: String,
465
546
  ) -> RbResult<Self> {
@@ -475,17 +556,28 @@ impl RbLazyFrame {
475
556
  .right_on(right_on)
476
557
  .allow_parallel(allow_parallel)
477
558
  .force_parallel(force_parallel)
559
+ .join_nulls(join_nulls)
478
560
  .how(how.0)
479
561
  .suffix(suffix)
480
562
  .finish()
481
563
  .into())
482
564
  }
483
565
 
566
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
567
+ let ldf = self.ldf.clone();
568
+ ldf.with_column(expr.inner.clone()).into()
569
+ }
570
+
484
571
  pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
485
572
  let ldf = self.ldf.clone();
486
573
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
487
574
  }
488
575
 
576
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
577
+ let ldf = self.ldf.clone();
578
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
579
+ }
580
+
489
581
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
490
582
  let ldf = self.ldf.clone();
491
583
  ldf.rename(existing, new).into()
@@ -510,48 +602,58 @@ impl RbLazyFrame {
510
602
  ldf.fill_nan(fill_value.inner.clone()).into()
511
603
  }
512
604
 
513
- pub fn min(&self) -> Self {
605
+ pub fn min(&self) -> RbResult<Self> {
514
606
  let ldf = self.ldf.clone();
515
- ldf.min().into()
607
+ let out = ldf.min().map_err(RbPolarsErr::from)?;
608
+ Ok(out.into())
516
609
  }
517
610
 
518
- pub fn max(&self) -> Self {
611
+ pub fn max(&self) -> RbResult<Self> {
519
612
  let ldf = self.ldf.clone();
520
- ldf.max().into()
613
+ let out = ldf.max().map_err(RbPolarsErr::from)?;
614
+ Ok(out.into())
521
615
  }
522
616
 
523
- pub fn sum(&self) -> Self {
617
+ pub fn sum(&self) -> RbResult<Self> {
524
618
  let ldf = self.ldf.clone();
525
- ldf.sum().into()
619
+ let out = ldf.sum().map_err(RbPolarsErr::from)?;
620
+ Ok(out.into())
526
621
  }
527
622
 
528
- pub fn mean(&self) -> Self {
623
+ pub fn mean(&self) -> RbResult<Self> {
529
624
  let ldf = self.ldf.clone();
530
- ldf.mean().into()
625
+ let out = ldf.mean().map_err(RbPolarsErr::from)?;
626
+ Ok(out.into())
531
627
  }
532
628
 
533
- pub fn std(&self, ddof: u8) -> Self {
629
+ pub fn std(&self, ddof: u8) -> RbResult<Self> {
534
630
  let ldf = self.ldf.clone();
535
- ldf.std(ddof).into()
631
+ let out = ldf.std(ddof).map_err(RbPolarsErr::from)?;
632
+ Ok(out.into())
536
633
  }
537
634
 
538
- pub fn var(&self, ddof: u8) -> Self {
635
+ pub fn var(&self, ddof: u8) -> RbResult<Self> {
539
636
  let ldf = self.ldf.clone();
540
- ldf.var(ddof).into()
637
+ let out = ldf.var(ddof).map_err(RbPolarsErr::from)?;
638
+ Ok(out.into())
541
639
  }
542
640
 
543
- pub fn median(&self) -> Self {
641
+ pub fn median(&self) -> RbResult<Self> {
544
642
  let ldf = self.ldf.clone();
545
- ldf.median().into()
643
+ let out = ldf.median().map_err(RbPolarsErr::from)?;
644
+ Ok(out.into())
546
645
  }
547
646
 
548
647
  pub fn quantile(
549
648
  &self,
550
649
  quantile: &RbExpr,
551
650
  interpolation: Wrap<QuantileInterpolOptions>,
552
- ) -> Self {
651
+ ) -> RbResult<Self> {
553
652
  let ldf = self.ldf.clone();
554
- ldf.quantile(quantile.inner.clone(), interpolation.0).into()
653
+ let out = ldf
654
+ .quantile(quantile.inner.clone(), interpolation.0)
655
+ .map_err(RbPolarsErr::from)?;
656
+ Ok(out.into())
555
657
  }
556
658
 
557
659
  pub fn explode(&self, column: RArray) -> RbResult<Self> {
@@ -560,6 +662,11 @@ impl RbLazyFrame {
560
662
  Ok(ldf.explode(column).into())
561
663
  }
562
664
 
665
+ pub fn null_count(&self) -> Self {
666
+ let ldf = self.ldf.clone();
667
+ ldf.null_count().into()
668
+ }
669
+
563
670
  pub fn unique(
564
671
  &self,
565
672
  maintain_order: bool,
@@ -610,14 +717,18 @@ impl RbLazyFrame {
610
717
  ldf.melt(args).into()
611
718
  }
612
719
 
613
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
720
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
614
721
  let ldf = self.ldf.clone();
615
- ldf.with_row_count(&name, offset).into()
722
+ ldf.with_row_index(&name, offset).into()
616
723
  }
617
724
 
618
- pub fn drop_columns(&self, cols: Vec<String>) -> Self {
725
+ pub fn drop(&self, cols: Vec<String>) -> Self {
619
726
  let ldf = self.ldf.clone();
620
- ldf.drop_columns(cols).into()
727
+ ldf.drop(cols).into()
728
+ }
729
+
730
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
731
+ self.ldf.clone().cast_all(dtype.0, strict).into()
621
732
  }
622
733
 
623
734
  pub fn clone(&self) -> Self {
@@ -659,4 +770,18 @@ impl RbLazyFrame {
659
770
  pub fn width(&self) -> RbResult<usize> {
660
771
  Ok(self.get_schema()?.len())
661
772
  }
773
+
774
+ pub fn count(&self) -> Self {
775
+ let ldf = self.ldf.clone();
776
+ ldf.count().into()
777
+ }
778
+
779
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
780
+ let out = self
781
+ .ldf
782
+ .clone()
783
+ .merge_sorted(other.ldf.clone(), &key)
784
+ .map_err(RbPolarsErr::from)?;
785
+ Ok(out.into())
786
+ }
662
787
  }