polars-df 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +55 -48
  4. data/Cargo.toml +3 -0
  5. data/README.md +12 -0
  6. data/ext/polars/Cargo.toml +22 -11
  7. data/ext/polars/src/batched_csv.rs +4 -4
  8. data/ext/polars/src/catalog/unity.rs +96 -94
  9. data/ext/polars/src/conversion/any_value.rs +26 -30
  10. data/ext/polars/src/conversion/chunked_array.rs +32 -28
  11. data/ext/polars/src/conversion/datetime.rs +11 -0
  12. data/ext/polars/src/conversion/mod.rs +307 -34
  13. data/ext/polars/src/dataframe/construction.rs +4 -3
  14. data/ext/polars/src/dataframe/export.rs +17 -15
  15. data/ext/polars/src/dataframe/general.rs +15 -12
  16. data/ext/polars/src/dataframe/io.rs +1 -2
  17. data/ext/polars/src/dataframe/mod.rs +25 -1
  18. data/ext/polars/src/dataframe/serde.rs +23 -8
  19. data/ext/polars/src/exceptions.rs +8 -4
  20. data/ext/polars/src/expr/array.rs +73 -4
  21. data/ext/polars/src/expr/binary.rs +26 -1
  22. data/ext/polars/src/expr/bitwise.rs +39 -0
  23. data/ext/polars/src/expr/categorical.rs +20 -0
  24. data/ext/polars/src/expr/datatype.rs +24 -1
  25. data/ext/polars/src/expr/datetime.rs +58 -14
  26. data/ext/polars/src/expr/general.rs +87 -15
  27. data/ext/polars/src/expr/list.rs +32 -24
  28. data/ext/polars/src/expr/meta.rs +15 -6
  29. data/ext/polars/src/expr/mod.rs +3 -0
  30. data/ext/polars/src/expr/name.rs +19 -14
  31. data/ext/polars/src/expr/rolling.rs +20 -0
  32. data/ext/polars/src/expr/serde.rs +28 -0
  33. data/ext/polars/src/expr/string.rs +64 -10
  34. data/ext/polars/src/expr/struct.rs +9 -1
  35. data/ext/polars/src/file.rs +15 -9
  36. data/ext/polars/src/functions/business.rs +0 -1
  37. data/ext/polars/src/functions/io.rs +25 -3
  38. data/ext/polars/src/functions/lazy.rs +11 -6
  39. data/ext/polars/src/functions/meta.rs +3 -3
  40. data/ext/polars/src/functions/string_cache.rs +3 -3
  41. data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
  42. data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
  43. data/ext/polars/src/io/mod.rs +6 -0
  44. data/ext/polars/src/lazyframe/general.rs +59 -9
  45. data/ext/polars/src/lazyframe/mod.rs +16 -1
  46. data/ext/polars/src/lazyframe/optflags.rs +58 -0
  47. data/ext/polars/src/lazyframe/serde.rs +27 -3
  48. data/ext/polars/src/lib.rs +261 -19
  49. data/ext/polars/src/map/dataframe.rs +20 -17
  50. data/ext/polars/src/map/lazy.rs +6 -5
  51. data/ext/polars/src/map/series.rs +8 -7
  52. data/ext/polars/src/on_startup.rs +12 -5
  53. data/ext/polars/src/rb_modules.rs +2 -2
  54. data/ext/polars/src/series/aggregation.rs +85 -28
  55. data/ext/polars/src/series/construction.rs +1 -0
  56. data/ext/polars/src/series/export.rs +37 -33
  57. data/ext/polars/src/series/general.rs +120 -21
  58. data/ext/polars/src/series/mod.rs +29 -4
  59. data/lib/polars/array_expr.rb +382 -3
  60. data/lib/polars/array_name_space.rb +281 -0
  61. data/lib/polars/binary_expr.rb +67 -0
  62. data/lib/polars/binary_name_space.rb +43 -0
  63. data/lib/polars/cat_expr.rb +224 -0
  64. data/lib/polars/cat_name_space.rb +138 -0
  65. data/lib/polars/config.rb +2 -2
  66. data/lib/polars/convert.rb +6 -6
  67. data/lib/polars/data_frame.rb +794 -27
  68. data/lib/polars/data_type_expr.rb +52 -0
  69. data/lib/polars/data_types.rb +26 -5
  70. data/lib/polars/date_time_expr.rb +252 -1
  71. data/lib/polars/date_time_name_space.rb +299 -0
  72. data/lib/polars/expr.rb +1248 -206
  73. data/lib/polars/functions/business.rb +95 -0
  74. data/lib/polars/functions/datatype.rb +21 -0
  75. data/lib/polars/functions/lazy.rb +14 -1
  76. data/lib/polars/io/csv.rb +1 -1
  77. data/lib/polars/io/iceberg.rb +27 -0
  78. data/lib/polars/io/json.rb +4 -4
  79. data/lib/polars/io/ndjson.rb +4 -4
  80. data/lib/polars/io/parquet.rb +32 -7
  81. data/lib/polars/io/scan_options.rb +4 -1
  82. data/lib/polars/lazy_frame.rb +1028 -28
  83. data/lib/polars/list_expr.rb +217 -17
  84. data/lib/polars/list_name_space.rb +231 -22
  85. data/lib/polars/meta_expr.rb +89 -0
  86. data/lib/polars/name_expr.rb +36 -0
  87. data/lib/polars/query_opt_flags.rb +50 -0
  88. data/lib/polars/scan_cast_options.rb +20 -1
  89. data/lib/polars/schema.rb +79 -3
  90. data/lib/polars/selector.rb +72 -0
  91. data/lib/polars/selectors.rb +3 -3
  92. data/lib/polars/series.rb +1053 -54
  93. data/lib/polars/string_expr.rb +436 -32
  94. data/lib/polars/string_name_space.rb +736 -50
  95. data/lib/polars/struct_expr.rb +103 -0
  96. data/lib/polars/struct_name_space.rb +19 -1
  97. data/lib/polars/utils/serde.rb +17 -0
  98. data/lib/polars/utils/various.rb +22 -1
  99. data/lib/polars/utils.rb +5 -1
  100. data/lib/polars/version.rb +1 -1
  101. data/lib/polars.rb +6 -0
  102. metadata +11 -1
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
1
+ use magnus::{IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
2
2
  use polars::io::{HiveOptions, RowIndex};
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
@@ -30,13 +30,12 @@ impl RbLazyFrame {
30
30
  source: Option<Value>,
31
31
  sources: Wrap<ScanSources>,
32
32
  infer_schema_length: Option<usize>,
33
- batch_size: Option<Wrap<NonZeroUsize>>,
33
+ batch_size: Option<NonZeroUsize>,
34
34
  n_rows: Option<usize>,
35
35
  low_memory: bool,
36
36
  rechunk: bool,
37
37
  row_index: Option<(String, IdxSize)>,
38
38
  ) -> RbResult<Self> {
39
- let batch_size = batch_size.map(|v| v.0);
40
39
  let row_index = row_index.map(|(name, offset)| RowIndex {
41
40
  name: name.into(),
42
41
  offset,
@@ -325,6 +324,30 @@ impl RbLazyFrame {
325
324
  .into())
326
325
  }
327
326
 
327
+ pub fn top_k(&self, k: IdxSize, by: RArray, reverse: Vec<bool>) -> RbResult<Self> {
328
+ let ldf = self.ldf.borrow().clone();
329
+ let exprs = rb_exprs_to_exprs(by)?;
330
+ Ok(ldf
331
+ .top_k(
332
+ k,
333
+ exprs,
334
+ SortMultipleOptions::new().with_order_descending_multi(reverse),
335
+ )
336
+ .into())
337
+ }
338
+
339
+ pub fn bottom_k(&self, k: IdxSize, by: RArray, reverse: Vec<bool>) -> RbResult<Self> {
340
+ let ldf = self.ldf.borrow().clone();
341
+ let exprs = rb_exprs_to_exprs(by)?;
342
+ Ok(ldf
343
+ .bottom_k(
344
+ k,
345
+ exprs,
346
+ SortMultipleOptions::new().with_order_descending_multi(reverse),
347
+ )
348
+ .into())
349
+ }
350
+
328
351
  pub fn cache(&self) -> Self {
329
352
  let ldf = self.ldf.borrow().clone();
330
353
  ldf.cache().into()
@@ -420,7 +443,7 @@ impl RbLazyFrame {
420
443
  let separator = u8::try_convert(arguments[3])?;
421
444
  let line_terminator = String::try_convert(arguments[4])?;
422
445
  let quote_char = u8::try_convert(arguments[5])?;
423
- let batch_size = Wrap::<NonZeroUsize>::try_convert(arguments[6])?;
446
+ let batch_size = NonZeroUsize::try_convert(arguments[6])?;
424
447
  let datetime_format = Option::<String>::try_convert(arguments[7])?;
425
448
  let date_format = Option::<String>::try_convert(arguments[8])?;
426
449
  let time_format = Option::<String>::try_convert(arguments[9])?;
@@ -453,7 +476,7 @@ impl RbLazyFrame {
453
476
  let options = CsvWriterOptions {
454
477
  include_bom,
455
478
  include_header,
456
- batch_size: batch_size.0,
479
+ batch_size,
457
480
  serialize_options,
458
481
  };
459
482
 
@@ -509,6 +532,11 @@ impl RbLazyFrame {
509
532
  ldf.filter(predicate.inner.clone()).into()
510
533
  }
511
534
 
535
+ pub fn remove(&self, predicate: &RbExpr) -> Self {
536
+ let ldf = self.ldf.borrow().clone();
537
+ ldf.remove(predicate.inner.clone()).into()
538
+ }
539
+
512
540
  pub fn select(&self, exprs: RArray) -> RbResult<Self> {
513
541
  let ldf = self.ldf.borrow().clone();
514
542
  let exprs = rb_exprs_to_exprs(exprs)?;
@@ -701,6 +729,20 @@ impl RbLazyFrame {
701
729
  .into())
702
730
  }
703
731
 
732
+ pub fn join_where(&self, other: &Self, predicates: RArray, suffix: String) -> RbResult<Self> {
733
+ let ldf = self.ldf.borrow().clone();
734
+ let other = other.ldf.borrow().clone();
735
+
736
+ let predicates = rb_exprs_to_exprs(predicates)?;
737
+
738
+ Ok(ldf
739
+ .join_builder()
740
+ .with(other)
741
+ .suffix(suffix)
742
+ .join_where(predicates)
743
+ .into())
744
+ }
745
+
704
746
  pub fn with_column(&self, expr: &RbExpr) -> Self {
705
747
  let ldf = self.ldf.borrow().clone();
706
748
  ldf.with_column(expr.inner.clone()).into()
@@ -816,6 +858,14 @@ impl RbLazyFrame {
816
858
  .into())
817
859
  }
818
860
 
861
+ pub fn drop_nans(&self, subset: Option<&RbSelector>) -> Self {
862
+ self.ldf
863
+ .borrow()
864
+ .clone()
865
+ .drop_nans(subset.map(|e| e.inner.clone()))
866
+ .into()
867
+ }
868
+
819
869
  pub fn drop_nulls(&self, subset: Option<&RbSelector>) -> Self {
820
870
  self.ldf
821
871
  .borrow()
@@ -880,19 +930,19 @@ impl RbLazyFrame {
880
930
  self.ldf.borrow().clone().into()
881
931
  }
882
932
 
883
- pub fn collect_schema(&self) -> RbResult<RHash> {
884
- let schema = self
933
+ pub fn collect_schema(ruby: &Ruby, rb_self: &Self) -> RbResult<RHash> {
934
+ let schema = rb_self
885
935
  .ldf
886
936
  .borrow_mut()
887
937
  .collect_schema()
888
938
  .map_err(RbPolarsErr::from)?;
889
939
 
890
- let schema_dict = RHash::new();
940
+ let schema_dict = ruby.hash_new();
891
941
  schema.iter_fields().for_each(|fld| {
892
942
  schema_dict
893
943
  .aset::<String, Value>(
894
944
  fld.name().to_string(),
895
- Wrap(fld.dtype().clone()).into_value(),
945
+ Wrap(fld.dtype().clone()).into_value_with(ruby),
896
946
  )
897
947
  .unwrap();
898
948
  });
@@ -1,8 +1,9 @@
1
1
  mod general;
2
+ mod optflags;
2
3
  mod serde;
3
4
  mod sink;
4
5
 
5
- use polars::lazy::frame::LazyFrame;
6
+ use polars::prelude::{LazyFrame, OptFlags};
6
7
  pub use sink::SinkTarget;
7
8
  use std::cell::RefCell;
8
9
 
@@ -12,6 +13,12 @@ pub struct RbLazyFrame {
12
13
  pub ldf: RefCell<LazyFrame>,
13
14
  }
14
15
 
16
+ #[magnus::wrap(class = "Polars::RbOptFlags")]
17
+ #[derive(Clone)]
18
+ pub struct RbOptFlags {
19
+ pub inner: RefCell<OptFlags>,
20
+ }
21
+
15
22
  impl From<LazyFrame> for RbLazyFrame {
16
23
  fn from(ldf: LazyFrame) -> Self {
17
24
  RbLazyFrame {
@@ -19,3 +26,11 @@ impl From<LazyFrame> for RbLazyFrame {
19
26
  }
20
27
  }
21
28
  }
29
+
30
+ impl From<OptFlags> for RbOptFlags {
31
+ fn from(inner: OptFlags) -> Self {
32
+ RbOptFlags {
33
+ inner: RefCell::new(inner),
34
+ }
35
+ }
36
+ }
@@ -0,0 +1,58 @@
1
+ use polars::prelude::OptFlags;
2
+
3
+ use super::RbOptFlags;
4
+
5
+ macro_rules! flag_getter_setters {
6
+ ($(($flag:ident, $getter:ident, $setter:ident, clear=$clear:literal))+) => {
7
+ impl RbOptFlags {
8
+ pub fn empty() -> Self {
9
+ Self {
10
+ inner: OptFlags::empty().into()
11
+ }
12
+ }
13
+
14
+ #[allow(clippy::should_implement_trait)]
15
+ pub fn default() -> Self {
16
+ Self { inner: OptFlags::default().into() }
17
+ }
18
+
19
+ pub fn no_optimizations(&self) {
20
+ $(if $clear {
21
+ self.inner.borrow_mut().remove(OptFlags::$flag);
22
+ })+
23
+ }
24
+
25
+ pub fn copy(&self) -> Self {
26
+ Self { inner: self.inner.clone() }
27
+ }
28
+
29
+ $(
30
+ pub fn $getter(&self) -> bool {
31
+ self.inner.borrow().contains(OptFlags::$flag)
32
+ }
33
+ pub fn $setter(&self, value: bool) {
34
+ self.inner.borrow_mut().set(OptFlags::$flag, value)
35
+ }
36
+ )+
37
+ }
38
+ };
39
+ }
40
+
41
+ flag_getter_setters! {
42
+ (TYPE_COERCION, get_type_coercion, set_type_coercion, clear=false)
43
+ (TYPE_CHECK, get_type_check, set_type_check, clear=false)
44
+
45
+ (PROJECTION_PUSHDOWN, get_projection_pushdown, set_projection_pushdown, clear=true)
46
+ (PREDICATE_PUSHDOWN, get_predicate_pushdown, set_predicate_pushdown, clear=true)
47
+ (CLUSTER_WITH_COLUMNS, get_cluster_with_columns, set_cluster_with_columns, clear=true)
48
+ (SIMPLIFY_EXPR, get_simplify_expression, set_simplify_expression, clear=true)
49
+ (SLICE_PUSHDOWN, get_slice_pushdown, set_slice_pushdown, clear=true)
50
+ (COMM_SUBPLAN_ELIM, get_comm_subplan_elim, set_comm_subplan_elim, clear=true)
51
+ (COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
52
+ (COLLAPSE_JOINS, get_collapse_joins, set_collapse_joins, clear=true)
53
+ (CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
54
+ (FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
55
+
56
+ (EAGER, get_eager, set_eager, clear=true)
57
+ (NEW_STREAMING, get_streaming, set_streaming, clear=true)
58
+ }
@@ -1,14 +1,38 @@
1
+ use std::io::Read;
2
+ #[cfg(feature = "serialize_binary")]
3
+ use std::io::{BufReader, BufWriter};
4
+
1
5
  use magnus::Value;
2
6
  use polars::lazy::frame::LazyFrame;
3
7
  use polars::prelude::*;
4
- use std::io::Read;
5
8
 
6
9
  use crate::file::get_file_like;
10
+ #[cfg(feature = "serialize_binary")]
11
+ use crate::utils::to_rb_err;
7
12
  use crate::{RbLazyFrame, RbResult, RbValueError};
8
13
 
9
14
  impl RbLazyFrame {
10
- // TODO change to serialize_json
11
- pub fn read_json(rb_f: Value) -> RbResult<Self> {
15
+ #[cfg(feature = "serialize_binary")]
16
+ pub fn serialize_binary(&self, rb_f: Value) -> RbResult<()> {
17
+ let file = get_file_like(rb_f, true)?;
18
+ let writer = BufWriter::new(file);
19
+ self.ldf
20
+ .borrow()
21
+ .logical_plan
22
+ .serialize_versioned(writer, Default::default())
23
+ .map_err(to_rb_err)
24
+ }
25
+
26
+ #[cfg(feature = "serialize_binary")]
27
+ pub fn deserialize_binary(rb_f: Value) -> RbResult<Self> {
28
+ let file = get_file_like(rb_f, false)?;
29
+ let reader = BufReader::new(file);
30
+
31
+ let lp: DslPlan = DslPlan::deserialize_versioned(reader).map_err(to_rb_err)?;
32
+ Ok(LazyFrame::from(lp).into())
33
+ }
34
+
35
+ pub fn deserialize_json(rb_f: Value) -> RbResult<Self> {
12
36
  // it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
13
37
  // so don't bother with files.
14
38
  let mut json = String::new();