polars-df 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +33 -0
  3. data/Cargo.lock +270 -97
  4. data/LICENSE.txt +1 -1
  5. data/README.md +1 -3
  6. data/ext/polars/Cargo.toml +19 -18
  7. data/ext/polars/src/catalog/unity.rs +15 -20
  8. data/ext/polars/src/conversion/any_value.rs +53 -29
  9. data/ext/polars/src/conversion/chunked_array.rs +58 -56
  10. data/ext/polars/src/conversion/datetime.rs +58 -7
  11. data/ext/polars/src/conversion/mod.rs +200 -150
  12. data/ext/polars/src/dataframe/export.rs +15 -12
  13. data/ext/polars/src/dataframe/general.rs +25 -7
  14. data/ext/polars/src/dataframe/map.rs +6 -4
  15. data/ext/polars/src/error.rs +1 -1
  16. data/ext/polars/src/expr/array.rs +0 -24
  17. data/ext/polars/src/expr/datatype.rs +13 -3
  18. data/ext/polars/src/expr/datetime.rs +4 -4
  19. data/ext/polars/src/expr/general.rs +35 -15
  20. data/ext/polars/src/expr/list.rs +0 -26
  21. data/ext/polars/src/expr/rolling.rs +24 -0
  22. data/ext/polars/src/functions/business.rs +2 -2
  23. data/ext/polars/src/functions/io.rs +4 -3
  24. data/ext/polars/src/functions/lazy.rs +65 -46
  25. data/ext/polars/src/functions/meta.rs +6 -5
  26. data/ext/polars/src/functions/mod.rs +0 -1
  27. data/ext/polars/src/functions/range.rs +13 -0
  28. data/ext/polars/src/functions/utils.rs +4 -2
  29. data/ext/polars/src/interop/arrow/mod.rs +4 -2
  30. data/ext/polars/src/interop/arrow/to_rb.rs +1 -1
  31. data/ext/polars/src/interop/numo/to_numo_series.rs +26 -25
  32. data/ext/polars/src/io/scan_options.rs +6 -3
  33. data/ext/polars/src/io/sink_options.rs +2 -0
  34. data/ext/polars/src/lazyframe/general.rs +243 -17
  35. data/ext/polars/src/lazyframe/optflags.rs +2 -1
  36. data/ext/polars/src/lib.rs +39 -35
  37. data/ext/polars/src/map/lazy.rs +5 -2
  38. data/ext/polars/src/map/series.rs +19 -18
  39. data/ext/polars/src/on_startup.rs +25 -6
  40. data/ext/polars/src/ruby/numo.rs +3 -4
  41. data/ext/polars/src/ruby/plan_callback.rs +1 -4
  42. data/ext/polars/src/ruby/rb_modules.rs +2 -4
  43. data/ext/polars/src/ruby/ruby_udf.rs +7 -9
  44. data/ext/polars/src/ruby/utils.rs +12 -1
  45. data/ext/polars/src/series/aggregation.rs +13 -1
  46. data/ext/polars/src/series/construction.rs +31 -50
  47. data/ext/polars/src/series/export.rs +33 -38
  48. data/ext/polars/src/series/general.rs +6 -6
  49. data/ext/polars/src/series/map.rs +3 -2
  50. data/ext/polars/src/series/scatter.rs +4 -4
  51. data/ext/polars/src/utils.rs +31 -7
  52. data/lib/polars/array_expr.rb +23 -7
  53. data/lib/polars/array_name_space.rb +16 -2
  54. data/lib/polars/binary_name_space.rb +32 -0
  55. data/lib/polars/collect_batches.rb +4 -0
  56. data/lib/polars/data_frame.rb +144 -11
  57. data/lib/polars/data_type_group.rb +5 -0
  58. data/lib/polars/date_time_expr.rb +91 -3
  59. data/lib/polars/date_time_name_space.rb +7 -1
  60. data/lib/polars/expr.rb +247 -44
  61. data/lib/polars/functions/business.rb +2 -2
  62. data/lib/polars/functions/datatype.rb +30 -0
  63. data/lib/polars/functions/eager.rb +80 -7
  64. data/lib/polars/functions/lazy.rb +97 -2
  65. data/lib/polars/functions/range/linear_space.rb +118 -0
  66. data/lib/polars/io/csv.rb +27 -5
  67. data/lib/polars/io/database.rb +2 -3
  68. data/lib/polars/io/ipc.rb +2 -2
  69. data/lib/polars/io/lines.rb +172 -0
  70. data/lib/polars/io/parquet.rb +1 -1
  71. data/lib/polars/io/sink_options.rb +5 -2
  72. data/lib/polars/lazy_frame.rb +517 -14
  73. data/lib/polars/list_expr.rb +21 -7
  74. data/lib/polars/list_name_space.rb +16 -2
  75. data/lib/polars/query_opt_flags.rb +23 -5
  76. data/lib/polars/selectors.rb +2 -2
  77. data/lib/polars/series.rb +176 -19
  78. data/lib/polars/sql_context.rb +2 -2
  79. data/lib/polars/string_cache.rb +19 -72
  80. data/lib/polars/string_expr.rb +1 -7
  81. data/lib/polars/string_name_space.rb +1 -7
  82. data/lib/polars/utils/construction/series.rb +24 -39
  83. data/lib/polars/utils/convert.rb +16 -6
  84. data/lib/polars/utils/parse.rb +7 -0
  85. data/lib/polars/utils/reduce_balanced.rb +43 -0
  86. data/lib/polars/utils/various.rb +5 -0
  87. data/lib/polars/version.rb +1 -1
  88. data/lib/polars.rb +2 -1
  89. metadata +4 -17
  90. data/ext/polars/src/functions/string_cache.rs +0 -24
@@ -1,10 +1,11 @@
1
1
  use magnus::prelude::*;
2
- use magnus::{RHash, Value};
2
+ use magnus::{RHash, Ruby, Value};
3
3
  use polars::prelude::{ArrowDataType, DataType};
4
4
  use polars_error::polars_err;
5
5
 
6
6
  use crate::interop::arrow::to_rust::normalize_arrow_fields;
7
7
  use crate::prelude::Wrap;
8
+ use crate::ruby::utils::TryIntoValue;
8
9
  use crate::series::import_schema_rbcapsule;
9
10
  use crate::utils::to_rb_err;
10
11
  use crate::{RbResult, RbValueError};
@@ -16,6 +17,7 @@ pub fn init_polars_schema_from_arrow_c_schema(
16
17
  polars_schema: RHash,
17
18
  schema_object: Value,
18
19
  ) -> RbResult<()> {
20
+ let ruby = &Ruby::get_with(polars_schema);
19
21
  let schema_capsule = schema_object.funcall("arrow_c_schema", ())?;
20
22
 
21
23
  let field = import_schema_rbcapsule(schema_capsule)?;
@@ -33,7 +35,7 @@ pub fn init_polars_schema_from_arrow_c_schema(
33
35
  let dtype = DataType::from_arrow_field(&field);
34
36
 
35
37
  let name = field.name.as_str();
36
- let dtype = Wrap(dtype);
38
+ let dtype = Wrap(dtype).try_into_value_with(ruby)?;
37
39
 
38
40
  if polars_schema.get(name).is_some() {
39
41
  return Err(to_rb_err(polars_err!(
@@ -11,7 +11,7 @@ use crate::RbResult;
11
11
 
12
12
  #[magnus::wrap(class = "Polars::ArrowArrayStream")]
13
13
  pub struct RbArrowArrayStream {
14
- stream: ffi::ArrowArrayStream,
14
+ pub(crate) stream: ffi::ArrowArrayStream,
15
15
  }
16
16
 
17
17
  impl RbArrowArrayStream {
@@ -1,4 +1,4 @@
1
- use magnus::Value;
1
+ use magnus::{Ruby, Value};
2
2
  use num_traits::{Float, NumCast};
3
3
  use polars_core::prelude::*;
4
4
 
@@ -10,34 +10,35 @@ use crate::series::RbSeries;
10
10
 
11
11
  impl RbSeries {
12
12
  /// Convert this Series to a Numo array.
13
- pub fn to_numo(&self) -> RbResult<Value> {
14
- series_to_numo(&self.series.read())
13
+ pub fn to_numo(rb: &Ruby, self_: &Self) -> RbResult<Value> {
14
+ series_to_numo(rb, &self_.series.read())
15
15
  }
16
16
  }
17
17
 
18
18
  /// Convert a Series to a Numo array.
19
- fn series_to_numo(s: &Series) -> RbResult<Value> {
20
- series_to_numo_with_copy(s)
19
+ fn series_to_numo(rb: &Ruby, s: &Series) -> RbResult<Value> {
20
+ series_to_numo_with_copy(rb, s)
21
21
  }
22
22
 
23
23
  /// Convert a Series to a Numo array, copying data in the process.
24
- fn series_to_numo_with_copy(s: &Series) -> RbResult<Value> {
24
+ fn series_to_numo_with_copy(rb: &Ruby, s: &Series) -> RbResult<Value> {
25
25
  use DataType::*;
26
26
  match s.dtype() {
27
- Int8 => numeric_series_to_numpy::<Int8Type, f32>(s),
28
- Int16 => numeric_series_to_numpy::<Int16Type, f32>(s),
29
- Int32 => numeric_series_to_numpy::<Int32Type, f64>(s),
30
- Int64 => numeric_series_to_numpy::<Int64Type, f64>(s),
31
- UInt8 => numeric_series_to_numpy::<UInt8Type, f32>(s),
32
- UInt16 => numeric_series_to_numpy::<UInt16Type, f32>(s),
33
- UInt32 => numeric_series_to_numpy::<UInt32Type, f64>(s),
34
- UInt64 => numeric_series_to_numpy::<UInt64Type, f64>(s),
35
- Float32 => numeric_series_to_numpy::<Float32Type, f32>(s),
36
- Float64 => numeric_series_to_numpy::<Float64Type, f64>(s),
37
- Boolean => boolean_series_to_numo(s),
27
+ Int8 => numeric_series_to_numo::<Int8Type, f32>(rb, s),
28
+ Int16 => numeric_series_to_numo::<Int16Type, f32>(rb, s),
29
+ Int32 => numeric_series_to_numo::<Int32Type, f64>(rb, s),
30
+ Int64 => numeric_series_to_numo::<Int64Type, f64>(rb, s),
31
+ UInt8 => numeric_series_to_numo::<UInt8Type, f32>(rb, s),
32
+ UInt16 => numeric_series_to_numo::<UInt16Type, f32>(rb, s),
33
+ UInt32 => numeric_series_to_numo::<UInt32Type, f64>(rb, s),
34
+ UInt64 => numeric_series_to_numo::<UInt64Type, f64>(rb, s),
35
+ Float32 => numeric_series_to_numo::<Float32Type, f32>(rb, s),
36
+ Float64 => numeric_series_to_numo::<Float64Type, f64>(rb, s),
37
+ Boolean => boolean_series_to_numo(rb, s),
38
38
  String => {
39
39
  let ca = s.str().unwrap();
40
- RbArray1::from_iter(ca)
40
+ let values = ca.iter();
41
+ RbArray1::from_iter(rb, values)
41
42
  }
42
43
  dt => {
43
44
  raise_err!(
@@ -49,7 +50,7 @@ fn series_to_numo_with_copy(s: &Series) -> RbResult<Value> {
49
50
  }
50
51
 
51
52
  /// Convert numeric types to f32 or f64 with NaN representing a null value.
52
- fn numeric_series_to_numpy<T, U>(s: &Series) -> RbResult<Value>
53
+ fn numeric_series_to_numo<T, U>(rb: &Ruby, s: &Series) -> RbResult<Value>
53
54
  where
54
55
  T: PolarsNumericType,
55
56
  T::Native: Element,
@@ -58,25 +59,25 @@ where
58
59
  let ca: &ChunkedArray<T> = s.as_ref().as_ref();
59
60
  if s.null_count() == 0 {
60
61
  let values = ca.into_no_null_iter();
61
- RbArray1::<T::Native>::from_iter(values)
62
+ RbArray1::<T::Native>::from_iter(rb, values)
62
63
  } else {
63
64
  let mapper = |opt_v: Option<T::Native>| match opt_v {
64
65
  Some(v) => NumCast::from(v).unwrap(),
65
66
  None => U::nan(),
66
67
  };
67
68
  let values = ca.iter().map(mapper);
68
- RbArray1::from_iter(values)
69
+ RbArray1::from_iter(rb, values)
69
70
  }
70
71
  }
71
72
 
72
73
  /// Convert booleans to bit if no nulls are present, otherwise convert to objects.
73
- fn boolean_series_to_numo(s: &Series) -> RbResult<Value> {
74
+ fn boolean_series_to_numo(rb: &Ruby, s: &Series) -> RbResult<Value> {
74
75
  let ca = s.bool().unwrap();
75
76
  if s.null_count() == 0 {
76
- let values = ca.into_no_null_iter();
77
- RbArray1::<bool>::from_iter(values)
77
+ let values = ca.no_null_iter();
78
+ RbArray1::<bool>::from_iter(rb, values)
78
79
  } else {
79
80
  let values = ca.iter();
80
- RbArray1::from_iter(values)
81
+ RbArray1::from_iter(rb, values)
81
82
  }
82
83
  }
@@ -11,9 +11,9 @@ use polars_io::{HiveOptions, RowIndex};
11
11
  use polars_utils::IdxSize;
12
12
  use polars_utils::slice_enum::Slice;
13
13
 
14
- use crate::RbResult;
15
14
  use crate::io::cloud_options::OptRbCloudOptions;
16
15
  use crate::prelude::Wrap;
16
+ use crate::{RbDataFrame, RbResult};
17
17
 
18
18
  /// Interface to `class ScanOptions` on the Ruby side
19
19
  pub struct RbScanOptions(Value);
@@ -25,8 +25,11 @@ impl TryConvert for RbScanOptions {
25
25
  }
26
26
 
27
27
  impl TryConvert for Wrap<TableStatistics> {
28
- fn try_convert(_ob: Value) -> RbResult<Self> {
29
- todo!();
28
+ fn try_convert(ob: Value) -> RbResult<Self> {
29
+ let attr: Value = ob.funcall("_df", ())?;
30
+ Ok(Wrap(TableStatistics(Arc::new(
31
+ <&RbDataFrame>::try_convert(attr)?.clone().df.into_inner(),
32
+ ))))
30
33
  }
31
34
  }
32
35
 
@@ -26,6 +26,7 @@ impl RbSinkOptions {
26
26
  let sync_on_close: Option<Wrap<SyncOnCloseType>> = self.0.funcall("sync_on_close", ())?;
27
27
  let storage_options: OptRbCloudOptions = self.0.funcall("storage_options", ())?;
28
28
  let credential_provider: Option<Value> = self.0.funcall("credential_provider", ())?;
29
+ let sinked_paths_callback: Option<Value> = self.0.funcall("sinked_paths_callback", ())?;
29
30
 
30
31
  let cloud_options =
31
32
  storage_options.extract_opt_cloud_options(cloud_scheme, credential_provider)?;
@@ -37,6 +38,7 @@ impl RbSinkOptions {
37
38
  maintain_order,
38
39
  sync_on_close,
39
40
  cloud_options: cloud_options.map(Arc::new),
41
+ sinked_paths_callback: sinked_paths_callback.map(|_| todo!()),
40
42
  };
41
43
 
42
44
  Ok(unified_sink_args)
@@ -1,10 +1,13 @@
1
+ use arrow::ffi::export_iterator;
1
2
  use magnus::{
2
3
  IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, value::ReprValue,
3
4
  };
4
5
  use parking_lot::Mutex;
6
+ use polars::frame::PivotColumnNaming;
5
7
  use polars::io::RowIndex;
6
8
  use polars::lazy::frame::LazyFrame;
7
9
  use polars::prelude::*;
10
+ use polars_core::query_result::QueryResult;
8
11
  use polars_plan::dsl::ScanSources;
9
12
  use polars_plan::plans::{HintIR, Sorted};
10
13
  use std::num::NonZeroUsize;
@@ -19,8 +22,14 @@ use crate::io::sink_options::RbSinkOptions;
19
22
  use crate::io::sink_output::RbFileSinkDestination;
20
23
  use crate::ruby::gvl::GvlExt;
21
24
  use crate::ruby::lazy::RubyUdfLazyFrameExt;
22
- use crate::utils::EnterPolarsExt;
23
- use crate::{RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
25
+ use crate::ruby::plan_callback::PlanCallbackExt;
26
+ use crate::ruby::ruby_function::RubyObject;
27
+ use crate::ruby::utils::TryIntoValue;
28
+ use crate::utils::{EnterPolarsExt, to_rb_err};
29
+ use crate::{
30
+ RbArrowArrayStream, RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbTypeError,
31
+ RbValueError,
32
+ };
24
33
 
25
34
  fn rbobject_to_first_path_and_scan_sources(
26
35
  obj: Value,
@@ -128,6 +137,7 @@ impl RbLazyFrame {
128
137
  let cloud_options = OptRbCloudOptions::try_convert(arguments[28])?;
129
138
  let credential_provider = Option::<Value>::try_convert(arguments[29])?;
130
139
  let include_file_paths = Option::<String>::try_convert(arguments[30])?;
140
+ let missing_columns = Option::<Wrap<MissingColumnsPolicy>>::try_convert(arguments[31])?;
131
141
  // end arguments
132
142
 
133
143
  let null_values = null_values.map(|w| w.0);
@@ -189,7 +199,8 @@ impl RbLazyFrame {
189
199
  .with_decimal_comma(decimal_comma)
190
200
  .with_glob(glob)
191
201
  .with_raise_if_empty(raise_if_empty)
192
- .with_include_file_paths(include_file_paths.map(|x| x.into()));
202
+ .with_include_file_paths(include_file_paths.map(|x| x.into()))
203
+ .with_missing_columns_policy(missing_columns.map(|x| x.0));
193
204
 
194
205
  if let Some(lambda) = with_schema_modify {
195
206
  let f = |schema: Schema| {
@@ -272,6 +283,25 @@ impl RbLazyFrame {
272
283
  Ok(lf.into())
273
284
  }
274
285
 
286
+ pub fn new_from_scan_lines(
287
+ sources: Wrap<ScanSources>,
288
+ scan_options: RbScanOptions,
289
+ name: String,
290
+ ) -> RbResult<Self> {
291
+ let sources = sources.0;
292
+ let first_path = sources.first_path();
293
+
294
+ let unified_scan_args =
295
+ scan_options.extract_unified_scan_args(first_path.and_then(|x| x.scheme()))?;
296
+
297
+ let dsl: DslPlan = DslBuilder::scan_lines(sources, unified_scan_args, (&*name).into())
298
+ .map_err(to_rb_err)?
299
+ .build();
300
+ let lf: LazyFrame = dsl.into();
301
+
302
+ Ok(lf.into())
303
+ }
304
+
275
305
  pub fn describe_plan(rb: &Ruby, self_: &Self) -> RbResult<String> {
276
306
  rb.enter_polars(|| self_.ldf.read().describe_plan())
277
307
  }
@@ -388,7 +418,11 @@ impl RbLazyFrame {
388
418
  pub fn collect(rb: &Ruby, self_: &Self, engine: Wrap<Engine>) -> RbResult<RbDataFrame> {
389
419
  rb.enter_polars_df(|| {
390
420
  let ldf = self_.ldf.read().clone();
391
- ldf.collect_with_engine(engine.0)
421
+ ldf.collect_with_engine(engine.0).map(|r| match r {
422
+ QueryResult::Single(df) => df,
423
+ // TODO: Should return query results
424
+ QueryResult::Multiple(_) => DataFrame::empty(),
425
+ })
392
426
  })
393
427
  }
394
428
 
@@ -409,7 +443,7 @@ impl RbLazyFrame {
409
443
 
410
444
  PolarsResult::Ok(RbCollectBatches {
411
445
  inner: Arc::new(Mutex::new(collect_batches)),
412
- _ldf: ldf,
446
+ ldf,
413
447
  })
414
448
  })
415
449
  }
@@ -573,6 +607,20 @@ impl RbLazyFrame {
573
607
  .map(Into::into)
574
608
  }
575
609
 
610
+ pub fn sink_batches(
611
+ rb: &Ruby,
612
+ self_: &Self,
613
+ function: Value,
614
+ maintain_order: bool,
615
+ chunk_size: Option<NonZeroUsize>,
616
+ ) -> RbResult<RbLazyFrame> {
617
+ let ldf = self_.ldf.read().clone();
618
+ // ensure new_ruby is called with GVL
619
+ let callback = PlanCallback::new_ruby(RubyObject::from(function));
620
+ rb.enter_polars(|| ldf.sink_batches(callback, maintain_order, chunk_size))
621
+ .map(Into::into)
622
+ }
623
+
576
624
  pub fn filter(&self, predicate: &RbExpr) -> Self {
577
625
  let ldf = self.ldf.read().clone();
578
626
  ldf.filter(predicate.inner.clone()).into()
@@ -771,6 +819,12 @@ impl RbLazyFrame {
771
819
  .into())
772
820
  }
773
821
 
822
+ pub fn gather(&self, idxs: &Self, null_on_oob: bool) -> Self {
823
+ let ldf = self.ldf.read().clone();
824
+ let idxs = idxs.clone().ldf.into_inner();
825
+ ldf.gather(idxs, null_on_oob).into()
826
+ }
827
+
774
828
  pub fn with_column(&self, expr: &RbExpr) -> Self {
775
829
  let ldf = self.ldf.read().clone();
776
830
  ldf.with_column(expr.inner.clone()).into()
@@ -786,6 +840,127 @@ impl RbLazyFrame {
786
840
  Ok(ldf.with_columns_seq(exprs.to_exprs()?).into())
787
841
  }
788
842
 
843
+ pub fn match_to_schema(
844
+ &self,
845
+ schema: Wrap<Schema>,
846
+ missing_columns: Value,
847
+ missing_struct_fields: Value,
848
+ extra_columns: Wrap<ExtraColumnsPolicy>,
849
+ extra_struct_fields: Value,
850
+ integer_cast: Value,
851
+ float_cast: Value,
852
+ ) -> RbResult<Self> {
853
+ fn parse_missing_columns(
854
+ schema: &Schema,
855
+ missing_columns: Value,
856
+ ) -> RbResult<Vec<MissingColumnsPolicyOrExpr>> {
857
+ let mut out = Vec::with_capacity(schema.len());
858
+ if let Ok(policy) = Wrap::<MissingColumnsPolicyOrExpr>::try_convert(missing_columns) {
859
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
860
+ } else if let Ok(dict) = RHash::try_convert(missing_columns) {
861
+ out.extend(std::iter::repeat_n(
862
+ MissingColumnsPolicyOrExpr::Raise,
863
+ schema.len(),
864
+ ));
865
+ dict.foreach(|key: String, value: Wrap<MissingColumnsPolicyOrExpr>| {
866
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
867
+ Ok(ForEach::Continue)
868
+ })?;
869
+ } else {
870
+ return Err(RbTypeError::new_err("Invalid value for `missing_columns`"));
871
+ }
872
+ Ok(out)
873
+ }
874
+ fn parse_missing_struct_fields(
875
+ schema: &Schema,
876
+ missing_struct_fields: Value,
877
+ ) -> RbResult<Vec<MissingColumnsPolicy>> {
878
+ let mut out = Vec::with_capacity(schema.len());
879
+ if let Ok(policy) = Wrap::<MissingColumnsPolicy>::try_convert(missing_struct_fields) {
880
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
881
+ } else if let Ok(dict) = RHash::try_convert(missing_struct_fields) {
882
+ out.extend(std::iter::repeat_n(
883
+ MissingColumnsPolicy::Raise,
884
+ schema.len(),
885
+ ));
886
+ dict.foreach(|key: String, value: Wrap<MissingColumnsPolicy>| {
887
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
888
+ Ok(ForEach::Continue)
889
+ })?;
890
+ } else {
891
+ return Err(RbTypeError::new_err(
892
+ "Invalid value for `missing_struct_fields`",
893
+ ));
894
+ }
895
+ Ok(out)
896
+ }
897
+ fn parse_extra_struct_fields(
898
+ schema: &Schema,
899
+ extra_struct_fields: Value,
900
+ ) -> RbResult<Vec<ExtraColumnsPolicy>> {
901
+ let mut out = Vec::with_capacity(schema.len());
902
+ if let Ok(policy) = Wrap::<ExtraColumnsPolicy>::try_convert(extra_struct_fields) {
903
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
904
+ } else if let Ok(dict) = RHash::try_convert(extra_struct_fields) {
905
+ out.extend(std::iter::repeat_n(ExtraColumnsPolicy::Raise, schema.len()));
906
+ dict.foreach(|key: String, value: Wrap<ExtraColumnsPolicy>| {
907
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
908
+ Ok(ForEach::Continue)
909
+ })?;
910
+ } else {
911
+ return Err(RbTypeError::new_err(
912
+ "Invalid value for `extra_struct_fields`",
913
+ ));
914
+ }
915
+ Ok(out)
916
+ }
917
+ fn parse_cast(schema: &Schema, cast: Value) -> RbResult<Vec<UpcastOrForbid>> {
918
+ let mut out = Vec::with_capacity(schema.len());
919
+ if let Ok(policy) = Wrap::<UpcastOrForbid>::try_convert(cast) {
920
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
921
+ } else if let Ok(dict) = RHash::try_convert(cast) {
922
+ out.extend(std::iter::repeat_n(UpcastOrForbid::Forbid, schema.len()));
923
+ dict.foreach(|key: String, value: Wrap<UpcastOrForbid>| {
924
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
925
+ Ok(ForEach::Continue)
926
+ })?;
927
+ } else {
928
+ return Err(RbTypeError::new_err(
929
+ "Invalid value for `integer_cast` / `float_cast`",
930
+ ));
931
+ }
932
+ Ok(out)
933
+ }
934
+
935
+ let missing_columns = parse_missing_columns(&schema.0, missing_columns)?;
936
+ let missing_struct_fields = parse_missing_struct_fields(&schema.0, missing_struct_fields)?;
937
+ let extra_struct_fields = parse_extra_struct_fields(&schema.0, extra_struct_fields)?;
938
+ let integer_cast = parse_cast(&schema.0, integer_cast)?;
939
+ let float_cast = parse_cast(&schema.0, float_cast)?;
940
+
941
+ let per_column = (0..schema.0.len())
942
+ .map(|i| MatchToSchemaPerColumn {
943
+ missing_columns: missing_columns[i].clone(),
944
+ missing_struct_fields: missing_struct_fields[i],
945
+ extra_struct_fields: extra_struct_fields[i],
946
+ integer_cast: integer_cast[i],
947
+ float_cast: float_cast[i],
948
+ })
949
+ .collect();
950
+
951
+ let ldf = self.ldf.read().clone();
952
+ Ok(ldf
953
+ .match_to_schema(Arc::new(schema.0), per_column, extra_columns.0)
954
+ .into())
955
+ }
956
+
957
+ pub fn pipe_with_schema(&self, callback: Value) -> Self {
958
+ let ldf = self.ldf.read().clone();
959
+ let function = RubyObject::from(callback);
960
+ ldf.pipe_with_schema(PlanCallback::new_ruby(function))
961
+ .into()
962
+ }
963
+
789
964
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
790
965
  let ldf = self.ldf.read().clone();
791
966
  ldf.rename(existing, new, strict).into()
@@ -930,6 +1105,7 @@ impl RbLazyFrame {
930
1105
  agg: &RbExpr,
931
1106
  maintain_order: bool,
932
1107
  separator: String,
1108
+ column_naming: Wrap<PivotColumnNaming>,
933
1109
  ) -> Self {
934
1110
  let ldf = self.ldf.read().clone();
935
1111
  ldf.pivot(
@@ -940,6 +1116,7 @@ impl RbLazyFrame {
940
1116
  agg.inner.clone(),
941
1117
  maintain_order,
942
1118
  separator.into(),
1119
+ column_naming.0,
943
1120
  )
944
1121
  .into()
945
1122
  }
@@ -981,7 +1158,7 @@ impl RbLazyFrame {
981
1158
  opt.set(OptFlags::PREDICATE_PUSHDOWN, predicate_pushdown);
982
1159
  opt.set(OptFlags::PROJECTION_PUSHDOWN, projection_pushdown);
983
1160
  opt.set(OptFlags::SLICE_PUSHDOWN, slice_pushdown);
984
- opt.set(OptFlags::NEW_STREAMING, streamable);
1161
+ opt.set(OptFlags::STREAMING, streamable);
985
1162
 
986
1163
  self.ldf
987
1164
  .read()
@@ -1022,14 +1199,12 @@ impl RbLazyFrame {
1022
1199
  let schema = rb.enter_polars(|| self_.ldf.write().collect_schema())?;
1023
1200
 
1024
1201
  let schema_dict = rb.hash_new();
1025
- schema.iter_fields().for_each(|fld| {
1026
- schema_dict
1027
- .aset::<String, Value>(
1028
- fld.name().to_string(),
1029
- Wrap(fld.dtype().clone()).into_value_with(rb),
1030
- )
1031
- .unwrap();
1032
- });
1202
+ for fld in schema.iter_fields() {
1203
+ schema_dict.aset(
1204
+ fld.name().to_string(),
1205
+ Wrap(fld.dtype().clone()).try_into_value_with(rb)?,
1206
+ )?;
1207
+ }
1033
1208
  Ok(schema_dict)
1034
1209
  }
1035
1210
 
@@ -1049,12 +1224,12 @@ impl RbLazyFrame {
1049
1224
  ldf.count().into()
1050
1225
  }
1051
1226
 
1052
- pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
1227
+ pub fn merge_sorted(&self, other: &Self, key: String, maintain_order: bool) -> RbResult<Self> {
1053
1228
  let out = self
1054
1229
  .ldf
1055
1230
  .read()
1056
1231
  .clone()
1057
- .merge_sorted(other.ldf.read().clone(), &key)
1232
+ .merge_sorted(other.ldf.read().clone(), &key, maintain_order)
1058
1233
  .map_err(RbPolarsErr::from)?;
1059
1234
  Ok(out.into())
1060
1235
  }
@@ -1118,7 +1293,7 @@ impl RbLazyFrame {
1118
1293
  #[magnus::wrap(class = "Polars::RbCollectBatches")]
1119
1294
  pub struct RbCollectBatches {
1120
1295
  inner: Arc<Mutex<CollectBatches>>,
1121
- _ldf: LazyFrame,
1296
+ ldf: LazyFrame,
1122
1297
  }
1123
1298
 
1124
1299
  impl RbCollectBatches {
@@ -1126,4 +1301,55 @@ impl RbCollectBatches {
1126
1301
  let inner = Arc::clone(&slf.inner);
1127
1302
  rb.enter_polars(|| PolarsResult::Ok(inner.lock().next().transpose()?.map(RbDataFrame::new)))
1128
1303
  }
1304
+
1305
+ pub fn __arrow_c_stream__(rb: &Ruby, self_: &Self) -> RbResult<Value> {
1306
+ let mut ldf = self_.ldf.clone();
1307
+ let schema = ldf
1308
+ .collect_schema()
1309
+ .map_err(RbPolarsErr::from)?
1310
+ .to_arrow(CompatLevel::newest());
1311
+
1312
+ let dtype = ArrowDataType::Struct(schema.into_iter_values().collect());
1313
+
1314
+ let iter = Box::new(ArrowStreamIterator::new(self_.inner.clone(), dtype.clone()));
1315
+ let field = ArrowField::new(PlSmallStr::EMPTY, dtype, false);
1316
+ let stream = export_iterator(iter, field);
1317
+ Ok(RbArrowArrayStream { stream }.into_value_with(rb))
1318
+ }
1319
+ }
1320
+
1321
+ pub struct ArrowStreamIterator {
1322
+ inner: Arc<Mutex<CollectBatches>>,
1323
+ dtype: ArrowDataType,
1324
+ }
1325
+
1326
+ impl ArrowStreamIterator {
1327
+ fn new(inner: Arc<Mutex<CollectBatches>>, schema: ArrowDataType) -> Self {
1328
+ Self {
1329
+ inner,
1330
+ dtype: schema,
1331
+ }
1332
+ }
1333
+ }
1334
+
1335
+ impl Iterator for ArrowStreamIterator {
1336
+ type Item = PolarsResult<ArrayRef>;
1337
+
1338
+ fn next(&mut self) -> Option<Self::Item> {
1339
+ let next = self.inner.lock().next();
1340
+ match next {
1341
+ None => None,
1342
+ Some(Err(err)) => Some(Err(err)),
1343
+ Some(Ok(df)) => {
1344
+ let height = df.height();
1345
+ let arrays = df.rechunk_into_arrow(CompatLevel::newest());
1346
+ Some(Ok(Box::new(arrow::array::StructArray::new(
1347
+ self.dtype.clone(),
1348
+ height,
1349
+ arrays,
1350
+ None,
1351
+ ))))
1352
+ }
1353
+ }
1354
+ }
1129
1355
  }
@@ -52,7 +52,8 @@ flag_getter_setters! {
52
52
  (COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
53
53
  (CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
54
54
  (FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
55
+ (SORT_COLLAPSE, get_sort_collapse, set_sort_collapse, clear=true)
55
56
 
56
57
  (EAGER, get_eager, set_eager, clear=true)
57
- (NEW_STREAMING, get_streaming, set_streaming, clear=true)
58
+ (STREAMING, get_streaming, set_streaming, clear=true)
58
59
  }