polars-df 0.25.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -0
  3. data/Cargo.lock +5 -5
  4. data/ext/polars/Cargo.toml +2 -1
  5. data/ext/polars/src/conversion/any_value.rs +28 -5
  6. data/ext/polars/src/conversion/mod.rs +49 -13
  7. data/ext/polars/src/dataframe/general.rs +20 -3
  8. data/ext/polars/src/expr/datatype.rs +11 -2
  9. data/ext/polars/src/expr/general.rs +8 -0
  10. data/ext/polars/src/expr/rolling.rs +24 -0
  11. data/ext/polars/src/functions/lazy.rs +7 -0
  12. data/ext/polars/src/functions/range.rs +13 -0
  13. data/ext/polars/src/interop/arrow/to_rb.rs +1 -1
  14. data/ext/polars/src/lazyframe/general.rs +215 -4
  15. data/ext/polars/src/lib.rs +25 -2
  16. data/ext/polars/src/on_startup.rs +11 -1
  17. data/ext/polars/src/ruby/plan_callback.rs +1 -4
  18. data/ext/polars/src/series/construction.rs +31 -50
  19. data/ext/polars/src/series/general.rs +2 -3
  20. data/lib/polars/collect_batches.rb +4 -0
  21. data/lib/polars/data_frame.rb +71 -1
  22. data/lib/polars/data_type_group.rb +5 -0
  23. data/lib/polars/expr.rb +125 -0
  24. data/lib/polars/functions/datatype.rb +30 -0
  25. data/lib/polars/functions/lazy.rb +92 -0
  26. data/lib/polars/functions/range/linear_space.rb +118 -0
  27. data/lib/polars/io/database.rb +2 -3
  28. data/lib/polars/io/ipc.rb +1 -1
  29. data/lib/polars/io/lines.rb +172 -0
  30. data/lib/polars/io/parquet.rb +1 -1
  31. data/lib/polars/io/sink_options.rb +1 -0
  32. data/lib/polars/lazy_frame.rb +420 -0
  33. data/lib/polars/query_opt_flags.rb +1 -0
  34. data/lib/polars/selectors.rb +1 -1
  35. data/lib/polars/series.rb +88 -0
  36. data/lib/polars/utils/construction/series.rb +19 -39
  37. data/lib/polars/version.rb +1 -1
  38. data/lib/polars.rb +1 -0
  39. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f29eb603b53abf84e780389ff4c04ff58ae16ccb4336747150caf662166c8ff7
4
- data.tar.gz: fea822884078c5f8ea7cc060d24f92304465b11efdb42eef36a63ee0f41e02a7
3
+ metadata.gz: 3f8d65cfb4e8577c4f9e92f2f2271f97f44b0bc66bd16e9b517a5ab5dae9a877
4
+ data.tar.gz: 7b9254cc522bfa6eae53986384d33c96933ffa788b47f4f6e82bc46d2cbf6395
5
5
  SHA512:
6
- metadata.gz: ed7f845b117e6dcd7adb90b77cd7dbe0e53607d2c2551627c5ca2bbfbbc42fe94970bd98c31dd752b7a61ff97570f96441a403d347e2e45aae0062f78c9e3676
7
- data.tar.gz: 40c417f1de724293f8800e02da735a6fd5631607e1da8c505f3a42e6056b98a1611e720f9d9522e56838161015e6ad8be4722391eeda57cce3d4614e2bad5dab
6
+ metadata.gz: 97754dc2c36dbc5e2d7d99bf71973d05107d9124d1dc7bea1bd760ee77d9ac303898a72ac301c798fe4ae77cd24a086a28b20db2e8254c8ef300b3df205e33a5
7
+ data.tar.gz: 61cbe25d941e7451bcce590a263912967430aa31638db0cfe699d05754ebae71da3495bcc6ac7305afcfd9cfb3bb39670785501e196eae5f6fd6ba631bc92d27
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.25.1 (2026-03-23)
2
+
3
+ - Added `explain_all` method to `Polars`
4
+ - Fixed `storage_options` option for `scan_parquet` and `scan_ipc` methods
5
+ - Fixed `Struct` construction with `nil` values
6
+ - Fixed `read_database` for JSON columns
7
+
1
8
  ## 0.25.0 (2026-02-18)
2
9
 
3
10
  - Updated Polars to 0.53.0
data/Cargo.lock CHANGED
@@ -2160,7 +2160,7 @@ dependencies = [
2160
2160
 
2161
2161
  [[package]]
2162
2162
  name = "polars-ruby"
2163
- version = "0.25.0"
2163
+ version = "0.25.1"
2164
2164
  dependencies = [
2165
2165
  "ahash",
2166
2166
  "bytes",
@@ -2408,9 +2408,9 @@ dependencies = [
2408
2408
 
2409
2409
  [[package]]
2410
2410
  name = "quinn-proto"
2411
- version = "0.11.12"
2411
+ version = "0.11.14"
2412
2412
  source = "registry+https://github.com/rust-lang/crates.io-index"
2413
- checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e"
2413
+ checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
2414
2414
  dependencies = [
2415
2415
  "bytes",
2416
2416
  "getrandom 0.3.3",
@@ -2811,9 +2811,9 @@ dependencies = [
2811
2811
 
2812
2812
  [[package]]
2813
2813
  name = "rustls-webpki"
2814
- version = "0.103.3"
2814
+ version = "0.103.10"
2815
2815
  source = "registry+https://github.com/rust-lang/crates.io-index"
2816
- checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
2816
+ checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
2817
2817
  dependencies = [
2818
2818
  "ring",
2819
2819
  "rustls-pki-types",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars-ruby"
3
- version = "0.25.0"
3
+ version = "0.25.1"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2024"
@@ -130,6 +130,7 @@ features = [
130
130
  "rolling_window_by",
131
131
  "round_series",
132
132
  "row_hash",
133
+ "scan_lines",
133
134
  "search_sorted",
134
135
  "semi_anti_join",
135
136
  "serde-lazy",
@@ -1,8 +1,10 @@
1
1
  use magnus::encoding::EncodingCapable;
2
2
  use magnus::{
3
- IntoValue, RArray, RHash, RString, Ruby, TryConvert, Value, prelude::*, r_hash::ForEach,
3
+ IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, prelude::*,
4
+ r_hash::ForEach, value::Opaque,
4
5
  };
5
6
  use num_traits::ToPrimitive;
7
+ use polars::chunked_array::object::PolarsObjectSafe;
6
8
  use polars::prelude::*;
7
9
  use polars_compute::decimal::{DEC128_MAX_PREC, DecimalFmtBuffer, dec128_fits};
8
10
  use polars_core::utils::any_values_to_supertype_and_n_dtypes;
@@ -22,7 +24,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
22
24
 
23
25
  impl TryConvert for Wrap<AnyValue<'_>> {
24
26
  fn try_convert(ob: Value) -> RbResult<Self> {
25
- rb_object_to_any_value(ob, true).map(Wrap)
27
+ rb_object_to_any_value(ob, true, true).map(Wrap)
26
28
  }
27
29
  }
28
30
 
@@ -88,7 +90,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
88
90
  }
89
91
  }
90
92
 
91
- pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
93
+ pub(crate) fn rb_object_to_any_value<'s>(
94
+ ob: Value,
95
+ strict: bool,
96
+ allow_object: bool,
97
+ ) -> RbResult<AnyValue<'s>> {
92
98
  // Conversion functions.
93
99
  fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
94
100
  Ok(AnyValue::Null)
@@ -174,7 +180,12 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
174
180
  let len = dict.len();
175
181
  let mut keys = Vec::with_capacity(len);
176
182
  let mut vals = Vec::with_capacity(len);
177
- dict.foreach(|key: String, val: Wrap<AnyValue>| {
183
+ dict.foreach(|key: Value, val: Wrap<AnyValue>| {
184
+ let key = if let Some(s) = Symbol::from_value(key) {
185
+ s.name()?.to_string()
186
+ } else {
187
+ String::try_convert(key)?
188
+ };
178
189
  let val = val.0;
179
190
  let dtype = DataType::from(&val);
180
191
  keys.push(Field::new(key.into(), dtype));
@@ -184,6 +195,14 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
184
195
  Ok(AnyValue::StructOwned(Box::new((vals, keys))))
185
196
  }
186
197
 
198
+ fn get_object(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
199
+ // This is slow, but hey don't use objects.
200
+ let v = &ObjectValue {
201
+ inner: Opaque::from(ob),
202
+ };
203
+ Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
204
+ }
205
+
187
206
  fn get_date(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
188
207
  // convert to DateTime for UTC
189
208
  let v = ob
@@ -263,6 +282,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
263
282
  } else if ob.is_kind_of(crate::ruby::rb_modules::bigdecimal(&ruby)) {
264
283
  get_decimal(ob, strict)
265
284
  } else {
266
- Err(RbValueError::new_err(format!("Cannot convert {ob}")))
285
+ if allow_object {
286
+ get_object(ob, strict)
287
+ } else {
288
+ Err(RbValueError::new_err(format!("Cannot convert {ob}")))
289
+ }
267
290
  }
268
291
  }
@@ -37,19 +37,9 @@ use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
37
37
  use crate::object::OBJECT_NAME;
38
38
  use crate::rb_modules::pl_series;
39
39
  use crate::utils::to_rb_err;
40
- use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError};
41
-
42
- pub(crate) fn slice_extract_wrapped<T>(slice: &[Wrap<T>]) -> &[T] {
43
- // Safety:
44
- // Wrap is transparent.
45
- unsafe { std::mem::transmute(slice) }
46
- }
47
-
48
- pub(crate) fn vec_extract_wrapped<T>(buf: Vec<Wrap<T>>) -> Vec<T> {
49
- // Safety:
50
- // Wrap is transparent.
51
- unsafe { std::mem::transmute(buf) }
52
- }
40
+ use crate::{
41
+ RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError,
42
+ };
53
43
 
54
44
  #[repr(transparent)]
55
45
  pub struct Wrap<T>(pub T);
@@ -648,6 +638,16 @@ impl TryConvert for Wrap<ScanSources> {
648
638
  }
649
639
  }
650
640
 
641
+ impl IntoValue for Wrap<Schema> {
642
+ fn into_value_with(self, ruby: &Ruby) -> Value {
643
+ let dict = ruby.hash_new();
644
+ for (k, v) in self.0.iter() {
645
+ dict.aset(k.as_str(), Wrap(v.clone())).unwrap();
646
+ }
647
+ dict.as_value()
648
+ }
649
+ }
650
+
651
651
  #[derive(Clone)]
652
652
  pub struct ObjectValue {
653
653
  pub inner: Opaque<Value>,
@@ -1491,6 +1491,21 @@ impl TryConvert for Wrap<Option<TimeZone>> {
1491
1491
 
1492
1492
  unsafe impl TryConvertOwned for Wrap<Option<TimeZone>> {}
1493
1493
 
1494
+ impl TryConvert for Wrap<UpcastOrForbid> {
1495
+ fn try_convert(ob: Value) -> RbResult<Self> {
1496
+ let parsed = match String::try_convert(ob)?.as_str() {
1497
+ "upcast" => UpcastOrForbid::Upcast,
1498
+ "forbid" => UpcastOrForbid::Forbid,
1499
+ v => {
1500
+ return Err(RbValueError::new_err(format!(
1501
+ "cast parameter must be one of {{'upcast', 'forbid'}}, got {v}",
1502
+ )));
1503
+ }
1504
+ };
1505
+ Ok(Wrap(parsed))
1506
+ }
1507
+ }
1508
+
1494
1509
  impl TryConvert for Wrap<ExtraColumnsPolicy> {
1495
1510
  fn try_convert(ob: Value) -> RbResult<Self> {
1496
1511
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -1521,6 +1536,27 @@ impl TryConvert for Wrap<MissingColumnsPolicy> {
1521
1536
  }
1522
1537
  }
1523
1538
 
1539
+ impl TryConvert for Wrap<MissingColumnsPolicyOrExpr> {
1540
+ fn try_convert(ob: Value) -> RbResult<Self> {
1541
+ if let Ok(rbexpr) = <&RbExpr>::try_convert(ob) {
1542
+ return Ok(Wrap(MissingColumnsPolicyOrExpr::InsertWith(
1543
+ rbexpr.inner.clone(),
1544
+ )));
1545
+ }
1546
+
1547
+ let parsed = match String::try_convert(ob)?.as_str() {
1548
+ "insert" => MissingColumnsPolicyOrExpr::Insert,
1549
+ "raise" => MissingColumnsPolicyOrExpr::Raise,
1550
+ v => {
1551
+ return Err(RbValueError::new_err(format!(
1552
+ "missing column/field parameter must be one of {{'insert', 'raise', expression}}, got {v}",
1553
+ )));
1554
+ }
1555
+ };
1556
+ Ok(Wrap(parsed))
1557
+ }
1558
+ }
1559
+
1524
1560
  impl TryConvert for Wrap<ColumnMapping> {
1525
1561
  fn try_convert(ob: Value) -> RbResult<Self> {
1526
1562
  let (column_mapping_type, ob) = <(String, Value)>::try_convert(ob)?;
@@ -1,5 +1,6 @@
1
1
  use std::hash::BuildHasher;
2
2
 
3
+ use arrow::bitmap::MutableBitmap;
3
4
  use either::Either;
4
5
  use magnus::{IntoValue, RArray, Ruby, Value, prelude::*, value::Opaque};
5
6
  use polars::prelude::*;
@@ -503,10 +504,26 @@ impl RbDataFrame {
503
504
  })
504
505
  }
505
506
 
506
- pub fn to_struct(rb: &Ruby, self_: &Self, name: String) -> RbResult<RbSeries> {
507
+ pub fn to_struct(
508
+ rb: &Ruby,
509
+ self_: &Self,
510
+ name: String,
511
+ invalid_indices: Vec<usize>,
512
+ ) -> RbResult<RbSeries> {
507
513
  rb.enter_polars_series(|| {
508
- let ca = self_.df.read().clone().into_struct(name.into());
509
- Ok(ca)
514
+ let mut ca = self_.df.read().clone().into_struct(name.into());
515
+
516
+ if !invalid_indices.is_empty() {
517
+ let mut validity = MutableBitmap::with_capacity(ca.len());
518
+ validity.extend_constant(ca.len(), true);
519
+ for i in invalid_indices {
520
+ validity.set(i, false);
521
+ }
522
+ ca.rechunk_mut();
523
+ Ok(ca.with_outer_validity(Some(validity.freeze())))
524
+ } else {
525
+ Ok(ca)
526
+ }
510
527
  })
511
528
  }
512
529
 
@@ -1,5 +1,5 @@
1
- use magnus::{IntoValue, Ruby, Value};
2
- use polars::prelude::{DataType, DataTypeExpr, Schema};
1
+ use magnus::{IntoValue, RArray, Ruby, TryConvert, Value};
2
+ use polars::prelude::{DataType, DataTypeExpr, PlSmallStr, Schema};
3
3
 
4
4
  use crate::prelude::Wrap;
5
5
  use crate::{RbExpr, RbPolarsErr, RbResult};
@@ -38,4 +38,13 @@ impl RbDataTypeExpr {
38
38
  .map_err(RbPolarsErr::from)?;
39
39
  Ok(Wrap(dtype).into_value_with(ruby))
40
40
  }
41
+
42
+ pub fn struct_with_fields(rb_fields: RArray) -> RbResult<Self> {
43
+ let mut fields = Vec::new();
44
+ for v in rb_fields.into_iter() {
45
+ let (name, dt_expr) = <(String, &RbDataTypeExpr)>::try_convert(v)?;
46
+ fields.push((PlSmallStr::from_string(name), dt_expr.inner.clone()));
47
+ }
48
+ Ok(DataTypeExpr::StructWithFields(fields).into())
49
+ }
41
50
  }
@@ -118,6 +118,14 @@ impl RbExpr {
118
118
  self.inner.clone().max().into()
119
119
  }
120
120
 
121
+ pub fn min_by(&self, by: &Self) -> Self {
122
+ self.inner.clone().min_by(by.inner.clone()).into()
123
+ }
124
+
125
+ pub fn max_by(&self, by: &Self) -> Self {
126
+ self.inner.clone().max_by(by.inner.clone()).into()
127
+ }
128
+
121
129
  pub fn nan_max(&self) -> Self {
122
130
  self.inner.clone().nan_max().into()
123
131
  }
@@ -1,6 +1,9 @@
1
+ use magnus::Value;
1
2
  use polars::prelude::*;
2
3
 
3
4
  use crate::conversion::Wrap;
5
+ use crate::ruby::plan_callback::PlanCallbackExt;
6
+ use crate::ruby::ruby_function::RubyObject;
4
7
  use crate::{RbExpr, RbPolarsErr, RbResult};
5
8
 
6
9
  impl RbExpr {
@@ -406,4 +409,25 @@ impl RbExpr {
406
409
 
407
410
  self.inner.clone().rolling_kurtosis(options).into()
408
411
  }
412
+
413
+ pub fn rolling_map(
414
+ &self,
415
+ lambda: Value,
416
+ window_size: usize,
417
+ weights: Option<Vec<f64>>,
418
+ min_periods: Option<usize>,
419
+ center: bool,
420
+ ) -> Self {
421
+ let min_periods = min_periods.unwrap_or(window_size);
422
+ let options = RollingOptionsFixedWindow {
423
+ window_size,
424
+ weights,
425
+ min_periods,
426
+ center,
427
+ ..Default::default()
428
+ };
429
+ let function = PlanCallback::new_ruby(RubyObject::from(lambda));
430
+
431
+ self.inner.clone().rolling_map(function, options).into()
432
+ }
409
433
  }
@@ -130,6 +130,13 @@ pub fn collect_all(
130
130
  Ok(ruby.ary_from_iter(dfs.into_iter().map(Into::<RbDataFrame>::into)))
131
131
  }
132
132
 
133
+ pub fn explain_all(rb: &Ruby, lfs: RArray, optflags: &RbOptFlags) -> RbResult<String> {
134
+ let plans = lfs_to_plans(lfs)?;
135
+ let explained =
136
+ rb.enter_polars(|| LazyFrame::explain_all(plans, optflags.clone().inner.into_inner()))?;
137
+ Ok(explained)
138
+ }
139
+
133
140
  pub fn collect_all_lazy(lfs: RArray, optflags: &RbOptFlags) -> RbResult<RbLazyFrame> {
134
141
  let plans = lfs_to_plans(lfs)?;
135
142
 
@@ -138,6 +138,19 @@ pub fn time_ranges(
138
138
  Ok(dsl::time_ranges(start, end, every, closed).into())
139
139
  }
140
140
 
141
+ pub fn linear_space(
142
+ start: &RbExpr,
143
+ end: &RbExpr,
144
+ num_samples: &RbExpr,
145
+ closed: Wrap<ClosedInterval>,
146
+ ) -> RbResult<RbExpr> {
147
+ let start = start.inner.clone();
148
+ let end = end.inner.clone();
149
+ let num_samples = num_samples.inner.clone();
150
+ let closed = closed.0;
151
+ Ok(dsl::linear_space(start, end, num_samples, closed).into())
152
+ }
153
+
141
154
  pub fn linear_spaces(
142
155
  start: &RbExpr,
143
156
  end: &RbExpr,
@@ -11,7 +11,7 @@ use crate::RbResult;
11
11
 
12
12
  #[magnus::wrap(class = "Polars::ArrowArrayStream")]
13
13
  pub struct RbArrowArrayStream {
14
- stream: ffi::ArrowArrayStream,
14
+ pub(crate) stream: ffi::ArrowArrayStream,
15
15
  }
16
16
 
17
17
  impl RbArrowArrayStream {
@@ -1,3 +1,4 @@
1
+ use arrow::ffi::export_iterator;
1
2
  use magnus::{
2
3
  IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, value::ReprValue,
3
4
  };
@@ -19,8 +20,13 @@ use crate::io::sink_options::RbSinkOptions;
19
20
  use crate::io::sink_output::RbFileSinkDestination;
20
21
  use crate::ruby::gvl::GvlExt;
21
22
  use crate::ruby::lazy::RubyUdfLazyFrameExt;
22
- use crate::utils::EnterPolarsExt;
23
- use crate::{RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
23
+ use crate::ruby::plan_callback::PlanCallbackExt;
24
+ use crate::ruby::ruby_function::RubyObject;
25
+ use crate::utils::{EnterPolarsExt, to_rb_err};
26
+ use crate::{
27
+ RbArrowArrayStream, RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbTypeError,
28
+ RbValueError,
29
+ };
24
30
 
25
31
  fn rbobject_to_first_path_and_scan_sources(
26
32
  obj: Value,
@@ -272,6 +278,25 @@ impl RbLazyFrame {
272
278
  Ok(lf.into())
273
279
  }
274
280
 
281
+ pub fn new_from_scan_lines(
282
+ sources: Wrap<ScanSources>,
283
+ scan_options: RbScanOptions,
284
+ name: String,
285
+ ) -> RbResult<Self> {
286
+ let sources = sources.0;
287
+ let first_path = sources.first_path();
288
+
289
+ let unified_scan_args =
290
+ scan_options.extract_unified_scan_args(first_path.and_then(|x| x.scheme()))?;
291
+
292
+ let dsl: DslPlan = DslBuilder::scan_lines(sources, unified_scan_args, (&*name).into())
293
+ .map_err(to_rb_err)?
294
+ .build();
295
+ let lf: LazyFrame = dsl.into();
296
+
297
+ Ok(lf.into())
298
+ }
299
+
275
300
  pub fn describe_plan(rb: &Ruby, self_: &Self) -> RbResult<String> {
276
301
  rb.enter_polars(|| self_.ldf.read().describe_plan())
277
302
  }
@@ -409,7 +434,7 @@ impl RbLazyFrame {
409
434
 
410
435
  PolarsResult::Ok(RbCollectBatches {
411
436
  inner: Arc::new(Mutex::new(collect_batches)),
412
- _ldf: ldf,
437
+ ldf,
413
438
  })
414
439
  })
415
440
  }
@@ -573,6 +598,20 @@ impl RbLazyFrame {
573
598
  .map(Into::into)
574
599
  }
575
600
 
601
+ pub fn sink_batches(
602
+ rb: &Ruby,
603
+ self_: &Self,
604
+ function: Value,
605
+ maintain_order: bool,
606
+ chunk_size: Option<NonZeroUsize>,
607
+ ) -> RbResult<RbLazyFrame> {
608
+ let ldf = self_.ldf.read().clone();
609
+ // ensure new_ruby is called with GVL
610
+ let callback = PlanCallback::new_ruby(RubyObject::from(function));
611
+ rb.enter_polars(|| ldf.sink_batches(callback, maintain_order, chunk_size))
612
+ .map(Into::into)
613
+ }
614
+
576
615
  pub fn filter(&self, predicate: &RbExpr) -> Self {
577
616
  let ldf = self.ldf.read().clone();
578
617
  ldf.filter(predicate.inner.clone()).into()
@@ -786,6 +825,127 @@ impl RbLazyFrame {
786
825
  Ok(ldf.with_columns_seq(exprs.to_exprs()?).into())
787
826
  }
788
827
 
828
+ pub fn match_to_schema(
829
+ &self,
830
+ schema: Wrap<Schema>,
831
+ missing_columns: Value,
832
+ missing_struct_fields: Value,
833
+ extra_columns: Wrap<ExtraColumnsPolicy>,
834
+ extra_struct_fields: Value,
835
+ integer_cast: Value,
836
+ float_cast: Value,
837
+ ) -> RbResult<Self> {
838
+ fn parse_missing_columns(
839
+ schema: &Schema,
840
+ missing_columns: Value,
841
+ ) -> RbResult<Vec<MissingColumnsPolicyOrExpr>> {
842
+ let mut out = Vec::with_capacity(schema.len());
843
+ if let Ok(policy) = Wrap::<MissingColumnsPolicyOrExpr>::try_convert(missing_columns) {
844
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
845
+ } else if let Ok(dict) = RHash::try_convert(missing_columns) {
846
+ out.extend(std::iter::repeat_n(
847
+ MissingColumnsPolicyOrExpr::Raise,
848
+ schema.len(),
849
+ ));
850
+ dict.foreach(|key: String, value: Wrap<MissingColumnsPolicyOrExpr>| {
851
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
852
+ Ok(ForEach::Continue)
853
+ })?;
854
+ } else {
855
+ return Err(RbTypeError::new_err("Invalid value for `missing_columns`"));
856
+ }
857
+ Ok(out)
858
+ }
859
+ fn parse_missing_struct_fields(
860
+ schema: &Schema,
861
+ missing_struct_fields: Value,
862
+ ) -> RbResult<Vec<MissingColumnsPolicy>> {
863
+ let mut out = Vec::with_capacity(schema.len());
864
+ if let Ok(policy) = Wrap::<MissingColumnsPolicy>::try_convert(missing_struct_fields) {
865
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
866
+ } else if let Ok(dict) = RHash::try_convert(missing_struct_fields) {
867
+ out.extend(std::iter::repeat_n(
868
+ MissingColumnsPolicy::Raise,
869
+ schema.len(),
870
+ ));
871
+ dict.foreach(|key: String, value: Wrap<MissingColumnsPolicy>| {
872
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
873
+ Ok(ForEach::Continue)
874
+ })?;
875
+ } else {
876
+ return Err(RbTypeError::new_err(
877
+ "Invalid value for `missing_struct_fields`",
878
+ ));
879
+ }
880
+ Ok(out)
881
+ }
882
+ fn parse_extra_struct_fields(
883
+ schema: &Schema,
884
+ extra_struct_fields: Value,
885
+ ) -> RbResult<Vec<ExtraColumnsPolicy>> {
886
+ let mut out = Vec::with_capacity(schema.len());
887
+ if let Ok(policy) = Wrap::<ExtraColumnsPolicy>::try_convert(extra_struct_fields) {
888
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
889
+ } else if let Ok(dict) = RHash::try_convert(extra_struct_fields) {
890
+ out.extend(std::iter::repeat_n(ExtraColumnsPolicy::Raise, schema.len()));
891
+ dict.foreach(|key: String, value: Wrap<ExtraColumnsPolicy>| {
892
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
893
+ Ok(ForEach::Continue)
894
+ })?;
895
+ } else {
896
+ return Err(RbTypeError::new_err(
897
+ "Invalid value for `extra_struct_fields`",
898
+ ));
899
+ }
900
+ Ok(out)
901
+ }
902
+ fn parse_cast(schema: &Schema, cast: Value) -> RbResult<Vec<UpcastOrForbid>> {
903
+ let mut out = Vec::with_capacity(schema.len());
904
+ if let Ok(policy) = Wrap::<UpcastOrForbid>::try_convert(cast) {
905
+ out.extend(std::iter::repeat_n(policy.0, schema.len()));
906
+ } else if let Ok(dict) = RHash::try_convert(cast) {
907
+ out.extend(std::iter::repeat_n(UpcastOrForbid::Forbid, schema.len()));
908
+ dict.foreach(|key: String, value: Wrap<UpcastOrForbid>| {
909
+ out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
910
+ Ok(ForEach::Continue)
911
+ })?;
912
+ } else {
913
+ return Err(RbTypeError::new_err(
914
+ "Invalid value for `integer_cast` / `float_cast`",
915
+ ));
916
+ }
917
+ Ok(out)
918
+ }
919
+
920
+ let missing_columns = parse_missing_columns(&schema.0, missing_columns)?;
921
+ let missing_struct_fields = parse_missing_struct_fields(&schema.0, missing_struct_fields)?;
922
+ let extra_struct_fields = parse_extra_struct_fields(&schema.0, extra_struct_fields)?;
923
+ let integer_cast = parse_cast(&schema.0, integer_cast)?;
924
+ let float_cast = parse_cast(&schema.0, float_cast)?;
925
+
926
+ let per_column = (0..schema.0.len())
927
+ .map(|i| MatchToSchemaPerColumn {
928
+ missing_columns: missing_columns[i].clone(),
929
+ missing_struct_fields: missing_struct_fields[i],
930
+ extra_struct_fields: extra_struct_fields[i],
931
+ integer_cast: integer_cast[i],
932
+ float_cast: float_cast[i],
933
+ })
934
+ .collect();
935
+
936
+ let ldf = self.ldf.read().clone();
937
+ Ok(ldf
938
+ .match_to_schema(Arc::new(schema.0), per_column, extra_columns.0)
939
+ .into())
940
+ }
941
+
942
+ pub fn pipe_with_schema(&self, callback: Value) -> Self {
943
+ let ldf = self.ldf.read().clone();
944
+ let function = RubyObject::from(callback);
945
+ ldf.pipe_with_schema(PlanCallback::new_ruby(function))
946
+ .into()
947
+ }
948
+
789
949
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
790
950
  let ldf = self.ldf.read().clone();
791
951
  ldf.rename(existing, new, strict).into()
@@ -1118,7 +1278,7 @@ impl RbLazyFrame {
1118
1278
  #[magnus::wrap(class = "Polars::RbCollectBatches")]
1119
1279
  pub struct RbCollectBatches {
1120
1280
  inner: Arc<Mutex<CollectBatches>>,
1121
- _ldf: LazyFrame,
1281
+ ldf: LazyFrame,
1122
1282
  }
1123
1283
 
1124
1284
  impl RbCollectBatches {
@@ -1126,4 +1286,55 @@ impl RbCollectBatches {
1126
1286
  let inner = Arc::clone(&slf.inner);
1127
1287
  rb.enter_polars(|| PolarsResult::Ok(inner.lock().next().transpose()?.map(RbDataFrame::new)))
1128
1288
  }
1289
+
1290
+ pub fn __arrow_c_stream__(rb: &Ruby, self_: &Self) -> RbResult<Value> {
1291
+ let mut ldf = self_.ldf.clone();
1292
+ let schema = ldf
1293
+ .collect_schema()
1294
+ .map_err(RbPolarsErr::from)?
1295
+ .to_arrow(CompatLevel::newest());
1296
+
1297
+ let dtype = ArrowDataType::Struct(schema.into_iter_values().collect());
1298
+
1299
+ let iter = Box::new(ArrowStreamIterator::new(self_.inner.clone(), dtype.clone()));
1300
+ let field = ArrowField::new(PlSmallStr::EMPTY, dtype, false);
1301
+ let stream = export_iterator(iter, field);
1302
+ Ok(RbArrowArrayStream { stream }.into_value_with(rb))
1303
+ }
1304
+ }
1305
+
1306
+ pub struct ArrowStreamIterator {
1307
+ inner: Arc<Mutex<CollectBatches>>,
1308
+ dtype: ArrowDataType,
1309
+ }
1310
+
1311
+ impl ArrowStreamIterator {
1312
+ fn new(inner: Arc<Mutex<CollectBatches>>, schema: ArrowDataType) -> Self {
1313
+ Self {
1314
+ inner,
1315
+ dtype: schema,
1316
+ }
1317
+ }
1318
+ }
1319
+
1320
+ impl Iterator for ArrowStreamIterator {
1321
+ type Item = PolarsResult<ArrayRef>;
1322
+
1323
+ fn next(&mut self) -> Option<Self::Item> {
1324
+ let next = self.inner.lock().next();
1325
+ match next {
1326
+ None => None,
1327
+ Some(Err(err)) => Some(Err(err)),
1328
+ Some(Ok(df)) => {
1329
+ let height = df.height();
1330
+ let arrays = df.rechunk_into_arrow(CompatLevel::newest());
1331
+ Some(Ok(Box::new(arrow::array::StructArray::new(
1332
+ self.dtype.clone(),
1333
+ height,
1334
+ arrays,
1335
+ None,
1336
+ ))))
1337
+ }
1338
+ }
1339
+ }
1129
1340
  }