polars-df 0.25.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +5 -5
- data/ext/polars/Cargo.toml +2 -1
- data/ext/polars/src/conversion/any_value.rs +28 -5
- data/ext/polars/src/conversion/mod.rs +49 -13
- data/ext/polars/src/dataframe/general.rs +20 -3
- data/ext/polars/src/expr/datatype.rs +11 -2
- data/ext/polars/src/expr/general.rs +8 -0
- data/ext/polars/src/expr/rolling.rs +24 -0
- data/ext/polars/src/functions/lazy.rs +7 -0
- data/ext/polars/src/functions/range.rs +13 -0
- data/ext/polars/src/interop/arrow/to_rb.rs +1 -1
- data/ext/polars/src/lazyframe/general.rs +215 -4
- data/ext/polars/src/lib.rs +25 -2
- data/ext/polars/src/on_startup.rs +11 -1
- data/ext/polars/src/ruby/plan_callback.rs +1 -4
- data/ext/polars/src/series/construction.rs +31 -50
- data/ext/polars/src/series/general.rs +2 -3
- data/lib/polars/collect_batches.rb +4 -0
- data/lib/polars/data_frame.rb +71 -1
- data/lib/polars/data_type_group.rb +5 -0
- data/lib/polars/expr.rb +125 -0
- data/lib/polars/functions/datatype.rb +30 -0
- data/lib/polars/functions/lazy.rb +92 -0
- data/lib/polars/functions/range/linear_space.rb +118 -0
- data/lib/polars/io/database.rb +2 -3
- data/lib/polars/io/ipc.rb +1 -1
- data/lib/polars/io/lines.rb +172 -0
- data/lib/polars/io/parquet.rb +1 -1
- data/lib/polars/io/sink_options.rb +1 -0
- data/lib/polars/lazy_frame.rb +420 -0
- data/lib/polars/query_opt_flags.rb +1 -0
- data/lib/polars/selectors.rb +1 -1
- data/lib/polars/series.rb +88 -0
- data/lib/polars/utils/construction/series.rb +19 -39
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3f8d65cfb4e8577c4f9e92f2f2271f97f44b0bc66bd16e9b517a5ab5dae9a877
|
|
4
|
+
data.tar.gz: 7b9254cc522bfa6eae53986384d33c96933ffa788b47f4f6e82bc46d2cbf6395
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 97754dc2c36dbc5e2d7d99bf71973d05107d9124d1dc7bea1bd760ee77d9ac303898a72ac301c798fe4ae77cd24a086a28b20db2e8254c8ef300b3df205e33a5
|
|
7
|
+
data.tar.gz: 61cbe25d941e7451bcce590a263912967430aa31638db0cfe699d05754ebae71da3495bcc6ac7305afcfd9cfb3bb39670785501e196eae5f6fd6ba631bc92d27
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
## 0.25.1 (2026-03-23)
|
|
2
|
+
|
|
3
|
+
- Added `explain_all` method to `Polars`
|
|
4
|
+
- Fixed `storage_options` option for `scan_parquet` and `scan_ipc` methods
|
|
5
|
+
- Fixed `Struct` construction with `nil` values
|
|
6
|
+
- Fixed `read_database` for JSON columns
|
|
7
|
+
|
|
1
8
|
## 0.25.0 (2026-02-18)
|
|
2
9
|
|
|
3
10
|
- Updated Polars to 0.53.0
|
data/Cargo.lock
CHANGED
|
@@ -2160,7 +2160,7 @@ dependencies = [
|
|
|
2160
2160
|
|
|
2161
2161
|
[[package]]
|
|
2162
2162
|
name = "polars-ruby"
|
|
2163
|
-
version = "0.25.
|
|
2163
|
+
version = "0.25.1"
|
|
2164
2164
|
dependencies = [
|
|
2165
2165
|
"ahash",
|
|
2166
2166
|
"bytes",
|
|
@@ -2408,9 +2408,9 @@ dependencies = [
|
|
|
2408
2408
|
|
|
2409
2409
|
[[package]]
|
|
2410
2410
|
name = "quinn-proto"
|
|
2411
|
-
version = "0.11.
|
|
2411
|
+
version = "0.11.14"
|
|
2412
2412
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2413
|
-
checksum = "
|
|
2413
|
+
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
|
|
2414
2414
|
dependencies = [
|
|
2415
2415
|
"bytes",
|
|
2416
2416
|
"getrandom 0.3.3",
|
|
@@ -2811,9 +2811,9 @@ dependencies = [
|
|
|
2811
2811
|
|
|
2812
2812
|
[[package]]
|
|
2813
2813
|
name = "rustls-webpki"
|
|
2814
|
-
version = "0.103.
|
|
2814
|
+
version = "0.103.10"
|
|
2815
2815
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2816
|
-
checksum = "
|
|
2816
|
+
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
|
|
2817
2817
|
dependencies = [
|
|
2818
2818
|
"ring",
|
|
2819
2819
|
"rustls-pki-types",
|
data/ext/polars/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "polars-ruby"
|
|
3
|
-
version = "0.25.
|
|
3
|
+
version = "0.25.1"
|
|
4
4
|
license = "MIT"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2024"
|
|
@@ -130,6 +130,7 @@ features = [
|
|
|
130
130
|
"rolling_window_by",
|
|
131
131
|
"round_series",
|
|
132
132
|
"row_hash",
|
|
133
|
+
"scan_lines",
|
|
133
134
|
"search_sorted",
|
|
134
135
|
"semi_anti_join",
|
|
135
136
|
"serde-lazy",
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
use magnus::encoding::EncodingCapable;
|
|
2
2
|
use magnus::{
|
|
3
|
-
IntoValue, RArray, RHash, RString, Ruby, TryConvert, Value, prelude::*,
|
|
3
|
+
IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, prelude::*,
|
|
4
|
+
r_hash::ForEach, value::Opaque,
|
|
4
5
|
};
|
|
5
6
|
use num_traits::ToPrimitive;
|
|
7
|
+
use polars::chunked_array::object::PolarsObjectSafe;
|
|
6
8
|
use polars::prelude::*;
|
|
7
9
|
use polars_compute::decimal::{DEC128_MAX_PREC, DecimalFmtBuffer, dec128_fits};
|
|
8
10
|
use polars_core::utils::any_values_to_supertype_and_n_dtypes;
|
|
@@ -22,7 +24,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
|
|
|
22
24
|
|
|
23
25
|
impl TryConvert for Wrap<AnyValue<'_>> {
|
|
24
26
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
25
|
-
rb_object_to_any_value(ob, true).map(Wrap)
|
|
27
|
+
rb_object_to_any_value(ob, true, true).map(Wrap)
|
|
26
28
|
}
|
|
27
29
|
}
|
|
28
30
|
|
|
@@ -88,7 +90,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
|
|
|
88
90
|
}
|
|
89
91
|
}
|
|
90
92
|
|
|
91
|
-
pub(crate) fn rb_object_to_any_value<'s>(
|
|
93
|
+
pub(crate) fn rb_object_to_any_value<'s>(
|
|
94
|
+
ob: Value,
|
|
95
|
+
strict: bool,
|
|
96
|
+
allow_object: bool,
|
|
97
|
+
) -> RbResult<AnyValue<'s>> {
|
|
92
98
|
// Conversion functions.
|
|
93
99
|
fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
|
94
100
|
Ok(AnyValue::Null)
|
|
@@ -174,7 +180,12 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
|
174
180
|
let len = dict.len();
|
|
175
181
|
let mut keys = Vec::with_capacity(len);
|
|
176
182
|
let mut vals = Vec::with_capacity(len);
|
|
177
|
-
dict.foreach(|key:
|
|
183
|
+
dict.foreach(|key: Value, val: Wrap<AnyValue>| {
|
|
184
|
+
let key = if let Some(s) = Symbol::from_value(key) {
|
|
185
|
+
s.name()?.to_string()
|
|
186
|
+
} else {
|
|
187
|
+
String::try_convert(key)?
|
|
188
|
+
};
|
|
178
189
|
let val = val.0;
|
|
179
190
|
let dtype = DataType::from(&val);
|
|
180
191
|
keys.push(Field::new(key.into(), dtype));
|
|
@@ -184,6 +195,14 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
|
184
195
|
Ok(AnyValue::StructOwned(Box::new((vals, keys))))
|
|
185
196
|
}
|
|
186
197
|
|
|
198
|
+
fn get_object(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
|
199
|
+
// This is slow, but hey don't use objects.
|
|
200
|
+
let v = &ObjectValue {
|
|
201
|
+
inner: Opaque::from(ob),
|
|
202
|
+
};
|
|
203
|
+
Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))
|
|
204
|
+
}
|
|
205
|
+
|
|
187
206
|
fn get_date(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
|
|
188
207
|
// convert to DateTime for UTC
|
|
189
208
|
let v = ob
|
|
@@ -263,6 +282,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
|
|
|
263
282
|
} else if ob.is_kind_of(crate::ruby::rb_modules::bigdecimal(&ruby)) {
|
|
264
283
|
get_decimal(ob, strict)
|
|
265
284
|
} else {
|
|
266
|
-
|
|
285
|
+
if allow_object {
|
|
286
|
+
get_object(ob, strict)
|
|
287
|
+
} else {
|
|
288
|
+
Err(RbValueError::new_err(format!("Cannot convert {ob}")))
|
|
289
|
+
}
|
|
267
290
|
}
|
|
268
291
|
}
|
|
@@ -37,19 +37,9 @@ use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
|
|
|
37
37
|
use crate::object::OBJECT_NAME;
|
|
38
38
|
use crate::rb_modules::pl_series;
|
|
39
39
|
use crate::utils::to_rb_err;
|
|
40
|
-
use crate::{
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
// Safety:
|
|
44
|
-
// Wrap is transparent.
|
|
45
|
-
unsafe { std::mem::transmute(slice) }
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
pub(crate) fn vec_extract_wrapped<T>(buf: Vec<Wrap<T>>) -> Vec<T> {
|
|
49
|
-
// Safety:
|
|
50
|
-
// Wrap is transparent.
|
|
51
|
-
unsafe { std::mem::transmute(buf) }
|
|
52
|
-
}
|
|
40
|
+
use crate::{
|
|
41
|
+
RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError,
|
|
42
|
+
};
|
|
53
43
|
|
|
54
44
|
#[repr(transparent)]
|
|
55
45
|
pub struct Wrap<T>(pub T);
|
|
@@ -648,6 +638,16 @@ impl TryConvert for Wrap<ScanSources> {
|
|
|
648
638
|
}
|
|
649
639
|
}
|
|
650
640
|
|
|
641
|
+
impl IntoValue for Wrap<Schema> {
|
|
642
|
+
fn into_value_with(self, ruby: &Ruby) -> Value {
|
|
643
|
+
let dict = ruby.hash_new();
|
|
644
|
+
for (k, v) in self.0.iter() {
|
|
645
|
+
dict.aset(k.as_str(), Wrap(v.clone())).unwrap();
|
|
646
|
+
}
|
|
647
|
+
dict.as_value()
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
651
|
#[derive(Clone)]
|
|
652
652
|
pub struct ObjectValue {
|
|
653
653
|
pub inner: Opaque<Value>,
|
|
@@ -1491,6 +1491,21 @@ impl TryConvert for Wrap<Option<TimeZone>> {
|
|
|
1491
1491
|
|
|
1492
1492
|
unsafe impl TryConvertOwned for Wrap<Option<TimeZone>> {}
|
|
1493
1493
|
|
|
1494
|
+
impl TryConvert for Wrap<UpcastOrForbid> {
|
|
1495
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
1496
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
|
1497
|
+
"upcast" => UpcastOrForbid::Upcast,
|
|
1498
|
+
"forbid" => UpcastOrForbid::Forbid,
|
|
1499
|
+
v => {
|
|
1500
|
+
return Err(RbValueError::new_err(format!(
|
|
1501
|
+
"cast parameter must be one of {{'upcast', 'forbid'}}, got {v}",
|
|
1502
|
+
)));
|
|
1503
|
+
}
|
|
1504
|
+
};
|
|
1505
|
+
Ok(Wrap(parsed))
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1494
1509
|
impl TryConvert for Wrap<ExtraColumnsPolicy> {
|
|
1495
1510
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
1496
1511
|
let parsed = match String::try_convert(ob)?.as_str() {
|
|
@@ -1521,6 +1536,27 @@ impl TryConvert for Wrap<MissingColumnsPolicy> {
|
|
|
1521
1536
|
}
|
|
1522
1537
|
}
|
|
1523
1538
|
|
|
1539
|
+
impl TryConvert for Wrap<MissingColumnsPolicyOrExpr> {
|
|
1540
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
1541
|
+
if let Ok(rbexpr) = <&RbExpr>::try_convert(ob) {
|
|
1542
|
+
return Ok(Wrap(MissingColumnsPolicyOrExpr::InsertWith(
|
|
1543
|
+
rbexpr.inner.clone(),
|
|
1544
|
+
)));
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
|
1548
|
+
"insert" => MissingColumnsPolicyOrExpr::Insert,
|
|
1549
|
+
"raise" => MissingColumnsPolicyOrExpr::Raise,
|
|
1550
|
+
v => {
|
|
1551
|
+
return Err(RbValueError::new_err(format!(
|
|
1552
|
+
"missing column/field parameter must be one of {{'insert', 'raise', expression}}, got {v}",
|
|
1553
|
+
)));
|
|
1554
|
+
}
|
|
1555
|
+
};
|
|
1556
|
+
Ok(Wrap(parsed))
|
|
1557
|
+
}
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1524
1560
|
impl TryConvert for Wrap<ColumnMapping> {
|
|
1525
1561
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
1526
1562
|
let (column_mapping_type, ob) = <(String, Value)>::try_convert(ob)?;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
use std::hash::BuildHasher;
|
|
2
2
|
|
|
3
|
+
use arrow::bitmap::MutableBitmap;
|
|
3
4
|
use either::Either;
|
|
4
5
|
use magnus::{IntoValue, RArray, Ruby, Value, prelude::*, value::Opaque};
|
|
5
6
|
use polars::prelude::*;
|
|
@@ -503,10 +504,26 @@ impl RbDataFrame {
|
|
|
503
504
|
})
|
|
504
505
|
}
|
|
505
506
|
|
|
506
|
-
pub fn to_struct(
|
|
507
|
+
pub fn to_struct(
|
|
508
|
+
rb: &Ruby,
|
|
509
|
+
self_: &Self,
|
|
510
|
+
name: String,
|
|
511
|
+
invalid_indices: Vec<usize>,
|
|
512
|
+
) -> RbResult<RbSeries> {
|
|
507
513
|
rb.enter_polars_series(|| {
|
|
508
|
-
let ca = self_.df.read().clone().into_struct(name.into());
|
|
509
|
-
|
|
514
|
+
let mut ca = self_.df.read().clone().into_struct(name.into());
|
|
515
|
+
|
|
516
|
+
if !invalid_indices.is_empty() {
|
|
517
|
+
let mut validity = MutableBitmap::with_capacity(ca.len());
|
|
518
|
+
validity.extend_constant(ca.len(), true);
|
|
519
|
+
for i in invalid_indices {
|
|
520
|
+
validity.set(i, false);
|
|
521
|
+
}
|
|
522
|
+
ca.rechunk_mut();
|
|
523
|
+
Ok(ca.with_outer_validity(Some(validity.freeze())))
|
|
524
|
+
} else {
|
|
525
|
+
Ok(ca)
|
|
526
|
+
}
|
|
510
527
|
})
|
|
511
528
|
}
|
|
512
529
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
use magnus::{IntoValue, Ruby, Value};
|
|
2
|
-
use polars::prelude::{DataType, DataTypeExpr, Schema};
|
|
1
|
+
use magnus::{IntoValue, RArray, Ruby, TryConvert, Value};
|
|
2
|
+
use polars::prelude::{DataType, DataTypeExpr, PlSmallStr, Schema};
|
|
3
3
|
|
|
4
4
|
use crate::prelude::Wrap;
|
|
5
5
|
use crate::{RbExpr, RbPolarsErr, RbResult};
|
|
@@ -38,4 +38,13 @@ impl RbDataTypeExpr {
|
|
|
38
38
|
.map_err(RbPolarsErr::from)?;
|
|
39
39
|
Ok(Wrap(dtype).into_value_with(ruby))
|
|
40
40
|
}
|
|
41
|
+
|
|
42
|
+
pub fn struct_with_fields(rb_fields: RArray) -> RbResult<Self> {
|
|
43
|
+
let mut fields = Vec::new();
|
|
44
|
+
for v in rb_fields.into_iter() {
|
|
45
|
+
let (name, dt_expr) = <(String, &RbDataTypeExpr)>::try_convert(v)?;
|
|
46
|
+
fields.push((PlSmallStr::from_string(name), dt_expr.inner.clone()));
|
|
47
|
+
}
|
|
48
|
+
Ok(DataTypeExpr::StructWithFields(fields).into())
|
|
49
|
+
}
|
|
41
50
|
}
|
|
@@ -118,6 +118,14 @@ impl RbExpr {
|
|
|
118
118
|
self.inner.clone().max().into()
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
+
pub fn min_by(&self, by: &Self) -> Self {
|
|
122
|
+
self.inner.clone().min_by(by.inner.clone()).into()
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
pub fn max_by(&self, by: &Self) -> Self {
|
|
126
|
+
self.inner.clone().max_by(by.inner.clone()).into()
|
|
127
|
+
}
|
|
128
|
+
|
|
121
129
|
pub fn nan_max(&self) -> Self {
|
|
122
130
|
self.inner.clone().nan_max().into()
|
|
123
131
|
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
use magnus::Value;
|
|
1
2
|
use polars::prelude::*;
|
|
2
3
|
|
|
3
4
|
use crate::conversion::Wrap;
|
|
5
|
+
use crate::ruby::plan_callback::PlanCallbackExt;
|
|
6
|
+
use crate::ruby::ruby_function::RubyObject;
|
|
4
7
|
use crate::{RbExpr, RbPolarsErr, RbResult};
|
|
5
8
|
|
|
6
9
|
impl RbExpr {
|
|
@@ -406,4 +409,25 @@ impl RbExpr {
|
|
|
406
409
|
|
|
407
410
|
self.inner.clone().rolling_kurtosis(options).into()
|
|
408
411
|
}
|
|
412
|
+
|
|
413
|
+
pub fn rolling_map(
|
|
414
|
+
&self,
|
|
415
|
+
lambda: Value,
|
|
416
|
+
window_size: usize,
|
|
417
|
+
weights: Option<Vec<f64>>,
|
|
418
|
+
min_periods: Option<usize>,
|
|
419
|
+
center: bool,
|
|
420
|
+
) -> Self {
|
|
421
|
+
let min_periods = min_periods.unwrap_or(window_size);
|
|
422
|
+
let options = RollingOptionsFixedWindow {
|
|
423
|
+
window_size,
|
|
424
|
+
weights,
|
|
425
|
+
min_periods,
|
|
426
|
+
center,
|
|
427
|
+
..Default::default()
|
|
428
|
+
};
|
|
429
|
+
let function = PlanCallback::new_ruby(RubyObject::from(lambda));
|
|
430
|
+
|
|
431
|
+
self.inner.clone().rolling_map(function, options).into()
|
|
432
|
+
}
|
|
409
433
|
}
|
|
@@ -130,6 +130,13 @@ pub fn collect_all(
|
|
|
130
130
|
Ok(ruby.ary_from_iter(dfs.into_iter().map(Into::<RbDataFrame>::into)))
|
|
131
131
|
}
|
|
132
132
|
|
|
133
|
+
pub fn explain_all(rb: &Ruby, lfs: RArray, optflags: &RbOptFlags) -> RbResult<String> {
|
|
134
|
+
let plans = lfs_to_plans(lfs)?;
|
|
135
|
+
let explained =
|
|
136
|
+
rb.enter_polars(|| LazyFrame::explain_all(plans, optflags.clone().inner.into_inner()))?;
|
|
137
|
+
Ok(explained)
|
|
138
|
+
}
|
|
139
|
+
|
|
133
140
|
pub fn collect_all_lazy(lfs: RArray, optflags: &RbOptFlags) -> RbResult<RbLazyFrame> {
|
|
134
141
|
let plans = lfs_to_plans(lfs)?;
|
|
135
142
|
|
|
@@ -138,6 +138,19 @@ pub fn time_ranges(
|
|
|
138
138
|
Ok(dsl::time_ranges(start, end, every, closed).into())
|
|
139
139
|
}
|
|
140
140
|
|
|
141
|
+
pub fn linear_space(
|
|
142
|
+
start: &RbExpr,
|
|
143
|
+
end: &RbExpr,
|
|
144
|
+
num_samples: &RbExpr,
|
|
145
|
+
closed: Wrap<ClosedInterval>,
|
|
146
|
+
) -> RbResult<RbExpr> {
|
|
147
|
+
let start = start.inner.clone();
|
|
148
|
+
let end = end.inner.clone();
|
|
149
|
+
let num_samples = num_samples.inner.clone();
|
|
150
|
+
let closed = closed.0;
|
|
151
|
+
Ok(dsl::linear_space(start, end, num_samples, closed).into())
|
|
152
|
+
}
|
|
153
|
+
|
|
141
154
|
pub fn linear_spaces(
|
|
142
155
|
start: &RbExpr,
|
|
143
156
|
end: &RbExpr,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
use arrow::ffi::export_iterator;
|
|
1
2
|
use magnus::{
|
|
2
3
|
IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, value::ReprValue,
|
|
3
4
|
};
|
|
@@ -19,8 +20,13 @@ use crate::io::sink_options::RbSinkOptions;
|
|
|
19
20
|
use crate::io::sink_output::RbFileSinkDestination;
|
|
20
21
|
use crate::ruby::gvl::GvlExt;
|
|
21
22
|
use crate::ruby::lazy::RubyUdfLazyFrameExt;
|
|
22
|
-
use crate::
|
|
23
|
-
use crate::
|
|
23
|
+
use crate::ruby::plan_callback::PlanCallbackExt;
|
|
24
|
+
use crate::ruby::ruby_function::RubyObject;
|
|
25
|
+
use crate::utils::{EnterPolarsExt, to_rb_err};
|
|
26
|
+
use crate::{
|
|
27
|
+
RbArrowArrayStream, RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbTypeError,
|
|
28
|
+
RbValueError,
|
|
29
|
+
};
|
|
24
30
|
|
|
25
31
|
fn rbobject_to_first_path_and_scan_sources(
|
|
26
32
|
obj: Value,
|
|
@@ -272,6 +278,25 @@ impl RbLazyFrame {
|
|
|
272
278
|
Ok(lf.into())
|
|
273
279
|
}
|
|
274
280
|
|
|
281
|
+
pub fn new_from_scan_lines(
|
|
282
|
+
sources: Wrap<ScanSources>,
|
|
283
|
+
scan_options: RbScanOptions,
|
|
284
|
+
name: String,
|
|
285
|
+
) -> RbResult<Self> {
|
|
286
|
+
let sources = sources.0;
|
|
287
|
+
let first_path = sources.first_path();
|
|
288
|
+
|
|
289
|
+
let unified_scan_args =
|
|
290
|
+
scan_options.extract_unified_scan_args(first_path.and_then(|x| x.scheme()))?;
|
|
291
|
+
|
|
292
|
+
let dsl: DslPlan = DslBuilder::scan_lines(sources, unified_scan_args, (&*name).into())
|
|
293
|
+
.map_err(to_rb_err)?
|
|
294
|
+
.build();
|
|
295
|
+
let lf: LazyFrame = dsl.into();
|
|
296
|
+
|
|
297
|
+
Ok(lf.into())
|
|
298
|
+
}
|
|
299
|
+
|
|
275
300
|
pub fn describe_plan(rb: &Ruby, self_: &Self) -> RbResult<String> {
|
|
276
301
|
rb.enter_polars(|| self_.ldf.read().describe_plan())
|
|
277
302
|
}
|
|
@@ -409,7 +434,7 @@ impl RbLazyFrame {
|
|
|
409
434
|
|
|
410
435
|
PolarsResult::Ok(RbCollectBatches {
|
|
411
436
|
inner: Arc::new(Mutex::new(collect_batches)),
|
|
412
|
-
|
|
437
|
+
ldf,
|
|
413
438
|
})
|
|
414
439
|
})
|
|
415
440
|
}
|
|
@@ -573,6 +598,20 @@ impl RbLazyFrame {
|
|
|
573
598
|
.map(Into::into)
|
|
574
599
|
}
|
|
575
600
|
|
|
601
|
+
pub fn sink_batches(
|
|
602
|
+
rb: &Ruby,
|
|
603
|
+
self_: &Self,
|
|
604
|
+
function: Value,
|
|
605
|
+
maintain_order: bool,
|
|
606
|
+
chunk_size: Option<NonZeroUsize>,
|
|
607
|
+
) -> RbResult<RbLazyFrame> {
|
|
608
|
+
let ldf = self_.ldf.read().clone();
|
|
609
|
+
// ensure new_ruby is called with GVL
|
|
610
|
+
let callback = PlanCallback::new_ruby(RubyObject::from(function));
|
|
611
|
+
rb.enter_polars(|| ldf.sink_batches(callback, maintain_order, chunk_size))
|
|
612
|
+
.map(Into::into)
|
|
613
|
+
}
|
|
614
|
+
|
|
576
615
|
pub fn filter(&self, predicate: &RbExpr) -> Self {
|
|
577
616
|
let ldf = self.ldf.read().clone();
|
|
578
617
|
ldf.filter(predicate.inner.clone()).into()
|
|
@@ -786,6 +825,127 @@ impl RbLazyFrame {
|
|
|
786
825
|
Ok(ldf.with_columns_seq(exprs.to_exprs()?).into())
|
|
787
826
|
}
|
|
788
827
|
|
|
828
|
+
pub fn match_to_schema(
|
|
829
|
+
&self,
|
|
830
|
+
schema: Wrap<Schema>,
|
|
831
|
+
missing_columns: Value,
|
|
832
|
+
missing_struct_fields: Value,
|
|
833
|
+
extra_columns: Wrap<ExtraColumnsPolicy>,
|
|
834
|
+
extra_struct_fields: Value,
|
|
835
|
+
integer_cast: Value,
|
|
836
|
+
float_cast: Value,
|
|
837
|
+
) -> RbResult<Self> {
|
|
838
|
+
fn parse_missing_columns(
|
|
839
|
+
schema: &Schema,
|
|
840
|
+
missing_columns: Value,
|
|
841
|
+
) -> RbResult<Vec<MissingColumnsPolicyOrExpr>> {
|
|
842
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
843
|
+
if let Ok(policy) = Wrap::<MissingColumnsPolicyOrExpr>::try_convert(missing_columns) {
|
|
844
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
845
|
+
} else if let Ok(dict) = RHash::try_convert(missing_columns) {
|
|
846
|
+
out.extend(std::iter::repeat_n(
|
|
847
|
+
MissingColumnsPolicyOrExpr::Raise,
|
|
848
|
+
schema.len(),
|
|
849
|
+
));
|
|
850
|
+
dict.foreach(|key: String, value: Wrap<MissingColumnsPolicyOrExpr>| {
|
|
851
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
852
|
+
Ok(ForEach::Continue)
|
|
853
|
+
})?;
|
|
854
|
+
} else {
|
|
855
|
+
return Err(RbTypeError::new_err("Invalid value for `missing_columns`"));
|
|
856
|
+
}
|
|
857
|
+
Ok(out)
|
|
858
|
+
}
|
|
859
|
+
fn parse_missing_struct_fields(
|
|
860
|
+
schema: &Schema,
|
|
861
|
+
missing_struct_fields: Value,
|
|
862
|
+
) -> RbResult<Vec<MissingColumnsPolicy>> {
|
|
863
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
864
|
+
if let Ok(policy) = Wrap::<MissingColumnsPolicy>::try_convert(missing_struct_fields) {
|
|
865
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
866
|
+
} else if let Ok(dict) = RHash::try_convert(missing_struct_fields) {
|
|
867
|
+
out.extend(std::iter::repeat_n(
|
|
868
|
+
MissingColumnsPolicy::Raise,
|
|
869
|
+
schema.len(),
|
|
870
|
+
));
|
|
871
|
+
dict.foreach(|key: String, value: Wrap<MissingColumnsPolicy>| {
|
|
872
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
873
|
+
Ok(ForEach::Continue)
|
|
874
|
+
})?;
|
|
875
|
+
} else {
|
|
876
|
+
return Err(RbTypeError::new_err(
|
|
877
|
+
"Invalid value for `missing_struct_fields`",
|
|
878
|
+
));
|
|
879
|
+
}
|
|
880
|
+
Ok(out)
|
|
881
|
+
}
|
|
882
|
+
fn parse_extra_struct_fields(
|
|
883
|
+
schema: &Schema,
|
|
884
|
+
extra_struct_fields: Value,
|
|
885
|
+
) -> RbResult<Vec<ExtraColumnsPolicy>> {
|
|
886
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
887
|
+
if let Ok(policy) = Wrap::<ExtraColumnsPolicy>::try_convert(extra_struct_fields) {
|
|
888
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
889
|
+
} else if let Ok(dict) = RHash::try_convert(extra_struct_fields) {
|
|
890
|
+
out.extend(std::iter::repeat_n(ExtraColumnsPolicy::Raise, schema.len()));
|
|
891
|
+
dict.foreach(|key: String, value: Wrap<ExtraColumnsPolicy>| {
|
|
892
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
893
|
+
Ok(ForEach::Continue)
|
|
894
|
+
})?;
|
|
895
|
+
} else {
|
|
896
|
+
return Err(RbTypeError::new_err(
|
|
897
|
+
"Invalid value for `extra_struct_fields`",
|
|
898
|
+
));
|
|
899
|
+
}
|
|
900
|
+
Ok(out)
|
|
901
|
+
}
|
|
902
|
+
fn parse_cast(schema: &Schema, cast: Value) -> RbResult<Vec<UpcastOrForbid>> {
|
|
903
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
904
|
+
if let Ok(policy) = Wrap::<UpcastOrForbid>::try_convert(cast) {
|
|
905
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
906
|
+
} else if let Ok(dict) = RHash::try_convert(cast) {
|
|
907
|
+
out.extend(std::iter::repeat_n(UpcastOrForbid::Forbid, schema.len()));
|
|
908
|
+
dict.foreach(|key: String, value: Wrap<UpcastOrForbid>| {
|
|
909
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
910
|
+
Ok(ForEach::Continue)
|
|
911
|
+
})?;
|
|
912
|
+
} else {
|
|
913
|
+
return Err(RbTypeError::new_err(
|
|
914
|
+
"Invalid value for `integer_cast` / `float_cast`",
|
|
915
|
+
));
|
|
916
|
+
}
|
|
917
|
+
Ok(out)
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
let missing_columns = parse_missing_columns(&schema.0, missing_columns)?;
|
|
921
|
+
let missing_struct_fields = parse_missing_struct_fields(&schema.0, missing_struct_fields)?;
|
|
922
|
+
let extra_struct_fields = parse_extra_struct_fields(&schema.0, extra_struct_fields)?;
|
|
923
|
+
let integer_cast = parse_cast(&schema.0, integer_cast)?;
|
|
924
|
+
let float_cast = parse_cast(&schema.0, float_cast)?;
|
|
925
|
+
|
|
926
|
+
let per_column = (0..schema.0.len())
|
|
927
|
+
.map(|i| MatchToSchemaPerColumn {
|
|
928
|
+
missing_columns: missing_columns[i].clone(),
|
|
929
|
+
missing_struct_fields: missing_struct_fields[i],
|
|
930
|
+
extra_struct_fields: extra_struct_fields[i],
|
|
931
|
+
integer_cast: integer_cast[i],
|
|
932
|
+
float_cast: float_cast[i],
|
|
933
|
+
})
|
|
934
|
+
.collect();
|
|
935
|
+
|
|
936
|
+
let ldf = self.ldf.read().clone();
|
|
937
|
+
Ok(ldf
|
|
938
|
+
.match_to_schema(Arc::new(schema.0), per_column, extra_columns.0)
|
|
939
|
+
.into())
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
pub fn pipe_with_schema(&self, callback: Value) -> Self {
|
|
943
|
+
let ldf = self.ldf.read().clone();
|
|
944
|
+
let function = RubyObject::from(callback);
|
|
945
|
+
ldf.pipe_with_schema(PlanCallback::new_ruby(function))
|
|
946
|
+
.into()
|
|
947
|
+
}
|
|
948
|
+
|
|
789
949
|
pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
|
|
790
950
|
let ldf = self.ldf.read().clone();
|
|
791
951
|
ldf.rename(existing, new, strict).into()
|
|
@@ -1118,7 +1278,7 @@ impl RbLazyFrame {
|
|
|
1118
1278
|
#[magnus::wrap(class = "Polars::RbCollectBatches")]
|
|
1119
1279
|
pub struct RbCollectBatches {
|
|
1120
1280
|
inner: Arc<Mutex<CollectBatches>>,
|
|
1121
|
-
|
|
1281
|
+
ldf: LazyFrame,
|
|
1122
1282
|
}
|
|
1123
1283
|
|
|
1124
1284
|
impl RbCollectBatches {
|
|
@@ -1126,4 +1286,55 @@ impl RbCollectBatches {
|
|
|
1126
1286
|
let inner = Arc::clone(&slf.inner);
|
|
1127
1287
|
rb.enter_polars(|| PolarsResult::Ok(inner.lock().next().transpose()?.map(RbDataFrame::new)))
|
|
1128
1288
|
}
|
|
1289
|
+
|
|
1290
|
+
pub fn __arrow_c_stream__(rb: &Ruby, self_: &Self) -> RbResult<Value> {
|
|
1291
|
+
let mut ldf = self_.ldf.clone();
|
|
1292
|
+
let schema = ldf
|
|
1293
|
+
.collect_schema()
|
|
1294
|
+
.map_err(RbPolarsErr::from)?
|
|
1295
|
+
.to_arrow(CompatLevel::newest());
|
|
1296
|
+
|
|
1297
|
+
let dtype = ArrowDataType::Struct(schema.into_iter_values().collect());
|
|
1298
|
+
|
|
1299
|
+
let iter = Box::new(ArrowStreamIterator::new(self_.inner.clone(), dtype.clone()));
|
|
1300
|
+
let field = ArrowField::new(PlSmallStr::EMPTY, dtype, false);
|
|
1301
|
+
let stream = export_iterator(iter, field);
|
|
1302
|
+
Ok(RbArrowArrayStream { stream }.into_value_with(rb))
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
pub struct ArrowStreamIterator {
|
|
1307
|
+
inner: Arc<Mutex<CollectBatches>>,
|
|
1308
|
+
dtype: ArrowDataType,
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
impl ArrowStreamIterator {
|
|
1312
|
+
fn new(inner: Arc<Mutex<CollectBatches>>, schema: ArrowDataType) -> Self {
|
|
1313
|
+
Self {
|
|
1314
|
+
inner,
|
|
1315
|
+
dtype: schema,
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
impl Iterator for ArrowStreamIterator {
|
|
1321
|
+
type Item = PolarsResult<ArrayRef>;
|
|
1322
|
+
|
|
1323
|
+
fn next(&mut self) -> Option<Self::Item> {
|
|
1324
|
+
let next = self.inner.lock().next();
|
|
1325
|
+
match next {
|
|
1326
|
+
None => None,
|
|
1327
|
+
Some(Err(err)) => Some(Err(err)),
|
|
1328
|
+
Some(Ok(df)) => {
|
|
1329
|
+
let height = df.height();
|
|
1330
|
+
let arrays = df.rechunk_into_arrow(CompatLevel::newest());
|
|
1331
|
+
Some(Ok(Box::new(arrow::array::StructArray::new(
|
|
1332
|
+
self.dtype.clone(),
|
|
1333
|
+
height,
|
|
1334
|
+
arrays,
|
|
1335
|
+
None,
|
|
1336
|
+
))))
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1129
1340
|
}
|