polars-df 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -0
- data/Cargo.lock +270 -97
- data/LICENSE.txt +1 -1
- data/README.md +1 -3
- data/ext/polars/Cargo.toml +19 -18
- data/ext/polars/src/catalog/unity.rs +15 -20
- data/ext/polars/src/conversion/any_value.rs +53 -29
- data/ext/polars/src/conversion/chunked_array.rs +58 -56
- data/ext/polars/src/conversion/datetime.rs +58 -7
- data/ext/polars/src/conversion/mod.rs +200 -150
- data/ext/polars/src/dataframe/export.rs +15 -12
- data/ext/polars/src/dataframe/general.rs +25 -7
- data/ext/polars/src/dataframe/map.rs +6 -4
- data/ext/polars/src/error.rs +1 -1
- data/ext/polars/src/expr/array.rs +0 -24
- data/ext/polars/src/expr/datatype.rs +13 -3
- data/ext/polars/src/expr/datetime.rs +4 -4
- data/ext/polars/src/expr/general.rs +35 -15
- data/ext/polars/src/expr/list.rs +0 -26
- data/ext/polars/src/expr/rolling.rs +24 -0
- data/ext/polars/src/functions/business.rs +2 -2
- data/ext/polars/src/functions/io.rs +4 -3
- data/ext/polars/src/functions/lazy.rs +65 -46
- data/ext/polars/src/functions/meta.rs +6 -5
- data/ext/polars/src/functions/mod.rs +0 -1
- data/ext/polars/src/functions/range.rs +13 -0
- data/ext/polars/src/functions/utils.rs +4 -2
- data/ext/polars/src/interop/arrow/mod.rs +4 -2
- data/ext/polars/src/interop/arrow/to_rb.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +26 -25
- data/ext/polars/src/io/scan_options.rs +6 -3
- data/ext/polars/src/io/sink_options.rs +2 -0
- data/ext/polars/src/lazyframe/general.rs +243 -17
- data/ext/polars/src/lazyframe/optflags.rs +2 -1
- data/ext/polars/src/lib.rs +39 -35
- data/ext/polars/src/map/lazy.rs +5 -2
- data/ext/polars/src/map/series.rs +19 -18
- data/ext/polars/src/on_startup.rs +25 -6
- data/ext/polars/src/ruby/numo.rs +3 -4
- data/ext/polars/src/ruby/plan_callback.rs +1 -4
- data/ext/polars/src/ruby/rb_modules.rs +2 -4
- data/ext/polars/src/ruby/ruby_udf.rs +7 -9
- data/ext/polars/src/ruby/utils.rs +12 -1
- data/ext/polars/src/series/aggregation.rs +13 -1
- data/ext/polars/src/series/construction.rs +31 -50
- data/ext/polars/src/series/export.rs +33 -38
- data/ext/polars/src/series/general.rs +6 -6
- data/ext/polars/src/series/map.rs +3 -2
- data/ext/polars/src/series/scatter.rs +4 -4
- data/ext/polars/src/utils.rs +31 -7
- data/lib/polars/array_expr.rb +23 -7
- data/lib/polars/array_name_space.rb +16 -2
- data/lib/polars/binary_name_space.rb +32 -0
- data/lib/polars/collect_batches.rb +4 -0
- data/lib/polars/data_frame.rb +144 -11
- data/lib/polars/data_type_group.rb +5 -0
- data/lib/polars/date_time_expr.rb +91 -3
- data/lib/polars/date_time_name_space.rb +7 -1
- data/lib/polars/expr.rb +247 -44
- data/lib/polars/functions/business.rb +2 -2
- data/lib/polars/functions/datatype.rb +30 -0
- data/lib/polars/functions/eager.rb +80 -7
- data/lib/polars/functions/lazy.rb +97 -2
- data/lib/polars/functions/range/linear_space.rb +118 -0
- data/lib/polars/io/csv.rb +27 -5
- data/lib/polars/io/database.rb +2 -3
- data/lib/polars/io/ipc.rb +2 -2
- data/lib/polars/io/lines.rb +172 -0
- data/lib/polars/io/parquet.rb +1 -1
- data/lib/polars/io/sink_options.rb +5 -2
- data/lib/polars/lazy_frame.rb +517 -14
- data/lib/polars/list_expr.rb +21 -7
- data/lib/polars/list_name_space.rb +16 -2
- data/lib/polars/query_opt_flags.rb +23 -5
- data/lib/polars/selectors.rb +2 -2
- data/lib/polars/series.rb +176 -19
- data/lib/polars/sql_context.rb +2 -2
- data/lib/polars/string_cache.rb +19 -72
- data/lib/polars/string_expr.rb +1 -7
- data/lib/polars/string_name_space.rb +1 -7
- data/lib/polars/utils/construction/series.rb +24 -39
- data/lib/polars/utils/convert.rb +16 -6
- data/lib/polars/utils/parse.rb +7 -0
- data/lib/polars/utils/reduce_balanced.rb +43 -0
- data/lib/polars/utils/various.rb +5 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -17
- data/ext/polars/src/functions/string_cache.rs +0 -24
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
use magnus::prelude::*;
|
|
2
|
-
use magnus::{RHash, Value};
|
|
2
|
+
use magnus::{RHash, Ruby, Value};
|
|
3
3
|
use polars::prelude::{ArrowDataType, DataType};
|
|
4
4
|
use polars_error::polars_err;
|
|
5
5
|
|
|
6
6
|
use crate::interop::arrow::to_rust::normalize_arrow_fields;
|
|
7
7
|
use crate::prelude::Wrap;
|
|
8
|
+
use crate::ruby::utils::TryIntoValue;
|
|
8
9
|
use crate::series::import_schema_rbcapsule;
|
|
9
10
|
use crate::utils::to_rb_err;
|
|
10
11
|
use crate::{RbResult, RbValueError};
|
|
@@ -16,6 +17,7 @@ pub fn init_polars_schema_from_arrow_c_schema(
|
|
|
16
17
|
polars_schema: RHash,
|
|
17
18
|
schema_object: Value,
|
|
18
19
|
) -> RbResult<()> {
|
|
20
|
+
let ruby = &Ruby::get_with(polars_schema);
|
|
19
21
|
let schema_capsule = schema_object.funcall("arrow_c_schema", ())?;
|
|
20
22
|
|
|
21
23
|
let field = import_schema_rbcapsule(schema_capsule)?;
|
|
@@ -33,7 +35,7 @@ pub fn init_polars_schema_from_arrow_c_schema(
|
|
|
33
35
|
let dtype = DataType::from_arrow_field(&field);
|
|
34
36
|
|
|
35
37
|
let name = field.name.as_str();
|
|
36
|
-
let dtype = Wrap(dtype)
|
|
38
|
+
let dtype = Wrap(dtype).try_into_value_with(ruby)?;
|
|
37
39
|
|
|
38
40
|
if polars_schema.get(name).is_some() {
|
|
39
41
|
return Err(to_rb_err(polars_err!(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::Value;
|
|
1
|
+
use magnus::{Ruby, Value};
|
|
2
2
|
use num_traits::{Float, NumCast};
|
|
3
3
|
use polars_core::prelude::*;
|
|
4
4
|
|
|
@@ -10,34 +10,35 @@ use crate::series::RbSeries;
|
|
|
10
10
|
|
|
11
11
|
impl RbSeries {
|
|
12
12
|
/// Convert this Series to a Numo array.
|
|
13
|
-
pub fn to_numo(&
|
|
14
|
-
series_to_numo(&
|
|
13
|
+
pub fn to_numo(rb: &Ruby, self_: &Self) -> RbResult<Value> {
|
|
14
|
+
series_to_numo(rb, &self_.series.read())
|
|
15
15
|
}
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
/// Convert a Series to a Numo array.
|
|
19
|
-
fn series_to_numo(s: &Series) -> RbResult<Value> {
|
|
20
|
-
series_to_numo_with_copy(s)
|
|
19
|
+
fn series_to_numo(rb: &Ruby, s: &Series) -> RbResult<Value> {
|
|
20
|
+
series_to_numo_with_copy(rb, s)
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
/// Convert a Series to a Numo array, copying data in the process.
|
|
24
|
-
fn series_to_numo_with_copy(s: &Series) -> RbResult<Value> {
|
|
24
|
+
fn series_to_numo_with_copy(rb: &Ruby, s: &Series) -> RbResult<Value> {
|
|
25
25
|
use DataType::*;
|
|
26
26
|
match s.dtype() {
|
|
27
|
-
Int8 =>
|
|
28
|
-
Int16 =>
|
|
29
|
-
Int32 =>
|
|
30
|
-
Int64 =>
|
|
31
|
-
UInt8 =>
|
|
32
|
-
UInt16 =>
|
|
33
|
-
UInt32 =>
|
|
34
|
-
UInt64 =>
|
|
35
|
-
Float32 =>
|
|
36
|
-
Float64 =>
|
|
37
|
-
Boolean => boolean_series_to_numo(s),
|
|
27
|
+
Int8 => numeric_series_to_numo::<Int8Type, f32>(rb, s),
|
|
28
|
+
Int16 => numeric_series_to_numo::<Int16Type, f32>(rb, s),
|
|
29
|
+
Int32 => numeric_series_to_numo::<Int32Type, f64>(rb, s),
|
|
30
|
+
Int64 => numeric_series_to_numo::<Int64Type, f64>(rb, s),
|
|
31
|
+
UInt8 => numeric_series_to_numo::<UInt8Type, f32>(rb, s),
|
|
32
|
+
UInt16 => numeric_series_to_numo::<UInt16Type, f32>(rb, s),
|
|
33
|
+
UInt32 => numeric_series_to_numo::<UInt32Type, f64>(rb, s),
|
|
34
|
+
UInt64 => numeric_series_to_numo::<UInt64Type, f64>(rb, s),
|
|
35
|
+
Float32 => numeric_series_to_numo::<Float32Type, f32>(rb, s),
|
|
36
|
+
Float64 => numeric_series_to_numo::<Float64Type, f64>(rb, s),
|
|
37
|
+
Boolean => boolean_series_to_numo(rb, s),
|
|
38
38
|
String => {
|
|
39
39
|
let ca = s.str().unwrap();
|
|
40
|
-
|
|
40
|
+
let values = ca.iter();
|
|
41
|
+
RbArray1::from_iter(rb, values)
|
|
41
42
|
}
|
|
42
43
|
dt => {
|
|
43
44
|
raise_err!(
|
|
@@ -49,7 +50,7 @@ fn series_to_numo_with_copy(s: &Series) -> RbResult<Value> {
|
|
|
49
50
|
}
|
|
50
51
|
|
|
51
52
|
/// Convert numeric types to f32 or f64 with NaN representing a null value.
|
|
52
|
-
fn
|
|
53
|
+
fn numeric_series_to_numo<T, U>(rb: &Ruby, s: &Series) -> RbResult<Value>
|
|
53
54
|
where
|
|
54
55
|
T: PolarsNumericType,
|
|
55
56
|
T::Native: Element,
|
|
@@ -58,25 +59,25 @@ where
|
|
|
58
59
|
let ca: &ChunkedArray<T> = s.as_ref().as_ref();
|
|
59
60
|
if s.null_count() == 0 {
|
|
60
61
|
let values = ca.into_no_null_iter();
|
|
61
|
-
RbArray1::<T::Native>::from_iter(values)
|
|
62
|
+
RbArray1::<T::Native>::from_iter(rb, values)
|
|
62
63
|
} else {
|
|
63
64
|
let mapper = |opt_v: Option<T::Native>| match opt_v {
|
|
64
65
|
Some(v) => NumCast::from(v).unwrap(),
|
|
65
66
|
None => U::nan(),
|
|
66
67
|
};
|
|
67
68
|
let values = ca.iter().map(mapper);
|
|
68
|
-
RbArray1::from_iter(values)
|
|
69
|
+
RbArray1::from_iter(rb, values)
|
|
69
70
|
}
|
|
70
71
|
}
|
|
71
72
|
|
|
72
73
|
/// Convert booleans to bit if no nulls are present, otherwise convert to objects.
|
|
73
|
-
fn boolean_series_to_numo(s: &Series) -> RbResult<Value> {
|
|
74
|
+
fn boolean_series_to_numo(rb: &Ruby, s: &Series) -> RbResult<Value> {
|
|
74
75
|
let ca = s.bool().unwrap();
|
|
75
76
|
if s.null_count() == 0 {
|
|
76
|
-
let values = ca.
|
|
77
|
-
RbArray1::<bool>::from_iter(values)
|
|
77
|
+
let values = ca.no_null_iter();
|
|
78
|
+
RbArray1::<bool>::from_iter(rb, values)
|
|
78
79
|
} else {
|
|
79
80
|
let values = ca.iter();
|
|
80
|
-
RbArray1::from_iter(values)
|
|
81
|
+
RbArray1::from_iter(rb, values)
|
|
81
82
|
}
|
|
82
83
|
}
|
|
@@ -11,9 +11,9 @@ use polars_io::{HiveOptions, RowIndex};
|
|
|
11
11
|
use polars_utils::IdxSize;
|
|
12
12
|
use polars_utils::slice_enum::Slice;
|
|
13
13
|
|
|
14
|
-
use crate::RbResult;
|
|
15
14
|
use crate::io::cloud_options::OptRbCloudOptions;
|
|
16
15
|
use crate::prelude::Wrap;
|
|
16
|
+
use crate::{RbDataFrame, RbResult};
|
|
17
17
|
|
|
18
18
|
/// Interface to `class ScanOptions` on the Ruby side
|
|
19
19
|
pub struct RbScanOptions(Value);
|
|
@@ -25,8 +25,11 @@ impl TryConvert for RbScanOptions {
|
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
impl TryConvert for Wrap<TableStatistics> {
|
|
28
|
-
fn try_convert(
|
|
29
|
-
|
|
28
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
|
29
|
+
let attr: Value = ob.funcall("_df", ())?;
|
|
30
|
+
Ok(Wrap(TableStatistics(Arc::new(
|
|
31
|
+
<&RbDataFrame>::try_convert(attr)?.clone().df.into_inner(),
|
|
32
|
+
))))
|
|
30
33
|
}
|
|
31
34
|
}
|
|
32
35
|
|
|
@@ -26,6 +26,7 @@ impl RbSinkOptions {
|
|
|
26
26
|
let sync_on_close: Option<Wrap<SyncOnCloseType>> = self.0.funcall("sync_on_close", ())?;
|
|
27
27
|
let storage_options: OptRbCloudOptions = self.0.funcall("storage_options", ())?;
|
|
28
28
|
let credential_provider: Option<Value> = self.0.funcall("credential_provider", ())?;
|
|
29
|
+
let sinked_paths_callback: Option<Value> = self.0.funcall("sinked_paths_callback", ())?;
|
|
29
30
|
|
|
30
31
|
let cloud_options =
|
|
31
32
|
storage_options.extract_opt_cloud_options(cloud_scheme, credential_provider)?;
|
|
@@ -37,6 +38,7 @@ impl RbSinkOptions {
|
|
|
37
38
|
maintain_order,
|
|
38
39
|
sync_on_close,
|
|
39
40
|
cloud_options: cloud_options.map(Arc::new),
|
|
41
|
+
sinked_paths_callback: sinked_paths_callback.map(|_| todo!()),
|
|
40
42
|
};
|
|
41
43
|
|
|
42
44
|
Ok(unified_sink_args)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
use arrow::ffi::export_iterator;
|
|
1
2
|
use magnus::{
|
|
2
3
|
IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, value::ReprValue,
|
|
3
4
|
};
|
|
4
5
|
use parking_lot::Mutex;
|
|
6
|
+
use polars::frame::PivotColumnNaming;
|
|
5
7
|
use polars::io::RowIndex;
|
|
6
8
|
use polars::lazy::frame::LazyFrame;
|
|
7
9
|
use polars::prelude::*;
|
|
10
|
+
use polars_core::query_result::QueryResult;
|
|
8
11
|
use polars_plan::dsl::ScanSources;
|
|
9
12
|
use polars_plan::plans::{HintIR, Sorted};
|
|
10
13
|
use std::num::NonZeroUsize;
|
|
@@ -19,8 +22,14 @@ use crate::io::sink_options::RbSinkOptions;
|
|
|
19
22
|
use crate::io::sink_output::RbFileSinkDestination;
|
|
20
23
|
use crate::ruby::gvl::GvlExt;
|
|
21
24
|
use crate::ruby::lazy::RubyUdfLazyFrameExt;
|
|
22
|
-
use crate::
|
|
23
|
-
use crate::
|
|
25
|
+
use crate::ruby::plan_callback::PlanCallbackExt;
|
|
26
|
+
use crate::ruby::ruby_function::RubyObject;
|
|
27
|
+
use crate::ruby::utils::TryIntoValue;
|
|
28
|
+
use crate::utils::{EnterPolarsExt, to_rb_err};
|
|
29
|
+
use crate::{
|
|
30
|
+
RbArrowArrayStream, RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbTypeError,
|
|
31
|
+
RbValueError,
|
|
32
|
+
};
|
|
24
33
|
|
|
25
34
|
fn rbobject_to_first_path_and_scan_sources(
|
|
26
35
|
obj: Value,
|
|
@@ -128,6 +137,7 @@ impl RbLazyFrame {
|
|
|
128
137
|
let cloud_options = OptRbCloudOptions::try_convert(arguments[28])?;
|
|
129
138
|
let credential_provider = Option::<Value>::try_convert(arguments[29])?;
|
|
130
139
|
let include_file_paths = Option::<String>::try_convert(arguments[30])?;
|
|
140
|
+
let missing_columns = Option::<Wrap<MissingColumnsPolicy>>::try_convert(arguments[31])?;
|
|
131
141
|
// end arguments
|
|
132
142
|
|
|
133
143
|
let null_values = null_values.map(|w| w.0);
|
|
@@ -189,7 +199,8 @@ impl RbLazyFrame {
|
|
|
189
199
|
.with_decimal_comma(decimal_comma)
|
|
190
200
|
.with_glob(glob)
|
|
191
201
|
.with_raise_if_empty(raise_if_empty)
|
|
192
|
-
.with_include_file_paths(include_file_paths.map(|x| x.into()))
|
|
202
|
+
.with_include_file_paths(include_file_paths.map(|x| x.into()))
|
|
203
|
+
.with_missing_columns_policy(missing_columns.map(|x| x.0));
|
|
193
204
|
|
|
194
205
|
if let Some(lambda) = with_schema_modify {
|
|
195
206
|
let f = |schema: Schema| {
|
|
@@ -272,6 +283,25 @@ impl RbLazyFrame {
|
|
|
272
283
|
Ok(lf.into())
|
|
273
284
|
}
|
|
274
285
|
|
|
286
|
+
pub fn new_from_scan_lines(
|
|
287
|
+
sources: Wrap<ScanSources>,
|
|
288
|
+
scan_options: RbScanOptions,
|
|
289
|
+
name: String,
|
|
290
|
+
) -> RbResult<Self> {
|
|
291
|
+
let sources = sources.0;
|
|
292
|
+
let first_path = sources.first_path();
|
|
293
|
+
|
|
294
|
+
let unified_scan_args =
|
|
295
|
+
scan_options.extract_unified_scan_args(first_path.and_then(|x| x.scheme()))?;
|
|
296
|
+
|
|
297
|
+
let dsl: DslPlan = DslBuilder::scan_lines(sources, unified_scan_args, (&*name).into())
|
|
298
|
+
.map_err(to_rb_err)?
|
|
299
|
+
.build();
|
|
300
|
+
let lf: LazyFrame = dsl.into();
|
|
301
|
+
|
|
302
|
+
Ok(lf.into())
|
|
303
|
+
}
|
|
304
|
+
|
|
275
305
|
pub fn describe_plan(rb: &Ruby, self_: &Self) -> RbResult<String> {
|
|
276
306
|
rb.enter_polars(|| self_.ldf.read().describe_plan())
|
|
277
307
|
}
|
|
@@ -388,7 +418,11 @@ impl RbLazyFrame {
|
|
|
388
418
|
pub fn collect(rb: &Ruby, self_: &Self, engine: Wrap<Engine>) -> RbResult<RbDataFrame> {
|
|
389
419
|
rb.enter_polars_df(|| {
|
|
390
420
|
let ldf = self_.ldf.read().clone();
|
|
391
|
-
ldf.collect_with_engine(engine.0)
|
|
421
|
+
ldf.collect_with_engine(engine.0).map(|r| match r {
|
|
422
|
+
QueryResult::Single(df) => df,
|
|
423
|
+
// TODO: Should return query results
|
|
424
|
+
QueryResult::Multiple(_) => DataFrame::empty(),
|
|
425
|
+
})
|
|
392
426
|
})
|
|
393
427
|
}
|
|
394
428
|
|
|
@@ -409,7 +443,7 @@ impl RbLazyFrame {
|
|
|
409
443
|
|
|
410
444
|
PolarsResult::Ok(RbCollectBatches {
|
|
411
445
|
inner: Arc::new(Mutex::new(collect_batches)),
|
|
412
|
-
|
|
446
|
+
ldf,
|
|
413
447
|
})
|
|
414
448
|
})
|
|
415
449
|
}
|
|
@@ -573,6 +607,20 @@ impl RbLazyFrame {
|
|
|
573
607
|
.map(Into::into)
|
|
574
608
|
}
|
|
575
609
|
|
|
610
|
+
pub fn sink_batches(
|
|
611
|
+
rb: &Ruby,
|
|
612
|
+
self_: &Self,
|
|
613
|
+
function: Value,
|
|
614
|
+
maintain_order: bool,
|
|
615
|
+
chunk_size: Option<NonZeroUsize>,
|
|
616
|
+
) -> RbResult<RbLazyFrame> {
|
|
617
|
+
let ldf = self_.ldf.read().clone();
|
|
618
|
+
// ensure new_ruby is called with GVL
|
|
619
|
+
let callback = PlanCallback::new_ruby(RubyObject::from(function));
|
|
620
|
+
rb.enter_polars(|| ldf.sink_batches(callback, maintain_order, chunk_size))
|
|
621
|
+
.map(Into::into)
|
|
622
|
+
}
|
|
623
|
+
|
|
576
624
|
pub fn filter(&self, predicate: &RbExpr) -> Self {
|
|
577
625
|
let ldf = self.ldf.read().clone();
|
|
578
626
|
ldf.filter(predicate.inner.clone()).into()
|
|
@@ -771,6 +819,12 @@ impl RbLazyFrame {
|
|
|
771
819
|
.into())
|
|
772
820
|
}
|
|
773
821
|
|
|
822
|
+
pub fn gather(&self, idxs: &Self, null_on_oob: bool) -> Self {
|
|
823
|
+
let ldf = self.ldf.read().clone();
|
|
824
|
+
let idxs = idxs.clone().ldf.into_inner();
|
|
825
|
+
ldf.gather(idxs, null_on_oob).into()
|
|
826
|
+
}
|
|
827
|
+
|
|
774
828
|
pub fn with_column(&self, expr: &RbExpr) -> Self {
|
|
775
829
|
let ldf = self.ldf.read().clone();
|
|
776
830
|
ldf.with_column(expr.inner.clone()).into()
|
|
@@ -786,6 +840,127 @@ impl RbLazyFrame {
|
|
|
786
840
|
Ok(ldf.with_columns_seq(exprs.to_exprs()?).into())
|
|
787
841
|
}
|
|
788
842
|
|
|
843
|
+
pub fn match_to_schema(
|
|
844
|
+
&self,
|
|
845
|
+
schema: Wrap<Schema>,
|
|
846
|
+
missing_columns: Value,
|
|
847
|
+
missing_struct_fields: Value,
|
|
848
|
+
extra_columns: Wrap<ExtraColumnsPolicy>,
|
|
849
|
+
extra_struct_fields: Value,
|
|
850
|
+
integer_cast: Value,
|
|
851
|
+
float_cast: Value,
|
|
852
|
+
) -> RbResult<Self> {
|
|
853
|
+
fn parse_missing_columns(
|
|
854
|
+
schema: &Schema,
|
|
855
|
+
missing_columns: Value,
|
|
856
|
+
) -> RbResult<Vec<MissingColumnsPolicyOrExpr>> {
|
|
857
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
858
|
+
if let Ok(policy) = Wrap::<MissingColumnsPolicyOrExpr>::try_convert(missing_columns) {
|
|
859
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
860
|
+
} else if let Ok(dict) = RHash::try_convert(missing_columns) {
|
|
861
|
+
out.extend(std::iter::repeat_n(
|
|
862
|
+
MissingColumnsPolicyOrExpr::Raise,
|
|
863
|
+
schema.len(),
|
|
864
|
+
));
|
|
865
|
+
dict.foreach(|key: String, value: Wrap<MissingColumnsPolicyOrExpr>| {
|
|
866
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
867
|
+
Ok(ForEach::Continue)
|
|
868
|
+
})?;
|
|
869
|
+
} else {
|
|
870
|
+
return Err(RbTypeError::new_err("Invalid value for `missing_columns`"));
|
|
871
|
+
}
|
|
872
|
+
Ok(out)
|
|
873
|
+
}
|
|
874
|
+
fn parse_missing_struct_fields(
|
|
875
|
+
schema: &Schema,
|
|
876
|
+
missing_struct_fields: Value,
|
|
877
|
+
) -> RbResult<Vec<MissingColumnsPolicy>> {
|
|
878
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
879
|
+
if let Ok(policy) = Wrap::<MissingColumnsPolicy>::try_convert(missing_struct_fields) {
|
|
880
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
881
|
+
} else if let Ok(dict) = RHash::try_convert(missing_struct_fields) {
|
|
882
|
+
out.extend(std::iter::repeat_n(
|
|
883
|
+
MissingColumnsPolicy::Raise,
|
|
884
|
+
schema.len(),
|
|
885
|
+
));
|
|
886
|
+
dict.foreach(|key: String, value: Wrap<MissingColumnsPolicy>| {
|
|
887
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
888
|
+
Ok(ForEach::Continue)
|
|
889
|
+
})?;
|
|
890
|
+
} else {
|
|
891
|
+
return Err(RbTypeError::new_err(
|
|
892
|
+
"Invalid value for `missing_struct_fields`",
|
|
893
|
+
));
|
|
894
|
+
}
|
|
895
|
+
Ok(out)
|
|
896
|
+
}
|
|
897
|
+
fn parse_extra_struct_fields(
|
|
898
|
+
schema: &Schema,
|
|
899
|
+
extra_struct_fields: Value,
|
|
900
|
+
) -> RbResult<Vec<ExtraColumnsPolicy>> {
|
|
901
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
902
|
+
if let Ok(policy) = Wrap::<ExtraColumnsPolicy>::try_convert(extra_struct_fields) {
|
|
903
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
904
|
+
} else if let Ok(dict) = RHash::try_convert(extra_struct_fields) {
|
|
905
|
+
out.extend(std::iter::repeat_n(ExtraColumnsPolicy::Raise, schema.len()));
|
|
906
|
+
dict.foreach(|key: String, value: Wrap<ExtraColumnsPolicy>| {
|
|
907
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
908
|
+
Ok(ForEach::Continue)
|
|
909
|
+
})?;
|
|
910
|
+
} else {
|
|
911
|
+
return Err(RbTypeError::new_err(
|
|
912
|
+
"Invalid value for `extra_struct_fields`",
|
|
913
|
+
));
|
|
914
|
+
}
|
|
915
|
+
Ok(out)
|
|
916
|
+
}
|
|
917
|
+
fn parse_cast(schema: &Schema, cast: Value) -> RbResult<Vec<UpcastOrForbid>> {
|
|
918
|
+
let mut out = Vec::with_capacity(schema.len());
|
|
919
|
+
if let Ok(policy) = Wrap::<UpcastOrForbid>::try_convert(cast) {
|
|
920
|
+
out.extend(std::iter::repeat_n(policy.0, schema.len()));
|
|
921
|
+
} else if let Ok(dict) = RHash::try_convert(cast) {
|
|
922
|
+
out.extend(std::iter::repeat_n(UpcastOrForbid::Forbid, schema.len()));
|
|
923
|
+
dict.foreach(|key: String, value: Wrap<UpcastOrForbid>| {
|
|
924
|
+
out[schema.try_index_of(&key).map_err(to_rb_err)?] = value.0;
|
|
925
|
+
Ok(ForEach::Continue)
|
|
926
|
+
})?;
|
|
927
|
+
} else {
|
|
928
|
+
return Err(RbTypeError::new_err(
|
|
929
|
+
"Invalid value for `integer_cast` / `float_cast`",
|
|
930
|
+
));
|
|
931
|
+
}
|
|
932
|
+
Ok(out)
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
let missing_columns = parse_missing_columns(&schema.0, missing_columns)?;
|
|
936
|
+
let missing_struct_fields = parse_missing_struct_fields(&schema.0, missing_struct_fields)?;
|
|
937
|
+
let extra_struct_fields = parse_extra_struct_fields(&schema.0, extra_struct_fields)?;
|
|
938
|
+
let integer_cast = parse_cast(&schema.0, integer_cast)?;
|
|
939
|
+
let float_cast = parse_cast(&schema.0, float_cast)?;
|
|
940
|
+
|
|
941
|
+
let per_column = (0..schema.0.len())
|
|
942
|
+
.map(|i| MatchToSchemaPerColumn {
|
|
943
|
+
missing_columns: missing_columns[i].clone(),
|
|
944
|
+
missing_struct_fields: missing_struct_fields[i],
|
|
945
|
+
extra_struct_fields: extra_struct_fields[i],
|
|
946
|
+
integer_cast: integer_cast[i],
|
|
947
|
+
float_cast: float_cast[i],
|
|
948
|
+
})
|
|
949
|
+
.collect();
|
|
950
|
+
|
|
951
|
+
let ldf = self.ldf.read().clone();
|
|
952
|
+
Ok(ldf
|
|
953
|
+
.match_to_schema(Arc::new(schema.0), per_column, extra_columns.0)
|
|
954
|
+
.into())
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
pub fn pipe_with_schema(&self, callback: Value) -> Self {
|
|
958
|
+
let ldf = self.ldf.read().clone();
|
|
959
|
+
let function = RubyObject::from(callback);
|
|
960
|
+
ldf.pipe_with_schema(PlanCallback::new_ruby(function))
|
|
961
|
+
.into()
|
|
962
|
+
}
|
|
963
|
+
|
|
789
964
|
pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
|
|
790
965
|
let ldf = self.ldf.read().clone();
|
|
791
966
|
ldf.rename(existing, new, strict).into()
|
|
@@ -930,6 +1105,7 @@ impl RbLazyFrame {
|
|
|
930
1105
|
agg: &RbExpr,
|
|
931
1106
|
maintain_order: bool,
|
|
932
1107
|
separator: String,
|
|
1108
|
+
column_naming: Wrap<PivotColumnNaming>,
|
|
933
1109
|
) -> Self {
|
|
934
1110
|
let ldf = self.ldf.read().clone();
|
|
935
1111
|
ldf.pivot(
|
|
@@ -940,6 +1116,7 @@ impl RbLazyFrame {
|
|
|
940
1116
|
agg.inner.clone(),
|
|
941
1117
|
maintain_order,
|
|
942
1118
|
separator.into(),
|
|
1119
|
+
column_naming.0,
|
|
943
1120
|
)
|
|
944
1121
|
.into()
|
|
945
1122
|
}
|
|
@@ -981,7 +1158,7 @@ impl RbLazyFrame {
|
|
|
981
1158
|
opt.set(OptFlags::PREDICATE_PUSHDOWN, predicate_pushdown);
|
|
982
1159
|
opt.set(OptFlags::PROJECTION_PUSHDOWN, projection_pushdown);
|
|
983
1160
|
opt.set(OptFlags::SLICE_PUSHDOWN, slice_pushdown);
|
|
984
|
-
opt.set(OptFlags::
|
|
1161
|
+
opt.set(OptFlags::STREAMING, streamable);
|
|
985
1162
|
|
|
986
1163
|
self.ldf
|
|
987
1164
|
.read()
|
|
@@ -1022,14 +1199,12 @@ impl RbLazyFrame {
|
|
|
1022
1199
|
let schema = rb.enter_polars(|| self_.ldf.write().collect_schema())?;
|
|
1023
1200
|
|
|
1024
1201
|
let schema_dict = rb.hash_new();
|
|
1025
|
-
schema.iter_fields()
|
|
1026
|
-
schema_dict
|
|
1027
|
-
.
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
.unwrap();
|
|
1032
|
-
});
|
|
1202
|
+
for fld in schema.iter_fields() {
|
|
1203
|
+
schema_dict.aset(
|
|
1204
|
+
fld.name().to_string(),
|
|
1205
|
+
Wrap(fld.dtype().clone()).try_into_value_with(rb)?,
|
|
1206
|
+
)?;
|
|
1207
|
+
}
|
|
1033
1208
|
Ok(schema_dict)
|
|
1034
1209
|
}
|
|
1035
1210
|
|
|
@@ -1049,12 +1224,12 @@ impl RbLazyFrame {
|
|
|
1049
1224
|
ldf.count().into()
|
|
1050
1225
|
}
|
|
1051
1226
|
|
|
1052
|
-
pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
|
|
1227
|
+
pub fn merge_sorted(&self, other: &Self, key: String, maintain_order: bool) -> RbResult<Self> {
|
|
1053
1228
|
let out = self
|
|
1054
1229
|
.ldf
|
|
1055
1230
|
.read()
|
|
1056
1231
|
.clone()
|
|
1057
|
-
.merge_sorted(other.ldf.read().clone(), &key)
|
|
1232
|
+
.merge_sorted(other.ldf.read().clone(), &key, maintain_order)
|
|
1058
1233
|
.map_err(RbPolarsErr::from)?;
|
|
1059
1234
|
Ok(out.into())
|
|
1060
1235
|
}
|
|
@@ -1118,7 +1293,7 @@ impl RbLazyFrame {
|
|
|
1118
1293
|
#[magnus::wrap(class = "Polars::RbCollectBatches")]
|
|
1119
1294
|
pub struct RbCollectBatches {
|
|
1120
1295
|
inner: Arc<Mutex<CollectBatches>>,
|
|
1121
|
-
|
|
1296
|
+
ldf: LazyFrame,
|
|
1122
1297
|
}
|
|
1123
1298
|
|
|
1124
1299
|
impl RbCollectBatches {
|
|
@@ -1126,4 +1301,55 @@ impl RbCollectBatches {
|
|
|
1126
1301
|
let inner = Arc::clone(&slf.inner);
|
|
1127
1302
|
rb.enter_polars(|| PolarsResult::Ok(inner.lock().next().transpose()?.map(RbDataFrame::new)))
|
|
1128
1303
|
}
|
|
1304
|
+
|
|
1305
|
+
pub fn __arrow_c_stream__(rb: &Ruby, self_: &Self) -> RbResult<Value> {
|
|
1306
|
+
let mut ldf = self_.ldf.clone();
|
|
1307
|
+
let schema = ldf
|
|
1308
|
+
.collect_schema()
|
|
1309
|
+
.map_err(RbPolarsErr::from)?
|
|
1310
|
+
.to_arrow(CompatLevel::newest());
|
|
1311
|
+
|
|
1312
|
+
let dtype = ArrowDataType::Struct(schema.into_iter_values().collect());
|
|
1313
|
+
|
|
1314
|
+
let iter = Box::new(ArrowStreamIterator::new(self_.inner.clone(), dtype.clone()));
|
|
1315
|
+
let field = ArrowField::new(PlSmallStr::EMPTY, dtype, false);
|
|
1316
|
+
let stream = export_iterator(iter, field);
|
|
1317
|
+
Ok(RbArrowArrayStream { stream }.into_value_with(rb))
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
pub struct ArrowStreamIterator {
|
|
1322
|
+
inner: Arc<Mutex<CollectBatches>>,
|
|
1323
|
+
dtype: ArrowDataType,
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
impl ArrowStreamIterator {
|
|
1327
|
+
fn new(inner: Arc<Mutex<CollectBatches>>, schema: ArrowDataType) -> Self {
|
|
1328
|
+
Self {
|
|
1329
|
+
inner,
|
|
1330
|
+
dtype: schema,
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
impl Iterator for ArrowStreamIterator {
|
|
1336
|
+
type Item = PolarsResult<ArrayRef>;
|
|
1337
|
+
|
|
1338
|
+
fn next(&mut self) -> Option<Self::Item> {
|
|
1339
|
+
let next = self.inner.lock().next();
|
|
1340
|
+
match next {
|
|
1341
|
+
None => None,
|
|
1342
|
+
Some(Err(err)) => Some(Err(err)),
|
|
1343
|
+
Some(Ok(df)) => {
|
|
1344
|
+
let height = df.height();
|
|
1345
|
+
let arrays = df.rechunk_into_arrow(CompatLevel::newest());
|
|
1346
|
+
Some(Ok(Box::new(arrow::array::StructArray::new(
|
|
1347
|
+
self.dtype.clone(),
|
|
1348
|
+
height,
|
|
1349
|
+
arrays,
|
|
1350
|
+
None,
|
|
1351
|
+
))))
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1129
1355
|
}
|
|
@@ -52,7 +52,8 @@ flag_getter_setters! {
|
|
|
52
52
|
(COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
|
|
53
53
|
(CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
|
|
54
54
|
(FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
|
|
55
|
+
(SORT_COLLAPSE, get_sort_collapse, set_sort_collapse, clear=true)
|
|
55
56
|
|
|
56
57
|
(EAGER, get_eager, set_eager, clear=true)
|
|
57
|
-
(
|
|
58
|
+
(STREAMING, get_streaming, set_streaming, clear=true)
|
|
58
59
|
}
|