polars-df 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +55 -48
- data/Cargo.toml +3 -0
- data/README.md +12 -0
- data/ext/polars/Cargo.toml +22 -11
- data/ext/polars/src/batched_csv.rs +4 -4
- data/ext/polars/src/catalog/unity.rs +96 -94
- data/ext/polars/src/conversion/any_value.rs +26 -30
- data/ext/polars/src/conversion/chunked_array.rs +32 -28
- data/ext/polars/src/conversion/datetime.rs +11 -0
- data/ext/polars/src/conversion/mod.rs +307 -34
- data/ext/polars/src/dataframe/construction.rs +4 -3
- data/ext/polars/src/dataframe/export.rs +17 -15
- data/ext/polars/src/dataframe/general.rs +15 -12
- data/ext/polars/src/dataframe/io.rs +1 -2
- data/ext/polars/src/dataframe/mod.rs +25 -1
- data/ext/polars/src/dataframe/serde.rs +23 -8
- data/ext/polars/src/exceptions.rs +8 -4
- data/ext/polars/src/expr/array.rs +73 -4
- data/ext/polars/src/expr/binary.rs +26 -1
- data/ext/polars/src/expr/bitwise.rs +39 -0
- data/ext/polars/src/expr/categorical.rs +20 -0
- data/ext/polars/src/expr/datatype.rs +24 -1
- data/ext/polars/src/expr/datetime.rs +58 -14
- data/ext/polars/src/expr/general.rs +87 -15
- data/ext/polars/src/expr/list.rs +32 -24
- data/ext/polars/src/expr/meta.rs +15 -6
- data/ext/polars/src/expr/mod.rs +3 -0
- data/ext/polars/src/expr/name.rs +19 -14
- data/ext/polars/src/expr/rolling.rs +20 -0
- data/ext/polars/src/expr/serde.rs +28 -0
- data/ext/polars/src/expr/string.rs +64 -10
- data/ext/polars/src/expr/struct.rs +9 -1
- data/ext/polars/src/file.rs +15 -9
- data/ext/polars/src/functions/business.rs +0 -1
- data/ext/polars/src/functions/io.rs +25 -3
- data/ext/polars/src/functions/lazy.rs +11 -6
- data/ext/polars/src/functions/meta.rs +3 -3
- data/ext/polars/src/functions/string_cache.rs +3 -3
- data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
- data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
- data/ext/polars/src/io/mod.rs +6 -0
- data/ext/polars/src/lazyframe/general.rs +59 -9
- data/ext/polars/src/lazyframe/mod.rs +16 -1
- data/ext/polars/src/lazyframe/optflags.rs +58 -0
- data/ext/polars/src/lazyframe/serde.rs +27 -3
- data/ext/polars/src/lib.rs +261 -19
- data/ext/polars/src/map/dataframe.rs +20 -17
- data/ext/polars/src/map/lazy.rs +6 -5
- data/ext/polars/src/map/series.rs +8 -7
- data/ext/polars/src/on_startup.rs +12 -5
- data/ext/polars/src/rb_modules.rs +2 -2
- data/ext/polars/src/series/aggregation.rs +85 -28
- data/ext/polars/src/series/construction.rs +1 -0
- data/ext/polars/src/series/export.rs +37 -33
- data/ext/polars/src/series/general.rs +120 -21
- data/ext/polars/src/series/mod.rs +29 -4
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +794 -27
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +26 -5
- data/lib/polars/date_time_expr.rb +252 -1
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1248 -206
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +14 -1
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/iceberg.rb +27 -0
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +32 -7
- data/lib/polars/io/scan_options.rb +4 -1
- data/lib/polars/lazy_frame.rb +1028 -28
- data/lib/polars/list_expr.rb +217 -17
- data/lib/polars/list_name_space.rb +231 -22
- data/lib/polars/meta_expr.rb +89 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +20 -1
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1053 -54
- data/lib/polars/string_expr.rb +436 -32
- data/lib/polars/string_name_space.rb +736 -50
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +22 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +11 -1
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
|
1
|
+
use magnus::{IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
|
2
2
|
use polars::io::{HiveOptions, RowIndex};
|
3
3
|
use polars::lazy::frame::LazyFrame;
|
4
4
|
use polars::prelude::*;
|
@@ -30,13 +30,12 @@ impl RbLazyFrame {
|
|
30
30
|
source: Option<Value>,
|
31
31
|
sources: Wrap<ScanSources>,
|
32
32
|
infer_schema_length: Option<usize>,
|
33
|
-
batch_size: Option<
|
33
|
+
batch_size: Option<NonZeroUsize>,
|
34
34
|
n_rows: Option<usize>,
|
35
35
|
low_memory: bool,
|
36
36
|
rechunk: bool,
|
37
37
|
row_index: Option<(String, IdxSize)>,
|
38
38
|
) -> RbResult<Self> {
|
39
|
-
let batch_size = batch_size.map(|v| v.0);
|
40
39
|
let row_index = row_index.map(|(name, offset)| RowIndex {
|
41
40
|
name: name.into(),
|
42
41
|
offset,
|
@@ -325,6 +324,30 @@ impl RbLazyFrame {
|
|
325
324
|
.into())
|
326
325
|
}
|
327
326
|
|
327
|
+
pub fn top_k(&self, k: IdxSize, by: RArray, reverse: Vec<bool>) -> RbResult<Self> {
|
328
|
+
let ldf = self.ldf.borrow().clone();
|
329
|
+
let exprs = rb_exprs_to_exprs(by)?;
|
330
|
+
Ok(ldf
|
331
|
+
.top_k(
|
332
|
+
k,
|
333
|
+
exprs,
|
334
|
+
SortMultipleOptions::new().with_order_descending_multi(reverse),
|
335
|
+
)
|
336
|
+
.into())
|
337
|
+
}
|
338
|
+
|
339
|
+
pub fn bottom_k(&self, k: IdxSize, by: RArray, reverse: Vec<bool>) -> RbResult<Self> {
|
340
|
+
let ldf = self.ldf.borrow().clone();
|
341
|
+
let exprs = rb_exprs_to_exprs(by)?;
|
342
|
+
Ok(ldf
|
343
|
+
.bottom_k(
|
344
|
+
k,
|
345
|
+
exprs,
|
346
|
+
SortMultipleOptions::new().with_order_descending_multi(reverse),
|
347
|
+
)
|
348
|
+
.into())
|
349
|
+
}
|
350
|
+
|
328
351
|
pub fn cache(&self) -> Self {
|
329
352
|
let ldf = self.ldf.borrow().clone();
|
330
353
|
ldf.cache().into()
|
@@ -420,7 +443,7 @@ impl RbLazyFrame {
|
|
420
443
|
let separator = u8::try_convert(arguments[3])?;
|
421
444
|
let line_terminator = String::try_convert(arguments[4])?;
|
422
445
|
let quote_char = u8::try_convert(arguments[5])?;
|
423
|
-
let batch_size =
|
446
|
+
let batch_size = NonZeroUsize::try_convert(arguments[6])?;
|
424
447
|
let datetime_format = Option::<String>::try_convert(arguments[7])?;
|
425
448
|
let date_format = Option::<String>::try_convert(arguments[8])?;
|
426
449
|
let time_format = Option::<String>::try_convert(arguments[9])?;
|
@@ -453,7 +476,7 @@ impl RbLazyFrame {
|
|
453
476
|
let options = CsvWriterOptions {
|
454
477
|
include_bom,
|
455
478
|
include_header,
|
456
|
-
batch_size
|
479
|
+
batch_size,
|
457
480
|
serialize_options,
|
458
481
|
};
|
459
482
|
|
@@ -509,6 +532,11 @@ impl RbLazyFrame {
|
|
509
532
|
ldf.filter(predicate.inner.clone()).into()
|
510
533
|
}
|
511
534
|
|
535
|
+
pub fn remove(&self, predicate: &RbExpr) -> Self {
|
536
|
+
let ldf = self.ldf.borrow().clone();
|
537
|
+
ldf.remove(predicate.inner.clone()).into()
|
538
|
+
}
|
539
|
+
|
512
540
|
pub fn select(&self, exprs: RArray) -> RbResult<Self> {
|
513
541
|
let ldf = self.ldf.borrow().clone();
|
514
542
|
let exprs = rb_exprs_to_exprs(exprs)?;
|
@@ -701,6 +729,20 @@ impl RbLazyFrame {
|
|
701
729
|
.into())
|
702
730
|
}
|
703
731
|
|
732
|
+
pub fn join_where(&self, other: &Self, predicates: RArray, suffix: String) -> RbResult<Self> {
|
733
|
+
let ldf = self.ldf.borrow().clone();
|
734
|
+
let other = other.ldf.borrow().clone();
|
735
|
+
|
736
|
+
let predicates = rb_exprs_to_exprs(predicates)?;
|
737
|
+
|
738
|
+
Ok(ldf
|
739
|
+
.join_builder()
|
740
|
+
.with(other)
|
741
|
+
.suffix(suffix)
|
742
|
+
.join_where(predicates)
|
743
|
+
.into())
|
744
|
+
}
|
745
|
+
|
704
746
|
pub fn with_column(&self, expr: &RbExpr) -> Self {
|
705
747
|
let ldf = self.ldf.borrow().clone();
|
706
748
|
ldf.with_column(expr.inner.clone()).into()
|
@@ -816,6 +858,14 @@ impl RbLazyFrame {
|
|
816
858
|
.into())
|
817
859
|
}
|
818
860
|
|
861
|
+
pub fn drop_nans(&self, subset: Option<&RbSelector>) -> Self {
|
862
|
+
self.ldf
|
863
|
+
.borrow()
|
864
|
+
.clone()
|
865
|
+
.drop_nans(subset.map(|e| e.inner.clone()))
|
866
|
+
.into()
|
867
|
+
}
|
868
|
+
|
819
869
|
pub fn drop_nulls(&self, subset: Option<&RbSelector>) -> Self {
|
820
870
|
self.ldf
|
821
871
|
.borrow()
|
@@ -880,19 +930,19 @@ impl RbLazyFrame {
|
|
880
930
|
self.ldf.borrow().clone().into()
|
881
931
|
}
|
882
932
|
|
883
|
-
pub fn collect_schema(&
|
884
|
-
let schema =
|
933
|
+
pub fn collect_schema(ruby: &Ruby, rb_self: &Self) -> RbResult<RHash> {
|
934
|
+
let schema = rb_self
|
885
935
|
.ldf
|
886
936
|
.borrow_mut()
|
887
937
|
.collect_schema()
|
888
938
|
.map_err(RbPolarsErr::from)?;
|
889
939
|
|
890
|
-
let schema_dict =
|
940
|
+
let schema_dict = ruby.hash_new();
|
891
941
|
schema.iter_fields().for_each(|fld| {
|
892
942
|
schema_dict
|
893
943
|
.aset::<String, Value>(
|
894
944
|
fld.name().to_string(),
|
895
|
-
Wrap(fld.dtype().clone()).
|
945
|
+
Wrap(fld.dtype().clone()).into_value_with(ruby),
|
896
946
|
)
|
897
947
|
.unwrap();
|
898
948
|
});
|
@@ -1,8 +1,9 @@
|
|
1
1
|
mod general;
|
2
|
+
mod optflags;
|
2
3
|
mod serde;
|
3
4
|
mod sink;
|
4
5
|
|
5
|
-
use polars::
|
6
|
+
use polars::prelude::{LazyFrame, OptFlags};
|
6
7
|
pub use sink::SinkTarget;
|
7
8
|
use std::cell::RefCell;
|
8
9
|
|
@@ -12,6 +13,12 @@ pub struct RbLazyFrame {
|
|
12
13
|
pub ldf: RefCell<LazyFrame>,
|
13
14
|
}
|
14
15
|
|
16
|
+
#[magnus::wrap(class = "Polars::RbOptFlags")]
|
17
|
+
#[derive(Clone)]
|
18
|
+
pub struct RbOptFlags {
|
19
|
+
pub inner: RefCell<OptFlags>,
|
20
|
+
}
|
21
|
+
|
15
22
|
impl From<LazyFrame> for RbLazyFrame {
|
16
23
|
fn from(ldf: LazyFrame) -> Self {
|
17
24
|
RbLazyFrame {
|
@@ -19,3 +26,11 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
19
26
|
}
|
20
27
|
}
|
21
28
|
}
|
29
|
+
|
30
|
+
impl From<OptFlags> for RbOptFlags {
|
31
|
+
fn from(inner: OptFlags) -> Self {
|
32
|
+
RbOptFlags {
|
33
|
+
inner: RefCell::new(inner),
|
34
|
+
}
|
35
|
+
}
|
36
|
+
}
|
@@ -0,0 +1,58 @@
|
|
1
|
+
use polars::prelude::OptFlags;
|
2
|
+
|
3
|
+
use super::RbOptFlags;
|
4
|
+
|
5
|
+
macro_rules! flag_getter_setters {
|
6
|
+
($(($flag:ident, $getter:ident, $setter:ident, clear=$clear:literal))+) => {
|
7
|
+
impl RbOptFlags {
|
8
|
+
pub fn empty() -> Self {
|
9
|
+
Self {
|
10
|
+
inner: OptFlags::empty().into()
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
#[allow(clippy::should_implement_trait)]
|
15
|
+
pub fn default() -> Self {
|
16
|
+
Self { inner: OptFlags::default().into() }
|
17
|
+
}
|
18
|
+
|
19
|
+
pub fn no_optimizations(&self) {
|
20
|
+
$(if $clear {
|
21
|
+
self.inner.borrow_mut().remove(OptFlags::$flag);
|
22
|
+
})+
|
23
|
+
}
|
24
|
+
|
25
|
+
pub fn copy(&self) -> Self {
|
26
|
+
Self { inner: self.inner.clone() }
|
27
|
+
}
|
28
|
+
|
29
|
+
$(
|
30
|
+
pub fn $getter(&self) -> bool {
|
31
|
+
self.inner.borrow().contains(OptFlags::$flag)
|
32
|
+
}
|
33
|
+
pub fn $setter(&self, value: bool) {
|
34
|
+
self.inner.borrow_mut().set(OptFlags::$flag, value)
|
35
|
+
}
|
36
|
+
)+
|
37
|
+
}
|
38
|
+
};
|
39
|
+
}
|
40
|
+
|
41
|
+
flag_getter_setters! {
|
42
|
+
(TYPE_COERCION, get_type_coercion, set_type_coercion, clear=false)
|
43
|
+
(TYPE_CHECK, get_type_check, set_type_check, clear=false)
|
44
|
+
|
45
|
+
(PROJECTION_PUSHDOWN, get_projection_pushdown, set_projection_pushdown, clear=true)
|
46
|
+
(PREDICATE_PUSHDOWN, get_predicate_pushdown, set_predicate_pushdown, clear=true)
|
47
|
+
(CLUSTER_WITH_COLUMNS, get_cluster_with_columns, set_cluster_with_columns, clear=true)
|
48
|
+
(SIMPLIFY_EXPR, get_simplify_expression, set_simplify_expression, clear=true)
|
49
|
+
(SLICE_PUSHDOWN, get_slice_pushdown, set_slice_pushdown, clear=true)
|
50
|
+
(COMM_SUBPLAN_ELIM, get_comm_subplan_elim, set_comm_subplan_elim, clear=true)
|
51
|
+
(COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
|
52
|
+
(COLLAPSE_JOINS, get_collapse_joins, set_collapse_joins, clear=true)
|
53
|
+
(CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
|
54
|
+
(FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
|
55
|
+
|
56
|
+
(EAGER, get_eager, set_eager, clear=true)
|
57
|
+
(NEW_STREAMING, get_streaming, set_streaming, clear=true)
|
58
|
+
}
|
@@ -1,14 +1,38 @@
|
|
1
|
+
use std::io::Read;
|
2
|
+
#[cfg(feature = "serialize_binary")]
|
3
|
+
use std::io::{BufReader, BufWriter};
|
4
|
+
|
1
5
|
use magnus::Value;
|
2
6
|
use polars::lazy::frame::LazyFrame;
|
3
7
|
use polars::prelude::*;
|
4
|
-
use std::io::Read;
|
5
8
|
|
6
9
|
use crate::file::get_file_like;
|
10
|
+
#[cfg(feature = "serialize_binary")]
|
11
|
+
use crate::utils::to_rb_err;
|
7
12
|
use crate::{RbLazyFrame, RbResult, RbValueError};
|
8
13
|
|
9
14
|
impl RbLazyFrame {
|
10
|
-
|
11
|
-
pub fn
|
15
|
+
#[cfg(feature = "serialize_binary")]
|
16
|
+
pub fn serialize_binary(&self, rb_f: Value) -> RbResult<()> {
|
17
|
+
let file = get_file_like(rb_f, true)?;
|
18
|
+
let writer = BufWriter::new(file);
|
19
|
+
self.ldf
|
20
|
+
.borrow()
|
21
|
+
.logical_plan
|
22
|
+
.serialize_versioned(writer, Default::default())
|
23
|
+
.map_err(to_rb_err)
|
24
|
+
}
|
25
|
+
|
26
|
+
#[cfg(feature = "serialize_binary")]
|
27
|
+
pub fn deserialize_binary(rb_f: Value) -> RbResult<Self> {
|
28
|
+
let file = get_file_like(rb_f, false)?;
|
29
|
+
let reader = BufReader::new(file);
|
30
|
+
|
31
|
+
let lp: DslPlan = DslPlan::deserialize_versioned(reader).map_err(to_rb_err)?;
|
32
|
+
Ok(LazyFrame::from(lp).into())
|
33
|
+
}
|
34
|
+
|
35
|
+
pub fn deserialize_json(rb_f: Value) -> RbResult<Self> {
|
12
36
|
// it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
|
13
37
|
// so don't bother with files.
|
14
38
|
let mut json = String::new();
|