polars-df 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +29 -0
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +53 -12
- data/ext/polars/src/dataframe.rs +36 -39
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +19 -5
- data/ext/polars/src/series.rs +13 -1
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +5 -3
data/ext/polars/src/lazy/dsl.rs
CHANGED
@@ -10,7 +10,7 @@ use crate::conversion::*;
|
|
10
10
|
use crate::lazy::apply::*;
|
11
11
|
use crate::lazy::utils::rb_exprs_to_exprs;
|
12
12
|
use crate::utils::reinterpret;
|
13
|
-
use crate::{RbResult, RbSeries};
|
13
|
+
use crate::{RbPolarsErr, RbResult, RbSeries};
|
14
14
|
|
15
15
|
#[magnus::wrap(class = "Polars::RbExpr")]
|
16
16
|
#[derive(Clone)]
|
@@ -715,6 +715,18 @@ impl RbExpr {
|
|
715
715
|
.into()
|
716
716
|
}
|
717
717
|
|
718
|
+
pub fn binary_contains(&self, lit: Vec<u8>) -> Self {
|
719
|
+
self.inner.clone().binary().contains_literal(lit).into()
|
720
|
+
}
|
721
|
+
|
722
|
+
pub fn binary_ends_with(&self, sub: Vec<u8>) -> Self {
|
723
|
+
self.inner.clone().binary().ends_with(sub).into()
|
724
|
+
}
|
725
|
+
|
726
|
+
pub fn binary_starts_with(&self, sub: Vec<u8>) -> Self {
|
727
|
+
self.inner.clone().binary().starts_with(sub).into()
|
728
|
+
}
|
729
|
+
|
718
730
|
pub fn str_hex_encode(&self) -> Self {
|
719
731
|
self.clone()
|
720
732
|
.inner
|
@@ -763,6 +775,58 @@ impl RbExpr {
|
|
763
775
|
.into()
|
764
776
|
}
|
765
777
|
|
778
|
+
pub fn binary_hex_encode(&self) -> Self {
|
779
|
+
self.clone()
|
780
|
+
.inner
|
781
|
+
.map(
|
782
|
+
move |s| s.binary().map(|s| Some(s.hex_encode().into_series())),
|
783
|
+
GetOutput::same_type(),
|
784
|
+
)
|
785
|
+
.with_fmt("binary.hex_encode")
|
786
|
+
.into()
|
787
|
+
}
|
788
|
+
|
789
|
+
pub fn binary_hex_decode(&self, strict: bool) -> Self {
|
790
|
+
self.clone()
|
791
|
+
.inner
|
792
|
+
.map(
|
793
|
+
move |s| {
|
794
|
+
s.binary()?
|
795
|
+
.hex_decode(strict)
|
796
|
+
.map(|s| Some(s.into_series()))
|
797
|
+
},
|
798
|
+
GetOutput::same_type(),
|
799
|
+
)
|
800
|
+
.with_fmt("binary.hex_decode")
|
801
|
+
.into()
|
802
|
+
}
|
803
|
+
|
804
|
+
pub fn binary_base64_encode(&self) -> Self {
|
805
|
+
self.clone()
|
806
|
+
.inner
|
807
|
+
.map(
|
808
|
+
move |s| s.binary().map(|s| Some(s.base64_encode().into_series())),
|
809
|
+
GetOutput::same_type(),
|
810
|
+
)
|
811
|
+
.with_fmt("binary.base64_encode")
|
812
|
+
.into()
|
813
|
+
}
|
814
|
+
|
815
|
+
pub fn binary_base64_decode(&self, strict: bool) -> Self {
|
816
|
+
self.clone()
|
817
|
+
.inner
|
818
|
+
.map(
|
819
|
+
move |s| {
|
820
|
+
s.binary()?
|
821
|
+
.base64_decode(strict)
|
822
|
+
.map(|s| Some(s.into_series()))
|
823
|
+
},
|
824
|
+
GetOutput::same_type(),
|
825
|
+
)
|
826
|
+
.with_fmt("binary.base64_decode")
|
827
|
+
.into()
|
828
|
+
}
|
829
|
+
|
766
830
|
pub fn str_json_path_match(&self, pat: String) -> Self {
|
767
831
|
let function = move |s: Series| {
|
768
832
|
let ca = s.utf8()?;
|
@@ -1654,9 +1718,9 @@ pub fn cov(a: &RbExpr, b: &RbExpr) -> RbExpr {
|
|
1654
1718
|
polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone()).into()
|
1655
1719
|
}
|
1656
1720
|
|
1657
|
-
pub fn
|
1721
|
+
pub fn arg_sort_by(by: RArray, reverse: Vec<bool>) -> RbResult<RbExpr> {
|
1658
1722
|
let by = rb_exprs_to_exprs(by)?;
|
1659
|
-
Ok(polars::lazy::dsl::
|
1723
|
+
Ok(polars::lazy::dsl::arg_sort_by(by, &reverse).into())
|
1660
1724
|
}
|
1661
1725
|
|
1662
1726
|
#[magnus::wrap(class = "Polars::RbWhen")]
|
@@ -1706,5 +1770,6 @@ pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
|
|
1706
1770
|
|
1707
1771
|
pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
|
1708
1772
|
let s = rb_exprs_to_exprs(s)?;
|
1709
|
-
|
1773
|
+
let expr = dsl::concat_lst(s).map_err(RbPolarsErr::from)?;
|
1774
|
+
Ok(expr.into())
|
1710
1775
|
}
|
data/ext/polars/src/lib.rs
CHANGED
@@ -73,7 +73,7 @@ fn init() -> RbResult<()> {
|
|
73
73
|
let class = module.define_class("RbDataFrame", Default::default())?;
|
74
74
|
class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
|
75
75
|
class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
|
76
|
-
class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet,
|
76
|
+
class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 9))?;
|
77
77
|
class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
|
78
78
|
class.define_singleton_method("read_avro", function!(RbDataFrame::read_avro, 4))?;
|
79
79
|
class.define_singleton_method("read_hashes", function!(RbDataFrame::read_hashes, 3))?;
|
@@ -151,7 +151,6 @@ fn init() -> RbResult<()> {
|
|
151
151
|
class.define_method("pivot_expr", method!(RbDataFrame::pivot_expr, 7))?;
|
152
152
|
class.define_method("partition_by", method!(RbDataFrame::partition_by, 2))?;
|
153
153
|
class.define_method("shift", method!(RbDataFrame::shift, 1))?;
|
154
|
-
class.define_method("unique", method!(RbDataFrame::unique, 3))?;
|
155
154
|
class.define_method("lazy", method!(RbDataFrame::lazy, 0))?;
|
156
155
|
class.define_method("max", method!(RbDataFrame::max, 0))?;
|
157
156
|
class.define_method("min", method!(RbDataFrame::min, 0))?;
|
@@ -304,10 +303,23 @@ fn init() -> RbResult<()> {
|
|
304
303
|
class.define_method("str_contains", method!(RbExpr::str_contains, 3))?;
|
305
304
|
class.define_method("str_ends_with", method!(RbExpr::str_ends_with, 1))?;
|
306
305
|
class.define_method("str_starts_with", method!(RbExpr::str_starts_with, 1))?;
|
306
|
+
class.define_method("binary_contains", method!(RbExpr::binary_contains, 1))?;
|
307
|
+
class.define_method("binary_ends_with", method!(RbExpr::binary_ends_with, 1))?;
|
308
|
+
class.define_method("binary_starts_with", method!(RbExpr::binary_starts_with, 1))?;
|
307
309
|
class.define_method("str_hex_encode", method!(RbExpr::str_hex_encode, 0))?;
|
308
310
|
class.define_method("str_hex_decode", method!(RbExpr::str_hex_decode, 1))?;
|
309
311
|
class.define_method("str_base64_encode", method!(RbExpr::str_base64_encode, 0))?;
|
310
312
|
class.define_method("str_base64_decode", method!(RbExpr::str_base64_decode, 1))?;
|
313
|
+
class.define_method("binary_hex_encode", method!(RbExpr::binary_hex_encode, 0))?;
|
314
|
+
class.define_method("binary_hex_decode", method!(RbExpr::binary_hex_decode, 1))?;
|
315
|
+
class.define_method(
|
316
|
+
"binary_base64_encode",
|
317
|
+
method!(RbExpr::binary_base64_encode, 0),
|
318
|
+
)?;
|
319
|
+
class.define_method(
|
320
|
+
"binary_base64_decode",
|
321
|
+
method!(RbExpr::binary_base64_decode, 1),
|
322
|
+
)?;
|
311
323
|
class.define_method(
|
312
324
|
"str_json_path_match",
|
313
325
|
method!(RbExpr::str_json_path_match, 1),
|
@@ -473,7 +485,7 @@ fn init() -> RbResult<()> {
|
|
473
485
|
function!(crate::lazy::dsl::spearman_rank_corr, 4),
|
474
486
|
)?;
|
475
487
|
class.define_singleton_method("cov", function!(crate::lazy::dsl::cov, 2))?;
|
476
|
-
class.define_singleton_method("
|
488
|
+
class.define_singleton_method("arg_sort_by", function!(crate::lazy::dsl::arg_sort_by, 2))?;
|
477
489
|
class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
|
478
490
|
class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
|
479
491
|
class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
|
@@ -487,7 +499,7 @@ fn init() -> RbResult<()> {
|
|
487
499
|
class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
|
488
500
|
class.define_singleton_method(
|
489
501
|
"new_from_parquet",
|
490
|
-
function!(RbLazyFrame::new_from_parquet,
|
502
|
+
function!(RbLazyFrame::new_from_parquet, 8),
|
491
503
|
)?;
|
492
504
|
class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
|
493
505
|
class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
|
@@ -504,6 +516,7 @@ fn init() -> RbResult<()> {
|
|
504
516
|
class.define_method("sort_by_exprs", method!(RbLazyFrame::sort_by_exprs, 3))?;
|
505
517
|
class.define_method("cache", method!(RbLazyFrame::cache, 0))?;
|
506
518
|
class.define_method("collect", method!(RbLazyFrame::collect, 0))?;
|
519
|
+
class.define_method("sink_parquet", method!(RbLazyFrame::sink_parquet, 7))?;
|
507
520
|
class.define_method("fetch", method!(RbLazyFrame::fetch, 1))?;
|
508
521
|
class.define_method("filter", method!(RbLazyFrame::filter, 1))?;
|
509
522
|
class.define_method("select", method!(RbLazyFrame::select, 1))?;
|
@@ -532,7 +545,7 @@ fn init() -> RbResult<()> {
|
|
532
545
|
class.define_method("drop_nulls", method!(RbLazyFrame::drop_nulls, 1))?;
|
533
546
|
class.define_method("slice", method!(RbLazyFrame::slice, 2))?;
|
534
547
|
class.define_method("tail", method!(RbLazyFrame::tail, 1))?;
|
535
|
-
class.define_method("melt", method!(RbLazyFrame::melt,
|
548
|
+
class.define_method("melt", method!(RbLazyFrame::melt, 5))?;
|
536
549
|
class.define_method("with_row_count", method!(RbLazyFrame::with_row_count, 2))?;
|
537
550
|
class.define_method("drop_columns", method!(RbLazyFrame::drop_columns, 1))?;
|
538
551
|
class.define_method("_clone", method!(RbLazyFrame::clone, 0))?;
|
@@ -560,6 +573,7 @@ fn init() -> RbResult<()> {
|
|
560
573
|
class.define_singleton_method("new_opt_f32", function!(RbSeries::new_opt_f32, 3))?;
|
561
574
|
class.define_singleton_method("new_opt_f64", function!(RbSeries::new_opt_f64, 3))?;
|
562
575
|
class.define_singleton_method("new_str", function!(RbSeries::new_str, 3))?;
|
576
|
+
class.define_singleton_method("new_binary", function!(RbSeries::new_binary, 3))?;
|
563
577
|
class.define_singleton_method("new_object", function!(RbSeries::new_object, 3))?;
|
564
578
|
class.define_singleton_method("new_list", function!(RbSeries::new_list, 3))?;
|
565
579
|
class.define_singleton_method("new_opt_date", function!(RbSeries::new_opt_date, 3))?;
|
data/ext/polars/src/series.rs
CHANGED
@@ -125,6 +125,12 @@ impl RbSeries {
|
|
125
125
|
RbSeries::new(s)
|
126
126
|
}
|
127
127
|
|
128
|
+
pub fn new_binary(name: String, val: Wrap<BinaryChunked>, _strict: bool) -> Self {
|
129
|
+
let mut s = val.0.into_series();
|
130
|
+
s.rename(&name);
|
131
|
+
RbSeries::new(s)
|
132
|
+
}
|
133
|
+
|
128
134
|
pub fn new_object(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
|
129
135
|
let val = val
|
130
136
|
.each()
|
@@ -504,7 +510,6 @@ impl RbSeries {
|
|
504
510
|
DataType::Int64 => RArray::from_iter(series.i64().unwrap()),
|
505
511
|
DataType::Float32 => RArray::from_iter(series.f32().unwrap()),
|
506
512
|
DataType::Float64 => RArray::from_iter(series.f64().unwrap()),
|
507
|
-
DataType::Decimal128(_) => todo!(),
|
508
513
|
DataType::Categorical(_) => {
|
509
514
|
RArray::from_iter(series.categorical().unwrap().iter_str())
|
510
515
|
}
|
@@ -526,6 +531,13 @@ impl RbSeries {
|
|
526
531
|
let ca = series.utf8().unwrap();
|
527
532
|
return RArray::from_iter(ca);
|
528
533
|
}
|
534
|
+
DataType::Binary => {
|
535
|
+
let a = RArray::with_capacity(series.len());
|
536
|
+
for v in series.iter() {
|
537
|
+
a.push::<Value>(Wrap(v).into_value()).unwrap();
|
538
|
+
}
|
539
|
+
return a;
|
540
|
+
}
|
529
541
|
DataType::Null | DataType::Unknown => {
|
530
542
|
panic!("to_a not implemented for null/unknown")
|
531
543
|
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for binary related expressions.
|
3
|
+
class BinaryExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Check if binaries in Series contain a binary substring.
|
13
|
+
#
|
14
|
+
# @param lit [String]
|
15
|
+
# The binary substring to look for
|
16
|
+
#
|
17
|
+
# @return [Expr]
|
18
|
+
def contains(lit)
|
19
|
+
Utils.wrap_expr(_rbexpr.binary_contains(lit))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Check if string values end with a binary substring.
|
23
|
+
#
|
24
|
+
# @param sub [String]
|
25
|
+
# Suffix substring.
|
26
|
+
#
|
27
|
+
# @return [Expr]
|
28
|
+
def ends_with(sub)
|
29
|
+
Utils.wrap_expr(_rbexpr.binary_ends_with(sub))
|
30
|
+
end
|
31
|
+
|
32
|
+
# Check if values start with a binary substring.
|
33
|
+
#
|
34
|
+
# @param sub [String]
|
35
|
+
# Prefix substring.
|
36
|
+
#
|
37
|
+
# @return [Expr]
|
38
|
+
def starts_with(sub)
|
39
|
+
Utils.wrap_expr(_rbexpr.binary_starts_with(sub))
|
40
|
+
end
|
41
|
+
|
42
|
+
# Decode a value using the provided encoding.
|
43
|
+
#
|
44
|
+
# @param encoding ["hex", "base64"]
|
45
|
+
# The encoding to use.
|
46
|
+
# @param strict [Boolean]
|
47
|
+
# Raise an error if the underlying value cannot be decoded,
|
48
|
+
# otherwise mask out with a null value.
|
49
|
+
#
|
50
|
+
# @return [Expr]
|
51
|
+
def decode(encoding, strict: true)
|
52
|
+
if encoding == "hex"
|
53
|
+
Utils.wrap_expr(_rbexpr.binary_hex_decode(strict))
|
54
|
+
elsif encoding == "base64"
|
55
|
+
Utils.wrap_expr(_rbexpr.binary_base64_decode(strict))
|
56
|
+
else
|
57
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Encode a value using the provided encoding.
|
62
|
+
#
|
63
|
+
# @param encoding ["hex", "base64"]
|
64
|
+
# The encoding to use.
|
65
|
+
#
|
66
|
+
# @return [Expr]
|
67
|
+
def encode(encoding)
|
68
|
+
if encoding == "hex"
|
69
|
+
Utils.wrap_expr(_rbexpr.binary_hex_encode)
|
70
|
+
elsif encoding == "base64"
|
71
|
+
Utils.wrap_expr(_rbexpr.binary_base64_encode)
|
72
|
+
else
|
73
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.bin namespace.
|
3
|
+
class BinaryNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "bin"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Check if binaries in Series contain a binary substring.
|
14
|
+
#
|
15
|
+
# @param lit [String]
|
16
|
+
# The binary substring to look for
|
17
|
+
#
|
18
|
+
# @return [Series]
|
19
|
+
def contains(lit)
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
# Check if string values end with a binary substring.
|
24
|
+
#
|
25
|
+
# @param sub [String]
|
26
|
+
# Suffix substring.
|
27
|
+
#
|
28
|
+
# @return [Series]
|
29
|
+
def ends_with(sub)
|
30
|
+
super
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if values start with a binary substring.
|
34
|
+
#
|
35
|
+
# @param sub [String]
|
36
|
+
# Prefix substring.
|
37
|
+
#
|
38
|
+
# @return [Series]
|
39
|
+
def starts_with(sub)
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
# Decode a value using the provided encoding.
|
44
|
+
#
|
45
|
+
# @param encoding ["hex", "base64"]
|
46
|
+
# The encoding to use.
|
47
|
+
# @param strict [Boolean]
|
48
|
+
# Raise an error if the underlying value cannot be decoded,
|
49
|
+
# otherwise mask out with a null value.
|
50
|
+
#
|
51
|
+
# @return [Series]
|
52
|
+
def decode(encoding, strict: true)
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
56
|
+
# Encode a value using the provided encoding.
|
57
|
+
#
|
58
|
+
# @param encoding ["hex", "base64"]
|
59
|
+
# The encoding to use.
|
60
|
+
#
|
61
|
+
# @return [Series]
|
62
|
+
def encode(encoding)
|
63
|
+
super
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/polars/data_frame.rb
CHANGED
@@ -97,7 +97,7 @@ module Polars
|
|
97
97
|
eol_char: "\n"
|
98
98
|
)
|
99
99
|
if Utils.pathlike?(file)
|
100
|
-
path = Utils.
|
100
|
+
path = Utils.normalise_filepath(file)
|
101
101
|
else
|
102
102
|
path = nil
|
103
103
|
# if defined?(StringIO) && file.is_a?(StringIO)
|
@@ -196,32 +196,56 @@ module Polars
|
|
196
196
|
|
197
197
|
# @private
|
198
198
|
def self._read_parquet(
|
199
|
-
|
199
|
+
source,
|
200
200
|
columns: nil,
|
201
201
|
n_rows: nil,
|
202
202
|
parallel: "auto",
|
203
203
|
row_count_name: nil,
|
204
204
|
row_count_offset: 0,
|
205
|
-
low_memory: false
|
205
|
+
low_memory: false,
|
206
|
+
use_statistics: true,
|
207
|
+
rechunk: true
|
206
208
|
)
|
207
|
-
if Utils.pathlike?(
|
208
|
-
|
209
|
+
if Utils.pathlike?(source)
|
210
|
+
source = Utils.normalise_filepath(source)
|
211
|
+
end
|
212
|
+
if columns.is_a?(String)
|
213
|
+
columns = [columns]
|
209
214
|
end
|
210
215
|
|
211
|
-
if
|
212
|
-
|
216
|
+
if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
|
217
|
+
scan =
|
218
|
+
Polars.scan_parquet(
|
219
|
+
source,
|
220
|
+
n_rows: n_rows,
|
221
|
+
rechunk: true,
|
222
|
+
parallel: parallel,
|
223
|
+
row_count_name: row_count_name,
|
224
|
+
row_count_offset: row_count_offset,
|
225
|
+
low_memory: low_memory
|
226
|
+
)
|
227
|
+
|
228
|
+
if columns.nil?
|
229
|
+
return self._from_rbdf(scan.collect._df)
|
230
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
231
|
+
return self._from_rbdf(scan.select(columns).collect._df)
|
232
|
+
else
|
233
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
234
|
+
end
|
213
235
|
end
|
214
236
|
|
215
237
|
projection, columns = Utils.handle_projection_columns(columns)
|
216
238
|
_from_rbdf(
|
217
239
|
RbDataFrame.read_parquet(
|
218
|
-
|
240
|
+
source,
|
219
241
|
columns,
|
220
242
|
projection,
|
221
243
|
n_rows,
|
222
244
|
parallel,
|
223
245
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
224
|
-
low_memory
|
246
|
+
low_memory,
|
247
|
+
use_statistics,
|
248
|
+
rechunk
|
225
249
|
)
|
226
250
|
)
|
227
251
|
end
|
@@ -229,7 +253,7 @@ module Polars
|
|
229
253
|
# @private
|
230
254
|
def self._read_avro(file, columns: nil, n_rows: nil)
|
231
255
|
if Utils.pathlike?(file)
|
232
|
-
file = Utils.
|
256
|
+
file = Utils.normalise_filepath(file)
|
233
257
|
end
|
234
258
|
projection, columns = Utils.handle_projection_columns(columns)
|
235
259
|
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
@@ -246,7 +270,7 @@ module Polars
|
|
246
270
|
memory_map: true
|
247
271
|
)
|
248
272
|
if Utils.pathlike?(file)
|
249
|
-
file = Utils.
|
273
|
+
file = Utils.normalise_filepath(file)
|
250
274
|
end
|
251
275
|
if columns.is_a?(String)
|
252
276
|
columns = [columns]
|
@@ -272,7 +296,7 @@ module Polars
|
|
272
296
|
# @private
|
273
297
|
def self._read_json(file)
|
274
298
|
if Utils.pathlike?(file)
|
275
|
-
file = Utils.
|
299
|
+
file = Utils.normalise_filepath(file)
|
276
300
|
end
|
277
301
|
|
278
302
|
_from_rbdf(RbDataFrame.read_json(file))
|
@@ -281,7 +305,7 @@ module Polars
|
|
281
305
|
# @private
|
282
306
|
def self._read_ndjson(file)
|
283
307
|
if Utils.pathlike?(file)
|
284
|
-
file = Utils.
|
308
|
+
file = Utils.normalise_filepath(file)
|
285
309
|
end
|
286
310
|
|
287
311
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
@@ -774,7 +798,7 @@ module Polars
|
|
774
798
|
row_oriented: false
|
775
799
|
)
|
776
800
|
if Utils.pathlike?(file)
|
777
|
-
file = Utils.
|
801
|
+
file = Utils.normalise_filepath(file)
|
778
802
|
end
|
779
803
|
|
780
804
|
_df.write_json(file, pretty, row_oriented)
|
@@ -789,7 +813,7 @@ module Polars
|
|
789
813
|
# @return [nil]
|
790
814
|
def write_ndjson(file)
|
791
815
|
if Utils.pathlike?(file)
|
792
|
-
file = Utils.
|
816
|
+
file = Utils.normalise_filepath(file)
|
793
817
|
end
|
794
818
|
|
795
819
|
_df.write_ndjson(file)
|
@@ -879,7 +903,7 @@ module Polars
|
|
879
903
|
end
|
880
904
|
|
881
905
|
if Utils.pathlike?(file)
|
882
|
-
file = Utils.
|
906
|
+
file = Utils.normalise_filepath(file)
|
883
907
|
end
|
884
908
|
|
885
909
|
_df.write_csv(
|
@@ -917,7 +941,7 @@ module Polars
|
|
917
941
|
compression = "uncompressed"
|
918
942
|
end
|
919
943
|
if Utils.pathlike?(file)
|
920
|
-
file = Utils.
|
944
|
+
file = Utils.normalise_filepath(file)
|
921
945
|
end
|
922
946
|
|
923
947
|
_df.write_avro(file, compression)
|
@@ -936,7 +960,7 @@ module Polars
|
|
936
960
|
compression = "uncompressed"
|
937
961
|
end
|
938
962
|
if Utils.pathlike?(file)
|
939
|
-
file = Utils.
|
963
|
+
file = Utils.normalise_filepath(file)
|
940
964
|
end
|
941
965
|
|
942
966
|
_df.write_ipc(file, compression)
|
@@ -978,7 +1002,7 @@ module Polars
|
|
978
1002
|
compression = "uncompressed"
|
979
1003
|
end
|
980
1004
|
if Utils.pathlike?(file)
|
981
|
-
file = Utils.
|
1005
|
+
file = Utils.normalise_filepath(file)
|
982
1006
|
end
|
983
1007
|
|
984
1008
|
_df.write_parquet(
|
@@ -3042,24 +3066,28 @@ module Polars
|
|
3042
3066
|
if aggregate_fn.is_a?(String)
|
3043
3067
|
case aggregate_fn
|
3044
3068
|
when "first"
|
3045
|
-
|
3069
|
+
aggregate_expr = Polars.element.first._rbexpr
|
3046
3070
|
when "sum"
|
3047
|
-
|
3071
|
+
aggregate_expr = Polars.element.sum._rbexpr
|
3048
3072
|
when "max"
|
3049
|
-
|
3073
|
+
aggregate_expr = Polars.element.max._rbexpr
|
3050
3074
|
when "min"
|
3051
|
-
|
3075
|
+
aggregate_expr = Polars.element.min._rbexpr
|
3052
3076
|
when "mean"
|
3053
|
-
|
3077
|
+
aggregate_expr = Polars.element.mean._rbexpr
|
3054
3078
|
when "median"
|
3055
|
-
|
3079
|
+
aggregate_expr = Polars.element.median._rbexpr
|
3056
3080
|
when "last"
|
3057
|
-
|
3081
|
+
aggregate_expr = Polars.element.last._rbexpr
|
3058
3082
|
when "count"
|
3059
|
-
|
3083
|
+
aggregate_expr = Polars.count._rbexpr
|
3060
3084
|
else
|
3061
3085
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3062
3086
|
end
|
3087
|
+
elsif aggregate_fn.nil?
|
3088
|
+
aggregate_expr = nil
|
3089
|
+
else
|
3090
|
+
aggregate_expr = aggregate_function._rbexpr
|
3063
3091
|
end
|
3064
3092
|
|
3065
3093
|
_from_rbdf(
|
@@ -3067,9 +3095,9 @@ module Polars
|
|
3067
3095
|
values,
|
3068
3096
|
index,
|
3069
3097
|
columns,
|
3070
|
-
aggregate_fn._rbexpr,
|
3071
3098
|
maintain_order,
|
3072
3099
|
sort_columns,
|
3100
|
+
aggregate_expr,
|
3073
3101
|
separator
|
3074
3102
|
)
|
3075
3103
|
)
|
@@ -3174,7 +3202,7 @@ module Polars
|
|
3174
3202
|
# # │ B ┆ 1 │
|
3175
3203
|
# # │ C ┆ 2 │
|
3176
3204
|
# # │ D ┆ 3 │
|
3177
|
-
# # │
|
3205
|
+
# # │ … ┆ … │
|
3178
3206
|
# # │ F ┆ 5 │
|
3179
3207
|
# # │ G ┆ 6 │
|
3180
3208
|
# # │ H ┆ 7 │
|
@@ -4053,15 +4081,12 @@ module Polars
|
|
4053
4081
|
# # │ 5 ┆ 3.0 ┆ true │
|
4054
4082
|
# # └─────┴─────┴───────┘
|
4055
4083
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
4056
|
-
|
4057
|
-
|
4058
|
-
subset
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
end
|
4063
|
-
|
4064
|
-
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4084
|
+
self._from_rbdf(
|
4085
|
+
lazy
|
4086
|
+
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
4087
|
+
.collect(no_optimization: true)
|
4088
|
+
._df
|
4089
|
+
)
|
4065
4090
|
end
|
4066
4091
|
|
4067
4092
|
# Return the number of unique rows, or the number of unique row-subsets.
|
@@ -1130,7 +1130,7 @@ module Polars
|
|
1130
1130
|
# ]
|
1131
1131
|
# )
|
1132
1132
|
# # =>
|
1133
|
-
# # shape: (
|
1133
|
+
# # shape: (1_001, 2)
|
1134
1134
|
# # ┌─────────────────────────┬───────────────────┐
|
1135
1135
|
# # │ date ┆ milliseconds_diff │
|
1136
1136
|
# # │ --- ┆ --- │
|
@@ -1140,7 +1140,7 @@ module Polars
|
|
1140
1140
|
# # │ 2020-01-01 00:00:00.001 ┆ 1 │
|
1141
1141
|
# # │ 2020-01-01 00:00:00.002 ┆ 1 │
|
1142
1142
|
# # │ 2020-01-01 00:00:00.003 ┆ 1 │
|
1143
|
-
# # │
|
1143
|
+
# # │ … ┆ … │
|
1144
1144
|
# # │ 2020-01-01 00:00:00.997 ┆ 1 │
|
1145
1145
|
# # │ 2020-01-01 00:00:00.998 ┆ 1 │
|
1146
1146
|
# # │ 2020-01-01 00:00:00.999 ┆ 1 │
|
@@ -1169,7 +1169,7 @@ module Polars
|
|
1169
1169
|
# ]
|
1170
1170
|
# )
|
1171
1171
|
# # =>
|
1172
|
-
# # shape: (
|
1172
|
+
# # shape: (1_001, 2)
|
1173
1173
|
# # ┌─────────────────────────┬───────────────────┐
|
1174
1174
|
# # │ date ┆ microseconds_diff │
|
1175
1175
|
# # │ --- ┆ --- │
|
@@ -1179,7 +1179,7 @@ module Polars
|
|
1179
1179
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000 │
|
1180
1180
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000 │
|
1181
1181
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000 │
|
1182
|
-
# # │
|
1182
|
+
# # │ … ┆ … │
|
1183
1183
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000 │
|
1184
1184
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000 │
|
1185
1185
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000 │
|
@@ -1208,7 +1208,7 @@ module Polars
|
|
1208
1208
|
# ]
|
1209
1209
|
# )
|
1210
1210
|
# # =>
|
1211
|
-
# # shape: (
|
1211
|
+
# # shape: (1_001, 2)
|
1212
1212
|
# # ┌─────────────────────────┬──────────────────┐
|
1213
1213
|
# # │ date ┆ nanoseconds_diff │
|
1214
1214
|
# # │ --- ┆ --- │
|
@@ -1218,7 +1218,7 @@ module Polars
|
|
1218
1218
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
|
1219
1219
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
|
1220
1220
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
|
1221
|
-
# # │
|
1221
|
+
# # │ … ┆ … │
|
1222
1222
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
|
1223
1223
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
|
1224
1224
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
|