polars-df 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/README.md +29 -0
- data/ext/polars/Cargo.toml +5 -3
- data/ext/polars/src/batched_csv.rs +29 -14
- data/ext/polars/src/conversion.rs +53 -12
- data/ext/polars/src/dataframe.rs +36 -39
- data/ext/polars/src/lazy/dataframe.rs +48 -14
- data/ext/polars/src/lazy/dsl.rs +69 -4
- data/ext/polars/src/lib.rs +19 -5
- data/ext/polars/src/series.rs +13 -1
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +5 -3
data/ext/polars/src/lazy/dsl.rs
CHANGED
@@ -10,7 +10,7 @@ use crate::conversion::*;
|
|
10
10
|
use crate::lazy::apply::*;
|
11
11
|
use crate::lazy::utils::rb_exprs_to_exprs;
|
12
12
|
use crate::utils::reinterpret;
|
13
|
-
use crate::{RbResult, RbSeries};
|
13
|
+
use crate::{RbPolarsErr, RbResult, RbSeries};
|
14
14
|
|
15
15
|
#[magnus::wrap(class = "Polars::RbExpr")]
|
16
16
|
#[derive(Clone)]
|
@@ -715,6 +715,18 @@ impl RbExpr {
|
|
715
715
|
.into()
|
716
716
|
}
|
717
717
|
|
718
|
+
pub fn binary_contains(&self, lit: Vec<u8>) -> Self {
|
719
|
+
self.inner.clone().binary().contains_literal(lit).into()
|
720
|
+
}
|
721
|
+
|
722
|
+
pub fn binary_ends_with(&self, sub: Vec<u8>) -> Self {
|
723
|
+
self.inner.clone().binary().ends_with(sub).into()
|
724
|
+
}
|
725
|
+
|
726
|
+
pub fn binary_starts_with(&self, sub: Vec<u8>) -> Self {
|
727
|
+
self.inner.clone().binary().starts_with(sub).into()
|
728
|
+
}
|
729
|
+
|
718
730
|
pub fn str_hex_encode(&self) -> Self {
|
719
731
|
self.clone()
|
720
732
|
.inner
|
@@ -763,6 +775,58 @@ impl RbExpr {
|
|
763
775
|
.into()
|
764
776
|
}
|
765
777
|
|
778
|
+
pub fn binary_hex_encode(&self) -> Self {
|
779
|
+
self.clone()
|
780
|
+
.inner
|
781
|
+
.map(
|
782
|
+
move |s| s.binary().map(|s| Some(s.hex_encode().into_series())),
|
783
|
+
GetOutput::same_type(),
|
784
|
+
)
|
785
|
+
.with_fmt("binary.hex_encode")
|
786
|
+
.into()
|
787
|
+
}
|
788
|
+
|
789
|
+
pub fn binary_hex_decode(&self, strict: bool) -> Self {
|
790
|
+
self.clone()
|
791
|
+
.inner
|
792
|
+
.map(
|
793
|
+
move |s| {
|
794
|
+
s.binary()?
|
795
|
+
.hex_decode(strict)
|
796
|
+
.map(|s| Some(s.into_series()))
|
797
|
+
},
|
798
|
+
GetOutput::same_type(),
|
799
|
+
)
|
800
|
+
.with_fmt("binary.hex_decode")
|
801
|
+
.into()
|
802
|
+
}
|
803
|
+
|
804
|
+
pub fn binary_base64_encode(&self) -> Self {
|
805
|
+
self.clone()
|
806
|
+
.inner
|
807
|
+
.map(
|
808
|
+
move |s| s.binary().map(|s| Some(s.base64_encode().into_series())),
|
809
|
+
GetOutput::same_type(),
|
810
|
+
)
|
811
|
+
.with_fmt("binary.base64_encode")
|
812
|
+
.into()
|
813
|
+
}
|
814
|
+
|
815
|
+
pub fn binary_base64_decode(&self, strict: bool) -> Self {
|
816
|
+
self.clone()
|
817
|
+
.inner
|
818
|
+
.map(
|
819
|
+
move |s| {
|
820
|
+
s.binary()?
|
821
|
+
.base64_decode(strict)
|
822
|
+
.map(|s| Some(s.into_series()))
|
823
|
+
},
|
824
|
+
GetOutput::same_type(),
|
825
|
+
)
|
826
|
+
.with_fmt("binary.base64_decode")
|
827
|
+
.into()
|
828
|
+
}
|
829
|
+
|
766
830
|
pub fn str_json_path_match(&self, pat: String) -> Self {
|
767
831
|
let function = move |s: Series| {
|
768
832
|
let ca = s.utf8()?;
|
@@ -1654,9 +1718,9 @@ pub fn cov(a: &RbExpr, b: &RbExpr) -> RbExpr {
|
|
1654
1718
|
polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone()).into()
|
1655
1719
|
}
|
1656
1720
|
|
1657
|
-
pub fn
|
1721
|
+
pub fn arg_sort_by(by: RArray, reverse: Vec<bool>) -> RbResult<RbExpr> {
|
1658
1722
|
let by = rb_exprs_to_exprs(by)?;
|
1659
|
-
Ok(polars::lazy::dsl::
|
1723
|
+
Ok(polars::lazy::dsl::arg_sort_by(by, &reverse).into())
|
1660
1724
|
}
|
1661
1725
|
|
1662
1726
|
#[magnus::wrap(class = "Polars::RbWhen")]
|
@@ -1706,5 +1770,6 @@ pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
|
|
1706
1770
|
|
1707
1771
|
pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
|
1708
1772
|
let s = rb_exprs_to_exprs(s)?;
|
1709
|
-
|
1773
|
+
let expr = dsl::concat_lst(s).map_err(RbPolarsErr::from)?;
|
1774
|
+
Ok(expr.into())
|
1710
1775
|
}
|
data/ext/polars/src/lib.rs
CHANGED
@@ -73,7 +73,7 @@ fn init() -> RbResult<()> {
|
|
73
73
|
let class = module.define_class("RbDataFrame", Default::default())?;
|
74
74
|
class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
|
75
75
|
class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
|
76
|
-
class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet,
|
76
|
+
class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 9))?;
|
77
77
|
class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
|
78
78
|
class.define_singleton_method("read_avro", function!(RbDataFrame::read_avro, 4))?;
|
79
79
|
class.define_singleton_method("read_hashes", function!(RbDataFrame::read_hashes, 3))?;
|
@@ -151,7 +151,6 @@ fn init() -> RbResult<()> {
|
|
151
151
|
class.define_method("pivot_expr", method!(RbDataFrame::pivot_expr, 7))?;
|
152
152
|
class.define_method("partition_by", method!(RbDataFrame::partition_by, 2))?;
|
153
153
|
class.define_method("shift", method!(RbDataFrame::shift, 1))?;
|
154
|
-
class.define_method("unique", method!(RbDataFrame::unique, 3))?;
|
155
154
|
class.define_method("lazy", method!(RbDataFrame::lazy, 0))?;
|
156
155
|
class.define_method("max", method!(RbDataFrame::max, 0))?;
|
157
156
|
class.define_method("min", method!(RbDataFrame::min, 0))?;
|
@@ -304,10 +303,23 @@ fn init() -> RbResult<()> {
|
|
304
303
|
class.define_method("str_contains", method!(RbExpr::str_contains, 3))?;
|
305
304
|
class.define_method("str_ends_with", method!(RbExpr::str_ends_with, 1))?;
|
306
305
|
class.define_method("str_starts_with", method!(RbExpr::str_starts_with, 1))?;
|
306
|
+
class.define_method("binary_contains", method!(RbExpr::binary_contains, 1))?;
|
307
|
+
class.define_method("binary_ends_with", method!(RbExpr::binary_ends_with, 1))?;
|
308
|
+
class.define_method("binary_starts_with", method!(RbExpr::binary_starts_with, 1))?;
|
307
309
|
class.define_method("str_hex_encode", method!(RbExpr::str_hex_encode, 0))?;
|
308
310
|
class.define_method("str_hex_decode", method!(RbExpr::str_hex_decode, 1))?;
|
309
311
|
class.define_method("str_base64_encode", method!(RbExpr::str_base64_encode, 0))?;
|
310
312
|
class.define_method("str_base64_decode", method!(RbExpr::str_base64_decode, 1))?;
|
313
|
+
class.define_method("binary_hex_encode", method!(RbExpr::binary_hex_encode, 0))?;
|
314
|
+
class.define_method("binary_hex_decode", method!(RbExpr::binary_hex_decode, 1))?;
|
315
|
+
class.define_method(
|
316
|
+
"binary_base64_encode",
|
317
|
+
method!(RbExpr::binary_base64_encode, 0),
|
318
|
+
)?;
|
319
|
+
class.define_method(
|
320
|
+
"binary_base64_decode",
|
321
|
+
method!(RbExpr::binary_base64_decode, 1),
|
322
|
+
)?;
|
311
323
|
class.define_method(
|
312
324
|
"str_json_path_match",
|
313
325
|
method!(RbExpr::str_json_path_match, 1),
|
@@ -473,7 +485,7 @@ fn init() -> RbResult<()> {
|
|
473
485
|
function!(crate::lazy::dsl::spearman_rank_corr, 4),
|
474
486
|
)?;
|
475
487
|
class.define_singleton_method("cov", function!(crate::lazy::dsl::cov, 2))?;
|
476
|
-
class.define_singleton_method("
|
488
|
+
class.define_singleton_method("arg_sort_by", function!(crate::lazy::dsl::arg_sort_by, 2))?;
|
477
489
|
class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
|
478
490
|
class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
|
479
491
|
class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
|
@@ -487,7 +499,7 @@ fn init() -> RbResult<()> {
|
|
487
499
|
class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
|
488
500
|
class.define_singleton_method(
|
489
501
|
"new_from_parquet",
|
490
|
-
function!(RbLazyFrame::new_from_parquet,
|
502
|
+
function!(RbLazyFrame::new_from_parquet, 8),
|
491
503
|
)?;
|
492
504
|
class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
|
493
505
|
class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
|
@@ -504,6 +516,7 @@ fn init() -> RbResult<()> {
|
|
504
516
|
class.define_method("sort_by_exprs", method!(RbLazyFrame::sort_by_exprs, 3))?;
|
505
517
|
class.define_method("cache", method!(RbLazyFrame::cache, 0))?;
|
506
518
|
class.define_method("collect", method!(RbLazyFrame::collect, 0))?;
|
519
|
+
class.define_method("sink_parquet", method!(RbLazyFrame::sink_parquet, 7))?;
|
507
520
|
class.define_method("fetch", method!(RbLazyFrame::fetch, 1))?;
|
508
521
|
class.define_method("filter", method!(RbLazyFrame::filter, 1))?;
|
509
522
|
class.define_method("select", method!(RbLazyFrame::select, 1))?;
|
@@ -532,7 +545,7 @@ fn init() -> RbResult<()> {
|
|
532
545
|
class.define_method("drop_nulls", method!(RbLazyFrame::drop_nulls, 1))?;
|
533
546
|
class.define_method("slice", method!(RbLazyFrame::slice, 2))?;
|
534
547
|
class.define_method("tail", method!(RbLazyFrame::tail, 1))?;
|
535
|
-
class.define_method("melt", method!(RbLazyFrame::melt,
|
548
|
+
class.define_method("melt", method!(RbLazyFrame::melt, 5))?;
|
536
549
|
class.define_method("with_row_count", method!(RbLazyFrame::with_row_count, 2))?;
|
537
550
|
class.define_method("drop_columns", method!(RbLazyFrame::drop_columns, 1))?;
|
538
551
|
class.define_method("_clone", method!(RbLazyFrame::clone, 0))?;
|
@@ -560,6 +573,7 @@ fn init() -> RbResult<()> {
|
|
560
573
|
class.define_singleton_method("new_opt_f32", function!(RbSeries::new_opt_f32, 3))?;
|
561
574
|
class.define_singleton_method("new_opt_f64", function!(RbSeries::new_opt_f64, 3))?;
|
562
575
|
class.define_singleton_method("new_str", function!(RbSeries::new_str, 3))?;
|
576
|
+
class.define_singleton_method("new_binary", function!(RbSeries::new_binary, 3))?;
|
563
577
|
class.define_singleton_method("new_object", function!(RbSeries::new_object, 3))?;
|
564
578
|
class.define_singleton_method("new_list", function!(RbSeries::new_list, 3))?;
|
565
579
|
class.define_singleton_method("new_opt_date", function!(RbSeries::new_opt_date, 3))?;
|
data/ext/polars/src/series.rs
CHANGED
@@ -125,6 +125,12 @@ impl RbSeries {
|
|
125
125
|
RbSeries::new(s)
|
126
126
|
}
|
127
127
|
|
128
|
+
pub fn new_binary(name: String, val: Wrap<BinaryChunked>, _strict: bool) -> Self {
|
129
|
+
let mut s = val.0.into_series();
|
130
|
+
s.rename(&name);
|
131
|
+
RbSeries::new(s)
|
132
|
+
}
|
133
|
+
|
128
134
|
pub fn new_object(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
|
129
135
|
let val = val
|
130
136
|
.each()
|
@@ -504,7 +510,6 @@ impl RbSeries {
|
|
504
510
|
DataType::Int64 => RArray::from_iter(series.i64().unwrap()),
|
505
511
|
DataType::Float32 => RArray::from_iter(series.f32().unwrap()),
|
506
512
|
DataType::Float64 => RArray::from_iter(series.f64().unwrap()),
|
507
|
-
DataType::Decimal128(_) => todo!(),
|
508
513
|
DataType::Categorical(_) => {
|
509
514
|
RArray::from_iter(series.categorical().unwrap().iter_str())
|
510
515
|
}
|
@@ -526,6 +531,13 @@ impl RbSeries {
|
|
526
531
|
let ca = series.utf8().unwrap();
|
527
532
|
return RArray::from_iter(ca);
|
528
533
|
}
|
534
|
+
DataType::Binary => {
|
535
|
+
let a = RArray::with_capacity(series.len());
|
536
|
+
for v in series.iter() {
|
537
|
+
a.push::<Value>(Wrap(v).into_value()).unwrap();
|
538
|
+
}
|
539
|
+
return a;
|
540
|
+
}
|
529
541
|
DataType::Null | DataType::Unknown => {
|
530
542
|
panic!("to_a not implemented for null/unknown")
|
531
543
|
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for binary related expressions.
|
3
|
+
class BinaryExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Check if binaries in Series contain a binary substring.
|
13
|
+
#
|
14
|
+
# @param lit [String]
|
15
|
+
# The binary substring to look for
|
16
|
+
#
|
17
|
+
# @return [Expr]
|
18
|
+
def contains(lit)
|
19
|
+
Utils.wrap_expr(_rbexpr.binary_contains(lit))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Check if string values end with a binary substring.
|
23
|
+
#
|
24
|
+
# @param sub [String]
|
25
|
+
# Suffix substring.
|
26
|
+
#
|
27
|
+
# @return [Expr]
|
28
|
+
def ends_with(sub)
|
29
|
+
Utils.wrap_expr(_rbexpr.binary_ends_with(sub))
|
30
|
+
end
|
31
|
+
|
32
|
+
# Check if values start with a binary substring.
|
33
|
+
#
|
34
|
+
# @param sub [String]
|
35
|
+
# Prefix substring.
|
36
|
+
#
|
37
|
+
# @return [Expr]
|
38
|
+
def starts_with(sub)
|
39
|
+
Utils.wrap_expr(_rbexpr.binary_starts_with(sub))
|
40
|
+
end
|
41
|
+
|
42
|
+
# Decode a value using the provided encoding.
|
43
|
+
#
|
44
|
+
# @param encoding ["hex", "base64"]
|
45
|
+
# The encoding to use.
|
46
|
+
# @param strict [Boolean]
|
47
|
+
# Raise an error if the underlying value cannot be decoded,
|
48
|
+
# otherwise mask out with a null value.
|
49
|
+
#
|
50
|
+
# @return [Expr]
|
51
|
+
def decode(encoding, strict: true)
|
52
|
+
if encoding == "hex"
|
53
|
+
Utils.wrap_expr(_rbexpr.binary_hex_decode(strict))
|
54
|
+
elsif encoding == "base64"
|
55
|
+
Utils.wrap_expr(_rbexpr.binary_base64_decode(strict))
|
56
|
+
else
|
57
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Encode a value using the provided encoding.
|
62
|
+
#
|
63
|
+
# @param encoding ["hex", "base64"]
|
64
|
+
# The encoding to use.
|
65
|
+
#
|
66
|
+
# @return [Expr]
|
67
|
+
def encode(encoding)
|
68
|
+
if encoding == "hex"
|
69
|
+
Utils.wrap_expr(_rbexpr.binary_hex_encode)
|
70
|
+
elsif encoding == "base64"
|
71
|
+
Utils.wrap_expr(_rbexpr.binary_base64_encode)
|
72
|
+
else
|
73
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.bin namespace.
|
3
|
+
class BinaryNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "bin"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Check if binaries in Series contain a binary substring.
|
14
|
+
#
|
15
|
+
# @param lit [String]
|
16
|
+
# The binary substring to look for
|
17
|
+
#
|
18
|
+
# @return [Series]
|
19
|
+
def contains(lit)
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
# Check if string values end with a binary substring.
|
24
|
+
#
|
25
|
+
# @param sub [String]
|
26
|
+
# Suffix substring.
|
27
|
+
#
|
28
|
+
# @return [Series]
|
29
|
+
def ends_with(sub)
|
30
|
+
super
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if values start with a binary substring.
|
34
|
+
#
|
35
|
+
# @param sub [String]
|
36
|
+
# Prefix substring.
|
37
|
+
#
|
38
|
+
# @return [Series]
|
39
|
+
def starts_with(sub)
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
# Decode a value using the provided encoding.
|
44
|
+
#
|
45
|
+
# @param encoding ["hex", "base64"]
|
46
|
+
# The encoding to use.
|
47
|
+
# @param strict [Boolean]
|
48
|
+
# Raise an error if the underlying value cannot be decoded,
|
49
|
+
# otherwise mask out with a null value.
|
50
|
+
#
|
51
|
+
# @return [Series]
|
52
|
+
def decode(encoding, strict: true)
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
56
|
+
# Encode a value using the provided encoding.
|
57
|
+
#
|
58
|
+
# @param encoding ["hex", "base64"]
|
59
|
+
# The encoding to use.
|
60
|
+
#
|
61
|
+
# @return [Series]
|
62
|
+
def encode(encoding)
|
63
|
+
super
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/polars/data_frame.rb
CHANGED
@@ -97,7 +97,7 @@ module Polars
|
|
97
97
|
eol_char: "\n"
|
98
98
|
)
|
99
99
|
if Utils.pathlike?(file)
|
100
|
-
path = Utils.
|
100
|
+
path = Utils.normalise_filepath(file)
|
101
101
|
else
|
102
102
|
path = nil
|
103
103
|
# if defined?(StringIO) && file.is_a?(StringIO)
|
@@ -196,32 +196,56 @@ module Polars
|
|
196
196
|
|
197
197
|
# @private
|
198
198
|
def self._read_parquet(
|
199
|
-
|
199
|
+
source,
|
200
200
|
columns: nil,
|
201
201
|
n_rows: nil,
|
202
202
|
parallel: "auto",
|
203
203
|
row_count_name: nil,
|
204
204
|
row_count_offset: 0,
|
205
|
-
low_memory: false
|
205
|
+
low_memory: false,
|
206
|
+
use_statistics: true,
|
207
|
+
rechunk: true
|
206
208
|
)
|
207
|
-
if Utils.pathlike?(
|
208
|
-
|
209
|
+
if Utils.pathlike?(source)
|
210
|
+
source = Utils.normalise_filepath(source)
|
211
|
+
end
|
212
|
+
if columns.is_a?(String)
|
213
|
+
columns = [columns]
|
209
214
|
end
|
210
215
|
|
211
|
-
if
|
212
|
-
|
216
|
+
if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
|
217
|
+
scan =
|
218
|
+
Polars.scan_parquet(
|
219
|
+
source,
|
220
|
+
n_rows: n_rows,
|
221
|
+
rechunk: true,
|
222
|
+
parallel: parallel,
|
223
|
+
row_count_name: row_count_name,
|
224
|
+
row_count_offset: row_count_offset,
|
225
|
+
low_memory: low_memory
|
226
|
+
)
|
227
|
+
|
228
|
+
if columns.nil?
|
229
|
+
return self._from_rbdf(scan.collect._df)
|
230
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
231
|
+
return self._from_rbdf(scan.select(columns).collect._df)
|
232
|
+
else
|
233
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
234
|
+
end
|
213
235
|
end
|
214
236
|
|
215
237
|
projection, columns = Utils.handle_projection_columns(columns)
|
216
238
|
_from_rbdf(
|
217
239
|
RbDataFrame.read_parquet(
|
218
|
-
|
240
|
+
source,
|
219
241
|
columns,
|
220
242
|
projection,
|
221
243
|
n_rows,
|
222
244
|
parallel,
|
223
245
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
224
|
-
low_memory
|
246
|
+
low_memory,
|
247
|
+
use_statistics,
|
248
|
+
rechunk
|
225
249
|
)
|
226
250
|
)
|
227
251
|
end
|
@@ -229,7 +253,7 @@ module Polars
|
|
229
253
|
# @private
|
230
254
|
def self._read_avro(file, columns: nil, n_rows: nil)
|
231
255
|
if Utils.pathlike?(file)
|
232
|
-
file = Utils.
|
256
|
+
file = Utils.normalise_filepath(file)
|
233
257
|
end
|
234
258
|
projection, columns = Utils.handle_projection_columns(columns)
|
235
259
|
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
@@ -246,7 +270,7 @@ module Polars
|
|
246
270
|
memory_map: true
|
247
271
|
)
|
248
272
|
if Utils.pathlike?(file)
|
249
|
-
file = Utils.
|
273
|
+
file = Utils.normalise_filepath(file)
|
250
274
|
end
|
251
275
|
if columns.is_a?(String)
|
252
276
|
columns = [columns]
|
@@ -272,7 +296,7 @@ module Polars
|
|
272
296
|
# @private
|
273
297
|
def self._read_json(file)
|
274
298
|
if Utils.pathlike?(file)
|
275
|
-
file = Utils.
|
299
|
+
file = Utils.normalise_filepath(file)
|
276
300
|
end
|
277
301
|
|
278
302
|
_from_rbdf(RbDataFrame.read_json(file))
|
@@ -281,7 +305,7 @@ module Polars
|
|
281
305
|
# @private
|
282
306
|
def self._read_ndjson(file)
|
283
307
|
if Utils.pathlike?(file)
|
284
|
-
file = Utils.
|
308
|
+
file = Utils.normalise_filepath(file)
|
285
309
|
end
|
286
310
|
|
287
311
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
@@ -774,7 +798,7 @@ module Polars
|
|
774
798
|
row_oriented: false
|
775
799
|
)
|
776
800
|
if Utils.pathlike?(file)
|
777
|
-
file = Utils.
|
801
|
+
file = Utils.normalise_filepath(file)
|
778
802
|
end
|
779
803
|
|
780
804
|
_df.write_json(file, pretty, row_oriented)
|
@@ -789,7 +813,7 @@ module Polars
|
|
789
813
|
# @return [nil]
|
790
814
|
def write_ndjson(file)
|
791
815
|
if Utils.pathlike?(file)
|
792
|
-
file = Utils.
|
816
|
+
file = Utils.normalise_filepath(file)
|
793
817
|
end
|
794
818
|
|
795
819
|
_df.write_ndjson(file)
|
@@ -879,7 +903,7 @@ module Polars
|
|
879
903
|
end
|
880
904
|
|
881
905
|
if Utils.pathlike?(file)
|
882
|
-
file = Utils.
|
906
|
+
file = Utils.normalise_filepath(file)
|
883
907
|
end
|
884
908
|
|
885
909
|
_df.write_csv(
|
@@ -917,7 +941,7 @@ module Polars
|
|
917
941
|
compression = "uncompressed"
|
918
942
|
end
|
919
943
|
if Utils.pathlike?(file)
|
920
|
-
file = Utils.
|
944
|
+
file = Utils.normalise_filepath(file)
|
921
945
|
end
|
922
946
|
|
923
947
|
_df.write_avro(file, compression)
|
@@ -936,7 +960,7 @@ module Polars
|
|
936
960
|
compression = "uncompressed"
|
937
961
|
end
|
938
962
|
if Utils.pathlike?(file)
|
939
|
-
file = Utils.
|
963
|
+
file = Utils.normalise_filepath(file)
|
940
964
|
end
|
941
965
|
|
942
966
|
_df.write_ipc(file, compression)
|
@@ -978,7 +1002,7 @@ module Polars
|
|
978
1002
|
compression = "uncompressed"
|
979
1003
|
end
|
980
1004
|
if Utils.pathlike?(file)
|
981
|
-
file = Utils.
|
1005
|
+
file = Utils.normalise_filepath(file)
|
982
1006
|
end
|
983
1007
|
|
984
1008
|
_df.write_parquet(
|
@@ -3042,24 +3066,28 @@ module Polars
|
|
3042
3066
|
if aggregate_fn.is_a?(String)
|
3043
3067
|
case aggregate_fn
|
3044
3068
|
when "first"
|
3045
|
-
|
3069
|
+
aggregate_expr = Polars.element.first._rbexpr
|
3046
3070
|
when "sum"
|
3047
|
-
|
3071
|
+
aggregate_expr = Polars.element.sum._rbexpr
|
3048
3072
|
when "max"
|
3049
|
-
|
3073
|
+
aggregate_expr = Polars.element.max._rbexpr
|
3050
3074
|
when "min"
|
3051
|
-
|
3075
|
+
aggregate_expr = Polars.element.min._rbexpr
|
3052
3076
|
when "mean"
|
3053
|
-
|
3077
|
+
aggregate_expr = Polars.element.mean._rbexpr
|
3054
3078
|
when "median"
|
3055
|
-
|
3079
|
+
aggregate_expr = Polars.element.median._rbexpr
|
3056
3080
|
when "last"
|
3057
|
-
|
3081
|
+
aggregate_expr = Polars.element.last._rbexpr
|
3058
3082
|
when "count"
|
3059
|
-
|
3083
|
+
aggregate_expr = Polars.count._rbexpr
|
3060
3084
|
else
|
3061
3085
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3062
3086
|
end
|
3087
|
+
elsif aggregate_fn.nil?
|
3088
|
+
aggregate_expr = nil
|
3089
|
+
else
|
3090
|
+
aggregate_expr = aggregate_function._rbexpr
|
3063
3091
|
end
|
3064
3092
|
|
3065
3093
|
_from_rbdf(
|
@@ -3067,9 +3095,9 @@ module Polars
|
|
3067
3095
|
values,
|
3068
3096
|
index,
|
3069
3097
|
columns,
|
3070
|
-
aggregate_fn._rbexpr,
|
3071
3098
|
maintain_order,
|
3072
3099
|
sort_columns,
|
3100
|
+
aggregate_expr,
|
3073
3101
|
separator
|
3074
3102
|
)
|
3075
3103
|
)
|
@@ -3174,7 +3202,7 @@ module Polars
|
|
3174
3202
|
# # │ B ┆ 1 │
|
3175
3203
|
# # │ C ┆ 2 │
|
3176
3204
|
# # │ D ┆ 3 │
|
3177
|
-
# # │
|
3205
|
+
# # │ … ┆ … │
|
3178
3206
|
# # │ F ┆ 5 │
|
3179
3207
|
# # │ G ┆ 6 │
|
3180
3208
|
# # │ H ┆ 7 │
|
@@ -4053,15 +4081,12 @@ module Polars
|
|
4053
4081
|
# # │ 5 ┆ 3.0 ┆ true │
|
4054
4082
|
# # └─────┴─────┴───────┘
|
4055
4083
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
4056
|
-
|
4057
|
-
|
4058
|
-
subset
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
end
|
4063
|
-
|
4064
|
-
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4084
|
+
self._from_rbdf(
|
4085
|
+
lazy
|
4086
|
+
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
4087
|
+
.collect(no_optimization: true)
|
4088
|
+
._df
|
4089
|
+
)
|
4065
4090
|
end
|
4066
4091
|
|
4067
4092
|
# Return the number of unique rows, or the number of unique row-subsets.
|
@@ -1130,7 +1130,7 @@ module Polars
|
|
1130
1130
|
# ]
|
1131
1131
|
# )
|
1132
1132
|
# # =>
|
1133
|
-
# # shape: (
|
1133
|
+
# # shape: (1_001, 2)
|
1134
1134
|
# # ┌─────────────────────────┬───────────────────┐
|
1135
1135
|
# # │ date ┆ milliseconds_diff │
|
1136
1136
|
# # │ --- ┆ --- │
|
@@ -1140,7 +1140,7 @@ module Polars
|
|
1140
1140
|
# # │ 2020-01-01 00:00:00.001 ┆ 1 │
|
1141
1141
|
# # │ 2020-01-01 00:00:00.002 ┆ 1 │
|
1142
1142
|
# # │ 2020-01-01 00:00:00.003 ┆ 1 │
|
1143
|
-
# # │
|
1143
|
+
# # │ … ┆ … │
|
1144
1144
|
# # │ 2020-01-01 00:00:00.997 ┆ 1 │
|
1145
1145
|
# # │ 2020-01-01 00:00:00.998 ┆ 1 │
|
1146
1146
|
# # │ 2020-01-01 00:00:00.999 ┆ 1 │
|
@@ -1169,7 +1169,7 @@ module Polars
|
|
1169
1169
|
# ]
|
1170
1170
|
# )
|
1171
1171
|
# # =>
|
1172
|
-
# # shape: (
|
1172
|
+
# # shape: (1_001, 2)
|
1173
1173
|
# # ┌─────────────────────────┬───────────────────┐
|
1174
1174
|
# # │ date ┆ microseconds_diff │
|
1175
1175
|
# # │ --- ┆ --- │
|
@@ -1179,7 +1179,7 @@ module Polars
|
|
1179
1179
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000 │
|
1180
1180
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000 │
|
1181
1181
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000 │
|
1182
|
-
# # │
|
1182
|
+
# # │ … ┆ … │
|
1183
1183
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000 │
|
1184
1184
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000 │
|
1185
1185
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000 │
|
@@ -1208,7 +1208,7 @@ module Polars
|
|
1208
1208
|
# ]
|
1209
1209
|
# )
|
1210
1210
|
# # =>
|
1211
|
-
# # shape: (
|
1211
|
+
# # shape: (1_001, 2)
|
1212
1212
|
# # ┌─────────────────────────┬──────────────────┐
|
1213
1213
|
# # │ date ┆ nanoseconds_diff │
|
1214
1214
|
# # │ --- ┆ --- │
|
@@ -1218,7 +1218,7 @@ module Polars
|
|
1218
1218
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
|
1219
1219
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
|
1220
1220
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
|
1221
|
-
# # │
|
1221
|
+
# # │ … ┆ … │
|
1222
1222
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
|
1223
1223
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
|
1224
1224
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
|