polars-df 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/ext/polars/src/series.rs
CHANGED
@@ -5,6 +5,7 @@ use polars::series::IsSorted;
|
|
5
5
|
use std::cell::RefCell;
|
6
6
|
|
7
7
|
use crate::conversion::*;
|
8
|
+
use crate::set::set_at_idx;
|
8
9
|
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
|
9
10
|
|
10
11
|
#[magnus::wrap(class = "Polars::RbSeries")]
|
@@ -116,11 +117,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
|
|
116
117
|
init_method_opt!(new_opt_f64, Float64Type, f64);
|
117
118
|
|
118
119
|
impl RbSeries {
|
119
|
-
pub fn new_str(name: String, val:
|
120
|
-
let
|
121
|
-
let mut s = Utf8Chunked::new(&name, v).into_series();
|
120
|
+
pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
|
121
|
+
let mut s = val.0.into_series();
|
122
122
|
s.rename(&name);
|
123
|
-
|
123
|
+
RbSeries::new(s)
|
124
124
|
}
|
125
125
|
|
126
126
|
pub fn estimated_size(&self) -> usize {
|
@@ -199,16 +199,16 @@ impl RbSeries {
|
|
199
199
|
self.series.borrow_mut().rename(&name);
|
200
200
|
}
|
201
201
|
|
202
|
-
pub fn dtype(&self) ->
|
203
|
-
self.series.borrow().dtype().
|
202
|
+
pub fn dtype(&self) -> Value {
|
203
|
+
Wrap(self.series.borrow().dtype().clone()).into()
|
204
204
|
}
|
205
205
|
|
206
|
-
pub fn inner_dtype(&self) -> Option<
|
206
|
+
pub fn inner_dtype(&self) -> Option<Value> {
|
207
207
|
self.series
|
208
208
|
.borrow()
|
209
209
|
.dtype()
|
210
210
|
.inner_dtype()
|
211
|
-
.map(|dt| dt.
|
211
|
+
.map(|dt| Wrap(dt.clone()).into())
|
212
212
|
}
|
213
213
|
|
214
214
|
pub fn set_sorted(&self, reverse: bool) -> Self {
|
@@ -475,6 +475,12 @@ impl RbSeries {
|
|
475
475
|
s.into_iter().collect()
|
476
476
|
} else if let Ok(s) = series.utf8() {
|
477
477
|
s.into_iter().collect()
|
478
|
+
} else if let Ok(_s) = series.date() {
|
479
|
+
let a = RArray::with_capacity(series.len());
|
480
|
+
for v in series.iter() {
|
481
|
+
a.push::<Value>(Wrap(v).into()).unwrap();
|
482
|
+
}
|
483
|
+
a
|
478
484
|
} else {
|
479
485
|
unimplemented!();
|
480
486
|
}
|
@@ -595,44 +601,190 @@ impl RbSeries {
|
|
595
601
|
}
|
596
602
|
}
|
597
603
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
604
|
+
pub fn set_at_idx(&self, idx: &RbSeries, values: &RbSeries) -> RbResult<()> {
|
605
|
+
let mut s = self.series.borrow_mut();
|
606
|
+
match set_at_idx(s.clone(), &idx.series.borrow(), &values.series.borrow()) {
|
607
|
+
Ok(out) => {
|
608
|
+
*s = out;
|
609
|
+
Ok(())
|
610
|
+
}
|
611
|
+
Err(e) => Err(RbPolarsErr::from(e)),
|
612
|
+
}
|
602
613
|
}
|
614
|
+
}
|
603
615
|
|
604
|
-
|
605
|
-
|
606
|
-
|
616
|
+
macro_rules! impl_eq_num {
|
617
|
+
($name:ident, $type:ty) => {
|
618
|
+
impl RbSeries {
|
619
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
620
|
+
let s = self.series.borrow().equal(rhs).map_err(RbPolarsErr::from)?;
|
621
|
+
Ok(RbSeries::new(s.into_series()))
|
622
|
+
}
|
623
|
+
}
|
624
|
+
};
|
625
|
+
}
|
607
626
|
|
608
|
-
|
609
|
-
|
610
|
-
|
627
|
+
impl_eq_num!(eq_u8, u8);
|
628
|
+
impl_eq_num!(eq_u16, u16);
|
629
|
+
impl_eq_num!(eq_u32, u32);
|
630
|
+
impl_eq_num!(eq_u64, u64);
|
631
|
+
impl_eq_num!(eq_i8, i8);
|
632
|
+
impl_eq_num!(eq_i16, i16);
|
633
|
+
impl_eq_num!(eq_i32, i32);
|
634
|
+
impl_eq_num!(eq_i64, i64);
|
635
|
+
impl_eq_num!(eq_f32, f32);
|
636
|
+
impl_eq_num!(eq_f64, f64);
|
637
|
+
// impl_eq_num!(eq_str, &str);
|
638
|
+
|
639
|
+
macro_rules! impl_neq_num {
|
640
|
+
($name:ident, $type:ty) => {
|
641
|
+
impl RbSeries {
|
642
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
643
|
+
let s = self
|
644
|
+
.series
|
645
|
+
.borrow()
|
646
|
+
.not_equal(rhs)
|
647
|
+
.map_err(RbPolarsErr::from)?;
|
648
|
+
Ok(RbSeries::new(s.into_series()))
|
649
|
+
}
|
650
|
+
}
|
651
|
+
};
|
652
|
+
}
|
611
653
|
|
612
|
-
|
613
|
-
|
614
|
-
|
654
|
+
impl_neq_num!(neq_u8, u8);
|
655
|
+
impl_neq_num!(neq_u16, u16);
|
656
|
+
impl_neq_num!(neq_u32, u32);
|
657
|
+
impl_neq_num!(neq_u64, u64);
|
658
|
+
impl_neq_num!(neq_i8, i8);
|
659
|
+
impl_neq_num!(neq_i16, i16);
|
660
|
+
impl_neq_num!(neq_i32, i32);
|
661
|
+
impl_neq_num!(neq_i64, i64);
|
662
|
+
impl_neq_num!(neq_f32, f32);
|
663
|
+
impl_neq_num!(neq_f64, f64);
|
664
|
+
// impl_neq_num!(neq_str, &str);
|
665
|
+
|
666
|
+
macro_rules! impl_gt_num {
|
667
|
+
($name:ident, $type:ty) => {
|
668
|
+
impl RbSeries {
|
669
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
670
|
+
let s = self.series.borrow().gt(rhs).map_err(RbPolarsErr::from)?;
|
671
|
+
Ok(RbSeries::new(s.into_series()))
|
672
|
+
}
|
673
|
+
}
|
674
|
+
};
|
675
|
+
}
|
615
676
|
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
677
|
+
impl_gt_num!(gt_u8, u8);
|
678
|
+
impl_gt_num!(gt_u16, u16);
|
679
|
+
impl_gt_num!(gt_u32, u32);
|
680
|
+
impl_gt_num!(gt_u64, u64);
|
681
|
+
impl_gt_num!(gt_i8, i8);
|
682
|
+
impl_gt_num!(gt_i16, i16);
|
683
|
+
impl_gt_num!(gt_i32, i32);
|
684
|
+
impl_gt_num!(gt_i64, i64);
|
685
|
+
impl_gt_num!(gt_f32, f32);
|
686
|
+
impl_gt_num!(gt_f64, f64);
|
687
|
+
// impl_gt_num!(gt_str, &str);
|
688
|
+
|
689
|
+
macro_rules! impl_gt_eq_num {
|
690
|
+
($name:ident, $type:ty) => {
|
691
|
+
impl RbSeries {
|
692
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
693
|
+
let s = self.series.borrow().gt_eq(rhs).map_err(RbPolarsErr::from)?;
|
694
|
+
Ok(RbSeries::new(s.into_series()))
|
695
|
+
}
|
696
|
+
}
|
697
|
+
};
|
698
|
+
}
|
620
699
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
700
|
+
impl_gt_eq_num!(gt_eq_u8, u8);
|
701
|
+
impl_gt_eq_num!(gt_eq_u16, u16);
|
702
|
+
impl_gt_eq_num!(gt_eq_u32, u32);
|
703
|
+
impl_gt_eq_num!(gt_eq_u64, u64);
|
704
|
+
impl_gt_eq_num!(gt_eq_i8, i8);
|
705
|
+
impl_gt_eq_num!(gt_eq_i16, i16);
|
706
|
+
impl_gt_eq_num!(gt_eq_i32, i32);
|
707
|
+
impl_gt_eq_num!(gt_eq_i64, i64);
|
708
|
+
impl_gt_eq_num!(gt_eq_f32, f32);
|
709
|
+
impl_gt_eq_num!(gt_eq_f64, f64);
|
710
|
+
// impl_gt_eq_num!(gt_eq_str, &str);
|
711
|
+
|
712
|
+
macro_rules! impl_lt_num {
|
713
|
+
($name:ident, $type:ty) => {
|
714
|
+
impl RbSeries {
|
715
|
+
pub fn $name(&self, rhs: $type) -> RbResult<RbSeries> {
|
716
|
+
let s = self.series.borrow().lt(rhs).map_err(RbPolarsErr::from)?;
|
717
|
+
Ok(RbSeries::new(s.into_series()))
|
718
|
+
}
|
719
|
+
}
|
720
|
+
};
|
721
|
+
}
|
625
722
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
723
|
+
impl_lt_num!(lt_u8, u8);
|
724
|
+
impl_lt_num!(lt_u16, u16);
|
725
|
+
impl_lt_num!(lt_u32, u32);
|
726
|
+
impl_lt_num!(lt_u64, u64);
|
727
|
+
impl_lt_num!(lt_i8, i8);
|
728
|
+
impl_lt_num!(lt_i16, i16);
|
729
|
+
impl_lt_num!(lt_i32, i32);
|
730
|
+
impl_lt_num!(lt_i64, i64);
|
731
|
+
impl_lt_num!(lt_f32, f32);
|
732
|
+
impl_lt_num!(lt_f64, f64);
|
733
|
+
// impl_lt_num!(lt_str, &str);
|
734
|
+
|
735
|
+
macro_rules! impl_lt_eq_num {
|
736
|
+
($name:ident, $type:ty) => {
|
737
|
+
impl RbSeries {
|
738
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
739
|
+
let s = self.series.borrow().lt_eq(rhs).map_err(RbPolarsErr::from)?;
|
740
|
+
Ok(RbSeries::new(s.into_series()))
|
741
|
+
}
|
742
|
+
}
|
743
|
+
};
|
744
|
+
}
|
745
|
+
|
746
|
+
impl_lt_eq_num!(lt_eq_u8, u8);
|
747
|
+
impl_lt_eq_num!(lt_eq_u16, u16);
|
748
|
+
impl_lt_eq_num!(lt_eq_u32, u32);
|
749
|
+
impl_lt_eq_num!(lt_eq_u64, u64);
|
750
|
+
impl_lt_eq_num!(lt_eq_i8, i8);
|
751
|
+
impl_lt_eq_num!(lt_eq_i16, i16);
|
752
|
+
impl_lt_eq_num!(lt_eq_i32, i32);
|
753
|
+
impl_lt_eq_num!(lt_eq_i64, i64);
|
754
|
+
impl_lt_eq_num!(lt_eq_f32, f32);
|
755
|
+
impl_lt_eq_num!(lt_eq_f64, f64);
|
756
|
+
// impl_lt_eq_num!(lt_eq_str, &str);
|
757
|
+
|
758
|
+
pub fn to_series_collection(rs: RArray) -> RbResult<Vec<Series>> {
|
759
|
+
let mut series = Vec::new();
|
760
|
+
for item in rs.each() {
|
761
|
+
series.push(item?.try_convert::<&RbSeries>()?.series.borrow().clone());
|
762
|
+
}
|
763
|
+
Ok(series)
|
634
764
|
}
|
635
765
|
|
636
766
|
pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
|
637
|
-
s.into_iter().map(
|
767
|
+
s.into_iter().map(RbSeries::new).collect()
|
768
|
+
}
|
769
|
+
|
770
|
+
impl RbSeries {
|
771
|
+
pub fn new_opt_date(name: String, values: RArray, _strict: Option<bool>) -> RbResult<Self> {
|
772
|
+
let len = values.len();
|
773
|
+
let mut builder = PrimitiveChunkedBuilder::<Int32Type>::new(&name, len);
|
774
|
+
for item in values.each() {
|
775
|
+
let v = item?;
|
776
|
+
if v.is_nil() {
|
777
|
+
builder.append_null();
|
778
|
+
} else {
|
779
|
+
// convert to DateTime for UTC
|
780
|
+
let v: Value = v.funcall("to_datetime", ())?;
|
781
|
+
let v: Value = v.funcall("to_time", ())?;
|
782
|
+
let v: Value = v.funcall("to_i", ())?;
|
783
|
+
// TODO use strict
|
784
|
+
builder.append_value(v.try_convert::<i32>()? / 86400);
|
785
|
+
}
|
786
|
+
}
|
787
|
+
let ca: ChunkedArray<Int32Type> = builder.finish();
|
788
|
+
Ok(ca.into_date().into_series().into())
|
789
|
+
}
|
638
790
|
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
// use polars::export::arrow2::array::Array;
|
2
|
+
use polars::prelude::*;
|
3
|
+
|
4
|
+
pub fn set_at_idx(mut s: Series, idx: &Series, values: &Series) -> PolarsResult<Series> {
|
5
|
+
let logical_dtype = s.dtype().clone();
|
6
|
+
let idx = idx.cast(&IDX_DTYPE)?;
|
7
|
+
let idx = idx.rechunk();
|
8
|
+
let idx = idx.idx().unwrap();
|
9
|
+
let idx = idx.downcast_iter().next().unwrap();
|
10
|
+
|
11
|
+
// if idx.null_count() > 0 {
|
12
|
+
// return Err(PolarsError::ComputeError(
|
13
|
+
// "index values should not be null".into(),
|
14
|
+
// ));
|
15
|
+
// }
|
16
|
+
|
17
|
+
let idx = idx.values().as_slice();
|
18
|
+
|
19
|
+
let values = values.to_physical_repr().cast(&s.dtype().to_physical())?;
|
20
|
+
|
21
|
+
// do not shadow, otherwise s is not dropped immediately
|
22
|
+
// and we want to have mutable access
|
23
|
+
s = s.to_physical_repr().into_owned();
|
24
|
+
let mutable_s = s._get_inner_mut();
|
25
|
+
|
26
|
+
let s = match logical_dtype.to_physical() {
|
27
|
+
DataType::Int8 => {
|
28
|
+
let ca: &mut ChunkedArray<Int8Type> = mutable_s.as_mut();
|
29
|
+
let values = values.i8()?;
|
30
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
31
|
+
}
|
32
|
+
DataType::Int16 => {
|
33
|
+
let ca: &mut ChunkedArray<Int16Type> = mutable_s.as_mut();
|
34
|
+
let values = values.i16()?;
|
35
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
36
|
+
}
|
37
|
+
DataType::Int32 => {
|
38
|
+
let ca: &mut ChunkedArray<Int32Type> = mutable_s.as_mut();
|
39
|
+
let values = values.i32()?;
|
40
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
41
|
+
}
|
42
|
+
DataType::Int64 => {
|
43
|
+
let ca: &mut ChunkedArray<Int64Type> = mutable_s.as_mut();
|
44
|
+
let values = values.i64()?;
|
45
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
46
|
+
}
|
47
|
+
DataType::UInt8 => {
|
48
|
+
let ca: &mut ChunkedArray<UInt8Type> = mutable_s.as_mut();
|
49
|
+
let values = values.u8()?;
|
50
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
51
|
+
}
|
52
|
+
DataType::UInt16 => {
|
53
|
+
let ca: &mut ChunkedArray<UInt16Type> = mutable_s.as_mut();
|
54
|
+
let values = values.u16()?;
|
55
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
56
|
+
}
|
57
|
+
DataType::UInt32 => {
|
58
|
+
let ca: &mut ChunkedArray<UInt32Type> = mutable_s.as_mut();
|
59
|
+
let values = values.u32()?;
|
60
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
61
|
+
}
|
62
|
+
DataType::UInt64 => {
|
63
|
+
let ca: &mut ChunkedArray<UInt64Type> = mutable_s.as_mut();
|
64
|
+
let values = values.u64()?;
|
65
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
66
|
+
}
|
67
|
+
DataType::Float32 => {
|
68
|
+
let ca: &mut ChunkedArray<Float32Type> = mutable_s.as_mut();
|
69
|
+
let values = values.f32()?;
|
70
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
71
|
+
}
|
72
|
+
DataType::Float64 => {
|
73
|
+
let ca: &mut ChunkedArray<Float64Type> = mutable_s.as_mut();
|
74
|
+
let values = values.f64()?;
|
75
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
76
|
+
}
|
77
|
+
DataType::Boolean => {
|
78
|
+
let ca = s.bool()?;
|
79
|
+
let values = values.bool()?;
|
80
|
+
ca.set_at_idx2(idx, values)
|
81
|
+
}
|
82
|
+
DataType::Utf8 => {
|
83
|
+
let ca = s.utf8()?;
|
84
|
+
let values = values.utf8()?;
|
85
|
+
ca.set_at_idx2(idx, values)
|
86
|
+
}
|
87
|
+
_ => panic!("not yet implemented for dtype: {}", logical_dtype),
|
88
|
+
};
|
89
|
+
|
90
|
+
s.and_then(|s| s.cast(&logical_dtype))
|
91
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
use polars::prelude::*;
|
2
|
+
|
3
|
+
pub fn reinterpret(s: &Series, signed: bool) -> polars::prelude::PolarsResult<Series> {
|
4
|
+
match (s.dtype(), signed) {
|
5
|
+
(DataType::UInt64, true) => {
|
6
|
+
let ca = s.u64().unwrap();
|
7
|
+
Ok(ca.reinterpret_signed().into_series())
|
8
|
+
}
|
9
|
+
(DataType::UInt64, false) => Ok(s.clone()),
|
10
|
+
(DataType::Int64, false) => {
|
11
|
+
let ca = s.i64().unwrap();
|
12
|
+
Ok(ca.reinterpret_unsigned().into_series())
|
13
|
+
}
|
14
|
+
(DataType::Int64, true) => Ok(s.clone()),
|
15
|
+
_ => Err(PolarsError::ComputeError(
|
16
|
+
"reinterpret is only allowed for 64bit integers dtype, use cast otherwise".into(),
|
17
|
+
)),
|
18
|
+
}
|
19
|
+
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Polars
|
2
|
+
# @private
|
3
|
+
class BatchedCsvReader
|
4
|
+
attr_accessor :_reader, :new_columns
|
5
|
+
|
6
|
+
def initialize(
|
7
|
+
file,
|
8
|
+
has_header: true,
|
9
|
+
columns: nil,
|
10
|
+
sep: ",",
|
11
|
+
comment_char: nil,
|
12
|
+
quote_char: '"',
|
13
|
+
skip_rows: 0,
|
14
|
+
dtypes: nil,
|
15
|
+
null_values: nil,
|
16
|
+
ignore_errors: false,
|
17
|
+
parse_dates: false,
|
18
|
+
n_threads: nil,
|
19
|
+
infer_schema_length: 100,
|
20
|
+
batch_size: 50_000,
|
21
|
+
n_rows: nil,
|
22
|
+
encoding: "utf8",
|
23
|
+
low_memory: false,
|
24
|
+
rechunk: true,
|
25
|
+
skip_rows_after_header: 0,
|
26
|
+
row_count_name: nil,
|
27
|
+
row_count_offset: 0,
|
28
|
+
sample_size: 1024,
|
29
|
+
eol_char: "\n",
|
30
|
+
new_columns: nil
|
31
|
+
)
|
32
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
33
|
+
path = Utils.format_path(file)
|
34
|
+
end
|
35
|
+
|
36
|
+
dtype_list = nil
|
37
|
+
dtype_slice = nil
|
38
|
+
if !dtypes.nil?
|
39
|
+
if dtypes.is_a?(Hash)
|
40
|
+
dtype_list = []
|
41
|
+
dtypes.each do|k, v|
|
42
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
43
|
+
end
|
44
|
+
elsif dtypes.is_a?(Array)
|
45
|
+
dtype_slice = dtypes
|
46
|
+
else
|
47
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
processed_null_values = Utils._process_null_values(null_values)
|
52
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
53
|
+
|
54
|
+
self._reader = RbBatchedCsv.new(
|
55
|
+
infer_schema_length,
|
56
|
+
batch_size,
|
57
|
+
has_header,
|
58
|
+
ignore_errors,
|
59
|
+
n_rows,
|
60
|
+
skip_rows,
|
61
|
+
projection,
|
62
|
+
sep,
|
63
|
+
rechunk,
|
64
|
+
columns,
|
65
|
+
encoding,
|
66
|
+
n_threads,
|
67
|
+
path,
|
68
|
+
dtype_list,
|
69
|
+
dtype_slice,
|
70
|
+
low_memory,
|
71
|
+
comment_char,
|
72
|
+
quote_char,
|
73
|
+
processed_null_values,
|
74
|
+
parse_dates,
|
75
|
+
skip_rows_after_header,
|
76
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
77
|
+
sample_size,
|
78
|
+
eol_char
|
79
|
+
)
|
80
|
+
self.new_columns = new_columns
|
81
|
+
end
|
82
|
+
|
83
|
+
def next_batches(n)
|
84
|
+
batches = _reader.next_batches(n)
|
85
|
+
if !batches.nil?
|
86
|
+
if new_columns
|
87
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
88
|
+
else
|
89
|
+
batches.map { |df| Utils.wrap_df(df) }
|
90
|
+
end
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/polars/cat_expr.rb
CHANGED
@@ -1,11 +1,50 @@
|
|
1
1
|
module Polars
|
2
|
+
# Namespace for categorical related expressions.
|
2
3
|
class CatExpr
|
4
|
+
# @private
|
3
5
|
attr_accessor :_rbexpr
|
4
6
|
|
7
|
+
# @private
|
5
8
|
def initialize(expr)
|
6
9
|
self._rbexpr = expr._rbexpr
|
7
10
|
end
|
8
11
|
|
12
|
+
# Determine how this categorical series should be sorted.
|
13
|
+
#
|
14
|
+
# @param ordering ["physical", "lexical"]
|
15
|
+
# Ordering type:
|
16
|
+
#
|
17
|
+
# - 'physical' -> Use the physical representation of the categories to determine the order (default).
|
18
|
+
# - 'lexical' -> Use the string values to determine the ordering.
|
19
|
+
#
|
20
|
+
# @return [Expr]
|
21
|
+
#
|
22
|
+
# @example
|
23
|
+
# df = Polars::DataFrame.new(
|
24
|
+
# {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
|
25
|
+
# ).with_columns(
|
26
|
+
# [
|
27
|
+
# Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
|
28
|
+
# ]
|
29
|
+
# )
|
30
|
+
# df.sort(["cats", "vals"])
|
31
|
+
# # =>
|
32
|
+
# # shape: (5, 2)
|
33
|
+
# # ┌──────┬──────┐
|
34
|
+
# # │ cats ┆ vals │
|
35
|
+
# # │ --- ┆ --- │
|
36
|
+
# # │ cat ┆ i64 │
|
37
|
+
# # ╞══════╪══════╡
|
38
|
+
# # │ a ┆ 2 │
|
39
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
40
|
+
# # │ b ┆ 3 │
|
41
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
42
|
+
# # │ k ┆ 2 │
|
43
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
44
|
+
# # │ z ┆ 1 │
|
45
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
46
|
+
# # │ z ┆ 3 │
|
47
|
+
# # └──────┴──────┘
|
9
48
|
def set_ordering(ordering)
|
10
49
|
Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
|
11
50
|
end
|