polars-df 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/ext/polars/src/series.rs
CHANGED
@@ -5,6 +5,7 @@ use polars::series::IsSorted;
|
|
5
5
|
use std::cell::RefCell;
|
6
6
|
|
7
7
|
use crate::conversion::*;
|
8
|
+
use crate::set::set_at_idx;
|
8
9
|
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
|
9
10
|
|
10
11
|
#[magnus::wrap(class = "Polars::RbSeries")]
|
@@ -116,11 +117,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
|
|
116
117
|
init_method_opt!(new_opt_f64, Float64Type, f64);
|
117
118
|
|
118
119
|
impl RbSeries {
|
119
|
-
pub fn new_str(name: String, val:
|
120
|
-
let
|
121
|
-
let mut s = Utf8Chunked::new(&name, v).into_series();
|
120
|
+
pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
|
121
|
+
let mut s = val.0.into_series();
|
122
122
|
s.rename(&name);
|
123
|
-
|
123
|
+
RbSeries::new(s)
|
124
124
|
}
|
125
125
|
|
126
126
|
pub fn estimated_size(&self) -> usize {
|
@@ -199,16 +199,16 @@ impl RbSeries {
|
|
199
199
|
self.series.borrow_mut().rename(&name);
|
200
200
|
}
|
201
201
|
|
202
|
-
pub fn dtype(&self) ->
|
203
|
-
self.series.borrow().dtype().
|
202
|
+
pub fn dtype(&self) -> Value {
|
203
|
+
Wrap(self.series.borrow().dtype().clone()).into()
|
204
204
|
}
|
205
205
|
|
206
|
-
pub fn inner_dtype(&self) -> Option<
|
206
|
+
pub fn inner_dtype(&self) -> Option<Value> {
|
207
207
|
self.series
|
208
208
|
.borrow()
|
209
209
|
.dtype()
|
210
210
|
.inner_dtype()
|
211
|
-
.map(|dt| dt.
|
211
|
+
.map(|dt| Wrap(dt.clone()).into())
|
212
212
|
}
|
213
213
|
|
214
214
|
pub fn set_sorted(&self, reverse: bool) -> Self {
|
@@ -475,6 +475,12 @@ impl RbSeries {
|
|
475
475
|
s.into_iter().collect()
|
476
476
|
} else if let Ok(s) = series.utf8() {
|
477
477
|
s.into_iter().collect()
|
478
|
+
} else if let Ok(_s) = series.date() {
|
479
|
+
let a = RArray::with_capacity(series.len());
|
480
|
+
for v in series.iter() {
|
481
|
+
a.push::<Value>(Wrap(v).into()).unwrap();
|
482
|
+
}
|
483
|
+
a
|
478
484
|
} else {
|
479
485
|
unimplemented!();
|
480
486
|
}
|
@@ -595,44 +601,190 @@ impl RbSeries {
|
|
595
601
|
}
|
596
602
|
}
|
597
603
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
604
|
+
pub fn set_at_idx(&self, idx: &RbSeries, values: &RbSeries) -> RbResult<()> {
|
605
|
+
let mut s = self.series.borrow_mut();
|
606
|
+
match set_at_idx(s.clone(), &idx.series.borrow(), &values.series.borrow()) {
|
607
|
+
Ok(out) => {
|
608
|
+
*s = out;
|
609
|
+
Ok(())
|
610
|
+
}
|
611
|
+
Err(e) => Err(RbPolarsErr::from(e)),
|
612
|
+
}
|
602
613
|
}
|
614
|
+
}
|
603
615
|
|
604
|
-
|
605
|
-
|
606
|
-
|
616
|
+
macro_rules! impl_eq_num {
|
617
|
+
($name:ident, $type:ty) => {
|
618
|
+
impl RbSeries {
|
619
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
620
|
+
let s = self.series.borrow().equal(rhs).map_err(RbPolarsErr::from)?;
|
621
|
+
Ok(RbSeries::new(s.into_series()))
|
622
|
+
}
|
623
|
+
}
|
624
|
+
};
|
625
|
+
}
|
607
626
|
|
608
|
-
|
609
|
-
|
610
|
-
|
627
|
+
impl_eq_num!(eq_u8, u8);
|
628
|
+
impl_eq_num!(eq_u16, u16);
|
629
|
+
impl_eq_num!(eq_u32, u32);
|
630
|
+
impl_eq_num!(eq_u64, u64);
|
631
|
+
impl_eq_num!(eq_i8, i8);
|
632
|
+
impl_eq_num!(eq_i16, i16);
|
633
|
+
impl_eq_num!(eq_i32, i32);
|
634
|
+
impl_eq_num!(eq_i64, i64);
|
635
|
+
impl_eq_num!(eq_f32, f32);
|
636
|
+
impl_eq_num!(eq_f64, f64);
|
637
|
+
// impl_eq_num!(eq_str, &str);
|
638
|
+
|
639
|
+
macro_rules! impl_neq_num {
|
640
|
+
($name:ident, $type:ty) => {
|
641
|
+
impl RbSeries {
|
642
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
643
|
+
let s = self
|
644
|
+
.series
|
645
|
+
.borrow()
|
646
|
+
.not_equal(rhs)
|
647
|
+
.map_err(RbPolarsErr::from)?;
|
648
|
+
Ok(RbSeries::new(s.into_series()))
|
649
|
+
}
|
650
|
+
}
|
651
|
+
};
|
652
|
+
}
|
611
653
|
|
612
|
-
|
613
|
-
|
614
|
-
|
654
|
+
impl_neq_num!(neq_u8, u8);
|
655
|
+
impl_neq_num!(neq_u16, u16);
|
656
|
+
impl_neq_num!(neq_u32, u32);
|
657
|
+
impl_neq_num!(neq_u64, u64);
|
658
|
+
impl_neq_num!(neq_i8, i8);
|
659
|
+
impl_neq_num!(neq_i16, i16);
|
660
|
+
impl_neq_num!(neq_i32, i32);
|
661
|
+
impl_neq_num!(neq_i64, i64);
|
662
|
+
impl_neq_num!(neq_f32, f32);
|
663
|
+
impl_neq_num!(neq_f64, f64);
|
664
|
+
// impl_neq_num!(neq_str, &str);
|
665
|
+
|
666
|
+
macro_rules! impl_gt_num {
|
667
|
+
($name:ident, $type:ty) => {
|
668
|
+
impl RbSeries {
|
669
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
670
|
+
let s = self.series.borrow().gt(rhs).map_err(RbPolarsErr::from)?;
|
671
|
+
Ok(RbSeries::new(s.into_series()))
|
672
|
+
}
|
673
|
+
}
|
674
|
+
};
|
675
|
+
}
|
615
676
|
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
677
|
+
impl_gt_num!(gt_u8, u8);
|
678
|
+
impl_gt_num!(gt_u16, u16);
|
679
|
+
impl_gt_num!(gt_u32, u32);
|
680
|
+
impl_gt_num!(gt_u64, u64);
|
681
|
+
impl_gt_num!(gt_i8, i8);
|
682
|
+
impl_gt_num!(gt_i16, i16);
|
683
|
+
impl_gt_num!(gt_i32, i32);
|
684
|
+
impl_gt_num!(gt_i64, i64);
|
685
|
+
impl_gt_num!(gt_f32, f32);
|
686
|
+
impl_gt_num!(gt_f64, f64);
|
687
|
+
// impl_gt_num!(gt_str, &str);
|
688
|
+
|
689
|
+
macro_rules! impl_gt_eq_num {
|
690
|
+
($name:ident, $type:ty) => {
|
691
|
+
impl RbSeries {
|
692
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
693
|
+
let s = self.series.borrow().gt_eq(rhs).map_err(RbPolarsErr::from)?;
|
694
|
+
Ok(RbSeries::new(s.into_series()))
|
695
|
+
}
|
696
|
+
}
|
697
|
+
};
|
698
|
+
}
|
620
699
|
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
700
|
+
impl_gt_eq_num!(gt_eq_u8, u8);
|
701
|
+
impl_gt_eq_num!(gt_eq_u16, u16);
|
702
|
+
impl_gt_eq_num!(gt_eq_u32, u32);
|
703
|
+
impl_gt_eq_num!(gt_eq_u64, u64);
|
704
|
+
impl_gt_eq_num!(gt_eq_i8, i8);
|
705
|
+
impl_gt_eq_num!(gt_eq_i16, i16);
|
706
|
+
impl_gt_eq_num!(gt_eq_i32, i32);
|
707
|
+
impl_gt_eq_num!(gt_eq_i64, i64);
|
708
|
+
impl_gt_eq_num!(gt_eq_f32, f32);
|
709
|
+
impl_gt_eq_num!(gt_eq_f64, f64);
|
710
|
+
// impl_gt_eq_num!(gt_eq_str, &str);
|
711
|
+
|
712
|
+
macro_rules! impl_lt_num {
|
713
|
+
($name:ident, $type:ty) => {
|
714
|
+
impl RbSeries {
|
715
|
+
pub fn $name(&self, rhs: $type) -> RbResult<RbSeries> {
|
716
|
+
let s = self.series.borrow().lt(rhs).map_err(RbPolarsErr::from)?;
|
717
|
+
Ok(RbSeries::new(s.into_series()))
|
718
|
+
}
|
719
|
+
}
|
720
|
+
};
|
721
|
+
}
|
625
722
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
723
|
+
impl_lt_num!(lt_u8, u8);
|
724
|
+
impl_lt_num!(lt_u16, u16);
|
725
|
+
impl_lt_num!(lt_u32, u32);
|
726
|
+
impl_lt_num!(lt_u64, u64);
|
727
|
+
impl_lt_num!(lt_i8, i8);
|
728
|
+
impl_lt_num!(lt_i16, i16);
|
729
|
+
impl_lt_num!(lt_i32, i32);
|
730
|
+
impl_lt_num!(lt_i64, i64);
|
731
|
+
impl_lt_num!(lt_f32, f32);
|
732
|
+
impl_lt_num!(lt_f64, f64);
|
733
|
+
// impl_lt_num!(lt_str, &str);
|
734
|
+
|
735
|
+
macro_rules! impl_lt_eq_num {
|
736
|
+
($name:ident, $type:ty) => {
|
737
|
+
impl RbSeries {
|
738
|
+
pub fn $name(&self, rhs: $type) -> RbResult<Self> {
|
739
|
+
let s = self.series.borrow().lt_eq(rhs).map_err(RbPolarsErr::from)?;
|
740
|
+
Ok(RbSeries::new(s.into_series()))
|
741
|
+
}
|
742
|
+
}
|
743
|
+
};
|
744
|
+
}
|
745
|
+
|
746
|
+
impl_lt_eq_num!(lt_eq_u8, u8);
|
747
|
+
impl_lt_eq_num!(lt_eq_u16, u16);
|
748
|
+
impl_lt_eq_num!(lt_eq_u32, u32);
|
749
|
+
impl_lt_eq_num!(lt_eq_u64, u64);
|
750
|
+
impl_lt_eq_num!(lt_eq_i8, i8);
|
751
|
+
impl_lt_eq_num!(lt_eq_i16, i16);
|
752
|
+
impl_lt_eq_num!(lt_eq_i32, i32);
|
753
|
+
impl_lt_eq_num!(lt_eq_i64, i64);
|
754
|
+
impl_lt_eq_num!(lt_eq_f32, f32);
|
755
|
+
impl_lt_eq_num!(lt_eq_f64, f64);
|
756
|
+
// impl_lt_eq_num!(lt_eq_str, &str);
|
757
|
+
|
758
|
+
pub fn to_series_collection(rs: RArray) -> RbResult<Vec<Series>> {
|
759
|
+
let mut series = Vec::new();
|
760
|
+
for item in rs.each() {
|
761
|
+
series.push(item?.try_convert::<&RbSeries>()?.series.borrow().clone());
|
762
|
+
}
|
763
|
+
Ok(series)
|
634
764
|
}
|
635
765
|
|
636
766
|
pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
|
637
|
-
s.into_iter().map(
|
767
|
+
s.into_iter().map(RbSeries::new).collect()
|
768
|
+
}
|
769
|
+
|
770
|
+
impl RbSeries {
|
771
|
+
pub fn new_opt_date(name: String, values: RArray, _strict: Option<bool>) -> RbResult<Self> {
|
772
|
+
let len = values.len();
|
773
|
+
let mut builder = PrimitiveChunkedBuilder::<Int32Type>::new(&name, len);
|
774
|
+
for item in values.each() {
|
775
|
+
let v = item?;
|
776
|
+
if v.is_nil() {
|
777
|
+
builder.append_null();
|
778
|
+
} else {
|
779
|
+
// convert to DateTime for UTC
|
780
|
+
let v: Value = v.funcall("to_datetime", ())?;
|
781
|
+
let v: Value = v.funcall("to_time", ())?;
|
782
|
+
let v: Value = v.funcall("to_i", ())?;
|
783
|
+
// TODO use strict
|
784
|
+
builder.append_value(v.try_convert::<i32>()? / 86400);
|
785
|
+
}
|
786
|
+
}
|
787
|
+
let ca: ChunkedArray<Int32Type> = builder.finish();
|
788
|
+
Ok(ca.into_date().into_series().into())
|
789
|
+
}
|
638
790
|
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
// use polars::export::arrow2::array::Array;
|
2
|
+
use polars::prelude::*;
|
3
|
+
|
4
|
+
pub fn set_at_idx(mut s: Series, idx: &Series, values: &Series) -> PolarsResult<Series> {
|
5
|
+
let logical_dtype = s.dtype().clone();
|
6
|
+
let idx = idx.cast(&IDX_DTYPE)?;
|
7
|
+
let idx = idx.rechunk();
|
8
|
+
let idx = idx.idx().unwrap();
|
9
|
+
let idx = idx.downcast_iter().next().unwrap();
|
10
|
+
|
11
|
+
// if idx.null_count() > 0 {
|
12
|
+
// return Err(PolarsError::ComputeError(
|
13
|
+
// "index values should not be null".into(),
|
14
|
+
// ));
|
15
|
+
// }
|
16
|
+
|
17
|
+
let idx = idx.values().as_slice();
|
18
|
+
|
19
|
+
let values = values.to_physical_repr().cast(&s.dtype().to_physical())?;
|
20
|
+
|
21
|
+
// do not shadow, otherwise s is not dropped immediately
|
22
|
+
// and we want to have mutable access
|
23
|
+
s = s.to_physical_repr().into_owned();
|
24
|
+
let mutable_s = s._get_inner_mut();
|
25
|
+
|
26
|
+
let s = match logical_dtype.to_physical() {
|
27
|
+
DataType::Int8 => {
|
28
|
+
let ca: &mut ChunkedArray<Int8Type> = mutable_s.as_mut();
|
29
|
+
let values = values.i8()?;
|
30
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
31
|
+
}
|
32
|
+
DataType::Int16 => {
|
33
|
+
let ca: &mut ChunkedArray<Int16Type> = mutable_s.as_mut();
|
34
|
+
let values = values.i16()?;
|
35
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
36
|
+
}
|
37
|
+
DataType::Int32 => {
|
38
|
+
let ca: &mut ChunkedArray<Int32Type> = mutable_s.as_mut();
|
39
|
+
let values = values.i32()?;
|
40
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
41
|
+
}
|
42
|
+
DataType::Int64 => {
|
43
|
+
let ca: &mut ChunkedArray<Int64Type> = mutable_s.as_mut();
|
44
|
+
let values = values.i64()?;
|
45
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
46
|
+
}
|
47
|
+
DataType::UInt8 => {
|
48
|
+
let ca: &mut ChunkedArray<UInt8Type> = mutable_s.as_mut();
|
49
|
+
let values = values.u8()?;
|
50
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
51
|
+
}
|
52
|
+
DataType::UInt16 => {
|
53
|
+
let ca: &mut ChunkedArray<UInt16Type> = mutable_s.as_mut();
|
54
|
+
let values = values.u16()?;
|
55
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
56
|
+
}
|
57
|
+
DataType::UInt32 => {
|
58
|
+
let ca: &mut ChunkedArray<UInt32Type> = mutable_s.as_mut();
|
59
|
+
let values = values.u32()?;
|
60
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
61
|
+
}
|
62
|
+
DataType::UInt64 => {
|
63
|
+
let ca: &mut ChunkedArray<UInt64Type> = mutable_s.as_mut();
|
64
|
+
let values = values.u64()?;
|
65
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
66
|
+
}
|
67
|
+
DataType::Float32 => {
|
68
|
+
let ca: &mut ChunkedArray<Float32Type> = mutable_s.as_mut();
|
69
|
+
let values = values.f32()?;
|
70
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
71
|
+
}
|
72
|
+
DataType::Float64 => {
|
73
|
+
let ca: &mut ChunkedArray<Float64Type> = mutable_s.as_mut();
|
74
|
+
let values = values.f64()?;
|
75
|
+
std::mem::take(ca).set_at_idx2(idx, values.into_iter())
|
76
|
+
}
|
77
|
+
DataType::Boolean => {
|
78
|
+
let ca = s.bool()?;
|
79
|
+
let values = values.bool()?;
|
80
|
+
ca.set_at_idx2(idx, values)
|
81
|
+
}
|
82
|
+
DataType::Utf8 => {
|
83
|
+
let ca = s.utf8()?;
|
84
|
+
let values = values.utf8()?;
|
85
|
+
ca.set_at_idx2(idx, values)
|
86
|
+
}
|
87
|
+
_ => panic!("not yet implemented for dtype: {}", logical_dtype),
|
88
|
+
};
|
89
|
+
|
90
|
+
s.and_then(|s| s.cast(&logical_dtype))
|
91
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
use polars::prelude::*;
|
2
|
+
|
3
|
+
pub fn reinterpret(s: &Series, signed: bool) -> polars::prelude::PolarsResult<Series> {
|
4
|
+
match (s.dtype(), signed) {
|
5
|
+
(DataType::UInt64, true) => {
|
6
|
+
let ca = s.u64().unwrap();
|
7
|
+
Ok(ca.reinterpret_signed().into_series())
|
8
|
+
}
|
9
|
+
(DataType::UInt64, false) => Ok(s.clone()),
|
10
|
+
(DataType::Int64, false) => {
|
11
|
+
let ca = s.i64().unwrap();
|
12
|
+
Ok(ca.reinterpret_unsigned().into_series())
|
13
|
+
}
|
14
|
+
(DataType::Int64, true) => Ok(s.clone()),
|
15
|
+
_ => Err(PolarsError::ComputeError(
|
16
|
+
"reinterpret is only allowed for 64bit integers dtype, use cast otherwise".into(),
|
17
|
+
)),
|
18
|
+
}
|
19
|
+
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Polars
|
2
|
+
# @private
|
3
|
+
class BatchedCsvReader
|
4
|
+
attr_accessor :_reader, :new_columns
|
5
|
+
|
6
|
+
def initialize(
|
7
|
+
file,
|
8
|
+
has_header: true,
|
9
|
+
columns: nil,
|
10
|
+
sep: ",",
|
11
|
+
comment_char: nil,
|
12
|
+
quote_char: '"',
|
13
|
+
skip_rows: 0,
|
14
|
+
dtypes: nil,
|
15
|
+
null_values: nil,
|
16
|
+
ignore_errors: false,
|
17
|
+
parse_dates: false,
|
18
|
+
n_threads: nil,
|
19
|
+
infer_schema_length: 100,
|
20
|
+
batch_size: 50_000,
|
21
|
+
n_rows: nil,
|
22
|
+
encoding: "utf8",
|
23
|
+
low_memory: false,
|
24
|
+
rechunk: true,
|
25
|
+
skip_rows_after_header: 0,
|
26
|
+
row_count_name: nil,
|
27
|
+
row_count_offset: 0,
|
28
|
+
sample_size: 1024,
|
29
|
+
eol_char: "\n",
|
30
|
+
new_columns: nil
|
31
|
+
)
|
32
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
33
|
+
path = Utils.format_path(file)
|
34
|
+
end
|
35
|
+
|
36
|
+
dtype_list = nil
|
37
|
+
dtype_slice = nil
|
38
|
+
if !dtypes.nil?
|
39
|
+
if dtypes.is_a?(Hash)
|
40
|
+
dtype_list = []
|
41
|
+
dtypes.each do|k, v|
|
42
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
43
|
+
end
|
44
|
+
elsif dtypes.is_a?(Array)
|
45
|
+
dtype_slice = dtypes
|
46
|
+
else
|
47
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
processed_null_values = Utils._process_null_values(null_values)
|
52
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
53
|
+
|
54
|
+
self._reader = RbBatchedCsv.new(
|
55
|
+
infer_schema_length,
|
56
|
+
batch_size,
|
57
|
+
has_header,
|
58
|
+
ignore_errors,
|
59
|
+
n_rows,
|
60
|
+
skip_rows,
|
61
|
+
projection,
|
62
|
+
sep,
|
63
|
+
rechunk,
|
64
|
+
columns,
|
65
|
+
encoding,
|
66
|
+
n_threads,
|
67
|
+
path,
|
68
|
+
dtype_list,
|
69
|
+
dtype_slice,
|
70
|
+
low_memory,
|
71
|
+
comment_char,
|
72
|
+
quote_char,
|
73
|
+
processed_null_values,
|
74
|
+
parse_dates,
|
75
|
+
skip_rows_after_header,
|
76
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
77
|
+
sample_size,
|
78
|
+
eol_char
|
79
|
+
)
|
80
|
+
self.new_columns = new_columns
|
81
|
+
end
|
82
|
+
|
83
|
+
def next_batches(n)
|
84
|
+
batches = _reader.next_batches(n)
|
85
|
+
if !batches.nil?
|
86
|
+
if new_columns
|
87
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
88
|
+
else
|
89
|
+
batches.map { |df| Utils.wrap_df(df) }
|
90
|
+
end
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/polars/cat_expr.rb
CHANGED
@@ -1,11 +1,50 @@
|
|
1
1
|
module Polars
|
2
|
+
# Namespace for categorical related expressions.
|
2
3
|
class CatExpr
|
4
|
+
# @private
|
3
5
|
attr_accessor :_rbexpr
|
4
6
|
|
7
|
+
# @private
|
5
8
|
def initialize(expr)
|
6
9
|
self._rbexpr = expr._rbexpr
|
7
10
|
end
|
8
11
|
|
12
|
+
# Determine how this categorical series should be sorted.
|
13
|
+
#
|
14
|
+
# @param ordering ["physical", "lexical"]
|
15
|
+
# Ordering type:
|
16
|
+
#
|
17
|
+
# - 'physical' -> Use the physical representation of the categories to determine the order (default).
|
18
|
+
# - 'lexical' -> Use the string values to determine the ordering.
|
19
|
+
#
|
20
|
+
# @return [Expr]
|
21
|
+
#
|
22
|
+
# @example
|
23
|
+
# df = Polars::DataFrame.new(
|
24
|
+
# {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
|
25
|
+
# ).with_columns(
|
26
|
+
# [
|
27
|
+
# Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
|
28
|
+
# ]
|
29
|
+
# )
|
30
|
+
# df.sort(["cats", "vals"])
|
31
|
+
# # =>
|
32
|
+
# # shape: (5, 2)
|
33
|
+
# # ┌──────┬──────┐
|
34
|
+
# # │ cats ┆ vals │
|
35
|
+
# # │ --- ┆ --- │
|
36
|
+
# # │ cat ┆ i64 │
|
37
|
+
# # ╞══════╪══════╡
|
38
|
+
# # │ a ┆ 2 │
|
39
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
40
|
+
# # │ b ┆ 3 │
|
41
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
42
|
+
# # │ k ┆ 2 │
|
43
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
44
|
+
# # │ z ┆ 1 │
|
45
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
46
|
+
# # │ z ┆ 3 │
|
47
|
+
# # └──────┴──────┘
|
9
48
|
def set_ordering(ordering)
|
10
49
|
Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
|
11
50
|
end
|