polars-df 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +1946 -0
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +31 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +336 -42
- data/ext/polars/src/dataframe.rs +409 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +436 -10
- data/ext/polars/src/lazy/dsl.rs +1134 -5
- data/ext/polars/src/lazy/meta.rs +41 -0
- data/ext/polars/src/lazy/mod.rs +2 -0
- data/ext/polars/src/lib.rs +390 -3
- data/ext/polars/src/series.rs +175 -13
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/cat_expr.rb +13 -0
- data/lib/polars/data_frame.rb +892 -21
- data/lib/polars/date_time_expr.rb +143 -0
- data/lib/polars/expr.rb +503 -0
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +338 -6
- data/lib/polars/lazy_functions.rb +158 -11
- data/lib/polars/list_expr.rb +108 -0
- data/lib/polars/meta_expr.rb +33 -0
- data/lib/polars/series.rb +1304 -14
- data/lib/polars/string_expr.rb +117 -0
- data/lib/polars/struct_expr.rb +27 -0
- data/lib/polars/utils.rb +60 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -1
- metadata +13 -2
data/ext/polars/src/series.rs
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
use crate::conversion::wrap;
|
2
|
-
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
3
1
|
use magnus::exception::arg_error;
|
4
2
|
use magnus::{Error, RArray, Value};
|
5
3
|
use polars::prelude::*;
|
6
4
|
use polars::series::IsSorted;
|
7
5
|
use std::cell::RefCell;
|
8
6
|
|
7
|
+
use crate::conversion::*;
|
8
|
+
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
|
9
|
+
|
9
10
|
#[magnus::wrap(class = "Polars::RbSeries")]
|
10
11
|
pub struct RbSeries {
|
11
12
|
pub series: RefCell<Series>,
|
@@ -24,6 +25,14 @@ impl RbSeries {
|
|
24
25
|
}
|
25
26
|
}
|
26
27
|
|
28
|
+
pub fn is_sorted_flag(&self) -> bool {
|
29
|
+
matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
|
30
|
+
}
|
31
|
+
|
32
|
+
pub fn is_sorted_reverse_flag(&self) -> bool {
|
33
|
+
matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
|
34
|
+
}
|
35
|
+
|
27
36
|
pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
|
28
37
|
let len = obj.len();
|
29
38
|
let mut builder = BooleanChunkedBuilder::new(&name, len);
|
@@ -107,11 +116,33 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
|
|
107
116
|
init_method_opt!(new_opt_f64, Float64Type, f64);
|
108
117
|
|
109
118
|
impl RbSeries {
|
110
|
-
pub fn new_str(name: String, val:
|
111
|
-
let
|
112
|
-
let mut s = Utf8Chunked::new(&name, v).into_series();
|
119
|
+
pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
|
120
|
+
let mut s = val.0.into_series();
|
113
121
|
s.rename(&name);
|
114
|
-
|
122
|
+
RbSeries::new(s)
|
123
|
+
}
|
124
|
+
|
125
|
+
pub fn estimated_size(&self) -> usize {
|
126
|
+
self.series.borrow().estimated_size()
|
127
|
+
}
|
128
|
+
|
129
|
+
pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
|
130
|
+
let val = format!("{}", self.series.borrow().get(index));
|
131
|
+
if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
|
132
|
+
let v_trunc = &val[..val
|
133
|
+
.char_indices()
|
134
|
+
.take(str_lengths)
|
135
|
+
.last()
|
136
|
+
.map(|(i, c)| i + c.len_utf8())
|
137
|
+
.unwrap_or(0)];
|
138
|
+
if val == v_trunc {
|
139
|
+
val
|
140
|
+
} else {
|
141
|
+
format!("{}...", v_trunc)
|
142
|
+
}
|
143
|
+
} else {
|
144
|
+
val
|
145
|
+
}
|
115
146
|
}
|
116
147
|
|
117
148
|
pub fn rechunk(&self, in_place: bool) -> Option<Self> {
|
@@ -124,6 +155,10 @@ impl RbSeries {
|
|
124
155
|
}
|
125
156
|
}
|
126
157
|
|
158
|
+
pub fn get_idx(&self, idx: usize) -> Value {
|
159
|
+
Wrap(self.series.borrow().get(idx)).into()
|
160
|
+
}
|
161
|
+
|
127
162
|
pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
|
128
163
|
let out = self
|
129
164
|
.series
|
@@ -163,16 +198,16 @@ impl RbSeries {
|
|
163
198
|
self.series.borrow_mut().rename(&name);
|
164
199
|
}
|
165
200
|
|
166
|
-
pub fn dtype(&self) ->
|
167
|
-
self.series.borrow().dtype().
|
201
|
+
pub fn dtype(&self) -> Value {
|
202
|
+
Wrap(self.series.borrow().dtype().clone()).into()
|
168
203
|
}
|
169
204
|
|
170
|
-
pub fn inner_dtype(&self) -> Option<
|
205
|
+
pub fn inner_dtype(&self) -> Option<Value> {
|
171
206
|
self.series
|
172
207
|
.borrow()
|
173
208
|
.dtype()
|
174
209
|
.inner_dtype()
|
175
|
-
.map(|dt| dt.
|
210
|
+
.map(|dt| Wrap(dt.clone()).into())
|
176
211
|
}
|
177
212
|
|
178
213
|
pub fn set_sorted(&self, reverse: bool) -> Self {
|
@@ -196,15 +231,15 @@ impl RbSeries {
|
|
196
231
|
}
|
197
232
|
|
198
233
|
pub fn max(&self) -> Value {
|
199
|
-
|
234
|
+
Wrap(self.series.borrow().max_as_series().get(0)).into()
|
200
235
|
}
|
201
236
|
|
202
237
|
pub fn min(&self) -> Value {
|
203
|
-
|
238
|
+
Wrap(self.series.borrow().min_as_series().get(0)).into()
|
204
239
|
}
|
205
240
|
|
206
241
|
pub fn sum(&self) -> Value {
|
207
|
-
|
242
|
+
Wrap(self.series.borrow().sum_as_series().get(0)).into()
|
208
243
|
}
|
209
244
|
|
210
245
|
pub fn n_chunks(&self) -> usize {
|
@@ -454,6 +489,111 @@ impl RbSeries {
|
|
454
489
|
}
|
455
490
|
}
|
456
491
|
|
492
|
+
pub fn quantile(
|
493
|
+
&self,
|
494
|
+
quantile: f64,
|
495
|
+
interpolation: Wrap<QuantileInterpolOptions>,
|
496
|
+
) -> RbResult<Value> {
|
497
|
+
Ok(Wrap(
|
498
|
+
self.series
|
499
|
+
.borrow()
|
500
|
+
.quantile_as_series(quantile, interpolation.0)
|
501
|
+
.map_err(|_| RbValueError::new_err("invalid quantile".into()))?
|
502
|
+
.get(0),
|
503
|
+
)
|
504
|
+
.into())
|
505
|
+
}
|
506
|
+
|
507
|
+
pub fn clone(&self) -> Self {
|
508
|
+
RbSeries::new(self.series.borrow().clone())
|
509
|
+
}
|
510
|
+
|
511
|
+
pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
|
512
|
+
let binding = mask.series.borrow();
|
513
|
+
let mask = binding.bool().map_err(RbPolarsErr::from)?;
|
514
|
+
let s = self
|
515
|
+
.series
|
516
|
+
.borrow()
|
517
|
+
.zip_with(mask, &other.series.borrow())
|
518
|
+
.map_err(RbPolarsErr::from)?;
|
519
|
+
Ok(RbSeries::new(s))
|
520
|
+
}
|
521
|
+
|
522
|
+
pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
|
523
|
+
let df = self
|
524
|
+
.series
|
525
|
+
.borrow()
|
526
|
+
.to_dummies()
|
527
|
+
.map_err(RbPolarsErr::from)?;
|
528
|
+
Ok(df.into())
|
529
|
+
}
|
530
|
+
|
531
|
+
pub fn peak_max(&self) -> Self {
|
532
|
+
self.series.borrow().peak_max().into_series().into()
|
533
|
+
}
|
534
|
+
|
535
|
+
pub fn peak_min(&self) -> Self {
|
536
|
+
self.series.borrow().peak_min().into_series().into()
|
537
|
+
}
|
538
|
+
|
539
|
+
pub fn n_unique(&self) -> RbResult<usize> {
|
540
|
+
let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
|
541
|
+
Ok(n)
|
542
|
+
}
|
543
|
+
|
544
|
+
pub fn floor(&self) -> RbResult<Self> {
|
545
|
+
let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
|
546
|
+
Ok(s.into())
|
547
|
+
}
|
548
|
+
|
549
|
+
pub fn shrink_to_fit(&self) {
|
550
|
+
self.series.borrow_mut().shrink_to_fit();
|
551
|
+
}
|
552
|
+
|
553
|
+
pub fn dot(&self, other: &RbSeries) -> Option<f64> {
|
554
|
+
self.series.borrow().dot(&other.series.borrow())
|
555
|
+
}
|
556
|
+
|
557
|
+
pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
|
558
|
+
let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
|
559
|
+
Ok(out)
|
560
|
+
}
|
561
|
+
|
562
|
+
pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
|
563
|
+
let out = self
|
564
|
+
.series
|
565
|
+
.borrow()
|
566
|
+
.kurtosis(fisher, bias)
|
567
|
+
.map_err(RbPolarsErr::from)?;
|
568
|
+
Ok(out)
|
569
|
+
}
|
570
|
+
|
571
|
+
pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
|
572
|
+
let dtype = dtype.0;
|
573
|
+
let out = if strict {
|
574
|
+
self.series.borrow().strict_cast(&dtype)
|
575
|
+
} else {
|
576
|
+
self.series.borrow().cast(&dtype)
|
577
|
+
};
|
578
|
+
let out = out.map_err(RbPolarsErr::from)?;
|
579
|
+
Ok(out.into())
|
580
|
+
}
|
581
|
+
|
582
|
+
pub fn time_unit(&self) -> Option<String> {
|
583
|
+
if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
|
584
|
+
Some(
|
585
|
+
match tu {
|
586
|
+
TimeUnit::Nanoseconds => "ns",
|
587
|
+
TimeUnit::Microseconds => "us",
|
588
|
+
TimeUnit::Milliseconds => "ms",
|
589
|
+
}
|
590
|
+
.to_string(),
|
591
|
+
)
|
592
|
+
} else {
|
593
|
+
None
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
457
597
|
// dispatch dynamically in future?
|
458
598
|
|
459
599
|
pub fn cumsum(&self, reverse: bool) -> Self {
|
@@ -468,8 +608,30 @@ impl RbSeries {
|
|
468
608
|
self.series.borrow().cummin(reverse).into()
|
469
609
|
}
|
470
610
|
|
611
|
+
pub fn cumprod(&self, reverse: bool) -> Self {
|
612
|
+
self.series.borrow().cumprod(reverse).into()
|
613
|
+
}
|
614
|
+
|
471
615
|
pub fn slice(&self, offset: i64, length: usize) -> Self {
|
472
616
|
let series = self.series.borrow().slice(offset, length);
|
473
617
|
series.into()
|
474
618
|
}
|
619
|
+
|
620
|
+
pub fn ceil(&self) -> RbResult<Self> {
|
621
|
+
let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
|
622
|
+
Ok(s.into())
|
623
|
+
}
|
624
|
+
|
625
|
+
pub fn round(&self, decimals: u32) -> RbResult<Self> {
|
626
|
+
let s = self
|
627
|
+
.series
|
628
|
+
.borrow()
|
629
|
+
.round(decimals)
|
630
|
+
.map_err(RbPolarsErr::from)?;
|
631
|
+
Ok(s.into())
|
632
|
+
}
|
633
|
+
}
|
634
|
+
|
635
|
+
pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
|
636
|
+
s.into_iter().map(RbSeries::new).collect()
|
475
637
|
}
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Polars
|
2
|
+
class BatchedCsvReader
|
3
|
+
attr_accessor :_reader, :new_columns
|
4
|
+
|
5
|
+
def initialize(
|
6
|
+
file,
|
7
|
+
has_header: true,
|
8
|
+
columns: nil,
|
9
|
+
sep: ",",
|
10
|
+
comment_char: nil,
|
11
|
+
quote_char: '"',
|
12
|
+
skip_rows: 0,
|
13
|
+
dtypes: nil,
|
14
|
+
null_values: nil,
|
15
|
+
ignore_errors: false,
|
16
|
+
parse_dates: false,
|
17
|
+
n_threads: nil,
|
18
|
+
infer_schema_length: 100,
|
19
|
+
batch_size: 50_000,
|
20
|
+
n_rows: nil,
|
21
|
+
encoding: "utf8",
|
22
|
+
low_memory: false,
|
23
|
+
rechunk: true,
|
24
|
+
skip_rows_after_header: 0,
|
25
|
+
row_count_name: nil,
|
26
|
+
row_count_offset: 0,
|
27
|
+
sample_size: 1024,
|
28
|
+
eol_char: "\n",
|
29
|
+
new_columns: nil
|
30
|
+
)
|
31
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
32
|
+
path = Utils.format_path(file)
|
33
|
+
end
|
34
|
+
|
35
|
+
dtype_list = nil
|
36
|
+
dtype_slice = nil
|
37
|
+
if !dtypes.nil?
|
38
|
+
if dtypes.is_a?(Hash)
|
39
|
+
dtype_list = []
|
40
|
+
dtypes.each do|k, v|
|
41
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
42
|
+
end
|
43
|
+
elsif dtypes.is_a?(Array)
|
44
|
+
dtype_slice = dtypes
|
45
|
+
else
|
46
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
processed_null_values = Utils._process_null_values(null_values)
|
51
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
52
|
+
|
53
|
+
self._reader = RbBatchedCsv.new(
|
54
|
+
infer_schema_length,
|
55
|
+
batch_size,
|
56
|
+
has_header,
|
57
|
+
ignore_errors,
|
58
|
+
n_rows,
|
59
|
+
skip_rows,
|
60
|
+
projection,
|
61
|
+
sep,
|
62
|
+
rechunk,
|
63
|
+
columns,
|
64
|
+
encoding,
|
65
|
+
n_threads,
|
66
|
+
path,
|
67
|
+
dtype_list,
|
68
|
+
dtype_slice,
|
69
|
+
low_memory,
|
70
|
+
comment_char,
|
71
|
+
quote_char,
|
72
|
+
processed_null_values,
|
73
|
+
parse_dates,
|
74
|
+
skip_rows_after_header,
|
75
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
76
|
+
sample_size,
|
77
|
+
eol_char
|
78
|
+
)
|
79
|
+
self.new_columns = new_columns
|
80
|
+
end
|
81
|
+
|
82
|
+
def next_batches(n)
|
83
|
+
batches = _reader.next_batches(n)
|
84
|
+
if !batches.nil?
|
85
|
+
if new_columns
|
86
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
87
|
+
else
|
88
|
+
batches.map { |df| Utils.wrap_df(df) }
|
89
|
+
end
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|