polars-df 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +1946 -0
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +31 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +336 -42
- data/ext/polars/src/dataframe.rs +409 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +436 -10
- data/ext/polars/src/lazy/dsl.rs +1134 -5
- data/ext/polars/src/lazy/meta.rs +41 -0
- data/ext/polars/src/lazy/mod.rs +2 -0
- data/ext/polars/src/lib.rs +390 -3
- data/ext/polars/src/series.rs +175 -13
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/cat_expr.rb +13 -0
- data/lib/polars/data_frame.rb +892 -21
- data/lib/polars/date_time_expr.rb +143 -0
- data/lib/polars/expr.rb +503 -0
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +338 -6
- data/lib/polars/lazy_functions.rb +158 -11
- data/lib/polars/list_expr.rb +108 -0
- data/lib/polars/meta_expr.rb +33 -0
- data/lib/polars/series.rb +1304 -14
- data/lib/polars/string_expr.rb +117 -0
- data/lib/polars/struct_expr.rb +27 -0
- data/lib/polars/utils.rb +60 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -1
- metadata +13 -2
data/ext/polars/src/series.rs
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
use crate::conversion::wrap;
|
2
|
-
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
3
1
|
use magnus::exception::arg_error;
|
4
2
|
use magnus::{Error, RArray, Value};
|
5
3
|
use polars::prelude::*;
|
6
4
|
use polars::series::IsSorted;
|
7
5
|
use std::cell::RefCell;
|
8
6
|
|
7
|
+
use crate::conversion::*;
|
8
|
+
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
|
9
|
+
|
9
10
|
#[magnus::wrap(class = "Polars::RbSeries")]
|
10
11
|
pub struct RbSeries {
|
11
12
|
pub series: RefCell<Series>,
|
@@ -24,6 +25,14 @@ impl RbSeries {
|
|
24
25
|
}
|
25
26
|
}
|
26
27
|
|
28
|
+
pub fn is_sorted_flag(&self) -> bool {
|
29
|
+
matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
|
30
|
+
}
|
31
|
+
|
32
|
+
pub fn is_sorted_reverse_flag(&self) -> bool {
|
33
|
+
matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
|
34
|
+
}
|
35
|
+
|
27
36
|
pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
|
28
37
|
let len = obj.len();
|
29
38
|
let mut builder = BooleanChunkedBuilder::new(&name, len);
|
@@ -107,11 +116,33 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
|
|
107
116
|
init_method_opt!(new_opt_f64, Float64Type, f64);
|
108
117
|
|
109
118
|
impl RbSeries {
|
110
|
-
pub fn new_str(name: String, val:
|
111
|
-
let
|
112
|
-
let mut s = Utf8Chunked::new(&name, v).into_series();
|
119
|
+
pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
|
120
|
+
let mut s = val.0.into_series();
|
113
121
|
s.rename(&name);
|
114
|
-
|
122
|
+
RbSeries::new(s)
|
123
|
+
}
|
124
|
+
|
125
|
+
pub fn estimated_size(&self) -> usize {
|
126
|
+
self.series.borrow().estimated_size()
|
127
|
+
}
|
128
|
+
|
129
|
+
pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
|
130
|
+
let val = format!("{}", self.series.borrow().get(index));
|
131
|
+
if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
|
132
|
+
let v_trunc = &val[..val
|
133
|
+
.char_indices()
|
134
|
+
.take(str_lengths)
|
135
|
+
.last()
|
136
|
+
.map(|(i, c)| i + c.len_utf8())
|
137
|
+
.unwrap_or(0)];
|
138
|
+
if val == v_trunc {
|
139
|
+
val
|
140
|
+
} else {
|
141
|
+
format!("{}...", v_trunc)
|
142
|
+
}
|
143
|
+
} else {
|
144
|
+
val
|
145
|
+
}
|
115
146
|
}
|
116
147
|
|
117
148
|
pub fn rechunk(&self, in_place: bool) -> Option<Self> {
|
@@ -124,6 +155,10 @@ impl RbSeries {
|
|
124
155
|
}
|
125
156
|
}
|
126
157
|
|
158
|
+
pub fn get_idx(&self, idx: usize) -> Value {
|
159
|
+
Wrap(self.series.borrow().get(idx)).into()
|
160
|
+
}
|
161
|
+
|
127
162
|
pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
|
128
163
|
let out = self
|
129
164
|
.series
|
@@ -163,16 +198,16 @@ impl RbSeries {
|
|
163
198
|
self.series.borrow_mut().rename(&name);
|
164
199
|
}
|
165
200
|
|
166
|
-
pub fn dtype(&self) ->
|
167
|
-
self.series.borrow().dtype().
|
201
|
+
pub fn dtype(&self) -> Value {
|
202
|
+
Wrap(self.series.borrow().dtype().clone()).into()
|
168
203
|
}
|
169
204
|
|
170
|
-
pub fn inner_dtype(&self) -> Option<
|
205
|
+
pub fn inner_dtype(&self) -> Option<Value> {
|
171
206
|
self.series
|
172
207
|
.borrow()
|
173
208
|
.dtype()
|
174
209
|
.inner_dtype()
|
175
|
-
.map(|dt| dt.
|
210
|
+
.map(|dt| Wrap(dt.clone()).into())
|
176
211
|
}
|
177
212
|
|
178
213
|
pub fn set_sorted(&self, reverse: bool) -> Self {
|
@@ -196,15 +231,15 @@ impl RbSeries {
|
|
196
231
|
}
|
197
232
|
|
198
233
|
pub fn max(&self) -> Value {
|
199
|
-
|
234
|
+
Wrap(self.series.borrow().max_as_series().get(0)).into()
|
200
235
|
}
|
201
236
|
|
202
237
|
pub fn min(&self) -> Value {
|
203
|
-
|
238
|
+
Wrap(self.series.borrow().min_as_series().get(0)).into()
|
204
239
|
}
|
205
240
|
|
206
241
|
pub fn sum(&self) -> Value {
|
207
|
-
|
242
|
+
Wrap(self.series.borrow().sum_as_series().get(0)).into()
|
208
243
|
}
|
209
244
|
|
210
245
|
pub fn n_chunks(&self) -> usize {
|
@@ -454,6 +489,111 @@ impl RbSeries {
|
|
454
489
|
}
|
455
490
|
}
|
456
491
|
|
492
|
+
pub fn quantile(
|
493
|
+
&self,
|
494
|
+
quantile: f64,
|
495
|
+
interpolation: Wrap<QuantileInterpolOptions>,
|
496
|
+
) -> RbResult<Value> {
|
497
|
+
Ok(Wrap(
|
498
|
+
self.series
|
499
|
+
.borrow()
|
500
|
+
.quantile_as_series(quantile, interpolation.0)
|
501
|
+
.map_err(|_| RbValueError::new_err("invalid quantile".into()))?
|
502
|
+
.get(0),
|
503
|
+
)
|
504
|
+
.into())
|
505
|
+
}
|
506
|
+
|
507
|
+
pub fn clone(&self) -> Self {
|
508
|
+
RbSeries::new(self.series.borrow().clone())
|
509
|
+
}
|
510
|
+
|
511
|
+
pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
|
512
|
+
let binding = mask.series.borrow();
|
513
|
+
let mask = binding.bool().map_err(RbPolarsErr::from)?;
|
514
|
+
let s = self
|
515
|
+
.series
|
516
|
+
.borrow()
|
517
|
+
.zip_with(mask, &other.series.borrow())
|
518
|
+
.map_err(RbPolarsErr::from)?;
|
519
|
+
Ok(RbSeries::new(s))
|
520
|
+
}
|
521
|
+
|
522
|
+
pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
|
523
|
+
let df = self
|
524
|
+
.series
|
525
|
+
.borrow()
|
526
|
+
.to_dummies()
|
527
|
+
.map_err(RbPolarsErr::from)?;
|
528
|
+
Ok(df.into())
|
529
|
+
}
|
530
|
+
|
531
|
+
pub fn peak_max(&self) -> Self {
|
532
|
+
self.series.borrow().peak_max().into_series().into()
|
533
|
+
}
|
534
|
+
|
535
|
+
pub fn peak_min(&self) -> Self {
|
536
|
+
self.series.borrow().peak_min().into_series().into()
|
537
|
+
}
|
538
|
+
|
539
|
+
pub fn n_unique(&self) -> RbResult<usize> {
|
540
|
+
let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
|
541
|
+
Ok(n)
|
542
|
+
}
|
543
|
+
|
544
|
+
pub fn floor(&self) -> RbResult<Self> {
|
545
|
+
let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
|
546
|
+
Ok(s.into())
|
547
|
+
}
|
548
|
+
|
549
|
+
pub fn shrink_to_fit(&self) {
|
550
|
+
self.series.borrow_mut().shrink_to_fit();
|
551
|
+
}
|
552
|
+
|
553
|
+
pub fn dot(&self, other: &RbSeries) -> Option<f64> {
|
554
|
+
self.series.borrow().dot(&other.series.borrow())
|
555
|
+
}
|
556
|
+
|
557
|
+
pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
|
558
|
+
let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
|
559
|
+
Ok(out)
|
560
|
+
}
|
561
|
+
|
562
|
+
pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
|
563
|
+
let out = self
|
564
|
+
.series
|
565
|
+
.borrow()
|
566
|
+
.kurtosis(fisher, bias)
|
567
|
+
.map_err(RbPolarsErr::from)?;
|
568
|
+
Ok(out)
|
569
|
+
}
|
570
|
+
|
571
|
+
pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
|
572
|
+
let dtype = dtype.0;
|
573
|
+
let out = if strict {
|
574
|
+
self.series.borrow().strict_cast(&dtype)
|
575
|
+
} else {
|
576
|
+
self.series.borrow().cast(&dtype)
|
577
|
+
};
|
578
|
+
let out = out.map_err(RbPolarsErr::from)?;
|
579
|
+
Ok(out.into())
|
580
|
+
}
|
581
|
+
|
582
|
+
pub fn time_unit(&self) -> Option<String> {
|
583
|
+
if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
|
584
|
+
Some(
|
585
|
+
match tu {
|
586
|
+
TimeUnit::Nanoseconds => "ns",
|
587
|
+
TimeUnit::Microseconds => "us",
|
588
|
+
TimeUnit::Milliseconds => "ms",
|
589
|
+
}
|
590
|
+
.to_string(),
|
591
|
+
)
|
592
|
+
} else {
|
593
|
+
None
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
457
597
|
// dispatch dynamically in future?
|
458
598
|
|
459
599
|
pub fn cumsum(&self, reverse: bool) -> Self {
|
@@ -468,8 +608,30 @@ impl RbSeries {
|
|
468
608
|
self.series.borrow().cummin(reverse).into()
|
469
609
|
}
|
470
610
|
|
611
|
+
pub fn cumprod(&self, reverse: bool) -> Self {
|
612
|
+
self.series.borrow().cumprod(reverse).into()
|
613
|
+
}
|
614
|
+
|
471
615
|
pub fn slice(&self, offset: i64, length: usize) -> Self {
|
472
616
|
let series = self.series.borrow().slice(offset, length);
|
473
617
|
series.into()
|
474
618
|
}
|
619
|
+
|
620
|
+
pub fn ceil(&self) -> RbResult<Self> {
|
621
|
+
let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
|
622
|
+
Ok(s.into())
|
623
|
+
}
|
624
|
+
|
625
|
+
pub fn round(&self, decimals: u32) -> RbResult<Self> {
|
626
|
+
let s = self
|
627
|
+
.series
|
628
|
+
.borrow()
|
629
|
+
.round(decimals)
|
630
|
+
.map_err(RbPolarsErr::from)?;
|
631
|
+
Ok(s.into())
|
632
|
+
}
|
633
|
+
}
|
634
|
+
|
635
|
+
pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
|
636
|
+
s.into_iter().map(RbSeries::new).collect()
|
475
637
|
}
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Polars
|
2
|
+
class BatchedCsvReader
|
3
|
+
attr_accessor :_reader, :new_columns
|
4
|
+
|
5
|
+
def initialize(
|
6
|
+
file,
|
7
|
+
has_header: true,
|
8
|
+
columns: nil,
|
9
|
+
sep: ",",
|
10
|
+
comment_char: nil,
|
11
|
+
quote_char: '"',
|
12
|
+
skip_rows: 0,
|
13
|
+
dtypes: nil,
|
14
|
+
null_values: nil,
|
15
|
+
ignore_errors: false,
|
16
|
+
parse_dates: false,
|
17
|
+
n_threads: nil,
|
18
|
+
infer_schema_length: 100,
|
19
|
+
batch_size: 50_000,
|
20
|
+
n_rows: nil,
|
21
|
+
encoding: "utf8",
|
22
|
+
low_memory: false,
|
23
|
+
rechunk: true,
|
24
|
+
skip_rows_after_header: 0,
|
25
|
+
row_count_name: nil,
|
26
|
+
row_count_offset: 0,
|
27
|
+
sample_size: 1024,
|
28
|
+
eol_char: "\n",
|
29
|
+
new_columns: nil
|
30
|
+
)
|
31
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
32
|
+
path = Utils.format_path(file)
|
33
|
+
end
|
34
|
+
|
35
|
+
dtype_list = nil
|
36
|
+
dtype_slice = nil
|
37
|
+
if !dtypes.nil?
|
38
|
+
if dtypes.is_a?(Hash)
|
39
|
+
dtype_list = []
|
40
|
+
dtypes.each do|k, v|
|
41
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
42
|
+
end
|
43
|
+
elsif dtypes.is_a?(Array)
|
44
|
+
dtype_slice = dtypes
|
45
|
+
else
|
46
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
processed_null_values = Utils._process_null_values(null_values)
|
51
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
52
|
+
|
53
|
+
self._reader = RbBatchedCsv.new(
|
54
|
+
infer_schema_length,
|
55
|
+
batch_size,
|
56
|
+
has_header,
|
57
|
+
ignore_errors,
|
58
|
+
n_rows,
|
59
|
+
skip_rows,
|
60
|
+
projection,
|
61
|
+
sep,
|
62
|
+
rechunk,
|
63
|
+
columns,
|
64
|
+
encoding,
|
65
|
+
n_threads,
|
66
|
+
path,
|
67
|
+
dtype_list,
|
68
|
+
dtype_slice,
|
69
|
+
low_memory,
|
70
|
+
comment_char,
|
71
|
+
quote_char,
|
72
|
+
processed_null_values,
|
73
|
+
parse_dates,
|
74
|
+
skip_rows_after_header,
|
75
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
76
|
+
sample_size,
|
77
|
+
eol_char
|
78
|
+
)
|
79
|
+
self.new_columns = new_columns
|
80
|
+
end
|
81
|
+
|
82
|
+
def next_batches(n)
|
83
|
+
batches = _reader.next_batches(n)
|
84
|
+
if !batches.nil?
|
85
|
+
if new_columns
|
86
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
87
|
+
else
|
88
|
+
batches.map { |df| Utils.wrap_df(df) }
|
89
|
+
end
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|