polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,12 @@
1
- use crate::conversion::wrap;
2
- use crate::{RbDataFrame, RbPolarsErr, RbResult};
3
1
  use magnus::exception::arg_error;
4
2
  use magnus::{Error, RArray, Value};
5
3
  use polars::prelude::*;
6
4
  use polars::series::IsSorted;
7
5
  use std::cell::RefCell;
8
6
 
7
+ use crate::conversion::*;
8
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
9
+
9
10
  #[magnus::wrap(class = "Polars::RbSeries")]
10
11
  pub struct RbSeries {
11
12
  pub series: RefCell<Series>,
@@ -24,6 +25,14 @@ impl RbSeries {
24
25
  }
25
26
  }
26
27
 
28
+ pub fn is_sorted_flag(&self) -> bool {
29
+ matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
30
+ }
31
+
32
+ pub fn is_sorted_reverse_flag(&self) -> bool {
33
+ matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
34
+ }
35
+
27
36
  pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
28
37
  let len = obj.len();
29
38
  let mut builder = BooleanChunkedBuilder::new(&name, len);
@@ -107,11 +116,33 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
107
116
  init_method_opt!(new_opt_f64, Float64Type, f64);
108
117
 
109
118
  impl RbSeries {
110
- pub fn new_str(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
111
- let v = val.try_convert::<Vec<Option<String>>>()?;
112
- let mut s = Utf8Chunked::new(&name, v).into_series();
119
+ pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
120
+ let mut s = val.0.into_series();
113
121
  s.rename(&name);
114
- Ok(RbSeries::new(s))
122
+ RbSeries::new(s)
123
+ }
124
+
125
+ pub fn estimated_size(&self) -> usize {
126
+ self.series.borrow().estimated_size()
127
+ }
128
+
129
+ pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
130
+ let val = format!("{}", self.series.borrow().get(index));
131
+ if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
132
+ let v_trunc = &val[..val
133
+ .char_indices()
134
+ .take(str_lengths)
135
+ .last()
136
+ .map(|(i, c)| i + c.len_utf8())
137
+ .unwrap_or(0)];
138
+ if val == v_trunc {
139
+ val
140
+ } else {
141
+ format!("{}...", v_trunc)
142
+ }
143
+ } else {
144
+ val
145
+ }
115
146
  }
116
147
 
117
148
  pub fn rechunk(&self, in_place: bool) -> Option<Self> {
@@ -124,6 +155,10 @@ impl RbSeries {
124
155
  }
125
156
  }
126
157
 
158
+ pub fn get_idx(&self, idx: usize) -> Value {
159
+ Wrap(self.series.borrow().get(idx)).into()
160
+ }
161
+
127
162
  pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
128
163
  let out = self
129
164
  .series
@@ -163,16 +198,16 @@ impl RbSeries {
163
198
  self.series.borrow_mut().rename(&name);
164
199
  }
165
200
 
166
- pub fn dtype(&self) -> String {
167
- self.series.borrow().dtype().to_string()
201
+ pub fn dtype(&self) -> Value {
202
+ Wrap(self.series.borrow().dtype().clone()).into()
168
203
  }
169
204
 
170
- pub fn inner_dtype(&self) -> Option<String> {
205
+ pub fn inner_dtype(&self) -> Option<Value> {
171
206
  self.series
172
207
  .borrow()
173
208
  .dtype()
174
209
  .inner_dtype()
175
- .map(|dt| dt.to_string())
210
+ .map(|dt| Wrap(dt.clone()).into())
176
211
  }
177
212
 
178
213
  pub fn set_sorted(&self, reverse: bool) -> Self {
@@ -196,15 +231,15 @@ impl RbSeries {
196
231
  }
197
232
 
198
233
  pub fn max(&self) -> Value {
199
- wrap(self.series.borrow().max_as_series().get(0))
234
+ Wrap(self.series.borrow().max_as_series().get(0)).into()
200
235
  }
201
236
 
202
237
  pub fn min(&self) -> Value {
203
- wrap(self.series.borrow().min_as_series().get(0))
238
+ Wrap(self.series.borrow().min_as_series().get(0)).into()
204
239
  }
205
240
 
206
241
  pub fn sum(&self) -> Value {
207
- wrap(self.series.borrow().sum_as_series().get(0))
242
+ Wrap(self.series.borrow().sum_as_series().get(0)).into()
208
243
  }
209
244
 
210
245
  pub fn n_chunks(&self) -> usize {
@@ -454,6 +489,111 @@ impl RbSeries {
454
489
  }
455
490
  }
456
491
 
492
+ pub fn quantile(
493
+ &self,
494
+ quantile: f64,
495
+ interpolation: Wrap<QuantileInterpolOptions>,
496
+ ) -> RbResult<Value> {
497
+ Ok(Wrap(
498
+ self.series
499
+ .borrow()
500
+ .quantile_as_series(quantile, interpolation.0)
501
+ .map_err(|_| RbValueError::new_err("invalid quantile".into()))?
502
+ .get(0),
503
+ )
504
+ .into())
505
+ }
506
+
507
+ pub fn clone(&self) -> Self {
508
+ RbSeries::new(self.series.borrow().clone())
509
+ }
510
+
511
+ pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
512
+ let binding = mask.series.borrow();
513
+ let mask = binding.bool().map_err(RbPolarsErr::from)?;
514
+ let s = self
515
+ .series
516
+ .borrow()
517
+ .zip_with(mask, &other.series.borrow())
518
+ .map_err(RbPolarsErr::from)?;
519
+ Ok(RbSeries::new(s))
520
+ }
521
+
522
+ pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
523
+ let df = self
524
+ .series
525
+ .borrow()
526
+ .to_dummies()
527
+ .map_err(RbPolarsErr::from)?;
528
+ Ok(df.into())
529
+ }
530
+
531
+ pub fn peak_max(&self) -> Self {
532
+ self.series.borrow().peak_max().into_series().into()
533
+ }
534
+
535
+ pub fn peak_min(&self) -> Self {
536
+ self.series.borrow().peak_min().into_series().into()
537
+ }
538
+
539
+ pub fn n_unique(&self) -> RbResult<usize> {
540
+ let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
541
+ Ok(n)
542
+ }
543
+
544
+ pub fn floor(&self) -> RbResult<Self> {
545
+ let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
546
+ Ok(s.into())
547
+ }
548
+
549
+ pub fn shrink_to_fit(&self) {
550
+ self.series.borrow_mut().shrink_to_fit();
551
+ }
552
+
553
+ pub fn dot(&self, other: &RbSeries) -> Option<f64> {
554
+ self.series.borrow().dot(&other.series.borrow())
555
+ }
556
+
557
+ pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
558
+ let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
559
+ Ok(out)
560
+ }
561
+
562
+ pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
563
+ let out = self
564
+ .series
565
+ .borrow()
566
+ .kurtosis(fisher, bias)
567
+ .map_err(RbPolarsErr::from)?;
568
+ Ok(out)
569
+ }
570
+
571
+ pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
572
+ let dtype = dtype.0;
573
+ let out = if strict {
574
+ self.series.borrow().strict_cast(&dtype)
575
+ } else {
576
+ self.series.borrow().cast(&dtype)
577
+ };
578
+ let out = out.map_err(RbPolarsErr::from)?;
579
+ Ok(out.into())
580
+ }
581
+
582
+ pub fn time_unit(&self) -> Option<String> {
583
+ if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
584
+ Some(
585
+ match tu {
586
+ TimeUnit::Nanoseconds => "ns",
587
+ TimeUnit::Microseconds => "us",
588
+ TimeUnit::Milliseconds => "ms",
589
+ }
590
+ .to_string(),
591
+ )
592
+ } else {
593
+ None
594
+ }
595
+ }
596
+
457
597
  // dispatch dynamically in future?
458
598
 
459
599
  pub fn cumsum(&self, reverse: bool) -> Self {
@@ -468,8 +608,30 @@ impl RbSeries {
468
608
  self.series.borrow().cummin(reverse).into()
469
609
  }
470
610
 
611
+ pub fn cumprod(&self, reverse: bool) -> Self {
612
+ self.series.borrow().cumprod(reverse).into()
613
+ }
614
+
471
615
  pub fn slice(&self, offset: i64, length: usize) -> Self {
472
616
  let series = self.series.borrow().slice(offset, length);
473
617
  series.into()
474
618
  }
619
+
620
+ pub fn ceil(&self) -> RbResult<Self> {
621
+ let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
622
+ Ok(s.into())
623
+ }
624
+
625
+ pub fn round(&self, decimals: u32) -> RbResult<Self> {
626
+ let s = self
627
+ .series
628
+ .borrow()
629
+ .round(decimals)
630
+ .map_err(RbPolarsErr::from)?;
631
+ Ok(s.into())
632
+ }
633
+ }
634
+
635
+ pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
636
+ s.into_iter().map(RbSeries::new).collect()
475
637
  }
@@ -0,0 +1,95 @@
1
+ module Polars
2
+ class BatchedCsvReader
3
+ attr_accessor :_reader, :new_columns
4
+
5
+ def initialize(
6
+ file,
7
+ has_header: true,
8
+ columns: nil,
9
+ sep: ",",
10
+ comment_char: nil,
11
+ quote_char: '"',
12
+ skip_rows: 0,
13
+ dtypes: nil,
14
+ null_values: nil,
15
+ ignore_errors: false,
16
+ parse_dates: false,
17
+ n_threads: nil,
18
+ infer_schema_length: 100,
19
+ batch_size: 50_000,
20
+ n_rows: nil,
21
+ encoding: "utf8",
22
+ low_memory: false,
23
+ rechunk: true,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n",
29
+ new_columns: nil
30
+ )
31
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
32
+ path = Utils.format_path(file)
33
+ end
34
+
35
+ dtype_list = nil
36
+ dtype_slice = nil
37
+ if !dtypes.nil?
38
+ if dtypes.is_a?(Hash)
39
+ dtype_list = []
40
+ dtypes.each do|k, v|
41
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
42
+ end
43
+ elsif dtypes.is_a?(Array)
44
+ dtype_slice = dtypes
45
+ else
46
+ raise ArgumentError, "dtype arg should be list or dict"
47
+ end
48
+ end
49
+
50
+ processed_null_values = Utils._process_null_values(null_values)
51
+ projection, columns = Utils.handle_projection_columns(columns)
52
+
53
+ self._reader = RbBatchedCsv.new(
54
+ infer_schema_length,
55
+ batch_size,
56
+ has_header,
57
+ ignore_errors,
58
+ n_rows,
59
+ skip_rows,
60
+ projection,
61
+ sep,
62
+ rechunk,
63
+ columns,
64
+ encoding,
65
+ n_threads,
66
+ path,
67
+ dtype_list,
68
+ dtype_slice,
69
+ low_memory,
70
+ comment_char,
71
+ quote_char,
72
+ processed_null_values,
73
+ parse_dates,
74
+ skip_rows_after_header,
75
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
76
+ sample_size,
77
+ eol_char
78
+ )
79
+ self.new_columns = new_columns
80
+ end
81
+
82
+ def next_batches(n)
83
+ batches = _reader.next_batches(n)
84
+ if !batches.nil?
85
+ if new_columns
86
+ batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
87
+ else
88
+ batches.map { |df| Utils.wrap_df(df) }
89
+ end
90
+ else
91
+ nil
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class CatExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def set_ordering(ordering)
10
+ Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
11
+ end
12
+ end
13
+ end