polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
- use crate::conversion::wrap;
2
- use crate::{RbDataFrame, RbPolarsErr, RbResult};
3
1
  use magnus::exception::arg_error;
4
2
  use magnus::{Error, RArray, Value};
5
3
  use polars::prelude::*;
6
4
  use polars::series::IsSorted;
7
5
  use std::cell::RefCell;
8
6
 
7
+ use crate::conversion::*;
8
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
9
+
9
10
  #[magnus::wrap(class = "Polars::RbSeries")]
10
11
  pub struct RbSeries {
11
12
  pub series: RefCell<Series>,
@@ -24,6 +25,14 @@ impl RbSeries {
24
25
  }
25
26
  }
26
27
 
28
+ pub fn is_sorted_flag(&self) -> bool {
29
+ matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
30
+ }
31
+
32
+ pub fn is_sorted_reverse_flag(&self) -> bool {
33
+ matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
34
+ }
35
+
27
36
  pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
28
37
  let len = obj.len();
29
38
  let mut builder = BooleanChunkedBuilder::new(&name, len);
@@ -107,11 +116,33 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
107
116
  init_method_opt!(new_opt_f64, Float64Type, f64);
108
117
 
109
118
  impl RbSeries {
110
- pub fn new_str(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
111
- let v = val.try_convert::<Vec<Option<String>>>()?;
112
- let mut s = Utf8Chunked::new(&name, v).into_series();
119
+ pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
120
+ let mut s = val.0.into_series();
113
121
  s.rename(&name);
114
- Ok(RbSeries::new(s))
122
+ RbSeries::new(s)
123
+ }
124
+
125
+ pub fn estimated_size(&self) -> usize {
126
+ self.series.borrow().estimated_size()
127
+ }
128
+
129
+ pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
130
+ let val = format!("{}", self.series.borrow().get(index));
131
+ if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
132
+ let v_trunc = &val[..val
133
+ .char_indices()
134
+ .take(str_lengths)
135
+ .last()
136
+ .map(|(i, c)| i + c.len_utf8())
137
+ .unwrap_or(0)];
138
+ if val == v_trunc {
139
+ val
140
+ } else {
141
+ format!("{}...", v_trunc)
142
+ }
143
+ } else {
144
+ val
145
+ }
115
146
  }
116
147
 
117
148
  pub fn rechunk(&self, in_place: bool) -> Option<Self> {
@@ -124,6 +155,10 @@ impl RbSeries {
124
155
  }
125
156
  }
126
157
 
158
+ pub fn get_idx(&self, idx: usize) -> Value {
159
+ Wrap(self.series.borrow().get(idx)).into()
160
+ }
161
+
127
162
  pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
128
163
  let out = self
129
164
  .series
@@ -163,16 +198,16 @@ impl RbSeries {
163
198
  self.series.borrow_mut().rename(&name);
164
199
  }
165
200
 
166
- pub fn dtype(&self) -> String {
167
- self.series.borrow().dtype().to_string()
201
+ pub fn dtype(&self) -> Value {
202
+ Wrap(self.series.borrow().dtype().clone()).into()
168
203
  }
169
204
 
170
- pub fn inner_dtype(&self) -> Option<String> {
205
+ pub fn inner_dtype(&self) -> Option<Value> {
171
206
  self.series
172
207
  .borrow()
173
208
  .dtype()
174
209
  .inner_dtype()
175
- .map(|dt| dt.to_string())
210
+ .map(|dt| Wrap(dt.clone()).into())
176
211
  }
177
212
 
178
213
  pub fn set_sorted(&self, reverse: bool) -> Self {
@@ -196,15 +231,15 @@ impl RbSeries {
196
231
  }
197
232
 
198
233
  pub fn max(&self) -> Value {
199
- wrap(self.series.borrow().max_as_series().get(0))
234
+ Wrap(self.series.borrow().max_as_series().get(0)).into()
200
235
  }
201
236
 
202
237
  pub fn min(&self) -> Value {
203
- wrap(self.series.borrow().min_as_series().get(0))
238
+ Wrap(self.series.borrow().min_as_series().get(0)).into()
204
239
  }
205
240
 
206
241
  pub fn sum(&self) -> Value {
207
- wrap(self.series.borrow().sum_as_series().get(0))
242
+ Wrap(self.series.borrow().sum_as_series().get(0)).into()
208
243
  }
209
244
 
210
245
  pub fn n_chunks(&self) -> usize {
@@ -454,6 +489,111 @@ impl RbSeries {
454
489
  }
455
490
  }
456
491
 
492
+ pub fn quantile(
493
+ &self,
494
+ quantile: f64,
495
+ interpolation: Wrap<QuantileInterpolOptions>,
496
+ ) -> RbResult<Value> {
497
+ Ok(Wrap(
498
+ self.series
499
+ .borrow()
500
+ .quantile_as_series(quantile, interpolation.0)
501
+ .map_err(|_| RbValueError::new_err("invalid quantile".into()))?
502
+ .get(0),
503
+ )
504
+ .into())
505
+ }
506
+
507
+ pub fn clone(&self) -> Self {
508
+ RbSeries::new(self.series.borrow().clone())
509
+ }
510
+
511
+ pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
512
+ let binding = mask.series.borrow();
513
+ let mask = binding.bool().map_err(RbPolarsErr::from)?;
514
+ let s = self
515
+ .series
516
+ .borrow()
517
+ .zip_with(mask, &other.series.borrow())
518
+ .map_err(RbPolarsErr::from)?;
519
+ Ok(RbSeries::new(s))
520
+ }
521
+
522
+ pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
523
+ let df = self
524
+ .series
525
+ .borrow()
526
+ .to_dummies()
527
+ .map_err(RbPolarsErr::from)?;
528
+ Ok(df.into())
529
+ }
530
+
531
+ pub fn peak_max(&self) -> Self {
532
+ self.series.borrow().peak_max().into_series().into()
533
+ }
534
+
535
+ pub fn peak_min(&self) -> Self {
536
+ self.series.borrow().peak_min().into_series().into()
537
+ }
538
+
539
+ pub fn n_unique(&self) -> RbResult<usize> {
540
+ let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
541
+ Ok(n)
542
+ }
543
+
544
+ pub fn floor(&self) -> RbResult<Self> {
545
+ let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
546
+ Ok(s.into())
547
+ }
548
+
549
+ pub fn shrink_to_fit(&self) {
550
+ self.series.borrow_mut().shrink_to_fit();
551
+ }
552
+
553
+ pub fn dot(&self, other: &RbSeries) -> Option<f64> {
554
+ self.series.borrow().dot(&other.series.borrow())
555
+ }
556
+
557
+ pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
558
+ let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
559
+ Ok(out)
560
+ }
561
+
562
+ pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
563
+ let out = self
564
+ .series
565
+ .borrow()
566
+ .kurtosis(fisher, bias)
567
+ .map_err(RbPolarsErr::from)?;
568
+ Ok(out)
569
+ }
570
+
571
+ pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
572
+ let dtype = dtype.0;
573
+ let out = if strict {
574
+ self.series.borrow().strict_cast(&dtype)
575
+ } else {
576
+ self.series.borrow().cast(&dtype)
577
+ };
578
+ let out = out.map_err(RbPolarsErr::from)?;
579
+ Ok(out.into())
580
+ }
581
+
582
+ pub fn time_unit(&self) -> Option<String> {
583
+ if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
584
+ Some(
585
+ match tu {
586
+ TimeUnit::Nanoseconds => "ns",
587
+ TimeUnit::Microseconds => "us",
588
+ TimeUnit::Milliseconds => "ms",
589
+ }
590
+ .to_string(),
591
+ )
592
+ } else {
593
+ None
594
+ }
595
+ }
596
+
457
597
  // dispatch dynamically in future?
458
598
 
459
599
  pub fn cumsum(&self, reverse: bool) -> Self {
@@ -468,8 +608,30 @@ impl RbSeries {
468
608
  self.series.borrow().cummin(reverse).into()
469
609
  }
470
610
 
611
+ pub fn cumprod(&self, reverse: bool) -> Self {
612
+ self.series.borrow().cumprod(reverse).into()
613
+ }
614
+
471
615
  pub fn slice(&self, offset: i64, length: usize) -> Self {
472
616
  let series = self.series.borrow().slice(offset, length);
473
617
  series.into()
474
618
  }
619
+
620
+ pub fn ceil(&self) -> RbResult<Self> {
621
+ let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
622
+ Ok(s.into())
623
+ }
624
+
625
+ pub fn round(&self, decimals: u32) -> RbResult<Self> {
626
+ let s = self
627
+ .series
628
+ .borrow()
629
+ .round(decimals)
630
+ .map_err(RbPolarsErr::from)?;
631
+ Ok(s.into())
632
+ }
633
+ }
634
+
635
+ pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
636
+ s.into_iter().map(RbSeries::new).collect()
475
637
  }
@@ -0,0 +1,95 @@
1
+ module Polars
2
+ class BatchedCsvReader
3
+ attr_accessor :_reader, :new_columns
4
+
5
+ def initialize(
6
+ file,
7
+ has_header: true,
8
+ columns: nil,
9
+ sep: ",",
10
+ comment_char: nil,
11
+ quote_char: '"',
12
+ skip_rows: 0,
13
+ dtypes: nil,
14
+ null_values: nil,
15
+ ignore_errors: false,
16
+ parse_dates: false,
17
+ n_threads: nil,
18
+ infer_schema_length: 100,
19
+ batch_size: 50_000,
20
+ n_rows: nil,
21
+ encoding: "utf8",
22
+ low_memory: false,
23
+ rechunk: true,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n",
29
+ new_columns: nil
30
+ )
31
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
32
+ path = Utils.format_path(file)
33
+ end
34
+
35
+ dtype_list = nil
36
+ dtype_slice = nil
37
+ if !dtypes.nil?
38
+ if dtypes.is_a?(Hash)
39
+ dtype_list = []
40
+ dtypes.each do|k, v|
41
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
42
+ end
43
+ elsif dtypes.is_a?(Array)
44
+ dtype_slice = dtypes
45
+ else
46
+ raise ArgumentError, "dtype arg should be list or dict"
47
+ end
48
+ end
49
+
50
+ processed_null_values = Utils._process_null_values(null_values)
51
+ projection, columns = Utils.handle_projection_columns(columns)
52
+
53
+ self._reader = RbBatchedCsv.new(
54
+ infer_schema_length,
55
+ batch_size,
56
+ has_header,
57
+ ignore_errors,
58
+ n_rows,
59
+ skip_rows,
60
+ projection,
61
+ sep,
62
+ rechunk,
63
+ columns,
64
+ encoding,
65
+ n_threads,
66
+ path,
67
+ dtype_list,
68
+ dtype_slice,
69
+ low_memory,
70
+ comment_char,
71
+ quote_char,
72
+ processed_null_values,
73
+ parse_dates,
74
+ skip_rows_after_header,
75
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
76
+ sample_size,
77
+ eol_char
78
+ )
79
+ self.new_columns = new_columns
80
+ end
81
+
82
+ def next_batches(n)
83
+ batches = _reader.next_batches(n)
84
+ if !batches.nil?
85
+ if new_columns
86
+ batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
87
+ else
88
+ batches.map { |df| Utils.wrap_df(df) }
89
+ end
90
+ else
91
+ nil
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class CatExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def set_ordering(ordering)
10
+ Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
11
+ end
12
+ end
13
+ end