polars-df 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,12 @@
1
- use crate::conversion::wrap;
2
- use crate::{RbDataFrame, RbPolarsErr, RbResult};
3
1
  use magnus::exception::arg_error;
4
2
  use magnus::{Error, RArray, Value};
5
3
  use polars::prelude::*;
6
4
  use polars::series::IsSorted;
7
5
  use std::cell::RefCell;
8
6
 
7
+ use crate::conversion::*;
8
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
9
+
9
10
  #[magnus::wrap(class = "Polars::RbSeries")]
10
11
  pub struct RbSeries {
11
12
  pub series: RefCell<Series>,
@@ -24,6 +25,14 @@ impl RbSeries {
24
25
  }
25
26
  }
26
27
 
28
+ pub fn is_sorted_flag(&self) -> bool {
29
+ matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
30
+ }
31
+
32
+ pub fn is_sorted_reverse_flag(&self) -> bool {
33
+ matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
34
+ }
35
+
27
36
  pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
28
37
  let len = obj.len();
29
38
  let mut builder = BooleanChunkedBuilder::new(&name, len);
@@ -114,6 +123,29 @@ impl RbSeries {
114
123
  Ok(RbSeries::new(s))
115
124
  }
116
125
 
126
+ pub fn estimated_size(&self) -> usize {
127
+ self.series.borrow().estimated_size()
128
+ }
129
+
130
+ pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
131
+ let val = format!("{}", self.series.borrow().get(index));
132
+ if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
133
+ let v_trunc = &val[..val
134
+ .char_indices()
135
+ .take(str_lengths)
136
+ .last()
137
+ .map(|(i, c)| i + c.len_utf8())
138
+ .unwrap_or(0)];
139
+ if val == v_trunc {
140
+ val
141
+ } else {
142
+ format!("{}...", v_trunc)
143
+ }
144
+ } else {
145
+ val
146
+ }
147
+ }
148
+
117
149
  pub fn rechunk(&self, in_place: bool) -> Option<Self> {
118
150
  let series = self.series.borrow_mut().rechunk();
119
151
  if in_place {
@@ -124,6 +156,10 @@ impl RbSeries {
124
156
  }
125
157
  }
126
158
 
159
+ pub fn get_idx(&self, idx: usize) -> Value {
160
+ Wrap(self.series.borrow().get(idx)).into()
161
+ }
162
+
127
163
  pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
128
164
  let out = self
129
165
  .series
@@ -196,15 +232,15 @@ impl RbSeries {
196
232
  }
197
233
 
198
234
  pub fn max(&self) -> Value {
199
- wrap(self.series.borrow().max_as_series().get(0))
235
+ Wrap(self.series.borrow().max_as_series().get(0)).into()
200
236
  }
201
237
 
202
238
  pub fn min(&self) -> Value {
203
- wrap(self.series.borrow().min_as_series().get(0))
239
+ Wrap(self.series.borrow().min_as_series().get(0)).into()
204
240
  }
205
241
 
206
242
  pub fn sum(&self) -> Value {
207
- wrap(self.series.borrow().sum_as_series().get(0))
243
+ Wrap(self.series.borrow().sum_as_series().get(0)).into()
208
244
  }
209
245
 
210
246
  pub fn n_chunks(&self) -> usize {
@@ -454,6 +490,111 @@ impl RbSeries {
454
490
  }
455
491
  }
456
492
 
493
+ pub fn quantile(
494
+ &self,
495
+ quantile: f64,
496
+ interpolation: Wrap<QuantileInterpolOptions>,
497
+ ) -> RbResult<Value> {
498
+ Ok(Wrap(
499
+ self.series
500
+ .borrow()
501
+ .quantile_as_series(quantile, interpolation.0)
502
+ .map_err(|_| RbValueError::new_err("invalid quantile".into()))?
503
+ .get(0),
504
+ )
505
+ .into())
506
+ }
507
+
508
+ pub fn clone(&self) -> Self {
509
+ RbSeries::new(self.series.borrow().clone())
510
+ }
511
+
512
+ pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
513
+ let binding = mask.series.borrow();
514
+ let mask = binding.bool().map_err(RbPolarsErr::from)?;
515
+ let s = self
516
+ .series
517
+ .borrow()
518
+ .zip_with(mask, &other.series.borrow())
519
+ .map_err(RbPolarsErr::from)?;
520
+ Ok(RbSeries::new(s))
521
+ }
522
+
523
+ pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
524
+ let df = self
525
+ .series
526
+ .borrow()
527
+ .to_dummies()
528
+ .map_err(RbPolarsErr::from)?;
529
+ Ok(df.into())
530
+ }
531
+
532
+ pub fn peak_max(&self) -> Self {
533
+ self.series.borrow().peak_max().into_series().into()
534
+ }
535
+
536
+ pub fn peak_min(&self) -> Self {
537
+ self.series.borrow().peak_min().into_series().into()
538
+ }
539
+
540
+ pub fn n_unique(&self) -> RbResult<usize> {
541
+ let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
542
+ Ok(n)
543
+ }
544
+
545
+ pub fn floor(&self) -> RbResult<Self> {
546
+ let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
547
+ Ok(s.into())
548
+ }
549
+
550
+ pub fn shrink_to_fit(&self) {
551
+ self.series.borrow_mut().shrink_to_fit();
552
+ }
553
+
554
+ pub fn dot(&self, other: &RbSeries) -> Option<f64> {
555
+ self.series.borrow().dot(&other.series.borrow())
556
+ }
557
+
558
+ pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
559
+ let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
560
+ Ok(out)
561
+ }
562
+
563
+ pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
564
+ let out = self
565
+ .series
566
+ .borrow()
567
+ .kurtosis(fisher, bias)
568
+ .map_err(RbPolarsErr::from)?;
569
+ Ok(out)
570
+ }
571
+
572
+ pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
573
+ let dtype = dtype.0;
574
+ let out = if strict {
575
+ self.series.borrow().strict_cast(&dtype)
576
+ } else {
577
+ self.series.borrow().cast(&dtype)
578
+ };
579
+ let out = out.map_err(RbPolarsErr::from)?;
580
+ Ok(out.into())
581
+ }
582
+
583
+ pub fn time_unit(&self) -> Option<String> {
584
+ if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
585
+ Some(
586
+ match tu {
587
+ TimeUnit::Nanoseconds => "ns",
588
+ TimeUnit::Microseconds => "us",
589
+ TimeUnit::Milliseconds => "ms",
590
+ }
591
+ .to_string(),
592
+ )
593
+ } else {
594
+ None
595
+ }
596
+ }
597
+
457
598
  // dispatch dynamically in future?
458
599
 
459
600
  pub fn cumsum(&self, reverse: bool) -> Self {
@@ -468,8 +609,30 @@ impl RbSeries {
468
609
  self.series.borrow().cummin(reverse).into()
469
610
  }
470
611
 
612
+ pub fn cumprod(&self, reverse: bool) -> Self {
613
+ self.series.borrow().cumprod(reverse).into()
614
+ }
615
+
471
616
  pub fn slice(&self, offset: i64, length: usize) -> Self {
472
617
  let series = self.series.borrow().slice(offset, length);
473
618
  series.into()
474
619
  }
620
+
621
+ pub fn ceil(&self) -> RbResult<Self> {
622
+ let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
623
+ Ok(s.into())
624
+ }
625
+
626
+ pub fn round(&self, decimals: u32) -> RbResult<Self> {
627
+ let s = self
628
+ .series
629
+ .borrow()
630
+ .round(decimals)
631
+ .map_err(RbPolarsErr::from)?;
632
+ Ok(s.into())
633
+ }
634
+ }
635
+
636
+ pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
637
+ s.into_iter().map(|v| RbSeries::new(v)).collect()
475
638
  }
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class CatExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def set_ordering(ordering)
10
+ Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
11
+ end
12
+ end
13
+ end
@@ -79,10 +79,51 @@ module Polars
79
79
  _df.columns
80
80
  end
81
81
 
82
+ def columns=(columns)
83
+ _df.set_column_names(columns)
84
+ end
85
+
82
86
  def dtypes
83
87
  _df.dtypes.map(&:to_sym)
84
88
  end
85
89
 
90
+ def schema
91
+ columns.zip(dtypes).to_h
92
+ end
93
+
94
+ # def ==(other)
95
+ # end
96
+
97
+ # def !=(other)
98
+ # end
99
+
100
+ # def >(other)
101
+ # end
102
+
103
+ # def <(other)
104
+ # end
105
+
106
+ # def >=(other)
107
+ # end
108
+
109
+ # def <=(other)
110
+ # end
111
+
112
+ # def *(other)
113
+ # end
114
+
115
+ # def /(other)
116
+ # end
117
+
118
+ # def +(other)
119
+ # end
120
+
121
+ # def -(other)
122
+ # end
123
+
124
+ # def %(other)
125
+ # end
126
+
86
127
  def to_s
87
128
  _df.to_s
88
129
  end
@@ -96,6 +137,25 @@ module Polars
96
137
  Utils.wrap_s(_df.column(name))
97
138
  end
98
139
 
140
+ # def []=(key, value)
141
+ # end
142
+
143
+ def to_h(as_series: true)
144
+ if as_series
145
+ get_columns.to_h { |s| [s.name, s] }
146
+ else
147
+ get_columns.to_h { |s| [s.name, s.to_a] }
148
+ end
149
+ end
150
+
151
+ # def to_hs / to_a
152
+ # end
153
+
154
+ # def to_numo
155
+ # end
156
+
157
+ # no to_pandas
158
+
99
159
  def to_series(index = 0)
100
160
  if index < 0
101
161
  index = columns.length + index
@@ -183,6 +243,12 @@ module Polars
183
243
  nil
184
244
  end
185
245
 
246
+ # def write_avro
247
+ # end
248
+
249
+ # def write_ipc
250
+ # end
251
+
186
252
  def write_parquet(
187
253
  file,
188
254
  compression: "zstd",
@@ -202,10 +268,43 @@ module Polars
202
268
  )
203
269
  end
204
270
 
271
+ def estimated_size(unit = "b")
272
+ sz = _df.estimated_size
273
+ Utils.scale_bytes(sz, to: unit)
274
+ end
275
+
276
+ # def transpose
277
+ # end
278
+
279
+ def reverse
280
+ select(Polars.col("*").reverse)
281
+ end
282
+
283
+ def rename(mapping)
284
+ lazy.rename(mapping).collect(no_optimization: true)
285
+ end
286
+
287
+ def insert_at_idx(index, series)
288
+ if index < 0
289
+ index = columns.length + index
290
+ end
291
+ _df.insert_at_idx(index, series._s)
292
+ self
293
+ end
294
+
205
295
  def filter(predicate)
206
296
  lazy.filter(predicate).collect
207
297
  end
208
298
 
299
+ # def describe
300
+ # end
301
+
302
+ # def find_idx_by_name
303
+ # end
304
+
305
+ # def replace_at_idx
306
+ # end
307
+
209
308
  def sort(by, reverse: false, nulls_last: false)
210
309
  _from_rbdf(_df.sort(by, reverse, nulls_last))
211
310
  end
@@ -214,6 +313,16 @@ module Polars
214
313
  _df.frame_equal(other._df, null_equal)
215
314
  end
216
315
 
316
+ # def replace
317
+ # end
318
+
319
+ def slice(offset, length = nil)
320
+ if !length.nil? && length < 0
321
+ length = height - offset + length
322
+ end
323
+ _from_rbdf(_df.slice(offset, length))
324
+ end
325
+
217
326
  def limit(n = 5)
218
327
  head(n)
219
328
  end
@@ -226,10 +335,31 @@ module Polars
226
335
  _from_rbdf(_df.tail(n))
227
336
  end
228
337
 
338
+ # def drop_nulls
339
+ # end
340
+
341
+ # def pipe
342
+ # end
343
+
344
+ # def with_row_count
345
+ # end
346
+
229
347
  def groupby(by, maintain_order: false)
230
348
  lazy.groupby(by, maintain_order: maintain_order)
231
349
  end
232
350
 
351
+ # def groupby_rolling
352
+ # end
353
+
354
+ # def groupby_dynamic
355
+ # end
356
+
357
+ # def upsample
358
+ # end
359
+
360
+ # def join_asof
361
+ # end
362
+
233
363
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
364
  lazy
235
365
  .join(
@@ -243,12 +373,79 @@ module Polars
243
373
  .collect(no_optimization: true)
244
374
  end
245
375
 
376
+ # def apply
377
+ # end
378
+
246
379
  def with_column(column)
247
380
  lazy
248
381
  .with_column(column)
249
382
  .collect(no_optimization: true, string_cache: false)
250
383
  end
251
384
 
385
+ # def hstack
386
+ # end
387
+
388
+ # def vstack
389
+ # end
390
+
391
+ # def extend
392
+ # end
393
+
394
+ # def drop
395
+ # end
396
+
397
+ # def drop_in_place
398
+ # end
399
+
400
+ # def cleared
401
+ # end
402
+
403
+ # clone handled by initialize_copy
404
+
405
+ def get_columns
406
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
407
+ end
408
+
409
+ def get_column(name)
410
+ self[name]
411
+ end
412
+
413
+ # def fill_null
414
+ # end
415
+
416
+ def fill_nan(fill_value)
417
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
418
+ end
419
+
420
+ # def explode
421
+ # end
422
+
423
+ # def pivot
424
+ # end
425
+
426
+ # def melt
427
+ # end
428
+
429
+ # def unstack
430
+ # end
431
+
432
+ # def partition_by
433
+ # end
434
+
435
+ # def shift
436
+ # end
437
+
438
+ # def shift_and_fill
439
+ # end
440
+
441
+ def is_duplicated
442
+ Utils.wrap_s(_df.is_duplicated)
443
+ end
444
+
445
+ def is_unique
446
+ Utils.wrap_s(_df.is_unique)
447
+ end
448
+
252
449
  def lazy
253
450
  wrap_ldf(_df.lazy)
254
451
  end
@@ -262,6 +459,56 @@ module Polars
262
459
  )
263
460
  end
264
461
 
462
+ def with_columns(exprs)
463
+ if !exprs.nil? && !exprs.is_a?(Array)
464
+ exprs = [exprs]
465
+ end
466
+ lazy
467
+ .with_columns(exprs)
468
+ .collect(no_optimization: true, string_cache: false)
469
+ end
470
+
471
+ def n_chunks(strategy: "first")
472
+ if strategy == "first"
473
+ _df.n_chunks
474
+ elsif strategy == "all"
475
+ get_columns.map(&:n_chunks)
476
+ else
477
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
478
+ end
479
+ end
480
+
481
+ def max(axis: 0)
482
+ if axis == 0
483
+ _from_rbdf(_df.max)
484
+ elsif axis == 1
485
+ Utils.wrap_s(_df.hmax)
486
+ else
487
+ raise ArgumentError, "Axis should be 0 or 1."
488
+ end
489
+ end
490
+
491
+ def min(axis: 0)
492
+ if axis == 0
493
+ _from_rbdf(_df.min)
494
+ elsif axis == 1
495
+ Utils.wrap_s(_df.hmin)
496
+ else
497
+ raise ArgumentError, "Axis should be 0 or 1."
498
+ end
499
+ end
500
+
501
+ def sum(axis: 0, null_strategy: "ignore")
502
+ case axis
503
+ when 0
504
+ _from_rbdf(_df.sum)
505
+ when 1
506
+ Utils.wrap_s(_df.hsum(null_strategy))
507
+ else
508
+ raise ArgumentError, "Axis should be 0 or 1."
509
+ end
510
+ end
511
+
265
512
  def mean(axis: 0, null_strategy: "ignore")
266
513
  case axis
267
514
  when 0
@@ -273,15 +520,33 @@ module Polars
273
520
  end
274
521
  end
275
522
 
276
- def with_columns(exprs)
277
- if !exprs.nil? && !exprs.is_a?(Array)
278
- exprs = [exprs]
279
- end
280
- lazy
281
- .with_columns(exprs)
282
- .collect(no_optimization: true, string_cache: false)
523
+ def std(ddof: 1)
524
+ _from_rbdf(_df.std(ddof))
525
+ end
526
+
527
+ def var(ddof: 1)
528
+ _from_rbdf(_df.var(ddof))
283
529
  end
284
530
 
531
+ def median
532
+ _from_rbdf(_df.median)
533
+ end
534
+
535
+ # def product
536
+ # end
537
+
538
+ # def quantile(quantile, interpolation: "nearest")
539
+ # end
540
+
541
+ # def to_dummies
542
+ # end
543
+
544
+ # def unique
545
+ # end
546
+
547
+ # def n_unique
548
+ # end
549
+
285
550
  def rechunk
286
551
  _from_rbdf(_df.rechunk)
287
552
  end
@@ -290,8 +555,48 @@ module Polars
290
555
  _from_rbdf(_df.null_count)
291
556
  end
292
557
 
558
+ # def sample
559
+ # end
560
+
561
+ # def fold
562
+ # end
563
+
564
+ # def row
565
+ # end
566
+
567
+ # def rows
568
+ # end
569
+
570
+ # def shrink_to_fit
571
+ # end
572
+
573
+ # def take_every
574
+ # end
575
+
576
+ # def hash_rows
577
+ # end
578
+
579
+ # def interpolate
580
+ # end
581
+
582
+ def is_empty
583
+ height == 0
584
+ end
585
+ alias_method :empty?, :is_empty
586
+
587
+ # def to_struct(name)
588
+ # end
589
+
590
+ # def unnest
591
+ # end
592
+
293
593
  private
294
594
 
595
+ def initialize_copy(other)
596
+ super
597
+ self._df = _df._clone
598
+ end
599
+
295
600
  def hash_to_rbdf(data)
296
601
  RbDataFrame.read_hash(data)
297
602
  end