polars-df 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
- use crate::conversion::wrap;
2
- use crate::{RbDataFrame, RbPolarsErr, RbResult};
3
1
  use magnus::exception::arg_error;
4
2
  use magnus::{Error, RArray, Value};
5
3
  use polars::prelude::*;
6
4
  use polars::series::IsSorted;
7
5
  use std::cell::RefCell;
8
6
 
7
+ use crate::conversion::*;
8
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
9
+
9
10
  #[magnus::wrap(class = "Polars::RbSeries")]
10
11
  pub struct RbSeries {
11
12
  pub series: RefCell<Series>,
@@ -24,6 +25,14 @@ impl RbSeries {
24
25
  }
25
26
  }
26
27
 
28
+ pub fn is_sorted_flag(&self) -> bool {
29
+ matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
30
+ }
31
+
32
+ pub fn is_sorted_reverse_flag(&self) -> bool {
33
+ matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
34
+ }
35
+
27
36
  pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
28
37
  let len = obj.len();
29
38
  let mut builder = BooleanChunkedBuilder::new(&name, len);
@@ -114,6 +123,29 @@ impl RbSeries {
114
123
  Ok(RbSeries::new(s))
115
124
  }
116
125
 
126
+ pub fn estimated_size(&self) -> usize {
127
+ self.series.borrow().estimated_size()
128
+ }
129
+
130
+ pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
131
+ let val = format!("{}", self.series.borrow().get(index));
132
+ if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
133
+ let v_trunc = &val[..val
134
+ .char_indices()
135
+ .take(str_lengths)
136
+ .last()
137
+ .map(|(i, c)| i + c.len_utf8())
138
+ .unwrap_or(0)];
139
+ if val == v_trunc {
140
+ val
141
+ } else {
142
+ format!("{}...", v_trunc)
143
+ }
144
+ } else {
145
+ val
146
+ }
147
+ }
148
+
117
149
  pub fn rechunk(&self, in_place: bool) -> Option<Self> {
118
150
  let series = self.series.borrow_mut().rechunk();
119
151
  if in_place {
@@ -124,6 +156,10 @@ impl RbSeries {
124
156
  }
125
157
  }
126
158
 
159
+ pub fn get_idx(&self, idx: usize) -> Value {
160
+ Wrap(self.series.borrow().get(idx)).into()
161
+ }
162
+
127
163
  pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
128
164
  let out = self
129
165
  .series
@@ -196,15 +232,15 @@ impl RbSeries {
196
232
  }
197
233
 
198
234
  pub fn max(&self) -> Value {
199
- wrap(self.series.borrow().max_as_series().get(0))
235
+ Wrap(self.series.borrow().max_as_series().get(0)).into()
200
236
  }
201
237
 
202
238
  pub fn min(&self) -> Value {
203
- wrap(self.series.borrow().min_as_series().get(0))
239
+ Wrap(self.series.borrow().min_as_series().get(0)).into()
204
240
  }
205
241
 
206
242
  pub fn sum(&self) -> Value {
207
- wrap(self.series.borrow().sum_as_series().get(0))
243
+ Wrap(self.series.borrow().sum_as_series().get(0)).into()
208
244
  }
209
245
 
210
246
  pub fn n_chunks(&self) -> usize {
@@ -454,6 +490,111 @@ impl RbSeries {
454
490
  }
455
491
  }
456
492
 
493
+ pub fn quantile(
494
+ &self,
495
+ quantile: f64,
496
+ interpolation: Wrap<QuantileInterpolOptions>,
497
+ ) -> RbResult<Value> {
498
+ Ok(Wrap(
499
+ self.series
500
+ .borrow()
501
+ .quantile_as_series(quantile, interpolation.0)
502
+ .map_err(|_| RbValueError::new_err("invalid quantile".into()))?
503
+ .get(0),
504
+ )
505
+ .into())
506
+ }
507
+
508
+ pub fn clone(&self) -> Self {
509
+ RbSeries::new(self.series.borrow().clone())
510
+ }
511
+
512
+ pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
513
+ let binding = mask.series.borrow();
514
+ let mask = binding.bool().map_err(RbPolarsErr::from)?;
515
+ let s = self
516
+ .series
517
+ .borrow()
518
+ .zip_with(mask, &other.series.borrow())
519
+ .map_err(RbPolarsErr::from)?;
520
+ Ok(RbSeries::new(s))
521
+ }
522
+
523
+ pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
524
+ let df = self
525
+ .series
526
+ .borrow()
527
+ .to_dummies()
528
+ .map_err(RbPolarsErr::from)?;
529
+ Ok(df.into())
530
+ }
531
+
532
+ pub fn peak_max(&self) -> Self {
533
+ self.series.borrow().peak_max().into_series().into()
534
+ }
535
+
536
+ pub fn peak_min(&self) -> Self {
537
+ self.series.borrow().peak_min().into_series().into()
538
+ }
539
+
540
+ pub fn n_unique(&self) -> RbResult<usize> {
541
+ let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
542
+ Ok(n)
543
+ }
544
+
545
+ pub fn floor(&self) -> RbResult<Self> {
546
+ let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
547
+ Ok(s.into())
548
+ }
549
+
550
+ pub fn shrink_to_fit(&self) {
551
+ self.series.borrow_mut().shrink_to_fit();
552
+ }
553
+
554
+ pub fn dot(&self, other: &RbSeries) -> Option<f64> {
555
+ self.series.borrow().dot(&other.series.borrow())
556
+ }
557
+
558
+ pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
559
+ let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
560
+ Ok(out)
561
+ }
562
+
563
+ pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
564
+ let out = self
565
+ .series
566
+ .borrow()
567
+ .kurtosis(fisher, bias)
568
+ .map_err(RbPolarsErr::from)?;
569
+ Ok(out)
570
+ }
571
+
572
+ pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
573
+ let dtype = dtype.0;
574
+ let out = if strict {
575
+ self.series.borrow().strict_cast(&dtype)
576
+ } else {
577
+ self.series.borrow().cast(&dtype)
578
+ };
579
+ let out = out.map_err(RbPolarsErr::from)?;
580
+ Ok(out.into())
581
+ }
582
+
583
+ pub fn time_unit(&self) -> Option<String> {
584
+ if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
585
+ Some(
586
+ match tu {
587
+ TimeUnit::Nanoseconds => "ns",
588
+ TimeUnit::Microseconds => "us",
589
+ TimeUnit::Milliseconds => "ms",
590
+ }
591
+ .to_string(),
592
+ )
593
+ } else {
594
+ None
595
+ }
596
+ }
597
+
457
598
  // dispatch dynamically in future?
458
599
 
459
600
  pub fn cumsum(&self, reverse: bool) -> Self {
@@ -468,8 +609,30 @@ impl RbSeries {
468
609
  self.series.borrow().cummin(reverse).into()
469
610
  }
470
611
 
612
+ pub fn cumprod(&self, reverse: bool) -> Self {
613
+ self.series.borrow().cumprod(reverse).into()
614
+ }
615
+
471
616
  pub fn slice(&self, offset: i64, length: usize) -> Self {
472
617
  let series = self.series.borrow().slice(offset, length);
473
618
  series.into()
474
619
  }
620
+
621
+ pub fn ceil(&self) -> RbResult<Self> {
622
+ let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
623
+ Ok(s.into())
624
+ }
625
+
626
+ pub fn round(&self, decimals: u32) -> RbResult<Self> {
627
+ let s = self
628
+ .series
629
+ .borrow()
630
+ .round(decimals)
631
+ .map_err(RbPolarsErr::from)?;
632
+ Ok(s.into())
633
+ }
634
+ }
635
+
636
+ pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
637
+ s.into_iter().map(|v| RbSeries::new(v)).collect()
475
638
  }
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class CatExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def set_ordering(ordering)
10
+ Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
11
+ end
12
+ end
13
+ end
@@ -79,10 +79,51 @@ module Polars
79
79
  _df.columns
80
80
  end
81
81
 
82
+ def columns=(columns)
83
+ _df.set_column_names(columns)
84
+ end
85
+
82
86
  def dtypes
83
87
  _df.dtypes.map(&:to_sym)
84
88
  end
85
89
 
90
+ def schema
91
+ columns.zip(dtypes).to_h
92
+ end
93
+
94
+ # def ==(other)
95
+ # end
96
+
97
+ # def !=(other)
98
+ # end
99
+
100
+ # def >(other)
101
+ # end
102
+
103
+ # def <(other)
104
+ # end
105
+
106
+ # def >=(other)
107
+ # end
108
+
109
+ # def <=(other)
110
+ # end
111
+
112
+ # def *(other)
113
+ # end
114
+
115
+ # def /(other)
116
+ # end
117
+
118
+ # def +(other)
119
+ # end
120
+
121
+ # def -(other)
122
+ # end
123
+
124
+ # def %(other)
125
+ # end
126
+
86
127
  def to_s
87
128
  _df.to_s
88
129
  end
@@ -96,6 +137,25 @@ module Polars
96
137
  Utils.wrap_s(_df.column(name))
97
138
  end
98
139
 
140
+ # def []=(key, value)
141
+ # end
142
+
143
+ def to_h(as_series: true)
144
+ if as_series
145
+ get_columns.to_h { |s| [s.name, s] }
146
+ else
147
+ get_columns.to_h { |s| [s.name, s.to_a] }
148
+ end
149
+ end
150
+
151
+ # def to_hs / to_a
152
+ # end
153
+
154
+ # def to_numo
155
+ # end
156
+
157
+ # no to_pandas
158
+
99
159
  def to_series(index = 0)
100
160
  if index < 0
101
161
  index = columns.length + index
@@ -183,6 +243,12 @@ module Polars
183
243
  nil
184
244
  end
185
245
 
246
+ # def write_avro
247
+ # end
248
+
249
+ # def write_ipc
250
+ # end
251
+
186
252
  def write_parquet(
187
253
  file,
188
254
  compression: "zstd",
@@ -202,10 +268,43 @@ module Polars
202
268
  )
203
269
  end
204
270
 
271
+ def estimated_size(unit = "b")
272
+ sz = _df.estimated_size
273
+ Utils.scale_bytes(sz, to: unit)
274
+ end
275
+
276
+ # def transpose
277
+ # end
278
+
279
+ def reverse
280
+ select(Polars.col("*").reverse)
281
+ end
282
+
283
+ def rename(mapping)
284
+ lazy.rename(mapping).collect(no_optimization: true)
285
+ end
286
+
287
+ def insert_at_idx(index, series)
288
+ if index < 0
289
+ index = columns.length + index
290
+ end
291
+ _df.insert_at_idx(index, series._s)
292
+ self
293
+ end
294
+
205
295
  def filter(predicate)
206
296
  lazy.filter(predicate).collect
207
297
  end
208
298
 
299
+ # def describe
300
+ # end
301
+
302
+ # def find_idx_by_name
303
+ # end
304
+
305
+ # def replace_at_idx
306
+ # end
307
+
209
308
  def sort(by, reverse: false, nulls_last: false)
210
309
  _from_rbdf(_df.sort(by, reverse, nulls_last))
211
310
  end
@@ -214,6 +313,16 @@ module Polars
214
313
  _df.frame_equal(other._df, null_equal)
215
314
  end
216
315
 
316
+ # def replace
317
+ # end
318
+
319
+ def slice(offset, length = nil)
320
+ if !length.nil? && length < 0
321
+ length = height - offset + length
322
+ end
323
+ _from_rbdf(_df.slice(offset, length))
324
+ end
325
+
217
326
  def limit(n = 5)
218
327
  head(n)
219
328
  end
@@ -226,10 +335,31 @@ module Polars
226
335
  _from_rbdf(_df.tail(n))
227
336
  end
228
337
 
338
+ # def drop_nulls
339
+ # end
340
+
341
+ # def pipe
342
+ # end
343
+
344
+ # def with_row_count
345
+ # end
346
+
229
347
  def groupby(by, maintain_order: false)
230
348
  lazy.groupby(by, maintain_order: maintain_order)
231
349
  end
232
350
 
351
+ # def groupby_rolling
352
+ # end
353
+
354
+ # def groupby_dynamic
355
+ # end
356
+
357
+ # def upsample
358
+ # end
359
+
360
+ # def join_asof
361
+ # end
362
+
233
363
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
364
  lazy
235
365
  .join(
@@ -243,12 +373,79 @@ module Polars
243
373
  .collect(no_optimization: true)
244
374
  end
245
375
 
376
+ # def apply
377
+ # end
378
+
246
379
  def with_column(column)
247
380
  lazy
248
381
  .with_column(column)
249
382
  .collect(no_optimization: true, string_cache: false)
250
383
  end
251
384
 
385
+ # def hstack
386
+ # end
387
+
388
+ # def vstack
389
+ # end
390
+
391
+ # def extend
392
+ # end
393
+
394
+ # def drop
395
+ # end
396
+
397
+ # def drop_in_place
398
+ # end
399
+
400
+ # def cleared
401
+ # end
402
+
403
+ # clone handled by initialize_copy
404
+
405
+ def get_columns
406
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
407
+ end
408
+
409
+ def get_column(name)
410
+ self[name]
411
+ end
412
+
413
+ # def fill_null
414
+ # end
415
+
416
+ def fill_nan(fill_value)
417
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
418
+ end
419
+
420
+ # def explode
421
+ # end
422
+
423
+ # def pivot
424
+ # end
425
+
426
+ # def melt
427
+ # end
428
+
429
+ # def unstack
430
+ # end
431
+
432
+ # def partition_by
433
+ # end
434
+
435
+ # def shift
436
+ # end
437
+
438
+ # def shift_and_fill
439
+ # end
440
+
441
+ def is_duplicated
442
+ Utils.wrap_s(_df.is_duplicated)
443
+ end
444
+
445
+ def is_unique
446
+ Utils.wrap_s(_df.is_unique)
447
+ end
448
+
252
449
  def lazy
253
450
  wrap_ldf(_df.lazy)
254
451
  end
@@ -262,6 +459,56 @@ module Polars
262
459
  )
263
460
  end
264
461
 
462
+ def with_columns(exprs)
463
+ if !exprs.nil? && !exprs.is_a?(Array)
464
+ exprs = [exprs]
465
+ end
466
+ lazy
467
+ .with_columns(exprs)
468
+ .collect(no_optimization: true, string_cache: false)
469
+ end
470
+
471
+ def n_chunks(strategy: "first")
472
+ if strategy == "first"
473
+ _df.n_chunks
474
+ elsif strategy == "all"
475
+ get_columns.map(&:n_chunks)
476
+ else
477
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
478
+ end
479
+ end
480
+
481
+ def max(axis: 0)
482
+ if axis == 0
483
+ _from_rbdf(_df.max)
484
+ elsif axis == 1
485
+ Utils.wrap_s(_df.hmax)
486
+ else
487
+ raise ArgumentError, "Axis should be 0 or 1."
488
+ end
489
+ end
490
+
491
+ def min(axis: 0)
492
+ if axis == 0
493
+ _from_rbdf(_df.min)
494
+ elsif axis == 1
495
+ Utils.wrap_s(_df.hmin)
496
+ else
497
+ raise ArgumentError, "Axis should be 0 or 1."
498
+ end
499
+ end
500
+
501
+ def sum(axis: 0, null_strategy: "ignore")
502
+ case axis
503
+ when 0
504
+ _from_rbdf(_df.sum)
505
+ when 1
506
+ Utils.wrap_s(_df.hsum(null_strategy))
507
+ else
508
+ raise ArgumentError, "Axis should be 0 or 1."
509
+ end
510
+ end
511
+
265
512
  def mean(axis: 0, null_strategy: "ignore")
266
513
  case axis
267
514
  when 0
@@ -273,15 +520,33 @@ module Polars
273
520
  end
274
521
  end
275
522
 
276
- def with_columns(exprs)
277
- if !exprs.nil? && !exprs.is_a?(Array)
278
- exprs = [exprs]
279
- end
280
- lazy
281
- .with_columns(exprs)
282
- .collect(no_optimization: true, string_cache: false)
523
+ def std(ddof: 1)
524
+ _from_rbdf(_df.std(ddof))
525
+ end
526
+
527
+ def var(ddof: 1)
528
+ _from_rbdf(_df.var(ddof))
283
529
  end
284
530
 
531
+ def median
532
+ _from_rbdf(_df.median)
533
+ end
534
+
535
+ # def product
536
+ # end
537
+
538
+ # def quantile(quantile, interpolation: "nearest")
539
+ # end
540
+
541
+ # def to_dummies
542
+ # end
543
+
544
+ # def unique
545
+ # end
546
+
547
+ # def n_unique
548
+ # end
549
+
285
550
  def rechunk
286
551
  _from_rbdf(_df.rechunk)
287
552
  end
@@ -290,8 +555,48 @@ module Polars
290
555
  _from_rbdf(_df.null_count)
291
556
  end
292
557
 
558
+ # def sample
559
+ # end
560
+
561
+ # def fold
562
+ # end
563
+
564
+ # def row
565
+ # end
566
+
567
+ # def rows
568
+ # end
569
+
570
+ # def shrink_to_fit
571
+ # end
572
+
573
+ # def take_every
574
+ # end
575
+
576
+ # def hash_rows
577
+ # end
578
+
579
+ # def interpolate
580
+ # end
581
+
582
+ def is_empty
583
+ height == 0
584
+ end
585
+ alias_method :empty?, :is_empty
586
+
587
+ # def to_struct(name)
588
+ # end
589
+
590
+ # def unnest
591
+ # end
592
+
293
593
  private
294
594
 
595
+ def initialize_copy(other)
596
+ super
597
+ self._df = _df._clone
598
+ end
599
+
295
600
  def hash_to_rbdf(data)
296
601
  RbDataFrame.read_hash(data)
297
602
  end