polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  use magnus::{r_hash::ForEach, Error, RArray, RHash, RString, Value};
2
2
  use polars::io::mmap::ReaderBytes;
3
+ use polars::io::RowCount;
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
5
6
  use std::fs::File;
@@ -7,8 +8,9 @@ use std::io::{BufReader, BufWriter, Cursor};
7
8
  use std::ops::Deref;
8
9
  use std::path::PathBuf;
9
10
 
10
- use crate::conversion::parse_parquet_compression;
11
+ use crate::conversion::*;
11
12
  use crate::file::{get_file_like, get_mmap_bytes_reader};
13
+ use crate::series::to_rbseries_collection;
12
14
  use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
13
15
 
14
16
  #[magnus::wrap(class = "Polars::RbDataFrame")]
@@ -38,10 +40,98 @@ impl RbDataFrame {
38
40
  Ok(RbDataFrame::new(df))
39
41
  }
40
42
 
41
- pub fn read_csv(rb_f: Value, has_header: bool) -> RbResult<Self> {
43
+ pub fn estimated_size(&self) -> usize {
44
+ self.df.borrow().estimated_size()
45
+ }
46
+
47
+ pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
48
+ // start arguments
49
+ // this pattern is needed for more than 16
50
+ let rb_f: Value = arguments[0].try_convert()?;
51
+ let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
52
+ let chunk_size: usize = arguments[2].try_convert()?;
53
+ let has_header: bool = arguments[3].try_convert()?;
54
+ let ignore_errors: bool = arguments[4].try_convert()?;
55
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
56
+ let skip_rows: usize = arguments[6].try_convert()?;
57
+ let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
58
+ let sep: String = arguments[8].try_convert()?;
59
+ let rechunk: bool = arguments[9].try_convert()?;
60
+ let columns: Option<Vec<String>> = arguments[10].try_convert()?;
61
+ let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
62
+ let n_threads: Option<usize> = arguments[12].try_convert()?;
63
+ let path: Option<String> = arguments[13].try_convert()?;
64
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
65
+ // TODO fix
66
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
67
+ let low_memory: bool = arguments[16].try_convert()?;
68
+ let comment_char: Option<String> = arguments[17].try_convert()?;
69
+ let quote_char: Option<String> = arguments[18].try_convert()?;
70
+ let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
71
+ let parse_dates: bool = arguments[20].try_convert()?;
72
+ let skip_rows_after_header: usize = arguments[21].try_convert()?;
73
+ let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
74
+ let sample_size: usize = arguments[23].try_convert()?;
75
+ let eol_char: String = arguments[24].try_convert()?;
76
+ // end arguments
77
+
78
+ let null_values = null_values.map(|w| w.0);
79
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
80
+ let eol_char = eol_char.as_bytes()[0];
81
+
82
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
83
+
84
+ let quote_char = if let Some(s) = quote_char {
85
+ if s.is_empty() {
86
+ None
87
+ } else {
88
+ Some(s.as_bytes()[0])
89
+ }
90
+ } else {
91
+ None
92
+ };
93
+
94
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
95
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
96
+ let dtype = dtype.0.clone();
97
+ Field::new(name, dtype)
98
+ });
99
+ Schema::from(fields)
100
+ });
101
+
102
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
103
+ overwrite_dtype
104
+ .iter()
105
+ .map(|dt| dt.0.clone())
106
+ .collect::<Vec<_>>()
107
+ });
108
+
42
109
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
43
110
  let df = CsvReader::new(mmap_bytes_r)
111
+ .infer_schema(infer_schema_length)
44
112
  .has_header(has_header)
113
+ .with_n_rows(n_rows)
114
+ .with_delimiter(sep.as_bytes()[0])
115
+ .with_skip_rows(skip_rows)
116
+ .with_ignore_parser_errors(ignore_errors)
117
+ .with_projection(projection)
118
+ .with_rechunk(rechunk)
119
+ .with_chunk_size(chunk_size)
120
+ .with_encoding(encoding.0)
121
+ .with_columns(columns)
122
+ .with_n_threads(n_threads)
123
+ .with_path(path)
124
+ .with_dtypes(overwrite_dtype.as_ref())
125
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
126
+ .low_memory(low_memory)
127
+ .with_comment_char(comment_char)
128
+ .with_null_values(null_values)
129
+ .with_parse_dates(parse_dates)
130
+ .with_quote_char(quote_char)
131
+ .with_end_of_line_char(eol_char)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_row_count(row_count)
134
+ .sample_size(sample_size)
45
135
  .finish()
46
136
  .map_err(RbPolarsErr::from)?;
47
137
  Ok(df.into())
@@ -56,6 +146,27 @@ impl RbDataFrame {
56
146
  .map(|v| v.into())
57
147
  }
58
148
 
149
+ pub fn read_ipc(
150
+ rb_f: Value,
151
+ columns: Option<Vec<String>>,
152
+ projection: Option<Vec<usize>>,
153
+ n_rows: Option<usize>,
154
+ row_count: Option<(String, IdxSize)>,
155
+ memory_map: bool,
156
+ ) -> RbResult<Self> {
157
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
158
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
159
+ let df = IpcReader::new(mmap_bytes_r)
160
+ .with_projection(projection)
161
+ .with_columns(columns)
162
+ .with_n_rows(n_rows)
163
+ .with_row_count(row_count)
164
+ .memory_mapped(memory_map)
165
+ .finish()
166
+ .map_err(RbPolarsErr::from)?;
167
+ Ok(RbDataFrame::new(df))
168
+ }
169
+
59
170
  pub fn read_json(rb_f: Value) -> RbResult<Self> {
60
171
  // memmap the file first
61
172
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
@@ -180,6 +291,28 @@ impl RbDataFrame {
180
291
  Ok(())
181
292
  }
182
293
 
294
+ pub fn write_ipc(
295
+ &self,
296
+ rb_f: Value,
297
+ compression: Wrap<Option<IpcCompression>>,
298
+ ) -> RbResult<()> {
299
+ if let Ok(s) = rb_f.try_convert::<String>() {
300
+ let f = std::fs::File::create(&s).unwrap();
301
+ IpcWriter::new(f)
302
+ .with_compression(compression.0)
303
+ .finish(&mut self.df.borrow_mut())
304
+ .map_err(RbPolarsErr::from)?;
305
+ } else {
306
+ let mut buf = get_file_like(rb_f, true)?;
307
+
308
+ IpcWriter::new(&mut buf)
309
+ .with_compression(compression.0)
310
+ .finish(&mut self.df.borrow_mut())
311
+ .map_err(RbPolarsErr::from)?;
312
+ }
313
+ Ok(())
314
+ }
315
+
183
316
  pub fn write_parquet(
184
317
  &self,
185
318
  rb_f: Value,
@@ -213,6 +346,11 @@ impl RbDataFrame {
213
346
  format!("{}", self.df.borrow())
214
347
  }
215
348
 
349
+ pub fn get_columns(&self) -> Vec<RbSeries> {
350
+ let cols = self.df.borrow().get_columns().clone();
351
+ to_rbseries_collection(cols)
352
+ }
353
+
216
354
  pub fn columns(&self) -> Vec<String> {
217
355
  self.df
218
356
  .borrow()
@@ -222,14 +360,27 @@ impl RbDataFrame {
222
360
  .collect()
223
361
  }
224
362
 
225
- pub fn dtypes(&self) -> Vec<String> {
363
+ pub fn set_column_names(&self, names: Vec<String>) -> RbResult<()> {
364
+ self.df
365
+ .borrow_mut()
366
+ .set_column_names(&names)
367
+ .map_err(RbPolarsErr::from)?;
368
+ Ok(())
369
+ }
370
+
371
+ pub fn dtypes(&self) -> Vec<Value> {
226
372
  self.df
227
373
  .borrow()
228
374
  .iter()
229
- .map(|s| s.dtype().to_string())
375
+ .map(|s| Wrap(s.dtype().clone()).into())
230
376
  .collect()
231
377
  }
232
378
 
379
+ pub fn n_chunks(&self) -> RbResult<usize> {
380
+ let n = self.df.borrow().n_chunks().map_err(RbPolarsErr::from)?;
381
+ Ok(n)
382
+ }
383
+
233
384
  pub fn shape(&self) -> (usize, usize) {
234
385
  self.df.borrow().shape()
235
386
  }
@@ -258,6 +409,28 @@ impl RbDataFrame {
258
409
  .map_err(RbPolarsErr::from)
259
410
  }
260
411
 
412
+ pub fn select(&self, selection: Vec<String>) -> RbResult<Self> {
413
+ let df = self
414
+ .df
415
+ .borrow()
416
+ .select(selection)
417
+ .map_err(RbPolarsErr::from)?;
418
+ Ok(RbDataFrame::new(df))
419
+ }
420
+
421
+ pub fn take(&self, indices: Vec<IdxSize>) -> RbResult<Self> {
422
+ let indices = IdxCa::from_vec("", indices);
423
+ let df = self.df.borrow().take(&indices).map_err(RbPolarsErr::from)?;
424
+ Ok(RbDataFrame::new(df))
425
+ }
426
+
427
+ pub fn take_with_series(&self, indices: &RbSeries) -> RbResult<Self> {
428
+ let binding = indices.series.borrow();
429
+ let idx = binding.idx().map_err(RbPolarsErr::from)?;
430
+ let df = self.df.borrow().take(idx).map_err(RbPolarsErr::from)?;
431
+ Ok(RbDataFrame::new(df))
432
+ }
433
+
261
434
  pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> RbResult<Self> {
262
435
  let df = self
263
436
  .df
@@ -273,6 +446,38 @@ impl RbDataFrame {
273
446
  Ok(RbDataFrame::new(df))
274
447
  }
275
448
 
449
+ pub fn replace(&self, column: String, new_col: &RbSeries) -> RbResult<()> {
450
+ self.df
451
+ .borrow_mut()
452
+ .replace(&column, new_col.series.borrow().clone())
453
+ .map_err(RbPolarsErr::from)?;
454
+ Ok(())
455
+ }
456
+
457
+ pub fn replace_at_idx(&self, index: usize, new_col: &RbSeries) -> RbResult<()> {
458
+ self.df
459
+ .borrow_mut()
460
+ .replace_at_idx(index, new_col.series.borrow().clone())
461
+ .map_err(RbPolarsErr::from)?;
462
+ Ok(())
463
+ }
464
+
465
+ pub fn insert_at_idx(&self, index: usize, new_col: &RbSeries) -> RbResult<()> {
466
+ self.df
467
+ .borrow_mut()
468
+ .insert_at_idx(index, new_col.series.borrow().clone())
469
+ .map_err(RbPolarsErr::from)?;
470
+ Ok(())
471
+ }
472
+
473
+ pub fn slice(&self, offset: usize, length: Option<usize>) -> Self {
474
+ let df = self.df.borrow().slice(
475
+ offset as i64,
476
+ length.unwrap_or_else(|| self.df.borrow().height()),
477
+ );
478
+ df.into()
479
+ }
480
+
276
481
  pub fn head(&self, length: Option<usize>) -> Self {
277
482
  self.df.borrow().head(length).into()
278
483
  }
@@ -281,6 +486,20 @@ impl RbDataFrame {
281
486
  self.df.borrow().tail(length).into()
282
487
  }
283
488
 
489
+ pub fn is_unique(&self) -> RbResult<RbSeries> {
490
+ let mask = self.df.borrow().is_unique().map_err(RbPolarsErr::from)?;
491
+ Ok(mask.into_series().into())
492
+ }
493
+
494
+ pub fn is_duplicated(&self) -> RbResult<RbSeries> {
495
+ let mask = self
496
+ .df
497
+ .borrow()
498
+ .is_duplicated()
499
+ .map_err(RbPolarsErr::from)?;
500
+ Ok(mask.into_series().into())
501
+ }
502
+
284
503
  pub fn frame_equal(&self, other: &RbDataFrame, null_equal: bool) -> bool {
285
504
  if null_equal {
286
505
  self.df.borrow().frame_equal_missing(&other.df.borrow())
@@ -289,16 +508,202 @@ impl RbDataFrame {
289
508
  }
290
509
  }
291
510
 
511
+ pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
512
+ let df = self
513
+ .df
514
+ .borrow()
515
+ .with_row_count(&name, offset)
516
+ .map_err(RbPolarsErr::from)?;
517
+ Ok(df.into())
518
+ }
519
+
520
+ pub fn clone(&self) -> Self {
521
+ RbDataFrame::new(self.df.borrow().clone())
522
+ }
523
+
524
+ pub fn melt(
525
+ &self,
526
+ id_vars: Vec<String>,
527
+ value_vars: Vec<String>,
528
+ value_name: Option<String>,
529
+ variable_name: Option<String>,
530
+ ) -> RbResult<Self> {
531
+ let args = MeltArgs {
532
+ id_vars,
533
+ value_vars,
534
+ value_name,
535
+ variable_name,
536
+ };
537
+
538
+ let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
539
+ Ok(RbDataFrame::new(df))
540
+ }
541
+
542
+ pub fn partition_by(&self, groups: Vec<String>, stable: bool) -> RbResult<Vec<Self>> {
543
+ let out = if stable {
544
+ self.df.borrow().partition_by_stable(groups)
545
+ } else {
546
+ self.df.borrow().partition_by(groups)
547
+ }
548
+ .map_err(RbPolarsErr::from)?;
549
+ Ok(out.into_iter().map(RbDataFrame::new).collect())
550
+ }
551
+
552
+ pub fn shift(&self, periods: i64) -> Self {
553
+ self.df.borrow().shift(periods).into()
554
+ }
555
+
556
+ pub fn unique(
557
+ &self,
558
+ maintain_order: bool,
559
+ subset: Option<Vec<String>>,
560
+ keep: Wrap<UniqueKeepStrategy>,
561
+ ) -> RbResult<Self> {
562
+ let subset = subset.as_ref().map(|v| v.as_ref());
563
+ let df = match maintain_order {
564
+ true => self.df.borrow().unique_stable(subset, keep.0),
565
+ false => self.df.borrow().unique(subset, keep.0),
566
+ }
567
+ .map_err(RbPolarsErr::from)?;
568
+ Ok(df.into())
569
+ }
570
+
292
571
  pub fn lazy(&self) -> RbLazyFrame {
293
572
  self.df.borrow().clone().lazy().into()
294
573
  }
295
574
 
575
+ pub fn max(&self) -> Self {
576
+ self.df.borrow().max().into()
577
+ }
578
+
579
+ pub fn min(&self) -> Self {
580
+ self.df.borrow().min().into()
581
+ }
582
+
583
+ pub fn sum(&self) -> Self {
584
+ self.df.borrow().sum().into()
585
+ }
586
+
296
587
  pub fn mean(&self) -> Self {
297
588
  self.df.borrow().mean().into()
298
589
  }
299
590
 
591
+ pub fn std(&self, ddof: u8) -> Self {
592
+ self.df.borrow().std(ddof).into()
593
+ }
594
+
595
+ pub fn var(&self, ddof: u8) -> Self {
596
+ self.df.borrow().var(ddof).into()
597
+ }
598
+
599
+ pub fn median(&self) -> Self {
600
+ self.df.borrow().median().into()
601
+ }
602
+
603
+ pub fn hmean(&self, null_strategy: Wrap<NullStrategy>) -> RbResult<Option<RbSeries>> {
604
+ let s = self
605
+ .df
606
+ .borrow()
607
+ .hmean(null_strategy.0)
608
+ .map_err(RbPolarsErr::from)?;
609
+ Ok(s.map(|s| s.into()))
610
+ }
611
+
612
+ pub fn hmax(&self) -> RbResult<Option<RbSeries>> {
613
+ let s = self.df.borrow().hmax().map_err(RbPolarsErr::from)?;
614
+ Ok(s.map(|s| s.into()))
615
+ }
616
+
617
+ pub fn hmin(&self) -> RbResult<Option<RbSeries>> {
618
+ let s = self.df.borrow().hmin().map_err(RbPolarsErr::from)?;
619
+ Ok(s.map(|s| s.into()))
620
+ }
621
+
622
+ pub fn hsum(&self, null_strategy: Wrap<NullStrategy>) -> RbResult<Option<RbSeries>> {
623
+ let s = self
624
+ .df
625
+ .borrow()
626
+ .hsum(null_strategy.0)
627
+ .map_err(RbPolarsErr::from)?;
628
+ Ok(s.map(|s| s.into()))
629
+ }
630
+
631
+ pub fn quantile(
632
+ &self,
633
+ quantile: f64,
634
+ interpolation: Wrap<QuantileInterpolOptions>,
635
+ ) -> RbResult<Self> {
636
+ let df = self
637
+ .df
638
+ .borrow()
639
+ .quantile(quantile, interpolation.0)
640
+ .map_err(RbPolarsErr::from)?;
641
+ Ok(df.into())
642
+ }
643
+
644
+ pub fn to_dummies(&self, columns: Option<Vec<String>>) -> RbResult<Self> {
645
+ let df = match columns {
646
+ Some(cols) => self
647
+ .df
648
+ .borrow()
649
+ .columns_to_dummies(cols.iter().map(|x| x as &str).collect()),
650
+ None => self.df.borrow().to_dummies(),
651
+ }
652
+ .map_err(RbPolarsErr::from)?;
653
+ Ok(df.into())
654
+ }
655
+
300
656
  pub fn null_count(&self) -> Self {
301
657
  let df = self.df.borrow().null_count();
302
658
  df.into()
303
659
  }
660
+
661
+ pub fn shrink_to_fit(&self) {
662
+ self.df.borrow_mut().shrink_to_fit();
663
+ }
664
+
665
+ pub fn transpose(&self, include_header: bool, names: String) -> RbResult<Self> {
666
+ let mut df = self.df.borrow().transpose().map_err(RbPolarsErr::from)?;
667
+ if include_header {
668
+ let s = Utf8Chunked::from_iter_values(
669
+ &names,
670
+ self.df.borrow().get_columns().iter().map(|s| s.name()),
671
+ )
672
+ .into_series();
673
+ df.insert_at_idx(0, s).unwrap();
674
+ }
675
+ Ok(df.into())
676
+ }
677
+
678
+ pub fn upsample(
679
+ &self,
680
+ by: Vec<String>,
681
+ index_column: String,
682
+ every: String,
683
+ offset: String,
684
+ stable: bool,
685
+ ) -> RbResult<Self> {
686
+ let out = if stable {
687
+ self.df.borrow().upsample_stable(
688
+ by,
689
+ &index_column,
690
+ Duration::parse(&every),
691
+ Duration::parse(&offset),
692
+ )
693
+ } else {
694
+ self.df.borrow().upsample(
695
+ by,
696
+ &index_column,
697
+ Duration::parse(&every),
698
+ Duration::parse(&offset),
699
+ )
700
+ };
701
+ let out = out.map_err(RbPolarsErr::from)?;
702
+ Ok(out.into())
703
+ }
704
+
705
+ pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
706
+ let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
707
+ Ok(df.into())
708
+ }
304
709
  }
@@ -1,5 +1,6 @@
1
1
  use magnus::exception::arg_error;
2
2
  use magnus::Error;
3
+ use polars::error::ArrowError;
3
4
  use polars::prelude::PolarsError;
4
5
 
5
6
  pub struct RbPolarsErr {}
@@ -10,6 +11,14 @@ impl RbPolarsErr {
10
11
  Error::runtime_error(e.to_string())
11
12
  }
12
13
 
14
+ pub fn arrow(e: ArrowError) -> Error {
15
+ Error::runtime_error(e.to_string())
16
+ }
17
+
18
+ pub fn io(e: std::io::Error) -> Error {
19
+ Error::runtime_error(e.to_string())
20
+ }
21
+
13
22
  pub fn other(message: String) -> Error {
14
23
  Error::runtime_error(message)
15
24
  }
@@ -1,18 +1,19 @@
1
1
  use magnus::{Error, RString, Value};
2
2
  use polars::io::mmap::MmapBytesReader;
3
- use std::fs::{File, OpenOptions};
3
+ use std::fs::File;
4
4
  use std::io::Cursor;
5
5
  use std::path::PathBuf;
6
6
 
7
7
  use crate::RbResult;
8
8
 
9
9
  pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
10
- OpenOptions::new()
11
- .write(true)
12
- .create(true)
13
- .truncate(truncate)
14
- .open(f.try_convert::<PathBuf>()?)
15
- .map_err(|e| Error::runtime_error(e.to_string()))
10
+ let str_slice = f.try_convert::<PathBuf>()?;
11
+ let f = if truncate {
12
+ File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
13
+ } else {
14
+ File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
15
+ };
16
+ Ok(f)
16
17
  }
17
18
 
18
19
  pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
@@ -0,0 +1,7 @@
1
+ use magnus::Value;
2
+ use polars::error::PolarsResult;
3
+ use polars::series::Series;
4
+
5
+ pub fn binary_lambda(_lambda: Value, _a: Series, _b: Series) -> PolarsResult<Series> {
6
+ todo!();
7
+ }