polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  use magnus::{r_hash::ForEach, Error, RArray, RHash, RString, Value};
2
2
  use polars::io::mmap::ReaderBytes;
3
+ use polars::io::RowCount;
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
5
6
  use std::fs::File;
@@ -7,8 +8,9 @@ use std::io::{BufReader, BufWriter, Cursor};
7
8
  use std::ops::Deref;
8
9
  use std::path::PathBuf;
9
10
 
10
- use crate::conversion::parse_parquet_compression;
11
+ use crate::conversion::*;
11
12
  use crate::file::{get_file_like, get_mmap_bytes_reader};
13
+ use crate::series::to_rbseries_collection;
12
14
  use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
13
15
 
14
16
  #[magnus::wrap(class = "Polars::RbDataFrame")]
@@ -38,10 +40,98 @@ impl RbDataFrame {
38
40
  Ok(RbDataFrame::new(df))
39
41
  }
40
42
 
41
- pub fn read_csv(rb_f: Value, has_header: bool) -> RbResult<Self> {
43
+ pub fn estimated_size(&self) -> usize {
44
+ self.df.borrow().estimated_size()
45
+ }
46
+
47
+ pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
48
+ // start arguments
49
+ // this pattern is needed for more than 16
50
+ let rb_f: Value = arguments[0].try_convert()?;
51
+ let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
52
+ let chunk_size: usize = arguments[2].try_convert()?;
53
+ let has_header: bool = arguments[3].try_convert()?;
54
+ let ignore_errors: bool = arguments[4].try_convert()?;
55
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
56
+ let skip_rows: usize = arguments[6].try_convert()?;
57
+ let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
58
+ let sep: String = arguments[8].try_convert()?;
59
+ let rechunk: bool = arguments[9].try_convert()?;
60
+ let columns: Option<Vec<String>> = arguments[10].try_convert()?;
61
+ let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
62
+ let n_threads: Option<usize> = arguments[12].try_convert()?;
63
+ let path: Option<String> = arguments[13].try_convert()?;
64
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
65
+ // TODO fix
66
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
67
+ let low_memory: bool = arguments[16].try_convert()?;
68
+ let comment_char: Option<String> = arguments[17].try_convert()?;
69
+ let quote_char: Option<String> = arguments[18].try_convert()?;
70
+ let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
71
+ let parse_dates: bool = arguments[20].try_convert()?;
72
+ let skip_rows_after_header: usize = arguments[21].try_convert()?;
73
+ let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
74
+ let sample_size: usize = arguments[23].try_convert()?;
75
+ let eol_char: String = arguments[24].try_convert()?;
76
+ // end arguments
77
+
78
+ let null_values = null_values.map(|w| w.0);
79
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
80
+ let eol_char = eol_char.as_bytes()[0];
81
+
82
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
83
+
84
+ let quote_char = if let Some(s) = quote_char {
85
+ if s.is_empty() {
86
+ None
87
+ } else {
88
+ Some(s.as_bytes()[0])
89
+ }
90
+ } else {
91
+ None
92
+ };
93
+
94
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
95
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
96
+ let dtype = dtype.0.clone();
97
+ Field::new(name, dtype)
98
+ });
99
+ Schema::from(fields)
100
+ });
101
+
102
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
103
+ overwrite_dtype
104
+ .iter()
105
+ .map(|dt| dt.0.clone())
106
+ .collect::<Vec<_>>()
107
+ });
108
+
42
109
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
43
110
  let df = CsvReader::new(mmap_bytes_r)
111
+ .infer_schema(infer_schema_length)
44
112
  .has_header(has_header)
113
+ .with_n_rows(n_rows)
114
+ .with_delimiter(sep.as_bytes()[0])
115
+ .with_skip_rows(skip_rows)
116
+ .with_ignore_parser_errors(ignore_errors)
117
+ .with_projection(projection)
118
+ .with_rechunk(rechunk)
119
+ .with_chunk_size(chunk_size)
120
+ .with_encoding(encoding.0)
121
+ .with_columns(columns)
122
+ .with_n_threads(n_threads)
123
+ .with_path(path)
124
+ .with_dtypes(overwrite_dtype.as_ref())
125
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
126
+ .low_memory(low_memory)
127
+ .with_comment_char(comment_char)
128
+ .with_null_values(null_values)
129
+ .with_parse_dates(parse_dates)
130
+ .with_quote_char(quote_char)
131
+ .with_end_of_line_char(eol_char)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_row_count(row_count)
134
+ .sample_size(sample_size)
45
135
  .finish()
46
136
  .map_err(RbPolarsErr::from)?;
47
137
  Ok(df.into())
@@ -56,6 +146,27 @@ impl RbDataFrame {
56
146
  .map(|v| v.into())
57
147
  }
58
148
 
149
+ pub fn read_ipc(
150
+ rb_f: Value,
151
+ columns: Option<Vec<String>>,
152
+ projection: Option<Vec<usize>>,
153
+ n_rows: Option<usize>,
154
+ row_count: Option<(String, IdxSize)>,
155
+ memory_map: bool,
156
+ ) -> RbResult<Self> {
157
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
158
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
159
+ let df = IpcReader::new(mmap_bytes_r)
160
+ .with_projection(projection)
161
+ .with_columns(columns)
162
+ .with_n_rows(n_rows)
163
+ .with_row_count(row_count)
164
+ .memory_mapped(memory_map)
165
+ .finish()
166
+ .map_err(RbPolarsErr::from)?;
167
+ Ok(RbDataFrame::new(df))
168
+ }
169
+
59
170
  pub fn read_json(rb_f: Value) -> RbResult<Self> {
60
171
  // memmap the file first
61
172
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
@@ -180,6 +291,28 @@ impl RbDataFrame {
180
291
  Ok(())
181
292
  }
182
293
 
294
+ pub fn write_ipc(
295
+ &self,
296
+ rb_f: Value,
297
+ compression: Wrap<Option<IpcCompression>>,
298
+ ) -> RbResult<()> {
299
+ if let Ok(s) = rb_f.try_convert::<String>() {
300
+ let f = std::fs::File::create(&s).unwrap();
301
+ IpcWriter::new(f)
302
+ .with_compression(compression.0)
303
+ .finish(&mut self.df.borrow_mut())
304
+ .map_err(RbPolarsErr::from)?;
305
+ } else {
306
+ let mut buf = get_file_like(rb_f, true)?;
307
+
308
+ IpcWriter::new(&mut buf)
309
+ .with_compression(compression.0)
310
+ .finish(&mut self.df.borrow_mut())
311
+ .map_err(RbPolarsErr::from)?;
312
+ }
313
+ Ok(())
314
+ }
315
+
183
316
  pub fn write_parquet(
184
317
  &self,
185
318
  rb_f: Value,
@@ -213,6 +346,11 @@ impl RbDataFrame {
213
346
  format!("{}", self.df.borrow())
214
347
  }
215
348
 
349
+ pub fn get_columns(&self) -> Vec<RbSeries> {
350
+ let cols = self.df.borrow().get_columns().clone();
351
+ to_rbseries_collection(cols)
352
+ }
353
+
216
354
  pub fn columns(&self) -> Vec<String> {
217
355
  self.df
218
356
  .borrow()
@@ -222,14 +360,27 @@ impl RbDataFrame {
222
360
  .collect()
223
361
  }
224
362
 
225
- pub fn dtypes(&self) -> Vec<String> {
363
+ pub fn set_column_names(&self, names: Vec<String>) -> RbResult<()> {
364
+ self.df
365
+ .borrow_mut()
366
+ .set_column_names(&names)
367
+ .map_err(RbPolarsErr::from)?;
368
+ Ok(())
369
+ }
370
+
371
+ pub fn dtypes(&self) -> Vec<Value> {
226
372
  self.df
227
373
  .borrow()
228
374
  .iter()
229
- .map(|s| s.dtype().to_string())
375
+ .map(|s| Wrap(s.dtype().clone()).into())
230
376
  .collect()
231
377
  }
232
378
 
379
+ pub fn n_chunks(&self) -> RbResult<usize> {
380
+ let n = self.df.borrow().n_chunks().map_err(RbPolarsErr::from)?;
381
+ Ok(n)
382
+ }
383
+
233
384
  pub fn shape(&self) -> (usize, usize) {
234
385
  self.df.borrow().shape()
235
386
  }
@@ -258,6 +409,28 @@ impl RbDataFrame {
258
409
  .map_err(RbPolarsErr::from)
259
410
  }
260
411
 
412
+ pub fn select(&self, selection: Vec<String>) -> RbResult<Self> {
413
+ let df = self
414
+ .df
415
+ .borrow()
416
+ .select(selection)
417
+ .map_err(RbPolarsErr::from)?;
418
+ Ok(RbDataFrame::new(df))
419
+ }
420
+
421
+ pub fn take(&self, indices: Vec<IdxSize>) -> RbResult<Self> {
422
+ let indices = IdxCa::from_vec("", indices);
423
+ let df = self.df.borrow().take(&indices).map_err(RbPolarsErr::from)?;
424
+ Ok(RbDataFrame::new(df))
425
+ }
426
+
427
+ pub fn take_with_series(&self, indices: &RbSeries) -> RbResult<Self> {
428
+ let binding = indices.series.borrow();
429
+ let idx = binding.idx().map_err(RbPolarsErr::from)?;
430
+ let df = self.df.borrow().take(idx).map_err(RbPolarsErr::from)?;
431
+ Ok(RbDataFrame::new(df))
432
+ }
433
+
261
434
  pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> RbResult<Self> {
262
435
  let df = self
263
436
  .df
@@ -273,6 +446,38 @@ impl RbDataFrame {
273
446
  Ok(RbDataFrame::new(df))
274
447
  }
275
448
 
449
+ pub fn replace(&self, column: String, new_col: &RbSeries) -> RbResult<()> {
450
+ self.df
451
+ .borrow_mut()
452
+ .replace(&column, new_col.series.borrow().clone())
453
+ .map_err(RbPolarsErr::from)?;
454
+ Ok(())
455
+ }
456
+
457
+ pub fn replace_at_idx(&self, index: usize, new_col: &RbSeries) -> RbResult<()> {
458
+ self.df
459
+ .borrow_mut()
460
+ .replace_at_idx(index, new_col.series.borrow().clone())
461
+ .map_err(RbPolarsErr::from)?;
462
+ Ok(())
463
+ }
464
+
465
+ pub fn insert_at_idx(&self, index: usize, new_col: &RbSeries) -> RbResult<()> {
466
+ self.df
467
+ .borrow_mut()
468
+ .insert_at_idx(index, new_col.series.borrow().clone())
469
+ .map_err(RbPolarsErr::from)?;
470
+ Ok(())
471
+ }
472
+
473
+ pub fn slice(&self, offset: usize, length: Option<usize>) -> Self {
474
+ let df = self.df.borrow().slice(
475
+ offset as i64,
476
+ length.unwrap_or_else(|| self.df.borrow().height()),
477
+ );
478
+ df.into()
479
+ }
480
+
276
481
  pub fn head(&self, length: Option<usize>) -> Self {
277
482
  self.df.borrow().head(length).into()
278
483
  }
@@ -281,6 +486,20 @@ impl RbDataFrame {
281
486
  self.df.borrow().tail(length).into()
282
487
  }
283
488
 
489
+ pub fn is_unique(&self) -> RbResult<RbSeries> {
490
+ let mask = self.df.borrow().is_unique().map_err(RbPolarsErr::from)?;
491
+ Ok(mask.into_series().into())
492
+ }
493
+
494
+ pub fn is_duplicated(&self) -> RbResult<RbSeries> {
495
+ let mask = self
496
+ .df
497
+ .borrow()
498
+ .is_duplicated()
499
+ .map_err(RbPolarsErr::from)?;
500
+ Ok(mask.into_series().into())
501
+ }
502
+
284
503
  pub fn frame_equal(&self, other: &RbDataFrame, null_equal: bool) -> bool {
285
504
  if null_equal {
286
505
  self.df.borrow().frame_equal_missing(&other.df.borrow())
@@ -289,16 +508,202 @@ impl RbDataFrame {
289
508
  }
290
509
  }
291
510
 
511
+ pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
512
+ let df = self
513
+ .df
514
+ .borrow()
515
+ .with_row_count(&name, offset)
516
+ .map_err(RbPolarsErr::from)?;
517
+ Ok(df.into())
518
+ }
519
+
520
+ pub fn clone(&self) -> Self {
521
+ RbDataFrame::new(self.df.borrow().clone())
522
+ }
523
+
524
+ pub fn melt(
525
+ &self,
526
+ id_vars: Vec<String>,
527
+ value_vars: Vec<String>,
528
+ value_name: Option<String>,
529
+ variable_name: Option<String>,
530
+ ) -> RbResult<Self> {
531
+ let args = MeltArgs {
532
+ id_vars,
533
+ value_vars,
534
+ value_name,
535
+ variable_name,
536
+ };
537
+
538
+ let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
539
+ Ok(RbDataFrame::new(df))
540
+ }
541
+
542
+ pub fn partition_by(&self, groups: Vec<String>, stable: bool) -> RbResult<Vec<Self>> {
543
+ let out = if stable {
544
+ self.df.borrow().partition_by_stable(groups)
545
+ } else {
546
+ self.df.borrow().partition_by(groups)
547
+ }
548
+ .map_err(RbPolarsErr::from)?;
549
+ Ok(out.into_iter().map(RbDataFrame::new).collect())
550
+ }
551
+
552
+ pub fn shift(&self, periods: i64) -> Self {
553
+ self.df.borrow().shift(periods).into()
554
+ }
555
+
556
+ pub fn unique(
557
+ &self,
558
+ maintain_order: bool,
559
+ subset: Option<Vec<String>>,
560
+ keep: Wrap<UniqueKeepStrategy>,
561
+ ) -> RbResult<Self> {
562
+ let subset = subset.as_ref().map(|v| v.as_ref());
563
+ let df = match maintain_order {
564
+ true => self.df.borrow().unique_stable(subset, keep.0),
565
+ false => self.df.borrow().unique(subset, keep.0),
566
+ }
567
+ .map_err(RbPolarsErr::from)?;
568
+ Ok(df.into())
569
+ }
570
+
292
571
  pub fn lazy(&self) -> RbLazyFrame {
293
572
  self.df.borrow().clone().lazy().into()
294
573
  }
295
574
 
575
+ pub fn max(&self) -> Self {
576
+ self.df.borrow().max().into()
577
+ }
578
+
579
+ pub fn min(&self) -> Self {
580
+ self.df.borrow().min().into()
581
+ }
582
+
583
+ pub fn sum(&self) -> Self {
584
+ self.df.borrow().sum().into()
585
+ }
586
+
296
587
  pub fn mean(&self) -> Self {
297
588
  self.df.borrow().mean().into()
298
589
  }
299
590
 
591
+ pub fn std(&self, ddof: u8) -> Self {
592
+ self.df.borrow().std(ddof).into()
593
+ }
594
+
595
+ pub fn var(&self, ddof: u8) -> Self {
596
+ self.df.borrow().var(ddof).into()
597
+ }
598
+
599
+ pub fn median(&self) -> Self {
600
+ self.df.borrow().median().into()
601
+ }
602
+
603
+ pub fn hmean(&self, null_strategy: Wrap<NullStrategy>) -> RbResult<Option<RbSeries>> {
604
+ let s = self
605
+ .df
606
+ .borrow()
607
+ .hmean(null_strategy.0)
608
+ .map_err(RbPolarsErr::from)?;
609
+ Ok(s.map(|s| s.into()))
610
+ }
611
+
612
+ pub fn hmax(&self) -> RbResult<Option<RbSeries>> {
613
+ let s = self.df.borrow().hmax().map_err(RbPolarsErr::from)?;
614
+ Ok(s.map(|s| s.into()))
615
+ }
616
+
617
+ pub fn hmin(&self) -> RbResult<Option<RbSeries>> {
618
+ let s = self.df.borrow().hmin().map_err(RbPolarsErr::from)?;
619
+ Ok(s.map(|s| s.into()))
620
+ }
621
+
622
+ pub fn hsum(&self, null_strategy: Wrap<NullStrategy>) -> RbResult<Option<RbSeries>> {
623
+ let s = self
624
+ .df
625
+ .borrow()
626
+ .hsum(null_strategy.0)
627
+ .map_err(RbPolarsErr::from)?;
628
+ Ok(s.map(|s| s.into()))
629
+ }
630
+
631
+ pub fn quantile(
632
+ &self,
633
+ quantile: f64,
634
+ interpolation: Wrap<QuantileInterpolOptions>,
635
+ ) -> RbResult<Self> {
636
+ let df = self
637
+ .df
638
+ .borrow()
639
+ .quantile(quantile, interpolation.0)
640
+ .map_err(RbPolarsErr::from)?;
641
+ Ok(df.into())
642
+ }
643
+
644
+ pub fn to_dummies(&self, columns: Option<Vec<String>>) -> RbResult<Self> {
645
+ let df = match columns {
646
+ Some(cols) => self
647
+ .df
648
+ .borrow()
649
+ .columns_to_dummies(cols.iter().map(|x| x as &str).collect()),
650
+ None => self.df.borrow().to_dummies(),
651
+ }
652
+ .map_err(RbPolarsErr::from)?;
653
+ Ok(df.into())
654
+ }
655
+
300
656
  pub fn null_count(&self) -> Self {
301
657
  let df = self.df.borrow().null_count();
302
658
  df.into()
303
659
  }
660
+
661
+ pub fn shrink_to_fit(&self) {
662
+ self.df.borrow_mut().shrink_to_fit();
663
+ }
664
+
665
+ pub fn transpose(&self, include_header: bool, names: String) -> RbResult<Self> {
666
+ let mut df = self.df.borrow().transpose().map_err(RbPolarsErr::from)?;
667
+ if include_header {
668
+ let s = Utf8Chunked::from_iter_values(
669
+ &names,
670
+ self.df.borrow().get_columns().iter().map(|s| s.name()),
671
+ )
672
+ .into_series();
673
+ df.insert_at_idx(0, s).unwrap();
674
+ }
675
+ Ok(df.into())
676
+ }
677
+
678
+ pub fn upsample(
679
+ &self,
680
+ by: Vec<String>,
681
+ index_column: String,
682
+ every: String,
683
+ offset: String,
684
+ stable: bool,
685
+ ) -> RbResult<Self> {
686
+ let out = if stable {
687
+ self.df.borrow().upsample_stable(
688
+ by,
689
+ &index_column,
690
+ Duration::parse(&every),
691
+ Duration::parse(&offset),
692
+ )
693
+ } else {
694
+ self.df.borrow().upsample(
695
+ by,
696
+ &index_column,
697
+ Duration::parse(&every),
698
+ Duration::parse(&offset),
699
+ )
700
+ };
701
+ let out = out.map_err(RbPolarsErr::from)?;
702
+ Ok(out.into())
703
+ }
704
+
705
+ pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
706
+ let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
707
+ Ok(df.into())
708
+ }
304
709
  }
@@ -1,5 +1,6 @@
1
1
  use magnus::exception::arg_error;
2
2
  use magnus::Error;
3
+ use polars::error::ArrowError;
3
4
  use polars::prelude::PolarsError;
4
5
 
5
6
  pub struct RbPolarsErr {}
@@ -10,6 +11,14 @@ impl RbPolarsErr {
10
11
  Error::runtime_error(e.to_string())
11
12
  }
12
13
 
14
+ pub fn arrow(e: ArrowError) -> Error {
15
+ Error::runtime_error(e.to_string())
16
+ }
17
+
18
+ pub fn io(e: std::io::Error) -> Error {
19
+ Error::runtime_error(e.to_string())
20
+ }
21
+
13
22
  pub fn other(message: String) -> Error {
14
23
  Error::runtime_error(message)
15
24
  }
@@ -1,18 +1,19 @@
1
1
  use magnus::{Error, RString, Value};
2
2
  use polars::io::mmap::MmapBytesReader;
3
- use std::fs::{File, OpenOptions};
3
+ use std::fs::File;
4
4
  use std::io::Cursor;
5
5
  use std::path::PathBuf;
6
6
 
7
7
  use crate::RbResult;
8
8
 
9
9
  pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
10
- OpenOptions::new()
11
- .write(true)
12
- .create(true)
13
- .truncate(truncate)
14
- .open(f.try_convert::<PathBuf>()?)
15
- .map_err(|e| Error::runtime_error(e.to_string()))
10
+ let str_slice = f.try_convert::<PathBuf>()?;
11
+ let f = if truncate {
12
+ File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
13
+ } else {
14
+ File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
15
+ };
16
+ Ok(f)
16
17
  }
17
18
 
18
19
  pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
@@ -0,0 +1,7 @@
1
+ use magnus::Value;
2
+ use polars::error::PolarsResult;
3
+ use polars::series::Series;
4
+
5
+ pub fn binary_lambda(_lambda: Value, _a: Series, _b: Series) -> PolarsResult<Series> {
6
+ todo!();
7
+ }