polars-df 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,15 +1,14 @@
1
- use magnus::{r_hash::ForEach, Error, RArray, RHash, RString, Value};
1
+ use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
2
2
  use polars::io::mmap::ReaderBytes;
3
+ use polars::io::RowCount;
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
5
- use std::fs::File;
6
- use std::io::{BufReader, BufWriter, Cursor};
6
+ use std::io::{BufWriter, Cursor};
7
7
  use std::ops::Deref;
8
- use std::path::PathBuf;
9
8
 
10
9
  use crate::conversion::*;
11
10
  use crate::file::{get_file_like, get_mmap_bytes_reader};
12
- use crate::series::to_rbseries_collection;
11
+ use crate::series::{to_rbseries_collection, to_series_collection};
13
12
  use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
14
13
 
15
14
  #[magnus::wrap(class = "Polars::RbDataFrame")]
@@ -43,22 +42,141 @@ impl RbDataFrame {
43
42
  self.df.borrow().estimated_size()
44
43
  }
45
44
 
46
- pub fn read_csv(rb_f: Value, has_header: bool) -> RbResult<Self> {
45
+ pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
46
+ // start arguments
47
+ // this pattern is needed for more than 16
48
+ let rb_f: Value = arguments[0].try_convert()?;
49
+ let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
50
+ let chunk_size: usize = arguments[2].try_convert()?;
51
+ let has_header: bool = arguments[3].try_convert()?;
52
+ let ignore_errors: bool = arguments[4].try_convert()?;
53
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
54
+ let skip_rows: usize = arguments[6].try_convert()?;
55
+ let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
56
+ let sep: String = arguments[8].try_convert()?;
57
+ let rechunk: bool = arguments[9].try_convert()?;
58
+ let columns: Option<Vec<String>> = arguments[10].try_convert()?;
59
+ let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
60
+ let n_threads: Option<usize> = arguments[12].try_convert()?;
61
+ let path: Option<String> = arguments[13].try_convert()?;
62
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
63
+ // TODO fix
64
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
65
+ let low_memory: bool = arguments[16].try_convert()?;
66
+ let comment_char: Option<String> = arguments[17].try_convert()?;
67
+ let quote_char: Option<String> = arguments[18].try_convert()?;
68
+ let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
69
+ let parse_dates: bool = arguments[20].try_convert()?;
70
+ let skip_rows_after_header: usize = arguments[21].try_convert()?;
71
+ let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
72
+ let sample_size: usize = arguments[23].try_convert()?;
73
+ let eol_char: String = arguments[24].try_convert()?;
74
+ // end arguments
75
+
76
+ let null_values = null_values.map(|w| w.0);
77
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
78
+ let eol_char = eol_char.as_bytes()[0];
79
+
80
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
81
+
82
+ let quote_char = if let Some(s) = quote_char {
83
+ if s.is_empty() {
84
+ None
85
+ } else {
86
+ Some(s.as_bytes()[0])
87
+ }
88
+ } else {
89
+ None
90
+ };
91
+
92
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
93
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
94
+ let dtype = dtype.0.clone();
95
+ Field::new(name, dtype)
96
+ });
97
+ Schema::from(fields)
98
+ });
99
+
100
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
101
+ overwrite_dtype
102
+ .iter()
103
+ .map(|dt| dt.0.clone())
104
+ .collect::<Vec<_>>()
105
+ });
106
+
47
107
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
48
108
  let df = CsvReader::new(mmap_bytes_r)
109
+ .infer_schema(infer_schema_length)
49
110
  .has_header(has_header)
111
+ .with_n_rows(n_rows)
112
+ .with_delimiter(sep.as_bytes()[0])
113
+ .with_skip_rows(skip_rows)
114
+ .with_ignore_parser_errors(ignore_errors)
115
+ .with_projection(projection)
116
+ .with_rechunk(rechunk)
117
+ .with_chunk_size(chunk_size)
118
+ .with_encoding(encoding.0)
119
+ .with_columns(columns)
120
+ .with_n_threads(n_threads)
121
+ .with_path(path)
122
+ .with_dtypes(overwrite_dtype.as_ref())
123
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
124
+ .low_memory(low_memory)
125
+ .with_comment_char(comment_char)
126
+ .with_null_values(null_values)
127
+ .with_parse_dates(parse_dates)
128
+ .with_quote_char(quote_char)
129
+ .with_end_of_line_char(eol_char)
130
+ .with_skip_rows_after_header(skip_rows_after_header)
131
+ .with_row_count(row_count)
132
+ .sample_size(sample_size)
50
133
  .finish()
51
134
  .map_err(RbPolarsErr::from)?;
52
135
  Ok(df.into())
53
136
  }
54
137
 
55
- pub fn read_parquet(path: PathBuf) -> RbResult<Self> {
56
- let f = File::open(&path).map_err(|e| Error::runtime_error(e.to_string()))?;
57
- let reader = BufReader::new(f);
58
- ParquetReader::new(reader)
138
+ pub fn read_parquet(
139
+ rb_f: Value,
140
+ columns: Option<Vec<String>>,
141
+ projection: Option<Vec<usize>>,
142
+ n_rows: Option<usize>,
143
+ parallel: Wrap<ParallelStrategy>,
144
+ row_count: Option<(String, IdxSize)>,
145
+ low_memory: bool,
146
+ ) -> RbResult<Self> {
147
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
148
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
149
+ let df = ParquetReader::new(mmap_bytes_r)
150
+ .with_projection(projection)
151
+ .with_columns(columns)
152
+ .read_parallel(parallel.0)
153
+ .with_n_rows(n_rows)
154
+ .with_row_count(row_count)
155
+ .set_low_memory(low_memory)
59
156
  .finish()
60
- .map_err(RbPolarsErr::from)
61
- .map(|v| v.into())
157
+ .map_err(RbPolarsErr::from)?;
158
+ Ok(RbDataFrame::new(df))
159
+ }
160
+
161
+ pub fn read_ipc(
162
+ rb_f: Value,
163
+ columns: Option<Vec<String>>,
164
+ projection: Option<Vec<usize>>,
165
+ n_rows: Option<usize>,
166
+ row_count: Option<(String, IdxSize)>,
167
+ memory_map: bool,
168
+ ) -> RbResult<Self> {
169
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
170
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
171
+ let df = IpcReader::new(mmap_bytes_r)
172
+ .with_projection(projection)
173
+ .with_columns(columns)
174
+ .with_n_rows(n_rows)
175
+ .with_row_count(row_count)
176
+ .memory_mapped(memory_map)
177
+ .finish()
178
+ .map_err(RbPolarsErr::from)?;
179
+ Ok(RbDataFrame::new(df))
62
180
  }
63
181
 
64
182
  pub fn read_json(rb_f: Value) -> RbResult<Self> {
@@ -185,6 +303,77 @@ impl RbDataFrame {
185
303
  Ok(())
186
304
  }
187
305
 
306
+ pub fn write_ipc(
307
+ &self,
308
+ rb_f: Value,
309
+ compression: Wrap<Option<IpcCompression>>,
310
+ ) -> RbResult<()> {
311
+ if let Ok(s) = rb_f.try_convert::<String>() {
312
+ let f = std::fs::File::create(&s).unwrap();
313
+ IpcWriter::new(f)
314
+ .with_compression(compression.0)
315
+ .finish(&mut self.df.borrow_mut())
316
+ .map_err(RbPolarsErr::from)?;
317
+ } else {
318
+ let mut buf = get_file_like(rb_f, true)?;
319
+
320
+ IpcWriter::new(&mut buf)
321
+ .with_compression(compression.0)
322
+ .finish(&mut self.df.borrow_mut())
323
+ .map_err(RbPolarsErr::from)?;
324
+ }
325
+ Ok(())
326
+ }
327
+
328
+ pub fn row_tuple(&self, idx: i64) -> Value {
329
+ let idx = if idx < 0 {
330
+ (self.df.borrow().height() as i64 + idx) as usize
331
+ } else {
332
+ idx as usize
333
+ };
334
+ RArray::from_vec(
335
+ self.df
336
+ .borrow()
337
+ .get_columns()
338
+ .iter()
339
+ .map(|s| match s.dtype() {
340
+ DataType::Object(_) => {
341
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
342
+ obj.unwrap().to_object()
343
+ }
344
+ _ => Wrap(s.get(idx)).into(),
345
+ })
346
+ .collect(),
347
+ )
348
+ .into()
349
+ }
350
+
351
+ pub fn row_tuples(&self) -> Value {
352
+ let df = &self.df;
353
+ RArray::from_vec(
354
+ (0..df.borrow().height())
355
+ .map(|idx| {
356
+ RArray::from_vec(
357
+ self.df
358
+ .borrow()
359
+ .get_columns()
360
+ .iter()
361
+ .map(|s| match s.dtype() {
362
+ DataType::Object(_) => {
363
+ let obj: Option<&ObjectValue> =
364
+ s.get_object(idx).map(|any| any.into());
365
+ obj.unwrap().to_object()
366
+ }
367
+ _ => Wrap(s.get(idx)).into(),
368
+ })
369
+ .collect(),
370
+ )
371
+ })
372
+ .collect(),
373
+ )
374
+ .into()
375
+ }
376
+
188
377
  pub fn write_parquet(
189
378
  &self,
190
379
  rb_f: Value,
@@ -210,6 +399,86 @@ impl RbDataFrame {
210
399
  Ok(())
211
400
  }
212
401
 
402
+ pub fn add(&self, s: &RbSeries) -> RbResult<Self> {
403
+ let df = (&*self.df.borrow() + &*s.series.borrow()).map_err(RbPolarsErr::from)?;
404
+ Ok(df.into())
405
+ }
406
+
407
+ pub fn sub(&self, s: &RbSeries) -> RbResult<Self> {
408
+ let df = (&*self.df.borrow() - &*s.series.borrow()).map_err(RbPolarsErr::from)?;
409
+ Ok(df.into())
410
+ }
411
+
412
+ pub fn div(&self, s: &RbSeries) -> RbResult<Self> {
413
+ let df = (&*self.df.borrow() / &*s.series.borrow()).map_err(RbPolarsErr::from)?;
414
+ Ok(df.into())
415
+ }
416
+
417
+ pub fn mul(&self, s: &RbSeries) -> RbResult<Self> {
418
+ let df = (&*self.df.borrow() * &*s.series.borrow()).map_err(RbPolarsErr::from)?;
419
+ Ok(df.into())
420
+ }
421
+
422
+ pub fn rem(&self, s: &RbSeries) -> RbResult<Self> {
423
+ let df = (&*self.df.borrow() % &*s.series.borrow()).map_err(RbPolarsErr::from)?;
424
+ Ok(df.into())
425
+ }
426
+
427
+ pub fn add_df(&self, s: &Self) -> RbResult<Self> {
428
+ let df = (&*self.df.borrow() + &*s.df.borrow()).map_err(RbPolarsErr::from)?;
429
+ Ok(df.into())
430
+ }
431
+
432
+ pub fn sub_df(&self, s: &Self) -> RbResult<Self> {
433
+ let df = (&*self.df.borrow() - &*s.df.borrow()).map_err(RbPolarsErr::from)?;
434
+ Ok(df.into())
435
+ }
436
+
437
+ pub fn div_df(&self, s: &Self) -> RbResult<Self> {
438
+ let df = (&*self.df.borrow() / &*s.df.borrow()).map_err(RbPolarsErr::from)?;
439
+ Ok(df.into())
440
+ }
441
+
442
+ pub fn mul_df(&self, s: &Self) -> RbResult<Self> {
443
+ let df = (&*self.df.borrow() * &*s.df.borrow()).map_err(RbPolarsErr::from)?;
444
+ Ok(df.into())
445
+ }
446
+
447
+ pub fn rem_df(&self, s: &Self) -> RbResult<Self> {
448
+ let df = (&*self.df.borrow() % &*s.df.borrow()).map_err(RbPolarsErr::from)?;
449
+ Ok(df.into())
450
+ }
451
+
452
+ pub fn sample_n(
453
+ &self,
454
+ n: usize,
455
+ with_replacement: bool,
456
+ shuffle: bool,
457
+ seed: Option<u64>,
458
+ ) -> RbResult<Self> {
459
+ let df = self
460
+ .df
461
+ .borrow()
462
+ .sample_n(n, with_replacement, shuffle, seed)
463
+ .map_err(RbPolarsErr::from)?;
464
+ Ok(df.into())
465
+ }
466
+
467
+ pub fn sample_frac(
468
+ &self,
469
+ frac: f64,
470
+ with_replacement: bool,
471
+ shuffle: bool,
472
+ seed: Option<u64>,
473
+ ) -> RbResult<Self> {
474
+ let df = self
475
+ .df
476
+ .borrow()
477
+ .sample_frac(frac, with_replacement, shuffle, seed)
478
+ .map_err(RbPolarsErr::from)?;
479
+ Ok(df.into())
480
+ }
481
+
213
482
  pub fn rechunk(&self) -> Self {
214
483
  self.df.borrow().agg_chunks().into()
215
484
  }
@@ -240,11 +509,11 @@ impl RbDataFrame {
240
509
  Ok(())
241
510
  }
242
511
 
243
- pub fn dtypes(&self) -> Vec<String> {
512
+ pub fn dtypes(&self) -> Vec<Value> {
244
513
  self.df
245
514
  .borrow()
246
515
  .iter()
247
- .map(|s| s.dtype().to_string())
516
+ .map(|s| Wrap(s.dtype().clone()).into())
248
517
  .collect()
249
518
  }
250
519
 
@@ -265,6 +534,73 @@ impl RbDataFrame {
265
534
  self.df.borrow().width()
266
535
  }
267
536
 
537
+ pub fn hstack_mut(&self, columns: RArray) -> RbResult<()> {
538
+ let columns = to_series_collection(columns)?;
539
+ self.df
540
+ .borrow_mut()
541
+ .hstack_mut(&columns)
542
+ .map_err(RbPolarsErr::from)?;
543
+ Ok(())
544
+ }
545
+
546
+ pub fn hstack(&self, columns: RArray) -> RbResult<Self> {
547
+ let columns = to_series_collection(columns)?;
548
+ let df = self
549
+ .df
550
+ .borrow()
551
+ .hstack(&columns)
552
+ .map_err(RbPolarsErr::from)?;
553
+ Ok(df.into())
554
+ }
555
+
556
+ pub fn extend(&self, df: &RbDataFrame) -> RbResult<()> {
557
+ self.df
558
+ .borrow_mut()
559
+ .extend(&df.df.borrow())
560
+ .map_err(RbPolarsErr::from)?;
561
+ Ok(())
562
+ }
563
+
564
+ pub fn vstack_mut(&self, df: &RbDataFrame) -> RbResult<()> {
565
+ self.df
566
+ .borrow_mut()
567
+ .vstack_mut(&df.df.borrow())
568
+ .map_err(RbPolarsErr::from)?;
569
+ Ok(())
570
+ }
571
+
572
+ pub fn vstack(&self, df: &RbDataFrame) -> RbResult<Self> {
573
+ let df = self
574
+ .df
575
+ .borrow()
576
+ .vstack(&df.df.borrow())
577
+ .map_err(RbPolarsErr::from)?;
578
+ Ok(df.into())
579
+ }
580
+
581
+ pub fn drop_in_place(&self, name: String) -> RbResult<RbSeries> {
582
+ let s = self
583
+ .df
584
+ .borrow_mut()
585
+ .drop_in_place(&name)
586
+ .map_err(RbPolarsErr::from)?;
587
+ Ok(RbSeries::new(s))
588
+ }
589
+
590
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> RbResult<Self> {
591
+ let df = self
592
+ .df
593
+ .borrow()
594
+ .drop_nulls(subset.as_ref().map(|s| s.as_ref()))
595
+ .map_err(RbPolarsErr::from)?;
596
+ Ok(df.into())
597
+ }
598
+
599
+ pub fn drop(&self, name: String) -> RbResult<Self> {
600
+ let df = self.df.borrow().drop(&name).map_err(RbPolarsErr::from)?;
601
+ Ok(RbDataFrame::new(df))
602
+ }
603
+
268
604
  pub fn select_at_idx(&self, idx: usize) -> Option<RbSeries> {
269
605
  self.df
270
606
  .borrow()
@@ -272,6 +608,10 @@ impl RbDataFrame {
272
608
  .map(|s| RbSeries::new(s.clone()))
273
609
  }
274
610
 
611
+ pub fn find_idx_by_name(&self, name: String) -> Option<usize> {
612
+ self.df.borrow().find_idx_by_name(&name)
613
+ }
614
+
275
615
  // TODO remove clone
276
616
  pub fn column(&self, name: String) -> RbResult<RbSeries> {
277
617
  self.df
@@ -418,7 +758,7 @@ impl RbDataFrame {
418
758
  self.df.borrow().partition_by(groups)
419
759
  }
420
760
  .map_err(RbPolarsErr::from)?;
421
- Ok(out.into_iter().map(|v| RbDataFrame::new(v)).collect())
761
+ Ok(out.into_iter().map(RbDataFrame::new).collect())
422
762
  }
423
763
 
424
764
  pub fn shift(&self, periods: i64) -> Self {
@@ -574,6 +914,11 @@ impl RbDataFrame {
574
914
  Ok(out.into())
575
915
  }
576
916
 
917
+ pub fn to_struct(&self, name: String) -> RbSeries {
918
+ let s = self.df.borrow().clone().into_struct(&name);
919
+ s.into_series().into()
920
+ }
921
+
577
922
  pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
578
923
  let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
579
924
  Ok(df.into())
@@ -1,5 +1,6 @@
1
1
  use magnus::exception::arg_error;
2
2
  use magnus::Error;
3
+ use polars::error::ArrowError;
3
4
  use polars::prelude::PolarsError;
4
5
 
5
6
  pub struct RbPolarsErr {}
@@ -10,6 +11,14 @@ impl RbPolarsErr {
10
11
  Error::runtime_error(e.to_string())
11
12
  }
12
13
 
14
+ pub fn arrow(e: ArrowError) -> Error {
15
+ Error::runtime_error(e.to_string())
16
+ }
17
+
18
+ pub fn io(e: std::io::Error) -> Error {
19
+ Error::runtime_error(e.to_string())
20
+ }
21
+
13
22
  pub fn other(message: String) -> Error {
14
23
  Error::runtime_error(message)
15
24
  }
@@ -1,18 +1,19 @@
1
1
  use magnus::{Error, RString, Value};
2
2
  use polars::io::mmap::MmapBytesReader;
3
- use std::fs::{File, OpenOptions};
3
+ use std::fs::File;
4
4
  use std::io::Cursor;
5
5
  use std::path::PathBuf;
6
6
 
7
7
  use crate::RbResult;
8
8
 
9
9
  pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
10
- OpenOptions::new()
11
- .write(true)
12
- .create(true)
13
- .truncate(truncate)
14
- .open(f.try_convert::<PathBuf>()?)
15
- .map_err(|e| Error::runtime_error(e.to_string()))
10
+ let str_slice = f.try_convert::<PathBuf>()?;
11
+ let f = if truncate {
12
+ File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
13
+ } else {
14
+ File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
15
+ };
16
+ Ok(f)
16
17
  }
17
18
 
18
19
  pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
@@ -0,0 +1,7 @@
1
+ use magnus::Value;
2
+ use polars::error::PolarsResult;
3
+ use polars::series::Series;
4
+
5
+ pub fn binary_lambda(_lambda: Value, _a: Series, _b: Series) -> PolarsResult<Series> {
6
+ todo!();
7
+ }
@@ -1,4 +1,5 @@
1
1
  use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
52
53
  }
53
54
 
54
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
55
187
  pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
56
188
  let file = BufWriter::new(get_file_like(rb_f, true)?);
57
189
  serde_json::to_writer(file, &self.ldf.logical_plan)
@@ -388,9 +520,9 @@ impl RbLazyFrame {
388
520
  Ok(self.get_schema()?.iter_names().cloned().collect())
389
521
  }
390
522
 
391
- pub fn dtypes(&self) -> RbResult<Vec<String>> {
523
+ pub fn dtypes(&self) -> RbResult<Vec<Value>> {
392
524
  let schema = self.get_schema()?;
393
- let iter = schema.iter_dtypes().map(|dt| dt.to_string());
525
+ let iter = schema.iter_dtypes().map(|dt| Wrap(dt.clone()).into());
394
526
  Ok(iter.collect())
395
527
  }
396
528
 
@@ -401,7 +533,7 @@ impl RbLazyFrame {
401
533
  schema.iter_fields().for_each(|fld| {
402
534
  // TODO remove unwrap
403
535
  schema_dict
404
- .aset(fld.name().clone(), fld.data_type().to_string())
536
+ .aset::<String, Value>(fld.name().clone(), Wrap(fld.data_type().clone()).into())
405
537
  .unwrap();
406
538
  });
407
539
  Ok(schema_dict)