polars-df 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,15 +1,14 @@
1
- use magnus::{r_hash::ForEach, Error, RArray, RHash, RString, Value};
1
+ use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
2
2
  use polars::io::mmap::ReaderBytes;
3
+ use polars::io::RowCount;
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
5
- use std::fs::File;
6
- use std::io::{BufReader, BufWriter, Cursor};
6
+ use std::io::{BufWriter, Cursor};
7
7
  use std::ops::Deref;
8
- use std::path::PathBuf;
9
8
 
10
9
  use crate::conversion::*;
11
10
  use crate::file::{get_file_like, get_mmap_bytes_reader};
12
- use crate::series::to_rbseries_collection;
11
+ use crate::series::{to_rbseries_collection, to_series_collection};
13
12
  use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
14
13
 
15
14
  #[magnus::wrap(class = "Polars::RbDataFrame")]
@@ -43,22 +42,141 @@ impl RbDataFrame {
43
42
  self.df.borrow().estimated_size()
44
43
  }
45
44
 
46
- pub fn read_csv(rb_f: Value, has_header: bool) -> RbResult<Self> {
45
+ pub fn read_csv(arguments: &[Value]) -> RbResult<Self> {
46
+ // start arguments
47
+ // this pattern is needed for more than 16
48
+ let rb_f: Value = arguments[0].try_convert()?;
49
+ let infer_schema_length: Option<usize> = arguments[1].try_convert()?;
50
+ let chunk_size: usize = arguments[2].try_convert()?;
51
+ let has_header: bool = arguments[3].try_convert()?;
52
+ let ignore_errors: bool = arguments[4].try_convert()?;
53
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
54
+ let skip_rows: usize = arguments[6].try_convert()?;
55
+ let projection: Option<Vec<usize>> = arguments[7].try_convert()?;
56
+ let sep: String = arguments[8].try_convert()?;
57
+ let rechunk: bool = arguments[9].try_convert()?;
58
+ let columns: Option<Vec<String>> = arguments[10].try_convert()?;
59
+ let encoding: Wrap<CsvEncoding> = arguments[11].try_convert()?;
60
+ let n_threads: Option<usize> = arguments[12].try_convert()?;
61
+ let path: Option<String> = arguments[13].try_convert()?;
62
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[14].try_convert()?;
63
+ // TODO fix
64
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[15].try_convert()?;
65
+ let low_memory: bool = arguments[16].try_convert()?;
66
+ let comment_char: Option<String> = arguments[17].try_convert()?;
67
+ let quote_char: Option<String> = arguments[18].try_convert()?;
68
+ let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
69
+ let parse_dates: bool = arguments[20].try_convert()?;
70
+ let skip_rows_after_header: usize = arguments[21].try_convert()?;
71
+ let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
72
+ let sample_size: usize = arguments[23].try_convert()?;
73
+ let eol_char: String = arguments[24].try_convert()?;
74
+ // end arguments
75
+
76
+ let null_values = null_values.map(|w| w.0);
77
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
78
+ let eol_char = eol_char.as_bytes()[0];
79
+
80
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
81
+
82
+ let quote_char = if let Some(s) = quote_char {
83
+ if s.is_empty() {
84
+ None
85
+ } else {
86
+ Some(s.as_bytes()[0])
87
+ }
88
+ } else {
89
+ None
90
+ };
91
+
92
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
93
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
94
+ let dtype = dtype.0.clone();
95
+ Field::new(name, dtype)
96
+ });
97
+ Schema::from(fields)
98
+ });
99
+
100
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
101
+ overwrite_dtype
102
+ .iter()
103
+ .map(|dt| dt.0.clone())
104
+ .collect::<Vec<_>>()
105
+ });
106
+
47
107
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
48
108
  let df = CsvReader::new(mmap_bytes_r)
109
+ .infer_schema(infer_schema_length)
49
110
  .has_header(has_header)
111
+ .with_n_rows(n_rows)
112
+ .with_delimiter(sep.as_bytes()[0])
113
+ .with_skip_rows(skip_rows)
114
+ .with_ignore_parser_errors(ignore_errors)
115
+ .with_projection(projection)
116
+ .with_rechunk(rechunk)
117
+ .with_chunk_size(chunk_size)
118
+ .with_encoding(encoding.0)
119
+ .with_columns(columns)
120
+ .with_n_threads(n_threads)
121
+ .with_path(path)
122
+ .with_dtypes(overwrite_dtype.as_ref())
123
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
124
+ .low_memory(low_memory)
125
+ .with_comment_char(comment_char)
126
+ .with_null_values(null_values)
127
+ .with_parse_dates(parse_dates)
128
+ .with_quote_char(quote_char)
129
+ .with_end_of_line_char(eol_char)
130
+ .with_skip_rows_after_header(skip_rows_after_header)
131
+ .with_row_count(row_count)
132
+ .sample_size(sample_size)
50
133
  .finish()
51
134
  .map_err(RbPolarsErr::from)?;
52
135
  Ok(df.into())
53
136
  }
54
137
 
55
- pub fn read_parquet(path: PathBuf) -> RbResult<Self> {
56
- let f = File::open(&path).map_err(|e| Error::runtime_error(e.to_string()))?;
57
- let reader = BufReader::new(f);
58
- ParquetReader::new(reader)
138
+ pub fn read_parquet(
139
+ rb_f: Value,
140
+ columns: Option<Vec<String>>,
141
+ projection: Option<Vec<usize>>,
142
+ n_rows: Option<usize>,
143
+ parallel: Wrap<ParallelStrategy>,
144
+ row_count: Option<(String, IdxSize)>,
145
+ low_memory: bool,
146
+ ) -> RbResult<Self> {
147
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
148
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
149
+ let df = ParquetReader::new(mmap_bytes_r)
150
+ .with_projection(projection)
151
+ .with_columns(columns)
152
+ .read_parallel(parallel.0)
153
+ .with_n_rows(n_rows)
154
+ .with_row_count(row_count)
155
+ .set_low_memory(low_memory)
59
156
  .finish()
60
- .map_err(RbPolarsErr::from)
61
- .map(|v| v.into())
157
+ .map_err(RbPolarsErr::from)?;
158
+ Ok(RbDataFrame::new(df))
159
+ }
160
+
161
+ pub fn read_ipc(
162
+ rb_f: Value,
163
+ columns: Option<Vec<String>>,
164
+ projection: Option<Vec<usize>>,
165
+ n_rows: Option<usize>,
166
+ row_count: Option<(String, IdxSize)>,
167
+ memory_map: bool,
168
+ ) -> RbResult<Self> {
169
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
170
+ let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
171
+ let df = IpcReader::new(mmap_bytes_r)
172
+ .with_projection(projection)
173
+ .with_columns(columns)
174
+ .with_n_rows(n_rows)
175
+ .with_row_count(row_count)
176
+ .memory_mapped(memory_map)
177
+ .finish()
178
+ .map_err(RbPolarsErr::from)?;
179
+ Ok(RbDataFrame::new(df))
62
180
  }
63
181
 
64
182
  pub fn read_json(rb_f: Value) -> RbResult<Self> {
@@ -185,6 +303,77 @@ impl RbDataFrame {
185
303
  Ok(())
186
304
  }
187
305
 
306
+ pub fn write_ipc(
307
+ &self,
308
+ rb_f: Value,
309
+ compression: Wrap<Option<IpcCompression>>,
310
+ ) -> RbResult<()> {
311
+ if let Ok(s) = rb_f.try_convert::<String>() {
312
+ let f = std::fs::File::create(&s).unwrap();
313
+ IpcWriter::new(f)
314
+ .with_compression(compression.0)
315
+ .finish(&mut self.df.borrow_mut())
316
+ .map_err(RbPolarsErr::from)?;
317
+ } else {
318
+ let mut buf = get_file_like(rb_f, true)?;
319
+
320
+ IpcWriter::new(&mut buf)
321
+ .with_compression(compression.0)
322
+ .finish(&mut self.df.borrow_mut())
323
+ .map_err(RbPolarsErr::from)?;
324
+ }
325
+ Ok(())
326
+ }
327
+
328
+ pub fn row_tuple(&self, idx: i64) -> Value {
329
+ let idx = if idx < 0 {
330
+ (self.df.borrow().height() as i64 + idx) as usize
331
+ } else {
332
+ idx as usize
333
+ };
334
+ RArray::from_vec(
335
+ self.df
336
+ .borrow()
337
+ .get_columns()
338
+ .iter()
339
+ .map(|s| match s.dtype() {
340
+ DataType::Object(_) => {
341
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
342
+ obj.unwrap().to_object()
343
+ }
344
+ _ => Wrap(s.get(idx)).into(),
345
+ })
346
+ .collect(),
347
+ )
348
+ .into()
349
+ }
350
+
351
+ pub fn row_tuples(&self) -> Value {
352
+ let df = &self.df;
353
+ RArray::from_vec(
354
+ (0..df.borrow().height())
355
+ .map(|idx| {
356
+ RArray::from_vec(
357
+ self.df
358
+ .borrow()
359
+ .get_columns()
360
+ .iter()
361
+ .map(|s| match s.dtype() {
362
+ DataType::Object(_) => {
363
+ let obj: Option<&ObjectValue> =
364
+ s.get_object(idx).map(|any| any.into());
365
+ obj.unwrap().to_object()
366
+ }
367
+ _ => Wrap(s.get(idx)).into(),
368
+ })
369
+ .collect(),
370
+ )
371
+ })
372
+ .collect(),
373
+ )
374
+ .into()
375
+ }
376
+
188
377
  pub fn write_parquet(
189
378
  &self,
190
379
  rb_f: Value,
@@ -210,6 +399,86 @@ impl RbDataFrame {
210
399
  Ok(())
211
400
  }
212
401
 
402
+ pub fn add(&self, s: &RbSeries) -> RbResult<Self> {
403
+ let df = (&*self.df.borrow() + &*s.series.borrow()).map_err(RbPolarsErr::from)?;
404
+ Ok(df.into())
405
+ }
406
+
407
+ pub fn sub(&self, s: &RbSeries) -> RbResult<Self> {
408
+ let df = (&*self.df.borrow() - &*s.series.borrow()).map_err(RbPolarsErr::from)?;
409
+ Ok(df.into())
410
+ }
411
+
412
+ pub fn div(&self, s: &RbSeries) -> RbResult<Self> {
413
+ let df = (&*self.df.borrow() / &*s.series.borrow()).map_err(RbPolarsErr::from)?;
414
+ Ok(df.into())
415
+ }
416
+
417
+ pub fn mul(&self, s: &RbSeries) -> RbResult<Self> {
418
+ let df = (&*self.df.borrow() * &*s.series.borrow()).map_err(RbPolarsErr::from)?;
419
+ Ok(df.into())
420
+ }
421
+
422
+ pub fn rem(&self, s: &RbSeries) -> RbResult<Self> {
423
+ let df = (&*self.df.borrow() % &*s.series.borrow()).map_err(RbPolarsErr::from)?;
424
+ Ok(df.into())
425
+ }
426
+
427
+ pub fn add_df(&self, s: &Self) -> RbResult<Self> {
428
+ let df = (&*self.df.borrow() + &*s.df.borrow()).map_err(RbPolarsErr::from)?;
429
+ Ok(df.into())
430
+ }
431
+
432
+ pub fn sub_df(&self, s: &Self) -> RbResult<Self> {
433
+ let df = (&*self.df.borrow() - &*s.df.borrow()).map_err(RbPolarsErr::from)?;
434
+ Ok(df.into())
435
+ }
436
+
437
+ pub fn div_df(&self, s: &Self) -> RbResult<Self> {
438
+ let df = (&*self.df.borrow() / &*s.df.borrow()).map_err(RbPolarsErr::from)?;
439
+ Ok(df.into())
440
+ }
441
+
442
+ pub fn mul_df(&self, s: &Self) -> RbResult<Self> {
443
+ let df = (&*self.df.borrow() * &*s.df.borrow()).map_err(RbPolarsErr::from)?;
444
+ Ok(df.into())
445
+ }
446
+
447
+ pub fn rem_df(&self, s: &Self) -> RbResult<Self> {
448
+ let df = (&*self.df.borrow() % &*s.df.borrow()).map_err(RbPolarsErr::from)?;
449
+ Ok(df.into())
450
+ }
451
+
452
+ pub fn sample_n(
453
+ &self,
454
+ n: usize,
455
+ with_replacement: bool,
456
+ shuffle: bool,
457
+ seed: Option<u64>,
458
+ ) -> RbResult<Self> {
459
+ let df = self
460
+ .df
461
+ .borrow()
462
+ .sample_n(n, with_replacement, shuffle, seed)
463
+ .map_err(RbPolarsErr::from)?;
464
+ Ok(df.into())
465
+ }
466
+
467
+ pub fn sample_frac(
468
+ &self,
469
+ frac: f64,
470
+ with_replacement: bool,
471
+ shuffle: bool,
472
+ seed: Option<u64>,
473
+ ) -> RbResult<Self> {
474
+ let df = self
475
+ .df
476
+ .borrow()
477
+ .sample_frac(frac, with_replacement, shuffle, seed)
478
+ .map_err(RbPolarsErr::from)?;
479
+ Ok(df.into())
480
+ }
481
+
213
482
  pub fn rechunk(&self) -> Self {
214
483
  self.df.borrow().agg_chunks().into()
215
484
  }
@@ -240,11 +509,11 @@ impl RbDataFrame {
240
509
  Ok(())
241
510
  }
242
511
 
243
- pub fn dtypes(&self) -> Vec<String> {
512
+ pub fn dtypes(&self) -> Vec<Value> {
244
513
  self.df
245
514
  .borrow()
246
515
  .iter()
247
- .map(|s| s.dtype().to_string())
516
+ .map(|s| Wrap(s.dtype().clone()).into())
248
517
  .collect()
249
518
  }
250
519
 
@@ -265,6 +534,73 @@ impl RbDataFrame {
265
534
  self.df.borrow().width()
266
535
  }
267
536
 
537
+ pub fn hstack_mut(&self, columns: RArray) -> RbResult<()> {
538
+ let columns = to_series_collection(columns)?;
539
+ self.df
540
+ .borrow_mut()
541
+ .hstack_mut(&columns)
542
+ .map_err(RbPolarsErr::from)?;
543
+ Ok(())
544
+ }
545
+
546
+ pub fn hstack(&self, columns: RArray) -> RbResult<Self> {
547
+ let columns = to_series_collection(columns)?;
548
+ let df = self
549
+ .df
550
+ .borrow()
551
+ .hstack(&columns)
552
+ .map_err(RbPolarsErr::from)?;
553
+ Ok(df.into())
554
+ }
555
+
556
+ pub fn extend(&self, df: &RbDataFrame) -> RbResult<()> {
557
+ self.df
558
+ .borrow_mut()
559
+ .extend(&df.df.borrow())
560
+ .map_err(RbPolarsErr::from)?;
561
+ Ok(())
562
+ }
563
+
564
+ pub fn vstack_mut(&self, df: &RbDataFrame) -> RbResult<()> {
565
+ self.df
566
+ .borrow_mut()
567
+ .vstack_mut(&df.df.borrow())
568
+ .map_err(RbPolarsErr::from)?;
569
+ Ok(())
570
+ }
571
+
572
+ pub fn vstack(&self, df: &RbDataFrame) -> RbResult<Self> {
573
+ let df = self
574
+ .df
575
+ .borrow()
576
+ .vstack(&df.df.borrow())
577
+ .map_err(RbPolarsErr::from)?;
578
+ Ok(df.into())
579
+ }
580
+
581
+ pub fn drop_in_place(&self, name: String) -> RbResult<RbSeries> {
582
+ let s = self
583
+ .df
584
+ .borrow_mut()
585
+ .drop_in_place(&name)
586
+ .map_err(RbPolarsErr::from)?;
587
+ Ok(RbSeries::new(s))
588
+ }
589
+
590
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> RbResult<Self> {
591
+ let df = self
592
+ .df
593
+ .borrow()
594
+ .drop_nulls(subset.as_ref().map(|s| s.as_ref()))
595
+ .map_err(RbPolarsErr::from)?;
596
+ Ok(df.into())
597
+ }
598
+
599
+ pub fn drop(&self, name: String) -> RbResult<Self> {
600
+ let df = self.df.borrow().drop(&name).map_err(RbPolarsErr::from)?;
601
+ Ok(RbDataFrame::new(df))
602
+ }
603
+
268
604
  pub fn select_at_idx(&self, idx: usize) -> Option<RbSeries> {
269
605
  self.df
270
606
  .borrow()
@@ -272,6 +608,10 @@ impl RbDataFrame {
272
608
  .map(|s| RbSeries::new(s.clone()))
273
609
  }
274
610
 
611
+ pub fn find_idx_by_name(&self, name: String) -> Option<usize> {
612
+ self.df.borrow().find_idx_by_name(&name)
613
+ }
614
+
275
615
  // TODO remove clone
276
616
  pub fn column(&self, name: String) -> RbResult<RbSeries> {
277
617
  self.df
@@ -418,7 +758,7 @@ impl RbDataFrame {
418
758
  self.df.borrow().partition_by(groups)
419
759
  }
420
760
  .map_err(RbPolarsErr::from)?;
421
- Ok(out.into_iter().map(|v| RbDataFrame::new(v)).collect())
761
+ Ok(out.into_iter().map(RbDataFrame::new).collect())
422
762
  }
423
763
 
424
764
  pub fn shift(&self, periods: i64) -> Self {
@@ -574,6 +914,11 @@ impl RbDataFrame {
574
914
  Ok(out.into())
575
915
  }
576
916
 
917
+ pub fn to_struct(&self, name: String) -> RbSeries {
918
+ let s = self.df.borrow().clone().into_struct(&name);
919
+ s.into_series().into()
920
+ }
921
+
577
922
  pub fn unnest(&self, names: Vec<String>) -> RbResult<Self> {
578
923
  let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
579
924
  Ok(df.into())
@@ -1,5 +1,6 @@
1
1
  use magnus::exception::arg_error;
2
2
  use magnus::Error;
3
+ use polars::error::ArrowError;
3
4
  use polars::prelude::PolarsError;
4
5
 
5
6
  pub struct RbPolarsErr {}
@@ -10,6 +11,14 @@ impl RbPolarsErr {
10
11
  Error::runtime_error(e.to_string())
11
12
  }
12
13
 
14
+ pub fn arrow(e: ArrowError) -> Error {
15
+ Error::runtime_error(e.to_string())
16
+ }
17
+
18
+ pub fn io(e: std::io::Error) -> Error {
19
+ Error::runtime_error(e.to_string())
20
+ }
21
+
13
22
  pub fn other(message: String) -> Error {
14
23
  Error::runtime_error(message)
15
24
  }
@@ -1,18 +1,19 @@
1
1
  use magnus::{Error, RString, Value};
2
2
  use polars::io::mmap::MmapBytesReader;
3
- use std::fs::{File, OpenOptions};
3
+ use std::fs::File;
4
4
  use std::io::Cursor;
5
5
  use std::path::PathBuf;
6
6
 
7
7
  use crate::RbResult;
8
8
 
9
9
  pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
10
- OpenOptions::new()
11
- .write(true)
12
- .create(true)
13
- .truncate(truncate)
14
- .open(f.try_convert::<PathBuf>()?)
15
- .map_err(|e| Error::runtime_error(e.to_string()))
10
+ let str_slice = f.try_convert::<PathBuf>()?;
11
+ let f = if truncate {
12
+ File::create(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
13
+ } else {
14
+ File::open(str_slice).map_err(|e| Error::runtime_error(e.to_string()))?
15
+ };
16
+ Ok(f)
16
17
  }
17
18
 
18
19
  pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
@@ -0,0 +1,7 @@
1
+ use magnus::Value;
2
+ use polars::error::PolarsResult;
3
+ use polars::series::Series;
4
+
5
+ pub fn binary_lambda(_lambda: Value, _a: Series, _b: Series) -> PolarsResult<Series> {
6
+ todo!();
7
+ }
@@ -1,4 +1,5 @@
1
1
  use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
52
53
  }
53
54
 
54
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
55
187
  pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
56
188
  let file = BufWriter::new(get_file_like(rb_f, true)?);
57
189
  serde_json::to_writer(file, &self.ldf.logical_plan)
@@ -388,9 +520,9 @@ impl RbLazyFrame {
388
520
  Ok(self.get_schema()?.iter_names().cloned().collect())
389
521
  }
390
522
 
391
- pub fn dtypes(&self) -> RbResult<Vec<String>> {
523
+ pub fn dtypes(&self) -> RbResult<Vec<Value>> {
392
524
  let schema = self.get_schema()?;
393
- let iter = schema.iter_dtypes().map(|dt| dt.to_string());
525
+ let iter = schema.iter_dtypes().map(|dt| Wrap(dt.clone()).into());
394
526
  Ok(iter.collect())
395
527
  }
396
528
 
@@ -401,7 +533,7 @@ impl RbLazyFrame {
401
533
  schema.iter_fields().for_each(|fld| {
402
534
  // TODO remove unwrap
403
535
  schema_dict
404
- .aset(fld.name().clone(), fld.data_type().to_string())
536
+ .aset::<String, Value>(fld.name().clone(), Wrap(fld.data_type().clone()).into())
405
537
  .unwrap();
406
538
  });
407
539
  Ok(schema_dict)