polars-df 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
4
4
  [patch.crates-io]
5
5
  jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
6
6
  halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
7
- arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
8
7
 
9
8
  [profile.release]
10
9
  strip = true
data/README.md CHANGED
@@ -50,6 +50,9 @@ From Parquet
50
50
 
51
51
  ```ruby
52
52
  Polars.read_parquet("file.parquet")
53
+
54
+ # or lazily with
55
+ Polars.scan_parquet("file.parquet")
53
56
  ```
54
57
 
55
58
  From Active Record
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
60
63
  Polars.read_sql("SELECT * FROM users")
61
64
  ```
62
65
 
66
+ From JSON
67
+
68
+ ```ruby
69
+ Polars.read_json("file.json")
70
+ # or
71
+ Polars.read_ndjson("file.ndjson")
72
+
73
+ # or lazily with
74
+ Polars.scan_ndjson("file.ndjson")
75
+ ```
76
+
77
+ From Feather / Arrow IPC
78
+
79
+ ```ruby
80
+ Polars.read_ipc("file.arrow")
81
+
82
+ # or lazily with
83
+ Polars.scan_ipc("file.arrow")
84
+ ```
85
+
86
+ From Avro
87
+
88
+ ```ruby
89
+ Polars.read_avro("file.avro")
90
+ ```
91
+
63
92
  From a hash
64
93
 
65
94
  ```ruby
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.3.1"
3
+ version = "0.4.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
14
  magnus = "0.5"
15
- polars-core = "0.27.0"
15
+ polars-core = "0.28.0"
16
16
  serde_json = "1"
17
+ smartstring = "1"
17
18
 
18
19
  [dependencies.polars]
19
- version = "0.27.0"
20
+ version = "0.28.0"
20
21
  features = [
21
22
  "abs",
22
23
  "arange",
@@ -44,6 +45,7 @@ features = [
44
45
  "ipc",
45
46
  "is_first",
46
47
  "is_in",
48
+ "is_unique",
47
49
  "json",
48
50
  "lazy",
49
51
  "lazy_regex",
@@ -7,11 +7,17 @@ use std::cell::RefCell;
7
7
  use std::path::PathBuf;
8
8
 
9
9
  use crate::conversion::*;
10
+ use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
10
11
  use crate::{RbDataFrame, RbPolarsErr, RbResult};
11
12
 
13
+ pub enum BatchedReader {
14
+ MMap(OwnedBatchedCsvReaderMmap),
15
+ Read(OwnedBatchedCsvReader),
16
+ }
17
+
12
18
  #[magnus::wrap(class = "Polars::RbBatchedCsv")]
13
19
  pub struct RbBatchedCsv {
14
- pub reader: RefCell<OwnedBatchedCsvReader>,
20
+ pub reader: RefCell<BatchedReader>,
15
21
  }
16
22
 
17
23
  impl RbBatchedCsv {
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
38
44
  let comment_char: Option<String> = arguments[16].try_convert()?;
39
45
  let quote_char: Option<String> = arguments[17].try_convert()?;
40
46
  let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
41
- let parse_dates: bool = arguments[19].try_convert()?;
47
+ let try_parse_dates: bool = arguments[19].try_convert()?;
42
48
  let skip_rows_after_header: usize = arguments[20].try_convert()?;
43
49
  let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
44
50
  let sample_size: usize = arguments[22].try_convert()?;
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
95
101
  .low_memory(low_memory)
96
102
  .with_comment_char(comment_char)
97
103
  .with_null_values(null_values)
98
- .with_parse_dates(parse_dates)
104
+ .with_try_parse_dates(try_parse_dates)
99
105
  .with_quote_char(quote_char)
100
106
  .with_end_of_line_char(eol_char)
101
107
  .with_skip_rows_after_header(skip_rows_after_header)
102
108
  .with_row_count(row_count)
103
- .sample_size(sample_size)
104
- .batched(overwrite_dtype.map(Arc::new))
105
- .map_err(RbPolarsErr::from)?;
109
+ .sample_size(sample_size);
110
+
111
+ let reader = if low_memory {
112
+ let reader = reader
113
+ .batched_read(overwrite_dtype.map(Arc::new))
114
+ .map_err(RbPolarsErr::from)?;
115
+ BatchedReader::Read(reader)
116
+ } else {
117
+ let reader = reader
118
+ .batched_mmap(overwrite_dtype.map(Arc::new))
119
+ .map_err(RbPolarsErr::from)?;
120
+ BatchedReader::MMap(reader)
121
+ };
106
122
 
107
123
  Ok(RbBatchedCsv {
108
124
  reader: RefCell::new(reader),
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
110
126
  }
111
127
 
112
128
  pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
113
- let batches = self
114
- .reader
115
- .borrow_mut()
116
- .next_batches(n)
117
- .map_err(RbPolarsErr::from)?;
118
- Ok(batches.map(|batches| {
119
- RArray::from_iter(batches.into_iter().map(|out| RbDataFrame::from(out.1)))
120
- }))
129
+ let batches = match &mut *self.reader.borrow_mut() {
130
+ BatchedReader::MMap(reader) => reader.next_batches(n),
131
+ BatchedReader::Read(reader) => reader.next_batches(n),
132
+ }
133
+ .map_err(RbPolarsErr::from)?;
134
+
135
+ Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
121
136
  }
122
137
  }
@@ -1,3 +1,6 @@
1
+ use std::fmt::{Display, Formatter};
2
+ use std::hash::{Hash, Hasher};
3
+
1
4
  use magnus::{
2
5
  class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
3
6
  RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
10
13
  use polars::io::avro::AvroCompression;
11
14
  use polars::prelude::*;
12
15
  use polars::series::ops::NullBehavior;
13
- use std::fmt::{Display, Formatter};
14
- use std::hash::{Hash, Hasher};
16
+ use smartstring::alias::String as SmartString;
15
17
 
16
18
  use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
17
19
 
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
82
84
  }
83
85
  }
84
86
 
87
+ impl TryConvert for Wrap<BinaryChunked> {
88
+ fn try_convert(obj: Value) -> RbResult<Self> {
89
+ let (seq, len) = get_rbseq(obj)?;
90
+ let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
91
+
92
+ for res in seq.each() {
93
+ let item = res?;
94
+ match item.try_convert::<RString>() {
95
+ Ok(val) => builder.append_value(unsafe { val.as_slice() }),
96
+ Err(_) => builder.append_null(),
97
+ }
98
+ }
99
+ Ok(Wrap(builder.finish()))
100
+ }
101
+ }
102
+
85
103
  impl TryConvert for Wrap<NullValues> {
86
104
  fn try_convert(ob: Value) -> RbResult<Self> {
87
105
  if let Ok(s) = ob.try_convert::<String>() {
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
98
116
  }
99
117
  }
100
118
 
119
+ fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
120
+ let dict = RHash::new();
121
+ for (fld, val) in flds.iter().zip(vals) {
122
+ dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
123
+ }
124
+ dict.into_value()
125
+ }
126
+
101
127
  impl IntoValue for Wrap<AnyValue<'_>> {
102
128
  fn into_value_with(self, _: &RubyHandle) -> Value {
103
129
  match self.0 {
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
114
140
  AnyValue::Null => *QNIL,
115
141
  AnyValue::Boolean(v) => Value::from(v),
116
142
  AnyValue::Utf8(v) => Value::from(v),
117
- AnyValue::Utf8Owned(_v) => todo!(),
143
+ AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
118
144
  AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
119
145
  AnyValue::Date(v) => class::time()
120
146
  .funcall::<_, _, Value>("at", (v * 86400,))
@@ -157,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
157
183
  AnyValue::Duration(_v, _tu) => todo!(),
158
184
  AnyValue::Time(_v) => todo!(),
159
185
  AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
160
- ref _av @ AnyValue::Struct(_, _, _flds) => todo!(),
161
- AnyValue::StructOwned(_payload) => todo!(),
162
- AnyValue::Object(_v) => todo!(),
163
- AnyValue::ObjectOwned(_v) => todo!(),
164
- AnyValue::Binary(_v) => todo!(),
165
- AnyValue::BinaryOwned(_v) => todo!(),
186
+ ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
187
+ AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
188
+ AnyValue::Object(v) => {
189
+ let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
190
+ object.inner
191
+ }
192
+ AnyValue::ObjectOwned(v) => {
193
+ let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
194
+ object.inner
195
+ }
196
+ AnyValue::Binary(v) => RString::from_slice(v).into_value(),
197
+ AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
198
+ AnyValue::Decimal(_v, _scale) => todo!(),
166
199
  }
167
200
  }
168
201
  }
@@ -182,7 +215,7 @@ impl IntoValue for Wrap<DataType> {
182
215
  DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
183
216
  DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
184
217
  DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
185
- DataType::Decimal128(_) => todo!(),
218
+ DataType::Decimal(_precision, _scale) => todo!(),
186
219
  DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
187
220
  DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
188
221
  DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
@@ -210,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
210
243
  DataType::Struct(fields) => {
211
244
  let field_class = pl.const_get::<_, Value>("Field").unwrap();
212
245
  let iter = fields.iter().map(|fld| {
213
- let name = fld.name().clone();
246
+ let name = fld.name().as_str();
214
247
  let dtype = Wrap(fld.data_type().clone());
215
248
  field_class
216
249
  .funcall::<_, _, Value>("new", (name, dtype))
@@ -340,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
340
373
  let n = 25;
341
374
  let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
342
375
  .map_err(RbPolarsErr::from)?;
343
- let s = Series::from_any_values_and_dtype("", &avs, &dtype)
376
+ let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
344
377
  .map_err(RbPolarsErr::from)?;
345
378
  Ok(Wrap(AnyValue::List(s)))
346
379
  }
@@ -870,3 +903,11 @@ pub fn parse_parquet_compression(
870
903
  };
871
904
  Ok(parsed)
872
905
  }
906
+
907
+ pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
908
+ where
909
+ I: IntoIterator<Item = S>,
910
+ S: AsRef<str>,
911
+ {
912
+ container.into_iter().map(|s| s.as_ref().into()).collect()
913
+ }
@@ -115,7 +115,7 @@ impl RbDataFrame {
115
115
  let comment_char: Option<String> = arguments[17].try_convert()?;
116
116
  let quote_char: Option<String> = arguments[18].try_convert()?;
117
117
  let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
118
- let parse_dates: bool = arguments[20].try_convert()?;
118
+ let try_parse_dates: bool = arguments[20].try_convert()?;
119
119
  let skip_rows_after_header: usize = arguments[21].try_convert()?;
120
120
  let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
121
121
  let sample_size: usize = arguments[23].try_convert()?;
@@ -168,12 +168,12 @@ impl RbDataFrame {
168
168
  .with_columns(columns)
169
169
  .with_n_threads(n_threads)
170
170
  .with_path(path)
171
- .with_dtypes(overwrite_dtype.as_ref())
171
+ .with_dtypes(overwrite_dtype.map(Arc::new))
172
172
  .with_dtypes_slice(overwrite_dtype_slice.as_deref())
173
173
  .low_memory(low_memory)
174
174
  .with_comment_char(comment_char)
175
175
  .with_null_values(null_values)
176
- .with_parse_dates(parse_dates)
176
+ .with_try_parse_dates(try_parse_dates)
177
177
  .with_quote_char(quote_char)
178
178
  .with_end_of_line_char(eol_char)
179
179
  .with_skip_rows_after_header(skip_rows_after_header)
@@ -184,6 +184,7 @@ impl RbDataFrame {
184
184
  Ok(df.into())
185
185
  }
186
186
 
187
+ #[allow(clippy::too_many_arguments)]
187
188
  pub fn read_parquet(
188
189
  rb_f: Value,
189
190
  columns: Option<Vec<String>>,
@@ -192,6 +193,8 @@ impl RbDataFrame {
192
193
  parallel: Wrap<ParallelStrategy>,
193
194
  row_count: Option<(String, IdxSize)>,
194
195
  low_memory: bool,
196
+ use_statistics: bool,
197
+ rechunk: bool,
195
198
  ) -> RbResult<Self> {
196
199
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
197
200
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
@@ -202,6 +205,8 @@ impl RbDataFrame {
202
205
  .with_n_rows(n_rows)
203
206
  .with_row_count(row_count)
204
207
  .set_low_memory(low_memory)
208
+ .use_statistics(use_statistics)
209
+ .set_rechunk(rechunk)
205
210
  .finish()
206
211
  .map_err(RbPolarsErr::from)?;
207
212
  Ok(RbDataFrame::new(df))
@@ -254,7 +259,7 @@ impl RbDataFrame {
254
259
  use polars::io::avro::AvroWriter;
255
260
 
256
261
  if let Ok(s) = rb_f.try_convert::<String>() {
257
- let f = std::fs::File::create(&s).unwrap();
262
+ let f = std::fs::File::create(s).unwrap();
258
263
  AvroWriter::new(f)
259
264
  .with_compression(compression.0)
260
265
  .finish(&mut self.df.borrow_mut())
@@ -339,7 +344,7 @@ impl RbDataFrame {
339
344
  // ensure the new names are used
340
345
  if let Some(schema) = &schema_overwrite {
341
346
  for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
342
- *name = new_name.clone();
347
+ *name = new_name.to_string();
343
348
  }
344
349
  }
345
350
  let rbdf = Self::finish_from_rows(
@@ -348,17 +353,19 @@ impl RbDataFrame {
348
353
  schema_overwrite.map(|wrap| wrap.0),
349
354
  )?;
350
355
 
351
- rbdf.df
352
- .borrow_mut()
353
- .get_columns_mut()
354
- .iter_mut()
355
- .zip(&names)
356
- .for_each(|(s, name)| {
357
- s.rename(name);
358
- });
356
+ unsafe {
357
+ rbdf.df
358
+ .borrow_mut()
359
+ .get_columns_mut()
360
+ .iter_mut()
361
+ .zip(&names)
362
+ .for_each(|(s, name)| {
363
+ s.rename(name);
364
+ });
365
+ }
359
366
  let length = names.len();
360
367
  if names.into_iter().collect::<PlHashSet<_>>().len() != length {
361
- let err = PolarsError::SchemaMisMatch("duplicate column names found".into());
368
+ let err = PolarsError::SchemaMismatch("duplicate column names found".into());
362
369
  Err(RbPolarsErr::from(err))?;
363
370
  }
364
371
 
@@ -394,7 +401,7 @@ impl RbDataFrame {
394
401
  let null = null_value.unwrap_or_default();
395
402
 
396
403
  if let Ok(s) = rb_f.try_convert::<String>() {
397
- let f = std::fs::File::create(&s).unwrap();
404
+ let f = std::fs::File::create(s).unwrap();
398
405
  // no need for a buffered writer, because the csv writer does internal buffering
399
406
  CsvWriter::new(f)
400
407
  .has_header(has_header)
@@ -436,7 +443,7 @@ impl RbDataFrame {
436
443
  compression: Wrap<Option<IpcCompression>>,
437
444
  ) -> RbResult<()> {
438
445
  if let Ok(s) = rb_f.try_convert::<String>() {
439
- let f = std::fs::File::create(&s).unwrap();
446
+ let f = std::fs::File::create(s).unwrap();
440
447
  IpcWriter::new(f)
441
448
  .with_compression(compression.0)
442
449
  .finish(&mut self.df.borrow_mut())
@@ -524,7 +531,7 @@ impl RbDataFrame {
524
531
  let compression = parse_parquet_compression(&compression, compression_level)?;
525
532
 
526
533
  if let Ok(s) = rb_f.try_convert::<String>() {
527
- let f = std::fs::File::create(&s).unwrap();
534
+ let f = std::fs::File::create(s).unwrap();
528
535
  ParquetWriter::new(f)
529
536
  .with_compression(compression)
530
537
  .with_statistics(statistics)
@@ -627,7 +634,7 @@ impl RbDataFrame {
627
634
  }
628
635
 
629
636
  pub fn get_columns(&self) -> RArray {
630
- let cols = self.df.borrow().get_columns().clone();
637
+ let cols = self.df.borrow().get_columns().to_vec();
631
638
  to_rbseries_collection(cols)
632
639
  }
633
640
 
@@ -881,10 +888,11 @@ impl RbDataFrame {
881
888
  variable_name: Option<String>,
882
889
  ) -> RbResult<Self> {
883
890
  let args = MeltArgs {
884
- id_vars,
885
- value_vars,
886
- value_name,
887
- variable_name,
891
+ id_vars: strings_to_smartstrings(id_vars),
892
+ value_vars: strings_to_smartstrings(value_vars),
893
+ value_name: value_name.map(|s| s.into()),
894
+ variable_name: variable_name.map(|s| s.into()),
895
+ streamable: false,
888
896
  };
889
897
 
890
898
  let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
@@ -897,22 +905,26 @@ impl RbDataFrame {
897
905
  values: Vec<String>,
898
906
  index: Vec<String>,
899
907
  columns: Vec<String>,
900
- aggregate_expr: &RbExpr,
901
908
  maintain_order: bool,
902
909
  sort_columns: bool,
910
+ aggregate_expr: Option<&RbExpr>,
903
911
  separator: Option<String>,
904
912
  ) -> RbResult<Self> {
905
913
  let fun = match maintain_order {
906
914
  true => pivot_stable,
907
915
  false => pivot,
908
916
  };
917
+ let agg_expr = match aggregate_expr {
918
+ Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
919
+ None => None,
920
+ };
909
921
  let df = fun(
910
922
  &self.df.borrow(),
911
923
  values,
912
924
  index,
913
925
  columns,
914
- aggregate_expr.inner.clone(),
915
926
  sort_columns,
927
+ agg_expr,
916
928
  separator.as_deref(),
917
929
  )
918
930
  .map_err(RbPolarsErr::from)?;
@@ -933,21 +945,6 @@ impl RbDataFrame {
933
945
  self.df.borrow().shift(periods).into()
934
946
  }
935
947
 
936
- pub fn unique(
937
- &self,
938
- maintain_order: bool,
939
- subset: Option<Vec<String>>,
940
- keep: Wrap<UniqueKeepStrategy>,
941
- ) -> RbResult<Self> {
942
- let subset = subset.as_ref().map(|v| v.as_ref());
943
- let df = match maintain_order {
944
- true => self.df.borrow().unique_stable(subset, keep.0),
945
- false => self.df.borrow().unique(subset, keep.0),
946
- }
947
- .map_err(RbPolarsErr::from)?;
948
- Ok(df.into())
949
- }
950
-
951
948
  pub fn lazy(&self) -> RbLazyFrame {
952
949
  self.df.borrow().clone().lazy().into()
953
950
  }
@@ -4,6 +4,7 @@ use polars::lazy::frame::{LazyFrame, LazyGroupBy};
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::path::PathBuf;
7
8
 
8
9
  use crate::conversion::*;
9
10
  use crate::file::get_file_like;
@@ -118,7 +119,7 @@ impl RbLazyFrame {
118
119
  let skip_rows_after_header: usize = arguments[15].try_convert()?;
119
120
  let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
120
121
  let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
121
- let parse_dates: bool = arguments[18].try_convert()?;
122
+ let try_parse_dates: bool = arguments[18].try_convert()?;
122
123
  let eol_char: String = arguments[19].try_convert()?;
123
124
  // end arguments
124
125
 
@@ -153,7 +154,7 @@ impl RbLazyFrame {
153
154
  .with_skip_rows_after_header(skip_rows_after_header)
154
155
  .with_encoding(encoding.0)
155
156
  .with_row_count(row_count)
156
- .with_parse_dates(parse_dates)
157
+ .with_try_parse_dates(try_parse_dates)
157
158
  .with_null_values(null_values);
158
159
 
159
160
  if let Some(_lambda) = with_schema_modify {
@@ -163,6 +164,7 @@ impl RbLazyFrame {
163
164
  Ok(r.finish().map_err(RbPolarsErr::from)?.into())
164
165
  }
165
166
 
167
+ #[allow(clippy::too_many_arguments)]
166
168
  pub fn new_from_parquet(
167
169
  path: String,
168
170
  n_rows: Option<usize>,
@@ -171,6 +173,7 @@ impl RbLazyFrame {
171
173
  rechunk: bool,
172
174
  row_count: Option<(String, IdxSize)>,
173
175
  low_memory: bool,
176
+ use_statistics: bool,
174
177
  ) -> RbResult<Self> {
175
178
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
179
  let args = ScanArgsParquet {
@@ -182,6 +185,7 @@ impl RbLazyFrame {
182
185
  low_memory,
183
186
  // TODO support cloud options
184
187
  cloud_options: None,
188
+ use_statistics,
185
189
  };
186
190
  let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
187
191
  Ok(lf.into())
@@ -284,6 +288,32 @@ impl RbLazyFrame {
284
288
  Ok(df.into())
285
289
  }
286
290
 
291
+ #[allow(clippy::too_many_arguments)]
292
+ pub fn sink_parquet(
293
+ &self,
294
+ path: PathBuf,
295
+ compression: String,
296
+ compression_level: Option<i32>,
297
+ statistics: bool,
298
+ row_group_size: Option<usize>,
299
+ data_pagesize_limit: Option<usize>,
300
+ maintain_order: bool,
301
+ ) -> RbResult<()> {
302
+ let compression = parse_parquet_compression(&compression, compression_level)?;
303
+
304
+ let options = ParquetWriteOptions {
305
+ compression,
306
+ statistics,
307
+ row_group_size,
308
+ data_pagesize_limit,
309
+ maintain_order,
310
+ };
311
+
312
+ let ldf = self.ldf.clone();
313
+ ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
314
+ Ok(())
315
+ }
316
+
287
317
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
288
318
  let ldf = self.ldf.clone();
289
319
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -328,7 +358,7 @@ impl RbLazyFrame {
328
358
  let lazy_gb = ldf.groupby_rolling(
329
359
  by,
330
360
  RollingGroupOptions {
331
- index_column,
361
+ index_column: index_column.into(),
332
362
  period: Duration::parse(&period),
333
363
  offset: Duration::parse(&offset),
334
364
  closed_window,
@@ -359,7 +389,7 @@ impl RbLazyFrame {
359
389
  let lazy_gb = ldf.groupby_dynamic(
360
390
  by,
361
391
  DynamicGroupOptions {
362
- index_column,
392
+ index_column: index_column.into(),
363
393
  every: Duration::parse(&every),
364
394
  period: Duration::parse(&period),
365
395
  offset: Duration::parse(&offset),
@@ -415,10 +445,10 @@ impl RbLazyFrame {
415
445
  .force_parallel(force_parallel)
416
446
  .how(JoinType::AsOf(AsOfOptions {
417
447
  strategy: strategy.0,
418
- left_by,
419
- right_by,
448
+ left_by: left_by.map(strings_to_smartstrings),
449
+ right_by: right_by.map(strings_to_smartstrings),
420
450
  tolerance: tolerance.map(|t| t.0.into_static().unwrap()),
421
- tolerance_str,
451
+ tolerance_str: tolerance_str.map(|s| s.into()),
422
452
  }))
423
453
  .suffix(suffix)
424
454
  .finish()
@@ -570,12 +600,14 @@ impl RbLazyFrame {
570
600
  value_vars: Vec<String>,
571
601
  value_name: Option<String>,
572
602
  variable_name: Option<String>,
603
+ streamable: bool,
573
604
  ) -> Self {
574
605
  let args = MeltArgs {
575
- id_vars,
576
- value_vars,
577
- value_name,
578
- variable_name,
606
+ id_vars: strings_to_smartstrings(id_vars),
607
+ value_vars: strings_to_smartstrings(value_vars),
608
+ value_name: value_name.map(|s| s.into()),
609
+ variable_name: variable_name.map(|s| s.into()),
610
+ streamable,
579
611
  };
580
612
 
581
613
  let ldf = self.ldf.clone();
@@ -596,8 +628,10 @@ impl RbLazyFrame {
596
628
  self.ldf.clone().into()
597
629
  }
598
630
 
599
- pub fn columns(&self) -> RbResult<Vec<String>> {
600
- Ok(self.get_schema()?.iter_names().cloned().collect())
631
+ pub fn columns(&self) -> RbResult<RArray> {
632
+ let schema = self.get_schema()?;
633
+ let iter = schema.iter_names().map(|s| s.as_str());
634
+ Ok(RArray::from_iter(iter))
601
635
  }
602
636
 
603
637
  pub fn dtypes(&self) -> RbResult<RArray> {
@@ -614,7 +648,7 @@ impl RbLazyFrame {
614
648
  // TODO remove unwrap
615
649
  schema_dict
616
650
  .aset::<String, Value>(
617
- fld.name().clone(),
651
+ fld.name().to_string(),
618
652
  Wrap(fld.data_type().clone()).into_value(),
619
653
  )
620
654
  .unwrap();