polars-df 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml CHANGED
@@ -4,7 +4,6 @@ members = ["ext/polars"]
4
4
  [patch.crates-io]
5
5
  jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
6
6
  halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
7
- arrow2 = { git = "https://github.com/ankane/arrow2", rev = "ef0270922a217070ba9942567c0ff3263ae8c531" }
8
7
 
9
8
  [profile.release]
10
9
  strip = true
data/README.md CHANGED
@@ -50,6 +50,9 @@ From Parquet
50
50
 
51
51
  ```ruby
52
52
  Polars.read_parquet("file.parquet")
53
+
54
+ # or lazily with
55
+ Polars.scan_parquet("file.parquet")
53
56
  ```
54
57
 
55
58
  From Active Record
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
60
63
  Polars.read_sql("SELECT * FROM users")
61
64
  ```
62
65
 
66
+ From JSON
67
+
68
+ ```ruby
69
+ Polars.read_json("file.json")
70
+ # or
71
+ Polars.read_ndjson("file.ndjson")
72
+
73
+ # or lazily with
74
+ Polars.scan_ndjson("file.ndjson")
75
+ ```
76
+
77
+ From Feather / Arrow IPC
78
+
79
+ ```ruby
80
+ Polars.read_ipc("file.arrow")
81
+
82
+ # or lazily with
83
+ Polars.scan_ipc("file.arrow")
84
+ ```
85
+
86
+ From Avro
87
+
88
+ ```ruby
89
+ Polars.read_avro("file.avro")
90
+ ```
91
+
63
92
  From a hash
64
93
 
65
94
  ```ruby
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.3.1"
3
+ version = "0.4.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -12,11 +12,12 @@ crate-type = ["cdylib"]
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
14
  magnus = "0.5"
15
- polars-core = "0.27.0"
15
+ polars-core = "0.28.0"
16
16
  serde_json = "1"
17
+ smartstring = "1"
17
18
 
18
19
  [dependencies.polars]
19
- version = "0.27.0"
20
+ version = "0.28.0"
20
21
  features = [
21
22
  "abs",
22
23
  "arange",
@@ -44,6 +45,7 @@ features = [
44
45
  "ipc",
45
46
  "is_first",
46
47
  "is_in",
48
+ "is_unique",
47
49
  "json",
48
50
  "lazy",
49
51
  "lazy_regex",
@@ -7,11 +7,17 @@ use std::cell::RefCell;
7
7
  use std::path::PathBuf;
8
8
 
9
9
  use crate::conversion::*;
10
+ use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
10
11
  use crate::{RbDataFrame, RbPolarsErr, RbResult};
11
12
 
13
+ pub enum BatchedReader {
14
+ MMap(OwnedBatchedCsvReaderMmap),
15
+ Read(OwnedBatchedCsvReader),
16
+ }
17
+
12
18
  #[magnus::wrap(class = "Polars::RbBatchedCsv")]
13
19
  pub struct RbBatchedCsv {
14
- pub reader: RefCell<OwnedBatchedCsvReader>,
20
+ pub reader: RefCell<BatchedReader>,
15
21
  }
16
22
 
17
23
  impl RbBatchedCsv {
@@ -38,7 +44,7 @@ impl RbBatchedCsv {
38
44
  let comment_char: Option<String> = arguments[16].try_convert()?;
39
45
  let quote_char: Option<String> = arguments[17].try_convert()?;
40
46
  let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
41
- let parse_dates: bool = arguments[19].try_convert()?;
47
+ let try_parse_dates: bool = arguments[19].try_convert()?;
42
48
  let skip_rows_after_header: usize = arguments[20].try_convert()?;
43
49
  let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
44
50
  let sample_size: usize = arguments[22].try_convert()?;
@@ -95,14 +101,24 @@ impl RbBatchedCsv {
95
101
  .low_memory(low_memory)
96
102
  .with_comment_char(comment_char)
97
103
  .with_null_values(null_values)
98
- .with_parse_dates(parse_dates)
104
+ .with_try_parse_dates(try_parse_dates)
99
105
  .with_quote_char(quote_char)
100
106
  .with_end_of_line_char(eol_char)
101
107
  .with_skip_rows_after_header(skip_rows_after_header)
102
108
  .with_row_count(row_count)
103
- .sample_size(sample_size)
104
- .batched(overwrite_dtype.map(Arc::new))
105
- .map_err(RbPolarsErr::from)?;
109
+ .sample_size(sample_size);
110
+
111
+ let reader = if low_memory {
112
+ let reader = reader
113
+ .batched_read(overwrite_dtype.map(Arc::new))
114
+ .map_err(RbPolarsErr::from)?;
115
+ BatchedReader::Read(reader)
116
+ } else {
117
+ let reader = reader
118
+ .batched_mmap(overwrite_dtype.map(Arc::new))
119
+ .map_err(RbPolarsErr::from)?;
120
+ BatchedReader::MMap(reader)
121
+ };
106
122
 
107
123
  Ok(RbBatchedCsv {
108
124
  reader: RefCell::new(reader),
@@ -110,13 +126,12 @@ impl RbBatchedCsv {
110
126
  }
111
127
 
112
128
  pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
113
- let batches = self
114
- .reader
115
- .borrow_mut()
116
- .next_batches(n)
117
- .map_err(RbPolarsErr::from)?;
118
- Ok(batches.map(|batches| {
119
- RArray::from_iter(batches.into_iter().map(|out| RbDataFrame::from(out.1)))
120
- }))
129
+ let batches = match &mut *self.reader.borrow_mut() {
130
+ BatchedReader::MMap(reader) => reader.next_batches(n),
131
+ BatchedReader::Read(reader) => reader.next_batches(n),
132
+ }
133
+ .map_err(RbPolarsErr::from)?;
134
+
135
+ Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
121
136
  }
122
137
  }
@@ -1,3 +1,6 @@
1
+ use std::fmt::{Display, Formatter};
2
+ use std::hash::{Hash, Hasher};
3
+
1
4
  use magnus::{
2
5
  class, exception, r_hash::ForEach, ruby_handle::RubyHandle, Integer, IntoValue, Module, RArray,
3
6
  RFloat, RHash, RString, Symbol, TryConvert, Value, QNIL,
@@ -10,8 +13,7 @@ use polars::frame::NullStrategy;
10
13
  use polars::io::avro::AvroCompression;
11
14
  use polars::prelude::*;
12
15
  use polars::series::ops::NullBehavior;
13
- use std::fmt::{Display, Formatter};
14
- use std::hash::{Hash, Hasher};
16
+ use smartstring::alias::String as SmartString;
15
17
 
16
18
  use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
17
19
 
@@ -82,6 +84,22 @@ impl TryConvert for Wrap<Utf8Chunked> {
82
84
  }
83
85
  }
84
86
 
87
+ impl TryConvert for Wrap<BinaryChunked> {
88
+ fn try_convert(obj: Value) -> RbResult<Self> {
89
+ let (seq, len) = get_rbseq(obj)?;
90
+ let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
91
+
92
+ for res in seq.each() {
93
+ let item = res?;
94
+ match item.try_convert::<RString>() {
95
+ Ok(val) => builder.append_value(unsafe { val.as_slice() }),
96
+ Err(_) => builder.append_null(),
97
+ }
98
+ }
99
+ Ok(Wrap(builder.finish()))
100
+ }
101
+ }
102
+
85
103
  impl TryConvert for Wrap<NullValues> {
86
104
  fn try_convert(ob: Value) -> RbResult<Self> {
87
105
  if let Ok(s) = ob.try_convert::<String>() {
@@ -98,6 +116,14 @@ impl TryConvert for Wrap<NullValues> {
98
116
  }
99
117
  }
100
118
 
119
+ fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
120
+ let dict = RHash::new();
121
+ for (fld, val) in flds.iter().zip(vals) {
122
+ dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
123
+ }
124
+ dict.into_value()
125
+ }
126
+
101
127
  impl IntoValue for Wrap<AnyValue<'_>> {
102
128
  fn into_value_with(self, _: &RubyHandle) -> Value {
103
129
  match self.0 {
@@ -114,7 +140,7 @@ impl IntoValue for Wrap<AnyValue<'_>> {
114
140
  AnyValue::Null => *QNIL,
115
141
  AnyValue::Boolean(v) => Value::from(v),
116
142
  AnyValue::Utf8(v) => Value::from(v),
117
- AnyValue::Utf8Owned(_v) => todo!(),
143
+ AnyValue::Utf8Owned(v) => Value::from(v.as_str()),
118
144
  AnyValue::Categorical(_idx, _rev, _arr) => todo!(),
119
145
  AnyValue::Date(v) => class::time()
120
146
  .funcall::<_, _, Value>("at", (v * 86400,))
@@ -157,12 +183,19 @@ impl IntoValue for Wrap<AnyValue<'_>> {
157
183
  AnyValue::Duration(_v, _tu) => todo!(),
158
184
  AnyValue::Time(_v) => todo!(),
159
185
  AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
160
- ref _av @ AnyValue::Struct(_, _, _flds) => todo!(),
161
- AnyValue::StructOwned(_payload) => todo!(),
162
- AnyValue::Object(_v) => todo!(),
163
- AnyValue::ObjectOwned(_v) => todo!(),
164
- AnyValue::Binary(_v) => todo!(),
165
- AnyValue::BinaryOwned(_v) => todo!(),
186
+ ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
187
+ AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
188
+ AnyValue::Object(v) => {
189
+ let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
190
+ object.inner
191
+ }
192
+ AnyValue::ObjectOwned(v) => {
193
+ let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
194
+ object.inner
195
+ }
196
+ AnyValue::Binary(v) => RString::from_slice(v).into_value(),
197
+ AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
198
+ AnyValue::Decimal(_v, _scale) => todo!(),
166
199
  }
167
200
  }
168
201
  }
@@ -182,7 +215,7 @@ impl IntoValue for Wrap<DataType> {
182
215
  DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
183
216
  DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
184
217
  DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
185
- DataType::Decimal128(_) => todo!(),
218
+ DataType::Decimal(_precision, _scale) => todo!(),
186
219
  DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
187
220
  DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
188
221
  DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
@@ -210,7 +243,7 @@ impl IntoValue for Wrap<DataType> {
210
243
  DataType::Struct(fields) => {
211
244
  let field_class = pl.const_get::<_, Value>("Field").unwrap();
212
245
  let iter = fields.iter().map(|fld| {
213
- let name = fld.name().clone();
246
+ let name = fld.name().as_str();
214
247
  let dtype = Wrap(fld.data_type().clone());
215
248
  field_class
216
249
  .funcall::<_, _, Value>("new", (name, dtype))
@@ -340,7 +373,7 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
340
373
  let n = 25;
341
374
  let dtype = any_values_to_dtype(&avs[..std::cmp::min(avs.len(), n)])
342
375
  .map_err(RbPolarsErr::from)?;
343
- let s = Series::from_any_values_and_dtype("", &avs, &dtype)
376
+ let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
344
377
  .map_err(RbPolarsErr::from)?;
345
378
  Ok(Wrap(AnyValue::List(s)))
346
379
  }
@@ -870,3 +903,11 @@ pub fn parse_parquet_compression(
870
903
  };
871
904
  Ok(parsed)
872
905
  }
906
+
907
+ pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
908
+ where
909
+ I: IntoIterator<Item = S>,
910
+ S: AsRef<str>,
911
+ {
912
+ container.into_iter().map(|s| s.as_ref().into()).collect()
913
+ }
@@ -115,7 +115,7 @@ impl RbDataFrame {
115
115
  let comment_char: Option<String> = arguments[17].try_convert()?;
116
116
  let quote_char: Option<String> = arguments[18].try_convert()?;
117
117
  let null_values: Option<Wrap<NullValues>> = arguments[19].try_convert()?;
118
- let parse_dates: bool = arguments[20].try_convert()?;
118
+ let try_parse_dates: bool = arguments[20].try_convert()?;
119
119
  let skip_rows_after_header: usize = arguments[21].try_convert()?;
120
120
  let row_count: Option<(String, IdxSize)> = arguments[22].try_convert()?;
121
121
  let sample_size: usize = arguments[23].try_convert()?;
@@ -168,12 +168,12 @@ impl RbDataFrame {
168
168
  .with_columns(columns)
169
169
  .with_n_threads(n_threads)
170
170
  .with_path(path)
171
- .with_dtypes(overwrite_dtype.as_ref())
171
+ .with_dtypes(overwrite_dtype.map(Arc::new))
172
172
  .with_dtypes_slice(overwrite_dtype_slice.as_deref())
173
173
  .low_memory(low_memory)
174
174
  .with_comment_char(comment_char)
175
175
  .with_null_values(null_values)
176
- .with_parse_dates(parse_dates)
176
+ .with_try_parse_dates(try_parse_dates)
177
177
  .with_quote_char(quote_char)
178
178
  .with_end_of_line_char(eol_char)
179
179
  .with_skip_rows_after_header(skip_rows_after_header)
@@ -184,6 +184,7 @@ impl RbDataFrame {
184
184
  Ok(df.into())
185
185
  }
186
186
 
187
+ #[allow(clippy::too_many_arguments)]
187
188
  pub fn read_parquet(
188
189
  rb_f: Value,
189
190
  columns: Option<Vec<String>>,
@@ -192,6 +193,8 @@ impl RbDataFrame {
192
193
  parallel: Wrap<ParallelStrategy>,
193
194
  row_count: Option<(String, IdxSize)>,
194
195
  low_memory: bool,
196
+ use_statistics: bool,
197
+ rechunk: bool,
195
198
  ) -> RbResult<Self> {
196
199
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
197
200
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
@@ -202,6 +205,8 @@ impl RbDataFrame {
202
205
  .with_n_rows(n_rows)
203
206
  .with_row_count(row_count)
204
207
  .set_low_memory(low_memory)
208
+ .use_statistics(use_statistics)
209
+ .set_rechunk(rechunk)
205
210
  .finish()
206
211
  .map_err(RbPolarsErr::from)?;
207
212
  Ok(RbDataFrame::new(df))
@@ -254,7 +259,7 @@ impl RbDataFrame {
254
259
  use polars::io::avro::AvroWriter;
255
260
 
256
261
  if let Ok(s) = rb_f.try_convert::<String>() {
257
- let f = std::fs::File::create(&s).unwrap();
262
+ let f = std::fs::File::create(s).unwrap();
258
263
  AvroWriter::new(f)
259
264
  .with_compression(compression.0)
260
265
  .finish(&mut self.df.borrow_mut())
@@ -339,7 +344,7 @@ impl RbDataFrame {
339
344
  // ensure the new names are used
340
345
  if let Some(schema) = &schema_overwrite {
341
346
  for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
342
- *name = new_name.clone();
347
+ *name = new_name.to_string();
343
348
  }
344
349
  }
345
350
  let rbdf = Self::finish_from_rows(
@@ -348,17 +353,19 @@ impl RbDataFrame {
348
353
  schema_overwrite.map(|wrap| wrap.0),
349
354
  )?;
350
355
 
351
- rbdf.df
352
- .borrow_mut()
353
- .get_columns_mut()
354
- .iter_mut()
355
- .zip(&names)
356
- .for_each(|(s, name)| {
357
- s.rename(name);
358
- });
356
+ unsafe {
357
+ rbdf.df
358
+ .borrow_mut()
359
+ .get_columns_mut()
360
+ .iter_mut()
361
+ .zip(&names)
362
+ .for_each(|(s, name)| {
363
+ s.rename(name);
364
+ });
365
+ }
359
366
  let length = names.len();
360
367
  if names.into_iter().collect::<PlHashSet<_>>().len() != length {
361
- let err = PolarsError::SchemaMisMatch("duplicate column names found".into());
368
+ let err = PolarsError::SchemaMismatch("duplicate column names found".into());
362
369
  Err(RbPolarsErr::from(err))?;
363
370
  }
364
371
 
@@ -394,7 +401,7 @@ impl RbDataFrame {
394
401
  let null = null_value.unwrap_or_default();
395
402
 
396
403
  if let Ok(s) = rb_f.try_convert::<String>() {
397
- let f = std::fs::File::create(&s).unwrap();
404
+ let f = std::fs::File::create(s).unwrap();
398
405
  // no need for a buffered writer, because the csv writer does internal buffering
399
406
  CsvWriter::new(f)
400
407
  .has_header(has_header)
@@ -436,7 +443,7 @@ impl RbDataFrame {
436
443
  compression: Wrap<Option<IpcCompression>>,
437
444
  ) -> RbResult<()> {
438
445
  if let Ok(s) = rb_f.try_convert::<String>() {
439
- let f = std::fs::File::create(&s).unwrap();
446
+ let f = std::fs::File::create(s).unwrap();
440
447
  IpcWriter::new(f)
441
448
  .with_compression(compression.0)
442
449
  .finish(&mut self.df.borrow_mut())
@@ -524,7 +531,7 @@ impl RbDataFrame {
524
531
  let compression = parse_parquet_compression(&compression, compression_level)?;
525
532
 
526
533
  if let Ok(s) = rb_f.try_convert::<String>() {
527
- let f = std::fs::File::create(&s).unwrap();
534
+ let f = std::fs::File::create(s).unwrap();
528
535
  ParquetWriter::new(f)
529
536
  .with_compression(compression)
530
537
  .with_statistics(statistics)
@@ -627,7 +634,7 @@ impl RbDataFrame {
627
634
  }
628
635
 
629
636
  pub fn get_columns(&self) -> RArray {
630
- let cols = self.df.borrow().get_columns().clone();
637
+ let cols = self.df.borrow().get_columns().to_vec();
631
638
  to_rbseries_collection(cols)
632
639
  }
633
640
 
@@ -881,10 +888,11 @@ impl RbDataFrame {
881
888
  variable_name: Option<String>,
882
889
  ) -> RbResult<Self> {
883
890
  let args = MeltArgs {
884
- id_vars,
885
- value_vars,
886
- value_name,
887
- variable_name,
891
+ id_vars: strings_to_smartstrings(id_vars),
892
+ value_vars: strings_to_smartstrings(value_vars),
893
+ value_name: value_name.map(|s| s.into()),
894
+ variable_name: variable_name.map(|s| s.into()),
895
+ streamable: false,
888
896
  };
889
897
 
890
898
  let df = self.df.borrow().melt2(args).map_err(RbPolarsErr::from)?;
@@ -897,22 +905,26 @@ impl RbDataFrame {
897
905
  values: Vec<String>,
898
906
  index: Vec<String>,
899
907
  columns: Vec<String>,
900
- aggregate_expr: &RbExpr,
901
908
  maintain_order: bool,
902
909
  sort_columns: bool,
910
+ aggregate_expr: Option<&RbExpr>,
903
911
  separator: Option<String>,
904
912
  ) -> RbResult<Self> {
905
913
  let fun = match maintain_order {
906
914
  true => pivot_stable,
907
915
  false => pivot,
908
916
  };
917
+ let agg_expr = match aggregate_expr {
918
+ Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
919
+ None => None,
920
+ };
909
921
  let df = fun(
910
922
  &self.df.borrow(),
911
923
  values,
912
924
  index,
913
925
  columns,
914
- aggregate_expr.inner.clone(),
915
926
  sort_columns,
927
+ agg_expr,
916
928
  separator.as_deref(),
917
929
  )
918
930
  .map_err(RbPolarsErr::from)?;
@@ -933,21 +945,6 @@ impl RbDataFrame {
933
945
  self.df.borrow().shift(periods).into()
934
946
  }
935
947
 
936
- pub fn unique(
937
- &self,
938
- maintain_order: bool,
939
- subset: Option<Vec<String>>,
940
- keep: Wrap<UniqueKeepStrategy>,
941
- ) -> RbResult<Self> {
942
- let subset = subset.as_ref().map(|v| v.as_ref());
943
- let df = match maintain_order {
944
- true => self.df.borrow().unique_stable(subset, keep.0),
945
- false => self.df.borrow().unique(subset, keep.0),
946
- }
947
- .map_err(RbPolarsErr::from)?;
948
- Ok(df.into())
949
- }
950
-
951
948
  pub fn lazy(&self) -> RbLazyFrame {
952
949
  self.df.borrow().clone().lazy().into()
953
950
  }
@@ -4,6 +4,7 @@ use polars::lazy::frame::{LazyFrame, LazyGroupBy};
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::path::PathBuf;
7
8
 
8
9
  use crate::conversion::*;
9
10
  use crate::file::get_file_like;
@@ -118,7 +119,7 @@ impl RbLazyFrame {
118
119
  let skip_rows_after_header: usize = arguments[15].try_convert()?;
119
120
  let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
120
121
  let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
121
- let parse_dates: bool = arguments[18].try_convert()?;
122
+ let try_parse_dates: bool = arguments[18].try_convert()?;
122
123
  let eol_char: String = arguments[19].try_convert()?;
123
124
  // end arguments
124
125
 
@@ -153,7 +154,7 @@ impl RbLazyFrame {
153
154
  .with_skip_rows_after_header(skip_rows_after_header)
154
155
  .with_encoding(encoding.0)
155
156
  .with_row_count(row_count)
156
- .with_parse_dates(parse_dates)
157
+ .with_try_parse_dates(try_parse_dates)
157
158
  .with_null_values(null_values);
158
159
 
159
160
  if let Some(_lambda) = with_schema_modify {
@@ -163,6 +164,7 @@ impl RbLazyFrame {
163
164
  Ok(r.finish().map_err(RbPolarsErr::from)?.into())
164
165
  }
165
166
 
167
+ #[allow(clippy::too_many_arguments)]
166
168
  pub fn new_from_parquet(
167
169
  path: String,
168
170
  n_rows: Option<usize>,
@@ -171,6 +173,7 @@ impl RbLazyFrame {
171
173
  rechunk: bool,
172
174
  row_count: Option<(String, IdxSize)>,
173
175
  low_memory: bool,
176
+ use_statistics: bool,
174
177
  ) -> RbResult<Self> {
175
178
  let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
179
  let args = ScanArgsParquet {
@@ -182,6 +185,7 @@ impl RbLazyFrame {
182
185
  low_memory,
183
186
  // TODO support cloud options
184
187
  cloud_options: None,
188
+ use_statistics,
185
189
  };
186
190
  let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
187
191
  Ok(lf.into())
@@ -284,6 +288,32 @@ impl RbLazyFrame {
284
288
  Ok(df.into())
285
289
  }
286
290
 
291
+ #[allow(clippy::too_many_arguments)]
292
+ pub fn sink_parquet(
293
+ &self,
294
+ path: PathBuf,
295
+ compression: String,
296
+ compression_level: Option<i32>,
297
+ statistics: bool,
298
+ row_group_size: Option<usize>,
299
+ data_pagesize_limit: Option<usize>,
300
+ maintain_order: bool,
301
+ ) -> RbResult<()> {
302
+ let compression = parse_parquet_compression(&compression, compression_level)?;
303
+
304
+ let options = ParquetWriteOptions {
305
+ compression,
306
+ statistics,
307
+ row_group_size,
308
+ data_pagesize_limit,
309
+ maintain_order,
310
+ };
311
+
312
+ let ldf = self.ldf.clone();
313
+ ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
314
+ Ok(())
315
+ }
316
+
287
317
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
288
318
  let ldf = self.ldf.clone();
289
319
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -328,7 +358,7 @@ impl RbLazyFrame {
328
358
  let lazy_gb = ldf.groupby_rolling(
329
359
  by,
330
360
  RollingGroupOptions {
331
- index_column,
361
+ index_column: index_column.into(),
332
362
  period: Duration::parse(&period),
333
363
  offset: Duration::parse(&offset),
334
364
  closed_window,
@@ -359,7 +389,7 @@ impl RbLazyFrame {
359
389
  let lazy_gb = ldf.groupby_dynamic(
360
390
  by,
361
391
  DynamicGroupOptions {
362
- index_column,
392
+ index_column: index_column.into(),
363
393
  every: Duration::parse(&every),
364
394
  period: Duration::parse(&period),
365
395
  offset: Duration::parse(&offset),
@@ -415,10 +445,10 @@ impl RbLazyFrame {
415
445
  .force_parallel(force_parallel)
416
446
  .how(JoinType::AsOf(AsOfOptions {
417
447
  strategy: strategy.0,
418
- left_by,
419
- right_by,
448
+ left_by: left_by.map(strings_to_smartstrings),
449
+ right_by: right_by.map(strings_to_smartstrings),
420
450
  tolerance: tolerance.map(|t| t.0.into_static().unwrap()),
421
- tolerance_str,
451
+ tolerance_str: tolerance_str.map(|s| s.into()),
422
452
  }))
423
453
  .suffix(suffix)
424
454
  .finish()
@@ -570,12 +600,14 @@ impl RbLazyFrame {
570
600
  value_vars: Vec<String>,
571
601
  value_name: Option<String>,
572
602
  variable_name: Option<String>,
603
+ streamable: bool,
573
604
  ) -> Self {
574
605
  let args = MeltArgs {
575
- id_vars,
576
- value_vars,
577
- value_name,
578
- variable_name,
606
+ id_vars: strings_to_smartstrings(id_vars),
607
+ value_vars: strings_to_smartstrings(value_vars),
608
+ value_name: value_name.map(|s| s.into()),
609
+ variable_name: variable_name.map(|s| s.into()),
610
+ streamable,
579
611
  };
580
612
 
581
613
  let ldf = self.ldf.clone();
@@ -596,8 +628,10 @@ impl RbLazyFrame {
596
628
  self.ldf.clone().into()
597
629
  }
598
630
 
599
- pub fn columns(&self) -> RbResult<Vec<String>> {
600
- Ok(self.get_schema()?.iter_names().cloned().collect())
631
+ pub fn columns(&self) -> RbResult<RArray> {
632
+ let schema = self.get_schema()?;
633
+ let iter = schema.iter_names().map(|s| s.as_str());
634
+ Ok(RArray::from_iter(iter))
601
635
  }
602
636
 
603
637
  pub fn dtypes(&self) -> RbResult<RArray> {
@@ -614,7 +648,7 @@ impl RbLazyFrame {
614
648
  // TODO remove unwrap
615
649
  schema_dict
616
650
  .aset::<String, Value>(
617
- fld.name().clone(),
651
+ fld.name().to_string(),
618
652
  Wrap(fld.data_type().clone()).into_value(),
619
653
  )
620
654
  .unwrap();