polars-df 0.21.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -0
  3. data/Cargo.lock +120 -90
  4. data/Cargo.toml +3 -0
  5. data/README.md +20 -7
  6. data/ext/polars/Cargo.toml +18 -12
  7. data/ext/polars/src/batched_csv.rs +4 -4
  8. data/ext/polars/src/catalog/unity.rs +96 -94
  9. data/ext/polars/src/conversion/any_value.rs +39 -37
  10. data/ext/polars/src/conversion/chunked_array.rs +36 -29
  11. data/ext/polars/src/conversion/datetime.rs +11 -0
  12. data/ext/polars/src/conversion/mod.rs +244 -51
  13. data/ext/polars/src/dataframe/construction.rs +5 -17
  14. data/ext/polars/src/dataframe/export.rs +17 -15
  15. data/ext/polars/src/dataframe/general.rs +15 -17
  16. data/ext/polars/src/dataframe/io.rs +1 -2
  17. data/ext/polars/src/dataframe/mod.rs +25 -1
  18. data/ext/polars/src/dataframe/serde.rs +23 -8
  19. data/ext/polars/src/exceptions.rs +8 -5
  20. data/ext/polars/src/expr/datatype.rs +4 -4
  21. data/ext/polars/src/expr/datetime.rs +22 -28
  22. data/ext/polars/src/expr/general.rs +3 -10
  23. data/ext/polars/src/expr/list.rs +8 -24
  24. data/ext/polars/src/expr/meta.rs +4 -6
  25. data/ext/polars/src/expr/mod.rs +2 -0
  26. data/ext/polars/src/expr/name.rs +11 -14
  27. data/ext/polars/src/expr/serde.rs +28 -0
  28. data/ext/polars/src/expr/string.rs +5 -10
  29. data/ext/polars/src/file.rs +20 -14
  30. data/ext/polars/src/functions/business.rs +0 -1
  31. data/ext/polars/src/functions/io.rs +7 -4
  32. data/ext/polars/src/functions/lazy.rs +7 -6
  33. data/ext/polars/src/functions/meta.rs +3 -3
  34. data/ext/polars/src/functions/string_cache.rs +3 -3
  35. data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
  36. data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
  37. data/ext/polars/src/io/mod.rs +23 -3
  38. data/ext/polars/src/lazyframe/general.rs +35 -50
  39. data/ext/polars/src/lazyframe/mod.rs +16 -1
  40. data/ext/polars/src/lazyframe/optflags.rs +57 -0
  41. data/ext/polars/src/lazyframe/serde.rs +27 -3
  42. data/ext/polars/src/lib.rs +144 -19
  43. data/ext/polars/src/map/dataframe.rs +18 -15
  44. data/ext/polars/src/map/lazy.rs +6 -5
  45. data/ext/polars/src/map/series.rs +7 -6
  46. data/ext/polars/src/on_startup.rs +12 -5
  47. data/ext/polars/src/rb_modules.rs +2 -2
  48. data/ext/polars/src/series/aggregation.rs +49 -29
  49. data/ext/polars/src/series/construction.rs +2 -0
  50. data/ext/polars/src/series/export.rs +38 -33
  51. data/ext/polars/src/series/general.rs +69 -31
  52. data/ext/polars/src/series/mod.rs +29 -4
  53. data/lib/polars/array_expr.rb +1 -1
  54. data/lib/polars/data_frame.rb +119 -15
  55. data/lib/polars/data_types.rb +23 -6
  56. data/lib/polars/date_time_expr.rb +36 -15
  57. data/lib/polars/expr.rb +41 -32
  58. data/lib/polars/functions/business.rb +95 -0
  59. data/lib/polars/functions/lazy.rb +1 -1
  60. data/lib/polars/iceberg_dataset.rb +113 -0
  61. data/lib/polars/io/iceberg.rb +34 -0
  62. data/lib/polars/io/ipc.rb +28 -49
  63. data/lib/polars/io/parquet.rb +7 -4
  64. data/lib/polars/io/scan_options.rb +12 -3
  65. data/lib/polars/io/utils.rb +17 -0
  66. data/lib/polars/lazy_frame.rb +97 -10
  67. data/lib/polars/list_expr.rb +21 -13
  68. data/lib/polars/list_name_space.rb +33 -21
  69. data/lib/polars/meta_expr.rb +25 -0
  70. data/lib/polars/query_opt_flags.rb +50 -0
  71. data/lib/polars/scan_cast_options.rb +23 -1
  72. data/lib/polars/schema.rb +1 -1
  73. data/lib/polars/selectors.rb +8 -8
  74. data/lib/polars/series.rb +26 -2
  75. data/lib/polars/string_expr.rb +27 -28
  76. data/lib/polars/string_name_space.rb +18 -5
  77. data/lib/polars/utils/convert.rb +2 -2
  78. data/lib/polars/utils/serde.rb +17 -0
  79. data/lib/polars/utils/various.rb +4 -0
  80. data/lib/polars/version.rb +1 -1
  81. data/lib/polars.rb +6 -0
  82. metadata +10 -1
@@ -3,7 +3,7 @@ use std::io;
3
3
  use std::io::{Cursor, Read, Seek, SeekFrom, Write};
4
4
  use std::path::PathBuf;
5
5
 
6
- use magnus::{Error, RString, Ruby, Value, exception, prelude::*, value::Opaque};
6
+ use magnus::{Error, RString, Ruby, Value, prelude::*, value::Opaque};
7
7
  use polars::io::cloud::CloudOptions;
8
8
  use polars::io::mmap::MmapBytesReader;
9
9
  use polars::prelude::PlPath;
@@ -67,23 +67,25 @@ impl RbFileLikeObject {
67
67
  /// ruby object has a `read`, `write`, and `seek` methods in respect to parameters.
68
68
  /// Will return a `TypeError` if object does not have `read`, `seek`, and `write` methods.
69
69
  pub fn with_requirements(object: Value, read: bool, write: bool, seek: bool) -> RbResult<Self> {
70
+ let ruby = Ruby::get_with(object);
71
+
70
72
  if read && !object.respond_to("read", false)? {
71
73
  return Err(Error::new(
72
- exception::type_error(),
74
+ ruby.exception_type_error(),
73
75
  "Object does not have a .read() method.",
74
76
  ));
75
77
  }
76
78
 
77
79
  if seek && !object.respond_to("seek", false)? {
78
80
  return Err(Error::new(
79
- exception::type_error(),
81
+ ruby.exception_type_error(),
80
82
  "Object does not have a .seek() method.",
81
83
  ));
82
84
  }
83
85
 
84
86
  if write && !object.respond_to("write", false)? {
85
87
  return Err(Error::new(
86
- exception::type_error(),
88
+ ruby.exception_type_error(),
87
89
  "Object does not have a .write() method.",
88
90
  ));
89
91
  }
@@ -113,10 +115,10 @@ impl Read for RbFileLikeObject {
113
115
 
114
116
  impl Write for RbFileLikeObject {
115
117
  fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
116
- let rbbytes = RString::from_slice(buf);
118
+ let ruby = Ruby::get().unwrap();
119
+ let rbbytes = ruby.str_from_slice(buf);
117
120
 
118
- let number_bytes_written = Ruby::get()
119
- .unwrap()
121
+ let number_bytes_written = ruby
120
122
  .get_inner(self.inner)
121
123
  .funcall::<_, _, usize>("write", (rbbytes,))
122
124
  .map_err(rberr_to_io_err)?;
@@ -143,13 +145,13 @@ impl Seek for RbFileLikeObject {
143
145
  SeekFrom::End(i) => (2, i),
144
146
  };
145
147
 
146
- let new_position = Ruby::get()
147
- .unwrap()
148
- .get_inner(self.inner)
149
- .funcall("seek", (offset, whence))
148
+ let inner = Ruby::get().unwrap().get_inner(self.inner);
149
+
150
+ inner
151
+ .funcall::<_, _, Value>("seek", (offset, whence))
150
152
  .map_err(rberr_to_io_err)?;
151
153
 
152
- Ok(new_position)
154
+ inner.funcall("tell", ()).map_err(rberr_to_io_err)
153
155
  }
154
156
  }
155
157
 
@@ -265,8 +267,12 @@ pub fn get_mmap_bytes_reader_and_path<'a>(
265
267
  RbReadBytes::Bytes(v) => Ok((Box::new(Cursor::new(unsafe { v.as_slice() })), None)),
266
268
  RbReadBytes::Other(v) => {
267
269
  let path = PathBuf::try_convert(*v)?;
268
- let f = File::open(&path)
269
- .map_err(|e| Error::new(exception::runtime_error(), e.to_string()))?;
270
+ let f = File::open(&path).map_err(|e| {
271
+ Error::new(
272
+ Ruby::get().unwrap().exception_runtime_error(),
273
+ e.to_string(),
274
+ )
275
+ })?;
270
276
  Ok((Box::new(f), Some(path)))
271
277
  }
272
278
  }
@@ -2,7 +2,6 @@ use polars::lazy::dsl;
2
2
 
3
3
  use crate::RbExpr;
4
4
 
5
- // TODO add to Ruby
6
5
  pub fn business_day_count(
7
6
  start: &RbExpr,
8
7
  end: &RbExpr,
@@ -1,6 +1,6 @@
1
1
  use std::io::BufReader;
2
2
 
3
- use magnus::{RHash, Value};
3
+ use magnus::{RHash, Ruby, Value};
4
4
  use polars::prelude::ArrowSchema;
5
5
 
6
6
  use crate::conversion::Wrap;
@@ -16,7 +16,8 @@ pub fn read_ipc_schema(rb_f: Value) -> RbResult<RHash> {
16
16
  EitherRustRubyFile::Rb(mut r) => read_file_metadata(&mut r).map_err(RbPolarsErr::from)?,
17
17
  };
18
18
 
19
- let dict = RHash::new();
19
+ let ruby = Ruby::get_with(rb_f);
20
+ let dict = ruby.hash_new();
20
21
  fields_to_rbdict(&metadata.schema, &dict)?;
21
22
  Ok(dict)
22
23
  }
@@ -33,7 +34,8 @@ pub fn read_parquet_metadata(rb_f: Value) -> RbResult<RHash> {
33
34
  };
34
35
 
35
36
  let key_value_metadata = read_custom_key_value_metadata(metadata.key_value_metadata());
36
- let dict = RHash::new();
37
+ let ruby = Ruby::get_with(rb_f);
38
+ let dict = ruby.hash_new();
37
39
  for (key, value) in key_value_metadata.into_iter() {
38
40
  dict.aset(key.as_str(), value.as_str())?;
39
41
  }
@@ -51,7 +53,8 @@ pub fn read_parquet_schema(rb_f: Value) -> RbResult<RHash> {
51
53
  };
52
54
  let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::from)?;
53
55
 
54
- let dict = RHash::new();
56
+ let ruby = Ruby::get_with(rb_f);
57
+ let dict = ruby.hash_new();
55
58
  fields_to_rbdict(&arrow_schema, &dict)?;
56
59
  Ok(dict)
57
60
  }
@@ -1,6 +1,6 @@
1
- use magnus::encoding::{self, EncodingCapable};
1
+ use magnus::encoding::EncodingCapable;
2
2
  use magnus::{
3
- Float, Integer, RArray, RString, Ruby, Value, class, prelude::*, typed_data::Obj, value::Opaque,
3
+ Float, Integer, RArray, RString, Ruby, Value, prelude::*, typed_data::Obj, value::Opaque,
4
4
  };
5
5
  use polars::lazy::dsl;
6
6
  use polars::prelude::*;
@@ -98,10 +98,10 @@ pub fn col(name: String) -> RbExpr {
98
98
  dsl::col(&name).into()
99
99
  }
100
100
 
101
- pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
101
+ pub fn collect_all(ruby: &Ruby, lfs: RArray) -> RbResult<RArray> {
102
102
  let lfs = lfs.typecheck::<Obj<RbLazyFrame>>()?;
103
103
 
104
- Ok(RArray::from_iter(lfs.iter().map(|lf| {
104
+ Ok(ruby.ary_from_iter(lfs.iter().map(|lf| {
105
105
  let df = lf.ldf.borrow().clone().collect().unwrap();
106
106
  RbDataFrame::new(df)
107
107
  })))
@@ -283,7 +283,8 @@ pub fn fold(
283
283
  }
284
284
 
285
285
  pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr> {
286
- if value.is_kind_of(class::true_class()) || value.is_kind_of(class::false_class()) {
286
+ let ruby = Ruby::get_with(value);
287
+ if value.is_kind_of(ruby.class_true_class()) || value.is_kind_of(ruby.class_false_class()) {
287
288
  Ok(dsl::lit(bool::try_convert(value)?).into())
288
289
  } else if let Some(v) = Integer::from_value(value) {
289
290
  match v.to_i64() {
@@ -302,7 +303,7 @@ pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr
302
303
  } else if let Some(v) = Float::from_value(value) {
303
304
  Ok(dsl::lit(v.to_f64()).into())
304
305
  } else if let Some(v) = RString::from_value(value) {
305
- if v.enc_get() == encoding::Index::utf8() {
306
+ if v.enc_get() == ruby.utf8_encindex() {
306
307
  Ok(dsl::lit(v.to_string()?).into())
307
308
  } else {
308
309
  Ok(dsl::lit(unsafe { v.as_slice() }).into())
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, Value};
1
+ use magnus::{IntoValue, Ruby, Value};
2
2
  use polars_core;
3
3
  use polars_core::POOL;
4
4
  use polars_core::fmt::FloatFmt;
@@ -7,8 +7,8 @@ use polars_core::prelude::IDX_DTYPE;
7
7
  use crate::conversion::Wrap;
8
8
  use crate::{RbResult, RbValueError};
9
9
 
10
- pub fn get_index_type() -> Value {
11
- Wrap(IDX_DTYPE).into_value()
10
+ pub fn get_index_type(ruby: &Ruby) -> Value {
11
+ Wrap(IDX_DTYPE).into_value_with(ruby)
12
12
  }
13
13
 
14
14
  pub fn thread_pool_size() -> usize {
@@ -1,5 +1,5 @@
1
1
  use crate::RbResult;
2
- use magnus::{RArray, Ruby, Value};
2
+ use magnus::{Ruby, Value};
3
3
 
4
4
  pub fn enable_string_cache() {
5
5
  // The string cache no longer exists.
@@ -18,7 +18,7 @@ pub fn using_string_cache() -> bool {
18
18
  pub struct RbStringCacheHolder {}
19
19
 
20
20
  impl RbStringCacheHolder {
21
- pub fn hold() -> RbResult<Value> {
22
- Ruby::get().unwrap().yield_splat(RArray::new())
21
+ pub fn hold(ruby: &Ruby) -> RbResult<Value> {
22
+ ruby.yield_splat(ruby.ary_new())
23
23
  }
24
24
  }
@@ -1,6 +1,6 @@
1
1
  use arrow::datatypes::ArrowDataType;
2
2
  use arrow::ffi;
3
- use magnus::{IntoValue, Value};
3
+ use magnus::{IntoValue, Ruby, Value};
4
4
  use polars::datatypes::CompatLevel;
5
5
  use polars::frame::DataFrame;
6
6
  use polars::prelude::{ArrayRef, ArrowField, PlSmallStr, PolarsResult, SchemaExt};
@@ -20,11 +20,11 @@ impl RbArrowArrayStream {
20
20
  }
21
21
  }
22
22
 
23
- pub(crate) fn dataframe_to_stream(df: &DataFrame) -> RbResult<Value> {
23
+ pub(crate) fn dataframe_to_stream(df: &DataFrame, ruby: &Ruby) -> RbResult<Value> {
24
24
  let iter = Box::new(DataFrameStreamIterator::new(df));
25
25
  let field = iter.field();
26
26
  let stream = ffi::export_iterator(iter, field);
27
- Ok(RbArrowArrayStream { stream }.into_value())
27
+ Ok(RbArrowArrayStream { stream }.into_value_with(ruby))
28
28
  }
29
29
 
30
30
  pub struct DataFrameStreamIterator {
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, Module, RArray, RClass, RModule, Value, class, prelude::*};
1
+ use magnus::{IntoValue, Module, RClass, RModule, Ruby, Value, prelude::*};
2
2
 
3
3
  use crate::RbResult;
4
4
 
@@ -44,9 +44,10 @@ impl<T: Element> RbArray1<T> {
44
44
  where
45
45
  I: IntoIterator<Item = T>,
46
46
  {
47
- class::object()
47
+ let ruby = Ruby::get().unwrap();
48
+ ruby.class_object()
48
49
  .const_get::<_, RModule>("Numo")?
49
50
  .const_get::<_, RClass>(T::class_name())?
50
- .funcall("cast", (RArray::from_iter(values),))
51
+ .funcall("cast", (ruby.ary_from_iter(values),))
51
52
  }
52
53
  }
@@ -1,10 +1,11 @@
1
1
  use std::sync::Arc;
2
2
 
3
3
  use magnus::{TryConvert, Value, value::ReprValue};
4
+ use polars::prelude::default_values::DefaultFieldValues;
4
5
  use polars::prelude::deletion::DeletionFilesList;
5
6
  use polars::prelude::{
6
7
  CastColumnsPolicy, ColumnMapping, ExtraColumnsPolicy, MissingColumnsPolicy, PlSmallStr, Schema,
7
- UnifiedScanArgs,
8
+ TableStatistics, UnifiedScanArgs,
8
9
  };
9
10
  use polars_io::{HiveOptions, RowIndex};
10
11
  use polars_utils::IdxSize;
@@ -23,6 +24,12 @@ impl TryConvert for RbScanOptions {
23
24
  }
24
25
  }
25
26
 
27
+ impl TryConvert for Wrap<TableStatistics> {
28
+ fn try_convert(_ob: Value) -> RbResult<Self> {
29
+ todo!();
30
+ }
31
+ }
32
+
26
33
  impl RbScanOptions {
27
34
  pub fn extract_unified_scan_args(
28
35
  &self,
@@ -37,6 +44,10 @@ impl RbScanOptions {
37
44
  let include_file_paths: Option<Wrap<PlSmallStr>> =
38
45
  self.0.funcall("include_file_paths", ())?;
39
46
  let glob: bool = self.0.funcall("glob", ())?;
47
+ let hidden_file_prefix: Option<Vec<String>> = self.0.funcall("hidden_file_prefix", ())?;
48
+ let column_mapping: Option<Wrap<ColumnMapping>> = self.0.funcall("column_mapping", ())?;
49
+ let default_values: Option<Wrap<DefaultFieldValues>> =
50
+ self.0.funcall("default_values", ())?;
40
51
  let hive_partitioning: Option<bool> = self.0.funcall("hive_partitioning", ())?;
41
52
  let hive_schema: Option<Wrap<Schema>> = self.0.funcall("hive_schema", ())?;
42
53
  let try_parse_hive_dates: bool = self.0.funcall("try_parse_hive_dates", ())?;
@@ -47,7 +58,9 @@ impl RbScanOptions {
47
58
  let retries: usize = self.0.funcall("retries", ())?;
48
59
  let deletion_files: Option<Wrap<DeletionFilesList>> =
49
60
  self.0.funcall("deletion_files", ())?;
50
- let column_mapping: Option<Wrap<ColumnMapping>> = self.0.funcall("column_mapping", ())?;
61
+ let table_statistics: Option<Wrap<TableStatistics>> =
62
+ self.0.funcall("table_statistics", ())?;
63
+ let row_count: Option<(u64, u64)> = self.0.funcall("row_count", ())?;
51
64
 
52
65
  let cloud_options = storage_options;
53
66
 
@@ -86,7 +99,13 @@ impl RbScanOptions {
86
99
  rechunk,
87
100
  cache,
88
101
  glob,
102
+ hidden_file_prefix: hidden_file_prefix
103
+ .map(|x| x.into_iter().map(|x| (*x).into()).collect()),
89
104
  projection: None,
105
+ column_mapping: column_mapping.map(|x| x.0),
106
+ default_values: default_values
107
+ .map(|x| x.0)
108
+ .filter(|DefaultFieldValues::Iceberg(v)| !v.is_empty()),
90
109
  row_index,
91
110
  pre_slice: pre_slice.map(Slice::from),
92
111
  cast_columns_policy: cast_options.0,
@@ -94,7 +113,8 @@ impl RbScanOptions {
94
113
  extra_columns_policy: extra_columns.0,
95
114
  include_file_paths: include_file_paths.map(|x| x.0),
96
115
  deletion_files: DeletionFilesList::filter_empty(deletion_files.map(|x| x.0)),
97
- column_mapping: column_mapping.map(|x| x.0),
116
+ table_statistics: table_statistics.map(|x| x.0),
117
+ row_count,
98
118
  };
99
119
 
100
120
  Ok(unified_scan_args)
@@ -1,5 +1,5 @@
1
- use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
2
- use polars::io::{HiveOptions, RowIndex};
1
+ use magnus::{IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
2
+ use polars::io::RowIndex;
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
5
5
  use polars_plan::dsl::ScanSources;
@@ -18,7 +18,10 @@ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResu
18
18
  fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PlPath>, ScanSources)> {
19
19
  use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
20
20
  Ok(match get_ruby_scan_source_input(obj, false)? {
21
- RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
21
+ RubyScanSourceInput::Path(path) => (
22
+ Some(path.clone()),
23
+ ScanSources::Paths(FromIterator::from_iter([path])),
24
+ ),
22
25
  RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
23
26
  RubyScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
24
27
  })
@@ -30,13 +33,12 @@ impl RbLazyFrame {
30
33
  source: Option<Value>,
31
34
  sources: Wrap<ScanSources>,
32
35
  infer_schema_length: Option<usize>,
33
- batch_size: Option<Wrap<NonZeroUsize>>,
36
+ batch_size: Option<NonZeroUsize>,
34
37
  n_rows: Option<usize>,
35
38
  low_memory: bool,
36
39
  rechunk: bool,
37
40
  row_index: Option<(String, IdxSize)>,
38
41
  ) -> RbResult<Self> {
39
- let batch_size = batch_size.map(|v| v.0);
40
42
  let row_index = row_index.map(|(name, offset)| RowIndex {
41
43
  name: name.into(),
42
44
  offset,
@@ -181,48 +183,28 @@ impl RbLazyFrame {
181
183
  Ok(lf.into())
182
184
  }
183
185
 
184
- #[allow(clippy::too_many_arguments)]
185
186
  pub fn new_from_ipc(
186
- source: Option<Value>,
187
187
  sources: Wrap<ScanSources>,
188
- n_rows: Option<usize>,
189
- cache: bool,
190
- rechunk: bool,
191
- row_index: Option<(String, IdxSize)>,
192
- hive_partitioning: Option<bool>,
193
- hive_schema: Option<Wrap<Schema>>,
194
- try_parse_hive_dates: bool,
195
- include_file_paths: Option<String>,
188
+ scan_options: RbScanOptions,
189
+ file_cache_ttl: Option<u64>,
196
190
  ) -> RbResult<Self> {
197
- let row_index = row_index.map(|(name, offset)| RowIndex {
198
- name: name.into(),
199
- offset,
200
- });
191
+ let options = IpcScanOptions;
201
192
 
202
- let hive_options = HiveOptions {
203
- enabled: hive_partitioning,
204
- hive_start_idx: 0,
205
- schema: hive_schema.map(|x| Arc::new(x.0)),
206
- try_parse_dates: try_parse_hive_dates,
207
- };
193
+ let sources = sources.0;
194
+ let first_path = sources.first_path().map(|p| p.into_owned());
208
195
 
209
- let args = ScanArgsIpc {
210
- n_rows,
211
- cache,
212
- rechunk,
213
- row_index,
214
- cloud_options: None,
215
- hive_options,
216
- include_file_paths: include_file_paths.map(|x| x.into()),
217
- };
196
+ let mut unified_scan_args =
197
+ scan_options.extract_unified_scan_args(first_path.as_ref().map(|p| p.as_ref()))?;
218
198
 
219
- let sources = sources.0;
220
- let (_first_path, sources) = match source {
221
- None => (sources.first_path().map(|p| p.into_owned()), sources),
222
- Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
223
- };
199
+ if let Some(file_cache_ttl) = file_cache_ttl {
200
+ unified_scan_args
201
+ .cloud_options
202
+ .get_or_insert_default()
203
+ .file_cache_ttl = file_cache_ttl;
204
+ }
224
205
 
225
- let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(RbPolarsErr::from)?;
206
+ let lf = LazyFrame::scan_ipc_sources(sources, options, unified_scan_args)
207
+ .map_err(RbPolarsErr::from)?;
226
208
  Ok(lf.into())
227
209
  }
228
210
 
@@ -407,13 +389,13 @@ impl RbLazyFrame {
407
389
  pub fn sink_ipc(
408
390
  &self,
409
391
  target: SinkTarget,
410
- compression: Option<Wrap<IpcCompression>>,
392
+ compression: Wrap<Option<IpcCompression>>,
411
393
  cloud_options: Option<Vec<(String, String)>>,
412
394
  retries: usize,
413
395
  sink_options: Wrap<SinkOptions>,
414
396
  ) -> RbResult<RbLazyFrame> {
415
397
  let options = IpcWriterOptions {
416
- compression: compression.map(|c| c.0),
398
+ compression: compression.0,
417
399
  ..Default::default()
418
400
  };
419
401
 
@@ -444,7 +426,7 @@ impl RbLazyFrame {
444
426
  let separator = u8::try_convert(arguments[3])?;
445
427
  let line_terminator = String::try_convert(arguments[4])?;
446
428
  let quote_char = u8::try_convert(arguments[5])?;
447
- let batch_size = Wrap::<NonZeroUsize>::try_convert(arguments[6])?;
429
+ let batch_size = NonZeroUsize::try_convert(arguments[6])?;
448
430
  let datetime_format = Option::<String>::try_convert(arguments[7])?;
449
431
  let date_format = Option::<String>::try_convert(arguments[8])?;
450
432
  let time_format = Option::<String>::try_convert(arguments[9])?;
@@ -477,7 +459,7 @@ impl RbLazyFrame {
477
459
  let options = CsvWriterOptions {
478
460
  include_bom,
479
461
  include_header,
480
- batch_size: batch_size.0,
462
+ batch_size,
481
463
  serialize_options,
482
464
  };
483
465
 
@@ -931,30 +913,33 @@ impl RbLazyFrame {
931
913
  self.ldf.borrow().clone().into()
932
914
  }
933
915
 
934
- pub fn collect_schema(&self) -> RbResult<RHash> {
935
- let schema = self
916
+ pub fn collect_schema(ruby: &Ruby, rb_self: &Self) -> RbResult<RHash> {
917
+ let schema = rb_self
936
918
  .ldf
937
919
  .borrow_mut()
938
920
  .collect_schema()
939
921
  .map_err(RbPolarsErr::from)?;
940
922
 
941
- let schema_dict = RHash::new();
923
+ let schema_dict = ruby.hash_new();
942
924
  schema.iter_fields().for_each(|fld| {
943
925
  schema_dict
944
926
  .aset::<String, Value>(
945
927
  fld.name().to_string(),
946
- Wrap(fld.dtype().clone()).into_value(),
928
+ Wrap(fld.dtype().clone()).into_value_with(ruby),
947
929
  )
948
930
  .unwrap();
949
931
  });
950
932
  Ok(schema_dict)
951
933
  }
952
934
 
953
- pub fn unnest(&self, columns: &RbSelector) -> Self {
935
+ pub fn unnest(&self, columns: &RbSelector, separator: Option<String>) -> Self {
954
936
  self.ldf
955
937
  .borrow()
956
938
  .clone()
957
- .unnest(columns.inner.clone())
939
+ .unnest(
940
+ columns.inner.clone(),
941
+ separator.as_deref().map(PlSmallStr::from_str),
942
+ )
958
943
  .into()
959
944
  }
960
945
 
@@ -1,8 +1,9 @@
1
1
  mod general;
2
+ mod optflags;
2
3
  mod serde;
3
4
  mod sink;
4
5
 
5
- use polars::lazy::frame::LazyFrame;
6
+ use polars::prelude::{LazyFrame, OptFlags};
6
7
  pub use sink::SinkTarget;
7
8
  use std::cell::RefCell;
8
9
 
@@ -12,6 +13,12 @@ pub struct RbLazyFrame {
12
13
  pub ldf: RefCell<LazyFrame>,
13
14
  }
14
15
 
16
+ #[magnus::wrap(class = "Polars::RbOptFlags")]
17
+ #[derive(Clone)]
18
+ pub struct RbOptFlags {
19
+ pub inner: RefCell<OptFlags>,
20
+ }
21
+
15
22
  impl From<LazyFrame> for RbLazyFrame {
16
23
  fn from(ldf: LazyFrame) -> Self {
17
24
  RbLazyFrame {
@@ -19,3 +26,11 @@ impl From<LazyFrame> for RbLazyFrame {
19
26
  }
20
27
  }
21
28
  }
29
+
30
+ impl From<OptFlags> for RbOptFlags {
31
+ fn from(inner: OptFlags) -> Self {
32
+ RbOptFlags {
33
+ inner: RefCell::new(inner),
34
+ }
35
+ }
36
+ }
@@ -0,0 +1,57 @@
1
+ use polars::prelude::OptFlags;
2
+
3
+ use super::RbOptFlags;
4
+
5
+ macro_rules! flag_getter_setters {
6
+ ($(($flag:ident, $getter:ident, $setter:ident, clear=$clear:literal))+) => {
7
+ impl RbOptFlags {
8
+ pub fn empty() -> Self {
9
+ Self {
10
+ inner: OptFlags::empty().into()
11
+ }
12
+ }
13
+
14
+ #[allow(clippy::should_implement_trait)]
15
+ pub fn default() -> Self {
16
+ Self { inner: OptFlags::default().into() }
17
+ }
18
+
19
+ pub fn no_optimizations(&self) {
20
+ $(if $clear {
21
+ self.inner.borrow_mut().remove(OptFlags::$flag);
22
+ })+
23
+ }
24
+
25
+ pub fn copy(&self) -> Self {
26
+ Self { inner: self.inner.clone() }
27
+ }
28
+
29
+ $(
30
+ pub fn $getter(&self) -> bool {
31
+ self.inner.borrow().contains(OptFlags::$flag)
32
+ }
33
+ pub fn $setter(&self, value: bool) {
34
+ self.inner.borrow_mut().set(OptFlags::$flag, value)
35
+ }
36
+ )+
37
+ }
38
+ };
39
+ }
40
+
41
+ flag_getter_setters! {
42
+ (TYPE_COERCION, get_type_coercion, set_type_coercion, clear=false)
43
+ (TYPE_CHECK, get_type_check, set_type_check, clear=false)
44
+
45
+ (PROJECTION_PUSHDOWN, get_projection_pushdown, set_projection_pushdown, clear=true)
46
+ (PREDICATE_PUSHDOWN, get_predicate_pushdown, set_predicate_pushdown, clear=true)
47
+ (CLUSTER_WITH_COLUMNS, get_cluster_with_columns, set_cluster_with_columns, clear=true)
48
+ (SIMPLIFY_EXPR, get_simplify_expression, set_simplify_expression, clear=true)
49
+ (SLICE_PUSHDOWN, get_slice_pushdown, set_slice_pushdown, clear=true)
50
+ (COMM_SUBPLAN_ELIM, get_comm_subplan_elim, set_comm_subplan_elim, clear=true)
51
+ (COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
52
+ (CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
53
+ (FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
54
+
55
+ (EAGER, get_eager, set_eager, clear=true)
56
+ (NEW_STREAMING, get_streaming, set_streaming, clear=true)
57
+ }
@@ -1,14 +1,38 @@
1
+ use std::io::Read;
2
+ #[cfg(feature = "serialize_binary")]
3
+ use std::io::{BufReader, BufWriter};
4
+
1
5
  use magnus::Value;
2
6
  use polars::lazy::frame::LazyFrame;
3
7
  use polars::prelude::*;
4
- use std::io::Read;
5
8
 
6
9
  use crate::file::get_file_like;
10
+ #[cfg(feature = "serialize_binary")]
11
+ use crate::utils::to_rb_err;
7
12
  use crate::{RbLazyFrame, RbResult, RbValueError};
8
13
 
9
14
  impl RbLazyFrame {
10
- // TODO change to serialize_json
11
- pub fn read_json(rb_f: Value) -> RbResult<Self> {
15
+ #[cfg(feature = "serialize_binary")]
16
+ pub fn serialize_binary(&self, rb_f: Value) -> RbResult<()> {
17
+ let file = get_file_like(rb_f, true)?;
18
+ let writer = BufWriter::new(file);
19
+ self.ldf
20
+ .borrow()
21
+ .logical_plan
22
+ .serialize_versioned(writer, Default::default())
23
+ .map_err(to_rb_err)
24
+ }
25
+
26
+ #[cfg(feature = "serialize_binary")]
27
+ pub fn deserialize_binary(rb_f: Value) -> RbResult<Self> {
28
+ let file = get_file_like(rb_f, false)?;
29
+ let reader = BufReader::new(file);
30
+
31
+ let lp: DslPlan = DslPlan::deserialize_versioned(reader).map_err(to_rb_err)?;
32
+ Ok(LazyFrame::from(lp).into())
33
+ }
34
+
35
+ pub fn deserialize_json(rb_f: Value) -> RbResult<Self> {
12
36
  // it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
13
37
  // so don't bother with files.
14
38
  let mut json = String::new();