polars-df 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -0
  3. data/Cargo.lock +192 -186
  4. data/LICENSE.txt +1 -1
  5. data/ext/polars/Cargo.toml +13 -9
  6. data/ext/polars/src/batched_csv.rs +2 -2
  7. data/ext/polars/src/catalog/mod.rs +1 -0
  8. data/ext/polars/src/catalog/unity.rs +450 -0
  9. data/ext/polars/src/conversion/any_value.rs +9 -19
  10. data/ext/polars/src/conversion/categorical.rs +30 -0
  11. data/ext/polars/src/conversion/chunked_array.rs +8 -8
  12. data/ext/polars/src/conversion/mod.rs +187 -109
  13. data/ext/polars/src/dataframe/construction.rs +2 -2
  14. data/ext/polars/src/dataframe/export.rs +2 -2
  15. data/ext/polars/src/dataframe/general.rs +4 -2
  16. data/ext/polars/src/dataframe/io.rs +2 -2
  17. data/ext/polars/src/exceptions.rs +1 -1
  18. data/ext/polars/src/expr/datatype.rs +14 -0
  19. data/ext/polars/src/expr/general.rs +22 -17
  20. data/ext/polars/src/expr/list.rs +21 -2
  21. data/ext/polars/src/expr/meta.rs +0 -34
  22. data/ext/polars/src/expr/mod.rs +3 -1
  23. data/ext/polars/src/expr/name.rs +2 -2
  24. data/ext/polars/src/expr/rolling.rs +1 -1
  25. data/ext/polars/src/expr/selector.rs +219 -0
  26. data/ext/polars/src/expr/string.rs +14 -6
  27. data/ext/polars/src/file.rs +11 -5
  28. data/ext/polars/src/functions/io.rs +2 -11
  29. data/ext/polars/src/functions/lazy.rs +22 -54
  30. data/ext/polars/src/functions/meta.rs +2 -2
  31. data/ext/polars/src/functions/misc.rs +1 -1
  32. data/ext/polars/src/functions/string_cache.rs +4 -5
  33. data/ext/polars/src/interop/numo/numo_rs.rs +1 -1
  34. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  35. data/ext/polars/src/io/mod.rs +102 -0
  36. data/ext/polars/src/lazyframe/general.rs +74 -112
  37. data/ext/polars/src/lazyframe/serde.rs +1 -1
  38. data/ext/polars/src/lazyframe/sink.rs +6 -6
  39. data/ext/polars/src/lib.rs +98 -20
  40. data/ext/polars/src/map/dataframe.rs +7 -7
  41. data/ext/polars/src/map/lazy.rs +1 -1
  42. data/ext/polars/src/map/mod.rs +31 -19
  43. data/ext/polars/src/map/series.rs +8 -8
  44. data/ext/polars/src/on_startup.rs +5 -2
  45. data/ext/polars/src/rb_modules.rs +1 -1
  46. data/ext/polars/src/series/construction.rs +11 -7
  47. data/ext/polars/src/series/export.rs +6 -4
  48. data/ext/polars/src/series/general.rs +12 -207
  49. data/ext/polars/src/series/import.rs +2 -2
  50. data/ext/polars/src/series/map.rs +227 -0
  51. data/ext/polars/src/series/mod.rs +2 -1
  52. data/ext/polars/src/series/scatter.rs +1 -1
  53. data/ext/polars/src/utils.rs +10 -2
  54. data/lib/polars/cat_name_space.rb +3 -43
  55. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  56. data/lib/polars/catalog/unity/column_info.rb +31 -0
  57. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  58. data/lib/polars/catalog/unity/table_info.rb +50 -0
  59. data/lib/polars/catalog.rb +448 -0
  60. data/lib/polars/convert.rb +10 -0
  61. data/lib/polars/data_frame.rb +151 -30
  62. data/lib/polars/data_types.rb +47 -3
  63. data/lib/polars/exceptions.rb +7 -2
  64. data/lib/polars/expr.rb +34 -31
  65. data/lib/polars/functions/col.rb +6 -5
  66. data/lib/polars/functions/lazy.rb +114 -15
  67. data/lib/polars/functions/repeat.rb +4 -0
  68. data/lib/polars/io/csv.rb +18 -0
  69. data/lib/polars/io/json.rb +16 -0
  70. data/lib/polars/io/ndjson.rb +13 -0
  71. data/lib/polars/io/parquet.rb +45 -63
  72. data/lib/polars/io/scan_options.rb +47 -0
  73. data/lib/polars/lazy_frame.rb +163 -75
  74. data/lib/polars/list_expr.rb +204 -7
  75. data/lib/polars/list_name_space.rb +120 -1
  76. data/lib/polars/meta_expr.rb +7 -22
  77. data/lib/polars/scan_cast_options.rb +64 -0
  78. data/lib/polars/schema.rb +6 -1
  79. data/lib/polars/selector.rb +138 -0
  80. data/lib/polars/selectors.rb +931 -202
  81. data/lib/polars/series.rb +34 -11
  82. data/lib/polars/string_expr.rb +24 -3
  83. data/lib/polars/string_name_space.rb +11 -0
  84. data/lib/polars/utils/parse.rb +40 -0
  85. data/lib/polars/utils.rb +5 -1
  86. data/lib/polars/version.rb +1 -1
  87. data/lib/polars.rb +8 -0
  88. metadata +17 -2
@@ -1,13 +1,13 @@
1
1
  use magnus::encoding::{self, EncodingCapable};
2
2
  use magnus::{
3
- class, prelude::*, typed_data::Obj, value::Opaque, Float, Integer, RArray, RString, Ruby, Value,
3
+ Float, Integer, RArray, RString, Ruby, Value, class, prelude::*, typed_data::Obj, value::Opaque,
4
4
  };
5
5
  use polars::lazy::dsl;
6
6
  use polars::prelude::*;
7
7
 
8
- use crate::conversion::{get_lf, get_rbseq, Wrap};
8
+ use crate::conversion::{Wrap, get_lf, get_rbseq};
9
+ use crate::expr::datatype::RbDataTypeExpr;
9
10
  use crate::map::lazy::binary_lambda;
10
- use crate::prelude::vec_extract_wrapped;
11
11
  use crate::rb_exprs_to_exprs;
12
12
  use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
13
13
 
@@ -103,10 +103,6 @@ pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
103
103
  })))
104
104
  }
105
105
 
106
- pub fn cols(names: Vec<String>) -> RbExpr {
107
- dsl::cols(names).into()
108
- }
109
-
110
106
  pub fn concat_lf(
111
107
  lfs: Value,
112
108
  rechunk: bool,
@@ -166,20 +162,24 @@ pub fn cum_fold(
166
162
  acc: &RbExpr,
167
163
  lambda: Value,
168
164
  exprs: RArray,
165
+ returns_scalar: bool,
166
+ return_dtype: Option<&RbDataTypeExpr>,
169
167
  include_init: bool,
170
168
  ) -> RbResult<RbExpr> {
171
169
  let exprs = rb_exprs_to_exprs(exprs)?;
172
170
  let lambda = Opaque::from(lambda);
173
-
174
- let func = move |a: Column, b: Column| {
175
- binary_lambda(
176
- Ruby::get().unwrap().get_inner(lambda),
177
- a.take_materialized_series(),
178
- b.take_materialized_series(),
179
- )
180
- .map(|v| v.map(Column::from))
181
- };
182
- Ok(dsl::cum_fold_exprs(acc.inner.clone(), func, exprs, include_init).into())
171
+ let func = PlanCallback::new(move |(a, b): (Series, Series)| {
172
+ binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b).map(|v| v.unwrap())
173
+ });
174
+ Ok(dsl::cum_fold_exprs(
175
+ acc.inner.clone(),
176
+ func,
177
+ exprs,
178
+ returns_scalar,
179
+ return_dtype.map(|v| v.inner.clone()),
180
+ include_init,
181
+ )
182
+ .into())
183
183
  }
184
184
 
185
185
  pub fn concat_lf_diagonal(
@@ -220,24 +220,6 @@ pub fn concat_lf_horizontal(lfs: RArray, parallel: bool) -> RbResult<RbLazyFrame
220
220
  Ok(lf.into())
221
221
  }
222
222
 
223
- pub fn dtype_cols(dtypes: RArray) -> RbResult<RbExpr> {
224
- let dtypes = dtypes
225
- .into_iter()
226
- .map(Wrap::<DataType>::try_convert)
227
- .collect::<RbResult<Vec<Wrap<DataType>>>>()?;
228
- let dtypes = vec_extract_wrapped(dtypes);
229
- Ok(dsl::dtype_cols(dtypes).into())
230
- }
231
-
232
- pub fn index_cols(indices: Vec<i64>) -> RbExpr {
233
- if indices.len() == 1 {
234
- dsl::nth(indices[0])
235
- } else {
236
- dsl::index_cols(indices)
237
- }
238
- .into()
239
- }
240
-
241
223
  #[allow(clippy::too_many_arguments)]
242
224
  pub fn duration(
243
225
  weeks: Option<&RbExpr>,
@@ -274,42 +256,28 @@ pub fn duration(
274
256
  dsl::duration(args).into()
275
257
  }
276
258
 
277
- pub fn first() -> RbExpr {
278
- dsl::first().into()
279
- }
280
-
281
259
  pub fn fold(
282
260
  acc: &RbExpr,
283
261
  lambda: Value,
284
262
  exprs: RArray,
285
263
  returns_scalar: bool,
286
- return_dtype: Option<Wrap<DataType>>,
264
+ return_dtype: Option<&RbDataTypeExpr>,
287
265
  ) -> RbResult<RbExpr> {
288
266
  let exprs = rb_exprs_to_exprs(exprs)?;
289
267
  let lambda = Opaque::from(lambda);
290
-
291
- let func = move |a: Column, b: Column| {
292
- binary_lambda(
293
- Ruby::get().unwrap().get_inner(lambda),
294
- a.take_materialized_series(),
295
- b.take_materialized_series(),
296
- )
297
- .map(|v| v.map(Column::from))
298
- };
268
+ let func = PlanCallback::new(move |(a, b): (Series, Series)| {
269
+ binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b).map(|v| v.unwrap())
270
+ });
299
271
  Ok(dsl::fold_exprs(
300
272
  acc.inner.clone(),
301
273
  func,
302
274
  exprs,
303
275
  returns_scalar,
304
- return_dtype.map(|w| w.0),
276
+ return_dtype.map(|w| w.inner.clone()),
305
277
  )
306
278
  .into())
307
279
  }
308
280
 
309
- pub fn last() -> RbExpr {
310
- dsl::last().into()
311
- }
312
-
313
281
  pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr> {
314
282
  if value.is_kind_of(class::true_class()) || value.is_kind_of(class::false_class()) {
315
283
  Ok(dsl::lit(bool::try_convert(value)?).into())
@@ -1,8 +1,8 @@
1
1
  use magnus::{IntoValue, Value};
2
2
  use polars_core;
3
+ use polars_core::POOL;
3
4
  use polars_core::fmt::FloatFmt;
4
5
  use polars_core::prelude::IDX_DTYPE;
5
- use polars_core::POOL;
6
6
 
7
7
  use crate::conversion::Wrap;
8
8
  use crate::{RbResult, RbValueError};
@@ -22,7 +22,7 @@ pub fn set_float_fmt(fmt: String) -> RbResult<()> {
22
22
  e => {
23
23
  return Err(RbValueError::new_err(format!(
24
24
  "fmt must be one of {{'full', 'mixed'}}, got {e}",
25
- )))
25
+ )));
26
26
  }
27
27
  };
28
28
  polars_core::fmt::set_float_fmt(fmt);
@@ -1,6 +1,6 @@
1
+ use crate::RbResult;
1
2
  use crate::conversion::Wrap;
2
3
  use crate::prelude::DataType;
3
- use crate::RbResult;
4
4
 
5
5
  pub fn dtype_str_repr(dtype: Wrap<DataType>) -> RbResult<String> {
6
6
  let dtype = dtype.0;
@@ -1,17 +1,17 @@
1
1
  use crate::RbResult;
2
2
  use magnus::{RArray, Ruby, Value};
3
- use polars_core::StringCacheHolder;
4
3
 
5
4
  pub fn enable_string_cache() {
6
- polars_core::enable_string_cache()
5
+ // The string cache no longer exists.
7
6
  }
8
7
 
9
8
  pub fn disable_string_cache() {
10
- polars_core::disable_string_cache()
9
+ // The string cache no longer exists.
11
10
  }
12
11
 
13
12
  pub fn using_string_cache() -> bool {
14
- polars_core::using_string_cache()
13
+ // The string cache no longer exists.
14
+ true
15
15
  }
16
16
 
17
17
  #[magnus::wrap(class = "Polars::RbStringCacheHolder")]
@@ -19,7 +19,6 @@ pub struct RbStringCacheHolder {}
19
19
 
20
20
  impl RbStringCacheHolder {
21
21
  pub fn hold() -> RbResult<Value> {
22
- let _hold = StringCacheHolder::hold();
23
22
  Ruby::get().unwrap().yield_splat(RArray::new())
24
23
  }
25
24
  }
@@ -1,4 +1,4 @@
1
- use magnus::{class, prelude::*, IntoValue, Module, RArray, RClass, RModule, Value};
1
+ use magnus::{IntoValue, Module, RArray, RClass, RModule, Value, class, prelude::*};
2
2
 
3
3
  use crate::RbResult;
4
4
 
@@ -3,10 +3,10 @@ use num_traits::{Float, NumCast};
3
3
  use polars_core::prelude::*;
4
4
 
5
5
  use super::numo_rs::{Element, RbArray1};
6
+ use crate::RbResult;
6
7
  use crate::error::RbPolarsErr;
7
8
  use crate::raise_err;
8
9
  use crate::series::RbSeries;
9
- use crate::RbResult;
10
10
 
11
11
  impl RbSeries {
12
12
  /// Convert this Series to a Numo array.
@@ -0,0 +1,102 @@
1
+ use std::sync::Arc;
2
+
3
+ use magnus::{TryConvert, Value, value::ReprValue};
4
+ use polars::prelude::deletion::DeletionFilesList;
5
+ use polars::prelude::{
6
+ CastColumnsPolicy, ColumnMapping, ExtraColumnsPolicy, MissingColumnsPolicy, PlSmallStr, Schema,
7
+ UnifiedScanArgs,
8
+ };
9
+ use polars_io::{HiveOptions, RowIndex};
10
+ use polars_utils::IdxSize;
11
+ use polars_utils::plpath::PlPathRef;
12
+ use polars_utils::slice_enum::Slice;
13
+
14
+ use crate::RbResult;
15
+ use crate::prelude::Wrap;
16
+
17
+ /// Interface to `class ScanOptions` on the Ruby side
18
+ pub struct RbScanOptions(Value);
19
+
20
+ impl TryConvert for RbScanOptions {
21
+ fn try_convert(ob: Value) -> RbResult<Self> {
22
+ Ok(Self(ob))
23
+ }
24
+ }
25
+
26
+ impl RbScanOptions {
27
+ pub fn extract_unified_scan_args(
28
+ &self,
29
+ // For cloud_options init
30
+ first_path: Option<PlPathRef>,
31
+ ) -> RbResult<UnifiedScanArgs> {
32
+ let row_index: Option<(Wrap<PlSmallStr>, IdxSize)> = self.0.funcall("row_index", ())?;
33
+ let pre_slice: Option<(i64, usize)> = self.0.funcall("pre_slice", ())?;
34
+ let cast_options: Wrap<CastColumnsPolicy> = self.0.funcall("cast_options", ())?;
35
+ let extra_columns: Wrap<ExtraColumnsPolicy> = self.0.funcall("extra_columns", ())?;
36
+ let missing_columns: Wrap<MissingColumnsPolicy> = self.0.funcall("missing_columns", ())?;
37
+ let include_file_paths: Option<Wrap<PlSmallStr>> =
38
+ self.0.funcall("include_file_paths", ())?;
39
+ let glob: bool = self.0.funcall("glob", ())?;
40
+ let hive_partitioning: Option<bool> = self.0.funcall("hive_partitioning", ())?;
41
+ let hive_schema: Option<Wrap<Schema>> = self.0.funcall("hive_schema", ())?;
42
+ let try_parse_hive_dates: bool = self.0.funcall("try_parse_hive_dates", ())?;
43
+ let rechunk: bool = self.0.funcall("rechunk", ())?;
44
+ let cache: bool = self.0.funcall("cache", ())?;
45
+ let storage_options: Option<Vec<(String, String)>> =
46
+ self.0.funcall("storage_options", ())?;
47
+ let retries: usize = self.0.funcall("retries", ())?;
48
+ let deletion_files: Option<Wrap<DeletionFilesList>> =
49
+ self.0.funcall("deletion_files", ())?;
50
+ let column_mapping: Option<Wrap<ColumnMapping>> = self.0.funcall("column_mapping", ())?;
51
+
52
+ let cloud_options = storage_options;
53
+
54
+ let cloud_options = if let Some(first_path) = first_path {
55
+ use crate::prelude::parse_cloud_options;
56
+
57
+ let first_path_url = first_path.to_str();
58
+ let cloud_options =
59
+ parse_cloud_options(first_path_url, cloud_options.unwrap_or_default())?;
60
+
61
+ Some(cloud_options.with_max_retries(retries))
62
+ } else {
63
+ None
64
+ };
65
+
66
+ let hive_schema = hive_schema.map(|s| Arc::new(s.0));
67
+
68
+ let row_index = row_index.map(|(name, offset)| RowIndex {
69
+ name: name.0,
70
+ offset,
71
+ });
72
+
73
+ let hive_options = HiveOptions {
74
+ enabled: hive_partitioning,
75
+ hive_start_idx: 0,
76
+ schema: hive_schema,
77
+ try_parse_dates: try_parse_hive_dates,
78
+ };
79
+
80
+ let unified_scan_args = UnifiedScanArgs {
81
+ // Schema is currently still stored inside the options per scan type, but we do eventually
82
+ // want to put it here instead.
83
+ schema: None,
84
+ cloud_options,
85
+ hive_options,
86
+ rechunk,
87
+ cache,
88
+ glob,
89
+ projection: None,
90
+ row_index,
91
+ pre_slice: pre_slice.map(Slice::from),
92
+ cast_columns_policy: cast_options.0,
93
+ missing_columns_policy: missing_columns.0,
94
+ extra_columns_policy: extra_columns.0,
95
+ include_file_paths: include_file_paths.map(|x| x.0),
96
+ deletion_files: DeletionFilesList::filter_empty(deletion_files.map(|x| x.0)),
97
+ column_mapping: column_mapping.map(|x| x.0),
98
+ };
99
+
100
+ Ok(unified_scan_args)
101
+ }
102
+ }
@@ -1,4 +1,4 @@
1
- use magnus::{r_hash::ForEach, typed_data::Obj, IntoValue, RArray, RHash, TryConvert, Value};
1
+ use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
2
2
  use polars::io::{HiveOptions, RowIndex};
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
@@ -6,16 +6,17 @@ use polars_plan::dsl::ScanSources;
6
6
  use std::cell::RefCell;
7
7
  use std::io::BufWriter;
8
8
  use std::num::NonZeroUsize;
9
- use std::path::PathBuf;
10
9
 
11
10
  use super::SinkTarget;
12
11
  use crate::conversion::*;
13
12
  use crate::expr::rb_exprs_to_exprs;
13
+ use crate::expr::selector::RbSelector;
14
14
  use crate::file::get_file_like;
15
+ use crate::io::RbScanOptions;
15
16
  use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
16
17
 
17
- fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PathBuf>, ScanSources)> {
18
- use crate::file::{get_ruby_scan_source_input, RubyScanSourceInput};
18
+ fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PlPath>, ScanSources)> {
19
+ use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
19
20
  Ok(match get_ruby_scan_source_input(obj, false)? {
20
21
  RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
21
22
  RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
@@ -43,7 +44,7 @@ impl RbLazyFrame {
43
44
 
44
45
  let sources = sources.0;
45
46
  let (_first_path, sources) = match source {
46
- None => (sources.first_path().map(|p| p.to_path_buf()), sources),
47
+ None => (sources.first_path().map(|p| p.into_owned()), sources),
47
48
  Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
48
49
  };
49
50
 
@@ -111,7 +112,7 @@ impl RbLazyFrame {
111
112
 
112
113
  let sources = sources.0;
113
114
  let (_first_path, sources) = match source {
114
- None => (sources.first_path().map(|p| p.to_path_buf()), sources),
115
+ None => (sources.first_path().map(|p| p.into_owned()), sources),
115
116
  Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
116
117
  };
117
118
 
@@ -147,72 +148,35 @@ impl RbLazyFrame {
147
148
  Ok(r.finish().map_err(RbPolarsErr::from)?.into())
148
149
  }
149
150
 
150
- pub fn new_from_parquet(arguments: &[Value]) -> RbResult<Self> {
151
- let source = Option::<Value>::try_convert(arguments[0])?;
152
- let sources = Wrap::<ScanSources>::try_convert(arguments[1])?;
153
- let n_rows = Option::<usize>::try_convert(arguments[2])?;
154
- let cache = bool::try_convert(arguments[3])?;
155
- let parallel = Wrap::<ParallelStrategy>::try_convert(arguments[4])?;
156
- let rechunk = bool::try_convert(arguments[5])?;
157
- let row_index = Option::<(String, IdxSize)>::try_convert(arguments[6])?;
158
- let low_memory = bool::try_convert(arguments[7])?;
159
- let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[8])?;
160
- let _credential_provider = Option::<Value>::try_convert(arguments[9])?;
161
- let use_statistics = bool::try_convert(arguments[10])?;
162
- let hive_partitioning = Option::<bool>::try_convert(arguments[11])?;
163
- let schema = Option::<Wrap<Schema>>::try_convert(arguments[12])?;
164
- let hive_schema = Option::<Wrap<Schema>>::try_convert(arguments[13])?;
165
- let try_parse_hive_dates = bool::try_convert(arguments[14])?;
166
- let retries = usize::try_convert(arguments[15])?;
167
- let glob = bool::try_convert(arguments[16])?;
168
- let include_file_paths = Option::<String>::try_convert(arguments[17])?;
169
- let allow_missing_columns = bool::try_convert(arguments[18])?;
151
+ pub fn new_from_parquet(
152
+ sources: Wrap<ScanSources>,
153
+ schema: Option<Wrap<Schema>>,
154
+ scan_options: RbScanOptions,
155
+ parallel: Wrap<ParallelStrategy>,
156
+ low_memory: bool,
157
+ use_statistics: bool,
158
+ ) -> RbResult<Self> {
159
+ use crate::utils::to_rb_err;
170
160
 
171
161
  let parallel = parallel.0;
172
- let hive_schema = hive_schema.map(|s| Arc::new(s.0));
173
-
174
- let row_index = row_index.map(|(name, offset)| RowIndex {
175
- name: name.into(),
176
- offset,
177
- });
178
162
 
179
- let hive_options = HiveOptions {
180
- enabled: hive_partitioning,
181
- hive_start_idx: 0,
182
- schema: hive_schema,
183
- try_parse_dates: try_parse_hive_dates,
184
- };
185
-
186
- let mut args = ScanArgsParquet {
187
- n_rows,
188
- cache,
163
+ let options = ParquetOptions {
164
+ schema: schema.map(|x| Arc::new(x.0)),
189
165
  parallel,
190
- rechunk,
191
- row_index,
192
166
  low_memory,
193
- cloud_options: None,
194
167
  use_statistics,
195
- schema: schema.map(|x| Arc::new(x.0)),
196
- hive_options,
197
- glob,
198
- include_file_paths: include_file_paths.map(|x| x.into()),
199
- allow_missing_columns,
200
168
  };
201
169
 
202
170
  let sources = sources.0;
203
- let (first_path, sources) = match source {
204
- None => (sources.first_path().map(|p| p.to_path_buf()), sources),
205
- Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
206
- };
171
+ let first_path = sources.first_path().map(|p| p.into_owned());
207
172
 
208
- if let Some(first_path) = first_path {
209
- let first_path_url = first_path.to_string_lossy();
210
- let cloud_options =
211
- parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
212
- args.cloud_options = Some(cloud_options.with_max_retries(retries));
213
- }
173
+ let unified_scan_args =
174
+ scan_options.extract_unified_scan_args(first_path.as_ref().map(|p| p.as_ref()))?;
214
175
 
215
- let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(RbPolarsErr::from)?;
176
+ let lf: LazyFrame = DslBuilder::scan_parquet(sources, options, unified_scan_args)
177
+ .map_err(to_rb_err)?
178
+ .build()
179
+ .into();
216
180
 
217
181
  Ok(lf.into())
218
182
  }
@@ -254,7 +218,7 @@ impl RbLazyFrame {
254
218
 
255
219
  let sources = sources.0;
256
220
  let (_first_path, sources) = match source {
257
- None => (sources.first_path().map(|p| p.to_path_buf()), sources),
221
+ None => (sources.first_path().map(|p| p.into_owned()), sources),
258
222
  Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
259
223
  };
260
224
 
@@ -265,7 +229,7 @@ impl RbLazyFrame {
265
229
  pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
266
230
  let file = BufWriter::new(get_file_like(rb_f, true)?);
267
231
  serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
268
- .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
232
+ .map_err(|err| RbValueError::new_err(format!("{err:?}")))?;
269
233
  Ok(())
270
234
  }
271
235
 
@@ -399,10 +363,8 @@ impl RbLazyFrame {
399
363
  let cloud_options = match target.base_path() {
400
364
  None => None,
401
365
  Some(base_path) => {
402
- let cloud_options = parse_cloud_options(
403
- base_path.to_str().unwrap(),
404
- cloud_options.unwrap_or_default(),
405
- )?;
366
+ let cloud_options =
367
+ parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
406
368
  Some(cloud_options.with_max_retries(retries))
407
369
  }
408
370
  };
@@ -434,10 +396,8 @@ impl RbLazyFrame {
434
396
  let cloud_options = match target.base_path() {
435
397
  None => None,
436
398
  Some(base_path) => {
437
- let cloud_options = parse_cloud_options(
438
- base_path.to_str().unwrap(),
439
- cloud_options.unwrap_or_default(),
440
- )?;
399
+ let cloud_options =
400
+ parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
441
401
  Some(cloud_options.with_max_retries(retries))
442
402
  }
443
403
  };
@@ -466,11 +426,12 @@ impl RbLazyFrame {
466
426
  let time_format = Option::<String>::try_convert(arguments[9])?;
467
427
  let float_scientific = Option::<bool>::try_convert(arguments[10])?;
468
428
  let float_precision = Option::<usize>::try_convert(arguments[11])?;
469
- let null_value = Option::<String>::try_convert(arguments[12])?;
470
- let quote_style = Option::<Wrap<QuoteStyle>>::try_convert(arguments[13])?;
471
- let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[14])?;
472
- let retries = usize::try_convert(arguments[15])?;
473
- let sink_options = Wrap::<SinkOptions>::try_convert(arguments[16])?;
429
+ let decimal_comma = bool::try_convert(arguments[12])?;
430
+ let null_value = Option::<String>::try_convert(arguments[13])?;
431
+ let quote_style = Option::<Wrap<QuoteStyle>>::try_convert(arguments[14])?;
432
+ let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[15])?;
433
+ let retries = usize::try_convert(arguments[16])?;
434
+ let sink_options = Wrap::<SinkOptions>::try_convert(arguments[17])?;
474
435
 
475
436
  let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
476
437
  let null_value = null_value.unwrap_or(SerializeOptions::default().null);
@@ -481,6 +442,7 @@ impl RbLazyFrame {
481
442
  datetime_format,
482
443
  float_scientific,
483
444
  float_precision,
445
+ decimal_comma,
484
446
  separator,
485
447
  quote_char,
486
448
  null: null_value,
@@ -498,10 +460,8 @@ impl RbLazyFrame {
498
460
  let cloud_options = match target.base_path() {
499
461
  None => None,
500
462
  Some(base_path) => {
501
- let cloud_options = parse_cloud_options(
502
- base_path.to_str().unwrap(),
503
- cloud_options.unwrap_or_default(),
504
- )?;
463
+ let cloud_options =
464
+ parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
505
465
  Some(cloud_options.with_max_retries(retries))
506
466
  }
507
467
  };
@@ -529,10 +489,8 @@ impl RbLazyFrame {
529
489
  let cloud_options = match target.base_path() {
530
490
  None => None,
531
491
  Some(base_path) => {
532
- let cloud_options = parse_cloud_options(
533
- base_path.to_str().unwrap(),
534
- cloud_options.unwrap_or_default(),
535
- )?;
492
+ let cloud_options =
493
+ parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
536
494
  Some(cloud_options.with_max_retries(retries))
537
495
  }
538
496
  };
@@ -546,12 +504,6 @@ impl RbLazyFrame {
546
504
  .map_err(Into::into)
547
505
  }
548
506
 
549
- pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
550
- let ldf = self.ldf.borrow().clone();
551
- let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
552
- Ok(df.into())
553
- }
554
-
555
507
  pub fn filter(&self, predicate: &RbExpr) -> Self {
556
508
  let ldf = self.ldf.borrow().clone();
557
509
  ldf.filter(predicate.inner.clone()).into()
@@ -689,15 +641,19 @@ impl RbLazyFrame {
689
641
  .allow_parallel(allow_parallel)
690
642
  .force_parallel(force_parallel)
691
643
  .coalesce(coalesce)
692
- .how(JoinType::AsOf(AsOfOptions {
644
+ .how(JoinType::AsOf(Box::new(AsOfOptions {
693
645
  strategy: strategy.0,
694
646
  left_by: left_by.map(strings_to_pl_smallstr),
695
647
  right_by: right_by.map(strings_to_pl_smallstr),
696
- tolerance: tolerance.map(|t| t.0.into_static()),
648
+ tolerance: tolerance.map(|t| {
649
+ let av = t.0.into_static();
650
+ let dtype = av.dtype();
651
+ Scalar::new(dtype, av)
652
+ }),
697
653
  tolerance_str: tolerance_str.map(|s| s.into()),
698
654
  allow_eq,
699
655
  check_sortedness,
700
- }))
656
+ })))
701
657
  .suffix(suffix)
702
658
  .finish()
703
659
  .into())
@@ -832,10 +788,12 @@ impl RbLazyFrame {
832
788
  out.into()
833
789
  }
834
790
 
835
- pub fn explode(&self, column: RArray) -> RbResult<Self> {
836
- let ldf = self.ldf.borrow().clone();
837
- let column = rb_exprs_to_exprs(column)?;
838
- Ok(ldf.explode(column).into())
791
+ pub fn explode(&self, subset: &RbSelector) -> Self {
792
+ self.ldf
793
+ .borrow()
794
+ .clone()
795
+ .explode(subset.inner.clone())
796
+ .into()
839
797
  }
840
798
 
841
799
  pub fn null_count(&self) -> Self {
@@ -846,10 +804,11 @@ impl RbLazyFrame {
846
804
  pub fn unique(
847
805
  &self,
848
806
  maintain_order: bool,
849
- subset: Option<Vec<String>>,
807
+ subset: Option<&RbSelector>,
850
808
  keep: Wrap<UniqueKeepStrategy>,
851
809
  ) -> RbResult<Self> {
852
810
  let ldf = self.ldf.borrow().clone();
811
+ let subset = subset.map(|e| e.inner.clone());
853
812
  Ok(match maintain_order {
854
813
  true => ldf.unique_stable_generic(subset, keep.0),
855
814
  false => ldf.unique_generic(subset, keep.0),
@@ -857,9 +816,11 @@ impl RbLazyFrame {
857
816
  .into())
858
817
  }
859
818
 
860
- pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
861
- let ldf = self.ldf.borrow().clone();
862
- ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
819
+ pub fn drop_nulls(&self, subset: Option<&RbSelector>) -> Self {
820
+ self.ldf
821
+ .borrow()
822
+ .clone()
823
+ .drop_nulls(subset.map(|e| e.inner.clone()))
863
824
  .into()
864
825
  }
865
826
 
@@ -875,16 +836,14 @@ impl RbLazyFrame {
875
836
 
876
837
  pub fn unpivot(
877
838
  &self,
878
- on: RArray,
879
- index: RArray,
839
+ on: &RbSelector,
840
+ index: &RbSelector,
880
841
  value_name: Option<String>,
881
842
  variable_name: Option<String>,
882
843
  ) -> RbResult<Self> {
883
- let on = rb_exprs_to_exprs(on)?;
884
- let index = rb_exprs_to_exprs(index)?;
885
844
  let args = UnpivotArgsDSL {
886
- on: on.into_iter().map(|e| e.into()).collect(),
887
- index: index.into_iter().map(|e| e.into()).collect(),
845
+ on: on.inner.clone(),
846
+ index: index.inner.clone(),
888
847
  value_name: value_name.map(|s| s.into()),
889
848
  variable_name: variable_name.map(|s| s.into()),
890
849
  };
@@ -898,9 +857,8 @@ impl RbLazyFrame {
898
857
  ldf.with_row_index(&name, offset).into()
899
858
  }
900
859
 
901
- pub fn drop(&self, cols: Vec<String>) -> Self {
902
- let ldf = self.ldf.borrow().clone();
903
- ldf.drop(cols).into()
860
+ pub fn drop(&self, columns: &RbSelector) -> Self {
861
+ self.ldf.borrow().clone().drop(columns.inner.clone()).into()
904
862
  }
905
863
 
906
864
  pub fn cast(&self, rb_dtypes: RHash, strict: bool) -> RbResult<Self> {
@@ -941,8 +899,12 @@ impl RbLazyFrame {
941
899
  Ok(schema_dict)
942
900
  }
943
901
 
944
- pub fn unnest(&self, cols: Vec<String>) -> Self {
945
- self.ldf.borrow().clone().unnest(cols).into()
902
+ pub fn unnest(&self, columns: &RbSelector) -> Self {
903
+ self.ldf
904
+ .borrow()
905
+ .clone()
906
+ .unnest(columns.inner.clone())
907
+ .into()
946
908
  }
947
909
 
948
910
  pub fn count(&self) -> Self {
@@ -25,7 +25,7 @@ impl RbLazyFrame {
25
25
  let json = unsafe { std::mem::transmute::<&'_ str, &'static str>(json.as_str()) };
26
26
 
27
27
  let lp = serde_json::from_str::<DslPlan>(json)
28
- .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
28
+ .map_err(|err| RbValueError::new_err(format!("{err:?}")))?;
29
29
  Ok(LazyFrame::from(lp).into())
30
30
  }
31
31
  }