polars-df 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +192 -186
- data/LICENSE.txt +1 -1
- data/ext/polars/Cargo.toml +13 -9
- data/ext/polars/src/batched_csv.rs +2 -2
- data/ext/polars/src/catalog/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +450 -0
- data/ext/polars/src/conversion/any_value.rs +9 -19
- data/ext/polars/src/conversion/categorical.rs +30 -0
- data/ext/polars/src/conversion/chunked_array.rs +8 -8
- data/ext/polars/src/conversion/mod.rs +187 -109
- data/ext/polars/src/dataframe/construction.rs +2 -2
- data/ext/polars/src/dataframe/export.rs +2 -2
- data/ext/polars/src/dataframe/general.rs +4 -2
- data/ext/polars/src/dataframe/io.rs +2 -2
- data/ext/polars/src/exceptions.rs +1 -1
- data/ext/polars/src/expr/datatype.rs +14 -0
- data/ext/polars/src/expr/general.rs +22 -17
- data/ext/polars/src/expr/list.rs +21 -2
- data/ext/polars/src/expr/meta.rs +0 -34
- data/ext/polars/src/expr/mod.rs +3 -1
- data/ext/polars/src/expr/name.rs +2 -2
- data/ext/polars/src/expr/rolling.rs +1 -1
- data/ext/polars/src/expr/selector.rs +219 -0
- data/ext/polars/src/expr/string.rs +14 -6
- data/ext/polars/src/file.rs +11 -5
- data/ext/polars/src/functions/io.rs +2 -11
- data/ext/polars/src/functions/lazy.rs +22 -54
- data/ext/polars/src/functions/meta.rs +2 -2
- data/ext/polars/src/functions/misc.rs +1 -1
- data/ext/polars/src/functions/string_cache.rs +4 -5
- data/ext/polars/src/interop/numo/numo_rs.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/io/mod.rs +102 -0
- data/ext/polars/src/lazyframe/general.rs +74 -112
- data/ext/polars/src/lazyframe/serde.rs +1 -1
- data/ext/polars/src/lazyframe/sink.rs +6 -6
- data/ext/polars/src/lib.rs +98 -20
- data/ext/polars/src/map/dataframe.rs +7 -7
- data/ext/polars/src/map/lazy.rs +1 -1
- data/ext/polars/src/map/mod.rs +31 -19
- data/ext/polars/src/map/series.rs +8 -8
- data/ext/polars/src/on_startup.rs +5 -2
- data/ext/polars/src/rb_modules.rs +1 -1
- data/ext/polars/src/series/construction.rs +11 -7
- data/ext/polars/src/series/export.rs +6 -4
- data/ext/polars/src/series/general.rs +12 -207
- data/ext/polars/src/series/import.rs +2 -2
- data/ext/polars/src/series/map.rs +227 -0
- data/ext/polars/src/series/mod.rs +2 -1
- data/ext/polars/src/series/scatter.rs +1 -1
- data/ext/polars/src/utils.rs +10 -2
- data/lib/polars/cat_name_space.rb +3 -43
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/convert.rb +10 -0
- data/lib/polars/data_frame.rb +151 -30
- data/lib/polars/data_types.rb +47 -3
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +34 -31
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/lazy.rb +114 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +18 -0
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +45 -63
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +163 -75
- data/lib/polars/list_expr.rb +204 -7
- data/lib/polars/list_name_space.rb +120 -1
- data/lib/polars/meta_expr.rb +7 -22
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +6 -1
- data/lib/polars/selector.rb +138 -0
- data/lib/polars/selectors.rb +931 -202
- data/lib/polars/series.rb +34 -11
- data/lib/polars/string_expr.rb +24 -3
- data/lib/polars/string_name_space.rb +11 -0
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +8 -0
- metadata +17 -2
@@ -1,13 +1,13 @@
|
|
1
1
|
use magnus::encoding::{self, EncodingCapable};
|
2
2
|
use magnus::{
|
3
|
-
|
3
|
+
Float, Integer, RArray, RString, Ruby, Value, class, prelude::*, typed_data::Obj, value::Opaque,
|
4
4
|
};
|
5
5
|
use polars::lazy::dsl;
|
6
6
|
use polars::prelude::*;
|
7
7
|
|
8
|
-
use crate::conversion::{get_lf, get_rbseq
|
8
|
+
use crate::conversion::{Wrap, get_lf, get_rbseq};
|
9
|
+
use crate::expr::datatype::RbDataTypeExpr;
|
9
10
|
use crate::map::lazy::binary_lambda;
|
10
|
-
use crate::prelude::vec_extract_wrapped;
|
11
11
|
use crate::rb_exprs_to_exprs;
|
12
12
|
use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
13
13
|
|
@@ -103,10 +103,6 @@ pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
|
|
103
103
|
})))
|
104
104
|
}
|
105
105
|
|
106
|
-
pub fn cols(names: Vec<String>) -> RbExpr {
|
107
|
-
dsl::cols(names).into()
|
108
|
-
}
|
109
|
-
|
110
106
|
pub fn concat_lf(
|
111
107
|
lfs: Value,
|
112
108
|
rechunk: bool,
|
@@ -166,20 +162,24 @@ pub fn cum_fold(
|
|
166
162
|
acc: &RbExpr,
|
167
163
|
lambda: Value,
|
168
164
|
exprs: RArray,
|
165
|
+
returns_scalar: bool,
|
166
|
+
return_dtype: Option<&RbDataTypeExpr>,
|
169
167
|
include_init: bool,
|
170
168
|
) -> RbResult<RbExpr> {
|
171
169
|
let exprs = rb_exprs_to_exprs(exprs)?;
|
172
170
|
let lambda = Opaque::from(lambda);
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
171
|
+
let func = PlanCallback::new(move |(a, b): (Series, Series)| {
|
172
|
+
binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b).map(|v| v.unwrap())
|
173
|
+
});
|
174
|
+
Ok(dsl::cum_fold_exprs(
|
175
|
+
acc.inner.clone(),
|
176
|
+
func,
|
177
|
+
exprs,
|
178
|
+
returns_scalar,
|
179
|
+
return_dtype.map(|v| v.inner.clone()),
|
180
|
+
include_init,
|
181
|
+
)
|
182
|
+
.into())
|
183
183
|
}
|
184
184
|
|
185
185
|
pub fn concat_lf_diagonal(
|
@@ -220,24 +220,6 @@ pub fn concat_lf_horizontal(lfs: RArray, parallel: bool) -> RbResult<RbLazyFrame
|
|
220
220
|
Ok(lf.into())
|
221
221
|
}
|
222
222
|
|
223
|
-
pub fn dtype_cols(dtypes: RArray) -> RbResult<RbExpr> {
|
224
|
-
let dtypes = dtypes
|
225
|
-
.into_iter()
|
226
|
-
.map(Wrap::<DataType>::try_convert)
|
227
|
-
.collect::<RbResult<Vec<Wrap<DataType>>>>()?;
|
228
|
-
let dtypes = vec_extract_wrapped(dtypes);
|
229
|
-
Ok(dsl::dtype_cols(dtypes).into())
|
230
|
-
}
|
231
|
-
|
232
|
-
pub fn index_cols(indices: Vec<i64>) -> RbExpr {
|
233
|
-
if indices.len() == 1 {
|
234
|
-
dsl::nth(indices[0])
|
235
|
-
} else {
|
236
|
-
dsl::index_cols(indices)
|
237
|
-
}
|
238
|
-
.into()
|
239
|
-
}
|
240
|
-
|
241
223
|
#[allow(clippy::too_many_arguments)]
|
242
224
|
pub fn duration(
|
243
225
|
weeks: Option<&RbExpr>,
|
@@ -274,42 +256,28 @@ pub fn duration(
|
|
274
256
|
dsl::duration(args).into()
|
275
257
|
}
|
276
258
|
|
277
|
-
pub fn first() -> RbExpr {
|
278
|
-
dsl::first().into()
|
279
|
-
}
|
280
|
-
|
281
259
|
pub fn fold(
|
282
260
|
acc: &RbExpr,
|
283
261
|
lambda: Value,
|
284
262
|
exprs: RArray,
|
285
263
|
returns_scalar: bool,
|
286
|
-
return_dtype: Option
|
264
|
+
return_dtype: Option<&RbDataTypeExpr>,
|
287
265
|
) -> RbResult<RbExpr> {
|
288
266
|
let exprs = rb_exprs_to_exprs(exprs)?;
|
289
267
|
let lambda = Opaque::from(lambda);
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
Ruby::get().unwrap().get_inner(lambda),
|
294
|
-
a.take_materialized_series(),
|
295
|
-
b.take_materialized_series(),
|
296
|
-
)
|
297
|
-
.map(|v| v.map(Column::from))
|
298
|
-
};
|
268
|
+
let func = PlanCallback::new(move |(a, b): (Series, Series)| {
|
269
|
+
binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b).map(|v| v.unwrap())
|
270
|
+
});
|
299
271
|
Ok(dsl::fold_exprs(
|
300
272
|
acc.inner.clone(),
|
301
273
|
func,
|
302
274
|
exprs,
|
303
275
|
returns_scalar,
|
304
|
-
return_dtype.map(|w| w.
|
276
|
+
return_dtype.map(|w| w.inner.clone()),
|
305
277
|
)
|
306
278
|
.into())
|
307
279
|
}
|
308
280
|
|
309
|
-
pub fn last() -> RbExpr {
|
310
|
-
dsl::last().into()
|
311
|
-
}
|
312
|
-
|
313
281
|
pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr> {
|
314
282
|
if value.is_kind_of(class::true_class()) || value.is_kind_of(class::false_class()) {
|
315
283
|
Ok(dsl::lit(bool::try_convert(value)?).into())
|
@@ -1,8 +1,8 @@
|
|
1
1
|
use magnus::{IntoValue, Value};
|
2
2
|
use polars_core;
|
3
|
+
use polars_core::POOL;
|
3
4
|
use polars_core::fmt::FloatFmt;
|
4
5
|
use polars_core::prelude::IDX_DTYPE;
|
5
|
-
use polars_core::POOL;
|
6
6
|
|
7
7
|
use crate::conversion::Wrap;
|
8
8
|
use crate::{RbResult, RbValueError};
|
@@ -22,7 +22,7 @@ pub fn set_float_fmt(fmt: String) -> RbResult<()> {
|
|
22
22
|
e => {
|
23
23
|
return Err(RbValueError::new_err(format!(
|
24
24
|
"fmt must be one of {{'full', 'mixed'}}, got {e}",
|
25
|
-
)))
|
25
|
+
)));
|
26
26
|
}
|
27
27
|
};
|
28
28
|
polars_core::fmt::set_float_fmt(fmt);
|
@@ -1,17 +1,17 @@
|
|
1
1
|
use crate::RbResult;
|
2
2
|
use magnus::{RArray, Ruby, Value};
|
3
|
-
use polars_core::StringCacheHolder;
|
4
3
|
|
5
4
|
pub fn enable_string_cache() {
|
6
|
-
|
5
|
+
// The string cache no longer exists.
|
7
6
|
}
|
8
7
|
|
9
8
|
pub fn disable_string_cache() {
|
10
|
-
|
9
|
+
// The string cache no longer exists.
|
11
10
|
}
|
12
11
|
|
13
12
|
pub fn using_string_cache() -> bool {
|
14
|
-
|
13
|
+
// The string cache no longer exists.
|
14
|
+
true
|
15
15
|
}
|
16
16
|
|
17
17
|
#[magnus::wrap(class = "Polars::RbStringCacheHolder")]
|
@@ -19,7 +19,6 @@ pub struct RbStringCacheHolder {}
|
|
19
19
|
|
20
20
|
impl RbStringCacheHolder {
|
21
21
|
pub fn hold() -> RbResult<Value> {
|
22
|
-
let _hold = StringCacheHolder::hold();
|
23
22
|
Ruby::get().unwrap().yield_splat(RArray::new())
|
24
23
|
}
|
25
24
|
}
|
@@ -3,10 +3,10 @@ use num_traits::{Float, NumCast};
|
|
3
3
|
use polars_core::prelude::*;
|
4
4
|
|
5
5
|
use super::numo_rs::{Element, RbArray1};
|
6
|
+
use crate::RbResult;
|
6
7
|
use crate::error::RbPolarsErr;
|
7
8
|
use crate::raise_err;
|
8
9
|
use crate::series::RbSeries;
|
9
|
-
use crate::RbResult;
|
10
10
|
|
11
11
|
impl RbSeries {
|
12
12
|
/// Convert this Series to a Numo array.
|
@@ -0,0 +1,102 @@
|
|
1
|
+
use std::sync::Arc;
|
2
|
+
|
3
|
+
use magnus::{TryConvert, Value, value::ReprValue};
|
4
|
+
use polars::prelude::deletion::DeletionFilesList;
|
5
|
+
use polars::prelude::{
|
6
|
+
CastColumnsPolicy, ColumnMapping, ExtraColumnsPolicy, MissingColumnsPolicy, PlSmallStr, Schema,
|
7
|
+
UnifiedScanArgs,
|
8
|
+
};
|
9
|
+
use polars_io::{HiveOptions, RowIndex};
|
10
|
+
use polars_utils::IdxSize;
|
11
|
+
use polars_utils::plpath::PlPathRef;
|
12
|
+
use polars_utils::slice_enum::Slice;
|
13
|
+
|
14
|
+
use crate::RbResult;
|
15
|
+
use crate::prelude::Wrap;
|
16
|
+
|
17
|
+
/// Interface to `class ScanOptions` on the Ruby side
|
18
|
+
pub struct RbScanOptions(Value);
|
19
|
+
|
20
|
+
impl TryConvert for RbScanOptions {
|
21
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
22
|
+
Ok(Self(ob))
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
impl RbScanOptions {
|
27
|
+
pub fn extract_unified_scan_args(
|
28
|
+
&self,
|
29
|
+
// For cloud_options init
|
30
|
+
first_path: Option<PlPathRef>,
|
31
|
+
) -> RbResult<UnifiedScanArgs> {
|
32
|
+
let row_index: Option<(Wrap<PlSmallStr>, IdxSize)> = self.0.funcall("row_index", ())?;
|
33
|
+
let pre_slice: Option<(i64, usize)> = self.0.funcall("pre_slice", ())?;
|
34
|
+
let cast_options: Wrap<CastColumnsPolicy> = self.0.funcall("cast_options", ())?;
|
35
|
+
let extra_columns: Wrap<ExtraColumnsPolicy> = self.0.funcall("extra_columns", ())?;
|
36
|
+
let missing_columns: Wrap<MissingColumnsPolicy> = self.0.funcall("missing_columns", ())?;
|
37
|
+
let include_file_paths: Option<Wrap<PlSmallStr>> =
|
38
|
+
self.0.funcall("include_file_paths", ())?;
|
39
|
+
let glob: bool = self.0.funcall("glob", ())?;
|
40
|
+
let hive_partitioning: Option<bool> = self.0.funcall("hive_partitioning", ())?;
|
41
|
+
let hive_schema: Option<Wrap<Schema>> = self.0.funcall("hive_schema", ())?;
|
42
|
+
let try_parse_hive_dates: bool = self.0.funcall("try_parse_hive_dates", ())?;
|
43
|
+
let rechunk: bool = self.0.funcall("rechunk", ())?;
|
44
|
+
let cache: bool = self.0.funcall("cache", ())?;
|
45
|
+
let storage_options: Option<Vec<(String, String)>> =
|
46
|
+
self.0.funcall("storage_options", ())?;
|
47
|
+
let retries: usize = self.0.funcall("retries", ())?;
|
48
|
+
let deletion_files: Option<Wrap<DeletionFilesList>> =
|
49
|
+
self.0.funcall("deletion_files", ())?;
|
50
|
+
let column_mapping: Option<Wrap<ColumnMapping>> = self.0.funcall("column_mapping", ())?;
|
51
|
+
|
52
|
+
let cloud_options = storage_options;
|
53
|
+
|
54
|
+
let cloud_options = if let Some(first_path) = first_path {
|
55
|
+
use crate::prelude::parse_cloud_options;
|
56
|
+
|
57
|
+
let first_path_url = first_path.to_str();
|
58
|
+
let cloud_options =
|
59
|
+
parse_cloud_options(first_path_url, cloud_options.unwrap_or_default())?;
|
60
|
+
|
61
|
+
Some(cloud_options.with_max_retries(retries))
|
62
|
+
} else {
|
63
|
+
None
|
64
|
+
};
|
65
|
+
|
66
|
+
let hive_schema = hive_schema.map(|s| Arc::new(s.0));
|
67
|
+
|
68
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
69
|
+
name: name.0,
|
70
|
+
offset,
|
71
|
+
});
|
72
|
+
|
73
|
+
let hive_options = HiveOptions {
|
74
|
+
enabled: hive_partitioning,
|
75
|
+
hive_start_idx: 0,
|
76
|
+
schema: hive_schema,
|
77
|
+
try_parse_dates: try_parse_hive_dates,
|
78
|
+
};
|
79
|
+
|
80
|
+
let unified_scan_args = UnifiedScanArgs {
|
81
|
+
// Schema is currently still stored inside the options per scan type, but we do eventually
|
82
|
+
// want to put it here instead.
|
83
|
+
schema: None,
|
84
|
+
cloud_options,
|
85
|
+
hive_options,
|
86
|
+
rechunk,
|
87
|
+
cache,
|
88
|
+
glob,
|
89
|
+
projection: None,
|
90
|
+
row_index,
|
91
|
+
pre_slice: pre_slice.map(Slice::from),
|
92
|
+
cast_columns_policy: cast_options.0,
|
93
|
+
missing_columns_policy: missing_columns.0,
|
94
|
+
extra_columns_policy: extra_columns.0,
|
95
|
+
include_file_paths: include_file_paths.map(|x| x.0),
|
96
|
+
deletion_files: DeletionFilesList::filter_empty(deletion_files.map(|x| x.0)),
|
97
|
+
column_mapping: column_mapping.map(|x| x.0),
|
98
|
+
};
|
99
|
+
|
100
|
+
Ok(unified_scan_args)
|
101
|
+
}
|
102
|
+
}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{
|
1
|
+
use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
|
2
2
|
use polars::io::{HiveOptions, RowIndex};
|
3
3
|
use polars::lazy::frame::LazyFrame;
|
4
4
|
use polars::prelude::*;
|
@@ -6,16 +6,17 @@ use polars_plan::dsl::ScanSources;
|
|
6
6
|
use std::cell::RefCell;
|
7
7
|
use std::io::BufWriter;
|
8
8
|
use std::num::NonZeroUsize;
|
9
|
-
use std::path::PathBuf;
|
10
9
|
|
11
10
|
use super::SinkTarget;
|
12
11
|
use crate::conversion::*;
|
13
12
|
use crate::expr::rb_exprs_to_exprs;
|
13
|
+
use crate::expr::selector::RbSelector;
|
14
14
|
use crate::file::get_file_like;
|
15
|
+
use crate::io::RbScanOptions;
|
15
16
|
use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
|
16
17
|
|
17
|
-
fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<
|
18
|
-
use crate::file::{
|
18
|
+
fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PlPath>, ScanSources)> {
|
19
|
+
use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
|
19
20
|
Ok(match get_ruby_scan_source_input(obj, false)? {
|
20
21
|
RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
|
21
22
|
RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
|
@@ -43,7 +44,7 @@ impl RbLazyFrame {
|
|
43
44
|
|
44
45
|
let sources = sources.0;
|
45
46
|
let (_first_path, sources) = match source {
|
46
|
-
None => (sources.first_path().map(|p| p.
|
47
|
+
None => (sources.first_path().map(|p| p.into_owned()), sources),
|
47
48
|
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
48
49
|
};
|
49
50
|
|
@@ -111,7 +112,7 @@ impl RbLazyFrame {
|
|
111
112
|
|
112
113
|
let sources = sources.0;
|
113
114
|
let (_first_path, sources) = match source {
|
114
|
-
None => (sources.first_path().map(|p| p.
|
115
|
+
None => (sources.first_path().map(|p| p.into_owned()), sources),
|
115
116
|
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
116
117
|
};
|
117
118
|
|
@@ -147,72 +148,35 @@ impl RbLazyFrame {
|
|
147
148
|
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
148
149
|
}
|
149
150
|
|
150
|
-
pub fn new_from_parquet(
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[8])?;
|
160
|
-
let _credential_provider = Option::<Value>::try_convert(arguments[9])?;
|
161
|
-
let use_statistics = bool::try_convert(arguments[10])?;
|
162
|
-
let hive_partitioning = Option::<bool>::try_convert(arguments[11])?;
|
163
|
-
let schema = Option::<Wrap<Schema>>::try_convert(arguments[12])?;
|
164
|
-
let hive_schema = Option::<Wrap<Schema>>::try_convert(arguments[13])?;
|
165
|
-
let try_parse_hive_dates = bool::try_convert(arguments[14])?;
|
166
|
-
let retries = usize::try_convert(arguments[15])?;
|
167
|
-
let glob = bool::try_convert(arguments[16])?;
|
168
|
-
let include_file_paths = Option::<String>::try_convert(arguments[17])?;
|
169
|
-
let allow_missing_columns = bool::try_convert(arguments[18])?;
|
151
|
+
pub fn new_from_parquet(
|
152
|
+
sources: Wrap<ScanSources>,
|
153
|
+
schema: Option<Wrap<Schema>>,
|
154
|
+
scan_options: RbScanOptions,
|
155
|
+
parallel: Wrap<ParallelStrategy>,
|
156
|
+
low_memory: bool,
|
157
|
+
use_statistics: bool,
|
158
|
+
) -> RbResult<Self> {
|
159
|
+
use crate::utils::to_rb_err;
|
170
160
|
|
171
161
|
let parallel = parallel.0;
|
172
|
-
let hive_schema = hive_schema.map(|s| Arc::new(s.0));
|
173
|
-
|
174
|
-
let row_index = row_index.map(|(name, offset)| RowIndex {
|
175
|
-
name: name.into(),
|
176
|
-
offset,
|
177
|
-
});
|
178
162
|
|
179
|
-
let
|
180
|
-
|
181
|
-
hive_start_idx: 0,
|
182
|
-
schema: hive_schema,
|
183
|
-
try_parse_dates: try_parse_hive_dates,
|
184
|
-
};
|
185
|
-
|
186
|
-
let mut args = ScanArgsParquet {
|
187
|
-
n_rows,
|
188
|
-
cache,
|
163
|
+
let options = ParquetOptions {
|
164
|
+
schema: schema.map(|x| Arc::new(x.0)),
|
189
165
|
parallel,
|
190
|
-
rechunk,
|
191
|
-
row_index,
|
192
166
|
low_memory,
|
193
|
-
cloud_options: None,
|
194
167
|
use_statistics,
|
195
|
-
schema: schema.map(|x| Arc::new(x.0)),
|
196
|
-
hive_options,
|
197
|
-
glob,
|
198
|
-
include_file_paths: include_file_paths.map(|x| x.into()),
|
199
|
-
allow_missing_columns,
|
200
168
|
};
|
201
169
|
|
202
170
|
let sources = sources.0;
|
203
|
-
let
|
204
|
-
None => (sources.first_path().map(|p| p.to_path_buf()), sources),
|
205
|
-
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
206
|
-
};
|
171
|
+
let first_path = sources.first_path().map(|p| p.into_owned());
|
207
172
|
|
208
|
-
|
209
|
-
|
210
|
-
let cloud_options =
|
211
|
-
parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
|
212
|
-
args.cloud_options = Some(cloud_options.with_max_retries(retries));
|
213
|
-
}
|
173
|
+
let unified_scan_args =
|
174
|
+
scan_options.extract_unified_scan_args(first_path.as_ref().map(|p| p.as_ref()))?;
|
214
175
|
|
215
|
-
let lf =
|
176
|
+
let lf: LazyFrame = DslBuilder::scan_parquet(sources, options, unified_scan_args)
|
177
|
+
.map_err(to_rb_err)?
|
178
|
+
.build()
|
179
|
+
.into();
|
216
180
|
|
217
181
|
Ok(lf.into())
|
218
182
|
}
|
@@ -254,7 +218,7 @@ impl RbLazyFrame {
|
|
254
218
|
|
255
219
|
let sources = sources.0;
|
256
220
|
let (_first_path, sources) = match source {
|
257
|
-
None => (sources.first_path().map(|p| p.
|
221
|
+
None => (sources.first_path().map(|p| p.into_owned()), sources),
|
258
222
|
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
259
223
|
};
|
260
224
|
|
@@ -265,7 +229,7 @@ impl RbLazyFrame {
|
|
265
229
|
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
266
230
|
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
267
231
|
serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
|
268
|
-
.map_err(|err| RbValueError::new_err(format!("{:?}"
|
232
|
+
.map_err(|err| RbValueError::new_err(format!("{err:?}")))?;
|
269
233
|
Ok(())
|
270
234
|
}
|
271
235
|
|
@@ -399,10 +363,8 @@ impl RbLazyFrame {
|
|
399
363
|
let cloud_options = match target.base_path() {
|
400
364
|
None => None,
|
401
365
|
Some(base_path) => {
|
402
|
-
let cloud_options =
|
403
|
-
base_path.to_str().
|
404
|
-
cloud_options.unwrap_or_default(),
|
405
|
-
)?;
|
366
|
+
let cloud_options =
|
367
|
+
parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
|
406
368
|
Some(cloud_options.with_max_retries(retries))
|
407
369
|
}
|
408
370
|
};
|
@@ -434,10 +396,8 @@ impl RbLazyFrame {
|
|
434
396
|
let cloud_options = match target.base_path() {
|
435
397
|
None => None,
|
436
398
|
Some(base_path) => {
|
437
|
-
let cloud_options =
|
438
|
-
base_path.to_str().
|
439
|
-
cloud_options.unwrap_or_default(),
|
440
|
-
)?;
|
399
|
+
let cloud_options =
|
400
|
+
parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
|
441
401
|
Some(cloud_options.with_max_retries(retries))
|
442
402
|
}
|
443
403
|
};
|
@@ -466,11 +426,12 @@ impl RbLazyFrame {
|
|
466
426
|
let time_format = Option::<String>::try_convert(arguments[9])?;
|
467
427
|
let float_scientific = Option::<bool>::try_convert(arguments[10])?;
|
468
428
|
let float_precision = Option::<usize>::try_convert(arguments[11])?;
|
469
|
-
let
|
470
|
-
let
|
471
|
-
let
|
472
|
-
let
|
473
|
-
let
|
429
|
+
let decimal_comma = bool::try_convert(arguments[12])?;
|
430
|
+
let null_value = Option::<String>::try_convert(arguments[13])?;
|
431
|
+
let quote_style = Option::<Wrap<QuoteStyle>>::try_convert(arguments[14])?;
|
432
|
+
let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[15])?;
|
433
|
+
let retries = usize::try_convert(arguments[16])?;
|
434
|
+
let sink_options = Wrap::<SinkOptions>::try_convert(arguments[17])?;
|
474
435
|
|
475
436
|
let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
|
476
437
|
let null_value = null_value.unwrap_or(SerializeOptions::default().null);
|
@@ -481,6 +442,7 @@ impl RbLazyFrame {
|
|
481
442
|
datetime_format,
|
482
443
|
float_scientific,
|
483
444
|
float_precision,
|
445
|
+
decimal_comma,
|
484
446
|
separator,
|
485
447
|
quote_char,
|
486
448
|
null: null_value,
|
@@ -498,10 +460,8 @@ impl RbLazyFrame {
|
|
498
460
|
let cloud_options = match target.base_path() {
|
499
461
|
None => None,
|
500
462
|
Some(base_path) => {
|
501
|
-
let cloud_options =
|
502
|
-
base_path.to_str().
|
503
|
-
cloud_options.unwrap_or_default(),
|
504
|
-
)?;
|
463
|
+
let cloud_options =
|
464
|
+
parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
|
505
465
|
Some(cloud_options.with_max_retries(retries))
|
506
466
|
}
|
507
467
|
};
|
@@ -529,10 +489,8 @@ impl RbLazyFrame {
|
|
529
489
|
let cloud_options = match target.base_path() {
|
530
490
|
None => None,
|
531
491
|
Some(base_path) => {
|
532
|
-
let cloud_options =
|
533
|
-
base_path.to_str().
|
534
|
-
cloud_options.unwrap_or_default(),
|
535
|
-
)?;
|
492
|
+
let cloud_options =
|
493
|
+
parse_cloud_options(base_path.to_str(), cloud_options.unwrap_or_default())?;
|
536
494
|
Some(cloud_options.with_max_retries(retries))
|
537
495
|
}
|
538
496
|
};
|
@@ -546,12 +504,6 @@ impl RbLazyFrame {
|
|
546
504
|
.map_err(Into::into)
|
547
505
|
}
|
548
506
|
|
549
|
-
pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
|
550
|
-
let ldf = self.ldf.borrow().clone();
|
551
|
-
let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
|
552
|
-
Ok(df.into())
|
553
|
-
}
|
554
|
-
|
555
507
|
pub fn filter(&self, predicate: &RbExpr) -> Self {
|
556
508
|
let ldf = self.ldf.borrow().clone();
|
557
509
|
ldf.filter(predicate.inner.clone()).into()
|
@@ -689,15 +641,19 @@ impl RbLazyFrame {
|
|
689
641
|
.allow_parallel(allow_parallel)
|
690
642
|
.force_parallel(force_parallel)
|
691
643
|
.coalesce(coalesce)
|
692
|
-
.how(JoinType::AsOf(AsOfOptions {
|
644
|
+
.how(JoinType::AsOf(Box::new(AsOfOptions {
|
693
645
|
strategy: strategy.0,
|
694
646
|
left_by: left_by.map(strings_to_pl_smallstr),
|
695
647
|
right_by: right_by.map(strings_to_pl_smallstr),
|
696
|
-
tolerance: tolerance.map(|t|
|
648
|
+
tolerance: tolerance.map(|t| {
|
649
|
+
let av = t.0.into_static();
|
650
|
+
let dtype = av.dtype();
|
651
|
+
Scalar::new(dtype, av)
|
652
|
+
}),
|
697
653
|
tolerance_str: tolerance_str.map(|s| s.into()),
|
698
654
|
allow_eq,
|
699
655
|
check_sortedness,
|
700
|
-
}))
|
656
|
+
})))
|
701
657
|
.suffix(suffix)
|
702
658
|
.finish()
|
703
659
|
.into())
|
@@ -832,10 +788,12 @@ impl RbLazyFrame {
|
|
832
788
|
out.into()
|
833
789
|
}
|
834
790
|
|
835
|
-
pub fn explode(&self,
|
836
|
-
|
837
|
-
|
838
|
-
|
791
|
+
pub fn explode(&self, subset: &RbSelector) -> Self {
|
792
|
+
self.ldf
|
793
|
+
.borrow()
|
794
|
+
.clone()
|
795
|
+
.explode(subset.inner.clone())
|
796
|
+
.into()
|
839
797
|
}
|
840
798
|
|
841
799
|
pub fn null_count(&self) -> Self {
|
@@ -846,10 +804,11 @@ impl RbLazyFrame {
|
|
846
804
|
pub fn unique(
|
847
805
|
&self,
|
848
806
|
maintain_order: bool,
|
849
|
-
subset: Option
|
807
|
+
subset: Option<&RbSelector>,
|
850
808
|
keep: Wrap<UniqueKeepStrategy>,
|
851
809
|
) -> RbResult<Self> {
|
852
810
|
let ldf = self.ldf.borrow().clone();
|
811
|
+
let subset = subset.map(|e| e.inner.clone());
|
853
812
|
Ok(match maintain_order {
|
854
813
|
true => ldf.unique_stable_generic(subset, keep.0),
|
855
814
|
false => ldf.unique_generic(subset, keep.0),
|
@@ -857,9 +816,11 @@ impl RbLazyFrame {
|
|
857
816
|
.into())
|
858
817
|
}
|
859
818
|
|
860
|
-
pub fn drop_nulls(&self, subset: Option
|
861
|
-
|
862
|
-
|
819
|
+
pub fn drop_nulls(&self, subset: Option<&RbSelector>) -> Self {
|
820
|
+
self.ldf
|
821
|
+
.borrow()
|
822
|
+
.clone()
|
823
|
+
.drop_nulls(subset.map(|e| e.inner.clone()))
|
863
824
|
.into()
|
864
825
|
}
|
865
826
|
|
@@ -875,16 +836,14 @@ impl RbLazyFrame {
|
|
875
836
|
|
876
837
|
pub fn unpivot(
|
877
838
|
&self,
|
878
|
-
on:
|
879
|
-
index:
|
839
|
+
on: &RbSelector,
|
840
|
+
index: &RbSelector,
|
880
841
|
value_name: Option<String>,
|
881
842
|
variable_name: Option<String>,
|
882
843
|
) -> RbResult<Self> {
|
883
|
-
let on = rb_exprs_to_exprs(on)?;
|
884
|
-
let index = rb_exprs_to_exprs(index)?;
|
885
844
|
let args = UnpivotArgsDSL {
|
886
|
-
on: on.
|
887
|
-
index: index.
|
845
|
+
on: on.inner.clone(),
|
846
|
+
index: index.inner.clone(),
|
888
847
|
value_name: value_name.map(|s| s.into()),
|
889
848
|
variable_name: variable_name.map(|s| s.into()),
|
890
849
|
};
|
@@ -898,9 +857,8 @@ impl RbLazyFrame {
|
|
898
857
|
ldf.with_row_index(&name, offset).into()
|
899
858
|
}
|
900
859
|
|
901
|
-
pub fn drop(&self,
|
902
|
-
|
903
|
-
ldf.drop(cols).into()
|
860
|
+
pub fn drop(&self, columns: &RbSelector) -> Self {
|
861
|
+
self.ldf.borrow().clone().drop(columns.inner.clone()).into()
|
904
862
|
}
|
905
863
|
|
906
864
|
pub fn cast(&self, rb_dtypes: RHash, strict: bool) -> RbResult<Self> {
|
@@ -941,8 +899,12 @@ impl RbLazyFrame {
|
|
941
899
|
Ok(schema_dict)
|
942
900
|
}
|
943
901
|
|
944
|
-
pub fn unnest(&self,
|
945
|
-
self.ldf
|
902
|
+
pub fn unnest(&self, columns: &RbSelector) -> Self {
|
903
|
+
self.ldf
|
904
|
+
.borrow()
|
905
|
+
.clone()
|
906
|
+
.unnest(columns.inner.clone())
|
907
|
+
.into()
|
946
908
|
}
|
947
909
|
|
948
910
|
pub fn count(&self) -> Self {
|
@@ -25,7 +25,7 @@ impl RbLazyFrame {
|
|
25
25
|
let json = unsafe { std::mem::transmute::<&'_ str, &'static str>(json.as_str()) };
|
26
26
|
|
27
27
|
let lp = serde_json::from_str::<DslPlan>(json)
|
28
|
-
.map_err(|err| RbValueError::new_err(format!("{:?}"
|
28
|
+
.map_err(|err| RbValueError::new_err(format!("{err:?}")))?;
|
29
29
|
Ok(LazyFrame::from(lp).into())
|
30
30
|
}
|
31
31
|
}
|