polars-df 0.21.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/Cargo.lock +120 -90
- data/Cargo.toml +3 -0
- data/README.md +20 -7
- data/ext/polars/Cargo.toml +18 -12
- data/ext/polars/src/batched_csv.rs +4 -4
- data/ext/polars/src/catalog/unity.rs +96 -94
- data/ext/polars/src/conversion/any_value.rs +39 -37
- data/ext/polars/src/conversion/chunked_array.rs +36 -29
- data/ext/polars/src/conversion/datetime.rs +11 -0
- data/ext/polars/src/conversion/mod.rs +244 -51
- data/ext/polars/src/dataframe/construction.rs +5 -17
- data/ext/polars/src/dataframe/export.rs +17 -15
- data/ext/polars/src/dataframe/general.rs +15 -17
- data/ext/polars/src/dataframe/io.rs +1 -2
- data/ext/polars/src/dataframe/mod.rs +25 -1
- data/ext/polars/src/dataframe/serde.rs +23 -8
- data/ext/polars/src/exceptions.rs +8 -5
- data/ext/polars/src/expr/datatype.rs +4 -4
- data/ext/polars/src/expr/datetime.rs +22 -28
- data/ext/polars/src/expr/general.rs +3 -10
- data/ext/polars/src/expr/list.rs +8 -24
- data/ext/polars/src/expr/meta.rs +4 -6
- data/ext/polars/src/expr/mod.rs +2 -0
- data/ext/polars/src/expr/name.rs +11 -14
- data/ext/polars/src/expr/serde.rs +28 -0
- data/ext/polars/src/expr/string.rs +5 -10
- data/ext/polars/src/file.rs +20 -14
- data/ext/polars/src/functions/business.rs +0 -1
- data/ext/polars/src/functions/io.rs +7 -4
- data/ext/polars/src/functions/lazy.rs +7 -6
- data/ext/polars/src/functions/meta.rs +3 -3
- data/ext/polars/src/functions/string_cache.rs +3 -3
- data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
- data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
- data/ext/polars/src/io/mod.rs +23 -3
- data/ext/polars/src/lazyframe/general.rs +35 -50
- data/ext/polars/src/lazyframe/mod.rs +16 -1
- data/ext/polars/src/lazyframe/optflags.rs +57 -0
- data/ext/polars/src/lazyframe/serde.rs +27 -3
- data/ext/polars/src/lib.rs +144 -19
- data/ext/polars/src/map/dataframe.rs +18 -15
- data/ext/polars/src/map/lazy.rs +6 -5
- data/ext/polars/src/map/series.rs +7 -6
- data/ext/polars/src/on_startup.rs +12 -5
- data/ext/polars/src/rb_modules.rs +2 -2
- data/ext/polars/src/series/aggregation.rs +49 -29
- data/ext/polars/src/series/construction.rs +2 -0
- data/ext/polars/src/series/export.rs +38 -33
- data/ext/polars/src/series/general.rs +69 -31
- data/ext/polars/src/series/mod.rs +29 -4
- data/lib/polars/array_expr.rb +1 -1
- data/lib/polars/data_frame.rb +119 -15
- data/lib/polars/data_types.rb +23 -6
- data/lib/polars/date_time_expr.rb +36 -15
- data/lib/polars/expr.rb +41 -32
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/lazy.rb +1 -1
- data/lib/polars/iceberg_dataset.rb +113 -0
- data/lib/polars/io/iceberg.rb +34 -0
- data/lib/polars/io/ipc.rb +28 -49
- data/lib/polars/io/parquet.rb +7 -4
- data/lib/polars/io/scan_options.rb +12 -3
- data/lib/polars/io/utils.rb +17 -0
- data/lib/polars/lazy_frame.rb +97 -10
- data/lib/polars/list_expr.rb +21 -13
- data/lib/polars/list_name_space.rb +33 -21
- data/lib/polars/meta_expr.rb +25 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +23 -1
- data/lib/polars/schema.rb +1 -1
- data/lib/polars/selectors.rb +8 -8
- data/lib/polars/series.rb +26 -2
- data/lib/polars/string_expr.rb +27 -28
- data/lib/polars/string_name_space.rb +18 -5
- data/lib/polars/utils/convert.rb +2 -2
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +4 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +10 -1
data/ext/polars/src/file.rs
CHANGED
|
@@ -3,7 +3,7 @@ use std::io;
|
|
|
3
3
|
use std::io::{Cursor, Read, Seek, SeekFrom, Write};
|
|
4
4
|
use std::path::PathBuf;
|
|
5
5
|
|
|
6
|
-
use magnus::{Error, RString, Ruby, Value,
|
|
6
|
+
use magnus::{Error, RString, Ruby, Value, prelude::*, value::Opaque};
|
|
7
7
|
use polars::io::cloud::CloudOptions;
|
|
8
8
|
use polars::io::mmap::MmapBytesReader;
|
|
9
9
|
use polars::prelude::PlPath;
|
|
@@ -67,23 +67,25 @@ impl RbFileLikeObject {
|
|
|
67
67
|
/// ruby object has a `read`, `write`, and `seek` methods in respect to parameters.
|
|
68
68
|
/// Will return a `TypeError` if object does not have `read`, `seek`, and `write` methods.
|
|
69
69
|
pub fn with_requirements(object: Value, read: bool, write: bool, seek: bool) -> RbResult<Self> {
|
|
70
|
+
let ruby = Ruby::get_with(object);
|
|
71
|
+
|
|
70
72
|
if read && !object.respond_to("read", false)? {
|
|
71
73
|
return Err(Error::new(
|
|
72
|
-
|
|
74
|
+
ruby.exception_type_error(),
|
|
73
75
|
"Object does not have a .read() method.",
|
|
74
76
|
));
|
|
75
77
|
}
|
|
76
78
|
|
|
77
79
|
if seek && !object.respond_to("seek", false)? {
|
|
78
80
|
return Err(Error::new(
|
|
79
|
-
|
|
81
|
+
ruby.exception_type_error(),
|
|
80
82
|
"Object does not have a .seek() method.",
|
|
81
83
|
));
|
|
82
84
|
}
|
|
83
85
|
|
|
84
86
|
if write && !object.respond_to("write", false)? {
|
|
85
87
|
return Err(Error::new(
|
|
86
|
-
|
|
88
|
+
ruby.exception_type_error(),
|
|
87
89
|
"Object does not have a .write() method.",
|
|
88
90
|
));
|
|
89
91
|
}
|
|
@@ -113,10 +115,10 @@ impl Read for RbFileLikeObject {
|
|
|
113
115
|
|
|
114
116
|
impl Write for RbFileLikeObject {
|
|
115
117
|
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
|
116
|
-
let
|
|
118
|
+
let ruby = Ruby::get().unwrap();
|
|
119
|
+
let rbbytes = ruby.str_from_slice(buf);
|
|
117
120
|
|
|
118
|
-
let number_bytes_written =
|
|
119
|
-
.unwrap()
|
|
121
|
+
let number_bytes_written = ruby
|
|
120
122
|
.get_inner(self.inner)
|
|
121
123
|
.funcall::<_, _, usize>("write", (rbbytes,))
|
|
122
124
|
.map_err(rberr_to_io_err)?;
|
|
@@ -143,13 +145,13 @@ impl Seek for RbFileLikeObject {
|
|
|
143
145
|
SeekFrom::End(i) => (2, i),
|
|
144
146
|
};
|
|
145
147
|
|
|
146
|
-
let
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
.funcall("seek", (offset, whence))
|
|
148
|
+
let inner = Ruby::get().unwrap().get_inner(self.inner);
|
|
149
|
+
|
|
150
|
+
inner
|
|
151
|
+
.funcall::<_, _, Value>("seek", (offset, whence))
|
|
150
152
|
.map_err(rberr_to_io_err)?;
|
|
151
153
|
|
|
152
|
-
|
|
154
|
+
inner.funcall("tell", ()).map_err(rberr_to_io_err)
|
|
153
155
|
}
|
|
154
156
|
}
|
|
155
157
|
|
|
@@ -265,8 +267,12 @@ pub fn get_mmap_bytes_reader_and_path<'a>(
|
|
|
265
267
|
RbReadBytes::Bytes(v) => Ok((Box::new(Cursor::new(unsafe { v.as_slice() })), None)),
|
|
266
268
|
RbReadBytes::Other(v) => {
|
|
267
269
|
let path = PathBuf::try_convert(*v)?;
|
|
268
|
-
let f = File::open(&path)
|
|
269
|
-
|
|
270
|
+
let f = File::open(&path).map_err(|e| {
|
|
271
|
+
Error::new(
|
|
272
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
273
|
+
e.to_string(),
|
|
274
|
+
)
|
|
275
|
+
})?;
|
|
270
276
|
Ok((Box::new(f), Some(path)))
|
|
271
277
|
}
|
|
272
278
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
use std::io::BufReader;
|
|
2
2
|
|
|
3
|
-
use magnus::{RHash, Value};
|
|
3
|
+
use magnus::{RHash, Ruby, Value};
|
|
4
4
|
use polars::prelude::ArrowSchema;
|
|
5
5
|
|
|
6
6
|
use crate::conversion::Wrap;
|
|
@@ -16,7 +16,8 @@ pub fn read_ipc_schema(rb_f: Value) -> RbResult<RHash> {
|
|
|
16
16
|
EitherRustRubyFile::Rb(mut r) => read_file_metadata(&mut r).map_err(RbPolarsErr::from)?,
|
|
17
17
|
};
|
|
18
18
|
|
|
19
|
-
let
|
|
19
|
+
let ruby = Ruby::get_with(rb_f);
|
|
20
|
+
let dict = ruby.hash_new();
|
|
20
21
|
fields_to_rbdict(&metadata.schema, &dict)?;
|
|
21
22
|
Ok(dict)
|
|
22
23
|
}
|
|
@@ -33,7 +34,8 @@ pub fn read_parquet_metadata(rb_f: Value) -> RbResult<RHash> {
|
|
|
33
34
|
};
|
|
34
35
|
|
|
35
36
|
let key_value_metadata = read_custom_key_value_metadata(metadata.key_value_metadata());
|
|
36
|
-
let
|
|
37
|
+
let ruby = Ruby::get_with(rb_f);
|
|
38
|
+
let dict = ruby.hash_new();
|
|
37
39
|
for (key, value) in key_value_metadata.into_iter() {
|
|
38
40
|
dict.aset(key.as_str(), value.as_str())?;
|
|
39
41
|
}
|
|
@@ -51,7 +53,8 @@ pub fn read_parquet_schema(rb_f: Value) -> RbResult<RHash> {
|
|
|
51
53
|
};
|
|
52
54
|
let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::from)?;
|
|
53
55
|
|
|
54
|
-
let
|
|
56
|
+
let ruby = Ruby::get_with(rb_f);
|
|
57
|
+
let dict = ruby.hash_new();
|
|
55
58
|
fields_to_rbdict(&arrow_schema, &dict)?;
|
|
56
59
|
Ok(dict)
|
|
57
60
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
use magnus::encoding::
|
|
1
|
+
use magnus::encoding::EncodingCapable;
|
|
2
2
|
use magnus::{
|
|
3
|
-
Float, Integer, RArray, RString, Ruby, Value,
|
|
3
|
+
Float, Integer, RArray, RString, Ruby, Value, prelude::*, typed_data::Obj, value::Opaque,
|
|
4
4
|
};
|
|
5
5
|
use polars::lazy::dsl;
|
|
6
6
|
use polars::prelude::*;
|
|
@@ -98,10 +98,10 @@ pub fn col(name: String) -> RbExpr {
|
|
|
98
98
|
dsl::col(&name).into()
|
|
99
99
|
}
|
|
100
100
|
|
|
101
|
-
pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
|
|
101
|
+
pub fn collect_all(ruby: &Ruby, lfs: RArray) -> RbResult<RArray> {
|
|
102
102
|
let lfs = lfs.typecheck::<Obj<RbLazyFrame>>()?;
|
|
103
103
|
|
|
104
|
-
Ok(
|
|
104
|
+
Ok(ruby.ary_from_iter(lfs.iter().map(|lf| {
|
|
105
105
|
let df = lf.ldf.borrow().clone().collect().unwrap();
|
|
106
106
|
RbDataFrame::new(df)
|
|
107
107
|
})))
|
|
@@ -283,7 +283,8 @@ pub fn fold(
|
|
|
283
283
|
}
|
|
284
284
|
|
|
285
285
|
pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr> {
|
|
286
|
-
|
|
286
|
+
let ruby = Ruby::get_with(value);
|
|
287
|
+
if value.is_kind_of(ruby.class_true_class()) || value.is_kind_of(ruby.class_false_class()) {
|
|
287
288
|
Ok(dsl::lit(bool::try_convert(value)?).into())
|
|
288
289
|
} else if let Some(v) = Integer::from_value(value) {
|
|
289
290
|
match v.to_i64() {
|
|
@@ -302,7 +303,7 @@ pub fn lit(value: Value, allow_object: bool, is_scalar: bool) -> RbResult<RbExpr
|
|
|
302
303
|
} else if let Some(v) = Float::from_value(value) {
|
|
303
304
|
Ok(dsl::lit(v.to_f64()).into())
|
|
304
305
|
} else if let Some(v) = RString::from_value(value) {
|
|
305
|
-
if v.enc_get() ==
|
|
306
|
+
if v.enc_get() == ruby.utf8_encindex() {
|
|
306
307
|
Ok(dsl::lit(v.to_string()?).into())
|
|
307
308
|
} else {
|
|
308
309
|
Ok(dsl::lit(unsafe { v.as_slice() }).into())
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{IntoValue, Value};
|
|
1
|
+
use magnus::{IntoValue, Ruby, Value};
|
|
2
2
|
use polars_core;
|
|
3
3
|
use polars_core::POOL;
|
|
4
4
|
use polars_core::fmt::FloatFmt;
|
|
@@ -7,8 +7,8 @@ use polars_core::prelude::IDX_DTYPE;
|
|
|
7
7
|
use crate::conversion::Wrap;
|
|
8
8
|
use crate::{RbResult, RbValueError};
|
|
9
9
|
|
|
10
|
-
pub fn get_index_type() -> Value {
|
|
11
|
-
Wrap(IDX_DTYPE).
|
|
10
|
+
pub fn get_index_type(ruby: &Ruby) -> Value {
|
|
11
|
+
Wrap(IDX_DTYPE).into_value_with(ruby)
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
pub fn thread_pool_size() -> usize {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
use crate::RbResult;
|
|
2
|
-
use magnus::{
|
|
2
|
+
use magnus::{Ruby, Value};
|
|
3
3
|
|
|
4
4
|
pub fn enable_string_cache() {
|
|
5
5
|
// The string cache no longer exists.
|
|
@@ -18,7 +18,7 @@ pub fn using_string_cache() -> bool {
|
|
|
18
18
|
pub struct RbStringCacheHolder {}
|
|
19
19
|
|
|
20
20
|
impl RbStringCacheHolder {
|
|
21
|
-
pub fn hold() -> RbResult<Value> {
|
|
22
|
-
|
|
21
|
+
pub fn hold(ruby: &Ruby) -> RbResult<Value> {
|
|
22
|
+
ruby.yield_splat(ruby.ary_new())
|
|
23
23
|
}
|
|
24
24
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
use arrow::datatypes::ArrowDataType;
|
|
2
2
|
use arrow::ffi;
|
|
3
|
-
use magnus::{IntoValue, Value};
|
|
3
|
+
use magnus::{IntoValue, Ruby, Value};
|
|
4
4
|
use polars::datatypes::CompatLevel;
|
|
5
5
|
use polars::frame::DataFrame;
|
|
6
6
|
use polars::prelude::{ArrayRef, ArrowField, PlSmallStr, PolarsResult, SchemaExt};
|
|
@@ -20,11 +20,11 @@ impl RbArrowArrayStream {
|
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
pub(crate) fn dataframe_to_stream(df: &DataFrame) -> RbResult<Value> {
|
|
23
|
+
pub(crate) fn dataframe_to_stream(df: &DataFrame, ruby: &Ruby) -> RbResult<Value> {
|
|
24
24
|
let iter = Box::new(DataFrameStreamIterator::new(df));
|
|
25
25
|
let field = iter.field();
|
|
26
26
|
let stream = ffi::export_iterator(iter, field);
|
|
27
|
-
Ok(RbArrowArrayStream { stream }.
|
|
27
|
+
Ok(RbArrowArrayStream { stream }.into_value_with(ruby))
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
pub struct DataFrameStreamIterator {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{IntoValue, Module,
|
|
1
|
+
use magnus::{IntoValue, Module, RClass, RModule, Ruby, Value, prelude::*};
|
|
2
2
|
|
|
3
3
|
use crate::RbResult;
|
|
4
4
|
|
|
@@ -44,9 +44,10 @@ impl<T: Element> RbArray1<T> {
|
|
|
44
44
|
where
|
|
45
45
|
I: IntoIterator<Item = T>,
|
|
46
46
|
{
|
|
47
|
-
|
|
47
|
+
let ruby = Ruby::get().unwrap();
|
|
48
|
+
ruby.class_object()
|
|
48
49
|
.const_get::<_, RModule>("Numo")?
|
|
49
50
|
.const_get::<_, RClass>(T::class_name())?
|
|
50
|
-
.funcall("cast", (
|
|
51
|
+
.funcall("cast", (ruby.ary_from_iter(values),))
|
|
51
52
|
}
|
|
52
53
|
}
|
data/ext/polars/src/io/mod.rs
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
use std::sync::Arc;
|
|
2
2
|
|
|
3
3
|
use magnus::{TryConvert, Value, value::ReprValue};
|
|
4
|
+
use polars::prelude::default_values::DefaultFieldValues;
|
|
4
5
|
use polars::prelude::deletion::DeletionFilesList;
|
|
5
6
|
use polars::prelude::{
|
|
6
7
|
CastColumnsPolicy, ColumnMapping, ExtraColumnsPolicy, MissingColumnsPolicy, PlSmallStr, Schema,
|
|
7
|
-
UnifiedScanArgs,
|
|
8
|
+
TableStatistics, UnifiedScanArgs,
|
|
8
9
|
};
|
|
9
10
|
use polars_io::{HiveOptions, RowIndex};
|
|
10
11
|
use polars_utils::IdxSize;
|
|
@@ -23,6 +24,12 @@ impl TryConvert for RbScanOptions {
|
|
|
23
24
|
}
|
|
24
25
|
}
|
|
25
26
|
|
|
27
|
+
impl TryConvert for Wrap<TableStatistics> {
|
|
28
|
+
fn try_convert(_ob: Value) -> RbResult<Self> {
|
|
29
|
+
todo!();
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
26
33
|
impl RbScanOptions {
|
|
27
34
|
pub fn extract_unified_scan_args(
|
|
28
35
|
&self,
|
|
@@ -37,6 +44,10 @@ impl RbScanOptions {
|
|
|
37
44
|
let include_file_paths: Option<Wrap<PlSmallStr>> =
|
|
38
45
|
self.0.funcall("include_file_paths", ())?;
|
|
39
46
|
let glob: bool = self.0.funcall("glob", ())?;
|
|
47
|
+
let hidden_file_prefix: Option<Vec<String>> = self.0.funcall("hidden_file_prefix", ())?;
|
|
48
|
+
let column_mapping: Option<Wrap<ColumnMapping>> = self.0.funcall("column_mapping", ())?;
|
|
49
|
+
let default_values: Option<Wrap<DefaultFieldValues>> =
|
|
50
|
+
self.0.funcall("default_values", ())?;
|
|
40
51
|
let hive_partitioning: Option<bool> = self.0.funcall("hive_partitioning", ())?;
|
|
41
52
|
let hive_schema: Option<Wrap<Schema>> = self.0.funcall("hive_schema", ())?;
|
|
42
53
|
let try_parse_hive_dates: bool = self.0.funcall("try_parse_hive_dates", ())?;
|
|
@@ -47,7 +58,9 @@ impl RbScanOptions {
|
|
|
47
58
|
let retries: usize = self.0.funcall("retries", ())?;
|
|
48
59
|
let deletion_files: Option<Wrap<DeletionFilesList>> =
|
|
49
60
|
self.0.funcall("deletion_files", ())?;
|
|
50
|
-
let
|
|
61
|
+
let table_statistics: Option<Wrap<TableStatistics>> =
|
|
62
|
+
self.0.funcall("table_statistics", ())?;
|
|
63
|
+
let row_count: Option<(u64, u64)> = self.0.funcall("row_count", ())?;
|
|
51
64
|
|
|
52
65
|
let cloud_options = storage_options;
|
|
53
66
|
|
|
@@ -86,7 +99,13 @@ impl RbScanOptions {
|
|
|
86
99
|
rechunk,
|
|
87
100
|
cache,
|
|
88
101
|
glob,
|
|
102
|
+
hidden_file_prefix: hidden_file_prefix
|
|
103
|
+
.map(|x| x.into_iter().map(|x| (*x).into()).collect()),
|
|
89
104
|
projection: None,
|
|
105
|
+
column_mapping: column_mapping.map(|x| x.0),
|
|
106
|
+
default_values: default_values
|
|
107
|
+
.map(|x| x.0)
|
|
108
|
+
.filter(|DefaultFieldValues::Iceberg(v)| !v.is_empty()),
|
|
90
109
|
row_index,
|
|
91
110
|
pre_slice: pre_slice.map(Slice::from),
|
|
92
111
|
cast_columns_policy: cast_options.0,
|
|
@@ -94,7 +113,8 @@ impl RbScanOptions {
|
|
|
94
113
|
extra_columns_policy: extra_columns.0,
|
|
95
114
|
include_file_paths: include_file_paths.map(|x| x.0),
|
|
96
115
|
deletion_files: DeletionFilesList::filter_empty(deletion_files.map(|x| x.0)),
|
|
97
|
-
|
|
116
|
+
table_statistics: table_statistics.map(|x| x.0),
|
|
117
|
+
row_count,
|
|
98
118
|
};
|
|
99
119
|
|
|
100
120
|
Ok(unified_scan_args)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
use magnus::{IntoValue, RArray, RHash, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
|
|
2
|
-
use polars::io::
|
|
1
|
+
use magnus::{IntoValue, RArray, RHash, Ruby, TryConvert, Value, r_hash::ForEach, typed_data::Obj};
|
|
2
|
+
use polars::io::RowIndex;
|
|
3
3
|
use polars::lazy::frame::LazyFrame;
|
|
4
4
|
use polars::prelude::*;
|
|
5
5
|
use polars_plan::dsl::ScanSources;
|
|
@@ -18,7 +18,10 @@ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResu
|
|
|
18
18
|
fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PlPath>, ScanSources)> {
|
|
19
19
|
use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
|
|
20
20
|
Ok(match get_ruby_scan_source_input(obj, false)? {
|
|
21
|
-
RubyScanSourceInput::Path(path) => (
|
|
21
|
+
RubyScanSourceInput::Path(path) => (
|
|
22
|
+
Some(path.clone()),
|
|
23
|
+
ScanSources::Paths(FromIterator::from_iter([path])),
|
|
24
|
+
),
|
|
22
25
|
RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
|
|
23
26
|
RubyScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
|
|
24
27
|
})
|
|
@@ -30,13 +33,12 @@ impl RbLazyFrame {
|
|
|
30
33
|
source: Option<Value>,
|
|
31
34
|
sources: Wrap<ScanSources>,
|
|
32
35
|
infer_schema_length: Option<usize>,
|
|
33
|
-
batch_size: Option<
|
|
36
|
+
batch_size: Option<NonZeroUsize>,
|
|
34
37
|
n_rows: Option<usize>,
|
|
35
38
|
low_memory: bool,
|
|
36
39
|
rechunk: bool,
|
|
37
40
|
row_index: Option<(String, IdxSize)>,
|
|
38
41
|
) -> RbResult<Self> {
|
|
39
|
-
let batch_size = batch_size.map(|v| v.0);
|
|
40
42
|
let row_index = row_index.map(|(name, offset)| RowIndex {
|
|
41
43
|
name: name.into(),
|
|
42
44
|
offset,
|
|
@@ -181,48 +183,28 @@ impl RbLazyFrame {
|
|
|
181
183
|
Ok(lf.into())
|
|
182
184
|
}
|
|
183
185
|
|
|
184
|
-
#[allow(clippy::too_many_arguments)]
|
|
185
186
|
pub fn new_from_ipc(
|
|
186
|
-
source: Option<Value>,
|
|
187
187
|
sources: Wrap<ScanSources>,
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
rechunk: bool,
|
|
191
|
-
row_index: Option<(String, IdxSize)>,
|
|
192
|
-
hive_partitioning: Option<bool>,
|
|
193
|
-
hive_schema: Option<Wrap<Schema>>,
|
|
194
|
-
try_parse_hive_dates: bool,
|
|
195
|
-
include_file_paths: Option<String>,
|
|
188
|
+
scan_options: RbScanOptions,
|
|
189
|
+
file_cache_ttl: Option<u64>,
|
|
196
190
|
) -> RbResult<Self> {
|
|
197
|
-
let
|
|
198
|
-
name: name.into(),
|
|
199
|
-
offset,
|
|
200
|
-
});
|
|
191
|
+
let options = IpcScanOptions;
|
|
201
192
|
|
|
202
|
-
let
|
|
203
|
-
|
|
204
|
-
hive_start_idx: 0,
|
|
205
|
-
schema: hive_schema.map(|x| Arc::new(x.0)),
|
|
206
|
-
try_parse_dates: try_parse_hive_dates,
|
|
207
|
-
};
|
|
193
|
+
let sources = sources.0;
|
|
194
|
+
let first_path = sources.first_path().map(|p| p.into_owned());
|
|
208
195
|
|
|
209
|
-
let
|
|
210
|
-
|
|
211
|
-
cache,
|
|
212
|
-
rechunk,
|
|
213
|
-
row_index,
|
|
214
|
-
cloud_options: None,
|
|
215
|
-
hive_options,
|
|
216
|
-
include_file_paths: include_file_paths.map(|x| x.into()),
|
|
217
|
-
};
|
|
196
|
+
let mut unified_scan_args =
|
|
197
|
+
scan_options.extract_unified_scan_args(first_path.as_ref().map(|p| p.as_ref()))?;
|
|
218
198
|
|
|
219
|
-
let
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
199
|
+
if let Some(file_cache_ttl) = file_cache_ttl {
|
|
200
|
+
unified_scan_args
|
|
201
|
+
.cloud_options
|
|
202
|
+
.get_or_insert_default()
|
|
203
|
+
.file_cache_ttl = file_cache_ttl;
|
|
204
|
+
}
|
|
224
205
|
|
|
225
|
-
let lf = LazyFrame::scan_ipc_sources(sources,
|
|
206
|
+
let lf = LazyFrame::scan_ipc_sources(sources, options, unified_scan_args)
|
|
207
|
+
.map_err(RbPolarsErr::from)?;
|
|
226
208
|
Ok(lf.into())
|
|
227
209
|
}
|
|
228
210
|
|
|
@@ -407,13 +389,13 @@ impl RbLazyFrame {
|
|
|
407
389
|
pub fn sink_ipc(
|
|
408
390
|
&self,
|
|
409
391
|
target: SinkTarget,
|
|
410
|
-
compression: Option<
|
|
392
|
+
compression: Wrap<Option<IpcCompression>>,
|
|
411
393
|
cloud_options: Option<Vec<(String, String)>>,
|
|
412
394
|
retries: usize,
|
|
413
395
|
sink_options: Wrap<SinkOptions>,
|
|
414
396
|
) -> RbResult<RbLazyFrame> {
|
|
415
397
|
let options = IpcWriterOptions {
|
|
416
|
-
compression: compression.
|
|
398
|
+
compression: compression.0,
|
|
417
399
|
..Default::default()
|
|
418
400
|
};
|
|
419
401
|
|
|
@@ -444,7 +426,7 @@ impl RbLazyFrame {
|
|
|
444
426
|
let separator = u8::try_convert(arguments[3])?;
|
|
445
427
|
let line_terminator = String::try_convert(arguments[4])?;
|
|
446
428
|
let quote_char = u8::try_convert(arguments[5])?;
|
|
447
|
-
let batch_size =
|
|
429
|
+
let batch_size = NonZeroUsize::try_convert(arguments[6])?;
|
|
448
430
|
let datetime_format = Option::<String>::try_convert(arguments[7])?;
|
|
449
431
|
let date_format = Option::<String>::try_convert(arguments[8])?;
|
|
450
432
|
let time_format = Option::<String>::try_convert(arguments[9])?;
|
|
@@ -477,7 +459,7 @@ impl RbLazyFrame {
|
|
|
477
459
|
let options = CsvWriterOptions {
|
|
478
460
|
include_bom,
|
|
479
461
|
include_header,
|
|
480
|
-
batch_size
|
|
462
|
+
batch_size,
|
|
481
463
|
serialize_options,
|
|
482
464
|
};
|
|
483
465
|
|
|
@@ -931,30 +913,33 @@ impl RbLazyFrame {
|
|
|
931
913
|
self.ldf.borrow().clone().into()
|
|
932
914
|
}
|
|
933
915
|
|
|
934
|
-
pub fn collect_schema(&
|
|
935
|
-
let schema =
|
|
916
|
+
pub fn collect_schema(ruby: &Ruby, rb_self: &Self) -> RbResult<RHash> {
|
|
917
|
+
let schema = rb_self
|
|
936
918
|
.ldf
|
|
937
919
|
.borrow_mut()
|
|
938
920
|
.collect_schema()
|
|
939
921
|
.map_err(RbPolarsErr::from)?;
|
|
940
922
|
|
|
941
|
-
let schema_dict =
|
|
923
|
+
let schema_dict = ruby.hash_new();
|
|
942
924
|
schema.iter_fields().for_each(|fld| {
|
|
943
925
|
schema_dict
|
|
944
926
|
.aset::<String, Value>(
|
|
945
927
|
fld.name().to_string(),
|
|
946
|
-
Wrap(fld.dtype().clone()).
|
|
928
|
+
Wrap(fld.dtype().clone()).into_value_with(ruby),
|
|
947
929
|
)
|
|
948
930
|
.unwrap();
|
|
949
931
|
});
|
|
950
932
|
Ok(schema_dict)
|
|
951
933
|
}
|
|
952
934
|
|
|
953
|
-
pub fn unnest(&self, columns: &RbSelector) -> Self {
|
|
935
|
+
pub fn unnest(&self, columns: &RbSelector, separator: Option<String>) -> Self {
|
|
954
936
|
self.ldf
|
|
955
937
|
.borrow()
|
|
956
938
|
.clone()
|
|
957
|
-
.unnest(
|
|
939
|
+
.unnest(
|
|
940
|
+
columns.inner.clone(),
|
|
941
|
+
separator.as_deref().map(PlSmallStr::from_str),
|
|
942
|
+
)
|
|
958
943
|
.into()
|
|
959
944
|
}
|
|
960
945
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
mod general;
|
|
2
|
+
mod optflags;
|
|
2
3
|
mod serde;
|
|
3
4
|
mod sink;
|
|
4
5
|
|
|
5
|
-
use polars::
|
|
6
|
+
use polars::prelude::{LazyFrame, OptFlags};
|
|
6
7
|
pub use sink::SinkTarget;
|
|
7
8
|
use std::cell::RefCell;
|
|
8
9
|
|
|
@@ -12,6 +13,12 @@ pub struct RbLazyFrame {
|
|
|
12
13
|
pub ldf: RefCell<LazyFrame>,
|
|
13
14
|
}
|
|
14
15
|
|
|
16
|
+
#[magnus::wrap(class = "Polars::RbOptFlags")]
|
|
17
|
+
#[derive(Clone)]
|
|
18
|
+
pub struct RbOptFlags {
|
|
19
|
+
pub inner: RefCell<OptFlags>,
|
|
20
|
+
}
|
|
21
|
+
|
|
15
22
|
impl From<LazyFrame> for RbLazyFrame {
|
|
16
23
|
fn from(ldf: LazyFrame) -> Self {
|
|
17
24
|
RbLazyFrame {
|
|
@@ -19,3 +26,11 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
|
19
26
|
}
|
|
20
27
|
}
|
|
21
28
|
}
|
|
29
|
+
|
|
30
|
+
impl From<OptFlags> for RbOptFlags {
|
|
31
|
+
fn from(inner: OptFlags) -> Self {
|
|
32
|
+
RbOptFlags {
|
|
33
|
+
inner: RefCell::new(inner),
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
use polars::prelude::OptFlags;
|
|
2
|
+
|
|
3
|
+
use super::RbOptFlags;
|
|
4
|
+
|
|
5
|
+
macro_rules! flag_getter_setters {
|
|
6
|
+
($(($flag:ident, $getter:ident, $setter:ident, clear=$clear:literal))+) => {
|
|
7
|
+
impl RbOptFlags {
|
|
8
|
+
pub fn empty() -> Self {
|
|
9
|
+
Self {
|
|
10
|
+
inner: OptFlags::empty().into()
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
#[allow(clippy::should_implement_trait)]
|
|
15
|
+
pub fn default() -> Self {
|
|
16
|
+
Self { inner: OptFlags::default().into() }
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
pub fn no_optimizations(&self) {
|
|
20
|
+
$(if $clear {
|
|
21
|
+
self.inner.borrow_mut().remove(OptFlags::$flag);
|
|
22
|
+
})+
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
pub fn copy(&self) -> Self {
|
|
26
|
+
Self { inner: self.inner.clone() }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
$(
|
|
30
|
+
pub fn $getter(&self) -> bool {
|
|
31
|
+
self.inner.borrow().contains(OptFlags::$flag)
|
|
32
|
+
}
|
|
33
|
+
pub fn $setter(&self, value: bool) {
|
|
34
|
+
self.inner.borrow_mut().set(OptFlags::$flag, value)
|
|
35
|
+
}
|
|
36
|
+
)+
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
flag_getter_setters! {
|
|
42
|
+
(TYPE_COERCION, get_type_coercion, set_type_coercion, clear=false)
|
|
43
|
+
(TYPE_CHECK, get_type_check, set_type_check, clear=false)
|
|
44
|
+
|
|
45
|
+
(PROJECTION_PUSHDOWN, get_projection_pushdown, set_projection_pushdown, clear=true)
|
|
46
|
+
(PREDICATE_PUSHDOWN, get_predicate_pushdown, set_predicate_pushdown, clear=true)
|
|
47
|
+
(CLUSTER_WITH_COLUMNS, get_cluster_with_columns, set_cluster_with_columns, clear=true)
|
|
48
|
+
(SIMPLIFY_EXPR, get_simplify_expression, set_simplify_expression, clear=true)
|
|
49
|
+
(SLICE_PUSHDOWN, get_slice_pushdown, set_slice_pushdown, clear=true)
|
|
50
|
+
(COMM_SUBPLAN_ELIM, get_comm_subplan_elim, set_comm_subplan_elim, clear=true)
|
|
51
|
+
(COMM_SUBEXPR_ELIM, get_comm_subexpr_elim, set_comm_subexpr_elim, clear=true)
|
|
52
|
+
(CHECK_ORDER_OBSERVE, get_check_order_observe, set_check_order_observe, clear=true)
|
|
53
|
+
(FAST_PROJECTION, get_fast_projection, set_fast_projection, clear=true)
|
|
54
|
+
|
|
55
|
+
(EAGER, get_eager, set_eager, clear=true)
|
|
56
|
+
(NEW_STREAMING, get_streaming, set_streaming, clear=true)
|
|
57
|
+
}
|
|
@@ -1,14 +1,38 @@
|
|
|
1
|
+
use std::io::Read;
|
|
2
|
+
#[cfg(feature = "serialize_binary")]
|
|
3
|
+
use std::io::{BufReader, BufWriter};
|
|
4
|
+
|
|
1
5
|
use magnus::Value;
|
|
2
6
|
use polars::lazy::frame::LazyFrame;
|
|
3
7
|
use polars::prelude::*;
|
|
4
|
-
use std::io::Read;
|
|
5
8
|
|
|
6
9
|
use crate::file::get_file_like;
|
|
10
|
+
#[cfg(feature = "serialize_binary")]
|
|
11
|
+
use crate::utils::to_rb_err;
|
|
7
12
|
use crate::{RbLazyFrame, RbResult, RbValueError};
|
|
8
13
|
|
|
9
14
|
impl RbLazyFrame {
|
|
10
|
-
|
|
11
|
-
pub fn
|
|
15
|
+
#[cfg(feature = "serialize_binary")]
|
|
16
|
+
pub fn serialize_binary(&self, rb_f: Value) -> RbResult<()> {
|
|
17
|
+
let file = get_file_like(rb_f, true)?;
|
|
18
|
+
let writer = BufWriter::new(file);
|
|
19
|
+
self.ldf
|
|
20
|
+
.borrow()
|
|
21
|
+
.logical_plan
|
|
22
|
+
.serialize_versioned(writer, Default::default())
|
|
23
|
+
.map_err(to_rb_err)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
#[cfg(feature = "serialize_binary")]
|
|
27
|
+
pub fn deserialize_binary(rb_f: Value) -> RbResult<Self> {
|
|
28
|
+
let file = get_file_like(rb_f, false)?;
|
|
29
|
+
let reader = BufReader::new(file);
|
|
30
|
+
|
|
31
|
+
let lp: DslPlan = DslPlan::deserialize_versioned(reader).map_err(to_rb_err)?;
|
|
32
|
+
Ok(LazyFrame::from(lp).into())
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
pub fn deserialize_json(rb_f: Value) -> RbResult<Self> {
|
|
12
36
|
// it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
|
|
13
37
|
// so don't bother with files.
|
|
14
38
|
let mut json = String::new();
|