polars-df 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +55 -48
- data/Cargo.toml +3 -0
- data/README.md +12 -0
- data/ext/polars/Cargo.toml +22 -11
- data/ext/polars/src/batched_csv.rs +4 -4
- data/ext/polars/src/catalog/unity.rs +96 -94
- data/ext/polars/src/conversion/any_value.rs +26 -30
- data/ext/polars/src/conversion/chunked_array.rs +32 -28
- data/ext/polars/src/conversion/datetime.rs +11 -0
- data/ext/polars/src/conversion/mod.rs +307 -34
- data/ext/polars/src/dataframe/construction.rs +4 -3
- data/ext/polars/src/dataframe/export.rs +17 -15
- data/ext/polars/src/dataframe/general.rs +15 -12
- data/ext/polars/src/dataframe/io.rs +1 -2
- data/ext/polars/src/dataframe/mod.rs +25 -1
- data/ext/polars/src/dataframe/serde.rs +23 -8
- data/ext/polars/src/exceptions.rs +8 -4
- data/ext/polars/src/expr/array.rs +73 -4
- data/ext/polars/src/expr/binary.rs +26 -1
- data/ext/polars/src/expr/bitwise.rs +39 -0
- data/ext/polars/src/expr/categorical.rs +20 -0
- data/ext/polars/src/expr/datatype.rs +24 -1
- data/ext/polars/src/expr/datetime.rs +58 -14
- data/ext/polars/src/expr/general.rs +87 -15
- data/ext/polars/src/expr/list.rs +32 -24
- data/ext/polars/src/expr/meta.rs +15 -6
- data/ext/polars/src/expr/mod.rs +3 -0
- data/ext/polars/src/expr/name.rs +19 -14
- data/ext/polars/src/expr/rolling.rs +20 -0
- data/ext/polars/src/expr/serde.rs +28 -0
- data/ext/polars/src/expr/string.rs +64 -10
- data/ext/polars/src/expr/struct.rs +9 -1
- data/ext/polars/src/file.rs +15 -9
- data/ext/polars/src/functions/business.rs +0 -1
- data/ext/polars/src/functions/io.rs +25 -3
- data/ext/polars/src/functions/lazy.rs +11 -6
- data/ext/polars/src/functions/meta.rs +3 -3
- data/ext/polars/src/functions/string_cache.rs +3 -3
- data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
- data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
- data/ext/polars/src/io/mod.rs +6 -0
- data/ext/polars/src/lazyframe/general.rs +59 -9
- data/ext/polars/src/lazyframe/mod.rs +16 -1
- data/ext/polars/src/lazyframe/optflags.rs +58 -0
- data/ext/polars/src/lazyframe/serde.rs +27 -3
- data/ext/polars/src/lib.rs +261 -19
- data/ext/polars/src/map/dataframe.rs +20 -17
- data/ext/polars/src/map/lazy.rs +6 -5
- data/ext/polars/src/map/series.rs +8 -7
- data/ext/polars/src/on_startup.rs +12 -5
- data/ext/polars/src/rb_modules.rs +2 -2
- data/ext/polars/src/series/aggregation.rs +85 -28
- data/ext/polars/src/series/construction.rs +1 -0
- data/ext/polars/src/series/export.rs +37 -33
- data/ext/polars/src/series/general.rs +120 -21
- data/ext/polars/src/series/mod.rs +29 -4
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +794 -27
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +26 -5
- data/lib/polars/date_time_expr.rb +252 -1
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1248 -206
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +14 -1
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/iceberg.rb +27 -0
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +32 -7
- data/lib/polars/io/scan_options.rb +4 -1
- data/lib/polars/lazy_frame.rb +1028 -28
- data/lib/polars/list_expr.rb +217 -17
- data/lib/polars/list_name_space.rb +231 -22
- data/lib/polars/meta_expr.rb +89 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +20 -1
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1053 -54
- data/lib/polars/string_expr.rb +436 -32
- data/lib/polars/string_name_space.rb +736 -50
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +22 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +11 -1
@@ -1,16 +1,17 @@
|
|
1
1
|
pub(crate) mod any_value;
|
2
2
|
mod categorical;
|
3
3
|
mod chunked_array;
|
4
|
+
mod datetime;
|
4
5
|
|
6
|
+
use std::collections::BTreeMap;
|
5
7
|
use std::fmt::{Debug, Display, Formatter};
|
6
8
|
use std::fs::File;
|
7
9
|
use std::hash::{Hash, Hasher};
|
8
|
-
use std::num::NonZeroUsize;
|
9
10
|
|
10
11
|
pub use categorical::RbCategories;
|
11
12
|
use magnus::{
|
12
|
-
IntoValue, Module, RArray, RHash, Ruby, Symbol, TryConvert, Value,
|
13
|
-
|
13
|
+
IntoValue, Module, RArray, RHash, Ruby, Symbol, TryConvert, Value, prelude::*, r_hash::ForEach,
|
14
|
+
try_convert::TryConvertOwned, value::Opaque,
|
14
15
|
};
|
15
16
|
use polars::chunked_array::object::PolarsObjectSafe;
|
16
17
|
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
@@ -18,9 +19,13 @@ use polars::datatypes::AnyValue;
|
|
18
19
|
use polars::frame::row::Row;
|
19
20
|
use polars::io::avro::AvroCompression;
|
20
21
|
use polars::io::cloud::CloudOptions;
|
22
|
+
use polars::prelude::default_values::{
|
23
|
+
DefaultFieldValues, IcebergIdentityTransformedPartitionFields,
|
24
|
+
};
|
21
25
|
use polars::prelude::deletion::DeletionFilesList;
|
22
26
|
use polars::prelude::*;
|
23
27
|
use polars::series::ops::NullBehavior;
|
28
|
+
use polars_core::schema::iceberg::IcebergSchema;
|
24
29
|
use polars_core::utils::arrow::array::Array;
|
25
30
|
use polars_core::utils::materialize_dyn_int;
|
26
31
|
use polars_plan::dsl::ScanSources;
|
@@ -29,7 +34,8 @@ use polars_utils::total_ord::{TotalEq, TotalHash};
|
|
29
34
|
|
30
35
|
use crate::file::{RubyScanSourceInput, get_ruby_scan_source_input};
|
31
36
|
use crate::object::OBJECT_NAME;
|
32
|
-
use crate::rb_modules::
|
37
|
+
use crate::rb_modules::pl_series;
|
38
|
+
use crate::utils::to_rb_err;
|
33
39
|
use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError};
|
34
40
|
|
35
41
|
pub(crate) fn slice_extract_wrapped<T>(slice: &[Wrap<T>]) -> &[T] {
|
@@ -84,7 +90,7 @@ pub(crate) fn get_series(obj: Value) -> RbResult<Series> {
|
|
84
90
|
}
|
85
91
|
|
86
92
|
pub(crate) fn to_series(s: RbSeries) -> Value {
|
87
|
-
let series =
|
93
|
+
let series = pl_series();
|
88
94
|
series
|
89
95
|
.funcall::<_, _, Value>("_from_rbseries", (s,))
|
90
96
|
.unwrap()
|
@@ -119,16 +125,16 @@ impl TryConvert for Wrap<NullValues> {
|
|
119
125
|
}
|
120
126
|
}
|
121
127
|
|
122
|
-
fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
123
|
-
let dict =
|
128
|
+
fn struct_dict<'a>(ruby: &Ruby, vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) -> Value {
|
129
|
+
let dict = ruby.hash_new();
|
124
130
|
for (fld, val) in flds.iter().zip(vals) {
|
125
131
|
dict.aset(fld.name().as_str(), Wrap(val)).unwrap()
|
126
132
|
}
|
127
|
-
dict.
|
133
|
+
dict.as_value()
|
128
134
|
}
|
129
135
|
|
130
136
|
impl IntoValue for Wrap<DataType> {
|
131
|
-
fn into_value_with(self,
|
137
|
+
fn into_value_with(self, ruby: &Ruby) -> Value {
|
132
138
|
let pl = crate::rb_modules::polars();
|
133
139
|
|
134
140
|
match self.0 {
|
@@ -234,8 +240,10 @@ impl IntoValue for Wrap<DataType> {
|
|
234
240
|
let categories: Value = categories_class
|
235
241
|
.funcall("_from_rb_categories", (RbCategories::from(cats.clone()),))
|
236
242
|
.unwrap();
|
237
|
-
let kwargs =
|
238
|
-
kwargs
|
243
|
+
let kwargs = ruby.hash_new();
|
244
|
+
kwargs
|
245
|
+
.aset(ruby.to_symbol("categories"), categories)
|
246
|
+
.unwrap();
|
239
247
|
categorical_class.funcall("new", (kwargs,)).unwrap()
|
240
248
|
}
|
241
249
|
DataType::Enum(_, mapping) => {
|
@@ -262,7 +270,7 @@ impl IntoValue for Wrap<DataType> {
|
|
262
270
|
.funcall::<_, _, Value>("new", (name, dtype))
|
263
271
|
.unwrap()
|
264
272
|
});
|
265
|
-
let fields =
|
273
|
+
let fields = ruby.ary_from_iter(iter);
|
266
274
|
let struct_class = pl.const_get::<_, Value>("Struct").unwrap();
|
267
275
|
struct_class
|
268
276
|
.funcall::<_, _, Value>("new", (fields,))
|
@@ -273,7 +281,7 @@ impl IntoValue for Wrap<DataType> {
|
|
273
281
|
class.funcall("new", ()).unwrap()
|
274
282
|
}
|
275
283
|
DataType::Unknown(UnknownKind::Int(v)) => {
|
276
|
-
Wrap(materialize_dyn_int(v).dtype()).
|
284
|
+
Wrap(materialize_dyn_int(v).dtype()).into_value_with(ruby)
|
277
285
|
}
|
278
286
|
DataType::Unknown(_) => {
|
279
287
|
let class = pl.const_get::<_, Value>("Unknown").unwrap();
|
@@ -291,19 +299,19 @@ enum CategoricalOrdering {
|
|
291
299
|
}
|
292
300
|
|
293
301
|
impl IntoValue for Wrap<CategoricalOrdering> {
|
294
|
-
fn into_value_with(self,
|
295
|
-
"lexical".
|
302
|
+
fn into_value_with(self, ruby: &Ruby) -> Value {
|
303
|
+
"lexical".into_value_with(ruby)
|
296
304
|
}
|
297
305
|
}
|
298
306
|
|
299
307
|
impl IntoValue for Wrap<TimeUnit> {
|
300
|
-
fn into_value_with(self,
|
308
|
+
fn into_value_with(self, ruby: &Ruby) -> Value {
|
301
309
|
let tu = match self.0 {
|
302
310
|
TimeUnit::Nanoseconds => "ns",
|
303
311
|
TimeUnit::Microseconds => "us",
|
304
312
|
TimeUnit::Milliseconds => "ms",
|
305
313
|
};
|
306
|
-
tu.
|
314
|
+
tu.into_value_with(ruby)
|
307
315
|
}
|
308
316
|
}
|
309
317
|
|
@@ -317,7 +325,8 @@ impl TryConvert for Wrap<Field> {
|
|
317
325
|
|
318
326
|
impl TryConvert for Wrap<DataType> {
|
319
327
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
320
|
-
let
|
328
|
+
let ruby = Ruby::get_with(ob);
|
329
|
+
let dtype = if ob.is_kind_of(ruby.class_class()) {
|
321
330
|
let name = ob.funcall::<_, _, String>("name", ())?;
|
322
331
|
match name.as_str() {
|
323
332
|
"Polars::Int8" => DataType::Int8,
|
@@ -524,6 +533,60 @@ impl TryConvert for Wrap<Schema> {
|
|
524
533
|
}
|
525
534
|
}
|
526
535
|
|
536
|
+
impl TryConvert for Wrap<ArrowSchema> {
|
537
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
538
|
+
let ruby = Ruby::get_with(ob);
|
539
|
+
// TODO improve
|
540
|
+
let ob = RHash::try_convert(ob)?;
|
541
|
+
let fields: RArray = ob.aref(ruby.to_symbol("fields"))?;
|
542
|
+
let mut arrow_schema = ArrowSchema::with_capacity(fields.len());
|
543
|
+
for f in fields {
|
544
|
+
let f = RHash::try_convert(f)?;
|
545
|
+
let name: String = f.aref(ruby.to_symbol("name"))?;
|
546
|
+
let rb_dtype: String = f.aref(ruby.to_symbol("type"))?;
|
547
|
+
let dtype = match rb_dtype.as_str() {
|
548
|
+
"null" => ArrowDataType::Null,
|
549
|
+
"boolean" => ArrowDataType::Boolean,
|
550
|
+
"int8" => ArrowDataType::Int8,
|
551
|
+
"int16" => ArrowDataType::Int16,
|
552
|
+
"int32" => ArrowDataType::Int32,
|
553
|
+
"int64" => ArrowDataType::Int64,
|
554
|
+
"uint8" => ArrowDataType::UInt8,
|
555
|
+
"uint16" => ArrowDataType::UInt16,
|
556
|
+
"uint32" => ArrowDataType::UInt32,
|
557
|
+
"uint64" => ArrowDataType::UInt64,
|
558
|
+
"float16" => ArrowDataType::Float16,
|
559
|
+
"float32" => ArrowDataType::Float32,
|
560
|
+
"float64" => ArrowDataType::Float64,
|
561
|
+
"date32" => ArrowDataType::Date32,
|
562
|
+
"date64" => ArrowDataType::Date64,
|
563
|
+
"binary" => ArrowDataType::Binary,
|
564
|
+
"large_binary" => ArrowDataType::LargeBinary,
|
565
|
+
"string" => ArrowDataType::Utf8,
|
566
|
+
"large_string" => ArrowDataType::LargeUtf8,
|
567
|
+
"binary_view" => ArrowDataType::BinaryView,
|
568
|
+
"string_view" => ArrowDataType::Utf8View,
|
569
|
+
"unknown" => ArrowDataType::Unknown,
|
570
|
+
_ => todo!(),
|
571
|
+
};
|
572
|
+
let is_nullable = f.aref(ruby.to_symbol("nullable"))?;
|
573
|
+
let rb_metadata: RHash = f.aref(ruby.to_symbol("metadata"))?;
|
574
|
+
let mut metadata = BTreeMap::new();
|
575
|
+
rb_metadata.foreach(|k: String, v: String| {
|
576
|
+
metadata.insert(k.into(), v.into());
|
577
|
+
Ok(ForEach::Continue)
|
578
|
+
})?;
|
579
|
+
arrow_schema
|
580
|
+
.try_insert(
|
581
|
+
name.clone().into(),
|
582
|
+
ArrowField::new(name.into(), dtype, is_nullable).with_metadata(metadata),
|
583
|
+
)
|
584
|
+
.map_err(to_rb_err)?;
|
585
|
+
}
|
586
|
+
Ok(Wrap(arrow_schema))
|
587
|
+
}
|
588
|
+
}
|
589
|
+
|
527
590
|
impl TryConvert for Wrap<ScanSources> {
|
528
591
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
529
592
|
let list = RArray::try_convert(ob)?;
|
@@ -665,7 +728,7 @@ impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
|
665
728
|
|
666
729
|
impl ObjectValue {
|
667
730
|
pub fn to_value(&self) -> Value {
|
668
|
-
self.clone().
|
731
|
+
self.clone().into_value_with(&Ruby::get().unwrap())
|
669
732
|
}
|
670
733
|
}
|
671
734
|
|
@@ -979,6 +1042,22 @@ impl TryConvert for Wrap<RankMethod> {
|
|
979
1042
|
}
|
980
1043
|
}
|
981
1044
|
|
1045
|
+
impl TryConvert for Wrap<Roll> {
|
1046
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
1047
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
1048
|
+
"raise" => Roll::Raise,
|
1049
|
+
"forward" => Roll::Forward,
|
1050
|
+
"backward" => Roll::Backward,
|
1051
|
+
v => {
|
1052
|
+
return Err(RbValueError::new_err(format!(
|
1053
|
+
"`roll` must be one of {{'raise', 'forward', 'backward'}}, got {v}",
|
1054
|
+
)));
|
1055
|
+
}
|
1056
|
+
};
|
1057
|
+
Ok(Wrap(parsed))
|
1058
|
+
}
|
1059
|
+
}
|
1060
|
+
|
982
1061
|
impl TryConvert for Wrap<TimeUnit> {
|
983
1062
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
984
1063
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -1156,7 +1235,109 @@ impl TryConvert for Wrap<CastColumnsPolicy> {
|
|
1156
1235
|
let out = Wrap(CastColumnsPolicy::ERROR_ON_MISMATCH);
|
1157
1236
|
return Ok(out);
|
1158
1237
|
}
|
1159
|
-
|
1238
|
+
|
1239
|
+
let integer_upcast = match &*ob.funcall::<_, _, String>("integer_cast", ())? {
|
1240
|
+
"upcast" => true,
|
1241
|
+
"forbid" => false,
|
1242
|
+
v => {
|
1243
|
+
return Err(RbValueError::new_err(format!(
|
1244
|
+
"unknown option for integer_cast: {v}"
|
1245
|
+
)));
|
1246
|
+
}
|
1247
|
+
};
|
1248
|
+
|
1249
|
+
let mut float_upcast = false;
|
1250
|
+
let mut float_downcast = false;
|
1251
|
+
|
1252
|
+
let float_cast_object: Value = ob.funcall("float_cast", ())?;
|
1253
|
+
|
1254
|
+
parse_multiple_options("float_cast", float_cast_object, |v| {
|
1255
|
+
match v {
|
1256
|
+
"forbid" => {}
|
1257
|
+
"upcast" => float_upcast = true,
|
1258
|
+
"downcast" => float_downcast = true,
|
1259
|
+
v => {
|
1260
|
+
return Err(RbValueError::new_err(format!(
|
1261
|
+
"unknown option for float_cast: {v}"
|
1262
|
+
)));
|
1263
|
+
}
|
1264
|
+
}
|
1265
|
+
|
1266
|
+
Ok(())
|
1267
|
+
})?;
|
1268
|
+
|
1269
|
+
let mut datetime_nanoseconds_downcast = false;
|
1270
|
+
let mut datetime_convert_timezone = false;
|
1271
|
+
|
1272
|
+
let datetime_cast_object: Value = ob.funcall("datetime_cast", ())?;
|
1273
|
+
|
1274
|
+
parse_multiple_options("datetime_cast", datetime_cast_object, |v| {
|
1275
|
+
match v {
|
1276
|
+
"forbid" => {}
|
1277
|
+
"nanosecond-downcast" => datetime_nanoseconds_downcast = true,
|
1278
|
+
"convert-timezone" => datetime_convert_timezone = true,
|
1279
|
+
v => {
|
1280
|
+
return Err(RbValueError::new_err(format!(
|
1281
|
+
"unknown option for datetime_cast: {v}"
|
1282
|
+
)));
|
1283
|
+
}
|
1284
|
+
};
|
1285
|
+
|
1286
|
+
Ok(())
|
1287
|
+
})?;
|
1288
|
+
|
1289
|
+
let missing_struct_fields =
|
1290
|
+
match &*ob.funcall::<_, _, String>("missing_struct_fields", ())? {
|
1291
|
+
"insert" => MissingColumnsPolicy::Insert,
|
1292
|
+
"raise" => MissingColumnsPolicy::Raise,
|
1293
|
+
v => {
|
1294
|
+
return Err(RbValueError::new_err(format!(
|
1295
|
+
"unknown option for missing_struct_fields: {v}"
|
1296
|
+
)));
|
1297
|
+
}
|
1298
|
+
};
|
1299
|
+
|
1300
|
+
let extra_struct_fields = match &*ob.funcall::<_, _, String>("extra_struct_fields", ())? {
|
1301
|
+
"ignore" => ExtraColumnsPolicy::Ignore,
|
1302
|
+
"raise" => ExtraColumnsPolicy::Raise,
|
1303
|
+
v => {
|
1304
|
+
return Err(RbValueError::new_err(format!(
|
1305
|
+
"unknown option for extra_struct_fields: {v}"
|
1306
|
+
)));
|
1307
|
+
}
|
1308
|
+
};
|
1309
|
+
|
1310
|
+
return Ok(Wrap(CastColumnsPolicy {
|
1311
|
+
integer_upcast,
|
1312
|
+
float_upcast,
|
1313
|
+
float_downcast,
|
1314
|
+
datetime_nanoseconds_downcast,
|
1315
|
+
datetime_microseconds_downcast: false,
|
1316
|
+
datetime_convert_timezone,
|
1317
|
+
null_upcast: true,
|
1318
|
+
missing_struct_fields,
|
1319
|
+
extra_struct_fields,
|
1320
|
+
}));
|
1321
|
+
|
1322
|
+
fn parse_multiple_options(
|
1323
|
+
parameter_name: &'static str,
|
1324
|
+
rb_object: Value,
|
1325
|
+
mut parser_func: impl FnMut(&str) -> RbResult<()>,
|
1326
|
+
) -> RbResult<()> {
|
1327
|
+
if let Ok(v) = String::try_convert(rb_object) {
|
1328
|
+
parser_func(&v)?;
|
1329
|
+
} else if let Ok(v) = RArray::try_convert(rb_object) {
|
1330
|
+
for v in v {
|
1331
|
+
parser_func(&String::try_convert(v)?)?;
|
1332
|
+
}
|
1333
|
+
} else {
|
1334
|
+
return Err(RbValueError::new_err(format!(
|
1335
|
+
"unknown type for {parameter_name}: {rb_object}"
|
1336
|
+
)));
|
1337
|
+
}
|
1338
|
+
|
1339
|
+
Ok(())
|
1340
|
+
}
|
1160
1341
|
}
|
1161
1342
|
}
|
1162
1343
|
|
@@ -1174,7 +1355,7 @@ pub fn parse_fill_null_strategy(
|
|
1174
1355
|
"one" => FillNullStrategy::One,
|
1175
1356
|
e => {
|
1176
1357
|
return Err(magnus::Error::new(
|
1177
|
-
|
1358
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
1178
1359
|
format!(
|
1179
1360
|
"strategy must be one of {{'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}}, got {e}",
|
1180
1361
|
),
|
@@ -1225,15 +1406,6 @@ pub fn parse_parquet_compression(
|
|
1225
1406
|
Ok(parsed)
|
1226
1407
|
}
|
1227
1408
|
|
1228
|
-
impl TryConvert for Wrap<NonZeroUsize> {
|
1229
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
1230
|
-
let v = usize::try_convert(ob)?;
|
1231
|
-
NonZeroUsize::new(v)
|
1232
|
-
.map(Wrap)
|
1233
|
-
.ok_or(RbValueError::new_err("must be non-zero"))
|
1234
|
-
}
|
1235
|
-
}
|
1236
|
-
|
1237
1409
|
pub(crate) fn strings_to_pl_smallstr<I, S>(container: I) -> Vec<PlSmallStr>
|
1238
1410
|
where
|
1239
1411
|
I: IntoIterator<Item = S>,
|
@@ -1270,6 +1442,23 @@ impl TryConvert for RbCompatLevel {
|
|
1270
1442
|
}
|
1271
1443
|
}
|
1272
1444
|
|
1445
|
+
impl TryConvert for Wrap<UnicodeForm> {
|
1446
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
1447
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
1448
|
+
"NFC" => UnicodeForm::NFC,
|
1449
|
+
"NFKC" => UnicodeForm::NFKC,
|
1450
|
+
"NFD" => UnicodeForm::NFD,
|
1451
|
+
"NFKD" => UnicodeForm::NFKD,
|
1452
|
+
v => {
|
1453
|
+
return Err(RbValueError::new_err(format!(
|
1454
|
+
"`form` must be one of {{'NFC', 'NFKC', 'NFD', 'NFKD'}}, got {v}",
|
1455
|
+
)));
|
1456
|
+
}
|
1457
|
+
};
|
1458
|
+
Ok(Wrap(parsed))
|
1459
|
+
}
|
1460
|
+
}
|
1461
|
+
|
1273
1462
|
impl TryConvert for Wrap<Option<TimeZone>> {
|
1274
1463
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
1275
1464
|
let tz = Option::<Wrap<PlSmallStr>>::try_convert(ob)?;
|
@@ -1313,13 +1502,97 @@ impl TryConvert for Wrap<MissingColumnsPolicy> {
|
|
1313
1502
|
}
|
1314
1503
|
|
1315
1504
|
impl TryConvert for Wrap<ColumnMapping> {
|
1316
|
-
fn try_convert(
|
1317
|
-
|
1505
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
1506
|
+
let (column_mapping_type, ob) = <(String, Value)>::try_convert(ob)?;
|
1507
|
+
|
1508
|
+
Ok(Wrap(match column_mapping_type.as_str() {
|
1509
|
+
"iceberg-column-mapping" => {
|
1510
|
+
let arrow_schema = Wrap::<ArrowSchema>::try_convert(ob)?;
|
1511
|
+
ColumnMapping::Iceberg(Arc::new(
|
1512
|
+
IcebergSchema::from_arrow_schema(&arrow_schema.0).map_err(to_rb_err)?,
|
1513
|
+
))
|
1514
|
+
}
|
1515
|
+
|
1516
|
+
v => {
|
1517
|
+
return Err(RbValueError::new_err(format!(
|
1518
|
+
"unknown column mapping type: {v}"
|
1519
|
+
)));
|
1520
|
+
}
|
1521
|
+
}))
|
1318
1522
|
}
|
1319
1523
|
}
|
1320
1524
|
|
1321
1525
|
impl TryConvert for Wrap<DeletionFilesList> {
|
1322
|
-
fn try_convert(
|
1323
|
-
|
1526
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
1527
|
+
let (deletion_file_type, ob) = <(String, Value)>::try_convert(ob)?;
|
1528
|
+
|
1529
|
+
Ok(Wrap(match deletion_file_type.as_str() {
|
1530
|
+
"iceberg-position-delete" => {
|
1531
|
+
let dict = RHash::try_convert(ob)?;
|
1532
|
+
|
1533
|
+
let mut out = PlIndexMap::new();
|
1534
|
+
|
1535
|
+
dict.foreach(|k: usize, v: RArray| {
|
1536
|
+
let files = v
|
1537
|
+
.into_iter()
|
1538
|
+
.map(|x| {
|
1539
|
+
let x = String::try_convert(x)?;
|
1540
|
+
Ok(x)
|
1541
|
+
})
|
1542
|
+
.collect::<RbResult<Arc<[String]>>>()?;
|
1543
|
+
|
1544
|
+
if !files.is_empty() {
|
1545
|
+
out.insert(k, files);
|
1546
|
+
}
|
1547
|
+
|
1548
|
+
Ok(ForEach::Continue)
|
1549
|
+
})?;
|
1550
|
+
|
1551
|
+
DeletionFilesList::IcebergPositionDelete(Arc::new(out))
|
1552
|
+
}
|
1553
|
+
|
1554
|
+
v => {
|
1555
|
+
return Err(RbValueError::new_err(format!(
|
1556
|
+
"unknown deletion file type: {v}"
|
1557
|
+
)));
|
1558
|
+
}
|
1559
|
+
}))
|
1560
|
+
}
|
1561
|
+
}
|
1562
|
+
|
1563
|
+
impl TryConvert for Wrap<DefaultFieldValues> {
|
1564
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
1565
|
+
let (default_values_type, ob) = <(String, Value)>::try_convert(ob)?;
|
1566
|
+
|
1567
|
+
Ok(Wrap(match &*default_values_type {
|
1568
|
+
"iceberg" => {
|
1569
|
+
let dict = RHash::try_convert(ob)?;
|
1570
|
+
|
1571
|
+
let mut out = PlIndexMap::new();
|
1572
|
+
|
1573
|
+
dict.foreach(|k: u32, v: Value| {
|
1574
|
+
let v: Result<Column, String> = if let Ok(s) = get_series(v) {
|
1575
|
+
Ok(s.into_column())
|
1576
|
+
} else {
|
1577
|
+
let err_msg = String::try_convert(v)?;
|
1578
|
+
Err(err_msg)
|
1579
|
+
};
|
1580
|
+
|
1581
|
+
out.insert(k, v);
|
1582
|
+
|
1583
|
+
Ok(ForEach::Continue)
|
1584
|
+
})?;
|
1585
|
+
|
1586
|
+
DefaultFieldValues::Iceberg(Arc::new(IcebergIdentityTransformedPartitionFields(
|
1587
|
+
out,
|
1588
|
+
)))
|
1589
|
+
}
|
1590
|
+
|
1591
|
+
v => {
|
1592
|
+
return Err(RbValueError::new_err(format!(
|
1593
|
+
"unknown deletion file type: {v}"
|
1594
|
+
)));
|
1595
|
+
}
|
1596
|
+
}))
|
1324
1597
|
}
|
1325
1598
|
}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{RArray, RHash, Symbol, Value, prelude::*, r_hash::ForEach};
|
1
|
+
use magnus::{RArray, RHash, Ruby, Symbol, Value, prelude::*, r_hash::ForEach};
|
2
2
|
use polars::frame::row::{Row, rows_to_schema_supertypes, rows_to_supertypes};
|
3
3
|
use polars::prelude::*;
|
4
4
|
|
@@ -125,6 +125,7 @@ where
|
|
125
125
|
}
|
126
126
|
|
127
127
|
fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
|
128
|
+
let ruby = Ruby::get_with(*data);
|
128
129
|
let (data, len) = get_rbseq(*data)?;
|
129
130
|
let mut rows = Vec::with_capacity(len);
|
130
131
|
for d in data.into_iter() {
|
@@ -132,8 +133,8 @@ fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResu
|
|
132
133
|
|
133
134
|
let mut row = Vec::with_capacity(names.len());
|
134
135
|
for k in names.iter() {
|
135
|
-
// TODO improve performance
|
136
|
-
let val = match d.get(k.clone()).or_else(|| d.get(
|
136
|
+
// TODO improve performance (must work with GC)
|
137
|
+
let val = match d.get(k.clone()).or_else(|| d.get(ruby.to_symbol(k))) {
|
137
138
|
None => AnyValue::Null,
|
138
139
|
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
139
140
|
};
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{IntoValue,
|
1
|
+
use magnus::{IntoValue, Ruby, Value, prelude::*};
|
2
2
|
|
3
3
|
use super::*;
|
4
4
|
use crate::RbResult;
|
@@ -6,14 +6,15 @@ use crate::conversion::{ObjectValue, Wrap};
|
|
6
6
|
use crate::interop::arrow::to_ruby::dataframe_to_stream;
|
7
7
|
|
8
8
|
impl RbDataFrame {
|
9
|
-
pub fn row_tuple(&
|
9
|
+
pub fn row_tuple(ruby: &Ruby, rb_self: &Self, idx: i64) -> Value {
|
10
10
|
let idx = if idx < 0 {
|
11
|
-
(
|
11
|
+
(rb_self.df.borrow().height() as i64 + idx) as usize
|
12
12
|
} else {
|
13
13
|
idx as usize
|
14
14
|
};
|
15
|
-
|
16
|
-
|
15
|
+
ruby.ary_from_iter(
|
16
|
+
rb_self
|
17
|
+
.df
|
17
18
|
.borrow()
|
18
19
|
.get_columns()
|
19
20
|
.iter()
|
@@ -22,17 +23,18 @@ impl RbDataFrame {
|
|
22
23
|
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
23
24
|
obj.unwrap().to_value()
|
24
25
|
}
|
25
|
-
_ => Wrap(s.get(idx).unwrap()).
|
26
|
+
_ => Wrap(s.get(idx).unwrap()).into_value_with(ruby),
|
26
27
|
}),
|
27
28
|
)
|
28
29
|
.as_value()
|
29
30
|
}
|
30
31
|
|
31
|
-
pub fn row_tuples(&
|
32
|
-
let df = &
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
pub fn row_tuples(ruby: &Ruby, rb_self: &Self) -> Value {
|
33
|
+
let df = &rb_self.df;
|
34
|
+
ruby.ary_from_iter((0..df.borrow().height()).map(|idx| {
|
35
|
+
ruby.ary_from_iter(
|
36
|
+
rb_self
|
37
|
+
.df
|
36
38
|
.borrow()
|
37
39
|
.get_columns()
|
38
40
|
.iter()
|
@@ -41,15 +43,15 @@ impl RbDataFrame {
|
|
41
43
|
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
42
44
|
obj.unwrap().to_value()
|
43
45
|
}
|
44
|
-
_ => Wrap(s.get(idx).unwrap()).
|
46
|
+
_ => Wrap(s.get(idx).unwrap()).into_value_with(ruby),
|
45
47
|
}),
|
46
48
|
)
|
47
49
|
}))
|
48
50
|
.as_value()
|
49
51
|
}
|
50
52
|
|
51
|
-
pub fn __arrow_c_stream__(&
|
52
|
-
|
53
|
-
dataframe_to_stream(&
|
53
|
+
pub fn __arrow_c_stream__(ruby: &Ruby, rb_self: &Self) -> RbResult<Value> {
|
54
|
+
rb_self.df.borrow_mut().align_chunks();
|
55
|
+
dataframe_to_stream(&rb_self.df.borrow(), ruby)
|
54
56
|
}
|
55
57
|
}
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use std::hash::BuildHasher;
|
2
2
|
|
3
3
|
use either::Either;
|
4
|
-
use magnus::{IntoValue, RArray, Value, prelude
|
4
|
+
use magnus::{IntoValue, RArray, Ruby, Value, prelude::*};
|
5
5
|
use polars::prelude::pivot::{pivot, pivot_stable};
|
6
6
|
use polars::prelude::*;
|
7
7
|
|
@@ -149,12 +149,13 @@ impl RbDataFrame {
|
|
149
149
|
Ok(())
|
150
150
|
}
|
151
151
|
|
152
|
-
pub fn dtypes(&
|
153
|
-
|
154
|
-
|
152
|
+
pub fn dtypes(ruby: &Ruby, rb_self: &Self) -> RArray {
|
153
|
+
ruby.ary_from_iter(
|
154
|
+
rb_self
|
155
|
+
.df
|
155
156
|
.borrow()
|
156
157
|
.iter()
|
157
|
-
.map(|s| Wrap(s.dtype().clone()).
|
158
|
+
.map(|s| Wrap(s.dtype().clone()).into_value_with(ruby)),
|
158
159
|
)
|
159
160
|
}
|
160
161
|
|
@@ -393,18 +394,19 @@ impl RbDataFrame {
|
|
393
394
|
}
|
394
395
|
|
395
396
|
pub fn partition_by(
|
396
|
-
&
|
397
|
+
ruby: &Ruby,
|
398
|
+
rb_self: &Self,
|
397
399
|
by: Vec<String>,
|
398
400
|
maintain_order: bool,
|
399
401
|
include_key: bool,
|
400
402
|
) -> RbResult<RArray> {
|
401
403
|
let out = if maintain_order {
|
402
|
-
|
404
|
+
rb_self.df.borrow().partition_by_stable(by, include_key)
|
403
405
|
} else {
|
404
|
-
|
406
|
+
rb_self.df.borrow().partition_by(by, include_key)
|
405
407
|
}
|
406
408
|
.map_err(RbPolarsErr::from)?;
|
407
|
-
Ok(
|
409
|
+
Ok(ruby.ary_from_iter(out.into_iter().map(RbDataFrame::new)))
|
408
410
|
}
|
409
411
|
|
410
412
|
pub fn lazy(&self) -> RbLazyFrame {
|
@@ -440,12 +442,13 @@ impl RbDataFrame {
|
|
440
442
|
}
|
441
443
|
|
442
444
|
pub fn map_rows(
|
443
|
-
&
|
445
|
+
ruby: &Ruby,
|
446
|
+
rb_self: &Self,
|
444
447
|
lambda: Value,
|
445
448
|
output_type: Option<Wrap<DataType>>,
|
446
449
|
inference_size: usize,
|
447
450
|
) -> RbResult<(Value, bool)> {
|
448
|
-
let df = &
|
451
|
+
let df = &rb_self.df.borrow();
|
449
452
|
|
450
453
|
let output_type = output_type.map(|dt| dt.0);
|
451
454
|
let out = match output_type {
|
@@ -490,7 +493,7 @@ impl RbDataFrame {
|
|
490
493
|
_ => return apply_lambda_unknown(df, lambda, inference_size),
|
491
494
|
};
|
492
495
|
|
493
|
-
Ok((
|
496
|
+
Ok((ruby.obj_wrap(RbSeries::from(out)).as_value(), false))
|
494
497
|
}
|
495
498
|
|
496
499
|
pub fn shrink_to_fit(&self) {
|
@@ -249,14 +249,13 @@ impl RbDataFrame {
|
|
249
249
|
include_header: bool,
|
250
250
|
separator: u8,
|
251
251
|
quote_char: u8,
|
252
|
-
batch_size:
|
252
|
+
batch_size: NonZeroUsize,
|
253
253
|
datetime_format: Option<String>,
|
254
254
|
date_format: Option<String>,
|
255
255
|
time_format: Option<String>,
|
256
256
|
float_precision: Option<usize>,
|
257
257
|
null_value: Option<String>,
|
258
258
|
) -> RbResult<()> {
|
259
|
-
let batch_size = batch_size.0;
|
260
259
|
let null = null_value.unwrap_or_default();
|
261
260
|
let mut buf = get_file_like(rb_f, true)?;
|
262
261
|
CsvWriter::new(&mut buf)
|