polars-df 0.9.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/README.md +7 -6
- data/ext/polars/Cargo.toml +10 -6
- data/ext/polars/src/batched_csv.rs +53 -50
- data/ext/polars/src/conversion/anyvalue.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +31 -67
- data/ext/polars/src/dataframe/construction.rs +186 -0
- data/ext/polars/src/dataframe/export.rs +48 -0
- data/ext/polars/src/dataframe/general.rs +607 -0
- data/ext/polars/src/dataframe/io.rs +463 -0
- data/ext/polars/src/dataframe/mod.rs +26 -0
- data/ext/polars/src/expr/array.rs +6 -2
- data/ext/polars/src/expr/datetime.rs +13 -4
- data/ext/polars/src/expr/general.rs +50 -9
- data/ext/polars/src/expr/list.rs +6 -2
- data/ext/polars/src/expr/rolling.rs +185 -69
- data/ext/polars/src/expr/string.rs +12 -33
- data/ext/polars/src/file.rs +158 -11
- data/ext/polars/src/functions/lazy.rs +20 -3
- data/ext/polars/src/functions/range.rs +74 -0
- data/ext/polars/src/functions/whenthen.rs +47 -17
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/interop/numo/mod.rs +2 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
- data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
- data/ext/polars/src/lazyframe/mod.rs +111 -56
- data/ext/polars/src/lib.rs +68 -34
- data/ext/polars/src/map/dataframe.rs +17 -9
- data/ext/polars/src/map/lazy.rs +5 -25
- data/ext/polars/src/map/series.rs +7 -1
- data/ext/polars/src/series/aggregation.rs +47 -30
- data/ext/polars/src/series/export.rs +131 -49
- data/ext/polars/src/series/mod.rs +13 -133
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +23 -8
- data/ext/polars/src/dataframe.rs +0 -1182
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -1,23 +1,19 @@
|
|
1
|
+
use std::cell::RefCell;
|
2
|
+
use std::path::PathBuf;
|
3
|
+
use std::sync::Mutex;
|
4
|
+
|
1
5
|
use magnus::{prelude::*, RArray, Value};
|
6
|
+
use polars::io::csv::read::OwnedBatchedCsvReader;
|
2
7
|
use polars::io::mmap::MmapBytesReader;
|
3
8
|
use polars::io::RowIndex;
|
4
|
-
use polars::prelude::read_impl::OwnedBatchedCsvReader;
|
5
9
|
use polars::prelude::*;
|
6
|
-
use std::cell::RefCell;
|
7
|
-
use std::path::PathBuf;
|
8
10
|
|
9
11
|
use crate::conversion::*;
|
10
|
-
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
11
12
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
12
13
|
|
13
|
-
pub enum BatchedReader {
|
14
|
-
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
-
Read(OwnedBatchedCsvReader),
|
16
|
-
}
|
17
|
-
|
18
14
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
19
15
|
pub struct RbBatchedCsv {
|
20
|
-
pub reader: RefCell<
|
16
|
+
pub reader: RefCell<Mutex<OwnedBatchedCsvReader>>,
|
21
17
|
}
|
22
18
|
|
23
19
|
impl RbBatchedCsv {
|
@@ -44,18 +40,23 @@ impl RbBatchedCsv {
|
|
44
40
|
let comment_prefix = Option::<String>::try_convert(arguments[16])?;
|
45
41
|
let quote_char = Option::<String>::try_convert(arguments[17])?;
|
46
42
|
let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[18])?;
|
47
|
-
let
|
48
|
-
let
|
49
|
-
let
|
50
|
-
let
|
51
|
-
let
|
43
|
+
let missing_utf8_is_empty_string = bool::try_convert(arguments[19])?;
|
44
|
+
let try_parse_dates = bool::try_convert(arguments[20])?;
|
45
|
+
let skip_rows_after_header = usize::try_convert(arguments[21])?;
|
46
|
+
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
|
47
|
+
let sample_size = usize::try_convert(arguments[23])?;
|
48
|
+
let eol_char = String::try_convert(arguments[24])?;
|
49
|
+
let raise_if_empty = bool::try_convert(arguments[25])?;
|
50
|
+
let truncate_ragged_lines = bool::try_convert(arguments[26])?;
|
51
|
+
let decimal_comma = bool::try_convert(arguments[27])?;
|
52
52
|
// end arguments
|
53
53
|
|
54
54
|
let null_values = null_values.map(|w| w.0);
|
55
55
|
let eol_char = eol_char.as_bytes()[0];
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
57
|
+
name: Arc::from(name.as_str()),
|
58
|
+
offset,
|
59
|
+
});
|
59
60
|
let quote_char = if let Some(s) = quote_char {
|
60
61
|
if s.is_empty() {
|
61
62
|
None
|
@@ -85,53 +86,55 @@ impl RbBatchedCsv {
|
|
85
86
|
|
86
87
|
let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
|
87
88
|
let reader = Box::new(file) as Box<dyn MmapBytesReader>;
|
88
|
-
let reader =
|
89
|
-
.
|
90
|
-
.
|
89
|
+
let reader = CsvReadOptions::default()
|
90
|
+
.with_infer_schema_length(infer_schema_length)
|
91
|
+
.with_has_header(has_header)
|
91
92
|
.with_n_rows(n_rows)
|
92
|
-
.with_separator(separator.as_bytes()[0])
|
93
93
|
.with_skip_rows(skip_rows)
|
94
94
|
.with_ignore_errors(ignore_errors)
|
95
|
-
.with_projection(projection)
|
95
|
+
.with_projection(projection.map(Arc::new))
|
96
96
|
.with_rechunk(rechunk)
|
97
97
|
.with_chunk_size(chunk_size)
|
98
|
-
.
|
99
|
-
.with_columns(columns)
|
98
|
+
.with_columns(columns.map(Arc::new))
|
100
99
|
.with_n_threads(n_threads)
|
101
|
-
.
|
102
|
-
.
|
103
|
-
.with_comment_prefix(comment_prefix.as_deref())
|
104
|
-
.with_null_values(null_values)
|
105
|
-
.with_try_parse_dates(try_parse_dates)
|
106
|
-
.with_quote_char(quote_char)
|
107
|
-
.with_end_of_line_char(eol_char)
|
100
|
+
.with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
|
101
|
+
.with_low_memory(low_memory)
|
108
102
|
.with_skip_rows_after_header(skip_rows_after_header)
|
109
103
|
.with_row_index(row_index)
|
110
|
-
.
|
104
|
+
.with_sample_size(sample_size)
|
105
|
+
.with_raise_if_empty(raise_if_empty)
|
106
|
+
.with_parse_options(
|
107
|
+
CsvParseOptions::default()
|
108
|
+
.with_separator(separator.as_bytes()[0])
|
109
|
+
.with_encoding(encoding.0)
|
110
|
+
.with_missing_is_null(!missing_utf8_is_empty_string)
|
111
|
+
.with_comment_prefix(comment_prefix.as_deref())
|
112
|
+
.with_null_values(null_values)
|
113
|
+
.with_try_parse_dates(try_parse_dates)
|
114
|
+
.with_quote_char(quote_char)
|
115
|
+
.with_eol_char(eol_char)
|
116
|
+
.with_truncate_ragged_lines(truncate_ragged_lines)
|
117
|
+
.with_decimal_comma(decimal_comma),
|
118
|
+
)
|
119
|
+
.into_reader_with_file_handle(reader);
|
111
120
|
|
112
|
-
let reader =
|
113
|
-
|
114
|
-
|
115
|
-
.map_err(RbPolarsErr::from)?;
|
116
|
-
BatchedReader::Read(reader)
|
117
|
-
} else {
|
118
|
-
let reader = reader
|
119
|
-
.batched_mmap(overwrite_dtype.map(Arc::new))
|
120
|
-
.map_err(RbPolarsErr::from)?;
|
121
|
-
BatchedReader::MMap(reader)
|
122
|
-
};
|
121
|
+
let reader = reader
|
122
|
+
.batched(overwrite_dtype.map(Arc::new))
|
123
|
+
.map_err(RbPolarsErr::from)?;
|
123
124
|
|
124
125
|
Ok(RbBatchedCsv {
|
125
|
-
reader: RefCell::new(reader),
|
126
|
+
reader: RefCell::new(Mutex::new(reader)),
|
126
127
|
})
|
127
128
|
}
|
128
129
|
|
129
130
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
130
|
-
let
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
131
|
+
let reader = &self.reader;
|
132
|
+
let batches = reader
|
133
|
+
.borrow()
|
134
|
+
.lock()
|
135
|
+
.map_err(|e| RbPolarsErr::other(e.to_string()))?
|
136
|
+
.next_batches(n)
|
137
|
+
.map_err(RbPolarsErr::from)?;
|
135
138
|
|
136
139
|
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
137
140
|
}
|
@@ -3,8 +3,8 @@ use magnus::{
|
|
3
3
|
class, prelude::*, r_hash::ForEach, Float, Integer, IntoValue, RArray, RHash, RString, Ruby,
|
4
4
|
TryConvert, Value,
|
5
5
|
};
|
6
|
-
use polars::frame::row::any_values_to_dtype;
|
7
6
|
use polars::prelude::*;
|
7
|
+
use polars_core::utils::any_values_to_supertype_and_n_dtypes;
|
8
8
|
|
9
9
|
use super::{struct_dict, ObjectValue, Wrap};
|
10
10
|
|
@@ -120,7 +120,8 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
120
120
|
avs.push(Wrap::<AnyValue>::try_convert(item?)?.0)
|
121
121
|
}
|
122
122
|
|
123
|
-
let (dtype, _n_types) =
|
123
|
+
let (dtype, _n_types) =
|
124
|
+
any_values_to_supertype_and_n_dtypes(&avs).map_err(RbPolarsErr::from)?;
|
124
125
|
|
125
126
|
// push the rest
|
126
127
|
avs.reserve(list.len());
|
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
|
|
7
7
|
|
8
8
|
use magnus::{
|
9
9
|
class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
|
10
|
-
Ruby,
|
10
|
+
Ruby, TryConvert, Value,
|
11
11
|
};
|
12
12
|
use polars::chunked_array::object::PolarsObjectSafe;
|
13
13
|
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
|
|
18
18
|
use polars::prelude::*;
|
19
19
|
use polars::series::ops::NullBehavior;
|
20
20
|
use polars_core::utils::arrow::array::Array;
|
21
|
+
use polars_core::utils::materialize_dyn_int;
|
21
22
|
use polars_utils::total_ord::{TotalEq, TotalHash};
|
22
23
|
use smartstring::alias::String as SmartString;
|
23
24
|
|
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
|
|
154
155
|
let class = pl.const_get::<_, Value>("Float32").unwrap();
|
155
156
|
class.funcall("new", ()).unwrap()
|
156
157
|
}
|
157
|
-
DataType::Float64 => {
|
158
|
+
DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
|
158
159
|
let class = pl.const_get::<_, Value>("Float64").unwrap();
|
159
160
|
class.funcall("new", ()).unwrap()
|
160
161
|
}
|
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
|
|
168
169
|
let class = pl.const_get::<_, Value>("Boolean").unwrap();
|
169
170
|
class.funcall("new", ()).unwrap()
|
170
171
|
}
|
171
|
-
DataType::String => {
|
172
|
+
DataType::String | DataType::Unknown(UnknownKind::Str) => {
|
172
173
|
let class = pl.const_get::<_, Value>("String").unwrap();
|
173
174
|
class.funcall("new", ()).unwrap()
|
174
175
|
}
|
@@ -217,10 +218,7 @@ impl IntoValue for Wrap<DataType> {
|
|
217
218
|
let class = pl.const_get::<_, Value>("Enum").unwrap();
|
218
219
|
let s = Series::from_arrow("category", categories.to_boxed()).unwrap();
|
219
220
|
let series = to_series(s.into());
|
220
|
-
class
|
221
|
-
.funcall::<_, _, Value>("new", (series,))
|
222
|
-
.unwrap()
|
223
|
-
.into()
|
221
|
+
class.funcall::<_, _, Value>("new", (series,)).unwrap()
|
224
222
|
}
|
225
223
|
DataType::Time => {
|
226
224
|
let class = pl.const_get::<_, Value>("Time").unwrap();
|
@@ -245,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
|
|
245
243
|
let class = pl.const_get::<_, Value>("Null").unwrap();
|
246
244
|
class.funcall("new", ()).unwrap()
|
247
245
|
}
|
248
|
-
DataType::Unknown => {
|
246
|
+
DataType::Unknown(UnknownKind::Int(v)) => {
|
247
|
+
Wrap(materialize_dyn_int(v).dtype()).into_value()
|
248
|
+
}
|
249
|
+
DataType::Unknown(_) => {
|
249
250
|
let class = pl.const_get::<_, Value>("Unknown").unwrap();
|
250
251
|
class.funcall("new", ()).unwrap()
|
251
252
|
}
|
@@ -313,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
|
|
313
314
|
"Polars::Object" => DataType::Object(OBJECT_NAME, None),
|
314
315
|
"Polars::List" => DataType::List(Box::new(DataType::Null)),
|
315
316
|
"Polars::Null" => DataType::Null,
|
316
|
-
"Polars::Unknown" => DataType::Unknown,
|
317
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
317
318
|
dt => {
|
318
319
|
return Err(RbValueError::new_err(format!(
|
319
320
|
"{dt} is not a correct polars DataType.",
|
@@ -353,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
|
|
353
354
|
"Polars::Float32" => DataType::Float32,
|
354
355
|
"Polars::Float64" => DataType::Float64,
|
355
356
|
"Polars::Null" => DataType::Null,
|
356
|
-
"Polars::Unknown" => DataType::Unknown,
|
357
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
357
358
|
"Polars::Duration" => {
|
358
359
|
let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
|
359
360
|
let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
|
@@ -413,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
|
|
413
414
|
"obj" => DataType::Object(OBJECT_NAME, None),
|
414
415
|
"list" => DataType::List(Box::new(DataType::Boolean)),
|
415
416
|
"null" => DataType::Null,
|
416
|
-
"unk" => DataType::Unknown,
|
417
|
+
"unk" => DataType::Unknown(Default::default()),
|
417
418
|
_ => {
|
418
419
|
return Err(RbValueError::new_err(format!(
|
419
420
|
"{} is not a supported DataType.",
|
@@ -549,57 +550,6 @@ impl Default for ObjectValue {
|
|
549
550
|
}
|
550
551
|
}
|
551
552
|
|
552
|
-
pub(crate) fn dicts_to_rows(
|
553
|
-
records: &Value,
|
554
|
-
infer_schema_len: Option<usize>,
|
555
|
-
schema_columns: PlIndexSet<String>,
|
556
|
-
) -> RbResult<(Vec<Row>, Vec<String>)> {
|
557
|
-
let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
|
558
|
-
let (dicts, len) = get_rbseq(*records)?;
|
559
|
-
|
560
|
-
let key_names = {
|
561
|
-
if !schema_columns.is_empty() {
|
562
|
-
schema_columns
|
563
|
-
} else {
|
564
|
-
let mut inferred_keys = PlIndexSet::new();
|
565
|
-
for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
|
566
|
-
let d = d?;
|
567
|
-
let d = RHash::try_convert(d)?;
|
568
|
-
|
569
|
-
d.foreach(|name: Value, _value: Value| {
|
570
|
-
if let Some(v) = Symbol::from_value(name) {
|
571
|
-
inferred_keys.insert(v.name()?.into());
|
572
|
-
} else {
|
573
|
-
inferred_keys.insert(String::try_convert(name)?);
|
574
|
-
};
|
575
|
-
Ok(ForEach::Continue)
|
576
|
-
})?;
|
577
|
-
}
|
578
|
-
inferred_keys
|
579
|
-
}
|
580
|
-
};
|
581
|
-
|
582
|
-
let mut rows = Vec::with_capacity(len);
|
583
|
-
|
584
|
-
for d in dicts.each() {
|
585
|
-
let d = d?;
|
586
|
-
let d = RHash::try_convert(d)?;
|
587
|
-
|
588
|
-
let mut row = Vec::with_capacity(key_names.len());
|
589
|
-
|
590
|
-
for k in key_names.iter() {
|
591
|
-
// TODO improve performance
|
592
|
-
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
593
|
-
None => AnyValue::Null,
|
594
|
-
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
595
|
-
};
|
596
|
-
row.push(val)
|
597
|
-
}
|
598
|
-
rows.push(Row(row))
|
599
|
-
}
|
600
|
-
Ok((rows, key_names.into_iter().collect()))
|
601
|
-
}
|
602
|
-
|
603
553
|
impl TryConvert for Wrap<AsofStrategy> {
|
604
554
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
605
555
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -736,12 +686,11 @@ impl TryConvert for Wrap<JoinType> {
|
|
736
686
|
let parsed = match String::try_convert(ob)?.as_str() {
|
737
687
|
"inner" => JoinType::Inner,
|
738
688
|
"left" => JoinType::Left,
|
739
|
-
"outer" => JoinType::Outer
|
740
|
-
"outer_coalesce" => JoinType::Outer
|
689
|
+
"outer" => JoinType::Outer,
|
690
|
+
"outer_coalesce" => JoinType::Outer,
|
741
691
|
"semi" => JoinType::Semi,
|
742
692
|
"anti" => JoinType::Anti,
|
743
|
-
|
744
|
-
// "cross" => JoinType::Cross,
|
693
|
+
"cross" => JoinType::Cross,
|
745
694
|
v => {
|
746
695
|
return Err(RbValueError::new_err(format!(
|
747
696
|
"how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
|
@@ -785,6 +734,21 @@ impl TryConvert for Wrap<ListToStructWidthStrategy> {
|
|
785
734
|
}
|
786
735
|
}
|
787
736
|
|
737
|
+
impl TryConvert for Wrap<NonExistent> {
|
738
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
739
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
740
|
+
"null" => NonExistent::Null,
|
741
|
+
"raise" => NonExistent::Raise,
|
742
|
+
v => {
|
743
|
+
return Err(RbValueError::new_err(format!(
|
744
|
+
"`non_existent` must be one of {{'null', 'raise'}}, got {v}",
|
745
|
+
)))
|
746
|
+
}
|
747
|
+
};
|
748
|
+
Ok(Wrap(parsed))
|
749
|
+
}
|
750
|
+
}
|
751
|
+
|
788
752
|
impl TryConvert for Wrap<NullBehavior> {
|
789
753
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
790
754
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -1066,7 +1030,7 @@ impl TryConvert for Wrap<NonZeroUsize> {
|
|
1066
1030
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
1067
1031
|
let v = usize::try_convert(ob)?;
|
1068
1032
|
NonZeroUsize::new(v)
|
1069
|
-
.map(
|
1033
|
+
.map(Wrap)
|
1070
1034
|
.ok_or(RbValueError::new_err("must be non-zero".into()))
|
1071
1035
|
}
|
1072
1036
|
}
|
@@ -0,0 +1,186 @@
|
|
1
|
+
use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
|
2
|
+
use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
|
3
|
+
use polars::prelude::*;
|
4
|
+
|
5
|
+
use super::*;
|
6
|
+
use crate::conversion::*;
|
7
|
+
use crate::{RbPolarsErr, RbResult};
|
8
|
+
|
9
|
+
impl RbDataFrame {
|
10
|
+
pub fn from_rows(
|
11
|
+
rb_rows: RArray,
|
12
|
+
infer_schema_length: Option<usize>,
|
13
|
+
schema: Option<Wrap<Schema>>,
|
14
|
+
) -> RbResult<Self> {
|
15
|
+
let mut data = Vec::with_capacity(rb_rows.len());
|
16
|
+
for v in rb_rows.each() {
|
17
|
+
let rb_row = RArray::try_convert(v?)?;
|
18
|
+
let mut row = Vec::with_capacity(rb_row.len());
|
19
|
+
for val in rb_row.each() {
|
20
|
+
row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
|
21
|
+
}
|
22
|
+
data.push(Row(row));
|
23
|
+
}
|
24
|
+
let schema = schema.map(|wrap| wrap.0);
|
25
|
+
finish_from_rows(data, schema, None, infer_schema_length)
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn from_hashes(
|
29
|
+
data: Value,
|
30
|
+
schema: Option<Wrap<Schema>>,
|
31
|
+
schema_overrides: Option<Wrap<Schema>>,
|
32
|
+
strict: bool,
|
33
|
+
infer_schema_length: Option<usize>,
|
34
|
+
) -> RbResult<Self> {
|
35
|
+
let schema = schema.map(|wrap| wrap.0);
|
36
|
+
let schema_overrides = schema_overrides.map(|wrap| wrap.0);
|
37
|
+
|
38
|
+
let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
|
39
|
+
let rows = dicts_to_rows(&data, &names, strict)?;
|
40
|
+
|
41
|
+
let schema = schema.or_else(|| {
|
42
|
+
Some(columns_names_to_empty_schema(
|
43
|
+
names.iter().map(String::as_str),
|
44
|
+
))
|
45
|
+
});
|
46
|
+
|
47
|
+
finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
fn finish_from_rows(
|
52
|
+
rows: Vec<Row>,
|
53
|
+
schema: Option<Schema>,
|
54
|
+
schema_overrides: Option<Schema>,
|
55
|
+
infer_schema_length: Option<usize>,
|
56
|
+
) -> RbResult<RbDataFrame> {
|
57
|
+
// Object builder must be registered
|
58
|
+
crate::on_startup::register_object_builder();
|
59
|
+
|
60
|
+
let mut schema = if let Some(mut schema) = schema {
|
61
|
+
resolve_schema_overrides(&mut schema, schema_overrides);
|
62
|
+
update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
|
63
|
+
schema
|
64
|
+
} else {
|
65
|
+
rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
|
66
|
+
};
|
67
|
+
|
68
|
+
// TODO: Remove this step when Decimals are supported properly.
|
69
|
+
// Erasing the decimal precision/scale here will just require us to infer it again later.
|
70
|
+
// https://github.com/pola-rs/polars/issues/14427
|
71
|
+
erase_decimal_precision_scale(&mut schema);
|
72
|
+
|
73
|
+
let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
|
74
|
+
Ok(df.into())
|
75
|
+
}
|
76
|
+
|
77
|
+
fn update_schema_from_rows(
|
78
|
+
schema: &mut Schema,
|
79
|
+
rows: &[Row],
|
80
|
+
infer_schema_length: Option<usize>,
|
81
|
+
) -> RbResult<()> {
|
82
|
+
let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
|
83
|
+
if schema_is_complete {
|
84
|
+
return Ok(());
|
85
|
+
}
|
86
|
+
|
87
|
+
// TODO: Only infer dtypes for columns with an unknown dtype
|
88
|
+
let inferred_dtypes =
|
89
|
+
rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
|
90
|
+
let inferred_dtypes_slice = inferred_dtypes.as_slice();
|
91
|
+
|
92
|
+
for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
|
93
|
+
if !dtype.is_known() {
|
94
|
+
*dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
|
95
|
+
polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
|
96
|
+
})
|
97
|
+
.map_err(RbPolarsErr::from)?
|
98
|
+
.clone();
|
99
|
+
}
|
100
|
+
}
|
101
|
+
Ok(())
|
102
|
+
}
|
103
|
+
|
104
|
+
fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
|
105
|
+
if let Some(overrides) = schema_overrides {
|
106
|
+
for (name, dtype) in overrides.into_iter() {
|
107
|
+
schema.set_dtype(name.as_str(), dtype);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
fn erase_decimal_precision_scale(schema: &mut Schema) {
|
113
|
+
for dtype in schema.iter_dtypes_mut() {
|
114
|
+
if let DataType::Decimal(_, _) = dtype {
|
115
|
+
*dtype = DataType::Decimal(None, None)
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
|
121
|
+
where
|
122
|
+
I: IntoIterator<Item = &'a str>,
|
123
|
+
{
|
124
|
+
let fields = column_names
|
125
|
+
.into_iter()
|
126
|
+
.map(|c| Field::new(c, DataType::Unknown(Default::default())));
|
127
|
+
Schema::from_iter(fields)
|
128
|
+
}
|
129
|
+
|
130
|
+
fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
|
131
|
+
let (data, len) = get_rbseq(*data)?;
|
132
|
+
let mut rows = Vec::with_capacity(len);
|
133
|
+
for d in data.each() {
|
134
|
+
let d = d?;
|
135
|
+
let d = RHash::try_convert(d)?;
|
136
|
+
|
137
|
+
let mut row = Vec::with_capacity(names.len());
|
138
|
+
for k in names.iter() {
|
139
|
+
// TODO improve performance
|
140
|
+
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
141
|
+
None => AnyValue::Null,
|
142
|
+
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
143
|
+
};
|
144
|
+
row.push(val)
|
145
|
+
}
|
146
|
+
rows.push(Row(row))
|
147
|
+
}
|
148
|
+
Ok(rows)
|
149
|
+
}
|
150
|
+
|
151
|
+
fn get_schema_names(
|
152
|
+
data: &Value,
|
153
|
+
schema: Option<&Schema>,
|
154
|
+
infer_schema_length: Option<usize>,
|
155
|
+
) -> RbResult<Vec<String>> {
|
156
|
+
if let Some(schema) = schema {
|
157
|
+
Ok(schema.iter_names().map(|n| n.to_string()).collect())
|
158
|
+
} else {
|
159
|
+
infer_schema_names_from_data(data, infer_schema_length)
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
fn infer_schema_names_from_data(
|
164
|
+
data: &Value,
|
165
|
+
infer_schema_length: Option<usize>,
|
166
|
+
) -> RbResult<Vec<String>> {
|
167
|
+
let (data, data_len) = get_rbseq(*data)?;
|
168
|
+
let infer_schema_length = infer_schema_length
|
169
|
+
.map(|n| std::cmp::max(1, n))
|
170
|
+
.unwrap_or(data_len);
|
171
|
+
|
172
|
+
let mut names = PlIndexSet::new();
|
173
|
+
for d in data.each().take(infer_schema_length) {
|
174
|
+
let d = d?;
|
175
|
+
let d = RHash::try_convert(d)?;
|
176
|
+
d.foreach(|name: Value, _value: Value| {
|
177
|
+
if let Some(v) = Symbol::from_value(name) {
|
178
|
+
names.insert(v.name()?.into());
|
179
|
+
} else {
|
180
|
+
names.insert(String::try_convert(name)?);
|
181
|
+
};
|
182
|
+
Ok(ForEach::Continue)
|
183
|
+
})?;
|
184
|
+
}
|
185
|
+
Ok(names.into_iter().collect())
|
186
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
use magnus::{prelude::*, IntoValue, RArray, Value};
|
2
|
+
|
3
|
+
use super::*;
|
4
|
+
use crate::conversion::{ObjectValue, Wrap};
|
5
|
+
|
6
|
+
impl RbDataFrame {
|
7
|
+
pub fn row_tuple(&self, idx: i64) -> Value {
|
8
|
+
let idx = if idx < 0 {
|
9
|
+
(self.df.borrow().height() as i64 + idx) as usize
|
10
|
+
} else {
|
11
|
+
idx as usize
|
12
|
+
};
|
13
|
+
RArray::from_iter(
|
14
|
+
self.df
|
15
|
+
.borrow()
|
16
|
+
.get_columns()
|
17
|
+
.iter()
|
18
|
+
.map(|s| match s.dtype() {
|
19
|
+
DataType::Object(_, _) => {
|
20
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
21
|
+
obj.unwrap().to_object()
|
22
|
+
}
|
23
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
24
|
+
}),
|
25
|
+
)
|
26
|
+
.as_value()
|
27
|
+
}
|
28
|
+
|
29
|
+
pub fn row_tuples(&self) -> Value {
|
30
|
+
let df = &self.df;
|
31
|
+
RArray::from_iter((0..df.borrow().height()).map(|idx| {
|
32
|
+
RArray::from_iter(
|
33
|
+
self.df
|
34
|
+
.borrow()
|
35
|
+
.get_columns()
|
36
|
+
.iter()
|
37
|
+
.map(|s| match s.dtype() {
|
38
|
+
DataType::Object(_, _) => {
|
39
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
40
|
+
obj.unwrap().to_object()
|
41
|
+
}
|
42
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
43
|
+
}),
|
44
|
+
)
|
45
|
+
}))
|
46
|
+
.as_value()
|
47
|
+
}
|
48
|
+
}
|