polars-df 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/README.md +7 -6
- data/ext/polars/Cargo.toml +10 -6
- data/ext/polars/src/batched_csv.rs +53 -50
- data/ext/polars/src/conversion/anyvalue.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +31 -67
- data/ext/polars/src/dataframe/construction.rs +186 -0
- data/ext/polars/src/dataframe/export.rs +48 -0
- data/ext/polars/src/dataframe/general.rs +607 -0
- data/ext/polars/src/dataframe/io.rs +463 -0
- data/ext/polars/src/dataframe/mod.rs +26 -0
- data/ext/polars/src/expr/array.rs +6 -2
- data/ext/polars/src/expr/datetime.rs +13 -4
- data/ext/polars/src/expr/general.rs +50 -9
- data/ext/polars/src/expr/list.rs +6 -2
- data/ext/polars/src/expr/rolling.rs +185 -69
- data/ext/polars/src/expr/string.rs +12 -33
- data/ext/polars/src/file.rs +158 -11
- data/ext/polars/src/functions/lazy.rs +20 -3
- data/ext/polars/src/functions/range.rs +74 -0
- data/ext/polars/src/functions/whenthen.rs +47 -17
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/interop/numo/mod.rs +2 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
- data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
- data/ext/polars/src/lazyframe/mod.rs +111 -56
- data/ext/polars/src/lib.rs +68 -34
- data/ext/polars/src/map/dataframe.rs +17 -9
- data/ext/polars/src/map/lazy.rs +5 -25
- data/ext/polars/src/map/series.rs +7 -1
- data/ext/polars/src/series/aggregation.rs +47 -30
- data/ext/polars/src/series/export.rs +131 -49
- data/ext/polars/src/series/mod.rs +13 -133
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +23 -8
- data/ext/polars/src/dataframe.rs +0 -1182
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -1,23 +1,19 @@
|
|
1
|
+
use std::cell::RefCell;
|
2
|
+
use std::path::PathBuf;
|
3
|
+
use std::sync::Mutex;
|
4
|
+
|
1
5
|
use magnus::{prelude::*, RArray, Value};
|
6
|
+
use polars::io::csv::read::OwnedBatchedCsvReader;
|
2
7
|
use polars::io::mmap::MmapBytesReader;
|
3
8
|
use polars::io::RowIndex;
|
4
|
-
use polars::prelude::read_impl::OwnedBatchedCsvReader;
|
5
9
|
use polars::prelude::*;
|
6
|
-
use std::cell::RefCell;
|
7
|
-
use std::path::PathBuf;
|
8
10
|
|
9
11
|
use crate::conversion::*;
|
10
|
-
use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
|
11
12
|
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
12
13
|
|
13
|
-
pub enum BatchedReader {
|
14
|
-
MMap(OwnedBatchedCsvReaderMmap),
|
15
|
-
Read(OwnedBatchedCsvReader),
|
16
|
-
}
|
17
|
-
|
18
14
|
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
19
15
|
pub struct RbBatchedCsv {
|
20
|
-
pub reader: RefCell<
|
16
|
+
pub reader: RefCell<Mutex<OwnedBatchedCsvReader>>,
|
21
17
|
}
|
22
18
|
|
23
19
|
impl RbBatchedCsv {
|
@@ -44,18 +40,23 @@ impl RbBatchedCsv {
|
|
44
40
|
let comment_prefix = Option::<String>::try_convert(arguments[16])?;
|
45
41
|
let quote_char = Option::<String>::try_convert(arguments[17])?;
|
46
42
|
let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[18])?;
|
47
|
-
let
|
48
|
-
let
|
49
|
-
let
|
50
|
-
let
|
51
|
-
let
|
43
|
+
let missing_utf8_is_empty_string = bool::try_convert(arguments[19])?;
|
44
|
+
let try_parse_dates = bool::try_convert(arguments[20])?;
|
45
|
+
let skip_rows_after_header = usize::try_convert(arguments[21])?;
|
46
|
+
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
|
47
|
+
let sample_size = usize::try_convert(arguments[23])?;
|
48
|
+
let eol_char = String::try_convert(arguments[24])?;
|
49
|
+
let raise_if_empty = bool::try_convert(arguments[25])?;
|
50
|
+
let truncate_ragged_lines = bool::try_convert(arguments[26])?;
|
51
|
+
let decimal_comma = bool::try_convert(arguments[27])?;
|
52
52
|
// end arguments
|
53
53
|
|
54
54
|
let null_values = null_values.map(|w| w.0);
|
55
55
|
let eol_char = eol_char.as_bytes()[0];
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
57
|
+
name: Arc::from(name.as_str()),
|
58
|
+
offset,
|
59
|
+
});
|
59
60
|
let quote_char = if let Some(s) = quote_char {
|
60
61
|
if s.is_empty() {
|
61
62
|
None
|
@@ -85,53 +86,55 @@ impl RbBatchedCsv {
|
|
85
86
|
|
86
87
|
let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
|
87
88
|
let reader = Box::new(file) as Box<dyn MmapBytesReader>;
|
88
|
-
let reader =
|
89
|
-
.
|
90
|
-
.
|
89
|
+
let reader = CsvReadOptions::default()
|
90
|
+
.with_infer_schema_length(infer_schema_length)
|
91
|
+
.with_has_header(has_header)
|
91
92
|
.with_n_rows(n_rows)
|
92
|
-
.with_separator(separator.as_bytes()[0])
|
93
93
|
.with_skip_rows(skip_rows)
|
94
94
|
.with_ignore_errors(ignore_errors)
|
95
|
-
.with_projection(projection)
|
95
|
+
.with_projection(projection.map(Arc::new))
|
96
96
|
.with_rechunk(rechunk)
|
97
97
|
.with_chunk_size(chunk_size)
|
98
|
-
.
|
99
|
-
.with_columns(columns)
|
98
|
+
.with_columns(columns.map(Arc::new))
|
100
99
|
.with_n_threads(n_threads)
|
101
|
-
.
|
102
|
-
.
|
103
|
-
.with_comment_prefix(comment_prefix.as_deref())
|
104
|
-
.with_null_values(null_values)
|
105
|
-
.with_try_parse_dates(try_parse_dates)
|
106
|
-
.with_quote_char(quote_char)
|
107
|
-
.with_end_of_line_char(eol_char)
|
100
|
+
.with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
|
101
|
+
.with_low_memory(low_memory)
|
108
102
|
.with_skip_rows_after_header(skip_rows_after_header)
|
109
103
|
.with_row_index(row_index)
|
110
|
-
.
|
104
|
+
.with_sample_size(sample_size)
|
105
|
+
.with_raise_if_empty(raise_if_empty)
|
106
|
+
.with_parse_options(
|
107
|
+
CsvParseOptions::default()
|
108
|
+
.with_separator(separator.as_bytes()[0])
|
109
|
+
.with_encoding(encoding.0)
|
110
|
+
.with_missing_is_null(!missing_utf8_is_empty_string)
|
111
|
+
.with_comment_prefix(comment_prefix.as_deref())
|
112
|
+
.with_null_values(null_values)
|
113
|
+
.with_try_parse_dates(try_parse_dates)
|
114
|
+
.with_quote_char(quote_char)
|
115
|
+
.with_eol_char(eol_char)
|
116
|
+
.with_truncate_ragged_lines(truncate_ragged_lines)
|
117
|
+
.with_decimal_comma(decimal_comma),
|
118
|
+
)
|
119
|
+
.into_reader_with_file_handle(reader);
|
111
120
|
|
112
|
-
let reader =
|
113
|
-
|
114
|
-
|
115
|
-
.map_err(RbPolarsErr::from)?;
|
116
|
-
BatchedReader::Read(reader)
|
117
|
-
} else {
|
118
|
-
let reader = reader
|
119
|
-
.batched_mmap(overwrite_dtype.map(Arc::new))
|
120
|
-
.map_err(RbPolarsErr::from)?;
|
121
|
-
BatchedReader::MMap(reader)
|
122
|
-
};
|
121
|
+
let reader = reader
|
122
|
+
.batched(overwrite_dtype.map(Arc::new))
|
123
|
+
.map_err(RbPolarsErr::from)?;
|
123
124
|
|
124
125
|
Ok(RbBatchedCsv {
|
125
|
-
reader: RefCell::new(reader),
|
126
|
+
reader: RefCell::new(Mutex::new(reader)),
|
126
127
|
})
|
127
128
|
}
|
128
129
|
|
129
130
|
pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
|
130
|
-
let
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
131
|
+
let reader = &self.reader;
|
132
|
+
let batches = reader
|
133
|
+
.borrow()
|
134
|
+
.lock()
|
135
|
+
.map_err(|e| RbPolarsErr::other(e.to_string()))?
|
136
|
+
.next_batches(n)
|
137
|
+
.map_err(RbPolarsErr::from)?;
|
135
138
|
|
136
139
|
Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
|
137
140
|
}
|
@@ -3,8 +3,8 @@ use magnus::{
|
|
3
3
|
class, prelude::*, r_hash::ForEach, Float, Integer, IntoValue, RArray, RHash, RString, Ruby,
|
4
4
|
TryConvert, Value,
|
5
5
|
};
|
6
|
-
use polars::frame::row::any_values_to_dtype;
|
7
6
|
use polars::prelude::*;
|
7
|
+
use polars_core::utils::any_values_to_supertype_and_n_dtypes;
|
8
8
|
|
9
9
|
use super::{struct_dict, ObjectValue, Wrap};
|
10
10
|
|
@@ -120,7 +120,8 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
120
120
|
avs.push(Wrap::<AnyValue>::try_convert(item?)?.0)
|
121
121
|
}
|
122
122
|
|
123
|
-
let (dtype, _n_types) =
|
123
|
+
let (dtype, _n_types) =
|
124
|
+
any_values_to_supertype_and_n_dtypes(&avs).map_err(RbPolarsErr::from)?;
|
124
125
|
|
125
126
|
// push the rest
|
126
127
|
avs.reserve(list.len());
|
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
|
|
7
7
|
|
8
8
|
use magnus::{
|
9
9
|
class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
|
10
|
-
Ruby,
|
10
|
+
Ruby, TryConvert, Value,
|
11
11
|
};
|
12
12
|
use polars::chunked_array::object::PolarsObjectSafe;
|
13
13
|
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
|
|
18
18
|
use polars::prelude::*;
|
19
19
|
use polars::series::ops::NullBehavior;
|
20
20
|
use polars_core::utils::arrow::array::Array;
|
21
|
+
use polars_core::utils::materialize_dyn_int;
|
21
22
|
use polars_utils::total_ord::{TotalEq, TotalHash};
|
22
23
|
use smartstring::alias::String as SmartString;
|
23
24
|
|
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
|
|
154
155
|
let class = pl.const_get::<_, Value>("Float32").unwrap();
|
155
156
|
class.funcall("new", ()).unwrap()
|
156
157
|
}
|
157
|
-
DataType::Float64 => {
|
158
|
+
DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
|
158
159
|
let class = pl.const_get::<_, Value>("Float64").unwrap();
|
159
160
|
class.funcall("new", ()).unwrap()
|
160
161
|
}
|
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
|
|
168
169
|
let class = pl.const_get::<_, Value>("Boolean").unwrap();
|
169
170
|
class.funcall("new", ()).unwrap()
|
170
171
|
}
|
171
|
-
DataType::String => {
|
172
|
+
DataType::String | DataType::Unknown(UnknownKind::Str) => {
|
172
173
|
let class = pl.const_get::<_, Value>("String").unwrap();
|
173
174
|
class.funcall("new", ()).unwrap()
|
174
175
|
}
|
@@ -217,10 +218,7 @@ impl IntoValue for Wrap<DataType> {
|
|
217
218
|
let class = pl.const_get::<_, Value>("Enum").unwrap();
|
218
219
|
let s = Series::from_arrow("category", categories.to_boxed()).unwrap();
|
219
220
|
let series = to_series(s.into());
|
220
|
-
class
|
221
|
-
.funcall::<_, _, Value>("new", (series,))
|
222
|
-
.unwrap()
|
223
|
-
.into()
|
221
|
+
class.funcall::<_, _, Value>("new", (series,)).unwrap()
|
224
222
|
}
|
225
223
|
DataType::Time => {
|
226
224
|
let class = pl.const_get::<_, Value>("Time").unwrap();
|
@@ -245,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
|
|
245
243
|
let class = pl.const_get::<_, Value>("Null").unwrap();
|
246
244
|
class.funcall("new", ()).unwrap()
|
247
245
|
}
|
248
|
-
DataType::Unknown => {
|
246
|
+
DataType::Unknown(UnknownKind::Int(v)) => {
|
247
|
+
Wrap(materialize_dyn_int(v).dtype()).into_value()
|
248
|
+
}
|
249
|
+
DataType::Unknown(_) => {
|
249
250
|
let class = pl.const_get::<_, Value>("Unknown").unwrap();
|
250
251
|
class.funcall("new", ()).unwrap()
|
251
252
|
}
|
@@ -313,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
|
|
313
314
|
"Polars::Object" => DataType::Object(OBJECT_NAME, None),
|
314
315
|
"Polars::List" => DataType::List(Box::new(DataType::Null)),
|
315
316
|
"Polars::Null" => DataType::Null,
|
316
|
-
"Polars::Unknown" => DataType::Unknown,
|
317
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
317
318
|
dt => {
|
318
319
|
return Err(RbValueError::new_err(format!(
|
319
320
|
"{dt} is not a correct polars DataType.",
|
@@ -353,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
|
|
353
354
|
"Polars::Float32" => DataType::Float32,
|
354
355
|
"Polars::Float64" => DataType::Float64,
|
355
356
|
"Polars::Null" => DataType::Null,
|
356
|
-
"Polars::Unknown" => DataType::Unknown,
|
357
|
+
"Polars::Unknown" => DataType::Unknown(Default::default()),
|
357
358
|
"Polars::Duration" => {
|
358
359
|
let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
|
359
360
|
let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
|
@@ -413,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
|
|
413
414
|
"obj" => DataType::Object(OBJECT_NAME, None),
|
414
415
|
"list" => DataType::List(Box::new(DataType::Boolean)),
|
415
416
|
"null" => DataType::Null,
|
416
|
-
"unk" => DataType::Unknown,
|
417
|
+
"unk" => DataType::Unknown(Default::default()),
|
417
418
|
_ => {
|
418
419
|
return Err(RbValueError::new_err(format!(
|
419
420
|
"{} is not a supported DataType.",
|
@@ -549,57 +550,6 @@ impl Default for ObjectValue {
|
|
549
550
|
}
|
550
551
|
}
|
551
552
|
|
552
|
-
pub(crate) fn dicts_to_rows(
|
553
|
-
records: &Value,
|
554
|
-
infer_schema_len: Option<usize>,
|
555
|
-
schema_columns: PlIndexSet<String>,
|
556
|
-
) -> RbResult<(Vec<Row>, Vec<String>)> {
|
557
|
-
let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
|
558
|
-
let (dicts, len) = get_rbseq(*records)?;
|
559
|
-
|
560
|
-
let key_names = {
|
561
|
-
if !schema_columns.is_empty() {
|
562
|
-
schema_columns
|
563
|
-
} else {
|
564
|
-
let mut inferred_keys = PlIndexSet::new();
|
565
|
-
for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
|
566
|
-
let d = d?;
|
567
|
-
let d = RHash::try_convert(d)?;
|
568
|
-
|
569
|
-
d.foreach(|name: Value, _value: Value| {
|
570
|
-
if let Some(v) = Symbol::from_value(name) {
|
571
|
-
inferred_keys.insert(v.name()?.into());
|
572
|
-
} else {
|
573
|
-
inferred_keys.insert(String::try_convert(name)?);
|
574
|
-
};
|
575
|
-
Ok(ForEach::Continue)
|
576
|
-
})?;
|
577
|
-
}
|
578
|
-
inferred_keys
|
579
|
-
}
|
580
|
-
};
|
581
|
-
|
582
|
-
let mut rows = Vec::with_capacity(len);
|
583
|
-
|
584
|
-
for d in dicts.each() {
|
585
|
-
let d = d?;
|
586
|
-
let d = RHash::try_convert(d)?;
|
587
|
-
|
588
|
-
let mut row = Vec::with_capacity(key_names.len());
|
589
|
-
|
590
|
-
for k in key_names.iter() {
|
591
|
-
// TODO improve performance
|
592
|
-
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
593
|
-
None => AnyValue::Null,
|
594
|
-
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
595
|
-
};
|
596
|
-
row.push(val)
|
597
|
-
}
|
598
|
-
rows.push(Row(row))
|
599
|
-
}
|
600
|
-
Ok((rows, key_names.into_iter().collect()))
|
601
|
-
}
|
602
|
-
|
603
553
|
impl TryConvert for Wrap<AsofStrategy> {
|
604
554
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
605
555
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -736,12 +686,11 @@ impl TryConvert for Wrap<JoinType> {
|
|
736
686
|
let parsed = match String::try_convert(ob)?.as_str() {
|
737
687
|
"inner" => JoinType::Inner,
|
738
688
|
"left" => JoinType::Left,
|
739
|
-
"outer" => JoinType::Outer
|
740
|
-
"outer_coalesce" => JoinType::Outer
|
689
|
+
"outer" => JoinType::Outer,
|
690
|
+
"outer_coalesce" => JoinType::Outer,
|
741
691
|
"semi" => JoinType::Semi,
|
742
692
|
"anti" => JoinType::Anti,
|
743
|
-
|
744
|
-
// "cross" => JoinType::Cross,
|
693
|
+
"cross" => JoinType::Cross,
|
745
694
|
v => {
|
746
695
|
return Err(RbValueError::new_err(format!(
|
747
696
|
"how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
|
@@ -785,6 +734,21 @@ impl TryConvert for Wrap<ListToStructWidthStrategy> {
|
|
785
734
|
}
|
786
735
|
}
|
787
736
|
|
737
|
+
impl TryConvert for Wrap<NonExistent> {
|
738
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
739
|
+
let parsed = match String::try_convert(ob)?.as_str() {
|
740
|
+
"null" => NonExistent::Null,
|
741
|
+
"raise" => NonExistent::Raise,
|
742
|
+
v => {
|
743
|
+
return Err(RbValueError::new_err(format!(
|
744
|
+
"`non_existent` must be one of {{'null', 'raise'}}, got {v}",
|
745
|
+
)))
|
746
|
+
}
|
747
|
+
};
|
748
|
+
Ok(Wrap(parsed))
|
749
|
+
}
|
750
|
+
}
|
751
|
+
|
788
752
|
impl TryConvert for Wrap<NullBehavior> {
|
789
753
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
790
754
|
let parsed = match String::try_convert(ob)?.as_str() {
|
@@ -1066,7 +1030,7 @@ impl TryConvert for Wrap<NonZeroUsize> {
|
|
1066
1030
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
1067
1031
|
let v = usize::try_convert(ob)?;
|
1068
1032
|
NonZeroUsize::new(v)
|
1069
|
-
.map(
|
1033
|
+
.map(Wrap)
|
1070
1034
|
.ok_or(RbValueError::new_err("must be non-zero".into()))
|
1071
1035
|
}
|
1072
1036
|
}
|
@@ -0,0 +1,186 @@
|
|
1
|
+
use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
|
2
|
+
use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
|
3
|
+
use polars::prelude::*;
|
4
|
+
|
5
|
+
use super::*;
|
6
|
+
use crate::conversion::*;
|
7
|
+
use crate::{RbPolarsErr, RbResult};
|
8
|
+
|
9
|
+
impl RbDataFrame {
|
10
|
+
pub fn from_rows(
|
11
|
+
rb_rows: RArray,
|
12
|
+
infer_schema_length: Option<usize>,
|
13
|
+
schema: Option<Wrap<Schema>>,
|
14
|
+
) -> RbResult<Self> {
|
15
|
+
let mut data = Vec::with_capacity(rb_rows.len());
|
16
|
+
for v in rb_rows.each() {
|
17
|
+
let rb_row = RArray::try_convert(v?)?;
|
18
|
+
let mut row = Vec::with_capacity(rb_row.len());
|
19
|
+
for val in rb_row.each() {
|
20
|
+
row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
|
21
|
+
}
|
22
|
+
data.push(Row(row));
|
23
|
+
}
|
24
|
+
let schema = schema.map(|wrap| wrap.0);
|
25
|
+
finish_from_rows(data, schema, None, infer_schema_length)
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn from_hashes(
|
29
|
+
data: Value,
|
30
|
+
schema: Option<Wrap<Schema>>,
|
31
|
+
schema_overrides: Option<Wrap<Schema>>,
|
32
|
+
strict: bool,
|
33
|
+
infer_schema_length: Option<usize>,
|
34
|
+
) -> RbResult<Self> {
|
35
|
+
let schema = schema.map(|wrap| wrap.0);
|
36
|
+
let schema_overrides = schema_overrides.map(|wrap| wrap.0);
|
37
|
+
|
38
|
+
let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
|
39
|
+
let rows = dicts_to_rows(&data, &names, strict)?;
|
40
|
+
|
41
|
+
let schema = schema.or_else(|| {
|
42
|
+
Some(columns_names_to_empty_schema(
|
43
|
+
names.iter().map(String::as_str),
|
44
|
+
))
|
45
|
+
});
|
46
|
+
|
47
|
+
finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
fn finish_from_rows(
|
52
|
+
rows: Vec<Row>,
|
53
|
+
schema: Option<Schema>,
|
54
|
+
schema_overrides: Option<Schema>,
|
55
|
+
infer_schema_length: Option<usize>,
|
56
|
+
) -> RbResult<RbDataFrame> {
|
57
|
+
// Object builder must be registered
|
58
|
+
crate::on_startup::register_object_builder();
|
59
|
+
|
60
|
+
let mut schema = if let Some(mut schema) = schema {
|
61
|
+
resolve_schema_overrides(&mut schema, schema_overrides);
|
62
|
+
update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
|
63
|
+
schema
|
64
|
+
} else {
|
65
|
+
rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
|
66
|
+
};
|
67
|
+
|
68
|
+
// TODO: Remove this step when Decimals are supported properly.
|
69
|
+
// Erasing the decimal precision/scale here will just require us to infer it again later.
|
70
|
+
// https://github.com/pola-rs/polars/issues/14427
|
71
|
+
erase_decimal_precision_scale(&mut schema);
|
72
|
+
|
73
|
+
let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
|
74
|
+
Ok(df.into())
|
75
|
+
}
|
76
|
+
|
77
|
+
fn update_schema_from_rows(
|
78
|
+
schema: &mut Schema,
|
79
|
+
rows: &[Row],
|
80
|
+
infer_schema_length: Option<usize>,
|
81
|
+
) -> RbResult<()> {
|
82
|
+
let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
|
83
|
+
if schema_is_complete {
|
84
|
+
return Ok(());
|
85
|
+
}
|
86
|
+
|
87
|
+
// TODO: Only infer dtypes for columns with an unknown dtype
|
88
|
+
let inferred_dtypes =
|
89
|
+
rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
|
90
|
+
let inferred_dtypes_slice = inferred_dtypes.as_slice();
|
91
|
+
|
92
|
+
for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
|
93
|
+
if !dtype.is_known() {
|
94
|
+
*dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
|
95
|
+
polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
|
96
|
+
})
|
97
|
+
.map_err(RbPolarsErr::from)?
|
98
|
+
.clone();
|
99
|
+
}
|
100
|
+
}
|
101
|
+
Ok(())
|
102
|
+
}
|
103
|
+
|
104
|
+
fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
|
105
|
+
if let Some(overrides) = schema_overrides {
|
106
|
+
for (name, dtype) in overrides.into_iter() {
|
107
|
+
schema.set_dtype(name.as_str(), dtype);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
fn erase_decimal_precision_scale(schema: &mut Schema) {
|
113
|
+
for dtype in schema.iter_dtypes_mut() {
|
114
|
+
if let DataType::Decimal(_, _) = dtype {
|
115
|
+
*dtype = DataType::Decimal(None, None)
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
|
121
|
+
where
|
122
|
+
I: IntoIterator<Item = &'a str>,
|
123
|
+
{
|
124
|
+
let fields = column_names
|
125
|
+
.into_iter()
|
126
|
+
.map(|c| Field::new(c, DataType::Unknown(Default::default())));
|
127
|
+
Schema::from_iter(fields)
|
128
|
+
}
|
129
|
+
|
130
|
+
fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
|
131
|
+
let (data, len) = get_rbseq(*data)?;
|
132
|
+
let mut rows = Vec::with_capacity(len);
|
133
|
+
for d in data.each() {
|
134
|
+
let d = d?;
|
135
|
+
let d = RHash::try_convert(d)?;
|
136
|
+
|
137
|
+
let mut row = Vec::with_capacity(names.len());
|
138
|
+
for k in names.iter() {
|
139
|
+
// TODO improve performance
|
140
|
+
let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
|
141
|
+
None => AnyValue::Null,
|
142
|
+
Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
|
143
|
+
};
|
144
|
+
row.push(val)
|
145
|
+
}
|
146
|
+
rows.push(Row(row))
|
147
|
+
}
|
148
|
+
Ok(rows)
|
149
|
+
}
|
150
|
+
|
151
|
+
fn get_schema_names(
|
152
|
+
data: &Value,
|
153
|
+
schema: Option<&Schema>,
|
154
|
+
infer_schema_length: Option<usize>,
|
155
|
+
) -> RbResult<Vec<String>> {
|
156
|
+
if let Some(schema) = schema {
|
157
|
+
Ok(schema.iter_names().map(|n| n.to_string()).collect())
|
158
|
+
} else {
|
159
|
+
infer_schema_names_from_data(data, infer_schema_length)
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
fn infer_schema_names_from_data(
|
164
|
+
data: &Value,
|
165
|
+
infer_schema_length: Option<usize>,
|
166
|
+
) -> RbResult<Vec<String>> {
|
167
|
+
let (data, data_len) = get_rbseq(*data)?;
|
168
|
+
let infer_schema_length = infer_schema_length
|
169
|
+
.map(|n| std::cmp::max(1, n))
|
170
|
+
.unwrap_or(data_len);
|
171
|
+
|
172
|
+
let mut names = PlIndexSet::new();
|
173
|
+
for d in data.each().take(infer_schema_length) {
|
174
|
+
let d = d?;
|
175
|
+
let d = RHash::try_convert(d)?;
|
176
|
+
d.foreach(|name: Value, _value: Value| {
|
177
|
+
if let Some(v) = Symbol::from_value(name) {
|
178
|
+
names.insert(v.name()?.into());
|
179
|
+
} else {
|
180
|
+
names.insert(String::try_convert(name)?);
|
181
|
+
};
|
182
|
+
Ok(ForEach::Continue)
|
183
|
+
})?;
|
184
|
+
}
|
185
|
+
Ok(names.into_iter().collect())
|
186
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
use magnus::{prelude::*, IntoValue, RArray, Value};
|
2
|
+
|
3
|
+
use super::*;
|
4
|
+
use crate::conversion::{ObjectValue, Wrap};
|
5
|
+
|
6
|
+
impl RbDataFrame {
|
7
|
+
pub fn row_tuple(&self, idx: i64) -> Value {
|
8
|
+
let idx = if idx < 0 {
|
9
|
+
(self.df.borrow().height() as i64 + idx) as usize
|
10
|
+
} else {
|
11
|
+
idx as usize
|
12
|
+
};
|
13
|
+
RArray::from_iter(
|
14
|
+
self.df
|
15
|
+
.borrow()
|
16
|
+
.get_columns()
|
17
|
+
.iter()
|
18
|
+
.map(|s| match s.dtype() {
|
19
|
+
DataType::Object(_, _) => {
|
20
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
21
|
+
obj.unwrap().to_object()
|
22
|
+
}
|
23
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
24
|
+
}),
|
25
|
+
)
|
26
|
+
.as_value()
|
27
|
+
}
|
28
|
+
|
29
|
+
pub fn row_tuples(&self) -> Value {
|
30
|
+
let df = &self.df;
|
31
|
+
RArray::from_iter((0..df.borrow().height()).map(|idx| {
|
32
|
+
RArray::from_iter(
|
33
|
+
self.df
|
34
|
+
.borrow()
|
35
|
+
.get_columns()
|
36
|
+
.iter()
|
37
|
+
.map(|s| match s.dtype() {
|
38
|
+
DataType::Object(_, _) => {
|
39
|
+
let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
|
40
|
+
obj.unwrap().to_object()
|
41
|
+
}
|
42
|
+
_ => Wrap(s.get(idx).unwrap()).into_value(),
|
43
|
+
}),
|
44
|
+
)
|
45
|
+
}))
|
46
|
+
.as_value()
|
47
|
+
}
|
48
|
+
}
|