polars-df 0.9.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/README.md +7 -6
  5. data/ext/polars/Cargo.toml +10 -6
  6. data/ext/polars/src/batched_csv.rs +53 -50
  7. data/ext/polars/src/conversion/anyvalue.rs +3 -2
  8. data/ext/polars/src/conversion/mod.rs +31 -67
  9. data/ext/polars/src/dataframe/construction.rs +186 -0
  10. data/ext/polars/src/dataframe/export.rs +48 -0
  11. data/ext/polars/src/dataframe/general.rs +607 -0
  12. data/ext/polars/src/dataframe/io.rs +463 -0
  13. data/ext/polars/src/dataframe/mod.rs +26 -0
  14. data/ext/polars/src/expr/array.rs +6 -2
  15. data/ext/polars/src/expr/datetime.rs +13 -4
  16. data/ext/polars/src/expr/general.rs +50 -9
  17. data/ext/polars/src/expr/list.rs +6 -2
  18. data/ext/polars/src/expr/rolling.rs +185 -69
  19. data/ext/polars/src/expr/string.rs +12 -33
  20. data/ext/polars/src/file.rs +158 -11
  21. data/ext/polars/src/functions/lazy.rs +20 -3
  22. data/ext/polars/src/functions/range.rs +74 -0
  23. data/ext/polars/src/functions/whenthen.rs +47 -17
  24. data/ext/polars/src/interop/mod.rs +1 -0
  25. data/ext/polars/src/interop/numo/mod.rs +2 -0
  26. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  27. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  28. data/ext/polars/src/lazyframe/mod.rs +111 -56
  29. data/ext/polars/src/lib.rs +68 -34
  30. data/ext/polars/src/map/dataframe.rs +17 -9
  31. data/ext/polars/src/map/lazy.rs +5 -25
  32. data/ext/polars/src/map/series.rs +7 -1
  33. data/ext/polars/src/series/aggregation.rs +47 -30
  34. data/ext/polars/src/series/export.rs +131 -49
  35. data/ext/polars/src/series/mod.rs +13 -133
  36. data/lib/polars/array_expr.rb +6 -2
  37. data/lib/polars/batched_csv_reader.rb +11 -3
  38. data/lib/polars/convert.rb +6 -1
  39. data/lib/polars/data_frame.rb +225 -370
  40. data/lib/polars/date_time_expr.rb +11 -4
  41. data/lib/polars/date_time_name_space.rb +14 -4
  42. data/lib/polars/dynamic_group_by.rb +2 -2
  43. data/lib/polars/exceptions.rb +4 -0
  44. data/lib/polars/expr.rb +1171 -54
  45. data/lib/polars/functions/lazy.rb +3 -3
  46. data/lib/polars/functions/range/date_range.rb +92 -0
  47. data/lib/polars/functions/range/datetime_range.rb +149 -0
  48. data/lib/polars/functions/range/time_range.rb +141 -0
  49. data/lib/polars/functions/whenthen.rb +74 -5
  50. data/lib/polars/group_by.rb +88 -23
  51. data/lib/polars/io/avro.rb +24 -0
  52. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  53. data/lib/polars/io/database.rb +73 -0
  54. data/lib/polars/io/ipc.rb +247 -0
  55. data/lib/polars/io/json.rb +18 -0
  56. data/lib/polars/io/ndjson.rb +69 -0
  57. data/lib/polars/io/parquet.rb +226 -0
  58. data/lib/polars/lazy_frame.rb +55 -195
  59. data/lib/polars/lazy_group_by.rb +100 -3
  60. data/lib/polars/list_expr.rb +6 -2
  61. data/lib/polars/rolling_group_by.rb +2 -2
  62. data/lib/polars/series.rb +14 -12
  63. data/lib/polars/string_expr.rb +38 -36
  64. data/lib/polars/utils.rb +89 -1
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars/whenthen.rb +83 -0
  67. data/lib/polars.rb +10 -3
  68. metadata +23 -8
  69. data/ext/polars/src/dataframe.rs +0 -1182
  70. data/lib/polars/when.rb +0 -16
  71. data/lib/polars/when_then.rb +0 -19
@@ -1,23 +1,19 @@
1
+ use std::cell::RefCell;
2
+ use std::path::PathBuf;
3
+ use std::sync::Mutex;
4
+
1
5
  use magnus::{prelude::*, RArray, Value};
6
+ use polars::io::csv::read::OwnedBatchedCsvReader;
2
7
  use polars::io::mmap::MmapBytesReader;
3
8
  use polars::io::RowIndex;
4
- use polars::prelude::read_impl::OwnedBatchedCsvReader;
5
9
  use polars::prelude::*;
6
- use std::cell::RefCell;
7
- use std::path::PathBuf;
8
10
 
9
11
  use crate::conversion::*;
10
- use crate::prelude::read_impl::OwnedBatchedCsvReaderMmap;
11
12
  use crate::{RbDataFrame, RbPolarsErr, RbResult};
12
13
 
13
- pub enum BatchedReader {
14
- MMap(OwnedBatchedCsvReaderMmap),
15
- Read(OwnedBatchedCsvReader),
16
- }
17
-
18
14
  #[magnus::wrap(class = "Polars::RbBatchedCsv")]
19
15
  pub struct RbBatchedCsv {
20
- pub reader: RefCell<BatchedReader>,
16
+ pub reader: RefCell<Mutex<OwnedBatchedCsvReader>>,
21
17
  }
22
18
 
23
19
  impl RbBatchedCsv {
@@ -44,18 +40,23 @@ impl RbBatchedCsv {
44
40
  let comment_prefix = Option::<String>::try_convert(arguments[16])?;
45
41
  let quote_char = Option::<String>::try_convert(arguments[17])?;
46
42
  let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[18])?;
47
- let try_parse_dates = bool::try_convert(arguments[19])?;
48
- let skip_rows_after_header = usize::try_convert(arguments[20])?;
49
- let row_index = Option::<(String, IdxSize)>::try_convert(arguments[21])?;
50
- let sample_size = usize::try_convert(arguments[22])?;
51
- let eol_char = String::try_convert(arguments[23])?;
43
+ let missing_utf8_is_empty_string = bool::try_convert(arguments[19])?;
44
+ let try_parse_dates = bool::try_convert(arguments[20])?;
45
+ let skip_rows_after_header = usize::try_convert(arguments[21])?;
46
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
47
+ let sample_size = usize::try_convert(arguments[23])?;
48
+ let eol_char = String::try_convert(arguments[24])?;
49
+ let raise_if_empty = bool::try_convert(arguments[25])?;
50
+ let truncate_ragged_lines = bool::try_convert(arguments[26])?;
51
+ let decimal_comma = bool::try_convert(arguments[27])?;
52
52
  // end arguments
53
53
 
54
54
  let null_values = null_values.map(|w| w.0);
55
55
  let eol_char = eol_char.as_bytes()[0];
56
-
57
- let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
58
-
56
+ let row_index = row_index.map(|(name, offset)| RowIndex {
57
+ name: Arc::from(name.as_str()),
58
+ offset,
59
+ });
59
60
  let quote_char = if let Some(s) = quote_char {
60
61
  if s.is_empty() {
61
62
  None
@@ -85,53 +86,55 @@ impl RbBatchedCsv {
85
86
 
86
87
  let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
87
88
  let reader = Box::new(file) as Box<dyn MmapBytesReader>;
88
- let reader = CsvReader::new(reader)
89
- .infer_schema(infer_schema_length)
90
- .has_header(has_header)
89
+ let reader = CsvReadOptions::default()
90
+ .with_infer_schema_length(infer_schema_length)
91
+ .with_has_header(has_header)
91
92
  .with_n_rows(n_rows)
92
- .with_separator(separator.as_bytes()[0])
93
93
  .with_skip_rows(skip_rows)
94
94
  .with_ignore_errors(ignore_errors)
95
- .with_projection(projection)
95
+ .with_projection(projection.map(Arc::new))
96
96
  .with_rechunk(rechunk)
97
97
  .with_chunk_size(chunk_size)
98
- .with_encoding(encoding.0)
99
- .with_columns(columns)
98
+ .with_columns(columns.map(Arc::new))
100
99
  .with_n_threads(n_threads)
101
- .with_dtypes_slice(overwrite_dtype_slice.as_deref())
102
- .low_memory(low_memory)
103
- .with_comment_prefix(comment_prefix.as_deref())
104
- .with_null_values(null_values)
105
- .with_try_parse_dates(try_parse_dates)
106
- .with_quote_char(quote_char)
107
- .with_end_of_line_char(eol_char)
100
+ .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
101
+ .with_low_memory(low_memory)
108
102
  .with_skip_rows_after_header(skip_rows_after_header)
109
103
  .with_row_index(row_index)
110
- .sample_size(sample_size);
104
+ .with_sample_size(sample_size)
105
+ .with_raise_if_empty(raise_if_empty)
106
+ .with_parse_options(
107
+ CsvParseOptions::default()
108
+ .with_separator(separator.as_bytes()[0])
109
+ .with_encoding(encoding.0)
110
+ .with_missing_is_null(!missing_utf8_is_empty_string)
111
+ .with_comment_prefix(comment_prefix.as_deref())
112
+ .with_null_values(null_values)
113
+ .with_try_parse_dates(try_parse_dates)
114
+ .with_quote_char(quote_char)
115
+ .with_eol_char(eol_char)
116
+ .with_truncate_ragged_lines(truncate_ragged_lines)
117
+ .with_decimal_comma(decimal_comma),
118
+ )
119
+ .into_reader_with_file_handle(reader);
111
120
 
112
- let reader = if low_memory {
113
- let reader = reader
114
- .batched_read(overwrite_dtype.map(Arc::new))
115
- .map_err(RbPolarsErr::from)?;
116
- BatchedReader::Read(reader)
117
- } else {
118
- let reader = reader
119
- .batched_mmap(overwrite_dtype.map(Arc::new))
120
- .map_err(RbPolarsErr::from)?;
121
- BatchedReader::MMap(reader)
122
- };
121
+ let reader = reader
122
+ .batched(overwrite_dtype.map(Arc::new))
123
+ .map_err(RbPolarsErr::from)?;
123
124
 
124
125
  Ok(RbBatchedCsv {
125
- reader: RefCell::new(reader),
126
+ reader: RefCell::new(Mutex::new(reader)),
126
127
  })
127
128
  }
128
129
 
129
130
  pub fn next_batches(&self, n: usize) -> RbResult<Option<RArray>> {
130
- let batches = match &mut *self.reader.borrow_mut() {
131
- BatchedReader::MMap(reader) => reader.next_batches(n),
132
- BatchedReader::Read(reader) => reader.next_batches(n),
133
- }
134
- .map_err(RbPolarsErr::from)?;
131
+ let reader = &self.reader;
132
+ let batches = reader
133
+ .borrow()
134
+ .lock()
135
+ .map_err(|e| RbPolarsErr::other(e.to_string()))?
136
+ .next_batches(n)
137
+ .map_err(RbPolarsErr::from)?;
135
138
 
136
139
  Ok(batches.map(|batches| RArray::from_iter(batches.into_iter().map(RbDataFrame::from))))
137
140
  }
@@ -3,8 +3,8 @@ use magnus::{
3
3
  class, prelude::*, r_hash::ForEach, Float, Integer, IntoValue, RArray, RHash, RString, Ruby,
4
4
  TryConvert, Value,
5
5
  };
6
- use polars::frame::row::any_values_to_dtype;
7
6
  use polars::prelude::*;
7
+ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
8
8
 
9
9
  use super::{struct_dict, ObjectValue, Wrap};
10
10
 
@@ -120,7 +120,8 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
120
120
  avs.push(Wrap::<AnyValue>::try_convert(item?)?.0)
121
121
  }
122
122
 
123
- let (dtype, _n_types) = any_values_to_dtype(&avs).map_err(RbPolarsErr::from)?;
123
+ let (dtype, _n_types) =
124
+ any_values_to_supertype_and_n_dtypes(&avs).map_err(RbPolarsErr::from)?;
124
125
 
125
126
  // push the rest
126
127
  avs.reserve(list.len());
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
7
7
 
8
8
  use magnus::{
9
9
  class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
10
- Ruby, Symbol, TryConvert, Value,
10
+ Ruby, TryConvert, Value,
11
11
  };
12
12
  use polars::chunked_array::object::PolarsObjectSafe;
13
13
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
18
18
  use polars::prelude::*;
19
19
  use polars::series::ops::NullBehavior;
20
20
  use polars_core::utils::arrow::array::Array;
21
+ use polars_core::utils::materialize_dyn_int;
21
22
  use polars_utils::total_ord::{TotalEq, TotalHash};
22
23
  use smartstring::alias::String as SmartString;
23
24
 
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
154
155
  let class = pl.const_get::<_, Value>("Float32").unwrap();
155
156
  class.funcall("new", ()).unwrap()
156
157
  }
157
- DataType::Float64 => {
158
+ DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
158
159
  let class = pl.const_get::<_, Value>("Float64").unwrap();
159
160
  class.funcall("new", ()).unwrap()
160
161
  }
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
168
169
  let class = pl.const_get::<_, Value>("Boolean").unwrap();
169
170
  class.funcall("new", ()).unwrap()
170
171
  }
171
- DataType::String => {
172
+ DataType::String | DataType::Unknown(UnknownKind::Str) => {
172
173
  let class = pl.const_get::<_, Value>("String").unwrap();
173
174
  class.funcall("new", ()).unwrap()
174
175
  }
@@ -217,10 +218,7 @@ impl IntoValue for Wrap<DataType> {
217
218
  let class = pl.const_get::<_, Value>("Enum").unwrap();
218
219
  let s = Series::from_arrow("category", categories.to_boxed()).unwrap();
219
220
  let series = to_series(s.into());
220
- class
221
- .funcall::<_, _, Value>("new", (series,))
222
- .unwrap()
223
- .into()
221
+ class.funcall::<_, _, Value>("new", (series,)).unwrap()
224
222
  }
225
223
  DataType::Time => {
226
224
  let class = pl.const_get::<_, Value>("Time").unwrap();
@@ -245,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
245
243
  let class = pl.const_get::<_, Value>("Null").unwrap();
246
244
  class.funcall("new", ()).unwrap()
247
245
  }
248
- DataType::Unknown => {
246
+ DataType::Unknown(UnknownKind::Int(v)) => {
247
+ Wrap(materialize_dyn_int(v).dtype()).into_value()
248
+ }
249
+ DataType::Unknown(_) => {
249
250
  let class = pl.const_get::<_, Value>("Unknown").unwrap();
250
251
  class.funcall("new", ()).unwrap()
251
252
  }
@@ -313,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
313
314
  "Polars::Object" => DataType::Object(OBJECT_NAME, None),
314
315
  "Polars::List" => DataType::List(Box::new(DataType::Null)),
315
316
  "Polars::Null" => DataType::Null,
316
- "Polars::Unknown" => DataType::Unknown,
317
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
317
318
  dt => {
318
319
  return Err(RbValueError::new_err(format!(
319
320
  "{dt} is not a correct polars DataType.",
@@ -353,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
353
354
  "Polars::Float32" => DataType::Float32,
354
355
  "Polars::Float64" => DataType::Float64,
355
356
  "Polars::Null" => DataType::Null,
356
- "Polars::Unknown" => DataType::Unknown,
357
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
357
358
  "Polars::Duration" => {
358
359
  let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
359
360
  let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
@@ -413,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
413
414
  "obj" => DataType::Object(OBJECT_NAME, None),
414
415
  "list" => DataType::List(Box::new(DataType::Boolean)),
415
416
  "null" => DataType::Null,
416
- "unk" => DataType::Unknown,
417
+ "unk" => DataType::Unknown(Default::default()),
417
418
  _ => {
418
419
  return Err(RbValueError::new_err(format!(
419
420
  "{} is not a supported DataType.",
@@ -549,57 +550,6 @@ impl Default for ObjectValue {
549
550
  }
550
551
  }
551
552
 
552
- pub(crate) fn dicts_to_rows(
553
- records: &Value,
554
- infer_schema_len: Option<usize>,
555
- schema_columns: PlIndexSet<String>,
556
- ) -> RbResult<(Vec<Row>, Vec<String>)> {
557
- let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
558
- let (dicts, len) = get_rbseq(*records)?;
559
-
560
- let key_names = {
561
- if !schema_columns.is_empty() {
562
- schema_columns
563
- } else {
564
- let mut inferred_keys = PlIndexSet::new();
565
- for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
566
- let d = d?;
567
- let d = RHash::try_convert(d)?;
568
-
569
- d.foreach(|name: Value, _value: Value| {
570
- if let Some(v) = Symbol::from_value(name) {
571
- inferred_keys.insert(v.name()?.into());
572
- } else {
573
- inferred_keys.insert(String::try_convert(name)?);
574
- };
575
- Ok(ForEach::Continue)
576
- })?;
577
- }
578
- inferred_keys
579
- }
580
- };
581
-
582
- let mut rows = Vec::with_capacity(len);
583
-
584
- for d in dicts.each() {
585
- let d = d?;
586
- let d = RHash::try_convert(d)?;
587
-
588
- let mut row = Vec::with_capacity(key_names.len());
589
-
590
- for k in key_names.iter() {
591
- // TODO improve performance
592
- let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
593
- None => AnyValue::Null,
594
- Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
595
- };
596
- row.push(val)
597
- }
598
- rows.push(Row(row))
599
- }
600
- Ok((rows, key_names.into_iter().collect()))
601
- }
602
-
603
553
  impl TryConvert for Wrap<AsofStrategy> {
604
554
  fn try_convert(ob: Value) -> RbResult<Self> {
605
555
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -736,12 +686,11 @@ impl TryConvert for Wrap<JoinType> {
736
686
  let parsed = match String::try_convert(ob)?.as_str() {
737
687
  "inner" => JoinType::Inner,
738
688
  "left" => JoinType::Left,
739
- "outer" => JoinType::Outer { coalesce: false },
740
- "outer_coalesce" => JoinType::Outer { coalesce: true },
689
+ "outer" => JoinType::Outer,
690
+ "outer_coalesce" => JoinType::Outer,
741
691
  "semi" => JoinType::Semi,
742
692
  "anti" => JoinType::Anti,
743
- // #[cfg(feature = "cross_join")]
744
- // "cross" => JoinType::Cross,
693
+ "cross" => JoinType::Cross,
745
694
  v => {
746
695
  return Err(RbValueError::new_err(format!(
747
696
  "how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
@@ -785,6 +734,21 @@ impl TryConvert for Wrap<ListToStructWidthStrategy> {
785
734
  }
786
735
  }
787
736
 
737
+ impl TryConvert for Wrap<NonExistent> {
738
+ fn try_convert(ob: Value) -> RbResult<Self> {
739
+ let parsed = match String::try_convert(ob)?.as_str() {
740
+ "null" => NonExistent::Null,
741
+ "raise" => NonExistent::Raise,
742
+ v => {
743
+ return Err(RbValueError::new_err(format!(
744
+ "`non_existent` must be one of {{'null', 'raise'}}, got {v}",
745
+ )))
746
+ }
747
+ };
748
+ Ok(Wrap(parsed))
749
+ }
750
+ }
751
+
788
752
  impl TryConvert for Wrap<NullBehavior> {
789
753
  fn try_convert(ob: Value) -> RbResult<Self> {
790
754
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -1066,7 +1030,7 @@ impl TryConvert for Wrap<NonZeroUsize> {
1066
1030
  fn try_convert(ob: Value) -> RbResult<Self> {
1067
1031
  let v = usize::try_convert(ob)?;
1068
1032
  NonZeroUsize::new(v)
1069
- .map(|v| Wrap(v))
1033
+ .map(Wrap)
1070
1034
  .ok_or(RbValueError::new_err("must be non-zero".into()))
1071
1035
  }
1072
1036
  }
@@ -0,0 +1,186 @@
1
+ use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
2
+ use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
3
+ use polars::prelude::*;
4
+
5
+ use super::*;
6
+ use crate::conversion::*;
7
+ use crate::{RbPolarsErr, RbResult};
8
+
9
+ impl RbDataFrame {
10
+ pub fn from_rows(
11
+ rb_rows: RArray,
12
+ infer_schema_length: Option<usize>,
13
+ schema: Option<Wrap<Schema>>,
14
+ ) -> RbResult<Self> {
15
+ let mut data = Vec::with_capacity(rb_rows.len());
16
+ for v in rb_rows.each() {
17
+ let rb_row = RArray::try_convert(v?)?;
18
+ let mut row = Vec::with_capacity(rb_row.len());
19
+ for val in rb_row.each() {
20
+ row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
21
+ }
22
+ data.push(Row(row));
23
+ }
24
+ let schema = schema.map(|wrap| wrap.0);
25
+ finish_from_rows(data, schema, None, infer_schema_length)
26
+ }
27
+
28
+ pub fn from_hashes(
29
+ data: Value,
30
+ schema: Option<Wrap<Schema>>,
31
+ schema_overrides: Option<Wrap<Schema>>,
32
+ strict: bool,
33
+ infer_schema_length: Option<usize>,
34
+ ) -> RbResult<Self> {
35
+ let schema = schema.map(|wrap| wrap.0);
36
+ let schema_overrides = schema_overrides.map(|wrap| wrap.0);
37
+
38
+ let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
39
+ let rows = dicts_to_rows(&data, &names, strict)?;
40
+
41
+ let schema = schema.or_else(|| {
42
+ Some(columns_names_to_empty_schema(
43
+ names.iter().map(String::as_str),
44
+ ))
45
+ });
46
+
47
+ finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
48
+ }
49
+ }
50
+
51
+ fn finish_from_rows(
52
+ rows: Vec<Row>,
53
+ schema: Option<Schema>,
54
+ schema_overrides: Option<Schema>,
55
+ infer_schema_length: Option<usize>,
56
+ ) -> RbResult<RbDataFrame> {
57
+ // Object builder must be registered
58
+ crate::on_startup::register_object_builder();
59
+
60
+ let mut schema = if let Some(mut schema) = schema {
61
+ resolve_schema_overrides(&mut schema, schema_overrides);
62
+ update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
63
+ schema
64
+ } else {
65
+ rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
66
+ };
67
+
68
+ // TODO: Remove this step when Decimals are supported properly.
69
+ // Erasing the decimal precision/scale here will just require us to infer it again later.
70
+ // https://github.com/pola-rs/polars/issues/14427
71
+ erase_decimal_precision_scale(&mut schema);
72
+
73
+ let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
74
+ Ok(df.into())
75
+ }
76
+
77
+ fn update_schema_from_rows(
78
+ schema: &mut Schema,
79
+ rows: &[Row],
80
+ infer_schema_length: Option<usize>,
81
+ ) -> RbResult<()> {
82
+ let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
83
+ if schema_is_complete {
84
+ return Ok(());
85
+ }
86
+
87
+ // TODO: Only infer dtypes for columns with an unknown dtype
88
+ let inferred_dtypes =
89
+ rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
90
+ let inferred_dtypes_slice = inferred_dtypes.as_slice();
91
+
92
+ for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
93
+ if !dtype.is_known() {
94
+ *dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
95
+ polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
96
+ })
97
+ .map_err(RbPolarsErr::from)?
98
+ .clone();
99
+ }
100
+ }
101
+ Ok(())
102
+ }
103
+
104
+ fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
105
+ if let Some(overrides) = schema_overrides {
106
+ for (name, dtype) in overrides.into_iter() {
107
+ schema.set_dtype(name.as_str(), dtype);
108
+ }
109
+ }
110
+ }
111
+
112
+ fn erase_decimal_precision_scale(schema: &mut Schema) {
113
+ for dtype in schema.iter_dtypes_mut() {
114
+ if let DataType::Decimal(_, _) = dtype {
115
+ *dtype = DataType::Decimal(None, None)
116
+ }
117
+ }
118
+ }
119
+
120
+ fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
121
+ where
122
+ I: IntoIterator<Item = &'a str>,
123
+ {
124
+ let fields = column_names
125
+ .into_iter()
126
+ .map(|c| Field::new(c, DataType::Unknown(Default::default())));
127
+ Schema::from_iter(fields)
128
+ }
129
+
130
+ fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
131
+ let (data, len) = get_rbseq(*data)?;
132
+ let mut rows = Vec::with_capacity(len);
133
+ for d in data.each() {
134
+ let d = d?;
135
+ let d = RHash::try_convert(d)?;
136
+
137
+ let mut row = Vec::with_capacity(names.len());
138
+ for k in names.iter() {
139
+ // TODO improve performance
140
+ let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
141
+ None => AnyValue::Null,
142
+ Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
143
+ };
144
+ row.push(val)
145
+ }
146
+ rows.push(Row(row))
147
+ }
148
+ Ok(rows)
149
+ }
150
+
151
+ fn get_schema_names(
152
+ data: &Value,
153
+ schema: Option<&Schema>,
154
+ infer_schema_length: Option<usize>,
155
+ ) -> RbResult<Vec<String>> {
156
+ if let Some(schema) = schema {
157
+ Ok(schema.iter_names().map(|n| n.to_string()).collect())
158
+ } else {
159
+ infer_schema_names_from_data(data, infer_schema_length)
160
+ }
161
+ }
162
+
163
+ fn infer_schema_names_from_data(
164
+ data: &Value,
165
+ infer_schema_length: Option<usize>,
166
+ ) -> RbResult<Vec<String>> {
167
+ let (data, data_len) = get_rbseq(*data)?;
168
+ let infer_schema_length = infer_schema_length
169
+ .map(|n| std::cmp::max(1, n))
170
+ .unwrap_or(data_len);
171
+
172
+ let mut names = PlIndexSet::new();
173
+ for d in data.each().take(infer_schema_length) {
174
+ let d = d?;
175
+ let d = RHash::try_convert(d)?;
176
+ d.foreach(|name: Value, _value: Value| {
177
+ if let Some(v) = Symbol::from_value(name) {
178
+ names.insert(v.name()?.into());
179
+ } else {
180
+ names.insert(String::try_convert(name)?);
181
+ };
182
+ Ok(ForEach::Continue)
183
+ })?;
184
+ }
185
+ Ok(names.into_iter().collect())
186
+ }
@@ -0,0 +1,48 @@
1
+ use magnus::{prelude::*, IntoValue, RArray, Value};
2
+
3
+ use super::*;
4
+ use crate::conversion::{ObjectValue, Wrap};
5
+
6
+ impl RbDataFrame {
7
+ pub fn row_tuple(&self, idx: i64) -> Value {
8
+ let idx = if idx < 0 {
9
+ (self.df.borrow().height() as i64 + idx) as usize
10
+ } else {
11
+ idx as usize
12
+ };
13
+ RArray::from_iter(
14
+ self.df
15
+ .borrow()
16
+ .get_columns()
17
+ .iter()
18
+ .map(|s| match s.dtype() {
19
+ DataType::Object(_, _) => {
20
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
21
+ obj.unwrap().to_object()
22
+ }
23
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
24
+ }),
25
+ )
26
+ .as_value()
27
+ }
28
+
29
+ pub fn row_tuples(&self) -> Value {
30
+ let df = &self.df;
31
+ RArray::from_iter((0..df.borrow().height()).map(|idx| {
32
+ RArray::from_iter(
33
+ self.df
34
+ .borrow()
35
+ .get_columns()
36
+ .iter()
37
+ .map(|s| match s.dtype() {
38
+ DataType::Object(_, _) => {
39
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
40
+ obj.unwrap().to_object()
41
+ }
42
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
43
+ }),
44
+ )
45
+ }))
46
+ .as_value()
47
+ }
48
+ }