polars-df 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -0
  3. data/Cargo.lock +90 -48
  4. data/README.md +6 -6
  5. data/ext/polars/Cargo.toml +7 -5
  6. data/ext/polars/src/batched_csv.rs +53 -52
  7. data/ext/polars/src/conversion/mod.rs +13 -60
  8. data/ext/polars/src/dataframe/construction.rs +186 -0
  9. data/ext/polars/src/dataframe/export.rs +48 -0
  10. data/ext/polars/src/dataframe/general.rs +607 -0
  11. data/ext/polars/src/dataframe/io.rs +463 -0
  12. data/ext/polars/src/dataframe/mod.rs +26 -0
  13. data/ext/polars/src/expr/datetime.rs +6 -2
  14. data/ext/polars/src/expr/general.rs +28 -6
  15. data/ext/polars/src/expr/rolling.rs +185 -69
  16. data/ext/polars/src/expr/string.rs +9 -30
  17. data/ext/polars/src/functions/lazy.rs +2 -0
  18. data/ext/polars/src/functions/range.rs +74 -0
  19. data/ext/polars/src/interop/mod.rs +1 -0
  20. data/ext/polars/src/interop/numo/mod.rs +2 -0
  21. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  22. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  23. data/ext/polars/src/lazyframe/mod.rs +54 -38
  24. data/ext/polars/src/lib.rs +46 -21
  25. data/ext/polars/src/map/lazy.rs +5 -25
  26. data/ext/polars/src/map/series.rs +7 -1
  27. data/ext/polars/src/series/aggregation.rs +47 -30
  28. data/ext/polars/src/series/export.rs +131 -49
  29. data/ext/polars/src/series/mod.rs +1 -131
  30. data/lib/polars/batched_csv_reader.rb +9 -3
  31. data/lib/polars/convert.rb +6 -1
  32. data/lib/polars/data_frame.rb +83 -302
  33. data/lib/polars/date_time_expr.rb +1 -0
  34. data/lib/polars/date_time_name_space.rb +5 -1
  35. data/lib/polars/dynamic_group_by.rb +2 -2
  36. data/lib/polars/exceptions.rb +4 -0
  37. data/lib/polars/expr.rb +1134 -20
  38. data/lib/polars/functions/range/date_range.rb +92 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/time_range.rb +141 -0
  41. data/lib/polars/group_by.rb +88 -23
  42. data/lib/polars/io/avro.rb +24 -0
  43. data/lib/polars/{io.rb → io/csv.rb} +296 -490
  44. data/lib/polars/io/database.rb +73 -0
  45. data/lib/polars/io/ipc.rb +247 -0
  46. data/lib/polars/io/json.rb +18 -0
  47. data/lib/polars/io/ndjson.rb +69 -0
  48. data/lib/polars/io/parquet.rb +226 -0
  49. data/lib/polars/lazy_frame.rb +23 -166
  50. data/lib/polars/lazy_group_by.rb +100 -3
  51. data/lib/polars/rolling_group_by.rb +2 -2
  52. data/lib/polars/series.rb +2 -2
  53. data/lib/polars/string_expr.rb +37 -36
  54. data/lib/polars/utils.rb +35 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +9 -1
  57. metadata +21 -5
  58. data/ext/polars/src/dataframe.rs +0 -1208
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
7
7
 
8
8
  use magnus::{
9
9
  class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
10
- Ruby, Symbol, TryConvert, Value,
10
+ Ruby, TryConvert, Value,
11
11
  };
12
12
  use polars::chunked_array::object::PolarsObjectSafe;
13
13
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
18
18
  use polars::prelude::*;
19
19
  use polars::series::ops::NullBehavior;
20
20
  use polars_core::utils::arrow::array::Array;
21
+ use polars_core::utils::materialize_dyn_int;
21
22
  use polars_utils::total_ord::{TotalEq, TotalHash};
22
23
  use smartstring::alias::String as SmartString;
23
24
 
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
154
155
  let class = pl.const_get::<_, Value>("Float32").unwrap();
155
156
  class.funcall("new", ()).unwrap()
156
157
  }
157
- DataType::Float64 => {
158
+ DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
158
159
  let class = pl.const_get::<_, Value>("Float64").unwrap();
159
160
  class.funcall("new", ()).unwrap()
160
161
  }
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
168
169
  let class = pl.const_get::<_, Value>("Boolean").unwrap();
169
170
  class.funcall("new", ()).unwrap()
170
171
  }
171
- DataType::String => {
172
+ DataType::String | DataType::Unknown(UnknownKind::Str) => {
172
173
  let class = pl.const_get::<_, Value>("String").unwrap();
173
174
  class.funcall("new", ()).unwrap()
174
175
  }
@@ -242,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
242
243
  let class = pl.const_get::<_, Value>("Null").unwrap();
243
244
  class.funcall("new", ()).unwrap()
244
245
  }
245
- DataType::Unknown => {
246
+ DataType::Unknown(UnknownKind::Int(v)) => {
247
+ Wrap(materialize_dyn_int(v).dtype()).into_value()
248
+ }
249
+ DataType::Unknown(_) => {
246
250
  let class = pl.const_get::<_, Value>("Unknown").unwrap();
247
251
  class.funcall("new", ()).unwrap()
248
252
  }
@@ -310,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
310
314
  "Polars::Object" => DataType::Object(OBJECT_NAME, None),
311
315
  "Polars::List" => DataType::List(Box::new(DataType::Null)),
312
316
  "Polars::Null" => DataType::Null,
313
- "Polars::Unknown" => DataType::Unknown,
317
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
314
318
  dt => {
315
319
  return Err(RbValueError::new_err(format!(
316
320
  "{dt} is not a correct polars DataType.",
@@ -350,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
350
354
  "Polars::Float32" => DataType::Float32,
351
355
  "Polars::Float64" => DataType::Float64,
352
356
  "Polars::Null" => DataType::Null,
353
- "Polars::Unknown" => DataType::Unknown,
357
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
354
358
  "Polars::Duration" => {
355
359
  let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
356
360
  let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
@@ -410,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
410
414
  "obj" => DataType::Object(OBJECT_NAME, None),
411
415
  "list" => DataType::List(Box::new(DataType::Boolean)),
412
416
  "null" => DataType::Null,
413
- "unk" => DataType::Unknown,
417
+ "unk" => DataType::Unknown(Default::default()),
414
418
  _ => {
415
419
  return Err(RbValueError::new_err(format!(
416
420
  "{} is not a supported DataType.",
@@ -546,57 +550,6 @@ impl Default for ObjectValue {
546
550
  }
547
551
  }
548
552
 
549
- pub(crate) fn dicts_to_rows(
550
- records: &Value,
551
- infer_schema_len: Option<usize>,
552
- schema_columns: PlIndexSet<String>,
553
- ) -> RbResult<(Vec<Row>, Vec<String>)> {
554
- let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
555
- let (dicts, len) = get_rbseq(*records)?;
556
-
557
- let key_names = {
558
- if !schema_columns.is_empty() {
559
- schema_columns
560
- } else {
561
- let mut inferred_keys = PlIndexSet::new();
562
- for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
563
- let d = d?;
564
- let d = RHash::try_convert(d)?;
565
-
566
- d.foreach(|name: Value, _value: Value| {
567
- if let Some(v) = Symbol::from_value(name) {
568
- inferred_keys.insert(v.name()?.into());
569
- } else {
570
- inferred_keys.insert(String::try_convert(name)?);
571
- };
572
- Ok(ForEach::Continue)
573
- })?;
574
- }
575
- inferred_keys
576
- }
577
- };
578
-
579
- let mut rows = Vec::with_capacity(len);
580
-
581
- for d in dicts.each() {
582
- let d = d?;
583
- let d = RHash::try_convert(d)?;
584
-
585
- let mut row = Vec::with_capacity(key_names.len());
586
-
587
- for k in key_names.iter() {
588
- // TODO improve performance
589
- let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
590
- None => AnyValue::Null,
591
- Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
592
- };
593
- row.push(val)
594
- }
595
- rows.push(Row(row))
596
- }
597
- Ok((rows, key_names.into_iter().collect()))
598
- }
599
-
600
553
  impl TryConvert for Wrap<AsofStrategy> {
601
554
  fn try_convert(ob: Value) -> RbResult<Self> {
602
555
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -733,8 +686,8 @@ impl TryConvert for Wrap<JoinType> {
733
686
  let parsed = match String::try_convert(ob)?.as_str() {
734
687
  "inner" => JoinType::Inner,
735
688
  "left" => JoinType::Left,
736
- "outer" => JoinType::Outer { coalesce: false },
737
- "outer_coalesce" => JoinType::Outer { coalesce: true },
689
+ "outer" => JoinType::Outer,
690
+ "outer_coalesce" => JoinType::Outer,
738
691
  "semi" => JoinType::Semi,
739
692
  "anti" => JoinType::Anti,
740
693
  "cross" => JoinType::Cross,
@@ -0,0 +1,186 @@
1
+ use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
2
+ use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
3
+ use polars::prelude::*;
4
+
5
+ use super::*;
6
+ use crate::conversion::*;
7
+ use crate::{RbPolarsErr, RbResult};
8
+
9
+ impl RbDataFrame {
10
+ pub fn from_rows(
11
+ rb_rows: RArray,
12
+ infer_schema_length: Option<usize>,
13
+ schema: Option<Wrap<Schema>>,
14
+ ) -> RbResult<Self> {
15
+ let mut data = Vec::with_capacity(rb_rows.len());
16
+ for v in rb_rows.each() {
17
+ let rb_row = RArray::try_convert(v?)?;
18
+ let mut row = Vec::with_capacity(rb_row.len());
19
+ for val in rb_row.each() {
20
+ row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
21
+ }
22
+ data.push(Row(row));
23
+ }
24
+ let schema = schema.map(|wrap| wrap.0);
25
+ finish_from_rows(data, schema, None, infer_schema_length)
26
+ }
27
+
28
+ pub fn from_hashes(
29
+ data: Value,
30
+ schema: Option<Wrap<Schema>>,
31
+ schema_overrides: Option<Wrap<Schema>>,
32
+ strict: bool,
33
+ infer_schema_length: Option<usize>,
34
+ ) -> RbResult<Self> {
35
+ let schema = schema.map(|wrap| wrap.0);
36
+ let schema_overrides = schema_overrides.map(|wrap| wrap.0);
37
+
38
+ let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
39
+ let rows = dicts_to_rows(&data, &names, strict)?;
40
+
41
+ let schema = schema.or_else(|| {
42
+ Some(columns_names_to_empty_schema(
43
+ names.iter().map(String::as_str),
44
+ ))
45
+ });
46
+
47
+ finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
48
+ }
49
+ }
50
+
51
+ fn finish_from_rows(
52
+ rows: Vec<Row>,
53
+ schema: Option<Schema>,
54
+ schema_overrides: Option<Schema>,
55
+ infer_schema_length: Option<usize>,
56
+ ) -> RbResult<RbDataFrame> {
57
+ // Object builder must be registered
58
+ crate::on_startup::register_object_builder();
59
+
60
+ let mut schema = if let Some(mut schema) = schema {
61
+ resolve_schema_overrides(&mut schema, schema_overrides);
62
+ update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
63
+ schema
64
+ } else {
65
+ rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
66
+ };
67
+
68
+ // TODO: Remove this step when Decimals are supported properly.
69
+ // Erasing the decimal precision/scale here will just require us to infer it again later.
70
+ // https://github.com/pola-rs/polars/issues/14427
71
+ erase_decimal_precision_scale(&mut schema);
72
+
73
+ let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
74
+ Ok(df.into())
75
+ }
76
+
77
+ fn update_schema_from_rows(
78
+ schema: &mut Schema,
79
+ rows: &[Row],
80
+ infer_schema_length: Option<usize>,
81
+ ) -> RbResult<()> {
82
+ let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
83
+ if schema_is_complete {
84
+ return Ok(());
85
+ }
86
+
87
+ // TODO: Only infer dtypes for columns with an unknown dtype
88
+ let inferred_dtypes =
89
+ rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
90
+ let inferred_dtypes_slice = inferred_dtypes.as_slice();
91
+
92
+ for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
93
+ if !dtype.is_known() {
94
+ *dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
95
+ polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
96
+ })
97
+ .map_err(RbPolarsErr::from)?
98
+ .clone();
99
+ }
100
+ }
101
+ Ok(())
102
+ }
103
+
104
+ fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
105
+ if let Some(overrides) = schema_overrides {
106
+ for (name, dtype) in overrides.into_iter() {
107
+ schema.set_dtype(name.as_str(), dtype);
108
+ }
109
+ }
110
+ }
111
+
112
+ fn erase_decimal_precision_scale(schema: &mut Schema) {
113
+ for dtype in schema.iter_dtypes_mut() {
114
+ if let DataType::Decimal(_, _) = dtype {
115
+ *dtype = DataType::Decimal(None, None)
116
+ }
117
+ }
118
+ }
119
+
120
+ fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
121
+ where
122
+ I: IntoIterator<Item = &'a str>,
123
+ {
124
+ let fields = column_names
125
+ .into_iter()
126
+ .map(|c| Field::new(c, DataType::Unknown(Default::default())));
127
+ Schema::from_iter(fields)
128
+ }
129
+
130
+ fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
131
+ let (data, len) = get_rbseq(*data)?;
132
+ let mut rows = Vec::with_capacity(len);
133
+ for d in data.each() {
134
+ let d = d?;
135
+ let d = RHash::try_convert(d)?;
136
+
137
+ let mut row = Vec::with_capacity(names.len());
138
+ for k in names.iter() {
139
+ // TODO improve performance
140
+ let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
141
+ None => AnyValue::Null,
142
+ Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
143
+ };
144
+ row.push(val)
145
+ }
146
+ rows.push(Row(row))
147
+ }
148
+ Ok(rows)
149
+ }
150
+
151
+ fn get_schema_names(
152
+ data: &Value,
153
+ schema: Option<&Schema>,
154
+ infer_schema_length: Option<usize>,
155
+ ) -> RbResult<Vec<String>> {
156
+ if let Some(schema) = schema {
157
+ Ok(schema.iter_names().map(|n| n.to_string()).collect())
158
+ } else {
159
+ infer_schema_names_from_data(data, infer_schema_length)
160
+ }
161
+ }
162
+
163
+ fn infer_schema_names_from_data(
164
+ data: &Value,
165
+ infer_schema_length: Option<usize>,
166
+ ) -> RbResult<Vec<String>> {
167
+ let (data, data_len) = get_rbseq(*data)?;
168
+ let infer_schema_length = infer_schema_length
169
+ .map(|n| std::cmp::max(1, n))
170
+ .unwrap_or(data_len);
171
+
172
+ let mut names = PlIndexSet::new();
173
+ for d in data.each().take(infer_schema_length) {
174
+ let d = d?;
175
+ let d = RHash::try_convert(d)?;
176
+ d.foreach(|name: Value, _value: Value| {
177
+ if let Some(v) = Symbol::from_value(name) {
178
+ names.insert(v.name()?.into());
179
+ } else {
180
+ names.insert(String::try_convert(name)?);
181
+ };
182
+ Ok(ForEach::Continue)
183
+ })?;
184
+ }
185
+ Ok(names.into_iter().collect())
186
+ }
@@ -0,0 +1,48 @@
1
+ use magnus::{prelude::*, IntoValue, RArray, Value};
2
+
3
+ use super::*;
4
+ use crate::conversion::{ObjectValue, Wrap};
5
+
6
+ impl RbDataFrame {
7
+ pub fn row_tuple(&self, idx: i64) -> Value {
8
+ let idx = if idx < 0 {
9
+ (self.df.borrow().height() as i64 + idx) as usize
10
+ } else {
11
+ idx as usize
12
+ };
13
+ RArray::from_iter(
14
+ self.df
15
+ .borrow()
16
+ .get_columns()
17
+ .iter()
18
+ .map(|s| match s.dtype() {
19
+ DataType::Object(_, _) => {
20
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
21
+ obj.unwrap().to_object()
22
+ }
23
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
24
+ }),
25
+ )
26
+ .as_value()
27
+ }
28
+
29
+ pub fn row_tuples(&self) -> Value {
30
+ let df = &self.df;
31
+ RArray::from_iter((0..df.borrow().height()).map(|idx| {
32
+ RArray::from_iter(
33
+ self.df
34
+ .borrow()
35
+ .get_columns()
36
+ .iter()
37
+ .map(|s| match s.dtype() {
38
+ DataType::Object(_, _) => {
39
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
40
+ obj.unwrap().to_object()
41
+ }
42
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
43
+ }),
44
+ )
45
+ }))
46
+ .as_value()
47
+ }
48
+ }