polars-df 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -0
  3. data/Cargo.lock +90 -48
  4. data/README.md +6 -6
  5. data/ext/polars/Cargo.toml +7 -5
  6. data/ext/polars/src/batched_csv.rs +53 -52
  7. data/ext/polars/src/conversion/mod.rs +13 -60
  8. data/ext/polars/src/dataframe/construction.rs +186 -0
  9. data/ext/polars/src/dataframe/export.rs +48 -0
  10. data/ext/polars/src/dataframe/general.rs +607 -0
  11. data/ext/polars/src/dataframe/io.rs +463 -0
  12. data/ext/polars/src/dataframe/mod.rs +26 -0
  13. data/ext/polars/src/expr/datetime.rs +6 -2
  14. data/ext/polars/src/expr/general.rs +28 -6
  15. data/ext/polars/src/expr/rolling.rs +185 -69
  16. data/ext/polars/src/expr/string.rs +9 -30
  17. data/ext/polars/src/functions/lazy.rs +2 -0
  18. data/ext/polars/src/functions/range.rs +74 -0
  19. data/ext/polars/src/interop/mod.rs +1 -0
  20. data/ext/polars/src/interop/numo/mod.rs +2 -0
  21. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  22. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  23. data/ext/polars/src/lazyframe/mod.rs +54 -38
  24. data/ext/polars/src/lib.rs +46 -21
  25. data/ext/polars/src/map/lazy.rs +5 -25
  26. data/ext/polars/src/map/series.rs +7 -1
  27. data/ext/polars/src/series/aggregation.rs +47 -30
  28. data/ext/polars/src/series/export.rs +131 -49
  29. data/ext/polars/src/series/mod.rs +1 -131
  30. data/lib/polars/batched_csv_reader.rb +9 -3
  31. data/lib/polars/convert.rb +6 -1
  32. data/lib/polars/data_frame.rb +83 -302
  33. data/lib/polars/date_time_expr.rb +1 -0
  34. data/lib/polars/date_time_name_space.rb +5 -1
  35. data/lib/polars/dynamic_group_by.rb +2 -2
  36. data/lib/polars/exceptions.rb +4 -0
  37. data/lib/polars/expr.rb +1134 -20
  38. data/lib/polars/functions/range/date_range.rb +92 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/time_range.rb +141 -0
  41. data/lib/polars/group_by.rb +88 -23
  42. data/lib/polars/io/avro.rb +24 -0
  43. data/lib/polars/{io.rb → io/csv.rb} +296 -490
  44. data/lib/polars/io/database.rb +73 -0
  45. data/lib/polars/io/ipc.rb +247 -0
  46. data/lib/polars/io/json.rb +18 -0
  47. data/lib/polars/io/ndjson.rb +69 -0
  48. data/lib/polars/io/parquet.rb +226 -0
  49. data/lib/polars/lazy_frame.rb +23 -166
  50. data/lib/polars/lazy_group_by.rb +100 -3
  51. data/lib/polars/rolling_group_by.rb +2 -2
  52. data/lib/polars/series.rb +2 -2
  53. data/lib/polars/string_expr.rb +37 -36
  54. data/lib/polars/utils.rb +35 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +9 -1
  57. metadata +21 -5
  58. data/ext/polars/src/dataframe.rs +0 -1208
@@ -7,7 +7,7 @@ use std::num::NonZeroUsize;
7
7
 
8
8
  use magnus::{
9
9
  class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
10
- Ruby, Symbol, TryConvert, Value,
10
+ Ruby, TryConvert, Value,
11
11
  };
12
12
  use polars::chunked_array::object::PolarsObjectSafe;
13
13
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
@@ -18,6 +18,7 @@ use polars::io::avro::AvroCompression;
18
18
  use polars::prelude::*;
19
19
  use polars::series::ops::NullBehavior;
20
20
  use polars_core::utils::arrow::array::Array;
21
+ use polars_core::utils::materialize_dyn_int;
21
22
  use polars_utils::total_ord::{TotalEq, TotalHash};
22
23
  use smartstring::alias::String as SmartString;
23
24
 
@@ -154,7 +155,7 @@ impl IntoValue for Wrap<DataType> {
154
155
  let class = pl.const_get::<_, Value>("Float32").unwrap();
155
156
  class.funcall("new", ()).unwrap()
156
157
  }
157
- DataType::Float64 => {
158
+ DataType::Float64 | DataType::Unknown(UnknownKind::Float) => {
158
159
  let class = pl.const_get::<_, Value>("Float64").unwrap();
159
160
  class.funcall("new", ()).unwrap()
160
161
  }
@@ -168,7 +169,7 @@ impl IntoValue for Wrap<DataType> {
168
169
  let class = pl.const_get::<_, Value>("Boolean").unwrap();
169
170
  class.funcall("new", ()).unwrap()
170
171
  }
171
- DataType::String => {
172
+ DataType::String | DataType::Unknown(UnknownKind::Str) => {
172
173
  let class = pl.const_get::<_, Value>("String").unwrap();
173
174
  class.funcall("new", ()).unwrap()
174
175
  }
@@ -242,7 +243,10 @@ impl IntoValue for Wrap<DataType> {
242
243
  let class = pl.const_get::<_, Value>("Null").unwrap();
243
244
  class.funcall("new", ()).unwrap()
244
245
  }
245
- DataType::Unknown => {
246
+ DataType::Unknown(UnknownKind::Int(v)) => {
247
+ Wrap(materialize_dyn_int(v).dtype()).into_value()
248
+ }
249
+ DataType::Unknown(_) => {
246
250
  let class = pl.const_get::<_, Value>("Unknown").unwrap();
247
251
  class.funcall("new", ()).unwrap()
248
252
  }
@@ -310,7 +314,7 @@ impl TryConvert for Wrap<DataType> {
310
314
  "Polars::Object" => DataType::Object(OBJECT_NAME, None),
311
315
  "Polars::List" => DataType::List(Box::new(DataType::Null)),
312
316
  "Polars::Null" => DataType::Null,
313
- "Polars::Unknown" => DataType::Unknown,
317
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
314
318
  dt => {
315
319
  return Err(RbValueError::new_err(format!(
316
320
  "{dt} is not a correct polars DataType.",
@@ -350,7 +354,7 @@ impl TryConvert for Wrap<DataType> {
350
354
  "Polars::Float32" => DataType::Float32,
351
355
  "Polars::Float64" => DataType::Float64,
352
356
  "Polars::Null" => DataType::Null,
353
- "Polars::Unknown" => DataType::Unknown,
357
+ "Polars::Unknown" => DataType::Unknown(Default::default()),
354
358
  "Polars::Duration" => {
355
359
  let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
356
360
  let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
@@ -410,7 +414,7 @@ impl TryConvert for Wrap<DataType> {
410
414
  "obj" => DataType::Object(OBJECT_NAME, None),
411
415
  "list" => DataType::List(Box::new(DataType::Boolean)),
412
416
  "null" => DataType::Null,
413
- "unk" => DataType::Unknown,
417
+ "unk" => DataType::Unknown(Default::default()),
414
418
  _ => {
415
419
  return Err(RbValueError::new_err(format!(
416
420
  "{} is not a supported DataType.",
@@ -546,57 +550,6 @@ impl Default for ObjectValue {
546
550
  }
547
551
  }
548
552
 
549
- pub(crate) fn dicts_to_rows(
550
- records: &Value,
551
- infer_schema_len: Option<usize>,
552
- schema_columns: PlIndexSet<String>,
553
- ) -> RbResult<(Vec<Row>, Vec<String>)> {
554
- let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
555
- let (dicts, len) = get_rbseq(*records)?;
556
-
557
- let key_names = {
558
- if !schema_columns.is_empty() {
559
- schema_columns
560
- } else {
561
- let mut inferred_keys = PlIndexSet::new();
562
- for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
563
- let d = d?;
564
- let d = RHash::try_convert(d)?;
565
-
566
- d.foreach(|name: Value, _value: Value| {
567
- if let Some(v) = Symbol::from_value(name) {
568
- inferred_keys.insert(v.name()?.into());
569
- } else {
570
- inferred_keys.insert(String::try_convert(name)?);
571
- };
572
- Ok(ForEach::Continue)
573
- })?;
574
- }
575
- inferred_keys
576
- }
577
- };
578
-
579
- let mut rows = Vec::with_capacity(len);
580
-
581
- for d in dicts.each() {
582
- let d = d?;
583
- let d = RHash::try_convert(d)?;
584
-
585
- let mut row = Vec::with_capacity(key_names.len());
586
-
587
- for k in key_names.iter() {
588
- // TODO improve performance
589
- let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
590
- None => AnyValue::Null,
591
- Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
592
- };
593
- row.push(val)
594
- }
595
- rows.push(Row(row))
596
- }
597
- Ok((rows, key_names.into_iter().collect()))
598
- }
599
-
600
553
  impl TryConvert for Wrap<AsofStrategy> {
601
554
  fn try_convert(ob: Value) -> RbResult<Self> {
602
555
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -733,8 +686,8 @@ impl TryConvert for Wrap<JoinType> {
733
686
  let parsed = match String::try_convert(ob)?.as_str() {
734
687
  "inner" => JoinType::Inner,
735
688
  "left" => JoinType::Left,
736
- "outer" => JoinType::Outer { coalesce: false },
737
- "outer_coalesce" => JoinType::Outer { coalesce: true },
689
+ "outer" => JoinType::Outer,
690
+ "outer_coalesce" => JoinType::Outer,
738
691
  "semi" => JoinType::Semi,
739
692
  "anti" => JoinType::Anti,
740
693
  "cross" => JoinType::Cross,
@@ -0,0 +1,186 @@
1
+ use magnus::{prelude::*, r_hash::ForEach, RArray, RHash, Symbol, Value};
2
+ use polars::frame::row::{rows_to_schema_supertypes, rows_to_supertypes, Row};
3
+ use polars::prelude::*;
4
+
5
+ use super::*;
6
+ use crate::conversion::*;
7
+ use crate::{RbPolarsErr, RbResult};
8
+
9
+ impl RbDataFrame {
10
+ pub fn from_rows(
11
+ rb_rows: RArray,
12
+ infer_schema_length: Option<usize>,
13
+ schema: Option<Wrap<Schema>>,
14
+ ) -> RbResult<Self> {
15
+ let mut data = Vec::with_capacity(rb_rows.len());
16
+ for v in rb_rows.each() {
17
+ let rb_row = RArray::try_convert(v?)?;
18
+ let mut row = Vec::with_capacity(rb_row.len());
19
+ for val in rb_row.each() {
20
+ row.push(Wrap::<AnyValue>::try_convert(val?)?.0);
21
+ }
22
+ data.push(Row(row));
23
+ }
24
+ let schema = schema.map(|wrap| wrap.0);
25
+ finish_from_rows(data, schema, None, infer_schema_length)
26
+ }
27
+
28
+ pub fn from_hashes(
29
+ data: Value,
30
+ schema: Option<Wrap<Schema>>,
31
+ schema_overrides: Option<Wrap<Schema>>,
32
+ strict: bool,
33
+ infer_schema_length: Option<usize>,
34
+ ) -> RbResult<Self> {
35
+ let schema = schema.map(|wrap| wrap.0);
36
+ let schema_overrides = schema_overrides.map(|wrap| wrap.0);
37
+
38
+ let names = get_schema_names(&data, schema.as_ref(), infer_schema_length)?;
39
+ let rows = dicts_to_rows(&data, &names, strict)?;
40
+
41
+ let schema = schema.or_else(|| {
42
+ Some(columns_names_to_empty_schema(
43
+ names.iter().map(String::as_str),
44
+ ))
45
+ });
46
+
47
+ finish_from_rows(rows, schema, schema_overrides, infer_schema_length)
48
+ }
49
+ }
50
+
51
+ fn finish_from_rows(
52
+ rows: Vec<Row>,
53
+ schema: Option<Schema>,
54
+ schema_overrides: Option<Schema>,
55
+ infer_schema_length: Option<usize>,
56
+ ) -> RbResult<RbDataFrame> {
57
+ // Object builder must be registered
58
+ crate::on_startup::register_object_builder();
59
+
60
+ let mut schema = if let Some(mut schema) = schema {
61
+ resolve_schema_overrides(&mut schema, schema_overrides);
62
+ update_schema_from_rows(&mut schema, &rows, infer_schema_length)?;
63
+ schema
64
+ } else {
65
+ rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?
66
+ };
67
+
68
+ // TODO: Remove this step when Decimals are supported properly.
69
+ // Erasing the decimal precision/scale here will just require us to infer it again later.
70
+ // https://github.com/pola-rs/polars/issues/14427
71
+ erase_decimal_precision_scale(&mut schema);
72
+
73
+ let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
74
+ Ok(df.into())
75
+ }
76
+
77
+ fn update_schema_from_rows(
78
+ schema: &mut Schema,
79
+ rows: &[Row],
80
+ infer_schema_length: Option<usize>,
81
+ ) -> RbResult<()> {
82
+ let schema_is_complete = schema.iter_dtypes().all(|dtype| dtype.is_known());
83
+ if schema_is_complete {
84
+ return Ok(());
85
+ }
86
+
87
+ // TODO: Only infer dtypes for columns with an unknown dtype
88
+ let inferred_dtypes =
89
+ rows_to_supertypes(rows, infer_schema_length).map_err(RbPolarsErr::from)?;
90
+ let inferred_dtypes_slice = inferred_dtypes.as_slice();
91
+
92
+ for (i, dtype) in schema.iter_dtypes_mut().enumerate() {
93
+ if !dtype.is_known() {
94
+ *dtype = inferred_dtypes_slice.get(i).ok_or_else(|| {
95
+ polars_err!(SchemaMismatch: "the number of columns in the schema does not match the data")
96
+ })
97
+ .map_err(RbPolarsErr::from)?
98
+ .clone();
99
+ }
100
+ }
101
+ Ok(())
102
+ }
103
+
104
+ fn resolve_schema_overrides(schema: &mut Schema, schema_overrides: Option<Schema>) {
105
+ if let Some(overrides) = schema_overrides {
106
+ for (name, dtype) in overrides.into_iter() {
107
+ schema.set_dtype(name.as_str(), dtype);
108
+ }
109
+ }
110
+ }
111
+
112
+ fn erase_decimal_precision_scale(schema: &mut Schema) {
113
+ for dtype in schema.iter_dtypes_mut() {
114
+ if let DataType::Decimal(_, _) = dtype {
115
+ *dtype = DataType::Decimal(None, None)
116
+ }
117
+ }
118
+ }
119
+
120
+ fn columns_names_to_empty_schema<'a, I>(column_names: I) -> Schema
121
+ where
122
+ I: IntoIterator<Item = &'a str>,
123
+ {
124
+ let fields = column_names
125
+ .into_iter()
126
+ .map(|c| Field::new(c, DataType::Unknown(Default::default())));
127
+ Schema::from_iter(fields)
128
+ }
129
+
130
+ fn dicts_to_rows<'a>(data: &Value, names: &'a [String], _strict: bool) -> RbResult<Vec<Row<'a>>> {
131
+ let (data, len) = get_rbseq(*data)?;
132
+ let mut rows = Vec::with_capacity(len);
133
+ for d in data.each() {
134
+ let d = d?;
135
+ let d = RHash::try_convert(d)?;
136
+
137
+ let mut row = Vec::with_capacity(names.len());
138
+ for k in names.iter() {
139
+ // TODO improve performance
140
+ let val = match d.get(k.clone()).or_else(|| d.get(Symbol::new(k))) {
141
+ None => AnyValue::Null,
142
+ Some(val) => Wrap::<AnyValue>::try_convert(val)?.0,
143
+ };
144
+ row.push(val)
145
+ }
146
+ rows.push(Row(row))
147
+ }
148
+ Ok(rows)
149
+ }
150
+
151
+ fn get_schema_names(
152
+ data: &Value,
153
+ schema: Option<&Schema>,
154
+ infer_schema_length: Option<usize>,
155
+ ) -> RbResult<Vec<String>> {
156
+ if let Some(schema) = schema {
157
+ Ok(schema.iter_names().map(|n| n.to_string()).collect())
158
+ } else {
159
+ infer_schema_names_from_data(data, infer_schema_length)
160
+ }
161
+ }
162
+
163
+ fn infer_schema_names_from_data(
164
+ data: &Value,
165
+ infer_schema_length: Option<usize>,
166
+ ) -> RbResult<Vec<String>> {
167
+ let (data, data_len) = get_rbseq(*data)?;
168
+ let infer_schema_length = infer_schema_length
169
+ .map(|n| std::cmp::max(1, n))
170
+ .unwrap_or(data_len);
171
+
172
+ let mut names = PlIndexSet::new();
173
+ for d in data.each().take(infer_schema_length) {
174
+ let d = d?;
175
+ let d = RHash::try_convert(d)?;
176
+ d.foreach(|name: Value, _value: Value| {
177
+ if let Some(v) = Symbol::from_value(name) {
178
+ names.insert(v.name()?.into());
179
+ } else {
180
+ names.insert(String::try_convert(name)?);
181
+ };
182
+ Ok(ForEach::Continue)
183
+ })?;
184
+ }
185
+ Ok(names.into_iter().collect())
186
+ }
@@ -0,0 +1,48 @@
1
+ use magnus::{prelude::*, IntoValue, RArray, Value};
2
+
3
+ use super::*;
4
+ use crate::conversion::{ObjectValue, Wrap};
5
+
6
+ impl RbDataFrame {
7
+ pub fn row_tuple(&self, idx: i64) -> Value {
8
+ let idx = if idx < 0 {
9
+ (self.df.borrow().height() as i64 + idx) as usize
10
+ } else {
11
+ idx as usize
12
+ };
13
+ RArray::from_iter(
14
+ self.df
15
+ .borrow()
16
+ .get_columns()
17
+ .iter()
18
+ .map(|s| match s.dtype() {
19
+ DataType::Object(_, _) => {
20
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
21
+ obj.unwrap().to_object()
22
+ }
23
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
24
+ }),
25
+ )
26
+ .as_value()
27
+ }
28
+
29
+ pub fn row_tuples(&self) -> Value {
30
+ let df = &self.df;
31
+ RArray::from_iter((0..df.borrow().height()).map(|idx| {
32
+ RArray::from_iter(
33
+ self.df
34
+ .borrow()
35
+ .get_columns()
36
+ .iter()
37
+ .map(|s| match s.dtype() {
38
+ DataType::Object(_, _) => {
39
+ let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
40
+ obj.unwrap().to_object()
41
+ }
42
+ _ => Wrap(s.get(idx).unwrap()).into_value(),
43
+ }),
44
+ )
45
+ }))
46
+ .as_value()
47
+ }
48
+ }