polars-df 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +15 -7
  8. data/ext/polars/src/batched_csv.rs +4 -4
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
  12. data/ext/polars/src/dataframe.rs +69 -53
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/datetime.rs +22 -56
  15. data/ext/polars/src/expr/general.rs +61 -33
  16. data/ext/polars/src/expr/list.rs +52 -4
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +59 -8
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/functions/aggregation.rs +6 -0
  22. data/ext/polars/src/functions/lazy.rs +103 -48
  23. data/ext/polars/src/functions/meta.rs +45 -1
  24. data/ext/polars/src/functions/string_cache.rs +14 -0
  25. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
  26. data/ext/polars/src/lib.rs +226 -168
  27. data/ext/polars/src/series/aggregation.rs +20 -0
  28. data/ext/polars/src/series/mod.rs +25 -4
  29. data/lib/polars/array_expr.rb +449 -0
  30. data/lib/polars/array_name_space.rb +346 -0
  31. data/lib/polars/cat_expr.rb +24 -0
  32. data/lib/polars/cat_name_space.rb +75 -0
  33. data/lib/polars/config.rb +2 -2
  34. data/lib/polars/data_frame.rb +179 -43
  35. data/lib/polars/data_types.rb +191 -28
  36. data/lib/polars/date_time_expr.rb +31 -14
  37. data/lib/polars/exceptions.rb +12 -1
  38. data/lib/polars/expr.rb +866 -186
  39. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  40. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  41. data/lib/polars/functions/as_datatype.rb +248 -0
  42. data/lib/polars/functions/col.rb +47 -0
  43. data/lib/polars/functions/eager.rb +182 -0
  44. data/lib/polars/functions/lazy.rb +1280 -0
  45. data/lib/polars/functions/len.rb +49 -0
  46. data/lib/polars/functions/lit.rb +35 -0
  47. data/lib/polars/functions/random.rb +16 -0
  48. data/lib/polars/functions/range/date_range.rb +103 -0
  49. data/lib/polars/functions/range/int_range.rb +51 -0
  50. data/lib/polars/functions/repeat.rb +144 -0
  51. data/lib/polars/functions/whenthen.rb +27 -0
  52. data/lib/polars/functions.rb +29 -416
  53. data/lib/polars/group_by.rb +2 -2
  54. data/lib/polars/io.rb +18 -25
  55. data/lib/polars/lazy_frame.rb +367 -53
  56. data/lib/polars/list_expr.rb +152 -6
  57. data/lib/polars/list_name_space.rb +102 -0
  58. data/lib/polars/meta_expr.rb +175 -7
  59. data/lib/polars/series.rb +273 -34
  60. data/lib/polars/string_cache.rb +75 -0
  61. data/lib/polars/string_expr.rb +412 -96
  62. data/lib/polars/string_name_space.rb +4 -4
  63. data/lib/polars/testing.rb +507 -0
  64. data/lib/polars/utils.rb +52 -8
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars.rb +15 -2
  67. metadata +35 -5
  68. data/lib/polars/lazy_functions.rb +0 -1181
@@ -6,12 +6,13 @@ use polars::frame::row::{rows_to_schema_supertypes, Row};
6
6
  use polars::frame::NullStrategy;
7
7
  use polars::io::avro::AvroCompression;
8
8
  use polars::io::mmap::ReaderBytes;
9
- use polars::io::RowCount;
9
+ use polars::io::RowIndex;
10
10
  use polars::prelude::pivot::{pivot, pivot_stable};
11
11
  use polars::prelude::*;
12
12
  use polars_core::utils::try_get_supertype;
13
13
  use std::cell::RefCell;
14
14
  use std::io::{BufWriter, Cursor};
15
+ use std::num::NonZeroUsize;
15
16
  use std::ops::Deref;
16
17
 
17
18
  use crate::conversion::*;
@@ -45,44 +46,51 @@ impl RbDataFrame {
45
46
  fn finish_from_rows(
46
47
  rows: Vec<Row>,
47
48
  infer_schema_length: Option<usize>,
48
- schema_overwrite: Option<Schema>,
49
+ schema: Option<Schema>,
50
+ schema_overrides_by_idx: Option<Vec<(usize, DataType)>>,
49
51
  ) -> RbResult<Self> {
50
- // object builder must be registered.
52
+ // Object builder must be registered
51
53
  crate::on_startup::register_object_builder();
52
54
 
53
- let schema =
55
+ let mut final_schema =
54
56
  rows_to_schema_supertypes(&rows, infer_schema_length.map(|n| std::cmp::max(1, n)))
55
57
  .map_err(RbPolarsErr::from)?;
56
- // replace inferred nulls with boolean
57
- let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
58
- DataType::Null => {
59
- // fld.coerce(DataType::Boolean);
60
- fld
61
- }
62
- DataType::Decimal(_, _) => {
63
- fld.coerce(DataType::Decimal(None, None));
64
- fld
58
+
59
+ // Erase scale from inferred decimals.
60
+ for dtype in final_schema.iter_dtypes_mut() {
61
+ if let DataType::Decimal(_, _) = dtype {
62
+ *dtype = DataType::Decimal(None, None)
65
63
  }
66
- _ => fld,
67
- });
68
- let mut schema = Schema::from_iter(fields);
64
+ }
69
65
 
70
- if let Some(schema_overwrite) = schema_overwrite {
71
- for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
72
- if let Some((name_, dtype_)) = schema.get_at_index_mut(i) {
66
+ // Integrate explicit/inferred schema.
67
+ if let Some(schema) = schema {
68
+ for (i, (name, dtype)) in schema.into_iter().enumerate() {
69
+ if let Some((name_, dtype_)) = final_schema.get_at_index_mut(i) {
73
70
  *name_ = name;
74
71
 
75
- // if user sets dtype unknown, we use the inferred datatype
72
+ // If schema dtype is Unknown, overwrite with inferred datatype.
76
73
  if !matches!(dtype, DataType::Unknown) {
77
74
  *dtype_ = dtype;
78
75
  }
79
76
  } else {
80
- schema.with_column(name, dtype);
77
+ final_schema.with_column(name, dtype);
81
78
  }
82
79
  }
83
80
  }
84
81
 
85
- let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
82
+ // Optional per-field overrides; these supersede default/inferred dtypes.
83
+ if let Some(overrides) = schema_overrides_by_idx {
84
+ for (i, dtype) in overrides {
85
+ if let Some((_, dtype_)) = final_schema.get_at_index_mut(i) {
86
+ if !matches!(dtype, DataType::Unknown) {
87
+ *dtype_ = dtype;
88
+ }
89
+ }
90
+ }
91
+ }
92
+ let df =
93
+ DataFrame::from_rows_and_schema(&rows, &final_schema).map_err(RbPolarsErr::from)?;
86
94
  Ok(df.into())
87
95
  }
88
96
 
@@ -125,7 +133,7 @@ impl RbDataFrame {
125
133
  let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[19])?;
126
134
  let try_parse_dates = bool::try_convert(arguments[20])?;
127
135
  let skip_rows_after_header = usize::try_convert(arguments[21])?;
128
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
136
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
129
137
  let sample_size = usize::try_convert(arguments[23])?;
130
138
  let eol_char = String::try_convert(arguments[24])?;
131
139
  // end arguments
@@ -133,7 +141,7 @@ impl RbDataFrame {
133
141
  let null_values = null_values.map(|w| w.0);
134
142
  let eol_char = eol_char.as_bytes()[0];
135
143
 
136
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
144
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
137
145
 
138
146
  let quote_char = if let Some(s) = quote_char {
139
147
  if s.is_empty() {
@@ -186,7 +194,7 @@ impl RbDataFrame {
186
194
  .with_quote_char(quote_char)
187
195
  .with_end_of_line_char(eol_char)
188
196
  .with_skip_rows_after_header(skip_rows_after_header)
189
- .with_row_count(row_count)
197
+ .with_row_index(row_index)
190
198
  .sample_size(sample_size)
191
199
  .finish()
192
200
  .map_err(RbPolarsErr::from)?;
@@ -200,19 +208,19 @@ impl RbDataFrame {
200
208
  projection: Option<Vec<usize>>,
201
209
  n_rows: Option<usize>,
202
210
  parallel: Wrap<ParallelStrategy>,
203
- row_count: Option<(String, IdxSize)>,
211
+ row_index: Option<(String, IdxSize)>,
204
212
  low_memory: bool,
205
213
  use_statistics: bool,
206
214
  rechunk: bool,
207
215
  ) -> RbResult<Self> {
208
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
216
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
209
217
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
210
218
  let df = ParquetReader::new(mmap_bytes_r)
211
219
  .with_projection(projection)
212
220
  .with_columns(columns)
213
221
  .read_parallel(parallel.0)
214
222
  .with_n_rows(n_rows)
215
- .with_row_count(row_count)
223
+ .with_row_index(row_index)
216
224
  .set_low_memory(low_memory)
217
225
  .use_statistics(use_statistics)
218
226
  .set_rechunk(rechunk)
@@ -226,16 +234,16 @@ impl RbDataFrame {
226
234
  columns: Option<Vec<String>>,
227
235
  projection: Option<Vec<usize>>,
228
236
  n_rows: Option<usize>,
229
- row_count: Option<(String, IdxSize)>,
237
+ row_index: Option<(String, IdxSize)>,
230
238
  memory_map: bool,
231
239
  ) -> RbResult<Self> {
232
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
240
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
233
241
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
234
242
  let df = IpcReader::new(mmap_bytes_r)
235
243
  .with_projection(projection)
236
244
  .with_columns(columns)
237
245
  .with_n_rows(n_rows)
238
- .with_row_count(row_count)
246
+ .with_row_index(row_index)
239
247
  .memory_mapped(memory_map)
240
248
  .finish()
241
249
  .map_err(RbPolarsErr::from)?;
@@ -352,7 +360,7 @@ impl RbDataFrame {
352
360
  pub fn read_rows(
353
361
  rb_rows: RArray,
354
362
  infer_schema_length: Option<usize>,
355
- schema_overwrite: Option<Wrap<Schema>>,
363
+ schema: Option<Wrap<Schema>>,
356
364
  ) -> RbResult<Self> {
357
365
  let mut rows = Vec::with_capacity(rb_rows.len());
358
366
  for v in rb_rows.each() {
@@ -363,30 +371,34 @@ impl RbDataFrame {
363
371
  }
364
372
  rows.push(Row(row));
365
373
  }
366
- Self::finish_from_rows(
367
- rows,
368
- infer_schema_length,
369
- schema_overwrite.map(|wrap| wrap.0),
370
- )
374
+ Self::finish_from_rows(rows, infer_schema_length, schema.map(|wrap| wrap.0), None)
371
375
  }
372
376
 
373
377
  pub fn read_hashes(
374
378
  dicts: Value,
375
379
  infer_schema_length: Option<usize>,
376
- schema_overwrite: Option<Wrap<Schema>>,
380
+ schema: Option<Wrap<Schema>>,
381
+ schema_overrides: Option<Wrap<Schema>>,
377
382
  ) -> RbResult<Self> {
378
- let (rows, mut names) = dicts_to_rows(&dicts, infer_schema_length.unwrap_or(50))?;
383
+ let mut schema_columns = PlIndexSet::new();
384
+ if let Some(s) = &schema {
385
+ schema_columns.extend(s.0.iter_names().map(|n| n.to_string()))
386
+ }
387
+ let (rows, names) = dicts_to_rows(&dicts, infer_schema_length, schema_columns)?;
379
388
 
380
- // ensure the new names are used
381
- if let Some(schema) = &schema_overwrite {
382
- for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
383
- *name = new_name.to_string();
389
+ let mut schema_overrides_by_idx: Vec<(usize, DataType)> = Vec::new();
390
+ if let Some(overrides) = schema_overrides {
391
+ for (idx, name) in names.iter().enumerate() {
392
+ if let Some(dtype) = overrides.0.get(name) {
393
+ schema_overrides_by_idx.push((idx, dtype.clone()));
394
+ }
384
395
  }
385
396
  }
386
397
  let rbdf = Self::finish_from_rows(
387
398
  rows,
388
399
  infer_schema_length,
389
- schema_overwrite.map(|wrap| wrap.0),
400
+ schema.map(|wrap| wrap.0),
401
+ Some(schema_overrides_by_idx),
390
402
  )?;
391
403
 
392
404
  unsafe {
@@ -427,13 +439,14 @@ impl RbDataFrame {
427
439
  include_header: bool,
428
440
  separator: u8,
429
441
  quote_char: u8,
430
- batch_size: usize,
442
+ batch_size: Wrap<NonZeroUsize>,
431
443
  datetime_format: Option<String>,
432
444
  date_format: Option<String>,
433
445
  time_format: Option<String>,
434
446
  float_precision: Option<usize>,
435
447
  null_value: Option<String>,
436
448
  ) -> RbResult<()> {
449
+ let batch_size = batch_size.0;
437
450
  let null = null_value.unwrap_or_default();
438
451
 
439
452
  if let Ok(s) = String::try_convert(rb_f) {
@@ -794,12 +807,11 @@ impl RbDataFrame {
794
807
  self.df.borrow().get_column_index(&name)
795
808
  }
796
809
 
797
- // TODO remove clone
798
- pub fn column(&self, name: String) -> RbResult<RbSeries> {
810
+ pub fn get_column(&self, name: String) -> RbResult<RbSeries> {
799
811
  self.df
800
812
  .borrow()
801
813
  .column(&name)
802
- .map(|v| v.clone().into())
814
+ .map(|s| RbSeries::new(s.clone()))
803
815
  .map_err(RbPolarsErr::from)
804
816
  }
805
817
 
@@ -887,11 +899,11 @@ impl RbDataFrame {
887
899
  }
888
900
  }
889
901
 
890
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
902
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
891
903
  let df = self
892
904
  .df
893
905
  .borrow()
894
- .with_row_count(&name, offset)
906
+ .with_row_index(&name, offset)
895
907
  .map_err(RbPolarsErr::from)?;
896
908
  Ok(df.into())
897
909
  }
@@ -922,9 +934,9 @@ impl RbDataFrame {
922
934
  #[allow(clippy::too_many_arguments)]
923
935
  pub fn pivot_expr(
924
936
  &self,
925
- values: Vec<String>,
926
937
  index: Vec<String>,
927
938
  columns: Vec<String>,
939
+ values: Option<Vec<String>>,
928
940
  maintain_order: bool,
929
941
  sort_columns: bool,
930
942
  aggregate_expr: Option<&RbExpr>,
@@ -937,9 +949,9 @@ impl RbDataFrame {
937
949
  let agg_expr = aggregate_expr.map(|aggregate_expr| aggregate_expr.inner.clone());
938
950
  let df = fun(
939
951
  &self.df.borrow(),
940
- values,
941
952
  index,
942
953
  columns,
954
+ values,
943
955
  sort_columns,
944
956
  agg_expr,
945
957
  separator.as_deref(),
@@ -1121,7 +1133,7 @@ impl RbDataFrame {
1121
1133
  };
1122
1134
  Ok(self
1123
1135
  .df
1124
- .borrow()
1136
+ .borrow_mut()
1125
1137
  .transpose(keep_names_as.as_deref(), new_col_names)
1126
1138
  .map_err(RbPolarsErr::from)?
1127
1139
  .into())
@@ -1163,4 +1175,8 @@ impl RbDataFrame {
1163
1175
  let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
1164
1176
  Ok(df.into())
1165
1177
  }
1178
+
1179
+ pub fn clear(&self) -> Self {
1180
+ self.df.borrow().clear().into()
1181
+ }
1166
1182
  }
@@ -1,3 +1,5 @@
1
+ use polars::prelude::*;
2
+
1
3
  use crate::RbExpr;
2
4
 
3
5
  impl RbExpr {
@@ -12,4 +14,76 @@ impl RbExpr {
12
14
  pub fn array_sum(&self) -> Self {
13
15
  self.inner.clone().arr().sum().into()
14
16
  }
17
+
18
+ pub fn arr_unique(&self, maintain_order: bool) -> Self {
19
+ if maintain_order {
20
+ self.inner.clone().arr().unique_stable().into()
21
+ } else {
22
+ self.inner.clone().arr().unique().into()
23
+ }
24
+ }
25
+
26
+ pub fn arr_to_list(&self) -> Self {
27
+ self.inner.clone().arr().to_list().into()
28
+ }
29
+
30
+ pub fn arr_all(&self) -> Self {
31
+ self.inner.clone().arr().all().into()
32
+ }
33
+
34
+ pub fn arr_any(&self) -> Self {
35
+ self.inner.clone().arr().any().into()
36
+ }
37
+
38
+ pub fn arr_sort(&self, descending: bool, nulls_last: bool) -> Self {
39
+ self.inner
40
+ .clone()
41
+ .arr()
42
+ .sort(SortOptions {
43
+ descending,
44
+ nulls_last,
45
+ ..Default::default()
46
+ })
47
+ .into()
48
+ }
49
+
50
+ pub fn arr_reverse(&self) -> Self {
51
+ self.inner.clone().arr().reverse().into()
52
+ }
53
+
54
+ pub fn arr_arg_min(&self) -> Self {
55
+ self.inner.clone().arr().arg_min().into()
56
+ }
57
+
58
+ pub fn arr_arg_max(&self) -> Self {
59
+ self.inner.clone().arr().arg_max().into()
60
+ }
61
+
62
+ pub fn arr_get(&self, index: &RbExpr) -> Self {
63
+ self.inner.clone().arr().get(index.inner.clone()).into()
64
+ }
65
+
66
+ pub fn arr_join(&self, separator: &RbExpr, ignore_nulls: bool) -> Self {
67
+ self.inner
68
+ .clone()
69
+ .arr()
70
+ .join(separator.inner.clone(), ignore_nulls)
71
+ .into()
72
+ }
73
+
74
+ pub fn arr_contains(&self, other: &RbExpr) -> Self {
75
+ self.inner
76
+ .clone()
77
+ .arr()
78
+ .contains(other.inner.clone())
79
+ .into()
80
+ }
81
+
82
+ pub fn arr_count_matches(&self, expr: &RbExpr) -> Self {
83
+ self.inner
84
+ .clone()
85
+ .arr()
86
+ .count_matches(expr.inner.clone())
87
+ .into()
88
+ }
15
89
  }
@@ -61,6 +61,14 @@ impl RbExpr {
61
61
  self.inner.clone().dt().month_end().into()
62
62
  }
63
63
 
64
+ pub fn dt_base_utc_offset(&self) -> Self {
65
+ self.inner.clone().dt().base_utc_offset().into()
66
+ }
67
+
68
+ pub fn dt_dst_offset(&self) -> Self {
69
+ self.inner.clone().dt().dst_offset().into()
70
+ }
71
+
64
72
  pub fn dt_round(&self, every: String, offset: String) -> Self {
65
73
  self.inner.clone().dt().round(&every, &offset).into()
66
74
  }
@@ -149,73 +157,31 @@ impl RbExpr {
149
157
  self.inner.clone().dt().timestamp(tu.0).into()
150
158
  }
151
159
 
152
- pub fn duration_days(&self) -> Self {
153
- self.inner
154
- .clone()
155
- .map(
156
- |s| Ok(Some(s.duration()?.days().into_series())),
157
- GetOutput::from_type(DataType::Int64),
158
- )
159
- .into()
160
+ pub fn dt_total_days(&self) -> Self {
161
+ self.inner.clone().dt().total_days().into()
160
162
  }
161
163
 
162
- pub fn duration_hours(&self) -> Self {
163
- self.inner
164
- .clone()
165
- .map(
166
- |s| Ok(Some(s.duration()?.hours().into_series())),
167
- GetOutput::from_type(DataType::Int64),
168
- )
169
- .into()
164
+ pub fn dt_total_hours(&self) -> Self {
165
+ self.inner.clone().dt().total_hours().into()
170
166
  }
171
167
 
172
- pub fn duration_minutes(&self) -> Self {
173
- self.inner
174
- .clone()
175
- .map(
176
- |s| Ok(Some(s.duration()?.minutes().into_series())),
177
- GetOutput::from_type(DataType::Int64),
178
- )
179
- .into()
168
+ pub fn dt_total_minutes(&self) -> Self {
169
+ self.inner.clone().dt().total_minutes().into()
180
170
  }
181
171
 
182
- pub fn duration_seconds(&self) -> Self {
183
- self.inner
184
- .clone()
185
- .map(
186
- |s| Ok(Some(s.duration()?.seconds().into_series())),
187
- GetOutput::from_type(DataType::Int64),
188
- )
189
- .into()
172
+ pub fn dt_total_seconds(&self) -> Self {
173
+ self.inner.clone().dt().total_seconds().into()
190
174
  }
191
175
 
192
- pub fn duration_milliseconds(&self) -> Self {
193
- self.inner
194
- .clone()
195
- .map(
196
- |s| Ok(Some(s.duration()?.milliseconds().into_series())),
197
- GetOutput::from_type(DataType::Int64),
198
- )
199
- .into()
176
+ pub fn dt_total_milliseconds(&self) -> Self {
177
+ self.inner.clone().dt().total_milliseconds().into()
200
178
  }
201
179
 
202
- pub fn duration_microseconds(&self) -> Self {
203
- self.inner
204
- .clone()
205
- .map(
206
- |s| Ok(Some(s.duration()?.microseconds().into_series())),
207
- GetOutput::from_type(DataType::Int64),
208
- )
209
- .into()
180
+ pub fn dt_total_microseconds(&self) -> Self {
181
+ self.inner.clone().dt().total_microseconds().into()
210
182
  }
211
183
 
212
- pub fn duration_nanoseconds(&self) -> Self {
213
- self.inner
214
- .clone()
215
- .map(
216
- |s| Ok(Some(s.duration()?.nanoseconds().into_series())),
217
- GetOutput::from_type(DataType::Int64),
218
- )
219
- .into()
184
+ pub fn dt_total_nanoseconds(&self) -> Self {
185
+ self.inner.clone().dt().total_nanoseconds().into()
220
186
  }
221
187
  }