polars-df 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +15 -7
  8. data/ext/polars/src/batched_csv.rs +4 -4
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
  12. data/ext/polars/src/dataframe.rs +69 -53
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/datetime.rs +22 -56
  15. data/ext/polars/src/expr/general.rs +61 -33
  16. data/ext/polars/src/expr/list.rs +52 -4
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +59 -8
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/functions/aggregation.rs +6 -0
  22. data/ext/polars/src/functions/lazy.rs +103 -48
  23. data/ext/polars/src/functions/meta.rs +45 -1
  24. data/ext/polars/src/functions/string_cache.rs +14 -0
  25. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
  26. data/ext/polars/src/lib.rs +226 -168
  27. data/ext/polars/src/series/aggregation.rs +20 -0
  28. data/ext/polars/src/series/mod.rs +25 -4
  29. data/lib/polars/array_expr.rb +449 -0
  30. data/lib/polars/array_name_space.rb +346 -0
  31. data/lib/polars/cat_expr.rb +24 -0
  32. data/lib/polars/cat_name_space.rb +75 -0
  33. data/lib/polars/config.rb +2 -2
  34. data/lib/polars/data_frame.rb +179 -43
  35. data/lib/polars/data_types.rb +191 -28
  36. data/lib/polars/date_time_expr.rb +31 -14
  37. data/lib/polars/exceptions.rb +12 -1
  38. data/lib/polars/expr.rb +866 -186
  39. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  40. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  41. data/lib/polars/functions/as_datatype.rb +248 -0
  42. data/lib/polars/functions/col.rb +47 -0
  43. data/lib/polars/functions/eager.rb +182 -0
  44. data/lib/polars/functions/lazy.rb +1280 -0
  45. data/lib/polars/functions/len.rb +49 -0
  46. data/lib/polars/functions/lit.rb +35 -0
  47. data/lib/polars/functions/random.rb +16 -0
  48. data/lib/polars/functions/range/date_range.rb +103 -0
  49. data/lib/polars/functions/range/int_range.rb +51 -0
  50. data/lib/polars/functions/repeat.rb +144 -0
  51. data/lib/polars/functions/whenthen.rb +27 -0
  52. data/lib/polars/functions.rb +29 -416
  53. data/lib/polars/group_by.rb +2 -2
  54. data/lib/polars/io.rb +18 -25
  55. data/lib/polars/lazy_frame.rb +367 -53
  56. data/lib/polars/list_expr.rb +152 -6
  57. data/lib/polars/list_name_space.rb +102 -0
  58. data/lib/polars/meta_expr.rb +175 -7
  59. data/lib/polars/series.rb +273 -34
  60. data/lib/polars/string_cache.rb +75 -0
  61. data/lib/polars/string_expr.rb +412 -96
  62. data/lib/polars/string_name_space.rb +4 -4
  63. data/lib/polars/testing.rb +507 -0
  64. data/lib/polars/utils.rb +52 -8
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars.rb +15 -2
  67. metadata +35 -5
  68. data/lib/polars/lazy_functions.rb +0 -1181
@@ -6,12 +6,13 @@ use polars::frame::row::{rows_to_schema_supertypes, Row};
6
6
  use polars::frame::NullStrategy;
7
7
  use polars::io::avro::AvroCompression;
8
8
  use polars::io::mmap::ReaderBytes;
9
- use polars::io::RowCount;
9
+ use polars::io::RowIndex;
10
10
  use polars::prelude::pivot::{pivot, pivot_stable};
11
11
  use polars::prelude::*;
12
12
  use polars_core::utils::try_get_supertype;
13
13
  use std::cell::RefCell;
14
14
  use std::io::{BufWriter, Cursor};
15
+ use std::num::NonZeroUsize;
15
16
  use std::ops::Deref;
16
17
 
17
18
  use crate::conversion::*;
@@ -45,44 +46,51 @@ impl RbDataFrame {
45
46
  fn finish_from_rows(
46
47
  rows: Vec<Row>,
47
48
  infer_schema_length: Option<usize>,
48
- schema_overwrite: Option<Schema>,
49
+ schema: Option<Schema>,
50
+ schema_overrides_by_idx: Option<Vec<(usize, DataType)>>,
49
51
  ) -> RbResult<Self> {
50
- // object builder must be registered.
52
+ // Object builder must be registered
51
53
  crate::on_startup::register_object_builder();
52
54
 
53
- let schema =
55
+ let mut final_schema =
54
56
  rows_to_schema_supertypes(&rows, infer_schema_length.map(|n| std::cmp::max(1, n)))
55
57
  .map_err(RbPolarsErr::from)?;
56
- // replace inferred nulls with boolean
57
- let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
58
- DataType::Null => {
59
- // fld.coerce(DataType::Boolean);
60
- fld
61
- }
62
- DataType::Decimal(_, _) => {
63
- fld.coerce(DataType::Decimal(None, None));
64
- fld
58
+
59
+ // Erase scale from inferred decimals.
60
+ for dtype in final_schema.iter_dtypes_mut() {
61
+ if let DataType::Decimal(_, _) = dtype {
62
+ *dtype = DataType::Decimal(None, None)
65
63
  }
66
- _ => fld,
67
- });
68
- let mut schema = Schema::from_iter(fields);
64
+ }
69
65
 
70
- if let Some(schema_overwrite) = schema_overwrite {
71
- for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
72
- if let Some((name_, dtype_)) = schema.get_at_index_mut(i) {
66
+ // Integrate explicit/inferred schema.
67
+ if let Some(schema) = schema {
68
+ for (i, (name, dtype)) in schema.into_iter().enumerate() {
69
+ if let Some((name_, dtype_)) = final_schema.get_at_index_mut(i) {
73
70
  *name_ = name;
74
71
 
75
- // if user sets dtype unknown, we use the inferred datatype
72
+ // If schema dtype is Unknown, overwrite with inferred datatype.
76
73
  if !matches!(dtype, DataType::Unknown) {
77
74
  *dtype_ = dtype;
78
75
  }
79
76
  } else {
80
- schema.with_column(name, dtype);
77
+ final_schema.with_column(name, dtype);
81
78
  }
82
79
  }
83
80
  }
84
81
 
85
- let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
82
+ // Optional per-field overrides; these supersede default/inferred dtypes.
83
+ if let Some(overrides) = schema_overrides_by_idx {
84
+ for (i, dtype) in overrides {
85
+ if let Some((_, dtype_)) = final_schema.get_at_index_mut(i) {
86
+ if !matches!(dtype, DataType::Unknown) {
87
+ *dtype_ = dtype;
88
+ }
89
+ }
90
+ }
91
+ }
92
+ let df =
93
+ DataFrame::from_rows_and_schema(&rows, &final_schema).map_err(RbPolarsErr::from)?;
86
94
  Ok(df.into())
87
95
  }
88
96
 
@@ -125,7 +133,7 @@ impl RbDataFrame {
125
133
  let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[19])?;
126
134
  let try_parse_dates = bool::try_convert(arguments[20])?;
127
135
  let skip_rows_after_header = usize::try_convert(arguments[21])?;
128
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
136
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
129
137
  let sample_size = usize::try_convert(arguments[23])?;
130
138
  let eol_char = String::try_convert(arguments[24])?;
131
139
  // end arguments
@@ -133,7 +141,7 @@ impl RbDataFrame {
133
141
  let null_values = null_values.map(|w| w.0);
134
142
  let eol_char = eol_char.as_bytes()[0];
135
143
 
136
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
144
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
137
145
 
138
146
  let quote_char = if let Some(s) = quote_char {
139
147
  if s.is_empty() {
@@ -186,7 +194,7 @@ impl RbDataFrame {
186
194
  .with_quote_char(quote_char)
187
195
  .with_end_of_line_char(eol_char)
188
196
  .with_skip_rows_after_header(skip_rows_after_header)
189
- .with_row_count(row_count)
197
+ .with_row_index(row_index)
190
198
  .sample_size(sample_size)
191
199
  .finish()
192
200
  .map_err(RbPolarsErr::from)?;
@@ -200,19 +208,19 @@ impl RbDataFrame {
200
208
  projection: Option<Vec<usize>>,
201
209
  n_rows: Option<usize>,
202
210
  parallel: Wrap<ParallelStrategy>,
203
- row_count: Option<(String, IdxSize)>,
211
+ row_index: Option<(String, IdxSize)>,
204
212
  low_memory: bool,
205
213
  use_statistics: bool,
206
214
  rechunk: bool,
207
215
  ) -> RbResult<Self> {
208
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
216
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
209
217
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
210
218
  let df = ParquetReader::new(mmap_bytes_r)
211
219
  .with_projection(projection)
212
220
  .with_columns(columns)
213
221
  .read_parallel(parallel.0)
214
222
  .with_n_rows(n_rows)
215
- .with_row_count(row_count)
223
+ .with_row_index(row_index)
216
224
  .set_low_memory(low_memory)
217
225
  .use_statistics(use_statistics)
218
226
  .set_rechunk(rechunk)
@@ -226,16 +234,16 @@ impl RbDataFrame {
226
234
  columns: Option<Vec<String>>,
227
235
  projection: Option<Vec<usize>>,
228
236
  n_rows: Option<usize>,
229
- row_count: Option<(String, IdxSize)>,
237
+ row_index: Option<(String, IdxSize)>,
230
238
  memory_map: bool,
231
239
  ) -> RbResult<Self> {
232
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
240
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
233
241
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
234
242
  let df = IpcReader::new(mmap_bytes_r)
235
243
  .with_projection(projection)
236
244
  .with_columns(columns)
237
245
  .with_n_rows(n_rows)
238
- .with_row_count(row_count)
246
+ .with_row_index(row_index)
239
247
  .memory_mapped(memory_map)
240
248
  .finish()
241
249
  .map_err(RbPolarsErr::from)?;
@@ -352,7 +360,7 @@ impl RbDataFrame {
352
360
  pub fn read_rows(
353
361
  rb_rows: RArray,
354
362
  infer_schema_length: Option<usize>,
355
- schema_overwrite: Option<Wrap<Schema>>,
363
+ schema: Option<Wrap<Schema>>,
356
364
  ) -> RbResult<Self> {
357
365
  let mut rows = Vec::with_capacity(rb_rows.len());
358
366
  for v in rb_rows.each() {
@@ -363,30 +371,34 @@ impl RbDataFrame {
363
371
  }
364
372
  rows.push(Row(row));
365
373
  }
366
- Self::finish_from_rows(
367
- rows,
368
- infer_schema_length,
369
- schema_overwrite.map(|wrap| wrap.0),
370
- )
374
+ Self::finish_from_rows(rows, infer_schema_length, schema.map(|wrap| wrap.0), None)
371
375
  }
372
376
 
373
377
  pub fn read_hashes(
374
378
  dicts: Value,
375
379
  infer_schema_length: Option<usize>,
376
- schema_overwrite: Option<Wrap<Schema>>,
380
+ schema: Option<Wrap<Schema>>,
381
+ schema_overrides: Option<Wrap<Schema>>,
377
382
  ) -> RbResult<Self> {
378
- let (rows, mut names) = dicts_to_rows(&dicts, infer_schema_length.unwrap_or(50))?;
383
+ let mut schema_columns = PlIndexSet::new();
384
+ if let Some(s) = &schema {
385
+ schema_columns.extend(s.0.iter_names().map(|n| n.to_string()))
386
+ }
387
+ let (rows, names) = dicts_to_rows(&dicts, infer_schema_length, schema_columns)?;
379
388
 
380
- // ensure the new names are used
381
- if let Some(schema) = &schema_overwrite {
382
- for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
383
- *name = new_name.to_string();
389
+ let mut schema_overrides_by_idx: Vec<(usize, DataType)> = Vec::new();
390
+ if let Some(overrides) = schema_overrides {
391
+ for (idx, name) in names.iter().enumerate() {
392
+ if let Some(dtype) = overrides.0.get(name) {
393
+ schema_overrides_by_idx.push((idx, dtype.clone()));
394
+ }
384
395
  }
385
396
  }
386
397
  let rbdf = Self::finish_from_rows(
387
398
  rows,
388
399
  infer_schema_length,
389
- schema_overwrite.map(|wrap| wrap.0),
400
+ schema.map(|wrap| wrap.0),
401
+ Some(schema_overrides_by_idx),
390
402
  )?;
391
403
 
392
404
  unsafe {
@@ -427,13 +439,14 @@ impl RbDataFrame {
427
439
  include_header: bool,
428
440
  separator: u8,
429
441
  quote_char: u8,
430
- batch_size: usize,
442
+ batch_size: Wrap<NonZeroUsize>,
431
443
  datetime_format: Option<String>,
432
444
  date_format: Option<String>,
433
445
  time_format: Option<String>,
434
446
  float_precision: Option<usize>,
435
447
  null_value: Option<String>,
436
448
  ) -> RbResult<()> {
449
+ let batch_size = batch_size.0;
437
450
  let null = null_value.unwrap_or_default();
438
451
 
439
452
  if let Ok(s) = String::try_convert(rb_f) {
@@ -794,12 +807,11 @@ impl RbDataFrame {
794
807
  self.df.borrow().get_column_index(&name)
795
808
  }
796
809
 
797
- // TODO remove clone
798
- pub fn column(&self, name: String) -> RbResult<RbSeries> {
810
+ pub fn get_column(&self, name: String) -> RbResult<RbSeries> {
799
811
  self.df
800
812
  .borrow()
801
813
  .column(&name)
802
- .map(|v| v.clone().into())
814
+ .map(|s| RbSeries::new(s.clone()))
803
815
  .map_err(RbPolarsErr::from)
804
816
  }
805
817
 
@@ -887,11 +899,11 @@ impl RbDataFrame {
887
899
  }
888
900
  }
889
901
 
890
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
902
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> RbResult<Self> {
891
903
  let df = self
892
904
  .df
893
905
  .borrow()
894
- .with_row_count(&name, offset)
906
+ .with_row_index(&name, offset)
895
907
  .map_err(RbPolarsErr::from)?;
896
908
  Ok(df.into())
897
909
  }
@@ -922,9 +934,9 @@ impl RbDataFrame {
922
934
  #[allow(clippy::too_many_arguments)]
923
935
  pub fn pivot_expr(
924
936
  &self,
925
- values: Vec<String>,
926
937
  index: Vec<String>,
927
938
  columns: Vec<String>,
939
+ values: Option<Vec<String>>,
928
940
  maintain_order: bool,
929
941
  sort_columns: bool,
930
942
  aggregate_expr: Option<&RbExpr>,
@@ -937,9 +949,9 @@ impl RbDataFrame {
937
949
  let agg_expr = aggregate_expr.map(|aggregate_expr| aggregate_expr.inner.clone());
938
950
  let df = fun(
939
951
  &self.df.borrow(),
940
- values,
941
952
  index,
942
953
  columns,
954
+ values,
943
955
  sort_columns,
944
956
  agg_expr,
945
957
  separator.as_deref(),
@@ -1121,7 +1133,7 @@ impl RbDataFrame {
1121
1133
  };
1122
1134
  Ok(self
1123
1135
  .df
1124
- .borrow()
1136
+ .borrow_mut()
1125
1137
  .transpose(keep_names_as.as_deref(), new_col_names)
1126
1138
  .map_err(RbPolarsErr::from)?
1127
1139
  .into())
@@ -1163,4 +1175,8 @@ impl RbDataFrame {
1163
1175
  let df = self.df.borrow().unnest(names).map_err(RbPolarsErr::from)?;
1164
1176
  Ok(df.into())
1165
1177
  }
1178
+
1179
+ pub fn clear(&self) -> Self {
1180
+ self.df.borrow().clear().into()
1181
+ }
1166
1182
  }
@@ -1,3 +1,5 @@
1
+ use polars::prelude::*;
2
+
1
3
  use crate::RbExpr;
2
4
 
3
5
  impl RbExpr {
@@ -12,4 +14,76 @@ impl RbExpr {
12
14
  pub fn array_sum(&self) -> Self {
13
15
  self.inner.clone().arr().sum().into()
14
16
  }
17
+
18
+ pub fn arr_unique(&self, maintain_order: bool) -> Self {
19
+ if maintain_order {
20
+ self.inner.clone().arr().unique_stable().into()
21
+ } else {
22
+ self.inner.clone().arr().unique().into()
23
+ }
24
+ }
25
+
26
+ pub fn arr_to_list(&self) -> Self {
27
+ self.inner.clone().arr().to_list().into()
28
+ }
29
+
30
+ pub fn arr_all(&self) -> Self {
31
+ self.inner.clone().arr().all().into()
32
+ }
33
+
34
+ pub fn arr_any(&self) -> Self {
35
+ self.inner.clone().arr().any().into()
36
+ }
37
+
38
+ pub fn arr_sort(&self, descending: bool, nulls_last: bool) -> Self {
39
+ self.inner
40
+ .clone()
41
+ .arr()
42
+ .sort(SortOptions {
43
+ descending,
44
+ nulls_last,
45
+ ..Default::default()
46
+ })
47
+ .into()
48
+ }
49
+
50
+ pub fn arr_reverse(&self) -> Self {
51
+ self.inner.clone().arr().reverse().into()
52
+ }
53
+
54
+ pub fn arr_arg_min(&self) -> Self {
55
+ self.inner.clone().arr().arg_min().into()
56
+ }
57
+
58
+ pub fn arr_arg_max(&self) -> Self {
59
+ self.inner.clone().arr().arg_max().into()
60
+ }
61
+
62
+ pub fn arr_get(&self, index: &RbExpr) -> Self {
63
+ self.inner.clone().arr().get(index.inner.clone()).into()
64
+ }
65
+
66
+ pub fn arr_join(&self, separator: &RbExpr, ignore_nulls: bool) -> Self {
67
+ self.inner
68
+ .clone()
69
+ .arr()
70
+ .join(separator.inner.clone(), ignore_nulls)
71
+ .into()
72
+ }
73
+
74
+ pub fn arr_contains(&self, other: &RbExpr) -> Self {
75
+ self.inner
76
+ .clone()
77
+ .arr()
78
+ .contains(other.inner.clone())
79
+ .into()
80
+ }
81
+
82
+ pub fn arr_count_matches(&self, expr: &RbExpr) -> Self {
83
+ self.inner
84
+ .clone()
85
+ .arr()
86
+ .count_matches(expr.inner.clone())
87
+ .into()
88
+ }
15
89
  }
@@ -61,6 +61,14 @@ impl RbExpr {
61
61
  self.inner.clone().dt().month_end().into()
62
62
  }
63
63
 
64
+ pub fn dt_base_utc_offset(&self) -> Self {
65
+ self.inner.clone().dt().base_utc_offset().into()
66
+ }
67
+
68
+ pub fn dt_dst_offset(&self) -> Self {
69
+ self.inner.clone().dt().dst_offset().into()
70
+ }
71
+
64
72
  pub fn dt_round(&self, every: String, offset: String) -> Self {
65
73
  self.inner.clone().dt().round(&every, &offset).into()
66
74
  }
@@ -149,73 +157,31 @@ impl RbExpr {
149
157
  self.inner.clone().dt().timestamp(tu.0).into()
150
158
  }
151
159
 
152
- pub fn duration_days(&self) -> Self {
153
- self.inner
154
- .clone()
155
- .map(
156
- |s| Ok(Some(s.duration()?.days().into_series())),
157
- GetOutput::from_type(DataType::Int64),
158
- )
159
- .into()
160
+ pub fn dt_total_days(&self) -> Self {
161
+ self.inner.clone().dt().total_days().into()
160
162
  }
161
163
 
162
- pub fn duration_hours(&self) -> Self {
163
- self.inner
164
- .clone()
165
- .map(
166
- |s| Ok(Some(s.duration()?.hours().into_series())),
167
- GetOutput::from_type(DataType::Int64),
168
- )
169
- .into()
164
+ pub fn dt_total_hours(&self) -> Self {
165
+ self.inner.clone().dt().total_hours().into()
170
166
  }
171
167
 
172
- pub fn duration_minutes(&self) -> Self {
173
- self.inner
174
- .clone()
175
- .map(
176
- |s| Ok(Some(s.duration()?.minutes().into_series())),
177
- GetOutput::from_type(DataType::Int64),
178
- )
179
- .into()
168
+ pub fn dt_total_minutes(&self) -> Self {
169
+ self.inner.clone().dt().total_minutes().into()
180
170
  }
181
171
 
182
- pub fn duration_seconds(&self) -> Self {
183
- self.inner
184
- .clone()
185
- .map(
186
- |s| Ok(Some(s.duration()?.seconds().into_series())),
187
- GetOutput::from_type(DataType::Int64),
188
- )
189
- .into()
172
+ pub fn dt_total_seconds(&self) -> Self {
173
+ self.inner.clone().dt().total_seconds().into()
190
174
  }
191
175
 
192
- pub fn duration_milliseconds(&self) -> Self {
193
- self.inner
194
- .clone()
195
- .map(
196
- |s| Ok(Some(s.duration()?.milliseconds().into_series())),
197
- GetOutput::from_type(DataType::Int64),
198
- )
199
- .into()
176
+ pub fn dt_total_milliseconds(&self) -> Self {
177
+ self.inner.clone().dt().total_milliseconds().into()
200
178
  }
201
179
 
202
- pub fn duration_microseconds(&self) -> Self {
203
- self.inner
204
- .clone()
205
- .map(
206
- |s| Ok(Some(s.duration()?.microseconds().into_series())),
207
- GetOutput::from_type(DataType::Int64),
208
- )
209
- .into()
180
+ pub fn dt_total_microseconds(&self) -> Self {
181
+ self.inner.clone().dt().total_microseconds().into()
210
182
  }
211
183
 
212
- pub fn duration_nanoseconds(&self) -> Self {
213
- self.inner
214
- .clone()
215
- .map(
216
- |s| Ok(Some(s.duration()?.nanoseconds().into_series())),
217
- GetOutput::from_type(DataType::Int64),
218
- )
219
- .into()
184
+ pub fn dt_total_nanoseconds(&self) -> Self {
185
+ self.inner.clone().dt().total_nanoseconds().into()
220
186
  }
221
187
  }