polars-df 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +447 -410
  4. data/Cargo.toml +0 -1
  5. data/README.md +6 -5
  6. data/ext/polars/Cargo.toml +10 -5
  7. data/ext/polars/src/apply/dataframe.rs +2 -2
  8. data/ext/polars/src/{lazy/apply.rs → apply/lazy.rs} +1 -2
  9. data/ext/polars/src/apply/mod.rs +8 -3
  10. data/ext/polars/src/batched_csv.rs +7 -5
  11. data/ext/polars/src/conversion.rs +269 -59
  12. data/ext/polars/src/dataframe.rs +38 -40
  13. data/ext/polars/src/error.rs +6 -2
  14. data/ext/polars/src/expr/array.rs +15 -0
  15. data/ext/polars/src/expr/binary.rs +69 -0
  16. data/ext/polars/src/expr/categorical.rs +10 -0
  17. data/ext/polars/src/expr/datetime.rs +223 -0
  18. data/ext/polars/src/expr/general.rs +963 -0
  19. data/ext/polars/src/expr/list.rs +151 -0
  20. data/ext/polars/src/{lazy → expr}/meta.rs +16 -6
  21. data/ext/polars/src/expr/string.rs +314 -0
  22. data/ext/polars/src/expr/struct.rs +15 -0
  23. data/ext/polars/src/expr.rs +34 -0
  24. data/ext/polars/src/functions/eager.rs +93 -0
  25. data/ext/polars/src/functions/io.rs +34 -0
  26. data/ext/polars/src/functions/lazy.rs +249 -0
  27. data/ext/polars/src/functions/meta.rs +8 -0
  28. data/ext/polars/src/functions/mod.rs +5 -0
  29. data/ext/polars/src/functions/whenthen.rs +43 -0
  30. data/ext/polars/src/{lazy/dataframe.rs → lazyframe.rs} +26 -35
  31. data/ext/polars/src/lazygroupby.rs +29 -0
  32. data/ext/polars/src/lib.rs +223 -316
  33. data/ext/polars/src/object.rs +1 -1
  34. data/ext/polars/src/rb_modules.rs +12 -0
  35. data/ext/polars/src/series/aggregation.rs +83 -0
  36. data/ext/polars/src/series/arithmetic.rs +88 -0
  37. data/ext/polars/src/series/comparison.rs +251 -0
  38. data/ext/polars/src/series/construction.rs +190 -0
  39. data/ext/polars/src/series.rs +151 -551
  40. data/lib/polars/array_expr.rb +84 -0
  41. data/lib/polars/array_name_space.rb +77 -0
  42. data/lib/polars/batched_csv_reader.rb +1 -1
  43. data/lib/polars/convert.rb +2 -2
  44. data/lib/polars/data_frame.rb +289 -96
  45. data/lib/polars/data_types.rb +169 -33
  46. data/lib/polars/date_time_expr.rb +142 -2
  47. data/lib/polars/date_time_name_space.rb +17 -3
  48. data/lib/polars/expr.rb +145 -78
  49. data/lib/polars/functions.rb +0 -1
  50. data/lib/polars/group_by.rb +1 -22
  51. data/lib/polars/lazy_frame.rb +84 -31
  52. data/lib/polars/lazy_functions.rb +71 -32
  53. data/lib/polars/list_expr.rb +94 -45
  54. data/lib/polars/list_name_space.rb +13 -13
  55. data/lib/polars/rolling_group_by.rb +4 -2
  56. data/lib/polars/series.rb +249 -87
  57. data/lib/polars/string_expr.rb +277 -45
  58. data/lib/polars/string_name_space.rb +137 -22
  59. data/lib/polars/struct_name_space.rb +32 -0
  60. data/lib/polars/utils.rb +138 -54
  61. data/lib/polars/version.rb +1 -1
  62. data/lib/polars.rb +5 -2
  63. metadata +29 -11
  64. data/ext/polars/src/lazy/dsl.rs +0 -1775
  65. data/ext/polars/src/lazy/mod.rs +0 -5
  66. data/ext/polars/src/lazy/utils.rs +0 -13
  67. data/ext/polars/src/list_construction.rs +0 -100
  68. /data/ext/polars/src/{numo.rs → series/export.rs} +0 -0
  69. /data/ext/polars/src/{set.rs → series/set_at_idx.rs} +0 -0
@@ -48,20 +48,25 @@ impl RbDataFrame {
48
48
  crate::object::register_object_builder();
49
49
 
50
50
  let schema =
51
- rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?;
51
+ rows_to_schema_supertypes(&rows, infer_schema_length.map(|n| std::cmp::max(1, n)))
52
+ .map_err(RbPolarsErr::from)?;
52
53
  // replace inferred nulls with boolean
53
54
  let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
54
55
  DataType::Null => {
55
56
  fld.coerce(DataType::Boolean);
56
57
  fld
57
58
  }
59
+ DataType::Decimal(_, _) => {
60
+ fld.coerce(DataType::Decimal(None, None));
61
+ fld
62
+ }
58
63
  _ => fld,
59
64
  });
60
- let mut schema = Schema::from(fields);
65
+ let mut schema = Schema::from_iter(fields);
61
66
 
62
67
  if let Some(schema_overwrite) = schema_overwrite {
63
68
  for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
64
- if let Some((name_, dtype_)) = schema.get_index_mut(i) {
69
+ if let Some((name_, dtype_)) = schema.get_at_index_mut(i) {
65
70
  *name_ = name;
66
71
 
67
72
  // if user sets dtype unknown, we use the inferred datatype
@@ -139,11 +144,13 @@ impl RbDataFrame {
139
144
  };
140
145
 
141
146
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
142
- let fields = overwrite_dtype.iter().map(|(name, dtype)| {
143
- let dtype = dtype.0.clone();
144
- Field::new(name, dtype)
145
- });
146
- Schema::from(fields)
147
+ overwrite_dtype
148
+ .iter()
149
+ .map(|(name, dtype)| {
150
+ let dtype = dtype.0.clone();
151
+ Field::new(name, dtype)
152
+ })
153
+ .collect::<Schema>()
147
154
  });
148
155
 
149
156
  let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
@@ -449,12 +456,14 @@ impl RbDataFrame {
449
456
  .finish(&mut self.df.borrow_mut())
450
457
  .map_err(RbPolarsErr::from)?;
451
458
  } else {
452
- let mut buf = get_file_like(rb_f, true)?;
453
-
459
+ let mut buf = Cursor::new(Vec::new());
454
460
  IpcWriter::new(&mut buf)
455
461
  .with_compression(compression.0)
456
462
  .finish(&mut self.df.borrow_mut())
457
463
  .map_err(RbPolarsErr::from)?;
464
+ // TODO less copying
465
+ let rb_str = RString::from_slice(&buf.into_inner());
466
+ rb_f.funcall::<_, _, Value>("write", (rb_str,))?;
458
467
  }
459
468
  Ok(())
460
469
  }
@@ -512,12 +521,10 @@ impl RbDataFrame {
512
521
  }
513
522
  }
514
523
  }
515
- let st = st?;
524
+ let _st = st?;
516
525
 
517
- match st {
518
- // TODO
519
- _ => None,
520
- }
526
+ // TODO
527
+ None
521
528
  }
522
529
 
523
530
  pub fn write_parquet(
@@ -789,22 +796,6 @@ impl RbDataFrame {
789
796
  Ok(RbDataFrame::new(df))
790
797
  }
791
798
 
792
- pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> RbResult<Self> {
793
- let df = self
794
- .df
795
- .borrow()
796
- .sort_with_options(
797
- &by_column,
798
- SortOptions {
799
- descending: reverse,
800
- nulls_last,
801
- multithreaded: true,
802
- },
803
- )
804
- .map_err(RbPolarsErr::from)?;
805
- Ok(RbDataFrame::new(df))
806
- }
807
-
808
799
  pub fn replace(&self, column: String, new_col: &RbSeries) -> RbResult<()> {
809
800
  self.df
810
801
  .borrow_mut()
@@ -914,10 +905,7 @@ impl RbDataFrame {
914
905
  true => pivot_stable,
915
906
  false => pivot,
916
907
  };
917
- let agg_expr = match aggregate_expr {
918
- Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
919
- None => None,
920
- };
908
+ let agg_expr = aggregate_expr.map(|aggregate_expr| aggregate_expr.inner.clone());
921
909
  let df = fun(
922
910
  &self.df.borrow(),
923
911
  values,
@@ -931,11 +919,16 @@ impl RbDataFrame {
931
919
  Ok(RbDataFrame::new(df))
932
920
  }
933
921
 
934
- pub fn partition_by(&self, groups: Vec<String>, stable: bool) -> RbResult<RArray> {
935
- let out = if stable {
936
- self.df.borrow().partition_by_stable(groups)
922
+ pub fn partition_by(
923
+ &self,
924
+ by: Vec<String>,
925
+ maintain_order: bool,
926
+ include_key: bool,
927
+ ) -> RbResult<RArray> {
928
+ let out = if maintain_order {
929
+ self.df.borrow().partition_by_stable(by, include_key)
937
930
  } else {
938
- self.df.borrow().partition_by(groups)
931
+ self.df.borrow().partition_by(by, include_key)
939
932
  }
940
933
  .map_err(RbPolarsErr::from)?;
941
934
  Ok(RArray::from_iter(out.into_iter().map(RbDataFrame::new)))
@@ -1022,13 +1015,18 @@ impl RbDataFrame {
1022
1015
  &self,
1023
1016
  columns: Option<Vec<String>>,
1024
1017
  separator: Option<String>,
1018
+ drop_first: bool,
1025
1019
  ) -> RbResult<Self> {
1026
1020
  let df = match columns {
1027
1021
  Some(cols) => self.df.borrow().columns_to_dummies(
1028
1022
  cols.iter().map(|x| x as &str).collect(),
1029
1023
  separator.as_deref(),
1024
+ drop_first,
1030
1025
  ),
1031
- None => self.df.borrow().to_dummies(separator.as_deref()),
1026
+ None => self
1027
+ .df
1028
+ .borrow()
1029
+ .to_dummies(separator.as_deref(), drop_first),
1032
1030
  }
1033
1031
  .map_err(RbPolarsErr::from)?;
1034
1032
  Ok(df.into())
@@ -22,9 +22,13 @@ impl RbPolarsErr {
22
22
  pub fn other(message: String) -> Error {
23
23
  Error::new(exception::runtime_error(), message)
24
24
  }
25
+ }
26
+
27
+ pub struct RbTypeError {}
25
28
 
26
- pub fn todo() -> Error {
27
- Error::new(exception::runtime_error(), "not implemented yet")
29
+ impl RbTypeError {
30
+ pub fn new_err(message: String) -> Error {
31
+ Error::new(exception::type_error(), message)
28
32
  }
29
33
  }
30
34
 
@@ -0,0 +1,15 @@
1
+ use crate::RbExpr;
2
+
3
+ impl RbExpr {
4
+ pub fn array_max(&self) -> Self {
5
+ self.inner.clone().arr().max().into()
6
+ }
7
+
8
+ pub fn array_min(&self) -> Self {
9
+ self.inner.clone().arr().min().into()
10
+ }
11
+
12
+ pub fn array_sum(&self) -> Self {
13
+ self.inner.clone().arr().sum().into()
14
+ }
15
+ }
@@ -0,0 +1,69 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::RbExpr;
4
+
5
+ impl RbExpr {
6
+ pub fn bin_contains(&self, lit: Vec<u8>) -> Self {
7
+ self.inner.clone().binary().contains_literal(lit).into()
8
+ }
9
+
10
+ pub fn bin_ends_with(&self, sub: Vec<u8>) -> Self {
11
+ self.inner.clone().binary().ends_with(sub).into()
12
+ }
13
+
14
+ pub fn bin_starts_with(&self, sub: Vec<u8>) -> Self {
15
+ self.inner.clone().binary().starts_with(sub).into()
16
+ }
17
+
18
+ pub fn bin_hex_decode(&self, strict: bool) -> Self {
19
+ self.clone()
20
+ .inner
21
+ .map(
22
+ move |s| {
23
+ s.binary()?
24
+ .hex_decode(strict)
25
+ .map(|s| Some(s.into_series()))
26
+ },
27
+ GetOutput::same_type(),
28
+ )
29
+ .with_fmt("bin.hex_decode")
30
+ .into()
31
+ }
32
+
33
+ pub fn bin_base64_decode(&self, strict: bool) -> Self {
34
+ self.clone()
35
+ .inner
36
+ .map(
37
+ move |s| {
38
+ s.binary()?
39
+ .base64_decode(strict)
40
+ .map(|s| Some(s.into_series()))
41
+ },
42
+ GetOutput::same_type(),
43
+ )
44
+ .with_fmt("bin.base64_decode")
45
+ .into()
46
+ }
47
+
48
+ pub fn bin_hex_encode(&self) -> Self {
49
+ self.clone()
50
+ .inner
51
+ .map(
52
+ move |s| s.binary().map(|s| Some(s.hex_encode().into_series())),
53
+ GetOutput::same_type(),
54
+ )
55
+ .with_fmt("bin.hex_encode")
56
+ .into()
57
+ }
58
+
59
+ pub fn bin_base64_encode(&self) -> Self {
60
+ self.clone()
61
+ .inner
62
+ .map(
63
+ move |s| s.binary().map(|s| Some(s.base64_encode().into_series())),
64
+ GetOutput::same_type(),
65
+ )
66
+ .with_fmt("bin.base64_encode")
67
+ .into()
68
+ }
69
+ }
@@ -0,0 +1,10 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::conversion::Wrap;
4
+ use crate::RbExpr;
5
+
6
+ impl RbExpr {
7
+ pub fn cat_set_ordering(&self, ordering: Wrap<CategoricalOrdering>) -> Self {
8
+ self.inner.clone().cat().set_ordering(ordering.0).into()
9
+ }
10
+ }
@@ -0,0 +1,223 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::conversion::Wrap;
4
+ use crate::RbExpr;
5
+
6
+ impl RbExpr {
7
+ pub fn dt_to_string(&self, format: String) -> Self {
8
+ self.inner.clone().dt().to_string(&format).into()
9
+ }
10
+
11
+ pub fn dt_offset_by(&self, by: String) -> Self {
12
+ let by = Duration::parse(&by);
13
+ self.inner.clone().dt().offset_by(by).into()
14
+ }
15
+
16
+ pub fn dt_epoch_seconds(&self) -> Self {
17
+ self.clone()
18
+ .inner
19
+ .map(
20
+ |s| {
21
+ s.timestamp(TimeUnit::Milliseconds)
22
+ .map(|ca| Some((ca / 1000).into_series()))
23
+ },
24
+ GetOutput::from_type(DataType::Int64),
25
+ )
26
+ .into()
27
+ }
28
+
29
+ pub fn dt_with_time_unit(&self, tu: Wrap<TimeUnit>) -> Self {
30
+ self.inner.clone().dt().with_time_unit(tu.0).into()
31
+ }
32
+
33
+ pub fn dt_convert_time_zone(&self, tz: TimeZone) -> Self {
34
+ self.inner.clone().dt().convert_time_zone(tz).into()
35
+ }
36
+
37
+ pub fn dt_cast_time_unit(&self, tu: Wrap<TimeUnit>) -> Self {
38
+ self.inner.clone().dt().cast_time_unit(tu.0).into()
39
+ }
40
+
41
+ pub fn dt_replace_time_zone(&self, tz: Option<String>, use_earliest: Option<bool>) -> Self {
42
+ self.inner
43
+ .clone()
44
+ .dt()
45
+ .replace_time_zone(tz, use_earliest)
46
+ .into()
47
+ }
48
+
49
+ #[allow(deprecated)]
50
+ pub fn dt_tz_localize(&self, tz: String) -> Self {
51
+ self.inner.clone().dt().tz_localize(tz).into()
52
+ }
53
+
54
+ pub fn dt_truncate(&self, every: String, offset: String) -> Self {
55
+ self.inner.clone().dt().truncate(&every, &offset).into()
56
+ }
57
+
58
+ pub fn dt_month_start(&self) -> Self {
59
+ self.inner.clone().dt().month_start().into()
60
+ }
61
+
62
+ pub fn dt_month_end(&self) -> Self {
63
+ self.inner.clone().dt().month_end().into()
64
+ }
65
+
66
+ pub fn dt_round(&self, every: String, offset: String) -> Self {
67
+ self.inner.clone().dt().round(&every, &offset).into()
68
+ }
69
+
70
+ pub fn dt_combine(&self, time: &Self, time_unit: Wrap<TimeUnit>) -> Self {
71
+ self.inner
72
+ .clone()
73
+ .dt()
74
+ .combine(time.inner.clone(), time_unit.0)
75
+ .into()
76
+ }
77
+
78
+ pub fn dt_year(&self) -> Self {
79
+ self.clone().inner.dt().year().into()
80
+ }
81
+
82
+ pub fn dt_is_leap_year(&self) -> Self {
83
+ self.clone().inner.dt().is_leap_year().into()
84
+ }
85
+
86
+ pub fn dt_iso_year(&self) -> Self {
87
+ self.clone().inner.dt().iso_year().into()
88
+ }
89
+
90
+ pub fn dt_quarter(&self) -> Self {
91
+ self.clone().inner.dt().quarter().into()
92
+ }
93
+
94
+ pub fn dt_month(&self) -> Self {
95
+ self.clone().inner.dt().month().into()
96
+ }
97
+
98
+ pub fn dt_week(&self) -> Self {
99
+ self.clone().inner.dt().week().into()
100
+ }
101
+
102
+ pub fn dt_weekday(&self) -> Self {
103
+ self.clone().inner.dt().weekday().into()
104
+ }
105
+
106
+ pub fn dt_day(&self) -> Self {
107
+ self.clone().inner.dt().day().into()
108
+ }
109
+
110
+ pub fn dt_ordinal_day(&self) -> Self {
111
+ self.clone().inner.dt().ordinal_day().into()
112
+ }
113
+
114
+ pub fn dt_time(&self) -> Self {
115
+ self.clone().inner.dt().time().into()
116
+ }
117
+
118
+ pub fn dt_date(&self) -> Self {
119
+ self.clone().inner.dt().date().into()
120
+ }
121
+
122
+ pub fn dt_datetime(&self) -> Self {
123
+ self.clone().inner.dt().datetime().into()
124
+ }
125
+
126
+ pub fn dt_hour(&self) -> Self {
127
+ self.clone().inner.dt().hour().into()
128
+ }
129
+
130
+ pub fn dt_minute(&self) -> Self {
131
+ self.clone().inner.dt().minute().into()
132
+ }
133
+
134
+ pub fn dt_second(&self) -> Self {
135
+ self.clone().inner.dt().second().into()
136
+ }
137
+
138
+ pub fn dt_millisecond(&self) -> Self {
139
+ self.clone().inner.dt().millisecond().into()
140
+ }
141
+
142
+ pub fn dt_microsecond(&self) -> Self {
143
+ self.clone().inner.dt().microsecond().into()
144
+ }
145
+
146
+ pub fn dt_nanosecond(&self) -> Self {
147
+ self.clone().inner.dt().nanosecond().into()
148
+ }
149
+
150
+ pub fn dt_timestamp(&self, tu: Wrap<TimeUnit>) -> Self {
151
+ self.inner.clone().dt().timestamp(tu.0).into()
152
+ }
153
+
154
+ pub fn duration_days(&self) -> Self {
155
+ self.inner
156
+ .clone()
157
+ .map(
158
+ |s| Ok(Some(s.duration()?.days().into_series())),
159
+ GetOutput::from_type(DataType::Int64),
160
+ )
161
+ .into()
162
+ }
163
+
164
+ pub fn duration_hours(&self) -> Self {
165
+ self.inner
166
+ .clone()
167
+ .map(
168
+ |s| Ok(Some(s.duration()?.hours().into_series())),
169
+ GetOutput::from_type(DataType::Int64),
170
+ )
171
+ .into()
172
+ }
173
+
174
+ pub fn duration_minutes(&self) -> Self {
175
+ self.inner
176
+ .clone()
177
+ .map(
178
+ |s| Ok(Some(s.duration()?.minutes().into_series())),
179
+ GetOutput::from_type(DataType::Int64),
180
+ )
181
+ .into()
182
+ }
183
+
184
+ pub fn duration_seconds(&self) -> Self {
185
+ self.inner
186
+ .clone()
187
+ .map(
188
+ |s| Ok(Some(s.duration()?.seconds().into_series())),
189
+ GetOutput::from_type(DataType::Int64),
190
+ )
191
+ .into()
192
+ }
193
+
194
+ pub fn duration_milliseconds(&self) -> Self {
195
+ self.inner
196
+ .clone()
197
+ .map(
198
+ |s| Ok(Some(s.duration()?.milliseconds().into_series())),
199
+ GetOutput::from_type(DataType::Int64),
200
+ )
201
+ .into()
202
+ }
203
+
204
+ pub fn duration_microseconds(&self) -> Self {
205
+ self.inner
206
+ .clone()
207
+ .map(
208
+ |s| Ok(Some(s.duration()?.microseconds().into_series())),
209
+ GetOutput::from_type(DataType::Int64),
210
+ )
211
+ .into()
212
+ }
213
+
214
+ pub fn duration_nanoseconds(&self) -> Self {
215
+ self.inner
216
+ .clone()
217
+ .map(
218
+ |s| Ok(Some(s.duration()?.nanoseconds().into_series())),
219
+ GetOutput::from_type(DataType::Int64),
220
+ )
221
+ .into()
222
+ }
223
+ }