polars-df 0.4.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +447 -410
  4. data/Cargo.toml +0 -1
  5. data/README.md +6 -5
  6. data/ext/polars/Cargo.toml +10 -5
  7. data/ext/polars/src/apply/dataframe.rs +2 -2
  8. data/ext/polars/src/{lazy/apply.rs → apply/lazy.rs} +1 -2
  9. data/ext/polars/src/apply/mod.rs +8 -3
  10. data/ext/polars/src/batched_csv.rs +7 -5
  11. data/ext/polars/src/conversion.rs +269 -59
  12. data/ext/polars/src/dataframe.rs +38 -40
  13. data/ext/polars/src/error.rs +6 -2
  14. data/ext/polars/src/expr/array.rs +15 -0
  15. data/ext/polars/src/expr/binary.rs +69 -0
  16. data/ext/polars/src/expr/categorical.rs +10 -0
  17. data/ext/polars/src/expr/datetime.rs +223 -0
  18. data/ext/polars/src/expr/general.rs +963 -0
  19. data/ext/polars/src/expr/list.rs +151 -0
  20. data/ext/polars/src/{lazy → expr}/meta.rs +16 -6
  21. data/ext/polars/src/expr/string.rs +314 -0
  22. data/ext/polars/src/expr/struct.rs +15 -0
  23. data/ext/polars/src/expr.rs +34 -0
  24. data/ext/polars/src/functions/eager.rs +93 -0
  25. data/ext/polars/src/functions/io.rs +34 -0
  26. data/ext/polars/src/functions/lazy.rs +249 -0
  27. data/ext/polars/src/functions/meta.rs +8 -0
  28. data/ext/polars/src/functions/mod.rs +5 -0
  29. data/ext/polars/src/functions/whenthen.rs +43 -0
  30. data/ext/polars/src/{lazy/dataframe.rs → lazyframe.rs} +26 -35
  31. data/ext/polars/src/lazygroupby.rs +29 -0
  32. data/ext/polars/src/lib.rs +223 -316
  33. data/ext/polars/src/object.rs +1 -1
  34. data/ext/polars/src/rb_modules.rs +12 -0
  35. data/ext/polars/src/series/aggregation.rs +83 -0
  36. data/ext/polars/src/series/arithmetic.rs +88 -0
  37. data/ext/polars/src/series/comparison.rs +251 -0
  38. data/ext/polars/src/series/construction.rs +190 -0
  39. data/ext/polars/src/series.rs +151 -551
  40. data/lib/polars/array_expr.rb +84 -0
  41. data/lib/polars/array_name_space.rb +77 -0
  42. data/lib/polars/batched_csv_reader.rb +1 -1
  43. data/lib/polars/convert.rb +2 -2
  44. data/lib/polars/data_frame.rb +289 -96
  45. data/lib/polars/data_types.rb +169 -33
  46. data/lib/polars/date_time_expr.rb +142 -2
  47. data/lib/polars/date_time_name_space.rb +17 -3
  48. data/lib/polars/expr.rb +145 -78
  49. data/lib/polars/functions.rb +0 -1
  50. data/lib/polars/group_by.rb +1 -22
  51. data/lib/polars/lazy_frame.rb +84 -31
  52. data/lib/polars/lazy_functions.rb +71 -32
  53. data/lib/polars/list_expr.rb +94 -45
  54. data/lib/polars/list_name_space.rb +13 -13
  55. data/lib/polars/rolling_group_by.rb +4 -2
  56. data/lib/polars/series.rb +249 -87
  57. data/lib/polars/string_expr.rb +277 -45
  58. data/lib/polars/string_name_space.rb +137 -22
  59. data/lib/polars/struct_name_space.rb +32 -0
  60. data/lib/polars/utils.rb +138 -54
  61. data/lib/polars/version.rb +1 -1
  62. data/lib/polars.rb +5 -2
  63. metadata +29 -11
  64. data/ext/polars/src/lazy/dsl.rs +0 -1775
  65. data/ext/polars/src/lazy/mod.rs +0 -5
  66. data/ext/polars/src/lazy/utils.rs +0 -13
  67. data/ext/polars/src/list_construction.rs +0 -100
  68. /data/ext/polars/src/{numo.rs → series/export.rs} +0 -0
  69. /data/ext/polars/src/{set.rs → series/set_at_idx.rs} +0 -0
@@ -48,20 +48,25 @@ impl RbDataFrame {
48
48
  crate::object::register_object_builder();
49
49
 
50
50
  let schema =
51
- rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?;
51
+ rows_to_schema_supertypes(&rows, infer_schema_length.map(|n| std::cmp::max(1, n)))
52
+ .map_err(RbPolarsErr::from)?;
52
53
  // replace inferred nulls with boolean
53
54
  let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
54
55
  DataType::Null => {
55
56
  fld.coerce(DataType::Boolean);
56
57
  fld
57
58
  }
59
+ DataType::Decimal(_, _) => {
60
+ fld.coerce(DataType::Decimal(None, None));
61
+ fld
62
+ }
58
63
  _ => fld,
59
64
  });
60
- let mut schema = Schema::from(fields);
65
+ let mut schema = Schema::from_iter(fields);
61
66
 
62
67
  if let Some(schema_overwrite) = schema_overwrite {
63
68
  for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
64
- if let Some((name_, dtype_)) = schema.get_index_mut(i) {
69
+ if let Some((name_, dtype_)) = schema.get_at_index_mut(i) {
65
70
  *name_ = name;
66
71
 
67
72
  // if user sets dtype unknown, we use the inferred datatype
@@ -139,11 +144,13 @@ impl RbDataFrame {
139
144
  };
140
145
 
141
146
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
142
- let fields = overwrite_dtype.iter().map(|(name, dtype)| {
143
- let dtype = dtype.0.clone();
144
- Field::new(name, dtype)
145
- });
146
- Schema::from(fields)
147
+ overwrite_dtype
148
+ .iter()
149
+ .map(|(name, dtype)| {
150
+ let dtype = dtype.0.clone();
151
+ Field::new(name, dtype)
152
+ })
153
+ .collect::<Schema>()
147
154
  });
148
155
 
149
156
  let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
@@ -449,12 +456,14 @@ impl RbDataFrame {
449
456
  .finish(&mut self.df.borrow_mut())
450
457
  .map_err(RbPolarsErr::from)?;
451
458
  } else {
452
- let mut buf = get_file_like(rb_f, true)?;
453
-
459
+ let mut buf = Cursor::new(Vec::new());
454
460
  IpcWriter::new(&mut buf)
455
461
  .with_compression(compression.0)
456
462
  .finish(&mut self.df.borrow_mut())
457
463
  .map_err(RbPolarsErr::from)?;
464
+ // TODO less copying
465
+ let rb_str = RString::from_slice(&buf.into_inner());
466
+ rb_f.funcall::<_, _, Value>("write", (rb_str,))?;
458
467
  }
459
468
  Ok(())
460
469
  }
@@ -512,12 +521,10 @@ impl RbDataFrame {
512
521
  }
513
522
  }
514
523
  }
515
- let st = st?;
524
+ let _st = st?;
516
525
 
517
- match st {
518
- // TODO
519
- _ => None,
520
- }
526
+ // TODO
527
+ None
521
528
  }
522
529
 
523
530
  pub fn write_parquet(
@@ -789,22 +796,6 @@ impl RbDataFrame {
789
796
  Ok(RbDataFrame::new(df))
790
797
  }
791
798
 
792
- pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> RbResult<Self> {
793
- let df = self
794
- .df
795
- .borrow()
796
- .sort_with_options(
797
- &by_column,
798
- SortOptions {
799
- descending: reverse,
800
- nulls_last,
801
- multithreaded: true,
802
- },
803
- )
804
- .map_err(RbPolarsErr::from)?;
805
- Ok(RbDataFrame::new(df))
806
- }
807
-
808
799
  pub fn replace(&self, column: String, new_col: &RbSeries) -> RbResult<()> {
809
800
  self.df
810
801
  .borrow_mut()
@@ -914,10 +905,7 @@ impl RbDataFrame {
914
905
  true => pivot_stable,
915
906
  false => pivot,
916
907
  };
917
- let agg_expr = match aggregate_expr {
918
- Some(aggregate_expr) => Some(aggregate_expr.inner.clone()),
919
- None => None,
920
- };
908
+ let agg_expr = aggregate_expr.map(|aggregate_expr| aggregate_expr.inner.clone());
921
909
  let df = fun(
922
910
  &self.df.borrow(),
923
911
  values,
@@ -931,11 +919,16 @@ impl RbDataFrame {
931
919
  Ok(RbDataFrame::new(df))
932
920
  }
933
921
 
934
- pub fn partition_by(&self, groups: Vec<String>, stable: bool) -> RbResult<RArray> {
935
- let out = if stable {
936
- self.df.borrow().partition_by_stable(groups)
922
+ pub fn partition_by(
923
+ &self,
924
+ by: Vec<String>,
925
+ maintain_order: bool,
926
+ include_key: bool,
927
+ ) -> RbResult<RArray> {
928
+ let out = if maintain_order {
929
+ self.df.borrow().partition_by_stable(by, include_key)
937
930
  } else {
938
- self.df.borrow().partition_by(groups)
931
+ self.df.borrow().partition_by(by, include_key)
939
932
  }
940
933
  .map_err(RbPolarsErr::from)?;
941
934
  Ok(RArray::from_iter(out.into_iter().map(RbDataFrame::new)))
@@ -1022,13 +1015,18 @@ impl RbDataFrame {
1022
1015
  &self,
1023
1016
  columns: Option<Vec<String>>,
1024
1017
  separator: Option<String>,
1018
+ drop_first: bool,
1025
1019
  ) -> RbResult<Self> {
1026
1020
  let df = match columns {
1027
1021
  Some(cols) => self.df.borrow().columns_to_dummies(
1028
1022
  cols.iter().map(|x| x as &str).collect(),
1029
1023
  separator.as_deref(),
1024
+ drop_first,
1030
1025
  ),
1031
- None => self.df.borrow().to_dummies(separator.as_deref()),
1026
+ None => self
1027
+ .df
1028
+ .borrow()
1029
+ .to_dummies(separator.as_deref(), drop_first),
1032
1030
  }
1033
1031
  .map_err(RbPolarsErr::from)?;
1034
1032
  Ok(df.into())
@@ -22,9 +22,13 @@ impl RbPolarsErr {
22
22
  pub fn other(message: String) -> Error {
23
23
  Error::new(exception::runtime_error(), message)
24
24
  }
25
+ }
26
+
27
+ pub struct RbTypeError {}
25
28
 
26
- pub fn todo() -> Error {
27
- Error::new(exception::runtime_error(), "not implemented yet")
29
+ impl RbTypeError {
30
+ pub fn new_err(message: String) -> Error {
31
+ Error::new(exception::type_error(), message)
28
32
  }
29
33
  }
30
34
 
@@ -0,0 +1,15 @@
1
+ use crate::RbExpr;
2
+
3
+ impl RbExpr {
4
+ pub fn array_max(&self) -> Self {
5
+ self.inner.clone().arr().max().into()
6
+ }
7
+
8
+ pub fn array_min(&self) -> Self {
9
+ self.inner.clone().arr().min().into()
10
+ }
11
+
12
+ pub fn array_sum(&self) -> Self {
13
+ self.inner.clone().arr().sum().into()
14
+ }
15
+ }
@@ -0,0 +1,69 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::RbExpr;
4
+
5
+ impl RbExpr {
6
+ pub fn bin_contains(&self, lit: Vec<u8>) -> Self {
7
+ self.inner.clone().binary().contains_literal(lit).into()
8
+ }
9
+
10
+ pub fn bin_ends_with(&self, sub: Vec<u8>) -> Self {
11
+ self.inner.clone().binary().ends_with(sub).into()
12
+ }
13
+
14
+ pub fn bin_starts_with(&self, sub: Vec<u8>) -> Self {
15
+ self.inner.clone().binary().starts_with(sub).into()
16
+ }
17
+
18
+ pub fn bin_hex_decode(&self, strict: bool) -> Self {
19
+ self.clone()
20
+ .inner
21
+ .map(
22
+ move |s| {
23
+ s.binary()?
24
+ .hex_decode(strict)
25
+ .map(|s| Some(s.into_series()))
26
+ },
27
+ GetOutput::same_type(),
28
+ )
29
+ .with_fmt("bin.hex_decode")
30
+ .into()
31
+ }
32
+
33
+ pub fn bin_base64_decode(&self, strict: bool) -> Self {
34
+ self.clone()
35
+ .inner
36
+ .map(
37
+ move |s| {
38
+ s.binary()?
39
+ .base64_decode(strict)
40
+ .map(|s| Some(s.into_series()))
41
+ },
42
+ GetOutput::same_type(),
43
+ )
44
+ .with_fmt("bin.base64_decode")
45
+ .into()
46
+ }
47
+
48
+ pub fn bin_hex_encode(&self) -> Self {
49
+ self.clone()
50
+ .inner
51
+ .map(
52
+ move |s| s.binary().map(|s| Some(s.hex_encode().into_series())),
53
+ GetOutput::same_type(),
54
+ )
55
+ .with_fmt("bin.hex_encode")
56
+ .into()
57
+ }
58
+
59
+ pub fn bin_base64_encode(&self) -> Self {
60
+ self.clone()
61
+ .inner
62
+ .map(
63
+ move |s| s.binary().map(|s| Some(s.base64_encode().into_series())),
64
+ GetOutput::same_type(),
65
+ )
66
+ .with_fmt("bin.base64_encode")
67
+ .into()
68
+ }
69
+ }
@@ -0,0 +1,10 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::conversion::Wrap;
4
+ use crate::RbExpr;
5
+
6
+ impl RbExpr {
7
+ pub fn cat_set_ordering(&self, ordering: Wrap<CategoricalOrdering>) -> Self {
8
+ self.inner.clone().cat().set_ordering(ordering.0).into()
9
+ }
10
+ }
@@ -0,0 +1,223 @@
1
+ use polars::prelude::*;
2
+
3
+ use crate::conversion::Wrap;
4
+ use crate::RbExpr;
5
+
6
+ impl RbExpr {
7
+ pub fn dt_to_string(&self, format: String) -> Self {
8
+ self.inner.clone().dt().to_string(&format).into()
9
+ }
10
+
11
+ pub fn dt_offset_by(&self, by: String) -> Self {
12
+ let by = Duration::parse(&by);
13
+ self.inner.clone().dt().offset_by(by).into()
14
+ }
15
+
16
+ pub fn dt_epoch_seconds(&self) -> Self {
17
+ self.clone()
18
+ .inner
19
+ .map(
20
+ |s| {
21
+ s.timestamp(TimeUnit::Milliseconds)
22
+ .map(|ca| Some((ca / 1000).into_series()))
23
+ },
24
+ GetOutput::from_type(DataType::Int64),
25
+ )
26
+ .into()
27
+ }
28
+
29
+ pub fn dt_with_time_unit(&self, tu: Wrap<TimeUnit>) -> Self {
30
+ self.inner.clone().dt().with_time_unit(tu.0).into()
31
+ }
32
+
33
+ pub fn dt_convert_time_zone(&self, tz: TimeZone) -> Self {
34
+ self.inner.clone().dt().convert_time_zone(tz).into()
35
+ }
36
+
37
+ pub fn dt_cast_time_unit(&self, tu: Wrap<TimeUnit>) -> Self {
38
+ self.inner.clone().dt().cast_time_unit(tu.0).into()
39
+ }
40
+
41
+ pub fn dt_replace_time_zone(&self, tz: Option<String>, use_earliest: Option<bool>) -> Self {
42
+ self.inner
43
+ .clone()
44
+ .dt()
45
+ .replace_time_zone(tz, use_earliest)
46
+ .into()
47
+ }
48
+
49
+ #[allow(deprecated)]
50
+ pub fn dt_tz_localize(&self, tz: String) -> Self {
51
+ self.inner.clone().dt().tz_localize(tz).into()
52
+ }
53
+
54
+ pub fn dt_truncate(&self, every: String, offset: String) -> Self {
55
+ self.inner.clone().dt().truncate(&every, &offset).into()
56
+ }
57
+
58
+ pub fn dt_month_start(&self) -> Self {
59
+ self.inner.clone().dt().month_start().into()
60
+ }
61
+
62
+ pub fn dt_month_end(&self) -> Self {
63
+ self.inner.clone().dt().month_end().into()
64
+ }
65
+
66
+ pub fn dt_round(&self, every: String, offset: String) -> Self {
67
+ self.inner.clone().dt().round(&every, &offset).into()
68
+ }
69
+
70
+ pub fn dt_combine(&self, time: &Self, time_unit: Wrap<TimeUnit>) -> Self {
71
+ self.inner
72
+ .clone()
73
+ .dt()
74
+ .combine(time.inner.clone(), time_unit.0)
75
+ .into()
76
+ }
77
+
78
+ pub fn dt_year(&self) -> Self {
79
+ self.clone().inner.dt().year().into()
80
+ }
81
+
82
+ pub fn dt_is_leap_year(&self) -> Self {
83
+ self.clone().inner.dt().is_leap_year().into()
84
+ }
85
+
86
+ pub fn dt_iso_year(&self) -> Self {
87
+ self.clone().inner.dt().iso_year().into()
88
+ }
89
+
90
+ pub fn dt_quarter(&self) -> Self {
91
+ self.clone().inner.dt().quarter().into()
92
+ }
93
+
94
+ pub fn dt_month(&self) -> Self {
95
+ self.clone().inner.dt().month().into()
96
+ }
97
+
98
+ pub fn dt_week(&self) -> Self {
99
+ self.clone().inner.dt().week().into()
100
+ }
101
+
102
+ pub fn dt_weekday(&self) -> Self {
103
+ self.clone().inner.dt().weekday().into()
104
+ }
105
+
106
+ pub fn dt_day(&self) -> Self {
107
+ self.clone().inner.dt().day().into()
108
+ }
109
+
110
+ pub fn dt_ordinal_day(&self) -> Self {
111
+ self.clone().inner.dt().ordinal_day().into()
112
+ }
113
+
114
+ pub fn dt_time(&self) -> Self {
115
+ self.clone().inner.dt().time().into()
116
+ }
117
+
118
+ pub fn dt_date(&self) -> Self {
119
+ self.clone().inner.dt().date().into()
120
+ }
121
+
122
+ pub fn dt_datetime(&self) -> Self {
123
+ self.clone().inner.dt().datetime().into()
124
+ }
125
+
126
+ pub fn dt_hour(&self) -> Self {
127
+ self.clone().inner.dt().hour().into()
128
+ }
129
+
130
+ pub fn dt_minute(&self) -> Self {
131
+ self.clone().inner.dt().minute().into()
132
+ }
133
+
134
+ pub fn dt_second(&self) -> Self {
135
+ self.clone().inner.dt().second().into()
136
+ }
137
+
138
+ pub fn dt_millisecond(&self) -> Self {
139
+ self.clone().inner.dt().millisecond().into()
140
+ }
141
+
142
+ pub fn dt_microsecond(&self) -> Self {
143
+ self.clone().inner.dt().microsecond().into()
144
+ }
145
+
146
+ pub fn dt_nanosecond(&self) -> Self {
147
+ self.clone().inner.dt().nanosecond().into()
148
+ }
149
+
150
+ pub fn dt_timestamp(&self, tu: Wrap<TimeUnit>) -> Self {
151
+ self.inner.clone().dt().timestamp(tu.0).into()
152
+ }
153
+
154
+ pub fn duration_days(&self) -> Self {
155
+ self.inner
156
+ .clone()
157
+ .map(
158
+ |s| Ok(Some(s.duration()?.days().into_series())),
159
+ GetOutput::from_type(DataType::Int64),
160
+ )
161
+ .into()
162
+ }
163
+
164
+ pub fn duration_hours(&self) -> Self {
165
+ self.inner
166
+ .clone()
167
+ .map(
168
+ |s| Ok(Some(s.duration()?.hours().into_series())),
169
+ GetOutput::from_type(DataType::Int64),
170
+ )
171
+ .into()
172
+ }
173
+
174
+ pub fn duration_minutes(&self) -> Self {
175
+ self.inner
176
+ .clone()
177
+ .map(
178
+ |s| Ok(Some(s.duration()?.minutes().into_series())),
179
+ GetOutput::from_type(DataType::Int64),
180
+ )
181
+ .into()
182
+ }
183
+
184
+ pub fn duration_seconds(&self) -> Self {
185
+ self.inner
186
+ .clone()
187
+ .map(
188
+ |s| Ok(Some(s.duration()?.seconds().into_series())),
189
+ GetOutput::from_type(DataType::Int64),
190
+ )
191
+ .into()
192
+ }
193
+
194
+ pub fn duration_milliseconds(&self) -> Self {
195
+ self.inner
196
+ .clone()
197
+ .map(
198
+ |s| Ok(Some(s.duration()?.milliseconds().into_series())),
199
+ GetOutput::from_type(DataType::Int64),
200
+ )
201
+ .into()
202
+ }
203
+
204
+ pub fn duration_microseconds(&self) -> Self {
205
+ self.inner
206
+ .clone()
207
+ .map(
208
+ |s| Ok(Some(s.duration()?.microseconds().into_series())),
209
+ GetOutput::from_type(DataType::Int64),
210
+ )
211
+ .into()
212
+ }
213
+
214
+ pub fn duration_nanoseconds(&self) -> Self {
215
+ self.inner
216
+ .clone()
217
+ .map(
218
+ |s| Ok(Some(s.duration()?.nanoseconds().into_series())),
219
+ GetOutput::from_type(DataType::Int64),
220
+ )
221
+ .into()
222
+ }
223
+ }