polars-df 0.5.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -1,12 +1,15 @@
1
- use magnus::{class, RArray, RString, Value};
1
+ use magnus::encoding::{self, EncodingCapable};
2
+ use magnus::{
3
+ class, prelude::*, typed_data::Obj, value::Opaque, Float, Integer, RArray, RString, Ruby, Value,
4
+ };
2
5
  use polars::lazy::dsl;
3
6
  use polars::prelude::*;
4
7
 
5
- use crate::apply::lazy::binary_lambda;
6
8
  use crate::conversion::{get_lf, get_rbseq, Wrap};
9
+ use crate::map::lazy::binary_lambda;
7
10
  use crate::prelude::vec_extract_wrapped;
8
11
  use crate::rb_exprs_to_exprs;
9
- use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
12
+ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
10
13
 
11
14
  macro_rules! set_unwrapped_or_0 {
12
15
  ($($var:ident),+ $(,)?) => {
@@ -14,10 +17,6 @@ macro_rules! set_unwrapped_or_0 {
14
17
  };
15
18
  }
16
19
 
17
- pub fn arange(low: &RbExpr, high: &RbExpr, step: i64) -> RbExpr {
18
- dsl::arange(low.inner.clone(), high.inner.clone(), step).into()
19
- }
20
-
21
20
  pub fn arg_sort_by(by: RArray, descending: Vec<bool>) -> RbResult<RbExpr> {
22
21
  let by = rb_exprs_to_exprs(by)?;
23
22
  Ok(dsl::arg_sort_by(by, &descending).into())
@@ -29,7 +28,7 @@ pub fn arg_where(condition: &RbExpr) -> RbExpr {
29
28
 
30
29
  pub fn as_struct(exprs: RArray) -> RbResult<RbExpr> {
31
30
  let exprs = rb_exprs_to_exprs(exprs)?;
32
- Ok(dsl::as_struct(&exprs).into())
31
+ Ok(dsl::as_struct(exprs).into())
33
32
  }
34
33
 
35
34
  pub fn coalesce(exprs: RArray) -> RbResult<RbExpr> {
@@ -44,7 +43,7 @@ pub fn col(name: String) -> RbExpr {
44
43
  pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
45
44
  let lfs = lfs
46
45
  .each()
47
- .map(|v| v?.try_convert::<&RbLazyFrame>())
46
+ .map(|v| <&RbLazyFrame>::try_convert(v?))
48
47
  .collect::<RbResult<Vec<&RbLazyFrame>>>()?;
49
48
 
50
49
  Ok(RArray::from_iter(lfs.iter().map(|lf| {
@@ -57,7 +56,12 @@ pub fn cols(names: Vec<String>) -> RbExpr {
57
56
  dsl::cols(names).into()
58
57
  }
59
58
 
60
- pub fn concat_lf(lfs: Value, rechunk: bool, parallel: bool) -> RbResult<RbLazyFrame> {
59
+ pub fn concat_lf(
60
+ lfs: Value,
61
+ rechunk: bool,
62
+ parallel: bool,
63
+ to_supertypes: bool,
64
+ ) -> RbResult<RbLazyFrame> {
61
65
  let (seq, len) = get_rbseq(lfs)?;
62
66
  let mut lfs = Vec::with_capacity(len);
63
67
 
@@ -67,40 +71,77 @@ pub fn concat_lf(lfs: Value, rechunk: bool, parallel: bool) -> RbResult<RbLazyFr
67
71
  lfs.push(lf);
68
72
  }
69
73
 
70
- let lf = polars::lazy::dsl::concat(lfs, rechunk, parallel).map_err(RbPolarsErr::from)?;
74
+ let lf = dsl::concat(
75
+ lfs,
76
+ UnionArgs {
77
+ rechunk,
78
+ parallel,
79
+ to_supertypes,
80
+ },
81
+ )
82
+ .map_err(RbPolarsErr::from)?;
83
+ Ok(lf.into())
84
+ }
85
+
86
+ pub fn concat_lf_diagonal(
87
+ lfs: RArray,
88
+ rechunk: bool,
89
+ parallel: bool,
90
+ to_supertypes: bool,
91
+ ) -> RbResult<RbLazyFrame> {
92
+ let iter = lfs.each();
93
+
94
+ let lfs = iter
95
+ .map(|item| {
96
+ let item = item?;
97
+ get_lf(item)
98
+ })
99
+ .collect::<RbResult<Vec<_>>>()?;
100
+
101
+ let lf = dsl::functions::concat_lf_diagonal(
102
+ lfs,
103
+ UnionArgs {
104
+ rechunk,
105
+ parallel,
106
+ to_supertypes,
107
+ },
108
+ )
109
+ .map_err(RbPolarsErr::from)?;
71
110
  Ok(lf.into())
72
111
  }
73
112
 
74
113
  #[allow(clippy::too_many_arguments)]
75
114
  pub fn duration(
115
+ weeks: Option<&RbExpr>,
76
116
  days: Option<&RbExpr>,
117
+ hours: Option<&RbExpr>,
118
+ minutes: Option<&RbExpr>,
77
119
  seconds: Option<&RbExpr>,
78
- nanoseconds: Option<&RbExpr>,
79
- microseconds: Option<&RbExpr>,
80
120
  milliseconds: Option<&RbExpr>,
81
- minutes: Option<&RbExpr>,
82
- hours: Option<&RbExpr>,
83
- weeks: Option<&RbExpr>,
121
+ microseconds: Option<&RbExpr>,
122
+ nanoseconds: Option<&RbExpr>,
123
+ time_unit: Wrap<TimeUnit>,
84
124
  ) -> RbExpr {
85
125
  set_unwrapped_or_0!(
126
+ weeks,
86
127
  days,
128
+ hours,
129
+ minutes,
87
130
  seconds,
88
- nanoseconds,
89
- microseconds,
90
131
  milliseconds,
91
- minutes,
92
- hours,
93
- weeks,
132
+ microseconds,
133
+ nanoseconds,
94
134
  );
95
135
  let args = DurationArgs {
136
+ weeks,
96
137
  days,
138
+ hours,
139
+ minutes,
97
140
  seconds,
98
- nanoseconds,
99
- microseconds,
100
141
  milliseconds,
101
- minutes,
102
- hours,
103
- weeks,
142
+ microseconds,
143
+ nanoseconds,
144
+ time_unit: time_unit.0,
104
145
  };
105
146
  dsl::duration(args).into()
106
147
  }
@@ -123,28 +164,27 @@ pub fn dtype_cols(dtypes: Vec<DataType>) -> RbExpr {
123
164
 
124
165
  pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
125
166
  let exprs = rb_exprs_to_exprs(exprs)?;
167
+ let lambda = Opaque::from(lambda);
126
168
 
127
- let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
169
+ let func =
170
+ move |a: Series, b: Series| binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b);
128
171
  Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
129
172
  }
130
173
 
131
174
  pub fn cumfold(acc: &RbExpr, lambda: Value, exprs: RArray, include_init: bool) -> RbResult<RbExpr> {
132
175
  let exprs = rb_exprs_to_exprs(exprs)?;
176
+ let lambda = Opaque::from(lambda);
133
177
 
134
- let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
135
- Ok(polars::lazy::dsl::cumfold_exprs(acc.inner.clone(), func, exprs, include_init).into())
178
+ let func =
179
+ move |a: Series, b: Series| binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b);
180
+ Ok(polars::lazy::dsl::cum_fold_exprs(acc.inner.clone(), func, exprs, include_init).into())
136
181
  }
137
182
 
138
- // TODO improve
139
- pub fn lit(value: Value) -> RbResult<RbExpr> {
140
- if value.is_nil() {
141
- Ok(dsl::lit(Null {}).into())
142
- } else if let Ok(series) = value.try_convert::<&RbSeries>() {
143
- Ok(dsl::lit(series.series.borrow().clone()).into())
144
- } else if let Some(v) = RString::from_value(value) {
145
- Ok(dsl::lit(v.try_convert::<String>()?).into())
146
- } else if value.is_kind_of(class::integer()) {
147
- match value.try_convert::<i64>() {
183
+ pub fn lit(value: Value, allow_object: bool) -> RbResult<RbExpr> {
184
+ if value.is_kind_of(class::true_class()) || value.is_kind_of(class::false_class()) {
185
+ Ok(dsl::lit(bool::try_convert(value)?).into())
186
+ } else if let Some(v) = Integer::from_value(value) {
187
+ match v.to_i64() {
148
188
  Ok(val) => {
149
189
  if val > 0 && val < i32::MAX as i64 || val < 0 && val > i32::MIN as i64 {
150
190
  Ok(dsl::lit(val as i32).into())
@@ -153,21 +193,51 @@ pub fn lit(value: Value) -> RbResult<RbExpr> {
153
193
  }
154
194
  }
155
195
  _ => {
156
- let val = value.try_convert::<u64>()?;
196
+ let val = v.to_u64()?;
157
197
  Ok(dsl::lit(val).into())
158
198
  }
159
199
  }
200
+ } else if let Some(v) = Float::from_value(value) {
201
+ Ok(dsl::lit(v.to_f64()).into())
202
+ } else if let Some(v) = RString::from_value(value) {
203
+ if v.enc_get() == encoding::Index::utf8() {
204
+ Ok(dsl::lit(v.to_string()?).into())
205
+ } else {
206
+ Ok(dsl::lit(unsafe { v.as_slice() }).into())
207
+ }
208
+ } else if let Ok(series) = Obj::<RbSeries>::try_convert(value) {
209
+ Ok(dsl::lit(series.series.borrow().clone()).into())
210
+ } else if value.is_nil() {
211
+ Ok(dsl::lit(Null {}).into())
212
+ } else if allow_object {
213
+ todo!()
160
214
  } else {
161
- Ok(dsl::lit(value.try_convert::<f64>()?).into())
215
+ Err(RbValueError::new_err(format!(
216
+ "could not convert value {:?} as a Literal",
217
+ value.to_string()
218
+ )))
162
219
  }
163
220
  }
164
221
 
165
- pub fn repeat(value: Value, n_times: &RbExpr) -> RbResult<RbExpr> {
166
- if value.is_nil() {
167
- Ok(polars::lazy::dsl::repeat(Null {}, n_times.inner.clone()).into())
168
- } else {
169
- todo!();
222
+ pub fn repeat(value: &RbExpr, n: &RbExpr, dtype: Option<Wrap<DataType>>) -> RbResult<RbExpr> {
223
+ let mut value = value.inner.clone();
224
+ let n = n.inner.clone();
225
+
226
+ if let Some(dtype) = dtype {
227
+ value = value.cast(dtype.0);
228
+ }
229
+
230
+ if let Expr::Literal(lv) = &value {
231
+ let av = lv.to_anyvalue().unwrap();
232
+ // Integer inputs that fit in Int32 are parsed as such
233
+ if let DataType::Int64 = av.dtype() {
234
+ let int_value = av.try_extract::<i64>().unwrap();
235
+ if int_value >= i32::MIN as i64 && int_value <= i32::MAX as i64 {
236
+ value = value.cast(DataType::Int32);
237
+ }
238
+ }
170
239
  }
240
+ Ok(dsl::repeat(value, n).into())
171
241
  }
172
242
 
173
243
  pub fn pearson_corr(a: &RbExpr, b: &RbExpr, ddof: u8) -> RbExpr {
@@ -179,8 +249,8 @@ pub fn spearman_rank_corr(a: &RbExpr, b: &RbExpr, ddof: u8, propagate_nans: bool
179
249
  .into()
180
250
  }
181
251
 
182
- pub fn cov(a: &RbExpr, b: &RbExpr) -> RbExpr {
183
- polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone()).into()
252
+ pub fn cov(a: &RbExpr, b: &RbExpr, ddof: u8) -> RbExpr {
253
+ polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone(), ddof).into()
184
254
  }
185
255
 
186
256
  pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
@@ -197,13 +267,8 @@ pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
197
267
  pub fn dtype_cols2(dtypes: RArray) -> RbResult<RbExpr> {
198
268
  let dtypes = dtypes
199
269
  .each()
200
- .map(|v| v?.try_convert::<Wrap<DataType>>())
270
+ .map(|v| Wrap::<DataType>::try_convert(v?))
201
271
  .collect::<RbResult<Vec<Wrap<DataType>>>>()?;
202
272
  let dtypes = vec_extract_wrapped(dtypes);
203
273
  Ok(crate::functions::lazy::dtype_cols(dtypes))
204
274
  }
205
-
206
- pub fn sum_exprs(exprs: RArray) -> RbResult<RbExpr> {
207
- let exprs = rb_exprs_to_exprs(exprs)?;
208
- Ok(polars::lazy::dsl::sum_exprs(exprs).into())
209
- }
@@ -1,8 +1,38 @@
1
1
  use magnus::{IntoValue, Value};
2
+ use polars_core;
3
+ use polars_core::fmt::FloatFmt;
2
4
  use polars_core::prelude::IDX_DTYPE;
5
+ use polars_core::POOL;
3
6
 
4
7
  use crate::conversion::Wrap;
8
+ use crate::{RbResult, RbValueError};
5
9
 
6
10
  pub fn get_idx_type() -> Value {
7
11
  Wrap(IDX_DTYPE).into_value()
8
12
  }
13
+
14
+ pub fn threadpool_size() -> usize {
15
+ POOL.current_num_threads()
16
+ }
17
+
18
+ pub fn set_float_fmt(fmt: String) -> RbResult<()> {
19
+ let fmt = match fmt.as_str() {
20
+ "full" => FloatFmt::Full,
21
+ "mixed" => FloatFmt::Mixed,
22
+ e => {
23
+ return Err(RbValueError::new_err(format!(
24
+ "fmt must be one of {{'full', 'mixed'}}, got {e}",
25
+ )))
26
+ }
27
+ };
28
+ polars_core::fmt::set_float_fmt(fmt);
29
+ Ok(())
30
+ }
31
+
32
+ pub fn get_float_fmt() -> RbResult<String> {
33
+ let strfmt = match polars_core::fmt::get_float_fmt() {
34
+ FloatFmt::Full => "full",
35
+ FloatFmt::Mixed => "mixed",
36
+ };
37
+ Ok(strfmt.to_string())
38
+ }
@@ -0,0 +1,8 @@
1
+ use crate::conversion::Wrap;
2
+ use crate::prelude::DataType;
3
+ use crate::RbResult;
4
+
5
+ pub fn dtype_str_repr(dtype: Wrap<DataType>) -> RbResult<String> {
6
+ let dtype = dtype.0;
7
+ Ok(dtype.to_string())
8
+ }
@@ -1,5 +1,10 @@
1
+ pub mod aggregation;
1
2
  pub mod eager;
2
3
  pub mod io;
3
4
  pub mod lazy;
4
5
  pub mod meta;
6
+ pub mod misc;
7
+ pub mod random;
8
+ pub mod range;
9
+ pub mod string_cache;
5
10
  pub mod whenthen;
@@ -0,0 +1,6 @@
1
+ use crate::RbResult;
2
+
3
+ pub fn set_random_seed(seed: u64) -> RbResult<()> {
4
+ polars_core::random::set_global_random_seed(seed);
5
+ Ok(())
6
+ }
@@ -0,0 +1,46 @@
1
+ use polars::lazy::dsl;
2
+ use polars_core::datatypes::{TimeUnit, TimeZone};
3
+
4
+ use crate::conversion::Wrap;
5
+ use crate::prelude::*;
6
+ use crate::RbExpr;
7
+
8
+ pub fn int_range(start: &RbExpr, end: &RbExpr, step: i64, dtype: Wrap<DataType>) -> RbExpr {
9
+ let dtype = dtype.0;
10
+
11
+ let mut result = dsl::int_range(start.inner.clone(), end.inner.clone(), step);
12
+
13
+ if dtype != DataType::Int64 {
14
+ result = result.cast(dtype)
15
+ }
16
+
17
+ result.into()
18
+ }
19
+
20
+ pub fn int_ranges(start: &RbExpr, end: &RbExpr, step: i64, dtype: Wrap<DataType>) -> RbExpr {
21
+ let dtype = dtype.0;
22
+
23
+ let mut result = dsl::int_ranges(start.inner.clone(), end.inner.clone(), step);
24
+
25
+ if dtype != DataType::Int64 {
26
+ result = result.cast(DataType::List(Box::new(dtype)))
27
+ }
28
+
29
+ result.into()
30
+ }
31
+
32
+ pub fn date_range(
33
+ start: &RbExpr,
34
+ end: &RbExpr,
35
+ every: String,
36
+ closed: Wrap<ClosedWindow>,
37
+ time_unit: Option<Wrap<TimeUnit>>,
38
+ time_zone: Option<TimeZone>,
39
+ ) -> RbExpr {
40
+ let start = start.inner.clone();
41
+ let end = end.inner.clone();
42
+ let every = Duration::parse(&every);
43
+ let closed = closed.0;
44
+ let time_unit = time_unit.map(|x| x.0);
45
+ dsl::date_range(start, end, every, closed, time_unit, time_zone).into()
46
+ }
@@ -0,0 +1,11 @@
1
+ pub fn enable_string_cache() {
2
+ polars_core::enable_string_cache()
3
+ }
4
+
5
+ pub fn disable_string_cache() {
6
+ polars_core::disable_string_cache()
7
+ }
8
+
9
+ pub fn using_string_cache() -> bool {
10
+ polars_core::using_string_cache()
11
+ }
@@ -16,23 +16,23 @@ impl From<dsl::When> for RbWhen {
16
16
 
17
17
  #[magnus::wrap(class = "Polars::RbWhenThen")]
18
18
  #[derive(Clone)]
19
- pub struct RbWhenThen {
20
- pub inner: dsl::WhenThen,
19
+ pub struct RbThen {
20
+ pub inner: dsl::Then,
21
21
  }
22
22
 
23
- impl From<dsl::WhenThen> for RbWhenThen {
24
- fn from(inner: dsl::WhenThen) -> Self {
25
- RbWhenThen { inner }
23
+ impl From<dsl::Then> for RbThen {
24
+ fn from(inner: dsl::Then) -> Self {
25
+ RbThen { inner }
26
26
  }
27
27
  }
28
28
 
29
29
  impl RbWhen {
30
- pub fn then(&self, expr: &RbExpr) -> RbWhenThen {
30
+ pub fn then(&self, expr: &RbExpr) -> RbThen {
31
31
  self.inner.clone().then(expr.inner.clone()).into()
32
32
  }
33
33
  }
34
34
 
35
- impl RbWhenThen {
35
+ impl RbThen {
36
36
  pub fn overwise(&self, expr: &RbExpr) -> RbExpr {
37
37
  self.inner.clone().otherwise(expr.inner.clone()).into()
38
38
  }