polars-df 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -1,12 +1,15 @@
1
- use magnus::{class, RArray, RString, Value};
1
+ use magnus::encoding::{self, EncodingCapable};
2
+ use magnus::{
3
+ class, prelude::*, typed_data::Obj, value::Opaque, Float, Integer, RArray, RString, Ruby, Value,
4
+ };
2
5
  use polars::lazy::dsl;
3
6
  use polars::prelude::*;
4
7
 
5
- use crate::apply::lazy::binary_lambda;
6
8
  use crate::conversion::{get_lf, get_rbseq, Wrap};
9
+ use crate::map::lazy::binary_lambda;
7
10
  use crate::prelude::vec_extract_wrapped;
8
11
  use crate::rb_exprs_to_exprs;
9
- use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
12
+ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
10
13
 
11
14
  macro_rules! set_unwrapped_or_0 {
12
15
  ($($var:ident),+ $(,)?) => {
@@ -14,10 +17,6 @@ macro_rules! set_unwrapped_or_0 {
14
17
  };
15
18
  }
16
19
 
17
- pub fn arange(low: &RbExpr, high: &RbExpr, step: i64) -> RbExpr {
18
- dsl::arange(low.inner.clone(), high.inner.clone(), step).into()
19
- }
20
-
21
20
  pub fn arg_sort_by(by: RArray, descending: Vec<bool>) -> RbResult<RbExpr> {
22
21
  let by = rb_exprs_to_exprs(by)?;
23
22
  Ok(dsl::arg_sort_by(by, &descending).into())
@@ -29,7 +28,7 @@ pub fn arg_where(condition: &RbExpr) -> RbExpr {
29
28
 
30
29
  pub fn as_struct(exprs: RArray) -> RbResult<RbExpr> {
31
30
  let exprs = rb_exprs_to_exprs(exprs)?;
32
- Ok(dsl::as_struct(&exprs).into())
31
+ Ok(dsl::as_struct(exprs).into())
33
32
  }
34
33
 
35
34
  pub fn coalesce(exprs: RArray) -> RbResult<RbExpr> {
@@ -44,7 +43,7 @@ pub fn col(name: String) -> RbExpr {
44
43
  pub fn collect_all(lfs: RArray) -> RbResult<RArray> {
45
44
  let lfs = lfs
46
45
  .each()
47
- .map(|v| v?.try_convert::<&RbLazyFrame>())
46
+ .map(|v| <&RbLazyFrame>::try_convert(v?))
48
47
  .collect::<RbResult<Vec<&RbLazyFrame>>>()?;
49
48
 
50
49
  Ok(RArray::from_iter(lfs.iter().map(|lf| {
@@ -57,7 +56,12 @@ pub fn cols(names: Vec<String>) -> RbExpr {
57
56
  dsl::cols(names).into()
58
57
  }
59
58
 
60
- pub fn concat_lf(lfs: Value, rechunk: bool, parallel: bool) -> RbResult<RbLazyFrame> {
59
+ pub fn concat_lf(
60
+ lfs: Value,
61
+ rechunk: bool,
62
+ parallel: bool,
63
+ to_supertypes: bool,
64
+ ) -> RbResult<RbLazyFrame> {
61
65
  let (seq, len) = get_rbseq(lfs)?;
62
66
  let mut lfs = Vec::with_capacity(len);
63
67
 
@@ -67,40 +71,77 @@ pub fn concat_lf(lfs: Value, rechunk: bool, parallel: bool) -> RbResult<RbLazyFr
67
71
  lfs.push(lf);
68
72
  }
69
73
 
70
- let lf = polars::lazy::dsl::concat(lfs, rechunk, parallel).map_err(RbPolarsErr::from)?;
74
+ let lf = dsl::concat(
75
+ lfs,
76
+ UnionArgs {
77
+ rechunk,
78
+ parallel,
79
+ to_supertypes,
80
+ },
81
+ )
82
+ .map_err(RbPolarsErr::from)?;
83
+ Ok(lf.into())
84
+ }
85
+
86
+ pub fn concat_lf_diagonal(
87
+ lfs: RArray,
88
+ rechunk: bool,
89
+ parallel: bool,
90
+ to_supertypes: bool,
91
+ ) -> RbResult<RbLazyFrame> {
92
+ let iter = lfs.each();
93
+
94
+ let lfs = iter
95
+ .map(|item| {
96
+ let item = item?;
97
+ get_lf(item)
98
+ })
99
+ .collect::<RbResult<Vec<_>>>()?;
100
+
101
+ let lf = dsl::functions::concat_lf_diagonal(
102
+ lfs,
103
+ UnionArgs {
104
+ rechunk,
105
+ parallel,
106
+ to_supertypes,
107
+ },
108
+ )
109
+ .map_err(RbPolarsErr::from)?;
71
110
  Ok(lf.into())
72
111
  }
73
112
 
74
113
  #[allow(clippy::too_many_arguments)]
75
114
  pub fn duration(
115
+ weeks: Option<&RbExpr>,
76
116
  days: Option<&RbExpr>,
117
+ hours: Option<&RbExpr>,
118
+ minutes: Option<&RbExpr>,
77
119
  seconds: Option<&RbExpr>,
78
- nanoseconds: Option<&RbExpr>,
79
- microseconds: Option<&RbExpr>,
80
120
  milliseconds: Option<&RbExpr>,
81
- minutes: Option<&RbExpr>,
82
- hours: Option<&RbExpr>,
83
- weeks: Option<&RbExpr>,
121
+ microseconds: Option<&RbExpr>,
122
+ nanoseconds: Option<&RbExpr>,
123
+ time_unit: Wrap<TimeUnit>,
84
124
  ) -> RbExpr {
85
125
  set_unwrapped_or_0!(
126
+ weeks,
86
127
  days,
128
+ hours,
129
+ minutes,
87
130
  seconds,
88
- nanoseconds,
89
- microseconds,
90
131
  milliseconds,
91
- minutes,
92
- hours,
93
- weeks,
132
+ microseconds,
133
+ nanoseconds,
94
134
  );
95
135
  let args = DurationArgs {
136
+ weeks,
96
137
  days,
138
+ hours,
139
+ minutes,
97
140
  seconds,
98
- nanoseconds,
99
- microseconds,
100
141
  milliseconds,
101
- minutes,
102
- hours,
103
- weeks,
142
+ microseconds,
143
+ nanoseconds,
144
+ time_unit: time_unit.0,
104
145
  };
105
146
  dsl::duration(args).into()
106
147
  }
@@ -123,28 +164,27 @@ pub fn dtype_cols(dtypes: Vec<DataType>) -> RbExpr {
123
164
 
124
165
  pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
125
166
  let exprs = rb_exprs_to_exprs(exprs)?;
167
+ let lambda = Opaque::from(lambda);
126
168
 
127
- let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
169
+ let func =
170
+ move |a: Series, b: Series| binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b);
128
171
  Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
129
172
  }
130
173
 
131
174
  pub fn cumfold(acc: &RbExpr, lambda: Value, exprs: RArray, include_init: bool) -> RbResult<RbExpr> {
132
175
  let exprs = rb_exprs_to_exprs(exprs)?;
176
+ let lambda = Opaque::from(lambda);
133
177
 
134
- let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
135
- Ok(polars::lazy::dsl::cumfold_exprs(acc.inner.clone(), func, exprs, include_init).into())
178
+ let func =
179
+ move |a: Series, b: Series| binary_lambda(Ruby::get().unwrap().get_inner(lambda), a, b);
180
+ Ok(polars::lazy::dsl::cum_fold_exprs(acc.inner.clone(), func, exprs, include_init).into())
136
181
  }
137
182
 
138
- // TODO improve
139
- pub fn lit(value: Value) -> RbResult<RbExpr> {
140
- if value.is_nil() {
141
- Ok(dsl::lit(Null {}).into())
142
- } else if let Ok(series) = value.try_convert::<&RbSeries>() {
143
- Ok(dsl::lit(series.series.borrow().clone()).into())
144
- } else if let Some(v) = RString::from_value(value) {
145
- Ok(dsl::lit(v.try_convert::<String>()?).into())
146
- } else if value.is_kind_of(class::integer()) {
147
- match value.try_convert::<i64>() {
183
+ pub fn lit(value: Value, allow_object: bool) -> RbResult<RbExpr> {
184
+ if value.is_kind_of(class::true_class()) || value.is_kind_of(class::false_class()) {
185
+ Ok(dsl::lit(bool::try_convert(value)?).into())
186
+ } else if let Some(v) = Integer::from_value(value) {
187
+ match v.to_i64() {
148
188
  Ok(val) => {
149
189
  if val > 0 && val < i32::MAX as i64 || val < 0 && val > i32::MIN as i64 {
150
190
  Ok(dsl::lit(val as i32).into())
@@ -153,21 +193,51 @@ pub fn lit(value: Value) -> RbResult<RbExpr> {
153
193
  }
154
194
  }
155
195
  _ => {
156
- let val = value.try_convert::<u64>()?;
196
+ let val = v.to_u64()?;
157
197
  Ok(dsl::lit(val).into())
158
198
  }
159
199
  }
200
+ } else if let Some(v) = Float::from_value(value) {
201
+ Ok(dsl::lit(v.to_f64()).into())
202
+ } else if let Some(v) = RString::from_value(value) {
203
+ if v.enc_get() == encoding::Index::utf8() {
204
+ Ok(dsl::lit(v.to_string()?).into())
205
+ } else {
206
+ Ok(dsl::lit(unsafe { v.as_slice() }).into())
207
+ }
208
+ } else if let Ok(series) = Obj::<RbSeries>::try_convert(value) {
209
+ Ok(dsl::lit(series.series.borrow().clone()).into())
210
+ } else if value.is_nil() {
211
+ Ok(dsl::lit(Null {}).into())
212
+ } else if allow_object {
213
+ todo!()
160
214
  } else {
161
- Ok(dsl::lit(value.try_convert::<f64>()?).into())
215
+ Err(RbValueError::new_err(format!(
216
+ "could not convert value {:?} as a Literal",
217
+ value.to_string()
218
+ )))
162
219
  }
163
220
  }
164
221
 
165
- pub fn repeat(value: Value, n_times: &RbExpr) -> RbResult<RbExpr> {
166
- if value.is_nil() {
167
- Ok(polars::lazy::dsl::repeat(Null {}, n_times.inner.clone()).into())
168
- } else {
169
- todo!();
222
+ pub fn repeat(value: &RbExpr, n: &RbExpr, dtype: Option<Wrap<DataType>>) -> RbResult<RbExpr> {
223
+ let mut value = value.inner.clone();
224
+ let n = n.inner.clone();
225
+
226
+ if let Some(dtype) = dtype {
227
+ value = value.cast(dtype.0);
228
+ }
229
+
230
+ if let Expr::Literal(lv) = &value {
231
+ let av = lv.to_anyvalue().unwrap();
232
+ // Integer inputs that fit in Int32 are parsed as such
233
+ if let DataType::Int64 = av.dtype() {
234
+ let int_value = av.try_extract::<i64>().unwrap();
235
+ if int_value >= i32::MIN as i64 && int_value <= i32::MAX as i64 {
236
+ value = value.cast(DataType::Int32);
237
+ }
238
+ }
170
239
  }
240
+ Ok(dsl::repeat(value, n).into())
171
241
  }
172
242
 
173
243
  pub fn pearson_corr(a: &RbExpr, b: &RbExpr, ddof: u8) -> RbExpr {
@@ -179,8 +249,8 @@ pub fn spearman_rank_corr(a: &RbExpr, b: &RbExpr, ddof: u8, propagate_nans: bool
179
249
  .into()
180
250
  }
181
251
 
182
- pub fn cov(a: &RbExpr, b: &RbExpr) -> RbExpr {
183
- polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone()).into()
252
+ pub fn cov(a: &RbExpr, b: &RbExpr, ddof: u8) -> RbExpr {
253
+ polars::lazy::dsl::cov(a.inner.clone(), b.inner.clone(), ddof).into()
184
254
  }
185
255
 
186
256
  pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
@@ -197,13 +267,8 @@ pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
197
267
  pub fn dtype_cols2(dtypes: RArray) -> RbResult<RbExpr> {
198
268
  let dtypes = dtypes
199
269
  .each()
200
- .map(|v| v?.try_convert::<Wrap<DataType>>())
270
+ .map(|v| Wrap::<DataType>::try_convert(v?))
201
271
  .collect::<RbResult<Vec<Wrap<DataType>>>>()?;
202
272
  let dtypes = vec_extract_wrapped(dtypes);
203
273
  Ok(crate::functions::lazy::dtype_cols(dtypes))
204
274
  }
205
-
206
- pub fn sum_exprs(exprs: RArray) -> RbResult<RbExpr> {
207
- let exprs = rb_exprs_to_exprs(exprs)?;
208
- Ok(polars::lazy::dsl::sum_exprs(exprs).into())
209
- }
@@ -1,8 +1,38 @@
1
1
  use magnus::{IntoValue, Value};
2
+ use polars_core;
3
+ use polars_core::fmt::FloatFmt;
2
4
  use polars_core::prelude::IDX_DTYPE;
5
+ use polars_core::POOL;
3
6
 
4
7
  use crate::conversion::Wrap;
8
+ use crate::{RbResult, RbValueError};
5
9
 
6
10
  pub fn get_idx_type() -> Value {
7
11
  Wrap(IDX_DTYPE).into_value()
8
12
  }
13
+
14
+ pub fn threadpool_size() -> usize {
15
+ POOL.current_num_threads()
16
+ }
17
+
18
+ pub fn set_float_fmt(fmt: String) -> RbResult<()> {
19
+ let fmt = match fmt.as_str() {
20
+ "full" => FloatFmt::Full,
21
+ "mixed" => FloatFmt::Mixed,
22
+ e => {
23
+ return Err(RbValueError::new_err(format!(
24
+ "fmt must be one of {{'full', 'mixed'}}, got {e}",
25
+ )))
26
+ }
27
+ };
28
+ polars_core::fmt::set_float_fmt(fmt);
29
+ Ok(())
30
+ }
31
+
32
+ pub fn get_float_fmt() -> RbResult<String> {
33
+ let strfmt = match polars_core::fmt::get_float_fmt() {
34
+ FloatFmt::Full => "full",
35
+ FloatFmt::Mixed => "mixed",
36
+ };
37
+ Ok(strfmt.to_string())
38
+ }
@@ -0,0 +1,8 @@
1
+ use crate::conversion::Wrap;
2
+ use crate::prelude::DataType;
3
+ use crate::RbResult;
4
+
5
+ pub fn dtype_str_repr(dtype: Wrap<DataType>) -> RbResult<String> {
6
+ let dtype = dtype.0;
7
+ Ok(dtype.to_string())
8
+ }
@@ -1,5 +1,10 @@
1
+ pub mod aggregation;
1
2
  pub mod eager;
2
3
  pub mod io;
3
4
  pub mod lazy;
4
5
  pub mod meta;
6
+ pub mod misc;
7
+ pub mod random;
8
+ pub mod range;
9
+ pub mod string_cache;
5
10
  pub mod whenthen;
@@ -0,0 +1,6 @@
1
+ use crate::RbResult;
2
+
3
+ pub fn set_random_seed(seed: u64) -> RbResult<()> {
4
+ polars_core::random::set_global_random_seed(seed);
5
+ Ok(())
6
+ }
@@ -0,0 +1,46 @@
1
+ use polars::lazy::dsl;
2
+ use polars_core::datatypes::{TimeUnit, TimeZone};
3
+
4
+ use crate::conversion::Wrap;
5
+ use crate::prelude::*;
6
+ use crate::RbExpr;
7
+
8
+ pub fn int_range(start: &RbExpr, end: &RbExpr, step: i64, dtype: Wrap<DataType>) -> RbExpr {
9
+ let dtype = dtype.0;
10
+
11
+ let mut result = dsl::int_range(start.inner.clone(), end.inner.clone(), step);
12
+
13
+ if dtype != DataType::Int64 {
14
+ result = result.cast(dtype)
15
+ }
16
+
17
+ result.into()
18
+ }
19
+
20
+ pub fn int_ranges(start: &RbExpr, end: &RbExpr, step: i64, dtype: Wrap<DataType>) -> RbExpr {
21
+ let dtype = dtype.0;
22
+
23
+ let mut result = dsl::int_ranges(start.inner.clone(), end.inner.clone(), step);
24
+
25
+ if dtype != DataType::Int64 {
26
+ result = result.cast(DataType::List(Box::new(dtype)))
27
+ }
28
+
29
+ result.into()
30
+ }
31
+
32
+ pub fn date_range(
33
+ start: &RbExpr,
34
+ end: &RbExpr,
35
+ every: String,
36
+ closed: Wrap<ClosedWindow>,
37
+ time_unit: Option<Wrap<TimeUnit>>,
38
+ time_zone: Option<TimeZone>,
39
+ ) -> RbExpr {
40
+ let start = start.inner.clone();
41
+ let end = end.inner.clone();
42
+ let every = Duration::parse(&every);
43
+ let closed = closed.0;
44
+ let time_unit = time_unit.map(|x| x.0);
45
+ dsl::date_range(start, end, every, closed, time_unit, time_zone).into()
46
+ }
@@ -0,0 +1,11 @@
1
+ pub fn enable_string_cache() {
2
+ polars_core::enable_string_cache()
3
+ }
4
+
5
+ pub fn disable_string_cache() {
6
+ polars_core::disable_string_cache()
7
+ }
8
+
9
+ pub fn using_string_cache() -> bool {
10
+ polars_core::using_string_cache()
11
+ }
@@ -16,23 +16,23 @@ impl From<dsl::When> for RbWhen {
16
16
 
17
17
  #[magnus::wrap(class = "Polars::RbWhenThen")]
18
18
  #[derive(Clone)]
19
- pub struct RbWhenThen {
20
- pub inner: dsl::WhenThen,
19
+ pub struct RbThen {
20
+ pub inner: dsl::Then,
21
21
  }
22
22
 
23
- impl From<dsl::WhenThen> for RbWhenThen {
24
- fn from(inner: dsl::WhenThen) -> Self {
25
- RbWhenThen { inner }
23
+ impl From<dsl::Then> for RbThen {
24
+ fn from(inner: dsl::Then) -> Self {
25
+ RbThen { inner }
26
26
  }
27
27
  }
28
28
 
29
29
  impl RbWhen {
30
- pub fn then(&self, expr: &RbExpr) -> RbWhenThen {
30
+ pub fn then(&self, expr: &RbExpr) -> RbThen {
31
31
  self.inner.clone().then(expr.inner.clone()).into()
32
32
  }
33
33
  }
34
34
 
35
- impl RbWhenThen {
35
+ impl RbThen {
36
36
  pub fn overwise(&self, expr: &RbExpr) -> RbExpr {
37
37
  self.inner.clone().otherwise(expr.inner.clone()).into()
38
38
  }