polars-df 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -5,15 +5,14 @@ mod construction;
5
5
  mod export;
6
6
  mod set_at_idx;
7
7
 
8
- use magnus::{exception, Error, IntoValue, RArray, Value, QNIL};
8
+ use magnus::{exception, prelude::*, value::qnil, Error, IntoValue, RArray, Value};
9
9
  use polars::prelude::*;
10
10
  use polars::series::IsSorted;
11
11
  use std::cell::RefCell;
12
12
 
13
- use crate::apply::series::{call_lambda_and_extract, ApplyLambda};
14
13
  use crate::apply_method_all_arrow_series2;
15
14
  use crate::conversion::*;
16
- use crate::series::set_at_idx::set_at_idx;
15
+ use crate::map::series::{call_lambda_and_extract, ApplyLambda};
17
16
  use crate::{RbDataFrame, RbPolarsErr, RbResult};
18
17
 
19
18
  #[magnus::wrap(class = "Polars::RbSeries")]
@@ -38,7 +37,7 @@ impl RbSeries {
38
37
  pub fn to_series_collection(rs: RArray) -> RbResult<Vec<Series>> {
39
38
  let mut series = Vec::new();
40
39
  for item in rs.each() {
41
- series.push(item?.try_convert::<&RbSeries>()?.series.borrow().clone());
40
+ series.push(<&RbSeries>::try_convert(item?)?.series.borrow().clone());
42
41
  }
43
42
  Ok(series)
44
43
  }
@@ -303,7 +302,7 @@ impl RbSeries {
303
302
  pub fn to_a(&self) -> Value {
304
303
  let series = &self.series.borrow();
305
304
 
306
- fn to_list_recursive(series: &Series) -> Value {
305
+ fn to_a_recursive(series: &Series) -> Value {
307
306
  let rblist = match series.dtype() {
308
307
  DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
309
308
  DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
@@ -325,7 +324,7 @@ impl RbSeries {
325
324
  let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
326
325
  match obj {
327
326
  Some(val) => v.push(val.to_object()).unwrap(),
328
- None => v.push(QNIL).unwrap(),
327
+ None => v.push(qnil()).unwrap(),
329
328
  };
330
329
  }
331
330
  v.into_value()
@@ -333,13 +332,29 @@ impl RbSeries {
333
332
  DataType::List(_) => {
334
333
  let v = RArray::new();
335
334
  let ca = series.list().unwrap();
335
+ for opt_s in unsafe { ca.amortized_iter() } {
336
+ match opt_s {
337
+ None => {
338
+ v.push(qnil()).unwrap();
339
+ }
340
+ Some(s) => {
341
+ let rblst = to_a_recursive(s.as_ref());
342
+ v.push(rblst).unwrap();
343
+ }
344
+ }
345
+ }
346
+ v.into_value()
347
+ }
348
+ DataType::Array(_, _) => {
349
+ let v = RArray::new();
350
+ let ca = series.array().unwrap();
336
351
  for opt_s in ca.amortized_iter() {
337
352
  match opt_s {
338
353
  None => {
339
- v.push(QNIL).unwrap();
354
+ v.push(qnil()).unwrap();
340
355
  }
341
356
  Some(s) => {
342
- let rblst = to_list_recursive(s.as_ref());
357
+ let rblst = to_a_recursive(s.as_ref());
343
358
  v.push(rblst).unwrap();
344
359
  }
345
360
  }
@@ -347,18 +362,20 @@ impl RbSeries {
347
362
  v.into_value()
348
363
  }
349
364
  DataType::Date => {
350
- let a = RArray::with_capacity(series.len());
351
- for v in series.iter() {
352
- a.push::<Value>(Wrap(v).into_value()).unwrap();
353
- }
354
- return a.into_value();
365
+ let ca = series.date().unwrap();
366
+ return Wrap(ca).into_value();
367
+ }
368
+ DataType::Time => {
369
+ let ca = series.time().unwrap();
370
+ return Wrap(ca).into_value();
355
371
  }
356
372
  DataType::Datetime(_, _) => {
357
- let a = RArray::with_capacity(series.len());
358
- for v in series.iter() {
359
- a.push::<Value>(Wrap(v).into_value()).unwrap();
360
- }
361
- return a.into_value();
373
+ let ca = series.datetime().unwrap();
374
+ return Wrap(ca).into_value();
375
+ }
376
+ DataType::Decimal(_, _) => {
377
+ let ca = series.decimal().unwrap();
378
+ return Wrap(ca).into_value();
362
379
  }
363
380
  DataType::Utf8 => {
364
381
  let ca = series.utf8().unwrap();
@@ -376,15 +393,37 @@ impl RbSeries {
376
393
  let ca = series.binary().unwrap();
377
394
  return Wrap(ca).into_value();
378
395
  }
379
- DataType::Null | DataType::Unknown => {
396
+ DataType::Null => {
397
+ let null: Option<u8> = None;
398
+ let n = series.len();
399
+ let iter = std::iter::repeat(null).take(n);
400
+ use std::iter::{Repeat, Take};
401
+ struct NullIter {
402
+ iter: Take<Repeat<Option<u8>>>,
403
+ n: usize,
404
+ }
405
+ impl Iterator for NullIter {
406
+ type Item = Option<u8>;
407
+
408
+ fn next(&mut self) -> Option<Self::Item> {
409
+ self.iter.next()
410
+ }
411
+ fn size_hint(&self) -> (usize, Option<usize>) {
412
+ (self.n, Some(self.n))
413
+ }
414
+ }
415
+ impl ExactSizeIterator for NullIter {}
416
+
417
+ RArray::from_iter(NullIter { iter, n }).into_value()
418
+ }
419
+ DataType::Unknown => {
380
420
  panic!("to_a not implemented for null/unknown")
381
421
  }
382
- _ => todo!(),
383
422
  };
384
423
  rblist
385
424
  }
386
425
 
387
- to_list_recursive(series)
426
+ to_a_recursive(series)
388
427
  }
389
428
 
390
429
  pub fn clone(&self) -> Self {
@@ -594,23 +633,15 @@ impl RbSeries {
594
633
  Ok(RbSeries::new(s))
595
634
  }
596
635
 
597
- pub fn to_dummies(&self, sep: Option<String>) -> RbResult<RbDataFrame> {
636
+ pub fn to_dummies(&self, sep: Option<String>, drop_first: bool) -> RbResult<RbDataFrame> {
598
637
  let df = self
599
638
  .series
600
639
  .borrow()
601
- .to_dummies(sep.as_deref())
640
+ .to_dummies(sep.as_deref(), drop_first)
602
641
  .map_err(RbPolarsErr::from)?;
603
642
  Ok(df.into())
604
643
  }
605
644
 
606
- pub fn peak_max(&self) -> Self {
607
- self.series.borrow().peak_max().into_series().into()
608
- }
609
-
610
- pub fn peak_min(&self) -> Self {
611
- self.series.borrow().peak_min().into_series().into()
612
- }
613
-
614
645
  pub fn n_unique(&self) -> RbResult<usize> {
615
646
  let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
616
647
  Ok(n)
@@ -668,17 +699,6 @@ impl RbSeries {
668
699
  None
669
700
  }
670
701
  }
671
-
672
- pub fn set_at_idx(&self, idx: &RbSeries, values: &RbSeries) -> RbResult<()> {
673
- let mut s = self.series.borrow_mut();
674
- match set_at_idx(s.clone(), &idx.series.borrow(), &values.series.borrow()) {
675
- Ok(out) => {
676
- *s = out;
677
- Ok(())
678
- }
679
- Err(e) => Err(RbPolarsErr::from(e)),
680
- }
681
- }
682
702
  }
683
703
 
684
704
  macro_rules! impl_set_with_mask {
@@ -0,0 +1,46 @@
1
+ use polars::sql::SQLContext;
2
+ use std::cell::RefCell;
3
+
4
+ use crate::{RbLazyFrame, RbPolarsErr, RbResult};
5
+
6
+ #[magnus::wrap(class = "Polars::RbSQLContext")]
7
+ #[repr(transparent)]
8
+ #[derive(Clone)]
9
+ pub struct RbSQLContext {
10
+ pub context: RefCell<SQLContext>,
11
+ }
12
+
13
+ #[allow(
14
+ clippy::wrong_self_convention,
15
+ clippy::should_implement_trait,
16
+ clippy::len_without_is_empty
17
+ )]
18
+ impl RbSQLContext {
19
+ #[allow(clippy::new_without_default)]
20
+ pub fn new() -> RbSQLContext {
21
+ RbSQLContext {
22
+ context: SQLContext::new().into(),
23
+ }
24
+ }
25
+
26
+ pub fn execute(&self, query: String) -> RbResult<RbLazyFrame> {
27
+ Ok(self
28
+ .context
29
+ .borrow_mut()
30
+ .execute(&query)
31
+ .map_err(RbPolarsErr::from)?
32
+ .into())
33
+ }
34
+
35
+ pub fn get_tables(&self) -> RbResult<Vec<String>> {
36
+ Ok(self.context.borrow().get_tables())
37
+ }
38
+
39
+ pub fn register(&self, name: String, lf: &RbLazyFrame) {
40
+ self.context.borrow_mut().register(&name, lf.ldf.clone())
41
+ }
42
+
43
+ pub fn unregister(&self, name: String) {
44
+ self.context.borrow_mut().unregister(&name)
45
+ }
46
+ }
@@ -0,0 +1,84 @@
1
+ module Polars
2
+ # Namespace for array related expressions.
3
+ class ArrayExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Compute the min values of the sub-arrays.
13
+ #
14
+ # @return [Expr]
15
+ #
16
+ # @example
17
+ # df = Polars::DataFrame.new(
18
+ # {"a" => [[1, 2], [4, 3]]},
19
+ # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
20
+ # )
21
+ # df.select(Polars.col("a").arr.min)
22
+ # # =>
23
+ # # shape: (2, 1)
24
+ # # ┌─────┐
25
+ # # │ a │
26
+ # # │ --- │
27
+ # # │ i64 │
28
+ # # ╞═════╡
29
+ # # │ 1 │
30
+ # # │ 3 │
31
+ # # └─────┘
32
+ def min
33
+ Utils.wrap_expr(_rbexpr.array_min)
34
+ end
35
+
36
+ # Compute the max values of the sub-arrays.
37
+ #
38
+ # @return [Expr]
39
+ #
40
+ # @example
41
+ # df = Polars::DataFrame.new(
42
+ # {"a" => [[1, 2], [4, 3]]},
43
+ # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
44
+ # )
45
+ # df.select(Polars.col("a").arr.max)
46
+ # # =>
47
+ # # shape: (2, 1)
48
+ # # ┌─────┐
49
+ # # │ a │
50
+ # # │ --- │
51
+ # # │ i64 │
52
+ # # ╞═════╡
53
+ # # │ 2 │
54
+ # # │ 4 │
55
+ # # └─────┘
56
+ def max
57
+ Utils.wrap_expr(_rbexpr.array_max)
58
+ end
59
+
60
+ # Compute the sum values of the sub-arrays.
61
+ #
62
+ # @return [Expr]
63
+ #
64
+ # @example
65
+ # df = Polars::DataFrame.new(
66
+ # {"a" => [[1, 2], [4, 3]]},
67
+ # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
68
+ # )
69
+ # df.select(Polars.col("a").arr.sum)
70
+ # # =>
71
+ # # shape: (2, 1)
72
+ # # ┌─────┐
73
+ # # │ a │
74
+ # # │ --- │
75
+ # # │ i64 │
76
+ # # ╞═════╡
77
+ # # │ 3 │
78
+ # # │ 7 │
79
+ # # └─────┘
80
+ def sum
81
+ Utils.wrap_expr(_rbexpr.array_sum)
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,77 @@
1
+ module Polars
2
+ # Series.arr namespace.
3
+ class ArrayNameSpace
4
+ include ExprDispatch
5
+
6
+ self._accessor = "arr"
7
+
8
+ # @private
9
+ def initialize(series)
10
+ self._s = series._s
11
+ end
12
+
13
+ # Compute the min values of the sub-arrays.
14
+ #
15
+ # @return [Series]
16
+ #
17
+ # @example
18
+ # s = Polars::Series.new(
19
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
20
+ # )
21
+ # s.arr.min
22
+ # # =>
23
+ # # shape: (2,)
24
+ # # Series: 'a' [i64]
25
+ # # [
26
+ # # 1
27
+ # # 3
28
+ # # ]
29
+ def min
30
+ super
31
+ end
32
+
33
+ # Compute the max values of the sub-arrays.
34
+ #
35
+ # @return [Series]
36
+ #
37
+ # @example
38
+ # s = Polars::Series.new(
39
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
40
+ # )
41
+ # s.arr.max
42
+ # # =>
43
+ # # shape: (2,)
44
+ # # Series: 'a' [i64]
45
+ # # [
46
+ # # 2
47
+ # # 4
48
+ # # ]
49
+ def max
50
+ super
51
+ end
52
+
53
+ # Compute the sum values of the sub-arrays.
54
+ #
55
+ # @return [Series]
56
+ #
57
+ # @example
58
+ # df = Polars::DataFrame.new(
59
+ # {"a" => [[1, 2], [4, 3]]},
60
+ # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
61
+ # )
62
+ # df.select(Polars.col("a").arr.sum)
63
+ # # =>
64
+ # # shape: (2, 1)
65
+ # # ┌─────┐
66
+ # # │ a │
67
+ # # │ --- │
68
+ # # │ i64 │
69
+ # # ╞═════╡
70
+ # # │ 3 │
71
+ # # │ 7 │
72
+ # # └─────┘
73
+ def sum
74
+ super
75
+ end
76
+ end
77
+ end
@@ -41,7 +41,7 @@ module Polars
41
41
  dtypes.each do|k, v|
42
42
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
43
  end
44
- elsif dtypes.is_a?(Array)
44
+ elsif dtypes.is_a?(::Array)
45
45
  dtype_slice = dtypes
46
46
  else
47
47
  raise ArgumentError, "dtype arg should be list or dict"