polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -0,0 +1,105 @@
1
+ //! A global process-aborting timeout system, mainly intended for testing.
2
+
3
+ use std::cmp::Reverse;
4
+ use std::collections::BinaryHeap;
5
+ use std::sync::LazyLock;
6
+ use std::sync::mpsc::{Receiver, RecvTimeoutError, Sender, channel};
7
+ use std::time::Duration;
8
+
9
+ use polars::prelude::{InitHashMaps, PlHashSet};
10
+ use polars_utils::priority::Priority;
11
+ use polars_utils::relaxed_cell::RelaxedCell;
12
+
13
+ static TIMEOUT_REQUEST_HANDLER: LazyLock<Sender<TimeoutRequest>> = LazyLock::new(|| {
14
+ let (send, recv) = channel();
15
+ std::thread::Builder::new()
16
+ .name("polars-timeout".to_string())
17
+ .spawn(move || timeout_thread(recv))
18
+ .unwrap();
19
+ send
20
+ });
21
+
22
+ enum TimeoutRequest {
23
+ Start(Duration, u64),
24
+ Cancel(u64),
25
+ }
26
+
27
+ pub fn get_timeout() -> Option<Duration> {
28
+ static TIMEOUT_DISABLED: RelaxedCell<bool> = RelaxedCell::new_bool(false);
29
+
30
+ // Fast path so we don't have to keep checking environment variables. Make
31
+ // sure that if you want to use POLARS_TIMEOUT_MS it is set before the first
32
+ // polars call.
33
+ if TIMEOUT_DISABLED.load() {
34
+ return None;
35
+ }
36
+
37
+ let Ok(timeout) = std::env::var("POLARS_TIMEOUT_MS") else {
38
+ TIMEOUT_DISABLED.store(true);
39
+ return None;
40
+ };
41
+
42
+ match timeout.parse() {
43
+ Ok(ms) => Some(Duration::from_millis(ms)),
44
+ Err(e) => {
45
+ eprintln!("failed to parse POLARS_TIMEOUT_MS: {e:?}");
46
+ None
47
+ }
48
+ }
49
+ }
50
+
51
+ fn timeout_thread(recv: Receiver<TimeoutRequest>) {
52
+ let mut active_timeouts: PlHashSet<u64> = PlHashSet::new();
53
+ let mut shortest_timeout: BinaryHeap<Priority<Reverse<Duration>, u64>> = BinaryHeap::new();
54
+ loop {
55
+ // Remove cancelled requests.
56
+ while let Some(Priority(_, id)) = shortest_timeout.peek() {
57
+ if active_timeouts.contains(id) {
58
+ break;
59
+ }
60
+ shortest_timeout.pop();
61
+ }
62
+
63
+ let request = if let Some(Priority(timeout, _)) = shortest_timeout.peek() {
64
+ match recv.recv_timeout(timeout.0) {
65
+ Err(RecvTimeoutError::Timeout) => {
66
+ eprintln!("exiting the process, POLARS_TIMEOUT_MS exceeded");
67
+ std::thread::sleep(Duration::from_secs_f64(1.0));
68
+ std::process::exit(1);
69
+ }
70
+ r => r.unwrap(),
71
+ }
72
+ } else {
73
+ recv.recv().unwrap()
74
+ };
75
+
76
+ match request {
77
+ TimeoutRequest::Start(duration, id) => {
78
+ shortest_timeout.push(Priority(Reverse(duration), id));
79
+ active_timeouts.insert(id);
80
+ }
81
+ TimeoutRequest::Cancel(id) => {
82
+ active_timeouts.remove(&id);
83
+ }
84
+ }
85
+ }
86
+ }
87
+
88
+ pub fn schedule_polars_timeout() -> Option<u64> {
89
+ static TIMEOUT_ID: RelaxedCell<u64> = RelaxedCell::new_u64(0);
90
+
91
+ let timeout = get_timeout()?;
92
+ let id = TIMEOUT_ID.fetch_add(1);
93
+ TIMEOUT_REQUEST_HANDLER
94
+ .send(TimeoutRequest::Start(timeout, id))
95
+ .unwrap();
96
+ Some(id)
97
+ }
98
+
99
+ pub fn cancel_polars_timeout(opt_id: Option<u64>) {
100
+ if let Some(id) = opt_id {
101
+ TIMEOUT_REQUEST_HANDLER
102
+ .send(TimeoutRequest::Cancel(id))
103
+ .unwrap();
104
+ }
105
+ }
@@ -1,4 +1,16 @@
1
- use crate::{RbErr, RbPolarsErr};
1
+ use std::os::raw::c_void;
2
+ use std::panic::AssertUnwindSafe;
3
+
4
+ use magnus::Ruby;
5
+ use polars::frame::DataFrame;
6
+ use polars::series::IntoSeries;
7
+ use polars_error::PolarsResult;
8
+ use polars_error::signals::{KeyboardInterrupt, catch_keyboard_interrupt};
9
+ use rb_sys::{rb_thread_call_with_gvl, rb_thread_call_without_gvl};
10
+
11
+ use crate::exceptions::RbKeyboardInterrupt;
12
+ use crate::timeout::{cancel_polars_timeout, schedule_polars_timeout};
13
+ use crate::{RbDataFrame, RbErr, RbPolarsErr, RbResult, RbSeries};
2
14
 
3
15
  #[macro_export]
4
16
  macro_rules! apply_method_all_arrow_series2 {
@@ -10,14 +22,17 @@ macro_rules! apply_method_all_arrow_series2 {
10
22
  DataType::UInt16 => $self.u16().unwrap().$method($($args),*),
11
23
  DataType::UInt32 => $self.u32().unwrap().$method($($args),*),
12
24
  DataType::UInt64 => $self.u64().unwrap().$method($($args),*),
25
+ DataType::UInt128 => $self.u128().unwrap().$method($($args),*),
13
26
  DataType::Int8 => $self.i8().unwrap().$method($($args),*),
14
27
  DataType::Int16 => $self.i16().unwrap().$method($($args),*),
15
28
  DataType::Int32 => $self.i32().unwrap().$method($($args),*),
16
29
  DataType::Int64 => $self.i64().unwrap().$method($($args),*),
30
+ DataType::Int128 => $self.i128().unwrap().$method($($args),*),
17
31
  DataType::Float32 => $self.f32().unwrap().$method($($args),*),
18
32
  DataType::Float64 => $self.f64().unwrap().$method($($args),*),
19
33
  DataType::Date => $self.date().unwrap().physical().$method($($args),*),
20
34
  DataType::Datetime(_, _) => $self.datetime().unwrap().physical().$method($($args),*),
35
+ // TODO implement
21
36
  // DataType::List(_) => $self.list().unwrap().$method($($args),*),
22
37
  DataType::Struct(_) => $self.struct_().unwrap().$method($($args),*),
23
38
  dt => panic!("dtype {:?} not supported", dt)
@@ -30,3 +45,146 @@ macro_rules! apply_method_all_arrow_series2 {
30
45
  pub(crate) fn to_rb_err<E: Into<RbPolarsErr>>(e: E) -> RbErr {
31
46
  e.into().into()
32
47
  }
48
+
49
+ pub trait EnterPolarsExt {
50
+ fn enter_polars<T, E, F>(self, f: F) -> RbResult<T>
51
+ where
52
+ F: FnOnce() -> Result<T, E>,
53
+ E: Into<RbPolarsErr>;
54
+
55
+ #[inline(always)]
56
+ fn enter_polars_ok<T, F>(self, f: F) -> RbResult<T>
57
+ where
58
+ Self: Sized,
59
+ F: FnOnce() -> T,
60
+ {
61
+ self.enter_polars(move || RbResult::Ok(f()))
62
+ }
63
+
64
+ #[inline(always)]
65
+ fn enter_polars_df<F>(self, f: F) -> RbResult<RbDataFrame>
66
+ where
67
+ Self: Sized,
68
+ F: FnOnce() -> PolarsResult<DataFrame>,
69
+ {
70
+ self.enter_polars(f).map(RbDataFrame::new)
71
+ }
72
+
73
+ #[inline(always)]
74
+ fn enter_polars_series<T, F>(self, f: F) -> RbResult<RbSeries>
75
+ where
76
+ Self: Sized,
77
+ T: IntoSeries,
78
+ F: FnOnce() -> PolarsResult<T>,
79
+ {
80
+ self.enter_polars(f).map(|s| RbSeries::new(s.into_series()))
81
+ }
82
+
83
+ fn detach<T, F>(self, f: F) -> T
84
+ where
85
+ Self: Sized,
86
+ F: FnOnce() -> T,
87
+ {
88
+ if std::env::var("POLARS_GVL").is_ok() {
89
+ f()
90
+ } else {
91
+ let mut data = CallbackData {
92
+ func: Some(f),
93
+ result: None,
94
+ };
95
+
96
+ unsafe {
97
+ rb_thread_call_without_gvl(
98
+ Some(call_without_gvl::<F, T>),
99
+ &mut data as *mut _ as *mut c_void,
100
+ None,
101
+ std::ptr::null_mut(),
102
+ );
103
+ }
104
+
105
+ data.result.unwrap()
106
+ }
107
+ }
108
+ }
109
+
110
+ impl EnterPolarsExt for &Ruby {
111
+ fn enter_polars<T, E, F>(self, f: F) -> RbResult<T>
112
+ where
113
+ F: FnOnce() -> Result<T, E>,
114
+ E: Into<RbPolarsErr>,
115
+ {
116
+ let timeout = schedule_polars_timeout();
117
+ let ret = self.detach(|| catch_keyboard_interrupt(AssertUnwindSafe(f)));
118
+ cancel_polars_timeout(timeout);
119
+ match ret {
120
+ Ok(Ok(ret)) => Ok(ret),
121
+ Ok(Err(err)) => Err(RbErr::from(err.into())),
122
+ Err(KeyboardInterrupt) => Err(RbKeyboardInterrupt::new_err("")),
123
+ }
124
+ }
125
+ }
126
+
127
+ pub trait RubyAttach {
128
+ fn attach<T, F>(f: F) -> T
129
+ where
130
+ F: FnOnce(&Ruby) -> T;
131
+ }
132
+
133
+ unsafe extern "C" {
134
+ fn ruby_thread_has_gvl_p() -> std::ffi::c_int;
135
+ }
136
+
137
+ impl RubyAttach for Ruby {
138
+ fn attach<T, F>(f: F) -> T
139
+ where
140
+ F: FnOnce(&Ruby) -> T,
141
+ {
142
+ // recheck GVL state since cached value can be incorrect
143
+ // https://github.com/matsadler/magnus/pull/161
144
+ if let Ok(rb) = Ruby::get()
145
+ && unsafe { ruby_thread_has_gvl_p() } != 0
146
+ {
147
+ f(&rb)
148
+ } else {
149
+ let mut data = CallbackData {
150
+ func: Some(f),
151
+ result: None,
152
+ };
153
+
154
+ unsafe {
155
+ rb_thread_call_with_gvl(
156
+ Some(call_with_gvl::<F, T>),
157
+ &mut data as *mut _ as *mut c_void,
158
+ );
159
+ }
160
+
161
+ data.result.unwrap()
162
+ }
163
+ }
164
+ }
165
+
166
+ struct CallbackData<F, T> {
167
+ func: Option<F>,
168
+ result: Option<T>,
169
+ }
170
+
171
+ extern "C" fn call_without_gvl<F, T>(data: *mut c_void) -> *mut c_void
172
+ where
173
+ F: FnOnce() -> T,
174
+ {
175
+ let data = unsafe { &mut *(data as *mut CallbackData<F, T>) };
176
+ let func = data.func.take().unwrap();
177
+ data.result = Some(func());
178
+ std::ptr::null_mut()
179
+ }
180
+
181
+ extern "C" fn call_with_gvl<F, T>(data: *mut c_void) -> *mut c_void
182
+ where
183
+ F: FnOnce(&Ruby) -> T,
184
+ {
185
+ let rb = Ruby::get().unwrap();
186
+ let data = unsafe { &mut *(data as *mut CallbackData<F, T>) };
187
+ let func = data.func.take().unwrap();
188
+ data.result = Some(func(&rb));
189
+ std::ptr::null_mut()
190
+ }
@@ -191,7 +191,7 @@ module Polars
191
191
  # @example
192
192
  # df = Polars::DataFrame.new(
193
193
  # {"a" => [[1, 2], [4, 3]]},
194
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
194
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
195
195
  # )
196
196
  # df.select(Polars.col("a").arr.min)
197
197
  # # =>
@@ -215,7 +215,7 @@ module Polars
215
215
  # @example
216
216
  # df = Polars::DataFrame.new(
217
217
  # {"a" => [[1, 2], [4, 3]]},
218
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
218
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
219
219
  # )
220
220
  # df.select(Polars.col("a").arr.max)
221
221
  # # =>
@@ -239,7 +239,7 @@ module Polars
239
239
  # @example
240
240
  # df = Polars::DataFrame.new(
241
241
  # {"a" => [[1, 2], [4, 3]]},
242
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
242
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
243
243
  # )
244
244
  # df.select(Polars.col("a").arr.sum)
245
245
  # # =>
@@ -263,7 +263,7 @@ module Polars
263
263
  # @example
264
264
  # df = Polars::DataFrame.new(
265
265
  # {"a" => [[1, 2], [4, 3]]},
266
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
266
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
267
267
  # )
268
268
  # df.select(Polars.col("a").arr.std)
269
269
  # # =>
@@ -287,7 +287,7 @@ module Polars
287
287
  # @example
288
288
  # df = Polars::DataFrame.new(
289
289
  # {"a" => [[1, 2], [4, 3]]},
290
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
290
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
291
291
  # )
292
292
  # df.select(Polars.col("a").arr.var)
293
293
  # # =>
@@ -335,7 +335,7 @@ module Polars
335
335
  # @example
336
336
  # df = Polars::DataFrame.new(
337
337
  # {"a" => [[1, 2], [4, 3]]},
338
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
338
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
339
339
  # )
340
340
  # df.select(Polars.col("a").arr.median)
341
341
  # # =>
@@ -437,7 +437,7 @@ module Polars
437
437
  # @example
438
438
  # df = Polars::DataFrame.new(
439
439
  # {
440
- # "a": [
440
+ # "a" => [
441
441
  # [true, true],
442
442
  # [false, true],
443
443
  # [false, false],
@@ -472,7 +472,7 @@ module Polars
472
472
  # @example
473
473
  # df = Polars::DataFrame.new(
474
474
  # {
475
- # "a": [
475
+ # "a" => [
476
476
  # [true, true],
477
477
  # [false, true],
478
478
  # [false, false],
@@ -642,7 +642,7 @@ module Polars
642
642
  # {"arr" => [[1, 2, 3], [4, 5, 6], [7, 8, 9]], "idx" => [1, -2, 4]},
643
643
  # schema: {"arr" => Polars::Array.new(Polars::Int32, 3), "idx" => Polars::Int32}
644
644
  # )
645
- # df.with_columns(get: Polars.col("arr").arr.get("idx"))
645
+ # df.with_columns(get: Polars.col("arr").arr.get("idx", null_on_oob: true))
646
646
  # # =>
647
647
  # # shape: (3, 3)
648
648
  # # ┌───────────────┬─────┬──────┐
@@ -654,7 +654,7 @@ module Polars
654
654
  # # │ [4, 5, 6] ┆ -2 ┆ 5 │
655
655
  # # │ [7, 8, 9] ┆ 4 ┆ null │
656
656
  # # └───────────────┴─────┴──────┘
657
- def get(index, null_on_oob: true)
657
+ def get(index, null_on_oob: false)
658
658
  index = Utils.parse_into_expression(index)
659
659
  Utils.wrap_expr(_rbexpr.arr_get(index, null_on_oob))
660
660
  end
@@ -681,7 +681,7 @@ module Polars
681
681
  # # │ [7, 8, 9] ┆ 7 │
682
682
  # # └───────────────┴───────┘
683
683
  def first
684
- get(0)
684
+ get(0, null_on_oob: true)
685
685
  end
686
686
 
687
687
  # Get the last value of the sub-arrays.
@@ -706,7 +706,7 @@ module Polars
706
706
  # # │ [7, 8, 9] ┆ 9 │
707
707
  # # └───────────────┴──────┘
708
708
  def last
709
- get(-1)
709
+ get(-1, null_on_oob: true)
710
710
  end
711
711
 
712
712
  # Join all string items in a sub-array and place a separator between them.
@@ -914,5 +914,74 @@ module Polars
914
914
  n = Utils.parse_into_expression(n)
915
915
  Utils.wrap_expr(_rbexpr.arr_shift(n))
916
916
  end
917
+
918
+ # Run any polars expression against the arrays' elements.
919
+ #
920
+ # @param expr [Expr]
921
+ # Expression to run. Note that you can select an element with `Polars.element`
922
+ # @param as_list [Boolean]
923
+ # Collect the resulting data as a list. This allows for expressions which
924
+ # output a variable amount of data.
925
+ #
926
+ # @return [Expr]
927
+ #
928
+ # @example
929
+ # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
930
+ # df.with_columns(rank: Polars.concat_arr("a", "b").arr.eval(Polars.element.rank))
931
+ # # =>
932
+ # # shape: (3, 3)
933
+ # # ┌─────┬─────┬───────────────┐
934
+ # # │ a ┆ b ┆ rank │
935
+ # # │ --- ┆ --- ┆ --- │
936
+ # # │ i64 ┆ i64 ┆ array[f64, 2] │
937
+ # # ╞═════╪═════╪═══════════════╡
938
+ # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
939
+ # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
940
+ # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
941
+ # # └─────┴─────┴───────────────┘
942
+ def eval(expr, as_list: false)
943
+ Utils.wrap_expr(_rbexpr.arr_eval(expr._rbexpr, as_list))
944
+ end
945
+
946
+ # Run any polars aggregation expression against the arrays' elements.
947
+ #
948
+ # @param expr [Expr]
949
+ # Expression to run. Note that you can select an element with `Polars.element`.
950
+ #
951
+ # @return [Expr]
952
+ #
953
+ # @example
954
+ # df = Polars::Series.new(
955
+ # "a", [[1, nil], [42, 13], [nil, nil]], dtype: Polars::Array.new(Polars::Int64, 2)
956
+ # ).to_frame
957
+ # df.with_columns(null_count: Polars.col("a").arr.agg(Polars.element.null_count))
958
+ # # =>
959
+ # # shape: (3, 2)
960
+ # # ┌───────────────┬────────────┐
961
+ # # │ a ┆ null_count │
962
+ # # │ --- ┆ --- │
963
+ # # │ array[i64, 2] ┆ u32 │
964
+ # # ╞═══════════════╪════════════╡
965
+ # # │ [1, null] ┆ 1 │
966
+ # # │ [42, 13] ┆ 0 │
967
+ # # │ [null, null] ┆ 2 │
968
+ # # └───────────────┴────────────┘
969
+ #
970
+ # @example
971
+ # df.with_columns(no_nulls: Polars.col("a").arr.agg(Polars.element.drop_nulls))
972
+ # # =>
973
+ # # shape: (3, 2)
974
+ # # ┌───────────────┬───────────┐
975
+ # # │ a ┆ no_nulls │
976
+ # # │ --- ┆ --- │
977
+ # # │ array[i64, 2] ┆ list[i64] │
978
+ # # ╞═══════════════╪═══════════╡
979
+ # # │ [1, null] ┆ [1] │
980
+ # # │ [42, 13] ┆ [42, 13] │
981
+ # # │ [null, null] ┆ [] │
982
+ # # └───────────────┴───────────┘
983
+ def agg(expr)
984
+ Utils.wrap_expr(_rbexpr.arr_agg(expr._rbexpr))
985
+ end
917
986
  end
918
987
  end
@@ -16,7 +16,7 @@ module Polars
16
16
  #
17
17
  # @example
18
18
  # s = Polars::Series.new(
19
- # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
19
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int64, 2)
20
20
  # )
21
21
  # s.arr.min
22
22
  # # =>
@@ -36,7 +36,7 @@ module Polars
36
36
  #
37
37
  # @example
38
38
  # s = Polars::Series.new(
39
- # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
39
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int64, 2)
40
40
  # )
41
41
  # s.arr.max
42
42
  # # =>
@@ -57,7 +57,7 @@ module Polars
57
57
  # @example
58
58
  # df = Polars::DataFrame.new(
59
59
  # {"a" => [[1, 2], [4, 3]]},
60
- # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
60
+ # schema: {"a" => Polars::Array.new(Polars::Int64, 2)}
61
61
  # )
62
62
  # df.select(Polars.col("a").arr.sum)
63
63
  # # =>
@@ -477,6 +477,10 @@ module Polars
477
477
  #
478
478
  # @param index [Integer]
479
479
  # Index to return per sublist
480
+ # @param null_on_oob [Boolean]
481
+ # Behavior if an index is out of bounds:
482
+ # true -> set as null
483
+ # false -> raise an error
480
484
  #
481
485
  # @return [Series]
482
486
  #
@@ -484,7 +488,7 @@ module Polars
484
488
  # s = Polars::Series.new(
485
489
  # "a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype: Polars::Array.new(Polars::Int32, 3)
486
490
  # )
487
- # s.arr.get(Polars::Series.new([1, -2, 4]))
491
+ # s.arr.get(Polars::Series.new([1, -2, 4]), null_on_oob: true)
488
492
  # # =>
489
493
  # # shape: (3,)
490
494
  # # Series: 'a' [i32]
@@ -493,7 +497,7 @@ module Polars
493
497
  # # 5
494
498
  # # null
495
499
  # # ]
496
- def get(index)
500
+ def get(index, null_on_oob: false)
497
501
  super
498
502
  end
499
503
 
@@ -548,7 +552,7 @@ module Polars
548
552
  # @param ignore_nulls [Boolean]
549
553
  # Ignore null values (default).
550
554
  #
551
- # If set to `False`, null values will be propagated.
555
+ # If set to `false`, null values will be propagated.
552
556
  # If the sub-list contains any null values, the output is `nil`.
553
557
  #
554
558
  # @return [Series]
@@ -593,6 +597,8 @@ module Polars
593
597
  #
594
598
  # @param item [Object]
595
599
  # Item that will be checked for membership
600
+ # @param nulls_equal [Boolean]
601
+ # If true, treat null as a distinct value. Null values will not propagate.
596
602
  #
597
603
  # @return [Series]
598
604
  #
@@ -609,7 +615,7 @@ module Polars
609
615
  # # true
610
616
  # # false
611
617
  # # ]
612
- def contains(item)
618
+ def contains(item, nulls_equal: true)
613
619
  super
614
620
  end
615
621
 
@@ -700,5 +706,66 @@ module Polars
700
706
  def shift(n = 1)
701
707
  super
702
708
  end
709
+
710
+ # Run any polars expression against the arrays' elements.
711
+ #
712
+ # @param expr [Expr]
713
+ # Expression to run. Note that you can select an element with `pl.element()`
714
+ # @param as_list [Boolean]
715
+ # Collect the resulting data as a list. This allows for expressions which
716
+ # output a variable amount of data.
717
+ #
718
+ # @return [Series]
719
+ #
720
+ # @example
721
+ # s = Polars::Series.new("a", [[1, 4], [8, 5], [3, 2]], dtype: Polars::Array.new(Polars::Int64, 2))
722
+ # s.arr.eval(Polars.element.rank)
723
+ # # =>
724
+ # # shape: (3,)
725
+ # # Series: 'a' [array[f64, 2]]
726
+ # # [
727
+ # # [1.0, 2.0]
728
+ # # [2.0, 1.0]
729
+ # # [2.0, 1.0]
730
+ # # ]
731
+ def eval(expr, as_list: false)
732
+ s = Utils.wrap_s(_s)
733
+ s.to_frame.select(F.col(s.name).arr.eval(expr, as_list: as_list)).to_series
734
+ end
735
+
736
+ # Run any polars aggregation expression against the arrays' elements.
737
+ #
738
+ # @param expr [Expr]
739
+ # Expression to run. Note that you can select an element with `Polars.element`.
740
+ #
741
+ # @return [Series]
742
+ #
743
+ # @example
744
+ # s = Polars::Series.new(
745
+ # "a", [[1, nil], [42, 13], [nil, nil]], dtype: Polars::Array.new(Polars::Int64, 2)
746
+ # )
747
+ # s.arr.agg(Polars.element.null_count)
748
+ # # =>
749
+ # # shape: (3,)
750
+ # # Series: 'a' [u32]
751
+ # # [
752
+ # # 1
753
+ # # 0
754
+ # # 2
755
+ # # ]
756
+ #
757
+ # @example
758
+ # s.arr.agg(Polars.element.drop_nulls)
759
+ # # =>
760
+ # # shape: (3,)
761
+ # # Series: 'a' [list[i64]]
762
+ # # [
763
+ # # [1]
764
+ # # [42, 13]
765
+ # # []
766
+ # # ]
767
+ def agg(expr)
768
+ super
769
+ end
703
770
  end
704
771
  end
@@ -4,18 +4,19 @@ module Polars
4
4
  attr_accessor :_reader, :new_columns
5
5
 
6
6
  def initialize(
7
- file,
7
+ source,
8
8
  has_header: true,
9
9
  columns: nil,
10
- sep: ",",
11
- comment_char: nil,
10
+ separator: ",",
11
+ comment_prefix: nil,
12
12
  quote_char: '"',
13
13
  skip_rows: 0,
14
- dtypes: nil,
14
+ skip_lines: 0,
15
+ schema_overrides: nil,
15
16
  null_values: nil,
16
17
  missing_utf8_is_empty_string: false,
17
18
  ignore_errors: false,
18
- parse_dates: false,
19
+ try_parse_dates: false,
19
20
  n_threads: nil,
20
21
  infer_schema_length: 100,
21
22
  batch_size: 50_000,
@@ -24,30 +25,28 @@ module Polars
24
25
  low_memory: false,
25
26
  rechunk: true,
26
27
  skip_rows_after_header: 0,
27
- row_count_name: nil,
28
- row_count_offset: 0,
28
+ row_index_name: nil,
29
+ row_index_offset: 0,
29
30
  eol_char: "\n",
30
31
  new_columns: nil,
31
32
  raise_if_empty: true,
32
33
  truncate_ragged_lines: false,
33
34
  decimal_comma: false
34
35
  )
35
- if Utils.pathlike?(file)
36
- path = Utils.normalize_filepath(file)
37
- end
36
+ path = Utils.normalize_filepath(source)
38
37
 
39
38
  dtype_list = nil
40
39
  dtype_slice = nil
41
- if !dtypes.nil?
42
- if dtypes.is_a?(Hash)
40
+ if !schema_overrides.nil?
41
+ if schema_overrides.is_a?(Hash)
43
42
  dtype_list = []
44
- dtypes.each do |k, v|
45
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ schema_overrides.each do |k, v|
44
+ dtype_list << [k, Utils.parse_into_dtype(v)]
46
45
  end
47
- elsif dtypes.is_a?(::Array)
48
- dtype_slice = dtypes
46
+ elsif schema_overrides.is_a?(::Array)
47
+ dtype_slice = schema_overrides
49
48
  else
50
- raise ArgumentError, "dtype arg should be list or dict"
49
+ raise TypeError, "dtype arg should be array or hash"
51
50
  end
52
51
  end
53
52
 
@@ -61,8 +60,9 @@ module Polars
61
60
  ignore_errors,
62
61
  n_rows,
63
62
  skip_rows,
63
+ skip_lines,
64
64
  projection,
65
- sep,
65
+ separator,
66
66
  rechunk,
67
67
  columns,
68
68
  encoding,
@@ -71,13 +71,13 @@ module Polars
71
71
  dtype_list,
72
72
  dtype_slice,
73
73
  low_memory,
74
- comment_char,
74
+ comment_prefix,
75
75
  quote_char,
76
76
  processed_null_values,
77
77
  missing_utf8_is_empty_string,
78
- parse_dates,
78
+ try_parse_dates,
79
79
  skip_rows_after_header,
80
- Utils.parse_row_index_args(row_count_name, row_count_offset),
80
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
81
81
  eol_char,
82
82
  raise_if_empty,
83
83
  truncate_ragged_lines,