polars-df 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,254 @@
1
+ pub mod dataframe;
2
+ pub mod series;
3
+
4
+ use magnus::{RHash, Value};
5
+ use polars::chunked_array::builder::get_list_builder;
6
+ use polars::prelude::*;
7
+ use polars_core::export::rayon::prelude::*;
8
+ use polars_core::utils::CustomIterTools;
9
+ use polars_core::POOL;
10
+
11
+ use crate::{ObjectValue, RbPolarsErr, RbResult, RbSeries, Wrap};
12
+
13
+ pub trait RbArrowPrimitiveType: PolarsNumericType {}
14
+
15
+ impl RbArrowPrimitiveType for UInt8Type {}
16
+ impl RbArrowPrimitiveType for UInt16Type {}
17
+ impl RbArrowPrimitiveType for UInt32Type {}
18
+ impl RbArrowPrimitiveType for UInt64Type {}
19
+ impl RbArrowPrimitiveType for Int8Type {}
20
+ impl RbArrowPrimitiveType for Int16Type {}
21
+ impl RbArrowPrimitiveType for Int32Type {}
22
+ impl RbArrowPrimitiveType for Int64Type {}
23
+ impl RbArrowPrimitiveType for Float32Type {}
24
+ impl RbArrowPrimitiveType for Float64Type {}
25
+
26
+ fn iterator_to_struct(
27
+ it: impl Iterator<Item = Option<Value>>,
28
+ init_null_count: usize,
29
+ first_value: AnyValue,
30
+ name: &str,
31
+ capacity: usize,
32
+ ) -> RbResult<RbSeries> {
33
+ let (vals, flds) = match &first_value {
34
+ AnyValue::Struct(vals, flds) => (&**vals, *flds),
35
+ AnyValue::StructOwned(payload) => (&*payload.0, &*payload.1),
36
+ _ => {
37
+ return Err(crate::error::ComputeError::new_err(format!(
38
+ "expected struct got {:?}",
39
+ first_value
40
+ )))
41
+ }
42
+ };
43
+
44
+ let struct_width = vals.len();
45
+
46
+ // every item in the struct is kept as its own buffer of anyvalues
47
+ // so as struct with 2 items: {a, b}
48
+ // will have
49
+ // [
50
+ // [ a values ]
51
+ // [ b values ]
52
+ // ]
53
+ let mut items = Vec::with_capacity(vals.len());
54
+ for item in vals {
55
+ let mut buf = Vec::with_capacity(capacity);
56
+ for _ in 0..init_null_count {
57
+ buf.push(AnyValue::Null);
58
+ }
59
+ buf.push(item.clone());
60
+ items.push(buf);
61
+ }
62
+
63
+ for dict in it {
64
+ match dict {
65
+ None => {
66
+ for field_items in &mut items {
67
+ field_items.push(AnyValue::Null);
68
+ }
69
+ }
70
+ Some(dict) => {
71
+ let dict = dict.try_convert::<RHash>()?;
72
+ if dict.len() != struct_width {
73
+ return Err(crate::error::ComputeError::new_err(
74
+ format!("Cannot create struct type.\n> The struct dtype expects {} fields, but it got a dict with {} fields.", struct_width, dict.len())
75
+ ));
76
+ }
77
+ // we ignore the keys of the rest of the dicts
78
+ // the first item determines the output name
79
+ todo!()
80
+ // for ((_, val), field_items) in dict.iter().zip(&mut items) {
81
+ // let item = val.try_convert::<Wrap<AnyValue>>()?;
82
+ // field_items.push(item.0)
83
+ // }
84
+ }
85
+ }
86
+ }
87
+
88
+ let fields = POOL.install(|| {
89
+ items
90
+ .par_iter()
91
+ .zip(flds)
92
+ .map(|(av, fld)| Series::new(fld.name(), av))
93
+ .collect::<Vec<_>>()
94
+ });
95
+
96
+ Ok(StructChunked::new(name, &fields)
97
+ .unwrap()
98
+ .into_series()
99
+ .into())
100
+ }
101
+
102
+ fn iterator_to_primitive<T>(
103
+ it: impl Iterator<Item = Option<T::Native>>,
104
+ init_null_count: usize,
105
+ first_value: Option<T::Native>,
106
+ name: &str,
107
+ capacity: usize,
108
+ ) -> ChunkedArray<T>
109
+ where
110
+ T: RbArrowPrimitiveType,
111
+ {
112
+ // safety: we know the iterators len
113
+ let mut ca: ChunkedArray<T> = unsafe {
114
+ if init_null_count > 0 {
115
+ (0..init_null_count)
116
+ .map(|_| None)
117
+ .chain(std::iter::once(first_value))
118
+ .chain(it)
119
+ .trust_my_length(capacity)
120
+ .collect_trusted()
121
+ } else if first_value.is_some() {
122
+ std::iter::once(first_value)
123
+ .chain(it)
124
+ .trust_my_length(capacity)
125
+ .collect_trusted()
126
+ } else {
127
+ it.collect()
128
+ }
129
+ };
130
+ debug_assert_eq!(ca.len(), capacity);
131
+ ca.rename(name);
132
+ ca
133
+ }
134
+
135
+ fn iterator_to_bool(
136
+ it: impl Iterator<Item = Option<bool>>,
137
+ init_null_count: usize,
138
+ first_value: Option<bool>,
139
+ name: &str,
140
+ capacity: usize,
141
+ ) -> ChunkedArray<BooleanType> {
142
+ // safety: we know the iterators len
143
+ let mut ca: BooleanChunked = unsafe {
144
+ if init_null_count > 0 {
145
+ (0..init_null_count)
146
+ .map(|_| None)
147
+ .chain(std::iter::once(first_value))
148
+ .chain(it)
149
+ .trust_my_length(capacity)
150
+ .collect_trusted()
151
+ } else if first_value.is_some() {
152
+ std::iter::once(first_value)
153
+ .chain(it)
154
+ .trust_my_length(capacity)
155
+ .collect_trusted()
156
+ } else {
157
+ it.collect()
158
+ }
159
+ };
160
+ debug_assert_eq!(ca.len(), capacity);
161
+ ca.rename(name);
162
+ ca
163
+ }
164
+
165
+ fn iterator_to_object(
166
+ it: impl Iterator<Item = Option<ObjectValue>>,
167
+ init_null_count: usize,
168
+ first_value: Option<ObjectValue>,
169
+ name: &str,
170
+ capacity: usize,
171
+ ) -> ObjectChunked<ObjectValue> {
172
+ // safety: we know the iterators len
173
+ let mut ca: ObjectChunked<ObjectValue> = unsafe {
174
+ if init_null_count > 0 {
175
+ (0..init_null_count)
176
+ .map(|_| None)
177
+ .chain(std::iter::once(first_value))
178
+ .chain(it)
179
+ .trust_my_length(capacity)
180
+ .collect_trusted()
181
+ } else if first_value.is_some() {
182
+ std::iter::once(first_value)
183
+ .chain(it)
184
+ .trust_my_length(capacity)
185
+ .collect_trusted()
186
+ } else {
187
+ it.collect()
188
+ }
189
+ };
190
+ debug_assert_eq!(ca.len(), capacity);
191
+ ca.rename(name);
192
+ ca
193
+ }
194
+
195
+ fn iterator_to_utf8(
196
+ it: impl Iterator<Item = Option<String>>,
197
+ init_null_count: usize,
198
+ first_value: Option<&str>,
199
+ name: &str,
200
+ capacity: usize,
201
+ ) -> Utf8Chunked {
202
+ let first_value = first_value.map(|v| v.to_string());
203
+
204
+ // safety: we know the iterators len
205
+ let mut ca: Utf8Chunked = unsafe {
206
+ if init_null_count > 0 {
207
+ (0..init_null_count)
208
+ .map(|_| None)
209
+ .chain(std::iter::once(first_value))
210
+ .chain(it)
211
+ .trust_my_length(capacity)
212
+ .collect_trusted()
213
+ } else if first_value.is_some() {
214
+ std::iter::once(first_value)
215
+ .chain(it)
216
+ .trust_my_length(capacity)
217
+ .collect_trusted()
218
+ } else {
219
+ it.collect()
220
+ }
221
+ };
222
+ debug_assert_eq!(ca.len(), capacity);
223
+ ca.rename(name);
224
+ ca
225
+ }
226
+
227
+ fn iterator_to_list(
228
+ dt: &DataType,
229
+ it: impl Iterator<Item = Option<Series>>,
230
+ init_null_count: usize,
231
+ first_value: Option<&Series>,
232
+ name: &str,
233
+ capacity: usize,
234
+ ) -> RbResult<ListChunked> {
235
+ let mut builder =
236
+ get_list_builder(dt, capacity * 5, capacity, name).map_err(RbPolarsErr::from)?;
237
+ for _ in 0..init_null_count {
238
+ builder.append_null()
239
+ }
240
+ builder.append_opt_series(first_value);
241
+ for opt_val in it {
242
+ match opt_val {
243
+ None => builder.append_null(),
244
+ Some(s) => {
245
+ if s.len() == 0 && s.dtype() != dt {
246
+ builder.append_series(&Series::full_null("", 0, dt))
247
+ } else {
248
+ builder.append_series(&s)
249
+ }
250
+ }
251
+ }
252
+ }
253
+ Ok(builder.finish())
254
+ }