polars-df 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,254 @@
1
+ pub mod dataframe;
2
+ pub mod series;
3
+
4
+ use magnus::{RHash, Value};
5
+ use polars::chunked_array::builder::get_list_builder;
6
+ use polars::prelude::*;
7
+ use polars_core::export::rayon::prelude::*;
8
+ use polars_core::utils::CustomIterTools;
9
+ use polars_core::POOL;
10
+
11
+ use crate::{ObjectValue, RbPolarsErr, RbResult, RbSeries, Wrap};
12
+
13
+ pub trait RbArrowPrimitiveType: PolarsNumericType {}
14
+
15
+ impl RbArrowPrimitiveType for UInt8Type {}
16
+ impl RbArrowPrimitiveType for UInt16Type {}
17
+ impl RbArrowPrimitiveType for UInt32Type {}
18
+ impl RbArrowPrimitiveType for UInt64Type {}
19
+ impl RbArrowPrimitiveType for Int8Type {}
20
+ impl RbArrowPrimitiveType for Int16Type {}
21
+ impl RbArrowPrimitiveType for Int32Type {}
22
+ impl RbArrowPrimitiveType for Int64Type {}
23
+ impl RbArrowPrimitiveType for Float32Type {}
24
+ impl RbArrowPrimitiveType for Float64Type {}
25
+
26
+ fn iterator_to_struct(
27
+ it: impl Iterator<Item = Option<Value>>,
28
+ init_null_count: usize,
29
+ first_value: AnyValue,
30
+ name: &str,
31
+ capacity: usize,
32
+ ) -> RbResult<RbSeries> {
33
+ let (vals, flds) = match &first_value {
34
+ AnyValue::Struct(vals, flds) => (&**vals, *flds),
35
+ AnyValue::StructOwned(payload) => (&*payload.0, &*payload.1),
36
+ _ => {
37
+ return Err(crate::error::ComputeError::new_err(format!(
38
+ "expected struct got {:?}",
39
+ first_value
40
+ )))
41
+ }
42
+ };
43
+
44
+ let struct_width = vals.len();
45
+
46
+ // every item in the struct is kept as its own buffer of anyvalues
47
+ // so as struct with 2 items: {a, b}
48
+ // will have
49
+ // [
50
+ // [ a values ]
51
+ // [ b values ]
52
+ // ]
53
+ let mut items = Vec::with_capacity(vals.len());
54
+ for item in vals {
55
+ let mut buf = Vec::with_capacity(capacity);
56
+ for _ in 0..init_null_count {
57
+ buf.push(AnyValue::Null);
58
+ }
59
+ buf.push(item.clone());
60
+ items.push(buf);
61
+ }
62
+
63
+ for dict in it {
64
+ match dict {
65
+ None => {
66
+ for field_items in &mut items {
67
+ field_items.push(AnyValue::Null);
68
+ }
69
+ }
70
+ Some(dict) => {
71
+ let dict = dict.try_convert::<RHash>()?;
72
+ if dict.len() != struct_width {
73
+ return Err(crate::error::ComputeError::new_err(
74
+ format!("Cannot create struct type.\n> The struct dtype expects {} fields, but it got a dict with {} fields.", struct_width, dict.len())
75
+ ));
76
+ }
77
+ // we ignore the keys of the rest of the dicts
78
+ // the first item determines the output name
79
+ todo!()
80
+ // for ((_, val), field_items) in dict.iter().zip(&mut items) {
81
+ // let item = val.try_convert::<Wrap<AnyValue>>()?;
82
+ // field_items.push(item.0)
83
+ // }
84
+ }
85
+ }
86
+ }
87
+
88
+ let fields = POOL.install(|| {
89
+ items
90
+ .par_iter()
91
+ .zip(flds)
92
+ .map(|(av, fld)| Series::new(fld.name(), av))
93
+ .collect::<Vec<_>>()
94
+ });
95
+
96
+ Ok(StructChunked::new(name, &fields)
97
+ .unwrap()
98
+ .into_series()
99
+ .into())
100
+ }
101
+
102
+ fn iterator_to_primitive<T>(
103
+ it: impl Iterator<Item = Option<T::Native>>,
104
+ init_null_count: usize,
105
+ first_value: Option<T::Native>,
106
+ name: &str,
107
+ capacity: usize,
108
+ ) -> ChunkedArray<T>
109
+ where
110
+ T: RbArrowPrimitiveType,
111
+ {
112
+ // safety: we know the iterators len
113
+ let mut ca: ChunkedArray<T> = unsafe {
114
+ if init_null_count > 0 {
115
+ (0..init_null_count)
116
+ .map(|_| None)
117
+ .chain(std::iter::once(first_value))
118
+ .chain(it)
119
+ .trust_my_length(capacity)
120
+ .collect_trusted()
121
+ } else if first_value.is_some() {
122
+ std::iter::once(first_value)
123
+ .chain(it)
124
+ .trust_my_length(capacity)
125
+ .collect_trusted()
126
+ } else {
127
+ it.collect()
128
+ }
129
+ };
130
+ debug_assert_eq!(ca.len(), capacity);
131
+ ca.rename(name);
132
+ ca
133
+ }
134
+
135
+ fn iterator_to_bool(
136
+ it: impl Iterator<Item = Option<bool>>,
137
+ init_null_count: usize,
138
+ first_value: Option<bool>,
139
+ name: &str,
140
+ capacity: usize,
141
+ ) -> ChunkedArray<BooleanType> {
142
+ // safety: we know the iterators len
143
+ let mut ca: BooleanChunked = unsafe {
144
+ if init_null_count > 0 {
145
+ (0..init_null_count)
146
+ .map(|_| None)
147
+ .chain(std::iter::once(first_value))
148
+ .chain(it)
149
+ .trust_my_length(capacity)
150
+ .collect_trusted()
151
+ } else if first_value.is_some() {
152
+ std::iter::once(first_value)
153
+ .chain(it)
154
+ .trust_my_length(capacity)
155
+ .collect_trusted()
156
+ } else {
157
+ it.collect()
158
+ }
159
+ };
160
+ debug_assert_eq!(ca.len(), capacity);
161
+ ca.rename(name);
162
+ ca
163
+ }
164
+
165
+ fn iterator_to_object(
166
+ it: impl Iterator<Item = Option<ObjectValue>>,
167
+ init_null_count: usize,
168
+ first_value: Option<ObjectValue>,
169
+ name: &str,
170
+ capacity: usize,
171
+ ) -> ObjectChunked<ObjectValue> {
172
+ // safety: we know the iterators len
173
+ let mut ca: ObjectChunked<ObjectValue> = unsafe {
174
+ if init_null_count > 0 {
175
+ (0..init_null_count)
176
+ .map(|_| None)
177
+ .chain(std::iter::once(first_value))
178
+ .chain(it)
179
+ .trust_my_length(capacity)
180
+ .collect_trusted()
181
+ } else if first_value.is_some() {
182
+ std::iter::once(first_value)
183
+ .chain(it)
184
+ .trust_my_length(capacity)
185
+ .collect_trusted()
186
+ } else {
187
+ it.collect()
188
+ }
189
+ };
190
+ debug_assert_eq!(ca.len(), capacity);
191
+ ca.rename(name);
192
+ ca
193
+ }
194
+
195
+ fn iterator_to_utf8(
196
+ it: impl Iterator<Item = Option<String>>,
197
+ init_null_count: usize,
198
+ first_value: Option<&str>,
199
+ name: &str,
200
+ capacity: usize,
201
+ ) -> Utf8Chunked {
202
+ let first_value = first_value.map(|v| v.to_string());
203
+
204
+ // safety: we know the iterators len
205
+ let mut ca: Utf8Chunked = unsafe {
206
+ if init_null_count > 0 {
207
+ (0..init_null_count)
208
+ .map(|_| None)
209
+ .chain(std::iter::once(first_value))
210
+ .chain(it)
211
+ .trust_my_length(capacity)
212
+ .collect_trusted()
213
+ } else if first_value.is_some() {
214
+ std::iter::once(first_value)
215
+ .chain(it)
216
+ .trust_my_length(capacity)
217
+ .collect_trusted()
218
+ } else {
219
+ it.collect()
220
+ }
221
+ };
222
+ debug_assert_eq!(ca.len(), capacity);
223
+ ca.rename(name);
224
+ ca
225
+ }
226
+
227
+ fn iterator_to_list(
228
+ dt: &DataType,
229
+ it: impl Iterator<Item = Option<Series>>,
230
+ init_null_count: usize,
231
+ first_value: Option<&Series>,
232
+ name: &str,
233
+ capacity: usize,
234
+ ) -> RbResult<ListChunked> {
235
+ let mut builder =
236
+ get_list_builder(dt, capacity * 5, capacity, name).map_err(RbPolarsErr::from)?;
237
+ for _ in 0..init_null_count {
238
+ builder.append_null()
239
+ }
240
+ builder.append_opt_series(first_value);
241
+ for opt_val in it {
242
+ match opt_val {
243
+ None => builder.append_null(),
244
+ Some(s) => {
245
+ if s.len() == 0 && s.dtype() != dt {
246
+ builder.append_series(&Series::full_null("", 0, dt))
247
+ } else {
248
+ builder.append_series(&s)
249
+ }
250
+ }
251
+ }
252
+ }
253
+ Ok(builder.finish())
254
+ }