polars-df 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml CHANGED
@@ -3,6 +3,8 @@ members = ["ext/polars"]
3
3
 
4
4
  [patch.crates-io]
5
5
  jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
6
+ halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
7
+ arrow2 = { git = "https://github.com/ankane/arrow2", rev = "9f36b2b97446e6dd495473e4361a70d863ac8027" }
6
8
 
7
9
  [profile.release]
8
10
  strip = true
data/LICENSE.txt CHANGED
@@ -1,5 +1,5 @@
1
1
  Copyright (c) 2020 Ritchie Vink
2
- Copyright (c) 2022 Andrew Kane
2
+ Copyright (c) 2022-2023 Andrew Kane
3
3
 
4
4
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
5
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
12
12
  gem "polars-df"
13
13
  ```
14
14
 
15
- Note: Rust is currently required for installation, and it can take 15-20 minutes to compile the extension.
16
-
17
15
  ## Getting Started
18
16
 
19
17
  This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
@@ -1,6 +1,7 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.1.4"
3
+ version = "0.2.0"
4
+ license = "MIT"
4
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
5
6
  edition = "2021"
6
7
  publish = false
@@ -9,16 +10,19 @@ publish = false
9
10
  crate-type = ["cdylib"]
10
11
 
11
12
  [dependencies]
13
+ ahash = "0.8"
12
14
  magnus = "0.4"
13
- polars-core = "0.25.1"
15
+ polars-core = "0.26.1"
14
16
  serde_json = "1"
15
17
 
16
18
  [dependencies.polars]
17
- version = "0.25.1"
19
+ version = "0.26.1"
18
20
  features = [
19
21
  "abs",
20
22
  "arange",
21
23
  "arg_where",
24
+ "asof_join",
25
+ "avro",
22
26
  "concat_str",
23
27
  "cse",
24
28
  "csv-file",
@@ -53,6 +57,7 @@ features = [
53
57
  "partition_by",
54
58
  "pct_change",
55
59
  "performant",
60
+ "pivot",
56
61
  "product",
57
62
  "propagate_nans",
58
63
  "random",
@@ -61,6 +66,7 @@ features = [
61
66
  "repeat_by",
62
67
  "rolling_window",
63
68
  "round_series",
69
+ "row_hash",
64
70
  "search_sorted",
65
71
  "semi_anti_join",
66
72
  "serde-lazy",
@@ -0,0 +1,303 @@
1
+ use magnus::{class, RArray, TryConvert, Value};
2
+ use polars::prelude::*;
3
+ use polars_core::frame::row::{rows_to_schema_first_non_null, Row};
4
+ use polars_core::series::SeriesIter;
5
+
6
+ use super::*;
7
+ use crate::{RbDataFrame, RbPolarsErr, RbSeries, Wrap};
8
+
9
+ fn get_iters(df: &DataFrame) -> Vec<SeriesIter> {
10
+ df.get_columns().iter().map(|s| s.iter()).collect()
11
+ }
12
+
13
+ fn get_iters_skip(df: &DataFrame, skip: usize) -> Vec<std::iter::Skip<SeriesIter>> {
14
+ df.get_columns()
15
+ .iter()
16
+ .map(|s| s.iter().skip(skip))
17
+ .collect()
18
+ }
19
+
20
+ pub fn apply_lambda_unknown<'a>(
21
+ df: &'a DataFrame,
22
+ lambda: Value,
23
+ inference_size: usize,
24
+ ) -> RbResult<(Value, bool)> {
25
+ let mut null_count = 0;
26
+ let mut iters = get_iters(df);
27
+
28
+ for _ in 0..df.height() {
29
+ let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
30
+ let arg = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
31
+ let out: Value = lambda.funcall("call", arg)?;
32
+
33
+ if out.is_nil() {
34
+ null_count += 1;
35
+ continue;
36
+ } else if out.is_kind_of(class::true_class()) || out.is_kind_of(class::false_class()) {
37
+ let first_value = out.try_convert::<bool>().ok();
38
+ return Ok((
39
+ RbSeries::new(
40
+ apply_lambda_with_bool_out_type(df, lambda, null_count, first_value)
41
+ .into_series(),
42
+ )
43
+ .into(),
44
+ false,
45
+ ));
46
+ } else if out.is_kind_of(class::float()) {
47
+ let first_value = out.try_convert::<f64>().ok();
48
+
49
+ return Ok((
50
+ RbSeries::new(
51
+ apply_lambda_with_primitive_out_type::<Float64Type>(
52
+ df,
53
+ lambda,
54
+ null_count,
55
+ first_value,
56
+ )
57
+ .into_series(),
58
+ )
59
+ .into(),
60
+ false,
61
+ ));
62
+ } else if out.is_kind_of(class::integer()) {
63
+ let first_value = out.try_convert::<i64>().ok();
64
+ return Ok((
65
+ RbSeries::new(
66
+ apply_lambda_with_primitive_out_type::<Int64Type>(
67
+ df,
68
+ lambda,
69
+ null_count,
70
+ first_value,
71
+ )
72
+ .into_series(),
73
+ )
74
+ .into(),
75
+ false,
76
+ ));
77
+ // } else if out.is_kind_of(class::string()) {
78
+ // let first_value = out.try_convert::<String>().ok();
79
+ // return Ok((
80
+ // RbSeries::new(
81
+ // apply_lambda_with_utf8_out_type(df, lambda, null_count, first_value)
82
+ // .into_series(),
83
+ // )
84
+ // .into(),
85
+ // false,
86
+ // ));
87
+ } else if out.respond_to("_s", true)? {
88
+ let rb_rbseries: Value = out.funcall("_s", ()).unwrap();
89
+ let series = rb_rbseries
90
+ .try_convert::<&RbSeries>()
91
+ .unwrap()
92
+ .series
93
+ .borrow();
94
+ let dt = series.dtype();
95
+ return Ok((
96
+ RbSeries::new(
97
+ apply_lambda_with_list_out_type(df, lambda, null_count, Some(&series), dt)?
98
+ .into_series(),
99
+ )
100
+ .into(),
101
+ false,
102
+ ));
103
+ } else if out.try_convert::<Wrap<Row<'a>>>().is_ok() {
104
+ let first_value = out.try_convert::<Wrap<Row<'a>>>().unwrap().0;
105
+ return Ok((
106
+ RbDataFrame::from(
107
+ apply_lambda_with_rows_output(
108
+ df,
109
+ lambda,
110
+ null_count,
111
+ first_value,
112
+ inference_size,
113
+ )
114
+ .map_err(RbPolarsErr::from)?,
115
+ )
116
+ .into(),
117
+ true,
118
+ ));
119
+ } else if out.is_kind_of(class::array()) {
120
+ return Err(RbPolarsErr::other(
121
+ "A list output type is invalid. Do you mean to create polars List Series?\
122
+ Then return a Series object."
123
+ .into(),
124
+ ));
125
+ } else {
126
+ return Err(RbPolarsErr::other("Could not determine output type".into()));
127
+ }
128
+ }
129
+ Err(RbPolarsErr::other("Could not determine output type".into()))
130
+ }
131
+
132
+ fn apply_iter<T>(
133
+ df: &DataFrame,
134
+ lambda: Value,
135
+ init_null_count: usize,
136
+ skip: usize,
137
+ ) -> impl Iterator<Item = Option<T>> + '_
138
+ where
139
+ T: TryConvert,
140
+ {
141
+ let mut iters = get_iters_skip(df, init_null_count + skip);
142
+ ((init_null_count + skip)..df.height()).map(move |_| {
143
+ let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
144
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
145
+ match lambda.funcall::<_, _, Value>("call", tpl) {
146
+ Ok(val) => val.try_convert::<T>().ok(),
147
+ Err(e) => panic!("ruby function failed {}", e),
148
+ }
149
+ })
150
+ }
151
+
152
+ /// Apply a lambda with a primitive output type
153
+ pub fn apply_lambda_with_primitive_out_type<D>(
154
+ df: &DataFrame,
155
+ lambda: Value,
156
+ init_null_count: usize,
157
+ first_value: Option<D::Native>,
158
+ ) -> ChunkedArray<D>
159
+ where
160
+ D: RbArrowPrimitiveType,
161
+ D::Native: Into<Value> + TryConvert,
162
+ {
163
+ let skip = usize::from(first_value.is_some());
164
+ if init_null_count == df.height() {
165
+ ChunkedArray::full_null("apply", df.height())
166
+ } else {
167
+ let iter = apply_iter(df, lambda, init_null_count, skip);
168
+ iterator_to_primitive(iter, init_null_count, first_value, "apply", df.height())
169
+ }
170
+ }
171
+
172
+ /// Apply a lambda with a boolean output type
173
+ pub fn apply_lambda_with_bool_out_type(
174
+ df: &DataFrame,
175
+ lambda: Value,
176
+ init_null_count: usize,
177
+ first_value: Option<bool>,
178
+ ) -> ChunkedArray<BooleanType> {
179
+ let skip = usize::from(first_value.is_some());
180
+ if init_null_count == df.height() {
181
+ ChunkedArray::full_null("apply", df.height())
182
+ } else {
183
+ let iter = apply_iter(df, lambda, init_null_count, skip);
184
+ iterator_to_bool(iter, init_null_count, first_value, "apply", df.height())
185
+ }
186
+ }
187
+
188
+ /// Apply a lambda with utf8 output type
189
+ pub fn apply_lambda_with_utf8_out_type(
190
+ df: &DataFrame,
191
+ lambda: Value,
192
+ init_null_count: usize,
193
+ first_value: Option<&str>,
194
+ ) -> Utf8Chunked {
195
+ let skip = usize::from(first_value.is_some());
196
+ if init_null_count == df.height() {
197
+ ChunkedArray::full_null("apply", df.height())
198
+ } else {
199
+ let iter = apply_iter::<String>(df, lambda, init_null_count, skip);
200
+ iterator_to_utf8(iter, init_null_count, first_value, "apply", df.height())
201
+ }
202
+ }
203
+
204
+ /// Apply a lambda with list output type
205
+ pub fn apply_lambda_with_list_out_type<'a>(
206
+ df: &'a DataFrame,
207
+ lambda: Value,
208
+ init_null_count: usize,
209
+ first_value: Option<&Series>,
210
+ dt: &DataType,
211
+ ) -> RbResult<ListChunked> {
212
+ let skip = usize::from(first_value.is_some());
213
+ if init_null_count == df.height() {
214
+ Ok(ChunkedArray::full_null("apply", df.height()))
215
+ } else {
216
+ let mut iters = get_iters_skip(df, init_null_count + skip);
217
+ let iter = ((init_null_count + skip)..df.height()).map(|_| {
218
+ let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
219
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
220
+ match lambda.funcall::<_, _, Value>("call", tpl) {
221
+ Ok(val) => match val.funcall::<_, _, Value>("_s", ()) {
222
+ Ok(val) => val
223
+ .try_convert::<&RbSeries>()
224
+ .ok()
225
+ .map(|ps| ps.series.borrow().clone()),
226
+ Err(_) => {
227
+ if val.is_nil() {
228
+ None
229
+ } else {
230
+ panic!("should return a Series, got a {:?}", val)
231
+ }
232
+ }
233
+ },
234
+ Err(e) => panic!("ruby function failed {}", e),
235
+ }
236
+ });
237
+ iterator_to_list(dt, iter, init_null_count, first_value, "apply", df.height())
238
+ }
239
+ }
240
+
241
+ pub fn apply_lambda_with_rows_output<'a>(
242
+ df: &'a DataFrame,
243
+ lambda: Value,
244
+ init_null_count: usize,
245
+ first_value: Row<'a>,
246
+ inference_size: usize,
247
+ ) -> PolarsResult<DataFrame> {
248
+ let width = first_value.0.len();
249
+ let null_row = Row::new(vec![AnyValue::Null; width]);
250
+
251
+ let mut row_buf = Row::default();
252
+
253
+ let skip = 1;
254
+ let mut iters = get_iters_skip(df, init_null_count + skip);
255
+ let mut row_iter = ((init_null_count + skip)..df.height()).map(|_| {
256
+ let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
257
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
258
+ match lambda.funcall::<_, _, Value>("call", tpl) {
259
+ Ok(val) => {
260
+ match val.try_convert::<RArray>().ok() {
261
+ Some(tuple) => {
262
+ row_buf.0.clear();
263
+ for v in tuple.each() {
264
+ let v = v.unwrap().try_convert::<Wrap<AnyValue>>().unwrap().0;
265
+ row_buf.0.push(v);
266
+ }
267
+ let ptr = &row_buf as *const Row;
268
+ // Safety:
269
+ // we know that row constructor of polars dataframe does not keep a reference
270
+ // to the row. Before we mutate the row buf again, the reference is dropped.
271
+ // we only cannot prove it to the compiler.
272
+ // we still do this because it saves a Vec allocation in a hot loop.
273
+ unsafe { &*ptr }
274
+ }
275
+ None => &null_row,
276
+ }
277
+ }
278
+ Err(e) => panic!("ruby function failed {}", e),
279
+ }
280
+ });
281
+
282
+ // first rows for schema inference
283
+ let mut buf = Vec::with_capacity(inference_size);
284
+ buf.push(first_value);
285
+ buf.extend((&mut row_iter).take(inference_size).cloned());
286
+ let schema = rows_to_schema_first_non_null(&buf, Some(50));
287
+
288
+ if init_null_count > 0 {
289
+ // Safety: we know the iterators size
290
+ let iter = unsafe {
291
+ (0..init_null_count)
292
+ .map(|_| &null_row)
293
+ .chain(buf.iter())
294
+ .chain(row_iter)
295
+ .trust_my_length(df.height())
296
+ };
297
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
298
+ } else {
299
+ // Safety: we know the iterators size
300
+ let iter = unsafe { buf.iter().chain(row_iter).trust_my_length(df.height()) };
301
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
302
+ }
303
+ }
@@ -0,0 +1,253 @@
1
+ pub mod dataframe;
2
+ pub mod series;
3
+
4
+ use magnus::{RHash, Value};
5
+ use polars::chunked_array::builder::get_list_builder;
6
+ use polars::prelude::*;
7
+ use polars_core::export::rayon::prelude::*;
8
+ use polars_core::utils::CustomIterTools;
9
+ use polars_core::POOL;
10
+
11
+ use crate::{ObjectValue, RbPolarsErr, RbResult, RbSeries, Wrap};
12
+
13
+ pub trait RbArrowPrimitiveType: PolarsNumericType {}
14
+
15
+ impl RbArrowPrimitiveType for UInt8Type {}
16
+ impl RbArrowPrimitiveType for UInt16Type {}
17
+ impl RbArrowPrimitiveType for UInt32Type {}
18
+ impl RbArrowPrimitiveType for UInt64Type {}
19
+ impl RbArrowPrimitiveType for Int8Type {}
20
+ impl RbArrowPrimitiveType for Int16Type {}
21
+ impl RbArrowPrimitiveType for Int32Type {}
22
+ impl RbArrowPrimitiveType for Int64Type {}
23
+ impl RbArrowPrimitiveType for Float32Type {}
24
+ impl RbArrowPrimitiveType for Float64Type {}
25
+
26
+ fn iterator_to_struct(
27
+ it: impl Iterator<Item = Option<Value>>,
28
+ init_null_count: usize,
29
+ first_value: AnyValue,
30
+ name: &str,
31
+ capacity: usize,
32
+ ) -> RbResult<RbSeries> {
33
+ let (vals, flds) = match &first_value {
34
+ av @ AnyValue::Struct(_, _, flds) => (av._iter_struct_av().collect::<Vec<_>>(), &**flds),
35
+ AnyValue::StructOwned(payload) => (payload.0.clone(), &*payload.1),
36
+ _ => {
37
+ return Err(crate::error::ComputeError::new_err(format!(
38
+ "expected struct got {first_value:?}",
39
+ )))
40
+ }
41
+ };
42
+
43
+ let struct_width = vals.len();
44
+
45
+ // every item in the struct is kept as its own buffer of anyvalues
46
+ // so as struct with 2 items: {a, b}
47
+ // will have
48
+ // [
49
+ // [ a values ]
50
+ // [ b values ]
51
+ // ]
52
+ let mut items = Vec::with_capacity(vals.len());
53
+ for item in vals {
54
+ let mut buf = Vec::with_capacity(capacity);
55
+ for _ in 0..init_null_count {
56
+ buf.push(AnyValue::Null);
57
+ }
58
+ buf.push(item.clone());
59
+ items.push(buf);
60
+ }
61
+
62
+ for dict in it {
63
+ match dict {
64
+ None => {
65
+ for field_items in &mut items {
66
+ field_items.push(AnyValue::Null);
67
+ }
68
+ }
69
+ Some(dict) => {
70
+ let dict = dict.try_convert::<RHash>()?;
71
+ if dict.len() != struct_width {
72
+ return Err(crate::error::ComputeError::new_err(
73
+ format!("Cannot create struct type.\n> The struct dtype expects {} fields, but it got a dict with {} fields.", struct_width, dict.len())
74
+ ));
75
+ }
76
+ // we ignore the keys of the rest of the dicts
77
+ // the first item determines the output name
78
+ todo!()
79
+ // for ((_, val), field_items) in dict.iter().zip(&mut items) {
80
+ // let item = val.try_convert::<Wrap<AnyValue>>()?;
81
+ // field_items.push(item.0)
82
+ // }
83
+ }
84
+ }
85
+ }
86
+
87
+ let fields = POOL.install(|| {
88
+ items
89
+ .par_iter()
90
+ .zip(flds)
91
+ .map(|(av, fld)| Series::new(fld.name(), av))
92
+ .collect::<Vec<_>>()
93
+ });
94
+
95
+ Ok(StructChunked::new(name, &fields)
96
+ .unwrap()
97
+ .into_series()
98
+ .into())
99
+ }
100
+
101
+ fn iterator_to_primitive<T>(
102
+ it: impl Iterator<Item = Option<T::Native>>,
103
+ init_null_count: usize,
104
+ first_value: Option<T::Native>,
105
+ name: &str,
106
+ capacity: usize,
107
+ ) -> ChunkedArray<T>
108
+ where
109
+ T: RbArrowPrimitiveType,
110
+ {
111
+ // safety: we know the iterators len
112
+ let mut ca: ChunkedArray<T> = unsafe {
113
+ if init_null_count > 0 {
114
+ (0..init_null_count)
115
+ .map(|_| None)
116
+ .chain(std::iter::once(first_value))
117
+ .chain(it)
118
+ .trust_my_length(capacity)
119
+ .collect_trusted()
120
+ } else if first_value.is_some() {
121
+ std::iter::once(first_value)
122
+ .chain(it)
123
+ .trust_my_length(capacity)
124
+ .collect_trusted()
125
+ } else {
126
+ it.collect()
127
+ }
128
+ };
129
+ debug_assert_eq!(ca.len(), capacity);
130
+ ca.rename(name);
131
+ ca
132
+ }
133
+
134
+ fn iterator_to_bool(
135
+ it: impl Iterator<Item = Option<bool>>,
136
+ init_null_count: usize,
137
+ first_value: Option<bool>,
138
+ name: &str,
139
+ capacity: usize,
140
+ ) -> ChunkedArray<BooleanType> {
141
+ // safety: we know the iterators len
142
+ let mut ca: BooleanChunked = unsafe {
143
+ if init_null_count > 0 {
144
+ (0..init_null_count)
145
+ .map(|_| None)
146
+ .chain(std::iter::once(first_value))
147
+ .chain(it)
148
+ .trust_my_length(capacity)
149
+ .collect_trusted()
150
+ } else if first_value.is_some() {
151
+ std::iter::once(first_value)
152
+ .chain(it)
153
+ .trust_my_length(capacity)
154
+ .collect_trusted()
155
+ } else {
156
+ it.collect()
157
+ }
158
+ };
159
+ debug_assert_eq!(ca.len(), capacity);
160
+ ca.rename(name);
161
+ ca
162
+ }
163
+
164
+ fn iterator_to_object(
165
+ it: impl Iterator<Item = Option<ObjectValue>>,
166
+ init_null_count: usize,
167
+ first_value: Option<ObjectValue>,
168
+ name: &str,
169
+ capacity: usize,
170
+ ) -> ObjectChunked<ObjectValue> {
171
+ // safety: we know the iterators len
172
+ let mut ca: ObjectChunked<ObjectValue> = unsafe {
173
+ if init_null_count > 0 {
174
+ (0..init_null_count)
175
+ .map(|_| None)
176
+ .chain(std::iter::once(first_value))
177
+ .chain(it)
178
+ .trust_my_length(capacity)
179
+ .collect_trusted()
180
+ } else if first_value.is_some() {
181
+ std::iter::once(first_value)
182
+ .chain(it)
183
+ .trust_my_length(capacity)
184
+ .collect_trusted()
185
+ } else {
186
+ it.collect()
187
+ }
188
+ };
189
+ debug_assert_eq!(ca.len(), capacity);
190
+ ca.rename(name);
191
+ ca
192
+ }
193
+
194
+ fn iterator_to_utf8(
195
+ it: impl Iterator<Item = Option<String>>,
196
+ init_null_count: usize,
197
+ first_value: Option<&str>,
198
+ name: &str,
199
+ capacity: usize,
200
+ ) -> Utf8Chunked {
201
+ let first_value = first_value.map(|v| v.to_string());
202
+
203
+ // safety: we know the iterators len
204
+ let mut ca: Utf8Chunked = unsafe {
205
+ if init_null_count > 0 {
206
+ (0..init_null_count)
207
+ .map(|_| None)
208
+ .chain(std::iter::once(first_value))
209
+ .chain(it)
210
+ .trust_my_length(capacity)
211
+ .collect_trusted()
212
+ } else if first_value.is_some() {
213
+ std::iter::once(first_value)
214
+ .chain(it)
215
+ .trust_my_length(capacity)
216
+ .collect_trusted()
217
+ } else {
218
+ it.collect()
219
+ }
220
+ };
221
+ debug_assert_eq!(ca.len(), capacity);
222
+ ca.rename(name);
223
+ ca
224
+ }
225
+
226
+ fn iterator_to_list(
227
+ dt: &DataType,
228
+ it: impl Iterator<Item = Option<Series>>,
229
+ init_null_count: usize,
230
+ first_value: Option<&Series>,
231
+ name: &str,
232
+ capacity: usize,
233
+ ) -> RbResult<ListChunked> {
234
+ let mut builder =
235
+ get_list_builder(dt, capacity * 5, capacity, name).map_err(RbPolarsErr::from)?;
236
+ for _ in 0..init_null_count {
237
+ builder.append_null()
238
+ }
239
+ builder.append_opt_series(first_value);
240
+ for opt_val in it {
241
+ match opt_val {
242
+ None => builder.append_null(),
243
+ Some(s) => {
244
+ if s.len() == 0 && s.dtype() != dt {
245
+ builder.append_series(&Series::full_null("", 0, dt))
246
+ } else {
247
+ builder.append_series(&s)
248
+ }
249
+ }
250
+ }
251
+ }
252
+ Ok(builder.finish())
253
+ }