polars-df 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6abc9619a425d8aaa0255864b063c41835349063aa4919df133ac5a4ceb972f2
4
- data.tar.gz: 78372a2a9eeddb3a8080b1d615991415b9ef7752752319e250f143841bfa67f3
3
+ metadata.gz: 3a08e866e51227716cd3cb4454835016a7d61e30e964fe76a8b99704dcb60a12
4
+ data.tar.gz: 1f30c3fdd47ebf52a311909aa26ba4b6d64e426622455854b9bbc660de1229b3
5
5
  SHA512:
6
- metadata.gz: e6fb27a50908c07e5f2f72c81171f07bfdf0999b5148421bdeb1ad7dc69cee1f0bae02021fa18fdad6d1740ea9273464daec513db5e3c7906d5839e77b7d6a66
7
- data.tar.gz: 2eb9df841575711a057dd1ca2986403667306ead52cf540491899ffaa184d4878c1bdfc1015e3f5831c12c668de0d4126cbab7c63d1770684e10012f3d28183f
6
+ metadata.gz: 1531fff4fc2fab8b2dc72709a69fb2890c215ae08e4223aa32262dbb4b0debb4b6f2fbab1e8138953871f5d02d462abfaba49cc7f22a66e25aa7d60f128a89bc
7
+ data.tar.gz: e1041d708e2f8046c14c565a65879fa4e5c6671cf526736a3f8418a82dfa70e17692a96d383e43f393d8761e5f29f717d63185ae1ed3f0793a6876be2d946fc0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.5 (2022-12-22)
2
+
3
+ - Added `read_avro` and `write_avro` methods
4
+ - Added more methods
5
+
1
6
  ## 0.1.4 (2022-12-02)
2
7
 
3
8
  - Added more methods
data/Cargo.lock CHANGED
@@ -8,6 +8,12 @@ version = "1.0.2"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
9
  checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
10
10
 
11
+ [[package]]
12
+ name = "adler32"
13
+ version = "1.2.0"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
16
+
11
17
  [[package]]
12
18
  name = "ahash"
13
19
  version = "0.7.6"
@@ -85,6 +91,7 @@ checksum = "ee6f62e41078c967a4c063fcbdfd3801a2a9632276402c045311c4d73d0845f3"
85
91
  dependencies = [
86
92
  "ahash 0.7.6",
87
93
  "arrow-format",
94
+ "avro-schema",
88
95
  "base64",
89
96
  "bytemuck",
90
97
  "chrono",
@@ -147,6 +154,20 @@ version = "1.1.0"
147
154
  source = "registry+https://github.com/rust-lang/crates.io-index"
148
155
  checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
149
156
 
157
+ [[package]]
158
+ name = "avro-schema"
159
+ version = "0.3.0"
160
+ source = "registry+https://github.com/rust-lang/crates.io-index"
161
+ checksum = "b5281855b39aba9684d2f47bf96983fbfd8f1725f12fabb0513a8ab879647bbd"
162
+ dependencies = [
163
+ "crc",
164
+ "fallible-streaming-iterator",
165
+ "libflate",
166
+ "serde",
167
+ "serde_json",
168
+ "snap",
169
+ ]
170
+
150
171
  [[package]]
151
172
  name = "base64"
152
173
  version = "0.13.1"
@@ -314,6 +335,21 @@ dependencies = [
314
335
  "unicode-width",
315
336
  ]
316
337
 
338
+ [[package]]
339
+ name = "crc"
340
+ version = "2.1.0"
341
+ source = "registry+https://github.com/rust-lang/crates.io-index"
342
+ checksum = "49fc9a695bca7f35f5f4c15cddc84415f66a74ea78eef08e90c5024f2b540e23"
343
+ dependencies = [
344
+ "crc-catalog",
345
+ ]
346
+
347
+ [[package]]
348
+ name = "crc-catalog"
349
+ version = "1.1.1"
350
+ source = "registry+https://github.com/rust-lang/crates.io-index"
351
+ checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403"
352
+
317
353
  [[package]]
318
354
  name = "crc32fast"
319
355
  version = "1.3.2"
@@ -824,6 +860,26 @@ version = "0.2.121"
824
860
  source = "registry+https://github.com/rust-lang/crates.io-index"
825
861
  checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f"
826
862
 
863
+ [[package]]
864
+ name = "libflate"
865
+ version = "1.2.0"
866
+ source = "registry+https://github.com/rust-lang/crates.io-index"
867
+ checksum = "05605ab2bce11bcfc0e9c635ff29ef8b2ea83f29be257ee7d730cac3ee373093"
868
+ dependencies = [
869
+ "adler32",
870
+ "crc32fast",
871
+ "libflate_lz77",
872
+ ]
873
+
874
+ [[package]]
875
+ name = "libflate_lz77"
876
+ version = "1.1.0"
877
+ source = "registry+https://github.com/rust-lang/crates.io-index"
878
+ checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a"
879
+ dependencies = [
880
+ "rle-decode-fast",
881
+ ]
882
+
827
883
  [[package]]
828
884
  name = "libloading"
829
885
  version = "0.7.4"
@@ -903,8 +959,7 @@ dependencies = [
903
959
  [[package]]
904
960
  name = "magnus-macros"
905
961
  version = "0.2.0"
906
- source = "registry+https://github.com/rust-lang/crates.io-index"
907
- checksum = "acc8ba6908cb0f67a4e75cb48fc81a1f0e6a6dd1501936e0c9e2c7c8f9f18e05"
962
+ source = "git+https://github.com/matsadler/magnus#ae792419bed70107d4c930e1f8193272750b9fd2"
908
963
  dependencies = [
909
964
  "proc-macro2",
910
965
  "quote",
@@ -1222,8 +1277,9 @@ dependencies = [
1222
1277
 
1223
1278
  [[package]]
1224
1279
  name = "polars"
1225
- version = "0.1.4"
1280
+ version = "0.1.5"
1226
1281
  dependencies = [
1282
+ "ahash 0.8.2",
1227
1283
  "jemallocator",
1228
1284
  "magnus",
1229
1285
  "mimalloc",
@@ -1500,18 +1556,18 @@ dependencies = [
1500
1556
 
1501
1557
  [[package]]
1502
1558
  name = "rb-sys"
1503
- version = "0.9.44"
1559
+ version = "0.9.48"
1504
1560
  source = "registry+https://github.com/rust-lang/crates.io-index"
1505
- checksum = "31f48777b8161ff5c077ad74ce486ebe963ca8a92257512bab473b405a80d69f"
1561
+ checksum = "dfc6b8f3bf2d04b0180e243ceeb033b51ca267d839aa1c12fa25f262c17d0596"
1506
1562
  dependencies = [
1507
1563
  "rb-sys-build",
1508
1564
  ]
1509
1565
 
1510
1566
  [[package]]
1511
1567
  name = "rb-sys-build"
1512
- version = "0.9.44"
1568
+ version = "0.9.48"
1513
1569
  source = "registry+https://github.com/rust-lang/crates.io-index"
1514
- checksum = "a46785122aff7077527b78c2518d739c45dc0fbc410a2b8361076ff4bbf993f9"
1570
+ checksum = "2cd591ebf22c45a44e51192fbeebba473aea0fe2a708b0b24665a13010c58b8d"
1515
1571
  dependencies = [
1516
1572
  "bindgen",
1517
1573
  "regex",
@@ -1521,8 +1577,7 @@ dependencies = [
1521
1577
  [[package]]
1522
1578
  name = "rb-sys-env"
1523
1579
  version = "0.1.1"
1524
- source = "registry+https://github.com/rust-lang/crates.io-index"
1525
- checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
1580
+ source = "git+https://github.com/oxidize-rb/rb-sys#93c4f97a244168b9ebc2c5682275e7281421f4b8"
1526
1581
 
1527
1582
  [[package]]
1528
1583
  name = "redox_syscall"
@@ -1561,6 +1616,12 @@ version = "0.6.28"
1561
1616
  source = "registry+https://github.com/rust-lang/crates.io-index"
1562
1617
  checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
1563
1618
 
1619
+ [[package]]
1620
+ name = "rle-decode-fast"
1621
+ version = "1.0.3"
1622
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1623
+ checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
1624
+
1564
1625
  [[package]]
1565
1626
  name = "rustc-hash"
1566
1627
  version = "1.1.0"
data/Cargo.toml CHANGED
@@ -3,6 +3,8 @@ members = ["ext/polars"]
3
3
 
4
4
  [patch.crates-io]
5
5
  jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
6
+ magnus-macros = { git = "https://github.com/matsadler/magnus" }
7
+ rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
6
8
 
7
9
  [profile.release]
8
10
  strip = true
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.1.4"
3
+ version = "0.1.5"
4
4
  authors = ["Andrew Kane <andrew@ankane.org>"]
5
5
  edition = "2021"
6
6
  publish = false
@@ -9,6 +9,7 @@ publish = false
9
9
  crate-type = ["cdylib"]
10
10
 
11
11
  [dependencies]
12
+ ahash = "0.8"
12
13
  magnus = "0.4"
13
14
  polars-core = "0.25.1"
14
15
  serde_json = "1"
@@ -19,6 +20,8 @@ features = [
19
20
  "abs",
20
21
  "arange",
21
22
  "arg_where",
23
+ "asof_join",
24
+ "avro",
22
25
  "concat_str",
23
26
  "cse",
24
27
  "csv-file",
@@ -53,6 +56,7 @@ features = [
53
56
  "partition_by",
54
57
  "pct_change",
55
58
  "performant",
59
+ "pivot",
56
60
  "product",
57
61
  "propagate_nans",
58
62
  "random",
@@ -61,6 +65,7 @@ features = [
61
65
  "repeat_by",
62
66
  "rolling_window",
63
67
  "round_series",
68
+ "row_hash",
64
69
  "search_sorted",
65
70
  "semi_anti_join",
66
71
  "serde-lazy",
@@ -0,0 +1,292 @@
1
+ use magnus::{class, RArray, TryConvert, Value};
2
+ use polars::prelude::*;
3
+ use polars_core::frame::row::{rows_to_schema_first_non_null, Row};
4
+
5
+ use super::*;
6
+ use crate::{RbDataFrame, RbPolarsErr, RbSeries, Wrap};
7
+
8
+ pub fn apply_lambda_unknown<'a>(
9
+ df: &'a DataFrame,
10
+ lambda: Value,
11
+ inference_size: usize,
12
+ ) -> RbResult<(Value, bool)> {
13
+ let columns = df.get_columns();
14
+ let mut null_count = 0;
15
+
16
+ for idx in 0..df.height() {
17
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
18
+ let arg = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
19
+ let out: Value = lambda.funcall("call", arg)?;
20
+
21
+ if out.is_nil() {
22
+ null_count += 1;
23
+ continue;
24
+ } else if out.is_kind_of(class::true_class()) || out.is_kind_of(class::false_class()) {
25
+ let first_value = out.try_convert::<bool>().ok();
26
+ return Ok((
27
+ RbSeries::new(
28
+ apply_lambda_with_bool_out_type(df, lambda, null_count, first_value)
29
+ .into_series(),
30
+ )
31
+ .into(),
32
+ false,
33
+ ));
34
+ } else if out.is_kind_of(class::float()) {
35
+ let first_value = out.try_convert::<f64>().ok();
36
+
37
+ return Ok((
38
+ RbSeries::new(
39
+ apply_lambda_with_primitive_out_type::<Float64Type>(
40
+ df,
41
+ lambda,
42
+ null_count,
43
+ first_value,
44
+ )
45
+ .into_series(),
46
+ )
47
+ .into(),
48
+ false,
49
+ ));
50
+ } else if out.is_kind_of(class::integer()) {
51
+ let first_value = out.try_convert::<i64>().ok();
52
+ return Ok((
53
+ RbSeries::new(
54
+ apply_lambda_with_primitive_out_type::<Int64Type>(
55
+ df,
56
+ lambda,
57
+ null_count,
58
+ first_value,
59
+ )
60
+ .into_series(),
61
+ )
62
+ .into(),
63
+ false,
64
+ ));
65
+ // } else if out.is_kind_of(class::string()) {
66
+ // let first_value = out.try_convert::<String>().ok();
67
+ // return Ok((
68
+ // RbSeries::new(
69
+ // apply_lambda_with_utf8_out_type(df, lambda, null_count, first_value)
70
+ // .into_series(),
71
+ // )
72
+ // .into(),
73
+ // false,
74
+ // ));
75
+ } else if out.respond_to("_s", true)? {
76
+ let rb_rbseries: Value = out.funcall("_s", ()).unwrap();
77
+ let series = rb_rbseries
78
+ .try_convert::<&RbSeries>()
79
+ .unwrap()
80
+ .series
81
+ .borrow();
82
+ let dt = series.dtype();
83
+ return Ok((
84
+ RbSeries::new(
85
+ apply_lambda_with_list_out_type(df, lambda, null_count, Some(&series), dt)?
86
+ .into_series(),
87
+ )
88
+ .into(),
89
+ false,
90
+ ));
91
+ } else if out.try_convert::<Wrap<Row<'a>>>().is_ok() {
92
+ let first_value = out.try_convert::<Wrap<Row<'a>>>().unwrap().0;
93
+ return Ok((
94
+ RbDataFrame::from(
95
+ apply_lambda_with_rows_output(
96
+ df,
97
+ lambda,
98
+ null_count,
99
+ first_value,
100
+ inference_size,
101
+ )
102
+ .map_err(RbPolarsErr::from)?,
103
+ )
104
+ .into(),
105
+ true,
106
+ ));
107
+ } else if out.is_kind_of(class::array()) {
108
+ return Err(RbPolarsErr::other(
109
+ "A list output type is invalid. Do you mean to create polars List Series?\
110
+ Then return a Series object."
111
+ .into(),
112
+ ));
113
+ } else {
114
+ return Err(RbPolarsErr::other("Could not determine output type".into()));
115
+ }
116
+ }
117
+ Err(RbPolarsErr::other("Could not determine output type".into()))
118
+ }
119
+
120
+ fn apply_iter<T>(
121
+ df: &DataFrame,
122
+ lambda: Value,
123
+ init_null_count: usize,
124
+ skip: usize,
125
+ ) -> impl Iterator<Item = Option<T>> + '_
126
+ where
127
+ T: TryConvert,
128
+ {
129
+ let columns = df.get_columns();
130
+ ((init_null_count + skip)..df.height()).map(move |idx| {
131
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
132
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
133
+ match lambda.funcall::<_, _, Value>("call", tpl) {
134
+ Ok(val) => val.try_convert::<T>().ok(),
135
+ Err(e) => panic!("ruby function failed {}", e),
136
+ }
137
+ })
138
+ }
139
+
140
+ /// Apply a lambda with a primitive output type
141
+ pub fn apply_lambda_with_primitive_out_type<D>(
142
+ df: &DataFrame,
143
+ lambda: Value,
144
+ init_null_count: usize,
145
+ first_value: Option<D::Native>,
146
+ ) -> ChunkedArray<D>
147
+ where
148
+ D: RbArrowPrimitiveType,
149
+ D::Native: Into<Value> + TryConvert,
150
+ {
151
+ let skip = usize::from(first_value.is_some());
152
+ if init_null_count == df.height() {
153
+ ChunkedArray::full_null("apply", df.height())
154
+ } else {
155
+ let iter = apply_iter(df, lambda, init_null_count, skip);
156
+ iterator_to_primitive(iter, init_null_count, first_value, "apply", df.height())
157
+ }
158
+ }
159
+
160
+ /// Apply a lambda with a boolean output type
161
+ pub fn apply_lambda_with_bool_out_type(
162
+ df: &DataFrame,
163
+ lambda: Value,
164
+ init_null_count: usize,
165
+ first_value: Option<bool>,
166
+ ) -> ChunkedArray<BooleanType> {
167
+ let skip = usize::from(first_value.is_some());
168
+ if init_null_count == df.height() {
169
+ ChunkedArray::full_null("apply", df.height())
170
+ } else {
171
+ let iter = apply_iter(df, lambda, init_null_count, skip);
172
+ iterator_to_bool(iter, init_null_count, first_value, "apply", df.height())
173
+ }
174
+ }
175
+
176
+ /// Apply a lambda with utf8 output type
177
+ pub fn apply_lambda_with_utf8_out_type(
178
+ df: &DataFrame,
179
+ lambda: Value,
180
+ init_null_count: usize,
181
+ first_value: Option<&str>,
182
+ ) -> Utf8Chunked {
183
+ let skip = usize::from(first_value.is_some());
184
+ if init_null_count == df.height() {
185
+ ChunkedArray::full_null("apply", df.height())
186
+ } else {
187
+ let iter = apply_iter::<String>(df, lambda, init_null_count, skip);
188
+ iterator_to_utf8(iter, init_null_count, first_value, "apply", df.height())
189
+ }
190
+ }
191
+
192
+ /// Apply a lambda with list output type
193
+ pub fn apply_lambda_with_list_out_type<'a>(
194
+ df: &'a DataFrame,
195
+ lambda: Value,
196
+ init_null_count: usize,
197
+ first_value: Option<&Series>,
198
+ dt: &DataType,
199
+ ) -> RbResult<ListChunked> {
200
+ let columns = df.get_columns();
201
+
202
+ let skip = usize::from(first_value.is_some());
203
+ if init_null_count == df.height() {
204
+ Ok(ChunkedArray::full_null("apply", df.height()))
205
+ } else {
206
+ let iter = ((init_null_count + skip)..df.height()).map(|idx| {
207
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
208
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
209
+ match lambda.funcall::<_, _, Value>("call", tpl) {
210
+ Ok(val) => match val.funcall::<_, _, Value>("_s", ()) {
211
+ Ok(val) => val
212
+ .try_convert::<&RbSeries>()
213
+ .ok()
214
+ .map(|ps| ps.series.borrow().clone()),
215
+ Err(_) => {
216
+ if val.is_nil() {
217
+ None
218
+ } else {
219
+ panic!("should return a Series, got a {:?}", val)
220
+ }
221
+ }
222
+ },
223
+ Err(e) => panic!("ruby function failed {}", e),
224
+ }
225
+ });
226
+ iterator_to_list(dt, iter, init_null_count, first_value, "apply", df.height())
227
+ }
228
+ }
229
+
230
+ pub fn apply_lambda_with_rows_output<'a>(
231
+ df: &'a DataFrame,
232
+ lambda: Value,
233
+ init_null_count: usize,
234
+ first_value: Row<'a>,
235
+ inference_size: usize,
236
+ ) -> PolarsResult<DataFrame> {
237
+ let columns = df.get_columns();
238
+ let width = first_value.0.len();
239
+ let null_row = Row::new(vec![AnyValue::Null; width]);
240
+
241
+ let mut row_buf = Row::default();
242
+
243
+ let skip = 1;
244
+ let mut row_iter = ((init_null_count + skip)..df.height()).map(|idx| {
245
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
246
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
247
+ match lambda.funcall::<_, _, Value>("call", tpl) {
248
+ Ok(val) => {
249
+ match val.try_convert::<RArray>().ok() {
250
+ Some(tuple) => {
251
+ row_buf.0.clear();
252
+ for v in tuple.each() {
253
+ let v = v.unwrap().try_convert::<Wrap<AnyValue>>().unwrap().0;
254
+ row_buf.0.push(v);
255
+ }
256
+ let ptr = &row_buf as *const Row;
257
+ // Safety:
258
+ // we know that row constructor of polars dataframe does not keep a reference
259
+ // to the row. Before we mutate the row buf again, the reference is dropped.
260
+ // we only cannot prove it to the compiler.
261
+ // we still do this because it saves a Vec allocation in a hot loop.
262
+ unsafe { &*ptr }
263
+ }
264
+ None => &null_row,
265
+ }
266
+ }
267
+ Err(e) => panic!("ruby function failed {}", e),
268
+ }
269
+ });
270
+
271
+ // first rows for schema inference
272
+ let mut buf = Vec::with_capacity(inference_size);
273
+ buf.push(first_value);
274
+ buf.extend((&mut row_iter).take(inference_size).cloned());
275
+ let schema = rows_to_schema_first_non_null(&buf, Some(50));
276
+
277
+ if init_null_count > 0 {
278
+ // Safety: we know the iterators size
279
+ let iter = unsafe {
280
+ (0..init_null_count)
281
+ .map(|_| &null_row)
282
+ .chain(buf.iter())
283
+ .chain(row_iter)
284
+ .trust_my_length(df.height())
285
+ };
286
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
287
+ } else {
288
+ // Safety: we know the iterators size
289
+ let iter = unsafe { buf.iter().chain(row_iter).trust_my_length(df.height()) };
290
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
291
+ }
292
+ }