polars-df 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6abc9619a425d8aaa0255864b063c41835349063aa4919df133ac5a4ceb972f2
4
- data.tar.gz: 78372a2a9eeddb3a8080b1d615991415b9ef7752752319e250f143841bfa67f3
3
+ metadata.gz: 3a08e866e51227716cd3cb4454835016a7d61e30e964fe76a8b99704dcb60a12
4
+ data.tar.gz: 1f30c3fdd47ebf52a311909aa26ba4b6d64e426622455854b9bbc660de1229b3
5
5
  SHA512:
6
- metadata.gz: e6fb27a50908c07e5f2f72c81171f07bfdf0999b5148421bdeb1ad7dc69cee1f0bae02021fa18fdad6d1740ea9273464daec513db5e3c7906d5839e77b7d6a66
7
- data.tar.gz: 2eb9df841575711a057dd1ca2986403667306ead52cf540491899ffaa184d4878c1bdfc1015e3f5831c12c668de0d4126cbab7c63d1770684e10012f3d28183f
6
+ metadata.gz: 1531fff4fc2fab8b2dc72709a69fb2890c215ae08e4223aa32262dbb4b0debb4b6f2fbab1e8138953871f5d02d462abfaba49cc7f22a66e25aa7d60f128a89bc
7
+ data.tar.gz: e1041d708e2f8046c14c565a65879fa4e5c6671cf526736a3f8418a82dfa70e17692a96d383e43f393d8761e5f29f717d63185ae1ed3f0793a6876be2d946fc0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.5 (2022-12-22)
2
+
3
+ - Added `read_avro` and `write_avro` methods
4
+ - Added more methods
5
+
1
6
  ## 0.1.4 (2022-12-02)
2
7
 
3
8
  - Added more methods
data/Cargo.lock CHANGED
@@ -8,6 +8,12 @@ version = "1.0.2"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
9
  checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
10
10
 
11
+ [[package]]
12
+ name = "adler32"
13
+ version = "1.2.0"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
16
+
11
17
  [[package]]
12
18
  name = "ahash"
13
19
  version = "0.7.6"
@@ -85,6 +91,7 @@ checksum = "ee6f62e41078c967a4c063fcbdfd3801a2a9632276402c045311c4d73d0845f3"
85
91
  dependencies = [
86
92
  "ahash 0.7.6",
87
93
  "arrow-format",
94
+ "avro-schema",
88
95
  "base64",
89
96
  "bytemuck",
90
97
  "chrono",
@@ -147,6 +154,20 @@ version = "1.1.0"
147
154
  source = "registry+https://github.com/rust-lang/crates.io-index"
148
155
  checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
149
156
 
157
+ [[package]]
158
+ name = "avro-schema"
159
+ version = "0.3.0"
160
+ source = "registry+https://github.com/rust-lang/crates.io-index"
161
+ checksum = "b5281855b39aba9684d2f47bf96983fbfd8f1725f12fabb0513a8ab879647bbd"
162
+ dependencies = [
163
+ "crc",
164
+ "fallible-streaming-iterator",
165
+ "libflate",
166
+ "serde",
167
+ "serde_json",
168
+ "snap",
169
+ ]
170
+
150
171
  [[package]]
151
172
  name = "base64"
152
173
  version = "0.13.1"
@@ -314,6 +335,21 @@ dependencies = [
314
335
  "unicode-width",
315
336
  ]
316
337
 
338
+ [[package]]
339
+ name = "crc"
340
+ version = "2.1.0"
341
+ source = "registry+https://github.com/rust-lang/crates.io-index"
342
+ checksum = "49fc9a695bca7f35f5f4c15cddc84415f66a74ea78eef08e90c5024f2b540e23"
343
+ dependencies = [
344
+ "crc-catalog",
345
+ ]
346
+
347
+ [[package]]
348
+ name = "crc-catalog"
349
+ version = "1.1.1"
350
+ source = "registry+https://github.com/rust-lang/crates.io-index"
351
+ checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403"
352
+
317
353
  [[package]]
318
354
  name = "crc32fast"
319
355
  version = "1.3.2"
@@ -824,6 +860,26 @@ version = "0.2.121"
824
860
  source = "registry+https://github.com/rust-lang/crates.io-index"
825
861
  checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f"
826
862
 
863
+ [[package]]
864
+ name = "libflate"
865
+ version = "1.2.0"
866
+ source = "registry+https://github.com/rust-lang/crates.io-index"
867
+ checksum = "05605ab2bce11bcfc0e9c635ff29ef8b2ea83f29be257ee7d730cac3ee373093"
868
+ dependencies = [
869
+ "adler32",
870
+ "crc32fast",
871
+ "libflate_lz77",
872
+ ]
873
+
874
+ [[package]]
875
+ name = "libflate_lz77"
876
+ version = "1.1.0"
877
+ source = "registry+https://github.com/rust-lang/crates.io-index"
878
+ checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a"
879
+ dependencies = [
880
+ "rle-decode-fast",
881
+ ]
882
+
827
883
  [[package]]
828
884
  name = "libloading"
829
885
  version = "0.7.4"
@@ -903,8 +959,7 @@ dependencies = [
903
959
  [[package]]
904
960
  name = "magnus-macros"
905
961
  version = "0.2.0"
906
- source = "registry+https://github.com/rust-lang/crates.io-index"
907
- checksum = "acc8ba6908cb0f67a4e75cb48fc81a1f0e6a6dd1501936e0c9e2c7c8f9f18e05"
962
+ source = "git+https://github.com/matsadler/magnus#ae792419bed70107d4c930e1f8193272750b9fd2"
908
963
  dependencies = [
909
964
  "proc-macro2",
910
965
  "quote",
@@ -1222,8 +1277,9 @@ dependencies = [
1222
1277
 
1223
1278
  [[package]]
1224
1279
  name = "polars"
1225
- version = "0.1.4"
1280
+ version = "0.1.5"
1226
1281
  dependencies = [
1282
+ "ahash 0.8.2",
1227
1283
  "jemallocator",
1228
1284
  "magnus",
1229
1285
  "mimalloc",
@@ -1500,18 +1556,18 @@ dependencies = [
1500
1556
 
1501
1557
  [[package]]
1502
1558
  name = "rb-sys"
1503
- version = "0.9.44"
1559
+ version = "0.9.48"
1504
1560
  source = "registry+https://github.com/rust-lang/crates.io-index"
1505
- checksum = "31f48777b8161ff5c077ad74ce486ebe963ca8a92257512bab473b405a80d69f"
1561
+ checksum = "dfc6b8f3bf2d04b0180e243ceeb033b51ca267d839aa1c12fa25f262c17d0596"
1506
1562
  dependencies = [
1507
1563
  "rb-sys-build",
1508
1564
  ]
1509
1565
 
1510
1566
  [[package]]
1511
1567
  name = "rb-sys-build"
1512
- version = "0.9.44"
1568
+ version = "0.9.48"
1513
1569
  source = "registry+https://github.com/rust-lang/crates.io-index"
1514
- checksum = "a46785122aff7077527b78c2518d739c45dc0fbc410a2b8361076ff4bbf993f9"
1570
+ checksum = "2cd591ebf22c45a44e51192fbeebba473aea0fe2a708b0b24665a13010c58b8d"
1515
1571
  dependencies = [
1516
1572
  "bindgen",
1517
1573
  "regex",
@@ -1521,8 +1577,7 @@ dependencies = [
1521
1577
  [[package]]
1522
1578
  name = "rb-sys-env"
1523
1579
  version = "0.1.1"
1524
- source = "registry+https://github.com/rust-lang/crates.io-index"
1525
- checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
1580
+ source = "git+https://github.com/oxidize-rb/rb-sys#93c4f97a244168b9ebc2c5682275e7281421f4b8"
1526
1581
 
1527
1582
  [[package]]
1528
1583
  name = "redox_syscall"
@@ -1561,6 +1616,12 @@ version = "0.6.28"
1561
1616
  source = "registry+https://github.com/rust-lang/crates.io-index"
1562
1617
  checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
1563
1618
 
1619
+ [[package]]
1620
+ name = "rle-decode-fast"
1621
+ version = "1.0.3"
1622
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1623
+ checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
1624
+
1564
1625
  [[package]]
1565
1626
  name = "rustc-hash"
1566
1627
  version = "1.1.0"
data/Cargo.toml CHANGED
@@ -3,6 +3,8 @@ members = ["ext/polars"]
3
3
 
4
4
  [patch.crates-io]
5
5
  jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
6
+ magnus-macros = { git = "https://github.com/matsadler/magnus" }
7
+ rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
6
8
 
7
9
  [profile.release]
8
10
  strip = true
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.1.4"
3
+ version = "0.1.5"
4
4
  authors = ["Andrew Kane <andrew@ankane.org>"]
5
5
  edition = "2021"
6
6
  publish = false
@@ -9,6 +9,7 @@ publish = false
9
9
  crate-type = ["cdylib"]
10
10
 
11
11
  [dependencies]
12
+ ahash = "0.8"
12
13
  magnus = "0.4"
13
14
  polars-core = "0.25.1"
14
15
  serde_json = "1"
@@ -19,6 +20,8 @@ features = [
19
20
  "abs",
20
21
  "arange",
21
22
  "arg_where",
23
+ "asof_join",
24
+ "avro",
22
25
  "concat_str",
23
26
  "cse",
24
27
  "csv-file",
@@ -53,6 +56,7 @@ features = [
53
56
  "partition_by",
54
57
  "pct_change",
55
58
  "performant",
59
+ "pivot",
56
60
  "product",
57
61
  "propagate_nans",
58
62
  "random",
@@ -61,6 +65,7 @@ features = [
61
65
  "repeat_by",
62
66
  "rolling_window",
63
67
  "round_series",
68
+ "row_hash",
64
69
  "search_sorted",
65
70
  "semi_anti_join",
66
71
  "serde-lazy",
@@ -0,0 +1,292 @@
1
+ use magnus::{class, RArray, TryConvert, Value};
2
+ use polars::prelude::*;
3
+ use polars_core::frame::row::{rows_to_schema_first_non_null, Row};
4
+
5
+ use super::*;
6
+ use crate::{RbDataFrame, RbPolarsErr, RbSeries, Wrap};
7
+
8
+ pub fn apply_lambda_unknown<'a>(
9
+ df: &'a DataFrame,
10
+ lambda: Value,
11
+ inference_size: usize,
12
+ ) -> RbResult<(Value, bool)> {
13
+ let columns = df.get_columns();
14
+ let mut null_count = 0;
15
+
16
+ for idx in 0..df.height() {
17
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
18
+ let arg = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
19
+ let out: Value = lambda.funcall("call", arg)?;
20
+
21
+ if out.is_nil() {
22
+ null_count += 1;
23
+ continue;
24
+ } else if out.is_kind_of(class::true_class()) || out.is_kind_of(class::false_class()) {
25
+ let first_value = out.try_convert::<bool>().ok();
26
+ return Ok((
27
+ RbSeries::new(
28
+ apply_lambda_with_bool_out_type(df, lambda, null_count, first_value)
29
+ .into_series(),
30
+ )
31
+ .into(),
32
+ false,
33
+ ));
34
+ } else if out.is_kind_of(class::float()) {
35
+ let first_value = out.try_convert::<f64>().ok();
36
+
37
+ return Ok((
38
+ RbSeries::new(
39
+ apply_lambda_with_primitive_out_type::<Float64Type>(
40
+ df,
41
+ lambda,
42
+ null_count,
43
+ first_value,
44
+ )
45
+ .into_series(),
46
+ )
47
+ .into(),
48
+ false,
49
+ ));
50
+ } else if out.is_kind_of(class::integer()) {
51
+ let first_value = out.try_convert::<i64>().ok();
52
+ return Ok((
53
+ RbSeries::new(
54
+ apply_lambda_with_primitive_out_type::<Int64Type>(
55
+ df,
56
+ lambda,
57
+ null_count,
58
+ first_value,
59
+ )
60
+ .into_series(),
61
+ )
62
+ .into(),
63
+ false,
64
+ ));
65
+ // } else if out.is_kind_of(class::string()) {
66
+ // let first_value = out.try_convert::<String>().ok();
67
+ // return Ok((
68
+ // RbSeries::new(
69
+ // apply_lambda_with_utf8_out_type(df, lambda, null_count, first_value)
70
+ // .into_series(),
71
+ // )
72
+ // .into(),
73
+ // false,
74
+ // ));
75
+ } else if out.respond_to("_s", true)? {
76
+ let rb_rbseries: Value = out.funcall("_s", ()).unwrap();
77
+ let series = rb_rbseries
78
+ .try_convert::<&RbSeries>()
79
+ .unwrap()
80
+ .series
81
+ .borrow();
82
+ let dt = series.dtype();
83
+ return Ok((
84
+ RbSeries::new(
85
+ apply_lambda_with_list_out_type(df, lambda, null_count, Some(&series), dt)?
86
+ .into_series(),
87
+ )
88
+ .into(),
89
+ false,
90
+ ));
91
+ } else if out.try_convert::<Wrap<Row<'a>>>().is_ok() {
92
+ let first_value = out.try_convert::<Wrap<Row<'a>>>().unwrap().0;
93
+ return Ok((
94
+ RbDataFrame::from(
95
+ apply_lambda_with_rows_output(
96
+ df,
97
+ lambda,
98
+ null_count,
99
+ first_value,
100
+ inference_size,
101
+ )
102
+ .map_err(RbPolarsErr::from)?,
103
+ )
104
+ .into(),
105
+ true,
106
+ ));
107
+ } else if out.is_kind_of(class::array()) {
108
+ return Err(RbPolarsErr::other(
109
+ "A list output type is invalid. Do you mean to create polars List Series?\
110
+ Then return a Series object."
111
+ .into(),
112
+ ));
113
+ } else {
114
+ return Err(RbPolarsErr::other("Could not determine output type".into()));
115
+ }
116
+ }
117
+ Err(RbPolarsErr::other("Could not determine output type".into()))
118
+ }
119
+
120
+ fn apply_iter<T>(
121
+ df: &DataFrame,
122
+ lambda: Value,
123
+ init_null_count: usize,
124
+ skip: usize,
125
+ ) -> impl Iterator<Item = Option<T>> + '_
126
+ where
127
+ T: TryConvert,
128
+ {
129
+ let columns = df.get_columns();
130
+ ((init_null_count + skip)..df.height()).map(move |idx| {
131
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
132
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
133
+ match lambda.funcall::<_, _, Value>("call", tpl) {
134
+ Ok(val) => val.try_convert::<T>().ok(),
135
+ Err(e) => panic!("ruby function failed {}", e),
136
+ }
137
+ })
138
+ }
139
+
140
+ /// Apply a lambda with a primitive output type
141
+ pub fn apply_lambda_with_primitive_out_type<D>(
142
+ df: &DataFrame,
143
+ lambda: Value,
144
+ init_null_count: usize,
145
+ first_value: Option<D::Native>,
146
+ ) -> ChunkedArray<D>
147
+ where
148
+ D: RbArrowPrimitiveType,
149
+ D::Native: Into<Value> + TryConvert,
150
+ {
151
+ let skip = usize::from(first_value.is_some());
152
+ if init_null_count == df.height() {
153
+ ChunkedArray::full_null("apply", df.height())
154
+ } else {
155
+ let iter = apply_iter(df, lambda, init_null_count, skip);
156
+ iterator_to_primitive(iter, init_null_count, first_value, "apply", df.height())
157
+ }
158
+ }
159
+
160
+ /// Apply a lambda with a boolean output type
161
+ pub fn apply_lambda_with_bool_out_type(
162
+ df: &DataFrame,
163
+ lambda: Value,
164
+ init_null_count: usize,
165
+ first_value: Option<bool>,
166
+ ) -> ChunkedArray<BooleanType> {
167
+ let skip = usize::from(first_value.is_some());
168
+ if init_null_count == df.height() {
169
+ ChunkedArray::full_null("apply", df.height())
170
+ } else {
171
+ let iter = apply_iter(df, lambda, init_null_count, skip);
172
+ iterator_to_bool(iter, init_null_count, first_value, "apply", df.height())
173
+ }
174
+ }
175
+
176
+ /// Apply a lambda with utf8 output type
177
+ pub fn apply_lambda_with_utf8_out_type(
178
+ df: &DataFrame,
179
+ lambda: Value,
180
+ init_null_count: usize,
181
+ first_value: Option<&str>,
182
+ ) -> Utf8Chunked {
183
+ let skip = usize::from(first_value.is_some());
184
+ if init_null_count == df.height() {
185
+ ChunkedArray::full_null("apply", df.height())
186
+ } else {
187
+ let iter = apply_iter::<String>(df, lambda, init_null_count, skip);
188
+ iterator_to_utf8(iter, init_null_count, first_value, "apply", df.height())
189
+ }
190
+ }
191
+
192
+ /// Apply a lambda with list output type
193
+ pub fn apply_lambda_with_list_out_type<'a>(
194
+ df: &'a DataFrame,
195
+ lambda: Value,
196
+ init_null_count: usize,
197
+ first_value: Option<&Series>,
198
+ dt: &DataType,
199
+ ) -> RbResult<ListChunked> {
200
+ let columns = df.get_columns();
201
+
202
+ let skip = usize::from(first_value.is_some());
203
+ if init_null_count == df.height() {
204
+ Ok(ChunkedArray::full_null("apply", df.height()))
205
+ } else {
206
+ let iter = ((init_null_count + skip)..df.height()).map(|idx| {
207
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
208
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
209
+ match lambda.funcall::<_, _, Value>("call", tpl) {
210
+ Ok(val) => match val.funcall::<_, _, Value>("_s", ()) {
211
+ Ok(val) => val
212
+ .try_convert::<&RbSeries>()
213
+ .ok()
214
+ .map(|ps| ps.series.borrow().clone()),
215
+ Err(_) => {
216
+ if val.is_nil() {
217
+ None
218
+ } else {
219
+ panic!("should return a Series, got a {:?}", val)
220
+ }
221
+ }
222
+ },
223
+ Err(e) => panic!("ruby function failed {}", e),
224
+ }
225
+ });
226
+ iterator_to_list(dt, iter, init_null_count, first_value, "apply", df.height())
227
+ }
228
+ }
229
+
230
+ pub fn apply_lambda_with_rows_output<'a>(
231
+ df: &'a DataFrame,
232
+ lambda: Value,
233
+ init_null_count: usize,
234
+ first_value: Row<'a>,
235
+ inference_size: usize,
236
+ ) -> PolarsResult<DataFrame> {
237
+ let columns = df.get_columns();
238
+ let width = first_value.0.len();
239
+ let null_row = Row::new(vec![AnyValue::Null; width]);
240
+
241
+ let mut row_buf = Row::default();
242
+
243
+ let skip = 1;
244
+ let mut row_iter = ((init_null_count + skip)..df.height()).map(|idx| {
245
+ let iter = columns.iter().map(|s: &Series| Wrap(s.get(idx)));
246
+ let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
247
+ match lambda.funcall::<_, _, Value>("call", tpl) {
248
+ Ok(val) => {
249
+ match val.try_convert::<RArray>().ok() {
250
+ Some(tuple) => {
251
+ row_buf.0.clear();
252
+ for v in tuple.each() {
253
+ let v = v.unwrap().try_convert::<Wrap<AnyValue>>().unwrap().0;
254
+ row_buf.0.push(v);
255
+ }
256
+ let ptr = &row_buf as *const Row;
257
+ // Safety:
258
+ // we know that row constructor of polars dataframe does not keep a reference
259
+ // to the row. Before we mutate the row buf again, the reference is dropped.
260
+ // we only cannot prove it to the compiler.
261
+ // we still do this because it saves a Vec allocation in a hot loop.
262
+ unsafe { &*ptr }
263
+ }
264
+ None => &null_row,
265
+ }
266
+ }
267
+ Err(e) => panic!("ruby function failed {}", e),
268
+ }
269
+ });
270
+
271
+ // first rows for schema inference
272
+ let mut buf = Vec::with_capacity(inference_size);
273
+ buf.push(first_value);
274
+ buf.extend((&mut row_iter).take(inference_size).cloned());
275
+ let schema = rows_to_schema_first_non_null(&buf, Some(50));
276
+
277
+ if init_null_count > 0 {
278
+ // Safety: we know the iterators size
279
+ let iter = unsafe {
280
+ (0..init_null_count)
281
+ .map(|_| &null_row)
282
+ .chain(buf.iter())
283
+ .chain(row_iter)
284
+ .trust_my_length(df.height())
285
+ };
286
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
287
+ } else {
288
+ // Safety: we know the iterators size
289
+ let iter = unsafe { buf.iter().chain(row_iter).trust_my_length(df.height()) };
290
+ DataFrame::from_rows_iter_and_schema(iter, &schema)
291
+ }
292
+ }