polars-df 0.8.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +3 -2
  7. data/ext/polars/Cargo.toml +18 -8
  8. data/ext/polars/src/batched_csv.rs +7 -5
  9. data/ext/polars/src/conversion/anyvalue.rs +186 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +273 -342
  12. data/ext/polars/src/dataframe.rs +108 -66
  13. data/ext/polars/src/expr/array.rs +78 -0
  14. data/ext/polars/src/expr/datetime.rs +29 -58
  15. data/ext/polars/src/expr/general.rs +83 -36
  16. data/ext/polars/src/expr/list.rs +58 -6
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +62 -11
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/file.rs +158 -11
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +120 -50
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/string_cache.rs +14 -0
  26. data/ext/polars/src/functions/whenthen.rs +47 -17
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +195 -40
  28. data/ext/polars/src/lib.rs +246 -179
  29. data/ext/polars/src/map/dataframe.rs +17 -9
  30. data/ext/polars/src/series/aggregation.rs +20 -0
  31. data/ext/polars/src/series/mod.rs +35 -4
  32. data/lib/polars/array_expr.rb +453 -0
  33. data/lib/polars/array_name_space.rb +346 -0
  34. data/lib/polars/batched_csv_reader.rb +4 -2
  35. data/lib/polars/cat_expr.rb +24 -0
  36. data/lib/polars/cat_name_space.rb +75 -0
  37. data/lib/polars/config.rb +2 -2
  38. data/lib/polars/data_frame.rb +306 -96
  39. data/lib/polars/data_types.rb +191 -28
  40. data/lib/polars/date_time_expr.rb +41 -18
  41. data/lib/polars/date_time_name_space.rb +9 -3
  42. data/lib/polars/exceptions.rb +12 -1
  43. data/lib/polars/expr.rb +898 -215
  44. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  45. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  46. data/lib/polars/functions/as_datatype.rb +248 -0
  47. data/lib/polars/functions/col.rb +47 -0
  48. data/lib/polars/functions/eager.rb +182 -0
  49. data/lib/polars/functions/lazy.rb +1280 -0
  50. data/lib/polars/functions/len.rb +49 -0
  51. data/lib/polars/functions/lit.rb +35 -0
  52. data/lib/polars/functions/random.rb +16 -0
  53. data/lib/polars/functions/range/date_range.rb +103 -0
  54. data/lib/polars/functions/range/int_range.rb +51 -0
  55. data/lib/polars/functions/repeat.rb +144 -0
  56. data/lib/polars/functions/whenthen.rb +96 -0
  57. data/lib/polars/functions.rb +29 -416
  58. data/lib/polars/group_by.rb +2 -2
  59. data/lib/polars/io.rb +36 -31
  60. data/lib/polars/lazy_frame.rb +405 -88
  61. data/lib/polars/list_expr.rb +158 -8
  62. data/lib/polars/list_name_space.rb +102 -0
  63. data/lib/polars/meta_expr.rb +175 -7
  64. data/lib/polars/series.rb +282 -41
  65. data/lib/polars/string_cache.rb +75 -0
  66. data/lib/polars/string_expr.rb +413 -96
  67. data/lib/polars/string_name_space.rb +4 -4
  68. data/lib/polars/testing.rb +507 -0
  69. data/lib/polars/utils.rb +106 -8
  70. data/lib/polars/version.rb +1 -1
  71. data/lib/polars/whenthen.rb +83 -0
  72. data/lib/polars.rb +16 -4
  73. metadata +37 -8
  74. data/lib/polars/lazy_functions.rb +0 -1181
  75. data/lib/polars/when.rb +0 -16
  76. data/lib/polars/when_then.rb +0 -19
@@ -1,9 +1,10 @@
1
1
  use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
2
- use polars::io::RowCount;
2
+ use polars::io::{HiveOptions, RowIndex};
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::num::NonZeroUsize;
7
8
  use std::path::PathBuf;
8
9
 
9
10
  use crate::conversion::*;
@@ -55,13 +56,14 @@ impl RbLazyFrame {
55
56
  pub fn new_from_ndjson(
56
57
  path: String,
57
58
  infer_schema_length: Option<usize>,
58
- batch_size: Option<usize>,
59
+ batch_size: Option<Wrap<NonZeroUsize>>,
59
60
  n_rows: Option<usize>,
60
61
  low_memory: bool,
61
62
  rechunk: bool,
62
- row_count: Option<(String, IdxSize)>,
63
+ row_index: Option<(String, IdxSize)>,
63
64
  ) -> RbResult<Self> {
64
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
65
+ let batch_size = batch_size.map(|v| v.0);
66
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
65
67
 
66
68
  let lf = LazyJsonLineReader::new(path)
67
69
  .with_infer_schema_length(infer_schema_length)
@@ -69,7 +71,7 @@ impl RbLazyFrame {
69
71
  .with_n_rows(n_rows)
70
72
  .low_memory(low_memory)
71
73
  .with_rechunk(rechunk)
72
- .with_row_count(row_count)
74
+ .with_row_index(row_index)
73
75
  .finish()
74
76
  .map_err(RbPolarsErr::from)?;
75
77
  Ok(lf.into())
@@ -95,17 +97,17 @@ impl RbLazyFrame {
95
97
  let rechunk = bool::try_convert(arguments[14])?;
96
98
  let skip_rows_after_header = usize::try_convert(arguments[15])?;
97
99
  let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
98
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
100
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
99
101
  let try_parse_dates = bool::try_convert(arguments[18])?;
100
102
  let eol_char = String::try_convert(arguments[19])?;
103
+ let truncate_ragged_lines = bool::try_convert(arguments[20])?;
101
104
  // end arguments
102
105
 
103
106
  let null_values = null_values.map(|w| w.0);
104
107
  let quote_char = quote_char.map(|s| s.as_bytes()[0]);
105
108
  let separator = separator.as_bytes()[0];
106
109
  let eol_char = eol_char.as_bytes()[0];
107
-
108
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
110
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
109
111
 
110
112
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
111
113
  overwrite_dtype
@@ -113,6 +115,7 @@ impl RbLazyFrame {
113
115
  .map(|(name, dtype)| Field::new(&name, dtype.0))
114
116
  .collect::<Schema>()
115
117
  });
118
+
116
119
  let r = LazyCsvReader::new(path)
117
120
  .with_infer_schema_length(infer_schema_length)
118
121
  .with_separator(separator)
@@ -122,6 +125,7 @@ impl RbLazyFrame {
122
125
  .with_n_rows(n_rows)
123
126
  .with_cache(cache)
124
127
  .with_dtype_overwrite(overwrite_dtype.as_ref())
128
+ // TODO add with_schema
125
129
  .low_memory(low_memory)
126
130
  .with_comment_prefix(comment_prefix.as_deref())
127
131
  .with_quote_char(quote_char)
@@ -129,9 +133,11 @@ impl RbLazyFrame {
129
133
  .with_rechunk(rechunk)
130
134
  .with_skip_rows_after_header(skip_rows_after_header)
131
135
  .with_encoding(encoding.0)
132
- .with_row_count(row_count)
136
+ .with_row_index(row_index)
133
137
  .with_try_parse_dates(try_parse_dates)
134
- .with_null_values(null_values);
138
+ .with_null_values(null_values)
139
+ // TODO add with_missing_is_null
140
+ .truncate_ragged_lines(truncate_ragged_lines);
135
141
 
136
142
  if let Some(_lambda) = with_schema_modify {
137
143
  todo!();
@@ -142,30 +148,53 @@ impl RbLazyFrame {
142
148
 
143
149
  #[allow(clippy::too_many_arguments)]
144
150
  pub fn new_from_parquet(
145
- path: String,
151
+ path: Option<PathBuf>,
152
+ paths: Vec<PathBuf>,
146
153
  n_rows: Option<usize>,
147
154
  cache: bool,
148
155
  parallel: Wrap<ParallelStrategy>,
149
156
  rechunk: bool,
150
- row_count: Option<(String, IdxSize)>,
157
+ row_index: Option<(String, IdxSize)>,
151
158
  low_memory: bool,
152
159
  use_statistics: bool,
153
160
  hive_partitioning: bool,
161
+ hive_schema: Option<Wrap<Schema>>,
154
162
  ) -> RbResult<Self> {
155
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
163
+ let parallel = parallel.0;
164
+ let hive_schema = hive_schema.map(|s| Arc::new(s.0));
165
+
166
+ let first_path = if let Some(path) = &path {
167
+ path
168
+ } else {
169
+ paths
170
+ .first()
171
+ .ok_or_else(|| RbValueError::new_err("expected a path argument".to_string()))?
172
+ };
173
+
174
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
175
+ let hive_options = HiveOptions {
176
+ enabled: hive_partitioning,
177
+ schema: hive_schema,
178
+ };
179
+
156
180
  let args = ScanArgsParquet {
157
181
  n_rows,
158
182
  cache,
159
- parallel: parallel.0,
183
+ parallel,
160
184
  rechunk,
161
- row_count,
185
+ row_index,
162
186
  low_memory,
163
- // TODO support cloud options
164
187
  cloud_options: None,
165
188
  use_statistics,
166
- hive_partitioning,
189
+ hive_options,
167
190
  };
168
- let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
191
+
192
+ let lf = if path.is_some() {
193
+ LazyFrame::scan_parquet(first_path, args)
194
+ } else {
195
+ LazyFrame::scan_parquet_files(Arc::from(paths), args)
196
+ }
197
+ .map_err(RbPolarsErr::from)?;
169
198
  Ok(lf.into())
170
199
  }
171
200
 
@@ -174,16 +203,17 @@ impl RbLazyFrame {
174
203
  n_rows: Option<usize>,
175
204
  cache: bool,
176
205
  rechunk: bool,
177
- row_count: Option<(String, IdxSize)>,
206
+ row_index: Option<(String, IdxSize)>,
178
207
  memory_map: bool,
179
208
  ) -> RbResult<Self> {
180
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
209
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
181
210
  let args = ScanArgsIpc {
182
211
  n_rows,
183
212
  cache,
184
213
  rechunk,
185
- row_count,
186
- memmap: memory_map,
214
+ row_index,
215
+ memory_map,
216
+ cloud_options: None,
187
217
  };
188
218
  let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
189
219
  Ok(lf.into())
@@ -216,37 +246,42 @@ impl RbLazyFrame {
216
246
  projection_pushdown: bool,
217
247
  simplify_expr: bool,
218
248
  slice_pushdown: bool,
219
- cse: bool,
249
+ comm_subplan_elim: bool,
250
+ comm_subexpr_elim: bool,
220
251
  allow_streaming: bool,
221
252
  _eager: bool,
222
253
  ) -> RbLazyFrame {
223
254
  let ldf = self.ldf.clone();
224
- let ldf = ldf
255
+ let mut ldf = ldf
225
256
  .with_type_coercion(type_coercion)
226
257
  .with_predicate_pushdown(predicate_pushdown)
227
258
  .with_simplify_expr(simplify_expr)
228
259
  .with_slice_pushdown(slice_pushdown)
229
- .with_comm_subplan_elim(cse)
230
260
  .with_streaming(allow_streaming)
231
261
  ._with_eager(_eager)
232
262
  .with_projection_pushdown(projection_pushdown);
263
+
264
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
265
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
266
+
233
267
  ldf.into()
234
268
  }
235
269
 
236
270
  pub fn sort(
237
271
  &self,
238
272
  by_column: String,
239
- reverse: bool,
273
+ descending: bool,
240
274
  nulls_last: bool,
241
275
  maintain_order: bool,
276
+ multithreaded: bool,
242
277
  ) -> Self {
243
278
  let ldf = self.ldf.clone();
244
279
  ldf.sort(
245
- &by_column,
246
- SortOptions {
247
- descending: reverse,
280
+ [&by_column],
281
+ SortMultipleOptions {
282
+ descending: vec![descending],
248
283
  nulls_last,
249
- multithreaded: true,
284
+ multithreaded,
250
285
  maintain_order,
251
286
  },
252
287
  )
@@ -255,15 +290,24 @@ impl RbLazyFrame {
255
290
 
256
291
  pub fn sort_by_exprs(
257
292
  &self,
258
- by_column: RArray,
259
- reverse: Vec<bool>,
293
+ by: RArray,
294
+ descending: Vec<bool>,
260
295
  nulls_last: bool,
261
296
  maintain_order: bool,
297
+ multithreaded: bool,
262
298
  ) -> RbResult<Self> {
263
299
  let ldf = self.ldf.clone();
264
- let exprs = rb_exprs_to_exprs(by_column)?;
300
+ let exprs = rb_exprs_to_exprs(by)?;
265
301
  Ok(ldf
266
- .sort_by_exprs(exprs, reverse, nulls_last, maintain_order)
302
+ .sort_by_exprs(
303
+ exprs,
304
+ SortMultipleOptions {
305
+ descending,
306
+ nulls_last,
307
+ maintain_order,
308
+ multithreaded,
309
+ },
310
+ )
267
311
  .into())
268
312
  }
269
313
 
@@ -304,6 +348,76 @@ impl RbLazyFrame {
304
348
  Ok(())
305
349
  }
306
350
 
351
+ pub fn sink_ipc(
352
+ &self,
353
+ path: PathBuf,
354
+ compression: Option<Wrap<IpcCompression>>,
355
+ maintain_order: bool,
356
+ ) -> RbResult<()> {
357
+ let options = IpcWriterOptions {
358
+ compression: compression.map(|c| c.0),
359
+ maintain_order,
360
+ };
361
+
362
+ let ldf = self.ldf.clone();
363
+ ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
364
+ Ok(())
365
+ }
366
+
367
+ #[allow(clippy::too_many_arguments)]
368
+ pub fn sink_csv(
369
+ &self,
370
+ path: PathBuf,
371
+ include_bom: bool,
372
+ include_header: bool,
373
+ separator: u8,
374
+ line_terminator: String,
375
+ quote_char: u8,
376
+ batch_size: Wrap<NonZeroUsize>,
377
+ datetime_format: Option<String>,
378
+ date_format: Option<String>,
379
+ time_format: Option<String>,
380
+ float_precision: Option<usize>,
381
+ null_value: Option<String>,
382
+ quote_style: Option<Wrap<QuoteStyle>>,
383
+ maintain_order: bool,
384
+ ) -> RbResult<()> {
385
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
386
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
387
+
388
+ let serialize_options = SerializeOptions {
389
+ date_format,
390
+ time_format,
391
+ datetime_format,
392
+ float_precision,
393
+ separator,
394
+ quote_char,
395
+ null: null_value,
396
+ line_terminator,
397
+ quote_style,
398
+ };
399
+
400
+ let options = CsvWriterOptions {
401
+ include_bom,
402
+ include_header,
403
+ maintain_order,
404
+ batch_size: batch_size.0,
405
+ serialize_options,
406
+ };
407
+
408
+ let ldf = self.ldf.clone();
409
+ ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
410
+ Ok(())
411
+ }
412
+
413
+ pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
414
+ let options = JsonWriterOptions { maintain_order };
415
+
416
+ let ldf = self.ldf.clone();
417
+ ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
418
+ Ok(())
419
+ }
420
+
307
421
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
308
422
  let ldf = self.ldf.clone();
309
423
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -321,6 +435,12 @@ impl RbLazyFrame {
321
435
  Ok(ldf.select(exprs).into())
322
436
  }
323
437
 
438
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
439
+ let ldf = self.ldf.clone();
440
+ let exprs = rb_exprs_to_exprs(exprs)?;
441
+ Ok(ldf.select_seq(exprs).into())
442
+ }
443
+
324
444
  pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
325
445
  let ldf = self.ldf.clone();
326
446
  let by = rb_exprs_to_exprs(by)?;
@@ -334,7 +454,7 @@ impl RbLazyFrame {
334
454
  })
335
455
  }
336
456
 
337
- pub fn group_by_rolling(
457
+ pub fn rolling(
338
458
  &self,
339
459
  index_column: &RbExpr,
340
460
  period: String,
@@ -346,7 +466,7 @@ impl RbLazyFrame {
346
466
  let closed_window = closed.0;
347
467
  let ldf = self.ldf.clone();
348
468
  let by = rb_exprs_to_exprs(by)?;
349
- let lazy_gb = ldf.group_by_rolling(
469
+ let lazy_gb = ldf.rolling(
350
470
  index_column.inner.clone(),
351
471
  by,
352
472
  RollingGroupOptions {
@@ -459,6 +579,7 @@ impl RbLazyFrame {
459
579
  right_on: RArray,
460
580
  allow_parallel: bool,
461
581
  force_parallel: bool,
582
+ join_nulls: bool,
462
583
  how: Wrap<JoinType>,
463
584
  suffix: String,
464
585
  ) -> RbResult<Self> {
@@ -474,17 +595,28 @@ impl RbLazyFrame {
474
595
  .right_on(right_on)
475
596
  .allow_parallel(allow_parallel)
476
597
  .force_parallel(force_parallel)
598
+ .join_nulls(join_nulls)
477
599
  .how(how.0)
478
600
  .suffix(suffix)
479
601
  .finish()
480
602
  .into())
481
603
  }
482
604
 
605
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
606
+ let ldf = self.ldf.clone();
607
+ ldf.with_column(expr.inner.clone()).into()
608
+ }
609
+
483
610
  pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
484
611
  let ldf = self.ldf.clone();
485
612
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
486
613
  }
487
614
 
615
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
616
+ let ldf = self.ldf.clone();
617
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
618
+ }
619
+
488
620
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
489
621
  let ldf = self.ldf.clone();
490
622
  ldf.rename(existing, new).into()
@@ -569,6 +701,11 @@ impl RbLazyFrame {
569
701
  Ok(ldf.explode(column).into())
570
702
  }
571
703
 
704
+ pub fn null_count(&self) -> Self {
705
+ let ldf = self.ldf.clone();
706
+ ldf.null_count().into()
707
+ }
708
+
572
709
  pub fn unique(
573
710
  &self,
574
711
  maintain_order: bool,
@@ -619,14 +756,18 @@ impl RbLazyFrame {
619
756
  ldf.melt(args).into()
620
757
  }
621
758
 
622
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
759
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
623
760
  let ldf = self.ldf.clone();
624
- ldf.with_row_count(&name, offset).into()
761
+ ldf.with_row_index(&name, offset).into()
625
762
  }
626
763
 
627
- pub fn drop_columns(&self, cols: Vec<String>) -> Self {
764
+ pub fn drop(&self, cols: Vec<String>) -> Self {
628
765
  let ldf = self.ldf.clone();
629
- ldf.drop_columns(cols).into()
766
+ ldf.drop(cols).into()
767
+ }
768
+
769
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
770
+ self.ldf.clone().cast_all(dtype.0, strict).into()
630
771
  }
631
772
 
632
773
  pub fn clone(&self) -> Self {
@@ -668,4 +809,18 @@ impl RbLazyFrame {
668
809
  pub fn width(&self) -> RbResult<usize> {
669
810
  Ok(self.get_schema()?.len())
670
811
  }
812
+
813
+ pub fn count(&self) -> Self {
814
+ let ldf = self.ldf.clone();
815
+ ldf.count().into()
816
+ }
817
+
818
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
819
+ let out = self
820
+ .ldf
821
+ .clone()
822
+ .merge_sorted(other.ldf.clone(), &key)
823
+ .map_err(RbPolarsErr::from)?;
824
+ Ok(out.into())
825
+ }
671
826
  }