polars-df 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +3 -2
  7. data/ext/polars/Cargo.toml +18 -8
  8. data/ext/polars/src/batched_csv.rs +7 -5
  9. data/ext/polars/src/conversion/anyvalue.rs +186 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +273 -342
  12. data/ext/polars/src/dataframe.rs +108 -66
  13. data/ext/polars/src/expr/array.rs +78 -0
  14. data/ext/polars/src/expr/datetime.rs +29 -58
  15. data/ext/polars/src/expr/general.rs +83 -36
  16. data/ext/polars/src/expr/list.rs +58 -6
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +62 -11
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/file.rs +158 -11
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +120 -50
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/string_cache.rs +14 -0
  26. data/ext/polars/src/functions/whenthen.rs +47 -17
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +195 -40
  28. data/ext/polars/src/lib.rs +246 -179
  29. data/ext/polars/src/map/dataframe.rs +17 -9
  30. data/ext/polars/src/series/aggregation.rs +20 -0
  31. data/ext/polars/src/series/mod.rs +35 -4
  32. data/lib/polars/array_expr.rb +453 -0
  33. data/lib/polars/array_name_space.rb +346 -0
  34. data/lib/polars/batched_csv_reader.rb +4 -2
  35. data/lib/polars/cat_expr.rb +24 -0
  36. data/lib/polars/cat_name_space.rb +75 -0
  37. data/lib/polars/config.rb +2 -2
  38. data/lib/polars/data_frame.rb +306 -96
  39. data/lib/polars/data_types.rb +191 -28
  40. data/lib/polars/date_time_expr.rb +41 -18
  41. data/lib/polars/date_time_name_space.rb +9 -3
  42. data/lib/polars/exceptions.rb +12 -1
  43. data/lib/polars/expr.rb +898 -215
  44. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  45. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  46. data/lib/polars/functions/as_datatype.rb +248 -0
  47. data/lib/polars/functions/col.rb +47 -0
  48. data/lib/polars/functions/eager.rb +182 -0
  49. data/lib/polars/functions/lazy.rb +1280 -0
  50. data/lib/polars/functions/len.rb +49 -0
  51. data/lib/polars/functions/lit.rb +35 -0
  52. data/lib/polars/functions/random.rb +16 -0
  53. data/lib/polars/functions/range/date_range.rb +103 -0
  54. data/lib/polars/functions/range/int_range.rb +51 -0
  55. data/lib/polars/functions/repeat.rb +144 -0
  56. data/lib/polars/functions/whenthen.rb +96 -0
  57. data/lib/polars/functions.rb +29 -416
  58. data/lib/polars/group_by.rb +2 -2
  59. data/lib/polars/io.rb +36 -31
  60. data/lib/polars/lazy_frame.rb +405 -88
  61. data/lib/polars/list_expr.rb +158 -8
  62. data/lib/polars/list_name_space.rb +102 -0
  63. data/lib/polars/meta_expr.rb +175 -7
  64. data/lib/polars/series.rb +282 -41
  65. data/lib/polars/string_cache.rb +75 -0
  66. data/lib/polars/string_expr.rb +413 -96
  67. data/lib/polars/string_name_space.rb +4 -4
  68. data/lib/polars/testing.rb +507 -0
  69. data/lib/polars/utils.rb +106 -8
  70. data/lib/polars/version.rb +1 -1
  71. data/lib/polars/whenthen.rb +83 -0
  72. data/lib/polars.rb +16 -4
  73. metadata +37 -8
  74. data/lib/polars/lazy_functions.rb +0 -1181
  75. data/lib/polars/when.rb +0 -16
  76. data/lib/polars/when_then.rb +0 -19
@@ -1,9 +1,10 @@
1
1
  use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
2
- use polars::io::RowCount;
2
+ use polars::io::{HiveOptions, RowIndex};
3
3
  use polars::lazy::frame::LazyFrame;
4
4
  use polars::prelude::*;
5
5
  use std::cell::RefCell;
6
6
  use std::io::{BufWriter, Read};
7
+ use std::num::NonZeroUsize;
7
8
  use std::path::PathBuf;
8
9
 
9
10
  use crate::conversion::*;
@@ -55,13 +56,14 @@ impl RbLazyFrame {
55
56
  pub fn new_from_ndjson(
56
57
  path: String,
57
58
  infer_schema_length: Option<usize>,
58
- batch_size: Option<usize>,
59
+ batch_size: Option<Wrap<NonZeroUsize>>,
59
60
  n_rows: Option<usize>,
60
61
  low_memory: bool,
61
62
  rechunk: bool,
62
- row_count: Option<(String, IdxSize)>,
63
+ row_index: Option<(String, IdxSize)>,
63
64
  ) -> RbResult<Self> {
64
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
65
+ let batch_size = batch_size.map(|v| v.0);
66
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
65
67
 
66
68
  let lf = LazyJsonLineReader::new(path)
67
69
  .with_infer_schema_length(infer_schema_length)
@@ -69,7 +71,7 @@ impl RbLazyFrame {
69
71
  .with_n_rows(n_rows)
70
72
  .low_memory(low_memory)
71
73
  .with_rechunk(rechunk)
72
- .with_row_count(row_count)
74
+ .with_row_index(row_index)
73
75
  .finish()
74
76
  .map_err(RbPolarsErr::from)?;
75
77
  Ok(lf.into())
@@ -95,17 +97,17 @@ impl RbLazyFrame {
95
97
  let rechunk = bool::try_convert(arguments[14])?;
96
98
  let skip_rows_after_header = usize::try_convert(arguments[15])?;
97
99
  let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
98
- let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
100
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
99
101
  let try_parse_dates = bool::try_convert(arguments[18])?;
100
102
  let eol_char = String::try_convert(arguments[19])?;
103
+ let truncate_ragged_lines = bool::try_convert(arguments[20])?;
101
104
  // end arguments
102
105
 
103
106
  let null_values = null_values.map(|w| w.0);
104
107
  let quote_char = quote_char.map(|s| s.as_bytes()[0]);
105
108
  let separator = separator.as_bytes()[0];
106
109
  let eol_char = eol_char.as_bytes()[0];
107
-
108
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
110
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
109
111
 
110
112
  let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
111
113
  overwrite_dtype
@@ -113,6 +115,7 @@ impl RbLazyFrame {
113
115
  .map(|(name, dtype)| Field::new(&name, dtype.0))
114
116
  .collect::<Schema>()
115
117
  });
118
+
116
119
  let r = LazyCsvReader::new(path)
117
120
  .with_infer_schema_length(infer_schema_length)
118
121
  .with_separator(separator)
@@ -122,6 +125,7 @@ impl RbLazyFrame {
122
125
  .with_n_rows(n_rows)
123
126
  .with_cache(cache)
124
127
  .with_dtype_overwrite(overwrite_dtype.as_ref())
128
+ // TODO add with_schema
125
129
  .low_memory(low_memory)
126
130
  .with_comment_prefix(comment_prefix.as_deref())
127
131
  .with_quote_char(quote_char)
@@ -129,9 +133,11 @@ impl RbLazyFrame {
129
133
  .with_rechunk(rechunk)
130
134
  .with_skip_rows_after_header(skip_rows_after_header)
131
135
  .with_encoding(encoding.0)
132
- .with_row_count(row_count)
136
+ .with_row_index(row_index)
133
137
  .with_try_parse_dates(try_parse_dates)
134
- .with_null_values(null_values);
138
+ .with_null_values(null_values)
139
+ // TODO add with_missing_is_null
140
+ .truncate_ragged_lines(truncate_ragged_lines);
135
141
 
136
142
  if let Some(_lambda) = with_schema_modify {
137
143
  todo!();
@@ -142,30 +148,53 @@ impl RbLazyFrame {
142
148
 
143
149
  #[allow(clippy::too_many_arguments)]
144
150
  pub fn new_from_parquet(
145
- path: String,
151
+ path: Option<PathBuf>,
152
+ paths: Vec<PathBuf>,
146
153
  n_rows: Option<usize>,
147
154
  cache: bool,
148
155
  parallel: Wrap<ParallelStrategy>,
149
156
  rechunk: bool,
150
- row_count: Option<(String, IdxSize)>,
157
+ row_index: Option<(String, IdxSize)>,
151
158
  low_memory: bool,
152
159
  use_statistics: bool,
153
160
  hive_partitioning: bool,
161
+ hive_schema: Option<Wrap<Schema>>,
154
162
  ) -> RbResult<Self> {
155
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
163
+ let parallel = parallel.0;
164
+ let hive_schema = hive_schema.map(|s| Arc::new(s.0));
165
+
166
+ let first_path = if let Some(path) = &path {
167
+ path
168
+ } else {
169
+ paths
170
+ .first()
171
+ .ok_or_else(|| RbValueError::new_err("expected a path argument".to_string()))?
172
+ };
173
+
174
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
175
+ let hive_options = HiveOptions {
176
+ enabled: hive_partitioning,
177
+ schema: hive_schema,
178
+ };
179
+
156
180
  let args = ScanArgsParquet {
157
181
  n_rows,
158
182
  cache,
159
- parallel: parallel.0,
183
+ parallel,
160
184
  rechunk,
161
- row_count,
185
+ row_index,
162
186
  low_memory,
163
- // TODO support cloud options
164
187
  cloud_options: None,
165
188
  use_statistics,
166
- hive_partitioning,
189
+ hive_options,
167
190
  };
168
- let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
191
+
192
+ let lf = if path.is_some() {
193
+ LazyFrame::scan_parquet(first_path, args)
194
+ } else {
195
+ LazyFrame::scan_parquet_files(Arc::from(paths), args)
196
+ }
197
+ .map_err(RbPolarsErr::from)?;
169
198
  Ok(lf.into())
170
199
  }
171
200
 
@@ -174,16 +203,17 @@ impl RbLazyFrame {
174
203
  n_rows: Option<usize>,
175
204
  cache: bool,
176
205
  rechunk: bool,
177
- row_count: Option<(String, IdxSize)>,
206
+ row_index: Option<(String, IdxSize)>,
178
207
  memory_map: bool,
179
208
  ) -> RbResult<Self> {
180
- let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
209
+ let row_index = row_index.map(|(name, offset)| RowIndex { name, offset });
181
210
  let args = ScanArgsIpc {
182
211
  n_rows,
183
212
  cache,
184
213
  rechunk,
185
- row_count,
186
- memmap: memory_map,
214
+ row_index,
215
+ memory_map,
216
+ cloud_options: None,
187
217
  };
188
218
  let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
189
219
  Ok(lf.into())
@@ -216,37 +246,42 @@ impl RbLazyFrame {
216
246
  projection_pushdown: bool,
217
247
  simplify_expr: bool,
218
248
  slice_pushdown: bool,
219
- cse: bool,
249
+ comm_subplan_elim: bool,
250
+ comm_subexpr_elim: bool,
220
251
  allow_streaming: bool,
221
252
  _eager: bool,
222
253
  ) -> RbLazyFrame {
223
254
  let ldf = self.ldf.clone();
224
- let ldf = ldf
255
+ let mut ldf = ldf
225
256
  .with_type_coercion(type_coercion)
226
257
  .with_predicate_pushdown(predicate_pushdown)
227
258
  .with_simplify_expr(simplify_expr)
228
259
  .with_slice_pushdown(slice_pushdown)
229
- .with_comm_subplan_elim(cse)
230
260
  .with_streaming(allow_streaming)
231
261
  ._with_eager(_eager)
232
262
  .with_projection_pushdown(projection_pushdown);
263
+
264
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
265
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
266
+
233
267
  ldf.into()
234
268
  }
235
269
 
236
270
  pub fn sort(
237
271
  &self,
238
272
  by_column: String,
239
- reverse: bool,
273
+ descending: bool,
240
274
  nulls_last: bool,
241
275
  maintain_order: bool,
276
+ multithreaded: bool,
242
277
  ) -> Self {
243
278
  let ldf = self.ldf.clone();
244
279
  ldf.sort(
245
- &by_column,
246
- SortOptions {
247
- descending: reverse,
280
+ [&by_column],
281
+ SortMultipleOptions {
282
+ descending: vec![descending],
248
283
  nulls_last,
249
- multithreaded: true,
284
+ multithreaded,
250
285
  maintain_order,
251
286
  },
252
287
  )
@@ -255,15 +290,24 @@ impl RbLazyFrame {
255
290
 
256
291
  pub fn sort_by_exprs(
257
292
  &self,
258
- by_column: RArray,
259
- reverse: Vec<bool>,
293
+ by: RArray,
294
+ descending: Vec<bool>,
260
295
  nulls_last: bool,
261
296
  maintain_order: bool,
297
+ multithreaded: bool,
262
298
  ) -> RbResult<Self> {
263
299
  let ldf = self.ldf.clone();
264
- let exprs = rb_exprs_to_exprs(by_column)?;
300
+ let exprs = rb_exprs_to_exprs(by)?;
265
301
  Ok(ldf
266
- .sort_by_exprs(exprs, reverse, nulls_last, maintain_order)
302
+ .sort_by_exprs(
303
+ exprs,
304
+ SortMultipleOptions {
305
+ descending,
306
+ nulls_last,
307
+ maintain_order,
308
+ multithreaded,
309
+ },
310
+ )
267
311
  .into())
268
312
  }
269
313
 
@@ -304,6 +348,76 @@ impl RbLazyFrame {
304
348
  Ok(())
305
349
  }
306
350
 
351
+ pub fn sink_ipc(
352
+ &self,
353
+ path: PathBuf,
354
+ compression: Option<Wrap<IpcCompression>>,
355
+ maintain_order: bool,
356
+ ) -> RbResult<()> {
357
+ let options = IpcWriterOptions {
358
+ compression: compression.map(|c| c.0),
359
+ maintain_order,
360
+ };
361
+
362
+ let ldf = self.ldf.clone();
363
+ ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
364
+ Ok(())
365
+ }
366
+
367
+ #[allow(clippy::too_many_arguments)]
368
+ pub fn sink_csv(
369
+ &self,
370
+ path: PathBuf,
371
+ include_bom: bool,
372
+ include_header: bool,
373
+ separator: u8,
374
+ line_terminator: String,
375
+ quote_char: u8,
376
+ batch_size: Wrap<NonZeroUsize>,
377
+ datetime_format: Option<String>,
378
+ date_format: Option<String>,
379
+ time_format: Option<String>,
380
+ float_precision: Option<usize>,
381
+ null_value: Option<String>,
382
+ quote_style: Option<Wrap<QuoteStyle>>,
383
+ maintain_order: bool,
384
+ ) -> RbResult<()> {
385
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
386
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
387
+
388
+ let serialize_options = SerializeOptions {
389
+ date_format,
390
+ time_format,
391
+ datetime_format,
392
+ float_precision,
393
+ separator,
394
+ quote_char,
395
+ null: null_value,
396
+ line_terminator,
397
+ quote_style,
398
+ };
399
+
400
+ let options = CsvWriterOptions {
401
+ include_bom,
402
+ include_header,
403
+ maintain_order,
404
+ batch_size: batch_size.0,
405
+ serialize_options,
406
+ };
407
+
408
+ let ldf = self.ldf.clone();
409
+ ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
410
+ Ok(())
411
+ }
412
+
413
+ pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
414
+ let options = JsonWriterOptions { maintain_order };
415
+
416
+ let ldf = self.ldf.clone();
417
+ ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
418
+ Ok(())
419
+ }
420
+
307
421
  pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
308
422
  let ldf = self.ldf.clone();
309
423
  let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
@@ -321,6 +435,12 @@ impl RbLazyFrame {
321
435
  Ok(ldf.select(exprs).into())
322
436
  }
323
437
 
438
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
439
+ let ldf = self.ldf.clone();
440
+ let exprs = rb_exprs_to_exprs(exprs)?;
441
+ Ok(ldf.select_seq(exprs).into())
442
+ }
443
+
324
444
  pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
325
445
  let ldf = self.ldf.clone();
326
446
  let by = rb_exprs_to_exprs(by)?;
@@ -334,7 +454,7 @@ impl RbLazyFrame {
334
454
  })
335
455
  }
336
456
 
337
- pub fn group_by_rolling(
457
+ pub fn rolling(
338
458
  &self,
339
459
  index_column: &RbExpr,
340
460
  period: String,
@@ -346,7 +466,7 @@ impl RbLazyFrame {
346
466
  let closed_window = closed.0;
347
467
  let ldf = self.ldf.clone();
348
468
  let by = rb_exprs_to_exprs(by)?;
349
- let lazy_gb = ldf.group_by_rolling(
469
+ let lazy_gb = ldf.rolling(
350
470
  index_column.inner.clone(),
351
471
  by,
352
472
  RollingGroupOptions {
@@ -459,6 +579,7 @@ impl RbLazyFrame {
459
579
  right_on: RArray,
460
580
  allow_parallel: bool,
461
581
  force_parallel: bool,
582
+ join_nulls: bool,
462
583
  how: Wrap<JoinType>,
463
584
  suffix: String,
464
585
  ) -> RbResult<Self> {
@@ -474,17 +595,28 @@ impl RbLazyFrame {
474
595
  .right_on(right_on)
475
596
  .allow_parallel(allow_parallel)
476
597
  .force_parallel(force_parallel)
598
+ .join_nulls(join_nulls)
477
599
  .how(how.0)
478
600
  .suffix(suffix)
479
601
  .finish()
480
602
  .into())
481
603
  }
482
604
 
605
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
606
+ let ldf = self.ldf.clone();
607
+ ldf.with_column(expr.inner.clone()).into()
608
+ }
609
+
483
610
  pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
484
611
  let ldf = self.ldf.clone();
485
612
  Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
486
613
  }
487
614
 
615
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
616
+ let ldf = self.ldf.clone();
617
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
618
+ }
619
+
488
620
  pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
489
621
  let ldf = self.ldf.clone();
490
622
  ldf.rename(existing, new).into()
@@ -569,6 +701,11 @@ impl RbLazyFrame {
569
701
  Ok(ldf.explode(column).into())
570
702
  }
571
703
 
704
+ pub fn null_count(&self) -> Self {
705
+ let ldf = self.ldf.clone();
706
+ ldf.null_count().into()
707
+ }
708
+
572
709
  pub fn unique(
573
710
  &self,
574
711
  maintain_order: bool,
@@ -619,14 +756,18 @@ impl RbLazyFrame {
619
756
  ldf.melt(args).into()
620
757
  }
621
758
 
622
- pub fn with_row_count(&self, name: String, offset: Option<IdxSize>) -> Self {
759
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
623
760
  let ldf = self.ldf.clone();
624
- ldf.with_row_count(&name, offset).into()
761
+ ldf.with_row_index(&name, offset).into()
625
762
  }
626
763
 
627
- pub fn drop_columns(&self, cols: Vec<String>) -> Self {
764
+ pub fn drop(&self, cols: Vec<String>) -> Self {
628
765
  let ldf = self.ldf.clone();
629
- ldf.drop_columns(cols).into()
766
+ ldf.drop(cols).into()
767
+ }
768
+
769
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
770
+ self.ldf.clone().cast_all(dtype.0, strict).into()
630
771
  }
631
772
 
632
773
  pub fn clone(&self) -> Self {
@@ -668,4 +809,18 @@ impl RbLazyFrame {
668
809
  pub fn width(&self) -> RbResult<usize> {
669
810
  Ok(self.get_schema()?.len())
670
811
  }
812
+
813
+ pub fn count(&self) -> Self {
814
+ let ldf = self.ldf.clone();
815
+ ldf.count().into()
816
+ }
817
+
818
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
819
+ let out = self
820
+ .ldf
821
+ .clone()
822
+ .merge_sorted(other.ldf.clone(), &key)
823
+ .map_err(RbPolarsErr::from)?;
824
+ Ok(out.into())
825
+ }
671
826
  }