polars-df 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/README.md +7 -6
  5. data/ext/polars/Cargo.toml +10 -6
  6. data/ext/polars/src/batched_csv.rs +53 -50
  7. data/ext/polars/src/conversion/anyvalue.rs +3 -2
  8. data/ext/polars/src/conversion/mod.rs +31 -67
  9. data/ext/polars/src/dataframe/construction.rs +186 -0
  10. data/ext/polars/src/dataframe/export.rs +48 -0
  11. data/ext/polars/src/dataframe/general.rs +607 -0
  12. data/ext/polars/src/dataframe/io.rs +463 -0
  13. data/ext/polars/src/dataframe/mod.rs +26 -0
  14. data/ext/polars/src/expr/array.rs +6 -2
  15. data/ext/polars/src/expr/datetime.rs +13 -4
  16. data/ext/polars/src/expr/general.rs +50 -9
  17. data/ext/polars/src/expr/list.rs +6 -2
  18. data/ext/polars/src/expr/rolling.rs +185 -69
  19. data/ext/polars/src/expr/string.rs +12 -33
  20. data/ext/polars/src/file.rs +158 -11
  21. data/ext/polars/src/functions/lazy.rs +20 -3
  22. data/ext/polars/src/functions/range.rs +74 -0
  23. data/ext/polars/src/functions/whenthen.rs +47 -17
  24. data/ext/polars/src/interop/mod.rs +1 -0
  25. data/ext/polars/src/interop/numo/mod.rs +2 -0
  26. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  27. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  28. data/ext/polars/src/lazyframe/mod.rs +111 -56
  29. data/ext/polars/src/lib.rs +68 -34
  30. data/ext/polars/src/map/dataframe.rs +17 -9
  31. data/ext/polars/src/map/lazy.rs +5 -25
  32. data/ext/polars/src/map/series.rs +7 -1
  33. data/ext/polars/src/series/aggregation.rs +47 -30
  34. data/ext/polars/src/series/export.rs +131 -49
  35. data/ext/polars/src/series/mod.rs +13 -133
  36. data/lib/polars/array_expr.rb +6 -2
  37. data/lib/polars/batched_csv_reader.rb +11 -3
  38. data/lib/polars/convert.rb +6 -1
  39. data/lib/polars/data_frame.rb +225 -370
  40. data/lib/polars/date_time_expr.rb +11 -4
  41. data/lib/polars/date_time_name_space.rb +14 -4
  42. data/lib/polars/dynamic_group_by.rb +2 -2
  43. data/lib/polars/exceptions.rb +4 -0
  44. data/lib/polars/expr.rb +1171 -54
  45. data/lib/polars/functions/lazy.rb +3 -3
  46. data/lib/polars/functions/range/date_range.rb +92 -0
  47. data/lib/polars/functions/range/datetime_range.rb +149 -0
  48. data/lib/polars/functions/range/time_range.rb +141 -0
  49. data/lib/polars/functions/whenthen.rb +74 -5
  50. data/lib/polars/group_by.rb +88 -23
  51. data/lib/polars/io/avro.rb +24 -0
  52. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  53. data/lib/polars/io/database.rb +73 -0
  54. data/lib/polars/io/ipc.rb +247 -0
  55. data/lib/polars/io/json.rb +18 -0
  56. data/lib/polars/io/ndjson.rb +69 -0
  57. data/lib/polars/io/parquet.rb +226 -0
  58. data/lib/polars/lazy_frame.rb +55 -195
  59. data/lib/polars/lazy_group_by.rb +100 -3
  60. data/lib/polars/list_expr.rb +6 -2
  61. data/lib/polars/rolling_group_by.rb +2 -2
  62. data/lib/polars/series.rb +14 -12
  63. data/lib/polars/string_expr.rb +38 -36
  64. data/lib/polars/utils.rb +89 -1
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars/whenthen.rb +83 -0
  67. data/lib/polars.rb +10 -3
  68. metadata +23 -8
  69. data/ext/polars/src/dataframe.rs +0 -1182
  70. data/lib/polars/when.rb +0 -16
  71. data/lib/polars/when_then.rb +0 -19
@@ -1,57 +1,139 @@
1
- use magnus::{class, prelude::*, Module, RArray, RClass, RModule, Value};
1
+ use magnus::{value::qnil, IntoValue, RArray, Value};
2
2
  use polars_core::prelude::*;
3
3
 
4
- use crate::{raise_err, RbPolarsErr, RbResult, RbSeries};
4
+ use crate::prelude::*;
5
+ use crate::RbSeries;
5
6
 
6
7
  impl RbSeries {
7
- /// For numeric types, this should only be called for Series with null types.
8
- /// This will cast to floats so that `nil = NAN`
9
- pub fn to_numo(&self) -> RbResult<Value> {
10
- let s = &self.series.borrow();
11
- match s.dtype() {
12
- DataType::String => {
13
- let ca = s.str().unwrap();
8
+ /// Convert this Series to a Ruby array.
9
+ /// This operation copies data.
10
+ pub fn to_a(&self) -> Value {
11
+ let series = &self.series.borrow();
14
12
 
15
- // TODO make more efficient
16
- let np_arr = RArray::from_iter(ca);
17
- class::object()
18
- .const_get::<_, RModule>("Numo")?
19
- .const_get::<_, RClass>("RObject")?
20
- .funcall("cast", (np_arr,))
21
- }
22
- dt if dt.is_numeric() => {
23
- if s.bit_repr_is_large() {
24
- let s = s.cast(&DataType::Float64).unwrap();
25
- let ca = s.f64().unwrap();
26
- // TODO make more efficient
27
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
28
- Some(v) => v,
29
- None => f64::NAN,
30
- }));
31
- class::object()
32
- .const_get::<_, RModule>("Numo")?
33
- .const_get::<_, RClass>("DFloat")?
34
- .funcall("cast", (np_arr,))
35
- } else {
36
- let s = s.cast(&DataType::Float32).unwrap();
37
- let ca = s.f32().unwrap();
38
- // TODO make more efficient
39
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
40
- Some(v) => v,
41
- None => f32::NAN,
42
- }));
43
- class::object()
44
- .const_get::<_, RModule>("Numo")?
45
- .const_get::<_, RClass>("SFloat")?
46
- .funcall("cast", (np_arr,))
47
- }
48
- }
49
- dt => {
50
- raise_err!(
51
- format!("'to_numo' not supported for dtype: {dt:?}"),
52
- ComputeError
53
- );
54
- }
13
+ fn to_a_recursive(series: &Series) -> Value {
14
+ let rblist = match series.dtype() {
15
+ DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
16
+ DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
17
+ DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
18
+ DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
19
+ DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
20
+ DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
21
+ DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
22
+ DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
23
+ DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
24
+ DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
25
+ DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
26
+ DataType::Categorical(_, _) | DataType::Enum(_, _) => {
27
+ RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
28
+ }
29
+ DataType::Object(_, _) => {
30
+ let v = RArray::with_capacity(series.len());
31
+ for i in 0..series.len() {
32
+ let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
33
+ match obj {
34
+ Some(val) => v.push(val.to_object()).unwrap(),
35
+ None => v.push(qnil()).unwrap(),
36
+ };
37
+ }
38
+ v.into_value()
39
+ }
40
+ DataType::List(_) => {
41
+ let v = RArray::new();
42
+ let ca = series.list().unwrap();
43
+ for opt_s in unsafe { ca.amortized_iter() } {
44
+ match opt_s {
45
+ None => {
46
+ v.push(qnil()).unwrap();
47
+ }
48
+ Some(s) => {
49
+ let rblst = to_a_recursive(s.as_ref());
50
+ v.push(rblst).unwrap();
51
+ }
52
+ }
53
+ }
54
+ v.into_value()
55
+ }
56
+ DataType::Array(_, _) => {
57
+ let v = RArray::new();
58
+ let ca = series.array().unwrap();
59
+ for opt_s in ca.amortized_iter() {
60
+ match opt_s {
61
+ None => {
62
+ v.push(qnil()).unwrap();
63
+ }
64
+ Some(s) => {
65
+ let rblst = to_a_recursive(s.as_ref());
66
+ v.push(rblst).unwrap();
67
+ }
68
+ }
69
+ }
70
+ v.into_value()
71
+ }
72
+ DataType::Date => {
73
+ let ca = series.date().unwrap();
74
+ return Wrap(ca).into_value();
75
+ }
76
+ DataType::Time => {
77
+ let ca = series.time().unwrap();
78
+ return Wrap(ca).into_value();
79
+ }
80
+ DataType::Datetime(_, _) => {
81
+ let ca = series.datetime().unwrap();
82
+ return Wrap(ca).into_value();
83
+ }
84
+ DataType::Decimal(_, _) => {
85
+ let ca = series.decimal().unwrap();
86
+ return Wrap(ca).into_value();
87
+ }
88
+ DataType::String => {
89
+ let ca = series.str().unwrap();
90
+ return Wrap(ca).into_value();
91
+ }
92
+ DataType::Struct(_) => {
93
+ let ca = series.struct_().unwrap();
94
+ return Wrap(ca).into_value();
95
+ }
96
+ DataType::Duration(_) => {
97
+ let ca = series.duration().unwrap();
98
+ return Wrap(ca).into_value();
99
+ }
100
+ DataType::Binary => {
101
+ let ca = series.binary().unwrap();
102
+ return Wrap(ca).into_value();
103
+ }
104
+ DataType::Null => {
105
+ let null: Option<u8> = None;
106
+ let n = series.len();
107
+ let iter = std::iter::repeat(null).take(n);
108
+ use std::iter::{Repeat, Take};
109
+ struct NullIter {
110
+ iter: Take<Repeat<Option<u8>>>,
111
+ n: usize,
112
+ }
113
+ impl Iterator for NullIter {
114
+ type Item = Option<u8>;
115
+
116
+ fn next(&mut self) -> Option<Self::Item> {
117
+ self.iter.next()
118
+ }
119
+ fn size_hint(&self) -> (usize, Option<usize>) {
120
+ (self.n, Some(self.n))
121
+ }
122
+ }
123
+ impl ExactSizeIterator for NullIter {}
124
+
125
+ RArray::from_iter(NullIter { iter, n }).into_value()
126
+ }
127
+ DataType::Unknown(_) => {
128
+ panic!("to_a not implemented for unknown")
129
+ }
130
+ DataType::BinaryOffset => {
131
+ unreachable!()
132
+ }
133
+ };
134
+ rblist
55
135
  }
136
+
137
+ to_a_recursive(series)
56
138
  }
57
139
  }
@@ -5,7 +5,7 @@ mod construction;
5
5
  mod export;
6
6
  mod scatter;
7
7
 
8
- use magnus::{exception, prelude::*, value::qnil, Error, IntoValue, RArray, Value};
8
+ use magnus::{exception, prelude::*, Error, IntoValue, RArray, Value};
9
9
  use polars::prelude::*;
10
10
  use polars::series::IsSorted;
11
11
  use std::cell::RefCell;
@@ -233,8 +233,18 @@ impl RbSeries {
233
233
  }
234
234
  }
235
235
 
236
- pub fn sort(&self, descending: bool, nulls_last: bool) -> Self {
237
- (self.series.borrow_mut().sort(descending, nulls_last)).into()
236
+ pub fn sort(&self, descending: bool, nulls_last: bool, multithreaded: bool) -> RbResult<Self> {
237
+ Ok(self
238
+ .series
239
+ .borrow_mut()
240
+ .sort(
241
+ SortOptions::default()
242
+ .with_order_descending(descending)
243
+ .with_nulls_last(nulls_last)
244
+ .with_multithreaded(multithreaded),
245
+ )
246
+ .map_err(RbPolarsErr::from)?
247
+ .into())
238
248
  }
239
249
 
240
250
  pub fn value_counts(&self, sorted: bool) -> RbResult<RbDataFrame> {
@@ -315,136 +325,6 @@ impl RbSeries {
315
325
  self.series.borrow().len()
316
326
  }
317
327
 
318
- pub fn to_a(&self) -> Value {
319
- let series = &self.series.borrow();
320
-
321
- fn to_a_recursive(series: &Series) -> Value {
322
- let rblist = match series.dtype() {
323
- DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
324
- DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
325
- DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
326
- DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
327
- DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
328
- DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
329
- DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
330
- DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
331
- DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
332
- DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
333
- DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
334
- DataType::Categorical(_, _) | DataType::Enum(_, _) => {
335
- RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
336
- }
337
- DataType::Object(_, _) => {
338
- let v = RArray::with_capacity(series.len());
339
- for i in 0..series.len() {
340
- let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
341
- match obj {
342
- Some(val) => v.push(val.to_object()).unwrap(),
343
- None => v.push(qnil()).unwrap(),
344
- };
345
- }
346
- v.into_value()
347
- }
348
- DataType::List(_) => {
349
- let v = RArray::new();
350
- let ca = series.list().unwrap();
351
- for opt_s in unsafe { ca.amortized_iter() } {
352
- match opt_s {
353
- None => {
354
- v.push(qnil()).unwrap();
355
- }
356
- Some(s) => {
357
- let rblst = to_a_recursive(s.as_ref());
358
- v.push(rblst).unwrap();
359
- }
360
- }
361
- }
362
- v.into_value()
363
- }
364
- DataType::Array(_, _) => {
365
- let v = RArray::new();
366
- let ca = series.array().unwrap();
367
- for opt_s in ca.amortized_iter() {
368
- match opt_s {
369
- None => {
370
- v.push(qnil()).unwrap();
371
- }
372
- Some(s) => {
373
- let rblst = to_a_recursive(s.as_ref());
374
- v.push(rblst).unwrap();
375
- }
376
- }
377
- }
378
- v.into_value()
379
- }
380
- DataType::Date => {
381
- let ca = series.date().unwrap();
382
- return Wrap(ca).into_value();
383
- }
384
- DataType::Time => {
385
- let ca = series.time().unwrap();
386
- return Wrap(ca).into_value();
387
- }
388
- DataType::Datetime(_, _) => {
389
- let ca = series.datetime().unwrap();
390
- return Wrap(ca).into_value();
391
- }
392
- DataType::Decimal(_, _) => {
393
- let ca = series.decimal().unwrap();
394
- return Wrap(ca).into_value();
395
- }
396
- DataType::String => {
397
- let ca = series.str().unwrap();
398
- return Wrap(ca).into_value();
399
- }
400
- DataType::Struct(_) => {
401
- let ca = series.struct_().unwrap();
402
- return Wrap(ca).into_value();
403
- }
404
- DataType::Duration(_) => {
405
- let ca = series.duration().unwrap();
406
- return Wrap(ca).into_value();
407
- }
408
- DataType::Binary => {
409
- let ca = series.binary().unwrap();
410
- return Wrap(ca).into_value();
411
- }
412
- DataType::Null => {
413
- let null: Option<u8> = None;
414
- let n = series.len();
415
- let iter = std::iter::repeat(null).take(n);
416
- use std::iter::{Repeat, Take};
417
- struct NullIter {
418
- iter: Take<Repeat<Option<u8>>>,
419
- n: usize,
420
- }
421
- impl Iterator for NullIter {
422
- type Item = Option<u8>;
423
-
424
- fn next(&mut self) -> Option<Self::Item> {
425
- self.iter.next()
426
- }
427
- fn size_hint(&self) -> (usize, Option<usize>) {
428
- (self.n, Some(self.n))
429
- }
430
- }
431
- impl ExactSizeIterator for NullIter {}
432
-
433
- RArray::from_iter(NullIter { iter, n }).into_value()
434
- }
435
- DataType::Unknown => {
436
- panic!("to_a not implemented for unknown")
437
- }
438
- DataType::BinaryOffset => {
439
- unreachable!()
440
- }
441
- };
442
- rblist
443
- }
444
-
445
- to_a_recursive(series)
446
- }
447
-
448
328
  pub fn clone(&self) -> Self {
449
329
  RbSeries::new(self.series.borrow().clone())
450
330
  }
@@ -333,6 +333,10 @@ module Polars
333
333
  #
334
334
  # @param index [Integer]
335
335
  # Index to return per sub-array
336
+ # @param null_on_oob [Boolean]
337
+ # Behavior if an index is out of bounds:
338
+ # true -> set as null
339
+ # false -> raise an error
336
340
  #
337
341
  # @return [Expr]
338
342
  #
@@ -353,9 +357,9 @@ module Polars
353
357
  # # │ [4, 5, 6] ┆ -2 ┆ 5 │
354
358
  # # │ [7, 8, 9] ┆ 4 ┆ null │
355
359
  # # └───────────────┴─────┴──────┘
356
- def get(index)
360
+ def get(index, null_on_oob: true)
357
361
  index = Utils.parse_as_expression(index)
358
- Utils.wrap_expr(_rbexpr.arr_get(index))
362
+ Utils.wrap_expr(_rbexpr.arr_get(index, null_on_oob))
359
363
  end
360
364
 
361
365
  # Get the first value of the sub-arrays.
@@ -13,6 +13,7 @@ module Polars
13
13
  skip_rows: 0,
14
14
  dtypes: nil,
15
15
  null_values: nil,
16
+ missing_utf8_is_empty_string: false,
16
17
  ignore_errors: false,
17
18
  parse_dates: false,
18
19
  n_threads: nil,
@@ -27,10 +28,13 @@ module Polars
27
28
  row_count_offset: 0,
28
29
  sample_size: 1024,
29
30
  eol_char: "\n",
30
- new_columns: nil
31
+ new_columns: nil,
32
+ raise_if_empty: true,
33
+ truncate_ragged_lines: false,
34
+ decimal_comma: false
31
35
  )
32
36
  if Utils.pathlike?(file)
33
- path = Utils.normalise_filepath(file)
37
+ path = Utils.normalize_filepath(file)
34
38
  end
35
39
 
36
40
  dtype_list = nil
@@ -71,11 +75,15 @@ module Polars
71
75
  comment_char,
72
76
  quote_char,
73
77
  processed_null_values,
78
+ missing_utf8_is_empty_string,
74
79
  parse_dates,
75
80
  skip_rows_after_header,
76
81
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
77
82
  sample_size,
78
- eol_char
83
+ eol_char,
84
+ raise_if_empty,
85
+ truncate_ragged_lines,
86
+ decimal_comma
79
87
  )
80
88
  self.new_columns = new_columns
81
89
  end
@@ -27,7 +27,12 @@ module Polars
27
27
  # # │ 2 ┆ 4 │
28
28
  # # └─────┴─────┘
29
29
  def from_hash(data, schema: nil, columns: nil)
30
- DataFrame._from_hash(data, schema: schema || columns)
30
+ Utils.wrap_df(
31
+ DataFrame.hash_to_rbdf(
32
+ data,
33
+ schema: schema || columns
34
+ )
35
+ )
31
36
  end
32
37
 
33
38
  # Construct a DataFrame from a sequence of dictionaries. This operation clones data.