polars-df 0.9.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/README.md +7 -6
  5. data/ext/polars/Cargo.toml +10 -6
  6. data/ext/polars/src/batched_csv.rs +53 -50
  7. data/ext/polars/src/conversion/anyvalue.rs +3 -2
  8. data/ext/polars/src/conversion/mod.rs +31 -67
  9. data/ext/polars/src/dataframe/construction.rs +186 -0
  10. data/ext/polars/src/dataframe/export.rs +48 -0
  11. data/ext/polars/src/dataframe/general.rs +607 -0
  12. data/ext/polars/src/dataframe/io.rs +463 -0
  13. data/ext/polars/src/dataframe/mod.rs +26 -0
  14. data/ext/polars/src/expr/array.rs +6 -2
  15. data/ext/polars/src/expr/datetime.rs +13 -4
  16. data/ext/polars/src/expr/general.rs +50 -9
  17. data/ext/polars/src/expr/list.rs +6 -2
  18. data/ext/polars/src/expr/rolling.rs +185 -69
  19. data/ext/polars/src/expr/string.rs +12 -33
  20. data/ext/polars/src/file.rs +158 -11
  21. data/ext/polars/src/functions/lazy.rs +20 -3
  22. data/ext/polars/src/functions/range.rs +74 -0
  23. data/ext/polars/src/functions/whenthen.rs +47 -17
  24. data/ext/polars/src/interop/mod.rs +1 -0
  25. data/ext/polars/src/interop/numo/mod.rs +2 -0
  26. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  27. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  28. data/ext/polars/src/lazyframe/mod.rs +111 -56
  29. data/ext/polars/src/lib.rs +68 -34
  30. data/ext/polars/src/map/dataframe.rs +17 -9
  31. data/ext/polars/src/map/lazy.rs +5 -25
  32. data/ext/polars/src/map/series.rs +7 -1
  33. data/ext/polars/src/series/aggregation.rs +47 -30
  34. data/ext/polars/src/series/export.rs +131 -49
  35. data/ext/polars/src/series/mod.rs +13 -133
  36. data/lib/polars/array_expr.rb +6 -2
  37. data/lib/polars/batched_csv_reader.rb +11 -3
  38. data/lib/polars/convert.rb +6 -1
  39. data/lib/polars/data_frame.rb +225 -370
  40. data/lib/polars/date_time_expr.rb +11 -4
  41. data/lib/polars/date_time_name_space.rb +14 -4
  42. data/lib/polars/dynamic_group_by.rb +2 -2
  43. data/lib/polars/exceptions.rb +4 -0
  44. data/lib/polars/expr.rb +1171 -54
  45. data/lib/polars/functions/lazy.rb +3 -3
  46. data/lib/polars/functions/range/date_range.rb +92 -0
  47. data/lib/polars/functions/range/datetime_range.rb +149 -0
  48. data/lib/polars/functions/range/time_range.rb +141 -0
  49. data/lib/polars/functions/whenthen.rb +74 -5
  50. data/lib/polars/group_by.rb +88 -23
  51. data/lib/polars/io/avro.rb +24 -0
  52. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  53. data/lib/polars/io/database.rb +73 -0
  54. data/lib/polars/io/ipc.rb +247 -0
  55. data/lib/polars/io/json.rb +18 -0
  56. data/lib/polars/io/ndjson.rb +69 -0
  57. data/lib/polars/io/parquet.rb +226 -0
  58. data/lib/polars/lazy_frame.rb +55 -195
  59. data/lib/polars/lazy_group_by.rb +100 -3
  60. data/lib/polars/list_expr.rb +6 -2
  61. data/lib/polars/rolling_group_by.rb +2 -2
  62. data/lib/polars/series.rb +14 -12
  63. data/lib/polars/string_expr.rb +38 -36
  64. data/lib/polars/utils.rb +89 -1
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars/whenthen.rb +83 -0
  67. data/lib/polars.rb +10 -3
  68. metadata +23 -8
  69. data/ext/polars/src/dataframe.rs +0 -1182
  70. data/lib/polars/when.rb +0 -16
  71. data/lib/polars/when_then.rb +0 -19
@@ -1,57 +1,139 @@
1
- use magnus::{class, prelude::*, Module, RArray, RClass, RModule, Value};
1
+ use magnus::{value::qnil, IntoValue, RArray, Value};
2
2
  use polars_core::prelude::*;
3
3
 
4
- use crate::{raise_err, RbPolarsErr, RbResult, RbSeries};
4
+ use crate::prelude::*;
5
+ use crate::RbSeries;
5
6
 
6
7
  impl RbSeries {
7
- /// For numeric types, this should only be called for Series with null types.
8
- /// This will cast to floats so that `nil = NAN`
9
- pub fn to_numo(&self) -> RbResult<Value> {
10
- let s = &self.series.borrow();
11
- match s.dtype() {
12
- DataType::String => {
13
- let ca = s.str().unwrap();
8
+ /// Convert this Series to a Ruby array.
9
+ /// This operation copies data.
10
+ pub fn to_a(&self) -> Value {
11
+ let series = &self.series.borrow();
14
12
 
15
- // TODO make more efficient
16
- let np_arr = RArray::from_iter(ca);
17
- class::object()
18
- .const_get::<_, RModule>("Numo")?
19
- .const_get::<_, RClass>("RObject")?
20
- .funcall("cast", (np_arr,))
21
- }
22
- dt if dt.is_numeric() => {
23
- if s.bit_repr_is_large() {
24
- let s = s.cast(&DataType::Float64).unwrap();
25
- let ca = s.f64().unwrap();
26
- // TODO make more efficient
27
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
28
- Some(v) => v,
29
- None => f64::NAN,
30
- }));
31
- class::object()
32
- .const_get::<_, RModule>("Numo")?
33
- .const_get::<_, RClass>("DFloat")?
34
- .funcall("cast", (np_arr,))
35
- } else {
36
- let s = s.cast(&DataType::Float32).unwrap();
37
- let ca = s.f32().unwrap();
38
- // TODO make more efficient
39
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
40
- Some(v) => v,
41
- None => f32::NAN,
42
- }));
43
- class::object()
44
- .const_get::<_, RModule>("Numo")?
45
- .const_get::<_, RClass>("SFloat")?
46
- .funcall("cast", (np_arr,))
47
- }
48
- }
49
- dt => {
50
- raise_err!(
51
- format!("'to_numo' not supported for dtype: {dt:?}"),
52
- ComputeError
53
- );
54
- }
13
+ fn to_a_recursive(series: &Series) -> Value {
14
+ let rblist = match series.dtype() {
15
+ DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
16
+ DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
17
+ DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
18
+ DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
19
+ DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
20
+ DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
21
+ DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
22
+ DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
23
+ DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
24
+ DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
25
+ DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
26
+ DataType::Categorical(_, _) | DataType::Enum(_, _) => {
27
+ RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
28
+ }
29
+ DataType::Object(_, _) => {
30
+ let v = RArray::with_capacity(series.len());
31
+ for i in 0..series.len() {
32
+ let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
33
+ match obj {
34
+ Some(val) => v.push(val.to_object()).unwrap(),
35
+ None => v.push(qnil()).unwrap(),
36
+ };
37
+ }
38
+ v.into_value()
39
+ }
40
+ DataType::List(_) => {
41
+ let v = RArray::new();
42
+ let ca = series.list().unwrap();
43
+ for opt_s in unsafe { ca.amortized_iter() } {
44
+ match opt_s {
45
+ None => {
46
+ v.push(qnil()).unwrap();
47
+ }
48
+ Some(s) => {
49
+ let rblst = to_a_recursive(s.as_ref());
50
+ v.push(rblst).unwrap();
51
+ }
52
+ }
53
+ }
54
+ v.into_value()
55
+ }
56
+ DataType::Array(_, _) => {
57
+ let v = RArray::new();
58
+ let ca = series.array().unwrap();
59
+ for opt_s in ca.amortized_iter() {
60
+ match opt_s {
61
+ None => {
62
+ v.push(qnil()).unwrap();
63
+ }
64
+ Some(s) => {
65
+ let rblst = to_a_recursive(s.as_ref());
66
+ v.push(rblst).unwrap();
67
+ }
68
+ }
69
+ }
70
+ v.into_value()
71
+ }
72
+ DataType::Date => {
73
+ let ca = series.date().unwrap();
74
+ return Wrap(ca).into_value();
75
+ }
76
+ DataType::Time => {
77
+ let ca = series.time().unwrap();
78
+ return Wrap(ca).into_value();
79
+ }
80
+ DataType::Datetime(_, _) => {
81
+ let ca = series.datetime().unwrap();
82
+ return Wrap(ca).into_value();
83
+ }
84
+ DataType::Decimal(_, _) => {
85
+ let ca = series.decimal().unwrap();
86
+ return Wrap(ca).into_value();
87
+ }
88
+ DataType::String => {
89
+ let ca = series.str().unwrap();
90
+ return Wrap(ca).into_value();
91
+ }
92
+ DataType::Struct(_) => {
93
+ let ca = series.struct_().unwrap();
94
+ return Wrap(ca).into_value();
95
+ }
96
+ DataType::Duration(_) => {
97
+ let ca = series.duration().unwrap();
98
+ return Wrap(ca).into_value();
99
+ }
100
+ DataType::Binary => {
101
+ let ca = series.binary().unwrap();
102
+ return Wrap(ca).into_value();
103
+ }
104
+ DataType::Null => {
105
+ let null: Option<u8> = None;
106
+ let n = series.len();
107
+ let iter = std::iter::repeat(null).take(n);
108
+ use std::iter::{Repeat, Take};
109
+ struct NullIter {
110
+ iter: Take<Repeat<Option<u8>>>,
111
+ n: usize,
112
+ }
113
+ impl Iterator for NullIter {
114
+ type Item = Option<u8>;
115
+
116
+ fn next(&mut self) -> Option<Self::Item> {
117
+ self.iter.next()
118
+ }
119
+ fn size_hint(&self) -> (usize, Option<usize>) {
120
+ (self.n, Some(self.n))
121
+ }
122
+ }
123
+ impl ExactSizeIterator for NullIter {}
124
+
125
+ RArray::from_iter(NullIter { iter, n }).into_value()
126
+ }
127
+ DataType::Unknown(_) => {
128
+ panic!("to_a not implemented for unknown")
129
+ }
130
+ DataType::BinaryOffset => {
131
+ unreachable!()
132
+ }
133
+ };
134
+ rblist
55
135
  }
136
+
137
+ to_a_recursive(series)
56
138
  }
57
139
  }
@@ -5,7 +5,7 @@ mod construction;
5
5
  mod export;
6
6
  mod scatter;
7
7
 
8
- use magnus::{exception, prelude::*, value::qnil, Error, IntoValue, RArray, Value};
8
+ use magnus::{exception, prelude::*, Error, IntoValue, RArray, Value};
9
9
  use polars::prelude::*;
10
10
  use polars::series::IsSorted;
11
11
  use std::cell::RefCell;
@@ -233,8 +233,18 @@ impl RbSeries {
233
233
  }
234
234
  }
235
235
 
236
- pub fn sort(&self, descending: bool, nulls_last: bool) -> Self {
237
- (self.series.borrow_mut().sort(descending, nulls_last)).into()
236
+ pub fn sort(&self, descending: bool, nulls_last: bool, multithreaded: bool) -> RbResult<Self> {
237
+ Ok(self
238
+ .series
239
+ .borrow_mut()
240
+ .sort(
241
+ SortOptions::default()
242
+ .with_order_descending(descending)
243
+ .with_nulls_last(nulls_last)
244
+ .with_multithreaded(multithreaded),
245
+ )
246
+ .map_err(RbPolarsErr::from)?
247
+ .into())
238
248
  }
239
249
 
240
250
  pub fn value_counts(&self, sorted: bool) -> RbResult<RbDataFrame> {
@@ -315,136 +325,6 @@ impl RbSeries {
315
325
  self.series.borrow().len()
316
326
  }
317
327
 
318
- pub fn to_a(&self) -> Value {
319
- let series = &self.series.borrow();
320
-
321
- fn to_a_recursive(series: &Series) -> Value {
322
- let rblist = match series.dtype() {
323
- DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
324
- DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
325
- DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
326
- DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
327
- DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
328
- DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
329
- DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
330
- DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
331
- DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
332
- DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
333
- DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
334
- DataType::Categorical(_, _) | DataType::Enum(_, _) => {
335
- RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
336
- }
337
- DataType::Object(_, _) => {
338
- let v = RArray::with_capacity(series.len());
339
- for i in 0..series.len() {
340
- let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
341
- match obj {
342
- Some(val) => v.push(val.to_object()).unwrap(),
343
- None => v.push(qnil()).unwrap(),
344
- };
345
- }
346
- v.into_value()
347
- }
348
- DataType::List(_) => {
349
- let v = RArray::new();
350
- let ca = series.list().unwrap();
351
- for opt_s in unsafe { ca.amortized_iter() } {
352
- match opt_s {
353
- None => {
354
- v.push(qnil()).unwrap();
355
- }
356
- Some(s) => {
357
- let rblst = to_a_recursive(s.as_ref());
358
- v.push(rblst).unwrap();
359
- }
360
- }
361
- }
362
- v.into_value()
363
- }
364
- DataType::Array(_, _) => {
365
- let v = RArray::new();
366
- let ca = series.array().unwrap();
367
- for opt_s in ca.amortized_iter() {
368
- match opt_s {
369
- None => {
370
- v.push(qnil()).unwrap();
371
- }
372
- Some(s) => {
373
- let rblst = to_a_recursive(s.as_ref());
374
- v.push(rblst).unwrap();
375
- }
376
- }
377
- }
378
- v.into_value()
379
- }
380
- DataType::Date => {
381
- let ca = series.date().unwrap();
382
- return Wrap(ca).into_value();
383
- }
384
- DataType::Time => {
385
- let ca = series.time().unwrap();
386
- return Wrap(ca).into_value();
387
- }
388
- DataType::Datetime(_, _) => {
389
- let ca = series.datetime().unwrap();
390
- return Wrap(ca).into_value();
391
- }
392
- DataType::Decimal(_, _) => {
393
- let ca = series.decimal().unwrap();
394
- return Wrap(ca).into_value();
395
- }
396
- DataType::String => {
397
- let ca = series.str().unwrap();
398
- return Wrap(ca).into_value();
399
- }
400
- DataType::Struct(_) => {
401
- let ca = series.struct_().unwrap();
402
- return Wrap(ca).into_value();
403
- }
404
- DataType::Duration(_) => {
405
- let ca = series.duration().unwrap();
406
- return Wrap(ca).into_value();
407
- }
408
- DataType::Binary => {
409
- let ca = series.binary().unwrap();
410
- return Wrap(ca).into_value();
411
- }
412
- DataType::Null => {
413
- let null: Option<u8> = None;
414
- let n = series.len();
415
- let iter = std::iter::repeat(null).take(n);
416
- use std::iter::{Repeat, Take};
417
- struct NullIter {
418
- iter: Take<Repeat<Option<u8>>>,
419
- n: usize,
420
- }
421
- impl Iterator for NullIter {
422
- type Item = Option<u8>;
423
-
424
- fn next(&mut self) -> Option<Self::Item> {
425
- self.iter.next()
426
- }
427
- fn size_hint(&self) -> (usize, Option<usize>) {
428
- (self.n, Some(self.n))
429
- }
430
- }
431
- impl ExactSizeIterator for NullIter {}
432
-
433
- RArray::from_iter(NullIter { iter, n }).into_value()
434
- }
435
- DataType::Unknown => {
436
- panic!("to_a not implemented for unknown")
437
- }
438
- DataType::BinaryOffset => {
439
- unreachable!()
440
- }
441
- };
442
- rblist
443
- }
444
-
445
- to_a_recursive(series)
446
- }
447
-
448
328
  pub fn clone(&self) -> Self {
449
329
  RbSeries::new(self.series.borrow().clone())
450
330
  }
@@ -333,6 +333,10 @@ module Polars
333
333
  #
334
334
  # @param index [Integer]
335
335
  # Index to return per sub-array
336
+ # @param null_on_oob [Boolean]
337
+ # Behavior if an index is out of bounds:
338
+ # true -> set as null
339
+ # false -> raise an error
336
340
  #
337
341
  # @return [Expr]
338
342
  #
@@ -353,9 +357,9 @@ module Polars
353
357
  # # │ [4, 5, 6] ┆ -2 ┆ 5 │
354
358
  # # │ [7, 8, 9] ┆ 4 ┆ null │
355
359
  # # └───────────────┴─────┴──────┘
356
- def get(index)
360
+ def get(index, null_on_oob: true)
357
361
  index = Utils.parse_as_expression(index)
358
- Utils.wrap_expr(_rbexpr.arr_get(index))
362
+ Utils.wrap_expr(_rbexpr.arr_get(index, null_on_oob))
359
363
  end
360
364
 
361
365
  # Get the first value of the sub-arrays.
@@ -13,6 +13,7 @@ module Polars
13
13
  skip_rows: 0,
14
14
  dtypes: nil,
15
15
  null_values: nil,
16
+ missing_utf8_is_empty_string: false,
16
17
  ignore_errors: false,
17
18
  parse_dates: false,
18
19
  n_threads: nil,
@@ -27,10 +28,13 @@ module Polars
27
28
  row_count_offset: 0,
28
29
  sample_size: 1024,
29
30
  eol_char: "\n",
30
- new_columns: nil
31
+ new_columns: nil,
32
+ raise_if_empty: true,
33
+ truncate_ragged_lines: false,
34
+ decimal_comma: false
31
35
  )
32
36
  if Utils.pathlike?(file)
33
- path = Utils.normalise_filepath(file)
37
+ path = Utils.normalize_filepath(file)
34
38
  end
35
39
 
36
40
  dtype_list = nil
@@ -71,11 +75,15 @@ module Polars
71
75
  comment_char,
72
76
  quote_char,
73
77
  processed_null_values,
78
+ missing_utf8_is_empty_string,
74
79
  parse_dates,
75
80
  skip_rows_after_header,
76
81
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
77
82
  sample_size,
78
- eol_char
83
+ eol_char,
84
+ raise_if_empty,
85
+ truncate_ragged_lines,
86
+ decimal_comma
79
87
  )
80
88
  self.new_columns = new_columns
81
89
  end
@@ -27,7 +27,12 @@ module Polars
27
27
  # # │ 2 ┆ 4 │
28
28
  # # └─────┴─────┘
29
29
  def from_hash(data, schema: nil, columns: nil)
30
- DataFrame._from_hash(data, schema: schema || columns)
30
+ Utils.wrap_df(
31
+ DataFrame.hash_to_rbdf(
32
+ data,
33
+ schema: schema || columns
34
+ )
35
+ )
31
36
  end
32
37
 
33
38
  # Construct a DataFrame from a sequence of dictionaries. This operation clones data.