polars-df 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -0
  3. data/Cargo.lock +90 -48
  4. data/README.md +6 -6
  5. data/ext/polars/Cargo.toml +7 -5
  6. data/ext/polars/src/batched_csv.rs +53 -52
  7. data/ext/polars/src/conversion/mod.rs +13 -60
  8. data/ext/polars/src/dataframe/construction.rs +186 -0
  9. data/ext/polars/src/dataframe/export.rs +48 -0
  10. data/ext/polars/src/dataframe/general.rs +607 -0
  11. data/ext/polars/src/dataframe/io.rs +463 -0
  12. data/ext/polars/src/dataframe/mod.rs +26 -0
  13. data/ext/polars/src/expr/datetime.rs +6 -2
  14. data/ext/polars/src/expr/general.rs +28 -6
  15. data/ext/polars/src/expr/rolling.rs +185 -69
  16. data/ext/polars/src/expr/string.rs +9 -30
  17. data/ext/polars/src/functions/lazy.rs +2 -0
  18. data/ext/polars/src/functions/range.rs +74 -0
  19. data/ext/polars/src/interop/mod.rs +1 -0
  20. data/ext/polars/src/interop/numo/mod.rs +2 -0
  21. data/ext/polars/src/interop/numo/to_numo_df.rs +23 -0
  22. data/ext/polars/src/interop/numo/to_numo_series.rs +60 -0
  23. data/ext/polars/src/lazyframe/mod.rs +54 -38
  24. data/ext/polars/src/lib.rs +46 -21
  25. data/ext/polars/src/map/lazy.rs +5 -25
  26. data/ext/polars/src/map/series.rs +7 -1
  27. data/ext/polars/src/series/aggregation.rs +47 -30
  28. data/ext/polars/src/series/export.rs +131 -49
  29. data/ext/polars/src/series/mod.rs +1 -131
  30. data/lib/polars/batched_csv_reader.rb +9 -3
  31. data/lib/polars/convert.rb +6 -1
  32. data/lib/polars/data_frame.rb +83 -302
  33. data/lib/polars/date_time_expr.rb +1 -0
  34. data/lib/polars/date_time_name_space.rb +5 -1
  35. data/lib/polars/dynamic_group_by.rb +2 -2
  36. data/lib/polars/exceptions.rb +4 -0
  37. data/lib/polars/expr.rb +1134 -20
  38. data/lib/polars/functions/range/date_range.rb +92 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/time_range.rb +141 -0
  41. data/lib/polars/group_by.rb +88 -23
  42. data/lib/polars/io/avro.rb +24 -0
  43. data/lib/polars/{io.rb → io/csv.rb} +296 -490
  44. data/lib/polars/io/database.rb +73 -0
  45. data/lib/polars/io/ipc.rb +247 -0
  46. data/lib/polars/io/json.rb +18 -0
  47. data/lib/polars/io/ndjson.rb +69 -0
  48. data/lib/polars/io/parquet.rb +226 -0
  49. data/lib/polars/lazy_frame.rb +23 -166
  50. data/lib/polars/lazy_group_by.rb +100 -3
  51. data/lib/polars/rolling_group_by.rb +2 -2
  52. data/lib/polars/series.rb +2 -2
  53. data/lib/polars/string_expr.rb +37 -36
  54. data/lib/polars/utils.rb +35 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +9 -1
  57. metadata +21 -5
  58. data/ext/polars/src/dataframe.rs +0 -1208
@@ -1,57 +1,139 @@
1
- use magnus::{class, prelude::*, Module, RArray, RClass, RModule, Value};
1
+ use magnus::{value::qnil, IntoValue, RArray, Value};
2
2
  use polars_core::prelude::*;
3
3
 
4
- use crate::{raise_err, RbPolarsErr, RbResult, RbSeries};
4
+ use crate::prelude::*;
5
+ use crate::RbSeries;
5
6
 
6
7
  impl RbSeries {
7
- /// For numeric types, this should only be called for Series with null types.
8
- /// This will cast to floats so that `nil = NAN`
9
- pub fn to_numo(&self) -> RbResult<Value> {
10
- let s = &self.series.borrow();
11
- match s.dtype() {
12
- DataType::String => {
13
- let ca = s.str().unwrap();
8
+ /// Convert this Series to a Ruby array.
9
+ /// This operation copies data.
10
+ pub fn to_a(&self) -> Value {
11
+ let series = &self.series.borrow();
14
12
 
15
- // TODO make more efficient
16
- let np_arr = RArray::from_iter(ca);
17
- class::object()
18
- .const_get::<_, RModule>("Numo")?
19
- .const_get::<_, RClass>("RObject")?
20
- .funcall("cast", (np_arr,))
21
- }
22
- dt if dt.is_numeric() => {
23
- if s.bit_repr_is_large() {
24
- let s = s.cast(&DataType::Float64).unwrap();
25
- let ca = s.f64().unwrap();
26
- // TODO make more efficient
27
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
28
- Some(v) => v,
29
- None => f64::NAN,
30
- }));
31
- class::object()
32
- .const_get::<_, RModule>("Numo")?
33
- .const_get::<_, RClass>("DFloat")?
34
- .funcall("cast", (np_arr,))
35
- } else {
36
- let s = s.cast(&DataType::Float32).unwrap();
37
- let ca = s.f32().unwrap();
38
- // TODO make more efficient
39
- let np_arr = RArray::from_iter(ca.into_iter().map(|opt_v| match opt_v {
40
- Some(v) => v,
41
- None => f32::NAN,
42
- }));
43
- class::object()
44
- .const_get::<_, RModule>("Numo")?
45
- .const_get::<_, RClass>("SFloat")?
46
- .funcall("cast", (np_arr,))
47
- }
48
- }
49
- dt => {
50
- raise_err!(
51
- format!("'to_numo' not supported for dtype: {dt:?}"),
52
- ComputeError
53
- );
54
- }
13
+ fn to_a_recursive(series: &Series) -> Value {
14
+ let rblist = match series.dtype() {
15
+ DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
16
+ DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
17
+ DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
18
+ DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
19
+ DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
20
+ DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
21
+ DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
22
+ DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
23
+ DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
24
+ DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
25
+ DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
26
+ DataType::Categorical(_, _) | DataType::Enum(_, _) => {
27
+ RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
28
+ }
29
+ DataType::Object(_, _) => {
30
+ let v = RArray::with_capacity(series.len());
31
+ for i in 0..series.len() {
32
+ let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
33
+ match obj {
34
+ Some(val) => v.push(val.to_object()).unwrap(),
35
+ None => v.push(qnil()).unwrap(),
36
+ };
37
+ }
38
+ v.into_value()
39
+ }
40
+ DataType::List(_) => {
41
+ let v = RArray::new();
42
+ let ca = series.list().unwrap();
43
+ for opt_s in unsafe { ca.amortized_iter() } {
44
+ match opt_s {
45
+ None => {
46
+ v.push(qnil()).unwrap();
47
+ }
48
+ Some(s) => {
49
+ let rblst = to_a_recursive(s.as_ref());
50
+ v.push(rblst).unwrap();
51
+ }
52
+ }
53
+ }
54
+ v.into_value()
55
+ }
56
+ DataType::Array(_, _) => {
57
+ let v = RArray::new();
58
+ let ca = series.array().unwrap();
59
+ for opt_s in ca.amortized_iter() {
60
+ match opt_s {
61
+ None => {
62
+ v.push(qnil()).unwrap();
63
+ }
64
+ Some(s) => {
65
+ let rblst = to_a_recursive(s.as_ref());
66
+ v.push(rblst).unwrap();
67
+ }
68
+ }
69
+ }
70
+ v.into_value()
71
+ }
72
+ DataType::Date => {
73
+ let ca = series.date().unwrap();
74
+ return Wrap(ca).into_value();
75
+ }
76
+ DataType::Time => {
77
+ let ca = series.time().unwrap();
78
+ return Wrap(ca).into_value();
79
+ }
80
+ DataType::Datetime(_, _) => {
81
+ let ca = series.datetime().unwrap();
82
+ return Wrap(ca).into_value();
83
+ }
84
+ DataType::Decimal(_, _) => {
85
+ let ca = series.decimal().unwrap();
86
+ return Wrap(ca).into_value();
87
+ }
88
+ DataType::String => {
89
+ let ca = series.str().unwrap();
90
+ return Wrap(ca).into_value();
91
+ }
92
+ DataType::Struct(_) => {
93
+ let ca = series.struct_().unwrap();
94
+ return Wrap(ca).into_value();
95
+ }
96
+ DataType::Duration(_) => {
97
+ let ca = series.duration().unwrap();
98
+ return Wrap(ca).into_value();
99
+ }
100
+ DataType::Binary => {
101
+ let ca = series.binary().unwrap();
102
+ return Wrap(ca).into_value();
103
+ }
104
+ DataType::Null => {
105
+ let null: Option<u8> = None;
106
+ let n = series.len();
107
+ let iter = std::iter::repeat(null).take(n);
108
+ use std::iter::{Repeat, Take};
109
+ struct NullIter {
110
+ iter: Take<Repeat<Option<u8>>>,
111
+ n: usize,
112
+ }
113
+ impl Iterator for NullIter {
114
+ type Item = Option<u8>;
115
+
116
+ fn next(&mut self) -> Option<Self::Item> {
117
+ self.iter.next()
118
+ }
119
+ fn size_hint(&self) -> (usize, Option<usize>) {
120
+ (self.n, Some(self.n))
121
+ }
122
+ }
123
+ impl ExactSizeIterator for NullIter {}
124
+
125
+ RArray::from_iter(NullIter { iter, n }).into_value()
126
+ }
127
+ DataType::Unknown(_) => {
128
+ panic!("to_a not implemented for unknown")
129
+ }
130
+ DataType::BinaryOffset => {
131
+ unreachable!()
132
+ }
133
+ };
134
+ rblist
55
135
  }
136
+
137
+ to_a_recursive(series)
56
138
  }
57
139
  }
@@ -5,7 +5,7 @@ mod construction;
5
5
  mod export;
6
6
  mod scatter;
7
7
 
8
- use magnus::{exception, prelude::*, value::qnil, Error, IntoValue, RArray, Value};
8
+ use magnus::{exception, prelude::*, Error, IntoValue, RArray, Value};
9
9
  use polars::prelude::*;
10
10
  use polars::series::IsSorted;
11
11
  use std::cell::RefCell;
@@ -325,136 +325,6 @@ impl RbSeries {
325
325
  self.series.borrow().len()
326
326
  }
327
327
 
328
- pub fn to_a(&self) -> Value {
329
- let series = &self.series.borrow();
330
-
331
- fn to_a_recursive(series: &Series) -> Value {
332
- let rblist = match series.dtype() {
333
- DataType::Boolean => RArray::from_iter(series.bool().unwrap()).into_value(),
334
- DataType::UInt8 => RArray::from_iter(series.u8().unwrap()).into_value(),
335
- DataType::UInt16 => RArray::from_iter(series.u16().unwrap()).into_value(),
336
- DataType::UInt32 => RArray::from_iter(series.u32().unwrap()).into_value(),
337
- DataType::UInt64 => RArray::from_iter(series.u64().unwrap()).into_value(),
338
- DataType::Int8 => RArray::from_iter(series.i8().unwrap()).into_value(),
339
- DataType::Int16 => RArray::from_iter(series.i16().unwrap()).into_value(),
340
- DataType::Int32 => RArray::from_iter(series.i32().unwrap()).into_value(),
341
- DataType::Int64 => RArray::from_iter(series.i64().unwrap()).into_value(),
342
- DataType::Float32 => RArray::from_iter(series.f32().unwrap()).into_value(),
343
- DataType::Float64 => RArray::from_iter(series.f64().unwrap()).into_value(),
344
- DataType::Categorical(_, _) | DataType::Enum(_, _) => {
345
- RArray::from_iter(series.categorical().unwrap().iter_str()).into_value()
346
- }
347
- DataType::Object(_, _) => {
348
- let v = RArray::with_capacity(series.len());
349
- for i in 0..series.len() {
350
- let obj: Option<&ObjectValue> = series.get_object(i).map(|any| any.into());
351
- match obj {
352
- Some(val) => v.push(val.to_object()).unwrap(),
353
- None => v.push(qnil()).unwrap(),
354
- };
355
- }
356
- v.into_value()
357
- }
358
- DataType::List(_) => {
359
- let v = RArray::new();
360
- let ca = series.list().unwrap();
361
- for opt_s in unsafe { ca.amortized_iter() } {
362
- match opt_s {
363
- None => {
364
- v.push(qnil()).unwrap();
365
- }
366
- Some(s) => {
367
- let rblst = to_a_recursive(s.as_ref());
368
- v.push(rblst).unwrap();
369
- }
370
- }
371
- }
372
- v.into_value()
373
- }
374
- DataType::Array(_, _) => {
375
- let v = RArray::new();
376
- let ca = series.array().unwrap();
377
- for opt_s in ca.amortized_iter() {
378
- match opt_s {
379
- None => {
380
- v.push(qnil()).unwrap();
381
- }
382
- Some(s) => {
383
- let rblst = to_a_recursive(s.as_ref());
384
- v.push(rblst).unwrap();
385
- }
386
- }
387
- }
388
- v.into_value()
389
- }
390
- DataType::Date => {
391
- let ca = series.date().unwrap();
392
- return Wrap(ca).into_value();
393
- }
394
- DataType::Time => {
395
- let ca = series.time().unwrap();
396
- return Wrap(ca).into_value();
397
- }
398
- DataType::Datetime(_, _) => {
399
- let ca = series.datetime().unwrap();
400
- return Wrap(ca).into_value();
401
- }
402
- DataType::Decimal(_, _) => {
403
- let ca = series.decimal().unwrap();
404
- return Wrap(ca).into_value();
405
- }
406
- DataType::String => {
407
- let ca = series.str().unwrap();
408
- return Wrap(ca).into_value();
409
- }
410
- DataType::Struct(_) => {
411
- let ca = series.struct_().unwrap();
412
- return Wrap(ca).into_value();
413
- }
414
- DataType::Duration(_) => {
415
- let ca = series.duration().unwrap();
416
- return Wrap(ca).into_value();
417
- }
418
- DataType::Binary => {
419
- let ca = series.binary().unwrap();
420
- return Wrap(ca).into_value();
421
- }
422
- DataType::Null => {
423
- let null: Option<u8> = None;
424
- let n = series.len();
425
- let iter = std::iter::repeat(null).take(n);
426
- use std::iter::{Repeat, Take};
427
- struct NullIter {
428
- iter: Take<Repeat<Option<u8>>>,
429
- n: usize,
430
- }
431
- impl Iterator for NullIter {
432
- type Item = Option<u8>;
433
-
434
- fn next(&mut self) -> Option<Self::Item> {
435
- self.iter.next()
436
- }
437
- fn size_hint(&self) -> (usize, Option<usize>) {
438
- (self.n, Some(self.n))
439
- }
440
- }
441
- impl ExactSizeIterator for NullIter {}
442
-
443
- RArray::from_iter(NullIter { iter, n }).into_value()
444
- }
445
- DataType::Unknown => {
446
- panic!("to_a not implemented for unknown")
447
- }
448
- DataType::BinaryOffset => {
449
- unreachable!()
450
- }
451
- };
452
- rblist
453
- }
454
-
455
- to_a_recursive(series)
456
- }
457
-
458
328
  pub fn clone(&self) -> Self {
459
329
  RbSeries::new(self.series.borrow().clone())
460
330
  }
@@ -13,6 +13,7 @@ module Polars
13
13
  skip_rows: 0,
14
14
  dtypes: nil,
15
15
  null_values: nil,
16
+ missing_utf8_is_empty_string: false,
16
17
  ignore_errors: false,
17
18
  parse_dates: false,
18
19
  n_threads: nil,
@@ -28,10 +29,12 @@ module Polars
28
29
  sample_size: 1024,
29
30
  eol_char: "\n",
30
31
  new_columns: nil,
31
- truncate_ragged_lines: false
32
+ raise_if_empty: true,
33
+ truncate_ragged_lines: false,
34
+ decimal_comma: false
32
35
  )
33
36
  if Utils.pathlike?(file)
34
- path = Utils.normalise_filepath(file)
37
+ path = Utils.normalize_filepath(file)
35
38
  end
36
39
 
37
40
  dtype_list = nil
@@ -72,12 +75,15 @@ module Polars
72
75
  comment_char,
73
76
  quote_char,
74
77
  processed_null_values,
78
+ missing_utf8_is_empty_string,
75
79
  parse_dates,
76
80
  skip_rows_after_header,
77
81
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
78
82
  sample_size,
79
83
  eol_char,
80
- truncate_ragged_lines
84
+ raise_if_empty,
85
+ truncate_ragged_lines,
86
+ decimal_comma
81
87
  )
82
88
  self.new_columns = new_columns
83
89
  end
@@ -27,7 +27,12 @@ module Polars
27
27
  # # │ 2 ┆ 4 │
28
28
  # # └─────┴─────┘
29
29
  def from_hash(data, schema: nil, columns: nil)
30
- DataFrame._from_hash(data, schema: schema || columns)
30
+ Utils.wrap_df(
31
+ DataFrame.hash_to_rbdf(
32
+ data,
33
+ schema: schema || columns
34
+ )
35
+ )
31
36
  end
32
37
 
33
38
  # Construct a DataFrame from a sequence of dictionaries. This operation clones data.