polars-df 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +360 -361
  4. data/ext/polars/Cargo.toml +10 -7
  5. data/ext/polars/src/batched_csv.rs +1 -1
  6. data/ext/polars/src/conversion/any_value.rs +261 -0
  7. data/ext/polars/src/conversion/chunked_array.rs +4 -4
  8. data/ext/polars/src/conversion/mod.rs +51 -10
  9. data/ext/polars/src/dataframe/construction.rs +6 -8
  10. data/ext/polars/src/dataframe/general.rs +19 -29
  11. data/ext/polars/src/dataframe/io.rs +43 -33
  12. data/ext/polars/src/error.rs +26 -4
  13. data/ext/polars/src/expr/categorical.rs +0 -10
  14. data/ext/polars/src/expr/datetime.rs +4 -12
  15. data/ext/polars/src/expr/general.rs +123 -110
  16. data/ext/polars/src/expr/mod.rs +2 -2
  17. data/ext/polars/src/expr/rolling.rs +17 -9
  18. data/ext/polars/src/expr/string.rs +2 -6
  19. data/ext/polars/src/functions/eager.rs +10 -10
  20. data/ext/polars/src/functions/lazy.rs +21 -21
  21. data/ext/polars/src/functions/range.rs +6 -12
  22. data/ext/polars/src/interop/numo/to_numo_series.rs +2 -1
  23. data/ext/polars/src/lazyframe/mod.rs +81 -98
  24. data/ext/polars/src/lib.rs +55 -45
  25. data/ext/polars/src/map/dataframe.rs +2 -2
  26. data/ext/polars/src/rb_modules.rs +25 -1
  27. data/ext/polars/src/series/aggregation.rs +4 -2
  28. data/ext/polars/src/series/arithmetic.rs +21 -11
  29. data/ext/polars/src/series/construction.rs +56 -38
  30. data/ext/polars/src/series/export.rs +1 -1
  31. data/ext/polars/src/series/mod.rs +31 -10
  32. data/ext/polars/src/sql.rs +3 -1
  33. data/lib/polars/array_expr.rb +4 -4
  34. data/lib/polars/batched_csv_reader.rb +2 -2
  35. data/lib/polars/cat_expr.rb +0 -36
  36. data/lib/polars/cat_name_space.rb +0 -37
  37. data/lib/polars/data_frame.rb +93 -101
  38. data/lib/polars/data_types.rb +1 -1
  39. data/lib/polars/date_time_expr.rb +525 -573
  40. data/lib/polars/date_time_name_space.rb +263 -464
  41. data/lib/polars/dynamic_group_by.rb +3 -3
  42. data/lib/polars/exceptions.rb +3 -0
  43. data/lib/polars/expr.rb +367 -330
  44. data/lib/polars/expr_dispatch.rb +1 -1
  45. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  46. data/lib/polars/functions/as_datatype.rb +63 -40
  47. data/lib/polars/functions/lazy.rb +63 -14
  48. data/lib/polars/functions/lit.rb +1 -1
  49. data/lib/polars/functions/range/date_range.rb +18 -77
  50. data/lib/polars/functions/range/datetime_range.rb +4 -4
  51. data/lib/polars/functions/range/int_range.rb +2 -2
  52. data/lib/polars/functions/range/time_range.rb +4 -4
  53. data/lib/polars/functions/repeat.rb +1 -1
  54. data/lib/polars/functions/whenthen.rb +1 -1
  55. data/lib/polars/io/csv.rb +8 -8
  56. data/lib/polars/io/ipc.rb +3 -3
  57. data/lib/polars/io/json.rb +13 -2
  58. data/lib/polars/io/ndjson.rb +15 -4
  59. data/lib/polars/io/parquet.rb +5 -4
  60. data/lib/polars/lazy_frame.rb +120 -106
  61. data/lib/polars/lazy_group_by.rb +1 -1
  62. data/lib/polars/list_expr.rb +11 -11
  63. data/lib/polars/list_name_space.rb +5 -1
  64. data/lib/polars/rolling_group_by.rb +5 -7
  65. data/lib/polars/series.rb +105 -189
  66. data/lib/polars/string_expr.rb +42 -67
  67. data/lib/polars/string_name_space.rb +5 -4
  68. data/lib/polars/testing.rb +2 -2
  69. data/lib/polars/utils/constants.rb +9 -0
  70. data/lib/polars/utils/convert.rb +97 -0
  71. data/lib/polars/utils/parse.rb +89 -0
  72. data/lib/polars/utils/various.rb +76 -0
  73. data/lib/polars/utils/wrap.rb +19 -0
  74. data/lib/polars/utils.rb +4 -330
  75. data/lib/polars/version.rb +1 -1
  76. data/lib/polars/whenthen.rb +6 -6
  77. data/lib/polars.rb +11 -0
  78. metadata +9 -4
  79. data/ext/polars/src/conversion/anyvalue.rs +0 -186
@@ -1,11 +1,9 @@
1
1
  use magnus::{prelude::*, RString, Value};
2
2
  use polars::io::avro::AvroCompression;
3
- use polars::io::mmap::ReaderBytes;
4
3
  use polars::io::RowIndex;
5
4
  use polars::prelude::*;
6
5
  use std::io::{BufWriter, Cursor};
7
6
  use std::num::NonZeroUsize;
8
- use std::ops::Deref;
9
7
 
10
8
  use super::*;
11
9
  use crate::conversion::*;
@@ -93,7 +91,7 @@ impl RbDataFrame {
93
91
  .with_projection(projection.map(Arc::new))
94
92
  .with_rechunk(rechunk)
95
93
  .with_chunk_size(chunk_size)
96
- .with_columns(columns.map(Arc::new))
94
+ .with_columns(columns.map(Arc::from))
97
95
  .with_n_threads(n_threads)
98
96
  .with_schema_overwrite(overwrite_dtype.map(Arc::new))
99
97
  .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
@@ -168,41 +166,53 @@ impl RbDataFrame {
168
166
  Ok(RbDataFrame::new(df))
169
167
  }
170
168
 
171
- pub fn read_json(rb_f: Value) -> RbResult<Self> {
172
- // memmap the file first
169
+ pub fn read_json(
170
+ rb_f: Value,
171
+ infer_schema_length: Option<usize>,
172
+ schema: Option<Wrap<Schema>>,
173
+ schema_overrides: Option<Wrap<Schema>>,
174
+ ) -> RbResult<Self> {
173
175
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
174
- let mmap_read: ReaderBytes = (&mmap_bytes_r).into();
175
- let bytes = mmap_read.deref();
176
-
177
- // Happy path is our column oriented json as that is most performant
178
- // on failure we try
179
- match serde_json::from_slice::<DataFrame>(bytes) {
180
- Ok(df) => Ok(df.into()),
181
- // try arrow json reader instead
182
- // this is row oriented
183
- Err(e) => {
184
- let msg = format!("{e}");
185
- if msg.contains("successful parse invalid data") {
186
- let e = RbPolarsErr::from(PolarsError::ComputeError(msg.into()));
187
- Err(e)
188
- } else {
189
- let out = JsonReader::new(mmap_bytes_r)
190
- .with_json_format(JsonFormat::Json)
191
- .finish()
192
- .map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
193
- Ok(out.into())
194
- }
195
- }
176
+
177
+ let mut builder = JsonReader::new(mmap_bytes_r)
178
+ .with_json_format(JsonFormat::Json)
179
+ .infer_schema_len(infer_schema_length.and_then(NonZeroUsize::new));
180
+
181
+ if let Some(schema) = schema {
182
+ builder = builder.with_schema(Arc::new(schema.0));
183
+ }
184
+
185
+ if let Some(schema) = schema_overrides.as_ref() {
186
+ builder = builder.with_schema_overwrite(&schema.0);
196
187
  }
188
+
189
+ let out = builder.finish().map_err(RbPolarsErr::from)?;
190
+ Ok(out.into())
197
191
  }
198
192
 
199
- pub fn read_ndjson(rb_f: Value) -> RbResult<Self> {
193
+ pub fn read_ndjson(
194
+ rb_f: Value,
195
+ ignore_errors: bool,
196
+ schema: Option<Wrap<Schema>>,
197
+ schema_overrides: Option<Wrap<Schema>>,
198
+ ) -> RbResult<Self> {
200
199
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
201
200
 
202
- let out = JsonReader::new(mmap_bytes_r)
201
+ let mut builder = JsonReader::new(mmap_bytes_r)
203
202
  .with_json_format(JsonFormat::JsonLines)
203
+ .with_ignore_errors(ignore_errors);
204
+
205
+ if let Some(schema) = schema {
206
+ builder = builder.with_schema(Arc::new(schema.0));
207
+ }
208
+
209
+ if let Some(schema) = schema_overrides.as_ref() {
210
+ builder = builder.with_schema_overwrite(&schema.0);
211
+ }
212
+
213
+ let out = builder
204
214
  .finish()
205
- .map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
215
+ .map_err(|e| RbPolarsErr::other(format!("{e}")))?;
206
216
  Ok(out.into())
207
217
  }
208
218
 
@@ -335,7 +345,7 @@ impl RbDataFrame {
335
345
  rb_f: Value,
336
346
  compression: String,
337
347
  compression_level: Option<i32>,
338
- statistics: bool,
348
+ statistics: Wrap<StatisticsOptions>,
339
349
  row_group_size: Option<usize>,
340
350
  data_page_size: Option<usize>,
341
351
  ) -> RbResult<()> {
@@ -345,7 +355,7 @@ impl RbDataFrame {
345
355
  let f = std::fs::File::create(s).unwrap();
346
356
  ParquetWriter::new(f)
347
357
  .with_compression(compression)
348
- .with_statistics(statistics)
358
+ .with_statistics(statistics.0)
349
359
  .with_row_group_size(row_group_size)
350
360
  .with_data_page_size(data_page_size)
351
361
  .finish(&mut self.df.borrow_mut())
@@ -354,7 +364,7 @@ impl RbDataFrame {
354
364
  let buf = get_file_like(rb_f, true)?;
355
365
  ParquetWriter::new(buf)
356
366
  .with_compression(compression)
357
- .with_statistics(statistics)
367
+ .with_statistics(statistics.0)
358
368
  .with_row_group_size(row_group_size)
359
369
  .with_data_page_size(data_page_size)
360
370
  .finish(&mut self.df.borrow_mut())
@@ -2,20 +2,26 @@ use magnus::exception;
2
2
  use magnus::Error;
3
3
  use polars::prelude::PolarsError;
4
4
 
5
+ use crate::rb_modules;
6
+
5
7
  pub struct RbPolarsErr {}
6
8
 
7
9
  impl RbPolarsErr {
8
10
  // convert to Error instead of Self
9
11
  pub fn from(e: PolarsError) -> Error {
10
- Error::new(exception::runtime_error(), e.to_string())
12
+ match e {
13
+ PolarsError::ComputeError(err) => ComputeError::new_err(err.to_string()),
14
+ PolarsError::InvalidOperation(err) => InvalidOperationError::new_err(err.to_string()),
15
+ _ => Error::new(rb_modules::error(), e.to_string()),
16
+ }
11
17
  }
12
18
 
13
19
  pub fn io(e: std::io::Error) -> Error {
14
- Error::new(exception::runtime_error(), e.to_string())
20
+ Error::new(rb_modules::error(), e.to_string())
15
21
  }
16
22
 
17
23
  pub fn other(message: String) -> Error {
18
- Error::new(exception::runtime_error(), message)
24
+ Error::new(rb_modules::error(), message)
19
25
  }
20
26
  }
21
27
 
@@ -35,11 +41,27 @@ impl RbValueError {
35
41
  }
36
42
  }
37
43
 
44
+ pub struct RbOverflowError {}
45
+
46
+ impl RbOverflowError {
47
+ pub fn new_err(message: String) -> Error {
48
+ Error::new(exception::range_error(), message)
49
+ }
50
+ }
51
+
38
52
  pub struct ComputeError {}
39
53
 
40
54
  impl ComputeError {
41
55
  pub fn new_err(message: String) -> Error {
42
- Error::new(exception::runtime_error(), message)
56
+ Error::new(rb_modules::compute_error(), message)
57
+ }
58
+ }
59
+
60
+ pub struct InvalidOperationError {}
61
+
62
+ impl InvalidOperationError {
63
+ pub fn new_err(message: String) -> Error {
64
+ Error::new(rb_modules::invalid_operation_error(), message)
43
65
  }
44
66
  }
45
67
 
@@ -1,16 +1,6 @@
1
- use polars::prelude::*;
2
-
3
- use crate::conversion::Wrap;
4
1
  use crate::RbExpr;
5
2
 
6
3
  impl RbExpr {
7
- pub fn cat_set_ordering(&self, ordering: Wrap<CategoricalOrdering>) -> Self {
8
- self.inner
9
- .clone()
10
- .cast(DataType::Categorical(None, ordering.0))
11
- .into()
12
- }
13
-
14
4
  pub fn cat_get_categories(&self) -> Self {
15
5
  self.inner.clone().cat().get_categories().into()
16
6
  }
@@ -50,12 +50,8 @@ impl RbExpr {
50
50
  .into()
51
51
  }
52
52
 
53
- pub fn dt_truncate(&self, every: &Self, offset: String) -> Self {
54
- self.inner
55
- .clone()
56
- .dt()
57
- .truncate(every.inner.clone(), offset)
58
- .into()
53
+ pub fn dt_truncate(&self, every: &Self) -> Self {
54
+ self.inner.clone().dt().truncate(every.inner.clone()).into()
59
55
  }
60
56
 
61
57
  pub fn dt_month_start(&self) -> Self {
@@ -74,12 +70,8 @@ impl RbExpr {
74
70
  self.inner.clone().dt().dst_offset().into()
75
71
  }
76
72
 
77
- pub fn dt_round(&self, every: &Self, offset: String) -> Self {
78
- self.inner
79
- .clone()
80
- .dt()
81
- .round(every.inner.clone(), &offset)
82
- .into()
73
+ pub fn dt_round(&self, every: &Self) -> Self {
74
+ self.inner.clone().dt().round(every.inner.clone()).into()
83
75
  }
84
76
 
85
77
  pub fn dt_combine(&self, time: &Self, time_unit: Wrap<TimeUnit>) -> Self {