polars-df 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +360 -361
  4. data/ext/polars/Cargo.toml +10 -7
  5. data/ext/polars/src/batched_csv.rs +1 -1
  6. data/ext/polars/src/conversion/any_value.rs +261 -0
  7. data/ext/polars/src/conversion/chunked_array.rs +4 -4
  8. data/ext/polars/src/conversion/mod.rs +51 -10
  9. data/ext/polars/src/dataframe/construction.rs +6 -8
  10. data/ext/polars/src/dataframe/general.rs +19 -29
  11. data/ext/polars/src/dataframe/io.rs +43 -33
  12. data/ext/polars/src/error.rs +26 -4
  13. data/ext/polars/src/expr/categorical.rs +0 -10
  14. data/ext/polars/src/expr/datetime.rs +4 -12
  15. data/ext/polars/src/expr/general.rs +123 -110
  16. data/ext/polars/src/expr/mod.rs +2 -2
  17. data/ext/polars/src/expr/rolling.rs +17 -9
  18. data/ext/polars/src/expr/string.rs +2 -6
  19. data/ext/polars/src/functions/eager.rs +10 -10
  20. data/ext/polars/src/functions/lazy.rs +21 -21
  21. data/ext/polars/src/functions/range.rs +6 -12
  22. data/ext/polars/src/interop/numo/to_numo_series.rs +2 -1
  23. data/ext/polars/src/lazyframe/mod.rs +81 -98
  24. data/ext/polars/src/lib.rs +55 -45
  25. data/ext/polars/src/map/dataframe.rs +2 -2
  26. data/ext/polars/src/rb_modules.rs +25 -1
  27. data/ext/polars/src/series/aggregation.rs +4 -2
  28. data/ext/polars/src/series/arithmetic.rs +21 -11
  29. data/ext/polars/src/series/construction.rs +56 -38
  30. data/ext/polars/src/series/export.rs +1 -1
  31. data/ext/polars/src/series/mod.rs +31 -10
  32. data/ext/polars/src/sql.rs +3 -1
  33. data/lib/polars/array_expr.rb +4 -4
  34. data/lib/polars/batched_csv_reader.rb +2 -2
  35. data/lib/polars/cat_expr.rb +0 -36
  36. data/lib/polars/cat_name_space.rb +0 -37
  37. data/lib/polars/data_frame.rb +93 -101
  38. data/lib/polars/data_types.rb +1 -1
  39. data/lib/polars/date_time_expr.rb +525 -573
  40. data/lib/polars/date_time_name_space.rb +263 -464
  41. data/lib/polars/dynamic_group_by.rb +3 -3
  42. data/lib/polars/exceptions.rb +3 -0
  43. data/lib/polars/expr.rb +367 -330
  44. data/lib/polars/expr_dispatch.rb +1 -1
  45. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  46. data/lib/polars/functions/as_datatype.rb +63 -40
  47. data/lib/polars/functions/lazy.rb +63 -14
  48. data/lib/polars/functions/lit.rb +1 -1
  49. data/lib/polars/functions/range/date_range.rb +18 -77
  50. data/lib/polars/functions/range/datetime_range.rb +4 -4
  51. data/lib/polars/functions/range/int_range.rb +2 -2
  52. data/lib/polars/functions/range/time_range.rb +4 -4
  53. data/lib/polars/functions/repeat.rb +1 -1
  54. data/lib/polars/functions/whenthen.rb +1 -1
  55. data/lib/polars/io/csv.rb +8 -8
  56. data/lib/polars/io/ipc.rb +3 -3
  57. data/lib/polars/io/json.rb +13 -2
  58. data/lib/polars/io/ndjson.rb +15 -4
  59. data/lib/polars/io/parquet.rb +5 -4
  60. data/lib/polars/lazy_frame.rb +120 -106
  61. data/lib/polars/lazy_group_by.rb +1 -1
  62. data/lib/polars/list_expr.rb +11 -11
  63. data/lib/polars/list_name_space.rb +5 -1
  64. data/lib/polars/rolling_group_by.rb +5 -7
  65. data/lib/polars/series.rb +105 -189
  66. data/lib/polars/string_expr.rb +42 -67
  67. data/lib/polars/string_name_space.rb +5 -4
  68. data/lib/polars/testing.rb +2 -2
  69. data/lib/polars/utils/constants.rb +9 -0
  70. data/lib/polars/utils/convert.rb +97 -0
  71. data/lib/polars/utils/parse.rb +89 -0
  72. data/lib/polars/utils/various.rb +76 -0
  73. data/lib/polars/utils/wrap.rb +19 -0
  74. data/lib/polars/utils.rb +4 -330
  75. data/lib/polars/version.rb +1 -1
  76. data/lib/polars/whenthen.rb +6 -6
  77. data/lib/polars.rb +11 -0
  78. metadata +9 -4
  79. data/ext/polars/src/conversion/anyvalue.rs +0 -186
@@ -1,11 +1,9 @@
1
1
  use magnus::{prelude::*, RString, Value};
2
2
  use polars::io::avro::AvroCompression;
3
- use polars::io::mmap::ReaderBytes;
4
3
  use polars::io::RowIndex;
5
4
  use polars::prelude::*;
6
5
  use std::io::{BufWriter, Cursor};
7
6
  use std::num::NonZeroUsize;
8
- use std::ops::Deref;
9
7
 
10
8
  use super::*;
11
9
  use crate::conversion::*;
@@ -93,7 +91,7 @@ impl RbDataFrame {
93
91
  .with_projection(projection.map(Arc::new))
94
92
  .with_rechunk(rechunk)
95
93
  .with_chunk_size(chunk_size)
96
- .with_columns(columns.map(Arc::new))
94
+ .with_columns(columns.map(Arc::from))
97
95
  .with_n_threads(n_threads)
98
96
  .with_schema_overwrite(overwrite_dtype.map(Arc::new))
99
97
  .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
@@ -168,41 +166,53 @@ impl RbDataFrame {
168
166
  Ok(RbDataFrame::new(df))
169
167
  }
170
168
 
171
- pub fn read_json(rb_f: Value) -> RbResult<Self> {
172
- // memmap the file first
169
+ pub fn read_json(
170
+ rb_f: Value,
171
+ infer_schema_length: Option<usize>,
172
+ schema: Option<Wrap<Schema>>,
173
+ schema_overrides: Option<Wrap<Schema>>,
174
+ ) -> RbResult<Self> {
173
175
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
174
- let mmap_read: ReaderBytes = (&mmap_bytes_r).into();
175
- let bytes = mmap_read.deref();
176
-
177
- // Happy path is our column oriented json as that is most performant
178
- // on failure we try
179
- match serde_json::from_slice::<DataFrame>(bytes) {
180
- Ok(df) => Ok(df.into()),
181
- // try arrow json reader instead
182
- // this is row oriented
183
- Err(e) => {
184
- let msg = format!("{e}");
185
- if msg.contains("successful parse invalid data") {
186
- let e = RbPolarsErr::from(PolarsError::ComputeError(msg.into()));
187
- Err(e)
188
- } else {
189
- let out = JsonReader::new(mmap_bytes_r)
190
- .with_json_format(JsonFormat::Json)
191
- .finish()
192
- .map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
193
- Ok(out.into())
194
- }
195
- }
176
+
177
+ let mut builder = JsonReader::new(mmap_bytes_r)
178
+ .with_json_format(JsonFormat::Json)
179
+ .infer_schema_len(infer_schema_length.and_then(NonZeroUsize::new));
180
+
181
+ if let Some(schema) = schema {
182
+ builder = builder.with_schema(Arc::new(schema.0));
183
+ }
184
+
185
+ if let Some(schema) = schema_overrides.as_ref() {
186
+ builder = builder.with_schema_overwrite(&schema.0);
196
187
  }
188
+
189
+ let out = builder.finish().map_err(RbPolarsErr::from)?;
190
+ Ok(out.into())
197
191
  }
198
192
 
199
- pub fn read_ndjson(rb_f: Value) -> RbResult<Self> {
193
+ pub fn read_ndjson(
194
+ rb_f: Value,
195
+ ignore_errors: bool,
196
+ schema: Option<Wrap<Schema>>,
197
+ schema_overrides: Option<Wrap<Schema>>,
198
+ ) -> RbResult<Self> {
200
199
  let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
201
200
 
202
- let out = JsonReader::new(mmap_bytes_r)
201
+ let mut builder = JsonReader::new(mmap_bytes_r)
203
202
  .with_json_format(JsonFormat::JsonLines)
203
+ .with_ignore_errors(ignore_errors);
204
+
205
+ if let Some(schema) = schema {
206
+ builder = builder.with_schema(Arc::new(schema.0));
207
+ }
208
+
209
+ if let Some(schema) = schema_overrides.as_ref() {
210
+ builder = builder.with_schema_overwrite(&schema.0);
211
+ }
212
+
213
+ let out = builder
204
214
  .finish()
205
- .map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
215
+ .map_err(|e| RbPolarsErr::other(format!("{e}")))?;
206
216
  Ok(out.into())
207
217
  }
208
218
 
@@ -335,7 +345,7 @@ impl RbDataFrame {
335
345
  rb_f: Value,
336
346
  compression: String,
337
347
  compression_level: Option<i32>,
338
- statistics: bool,
348
+ statistics: Wrap<StatisticsOptions>,
339
349
  row_group_size: Option<usize>,
340
350
  data_page_size: Option<usize>,
341
351
  ) -> RbResult<()> {
@@ -345,7 +355,7 @@ impl RbDataFrame {
345
355
  let f = std::fs::File::create(s).unwrap();
346
356
  ParquetWriter::new(f)
347
357
  .with_compression(compression)
348
- .with_statistics(statistics)
358
+ .with_statistics(statistics.0)
349
359
  .with_row_group_size(row_group_size)
350
360
  .with_data_page_size(data_page_size)
351
361
  .finish(&mut self.df.borrow_mut())
@@ -354,7 +364,7 @@ impl RbDataFrame {
354
364
  let buf = get_file_like(rb_f, true)?;
355
365
  ParquetWriter::new(buf)
356
366
  .with_compression(compression)
357
- .with_statistics(statistics)
367
+ .with_statistics(statistics.0)
358
368
  .with_row_group_size(row_group_size)
359
369
  .with_data_page_size(data_page_size)
360
370
  .finish(&mut self.df.borrow_mut())
@@ -2,20 +2,26 @@ use magnus::exception;
2
2
  use magnus::Error;
3
3
  use polars::prelude::PolarsError;
4
4
 
5
+ use crate::rb_modules;
6
+
5
7
  pub struct RbPolarsErr {}
6
8
 
7
9
  impl RbPolarsErr {
8
10
  // convert to Error instead of Self
9
11
  pub fn from(e: PolarsError) -> Error {
10
- Error::new(exception::runtime_error(), e.to_string())
12
+ match e {
13
+ PolarsError::ComputeError(err) => ComputeError::new_err(err.to_string()),
14
+ PolarsError::InvalidOperation(err) => InvalidOperationError::new_err(err.to_string()),
15
+ _ => Error::new(rb_modules::error(), e.to_string()),
16
+ }
11
17
  }
12
18
 
13
19
  pub fn io(e: std::io::Error) -> Error {
14
- Error::new(exception::runtime_error(), e.to_string())
20
+ Error::new(rb_modules::error(), e.to_string())
15
21
  }
16
22
 
17
23
  pub fn other(message: String) -> Error {
18
- Error::new(exception::runtime_error(), message)
24
+ Error::new(rb_modules::error(), message)
19
25
  }
20
26
  }
21
27
 
@@ -35,11 +41,27 @@ impl RbValueError {
35
41
  }
36
42
  }
37
43
 
44
+ pub struct RbOverflowError {}
45
+
46
+ impl RbOverflowError {
47
+ pub fn new_err(message: String) -> Error {
48
+ Error::new(exception::range_error(), message)
49
+ }
50
+ }
51
+
38
52
  pub struct ComputeError {}
39
53
 
40
54
  impl ComputeError {
41
55
  pub fn new_err(message: String) -> Error {
42
- Error::new(exception::runtime_error(), message)
56
+ Error::new(rb_modules::compute_error(), message)
57
+ }
58
+ }
59
+
60
+ pub struct InvalidOperationError {}
61
+
62
+ impl InvalidOperationError {
63
+ pub fn new_err(message: String) -> Error {
64
+ Error::new(rb_modules::invalid_operation_error(), message)
43
65
  }
44
66
  }
45
67
 
@@ -1,16 +1,6 @@
1
- use polars::prelude::*;
2
-
3
- use crate::conversion::Wrap;
4
1
  use crate::RbExpr;
5
2
 
6
3
  impl RbExpr {
7
- pub fn cat_set_ordering(&self, ordering: Wrap<CategoricalOrdering>) -> Self {
8
- self.inner
9
- .clone()
10
- .cast(DataType::Categorical(None, ordering.0))
11
- .into()
12
- }
13
-
14
4
  pub fn cat_get_categories(&self) -> Self {
15
5
  self.inner.clone().cat().get_categories().into()
16
6
  }
@@ -50,12 +50,8 @@ impl RbExpr {
50
50
  .into()
51
51
  }
52
52
 
53
- pub fn dt_truncate(&self, every: &Self, offset: String) -> Self {
54
- self.inner
55
- .clone()
56
- .dt()
57
- .truncate(every.inner.clone(), offset)
58
- .into()
53
+ pub fn dt_truncate(&self, every: &Self) -> Self {
54
+ self.inner.clone().dt().truncate(every.inner.clone()).into()
59
55
  }
60
56
 
61
57
  pub fn dt_month_start(&self) -> Self {
@@ -74,12 +70,8 @@ impl RbExpr {
74
70
  self.inner.clone().dt().dst_offset().into()
75
71
  }
76
72
 
77
- pub fn dt_round(&self, every: &Self, offset: String) -> Self {
78
- self.inner
79
- .clone()
80
- .dt()
81
- .round(every.inner.clone(), &offset)
82
- .into()
73
+ pub fn dt_round(&self, every: &Self) -> Self {
74
+ self.inner.clone().dt().round(every.inner.clone()).into()
83
75
  }
84
76
 
85
77
  pub fn dt_combine(&self, time: &Self, time_unit: Wrap<TimeUnit>) -> Self {