polars-df 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.12.0"
3
+ version = "0.13.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.76.0"
7
+ rust-version = "1.80.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
@@ -15,14 +15,14 @@ ahash = "0.8"
15
15
  chrono = "0.4"
16
16
  either = "1.8"
17
17
  magnus = "0.7"
18
- polars-core = "=0.41.3"
19
- polars-parquet = "=0.41.3"
20
- polars-utils = "=0.41.3"
18
+ polars-core = "=0.42.0"
19
+ polars-parquet = "=0.42.0"
20
+ polars-utils = "=0.42.0"
21
21
  serde_json = "1"
22
22
  smartstring = "1"
23
23
 
24
24
  [dependencies.polars]
25
- version = "=0.41.3"
25
+ version = "=0.42.0"
26
26
  features = [
27
27
  "abs",
28
28
  "approx_unique",
@@ -117,5 +117,5 @@ features = [
117
117
  [target.'cfg(target_os = "linux")'.dependencies]
118
118
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
119
119
 
120
- [target.'cfg(not(target_os = "linux"))'.dependencies]
120
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
121
121
  mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -56,17 +56,15 @@ impl IntoValue for Wrap<&BinaryChunked> {
56
56
  }
57
57
 
58
58
  impl IntoValue for Wrap<&StructChunked> {
59
- fn into_value_with(self, _: &Ruby) -> Value {
59
+ fn into_value_with(self, ruby: &Ruby) -> Value {
60
60
  let s = self.0.clone().into_series();
61
61
  // todo! iterate its chunks and flatten.
62
62
  // make series::iter() accept a chunk index.
63
63
  let s = s.rechunk();
64
- let iter = s.iter().map(|av| {
65
- if let AnyValue::Struct(_, _, flds) = av {
66
- struct_dict(av._iter_struct_av(), flds)
67
- } else {
68
- unreachable!()
69
- }
64
+ let iter = s.iter().map(|av| match av {
65
+ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
66
+ AnyValue::Null => ruby.qnil().as_value(),
67
+ _ => unreachable!(),
70
68
  });
71
69
 
72
70
  RArray::from_iter(iter).into_value()
@@ -26,12 +26,6 @@ use crate::object::OBJECT_NAME;
26
26
  use crate::rb_modules::series;
27
27
  use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError};
28
28
 
29
- pub(crate) fn slice_to_wrapped<T>(slice: &[T]) -> &[Wrap<T>] {
30
- // Safety:
31
- // Wrap is transparent.
32
- unsafe { std::mem::transmute(slice) }
33
- }
34
-
35
29
  pub(crate) fn slice_extract_wrapped<T>(slice: &[Wrap<T>]) -> &[T] {
36
30
  // Safety:
37
31
  // Wrap is transparent.
@@ -348,12 +348,11 @@ impl RbDataFrame {
348
348
  value_name: Option<String>,
349
349
  variable_name: Option<String>,
350
350
  ) -> RbResult<Self> {
351
- let args = UnpivotArgs {
351
+ let args = UnpivotArgsIR {
352
352
  on: strings_to_smartstrings(on),
353
353
  index: strings_to_smartstrings(index),
354
354
  value_name: value_name.map(|s| s.into()),
355
355
  variable_name: variable_name.map(|s| s.into()),
356
- streamable: false,
357
356
  };
358
357
 
359
358
  let df = self.df.borrow().unpivot2(args).map_err(RbPolarsErr::from)?;
@@ -145,7 +145,7 @@ impl RbDataFrame {
145
145
  .with_projection(projection)
146
146
  .with_columns(columns)
147
147
  .read_parallel(parallel.0)
148
- .with_n_rows(n_rows)
148
+ .with_slice(n_rows.map(|x| (0, x)))
149
149
  .with_row_index(row_index)
150
150
  .set_low_memory(low_memory)
151
151
  .use_statistics(use_statistics)
@@ -156,7 +156,7 @@ impl RbDataFrame {
156
156
  .with_projection(projection)
157
157
  .with_columns(columns)
158
158
  .read_parallel(parallel.0)
159
- .with_n_rows(n_rows)
159
+ .with_slice(n_rows.map(|x| (0, x)))
160
160
  .with_row_index(row_index)
161
161
  .use_statistics(use_statistics)
162
162
  .set_rechunk(rechunk)
@@ -7,15 +7,11 @@ impl RbExpr {
7
7
  self.inner == other.inner
8
8
  }
9
9
 
10
- pub fn meta_pop(&self) -> RArray {
11
- RArray::from_iter(
12
- self.inner
13
- .clone()
14
- .meta()
15
- .pop()
16
- .into_iter()
17
- .map(RbExpr::from),
18
- )
10
+ pub fn meta_pop(&self) -> RbResult<RArray> {
11
+ let exprs = self.inner.clone().meta().pop().map_err(RbPolarsErr::from)?;
12
+ Ok(RArray::from_iter(
13
+ exprs.iter().map(|e| RbExpr::from(e.clone())),
14
+ ))
19
15
  }
20
16
 
21
17
  pub fn meta_root_names(&self) -> Vec<String> {
@@ -64,6 +64,6 @@ pub fn concat_df_horizontal(seq: RArray) -> RbResult<RbDataFrame> {
64
64
  for item in seq.into_iter() {
65
65
  dfs.push(get_df(item)?);
66
66
  }
67
- let df = functions::concat_df_horizontal(&dfs).map_err(RbPolarsErr::from)?;
67
+ let df = functions::concat_df_horizontal(&dfs, true).map_err(RbPolarsErr::from)?;
68
68
  Ok(df.into())
69
69
  }
@@ -162,6 +162,7 @@ impl RbLazyFrame {
162
162
  hive_schema: Option<Wrap<Schema>>,
163
163
  try_parse_hive_dates: bool,
164
164
  glob: bool,
165
+ include_file_paths: Option<String>,
165
166
  ) -> RbResult<Self> {
166
167
  let parallel = parallel.0;
167
168
  let hive_schema = hive_schema.map(|s| Arc::new(s.0));
@@ -196,6 +197,7 @@ impl RbLazyFrame {
196
197
  use_statistics,
197
198
  hive_options,
198
199
  glob,
200
+ include_file_paths: include_file_paths.map(Arc::from),
199
201
  };
200
202
 
201
203
  let lf = if path.is_some() {
@@ -207,6 +209,7 @@ impl RbLazyFrame {
207
209
  Ok(lf.into())
208
210
  }
209
211
 
212
+ #[allow(clippy::too_many_arguments)]
210
213
  pub fn new_from_ipc(
211
214
  path: String,
212
215
  n_rows: Option<usize>,
@@ -214,12 +217,23 @@ impl RbLazyFrame {
214
217
  rechunk: bool,
215
218
  row_index: Option<(String, IdxSize)>,
216
219
  memory_map: bool,
220
+ hive_partitioning: Option<bool>,
221
+ hive_schema: Option<Wrap<Schema>>,
222
+ try_parse_hive_dates: bool,
223
+ include_file_paths: Option<String>,
217
224
  ) -> RbResult<Self> {
218
225
  let row_index = row_index.map(|(name, offset)| RowIndex {
219
226
  name: Arc::from(name.as_str()),
220
227
  offset,
221
228
  });
222
229
 
230
+ let hive_options = HiveOptions {
231
+ enabled: hive_partitioning,
232
+ hive_start_idx: 0,
233
+ schema: hive_schema.map(|x| Arc::new(x.0)),
234
+ try_parse_dates: try_parse_hive_dates,
235
+ };
236
+
223
237
  let args = ScanArgsIpc {
224
238
  n_rows,
225
239
  cache,
@@ -227,6 +241,8 @@ impl RbLazyFrame {
227
241
  row_index,
228
242
  memory_map,
229
243
  cloud_options: None,
244
+ hive_options,
245
+ include_file_paths: include_file_paths.map(Arc::from),
230
246
  };
231
247
  let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
232
248
  Ok(lf.into())
@@ -348,7 +364,7 @@ impl RbLazyFrame {
348
364
  compression_level: Option<i32>,
349
365
  statistics: Wrap<StatisticsOptions>,
350
366
  row_group_size: Option<usize>,
351
- data_pagesize_limit: Option<usize>,
367
+ data_page_size: Option<usize>,
352
368
  maintain_order: bool,
353
369
  ) -> RbResult<()> {
354
370
  let compression = parse_parquet_compression(&compression, compression_level)?;
@@ -357,7 +373,7 @@ impl RbLazyFrame {
357
373
  compression,
358
374
  statistics: statistics.0,
359
375
  row_group_size,
360
- data_pagesize_limit,
376
+ data_page_size,
361
377
  maintain_order,
362
378
  };
363
379
 
@@ -752,22 +768,22 @@ impl RbLazyFrame {
752
768
 
753
769
  pub fn unpivot(
754
770
  &self,
755
- on: Vec<String>,
756
- index: Vec<String>,
771
+ on: RArray,
772
+ index: RArray,
757
773
  value_name: Option<String>,
758
774
  variable_name: Option<String>,
759
- streamable: bool,
760
- ) -> Self {
761
- let args = UnpivotArgs {
762
- on: strings_to_smartstrings(on),
763
- index: strings_to_smartstrings(index),
775
+ ) -> RbResult<Self> {
776
+ let on = rb_exprs_to_exprs(on)?;
777
+ let index = rb_exprs_to_exprs(index)?;
778
+ let args = UnpivotArgsDSL {
779
+ on: on.into_iter().map(|e| e.into()).collect(),
780
+ index: index.into_iter().map(|e| e.into()).collect(),
764
781
  value_name: value_name.map(|s| s.into()),
765
782
  variable_name: variable_name.map(|s| s.into()),
766
- streamable,
767
783
  };
768
784
 
769
785
  let ldf = self.ldf.borrow().clone();
770
- ldf.unpivot(args).into()
786
+ Ok(ldf.unpivot(args).into())
771
787
  }
772
788
 
773
789
  pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
@@ -1,3 +1,4 @@
1
+ mod allocator;
1
2
  mod batched_csv;
2
3
  mod conversion;
3
4
  mod dataframe;
@@ -31,20 +32,6 @@ use magnus::{define_module, function, method, prelude::*, Error, Ruby};
31
32
  use series::RbSeries;
32
33
  use sql::RbSQLContext;
33
34
 
34
- #[cfg(target_os = "linux")]
35
- use jemallocator::Jemalloc;
36
-
37
- #[cfg(not(target_os = "linux"))]
38
- use mimalloc::MiMalloc;
39
-
40
- #[global_allocator]
41
- #[cfg(target_os = "linux")]
42
- static GLOBAL: Jemalloc = Jemalloc;
43
-
44
- #[global_allocator]
45
- #[cfg(not(target_os = "linux"))]
46
- static GLOBAL: MiMalloc = MiMalloc;
47
-
48
35
  type RbResult<T> = Result<T, Error>;
49
36
 
50
37
  #[magnus::init]
@@ -722,9 +709,9 @@ fn init(ruby: &Ruby) -> RbResult<()> {
722
709
  class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
723
710
  class.define_singleton_method(
724
711
  "new_from_parquet",
725
- function!(RbLazyFrame::new_from_parquet, 13),
712
+ function!(RbLazyFrame::new_from_parquet, 14),
726
713
  )?;
727
- class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
714
+ class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 10))?;
728
715
  class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
729
716
  class.define_method("describe_plan", method!(RbLazyFrame::describe_plan, 0))?;
730
717
  class.define_method(
@@ -780,7 +767,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
780
767
  class.define_method("drop_nulls", method!(RbLazyFrame::drop_nulls, 1))?;
781
768
  class.define_method("slice", method!(RbLazyFrame::slice, 2))?;
782
769
  class.define_method("tail", method!(RbLazyFrame::tail, 1))?;
783
- class.define_method("unpivot", method!(RbLazyFrame::unpivot, 5))?;
770
+ class.define_method("unpivot", method!(RbLazyFrame::unpivot, 4))?;
784
771
  class.define_method("with_row_index", method!(RbLazyFrame::with_row_index, 2))?;
785
772
  class.define_method("drop", method!(RbLazyFrame::drop, 1))?;
786
773
  class.define_method("cast_all", method!(RbLazyFrame::cast_all, 2))?;
@@ -878,7 +865,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
878
865
  class.define_method("arg_max", method!(RbSeries::arg_max, 0))?;
879
866
  class.define_method("take_with_series", method!(RbSeries::take_with_series, 1))?;
880
867
  class.define_method("null_count", method!(RbSeries::null_count, 0))?;
881
- class.define_method("has_validity", method!(RbSeries::has_validity, 0))?;
868
+ class.define_method("has_nulls", method!(RbSeries::has_nulls, 0))?;
882
869
  class.define_method("sample_n", method!(RbSeries::sample_n, 4))?;
883
870
  class.define_method("sample_frac", method!(RbSeries::sample_frac, 4))?;
884
871
  class.define_method("equals", method!(RbSeries::equals, 4))?;
@@ -93,7 +93,7 @@ fn iterator_to_struct(
93
93
  .collect::<Vec<_>>()
94
94
  });
95
95
 
96
- Ok(StructChunked::new(name, &fields)
96
+ Ok(StructChunked::from_series(name, &fields)
97
97
  .unwrap()
98
98
  .into_series()
99
99
  .into())