polars-df 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE.txt +1 -0
  5. data/README.md +1 -2
  6. data/ext/polars/Cargo.toml +15 -6
  7. data/ext/polars/src/batched_csv.rs +10 -13
  8. data/ext/polars/src/conversion/any_value.rs +37 -21
  9. data/ext/polars/src/conversion/chunked_array.rs +3 -3
  10. data/ext/polars/src/conversion/mod.rs +159 -46
  11. data/ext/polars/src/dataframe/construction.rs +4 -7
  12. data/ext/polars/src/dataframe/export.rs +9 -2
  13. data/ext/polars/src/dataframe/general.rs +22 -16
  14. data/ext/polars/src/dataframe/io.rs +78 -174
  15. data/ext/polars/src/dataframe/mod.rs +1 -0
  16. data/ext/polars/src/dataframe/serde.rs +15 -0
  17. data/ext/polars/src/error.rs +31 -48
  18. data/ext/polars/src/exceptions.rs +24 -0
  19. data/ext/polars/src/expr/binary.rs +4 -42
  20. data/ext/polars/src/expr/datetime.rs +16 -7
  21. data/ext/polars/src/expr/general.rs +14 -23
  22. data/ext/polars/src/expr/list.rs +18 -11
  23. data/ext/polars/src/expr/name.rs +3 -2
  24. data/ext/polars/src/expr/rolling.rs +6 -7
  25. data/ext/polars/src/expr/string.rs +17 -37
  26. data/ext/polars/src/file.rs +59 -22
  27. data/ext/polars/src/functions/business.rs +15 -0
  28. data/ext/polars/src/functions/io.rs +6 -6
  29. data/ext/polars/src/functions/lazy.rs +17 -8
  30. data/ext/polars/src/functions/mod.rs +1 -0
  31. data/ext/polars/src/functions/range.rs +4 -2
  32. data/ext/polars/src/interop/arrow/mod.rs +1 -0
  33. data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
  34. data/ext/polars/src/interop/mod.rs +1 -0
  35. data/ext/polars/src/lazyframe/general.rs +877 -0
  36. data/ext/polars/src/lazyframe/mod.rs +3 -825
  37. data/ext/polars/src/lazyframe/serde.rs +31 -0
  38. data/ext/polars/src/lib.rs +44 -13
  39. data/ext/polars/src/map/dataframe.rs +46 -14
  40. data/ext/polars/src/map/lazy.rs +65 -4
  41. data/ext/polars/src/map/mod.rs +17 -16
  42. data/ext/polars/src/map/series.rs +106 -64
  43. data/ext/polars/src/on_startup.rs +2 -2
  44. data/ext/polars/src/series/aggregation.rs +1 -5
  45. data/ext/polars/src/series/arithmetic.rs +10 -10
  46. data/ext/polars/src/series/construction.rs +52 -25
  47. data/ext/polars/src/series/export.rs +1 -1
  48. data/ext/polars/src/series/general.rs +643 -0
  49. data/ext/polars/src/series/import.rs +55 -0
  50. data/ext/polars/src/series/mod.rs +11 -638
  51. data/ext/polars/src/series/scatter.rs +2 -2
  52. data/ext/polars/src/utils.rs +0 -20
  53. data/lib/polars/batched_csv_reader.rb +0 -2
  54. data/lib/polars/binary_expr.rb +133 -9
  55. data/lib/polars/binary_name_space.rb +101 -6
  56. data/lib/polars/config.rb +4 -0
  57. data/lib/polars/data_frame.rb +285 -62
  58. data/lib/polars/data_type_group.rb +28 -0
  59. data/lib/polars/data_types.rb +2 -0
  60. data/lib/polars/date_time_expr.rb +244 -0
  61. data/lib/polars/date_time_name_space.rb +87 -0
  62. data/lib/polars/expr.rb +109 -8
  63. data/lib/polars/functions/as_datatype.rb +51 -2
  64. data/lib/polars/functions/col.rb +1 -1
  65. data/lib/polars/functions/eager.rb +1 -3
  66. data/lib/polars/functions/lazy.rb +88 -10
  67. data/lib/polars/functions/range/time_range.rb +21 -21
  68. data/lib/polars/io/csv.rb +14 -16
  69. data/lib/polars/io/database.rb +2 -2
  70. data/lib/polars/io/ipc.rb +14 -12
  71. data/lib/polars/io/ndjson.rb +10 -0
  72. data/lib/polars/io/parquet.rb +168 -111
  73. data/lib/polars/lazy_frame.rb +649 -15
  74. data/lib/polars/list_name_space.rb +169 -0
  75. data/lib/polars/selectors.rb +1144 -0
  76. data/lib/polars/series.rb +470 -40
  77. data/lib/polars/string_cache.rb +27 -1
  78. data/lib/polars/string_expr.rb +0 -1
  79. data/lib/polars/string_name_space.rb +73 -3
  80. data/lib/polars/struct_name_space.rb +31 -7
  81. data/lib/polars/utils/various.rb +5 -1
  82. data/lib/polars/utils.rb +45 -10
  83. data/lib/polars/version.rb +1 -1
  84. data/lib/polars.rb +2 -1
  85. metadata +14 -4
  86. data/lib/polars/functions.rb +0 -57
data/LICENSE.txt CHANGED
@@ -1,5 +1,6 @@
1
1
  Copyright (c) 2020 Ritchie Vink
2
2
  Copyright (c) 2022-2024 Andrew Kane
3
+ Some portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
 
4
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -17,8 +17,7 @@ gem "polars-df"
17
17
  This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
18
18
 
19
19
  ```ruby
20
- Polars.read_csv("iris.csv")
21
- .lazy
20
+ Polars.scan_csv("iris.csv")
22
21
  .filter(Polars.col("sepal_length") > 5)
23
22
  .group_by("species")
24
23
  .agg(Polars.all.sum)
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.13.0"
3
+ version = "0.15.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -12,17 +12,20 @@ crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
14
  ahash = "0.8"
15
+ arrow = { package = "polars-arrow", version = "=0.44.2" }
16
+ bytes = "1"
15
17
  chrono = "0.4"
16
18
  either = "1.8"
17
19
  magnus = "0.7"
18
- polars-core = "=0.42.0"
19
- polars-parquet = "=0.42.0"
20
- polars-utils = "=0.42.0"
20
+ polars-core = "=0.44.2"
21
+ polars-plan = "=0.44.2"
22
+ polars-parquet = "=0.44.2"
23
+ polars-utils = "=0.44.2"
24
+ regex = "1"
21
25
  serde_json = "1"
22
- smartstring = "1"
23
26
 
24
27
  [dependencies.polars]
25
- version = "=0.42.0"
28
+ version = "=0.44.2"
26
29
  features = [
27
30
  "abs",
28
31
  "approx_unique",
@@ -31,7 +34,11 @@ features = [
31
34
  "array_count",
32
35
  "asof_join",
33
36
  "avro",
37
+ "aws",
38
+ "azure",
34
39
  "binary_encoding",
40
+ "business",
41
+ "cloud",
35
42
  "concat_str",
36
43
  "cov",
37
44
  "cross_join",
@@ -52,6 +59,8 @@ features = [
52
59
  "extract_jsonpath",
53
60
  "find_many",
54
61
  "fmt",
62
+ "gcp",
63
+ "http",
55
64
  "interpolate",
56
65
  "ipc",
57
66
  "ipc_streaming",
@@ -34,8 +34,7 @@ impl RbBatchedCsv {
34
34
  let n_threads = Option::<usize>::try_convert(arguments[11])?;
35
35
  let path = PathBuf::try_convert(arguments[12])?;
36
36
  let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[13])?;
37
- // TODO fix
38
- let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::None; // Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
37
+ let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
39
38
  let low_memory = bool::try_convert(arguments[15])?;
40
39
  let comment_prefix = Option::<String>::try_convert(arguments[16])?;
41
40
  let quote_char = Option::<String>::try_convert(arguments[17])?;
@@ -44,17 +43,16 @@ impl RbBatchedCsv {
44
43
  let try_parse_dates = bool::try_convert(arguments[20])?;
45
44
  let skip_rows_after_header = usize::try_convert(arguments[21])?;
46
45
  let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
47
- let sample_size = usize::try_convert(arguments[23])?;
48
- let eol_char = String::try_convert(arguments[24])?;
49
- let raise_if_empty = bool::try_convert(arguments[25])?;
50
- let truncate_ragged_lines = bool::try_convert(arguments[26])?;
51
- let decimal_comma = bool::try_convert(arguments[27])?;
46
+ let eol_char = String::try_convert(arguments[23])?;
47
+ let raise_if_empty = bool::try_convert(arguments[24])?;
48
+ let truncate_ragged_lines = bool::try_convert(arguments[25])?;
49
+ let decimal_comma = bool::try_convert(arguments[26])?;
52
50
  // end arguments
53
51
 
54
52
  let null_values = null_values.map(|w| w.0);
55
53
  let eol_char = eol_char.as_bytes()[0];
56
54
  let row_index = row_index.map(|(name, offset)| RowIndex {
57
- name: Arc::from(name.as_str()),
55
+ name: name.into(),
58
56
  offset,
59
57
  });
60
58
  let quote_char = if let Some(s) = quote_char {
@@ -72,7 +70,7 @@ impl RbBatchedCsv {
72
70
  .iter()
73
71
  .map(|(name, dtype)| {
74
72
  let dtype = dtype.0.clone();
75
- Field::new(name, dtype)
73
+ Field::new((&**name).into(), dtype)
76
74
  })
77
75
  .collect::<Schema>()
78
76
  });
@@ -84,7 +82,7 @@ impl RbBatchedCsv {
84
82
  .collect::<Vec<_>>()
85
83
  });
86
84
 
87
- let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
85
+ let file = std::fs::File::open(path).map_err(RbPolarsErr::from)?;
88
86
  let reader = Box::new(file) as Box<dyn MmapBytesReader>;
89
87
  let reader = CsvReadOptions::default()
90
88
  .with_infer_schema_length(infer_schema_length)
@@ -95,13 +93,12 @@ impl RbBatchedCsv {
95
93
  .with_projection(projection.map(Arc::new))
96
94
  .with_rechunk(rechunk)
97
95
  .with_chunk_size(chunk_size)
98
- .with_columns(columns.map(Arc::from))
96
+ .with_columns(columns.map(|x| x.into_iter().map(PlSmallStr::from_string).collect()))
99
97
  .with_n_threads(n_threads)
100
98
  .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
101
99
  .with_low_memory(low_memory)
102
100
  .with_skip_rows_after_header(skip_rows_after_header)
103
101
  .with_row_index(row_index)
104
- .with_sample_size(sample_size)
105
102
  .with_raise_if_empty(raise_if_empty)
106
103
  .with_parse_options(
107
104
  CsvParseOptions::default()
@@ -132,7 +129,7 @@ impl RbBatchedCsv {
132
129
  let batches = reader
133
130
  .borrow()
134
131
  .lock()
135
- .map_err(|e| RbPolarsErr::other(e.to_string()))?
132
+ .map_err(|e| RbPolarsErr::Other(e.to_string()))?
136
133
  .next_batches(n)
137
134
  .map_err(RbPolarsErr::from)?;
138
135
 
@@ -7,9 +7,9 @@ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
7
7
 
8
8
  use super::{struct_dict, ObjectValue, Wrap};
9
9
 
10
- use crate::error::RbOverflowError;
10
+ use crate::exceptions::RbOverflowError;
11
11
  use crate::rb_modules::utils;
12
- use crate::{RbPolarsErr, RbResult, RbSeries};
12
+ use crate::{RbErr, RbPolarsErr, RbResult, RbSeries};
13
13
 
14
14
  impl IntoValue for Wrap<AnyValue<'_>> {
15
15
  fn into_value_with(self, ruby: &Ruby) -> Value {
@@ -47,12 +47,20 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
47
47
  };
48
48
  s.into_value()
49
49
  }
50
+ AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
51
+ let s = if arr.is_null() {
52
+ rev.get(idx)
53
+ } else {
54
+ unsafe { arr.deref_unchecked().value(idx as usize) }
55
+ };
56
+ s.into_value()
57
+ }
50
58
  AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
51
59
  AnyValue::Datetime(v, time_unit, time_zone) => {
52
- let time_unit = time_unit.to_ascii();
53
- utils()
54
- .funcall("_to_ruby_datetime", (v, time_unit, time_zone.clone()))
55
- .unwrap()
60
+ datetime_to_rb_object(v, time_unit, time_zone)
61
+ }
62
+ AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
63
+ datetime_to_rb_object(v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
56
64
  }
57
65
  AnyValue::Duration(v, time_unit) => {
58
66
  let time_unit = time_unit.to_ascii();
@@ -66,11 +74,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
66
74
  AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
67
75
  AnyValue::Object(v) => {
68
76
  let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
69
- object.to_object()
77
+ object.to_value()
70
78
  }
71
79
  AnyValue::ObjectOwned(v) => {
72
80
  let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
73
- object.to_object()
81
+ object.to_value()
74
82
  }
75
83
  AnyValue::Binary(v) => RString::from_slice(v).into_value(),
76
84
  AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
@@ -80,6 +88,13 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
80
88
  }
81
89
  }
82
90
 
91
+ fn datetime_to_rb_object(v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> Value {
92
+ let tu = tu.to_ascii();
93
+ utils()
94
+ .funcall("_to_ruby_datetime", (v, tu, tz.map(|v| v.to_string())))
95
+ .unwrap()
96
+ }
97
+
83
98
  pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
84
99
  // Conversion functions.
85
100
  fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -122,7 +137,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
122
137
  fn get_list(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
123
138
  let v = RArray::from_value(ob).unwrap();
124
139
  if v.is_empty() {
125
- Ok(AnyValue::List(Series::new_empty("", &DataType::Null)))
140
+ Ok(AnyValue::List(Series::new_empty(
141
+ PlSmallStr::EMPTY,
142
+ &DataType::Null,
143
+ )))
126
144
  } else {
127
145
  let list = v;
128
146
 
@@ -142,7 +160,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
142
160
  avs.push(Wrap::<AnyValue>::try_convert(item)?.0)
143
161
  }
144
162
 
145
- let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
163
+ let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, true)
146
164
  .map_err(RbPolarsErr::from)?;
147
165
  Ok(AnyValue::List(s))
148
166
  }
@@ -158,11 +176,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
158
176
  let len = dict.len();
159
177
  let mut keys = Vec::with_capacity(len);
160
178
  let mut vals = Vec::with_capacity(len);
161
- dict.foreach(|k: Value, v: Value| {
162
- let key = String::try_convert(k)?;
163
- let val = Wrap::<AnyValue>::try_convert(v)?.0;
179
+ dict.foreach(|key: String, val: Wrap<AnyValue>| {
180
+ let val = val.0;
164
181
  let dtype = DataType::from(&val);
165
- keys.push(Field::new(&key, dtype));
182
+ keys.push(Field::new(key.into(), dtype));
166
183
  vals.push(val);
167
184
  Ok(ForEach::Continue)
168
185
  })?;
@@ -184,7 +201,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
184
201
  let v = sec * 1_000_000_000 + nsec;
185
202
  // TODO support time zone when possible
186
203
  // https://github.com/pola-rs/polars/issues/9103
187
- Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, &None))
204
+ Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, None))
188
205
  }
189
206
 
190
207
  fn get_datetime(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -193,7 +210,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
193
210
  Ok(AnyValue::Datetime(
194
211
  sec * 1_000_000_000 + nsec,
195
212
  TimeUnit::Nanoseconds,
196
- &None,
213
+ None,
197
214
  ))
198
215
  }
199
216
 
@@ -218,7 +235,9 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
218
235
 
219
236
  let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
220
237
  let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
221
- RbPolarsErr::other("BigDecimal is too large to fit in Decimal128".into())
238
+ RbErr::from(RbPolarsErr::Other(
239
+ "BigDecimal is too large to fit in Decimal128".into(),
240
+ ))
222
241
  })?;
223
242
  if sign < 0 {
224
243
  // TODO better error
@@ -253,9 +272,6 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
253
272
  } else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
254
273
  get_decimal(ob, strict)
255
274
  } else {
256
- Err(RbPolarsErr::other(format!(
257
- "object type not supported {:?}",
258
- ob
259
- )))
275
+ Err(RbPolarsErr::Other(format!("object type not supported {:?}", ob)).into())
260
276
  }
261
277
  }
@@ -9,7 +9,7 @@ use crate::RbResult;
9
9
  impl TryConvert for Wrap<StringChunked> {
10
10
  fn try_convert(obj: Value) -> RbResult<Self> {
11
11
  let (seq, len) = get_rbseq(obj)?;
12
- let mut builder = StringChunkedBuilder::new("", len);
12
+ let mut builder = StringChunkedBuilder::new(PlSmallStr::EMPTY, len);
13
13
 
14
14
  for res in seq.into_iter() {
15
15
  let item = res;
@@ -25,7 +25,7 @@ impl TryConvert for Wrap<StringChunked> {
25
25
  impl TryConvert for Wrap<BinaryChunked> {
26
26
  fn try_convert(obj: Value) -> RbResult<Self> {
27
27
  let (seq, len) = get_rbseq(obj)?;
28
- let mut builder = BinaryChunkedBuilder::new("", len);
28
+ let mut builder = BinaryChunkedBuilder::new(PlSmallStr::EMPTY, len);
29
29
 
30
30
  for res in seq.into_iter() {
31
31
  let item = res;
@@ -90,7 +90,7 @@ impl IntoValue for Wrap<&DatetimeChunked> {
90
90
  fn into_value_with(self, _: &Ruby) -> Value {
91
91
  let utils = utils();
92
92
  let time_unit = Wrap(self.0.time_unit()).into_value();
93
- let time_zone = self.0.time_zone().clone().into_value();
93
+ let time_zone = self.0.time_zone().as_deref().map(|v| v.into_value());
94
94
  let iter = self.0.into_iter().map(|opt_v| {
95
95
  opt_v.map(|v| {
96
96
  utils