polars-df 0.13.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE.txt +1 -0
  5. data/README.md +1 -2
  6. data/ext/polars/Cargo.toml +15 -6
  7. data/ext/polars/src/batched_csv.rs +10 -13
  8. data/ext/polars/src/conversion/any_value.rs +37 -21
  9. data/ext/polars/src/conversion/chunked_array.rs +3 -3
  10. data/ext/polars/src/conversion/mod.rs +159 -46
  11. data/ext/polars/src/dataframe/construction.rs +4 -7
  12. data/ext/polars/src/dataframe/export.rs +9 -2
  13. data/ext/polars/src/dataframe/general.rs +22 -16
  14. data/ext/polars/src/dataframe/io.rs +78 -174
  15. data/ext/polars/src/dataframe/mod.rs +1 -0
  16. data/ext/polars/src/dataframe/serde.rs +15 -0
  17. data/ext/polars/src/error.rs +31 -48
  18. data/ext/polars/src/exceptions.rs +24 -0
  19. data/ext/polars/src/expr/binary.rs +4 -42
  20. data/ext/polars/src/expr/datetime.rs +16 -7
  21. data/ext/polars/src/expr/general.rs +14 -23
  22. data/ext/polars/src/expr/list.rs +18 -11
  23. data/ext/polars/src/expr/name.rs +3 -2
  24. data/ext/polars/src/expr/rolling.rs +6 -7
  25. data/ext/polars/src/expr/string.rs +17 -37
  26. data/ext/polars/src/file.rs +59 -22
  27. data/ext/polars/src/functions/business.rs +15 -0
  28. data/ext/polars/src/functions/io.rs +6 -6
  29. data/ext/polars/src/functions/lazy.rs +17 -8
  30. data/ext/polars/src/functions/mod.rs +1 -0
  31. data/ext/polars/src/functions/range.rs +4 -2
  32. data/ext/polars/src/interop/arrow/mod.rs +1 -0
  33. data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
  34. data/ext/polars/src/interop/mod.rs +1 -0
  35. data/ext/polars/src/lazyframe/general.rs +877 -0
  36. data/ext/polars/src/lazyframe/mod.rs +3 -825
  37. data/ext/polars/src/lazyframe/serde.rs +31 -0
  38. data/ext/polars/src/lib.rs +44 -13
  39. data/ext/polars/src/map/dataframe.rs +46 -14
  40. data/ext/polars/src/map/lazy.rs +65 -4
  41. data/ext/polars/src/map/mod.rs +17 -16
  42. data/ext/polars/src/map/series.rs +106 -64
  43. data/ext/polars/src/on_startup.rs +2 -2
  44. data/ext/polars/src/series/aggregation.rs +1 -5
  45. data/ext/polars/src/series/arithmetic.rs +10 -10
  46. data/ext/polars/src/series/construction.rs +52 -25
  47. data/ext/polars/src/series/export.rs +1 -1
  48. data/ext/polars/src/series/general.rs +643 -0
  49. data/ext/polars/src/series/import.rs +55 -0
  50. data/ext/polars/src/series/mod.rs +11 -638
  51. data/ext/polars/src/series/scatter.rs +2 -2
  52. data/ext/polars/src/utils.rs +0 -20
  53. data/lib/polars/batched_csv_reader.rb +0 -2
  54. data/lib/polars/binary_expr.rb +133 -9
  55. data/lib/polars/binary_name_space.rb +101 -6
  56. data/lib/polars/config.rb +4 -0
  57. data/lib/polars/data_frame.rb +285 -62
  58. data/lib/polars/data_type_group.rb +28 -0
  59. data/lib/polars/data_types.rb +2 -0
  60. data/lib/polars/date_time_expr.rb +244 -0
  61. data/lib/polars/date_time_name_space.rb +87 -0
  62. data/lib/polars/expr.rb +109 -8
  63. data/lib/polars/functions/as_datatype.rb +51 -2
  64. data/lib/polars/functions/col.rb +1 -1
  65. data/lib/polars/functions/eager.rb +1 -3
  66. data/lib/polars/functions/lazy.rb +88 -10
  67. data/lib/polars/functions/range/time_range.rb +21 -21
  68. data/lib/polars/io/csv.rb +14 -16
  69. data/lib/polars/io/database.rb +2 -2
  70. data/lib/polars/io/ipc.rb +14 -12
  71. data/lib/polars/io/ndjson.rb +10 -0
  72. data/lib/polars/io/parquet.rb +168 -111
  73. data/lib/polars/lazy_frame.rb +649 -15
  74. data/lib/polars/list_name_space.rb +169 -0
  75. data/lib/polars/selectors.rb +1144 -0
  76. data/lib/polars/series.rb +470 -40
  77. data/lib/polars/string_cache.rb +27 -1
  78. data/lib/polars/string_expr.rb +0 -1
  79. data/lib/polars/string_name_space.rb +73 -3
  80. data/lib/polars/struct_name_space.rb +31 -7
  81. data/lib/polars/utils/various.rb +5 -1
  82. data/lib/polars/utils.rb +45 -10
  83. data/lib/polars/version.rb +1 -1
  84. data/lib/polars.rb +2 -1
  85. metadata +14 -4
  86. data/lib/polars/functions.rb +0 -57
data/LICENSE.txt CHANGED
@@ -1,5 +1,6 @@
1
1
  Copyright (c) 2020 Ritchie Vink
2
2
  Copyright (c) 2022-2024 Andrew Kane
3
+ Some portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
 
4
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -17,8 +17,7 @@ gem "polars-df"
17
17
  This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
18
18
 
19
19
  ```ruby
20
- Polars.read_csv("iris.csv")
21
- .lazy
20
+ Polars.scan_csv("iris.csv")
22
21
  .filter(Polars.col("sepal_length") > 5)
23
22
  .group_by("species")
24
23
  .agg(Polars.all.sum)
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.13.0"
3
+ version = "0.15.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -12,17 +12,20 @@ crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
14
  ahash = "0.8"
15
+ arrow = { package = "polars-arrow", version = "=0.44.2" }
16
+ bytes = "1"
15
17
  chrono = "0.4"
16
18
  either = "1.8"
17
19
  magnus = "0.7"
18
- polars-core = "=0.42.0"
19
- polars-parquet = "=0.42.0"
20
- polars-utils = "=0.42.0"
20
+ polars-core = "=0.44.2"
21
+ polars-plan = "=0.44.2"
22
+ polars-parquet = "=0.44.2"
23
+ polars-utils = "=0.44.2"
24
+ regex = "1"
21
25
  serde_json = "1"
22
- smartstring = "1"
23
26
 
24
27
  [dependencies.polars]
25
- version = "=0.42.0"
28
+ version = "=0.44.2"
26
29
  features = [
27
30
  "abs",
28
31
  "approx_unique",
@@ -31,7 +34,11 @@ features = [
31
34
  "array_count",
32
35
  "asof_join",
33
36
  "avro",
37
+ "aws",
38
+ "azure",
34
39
  "binary_encoding",
40
+ "business",
41
+ "cloud",
35
42
  "concat_str",
36
43
  "cov",
37
44
  "cross_join",
@@ -52,6 +59,8 @@ features = [
52
59
  "extract_jsonpath",
53
60
  "find_many",
54
61
  "fmt",
62
+ "gcp",
63
+ "http",
55
64
  "interpolate",
56
65
  "ipc",
57
66
  "ipc_streaming",
@@ -34,8 +34,7 @@ impl RbBatchedCsv {
34
34
  let n_threads = Option::<usize>::try_convert(arguments[11])?;
35
35
  let path = PathBuf::try_convert(arguments[12])?;
36
36
  let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[13])?;
37
- // TODO fix
38
- let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::None; // Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
37
+ let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
39
38
  let low_memory = bool::try_convert(arguments[15])?;
40
39
  let comment_prefix = Option::<String>::try_convert(arguments[16])?;
41
40
  let quote_char = Option::<String>::try_convert(arguments[17])?;
@@ -44,17 +43,16 @@ impl RbBatchedCsv {
44
43
  let try_parse_dates = bool::try_convert(arguments[20])?;
45
44
  let skip_rows_after_header = usize::try_convert(arguments[21])?;
46
45
  let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
47
- let sample_size = usize::try_convert(arguments[23])?;
48
- let eol_char = String::try_convert(arguments[24])?;
49
- let raise_if_empty = bool::try_convert(arguments[25])?;
50
- let truncate_ragged_lines = bool::try_convert(arguments[26])?;
51
- let decimal_comma = bool::try_convert(arguments[27])?;
46
+ let eol_char = String::try_convert(arguments[23])?;
47
+ let raise_if_empty = bool::try_convert(arguments[24])?;
48
+ let truncate_ragged_lines = bool::try_convert(arguments[25])?;
49
+ let decimal_comma = bool::try_convert(arguments[26])?;
52
50
  // end arguments
53
51
 
54
52
  let null_values = null_values.map(|w| w.0);
55
53
  let eol_char = eol_char.as_bytes()[0];
56
54
  let row_index = row_index.map(|(name, offset)| RowIndex {
57
- name: Arc::from(name.as_str()),
55
+ name: name.into(),
58
56
  offset,
59
57
  });
60
58
  let quote_char = if let Some(s) = quote_char {
@@ -72,7 +70,7 @@ impl RbBatchedCsv {
72
70
  .iter()
73
71
  .map(|(name, dtype)| {
74
72
  let dtype = dtype.0.clone();
75
- Field::new(name, dtype)
73
+ Field::new((&**name).into(), dtype)
76
74
  })
77
75
  .collect::<Schema>()
78
76
  });
@@ -84,7 +82,7 @@ impl RbBatchedCsv {
84
82
  .collect::<Vec<_>>()
85
83
  });
86
84
 
87
- let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
85
+ let file = std::fs::File::open(path).map_err(RbPolarsErr::from)?;
88
86
  let reader = Box::new(file) as Box<dyn MmapBytesReader>;
89
87
  let reader = CsvReadOptions::default()
90
88
  .with_infer_schema_length(infer_schema_length)
@@ -95,13 +93,12 @@ impl RbBatchedCsv {
95
93
  .with_projection(projection.map(Arc::new))
96
94
  .with_rechunk(rechunk)
97
95
  .with_chunk_size(chunk_size)
98
- .with_columns(columns.map(Arc::from))
96
+ .with_columns(columns.map(|x| x.into_iter().map(PlSmallStr::from_string).collect()))
99
97
  .with_n_threads(n_threads)
100
98
  .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
101
99
  .with_low_memory(low_memory)
102
100
  .with_skip_rows_after_header(skip_rows_after_header)
103
101
  .with_row_index(row_index)
104
- .with_sample_size(sample_size)
105
102
  .with_raise_if_empty(raise_if_empty)
106
103
  .with_parse_options(
107
104
  CsvParseOptions::default()
@@ -132,7 +129,7 @@ impl RbBatchedCsv {
132
129
  let batches = reader
133
130
  .borrow()
134
131
  .lock()
135
- .map_err(|e| RbPolarsErr::other(e.to_string()))?
132
+ .map_err(|e| RbPolarsErr::Other(e.to_string()))?
136
133
  .next_batches(n)
137
134
  .map_err(RbPolarsErr::from)?;
138
135
 
@@ -7,9 +7,9 @@ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
7
7
 
8
8
  use super::{struct_dict, ObjectValue, Wrap};
9
9
 
10
- use crate::error::RbOverflowError;
10
+ use crate::exceptions::RbOverflowError;
11
11
  use crate::rb_modules::utils;
12
- use crate::{RbPolarsErr, RbResult, RbSeries};
12
+ use crate::{RbErr, RbPolarsErr, RbResult, RbSeries};
13
13
 
14
14
  impl IntoValue for Wrap<AnyValue<'_>> {
15
15
  fn into_value_with(self, ruby: &Ruby) -> Value {
@@ -47,12 +47,20 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
47
47
  };
48
48
  s.into_value()
49
49
  }
50
+ AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
51
+ let s = if arr.is_null() {
52
+ rev.get(idx)
53
+ } else {
54
+ unsafe { arr.deref_unchecked().value(idx as usize) }
55
+ };
56
+ s.into_value()
57
+ }
50
58
  AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
51
59
  AnyValue::Datetime(v, time_unit, time_zone) => {
52
- let time_unit = time_unit.to_ascii();
53
- utils()
54
- .funcall("_to_ruby_datetime", (v, time_unit, time_zone.clone()))
55
- .unwrap()
60
+ datetime_to_rb_object(v, time_unit, time_zone)
61
+ }
62
+ AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
63
+ datetime_to_rb_object(v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
56
64
  }
57
65
  AnyValue::Duration(v, time_unit) => {
58
66
  let time_unit = time_unit.to_ascii();
@@ -66,11 +74,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
66
74
  AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
67
75
  AnyValue::Object(v) => {
68
76
  let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
69
- object.to_object()
77
+ object.to_value()
70
78
  }
71
79
  AnyValue::ObjectOwned(v) => {
72
80
  let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
73
- object.to_object()
81
+ object.to_value()
74
82
  }
75
83
  AnyValue::Binary(v) => RString::from_slice(v).into_value(),
76
84
  AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
@@ -80,6 +88,13 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
80
88
  }
81
89
  }
82
90
 
91
+ fn datetime_to_rb_object(v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> Value {
92
+ let tu = tu.to_ascii();
93
+ utils()
94
+ .funcall("_to_ruby_datetime", (v, tu, tz.map(|v| v.to_string())))
95
+ .unwrap()
96
+ }
97
+
83
98
  pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
84
99
  // Conversion functions.
85
100
  fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -122,7 +137,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
122
137
  fn get_list(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
123
138
  let v = RArray::from_value(ob).unwrap();
124
139
  if v.is_empty() {
125
- Ok(AnyValue::List(Series::new_empty("", &DataType::Null)))
140
+ Ok(AnyValue::List(Series::new_empty(
141
+ PlSmallStr::EMPTY,
142
+ &DataType::Null,
143
+ )))
126
144
  } else {
127
145
  let list = v;
128
146
 
@@ -142,7 +160,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
142
160
  avs.push(Wrap::<AnyValue>::try_convert(item)?.0)
143
161
  }
144
162
 
145
- let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
163
+ let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, true)
146
164
  .map_err(RbPolarsErr::from)?;
147
165
  Ok(AnyValue::List(s))
148
166
  }
@@ -158,11 +176,10 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
158
176
  let len = dict.len();
159
177
  let mut keys = Vec::with_capacity(len);
160
178
  let mut vals = Vec::with_capacity(len);
161
- dict.foreach(|k: Value, v: Value| {
162
- let key = String::try_convert(k)?;
163
- let val = Wrap::<AnyValue>::try_convert(v)?.0;
179
+ dict.foreach(|key: String, val: Wrap<AnyValue>| {
180
+ let val = val.0;
164
181
  let dtype = DataType::from(&val);
165
- keys.push(Field::new(&key, dtype));
182
+ keys.push(Field::new(key.into(), dtype));
166
183
  vals.push(val);
167
184
  Ok(ForEach::Continue)
168
185
  })?;
@@ -184,7 +201,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
184
201
  let v = sec * 1_000_000_000 + nsec;
185
202
  // TODO support time zone when possible
186
203
  // https://github.com/pola-rs/polars/issues/9103
187
- Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, &None))
204
+ Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, None))
188
205
  }
189
206
 
190
207
  fn get_datetime(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -193,7 +210,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
193
210
  Ok(AnyValue::Datetime(
194
211
  sec * 1_000_000_000 + nsec,
195
212
  TimeUnit::Nanoseconds,
196
- &None,
213
+ None,
197
214
  ))
198
215
  }
199
216
 
@@ -218,7 +235,9 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
218
235
 
219
236
  let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
220
237
  let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
221
- RbPolarsErr::other("BigDecimal is too large to fit in Decimal128".into())
238
+ RbErr::from(RbPolarsErr::Other(
239
+ "BigDecimal is too large to fit in Decimal128".into(),
240
+ ))
222
241
  })?;
223
242
  if sign < 0 {
224
243
  // TODO better error
@@ -253,9 +272,6 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
253
272
  } else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
254
273
  get_decimal(ob, strict)
255
274
  } else {
256
- Err(RbPolarsErr::other(format!(
257
- "object type not supported {:?}",
258
- ob
259
- )))
275
+ Err(RbPolarsErr::Other(format!("object type not supported {:?}", ob)).into())
260
276
  }
261
277
  }
@@ -9,7 +9,7 @@ use crate::RbResult;
9
9
  impl TryConvert for Wrap<StringChunked> {
10
10
  fn try_convert(obj: Value) -> RbResult<Self> {
11
11
  let (seq, len) = get_rbseq(obj)?;
12
- let mut builder = StringChunkedBuilder::new("", len);
12
+ let mut builder = StringChunkedBuilder::new(PlSmallStr::EMPTY, len);
13
13
 
14
14
  for res in seq.into_iter() {
15
15
  let item = res;
@@ -25,7 +25,7 @@ impl TryConvert for Wrap<StringChunked> {
25
25
  impl TryConvert for Wrap<BinaryChunked> {
26
26
  fn try_convert(obj: Value) -> RbResult<Self> {
27
27
  let (seq, len) = get_rbseq(obj)?;
28
- let mut builder = BinaryChunkedBuilder::new("", len);
28
+ let mut builder = BinaryChunkedBuilder::new(PlSmallStr::EMPTY, len);
29
29
 
30
30
  for res in seq.into_iter() {
31
31
  let item = res;
@@ -90,7 +90,7 @@ impl IntoValue for Wrap<&DatetimeChunked> {
90
90
  fn into_value_with(self, _: &Ruby) -> Value {
91
91
  let utils = utils();
92
92
  let time_unit = Wrap(self.0.time_unit()).into_value();
93
- let time_zone = self.0.time_zone().clone().into_value();
93
+ let time_zone = self.0.time_zone().as_deref().map(|v| v.into_value());
94
94
  let iter = self.0.into_iter().map(|opt_v| {
95
95
  opt_v.map(|v| {
96
96
  utils