polars-df 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -0
  3. data/Cargo.lock +1523 -378
  4. data/LICENSE.txt +1 -0
  5. data/README.md +38 -4
  6. data/ext/polars/Cargo.toml +15 -5
  7. data/ext/polars/src/batched_csv.rs +7 -10
  8. data/ext/polars/src/conversion/any_value.rs +31 -21
  9. data/ext/polars/src/conversion/mod.rs +155 -48
  10. data/ext/polars/src/dataframe/construction.rs +0 -3
  11. data/ext/polars/src/dataframe/export.rs +9 -2
  12. data/ext/polars/src/dataframe/general.rs +15 -57
  13. data/ext/polars/src/dataframe/io.rs +77 -169
  14. data/ext/polars/src/dataframe/mod.rs +1 -0
  15. data/ext/polars/src/dataframe/serde.rs +15 -0
  16. data/ext/polars/src/error.rs +31 -48
  17. data/ext/polars/src/exceptions.rs +24 -0
  18. data/ext/polars/src/expr/binary.rs +4 -42
  19. data/ext/polars/src/expr/datetime.rs +5 -4
  20. data/ext/polars/src/expr/general.rs +16 -22
  21. data/ext/polars/src/expr/list.rs +18 -11
  22. data/ext/polars/src/expr/meta.rs +6 -2
  23. data/ext/polars/src/expr/rolling.rs +6 -7
  24. data/ext/polars/src/expr/string.rs +9 -36
  25. data/ext/polars/src/file.rs +78 -23
  26. data/ext/polars/src/functions/aggregation.rs +4 -4
  27. data/ext/polars/src/functions/business.rs +15 -0
  28. data/ext/polars/src/functions/io.rs +34 -13
  29. data/ext/polars/src/functions/lazy.rs +22 -12
  30. data/ext/polars/src/functions/meta.rs +1 -1
  31. data/ext/polars/src/functions/mod.rs +1 -0
  32. data/ext/polars/src/interop/arrow/mod.rs +1 -0
  33. data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
  34. data/ext/polars/src/interop/mod.rs +1 -0
  35. data/ext/polars/src/lazyframe/general.rs +920 -0
  36. data/ext/polars/src/lazyframe/mod.rs +3 -827
  37. data/ext/polars/src/lazyframe/serde.rs +31 -0
  38. data/ext/polars/src/lib.rs +54 -27
  39. data/ext/polars/src/map/dataframe.rs +10 -6
  40. data/ext/polars/src/map/lazy.rs +65 -4
  41. data/ext/polars/src/map/mod.rs +9 -8
  42. data/ext/polars/src/on_startup.rs +1 -1
  43. data/ext/polars/src/series/aggregation.rs +1 -5
  44. data/ext/polars/src/series/arithmetic.rs +10 -10
  45. data/ext/polars/src/series/construction.rs +2 -2
  46. data/ext/polars/src/series/export.rs +1 -1
  47. data/ext/polars/src/series/general.rs +631 -0
  48. data/ext/polars/src/series/import.rs +55 -0
  49. data/ext/polars/src/series/mod.rs +11 -638
  50. data/ext/polars/src/series/scatter.rs +2 -2
  51. data/ext/polars/src/utils.rs +0 -20
  52. data/lib/polars/batched_csv_reader.rb +0 -2
  53. data/lib/polars/binary_expr.rb +133 -9
  54. data/lib/polars/binary_name_space.rb +101 -6
  55. data/lib/polars/config.rb +4 -0
  56. data/lib/polars/data_frame.rb +452 -101
  57. data/lib/polars/data_type_group.rb +28 -0
  58. data/lib/polars/data_types.rb +3 -1
  59. data/lib/polars/date_time_expr.rb +244 -0
  60. data/lib/polars/date_time_name_space.rb +87 -0
  61. data/lib/polars/expr.rb +103 -2
  62. data/lib/polars/functions/aggregation/horizontal.rb +10 -4
  63. data/lib/polars/functions/as_datatype.rb +51 -2
  64. data/lib/polars/functions/col.rb +1 -1
  65. data/lib/polars/functions/eager.rb +1 -3
  66. data/lib/polars/functions/lazy.rb +95 -13
  67. data/lib/polars/functions/range/time_range.rb +21 -21
  68. data/lib/polars/io/csv.rb +14 -16
  69. data/lib/polars/io/database.rb +2 -2
  70. data/lib/polars/io/delta.rb +126 -0
  71. data/lib/polars/io/ipc.rb +14 -4
  72. data/lib/polars/io/ndjson.rb +10 -0
  73. data/lib/polars/io/parquet.rb +168 -111
  74. data/lib/polars/lazy_frame.rb +684 -20
  75. data/lib/polars/list_name_space.rb +169 -0
  76. data/lib/polars/selectors.rb +1226 -0
  77. data/lib/polars/series.rb +465 -35
  78. data/lib/polars/string_cache.rb +27 -1
  79. data/lib/polars/string_expr.rb +0 -1
  80. data/lib/polars/string_name_space.rb +73 -3
  81. data/lib/polars/struct_name_space.rb +31 -7
  82. data/lib/polars/utils/various.rb +5 -1
  83. data/lib/polars/utils.rb +45 -10
  84. data/lib/polars/version.rb +1 -1
  85. data/lib/polars.rb +17 -1
  86. metadata +16 -9
  87. data/lib/polars/functions.rb +0 -57
data/LICENSE.txt CHANGED
@@ -1,5 +1,6 @@
1
1
  Copyright (c) 2020 Ritchie Vink
2
2
  Copyright (c) 2022-2024 Andrew Kane
3
+ Some portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
 
4
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
5
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -14,18 +14,17 @@ gem "polars-df"
14
14
 
15
15
  ## Getting Started
16
16
 
17
- This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
17
+ This library follows the [Polars Python API](https://docs.pola.rs/api/python/stable/reference/index.html).
18
18
 
19
19
  ```ruby
20
- Polars.read_csv("iris.csv")
21
- .lazy
20
+ Polars.scan_csv("iris.csv")
22
21
  .filter(Polars.col("sepal_length") > 5)
23
22
  .group_by("species")
24
23
  .agg(Polars.all.sum)
25
24
  .collect
26
25
  ```
27
26
 
28
- You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
27
+ You can follow [Polars tutorials](https://docs.pola.rs/user-guide/getting-started/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
29
28
 
30
29
  ## Reference
31
30
 
@@ -89,6 +88,15 @@ From Avro
89
88
  Polars.read_avro("file.avro")
90
89
  ```
91
90
 
91
+ From Delta Lake (requires [deltalake-rb](https://github.com/ankane/delta-ruby)) [experimental, unreleased]
92
+
93
+ ```ruby
94
+ Polars.read_delta("./table")
95
+
96
+ # or lazily with
97
+ Polars.scan_delta("./table")
98
+ ```
99
+
92
100
  From a hash
93
101
 
94
102
  ```ruby
@@ -337,6 +345,32 @@ Parquet
337
345
  df.write_parquet("file.parquet")
338
346
  ```
339
347
 
348
+ JSON
349
+
350
+ ```ruby
351
+ df.write_json("file.json")
352
+ # or
353
+ df.write_ndjson("file.ndjson")
354
+ ```
355
+
356
+ Feather / Arrow IPC
357
+
358
+ ```ruby
359
+ df.write_ipc("file.arrow")
360
+ ```
361
+
362
+ Avro
363
+
364
+ ```ruby
365
+ df.write_avro("file.avro")
366
+ ```
367
+
368
+ Delta Lake [experimental, unreleased]
369
+
370
+ ```ruby
371
+ df.write_delta("./table")
372
+ ```
373
+
340
374
  Numo array
341
375
 
342
376
  ```ruby
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.14.0"
3
+ version = "0.16.0"
4
4
  license = "MIT"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -12,16 +12,20 @@ crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
14
  ahash = "0.8"
15
+ arrow = { package = "polars-arrow", version = "=0.45.1" }
16
+ bytes = "1"
15
17
  chrono = "0.4"
16
18
  either = "1.8"
17
19
  magnus = "0.7"
18
- polars-core = "=0.43.1"
19
- polars-parquet = "=0.43.1"
20
- polars-utils = "=0.43.1"
20
+ polars-core = "=0.45.1"
21
+ polars-plan = "=0.45.1"
22
+ polars-parquet = "=0.45.1"
23
+ polars-utils = "=0.45.1"
24
+ regex = "1"
21
25
  serde_json = "1"
22
26
 
23
27
  [dependencies.polars]
24
- version = "=0.43.1"
28
+ version = "=0.45.1"
25
29
  features = [
26
30
  "abs",
27
31
  "approx_unique",
@@ -30,7 +34,11 @@ features = [
30
34
  "array_count",
31
35
  "asof_join",
32
36
  "avro",
37
+ "aws",
38
+ "azure",
33
39
  "binary_encoding",
40
+ "business",
41
+ "cloud",
34
42
  "concat_str",
35
43
  "cov",
36
44
  "cross_join",
@@ -51,6 +59,8 @@ features = [
51
59
  "extract_jsonpath",
52
60
  "find_many",
53
61
  "fmt",
62
+ "gcp",
63
+ "http",
54
64
  "interpolate",
55
65
  "ipc",
56
66
  "ipc_streaming",
@@ -34,8 +34,7 @@ impl RbBatchedCsv {
34
34
  let n_threads = Option::<usize>::try_convert(arguments[11])?;
35
35
  let path = PathBuf::try_convert(arguments[12])?;
36
36
  let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[13])?;
37
- // TODO fix
38
- let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::None; // Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
37
+ let overwrite_dtype_slice = Option::<Vec<Wrap<DataType>>>::try_convert(arguments[14])?;
39
38
  let low_memory = bool::try_convert(arguments[15])?;
40
39
  let comment_prefix = Option::<String>::try_convert(arguments[16])?;
41
40
  let quote_char = Option::<String>::try_convert(arguments[17])?;
@@ -44,11 +43,10 @@ impl RbBatchedCsv {
44
43
  let try_parse_dates = bool::try_convert(arguments[20])?;
45
44
  let skip_rows_after_header = usize::try_convert(arguments[21])?;
46
45
  let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
47
- let sample_size = usize::try_convert(arguments[23])?;
48
- let eol_char = String::try_convert(arguments[24])?;
49
- let raise_if_empty = bool::try_convert(arguments[25])?;
50
- let truncate_ragged_lines = bool::try_convert(arguments[26])?;
51
- let decimal_comma = bool::try_convert(arguments[27])?;
46
+ let eol_char = String::try_convert(arguments[23])?;
47
+ let raise_if_empty = bool::try_convert(arguments[24])?;
48
+ let truncate_ragged_lines = bool::try_convert(arguments[25])?;
49
+ let decimal_comma = bool::try_convert(arguments[26])?;
52
50
  // end arguments
53
51
 
54
52
  let null_values = null_values.map(|w| w.0);
@@ -84,7 +82,7 @@ impl RbBatchedCsv {
84
82
  .collect::<Vec<_>>()
85
83
  });
86
84
 
87
- let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
85
+ let file = std::fs::File::open(path).map_err(RbPolarsErr::from)?;
88
86
  let reader = Box::new(file) as Box<dyn MmapBytesReader>;
89
87
  let reader = CsvReadOptions::default()
90
88
  .with_infer_schema_length(infer_schema_length)
@@ -101,7 +99,6 @@ impl RbBatchedCsv {
101
99
  .with_low_memory(low_memory)
102
100
  .with_skip_rows_after_header(skip_rows_after_header)
103
101
  .with_row_index(row_index)
104
- .with_sample_size(sample_size)
105
102
  .with_raise_if_empty(raise_if_empty)
106
103
  .with_parse_options(
107
104
  CsvParseOptions::default()
@@ -132,7 +129,7 @@ impl RbBatchedCsv {
132
129
  let batches = reader
133
130
  .borrow()
134
131
  .lock()
135
- .map_err(|e| RbPolarsErr::other(e.to_string()))?
132
+ .map_err(|e| RbPolarsErr::Other(e.to_string()))?
136
133
  .next_batches(n)
137
134
  .map_err(RbPolarsErr::from)?;
138
135
 
@@ -7,9 +7,9 @@ use polars_core::utils::any_values_to_supertype_and_n_dtypes;
7
7
 
8
8
  use super::{struct_dict, ObjectValue, Wrap};
9
9
 
10
- use crate::error::RbOverflowError;
10
+ use crate::exceptions::RbOverflowError;
11
11
  use crate::rb_modules::utils;
12
- use crate::{RbPolarsErr, RbResult, RbSeries};
12
+ use crate::{RbErr, RbPolarsErr, RbResult, RbSeries};
13
13
 
14
14
  impl IntoValue for Wrap<AnyValue<'_>> {
15
15
  fn into_value_with(self, ruby: &Ruby) -> Value {
@@ -47,15 +47,20 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
47
47
  };
48
48
  s.into_value()
49
49
  }
50
+ AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => {
51
+ let s = if arr.is_null() {
52
+ rev.get(idx)
53
+ } else {
54
+ unsafe { arr.deref_unchecked().value(idx as usize) }
55
+ };
56
+ s.into_value()
57
+ }
50
58
  AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
51
59
  AnyValue::Datetime(v, time_unit, time_zone) => {
52
- let time_unit = time_unit.to_ascii();
53
- utils()
54
- .funcall(
55
- "_to_ruby_datetime",
56
- (v, time_unit, time_zone.as_ref().map(|v| v.to_string())),
57
- )
58
- .unwrap()
60
+ datetime_to_rb_object(v, time_unit, time_zone)
61
+ }
62
+ AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
63
+ datetime_to_rb_object(v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
59
64
  }
60
65
  AnyValue::Duration(v, time_unit) => {
61
66
  let time_unit = time_unit.to_ascii();
@@ -69,11 +74,11 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
69
74
  AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
70
75
  AnyValue::Object(v) => {
71
76
  let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
72
- object.to_object()
77
+ object.to_value()
73
78
  }
74
79
  AnyValue::ObjectOwned(v) => {
75
80
  let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
76
- object.to_object()
81
+ object.to_value()
77
82
  }
78
83
  AnyValue::Binary(v) => RString::from_slice(v).into_value(),
79
84
  AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
@@ -83,6 +88,13 @@ pub(crate) fn any_value_into_rb_object(av: AnyValue, ruby: &Ruby) -> Value {
83
88
  }
84
89
  }
85
90
 
91
+ fn datetime_to_rb_object(v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> Value {
92
+ let tu = tu.to_ascii();
93
+ utils()
94
+ .funcall("_to_ruby_datetime", (v, tu, tz.map(|v| v.to_string())))
95
+ .unwrap()
96
+ }
97
+
86
98
  pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<AnyValue<'s>> {
87
99
  // Conversion functions.
88
100
  fn get_null(_ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -164,9 +176,8 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
164
176
  let len = dict.len();
165
177
  let mut keys = Vec::with_capacity(len);
166
178
  let mut vals = Vec::with_capacity(len);
167
- dict.foreach(|k: Value, v: Value| {
168
- let key = String::try_convert(k)?;
169
- let val = Wrap::<AnyValue>::try_convert(v)?.0;
179
+ dict.foreach(|key: String, val: Wrap<AnyValue>| {
180
+ let val = val.0;
170
181
  let dtype = DataType::from(&val);
171
182
  keys.push(Field::new(key.into(), dtype));
172
183
  vals.push(val);
@@ -190,7 +201,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
190
201
  let v = sec * 1_000_000_000 + nsec;
191
202
  // TODO support time zone when possible
192
203
  // https://github.com/pola-rs/polars/issues/9103
193
- Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, &None))
204
+ Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, None))
194
205
  }
195
206
 
196
207
  fn get_datetime(ob: Value, _strict: bool) -> RbResult<AnyValue<'static>> {
@@ -199,7 +210,7 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
199
210
  Ok(AnyValue::Datetime(
200
211
  sec * 1_000_000_000 + nsec,
201
212
  TimeUnit::Nanoseconds,
202
- &None,
213
+ None,
203
214
  ))
204
215
  }
205
216
 
@@ -224,7 +235,9 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
224
235
 
225
236
  let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
226
237
  let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
227
- RbPolarsErr::other("BigDecimal is too large to fit in Decimal128".into())
238
+ RbErr::from(RbPolarsErr::Other(
239
+ "BigDecimal is too large to fit in Decimal128".into(),
240
+ ))
228
241
  })?;
229
242
  if sign < 0 {
230
243
  // TODO better error
@@ -259,9 +272,6 @@ pub(crate) fn rb_object_to_any_value<'s>(ob: Value, strict: bool) -> RbResult<An
259
272
  } else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
260
273
  get_decimal(ob, strict)
261
274
  } else {
262
- Err(RbPolarsErr::other(format!(
263
- "object type not supported {:?}",
264
- ob
265
- )))
275
+ Err(RbPolarsErr::Other(format!("object type not supported {:?}", ob)).into())
266
276
  }
267
277
  }