polars-df 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a39332a375d211cf0e04ea95cdb4fe315c843b3df474ee3fef135ec63eb0516
4
- data.tar.gz: 41881b2aa8da2bb2850fbcaba11ec1409204ec99b7d36d1470a7536684b02509
3
+ metadata.gz: 9e6fb732e5dafe2fde285322554bd9159483cbbdf17d6e2bba9cba9a83563b47
4
+ data.tar.gz: 1b4249d0c0100f136973c601b8404cb6d92abc632d5ed0476bd93bc5360a11dc
5
5
  SHA512:
6
- metadata.gz: 17ba7456808346b025d52d14d5fa6d91dc0d18385d0ee796c1b821eb6272875e15927e34da8f41c1872e4b158bfd32165ce1ee4adc5cebf4474fc7e86ad98e31
7
- data.tar.gz: 4c3ef1aa3ed5c6e0620d555bab73d20e8b7cd495e26c97f4ddd41f14ed8d3883331cff36158fbe4e1dbc3b86825fa83dfa4b832f901db35a73a9c021b8dd955c
6
+ metadata.gz: d9414d6f60c489e2b3b72885288822083ba8c04bac4053f4e34c1d53ee805d164f17fe4b8b3a8f4ff562550bcc657f374bea6e250b52985367f601ea50e3037f
7
+ data.tar.gz: 9e3a7cfe105f03ec20e9c26aa38c1475074ccc1ea057a170a97b7068b41943d561d50af49bb1d1f74b7705809dc1375900f542ab93683ba627dea080274f6d91
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ --no-private
2
+ --markup markdown
3
+ --embed-mixins
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## 0.1.3 (2022-11-27)
2
+
3
+ - Added more methods
4
+
5
+ ## 0.1.2 (2022-11-25)
6
+
7
+ - Added more methods
8
+
1
9
  ## 0.1.1 (2022-11-23)
2
10
 
3
11
  - Added more methods
data/Cargo.lock CHANGED
@@ -1160,7 +1160,7 @@ dependencies = [
1160
1160
 
1161
1161
  [[package]]
1162
1162
  name = "polars"
1163
- version = "0.1.0"
1163
+ version = "0.1.3"
1164
1164
  dependencies = [
1165
1165
  "magnus",
1166
1166
  "polars 0.25.1",
@@ -1217,6 +1217,7 @@ dependencies = [
1217
1217
  "rayon",
1218
1218
  "regex",
1219
1219
  "serde",
1220
+ "serde_json",
1220
1221
  "smartstring",
1221
1222
  "thiserror",
1222
1223
  ]
data/README.md CHANGED
@@ -27,7 +27,7 @@ Polars.read_csv("iris.csv")
27
27
  .collect
28
28
  ```
29
29
 
30
- You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. Note that many methods and options are missing at the moment.
30
+ You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. Some methods are missing at the moment.
31
31
 
32
32
  ## Examples
33
33
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.1.1"
3
+ version = "0.1.3"
4
4
  authors = ["Andrew Kane <andrew@ankane.org>"]
5
5
  edition = "2021"
6
6
  publish = false
@@ -17,10 +17,12 @@ version = "0.25.1"
17
17
  features = [
18
18
  "abs",
19
19
  "arange",
20
+ "arg_where",
20
21
  "concat_str",
21
22
  "csv-file",
22
23
  "cum_agg",
23
24
  "cumulative_eval",
25
+ "dataframe_arithmetic",
24
26
  "date_offset",
25
27
  "diagonal_concat",
26
28
  "diff",
@@ -38,16 +40,20 @@ features = [
38
40
  "lazy",
39
41
  "lazy_regex",
40
42
  "list_eval",
43
+ "list_to_struct",
41
44
  "log",
42
45
  "meta",
43
46
  "mode",
44
47
  "moment",
48
+ "object",
45
49
  "parquet",
46
50
  "partition_by",
47
51
  "pct_change",
48
52
  "product",
53
+ "propagate_nans",
49
54
  "random",
50
55
  "rank",
56
+ "reinterpret",
51
57
  "repeat_by",
52
58
  "rolling_window",
53
59
  "round_series",
@@ -0,0 +1,120 @@
1
+ use magnus::Value;
2
+ use polars::io::mmap::MmapBytesReader;
3
+ use polars::io::RowCount;
4
+ use polars::prelude::read_impl::OwnedBatchedCsvReader;
5
+ use polars::prelude::*;
6
+ use std::cell::RefCell;
7
+ use std::path::PathBuf;
8
+
9
+ use crate::conversion::*;
10
+ use crate::{RbDataFrame, RbPolarsErr, RbResult};
11
+
12
+ #[magnus::wrap(class = "Polars::RbBatchedCsv")]
13
+ pub struct RbBatchedCsv {
14
+ pub reader: RefCell<OwnedBatchedCsvReader>,
15
+ }
16
+
17
+ impl RbBatchedCsv {
18
+ pub fn new(arguments: &[Value]) -> RbResult<Self> {
19
+ // start arguments
20
+ // this pattern is needed for more than 16
21
+ let infer_schema_length: Option<usize> = arguments[0].try_convert()?;
22
+ let chunk_size: usize = arguments[1].try_convert()?;
23
+ let has_header: bool = arguments[2].try_convert()?;
24
+ let ignore_errors: bool = arguments[3].try_convert()?;
25
+ let n_rows: Option<usize> = arguments[4].try_convert()?;
26
+ let skip_rows: usize = arguments[5].try_convert()?;
27
+ let projection: Option<Vec<usize>> = arguments[6].try_convert()?;
28
+ let sep: String = arguments[7].try_convert()?;
29
+ let rechunk: bool = arguments[8].try_convert()?;
30
+ let columns: Option<Vec<String>> = arguments[9].try_convert()?;
31
+ let encoding: Wrap<CsvEncoding> = arguments[10].try_convert()?;
32
+ let n_threads: Option<usize> = arguments[11].try_convert()?;
33
+ let path: PathBuf = arguments[12].try_convert()?;
34
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[13].try_convert()?;
35
+ // TODO fix
36
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[14].try_convert()?;
37
+ let low_memory: bool = arguments[15].try_convert()?;
38
+ let comment_char: Option<String> = arguments[16].try_convert()?;
39
+ let quote_char: Option<String> = arguments[17].try_convert()?;
40
+ let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
41
+ let parse_dates: bool = arguments[19].try_convert()?;
42
+ let skip_rows_after_header: usize = arguments[20].try_convert()?;
43
+ let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
44
+ let sample_size: usize = arguments[22].try_convert()?;
45
+ let eol_char: String = arguments[23].try_convert()?;
46
+ // end arguments
47
+
48
+ let null_values = null_values.map(|w| w.0);
49
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
50
+ let eol_char = eol_char.as_bytes()[0];
51
+
52
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
53
+
54
+ let quote_char = if let Some(s) = quote_char {
55
+ if s.is_empty() {
56
+ None
57
+ } else {
58
+ Some(s.as_bytes()[0])
59
+ }
60
+ } else {
61
+ None
62
+ };
63
+
64
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
65
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
66
+ let dtype = dtype.0.clone();
67
+ Field::new(name, dtype)
68
+ });
69
+ Schema::from(fields)
70
+ });
71
+
72
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
73
+ overwrite_dtype
74
+ .iter()
75
+ .map(|dt| dt.0.clone())
76
+ .collect::<Vec<_>>()
77
+ });
78
+
79
+ let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
80
+ let reader = Box::new(file) as Box<dyn MmapBytesReader>;
81
+ let reader = CsvReader::new(reader)
82
+ .infer_schema(infer_schema_length)
83
+ .has_header(has_header)
84
+ .with_n_rows(n_rows)
85
+ .with_delimiter(sep.as_bytes()[0])
86
+ .with_skip_rows(skip_rows)
87
+ .with_ignore_parser_errors(ignore_errors)
88
+ .with_projection(projection)
89
+ .with_rechunk(rechunk)
90
+ .with_chunk_size(chunk_size)
91
+ .with_encoding(encoding.0)
92
+ .with_columns(columns)
93
+ .with_n_threads(n_threads)
94
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
95
+ .low_memory(low_memory)
96
+ .with_comment_char(comment_char)
97
+ .with_null_values(null_values)
98
+ .with_parse_dates(parse_dates)
99
+ .with_quote_char(quote_char)
100
+ .with_end_of_line_char(eol_char)
101
+ .with_skip_rows_after_header(skip_rows_after_header)
102
+ .with_row_count(row_count)
103
+ .sample_size(sample_size)
104
+ .batched(overwrite_dtype.map(Arc::new))
105
+ .map_err(RbPolarsErr::from)?;
106
+
107
+ Ok(RbBatchedCsv {
108
+ reader: RefCell::new(reader),
109
+ })
110
+ }
111
+
112
+ pub fn next_batches(&self, n: usize) -> RbResult<Option<Vec<RbDataFrame>>> {
113
+ let batches = self
114
+ .reader
115
+ .borrow_mut()
116
+ .next_batches(n)
117
+ .map_err(RbPolarsErr::from)?;
118
+ Ok(batches.map(|batches| batches.into_iter().map(|out| out.1.into()).collect()))
119
+ }
120
+ }
@@ -1,11 +1,12 @@
1
- use magnus::{TryConvert, Value, QNIL};
1
+ use magnus::{class, RArray, Symbol, TryConvert, Value, QNIL};
2
+ use polars::chunked_array::object::PolarsObjectSafe;
2
3
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
3
4
  use polars::datatypes::AnyValue;
4
5
  use polars::frame::DataFrame;
5
6
  use polars::prelude::*;
6
7
  use polars::series::ops::NullBehavior;
7
8
 
8
- use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
9
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
9
10
 
10
11
  pub struct Wrap<T>(pub T);
11
12
 
@@ -15,14 +16,57 @@ impl<T> From<T> for Wrap<T> {
15
16
  }
16
17
  }
17
18
 
19
+ pub fn get_rbseq(obj: Value) -> RbResult<(RArray, usize)> {
20
+ let seq: RArray = obj.try_convert()?;
21
+ let len = seq.len();
22
+ Ok((seq, len))
23
+ }
24
+
18
25
  pub fn get_df(obj: Value) -> RbResult<DataFrame> {
19
26
  let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?;
20
27
  Ok(rbdf.df.borrow().clone())
21
28
  }
22
29
 
23
- impl Into<Value> for Wrap<AnyValue<'_>> {
24
- fn into(self) -> Value {
25
- match self.0 {
30
+ pub fn get_series(obj: Value) -> RbResult<Series> {
31
+ let rbs = obj.funcall::<_, _, &RbSeries>("_s", ())?;
32
+ Ok(rbs.series.borrow().clone())
33
+ }
34
+
35
+ impl TryConvert for Wrap<Utf8Chunked> {
36
+ fn try_convert(obj: Value) -> RbResult<Self> {
37
+ let (seq, len) = get_rbseq(obj)?;
38
+ let mut builder = Utf8ChunkedBuilder::new("", len, len * 25);
39
+
40
+ for res in seq.each() {
41
+ let item = res?;
42
+ match item.try_convert::<String>() {
43
+ Ok(val) => builder.append_value(&val),
44
+ Err(_) => builder.append_null(),
45
+ }
46
+ }
47
+ Ok(Wrap(builder.finish()))
48
+ }
49
+ }
50
+
51
+ impl TryConvert for Wrap<NullValues> {
52
+ fn try_convert(ob: Value) -> RbResult<Self> {
53
+ if let Ok(s) = ob.try_convert::<String>() {
54
+ Ok(Wrap(NullValues::AllColumnsSingle(s)))
55
+ } else if let Ok(s) = ob.try_convert::<Vec<String>>() {
56
+ Ok(Wrap(NullValues::AllColumns(s)))
57
+ } else if let Ok(s) = ob.try_convert::<Vec<(String, String)>>() {
58
+ Ok(Wrap(NullValues::Named(s)))
59
+ } else {
60
+ Err(RbPolarsErr::other(
61
+ "could not extract value from null_values argument".into(),
62
+ ))
63
+ }
64
+ }
65
+ }
66
+
67
+ impl From<Wrap<AnyValue<'_>>> for Value {
68
+ fn from(w: Wrap<AnyValue<'_>>) -> Self {
69
+ match w.0 {
26
70
  AnyValue::UInt8(v) => Value::from(v),
27
71
  AnyValue::UInt16(v) => Value::from(v),
28
72
  AnyValue::UInt32(v) => Value::from(v),
@@ -36,11 +80,24 @@ impl Into<Value> for Wrap<AnyValue<'_>> {
36
80
  AnyValue::Null => *QNIL,
37
81
  AnyValue::Boolean(v) => Value::from(v),
38
82
  AnyValue::Utf8(v) => Value::from(v),
83
+ AnyValue::Date(v) => class::time()
84
+ .funcall::<_, _, Value>("at", (v * 86400,))
85
+ .unwrap()
86
+ .funcall::<_, _, Value>("utc", ())
87
+ .unwrap()
88
+ .funcall::<_, _, Value>("to_date", ())
89
+ .unwrap(),
39
90
  _ => todo!(),
40
91
  }
41
92
  }
42
93
  }
43
94
 
95
+ impl From<Wrap<DataType>> for Value {
96
+ fn from(w: Wrap<DataType>) -> Self {
97
+ Symbol::from(w.0.to_string()).into()
98
+ }
99
+ }
100
+
44
101
  impl TryConvert for Wrap<DataType> {
45
102
  fn try_convert(ob: Value) -> RbResult<Self> {
46
103
  let dtype = match ob.try_convert::<String>()?.as_str() {
@@ -53,10 +110,19 @@ impl TryConvert for Wrap<DataType> {
53
110
  "i32" => DataType::Int32,
54
111
  "i64" => DataType::Int64,
55
112
  "str" => DataType::Utf8,
113
+ "bin" => DataType::Binary,
56
114
  "bool" => DataType::Boolean,
115
+ "cat" => DataType::Categorical(None),
116
+ "date" => DataType::Date,
117
+ "datetime" => DataType::Datetime(TimeUnit::Microseconds, None),
57
118
  "f32" => DataType::Float32,
119
+ "time" => DataType::Time,
120
+ "dur" => DataType::Duration(TimeUnit::Microseconds),
58
121
  "f64" => DataType::Float64,
59
- "date" => DataType::Date,
122
+ // "obj" => DataType::Object(OBJECT_NAME),
123
+ "list" => DataType::List(Box::new(DataType::Boolean)),
124
+ "null" => DataType::Null,
125
+ "unk" => DataType::Unknown,
60
126
  _ => {
61
127
  return Err(RbValueError::new_err(format!(
62
128
  "{} is not a supported DataType.",
@@ -118,6 +184,39 @@ impl TryConvert for Wrap<ClosedWindow> {
118
184
  }
119
185
  }
120
186
 
187
+ impl TryConvert for Wrap<CsvEncoding> {
188
+ fn try_convert(ob: Value) -> RbResult<Self> {
189
+ let parsed = match ob.try_convert::<String>()?.as_str() {
190
+ "utf8" => CsvEncoding::Utf8,
191
+ "utf8-lossy" => CsvEncoding::LossyUtf8,
192
+ v => {
193
+ return Err(RbValueError::new_err(format!(
194
+ "encoding must be one of {{'utf8', 'utf8-lossy'}}, got {}",
195
+ v
196
+ )))
197
+ }
198
+ };
199
+ Ok(Wrap(parsed))
200
+ }
201
+ }
202
+
203
+ impl TryConvert for Wrap<Option<IpcCompression>> {
204
+ fn try_convert(ob: Value) -> RbResult<Self> {
205
+ let parsed = match ob.try_convert::<String>()?.as_str() {
206
+ "uncompressed" => None,
207
+ "lz4" => Some(IpcCompression::LZ4),
208
+ "zstd" => Some(IpcCompression::ZSTD),
209
+ v => {
210
+ return Err(RbValueError::new_err(format!(
211
+ "compression must be one of {{'uncompressed', 'lz4', 'zstd'}}, got {}",
212
+ v
213
+ )))
214
+ }
215
+ };
216
+ Ok(Wrap(parsed))
217
+ }
218
+ }
219
+
121
220
  impl TryConvert for Wrap<JoinType> {
122
221
  fn try_convert(ob: Value) -> RbResult<Self> {
123
222
  let parsed = match ob.try_convert::<String>()?.as_str() {
@@ -171,6 +270,24 @@ impl TryConvert for Wrap<NullStrategy> {
171
270
  }
172
271
  }
173
272
 
273
+ impl TryConvert for Wrap<ParallelStrategy> {
274
+ fn try_convert(ob: Value) -> RbResult<Self> {
275
+ let parsed = match ob.try_convert::<String>()?.as_str() {
276
+ "auto" => ParallelStrategy::Auto,
277
+ "columns" => ParallelStrategy::Columns,
278
+ "row_groups" => ParallelStrategy::RowGroups,
279
+ "none" => ParallelStrategy::None,
280
+ v => {
281
+ return Err(RbValueError::new_err(format!(
282
+ "parallel must be one of {{'auto', 'columns', 'row_groups', 'none'}}, got {}",
283
+ v
284
+ )))
285
+ }
286
+ };
287
+ Ok(Wrap(parsed))
288
+ }
289
+ }
290
+
174
291
  impl TryConvert for Wrap<QuantileInterpolOptions> {
175
292
  fn try_convert(ob: Value) -> RbResult<Self> {
176
293
  let parsed = match ob.try_convert::<String>()?.as_str() {
@@ -307,3 +424,19 @@ pub fn parse_parquet_compression(
307
424
  };
308
425
  Ok(parsed)
309
426
  }
427
+
428
+ pub struct ObjectValue {
429
+ pub inner: Value,
430
+ }
431
+
432
+ impl From<&dyn PolarsObjectSafe> for &ObjectValue {
433
+ fn from(val: &dyn PolarsObjectSafe) -> Self {
434
+ unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
435
+ }
436
+ }
437
+
438
+ impl ObjectValue {
439
+ pub fn to_object(&self) -> Value {
440
+ self.inner
441
+ }
442
+ }