polars-df 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e6fb732e5dafe2fde285322554bd9159483cbbdf17d6e2bba9cba9a83563b47
|
4
|
+
data.tar.gz: 1b4249d0c0100f136973c601b8404cb6d92abc632d5ed0476bd93bc5360a11dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d9414d6f60c489e2b3b72885288822083ba8c04bac4053f4e34c1d53ee805d164f17fe4b8b3a8f4ff562550bcc657f374bea6e250b52985367f601ea50e3037f
|
7
|
+
data.tar.gz: 9e3a7cfe105f03ec20e9c26aa38c1475074ccc1ea057a170a97b7068b41943d561d50af49bb1d1f74b7705809dc1375900f542ab93683ba627dea080274f6d91
|
data/.yardopts
ADDED
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -1160,7 +1160,7 @@ dependencies = [
|
|
1160
1160
|
|
1161
1161
|
[[package]]
|
1162
1162
|
name = "polars"
|
1163
|
-
version = "0.1.
|
1163
|
+
version = "0.1.3"
|
1164
1164
|
dependencies = [
|
1165
1165
|
"magnus",
|
1166
1166
|
"polars 0.25.1",
|
@@ -1217,6 +1217,7 @@ dependencies = [
|
|
1217
1217
|
"rayon",
|
1218
1218
|
"regex",
|
1219
1219
|
"serde",
|
1220
|
+
"serde_json",
|
1220
1221
|
"smartstring",
|
1221
1222
|
"thiserror",
|
1222
1223
|
]
|
data/README.md
CHANGED
@@ -27,7 +27,7 @@ Polars.read_csv("iris.csv")
|
|
27
27
|
.collect
|
28
28
|
```
|
29
29
|
|
30
|
-
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
30
|
+
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. Some methods are missing at the moment.
|
31
31
|
|
32
32
|
## Examples
|
33
33
|
|
data/ext/polars/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "polars"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.3"
|
4
4
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
5
5
|
edition = "2021"
|
6
6
|
publish = false
|
@@ -17,10 +17,12 @@ version = "0.25.1"
|
|
17
17
|
features = [
|
18
18
|
"abs",
|
19
19
|
"arange",
|
20
|
+
"arg_where",
|
20
21
|
"concat_str",
|
21
22
|
"csv-file",
|
22
23
|
"cum_agg",
|
23
24
|
"cumulative_eval",
|
25
|
+
"dataframe_arithmetic",
|
24
26
|
"date_offset",
|
25
27
|
"diagonal_concat",
|
26
28
|
"diff",
|
@@ -38,16 +40,20 @@ features = [
|
|
38
40
|
"lazy",
|
39
41
|
"lazy_regex",
|
40
42
|
"list_eval",
|
43
|
+
"list_to_struct",
|
41
44
|
"log",
|
42
45
|
"meta",
|
43
46
|
"mode",
|
44
47
|
"moment",
|
48
|
+
"object",
|
45
49
|
"parquet",
|
46
50
|
"partition_by",
|
47
51
|
"pct_change",
|
48
52
|
"product",
|
53
|
+
"propagate_nans",
|
49
54
|
"random",
|
50
55
|
"rank",
|
56
|
+
"reinterpret",
|
51
57
|
"repeat_by",
|
52
58
|
"rolling_window",
|
53
59
|
"round_series",
|
@@ -0,0 +1,120 @@
|
|
1
|
+
use magnus::Value;
|
2
|
+
use polars::io::mmap::MmapBytesReader;
|
3
|
+
use polars::io::RowCount;
|
4
|
+
use polars::prelude::read_impl::OwnedBatchedCsvReader;
|
5
|
+
use polars::prelude::*;
|
6
|
+
use std::cell::RefCell;
|
7
|
+
use std::path::PathBuf;
|
8
|
+
|
9
|
+
use crate::conversion::*;
|
10
|
+
use crate::{RbDataFrame, RbPolarsErr, RbResult};
|
11
|
+
|
12
|
+
#[magnus::wrap(class = "Polars::RbBatchedCsv")]
|
13
|
+
pub struct RbBatchedCsv {
|
14
|
+
pub reader: RefCell<OwnedBatchedCsvReader>,
|
15
|
+
}
|
16
|
+
|
17
|
+
impl RbBatchedCsv {
|
18
|
+
pub fn new(arguments: &[Value]) -> RbResult<Self> {
|
19
|
+
// start arguments
|
20
|
+
// this pattern is needed for more than 16
|
21
|
+
let infer_schema_length: Option<usize> = arguments[0].try_convert()?;
|
22
|
+
let chunk_size: usize = arguments[1].try_convert()?;
|
23
|
+
let has_header: bool = arguments[2].try_convert()?;
|
24
|
+
let ignore_errors: bool = arguments[3].try_convert()?;
|
25
|
+
let n_rows: Option<usize> = arguments[4].try_convert()?;
|
26
|
+
let skip_rows: usize = arguments[5].try_convert()?;
|
27
|
+
let projection: Option<Vec<usize>> = arguments[6].try_convert()?;
|
28
|
+
let sep: String = arguments[7].try_convert()?;
|
29
|
+
let rechunk: bool = arguments[8].try_convert()?;
|
30
|
+
let columns: Option<Vec<String>> = arguments[9].try_convert()?;
|
31
|
+
let encoding: Wrap<CsvEncoding> = arguments[10].try_convert()?;
|
32
|
+
let n_threads: Option<usize> = arguments[11].try_convert()?;
|
33
|
+
let path: PathBuf = arguments[12].try_convert()?;
|
34
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[13].try_convert()?;
|
35
|
+
// TODO fix
|
36
|
+
let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[14].try_convert()?;
|
37
|
+
let low_memory: bool = arguments[15].try_convert()?;
|
38
|
+
let comment_char: Option<String> = arguments[16].try_convert()?;
|
39
|
+
let quote_char: Option<String> = arguments[17].try_convert()?;
|
40
|
+
let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
|
41
|
+
let parse_dates: bool = arguments[19].try_convert()?;
|
42
|
+
let skip_rows_after_header: usize = arguments[20].try_convert()?;
|
43
|
+
let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
|
44
|
+
let sample_size: usize = arguments[22].try_convert()?;
|
45
|
+
let eol_char: String = arguments[23].try_convert()?;
|
46
|
+
// end arguments
|
47
|
+
|
48
|
+
let null_values = null_values.map(|w| w.0);
|
49
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
50
|
+
let eol_char = eol_char.as_bytes()[0];
|
51
|
+
|
52
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
53
|
+
|
54
|
+
let quote_char = if let Some(s) = quote_char {
|
55
|
+
if s.is_empty() {
|
56
|
+
None
|
57
|
+
} else {
|
58
|
+
Some(s.as_bytes()[0])
|
59
|
+
}
|
60
|
+
} else {
|
61
|
+
None
|
62
|
+
};
|
63
|
+
|
64
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
65
|
+
let fields = overwrite_dtype.iter().map(|(name, dtype)| {
|
66
|
+
let dtype = dtype.0.clone();
|
67
|
+
Field::new(name, dtype)
|
68
|
+
});
|
69
|
+
Schema::from(fields)
|
70
|
+
});
|
71
|
+
|
72
|
+
let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
|
73
|
+
overwrite_dtype
|
74
|
+
.iter()
|
75
|
+
.map(|dt| dt.0.clone())
|
76
|
+
.collect::<Vec<_>>()
|
77
|
+
});
|
78
|
+
|
79
|
+
let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
|
80
|
+
let reader = Box::new(file) as Box<dyn MmapBytesReader>;
|
81
|
+
let reader = CsvReader::new(reader)
|
82
|
+
.infer_schema(infer_schema_length)
|
83
|
+
.has_header(has_header)
|
84
|
+
.with_n_rows(n_rows)
|
85
|
+
.with_delimiter(sep.as_bytes()[0])
|
86
|
+
.with_skip_rows(skip_rows)
|
87
|
+
.with_ignore_parser_errors(ignore_errors)
|
88
|
+
.with_projection(projection)
|
89
|
+
.with_rechunk(rechunk)
|
90
|
+
.with_chunk_size(chunk_size)
|
91
|
+
.with_encoding(encoding.0)
|
92
|
+
.with_columns(columns)
|
93
|
+
.with_n_threads(n_threads)
|
94
|
+
.with_dtypes_slice(overwrite_dtype_slice.as_deref())
|
95
|
+
.low_memory(low_memory)
|
96
|
+
.with_comment_char(comment_char)
|
97
|
+
.with_null_values(null_values)
|
98
|
+
.with_parse_dates(parse_dates)
|
99
|
+
.with_quote_char(quote_char)
|
100
|
+
.with_end_of_line_char(eol_char)
|
101
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
102
|
+
.with_row_count(row_count)
|
103
|
+
.sample_size(sample_size)
|
104
|
+
.batched(overwrite_dtype.map(Arc::new))
|
105
|
+
.map_err(RbPolarsErr::from)?;
|
106
|
+
|
107
|
+
Ok(RbBatchedCsv {
|
108
|
+
reader: RefCell::new(reader),
|
109
|
+
})
|
110
|
+
}
|
111
|
+
|
112
|
+
pub fn next_batches(&self, n: usize) -> RbResult<Option<Vec<RbDataFrame>>> {
|
113
|
+
let batches = self
|
114
|
+
.reader
|
115
|
+
.borrow_mut()
|
116
|
+
.next_batches(n)
|
117
|
+
.map_err(RbPolarsErr::from)?;
|
118
|
+
Ok(batches.map(|batches| batches.into_iter().map(|out| out.1.into()).collect()))
|
119
|
+
}
|
120
|
+
}
|
@@ -1,11 +1,12 @@
|
|
1
|
-
use magnus::{TryConvert, Value, QNIL};
|
1
|
+
use magnus::{class, RArray, Symbol, TryConvert, Value, QNIL};
|
2
|
+
use polars::chunked_array::object::PolarsObjectSafe;
|
2
3
|
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
3
4
|
use polars::datatypes::AnyValue;
|
4
5
|
use polars::frame::DataFrame;
|
5
6
|
use polars::prelude::*;
|
6
7
|
use polars::series::ops::NullBehavior;
|
7
8
|
|
8
|
-
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
|
9
|
+
use crate::{RbDataFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
|
9
10
|
|
10
11
|
pub struct Wrap<T>(pub T);
|
11
12
|
|
@@ -15,14 +16,57 @@ impl<T> From<T> for Wrap<T> {
|
|
15
16
|
}
|
16
17
|
}
|
17
18
|
|
19
|
+
pub fn get_rbseq(obj: Value) -> RbResult<(RArray, usize)> {
|
20
|
+
let seq: RArray = obj.try_convert()?;
|
21
|
+
let len = seq.len();
|
22
|
+
Ok((seq, len))
|
23
|
+
}
|
24
|
+
|
18
25
|
pub fn get_df(obj: Value) -> RbResult<DataFrame> {
|
19
26
|
let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?;
|
20
27
|
Ok(rbdf.df.borrow().clone())
|
21
28
|
}
|
22
29
|
|
23
|
-
|
24
|
-
|
25
|
-
|
30
|
+
pub fn get_series(obj: Value) -> RbResult<Series> {
|
31
|
+
let rbs = obj.funcall::<_, _, &RbSeries>("_s", ())?;
|
32
|
+
Ok(rbs.series.borrow().clone())
|
33
|
+
}
|
34
|
+
|
35
|
+
impl TryConvert for Wrap<Utf8Chunked> {
|
36
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
37
|
+
let (seq, len) = get_rbseq(obj)?;
|
38
|
+
let mut builder = Utf8ChunkedBuilder::new("", len, len * 25);
|
39
|
+
|
40
|
+
for res in seq.each() {
|
41
|
+
let item = res?;
|
42
|
+
match item.try_convert::<String>() {
|
43
|
+
Ok(val) => builder.append_value(&val),
|
44
|
+
Err(_) => builder.append_null(),
|
45
|
+
}
|
46
|
+
}
|
47
|
+
Ok(Wrap(builder.finish()))
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
impl TryConvert for Wrap<NullValues> {
|
52
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
53
|
+
if let Ok(s) = ob.try_convert::<String>() {
|
54
|
+
Ok(Wrap(NullValues::AllColumnsSingle(s)))
|
55
|
+
} else if let Ok(s) = ob.try_convert::<Vec<String>>() {
|
56
|
+
Ok(Wrap(NullValues::AllColumns(s)))
|
57
|
+
} else if let Ok(s) = ob.try_convert::<Vec<(String, String)>>() {
|
58
|
+
Ok(Wrap(NullValues::Named(s)))
|
59
|
+
} else {
|
60
|
+
Err(RbPolarsErr::other(
|
61
|
+
"could not extract value from null_values argument".into(),
|
62
|
+
))
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
impl From<Wrap<AnyValue<'_>>> for Value {
|
68
|
+
fn from(w: Wrap<AnyValue<'_>>) -> Self {
|
69
|
+
match w.0 {
|
26
70
|
AnyValue::UInt8(v) => Value::from(v),
|
27
71
|
AnyValue::UInt16(v) => Value::from(v),
|
28
72
|
AnyValue::UInt32(v) => Value::from(v),
|
@@ -36,11 +80,24 @@ impl Into<Value> for Wrap<AnyValue<'_>> {
|
|
36
80
|
AnyValue::Null => *QNIL,
|
37
81
|
AnyValue::Boolean(v) => Value::from(v),
|
38
82
|
AnyValue::Utf8(v) => Value::from(v),
|
83
|
+
AnyValue::Date(v) => class::time()
|
84
|
+
.funcall::<_, _, Value>("at", (v * 86400,))
|
85
|
+
.unwrap()
|
86
|
+
.funcall::<_, _, Value>("utc", ())
|
87
|
+
.unwrap()
|
88
|
+
.funcall::<_, _, Value>("to_date", ())
|
89
|
+
.unwrap(),
|
39
90
|
_ => todo!(),
|
40
91
|
}
|
41
92
|
}
|
42
93
|
}
|
43
94
|
|
95
|
+
impl From<Wrap<DataType>> for Value {
|
96
|
+
fn from(w: Wrap<DataType>) -> Self {
|
97
|
+
Symbol::from(w.0.to_string()).into()
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
44
101
|
impl TryConvert for Wrap<DataType> {
|
45
102
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
46
103
|
let dtype = match ob.try_convert::<String>()?.as_str() {
|
@@ -53,10 +110,19 @@ impl TryConvert for Wrap<DataType> {
|
|
53
110
|
"i32" => DataType::Int32,
|
54
111
|
"i64" => DataType::Int64,
|
55
112
|
"str" => DataType::Utf8,
|
113
|
+
"bin" => DataType::Binary,
|
56
114
|
"bool" => DataType::Boolean,
|
115
|
+
"cat" => DataType::Categorical(None),
|
116
|
+
"date" => DataType::Date,
|
117
|
+
"datetime" => DataType::Datetime(TimeUnit::Microseconds, None),
|
57
118
|
"f32" => DataType::Float32,
|
119
|
+
"time" => DataType::Time,
|
120
|
+
"dur" => DataType::Duration(TimeUnit::Microseconds),
|
58
121
|
"f64" => DataType::Float64,
|
59
|
-
"
|
122
|
+
// "obj" => DataType::Object(OBJECT_NAME),
|
123
|
+
"list" => DataType::List(Box::new(DataType::Boolean)),
|
124
|
+
"null" => DataType::Null,
|
125
|
+
"unk" => DataType::Unknown,
|
60
126
|
_ => {
|
61
127
|
return Err(RbValueError::new_err(format!(
|
62
128
|
"{} is not a supported DataType.",
|
@@ -118,6 +184,39 @@ impl TryConvert for Wrap<ClosedWindow> {
|
|
118
184
|
}
|
119
185
|
}
|
120
186
|
|
187
|
+
impl TryConvert for Wrap<CsvEncoding> {
|
188
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
189
|
+
let parsed = match ob.try_convert::<String>()?.as_str() {
|
190
|
+
"utf8" => CsvEncoding::Utf8,
|
191
|
+
"utf8-lossy" => CsvEncoding::LossyUtf8,
|
192
|
+
v => {
|
193
|
+
return Err(RbValueError::new_err(format!(
|
194
|
+
"encoding must be one of {{'utf8', 'utf8-lossy'}}, got {}",
|
195
|
+
v
|
196
|
+
)))
|
197
|
+
}
|
198
|
+
};
|
199
|
+
Ok(Wrap(parsed))
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
impl TryConvert for Wrap<Option<IpcCompression>> {
|
204
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
205
|
+
let parsed = match ob.try_convert::<String>()?.as_str() {
|
206
|
+
"uncompressed" => None,
|
207
|
+
"lz4" => Some(IpcCompression::LZ4),
|
208
|
+
"zstd" => Some(IpcCompression::ZSTD),
|
209
|
+
v => {
|
210
|
+
return Err(RbValueError::new_err(format!(
|
211
|
+
"compression must be one of {{'uncompressed', 'lz4', 'zstd'}}, got {}",
|
212
|
+
v
|
213
|
+
)))
|
214
|
+
}
|
215
|
+
};
|
216
|
+
Ok(Wrap(parsed))
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
121
220
|
impl TryConvert for Wrap<JoinType> {
|
122
221
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
123
222
|
let parsed = match ob.try_convert::<String>()?.as_str() {
|
@@ -171,6 +270,24 @@ impl TryConvert for Wrap<NullStrategy> {
|
|
171
270
|
}
|
172
271
|
}
|
173
272
|
|
273
|
+
impl TryConvert for Wrap<ParallelStrategy> {
|
274
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
275
|
+
let parsed = match ob.try_convert::<String>()?.as_str() {
|
276
|
+
"auto" => ParallelStrategy::Auto,
|
277
|
+
"columns" => ParallelStrategy::Columns,
|
278
|
+
"row_groups" => ParallelStrategy::RowGroups,
|
279
|
+
"none" => ParallelStrategy::None,
|
280
|
+
v => {
|
281
|
+
return Err(RbValueError::new_err(format!(
|
282
|
+
"parallel must be one of {{'auto', 'columns', 'row_groups', 'none'}}, got {}",
|
283
|
+
v
|
284
|
+
)))
|
285
|
+
}
|
286
|
+
};
|
287
|
+
Ok(Wrap(parsed))
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
174
291
|
impl TryConvert for Wrap<QuantileInterpolOptions> {
|
175
292
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
176
293
|
let parsed = match ob.try_convert::<String>()?.as_str() {
|
@@ -307,3 +424,19 @@ pub fn parse_parquet_compression(
|
|
307
424
|
};
|
308
425
|
Ok(parsed)
|
309
426
|
}
|
427
|
+
|
428
|
+
pub struct ObjectValue {
|
429
|
+
pub inner: Value,
|
430
|
+
}
|
431
|
+
|
432
|
+
impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
433
|
+
fn from(val: &dyn PolarsObjectSafe) -> Self {
|
434
|
+
unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
|
435
|
+
}
|
436
|
+
}
|
437
|
+
|
438
|
+
impl ObjectValue {
|
439
|
+
pub fn to_object(&self) -> Value {
|
440
|
+
self.inner
|
441
|
+
}
|
442
|
+
}
|