polars-df 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +93 -0
- data/ext/polars/Cargo.toml +35 -0
- data/ext/polars/extconf.rb +4 -0
- data/ext/polars/src/conversion.rs +115 -0
- data/ext/polars/src/dataframe.rs +304 -0
- data/ext/polars/src/error.rs +24 -0
- data/ext/polars/src/file.rs +28 -0
- data/ext/polars/src/lazy/dataframe.rs +123 -0
- data/ext/polars/src/lazy/dsl.rs +298 -0
- data/ext/polars/src/lazy/mod.rs +3 -0
- data/ext/polars/src/lazy/utils.rs +13 -0
- data/ext/polars/src/lib.rs +256 -0
- data/ext/polars/src/series.rs +475 -0
- data/lib/polars/data_frame.rb +315 -0
- data/lib/polars/expr.rb +233 -0
- data/lib/polars/functions.rb +45 -0
- data/lib/polars/io.rb +39 -0
- data/lib/polars/lazy_frame.rb +139 -0
- data/lib/polars/lazy_functions.rb +121 -0
- data/lib/polars/lazy_group_by.rb +13 -0
- data/lib/polars/series.rb +261 -0
- data/lib/polars/string_expr.rb +17 -0
- data/lib/polars/utils.rb +47 -0
- data/lib/polars/version.rb +3 -0
- data/lib/polars/when.rb +15 -0
- data/lib/polars/when_then.rb +18 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +25 -0
- metadata +87 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a2b7504ba8b5c20977f2699f849c2f8a8c28f7912d695f17f35eb8b492699fe6
|
4
|
+
data.tar.gz: d38b420fa7e192e38c91b7f7b621117fbdbb981a779e2f54ae7a9d9066e355ba
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7ac246cbb2c93433cd17078e534a372ed0e2d00c1cdd27bcf454cb4685b8bd48fd6d66f5060a9132a2fece3f877503f67e3b5f8b65da467942174a753c2d889b
|
7
|
+
data.tar.gz: 5e3f62085f6b02708534ed3ee3e4bdfe0e608c8c6fdd688e0e534858e53b63ec0d57a255d98bfc3385e609053385e354536efcf5e5469a0e17c9aecbaa9ea879
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2020 Ritchie Vink
|
2
|
+
Copyright (c) 2022 Andrew Kane
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
6
|
+
in the Software without restriction, including without limitation the rights
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
9
|
+
furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in all
|
12
|
+
copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# Polars Ruby
|
2
|
+
|
3
|
+
:fire: Blazingly fast DataFrames for Ruby, powered by [Polars](https://github.com/pola-rs/polars)
|
4
|
+
|
5
|
+
[![Build Status](https://github.com/ankane/polars-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/polars-ruby/actions)
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem "polars-df"
|
13
|
+
```
|
14
|
+
|
15
|
+
Note: Rust is currently required for installation, and it can take 15-20 minutes to compile the extension.
|
16
|
+
|
17
|
+
## Getting Started
|
18
|
+
|
19
|
+
This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
Polars.read_csv("iris.csv")
|
23
|
+
.lazy
|
24
|
+
.filter(Polars.col("sepal_length") > 5)
|
25
|
+
.groupby("species")
|
26
|
+
.agg(Polars.all.sum)
|
27
|
+
.collect
|
28
|
+
```
|
29
|
+
|
30
|
+
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. Note that many methods and options are missing at the moment.
|
31
|
+
|
32
|
+
## Examples
|
33
|
+
|
34
|
+
### Creating DataFrames
|
35
|
+
|
36
|
+
From a CSV
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Polars.read_csv("file.csv")
|
40
|
+
```
|
41
|
+
|
42
|
+
From Parquet
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
Polars.read_parquet("file.parquet")
|
46
|
+
```
|
47
|
+
|
48
|
+
From Active Record
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
Polars::DataFrame.new(User.all)
|
52
|
+
```
|
53
|
+
|
54
|
+
From a hash
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
Polars::DataFrame.new({
|
58
|
+
a: [1, 2, 3],
|
59
|
+
b: ["one", "two", "three"]
|
60
|
+
})
|
61
|
+
```
|
62
|
+
|
63
|
+
From an array of series
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
Polars::DataFrame.new([
|
67
|
+
Polars::Series.new("a", [1, 2, 3]),
|
68
|
+
Polars::Series.new("b", ["one", "two", "three"])
|
69
|
+
])
|
70
|
+
```
|
71
|
+
|
72
|
+
## History
|
73
|
+
|
74
|
+
View the [changelog](CHANGELOG.md)
|
75
|
+
|
76
|
+
## Contributing
|
77
|
+
|
78
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
79
|
+
|
80
|
+
- [Report bugs](https://github.com/ankane/polars-ruby/issues)
|
81
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/polars-ruby/pulls)
|
82
|
+
- Write, clarify, or fix documentation
|
83
|
+
- Suggest or add new features
|
84
|
+
|
85
|
+
To get started with development:
|
86
|
+
|
87
|
+
```sh
|
88
|
+
git clone https://github.com/ankane/polars-ruby.git
|
89
|
+
cd polars-ruby
|
90
|
+
bundle install
|
91
|
+
bundle exec rake compile
|
92
|
+
bundle exec rake test
|
93
|
+
```
|
@@ -0,0 +1,35 @@
|
|
1
|
+
[package]
|
2
|
+
name = "polars"
|
3
|
+
version = "0.1.0"
|
4
|
+
authors = ["Andrew Kane <andrew@ankane.org>"]
|
5
|
+
edition = "2021"
|
6
|
+
publish = false
|
7
|
+
|
8
|
+
[lib]
|
9
|
+
crate-type = ["cdylib"]
|
10
|
+
|
11
|
+
[dependencies]
|
12
|
+
magnus = "0.4"
|
13
|
+
serde_json = "1"
|
14
|
+
|
15
|
+
[dependencies.polars]
|
16
|
+
version = "0.25.1"
|
17
|
+
features = [
|
18
|
+
"arange",
|
19
|
+
"csv-file",
|
20
|
+
"cum_agg",
|
21
|
+
"diagonal_concat",
|
22
|
+
"dtype-full",
|
23
|
+
"fmt",
|
24
|
+
"horizontal_concat",
|
25
|
+
"interpolate",
|
26
|
+
"ipc",
|
27
|
+
"json",
|
28
|
+
"lazy",
|
29
|
+
"lazy_regex",
|
30
|
+
"parquet",
|
31
|
+
"semi_anti_join",
|
32
|
+
"serde-lazy",
|
33
|
+
"strings",
|
34
|
+
"trigonometry",
|
35
|
+
]
|
@@ -0,0 +1,115 @@
|
|
1
|
+
use magnus::{Value, QNIL};
|
2
|
+
use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
|
3
|
+
use polars::datatypes::AnyValue;
|
4
|
+
use polars::frame::DataFrame;
|
5
|
+
use polars::prelude::*;
|
6
|
+
|
7
|
+
use crate::{RbDataFrame, RbResult, RbValueError};
|
8
|
+
|
9
|
+
pub fn wrap(val: AnyValue) -> Value {
|
10
|
+
match val {
|
11
|
+
AnyValue::UInt8(v) => Value::from(v),
|
12
|
+
AnyValue::UInt16(v) => Value::from(v),
|
13
|
+
AnyValue::UInt32(v) => Value::from(v),
|
14
|
+
AnyValue::UInt64(v) => Value::from(v),
|
15
|
+
AnyValue::Int8(v) => Value::from(v),
|
16
|
+
AnyValue::Int16(v) => Value::from(v),
|
17
|
+
AnyValue::Int32(v) => Value::from(v),
|
18
|
+
AnyValue::Int64(v) => Value::from(v),
|
19
|
+
AnyValue::Float32(v) => Value::from(v),
|
20
|
+
AnyValue::Float64(v) => Value::from(v),
|
21
|
+
AnyValue::Null => *QNIL,
|
22
|
+
AnyValue::Boolean(v) => Value::from(v),
|
23
|
+
AnyValue::Utf8(v) => Value::from(v),
|
24
|
+
_ => todo!(),
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn parse_fill_null_strategy(
|
29
|
+
strategy: &str,
|
30
|
+
limit: FillNullLimit,
|
31
|
+
) -> RbResult<FillNullStrategy> {
|
32
|
+
let parsed = match strategy {
|
33
|
+
"forward" => FillNullStrategy::Forward(limit),
|
34
|
+
"backward" => FillNullStrategy::Backward(limit),
|
35
|
+
"min" => FillNullStrategy::Min,
|
36
|
+
"max" => FillNullStrategy::Max,
|
37
|
+
"mean" => FillNullStrategy::Mean,
|
38
|
+
"zero" => FillNullStrategy::Zero,
|
39
|
+
"one" => FillNullStrategy::One,
|
40
|
+
e => {
|
41
|
+
return Err(magnus::Error::runtime_error(format!(
|
42
|
+
"strategy must be one of {{'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}",
|
43
|
+
e,
|
44
|
+
)))
|
45
|
+
}
|
46
|
+
};
|
47
|
+
Ok(parsed)
|
48
|
+
}
|
49
|
+
|
50
|
+
pub fn wrap_join_type(ob: &str) -> RbResult<JoinType> {
|
51
|
+
let parsed = match ob {
|
52
|
+
"inner" => JoinType::Inner,
|
53
|
+
"left" => JoinType::Left,
|
54
|
+
"outer" => JoinType::Outer,
|
55
|
+
"semi" => JoinType::Semi,
|
56
|
+
"anti" => JoinType::Anti,
|
57
|
+
// #[cfg(feature = "cross_join")]
|
58
|
+
// "cross" => JoinType::Cross,
|
59
|
+
v => {
|
60
|
+
return Err(RbValueError::new_err(format!(
|
61
|
+
"how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
|
62
|
+
v
|
63
|
+
)))
|
64
|
+
}
|
65
|
+
};
|
66
|
+
Ok(parsed)
|
67
|
+
}
|
68
|
+
|
69
|
+
pub fn get_df(obj: Value) -> RbResult<DataFrame> {
|
70
|
+
let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?;
|
71
|
+
Ok(rbdf.df.borrow().clone())
|
72
|
+
}
|
73
|
+
|
74
|
+
pub fn parse_parquet_compression(
|
75
|
+
compression: &str,
|
76
|
+
compression_level: Option<i32>,
|
77
|
+
) -> RbResult<ParquetCompression> {
|
78
|
+
let parsed = match compression {
|
79
|
+
"uncompressed" => ParquetCompression::Uncompressed,
|
80
|
+
"snappy" => ParquetCompression::Snappy,
|
81
|
+
"gzip" => ParquetCompression::Gzip(
|
82
|
+
compression_level
|
83
|
+
.map(|lvl| {
|
84
|
+
GzipLevel::try_new(lvl as u8)
|
85
|
+
.map_err(|e| RbValueError::new_err(format!("{:?}", e)))
|
86
|
+
})
|
87
|
+
.transpose()?,
|
88
|
+
),
|
89
|
+
"lzo" => ParquetCompression::Lzo,
|
90
|
+
"brotli" => ParquetCompression::Brotli(
|
91
|
+
compression_level
|
92
|
+
.map(|lvl| {
|
93
|
+
BrotliLevel::try_new(lvl as u32)
|
94
|
+
.map_err(|e| RbValueError::new_err(format!("{:?}", e)))
|
95
|
+
})
|
96
|
+
.transpose()?,
|
97
|
+
),
|
98
|
+
"lz4" => ParquetCompression::Lz4Raw,
|
99
|
+
"zstd" => ParquetCompression::Zstd(
|
100
|
+
compression_level
|
101
|
+
.map(|lvl| {
|
102
|
+
ZstdLevel::try_new(lvl)
|
103
|
+
.map_err(|e| RbValueError::new_err(format!("{:?}", e)))
|
104
|
+
})
|
105
|
+
.transpose()?,
|
106
|
+
),
|
107
|
+
e => {
|
108
|
+
return Err(RbValueError::new_err(format!(
|
109
|
+
"compression must be one of {{'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'}}, got {}",
|
110
|
+
e
|
111
|
+
)))
|
112
|
+
}
|
113
|
+
};
|
114
|
+
Ok(parsed)
|
115
|
+
}
|
@@ -0,0 +1,304 @@
|
|
1
|
+
use magnus::{r_hash::ForEach, Error, RArray, RHash, RString, Value};
|
2
|
+
use polars::io::mmap::ReaderBytes;
|
3
|
+
use polars::prelude::*;
|
4
|
+
use std::cell::RefCell;
|
5
|
+
use std::fs::File;
|
6
|
+
use std::io::{BufReader, BufWriter, Cursor};
|
7
|
+
use std::ops::Deref;
|
8
|
+
use std::path::PathBuf;
|
9
|
+
|
10
|
+
use crate::conversion::parse_parquet_compression;
|
11
|
+
use crate::file::{get_file_like, get_mmap_bytes_reader};
|
12
|
+
use crate::{series, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
|
13
|
+
|
14
|
+
#[magnus::wrap(class = "Polars::RbDataFrame")]
|
15
|
+
pub struct RbDataFrame {
|
16
|
+
pub df: RefCell<DataFrame>,
|
17
|
+
}
|
18
|
+
|
19
|
+
impl From<DataFrame> for RbDataFrame {
|
20
|
+
fn from(df: DataFrame) -> Self {
|
21
|
+
RbDataFrame::new(df)
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
impl RbDataFrame {
|
26
|
+
pub fn new(df: DataFrame) -> Self {
|
27
|
+
RbDataFrame {
|
28
|
+
df: RefCell::new(df),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
pub fn init(columns: RArray) -> RbResult<Self> {
|
33
|
+
let mut cols = Vec::new();
|
34
|
+
for i in columns.each() {
|
35
|
+
cols.push(i?.try_convert::<&RbSeries>()?.series.borrow().clone());
|
36
|
+
}
|
37
|
+
let df = DataFrame::new(cols).map_err(RbPolarsErr::from)?;
|
38
|
+
Ok(RbDataFrame::new(df))
|
39
|
+
}
|
40
|
+
|
41
|
+
pub fn read_csv(rb_f: Value, has_header: bool) -> RbResult<Self> {
|
42
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
43
|
+
let df = CsvReader::new(mmap_bytes_r)
|
44
|
+
.has_header(has_header)
|
45
|
+
.finish()
|
46
|
+
.map_err(RbPolarsErr::from)?;
|
47
|
+
Ok(df.into())
|
48
|
+
}
|
49
|
+
|
50
|
+
pub fn read_parquet(path: PathBuf) -> RbResult<Self> {
|
51
|
+
let f = File::open(&path).map_err(|e| Error::runtime_error(e.to_string()))?;
|
52
|
+
let reader = BufReader::new(f);
|
53
|
+
ParquetReader::new(reader)
|
54
|
+
.finish()
|
55
|
+
.map_err(RbPolarsErr::from)
|
56
|
+
.map(|v| v.into())
|
57
|
+
}
|
58
|
+
|
59
|
+
pub fn read_json(rb_f: Value) -> RbResult<Self> {
|
60
|
+
// memmap the file first
|
61
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
62
|
+
let mmap_read: ReaderBytes = (&mmap_bytes_r).into();
|
63
|
+
let bytes = mmap_read.deref();
|
64
|
+
|
65
|
+
// Happy path is our column oriented json as that is most performant
|
66
|
+
// on failure we try
|
67
|
+
match serde_json::from_slice::<DataFrame>(bytes) {
|
68
|
+
Ok(df) => Ok(df.into()),
|
69
|
+
// try arrow json reader instead
|
70
|
+
// this is row oriented
|
71
|
+
Err(_) => {
|
72
|
+
let out = JsonReader::new(mmap_bytes_r)
|
73
|
+
.with_json_format(JsonFormat::Json)
|
74
|
+
.finish()
|
75
|
+
.map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
|
76
|
+
Ok(out.into())
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
pub fn read_ndjson(rb_f: Value) -> RbResult<Self> {
|
82
|
+
let mmap_bytes_r = get_mmap_bytes_reader(rb_f)?;
|
83
|
+
|
84
|
+
let out = JsonReader::new(mmap_bytes_r)
|
85
|
+
.with_json_format(JsonFormat::JsonLines)
|
86
|
+
.finish()
|
87
|
+
.map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
|
88
|
+
Ok(out.into())
|
89
|
+
}
|
90
|
+
|
91
|
+
pub fn write_json(&self, rb_f: Value, pretty: bool, row_oriented: bool) -> RbResult<()> {
|
92
|
+
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
93
|
+
|
94
|
+
let r = match (pretty, row_oriented) {
|
95
|
+
(_, true) => JsonWriter::new(file)
|
96
|
+
.with_json_format(JsonFormat::Json)
|
97
|
+
.finish(&mut self.df.borrow_mut()),
|
98
|
+
(true, _) => serde_json::to_writer_pretty(file, &*self.df.borrow())
|
99
|
+
.map_err(|e| PolarsError::ComputeError(format!("{:?}", e).into())),
|
100
|
+
(false, _) => serde_json::to_writer(file, &*self.df.borrow())
|
101
|
+
.map_err(|e| PolarsError::ComputeError(format!("{:?}", e).into())),
|
102
|
+
};
|
103
|
+
r.map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
|
104
|
+
Ok(())
|
105
|
+
}
|
106
|
+
|
107
|
+
pub fn write_ndjson(&self, rb_f: Value) -> RbResult<()> {
|
108
|
+
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
109
|
+
|
110
|
+
let r = JsonWriter::new(file)
|
111
|
+
.with_json_format(JsonFormat::JsonLines)
|
112
|
+
.finish(&mut self.df.borrow_mut());
|
113
|
+
|
114
|
+
r.map_err(|e| RbPolarsErr::other(format!("{:?}", e)))?;
|
115
|
+
Ok(())
|
116
|
+
}
|
117
|
+
|
118
|
+
pub fn read_hash(data: RHash) -> RbResult<Self> {
|
119
|
+
let mut cols: Vec<Series> = Vec::new();
|
120
|
+
data.foreach(|name: String, values: Value| {
|
121
|
+
let obj: Value = series().funcall("new", (name, values))?;
|
122
|
+
let rbseries = obj.funcall::<_, _, &RbSeries>("_s", ())?;
|
123
|
+
cols.push(rbseries.series.borrow().clone());
|
124
|
+
Ok(ForEach::Continue)
|
125
|
+
})?;
|
126
|
+
let df = DataFrame::new(cols).map_err(RbPolarsErr::from)?;
|
127
|
+
Ok(df.into())
|
128
|
+
}
|
129
|
+
|
130
|
+
#[allow(clippy::too_many_arguments)]
|
131
|
+
pub fn write_csv(
|
132
|
+
&self,
|
133
|
+
rb_f: Value,
|
134
|
+
has_header: bool,
|
135
|
+
sep: u8,
|
136
|
+
quote: u8,
|
137
|
+
batch_size: usize,
|
138
|
+
datetime_format: Option<String>,
|
139
|
+
date_format: Option<String>,
|
140
|
+
time_format: Option<String>,
|
141
|
+
float_precision: Option<usize>,
|
142
|
+
null_value: Option<String>,
|
143
|
+
) -> RbResult<()> {
|
144
|
+
let null = null_value.unwrap_or_default();
|
145
|
+
|
146
|
+
if let Ok(s) = rb_f.try_convert::<String>() {
|
147
|
+
let f = std::fs::File::create(&s).unwrap();
|
148
|
+
// no need for a buffered writer, because the csv writer does internal buffering
|
149
|
+
CsvWriter::new(f)
|
150
|
+
.has_header(has_header)
|
151
|
+
.with_delimiter(sep)
|
152
|
+
.with_quoting_char(quote)
|
153
|
+
.with_batch_size(batch_size)
|
154
|
+
.with_datetime_format(datetime_format)
|
155
|
+
.with_date_format(date_format)
|
156
|
+
.with_time_format(time_format)
|
157
|
+
.with_float_precision(float_precision)
|
158
|
+
.with_null_value(null)
|
159
|
+
.finish(&mut self.df.borrow_mut())
|
160
|
+
.map_err(RbPolarsErr::from)?;
|
161
|
+
} else {
|
162
|
+
let mut buf = Cursor::new(Vec::new());
|
163
|
+
CsvWriter::new(&mut buf)
|
164
|
+
.has_header(has_header)
|
165
|
+
.with_delimiter(sep)
|
166
|
+
.with_quoting_char(quote)
|
167
|
+
.with_batch_size(batch_size)
|
168
|
+
.with_datetime_format(datetime_format)
|
169
|
+
.with_date_format(date_format)
|
170
|
+
.with_time_format(time_format)
|
171
|
+
.with_float_precision(float_precision)
|
172
|
+
.with_null_value(null)
|
173
|
+
.finish(&mut self.df.borrow_mut())
|
174
|
+
.map_err(RbPolarsErr::from)?;
|
175
|
+
// TODO less copying
|
176
|
+
let rb_str = RString::from_slice(&buf.into_inner());
|
177
|
+
rb_f.funcall::<_, _, Value>("write", (rb_str,))?;
|
178
|
+
}
|
179
|
+
|
180
|
+
Ok(())
|
181
|
+
}
|
182
|
+
|
183
|
+
pub fn write_parquet(
|
184
|
+
&self,
|
185
|
+
rb_f: Value,
|
186
|
+
compression: String,
|
187
|
+
compression_level: Option<i32>,
|
188
|
+
statistics: bool,
|
189
|
+
row_group_size: Option<usize>,
|
190
|
+
) -> RbResult<()> {
|
191
|
+
let compression = parse_parquet_compression(&compression, compression_level)?;
|
192
|
+
|
193
|
+
if let Ok(s) = rb_f.try_convert::<String>() {
|
194
|
+
let f = std::fs::File::create(&s).unwrap();
|
195
|
+
ParquetWriter::new(f)
|
196
|
+
.with_compression(compression)
|
197
|
+
.with_statistics(statistics)
|
198
|
+
.with_row_group_size(row_group_size)
|
199
|
+
.finish(&mut self.df.borrow_mut())
|
200
|
+
.map_err(RbPolarsErr::from)?;
|
201
|
+
} else {
|
202
|
+
todo!();
|
203
|
+
}
|
204
|
+
|
205
|
+
Ok(())
|
206
|
+
}
|
207
|
+
|
208
|
+
pub fn rechunk(&self) -> Self {
|
209
|
+
self.df.borrow().agg_chunks().into()
|
210
|
+
}
|
211
|
+
|
212
|
+
pub fn to_s(&self) -> String {
|
213
|
+
format!("{}", self.df.borrow())
|
214
|
+
}
|
215
|
+
|
216
|
+
pub fn columns(&self) -> Vec<String> {
|
217
|
+
self.df
|
218
|
+
.borrow()
|
219
|
+
.get_column_names()
|
220
|
+
.iter()
|
221
|
+
.map(|v| v.to_string())
|
222
|
+
.collect()
|
223
|
+
}
|
224
|
+
|
225
|
+
pub fn dtypes(&self) -> Vec<String> {
|
226
|
+
self.df
|
227
|
+
.borrow()
|
228
|
+
.iter()
|
229
|
+
.map(|s| s.dtype().to_string())
|
230
|
+
.collect()
|
231
|
+
}
|
232
|
+
|
233
|
+
pub fn shape(&self) -> (usize, usize) {
|
234
|
+
self.df.borrow().shape()
|
235
|
+
}
|
236
|
+
|
237
|
+
pub fn height(&self) -> usize {
|
238
|
+
self.df.borrow().height()
|
239
|
+
}
|
240
|
+
|
241
|
+
pub fn width(&self) -> usize {
|
242
|
+
self.df.borrow().width()
|
243
|
+
}
|
244
|
+
|
245
|
+
pub fn select_at_idx(&self, idx: usize) -> Option<RbSeries> {
|
246
|
+
self.df
|
247
|
+
.borrow()
|
248
|
+
.select_at_idx(idx)
|
249
|
+
.map(|s| RbSeries::new(s.clone()))
|
250
|
+
}
|
251
|
+
|
252
|
+
// TODO remove clone
|
253
|
+
pub fn column(&self, name: String) -> RbResult<RbSeries> {
|
254
|
+
self.df
|
255
|
+
.borrow()
|
256
|
+
.column(&name)
|
257
|
+
.map(|v| v.clone().into())
|
258
|
+
.map_err(RbPolarsErr::from)
|
259
|
+
}
|
260
|
+
|
261
|
+
pub fn sort(&self, by_column: String, reverse: bool, nulls_last: bool) -> RbResult<Self> {
|
262
|
+
let df = self
|
263
|
+
.df
|
264
|
+
.borrow()
|
265
|
+
.sort_with_options(
|
266
|
+
&by_column,
|
267
|
+
SortOptions {
|
268
|
+
descending: reverse,
|
269
|
+
nulls_last,
|
270
|
+
},
|
271
|
+
)
|
272
|
+
.map_err(RbPolarsErr::from)?;
|
273
|
+
Ok(RbDataFrame::new(df))
|
274
|
+
}
|
275
|
+
|
276
|
+
pub fn head(&self, length: Option<usize>) -> Self {
|
277
|
+
self.df.borrow().head(length).into()
|
278
|
+
}
|
279
|
+
|
280
|
+
pub fn tail(&self, length: Option<usize>) -> Self {
|
281
|
+
self.df.borrow().tail(length).into()
|
282
|
+
}
|
283
|
+
|
284
|
+
pub fn frame_equal(&self, other: &RbDataFrame, null_equal: bool) -> bool {
|
285
|
+
if null_equal {
|
286
|
+
self.df.borrow().frame_equal_missing(&other.df.borrow())
|
287
|
+
} else {
|
288
|
+
self.df.borrow().frame_equal(&other.df.borrow())
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
292
|
+
pub fn lazy(&self) -> RbLazyFrame {
|
293
|
+
self.df.borrow().clone().lazy().into()
|
294
|
+
}
|
295
|
+
|
296
|
+
pub fn mean(&self) -> Self {
|
297
|
+
self.df.borrow().mean().into()
|
298
|
+
}
|
299
|
+
|
300
|
+
pub fn null_count(&self) -> Self {
|
301
|
+
let df = self.df.borrow().null_count();
|
302
|
+
df.into()
|
303
|
+
}
|
304
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
use magnus::exception::arg_error;
|
2
|
+
use magnus::Error;
|
3
|
+
use polars::prelude::PolarsError;
|
4
|
+
|
5
|
+
pub struct RbPolarsErr {}
|
6
|
+
|
7
|
+
impl RbPolarsErr {
|
8
|
+
// convert to Error instead of Self
|
9
|
+
pub fn from(e: PolarsError) -> Error {
|
10
|
+
Error::runtime_error(e.to_string())
|
11
|
+
}
|
12
|
+
|
13
|
+
pub fn other(message: String) -> Error {
|
14
|
+
Error::runtime_error(message)
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
pub struct RbValueError {}
|
19
|
+
|
20
|
+
impl RbValueError {
|
21
|
+
pub fn new_err(message: String) -> Error {
|
22
|
+
Error::new(arg_error(), message)
|
23
|
+
}
|
24
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
use magnus::{Error, RString, Value};
|
2
|
+
use polars::io::mmap::MmapBytesReader;
|
3
|
+
use std::fs::{File, OpenOptions};
|
4
|
+
use std::io::Cursor;
|
5
|
+
use std::path::PathBuf;
|
6
|
+
|
7
|
+
use crate::RbResult;
|
8
|
+
|
9
|
+
pub fn get_file_like(f: Value, truncate: bool) -> RbResult<File> {
|
10
|
+
OpenOptions::new()
|
11
|
+
.write(true)
|
12
|
+
.create(true)
|
13
|
+
.truncate(truncate)
|
14
|
+
.open(f.try_convert::<PathBuf>()?)
|
15
|
+
.map_err(|e| Error::runtime_error(e.to_string()))
|
16
|
+
}
|
17
|
+
|
18
|
+
pub fn get_mmap_bytes_reader(rb_f: Value) -> RbResult<Box<dyn MmapBytesReader>> {
|
19
|
+
if let Ok(bytes) = rb_f.funcall::<_, _, RString>("read", ()) {
|
20
|
+
let bytes = unsafe { bytes.as_slice() };
|
21
|
+
// TODO avoid copy
|
22
|
+
Ok(Box::new(Cursor::new(bytes.to_vec())))
|
23
|
+
} else {
|
24
|
+
let p = rb_f.try_convert::<PathBuf>()?;
|
25
|
+
let f = File::open(p).map_err(|e| Error::runtime_error(e.to_string()))?;
|
26
|
+
Ok(Box::new(f))
|
27
|
+
}
|
28
|
+
}
|