polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml ADDED
@@ -0,0 +1,5 @@
1
+ [workspace]
2
+ members = ["ext/polars"]
3
+
4
+ [profile.release]
5
+ strip = true
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars"
3
- version = "0.1.0"
3
+ version = "0.1.2"
4
4
  authors = ["Andrew Kane <andrew@ankane.org>"]
5
5
  edition = "2021"
6
6
  publish = false
@@ -15,21 +15,51 @@ serde_json = "1"
15
15
  [dependencies.polars]
16
16
  version = "0.25.1"
17
17
  features = [
18
+ "abs",
18
19
  "arange",
20
+ "concat_str",
19
21
  "csv-file",
20
22
  "cum_agg",
23
+ "cumulative_eval",
24
+ "date_offset",
21
25
  "diagonal_concat",
26
+ "diff",
27
+ "dot_product",
22
28
  "dtype-full",
29
+ "dynamic_groupby",
30
+ "ewma",
23
31
  "fmt",
24
32
  "horizontal_concat",
25
33
  "interpolate",
26
34
  "ipc",
35
+ "is_first",
36
+ "is_in",
27
37
  "json",
28
38
  "lazy",
29
39
  "lazy_regex",
40
+ "list_eval",
41
+ "log",
42
+ "meta",
43
+ "mode",
44
+ "moment",
30
45
  "parquet",
46
+ "partition_by",
47
+ "pct_change",
48
+ "product",
49
+ "random",
50
+ "rank",
51
+ "repeat_by",
52
+ "rolling_window",
53
+ "round_series",
54
+ "search_sorted",
31
55
  "semi_anti_join",
32
56
  "serde-lazy",
57
+ "sign",
58
+ "string_justify",
33
59
  "strings",
60
+ "timezones",
61
+ "to_dummies",
62
+ "top_k",
34
63
  "trigonometry",
64
+ "unique_counts",
35
65
  ]
@@ -0,0 +1,120 @@
1
+ use magnus::Value;
2
+ use polars::io::mmap::MmapBytesReader;
3
+ use polars::io::RowCount;
4
+ use polars::prelude::read_impl::OwnedBatchedCsvReader;
5
+ use polars::prelude::*;
6
+ use std::cell::RefCell;
7
+ use std::path::PathBuf;
8
+
9
+ use crate::conversion::*;
10
+ use crate::{RbDataFrame, RbPolarsErr, RbResult};
11
+
12
+ #[magnus::wrap(class = "Polars::RbBatchedCsv")]
13
+ pub struct RbBatchedCsv {
14
+ pub reader: RefCell<OwnedBatchedCsvReader>,
15
+ }
16
+
17
+ impl RbBatchedCsv {
18
+ pub fn new(arguments: &[Value]) -> RbResult<Self> {
19
+ // start arguments
20
+ // this pattern is needed for more than 16
21
+ let infer_schema_length: Option<usize> = arguments[0].try_convert()?;
22
+ let chunk_size: usize = arguments[1].try_convert()?;
23
+ let has_header: bool = arguments[2].try_convert()?;
24
+ let ignore_errors: bool = arguments[3].try_convert()?;
25
+ let n_rows: Option<usize> = arguments[4].try_convert()?;
26
+ let skip_rows: usize = arguments[5].try_convert()?;
27
+ let projection: Option<Vec<usize>> = arguments[6].try_convert()?;
28
+ let sep: String = arguments[7].try_convert()?;
29
+ let rechunk: bool = arguments[8].try_convert()?;
30
+ let columns: Option<Vec<String>> = arguments[9].try_convert()?;
31
+ let encoding: Wrap<CsvEncoding> = arguments[10].try_convert()?;
32
+ let n_threads: Option<usize> = arguments[11].try_convert()?;
33
+ let path: PathBuf = arguments[12].try_convert()?;
34
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[13].try_convert()?;
35
+ // TODO fix
36
+ let overwrite_dtype_slice: Option<Vec<Wrap<DataType>>> = None; // arguments[14].try_convert()?;
37
+ let low_memory: bool = arguments[15].try_convert()?;
38
+ let comment_char: Option<String> = arguments[16].try_convert()?;
39
+ let quote_char: Option<String> = arguments[17].try_convert()?;
40
+ let null_values: Option<Wrap<NullValues>> = arguments[18].try_convert()?;
41
+ let parse_dates: bool = arguments[19].try_convert()?;
42
+ let skip_rows_after_header: usize = arguments[20].try_convert()?;
43
+ let row_count: Option<(String, IdxSize)> = arguments[21].try_convert()?;
44
+ let sample_size: usize = arguments[22].try_convert()?;
45
+ let eol_char: String = arguments[23].try_convert()?;
46
+ // end arguments
47
+
48
+ let null_values = null_values.map(|w| w.0);
49
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
50
+ let eol_char = eol_char.as_bytes()[0];
51
+
52
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
53
+
54
+ let quote_char = if let Some(s) = quote_char {
55
+ if s.is_empty() {
56
+ None
57
+ } else {
58
+ Some(s.as_bytes()[0])
59
+ }
60
+ } else {
61
+ None
62
+ };
63
+
64
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
65
+ let fields = overwrite_dtype.iter().map(|(name, dtype)| {
66
+ let dtype = dtype.0.clone();
67
+ Field::new(name, dtype)
68
+ });
69
+ Schema::from(fields)
70
+ });
71
+
72
+ let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
73
+ overwrite_dtype
74
+ .iter()
75
+ .map(|dt| dt.0.clone())
76
+ .collect::<Vec<_>>()
77
+ });
78
+
79
+ let file = std::fs::File::open(path).map_err(RbPolarsErr::io)?;
80
+ let reader = Box::new(file) as Box<dyn MmapBytesReader>;
81
+ let reader = CsvReader::new(reader)
82
+ .infer_schema(infer_schema_length)
83
+ .has_header(has_header)
84
+ .with_n_rows(n_rows)
85
+ .with_delimiter(sep.as_bytes()[0])
86
+ .with_skip_rows(skip_rows)
87
+ .with_ignore_parser_errors(ignore_errors)
88
+ .with_projection(projection)
89
+ .with_rechunk(rechunk)
90
+ .with_chunk_size(chunk_size)
91
+ .with_encoding(encoding.0)
92
+ .with_columns(columns)
93
+ .with_n_threads(n_threads)
94
+ .with_dtypes_slice(overwrite_dtype_slice.as_deref())
95
+ .low_memory(low_memory)
96
+ .with_comment_char(comment_char)
97
+ .with_null_values(null_values)
98
+ .with_parse_dates(parse_dates)
99
+ .with_quote_char(quote_char)
100
+ .with_end_of_line_char(eol_char)
101
+ .with_skip_rows_after_header(skip_rows_after_header)
102
+ .with_row_count(row_count)
103
+ .sample_size(sample_size)
104
+ .batched(overwrite_dtype.map(Arc::new))
105
+ .map_err(RbPolarsErr::from)?;
106
+
107
+ Ok(RbBatchedCsv {
108
+ reader: RefCell::new(reader),
109
+ })
110
+ }
111
+
112
+ pub fn next_batches(&self, n: usize) -> RbResult<Option<Vec<RbDataFrame>>> {
113
+ let batches = self
114
+ .reader
115
+ .borrow_mut()
116
+ .next_batches(n)
117
+ .map_err(RbPolarsErr::from)?;
118
+ Ok(batches.map(|batches| batches.into_iter().map(|out| out.1.into()).collect()))
119
+ }
120
+ }
@@ -1,27 +1,345 @@
1
- use magnus::{Value, QNIL};
1
+ use magnus::{RArray, Symbol, TryConvert, Value, QNIL};
2
2
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
3
3
  use polars::datatypes::AnyValue;
4
4
  use polars::frame::DataFrame;
5
5
  use polars::prelude::*;
6
+ use polars::series::ops::NullBehavior;
6
7
 
7
- use crate::{RbDataFrame, RbResult, RbValueError};
8
+ use crate::{RbDataFrame, RbPolarsErr, RbResult, RbSeries, RbValueError};
8
9
 
9
- pub fn wrap(val: AnyValue) -> Value {
10
- match val {
11
- AnyValue::UInt8(v) => Value::from(v),
12
- AnyValue::UInt16(v) => Value::from(v),
13
- AnyValue::UInt32(v) => Value::from(v),
14
- AnyValue::UInt64(v) => Value::from(v),
15
- AnyValue::Int8(v) => Value::from(v),
16
- AnyValue::Int16(v) => Value::from(v),
17
- AnyValue::Int32(v) => Value::from(v),
18
- AnyValue::Int64(v) => Value::from(v),
19
- AnyValue::Float32(v) => Value::from(v),
20
- AnyValue::Float64(v) => Value::from(v),
21
- AnyValue::Null => *QNIL,
22
- AnyValue::Boolean(v) => Value::from(v),
23
- AnyValue::Utf8(v) => Value::from(v),
24
- _ => todo!(),
10
+ pub struct Wrap<T>(pub T);
11
+
12
+ impl<T> From<T> for Wrap<T> {
13
+ fn from(t: T) -> Self {
14
+ Wrap(t)
15
+ }
16
+ }
17
+
18
+ pub fn get_rbseq(obj: Value) -> RbResult<(RArray, usize)> {
19
+ let seq: RArray = obj.try_convert()?;
20
+ let len = seq.len();
21
+ Ok((seq, len))
22
+ }
23
+
24
+ pub fn get_df(obj: Value) -> RbResult<DataFrame> {
25
+ let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?;
26
+ Ok(rbdf.df.borrow().clone())
27
+ }
28
+
29
+ pub fn get_series(obj: Value) -> RbResult<Series> {
30
+ let rbs = obj.funcall::<_, _, &RbSeries>("_s", ())?;
31
+ Ok(rbs.series.borrow().clone())
32
+ }
33
+
34
+ impl TryConvert for Wrap<Utf8Chunked> {
35
+ fn try_convert(obj: Value) -> RbResult<Self> {
36
+ let (seq, len) = get_rbseq(obj)?;
37
+ let mut builder = Utf8ChunkedBuilder::new("", len, len * 25);
38
+
39
+ for res in seq.each() {
40
+ let item = res?;
41
+ match item.try_convert::<String>() {
42
+ Ok(val) => builder.append_value(&val),
43
+ Err(_) => builder.append_null(),
44
+ }
45
+ }
46
+ Ok(Wrap(builder.finish()))
47
+ }
48
+ }
49
+
50
+ impl TryConvert for Wrap<NullValues> {
51
+ fn try_convert(ob: Value) -> RbResult<Self> {
52
+ if let Ok(s) = ob.try_convert::<String>() {
53
+ Ok(Wrap(NullValues::AllColumnsSingle(s)))
54
+ } else if let Ok(s) = ob.try_convert::<Vec<String>>() {
55
+ Ok(Wrap(NullValues::AllColumns(s)))
56
+ } else if let Ok(s) = ob.try_convert::<Vec<(String, String)>>() {
57
+ Ok(Wrap(NullValues::Named(s)))
58
+ } else {
59
+ Err(RbPolarsErr::other(
60
+ "could not extract value from null_values argument".into(),
61
+ ))
62
+ }
63
+ }
64
+ }
65
+
66
+ impl From<Wrap<AnyValue<'_>>> for Value {
67
+ fn from(w: Wrap<AnyValue<'_>>) -> Self {
68
+ match w.0 {
69
+ AnyValue::UInt8(v) => Value::from(v),
70
+ AnyValue::UInt16(v) => Value::from(v),
71
+ AnyValue::UInt32(v) => Value::from(v),
72
+ AnyValue::UInt64(v) => Value::from(v),
73
+ AnyValue::Int8(v) => Value::from(v),
74
+ AnyValue::Int16(v) => Value::from(v),
75
+ AnyValue::Int32(v) => Value::from(v),
76
+ AnyValue::Int64(v) => Value::from(v),
77
+ AnyValue::Float32(v) => Value::from(v),
78
+ AnyValue::Float64(v) => Value::from(v),
79
+ AnyValue::Null => *QNIL,
80
+ AnyValue::Boolean(v) => Value::from(v),
81
+ AnyValue::Utf8(v) => Value::from(v),
82
+ _ => todo!(),
83
+ }
84
+ }
85
+ }
86
+
87
+ impl From<Wrap<DataType>> for Value {
88
+ fn from(w: Wrap<DataType>) -> Self {
89
+ Symbol::from(w.0.to_string()).into()
90
+ }
91
+ }
92
+
93
+ impl TryConvert for Wrap<DataType> {
94
+ fn try_convert(ob: Value) -> RbResult<Self> {
95
+ let dtype = match ob.try_convert::<String>()?.as_str() {
96
+ "u8" => DataType::UInt8,
97
+ "u16" => DataType::UInt16,
98
+ "u32" => DataType::UInt32,
99
+ "u64" => DataType::UInt64,
100
+ "i8" => DataType::Int8,
101
+ "i16" => DataType::Int16,
102
+ "i32" => DataType::Int32,
103
+ "i64" => DataType::Int64,
104
+ "str" => DataType::Utf8,
105
+ "bool" => DataType::Boolean,
106
+ "f32" => DataType::Float32,
107
+ "f64" => DataType::Float64,
108
+ "date" => DataType::Date,
109
+ _ => {
110
+ return Err(RbValueError::new_err(format!(
111
+ "{} is not a supported DataType.",
112
+ ob
113
+ )))
114
+ }
115
+ };
116
+ Ok(Wrap(dtype))
117
+ }
118
+ }
119
+
120
+ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
121
+ fn try_convert(ob: Value) -> RbResult<Self> {
122
+ // TODO improve
123
+ if let Ok(v) = ob.try_convert::<i64>() {
124
+ Ok(AnyValue::Int64(v).into())
125
+ } else if let Ok(v) = ob.try_convert::<f64>() {
126
+ Ok(AnyValue::Float64(v).into())
127
+ } else {
128
+ Err(RbPolarsErr::other(format!(
129
+ "object type not supported {:?}",
130
+ ob
131
+ )))
132
+ }
133
+ }
134
+ }
135
+
136
+ impl TryConvert for Wrap<CategoricalOrdering> {
137
+ fn try_convert(ob: Value) -> RbResult<Self> {
138
+ let parsed = match ob.try_convert::<String>()?.as_str() {
139
+ "physical" => CategoricalOrdering::Physical,
140
+ "lexical" => CategoricalOrdering::Lexical,
141
+ v => {
142
+ return Err(RbValueError::new_err(format!(
143
+ "ordering must be one of {{'physical', 'lexical'}}, got {}",
144
+ v
145
+ )))
146
+ }
147
+ };
148
+ Ok(Wrap(parsed))
149
+ }
150
+ }
151
+
152
+ impl TryConvert for Wrap<ClosedWindow> {
153
+ fn try_convert(ob: Value) -> RbResult<Self> {
154
+ let parsed = match ob.try_convert::<String>()?.as_str() {
155
+ "left" => ClosedWindow::Left,
156
+ "right" => ClosedWindow::Right,
157
+ "both" => ClosedWindow::Both,
158
+ "none" => ClosedWindow::None,
159
+ v => {
160
+ return Err(RbValueError::new_err(format!(
161
+ "closed must be one of {{'left', 'right', 'both', 'none'}}, got {}",
162
+ v
163
+ )))
164
+ }
165
+ };
166
+ Ok(Wrap(parsed))
167
+ }
168
+ }
169
+
170
+ impl TryConvert for Wrap<CsvEncoding> {
171
+ fn try_convert(ob: Value) -> RbResult<Self> {
172
+ let parsed = match ob.try_convert::<String>()?.as_str() {
173
+ "utf8" => CsvEncoding::Utf8,
174
+ "utf8-lossy" => CsvEncoding::LossyUtf8,
175
+ v => {
176
+ return Err(RbValueError::new_err(format!(
177
+ "encoding must be one of {{'utf8', 'utf8-lossy'}}, got {}",
178
+ v
179
+ )))
180
+ }
181
+ };
182
+ Ok(Wrap(parsed))
183
+ }
184
+ }
185
+
186
+ impl TryConvert for Wrap<Option<IpcCompression>> {
187
+ fn try_convert(ob: Value) -> RbResult<Self> {
188
+ let parsed = match ob.try_convert::<String>()?.as_str() {
189
+ "uncompressed" => None,
190
+ "lz4" => Some(IpcCompression::LZ4),
191
+ "zstd" => Some(IpcCompression::ZSTD),
192
+ v => {
193
+ return Err(RbValueError::new_err(format!(
194
+ "compression must be one of {{'uncompressed', 'lz4', 'zstd'}}, got {}",
195
+ v
196
+ )))
197
+ }
198
+ };
199
+ Ok(Wrap(parsed))
200
+ }
201
+ }
202
+
203
+ impl TryConvert for Wrap<JoinType> {
204
+ fn try_convert(ob: Value) -> RbResult<Self> {
205
+ let parsed = match ob.try_convert::<String>()?.as_str() {
206
+ "inner" => JoinType::Inner,
207
+ "left" => JoinType::Left,
208
+ "outer" => JoinType::Outer,
209
+ "semi" => JoinType::Semi,
210
+ "anti" => JoinType::Anti,
211
+ // #[cfg(feature = "cross_join")]
212
+ // "cross" => JoinType::Cross,
213
+ v => {
214
+ return Err(RbValueError::new_err(format!(
215
+ "how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
216
+ v
217
+ )))
218
+ }
219
+ };
220
+ Ok(Wrap(parsed))
221
+ }
222
+ }
223
+
224
+ impl TryConvert for Wrap<NullBehavior> {
225
+ fn try_convert(ob: Value) -> RbResult<Self> {
226
+ let parsed = match ob.try_convert::<String>()?.as_str() {
227
+ "drop" => NullBehavior::Drop,
228
+ "ignore" => NullBehavior::Ignore,
229
+ v => {
230
+ return Err(RbValueError::new_err(format!(
231
+ "null behavior must be one of {{'drop', 'ignore'}}, got {}",
232
+ v
233
+ )))
234
+ }
235
+ };
236
+ Ok(Wrap(parsed))
237
+ }
238
+ }
239
+
240
+ impl TryConvert for Wrap<NullStrategy> {
241
+ fn try_convert(ob: Value) -> RbResult<Self> {
242
+ let parsed = match ob.try_convert::<String>()?.as_str() {
243
+ "ignore" => NullStrategy::Ignore,
244
+ "propagate" => NullStrategy::Propagate,
245
+ v => {
246
+ return Err(RbValueError::new_err(format!(
247
+ "null strategy must be one of {{'ignore', 'propagate'}}, got {}",
248
+ v
249
+ )))
250
+ }
251
+ };
252
+ Ok(Wrap(parsed))
253
+ }
254
+ }
255
+
256
+ impl TryConvert for Wrap<ParallelStrategy> {
257
+ fn try_convert(ob: Value) -> RbResult<Self> {
258
+ let parsed = match ob.try_convert::<String>()?.as_str() {
259
+ "auto" => ParallelStrategy::Auto,
260
+ "columns" => ParallelStrategy::Columns,
261
+ "row_groups" => ParallelStrategy::RowGroups,
262
+ "none" => ParallelStrategy::None,
263
+ v => {
264
+ return Err(RbValueError::new_err(format!(
265
+ "parallel must be one of {{'auto', 'columns', 'row_groups', 'none'}}, got {}",
266
+ v
267
+ )))
268
+ }
269
+ };
270
+ Ok(Wrap(parsed))
271
+ }
272
+ }
273
+
274
+ impl TryConvert for Wrap<QuantileInterpolOptions> {
275
+ fn try_convert(ob: Value) -> RbResult<Self> {
276
+ let parsed = match ob.try_convert::<String>()?.as_str() {
277
+ "lower" => QuantileInterpolOptions::Lower,
278
+ "higher" => QuantileInterpolOptions::Higher,
279
+ "nearest" => QuantileInterpolOptions::Nearest,
280
+ "linear" => QuantileInterpolOptions::Linear,
281
+ "midpoint" => QuantileInterpolOptions::Midpoint,
282
+ v => {
283
+ return Err(RbValueError::new_err(format!(
284
+ "interpolation must be one of {{'lower', 'higher', 'nearest', 'linear', 'midpoint'}}, got {}",
285
+ v
286
+ )))
287
+ }
288
+ };
289
+ Ok(Wrap(parsed))
290
+ }
291
+ }
292
+
293
+ impl TryConvert for Wrap<RankMethod> {
294
+ fn try_convert(ob: Value) -> RbResult<Self> {
295
+ let parsed = match ob.try_convert::<String>()?.as_str() {
296
+ "min" => RankMethod::Min,
297
+ "max" => RankMethod::Max,
298
+ "average" => RankMethod::Average,
299
+ "dense" => RankMethod::Dense,
300
+ "ordinal" => RankMethod::Ordinal,
301
+ "random" => RankMethod::Random,
302
+ v => {
303
+ return Err(RbValueError::new_err(format!(
304
+ "method must be one of {{'min', 'max', 'average', 'dense', 'ordinal', 'random'}}, got {}",
305
+ v
306
+ )))
307
+ }
308
+ };
309
+ Ok(Wrap(parsed))
310
+ }
311
+ }
312
+
313
+ impl TryConvert for Wrap<TimeUnit> {
314
+ fn try_convert(ob: Value) -> RbResult<Self> {
315
+ let parsed = match ob.try_convert::<String>()?.as_str() {
316
+ "ns" => TimeUnit::Nanoseconds,
317
+ "us" => TimeUnit::Microseconds,
318
+ "ms" => TimeUnit::Milliseconds,
319
+ v => {
320
+ return Err(RbValueError::new_err(format!(
321
+ "time unit must be one of {{'ns', 'us', 'ms'}}, got {}",
322
+ v
323
+ )))
324
+ }
325
+ };
326
+ Ok(Wrap(parsed))
327
+ }
328
+ }
329
+
330
+ impl TryConvert for Wrap<UniqueKeepStrategy> {
331
+ fn try_convert(ob: Value) -> RbResult<Self> {
332
+ let parsed = match ob.try_convert::<String>()?.as_str() {
333
+ "first" => UniqueKeepStrategy::First,
334
+ "last" => UniqueKeepStrategy::Last,
335
+ v => {
336
+ return Err(RbValueError::new_err(format!(
337
+ "keep must be one of {{'first', 'last'}}, got {}",
338
+ v
339
+ )))
340
+ }
341
+ };
342
+ Ok(Wrap(parsed))
25
343
  }
26
344
  }
27
345
 
@@ -47,30 +365,6 @@ pub fn parse_fill_null_strategy(
47
365
  Ok(parsed)
48
366
  }
49
367
 
50
- pub fn wrap_join_type(ob: &str) -> RbResult<JoinType> {
51
- let parsed = match ob {
52
- "inner" => JoinType::Inner,
53
- "left" => JoinType::Left,
54
- "outer" => JoinType::Outer,
55
- "semi" => JoinType::Semi,
56
- "anti" => JoinType::Anti,
57
- // #[cfg(feature = "cross_join")]
58
- // "cross" => JoinType::Cross,
59
- v => {
60
- return Err(RbValueError::new_err(format!(
61
- "how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}",
62
- v
63
- )))
64
- }
65
- };
66
- Ok(parsed)
67
- }
68
-
69
- pub fn get_df(obj: Value) -> RbResult<DataFrame> {
70
- let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?;
71
- Ok(rbdf.df.borrow().clone())
72
- }
73
-
74
368
  pub fn parse_parquet_compression(
75
369
  compression: &str,
76
370
  compression_level: Option<i32>,