polars-df 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/Cargo.lock +1523 -378
- data/LICENSE.txt +1 -0
- data/README.md +38 -4
- data/ext/polars/Cargo.toml +15 -5
- data/ext/polars/src/batched_csv.rs +7 -10
- data/ext/polars/src/conversion/any_value.rs +31 -21
- data/ext/polars/src/conversion/mod.rs +155 -48
- data/ext/polars/src/dataframe/construction.rs +0 -3
- data/ext/polars/src/dataframe/export.rs +9 -2
- data/ext/polars/src/dataframe/general.rs +15 -57
- data/ext/polars/src/dataframe/io.rs +77 -169
- data/ext/polars/src/dataframe/mod.rs +1 -0
- data/ext/polars/src/dataframe/serde.rs +15 -0
- data/ext/polars/src/error.rs +31 -48
- data/ext/polars/src/exceptions.rs +24 -0
- data/ext/polars/src/expr/binary.rs +4 -42
- data/ext/polars/src/expr/datetime.rs +5 -4
- data/ext/polars/src/expr/general.rs +16 -22
- data/ext/polars/src/expr/list.rs +18 -11
- data/ext/polars/src/expr/meta.rs +6 -2
- data/ext/polars/src/expr/rolling.rs +6 -7
- data/ext/polars/src/expr/string.rs +9 -36
- data/ext/polars/src/file.rs +78 -23
- data/ext/polars/src/functions/aggregation.rs +4 -4
- data/ext/polars/src/functions/business.rs +15 -0
- data/ext/polars/src/functions/io.rs +34 -13
- data/ext/polars/src/functions/lazy.rs +22 -12
- data/ext/polars/src/functions/meta.rs +1 -1
- data/ext/polars/src/functions/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/lazyframe/general.rs +920 -0
- data/ext/polars/src/lazyframe/mod.rs +3 -827
- data/ext/polars/src/lazyframe/serde.rs +31 -0
- data/ext/polars/src/lib.rs +54 -27
- data/ext/polars/src/map/dataframe.rs +10 -6
- data/ext/polars/src/map/lazy.rs +65 -4
- data/ext/polars/src/map/mod.rs +9 -8
- data/ext/polars/src/on_startup.rs +1 -1
- data/ext/polars/src/series/aggregation.rs +1 -5
- data/ext/polars/src/series/arithmetic.rs +10 -10
- data/ext/polars/src/series/construction.rs +2 -2
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +631 -0
- data/ext/polars/src/series/import.rs +55 -0
- data/ext/polars/src/series/mod.rs +11 -638
- data/ext/polars/src/series/scatter.rs +2 -2
- data/ext/polars/src/utils.rs +0 -20
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +452 -101
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +3 -1
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +95 -13
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +684 -20
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1226 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +17 -1
- metadata +16 -9
- data/lib/polars/functions.rb +0 -57
@@ -1,16 +1,8 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
mod general;
|
2
|
+
mod serde;
|
3
|
+
|
3
4
|
use polars::lazy::frame::LazyFrame;
|
4
|
-
use polars::prelude::*;
|
5
5
|
use std::cell::RefCell;
|
6
|
-
use std::io::{BufWriter, Read};
|
7
|
-
use std::num::NonZeroUsize;
|
8
|
-
use std::path::PathBuf;
|
9
|
-
|
10
|
-
use crate::conversion::*;
|
11
|
-
use crate::expr::rb_exprs_to_exprs;
|
12
|
-
use crate::file::get_file_like;
|
13
|
-
use crate::{RbDataFrame, RbExpr, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
|
14
6
|
|
15
7
|
#[magnus::wrap(class = "Polars::RbLazyFrame")]
|
16
8
|
#[derive(Clone)]
|
@@ -25,819 +17,3 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
25
17
|
}
|
26
18
|
}
|
27
19
|
}
|
28
|
-
|
29
|
-
impl RbLazyFrame {
|
30
|
-
pub fn read_json(rb_f: Value) -> RbResult<Self> {
|
31
|
-
// it is faster to first read to memory and then parse: https://github.com/serde-rs/json/issues/160
|
32
|
-
// so don't bother with files.
|
33
|
-
let mut json = String::new();
|
34
|
-
let _ = get_file_like(rb_f, false)?
|
35
|
-
.read_to_string(&mut json)
|
36
|
-
.unwrap();
|
37
|
-
|
38
|
-
// Safety
|
39
|
-
// we skipped the serializing/deserializing of the static in lifetime in `DataType`
|
40
|
-
// so we actually don't have a lifetime at all when serializing.
|
41
|
-
|
42
|
-
// &str still has a lifetime. Bit its ok, because we drop it immediately
|
43
|
-
// in this scope
|
44
|
-
let json = unsafe { std::mem::transmute::<&'_ str, &'static str>(json.as_str()) };
|
45
|
-
|
46
|
-
let lp = serde_json::from_str::<DslPlan>(json)
|
47
|
-
.map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
|
48
|
-
Ok(LazyFrame::from(lp).into())
|
49
|
-
}
|
50
|
-
|
51
|
-
pub fn new_from_ndjson(
|
52
|
-
path: String,
|
53
|
-
infer_schema_length: Option<usize>,
|
54
|
-
batch_size: Option<Wrap<NonZeroUsize>>,
|
55
|
-
n_rows: Option<usize>,
|
56
|
-
low_memory: bool,
|
57
|
-
rechunk: bool,
|
58
|
-
row_index: Option<(String, IdxSize)>,
|
59
|
-
) -> RbResult<Self> {
|
60
|
-
let batch_size = batch_size.map(|v| v.0);
|
61
|
-
let row_index = row_index.map(|(name, offset)| RowIndex {
|
62
|
-
name: name.into(),
|
63
|
-
offset,
|
64
|
-
});
|
65
|
-
|
66
|
-
let lf = LazyJsonLineReader::new(path)
|
67
|
-
.with_infer_schema_length(infer_schema_length.and_then(NonZeroUsize::new))
|
68
|
-
.with_batch_size(batch_size)
|
69
|
-
.with_n_rows(n_rows)
|
70
|
-
.low_memory(low_memory)
|
71
|
-
.with_rechunk(rechunk)
|
72
|
-
.with_row_index(row_index)
|
73
|
-
.finish()
|
74
|
-
.map_err(RbPolarsErr::from)?;
|
75
|
-
Ok(lf.into())
|
76
|
-
}
|
77
|
-
|
78
|
-
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
79
|
-
// start arguments
|
80
|
-
// this pattern is needed for more than 16
|
81
|
-
let path = String::try_convert(arguments[0])?;
|
82
|
-
let separator = String::try_convert(arguments[1])?;
|
83
|
-
let has_header = bool::try_convert(arguments[2])?;
|
84
|
-
let ignore_errors = bool::try_convert(arguments[3])?;
|
85
|
-
let skip_rows = usize::try_convert(arguments[4])?;
|
86
|
-
let n_rows = Option::<usize>::try_convert(arguments[5])?;
|
87
|
-
let cache = bool::try_convert(arguments[6])?;
|
88
|
-
let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
|
89
|
-
let low_memory = bool::try_convert(arguments[8])?;
|
90
|
-
let comment_prefix = Option::<String>::try_convert(arguments[9])?;
|
91
|
-
let quote_char = Option::<String>::try_convert(arguments[10])?;
|
92
|
-
let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
|
93
|
-
let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
|
94
|
-
let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
|
95
|
-
let rechunk = bool::try_convert(arguments[14])?;
|
96
|
-
let skip_rows_after_header = usize::try_convert(arguments[15])?;
|
97
|
-
let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
|
98
|
-
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
|
99
|
-
let try_parse_dates = bool::try_convert(arguments[18])?;
|
100
|
-
let eol_char = String::try_convert(arguments[19])?;
|
101
|
-
let truncate_ragged_lines = bool::try_convert(arguments[20])?;
|
102
|
-
// end arguments
|
103
|
-
|
104
|
-
let null_values = null_values.map(|w| w.0);
|
105
|
-
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
106
|
-
let separator = separator.as_bytes()[0];
|
107
|
-
let eol_char = eol_char.as_bytes()[0];
|
108
|
-
let row_index = row_index.map(|(name, offset)| RowIndex {
|
109
|
-
name: name.into(),
|
110
|
-
offset,
|
111
|
-
});
|
112
|
-
|
113
|
-
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
114
|
-
overwrite_dtype
|
115
|
-
.into_iter()
|
116
|
-
.map(|(name, dtype)| Field::new((&*name).into(), dtype.0))
|
117
|
-
.collect::<Schema>()
|
118
|
-
});
|
119
|
-
|
120
|
-
let r = LazyCsvReader::new(path)
|
121
|
-
.with_infer_schema_length(infer_schema_length)
|
122
|
-
.with_separator(separator)
|
123
|
-
.with_has_header(has_header)
|
124
|
-
.with_ignore_errors(ignore_errors)
|
125
|
-
.with_skip_rows(skip_rows)
|
126
|
-
.with_n_rows(n_rows)
|
127
|
-
.with_cache(cache)
|
128
|
-
.with_dtype_overwrite(overwrite_dtype.map(Arc::new))
|
129
|
-
// TODO add with_schema
|
130
|
-
.with_low_memory(low_memory)
|
131
|
-
.with_comment_prefix(comment_prefix.map(|x| x.into()))
|
132
|
-
.with_quote_char(quote_char)
|
133
|
-
.with_eol_char(eol_char)
|
134
|
-
.with_rechunk(rechunk)
|
135
|
-
.with_skip_rows_after_header(skip_rows_after_header)
|
136
|
-
.with_encoding(encoding.0)
|
137
|
-
.with_row_index(row_index)
|
138
|
-
.with_try_parse_dates(try_parse_dates)
|
139
|
-
.with_null_values(null_values)
|
140
|
-
// TODO add with_missing_is_null
|
141
|
-
.with_truncate_ragged_lines(truncate_ragged_lines);
|
142
|
-
|
143
|
-
if let Some(_lambda) = with_schema_modify {
|
144
|
-
todo!();
|
145
|
-
}
|
146
|
-
|
147
|
-
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
148
|
-
}
|
149
|
-
|
150
|
-
#[allow(clippy::too_many_arguments)]
|
151
|
-
pub fn new_from_parquet(
|
152
|
-
path: Option<PathBuf>,
|
153
|
-
paths: Vec<PathBuf>,
|
154
|
-
n_rows: Option<usize>,
|
155
|
-
cache: bool,
|
156
|
-
parallel: Wrap<ParallelStrategy>,
|
157
|
-
rechunk: bool,
|
158
|
-
row_index: Option<(String, IdxSize)>,
|
159
|
-
low_memory: bool,
|
160
|
-
use_statistics: bool,
|
161
|
-
hive_partitioning: Option<bool>,
|
162
|
-
hive_schema: Option<Wrap<Schema>>,
|
163
|
-
try_parse_hive_dates: bool,
|
164
|
-
glob: bool,
|
165
|
-
include_file_paths: Option<String>,
|
166
|
-
) -> RbResult<Self> {
|
167
|
-
let parallel = parallel.0;
|
168
|
-
let hive_schema = hive_schema.map(|s| Arc::new(s.0));
|
169
|
-
|
170
|
-
let first_path = if let Some(path) = &path {
|
171
|
-
path
|
172
|
-
} else {
|
173
|
-
paths
|
174
|
-
.first()
|
175
|
-
.ok_or_else(|| RbValueError::new_err("expected a path argument".to_string()))?
|
176
|
-
};
|
177
|
-
|
178
|
-
let row_index = row_index.map(|(name, offset)| RowIndex {
|
179
|
-
name: name.into(),
|
180
|
-
offset,
|
181
|
-
});
|
182
|
-
let hive_options = HiveOptions {
|
183
|
-
enabled: hive_partitioning,
|
184
|
-
hive_start_idx: 0,
|
185
|
-
schema: hive_schema,
|
186
|
-
try_parse_dates: try_parse_hive_dates,
|
187
|
-
};
|
188
|
-
|
189
|
-
let args = ScanArgsParquet {
|
190
|
-
n_rows,
|
191
|
-
cache,
|
192
|
-
parallel,
|
193
|
-
rechunk,
|
194
|
-
row_index,
|
195
|
-
low_memory,
|
196
|
-
cloud_options: None,
|
197
|
-
use_statistics,
|
198
|
-
hive_options,
|
199
|
-
glob,
|
200
|
-
include_file_paths: include_file_paths.map(|x| x.into()),
|
201
|
-
};
|
202
|
-
|
203
|
-
let lf = if path.is_some() {
|
204
|
-
LazyFrame::scan_parquet(first_path, args)
|
205
|
-
} else {
|
206
|
-
LazyFrame::scan_parquet_files(Arc::from(paths), args)
|
207
|
-
}
|
208
|
-
.map_err(RbPolarsErr::from)?;
|
209
|
-
Ok(lf.into())
|
210
|
-
}
|
211
|
-
|
212
|
-
#[allow(clippy::too_many_arguments)]
|
213
|
-
pub fn new_from_ipc(
|
214
|
-
path: String,
|
215
|
-
n_rows: Option<usize>,
|
216
|
-
cache: bool,
|
217
|
-
rechunk: bool,
|
218
|
-
row_index: Option<(String, IdxSize)>,
|
219
|
-
hive_partitioning: Option<bool>,
|
220
|
-
hive_schema: Option<Wrap<Schema>>,
|
221
|
-
try_parse_hive_dates: bool,
|
222
|
-
include_file_paths: Option<String>,
|
223
|
-
) -> RbResult<Self> {
|
224
|
-
let row_index = row_index.map(|(name, offset)| RowIndex {
|
225
|
-
name: name.into(),
|
226
|
-
offset,
|
227
|
-
});
|
228
|
-
|
229
|
-
let hive_options = HiveOptions {
|
230
|
-
enabled: hive_partitioning,
|
231
|
-
hive_start_idx: 0,
|
232
|
-
schema: hive_schema.map(|x| Arc::new(x.0)),
|
233
|
-
try_parse_dates: try_parse_hive_dates,
|
234
|
-
};
|
235
|
-
|
236
|
-
let args = ScanArgsIpc {
|
237
|
-
n_rows,
|
238
|
-
cache,
|
239
|
-
rechunk,
|
240
|
-
row_index,
|
241
|
-
cloud_options: None,
|
242
|
-
hive_options,
|
243
|
-
include_file_paths: include_file_paths.map(|x| x.into()),
|
244
|
-
};
|
245
|
-
let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
|
246
|
-
Ok(lf.into())
|
247
|
-
}
|
248
|
-
|
249
|
-
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
250
|
-
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
251
|
-
serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
|
252
|
-
.map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
|
253
|
-
Ok(())
|
254
|
-
}
|
255
|
-
|
256
|
-
pub fn describe_plan(&self) -> RbResult<String> {
|
257
|
-
self.ldf
|
258
|
-
.borrow()
|
259
|
-
.describe_plan()
|
260
|
-
.map_err(RbPolarsErr::from)
|
261
|
-
.map_err(Into::into)
|
262
|
-
}
|
263
|
-
|
264
|
-
pub fn describe_optimized_plan(&self) -> RbResult<String> {
|
265
|
-
let result = self
|
266
|
-
.ldf
|
267
|
-
.borrow()
|
268
|
-
.describe_optimized_plan()
|
269
|
-
.map_err(RbPolarsErr::from)?;
|
270
|
-
Ok(result)
|
271
|
-
}
|
272
|
-
|
273
|
-
#[allow(clippy::too_many_arguments)]
|
274
|
-
pub fn optimization_toggle(
|
275
|
-
&self,
|
276
|
-
type_coercion: bool,
|
277
|
-
predicate_pushdown: bool,
|
278
|
-
projection_pushdown: bool,
|
279
|
-
simplify_expr: bool,
|
280
|
-
slice_pushdown: bool,
|
281
|
-
comm_subplan_elim: bool,
|
282
|
-
comm_subexpr_elim: bool,
|
283
|
-
allow_streaming: bool,
|
284
|
-
_eager: bool,
|
285
|
-
) -> RbLazyFrame {
|
286
|
-
let ldf = self.ldf.borrow().clone();
|
287
|
-
let mut ldf = ldf
|
288
|
-
.with_type_coercion(type_coercion)
|
289
|
-
.with_predicate_pushdown(predicate_pushdown)
|
290
|
-
.with_simplify_expr(simplify_expr)
|
291
|
-
.with_slice_pushdown(slice_pushdown)
|
292
|
-
.with_streaming(allow_streaming)
|
293
|
-
._with_eager(_eager)
|
294
|
-
.with_projection_pushdown(projection_pushdown);
|
295
|
-
|
296
|
-
ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
|
297
|
-
ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
|
298
|
-
|
299
|
-
ldf.into()
|
300
|
-
}
|
301
|
-
|
302
|
-
pub fn sort(
|
303
|
-
&self,
|
304
|
-
by_column: String,
|
305
|
-
descending: bool,
|
306
|
-
nulls_last: bool,
|
307
|
-
maintain_order: bool,
|
308
|
-
multithreaded: bool,
|
309
|
-
) -> Self {
|
310
|
-
let ldf = self.ldf.borrow().clone();
|
311
|
-
ldf.sort(
|
312
|
-
[&by_column],
|
313
|
-
SortMultipleOptions {
|
314
|
-
descending: vec![descending],
|
315
|
-
nulls_last: vec![nulls_last],
|
316
|
-
multithreaded,
|
317
|
-
maintain_order,
|
318
|
-
},
|
319
|
-
)
|
320
|
-
.into()
|
321
|
-
}
|
322
|
-
|
323
|
-
pub fn sort_by_exprs(
|
324
|
-
&self,
|
325
|
-
by: RArray,
|
326
|
-
descending: Vec<bool>,
|
327
|
-
nulls_last: Vec<bool>,
|
328
|
-
maintain_order: bool,
|
329
|
-
multithreaded: bool,
|
330
|
-
) -> RbResult<Self> {
|
331
|
-
let ldf = self.ldf.borrow().clone();
|
332
|
-
let exprs = rb_exprs_to_exprs(by)?;
|
333
|
-
Ok(ldf
|
334
|
-
.sort_by_exprs(
|
335
|
-
exprs,
|
336
|
-
SortMultipleOptions {
|
337
|
-
descending,
|
338
|
-
nulls_last,
|
339
|
-
maintain_order,
|
340
|
-
multithreaded,
|
341
|
-
},
|
342
|
-
)
|
343
|
-
.into())
|
344
|
-
}
|
345
|
-
|
346
|
-
pub fn cache(&self) -> Self {
|
347
|
-
let ldf = self.ldf.borrow().clone();
|
348
|
-
ldf.cache().into()
|
349
|
-
}
|
350
|
-
|
351
|
-
pub fn collect(&self) -> RbResult<RbDataFrame> {
|
352
|
-
let ldf = self.ldf.borrow().clone();
|
353
|
-
let df = ldf.collect().map_err(RbPolarsErr::from)?;
|
354
|
-
Ok(df.into())
|
355
|
-
}
|
356
|
-
|
357
|
-
#[allow(clippy::too_many_arguments)]
|
358
|
-
pub fn sink_parquet(
|
359
|
-
&self,
|
360
|
-
path: PathBuf,
|
361
|
-
compression: String,
|
362
|
-
compression_level: Option<i32>,
|
363
|
-
statistics: Wrap<StatisticsOptions>,
|
364
|
-
row_group_size: Option<usize>,
|
365
|
-
data_page_size: Option<usize>,
|
366
|
-
maintain_order: bool,
|
367
|
-
) -> RbResult<()> {
|
368
|
-
let compression = parse_parquet_compression(&compression, compression_level)?;
|
369
|
-
|
370
|
-
let options = ParquetWriteOptions {
|
371
|
-
compression,
|
372
|
-
statistics: statistics.0,
|
373
|
-
row_group_size,
|
374
|
-
data_page_size,
|
375
|
-
maintain_order,
|
376
|
-
};
|
377
|
-
|
378
|
-
let ldf = self.ldf.borrow().clone();
|
379
|
-
ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
|
380
|
-
Ok(())
|
381
|
-
}
|
382
|
-
|
383
|
-
pub fn sink_ipc(
|
384
|
-
&self,
|
385
|
-
path: PathBuf,
|
386
|
-
compression: Option<Wrap<IpcCompression>>,
|
387
|
-
maintain_order: bool,
|
388
|
-
) -> RbResult<()> {
|
389
|
-
let options = IpcWriterOptions {
|
390
|
-
compression: compression.map(|c| c.0),
|
391
|
-
maintain_order,
|
392
|
-
};
|
393
|
-
|
394
|
-
let ldf = self.ldf.borrow().clone();
|
395
|
-
ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
|
396
|
-
Ok(())
|
397
|
-
}
|
398
|
-
|
399
|
-
#[allow(clippy::too_many_arguments)]
|
400
|
-
pub fn sink_csv(
|
401
|
-
&self,
|
402
|
-
path: PathBuf,
|
403
|
-
include_bom: bool,
|
404
|
-
include_header: bool,
|
405
|
-
separator: u8,
|
406
|
-
line_terminator: String,
|
407
|
-
quote_char: u8,
|
408
|
-
batch_size: Wrap<NonZeroUsize>,
|
409
|
-
datetime_format: Option<String>,
|
410
|
-
date_format: Option<String>,
|
411
|
-
time_format: Option<String>,
|
412
|
-
float_scientific: Option<bool>,
|
413
|
-
float_precision: Option<usize>,
|
414
|
-
null_value: Option<String>,
|
415
|
-
quote_style: Option<Wrap<QuoteStyle>>,
|
416
|
-
maintain_order: bool,
|
417
|
-
) -> RbResult<()> {
|
418
|
-
let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
|
419
|
-
let null_value = null_value.unwrap_or(SerializeOptions::default().null);
|
420
|
-
|
421
|
-
let serialize_options = SerializeOptions {
|
422
|
-
date_format,
|
423
|
-
time_format,
|
424
|
-
datetime_format,
|
425
|
-
float_scientific,
|
426
|
-
float_precision,
|
427
|
-
separator,
|
428
|
-
quote_char,
|
429
|
-
null: null_value,
|
430
|
-
line_terminator,
|
431
|
-
quote_style,
|
432
|
-
};
|
433
|
-
|
434
|
-
let options = CsvWriterOptions {
|
435
|
-
include_bom,
|
436
|
-
include_header,
|
437
|
-
maintain_order,
|
438
|
-
batch_size: batch_size.0,
|
439
|
-
serialize_options,
|
440
|
-
};
|
441
|
-
|
442
|
-
let ldf = self.ldf.borrow().clone();
|
443
|
-
ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
|
444
|
-
Ok(())
|
445
|
-
}
|
446
|
-
|
447
|
-
pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
|
448
|
-
let options = JsonWriterOptions { maintain_order };
|
449
|
-
|
450
|
-
let ldf = self.ldf.borrow().clone();
|
451
|
-
ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
|
452
|
-
Ok(())
|
453
|
-
}
|
454
|
-
|
455
|
-
pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
|
456
|
-
let ldf = self.ldf.borrow().clone();
|
457
|
-
let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
|
458
|
-
Ok(df.into())
|
459
|
-
}
|
460
|
-
|
461
|
-
pub fn filter(&self, predicate: &RbExpr) -> Self {
|
462
|
-
let ldf = self.ldf.borrow().clone();
|
463
|
-
ldf.filter(predicate.inner.clone()).into()
|
464
|
-
}
|
465
|
-
|
466
|
-
pub fn select(&self, exprs: RArray) -> RbResult<Self> {
|
467
|
-
let ldf = self.ldf.borrow().clone();
|
468
|
-
let exprs = rb_exprs_to_exprs(exprs)?;
|
469
|
-
Ok(ldf.select(exprs).into())
|
470
|
-
}
|
471
|
-
|
472
|
-
pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
|
473
|
-
let ldf = self.ldf.borrow().clone();
|
474
|
-
let exprs = rb_exprs_to_exprs(exprs)?;
|
475
|
-
Ok(ldf.select_seq(exprs).into())
|
476
|
-
}
|
477
|
-
|
478
|
-
pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
|
479
|
-
let ldf = self.ldf.borrow().clone();
|
480
|
-
let by = rb_exprs_to_exprs(by)?;
|
481
|
-
let lazy_gb = if maintain_order {
|
482
|
-
ldf.group_by_stable(by)
|
483
|
-
} else {
|
484
|
-
ldf.group_by(by)
|
485
|
-
};
|
486
|
-
Ok(RbLazyGroupBy {
|
487
|
-
lgb: RefCell::new(Some(lazy_gb)),
|
488
|
-
})
|
489
|
-
}
|
490
|
-
|
491
|
-
pub fn rolling(
|
492
|
-
&self,
|
493
|
-
index_column: &RbExpr,
|
494
|
-
period: String,
|
495
|
-
offset: String,
|
496
|
-
closed: Wrap<ClosedWindow>,
|
497
|
-
by: RArray,
|
498
|
-
) -> RbResult<RbLazyGroupBy> {
|
499
|
-
let closed_window = closed.0;
|
500
|
-
let ldf = self.ldf.borrow().clone();
|
501
|
-
let by = rb_exprs_to_exprs(by)?;
|
502
|
-
let lazy_gb = ldf.rolling(
|
503
|
-
index_column.inner.clone(),
|
504
|
-
by,
|
505
|
-
RollingGroupOptions {
|
506
|
-
index_column: "".into(),
|
507
|
-
period: Duration::parse(&period),
|
508
|
-
offset: Duration::parse(&offset),
|
509
|
-
closed_window,
|
510
|
-
},
|
511
|
-
);
|
512
|
-
|
513
|
-
Ok(RbLazyGroupBy {
|
514
|
-
lgb: RefCell::new(Some(lazy_gb)),
|
515
|
-
})
|
516
|
-
}
|
517
|
-
|
518
|
-
#[allow(clippy::too_many_arguments)]
|
519
|
-
pub fn group_by_dynamic(
|
520
|
-
&self,
|
521
|
-
index_column: &RbExpr,
|
522
|
-
every: String,
|
523
|
-
period: String,
|
524
|
-
offset: String,
|
525
|
-
label: Wrap<Label>,
|
526
|
-
include_boundaries: bool,
|
527
|
-
closed: Wrap<ClosedWindow>,
|
528
|
-
by: RArray,
|
529
|
-
start_by: Wrap<StartBy>,
|
530
|
-
) -> RbResult<RbLazyGroupBy> {
|
531
|
-
let closed_window = closed.0;
|
532
|
-
let by = rb_exprs_to_exprs(by)?;
|
533
|
-
let ldf = self.ldf.borrow().clone();
|
534
|
-
let lazy_gb = ldf.group_by_dynamic(
|
535
|
-
index_column.inner.clone(),
|
536
|
-
by,
|
537
|
-
DynamicGroupOptions {
|
538
|
-
every: Duration::parse(&every),
|
539
|
-
period: Duration::parse(&period),
|
540
|
-
offset: Duration::parse(&offset),
|
541
|
-
label: label.0,
|
542
|
-
include_boundaries,
|
543
|
-
closed_window,
|
544
|
-
start_by: start_by.0,
|
545
|
-
..Default::default()
|
546
|
-
},
|
547
|
-
);
|
548
|
-
|
549
|
-
Ok(RbLazyGroupBy {
|
550
|
-
lgb: RefCell::new(Some(lazy_gb)),
|
551
|
-
})
|
552
|
-
}
|
553
|
-
|
554
|
-
pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
|
555
|
-
let contexts = contexts
|
556
|
-
.into_iter()
|
557
|
-
.map(TryConvert::try_convert)
|
558
|
-
.collect::<RbResult<Vec<&RbLazyFrame>>>()?;
|
559
|
-
let contexts = contexts
|
560
|
-
.into_iter()
|
561
|
-
.map(|ldf| ldf.ldf.borrow().clone())
|
562
|
-
.collect::<Vec<_>>();
|
563
|
-
Ok(self.ldf.borrow().clone().with_context(contexts).into())
|
564
|
-
}
|
565
|
-
|
566
|
-
#[allow(clippy::too_many_arguments)]
|
567
|
-
pub fn join_asof(
|
568
|
-
&self,
|
569
|
-
other: &RbLazyFrame,
|
570
|
-
left_on: &RbExpr,
|
571
|
-
right_on: &RbExpr,
|
572
|
-
left_by: Option<Vec<String>>,
|
573
|
-
right_by: Option<Vec<String>>,
|
574
|
-
allow_parallel: bool,
|
575
|
-
force_parallel: bool,
|
576
|
-
suffix: String,
|
577
|
-
strategy: Wrap<AsofStrategy>,
|
578
|
-
tolerance: Option<Wrap<AnyValue<'_>>>,
|
579
|
-
tolerance_str: Option<String>,
|
580
|
-
) -> RbResult<Self> {
|
581
|
-
let ldf = self.ldf.borrow().clone();
|
582
|
-
let other = other.ldf.borrow().clone();
|
583
|
-
let left_on = left_on.inner.clone();
|
584
|
-
let right_on = right_on.inner.clone();
|
585
|
-
Ok(ldf
|
586
|
-
.join_builder()
|
587
|
-
.with(other)
|
588
|
-
.left_on([left_on])
|
589
|
-
.right_on([right_on])
|
590
|
-
.allow_parallel(allow_parallel)
|
591
|
-
.force_parallel(force_parallel)
|
592
|
-
.how(JoinType::AsOf(AsOfOptions {
|
593
|
-
strategy: strategy.0,
|
594
|
-
left_by: left_by.map(strings_to_pl_smallstr),
|
595
|
-
right_by: right_by.map(strings_to_pl_smallstr),
|
596
|
-
tolerance: tolerance.map(|t| t.0.into_static().unwrap()),
|
597
|
-
tolerance_str: tolerance_str.map(|s| s.into()),
|
598
|
-
}))
|
599
|
-
.suffix(suffix)
|
600
|
-
.finish()
|
601
|
-
.into())
|
602
|
-
}
|
603
|
-
|
604
|
-
#[allow(clippy::too_many_arguments)]
|
605
|
-
pub fn join(
|
606
|
-
&self,
|
607
|
-
other: &RbLazyFrame,
|
608
|
-
left_on: RArray,
|
609
|
-
right_on: RArray,
|
610
|
-
allow_parallel: bool,
|
611
|
-
force_parallel: bool,
|
612
|
-
join_nulls: bool,
|
613
|
-
how: Wrap<JoinType>,
|
614
|
-
suffix: String,
|
615
|
-
) -> RbResult<Self> {
|
616
|
-
let ldf = self.ldf.borrow().clone();
|
617
|
-
let other = other.ldf.borrow().clone();
|
618
|
-
let left_on = rb_exprs_to_exprs(left_on)?;
|
619
|
-
let right_on = rb_exprs_to_exprs(right_on)?;
|
620
|
-
|
621
|
-
Ok(ldf
|
622
|
-
.join_builder()
|
623
|
-
.with(other)
|
624
|
-
.left_on(left_on)
|
625
|
-
.right_on(right_on)
|
626
|
-
.allow_parallel(allow_parallel)
|
627
|
-
.force_parallel(force_parallel)
|
628
|
-
.join_nulls(join_nulls)
|
629
|
-
.how(how.0)
|
630
|
-
.suffix(suffix)
|
631
|
-
.finish()
|
632
|
-
.into())
|
633
|
-
}
|
634
|
-
|
635
|
-
pub fn with_column(&self, expr: &RbExpr) -> Self {
|
636
|
-
let ldf = self.ldf.borrow().clone();
|
637
|
-
ldf.with_column(expr.inner.clone()).into()
|
638
|
-
}
|
639
|
-
|
640
|
-
pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
|
641
|
-
let ldf = self.ldf.borrow().clone();
|
642
|
-
Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
|
643
|
-
}
|
644
|
-
|
645
|
-
pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
|
646
|
-
let ldf = self.ldf.borrow().clone();
|
647
|
-
Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
|
648
|
-
}
|
649
|
-
|
650
|
-
pub fn rename(&self, existing: Vec<String>, new: Vec<String>) -> Self {
|
651
|
-
let ldf = self.ldf.borrow().clone();
|
652
|
-
ldf.rename(existing, new).into()
|
653
|
-
}
|
654
|
-
|
655
|
-
pub fn reverse(&self) -> Self {
|
656
|
-
let ldf = self.ldf.borrow().clone();
|
657
|
-
ldf.reverse().into()
|
658
|
-
}
|
659
|
-
|
660
|
-
pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
|
661
|
-
let lf = self.ldf.borrow().clone();
|
662
|
-
let out = match fill_value {
|
663
|
-
Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
|
664
|
-
None => lf.shift(n.inner.clone()),
|
665
|
-
};
|
666
|
-
out.into()
|
667
|
-
}
|
668
|
-
|
669
|
-
pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
|
670
|
-
let ldf = self.ldf.borrow().clone();
|
671
|
-
ldf.fill_nan(fill_value.inner.clone()).into()
|
672
|
-
}
|
673
|
-
|
674
|
-
pub fn min(&self) -> Self {
|
675
|
-
let ldf = self.ldf.borrow().clone();
|
676
|
-
let out = ldf.min();
|
677
|
-
out.into()
|
678
|
-
}
|
679
|
-
|
680
|
-
pub fn max(&self) -> Self {
|
681
|
-
let ldf = self.ldf.borrow().clone();
|
682
|
-
let out = ldf.max();
|
683
|
-
out.into()
|
684
|
-
}
|
685
|
-
|
686
|
-
pub fn sum(&self) -> Self {
|
687
|
-
let ldf = self.ldf.borrow().clone();
|
688
|
-
let out = ldf.sum();
|
689
|
-
out.into()
|
690
|
-
}
|
691
|
-
|
692
|
-
pub fn mean(&self) -> Self {
|
693
|
-
let ldf = self.ldf.borrow().clone();
|
694
|
-
let out = ldf.mean();
|
695
|
-
out.into()
|
696
|
-
}
|
697
|
-
|
698
|
-
pub fn std(&self, ddof: u8) -> Self {
|
699
|
-
let ldf = self.ldf.borrow().clone();
|
700
|
-
let out = ldf.std(ddof);
|
701
|
-
out.into()
|
702
|
-
}
|
703
|
-
|
704
|
-
pub fn var(&self, ddof: u8) -> Self {
|
705
|
-
let ldf = self.ldf.borrow().clone();
|
706
|
-
let out = ldf.var(ddof);
|
707
|
-
out.into()
|
708
|
-
}
|
709
|
-
|
710
|
-
pub fn median(&self) -> Self {
|
711
|
-
let ldf = self.ldf.borrow().clone();
|
712
|
-
let out = ldf.median();
|
713
|
-
out.into()
|
714
|
-
}
|
715
|
-
|
716
|
-
pub fn quantile(
|
717
|
-
&self,
|
718
|
-
quantile: &RbExpr,
|
719
|
-
interpolation: Wrap<QuantileInterpolOptions>,
|
720
|
-
) -> Self {
|
721
|
-
let ldf = self.ldf.borrow().clone();
|
722
|
-
let out = ldf.quantile(quantile.inner.clone(), interpolation.0);
|
723
|
-
out.into()
|
724
|
-
}
|
725
|
-
|
726
|
-
pub fn explode(&self, column: RArray) -> RbResult<Self> {
|
727
|
-
let ldf = self.ldf.borrow().clone();
|
728
|
-
let column = rb_exprs_to_exprs(column)?;
|
729
|
-
Ok(ldf.explode(column).into())
|
730
|
-
}
|
731
|
-
|
732
|
-
pub fn null_count(&self) -> Self {
|
733
|
-
let ldf = self.ldf.borrow().clone();
|
734
|
-
ldf.null_count().into()
|
735
|
-
}
|
736
|
-
|
737
|
-
pub fn unique(
|
738
|
-
&self,
|
739
|
-
maintain_order: bool,
|
740
|
-
subset: Option<Vec<String>>,
|
741
|
-
keep: Wrap<UniqueKeepStrategy>,
|
742
|
-
) -> RbResult<Self> {
|
743
|
-
let ldf = self.ldf.borrow().clone();
|
744
|
-
Ok(match maintain_order {
|
745
|
-
true => ldf.unique_stable_generic(subset, keep.0),
|
746
|
-
false => ldf.unique_generic(subset, keep.0),
|
747
|
-
}
|
748
|
-
.into())
|
749
|
-
}
|
750
|
-
|
751
|
-
pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
|
752
|
-
let ldf = self.ldf.borrow().clone();
|
753
|
-
ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
|
754
|
-
.into()
|
755
|
-
}
|
756
|
-
|
757
|
-
pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
|
758
|
-
let ldf = self.ldf.borrow().clone();
|
759
|
-
ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
|
760
|
-
}
|
761
|
-
|
762
|
-
pub fn tail(&self, n: IdxSize) -> Self {
|
763
|
-
let ldf = self.ldf.borrow().clone();
|
764
|
-
ldf.tail(n).into()
|
765
|
-
}
|
766
|
-
|
767
|
-
pub fn unpivot(
|
768
|
-
&self,
|
769
|
-
on: RArray,
|
770
|
-
index: RArray,
|
771
|
-
value_name: Option<String>,
|
772
|
-
variable_name: Option<String>,
|
773
|
-
) -> RbResult<Self> {
|
774
|
-
let on = rb_exprs_to_exprs(on)?;
|
775
|
-
let index = rb_exprs_to_exprs(index)?;
|
776
|
-
let args = UnpivotArgsDSL {
|
777
|
-
on: on.into_iter().map(|e| e.into()).collect(),
|
778
|
-
index: index.into_iter().map(|e| e.into()).collect(),
|
779
|
-
value_name: value_name.map(|s| s.into()),
|
780
|
-
variable_name: variable_name.map(|s| s.into()),
|
781
|
-
};
|
782
|
-
|
783
|
-
let ldf = self.ldf.borrow().clone();
|
784
|
-
Ok(ldf.unpivot(args).into())
|
785
|
-
}
|
786
|
-
|
787
|
-
pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
|
788
|
-
let ldf = self.ldf.borrow().clone();
|
789
|
-
ldf.with_row_index(&name, offset).into()
|
790
|
-
}
|
791
|
-
|
792
|
-
pub fn drop(&self, cols: Vec<String>) -> Self {
|
793
|
-
let ldf = self.ldf.borrow().clone();
|
794
|
-
ldf.drop(cols).into()
|
795
|
-
}
|
796
|
-
|
797
|
-
pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
|
798
|
-
self.ldf.borrow().clone().cast_all(dtype.0, strict).into()
|
799
|
-
}
|
800
|
-
|
801
|
-
pub fn clone(&self) -> Self {
|
802
|
-
self.ldf.borrow().clone().into()
|
803
|
-
}
|
804
|
-
|
805
|
-
pub fn collect_schema(&self) -> RbResult<RHash> {
|
806
|
-
let schema = self
|
807
|
-
.ldf
|
808
|
-
.borrow_mut()
|
809
|
-
.collect_schema()
|
810
|
-
.map_err(RbPolarsErr::from)?;
|
811
|
-
|
812
|
-
let schema_dict = RHash::new();
|
813
|
-
schema.iter_fields().for_each(|fld| {
|
814
|
-
// TODO remove unwrap
|
815
|
-
schema_dict
|
816
|
-
.aset::<String, Value>(
|
817
|
-
fld.name().to_string(),
|
818
|
-
Wrap(fld.dtype().clone()).into_value(),
|
819
|
-
)
|
820
|
-
.unwrap();
|
821
|
-
});
|
822
|
-
Ok(schema_dict)
|
823
|
-
}
|
824
|
-
|
825
|
-
pub fn unnest(&self, cols: Vec<String>) -> Self {
|
826
|
-
self.ldf.borrow().clone().unnest(cols).into()
|
827
|
-
}
|
828
|
-
|
829
|
-
pub fn count(&self) -> Self {
|
830
|
-
let ldf = self.ldf.borrow().clone();
|
831
|
-
ldf.count().into()
|
832
|
-
}
|
833
|
-
|
834
|
-
pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
|
835
|
-
let out = self
|
836
|
-
.ldf
|
837
|
-
.borrow()
|
838
|
-
.clone()
|
839
|
-
.merge_sorted(other.ldf.borrow().clone(), &key)
|
840
|
-
.map_err(RbPolarsErr::from)?;
|
841
|
-
Ok(out.into())
|
842
|
-
}
|
843
|
-
}
|