polars-df 0.14.0 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/Cargo.lock +1296 -283
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/ext/polars/Cargo.toml +15 -5
- data/ext/polars/src/batched_csv.rs +7 -10
- data/ext/polars/src/conversion/any_value.rs +31 -21
- data/ext/polars/src/conversion/mod.rs +125 -28
- data/ext/polars/src/dataframe/construction.rs +0 -3
- data/ext/polars/src/dataframe/export.rs +9 -2
- data/ext/polars/src/dataframe/general.rs +16 -11
- data/ext/polars/src/dataframe/io.rs +73 -169
- data/ext/polars/src/dataframe/mod.rs +1 -0
- data/ext/polars/src/dataframe/serde.rs +15 -0
- data/ext/polars/src/error.rs +31 -48
- data/ext/polars/src/exceptions.rs +24 -0
- data/ext/polars/src/expr/binary.rs +4 -42
- data/ext/polars/src/expr/datetime.rs +5 -4
- data/ext/polars/src/expr/general.rs +13 -22
- data/ext/polars/src/expr/list.rs +18 -11
- data/ext/polars/src/expr/rolling.rs +6 -7
- data/ext/polars/src/expr/string.rs +9 -36
- data/ext/polars/src/file.rs +59 -22
- data/ext/polars/src/functions/business.rs +15 -0
- data/ext/polars/src/functions/lazy.rs +17 -8
- data/ext/polars/src/functions/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/lazyframe/general.rs +877 -0
- data/ext/polars/src/lazyframe/mod.rs +3 -827
- data/ext/polars/src/lazyframe/serde.rs +31 -0
- data/ext/polars/src/lib.rs +45 -14
- data/ext/polars/src/map/dataframe.rs +10 -6
- data/ext/polars/src/map/lazy.rs +65 -4
- data/ext/polars/src/map/mod.rs +9 -8
- data/ext/polars/src/on_startup.rs +1 -1
- data/ext/polars/src/series/aggregation.rs +1 -5
- data/ext/polars/src/series/arithmetic.rs +10 -10
- data/ext/polars/src/series/construction.rs +2 -2
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +643 -0
- data/ext/polars/src/series/import.rs +55 -0
- data/ext/polars/src/series/mod.rs +11 -638
- data/ext/polars/src/series/scatter.rs +2 -2
- data/ext/polars/src/utils.rs +0 -20
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +275 -52
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +14 -4
- data/lib/polars/functions.rb +0 -57
@@ -0,0 +1,877 @@
|
|
1
|
+
use magnus::{r_hash::ForEach, typed_data::Obj, IntoValue, RArray, RHash, TryConvert, Value};
|
2
|
+
use polars::io::{HiveOptions, RowIndex};
|
3
|
+
use polars::lazy::frame::LazyFrame;
|
4
|
+
use polars::prelude::*;
|
5
|
+
use polars_plan::plans::ScanSources;
|
6
|
+
use std::cell::RefCell;
|
7
|
+
use std::io::BufWriter;
|
8
|
+
use std::num::NonZeroUsize;
|
9
|
+
use std::path::PathBuf;
|
10
|
+
|
11
|
+
use crate::conversion::*;
|
12
|
+
use crate::expr::rb_exprs_to_exprs;
|
13
|
+
use crate::file::get_file_like;
|
14
|
+
use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
|
15
|
+
|
16
|
+
fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PathBuf>, ScanSources)> {
|
17
|
+
use crate::file::{get_ruby_scan_source_input, RubyScanSourceInput};
|
18
|
+
Ok(match get_ruby_scan_source_input(obj, false)? {
|
19
|
+
RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
|
20
|
+
RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
|
21
|
+
RubyScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
|
22
|
+
})
|
23
|
+
}
|
24
|
+
|
25
|
+
impl RbLazyFrame {
|
26
|
+
#[allow(clippy::too_many_arguments)]
|
27
|
+
pub fn new_from_ndjson(
|
28
|
+
source: Option<Value>,
|
29
|
+
sources: Wrap<ScanSources>,
|
30
|
+
infer_schema_length: Option<usize>,
|
31
|
+
batch_size: Option<Wrap<NonZeroUsize>>,
|
32
|
+
n_rows: Option<usize>,
|
33
|
+
low_memory: bool,
|
34
|
+
rechunk: bool,
|
35
|
+
row_index: Option<(String, IdxSize)>,
|
36
|
+
) -> RbResult<Self> {
|
37
|
+
let batch_size = batch_size.map(|v| v.0);
|
38
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
39
|
+
name: name.into(),
|
40
|
+
offset,
|
41
|
+
});
|
42
|
+
|
43
|
+
let sources = sources.0;
|
44
|
+
let (_first_path, sources) = match source {
|
45
|
+
None => (sources.first_path().map(|p| p.to_path_buf()), sources),
|
46
|
+
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
47
|
+
};
|
48
|
+
|
49
|
+
let r = LazyJsonLineReader::new_with_sources(sources);
|
50
|
+
|
51
|
+
let lf = r
|
52
|
+
.with_infer_schema_length(infer_schema_length.and_then(NonZeroUsize::new))
|
53
|
+
.with_batch_size(batch_size)
|
54
|
+
.with_n_rows(n_rows)
|
55
|
+
.low_memory(low_memory)
|
56
|
+
.with_rechunk(rechunk)
|
57
|
+
// .with_schema(schema.map(|schema| Arc::new(schema.0)))
|
58
|
+
// .with_schema_overwrite(schema_overrides.map(|x| Arc::new(x.0)))
|
59
|
+
.with_row_index(row_index)
|
60
|
+
// .with_ignore_errors(ignore_errors)
|
61
|
+
// .with_include_file_paths(include_file_paths.map(|x| x.into()))
|
62
|
+
.finish()
|
63
|
+
.map_err(RbPolarsErr::from)?;
|
64
|
+
|
65
|
+
Ok(lf.into())
|
66
|
+
}
|
67
|
+
|
68
|
+
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
69
|
+
// start arguments
|
70
|
+
// this pattern is needed for more than 16
|
71
|
+
let source = Option::<Value>::try_convert(arguments[0])?;
|
72
|
+
let sources = Wrap::<ScanSources>::try_convert(arguments[21])?;
|
73
|
+
let separator = String::try_convert(arguments[1])?;
|
74
|
+
let has_header = bool::try_convert(arguments[2])?;
|
75
|
+
let ignore_errors = bool::try_convert(arguments[3])?;
|
76
|
+
let skip_rows = usize::try_convert(arguments[4])?;
|
77
|
+
let n_rows = Option::<usize>::try_convert(arguments[5])?;
|
78
|
+
let cache = bool::try_convert(arguments[6])?;
|
79
|
+
let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
|
80
|
+
let low_memory = bool::try_convert(arguments[8])?;
|
81
|
+
let comment_prefix = Option::<String>::try_convert(arguments[9])?;
|
82
|
+
let quote_char = Option::<String>::try_convert(arguments[10])?;
|
83
|
+
let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
|
84
|
+
let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
|
85
|
+
let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
|
86
|
+
let rechunk = bool::try_convert(arguments[14])?;
|
87
|
+
let skip_rows_after_header = usize::try_convert(arguments[15])?;
|
88
|
+
let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
|
89
|
+
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
|
90
|
+
let try_parse_dates = bool::try_convert(arguments[18])?;
|
91
|
+
let eol_char = String::try_convert(arguments[19])?;
|
92
|
+
let truncate_ragged_lines = bool::try_convert(arguments[20])?;
|
93
|
+
// end arguments
|
94
|
+
|
95
|
+
let null_values = null_values.map(|w| w.0);
|
96
|
+
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
97
|
+
let separator = separator.as_bytes()[0];
|
98
|
+
let eol_char = eol_char.as_bytes()[0];
|
99
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
100
|
+
name: name.into(),
|
101
|
+
offset,
|
102
|
+
});
|
103
|
+
|
104
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
105
|
+
overwrite_dtype
|
106
|
+
.into_iter()
|
107
|
+
.map(|(name, dtype)| Field::new((&*name).into(), dtype.0))
|
108
|
+
.collect::<Schema>()
|
109
|
+
});
|
110
|
+
|
111
|
+
let sources = sources.0;
|
112
|
+
let (_first_path, sources) = match source {
|
113
|
+
None => (sources.first_path().map(|p| p.to_path_buf()), sources),
|
114
|
+
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
115
|
+
};
|
116
|
+
|
117
|
+
let r = LazyCsvReader::new_with_sources(sources);
|
118
|
+
|
119
|
+
let r = r
|
120
|
+
.with_infer_schema_length(infer_schema_length)
|
121
|
+
.with_separator(separator)
|
122
|
+
.with_has_header(has_header)
|
123
|
+
.with_ignore_errors(ignore_errors)
|
124
|
+
.with_skip_rows(skip_rows)
|
125
|
+
.with_n_rows(n_rows)
|
126
|
+
.with_cache(cache)
|
127
|
+
.with_dtype_overwrite(overwrite_dtype.map(Arc::new))
|
128
|
+
// TODO add with_schema
|
129
|
+
.with_low_memory(low_memory)
|
130
|
+
.with_comment_prefix(comment_prefix.map(|x| x.into()))
|
131
|
+
.with_quote_char(quote_char)
|
132
|
+
.with_eol_char(eol_char)
|
133
|
+
.with_rechunk(rechunk)
|
134
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
135
|
+
.with_encoding(encoding.0)
|
136
|
+
.with_row_index(row_index)
|
137
|
+
.with_try_parse_dates(try_parse_dates)
|
138
|
+
.with_null_values(null_values)
|
139
|
+
// TODO add with_missing_is_null
|
140
|
+
.with_truncate_ragged_lines(truncate_ragged_lines);
|
141
|
+
|
142
|
+
if let Some(_lambda) = with_schema_modify {
|
143
|
+
todo!();
|
144
|
+
}
|
145
|
+
|
146
|
+
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
147
|
+
}
|
148
|
+
|
149
|
+
pub fn new_from_parquet(arguments: &[Value]) -> RbResult<Self> {
|
150
|
+
let source = Option::<Value>::try_convert(arguments[0])?;
|
151
|
+
let sources = Wrap::<ScanSources>::try_convert(arguments[1])?;
|
152
|
+
let n_rows = Option::<usize>::try_convert(arguments[2])?;
|
153
|
+
let cache = bool::try_convert(arguments[3])?;
|
154
|
+
let parallel = Wrap::<ParallelStrategy>::try_convert(arguments[4])?;
|
155
|
+
let rechunk = bool::try_convert(arguments[5])?;
|
156
|
+
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[6])?;
|
157
|
+
let low_memory = bool::try_convert(arguments[7])?;
|
158
|
+
let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[8])?;
|
159
|
+
let _credential_provider = Option::<Value>::try_convert(arguments[9])?;
|
160
|
+
let use_statistics = bool::try_convert(arguments[10])?;
|
161
|
+
let hive_partitioning = Option::<bool>::try_convert(arguments[11])?;
|
162
|
+
let schema = Option::<Wrap<Schema>>::try_convert(arguments[12])?;
|
163
|
+
let hive_schema = Option::<Wrap<Schema>>::try_convert(arguments[13])?;
|
164
|
+
let try_parse_hive_dates = bool::try_convert(arguments[14])?;
|
165
|
+
let retries = usize::try_convert(arguments[15])?;
|
166
|
+
let glob = bool::try_convert(arguments[16])?;
|
167
|
+
let include_file_paths = Option::<String>::try_convert(arguments[17])?;
|
168
|
+
let allow_missing_columns = bool::try_convert(arguments[18])?;
|
169
|
+
|
170
|
+
let parallel = parallel.0;
|
171
|
+
let hive_schema = hive_schema.map(|s| Arc::new(s.0));
|
172
|
+
|
173
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
174
|
+
name: name.into(),
|
175
|
+
offset,
|
176
|
+
});
|
177
|
+
|
178
|
+
let hive_options = HiveOptions {
|
179
|
+
enabled: hive_partitioning,
|
180
|
+
hive_start_idx: 0,
|
181
|
+
schema: hive_schema,
|
182
|
+
try_parse_dates: try_parse_hive_dates,
|
183
|
+
};
|
184
|
+
|
185
|
+
let mut args = ScanArgsParquet {
|
186
|
+
n_rows,
|
187
|
+
cache,
|
188
|
+
parallel,
|
189
|
+
rechunk,
|
190
|
+
row_index,
|
191
|
+
low_memory,
|
192
|
+
cloud_options: None,
|
193
|
+
use_statistics,
|
194
|
+
schema: schema.map(|x| Arc::new(x.0)),
|
195
|
+
hive_options,
|
196
|
+
glob,
|
197
|
+
include_file_paths: include_file_paths.map(|x| x.into()),
|
198
|
+
allow_missing_columns,
|
199
|
+
};
|
200
|
+
|
201
|
+
let sources = sources.0;
|
202
|
+
let (first_path, sources) = match source {
|
203
|
+
None => (sources.first_path().map(|p| p.to_path_buf()), sources),
|
204
|
+
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
205
|
+
};
|
206
|
+
|
207
|
+
if let Some(first_path) = first_path {
|
208
|
+
let first_path_url = first_path.to_string_lossy();
|
209
|
+
let cloud_options =
|
210
|
+
parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
|
211
|
+
args.cloud_options = Some(cloud_options.with_max_retries(retries));
|
212
|
+
}
|
213
|
+
|
214
|
+
let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(RbPolarsErr::from)?;
|
215
|
+
|
216
|
+
Ok(lf.into())
|
217
|
+
}
|
218
|
+
|
219
|
+
#[allow(clippy::too_many_arguments)]
|
220
|
+
pub fn new_from_ipc(
|
221
|
+
source: Option<Value>,
|
222
|
+
sources: Wrap<ScanSources>,
|
223
|
+
n_rows: Option<usize>,
|
224
|
+
cache: bool,
|
225
|
+
rechunk: bool,
|
226
|
+
row_index: Option<(String, IdxSize)>,
|
227
|
+
hive_partitioning: Option<bool>,
|
228
|
+
hive_schema: Option<Wrap<Schema>>,
|
229
|
+
try_parse_hive_dates: bool,
|
230
|
+
include_file_paths: Option<String>,
|
231
|
+
) -> RbResult<Self> {
|
232
|
+
let row_index = row_index.map(|(name, offset)| RowIndex {
|
233
|
+
name: name.into(),
|
234
|
+
offset,
|
235
|
+
});
|
236
|
+
|
237
|
+
let hive_options = HiveOptions {
|
238
|
+
enabled: hive_partitioning,
|
239
|
+
hive_start_idx: 0,
|
240
|
+
schema: hive_schema.map(|x| Arc::new(x.0)),
|
241
|
+
try_parse_dates: try_parse_hive_dates,
|
242
|
+
};
|
243
|
+
|
244
|
+
let args = ScanArgsIpc {
|
245
|
+
n_rows,
|
246
|
+
cache,
|
247
|
+
rechunk,
|
248
|
+
row_index,
|
249
|
+
cloud_options: None,
|
250
|
+
hive_options,
|
251
|
+
include_file_paths: include_file_paths.map(|x| x.into()),
|
252
|
+
};
|
253
|
+
|
254
|
+
let sources = sources.0;
|
255
|
+
let (_first_path, sources) = match source {
|
256
|
+
None => (sources.first_path().map(|p| p.to_path_buf()), sources),
|
257
|
+
Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
|
258
|
+
};
|
259
|
+
|
260
|
+
let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(RbPolarsErr::from)?;
|
261
|
+
Ok(lf.into())
|
262
|
+
}
|
263
|
+
|
264
|
+
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
265
|
+
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
266
|
+
serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
|
267
|
+
.map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
|
268
|
+
Ok(())
|
269
|
+
}
|
270
|
+
|
271
|
+
pub fn describe_plan(&self) -> RbResult<String> {
|
272
|
+
self.ldf
|
273
|
+
.borrow()
|
274
|
+
.describe_plan()
|
275
|
+
.map_err(RbPolarsErr::from)
|
276
|
+
.map_err(Into::into)
|
277
|
+
}
|
278
|
+
|
279
|
+
pub fn describe_optimized_plan(&self) -> RbResult<String> {
|
280
|
+
let result = self
|
281
|
+
.ldf
|
282
|
+
.borrow()
|
283
|
+
.describe_optimized_plan()
|
284
|
+
.map_err(RbPolarsErr::from)?;
|
285
|
+
Ok(result)
|
286
|
+
}
|
287
|
+
|
288
|
+
#[allow(clippy::too_many_arguments)]
|
289
|
+
pub fn optimization_toggle(
|
290
|
+
&self,
|
291
|
+
type_coercion: bool,
|
292
|
+
predicate_pushdown: bool,
|
293
|
+
projection_pushdown: bool,
|
294
|
+
simplify_expr: bool,
|
295
|
+
slice_pushdown: bool,
|
296
|
+
comm_subplan_elim: bool,
|
297
|
+
comm_subexpr_elim: bool,
|
298
|
+
allow_streaming: bool,
|
299
|
+
_eager: bool,
|
300
|
+
) -> RbLazyFrame {
|
301
|
+
let ldf = self.ldf.borrow().clone();
|
302
|
+
let mut ldf = ldf
|
303
|
+
.with_type_coercion(type_coercion)
|
304
|
+
.with_predicate_pushdown(predicate_pushdown)
|
305
|
+
.with_simplify_expr(simplify_expr)
|
306
|
+
.with_slice_pushdown(slice_pushdown)
|
307
|
+
.with_streaming(allow_streaming)
|
308
|
+
._with_eager(_eager)
|
309
|
+
.with_projection_pushdown(projection_pushdown);
|
310
|
+
|
311
|
+
ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
|
312
|
+
ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
|
313
|
+
|
314
|
+
ldf.into()
|
315
|
+
}
|
316
|
+
|
317
|
+
pub fn sort(
|
318
|
+
&self,
|
319
|
+
by_column: String,
|
320
|
+
descending: bool,
|
321
|
+
nulls_last: bool,
|
322
|
+
maintain_order: bool,
|
323
|
+
multithreaded: bool,
|
324
|
+
) -> Self {
|
325
|
+
let ldf = self.ldf.borrow().clone();
|
326
|
+
ldf.sort(
|
327
|
+
[&by_column],
|
328
|
+
SortMultipleOptions {
|
329
|
+
descending: vec![descending],
|
330
|
+
nulls_last: vec![nulls_last],
|
331
|
+
multithreaded,
|
332
|
+
maintain_order,
|
333
|
+
},
|
334
|
+
)
|
335
|
+
.into()
|
336
|
+
}
|
337
|
+
|
338
|
+
pub fn sort_by_exprs(
|
339
|
+
&self,
|
340
|
+
by: RArray,
|
341
|
+
descending: Vec<bool>,
|
342
|
+
nulls_last: Vec<bool>,
|
343
|
+
maintain_order: bool,
|
344
|
+
multithreaded: bool,
|
345
|
+
) -> RbResult<Self> {
|
346
|
+
let ldf = self.ldf.borrow().clone();
|
347
|
+
let exprs = rb_exprs_to_exprs(by)?;
|
348
|
+
Ok(ldf
|
349
|
+
.sort_by_exprs(
|
350
|
+
exprs,
|
351
|
+
SortMultipleOptions {
|
352
|
+
descending,
|
353
|
+
nulls_last,
|
354
|
+
maintain_order,
|
355
|
+
multithreaded,
|
356
|
+
},
|
357
|
+
)
|
358
|
+
.into())
|
359
|
+
}
|
360
|
+
|
361
|
+
pub fn cache(&self) -> Self {
|
362
|
+
let ldf = self.ldf.borrow().clone();
|
363
|
+
ldf.cache().into()
|
364
|
+
}
|
365
|
+
|
366
|
+
pub fn collect(&self) -> RbResult<RbDataFrame> {
|
367
|
+
let ldf = self.ldf.borrow().clone();
|
368
|
+
let df = ldf.collect().map_err(RbPolarsErr::from)?;
|
369
|
+
Ok(df.into())
|
370
|
+
}
|
371
|
+
|
372
|
+
#[allow(clippy::too_many_arguments)]
|
373
|
+
pub fn sink_parquet(
|
374
|
+
&self,
|
375
|
+
path: PathBuf,
|
376
|
+
compression: String,
|
377
|
+
compression_level: Option<i32>,
|
378
|
+
statistics: Wrap<StatisticsOptions>,
|
379
|
+
row_group_size: Option<usize>,
|
380
|
+
data_page_size: Option<usize>,
|
381
|
+
maintain_order: bool,
|
382
|
+
) -> RbResult<()> {
|
383
|
+
let compression = parse_parquet_compression(&compression, compression_level)?;
|
384
|
+
|
385
|
+
let options = ParquetWriteOptions {
|
386
|
+
compression,
|
387
|
+
statistics: statistics.0,
|
388
|
+
row_group_size,
|
389
|
+
data_page_size,
|
390
|
+
maintain_order,
|
391
|
+
};
|
392
|
+
|
393
|
+
let ldf = self.ldf.borrow().clone();
|
394
|
+
ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
|
395
|
+
Ok(())
|
396
|
+
}
|
397
|
+
|
398
|
+
pub fn sink_ipc(
|
399
|
+
&self,
|
400
|
+
path: PathBuf,
|
401
|
+
compression: Option<Wrap<IpcCompression>>,
|
402
|
+
maintain_order: bool,
|
403
|
+
) -> RbResult<()> {
|
404
|
+
let options = IpcWriterOptions {
|
405
|
+
compression: compression.map(|c| c.0),
|
406
|
+
maintain_order,
|
407
|
+
};
|
408
|
+
|
409
|
+
let ldf = self.ldf.borrow().clone();
|
410
|
+
ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
|
411
|
+
Ok(())
|
412
|
+
}
|
413
|
+
|
414
|
+
#[allow(clippy::too_many_arguments)]
|
415
|
+
pub fn sink_csv(
|
416
|
+
&self,
|
417
|
+
path: PathBuf,
|
418
|
+
include_bom: bool,
|
419
|
+
include_header: bool,
|
420
|
+
separator: u8,
|
421
|
+
line_terminator: String,
|
422
|
+
quote_char: u8,
|
423
|
+
batch_size: Wrap<NonZeroUsize>,
|
424
|
+
datetime_format: Option<String>,
|
425
|
+
date_format: Option<String>,
|
426
|
+
time_format: Option<String>,
|
427
|
+
float_scientific: Option<bool>,
|
428
|
+
float_precision: Option<usize>,
|
429
|
+
null_value: Option<String>,
|
430
|
+
quote_style: Option<Wrap<QuoteStyle>>,
|
431
|
+
maintain_order: bool,
|
432
|
+
) -> RbResult<()> {
|
433
|
+
let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
|
434
|
+
let null_value = null_value.unwrap_or(SerializeOptions::default().null);
|
435
|
+
|
436
|
+
let serialize_options = SerializeOptions {
|
437
|
+
date_format,
|
438
|
+
time_format,
|
439
|
+
datetime_format,
|
440
|
+
float_scientific,
|
441
|
+
float_precision,
|
442
|
+
separator,
|
443
|
+
quote_char,
|
444
|
+
null: null_value,
|
445
|
+
line_terminator,
|
446
|
+
quote_style,
|
447
|
+
};
|
448
|
+
|
449
|
+
let options = CsvWriterOptions {
|
450
|
+
include_bom,
|
451
|
+
include_header,
|
452
|
+
maintain_order,
|
453
|
+
batch_size: batch_size.0,
|
454
|
+
serialize_options,
|
455
|
+
};
|
456
|
+
|
457
|
+
let ldf = self.ldf.borrow().clone();
|
458
|
+
ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
|
459
|
+
Ok(())
|
460
|
+
}
|
461
|
+
|
462
|
+
pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
|
463
|
+
let options = JsonWriterOptions { maintain_order };
|
464
|
+
|
465
|
+
let ldf = self.ldf.borrow().clone();
|
466
|
+
ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
|
467
|
+
Ok(())
|
468
|
+
}
|
469
|
+
|
470
|
+
pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
|
471
|
+
let ldf = self.ldf.borrow().clone();
|
472
|
+
let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
|
473
|
+
Ok(df.into())
|
474
|
+
}
|
475
|
+
|
476
|
+
pub fn filter(&self, predicate: &RbExpr) -> Self {
|
477
|
+
let ldf = self.ldf.borrow().clone();
|
478
|
+
ldf.filter(predicate.inner.clone()).into()
|
479
|
+
}
|
480
|
+
|
481
|
+
pub fn select(&self, exprs: RArray) -> RbResult<Self> {
|
482
|
+
let ldf = self.ldf.borrow().clone();
|
483
|
+
let exprs = rb_exprs_to_exprs(exprs)?;
|
484
|
+
Ok(ldf.select(exprs).into())
|
485
|
+
}
|
486
|
+
|
487
|
+
pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
|
488
|
+
let ldf = self.ldf.borrow().clone();
|
489
|
+
let exprs = rb_exprs_to_exprs(exprs)?;
|
490
|
+
Ok(ldf.select_seq(exprs).into())
|
491
|
+
}
|
492
|
+
|
493
|
+
pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
|
494
|
+
let ldf = self.ldf.borrow().clone();
|
495
|
+
let by = rb_exprs_to_exprs(by)?;
|
496
|
+
let lazy_gb = if maintain_order {
|
497
|
+
ldf.group_by_stable(by)
|
498
|
+
} else {
|
499
|
+
ldf.group_by(by)
|
500
|
+
};
|
501
|
+
Ok(RbLazyGroupBy {
|
502
|
+
lgb: RefCell::new(Some(lazy_gb)),
|
503
|
+
})
|
504
|
+
}
|
505
|
+
|
506
|
+
pub fn rolling(
|
507
|
+
&self,
|
508
|
+
index_column: &RbExpr,
|
509
|
+
period: String,
|
510
|
+
offset: String,
|
511
|
+
closed: Wrap<ClosedWindow>,
|
512
|
+
by: RArray,
|
513
|
+
) -> RbResult<RbLazyGroupBy> {
|
514
|
+
let closed_window = closed.0;
|
515
|
+
let ldf = self.ldf.borrow().clone();
|
516
|
+
let by = rb_exprs_to_exprs(by)?;
|
517
|
+
let lazy_gb = ldf.rolling(
|
518
|
+
index_column.inner.clone(),
|
519
|
+
by,
|
520
|
+
RollingGroupOptions {
|
521
|
+
index_column: "".into(),
|
522
|
+
period: Duration::parse(&period),
|
523
|
+
offset: Duration::parse(&offset),
|
524
|
+
closed_window,
|
525
|
+
},
|
526
|
+
);
|
527
|
+
|
528
|
+
Ok(RbLazyGroupBy {
|
529
|
+
lgb: RefCell::new(Some(lazy_gb)),
|
530
|
+
})
|
531
|
+
}
|
532
|
+
|
533
|
+
#[allow(clippy::too_many_arguments)]
|
534
|
+
pub fn group_by_dynamic(
|
535
|
+
&self,
|
536
|
+
index_column: &RbExpr,
|
537
|
+
every: String,
|
538
|
+
period: String,
|
539
|
+
offset: String,
|
540
|
+
label: Wrap<Label>,
|
541
|
+
include_boundaries: bool,
|
542
|
+
closed: Wrap<ClosedWindow>,
|
543
|
+
by: RArray,
|
544
|
+
start_by: Wrap<StartBy>,
|
545
|
+
) -> RbResult<RbLazyGroupBy> {
|
546
|
+
let closed_window = closed.0;
|
547
|
+
let by = rb_exprs_to_exprs(by)?;
|
548
|
+
let ldf = self.ldf.borrow().clone();
|
549
|
+
let lazy_gb = ldf.group_by_dynamic(
|
550
|
+
index_column.inner.clone(),
|
551
|
+
by,
|
552
|
+
DynamicGroupOptions {
|
553
|
+
every: Duration::parse(&every),
|
554
|
+
period: Duration::parse(&period),
|
555
|
+
offset: Duration::parse(&offset),
|
556
|
+
label: label.0,
|
557
|
+
include_boundaries,
|
558
|
+
closed_window,
|
559
|
+
start_by: start_by.0,
|
560
|
+
..Default::default()
|
561
|
+
},
|
562
|
+
);
|
563
|
+
|
564
|
+
Ok(RbLazyGroupBy {
|
565
|
+
lgb: RefCell::new(Some(lazy_gb)),
|
566
|
+
})
|
567
|
+
}
|
568
|
+
|
569
|
+
pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
|
570
|
+
let contexts = contexts.typecheck::<Obj<RbLazyFrame>>()?;
|
571
|
+
let contexts = contexts
|
572
|
+
.into_iter()
|
573
|
+
.map(|ldf| ldf.ldf.borrow().clone())
|
574
|
+
.collect::<Vec<_>>();
|
575
|
+
Ok(self.ldf.borrow().clone().with_context(contexts).into())
|
576
|
+
}
|
577
|
+
|
578
|
+
#[allow(clippy::too_many_arguments)]
|
579
|
+
pub fn join_asof(
|
580
|
+
&self,
|
581
|
+
other: &RbLazyFrame,
|
582
|
+
left_on: &RbExpr,
|
583
|
+
right_on: &RbExpr,
|
584
|
+
left_by: Option<Vec<String>>,
|
585
|
+
right_by: Option<Vec<String>>,
|
586
|
+
allow_parallel: bool,
|
587
|
+
force_parallel: bool,
|
588
|
+
suffix: String,
|
589
|
+
strategy: Wrap<AsofStrategy>,
|
590
|
+
tolerance: Option<Wrap<AnyValue<'_>>>,
|
591
|
+
tolerance_str: Option<String>,
|
592
|
+
coalesce: bool,
|
593
|
+
) -> RbResult<Self> {
|
594
|
+
let coalesce = if coalesce {
|
595
|
+
JoinCoalesce::CoalesceColumns
|
596
|
+
} else {
|
597
|
+
JoinCoalesce::KeepColumns
|
598
|
+
};
|
599
|
+
let ldf = self.ldf.borrow().clone();
|
600
|
+
let other = other.ldf.borrow().clone();
|
601
|
+
let left_on = left_on.inner.clone();
|
602
|
+
let right_on = right_on.inner.clone();
|
603
|
+
Ok(ldf
|
604
|
+
.join_builder()
|
605
|
+
.with(other)
|
606
|
+
.left_on([left_on])
|
607
|
+
.right_on([right_on])
|
608
|
+
.allow_parallel(allow_parallel)
|
609
|
+
.force_parallel(force_parallel)
|
610
|
+
.coalesce(coalesce)
|
611
|
+
.how(JoinType::AsOf(AsOfOptions {
|
612
|
+
strategy: strategy.0,
|
613
|
+
left_by: left_by.map(strings_to_pl_smallstr),
|
614
|
+
right_by: right_by.map(strings_to_pl_smallstr),
|
615
|
+
tolerance: tolerance.map(|t| t.0.into_static()),
|
616
|
+
tolerance_str: tolerance_str.map(|s| s.into()),
|
617
|
+
}))
|
618
|
+
.suffix(suffix)
|
619
|
+
.finish()
|
620
|
+
.into())
|
621
|
+
}
|
622
|
+
|
623
|
+
#[allow(clippy::too_many_arguments)]
|
624
|
+
pub fn join(
|
625
|
+
&self,
|
626
|
+
other: &RbLazyFrame,
|
627
|
+
left_on: RArray,
|
628
|
+
right_on: RArray,
|
629
|
+
allow_parallel: bool,
|
630
|
+
force_parallel: bool,
|
631
|
+
join_nulls: bool,
|
632
|
+
how: Wrap<JoinType>,
|
633
|
+
suffix: String,
|
634
|
+
validate: Wrap<JoinValidation>,
|
635
|
+
coalesce: Option<bool>,
|
636
|
+
) -> RbResult<Self> {
|
637
|
+
let coalesce = match coalesce {
|
638
|
+
None => JoinCoalesce::JoinSpecific,
|
639
|
+
Some(true) => JoinCoalesce::CoalesceColumns,
|
640
|
+
Some(false) => JoinCoalesce::KeepColumns,
|
641
|
+
};
|
642
|
+
let ldf = self.ldf.borrow().clone();
|
643
|
+
let other = other.ldf.borrow().clone();
|
644
|
+
let left_on = rb_exprs_to_exprs(left_on)?;
|
645
|
+
let right_on = rb_exprs_to_exprs(right_on)?;
|
646
|
+
|
647
|
+
Ok(ldf
|
648
|
+
.join_builder()
|
649
|
+
.with(other)
|
650
|
+
.left_on(left_on)
|
651
|
+
.right_on(right_on)
|
652
|
+
.allow_parallel(allow_parallel)
|
653
|
+
.force_parallel(force_parallel)
|
654
|
+
.join_nulls(join_nulls)
|
655
|
+
.how(how.0)
|
656
|
+
.validate(validate.0)
|
657
|
+
.coalesce(coalesce)
|
658
|
+
.suffix(suffix)
|
659
|
+
.finish()
|
660
|
+
.into())
|
661
|
+
}
|
662
|
+
|
663
|
+
pub fn with_column(&self, expr: &RbExpr) -> Self {
|
664
|
+
let ldf = self.ldf.borrow().clone();
|
665
|
+
ldf.with_column(expr.inner.clone()).into()
|
666
|
+
}
|
667
|
+
|
668
|
+
pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
|
669
|
+
let ldf = self.ldf.borrow().clone();
|
670
|
+
Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
|
671
|
+
}
|
672
|
+
|
673
|
+
pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
|
674
|
+
let ldf = self.ldf.borrow().clone();
|
675
|
+
Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
|
676
|
+
}
|
677
|
+
|
678
|
+
pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
|
679
|
+
let ldf = self.ldf.borrow().clone();
|
680
|
+
ldf.rename(existing, new, strict).into()
|
681
|
+
}
|
682
|
+
|
683
|
+
pub fn reverse(&self) -> Self {
|
684
|
+
let ldf = self.ldf.borrow().clone();
|
685
|
+
ldf.reverse().into()
|
686
|
+
}
|
687
|
+
|
688
|
+
pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
|
689
|
+
let lf = self.ldf.borrow().clone();
|
690
|
+
let out = match fill_value {
|
691
|
+
Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
|
692
|
+
None => lf.shift(n.inner.clone()),
|
693
|
+
};
|
694
|
+
out.into()
|
695
|
+
}
|
696
|
+
|
697
|
+
pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
|
698
|
+
let ldf = self.ldf.borrow().clone();
|
699
|
+
ldf.fill_nan(fill_value.inner.clone()).into()
|
700
|
+
}
|
701
|
+
|
702
|
+
pub fn min(&self) -> Self {
|
703
|
+
let ldf = self.ldf.borrow().clone();
|
704
|
+
let out = ldf.min();
|
705
|
+
out.into()
|
706
|
+
}
|
707
|
+
|
708
|
+
pub fn max(&self) -> Self {
|
709
|
+
let ldf = self.ldf.borrow().clone();
|
710
|
+
let out = ldf.max();
|
711
|
+
out.into()
|
712
|
+
}
|
713
|
+
|
714
|
+
pub fn sum(&self) -> Self {
|
715
|
+
let ldf = self.ldf.borrow().clone();
|
716
|
+
let out = ldf.sum();
|
717
|
+
out.into()
|
718
|
+
}
|
719
|
+
|
720
|
+
pub fn mean(&self) -> Self {
|
721
|
+
let ldf = self.ldf.borrow().clone();
|
722
|
+
let out = ldf.mean();
|
723
|
+
out.into()
|
724
|
+
}
|
725
|
+
|
726
|
+
pub fn std(&self, ddof: u8) -> Self {
|
727
|
+
let ldf = self.ldf.borrow().clone();
|
728
|
+
let out = ldf.std(ddof);
|
729
|
+
out.into()
|
730
|
+
}
|
731
|
+
|
732
|
+
pub fn var(&self, ddof: u8) -> Self {
|
733
|
+
let ldf = self.ldf.borrow().clone();
|
734
|
+
let out = ldf.var(ddof);
|
735
|
+
out.into()
|
736
|
+
}
|
737
|
+
|
738
|
+
pub fn median(&self) -> Self {
|
739
|
+
let ldf = self.ldf.borrow().clone();
|
740
|
+
let out = ldf.median();
|
741
|
+
out.into()
|
742
|
+
}
|
743
|
+
|
744
|
+
pub fn quantile(&self, quantile: &RbExpr, interpolation: Wrap<QuantileMethod>) -> Self {
|
745
|
+
let ldf = self.ldf.borrow().clone();
|
746
|
+
let out = ldf.quantile(quantile.inner.clone(), interpolation.0);
|
747
|
+
out.into()
|
748
|
+
}
|
749
|
+
|
750
|
+
pub fn explode(&self, column: RArray) -> RbResult<Self> {
|
751
|
+
let ldf = self.ldf.borrow().clone();
|
752
|
+
let column = rb_exprs_to_exprs(column)?;
|
753
|
+
Ok(ldf.explode(column).into())
|
754
|
+
}
|
755
|
+
|
756
|
+
pub fn null_count(&self) -> Self {
|
757
|
+
let ldf = self.ldf.borrow().clone();
|
758
|
+
ldf.null_count().into()
|
759
|
+
}
|
760
|
+
|
761
|
+
pub fn unique(
|
762
|
+
&self,
|
763
|
+
maintain_order: bool,
|
764
|
+
subset: Option<Vec<String>>,
|
765
|
+
keep: Wrap<UniqueKeepStrategy>,
|
766
|
+
) -> RbResult<Self> {
|
767
|
+
let ldf = self.ldf.borrow().clone();
|
768
|
+
Ok(match maintain_order {
|
769
|
+
true => ldf.unique_stable_generic(subset, keep.0),
|
770
|
+
false => ldf.unique_generic(subset, keep.0),
|
771
|
+
}
|
772
|
+
.into())
|
773
|
+
}
|
774
|
+
|
775
|
+
pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
|
776
|
+
let ldf = self.ldf.borrow().clone();
|
777
|
+
ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
|
778
|
+
.into()
|
779
|
+
}
|
780
|
+
|
781
|
+
pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
|
782
|
+
let ldf = self.ldf.borrow().clone();
|
783
|
+
ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
|
784
|
+
}
|
785
|
+
|
786
|
+
pub fn tail(&self, n: IdxSize) -> Self {
|
787
|
+
let ldf = self.ldf.borrow().clone();
|
788
|
+
ldf.tail(n).into()
|
789
|
+
}
|
790
|
+
|
791
|
+
pub fn unpivot(
|
792
|
+
&self,
|
793
|
+
on: RArray,
|
794
|
+
index: RArray,
|
795
|
+
value_name: Option<String>,
|
796
|
+
variable_name: Option<String>,
|
797
|
+
) -> RbResult<Self> {
|
798
|
+
let on = rb_exprs_to_exprs(on)?;
|
799
|
+
let index = rb_exprs_to_exprs(index)?;
|
800
|
+
let args = UnpivotArgsDSL {
|
801
|
+
on: on.into_iter().map(|e| e.into()).collect(),
|
802
|
+
index: index.into_iter().map(|e| e.into()).collect(),
|
803
|
+
value_name: value_name.map(|s| s.into()),
|
804
|
+
variable_name: variable_name.map(|s| s.into()),
|
805
|
+
};
|
806
|
+
|
807
|
+
let ldf = self.ldf.borrow().clone();
|
808
|
+
Ok(ldf.unpivot(args).into())
|
809
|
+
}
|
810
|
+
|
811
|
+
pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
|
812
|
+
let ldf = self.ldf.borrow().clone();
|
813
|
+
ldf.with_row_index(&name, offset).into()
|
814
|
+
}
|
815
|
+
|
816
|
+
pub fn drop(&self, cols: Vec<String>) -> Self {
|
817
|
+
let ldf = self.ldf.borrow().clone();
|
818
|
+
ldf.drop(cols).into()
|
819
|
+
}
|
820
|
+
|
821
|
+
pub fn cast(&self, rb_dtypes: RHash, strict: bool) -> RbResult<Self> {
|
822
|
+
let mut dtypes = Vec::new();
|
823
|
+
rb_dtypes.foreach(|k: String, v: Wrap<DataType>| {
|
824
|
+
dtypes.push((k, v.0));
|
825
|
+
Ok(ForEach::Continue)
|
826
|
+
})?;
|
827
|
+
let mut cast_map = PlHashMap::with_capacity(dtypes.len());
|
828
|
+
cast_map.extend(dtypes.iter().map(|(k, v)| (k.as_ref(), v.clone())));
|
829
|
+
Ok(self.ldf.borrow().clone().cast(cast_map, strict).into())
|
830
|
+
}
|
831
|
+
|
832
|
+
pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
|
833
|
+
self.ldf.borrow().clone().cast_all(dtype.0, strict).into()
|
834
|
+
}
|
835
|
+
|
836
|
+
pub fn clone(&self) -> Self {
|
837
|
+
self.ldf.borrow().clone().into()
|
838
|
+
}
|
839
|
+
|
840
|
+
pub fn collect_schema(&self) -> RbResult<RHash> {
|
841
|
+
let schema = self
|
842
|
+
.ldf
|
843
|
+
.borrow_mut()
|
844
|
+
.collect_schema()
|
845
|
+
.map_err(RbPolarsErr::from)?;
|
846
|
+
|
847
|
+
let schema_dict = RHash::new();
|
848
|
+
schema.iter_fields().for_each(|fld| {
|
849
|
+
schema_dict
|
850
|
+
.aset::<String, Value>(
|
851
|
+
fld.name().to_string(),
|
852
|
+
Wrap(fld.dtype().clone()).into_value(),
|
853
|
+
)
|
854
|
+
.unwrap();
|
855
|
+
});
|
856
|
+
Ok(schema_dict)
|
857
|
+
}
|
858
|
+
|
859
|
+
pub fn unnest(&self, cols: Vec<String>) -> Self {
|
860
|
+
self.ldf.borrow().clone().unnest(cols).into()
|
861
|
+
}
|
862
|
+
|
863
|
+
pub fn count(&self) -> Self {
|
864
|
+
let ldf = self.ldf.borrow().clone();
|
865
|
+
ldf.count().into()
|
866
|
+
}
|
867
|
+
|
868
|
+
pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
|
869
|
+
let out = self
|
870
|
+
.ldf
|
871
|
+
.borrow()
|
872
|
+
.clone()
|
873
|
+
.merge_sorted(other.ldf.borrow().clone(), &key)
|
874
|
+
.map_err(RbPolarsErr::from)?;
|
875
|
+
Ok(out.into())
|
876
|
+
}
|
877
|
+
}
|