polars-df 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/README.md +11 -9
- data/ext/polars/Cargo.toml +18 -10
- data/ext/polars/src/batched_csv.rs +26 -26
- data/ext/polars/src/conversion.rs +272 -136
- data/ext/polars/src/dataframe.rs +135 -94
- data/ext/polars/src/error.rs +8 -5
- data/ext/polars/src/expr/array.rs +15 -0
- data/ext/polars/src/expr/binary.rs +18 -6
- data/ext/polars/src/expr/datetime.rs +10 -12
- data/ext/polars/src/expr/general.rs +78 -264
- data/ext/polars/src/expr/list.rs +41 -28
- data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
- data/ext/polars/src/expr/name.rs +44 -0
- data/ext/polars/src/expr/rolling.rs +196 -0
- data/ext/polars/src/expr/string.rs +94 -66
- data/ext/polars/src/file.rs +3 -3
- data/ext/polars/src/functions/aggregation.rs +35 -0
- data/ext/polars/src/functions/eager.rs +7 -31
- data/ext/polars/src/functions/io.rs +10 -10
- data/ext/polars/src/functions/lazy.rs +119 -54
- data/ext/polars/src/functions/meta.rs +30 -0
- data/ext/polars/src/functions/misc.rs +8 -0
- data/ext/polars/src/functions/mod.rs +5 -0
- data/ext/polars/src/functions/random.rs +6 -0
- data/ext/polars/src/functions/range.rs +46 -0
- data/ext/polars/src/functions/string_cache.rs +11 -0
- data/ext/polars/src/functions/whenthen.rs +7 -7
- data/ext/polars/src/lazyframe.rs +61 -44
- data/ext/polars/src/lib.rs +173 -84
- data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
- data/ext/polars/src/{apply → map}/mod.rs +10 -6
- data/ext/polars/src/{apply → map}/series.rs +12 -16
- data/ext/polars/src/object.rs +2 -2
- data/ext/polars/src/rb_modules.rs +25 -6
- data/ext/polars/src/series/construction.rs +32 -6
- data/ext/polars/src/series/export.rs +2 -2
- data/ext/polars/src/series/set_at_idx.rs +33 -17
- data/ext/polars/src/series.rs +62 -42
- data/ext/polars/src/sql.rs +46 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +21 -7
- /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
data/ext/polars/src/lazyframe.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{IntoValue, RArray, RHash, Value};
|
1
|
+
use magnus::{IntoValue, RArray, RHash, TryConvert, Value};
|
2
2
|
use polars::io::RowCount;
|
3
3
|
use polars::lazy::frame::LazyFrame;
|
4
4
|
use polars::prelude::*;
|
@@ -78,32 +78,32 @@ impl RbLazyFrame {
|
|
78
78
|
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
79
79
|
// start arguments
|
80
80
|
// this pattern is needed for more than 16
|
81
|
-
let path
|
82
|
-
let
|
83
|
-
let has_header
|
84
|
-
let ignore_errors
|
85
|
-
let skip_rows
|
86
|
-
let n_rows
|
87
|
-
let cache
|
88
|
-
let overwrite_dtype
|
89
|
-
let low_memory
|
90
|
-
let comment_char
|
91
|
-
let quote_char
|
92
|
-
let null_values
|
93
|
-
let infer_schema_length
|
94
|
-
let with_schema_modify
|
95
|
-
let rechunk
|
96
|
-
let skip_rows_after_header
|
97
|
-
let encoding
|
98
|
-
let row_count
|
99
|
-
let try_parse_dates
|
100
|
-
let eol_char
|
81
|
+
let path = String::try_convert(arguments[0])?;
|
82
|
+
let separator = String::try_convert(arguments[1])?;
|
83
|
+
let has_header = bool::try_convert(arguments[2])?;
|
84
|
+
let ignore_errors = bool::try_convert(arguments[3])?;
|
85
|
+
let skip_rows = usize::try_convert(arguments[4])?;
|
86
|
+
let n_rows = Option::<usize>::try_convert(arguments[5])?;
|
87
|
+
let cache = bool::try_convert(arguments[6])?;
|
88
|
+
let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
|
89
|
+
let low_memory = bool::try_convert(arguments[8])?;
|
90
|
+
let comment_char = Option::<String>::try_convert(arguments[9])?;
|
91
|
+
let quote_char = Option::<String>::try_convert(arguments[10])?;
|
92
|
+
let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
|
93
|
+
let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
|
94
|
+
let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
|
95
|
+
let rechunk = bool::try_convert(arguments[14])?;
|
96
|
+
let skip_rows_after_header = usize::try_convert(arguments[15])?;
|
97
|
+
let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
|
98
|
+
let row_count = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
|
99
|
+
let try_parse_dates = bool::try_convert(arguments[18])?;
|
100
|
+
let eol_char = String::try_convert(arguments[19])?;
|
101
101
|
// end arguments
|
102
102
|
|
103
103
|
let null_values = null_values.map(|w| w.0);
|
104
104
|
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
105
105
|
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
106
|
-
let
|
106
|
+
let separator = separator.as_bytes()[0];
|
107
107
|
let eol_char = eol_char.as_bytes()[0];
|
108
108
|
|
109
109
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
@@ -116,7 +116,7 @@ impl RbLazyFrame {
|
|
116
116
|
});
|
117
117
|
let r = LazyCsvReader::new(path)
|
118
118
|
.with_infer_schema_length(infer_schema_length)
|
119
|
-
.
|
119
|
+
.with_separator(separator)
|
120
120
|
.has_header(has_header)
|
121
121
|
.with_ignore_errors(ignore_errors)
|
122
122
|
.with_skip_rows(skip_rows)
|
@@ -151,6 +151,7 @@ impl RbLazyFrame {
|
|
151
151
|
row_count: Option<(String, IdxSize)>,
|
152
152
|
low_memory: bool,
|
153
153
|
use_statistics: bool,
|
154
|
+
hive_partitioning: bool,
|
154
155
|
) -> RbResult<Self> {
|
155
156
|
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
156
157
|
let args = ScanArgsParquet {
|
@@ -163,6 +164,7 @@ impl RbLazyFrame {
|
|
163
164
|
// TODO support cloud options
|
164
165
|
cloud_options: None,
|
165
166
|
use_statistics,
|
167
|
+
hive_partitioning,
|
166
168
|
};
|
167
169
|
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
168
170
|
Ok(lf.into())
|
@@ -217,6 +219,7 @@ impl RbLazyFrame {
|
|
217
219
|
slice_pushdown: bool,
|
218
220
|
cse: bool,
|
219
221
|
allow_streaming: bool,
|
222
|
+
_eager: bool,
|
220
223
|
) -> RbLazyFrame {
|
221
224
|
let ldf = self.ldf.clone();
|
222
225
|
let ldf = ldf
|
@@ -224,13 +227,20 @@ impl RbLazyFrame {
|
|
224
227
|
.with_predicate_pushdown(predicate_pushdown)
|
225
228
|
.with_simplify_expr(simplify_expr)
|
226
229
|
.with_slice_pushdown(slice_pushdown)
|
227
|
-
.
|
230
|
+
.with_comm_subplan_elim(cse)
|
228
231
|
.with_streaming(allow_streaming)
|
232
|
+
._with_eager(_eager)
|
229
233
|
.with_projection_pushdown(projection_pushdown);
|
230
234
|
ldf.into()
|
231
235
|
}
|
232
236
|
|
233
|
-
pub fn sort(
|
237
|
+
pub fn sort(
|
238
|
+
&self,
|
239
|
+
by_column: String,
|
240
|
+
reverse: bool,
|
241
|
+
nulls_last: bool,
|
242
|
+
maintain_order: bool,
|
243
|
+
) -> Self {
|
234
244
|
let ldf = self.ldf.clone();
|
235
245
|
ldf.sort(
|
236
246
|
&by_column,
|
@@ -238,6 +248,7 @@ impl RbLazyFrame {
|
|
238
248
|
descending: reverse,
|
239
249
|
nulls_last,
|
240
250
|
multithreaded: true,
|
251
|
+
maintain_order,
|
241
252
|
},
|
242
253
|
)
|
243
254
|
.into()
|
@@ -248,10 +259,13 @@ impl RbLazyFrame {
|
|
248
259
|
by_column: RArray,
|
249
260
|
reverse: Vec<bool>,
|
250
261
|
nulls_last: bool,
|
262
|
+
maintain_order: bool,
|
251
263
|
) -> RbResult<Self> {
|
252
264
|
let ldf = self.ldf.clone();
|
253
265
|
let exprs = rb_exprs_to_exprs(by_column)?;
|
254
|
-
Ok(ldf
|
266
|
+
Ok(ldf
|
267
|
+
.sort_by_exprs(exprs, reverse, nulls_last, maintain_order)
|
268
|
+
.into())
|
255
269
|
}
|
256
270
|
|
257
271
|
pub fn cache(&self) -> Self {
|
@@ -308,31 +322,32 @@ impl RbLazyFrame {
|
|
308
322
|
Ok(ldf.select(exprs).into())
|
309
323
|
}
|
310
324
|
|
311
|
-
pub fn
|
325
|
+
pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
|
312
326
|
let ldf = self.ldf.clone();
|
313
327
|
let by = rb_exprs_to_exprs(by)?;
|
314
328
|
let lazy_gb = if maintain_order {
|
315
|
-
ldf.
|
329
|
+
ldf.group_by_stable(by)
|
316
330
|
} else {
|
317
|
-
ldf.
|
331
|
+
ldf.group_by(by)
|
318
332
|
};
|
319
333
|
Ok(RbLazyGroupBy {
|
320
334
|
lgb: RefCell::new(Some(lazy_gb)),
|
321
335
|
})
|
322
336
|
}
|
323
337
|
|
324
|
-
pub fn
|
338
|
+
pub fn group_by_rolling(
|
325
339
|
&self,
|
326
340
|
index_column: &RbExpr,
|
327
341
|
period: String,
|
328
342
|
offset: String,
|
329
343
|
closed: Wrap<ClosedWindow>,
|
330
344
|
by: RArray,
|
345
|
+
check_sorted: bool,
|
331
346
|
) -> RbResult<RbLazyGroupBy> {
|
332
347
|
let closed_window = closed.0;
|
333
348
|
let ldf = self.ldf.clone();
|
334
349
|
let by = rb_exprs_to_exprs(by)?;
|
335
|
-
let lazy_gb = ldf.
|
350
|
+
let lazy_gb = ldf.group_by_rolling(
|
336
351
|
index_column.inner.clone(),
|
337
352
|
by,
|
338
353
|
RollingGroupOptions {
|
@@ -340,6 +355,7 @@ impl RbLazyFrame {
|
|
340
355
|
period: Duration::parse(&period),
|
341
356
|
offset: Duration::parse(&offset),
|
342
357
|
closed_window,
|
358
|
+
check_sorted,
|
343
359
|
},
|
344
360
|
);
|
345
361
|
|
@@ -349,32 +365,34 @@ impl RbLazyFrame {
|
|
349
365
|
}
|
350
366
|
|
351
367
|
#[allow(clippy::too_many_arguments)]
|
352
|
-
pub fn
|
368
|
+
pub fn group_by_dynamic(
|
353
369
|
&self,
|
354
370
|
index_column: &RbExpr,
|
355
371
|
every: String,
|
356
372
|
period: String,
|
357
373
|
offset: String,
|
358
|
-
|
374
|
+
label: Wrap<Label>,
|
359
375
|
include_boundaries: bool,
|
360
376
|
closed: Wrap<ClosedWindow>,
|
361
377
|
by: RArray,
|
362
378
|
start_by: Wrap<StartBy>,
|
379
|
+
check_sorted: bool,
|
363
380
|
) -> RbResult<RbLazyGroupBy> {
|
364
381
|
let closed_window = closed.0;
|
365
382
|
let by = rb_exprs_to_exprs(by)?;
|
366
383
|
let ldf = self.ldf.clone();
|
367
|
-
let lazy_gb = ldf.
|
384
|
+
let lazy_gb = ldf.group_by_dynamic(
|
368
385
|
index_column.inner.clone(),
|
369
386
|
by,
|
370
387
|
DynamicGroupOptions {
|
371
388
|
every: Duration::parse(&every),
|
372
389
|
period: Duration::parse(&period),
|
373
390
|
offset: Duration::parse(&offset),
|
374
|
-
|
391
|
+
label: label.0,
|
375
392
|
include_boundaries,
|
376
393
|
closed_window,
|
377
394
|
start_by: start_by.0,
|
395
|
+
check_sorted,
|
378
396
|
..Default::default()
|
379
397
|
},
|
380
398
|
);
|
@@ -387,7 +405,7 @@ impl RbLazyFrame {
|
|
387
405
|
pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
|
388
406
|
let contexts = contexts
|
389
407
|
.each()
|
390
|
-
.map(|v| v.unwrap()
|
408
|
+
.map(|v| TryConvert::try_convert(v.unwrap()))
|
391
409
|
.collect::<RbResult<Vec<&RbLazyFrame>>>()?;
|
392
410
|
let contexts = contexts
|
393
411
|
.into_iter()
|
@@ -478,14 +496,13 @@ impl RbLazyFrame {
|
|
478
496
|
ldf.reverse().into()
|
479
497
|
}
|
480
498
|
|
481
|
-
pub fn shift(&self,
|
482
|
-
let
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
ldf.shift_and_fill(periods, fill_value.inner.clone()).into()
|
499
|
+
pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
|
500
|
+
let lf = self.ldf.clone();
|
501
|
+
let out = match fill_value {
|
502
|
+
Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
|
503
|
+
None => lf.shift(n.inner.clone()),
|
504
|
+
};
|
505
|
+
out.into()
|
489
506
|
}
|
490
507
|
|
491
508
|
pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
|