polars-df 0.14.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -0
  3. data/Cargo.lock +1296 -283
  4. data/LICENSE.txt +1 -0
  5. data/README.md +1 -2
  6. data/ext/polars/Cargo.toml +15 -5
  7. data/ext/polars/src/batched_csv.rs +7 -10
  8. data/ext/polars/src/conversion/any_value.rs +31 -21
  9. data/ext/polars/src/conversion/mod.rs +125 -28
  10. data/ext/polars/src/dataframe/construction.rs +0 -3
  11. data/ext/polars/src/dataframe/export.rs +9 -2
  12. data/ext/polars/src/dataframe/general.rs +16 -11
  13. data/ext/polars/src/dataframe/io.rs +73 -169
  14. data/ext/polars/src/dataframe/mod.rs +1 -0
  15. data/ext/polars/src/dataframe/serde.rs +15 -0
  16. data/ext/polars/src/error.rs +31 -48
  17. data/ext/polars/src/exceptions.rs +24 -0
  18. data/ext/polars/src/expr/binary.rs +4 -42
  19. data/ext/polars/src/expr/datetime.rs +5 -4
  20. data/ext/polars/src/expr/general.rs +13 -22
  21. data/ext/polars/src/expr/list.rs +18 -11
  22. data/ext/polars/src/expr/rolling.rs +6 -7
  23. data/ext/polars/src/expr/string.rs +9 -36
  24. data/ext/polars/src/file.rs +59 -22
  25. data/ext/polars/src/functions/business.rs +15 -0
  26. data/ext/polars/src/functions/lazy.rs +17 -8
  27. data/ext/polars/src/functions/mod.rs +1 -0
  28. data/ext/polars/src/interop/arrow/mod.rs +1 -0
  29. data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
  30. data/ext/polars/src/interop/mod.rs +1 -0
  31. data/ext/polars/src/lazyframe/general.rs +877 -0
  32. data/ext/polars/src/lazyframe/mod.rs +3 -827
  33. data/ext/polars/src/lazyframe/serde.rs +31 -0
  34. data/ext/polars/src/lib.rs +45 -14
  35. data/ext/polars/src/map/dataframe.rs +10 -6
  36. data/ext/polars/src/map/lazy.rs +65 -4
  37. data/ext/polars/src/map/mod.rs +9 -8
  38. data/ext/polars/src/on_startup.rs +1 -1
  39. data/ext/polars/src/series/aggregation.rs +1 -5
  40. data/ext/polars/src/series/arithmetic.rs +10 -10
  41. data/ext/polars/src/series/construction.rs +2 -2
  42. data/ext/polars/src/series/export.rs +1 -1
  43. data/ext/polars/src/series/general.rs +643 -0
  44. data/ext/polars/src/series/import.rs +55 -0
  45. data/ext/polars/src/series/mod.rs +11 -638
  46. data/ext/polars/src/series/scatter.rs +2 -2
  47. data/ext/polars/src/utils.rs +0 -20
  48. data/lib/polars/batched_csv_reader.rb +0 -2
  49. data/lib/polars/binary_expr.rb +133 -9
  50. data/lib/polars/binary_name_space.rb +101 -6
  51. data/lib/polars/config.rb +4 -0
  52. data/lib/polars/data_frame.rb +275 -52
  53. data/lib/polars/data_type_group.rb +28 -0
  54. data/lib/polars/data_types.rb +2 -0
  55. data/lib/polars/date_time_expr.rb +244 -0
  56. data/lib/polars/date_time_name_space.rb +87 -0
  57. data/lib/polars/expr.rb +103 -2
  58. data/lib/polars/functions/as_datatype.rb +51 -2
  59. data/lib/polars/functions/col.rb +1 -1
  60. data/lib/polars/functions/eager.rb +1 -3
  61. data/lib/polars/functions/lazy.rb +88 -10
  62. data/lib/polars/functions/range/time_range.rb +21 -21
  63. data/lib/polars/io/csv.rb +14 -16
  64. data/lib/polars/io/database.rb +2 -2
  65. data/lib/polars/io/ipc.rb +14 -4
  66. data/lib/polars/io/ndjson.rb +10 -0
  67. data/lib/polars/io/parquet.rb +168 -111
  68. data/lib/polars/lazy_frame.rb +649 -15
  69. data/lib/polars/list_name_space.rb +169 -0
  70. data/lib/polars/selectors.rb +1144 -0
  71. data/lib/polars/series.rb +465 -35
  72. data/lib/polars/string_cache.rb +27 -1
  73. data/lib/polars/string_expr.rb +0 -1
  74. data/lib/polars/string_name_space.rb +73 -3
  75. data/lib/polars/struct_name_space.rb +31 -7
  76. data/lib/polars/utils/various.rb +5 -1
  77. data/lib/polars/utils.rb +45 -10
  78. data/lib/polars/version.rb +1 -1
  79. data/lib/polars.rb +2 -1
  80. metadata +14 -4
  81. data/lib/polars/functions.rb +0 -57
@@ -0,0 +1,877 @@
1
+ use magnus::{r_hash::ForEach, typed_data::Obj, IntoValue, RArray, RHash, TryConvert, Value};
2
+ use polars::io::{HiveOptions, RowIndex};
3
+ use polars::lazy::frame::LazyFrame;
4
+ use polars::prelude::*;
5
+ use polars_plan::plans::ScanSources;
6
+ use std::cell::RefCell;
7
+ use std::io::BufWriter;
8
+ use std::num::NonZeroUsize;
9
+ use std::path::PathBuf;
10
+
11
+ use crate::conversion::*;
12
+ use crate::expr::rb_exprs_to_exprs;
13
+ use crate::file::get_file_like;
14
+ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
15
+
16
+ fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PathBuf>, ScanSources)> {
17
+ use crate::file::{get_ruby_scan_source_input, RubyScanSourceInput};
18
+ Ok(match get_ruby_scan_source_input(obj, false)? {
19
+ RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
20
+ RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
21
+ RubyScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
22
+ })
23
+ }
24
+
25
+ impl RbLazyFrame {
26
+ #[allow(clippy::too_many_arguments)]
27
+ pub fn new_from_ndjson(
28
+ source: Option<Value>,
29
+ sources: Wrap<ScanSources>,
30
+ infer_schema_length: Option<usize>,
31
+ batch_size: Option<Wrap<NonZeroUsize>>,
32
+ n_rows: Option<usize>,
33
+ low_memory: bool,
34
+ rechunk: bool,
35
+ row_index: Option<(String, IdxSize)>,
36
+ ) -> RbResult<Self> {
37
+ let batch_size = batch_size.map(|v| v.0);
38
+ let row_index = row_index.map(|(name, offset)| RowIndex {
39
+ name: name.into(),
40
+ offset,
41
+ });
42
+
43
+ let sources = sources.0;
44
+ let (_first_path, sources) = match source {
45
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
46
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
47
+ };
48
+
49
+ let r = LazyJsonLineReader::new_with_sources(sources);
50
+
51
+ let lf = r
52
+ .with_infer_schema_length(infer_schema_length.and_then(NonZeroUsize::new))
53
+ .with_batch_size(batch_size)
54
+ .with_n_rows(n_rows)
55
+ .low_memory(low_memory)
56
+ .with_rechunk(rechunk)
57
+ // .with_schema(schema.map(|schema| Arc::new(schema.0)))
58
+ // .with_schema_overwrite(schema_overrides.map(|x| Arc::new(x.0)))
59
+ .with_row_index(row_index)
60
+ // .with_ignore_errors(ignore_errors)
61
+ // .with_include_file_paths(include_file_paths.map(|x| x.into()))
62
+ .finish()
63
+ .map_err(RbPolarsErr::from)?;
64
+
65
+ Ok(lf.into())
66
+ }
67
+
68
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
69
+ // start arguments
70
+ // this pattern is needed for more than 16
71
+ let source = Option::<Value>::try_convert(arguments[0])?;
72
+ let sources = Wrap::<ScanSources>::try_convert(arguments[21])?;
73
+ let separator = String::try_convert(arguments[1])?;
74
+ let has_header = bool::try_convert(arguments[2])?;
75
+ let ignore_errors = bool::try_convert(arguments[3])?;
76
+ let skip_rows = usize::try_convert(arguments[4])?;
77
+ let n_rows = Option::<usize>::try_convert(arguments[5])?;
78
+ let cache = bool::try_convert(arguments[6])?;
79
+ let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
80
+ let low_memory = bool::try_convert(arguments[8])?;
81
+ let comment_prefix = Option::<String>::try_convert(arguments[9])?;
82
+ let quote_char = Option::<String>::try_convert(arguments[10])?;
83
+ let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
84
+ let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
85
+ let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
86
+ let rechunk = bool::try_convert(arguments[14])?;
87
+ let skip_rows_after_header = usize::try_convert(arguments[15])?;
88
+ let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
89
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
90
+ let try_parse_dates = bool::try_convert(arguments[18])?;
91
+ let eol_char = String::try_convert(arguments[19])?;
92
+ let truncate_ragged_lines = bool::try_convert(arguments[20])?;
93
+ // end arguments
94
+
95
+ let null_values = null_values.map(|w| w.0);
96
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
97
+ let separator = separator.as_bytes()[0];
98
+ let eol_char = eol_char.as_bytes()[0];
99
+ let row_index = row_index.map(|(name, offset)| RowIndex {
100
+ name: name.into(),
101
+ offset,
102
+ });
103
+
104
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
105
+ overwrite_dtype
106
+ .into_iter()
107
+ .map(|(name, dtype)| Field::new((&*name).into(), dtype.0))
108
+ .collect::<Schema>()
109
+ });
110
+
111
+ let sources = sources.0;
112
+ let (_first_path, sources) = match source {
113
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
114
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
115
+ };
116
+
117
+ let r = LazyCsvReader::new_with_sources(sources);
118
+
119
+ let r = r
120
+ .with_infer_schema_length(infer_schema_length)
121
+ .with_separator(separator)
122
+ .with_has_header(has_header)
123
+ .with_ignore_errors(ignore_errors)
124
+ .with_skip_rows(skip_rows)
125
+ .with_n_rows(n_rows)
126
+ .with_cache(cache)
127
+ .with_dtype_overwrite(overwrite_dtype.map(Arc::new))
128
+ // TODO add with_schema
129
+ .with_low_memory(low_memory)
130
+ .with_comment_prefix(comment_prefix.map(|x| x.into()))
131
+ .with_quote_char(quote_char)
132
+ .with_eol_char(eol_char)
133
+ .with_rechunk(rechunk)
134
+ .with_skip_rows_after_header(skip_rows_after_header)
135
+ .with_encoding(encoding.0)
136
+ .with_row_index(row_index)
137
+ .with_try_parse_dates(try_parse_dates)
138
+ .with_null_values(null_values)
139
+ // TODO add with_missing_is_null
140
+ .with_truncate_ragged_lines(truncate_ragged_lines);
141
+
142
+ if let Some(_lambda) = with_schema_modify {
143
+ todo!();
144
+ }
145
+
146
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
147
+ }
148
+
149
+ pub fn new_from_parquet(arguments: &[Value]) -> RbResult<Self> {
150
+ let source = Option::<Value>::try_convert(arguments[0])?;
151
+ let sources = Wrap::<ScanSources>::try_convert(arguments[1])?;
152
+ let n_rows = Option::<usize>::try_convert(arguments[2])?;
153
+ let cache = bool::try_convert(arguments[3])?;
154
+ let parallel = Wrap::<ParallelStrategy>::try_convert(arguments[4])?;
155
+ let rechunk = bool::try_convert(arguments[5])?;
156
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[6])?;
157
+ let low_memory = bool::try_convert(arguments[7])?;
158
+ let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[8])?;
159
+ let _credential_provider = Option::<Value>::try_convert(arguments[9])?;
160
+ let use_statistics = bool::try_convert(arguments[10])?;
161
+ let hive_partitioning = Option::<bool>::try_convert(arguments[11])?;
162
+ let schema = Option::<Wrap<Schema>>::try_convert(arguments[12])?;
163
+ let hive_schema = Option::<Wrap<Schema>>::try_convert(arguments[13])?;
164
+ let try_parse_hive_dates = bool::try_convert(arguments[14])?;
165
+ let retries = usize::try_convert(arguments[15])?;
166
+ let glob = bool::try_convert(arguments[16])?;
167
+ let include_file_paths = Option::<String>::try_convert(arguments[17])?;
168
+ let allow_missing_columns = bool::try_convert(arguments[18])?;
169
+
170
+ let parallel = parallel.0;
171
+ let hive_schema = hive_schema.map(|s| Arc::new(s.0));
172
+
173
+ let row_index = row_index.map(|(name, offset)| RowIndex {
174
+ name: name.into(),
175
+ offset,
176
+ });
177
+
178
+ let hive_options = HiveOptions {
179
+ enabled: hive_partitioning,
180
+ hive_start_idx: 0,
181
+ schema: hive_schema,
182
+ try_parse_dates: try_parse_hive_dates,
183
+ };
184
+
185
+ let mut args = ScanArgsParquet {
186
+ n_rows,
187
+ cache,
188
+ parallel,
189
+ rechunk,
190
+ row_index,
191
+ low_memory,
192
+ cloud_options: None,
193
+ use_statistics,
194
+ schema: schema.map(|x| Arc::new(x.0)),
195
+ hive_options,
196
+ glob,
197
+ include_file_paths: include_file_paths.map(|x| x.into()),
198
+ allow_missing_columns,
199
+ };
200
+
201
+ let sources = sources.0;
202
+ let (first_path, sources) = match source {
203
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
204
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
205
+ };
206
+
207
+ if let Some(first_path) = first_path {
208
+ let first_path_url = first_path.to_string_lossy();
209
+ let cloud_options =
210
+ parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
211
+ args.cloud_options = Some(cloud_options.with_max_retries(retries));
212
+ }
213
+
214
+ let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(RbPolarsErr::from)?;
215
+
216
+ Ok(lf.into())
217
+ }
218
+
219
+ #[allow(clippy::too_many_arguments)]
220
+ pub fn new_from_ipc(
221
+ source: Option<Value>,
222
+ sources: Wrap<ScanSources>,
223
+ n_rows: Option<usize>,
224
+ cache: bool,
225
+ rechunk: bool,
226
+ row_index: Option<(String, IdxSize)>,
227
+ hive_partitioning: Option<bool>,
228
+ hive_schema: Option<Wrap<Schema>>,
229
+ try_parse_hive_dates: bool,
230
+ include_file_paths: Option<String>,
231
+ ) -> RbResult<Self> {
232
+ let row_index = row_index.map(|(name, offset)| RowIndex {
233
+ name: name.into(),
234
+ offset,
235
+ });
236
+
237
+ let hive_options = HiveOptions {
238
+ enabled: hive_partitioning,
239
+ hive_start_idx: 0,
240
+ schema: hive_schema.map(|x| Arc::new(x.0)),
241
+ try_parse_dates: try_parse_hive_dates,
242
+ };
243
+
244
+ let args = ScanArgsIpc {
245
+ n_rows,
246
+ cache,
247
+ rechunk,
248
+ row_index,
249
+ cloud_options: None,
250
+ hive_options,
251
+ include_file_paths: include_file_paths.map(|x| x.into()),
252
+ };
253
+
254
+ let sources = sources.0;
255
+ let (_first_path, sources) = match source {
256
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
257
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
258
+ };
259
+
260
+ let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(RbPolarsErr::from)?;
261
+ Ok(lf.into())
262
+ }
263
+
264
+ pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
265
+ let file = BufWriter::new(get_file_like(rb_f, true)?);
266
+ serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
267
+ .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
268
+ Ok(())
269
+ }
270
+
271
+ pub fn describe_plan(&self) -> RbResult<String> {
272
+ self.ldf
273
+ .borrow()
274
+ .describe_plan()
275
+ .map_err(RbPolarsErr::from)
276
+ .map_err(Into::into)
277
+ }
278
+
279
+ pub fn describe_optimized_plan(&self) -> RbResult<String> {
280
+ let result = self
281
+ .ldf
282
+ .borrow()
283
+ .describe_optimized_plan()
284
+ .map_err(RbPolarsErr::from)?;
285
+ Ok(result)
286
+ }
287
+
288
+ #[allow(clippy::too_many_arguments)]
289
+ pub fn optimization_toggle(
290
+ &self,
291
+ type_coercion: bool,
292
+ predicate_pushdown: bool,
293
+ projection_pushdown: bool,
294
+ simplify_expr: bool,
295
+ slice_pushdown: bool,
296
+ comm_subplan_elim: bool,
297
+ comm_subexpr_elim: bool,
298
+ allow_streaming: bool,
299
+ _eager: bool,
300
+ ) -> RbLazyFrame {
301
+ let ldf = self.ldf.borrow().clone();
302
+ let mut ldf = ldf
303
+ .with_type_coercion(type_coercion)
304
+ .with_predicate_pushdown(predicate_pushdown)
305
+ .with_simplify_expr(simplify_expr)
306
+ .with_slice_pushdown(slice_pushdown)
307
+ .with_streaming(allow_streaming)
308
+ ._with_eager(_eager)
309
+ .with_projection_pushdown(projection_pushdown);
310
+
311
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
312
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
313
+
314
+ ldf.into()
315
+ }
316
+
317
+ pub fn sort(
318
+ &self,
319
+ by_column: String,
320
+ descending: bool,
321
+ nulls_last: bool,
322
+ maintain_order: bool,
323
+ multithreaded: bool,
324
+ ) -> Self {
325
+ let ldf = self.ldf.borrow().clone();
326
+ ldf.sort(
327
+ [&by_column],
328
+ SortMultipleOptions {
329
+ descending: vec![descending],
330
+ nulls_last: vec![nulls_last],
331
+ multithreaded,
332
+ maintain_order,
333
+ },
334
+ )
335
+ .into()
336
+ }
337
+
338
+ pub fn sort_by_exprs(
339
+ &self,
340
+ by: RArray,
341
+ descending: Vec<bool>,
342
+ nulls_last: Vec<bool>,
343
+ maintain_order: bool,
344
+ multithreaded: bool,
345
+ ) -> RbResult<Self> {
346
+ let ldf = self.ldf.borrow().clone();
347
+ let exprs = rb_exprs_to_exprs(by)?;
348
+ Ok(ldf
349
+ .sort_by_exprs(
350
+ exprs,
351
+ SortMultipleOptions {
352
+ descending,
353
+ nulls_last,
354
+ maintain_order,
355
+ multithreaded,
356
+ },
357
+ )
358
+ .into())
359
+ }
360
+
361
+ pub fn cache(&self) -> Self {
362
+ let ldf = self.ldf.borrow().clone();
363
+ ldf.cache().into()
364
+ }
365
+
366
+ pub fn collect(&self) -> RbResult<RbDataFrame> {
367
+ let ldf = self.ldf.borrow().clone();
368
+ let df = ldf.collect().map_err(RbPolarsErr::from)?;
369
+ Ok(df.into())
370
+ }
371
+
372
+ #[allow(clippy::too_many_arguments)]
373
+ pub fn sink_parquet(
374
+ &self,
375
+ path: PathBuf,
376
+ compression: String,
377
+ compression_level: Option<i32>,
378
+ statistics: Wrap<StatisticsOptions>,
379
+ row_group_size: Option<usize>,
380
+ data_page_size: Option<usize>,
381
+ maintain_order: bool,
382
+ ) -> RbResult<()> {
383
+ let compression = parse_parquet_compression(&compression, compression_level)?;
384
+
385
+ let options = ParquetWriteOptions {
386
+ compression,
387
+ statistics: statistics.0,
388
+ row_group_size,
389
+ data_page_size,
390
+ maintain_order,
391
+ };
392
+
393
+ let ldf = self.ldf.borrow().clone();
394
+ ldf.sink_parquet(path, options).map_err(RbPolarsErr::from)?;
395
+ Ok(())
396
+ }
397
+
398
+ pub fn sink_ipc(
399
+ &self,
400
+ path: PathBuf,
401
+ compression: Option<Wrap<IpcCompression>>,
402
+ maintain_order: bool,
403
+ ) -> RbResult<()> {
404
+ let options = IpcWriterOptions {
405
+ compression: compression.map(|c| c.0),
406
+ maintain_order,
407
+ };
408
+
409
+ let ldf = self.ldf.borrow().clone();
410
+ ldf.sink_ipc(path, options).map_err(RbPolarsErr::from)?;
411
+ Ok(())
412
+ }
413
+
414
+ #[allow(clippy::too_many_arguments)]
415
+ pub fn sink_csv(
416
+ &self,
417
+ path: PathBuf,
418
+ include_bom: bool,
419
+ include_header: bool,
420
+ separator: u8,
421
+ line_terminator: String,
422
+ quote_char: u8,
423
+ batch_size: Wrap<NonZeroUsize>,
424
+ datetime_format: Option<String>,
425
+ date_format: Option<String>,
426
+ time_format: Option<String>,
427
+ float_scientific: Option<bool>,
428
+ float_precision: Option<usize>,
429
+ null_value: Option<String>,
430
+ quote_style: Option<Wrap<QuoteStyle>>,
431
+ maintain_order: bool,
432
+ ) -> RbResult<()> {
433
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
434
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
435
+
436
+ let serialize_options = SerializeOptions {
437
+ date_format,
438
+ time_format,
439
+ datetime_format,
440
+ float_scientific,
441
+ float_precision,
442
+ separator,
443
+ quote_char,
444
+ null: null_value,
445
+ line_terminator,
446
+ quote_style,
447
+ };
448
+
449
+ let options = CsvWriterOptions {
450
+ include_bom,
451
+ include_header,
452
+ maintain_order,
453
+ batch_size: batch_size.0,
454
+ serialize_options,
455
+ };
456
+
457
+ let ldf = self.ldf.borrow().clone();
458
+ ldf.sink_csv(path, options).map_err(RbPolarsErr::from)?;
459
+ Ok(())
460
+ }
461
+
462
+ pub fn sink_json(&self, path: PathBuf, maintain_order: bool) -> RbResult<()> {
463
+ let options = JsonWriterOptions { maintain_order };
464
+
465
+ let ldf = self.ldf.borrow().clone();
466
+ ldf.sink_json(path, options).map_err(RbPolarsErr::from)?;
467
+ Ok(())
468
+ }
469
+
470
+ pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
471
+ let ldf = self.ldf.borrow().clone();
472
+ let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
473
+ Ok(df.into())
474
+ }
475
+
476
+ pub fn filter(&self, predicate: &RbExpr) -> Self {
477
+ let ldf = self.ldf.borrow().clone();
478
+ ldf.filter(predicate.inner.clone()).into()
479
+ }
480
+
481
+ pub fn select(&self, exprs: RArray) -> RbResult<Self> {
482
+ let ldf = self.ldf.borrow().clone();
483
+ let exprs = rb_exprs_to_exprs(exprs)?;
484
+ Ok(ldf.select(exprs).into())
485
+ }
486
+
487
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
488
+ let ldf = self.ldf.borrow().clone();
489
+ let exprs = rb_exprs_to_exprs(exprs)?;
490
+ Ok(ldf.select_seq(exprs).into())
491
+ }
492
+
493
+ pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
494
+ let ldf = self.ldf.borrow().clone();
495
+ let by = rb_exprs_to_exprs(by)?;
496
+ let lazy_gb = if maintain_order {
497
+ ldf.group_by_stable(by)
498
+ } else {
499
+ ldf.group_by(by)
500
+ };
501
+ Ok(RbLazyGroupBy {
502
+ lgb: RefCell::new(Some(lazy_gb)),
503
+ })
504
+ }
505
+
506
+ pub fn rolling(
507
+ &self,
508
+ index_column: &RbExpr,
509
+ period: String,
510
+ offset: String,
511
+ closed: Wrap<ClosedWindow>,
512
+ by: RArray,
513
+ ) -> RbResult<RbLazyGroupBy> {
514
+ let closed_window = closed.0;
515
+ let ldf = self.ldf.borrow().clone();
516
+ let by = rb_exprs_to_exprs(by)?;
517
+ let lazy_gb = ldf.rolling(
518
+ index_column.inner.clone(),
519
+ by,
520
+ RollingGroupOptions {
521
+ index_column: "".into(),
522
+ period: Duration::parse(&period),
523
+ offset: Duration::parse(&offset),
524
+ closed_window,
525
+ },
526
+ );
527
+
528
+ Ok(RbLazyGroupBy {
529
+ lgb: RefCell::new(Some(lazy_gb)),
530
+ })
531
+ }
532
+
533
+ #[allow(clippy::too_many_arguments)]
534
+ pub fn group_by_dynamic(
535
+ &self,
536
+ index_column: &RbExpr,
537
+ every: String,
538
+ period: String,
539
+ offset: String,
540
+ label: Wrap<Label>,
541
+ include_boundaries: bool,
542
+ closed: Wrap<ClosedWindow>,
543
+ by: RArray,
544
+ start_by: Wrap<StartBy>,
545
+ ) -> RbResult<RbLazyGroupBy> {
546
+ let closed_window = closed.0;
547
+ let by = rb_exprs_to_exprs(by)?;
548
+ let ldf = self.ldf.borrow().clone();
549
+ let lazy_gb = ldf.group_by_dynamic(
550
+ index_column.inner.clone(),
551
+ by,
552
+ DynamicGroupOptions {
553
+ every: Duration::parse(&every),
554
+ period: Duration::parse(&period),
555
+ offset: Duration::parse(&offset),
556
+ label: label.0,
557
+ include_boundaries,
558
+ closed_window,
559
+ start_by: start_by.0,
560
+ ..Default::default()
561
+ },
562
+ );
563
+
564
+ Ok(RbLazyGroupBy {
565
+ lgb: RefCell::new(Some(lazy_gb)),
566
+ })
567
+ }
568
+
569
+ pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
570
+ let contexts = contexts.typecheck::<Obj<RbLazyFrame>>()?;
571
+ let contexts = contexts
572
+ .into_iter()
573
+ .map(|ldf| ldf.ldf.borrow().clone())
574
+ .collect::<Vec<_>>();
575
+ Ok(self.ldf.borrow().clone().with_context(contexts).into())
576
+ }
577
+
578
+ #[allow(clippy::too_many_arguments)]
579
+ pub fn join_asof(
580
+ &self,
581
+ other: &RbLazyFrame,
582
+ left_on: &RbExpr,
583
+ right_on: &RbExpr,
584
+ left_by: Option<Vec<String>>,
585
+ right_by: Option<Vec<String>>,
586
+ allow_parallel: bool,
587
+ force_parallel: bool,
588
+ suffix: String,
589
+ strategy: Wrap<AsofStrategy>,
590
+ tolerance: Option<Wrap<AnyValue<'_>>>,
591
+ tolerance_str: Option<String>,
592
+ coalesce: bool,
593
+ ) -> RbResult<Self> {
594
+ let coalesce = if coalesce {
595
+ JoinCoalesce::CoalesceColumns
596
+ } else {
597
+ JoinCoalesce::KeepColumns
598
+ };
599
+ let ldf = self.ldf.borrow().clone();
600
+ let other = other.ldf.borrow().clone();
601
+ let left_on = left_on.inner.clone();
602
+ let right_on = right_on.inner.clone();
603
+ Ok(ldf
604
+ .join_builder()
605
+ .with(other)
606
+ .left_on([left_on])
607
+ .right_on([right_on])
608
+ .allow_parallel(allow_parallel)
609
+ .force_parallel(force_parallel)
610
+ .coalesce(coalesce)
611
+ .how(JoinType::AsOf(AsOfOptions {
612
+ strategy: strategy.0,
613
+ left_by: left_by.map(strings_to_pl_smallstr),
614
+ right_by: right_by.map(strings_to_pl_smallstr),
615
+ tolerance: tolerance.map(|t| t.0.into_static()),
616
+ tolerance_str: tolerance_str.map(|s| s.into()),
617
+ }))
618
+ .suffix(suffix)
619
+ .finish()
620
+ .into())
621
+ }
622
+
623
+ #[allow(clippy::too_many_arguments)]
624
+ pub fn join(
625
+ &self,
626
+ other: &RbLazyFrame,
627
+ left_on: RArray,
628
+ right_on: RArray,
629
+ allow_parallel: bool,
630
+ force_parallel: bool,
631
+ join_nulls: bool,
632
+ how: Wrap<JoinType>,
633
+ suffix: String,
634
+ validate: Wrap<JoinValidation>,
635
+ coalesce: Option<bool>,
636
+ ) -> RbResult<Self> {
637
+ let coalesce = match coalesce {
638
+ None => JoinCoalesce::JoinSpecific,
639
+ Some(true) => JoinCoalesce::CoalesceColumns,
640
+ Some(false) => JoinCoalesce::KeepColumns,
641
+ };
642
+ let ldf = self.ldf.borrow().clone();
643
+ let other = other.ldf.borrow().clone();
644
+ let left_on = rb_exprs_to_exprs(left_on)?;
645
+ let right_on = rb_exprs_to_exprs(right_on)?;
646
+
647
+ Ok(ldf
648
+ .join_builder()
649
+ .with(other)
650
+ .left_on(left_on)
651
+ .right_on(right_on)
652
+ .allow_parallel(allow_parallel)
653
+ .force_parallel(force_parallel)
654
+ .join_nulls(join_nulls)
655
+ .how(how.0)
656
+ .validate(validate.0)
657
+ .coalesce(coalesce)
658
+ .suffix(suffix)
659
+ .finish()
660
+ .into())
661
+ }
662
+
663
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
664
+ let ldf = self.ldf.borrow().clone();
665
+ ldf.with_column(expr.inner.clone()).into()
666
+ }
667
+
668
+ pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
669
+ let ldf = self.ldf.borrow().clone();
670
+ Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
671
+ }
672
+
673
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
674
+ let ldf = self.ldf.borrow().clone();
675
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
676
+ }
677
+
678
+ pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
679
+ let ldf = self.ldf.borrow().clone();
680
+ ldf.rename(existing, new, strict).into()
681
+ }
682
+
683
+ pub fn reverse(&self) -> Self {
684
+ let ldf = self.ldf.borrow().clone();
685
+ ldf.reverse().into()
686
+ }
687
+
688
+ pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
689
+ let lf = self.ldf.borrow().clone();
690
+ let out = match fill_value {
691
+ Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
692
+ None => lf.shift(n.inner.clone()),
693
+ };
694
+ out.into()
695
+ }
696
+
697
+ pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
698
+ let ldf = self.ldf.borrow().clone();
699
+ ldf.fill_nan(fill_value.inner.clone()).into()
700
+ }
701
+
702
+ pub fn min(&self) -> Self {
703
+ let ldf = self.ldf.borrow().clone();
704
+ let out = ldf.min();
705
+ out.into()
706
+ }
707
+
708
+ pub fn max(&self) -> Self {
709
+ let ldf = self.ldf.borrow().clone();
710
+ let out = ldf.max();
711
+ out.into()
712
+ }
713
+
714
+ pub fn sum(&self) -> Self {
715
+ let ldf = self.ldf.borrow().clone();
716
+ let out = ldf.sum();
717
+ out.into()
718
+ }
719
+
720
+ pub fn mean(&self) -> Self {
721
+ let ldf = self.ldf.borrow().clone();
722
+ let out = ldf.mean();
723
+ out.into()
724
+ }
725
+
726
+ pub fn std(&self, ddof: u8) -> Self {
727
+ let ldf = self.ldf.borrow().clone();
728
+ let out = ldf.std(ddof);
729
+ out.into()
730
+ }
731
+
732
+ pub fn var(&self, ddof: u8) -> Self {
733
+ let ldf = self.ldf.borrow().clone();
734
+ let out = ldf.var(ddof);
735
+ out.into()
736
+ }
737
+
738
+ pub fn median(&self) -> Self {
739
+ let ldf = self.ldf.borrow().clone();
740
+ let out = ldf.median();
741
+ out.into()
742
+ }
743
+
744
+ pub fn quantile(&self, quantile: &RbExpr, interpolation: Wrap<QuantileMethod>) -> Self {
745
+ let ldf = self.ldf.borrow().clone();
746
+ let out = ldf.quantile(quantile.inner.clone(), interpolation.0);
747
+ out.into()
748
+ }
749
+
750
+ pub fn explode(&self, column: RArray) -> RbResult<Self> {
751
+ let ldf = self.ldf.borrow().clone();
752
+ let column = rb_exprs_to_exprs(column)?;
753
+ Ok(ldf.explode(column).into())
754
+ }
755
+
756
+ pub fn null_count(&self) -> Self {
757
+ let ldf = self.ldf.borrow().clone();
758
+ ldf.null_count().into()
759
+ }
760
+
761
+ pub fn unique(
762
+ &self,
763
+ maintain_order: bool,
764
+ subset: Option<Vec<String>>,
765
+ keep: Wrap<UniqueKeepStrategy>,
766
+ ) -> RbResult<Self> {
767
+ let ldf = self.ldf.borrow().clone();
768
+ Ok(match maintain_order {
769
+ true => ldf.unique_stable_generic(subset, keep.0),
770
+ false => ldf.unique_generic(subset, keep.0),
771
+ }
772
+ .into())
773
+ }
774
+
775
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
776
+ let ldf = self.ldf.borrow().clone();
777
+ ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
778
+ .into()
779
+ }
780
+
781
+ pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
782
+ let ldf = self.ldf.borrow().clone();
783
+ ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
784
+ }
785
+
786
+ pub fn tail(&self, n: IdxSize) -> Self {
787
+ let ldf = self.ldf.borrow().clone();
788
+ ldf.tail(n).into()
789
+ }
790
+
791
+ pub fn unpivot(
792
+ &self,
793
+ on: RArray,
794
+ index: RArray,
795
+ value_name: Option<String>,
796
+ variable_name: Option<String>,
797
+ ) -> RbResult<Self> {
798
+ let on = rb_exprs_to_exprs(on)?;
799
+ let index = rb_exprs_to_exprs(index)?;
800
+ let args = UnpivotArgsDSL {
801
+ on: on.into_iter().map(|e| e.into()).collect(),
802
+ index: index.into_iter().map(|e| e.into()).collect(),
803
+ value_name: value_name.map(|s| s.into()),
804
+ variable_name: variable_name.map(|s| s.into()),
805
+ };
806
+
807
+ let ldf = self.ldf.borrow().clone();
808
+ Ok(ldf.unpivot(args).into())
809
+ }
810
+
811
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
812
+ let ldf = self.ldf.borrow().clone();
813
+ ldf.with_row_index(&name, offset).into()
814
+ }
815
+
816
+ pub fn drop(&self, cols: Vec<String>) -> Self {
817
+ let ldf = self.ldf.borrow().clone();
818
+ ldf.drop(cols).into()
819
+ }
820
+
821
+ pub fn cast(&self, rb_dtypes: RHash, strict: bool) -> RbResult<Self> {
822
+ let mut dtypes = Vec::new();
823
+ rb_dtypes.foreach(|k: String, v: Wrap<DataType>| {
824
+ dtypes.push((k, v.0));
825
+ Ok(ForEach::Continue)
826
+ })?;
827
+ let mut cast_map = PlHashMap::with_capacity(dtypes.len());
828
+ cast_map.extend(dtypes.iter().map(|(k, v)| (k.as_ref(), v.clone())));
829
+ Ok(self.ldf.borrow().clone().cast(cast_map, strict).into())
830
+ }
831
+
832
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
833
+ self.ldf.borrow().clone().cast_all(dtype.0, strict).into()
834
+ }
835
+
836
+ pub fn clone(&self) -> Self {
837
+ self.ldf.borrow().clone().into()
838
+ }
839
+
840
+ pub fn collect_schema(&self) -> RbResult<RHash> {
841
+ let schema = self
842
+ .ldf
843
+ .borrow_mut()
844
+ .collect_schema()
845
+ .map_err(RbPolarsErr::from)?;
846
+
847
+ let schema_dict = RHash::new();
848
+ schema.iter_fields().for_each(|fld| {
849
+ schema_dict
850
+ .aset::<String, Value>(
851
+ fld.name().to_string(),
852
+ Wrap(fld.dtype().clone()).into_value(),
853
+ )
854
+ .unwrap();
855
+ });
856
+ Ok(schema_dict)
857
+ }
858
+
859
+ pub fn unnest(&self, cols: Vec<String>) -> Self {
860
+ self.ldf.borrow().clone().unnest(cols).into()
861
+ }
862
+
863
+ pub fn count(&self) -> Self {
864
+ let ldf = self.ldf.borrow().clone();
865
+ ldf.count().into()
866
+ }
867
+
868
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
869
+ let out = self
870
+ .ldf
871
+ .borrow()
872
+ .clone()
873
+ .merge_sorted(other.ldf.borrow().clone(), &key)
874
+ .map_err(RbPolarsErr::from)?;
875
+ Ok(out.into())
876
+ }
877
+ }