polars-df 0.14.0 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -0
  3. data/Cargo.lock +1523 -378
  4. data/LICENSE.txt +1 -0
  5. data/README.md +38 -4
  6. data/ext/polars/Cargo.toml +15 -5
  7. data/ext/polars/src/batched_csv.rs +7 -10
  8. data/ext/polars/src/conversion/any_value.rs +31 -21
  9. data/ext/polars/src/conversion/mod.rs +155 -48
  10. data/ext/polars/src/dataframe/construction.rs +0 -3
  11. data/ext/polars/src/dataframe/export.rs +9 -2
  12. data/ext/polars/src/dataframe/general.rs +15 -57
  13. data/ext/polars/src/dataframe/io.rs +77 -169
  14. data/ext/polars/src/dataframe/mod.rs +1 -0
  15. data/ext/polars/src/dataframe/serde.rs +15 -0
  16. data/ext/polars/src/error.rs +31 -48
  17. data/ext/polars/src/exceptions.rs +24 -0
  18. data/ext/polars/src/expr/binary.rs +4 -42
  19. data/ext/polars/src/expr/datetime.rs +5 -4
  20. data/ext/polars/src/expr/general.rs +16 -22
  21. data/ext/polars/src/expr/list.rs +18 -11
  22. data/ext/polars/src/expr/meta.rs +6 -2
  23. data/ext/polars/src/expr/rolling.rs +6 -7
  24. data/ext/polars/src/expr/string.rs +9 -36
  25. data/ext/polars/src/file.rs +78 -23
  26. data/ext/polars/src/functions/aggregation.rs +4 -4
  27. data/ext/polars/src/functions/business.rs +15 -0
  28. data/ext/polars/src/functions/io.rs +34 -13
  29. data/ext/polars/src/functions/lazy.rs +22 -12
  30. data/ext/polars/src/functions/meta.rs +1 -1
  31. data/ext/polars/src/functions/mod.rs +1 -0
  32. data/ext/polars/src/interop/arrow/mod.rs +1 -0
  33. data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
  34. data/ext/polars/src/interop/mod.rs +1 -0
  35. data/ext/polars/src/lazyframe/general.rs +920 -0
  36. data/ext/polars/src/lazyframe/mod.rs +3 -827
  37. data/ext/polars/src/lazyframe/serde.rs +31 -0
  38. data/ext/polars/src/lib.rs +54 -27
  39. data/ext/polars/src/map/dataframe.rs +10 -6
  40. data/ext/polars/src/map/lazy.rs +65 -4
  41. data/ext/polars/src/map/mod.rs +9 -8
  42. data/ext/polars/src/on_startup.rs +1 -1
  43. data/ext/polars/src/series/aggregation.rs +1 -5
  44. data/ext/polars/src/series/arithmetic.rs +10 -10
  45. data/ext/polars/src/series/construction.rs +2 -2
  46. data/ext/polars/src/series/export.rs +1 -1
  47. data/ext/polars/src/series/general.rs +631 -0
  48. data/ext/polars/src/series/import.rs +55 -0
  49. data/ext/polars/src/series/mod.rs +11 -638
  50. data/ext/polars/src/series/scatter.rs +2 -2
  51. data/ext/polars/src/utils.rs +0 -20
  52. data/lib/polars/batched_csv_reader.rb +0 -2
  53. data/lib/polars/binary_expr.rb +133 -9
  54. data/lib/polars/binary_name_space.rb +101 -6
  55. data/lib/polars/config.rb +4 -0
  56. data/lib/polars/data_frame.rb +452 -101
  57. data/lib/polars/data_type_group.rb +28 -0
  58. data/lib/polars/data_types.rb +3 -1
  59. data/lib/polars/date_time_expr.rb +244 -0
  60. data/lib/polars/date_time_name_space.rb +87 -0
  61. data/lib/polars/expr.rb +103 -2
  62. data/lib/polars/functions/aggregation/horizontal.rb +10 -4
  63. data/lib/polars/functions/as_datatype.rb +51 -2
  64. data/lib/polars/functions/col.rb +1 -1
  65. data/lib/polars/functions/eager.rb +1 -3
  66. data/lib/polars/functions/lazy.rb +95 -13
  67. data/lib/polars/functions/range/time_range.rb +21 -21
  68. data/lib/polars/io/csv.rb +14 -16
  69. data/lib/polars/io/database.rb +2 -2
  70. data/lib/polars/io/delta.rb +126 -0
  71. data/lib/polars/io/ipc.rb +14 -4
  72. data/lib/polars/io/ndjson.rb +10 -0
  73. data/lib/polars/io/parquet.rb +168 -111
  74. data/lib/polars/lazy_frame.rb +684 -20
  75. data/lib/polars/list_name_space.rb +169 -0
  76. data/lib/polars/selectors.rb +1226 -0
  77. data/lib/polars/series.rb +465 -35
  78. data/lib/polars/string_cache.rb +27 -1
  79. data/lib/polars/string_expr.rb +0 -1
  80. data/lib/polars/string_name_space.rb +73 -3
  81. data/lib/polars/struct_name_space.rb +31 -7
  82. data/lib/polars/utils/various.rb +5 -1
  83. data/lib/polars/utils.rb +45 -10
  84. data/lib/polars/version.rb +1 -1
  85. data/lib/polars.rb +17 -1
  86. metadata +16 -9
  87. data/lib/polars/functions.rb +0 -57
@@ -0,0 +1,920 @@
1
+ use magnus::{r_hash::ForEach, typed_data::Obj, IntoValue, RArray, RHash, TryConvert, Value};
2
+ use polars::io::{HiveOptions, RowIndex};
3
+ use polars::lazy::frame::LazyFrame;
4
+ use polars::prelude::*;
5
+ use polars_plan::plans::ScanSources;
6
+ use std::cell::RefCell;
7
+ use std::io::BufWriter;
8
+ use std::num::NonZeroUsize;
9
+ use std::path::PathBuf;
10
+
11
+ use crate::conversion::*;
12
+ use crate::expr::rb_exprs_to_exprs;
13
+ use crate::file::get_file_like;
14
+ use crate::{RbDataFrame, RbExpr, RbLazyFrame, RbLazyGroupBy, RbPolarsErr, RbResult, RbValueError};
15
+
16
+ fn rbobject_to_first_path_and_scan_sources(obj: Value) -> RbResult<(Option<PathBuf>, ScanSources)> {
17
+ use crate::file::{get_ruby_scan_source_input, RubyScanSourceInput};
18
+ Ok(match get_ruby_scan_source_input(obj, false)? {
19
+ RubyScanSourceInput::Path(path) => (Some(path.clone()), ScanSources::Paths([path].into())),
20
+ RubyScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
21
+ RubyScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
22
+ })
23
+ }
24
+
25
+ impl RbLazyFrame {
26
+ #[allow(clippy::too_many_arguments)]
27
+ pub fn new_from_ndjson(
28
+ source: Option<Value>,
29
+ sources: Wrap<ScanSources>,
30
+ infer_schema_length: Option<usize>,
31
+ batch_size: Option<Wrap<NonZeroUsize>>,
32
+ n_rows: Option<usize>,
33
+ low_memory: bool,
34
+ rechunk: bool,
35
+ row_index: Option<(String, IdxSize)>,
36
+ ) -> RbResult<Self> {
37
+ let batch_size = batch_size.map(|v| v.0);
38
+ let row_index = row_index.map(|(name, offset)| RowIndex {
39
+ name: name.into(),
40
+ offset,
41
+ });
42
+
43
+ let sources = sources.0;
44
+ let (_first_path, sources) = match source {
45
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
46
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
47
+ };
48
+
49
+ let r = LazyJsonLineReader::new_with_sources(sources);
50
+
51
+ let lf = r
52
+ .with_infer_schema_length(infer_schema_length.and_then(NonZeroUsize::new))
53
+ .with_batch_size(batch_size)
54
+ .with_n_rows(n_rows)
55
+ .low_memory(low_memory)
56
+ .with_rechunk(rechunk)
57
+ // .with_schema(schema.map(|schema| Arc::new(schema.0)))
58
+ // .with_schema_overwrite(schema_overrides.map(|x| Arc::new(x.0)))
59
+ .with_row_index(row_index)
60
+ // .with_ignore_errors(ignore_errors)
61
+ // .with_include_file_paths(include_file_paths.map(|x| x.into()))
62
+ .finish()
63
+ .map_err(RbPolarsErr::from)?;
64
+
65
+ Ok(lf.into())
66
+ }
67
+
68
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
69
+ // start arguments
70
+ // this pattern is needed for more than 16
71
+ let source = Option::<Value>::try_convert(arguments[0])?;
72
+ let sources = Wrap::<ScanSources>::try_convert(arguments[21])?;
73
+ let separator = String::try_convert(arguments[1])?;
74
+ let has_header = bool::try_convert(arguments[2])?;
75
+ let ignore_errors = bool::try_convert(arguments[3])?;
76
+ let skip_rows = usize::try_convert(arguments[4])?;
77
+ let n_rows = Option::<usize>::try_convert(arguments[5])?;
78
+ let cache = bool::try_convert(arguments[6])?;
79
+ let overwrite_dtype = Option::<Vec<(String, Wrap<DataType>)>>::try_convert(arguments[7])?;
80
+ let low_memory = bool::try_convert(arguments[8])?;
81
+ let comment_prefix = Option::<String>::try_convert(arguments[9])?;
82
+ let quote_char = Option::<String>::try_convert(arguments[10])?;
83
+ let null_values = Option::<Wrap<NullValues>>::try_convert(arguments[11])?;
84
+ let infer_schema_length = Option::<usize>::try_convert(arguments[12])?;
85
+ let with_schema_modify = Option::<Value>::try_convert(arguments[13])?;
86
+ let rechunk = bool::try_convert(arguments[14])?;
87
+ let skip_rows_after_header = usize::try_convert(arguments[15])?;
88
+ let encoding = Wrap::<CsvEncoding>::try_convert(arguments[16])?;
89
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
90
+ let try_parse_dates = bool::try_convert(arguments[18])?;
91
+ let eol_char = String::try_convert(arguments[19])?;
92
+ let truncate_ragged_lines = bool::try_convert(arguments[20])?;
93
+ // end arguments
94
+
95
+ let null_values = null_values.map(|w| w.0);
96
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
97
+ let separator = separator.as_bytes()[0];
98
+ let eol_char = eol_char.as_bytes()[0];
99
+ let row_index = row_index.map(|(name, offset)| RowIndex {
100
+ name: name.into(),
101
+ offset,
102
+ });
103
+
104
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
105
+ overwrite_dtype
106
+ .into_iter()
107
+ .map(|(name, dtype)| Field::new((&*name).into(), dtype.0))
108
+ .collect::<Schema>()
109
+ });
110
+
111
+ let sources = sources.0;
112
+ let (_first_path, sources) = match source {
113
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
114
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
115
+ };
116
+
117
+ let r = LazyCsvReader::new_with_sources(sources);
118
+
119
+ let r = r
120
+ .with_infer_schema_length(infer_schema_length)
121
+ .with_separator(separator)
122
+ .with_has_header(has_header)
123
+ .with_ignore_errors(ignore_errors)
124
+ .with_skip_rows(skip_rows)
125
+ .with_n_rows(n_rows)
126
+ .with_cache(cache)
127
+ .with_dtype_overwrite(overwrite_dtype.map(Arc::new))
128
+ // TODO add with_schema
129
+ .with_low_memory(low_memory)
130
+ .with_comment_prefix(comment_prefix.map(|x| x.into()))
131
+ .with_quote_char(quote_char)
132
+ .with_eol_char(eol_char)
133
+ .with_rechunk(rechunk)
134
+ .with_skip_rows_after_header(skip_rows_after_header)
135
+ .with_encoding(encoding.0)
136
+ .with_row_index(row_index)
137
+ .with_try_parse_dates(try_parse_dates)
138
+ .with_null_values(null_values)
139
+ // TODO add with_missing_is_null
140
+ .with_truncate_ragged_lines(truncate_ragged_lines);
141
+
142
+ if let Some(_lambda) = with_schema_modify {
143
+ todo!();
144
+ }
145
+
146
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
147
+ }
148
+
149
+ pub fn new_from_parquet(arguments: &[Value]) -> RbResult<Self> {
150
+ let source = Option::<Value>::try_convert(arguments[0])?;
151
+ let sources = Wrap::<ScanSources>::try_convert(arguments[1])?;
152
+ let n_rows = Option::<usize>::try_convert(arguments[2])?;
153
+ let cache = bool::try_convert(arguments[3])?;
154
+ let parallel = Wrap::<ParallelStrategy>::try_convert(arguments[4])?;
155
+ let rechunk = bool::try_convert(arguments[5])?;
156
+ let row_index = Option::<(String, IdxSize)>::try_convert(arguments[6])?;
157
+ let low_memory = bool::try_convert(arguments[7])?;
158
+ let cloud_options = Option::<Vec<(String, String)>>::try_convert(arguments[8])?;
159
+ let _credential_provider = Option::<Value>::try_convert(arguments[9])?;
160
+ let use_statistics = bool::try_convert(arguments[10])?;
161
+ let hive_partitioning = Option::<bool>::try_convert(arguments[11])?;
162
+ let schema = Option::<Wrap<Schema>>::try_convert(arguments[12])?;
163
+ let hive_schema = Option::<Wrap<Schema>>::try_convert(arguments[13])?;
164
+ let try_parse_hive_dates = bool::try_convert(arguments[14])?;
165
+ let retries = usize::try_convert(arguments[15])?;
166
+ let glob = bool::try_convert(arguments[16])?;
167
+ let include_file_paths = Option::<String>::try_convert(arguments[17])?;
168
+ let allow_missing_columns = bool::try_convert(arguments[18])?;
169
+
170
+ let parallel = parallel.0;
171
+ let hive_schema = hive_schema.map(|s| Arc::new(s.0));
172
+
173
+ let row_index = row_index.map(|(name, offset)| RowIndex {
174
+ name: name.into(),
175
+ offset,
176
+ });
177
+
178
+ let hive_options = HiveOptions {
179
+ enabled: hive_partitioning,
180
+ hive_start_idx: 0,
181
+ schema: hive_schema,
182
+ try_parse_dates: try_parse_hive_dates,
183
+ };
184
+
185
+ let mut args = ScanArgsParquet {
186
+ n_rows,
187
+ cache,
188
+ parallel,
189
+ rechunk,
190
+ row_index,
191
+ low_memory,
192
+ cloud_options: None,
193
+ use_statistics,
194
+ schema: schema.map(|x| Arc::new(x.0)),
195
+ hive_options,
196
+ glob,
197
+ include_file_paths: include_file_paths.map(|x| x.into()),
198
+ allow_missing_columns,
199
+ };
200
+
201
+ let sources = sources.0;
202
+ let (first_path, sources) = match source {
203
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
204
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
205
+ };
206
+
207
+ if let Some(first_path) = first_path {
208
+ let first_path_url = first_path.to_string_lossy();
209
+ let cloud_options =
210
+ parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
211
+ args.cloud_options = Some(cloud_options.with_max_retries(retries));
212
+ }
213
+
214
+ let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(RbPolarsErr::from)?;
215
+
216
+ Ok(lf.into())
217
+ }
218
+
219
+ #[allow(clippy::too_many_arguments)]
220
+ pub fn new_from_ipc(
221
+ source: Option<Value>,
222
+ sources: Wrap<ScanSources>,
223
+ n_rows: Option<usize>,
224
+ cache: bool,
225
+ rechunk: bool,
226
+ row_index: Option<(String, IdxSize)>,
227
+ hive_partitioning: Option<bool>,
228
+ hive_schema: Option<Wrap<Schema>>,
229
+ try_parse_hive_dates: bool,
230
+ include_file_paths: Option<String>,
231
+ ) -> RbResult<Self> {
232
+ let row_index = row_index.map(|(name, offset)| RowIndex {
233
+ name: name.into(),
234
+ offset,
235
+ });
236
+
237
+ let hive_options = HiveOptions {
238
+ enabled: hive_partitioning,
239
+ hive_start_idx: 0,
240
+ schema: hive_schema.map(|x| Arc::new(x.0)),
241
+ try_parse_dates: try_parse_hive_dates,
242
+ };
243
+
244
+ let args = ScanArgsIpc {
245
+ n_rows,
246
+ cache,
247
+ rechunk,
248
+ row_index,
249
+ cloud_options: None,
250
+ hive_options,
251
+ include_file_paths: include_file_paths.map(|x| x.into()),
252
+ };
253
+
254
+ let sources = sources.0;
255
+ let (_first_path, sources) = match source {
256
+ None => (sources.first_path().map(|p| p.to_path_buf()), sources),
257
+ Some(source) => rbobject_to_first_path_and_scan_sources(source)?,
258
+ };
259
+
260
+ let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(RbPolarsErr::from)?;
261
+ Ok(lf.into())
262
+ }
263
+
264
+ pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
265
+ let file = BufWriter::new(get_file_like(rb_f, true)?);
266
+ serde_json::to_writer(file, &self.ldf.borrow().logical_plan)
267
+ .map_err(|err| RbValueError::new_err(format!("{:?}", err)))?;
268
+ Ok(())
269
+ }
270
+
271
+ pub fn describe_plan(&self) -> RbResult<String> {
272
+ self.ldf
273
+ .borrow()
274
+ .describe_plan()
275
+ .map_err(RbPolarsErr::from)
276
+ .map_err(Into::into)
277
+ }
278
+
279
+ pub fn describe_optimized_plan(&self) -> RbResult<String> {
280
+ let result = self
281
+ .ldf
282
+ .borrow()
283
+ .describe_optimized_plan()
284
+ .map_err(RbPolarsErr::from)?;
285
+ Ok(result)
286
+ }
287
+
288
+ #[allow(clippy::too_many_arguments)]
289
+ pub fn optimization_toggle(
290
+ &self,
291
+ type_coercion: bool,
292
+ predicate_pushdown: bool,
293
+ projection_pushdown: bool,
294
+ simplify_expr: bool,
295
+ slice_pushdown: bool,
296
+ comm_subplan_elim: bool,
297
+ comm_subexpr_elim: bool,
298
+ allow_streaming: bool,
299
+ _eager: bool,
300
+ ) -> RbLazyFrame {
301
+ let ldf = self.ldf.borrow().clone();
302
+ let mut ldf = ldf
303
+ .with_type_coercion(type_coercion)
304
+ .with_predicate_pushdown(predicate_pushdown)
305
+ .with_simplify_expr(simplify_expr)
306
+ .with_slice_pushdown(slice_pushdown)
307
+ .with_streaming(allow_streaming)
308
+ ._with_eager(_eager)
309
+ .with_projection_pushdown(projection_pushdown);
310
+
311
+ ldf = ldf.with_comm_subplan_elim(comm_subplan_elim);
312
+ ldf = ldf.with_comm_subexpr_elim(comm_subexpr_elim);
313
+
314
+ ldf.into()
315
+ }
316
+
317
+ pub fn sort(
318
+ &self,
319
+ by_column: String,
320
+ descending: bool,
321
+ nulls_last: bool,
322
+ maintain_order: bool,
323
+ multithreaded: bool,
324
+ ) -> Self {
325
+ let ldf = self.ldf.borrow().clone();
326
+ ldf.sort(
327
+ [&by_column],
328
+ SortMultipleOptions {
329
+ descending: vec![descending],
330
+ nulls_last: vec![nulls_last],
331
+ multithreaded,
332
+ maintain_order,
333
+ limit: None,
334
+ },
335
+ )
336
+ .into()
337
+ }
338
+
339
+ pub fn sort_by_exprs(
340
+ &self,
341
+ by: RArray,
342
+ descending: Vec<bool>,
343
+ nulls_last: Vec<bool>,
344
+ maintain_order: bool,
345
+ multithreaded: bool,
346
+ ) -> RbResult<Self> {
347
+ let ldf = self.ldf.borrow().clone();
348
+ let exprs = rb_exprs_to_exprs(by)?;
349
+ Ok(ldf
350
+ .sort_by_exprs(
351
+ exprs,
352
+ SortMultipleOptions {
353
+ descending,
354
+ nulls_last,
355
+ maintain_order,
356
+ multithreaded,
357
+ limit: None,
358
+ },
359
+ )
360
+ .into())
361
+ }
362
+
363
+ pub fn cache(&self) -> Self {
364
+ let ldf = self.ldf.borrow().clone();
365
+ ldf.cache().into()
366
+ }
367
+
368
+ pub fn collect(&self) -> RbResult<RbDataFrame> {
369
+ let ldf = self.ldf.borrow().clone();
370
+ let df = ldf.collect().map_err(RbPolarsErr::from)?;
371
+ Ok(df.into())
372
+ }
373
+
374
+ #[allow(clippy::too_many_arguments)]
375
+ pub fn sink_parquet(
376
+ &self,
377
+ path: PathBuf,
378
+ compression: String,
379
+ compression_level: Option<i32>,
380
+ statistics: Wrap<StatisticsOptions>,
381
+ row_group_size: Option<usize>,
382
+ data_page_size: Option<usize>,
383
+ maintain_order: bool,
384
+ cloud_options: Option<Vec<(String, String)>>,
385
+ retries: usize,
386
+ ) -> RbResult<()> {
387
+ let compression = parse_parquet_compression(&compression, compression_level)?;
388
+
389
+ let options = ParquetWriteOptions {
390
+ compression,
391
+ statistics: statistics.0,
392
+ row_group_size,
393
+ data_page_size,
394
+ maintain_order,
395
+ };
396
+
397
+ let cloud_options = {
398
+ let cloud_options =
399
+ parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
400
+ Some(cloud_options.with_max_retries(retries))
401
+ };
402
+
403
+ let ldf = self.ldf.borrow().clone();
404
+ ldf.sink_parquet(&path, options, cloud_options)
405
+ .map_err(RbPolarsErr::from)?;
406
+ Ok(())
407
+ }
408
+
409
+ pub fn sink_ipc(
410
+ &self,
411
+ path: PathBuf,
412
+ compression: Option<Wrap<IpcCompression>>,
413
+ maintain_order: bool,
414
+ cloud_options: Option<Vec<(String, String)>>,
415
+ retries: usize,
416
+ ) -> RbResult<()> {
417
+ let options = IpcWriterOptions {
418
+ compression: compression.map(|c| c.0),
419
+ maintain_order,
420
+ };
421
+
422
+ let cloud_options = {
423
+ let cloud_options =
424
+ parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
425
+ Some(cloud_options.with_max_retries(retries))
426
+ };
427
+
428
+ let ldf = self.ldf.borrow().clone();
429
+ ldf.sink_ipc(&path, options, cloud_options)
430
+ .map_err(RbPolarsErr::from)?;
431
+ Ok(())
432
+ }
433
+
434
+ #[allow(clippy::too_many_arguments)]
435
+ pub fn sink_csv(
436
+ &self,
437
+ path: PathBuf,
438
+ include_bom: bool,
439
+ include_header: bool,
440
+ separator: u8,
441
+ line_terminator: String,
442
+ quote_char: u8,
443
+ batch_size: Wrap<NonZeroUsize>,
444
+ datetime_format: Option<String>,
445
+ date_format: Option<String>,
446
+ time_format: Option<String>,
447
+ float_scientific: Option<bool>,
448
+ float_precision: Option<usize>,
449
+ null_value: Option<String>,
450
+ quote_style: Option<Wrap<QuoteStyle>>,
451
+ maintain_order: bool,
452
+ ) -> RbResult<()> {
453
+ // TODO
454
+ let cloud_options = None;
455
+
456
+ let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
457
+ let null_value = null_value.unwrap_or(SerializeOptions::default().null);
458
+
459
+ let serialize_options = SerializeOptions {
460
+ date_format,
461
+ time_format,
462
+ datetime_format,
463
+ float_scientific,
464
+ float_precision,
465
+ separator,
466
+ quote_char,
467
+ null: null_value,
468
+ line_terminator,
469
+ quote_style,
470
+ };
471
+
472
+ let options = CsvWriterOptions {
473
+ include_bom,
474
+ include_header,
475
+ maintain_order,
476
+ batch_size: batch_size.0,
477
+ serialize_options,
478
+ };
479
+
480
+ let cloud_options = {
481
+ let cloud_options =
482
+ parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
483
+ Some(cloud_options)
484
+ };
485
+
486
+ let ldf = self.ldf.borrow().clone();
487
+ ldf.sink_csv(&path, options, cloud_options)
488
+ .map_err(RbPolarsErr::from)?;
489
+ Ok(())
490
+ }
491
+
492
+ pub fn sink_json(
493
+ &self,
494
+ path: PathBuf,
495
+ maintain_order: bool,
496
+ cloud_options: Option<Vec<(String, String)>>,
497
+ retries: usize,
498
+ ) -> RbResult<()> {
499
+ let options = JsonWriterOptions { maintain_order };
500
+
501
+ let cloud_options = {
502
+ let cloud_options =
503
+ parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
504
+ Some(cloud_options.with_max_retries(retries))
505
+ };
506
+
507
+ let ldf = self.ldf.borrow().clone();
508
+ ldf.sink_json(&path, options, cloud_options)
509
+ .map_err(RbPolarsErr::from)?;
510
+ Ok(())
511
+ }
512
+
513
+ pub fn fetch(&self, n_rows: usize) -> RbResult<RbDataFrame> {
514
+ let ldf = self.ldf.borrow().clone();
515
+ let df = ldf.fetch(n_rows).map_err(RbPolarsErr::from)?;
516
+ Ok(df.into())
517
+ }
518
+
519
+ pub fn filter(&self, predicate: &RbExpr) -> Self {
520
+ let ldf = self.ldf.borrow().clone();
521
+ ldf.filter(predicate.inner.clone()).into()
522
+ }
523
+
524
+ pub fn select(&self, exprs: RArray) -> RbResult<Self> {
525
+ let ldf = self.ldf.borrow().clone();
526
+ let exprs = rb_exprs_to_exprs(exprs)?;
527
+ Ok(ldf.select(exprs).into())
528
+ }
529
+
530
+ pub fn select_seq(&self, exprs: RArray) -> RbResult<Self> {
531
+ let ldf = self.ldf.borrow().clone();
532
+ let exprs = rb_exprs_to_exprs(exprs)?;
533
+ Ok(ldf.select_seq(exprs).into())
534
+ }
535
+
536
+ pub fn group_by(&self, by: RArray, maintain_order: bool) -> RbResult<RbLazyGroupBy> {
537
+ let ldf = self.ldf.borrow().clone();
538
+ let by = rb_exprs_to_exprs(by)?;
539
+ let lazy_gb = if maintain_order {
540
+ ldf.group_by_stable(by)
541
+ } else {
542
+ ldf.group_by(by)
543
+ };
544
+ Ok(RbLazyGroupBy {
545
+ lgb: RefCell::new(Some(lazy_gb)),
546
+ })
547
+ }
548
+
549
+ pub fn rolling(
550
+ &self,
551
+ index_column: &RbExpr,
552
+ period: String,
553
+ offset: String,
554
+ closed: Wrap<ClosedWindow>,
555
+ by: RArray,
556
+ ) -> RbResult<RbLazyGroupBy> {
557
+ let closed_window = closed.0;
558
+ let ldf = self.ldf.borrow().clone();
559
+ let by = rb_exprs_to_exprs(by)?;
560
+ let lazy_gb = ldf.rolling(
561
+ index_column.inner.clone(),
562
+ by,
563
+ RollingGroupOptions {
564
+ index_column: "".into(),
565
+ period: Duration::parse(&period),
566
+ offset: Duration::parse(&offset),
567
+ closed_window,
568
+ },
569
+ );
570
+
571
+ Ok(RbLazyGroupBy {
572
+ lgb: RefCell::new(Some(lazy_gb)),
573
+ })
574
+ }
575
+
576
+ #[allow(clippy::too_many_arguments)]
577
+ pub fn group_by_dynamic(
578
+ &self,
579
+ index_column: &RbExpr,
580
+ every: String,
581
+ period: String,
582
+ offset: String,
583
+ label: Wrap<Label>,
584
+ include_boundaries: bool,
585
+ closed: Wrap<ClosedWindow>,
586
+ by: RArray,
587
+ start_by: Wrap<StartBy>,
588
+ ) -> RbResult<RbLazyGroupBy> {
589
+ let closed_window = closed.0;
590
+ let by = rb_exprs_to_exprs(by)?;
591
+ let ldf = self.ldf.borrow().clone();
592
+ let lazy_gb = ldf.group_by_dynamic(
593
+ index_column.inner.clone(),
594
+ by,
595
+ DynamicGroupOptions {
596
+ every: Duration::parse(&every),
597
+ period: Duration::parse(&period),
598
+ offset: Duration::parse(&offset),
599
+ label: label.0,
600
+ include_boundaries,
601
+ closed_window,
602
+ start_by: start_by.0,
603
+ ..Default::default()
604
+ },
605
+ );
606
+
607
+ Ok(RbLazyGroupBy {
608
+ lgb: RefCell::new(Some(lazy_gb)),
609
+ })
610
+ }
611
+
612
+ pub fn with_context(&self, contexts: RArray) -> RbResult<Self> {
613
+ let contexts = contexts.typecheck::<Obj<RbLazyFrame>>()?;
614
+ let contexts = contexts
615
+ .into_iter()
616
+ .map(|ldf| ldf.ldf.borrow().clone())
617
+ .collect::<Vec<_>>();
618
+ Ok(self.ldf.borrow().clone().with_context(contexts).into())
619
+ }
620
+
621
+ #[allow(clippy::too_many_arguments)]
622
+ pub fn join_asof(
623
+ &self,
624
+ other: &RbLazyFrame,
625
+ left_on: &RbExpr,
626
+ right_on: &RbExpr,
627
+ left_by: Option<Vec<String>>,
628
+ right_by: Option<Vec<String>>,
629
+ allow_parallel: bool,
630
+ force_parallel: bool,
631
+ suffix: String,
632
+ strategy: Wrap<AsofStrategy>,
633
+ tolerance: Option<Wrap<AnyValue<'_>>>,
634
+ tolerance_str: Option<String>,
635
+ coalesce: bool,
636
+ ) -> RbResult<Self> {
637
+ let coalesce = if coalesce {
638
+ JoinCoalesce::CoalesceColumns
639
+ } else {
640
+ JoinCoalesce::KeepColumns
641
+ };
642
+ let ldf = self.ldf.borrow().clone();
643
+ let other = other.ldf.borrow().clone();
644
+ let left_on = left_on.inner.clone();
645
+ let right_on = right_on.inner.clone();
646
+ Ok(ldf
647
+ .join_builder()
648
+ .with(other)
649
+ .left_on([left_on])
650
+ .right_on([right_on])
651
+ .allow_parallel(allow_parallel)
652
+ .force_parallel(force_parallel)
653
+ .coalesce(coalesce)
654
+ .how(JoinType::AsOf(AsOfOptions {
655
+ strategy: strategy.0,
656
+ left_by: left_by.map(strings_to_pl_smallstr),
657
+ right_by: right_by.map(strings_to_pl_smallstr),
658
+ tolerance: tolerance.map(|t| t.0.into_static()),
659
+ tolerance_str: tolerance_str.map(|s| s.into()),
660
+ }))
661
+ .suffix(suffix)
662
+ .finish()
663
+ .into())
664
+ }
665
+
666
+ #[allow(clippy::too_many_arguments)]
667
+ pub fn join(
668
+ &self,
669
+ other: &RbLazyFrame,
670
+ left_on: RArray,
671
+ right_on: RArray,
672
+ allow_parallel: bool,
673
+ force_parallel: bool,
674
+ join_nulls: bool,
675
+ how: Wrap<JoinType>,
676
+ suffix: String,
677
+ validate: Wrap<JoinValidation>,
678
+ coalesce: Option<bool>,
679
+ ) -> RbResult<Self> {
680
+ let coalesce = match coalesce {
681
+ None => JoinCoalesce::JoinSpecific,
682
+ Some(true) => JoinCoalesce::CoalesceColumns,
683
+ Some(false) => JoinCoalesce::KeepColumns,
684
+ };
685
+ let ldf = self.ldf.borrow().clone();
686
+ let other = other.ldf.borrow().clone();
687
+ let left_on = rb_exprs_to_exprs(left_on)?;
688
+ let right_on = rb_exprs_to_exprs(right_on)?;
689
+
690
+ Ok(ldf
691
+ .join_builder()
692
+ .with(other)
693
+ .left_on(left_on)
694
+ .right_on(right_on)
695
+ .allow_parallel(allow_parallel)
696
+ .force_parallel(force_parallel)
697
+ .join_nulls(join_nulls)
698
+ .how(how.0)
699
+ .validate(validate.0)
700
+ .coalesce(coalesce)
701
+ .suffix(suffix)
702
+ .finish()
703
+ .into())
704
+ }
705
+
706
+ pub fn with_column(&self, expr: &RbExpr) -> Self {
707
+ let ldf = self.ldf.borrow().clone();
708
+ ldf.with_column(expr.inner.clone()).into()
709
+ }
710
+
711
+ pub fn with_columns(&self, exprs: RArray) -> RbResult<Self> {
712
+ let ldf = self.ldf.borrow().clone();
713
+ Ok(ldf.with_columns(rb_exprs_to_exprs(exprs)?).into())
714
+ }
715
+
716
+ pub fn with_columns_seq(&self, exprs: RArray) -> RbResult<Self> {
717
+ let ldf = self.ldf.borrow().clone();
718
+ Ok(ldf.with_columns_seq(rb_exprs_to_exprs(exprs)?).into())
719
+ }
720
+
721
+ pub fn rename(&self, existing: Vec<String>, new: Vec<String>, strict: bool) -> Self {
722
+ let ldf = self.ldf.borrow().clone();
723
+ ldf.rename(existing, new, strict).into()
724
+ }
725
+
726
+ pub fn reverse(&self) -> Self {
727
+ let ldf = self.ldf.borrow().clone();
728
+ ldf.reverse().into()
729
+ }
730
+
731
+ pub fn shift(&self, n: &RbExpr, fill_value: Option<&RbExpr>) -> Self {
732
+ let lf = self.ldf.borrow().clone();
733
+ let out = match fill_value {
734
+ Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
735
+ None => lf.shift(n.inner.clone()),
736
+ };
737
+ out.into()
738
+ }
739
+
740
+ pub fn fill_nan(&self, fill_value: &RbExpr) -> Self {
741
+ let ldf = self.ldf.borrow().clone();
742
+ ldf.fill_nan(fill_value.inner.clone()).into()
743
+ }
744
+
745
+ pub fn min(&self) -> Self {
746
+ let ldf = self.ldf.borrow().clone();
747
+ let out = ldf.min();
748
+ out.into()
749
+ }
750
+
751
+ pub fn max(&self) -> Self {
752
+ let ldf = self.ldf.borrow().clone();
753
+ let out = ldf.max();
754
+ out.into()
755
+ }
756
+
757
+ pub fn sum(&self) -> Self {
758
+ let ldf = self.ldf.borrow().clone();
759
+ let out = ldf.sum();
760
+ out.into()
761
+ }
762
+
763
+ pub fn mean(&self) -> Self {
764
+ let ldf = self.ldf.borrow().clone();
765
+ let out = ldf.mean();
766
+ out.into()
767
+ }
768
+
769
+ pub fn std(&self, ddof: u8) -> Self {
770
+ let ldf = self.ldf.borrow().clone();
771
+ let out = ldf.std(ddof);
772
+ out.into()
773
+ }
774
+
775
+ pub fn var(&self, ddof: u8) -> Self {
776
+ let ldf = self.ldf.borrow().clone();
777
+ let out = ldf.var(ddof);
778
+ out.into()
779
+ }
780
+
781
+ pub fn median(&self) -> Self {
782
+ let ldf = self.ldf.borrow().clone();
783
+ let out = ldf.median();
784
+ out.into()
785
+ }
786
+
787
+ pub fn quantile(&self, quantile: &RbExpr, interpolation: Wrap<QuantileMethod>) -> Self {
788
+ let ldf = self.ldf.borrow().clone();
789
+ let out = ldf.quantile(quantile.inner.clone(), interpolation.0);
790
+ out.into()
791
+ }
792
+
793
+ pub fn explode(&self, column: RArray) -> RbResult<Self> {
794
+ let ldf = self.ldf.borrow().clone();
795
+ let column = rb_exprs_to_exprs(column)?;
796
+ Ok(ldf.explode(column).into())
797
+ }
798
+
799
+ pub fn null_count(&self) -> Self {
800
+ let ldf = self.ldf.borrow().clone();
801
+ ldf.null_count().into()
802
+ }
803
+
804
+ pub fn unique(
805
+ &self,
806
+ maintain_order: bool,
807
+ subset: Option<Vec<String>>,
808
+ keep: Wrap<UniqueKeepStrategy>,
809
+ ) -> RbResult<Self> {
810
+ let ldf = self.ldf.borrow().clone();
811
+ Ok(match maintain_order {
812
+ true => ldf.unique_stable_generic(subset, keep.0),
813
+ false => ldf.unique_generic(subset, keep.0),
814
+ }
815
+ .into())
816
+ }
817
+
818
+ pub fn drop_nulls(&self, subset: Option<Vec<String>>) -> Self {
819
+ let ldf = self.ldf.borrow().clone();
820
+ ldf.drop_nulls(subset.map(|v| v.into_iter().map(|s| col(&s)).collect()))
821
+ .into()
822
+ }
823
+
824
+ pub fn slice(&self, offset: i64, len: Option<IdxSize>) -> Self {
825
+ let ldf = self.ldf.borrow().clone();
826
+ ldf.slice(offset, len.unwrap_or(IdxSize::MAX)).into()
827
+ }
828
+
829
+ pub fn tail(&self, n: IdxSize) -> Self {
830
+ let ldf = self.ldf.borrow().clone();
831
+ ldf.tail(n).into()
832
+ }
833
+
834
+ pub fn unpivot(
835
+ &self,
836
+ on: RArray,
837
+ index: RArray,
838
+ value_name: Option<String>,
839
+ variable_name: Option<String>,
840
+ ) -> RbResult<Self> {
841
+ let on = rb_exprs_to_exprs(on)?;
842
+ let index = rb_exprs_to_exprs(index)?;
843
+ let args = UnpivotArgsDSL {
844
+ on: on.into_iter().map(|e| e.into()).collect(),
845
+ index: index.into_iter().map(|e| e.into()).collect(),
846
+ value_name: value_name.map(|s| s.into()),
847
+ variable_name: variable_name.map(|s| s.into()),
848
+ };
849
+
850
+ let ldf = self.ldf.borrow().clone();
851
+ Ok(ldf.unpivot(args).into())
852
+ }
853
+
854
+ pub fn with_row_index(&self, name: String, offset: Option<IdxSize>) -> Self {
855
+ let ldf = self.ldf.borrow().clone();
856
+ ldf.with_row_index(&name, offset).into()
857
+ }
858
+
859
+ pub fn drop(&self, cols: Vec<String>) -> Self {
860
+ let ldf = self.ldf.borrow().clone();
861
+ ldf.drop(cols).into()
862
+ }
863
+
864
+ pub fn cast(&self, rb_dtypes: RHash, strict: bool) -> RbResult<Self> {
865
+ let mut dtypes = Vec::new();
866
+ rb_dtypes.foreach(|k: String, v: Wrap<DataType>| {
867
+ dtypes.push((k, v.0));
868
+ Ok(ForEach::Continue)
869
+ })?;
870
+ let mut cast_map = PlHashMap::with_capacity(dtypes.len());
871
+ cast_map.extend(dtypes.iter().map(|(k, v)| (k.as_ref(), v.clone())));
872
+ Ok(self.ldf.borrow().clone().cast(cast_map, strict).into())
873
+ }
874
+
875
+ pub fn cast_all(&self, dtype: Wrap<DataType>, strict: bool) -> Self {
876
+ self.ldf.borrow().clone().cast_all(dtype.0, strict).into()
877
+ }
878
+
879
+ pub fn clone(&self) -> Self {
880
+ self.ldf.borrow().clone().into()
881
+ }
882
+
883
+ pub fn collect_schema(&self) -> RbResult<RHash> {
884
+ let schema = self
885
+ .ldf
886
+ .borrow_mut()
887
+ .collect_schema()
888
+ .map_err(RbPolarsErr::from)?;
889
+
890
+ let schema_dict = RHash::new();
891
+ schema.iter_fields().for_each(|fld| {
892
+ schema_dict
893
+ .aset::<String, Value>(
894
+ fld.name().to_string(),
895
+ Wrap(fld.dtype().clone()).into_value(),
896
+ )
897
+ .unwrap();
898
+ });
899
+ Ok(schema_dict)
900
+ }
901
+
902
+ pub fn unnest(&self, cols: Vec<String>) -> Self {
903
+ self.ldf.borrow().clone().unnest(cols).into()
904
+ }
905
+
906
+ pub fn count(&self) -> Self {
907
+ let ldf = self.ldf.borrow().clone();
908
+ ldf.count().into()
909
+ }
910
+
911
+ pub fn merge_sorted(&self, other: &Self, key: String) -> RbResult<Self> {
912
+ let out = self
913
+ .ldf
914
+ .borrow()
915
+ .clone()
916
+ .merge_sorted(other.ldf.borrow().clone(), &key)
917
+ .map_err(RbPolarsErr::from)?;
918
+ Ok(out.into())
919
+ }
920
+ }