polars-df 0.2.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38828 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.so +0 -0
  10. data/lib/polars/3.1/polars.so +0 -0
  11. data/lib/polars/3.2/polars.so +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,2442 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # @private
8
+ def self._from_rbldf(rb_ldf)
9
+ ldf = LazyFrame.allocate
10
+ ldf._ldf = rb_ldf
11
+ ldf
12
+ end
13
+
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # Read a logical plan from a JSON file to construct a LazyFrame.
153
+ #
154
+ # @param file [String]
155
+ # Path to a file or a file-like object.
156
+ #
157
+ # @return [LazyFrame]
158
+ def self.read_json(file)
159
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
160
+ file = Utils.format_path(file)
161
+ end
162
+
163
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
164
+ end
165
+
166
+ # Get or set column names.
167
+ #
168
+ # @return [Array]
169
+ #
170
+ # @example
171
+ # df = (
172
+ # Polars::DataFrame.new(
173
+ # {
174
+ # "foo" => [1, 2, 3],
175
+ # "bar" => [6, 7, 8],
176
+ # "ham" => ["a", "b", "c"]
177
+ # }
178
+ # )
179
+ # .lazy
180
+ # .select(["foo", "bar"])
181
+ # )
182
+ # df.columns
183
+ # # => ["foo", "bar"]
184
+ def columns
185
+ _ldf.columns
186
+ end
187
+
188
+ # Get dtypes of columns in LazyFrame.
189
+ #
190
+ # @return [Array]
191
+ #
192
+ # @example
193
+ # lf = Polars::DataFrame.new(
194
+ # {
195
+ # "foo" => [1, 2, 3],
196
+ # "bar" => [6.0, 7.0, 8.0],
197
+ # "ham" => ["a", "b", "c"]
198
+ # }
199
+ # ).lazy
200
+ # lf.dtypes
201
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
202
+ def dtypes
203
+ _ldf.dtypes
204
+ end
205
+
206
+ # Get the schema.
207
+ #
208
+ # @return [Hash]
209
+ #
210
+ # @example
211
+ # lf = Polars::DataFrame.new(
212
+ # {
213
+ # "foo" => [1, 2, 3],
214
+ # "bar" => [6.0, 7.0, 8.0],
215
+ # "ham" => ["a", "b", "c"]
216
+ # }
217
+ # ).lazy
218
+ # lf.schema
219
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
220
+ def schema
221
+ _ldf.schema
222
+ end
223
+
224
+ # Get the width of the LazyFrame.
225
+ #
226
+ # @return [Integer]
227
+ #
228
+ # @example
229
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
230
+ # lf.width
231
+ # # => 2
232
+ def width
233
+ _ldf.width
234
+ end
235
+
236
+ # Check if LazyFrame includes key.
237
+ #
238
+ # @return [Boolean]
239
+ def include?(key)
240
+ columns.include?(key)
241
+ end
242
+
243
+ # clone handled by initialize_copy
244
+
245
+ # def [](item)
246
+ # end
247
+
248
+ # Returns a string representing the LazyFrame.
249
+ #
250
+ # @return [String]
251
+ def to_s
252
+ <<~EOS
253
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
254
+
255
+ #{describe_plan}
256
+ EOS
257
+ end
258
+
259
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
260
+ #
261
+ # @param file [String]
262
+ # File path to which the result should be written.
263
+ #
264
+ # @return [nil]
265
+ def write_json(file)
266
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
267
+ file = Utils.format_path(file)
268
+ end
269
+ _ldf.write_json(file)
270
+ nil
271
+ end
272
+
273
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
274
+ #
275
+ # @param func [Object]
276
+ # Callable; will receive the frame as the first parameter,
277
+ # followed by any given args/kwargs.
278
+ # @param args [Object]
279
+ # Arguments to pass to the UDF.
280
+ # @param kwargs [Object]
281
+ # Keyword arguments to pass to the UDF.
282
+ #
283
+ # @return [LazyFrame]
284
+ #
285
+ # @example
286
+ # cast_str_to_int = lambda do |data, col_name:|
287
+ # data.with_column(Polars.col(col_name).cast(:i64))
288
+ # end
289
+ #
290
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
291
+ # df.pipe(cast_str_to_int, col_name: "b").collect()
292
+ # # =>
293
+ # # shape: (4, 2)
294
+ # # ┌─────┬─────┐
295
+ # # │ a ┆ b │
296
+ # # │ --- ┆ --- │
297
+ # # │ i64 ┆ i64 │
298
+ # # ╞═════╪═════╡
299
+ # # │ 1 ┆ 10 │
300
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
301
+ # # │ 2 ┆ 20 │
302
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
303
+ # # │ 3 ┆ 30 │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 4 ┆ 40 │
306
+ # # └─────┴─────┘
307
+ def pipe(func, *args, **kwargs, &block)
308
+ func.call(self, *args, **kwargs, &block)
309
+ end
310
+
311
+ # Create a string representation of the unoptimized query plan.
312
+ #
313
+ # @return [String]
314
+ def describe_plan
315
+ _ldf.describe_plan
316
+ end
317
+
318
+ # Create a string representation of the optimized query plan.
319
+ #
320
+ # @return [String]
321
+ def describe_optimized_plan(
322
+ type_coercion: true,
323
+ predicate_pushdown: true,
324
+ projection_pushdown: true,
325
+ simplify_expression: true,
326
+ slice_pushdown: true,
327
+ common_subplan_elimination: true,
328
+ allow_streaming: false
329
+ )
330
+ ldf = _ldf.optimization_toggle(
331
+ type_coercion,
332
+ predicate_pushdown,
333
+ projection_pushdown,
334
+ simplify_expression,
335
+ slice_pushdown,
336
+ common_subplan_elimination,
337
+ allow_streaming,
338
+ )
339
+
340
+ ldf.describe_optimized_plan
341
+ end
342
+
343
+ # def show_graph
344
+ # end
345
+
346
+ # Sort the DataFrame.
347
+ #
348
+ # Sorting can be done by:
349
+ #
350
+ # - A single column name
351
+ # - An expression
352
+ # - Multiple expressions
353
+ #
354
+ # @param by [Object]
355
+ # Column (expressions) to sort by.
356
+ # @param reverse [Boolean]
357
+ # Sort in descending order.
358
+ # @param nulls_last [Boolean]
359
+ # Place null values last. Can only be used if sorted by a single column.
360
+ #
361
+ # @return [LazyFrame]
362
+ #
363
+ # @example
364
+ # df = Polars::DataFrame.new(
365
+ # {
366
+ # "foo" => [1, 2, 3],
367
+ # "bar" => [6.0, 7.0, 8.0],
368
+ # "ham" => ["a", "b", "c"]
369
+ # }
370
+ # ).lazy
371
+ # df.sort("foo", reverse: true).collect
372
+ # # =>
373
+ # # shape: (3, 3)
374
+ # # ┌─────┬─────┬─────┐
375
+ # # │ foo ┆ bar ┆ ham │
376
+ # # │ --- ┆ --- ┆ --- │
377
+ # # │ i64 ┆ f64 ┆ str │
378
+ # # ╞═════╪═════╪═════╡
379
+ # # │ 3 ┆ 8.0 ┆ c │
380
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
381
+ # # │ 2 ┆ 7.0 ┆ b │
382
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
383
+ # # │ 1 ┆ 6.0 ┆ a │
384
+ # # └─────┴─────┴─────┘
385
+ def sort(by, reverse: false, nulls_last: false)
386
+ if by.is_a?(String)
387
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
388
+ end
389
+ if Utils.bool?(reverse)
390
+ reverse = [reverse]
391
+ end
392
+
393
+ by = Utils.selection_to_rbexpr_list(by)
394
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
395
+ end
396
+
397
+ # def profile
398
+ # end
399
+
400
+ # Collect into a DataFrame.
401
+ #
402
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
403
+ # only. This can be a huge time saver in debugging queries.
404
+ #
405
+ # @param type_coercion [Boolean]
406
+ # Do type coercion optimization.
407
+ # @param predicate_pushdown [Boolean]
408
+ # Do predicate pushdown optimization.
409
+ # @param projection_pushdown [Boolean]
410
+ # Do projection pushdown optimization.
411
+ # @param simplify_expression [Boolean]
412
+ # Run simplify expressions optimization.
413
+ # @param string_cache [Boolean]
414
+ # This argument is deprecated. Please set the string cache globally.
415
+ # The argument will be ignored
416
+ # @param no_optimization [Boolean]
417
+ # Turn off (certain) optimizations.
418
+ # @param slice_pushdown [Boolean]
419
+ # Slice pushdown optimization.
420
+ # @param common_subplan_elimination [Boolean]
421
+ # Will try to cache branching subplans that occur on self-joins or unions.
422
+ # @param allow_streaming [Boolean]
423
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
424
+ #
425
+ # @return [DataFrame]
426
+ #
427
+ # @example
428
+ # df = Polars::DataFrame.new(
429
+ # {
430
+ # "a" => ["a", "b", "a", "b", "b", "c"],
431
+ # "b" => [1, 2, 3, 4, 5, 6],
432
+ # "c" => [6, 5, 4, 3, 2, 1]
433
+ # }
434
+ # ).lazy
435
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
436
+ # # =>
437
+ # # shape: (3, 3)
438
+ # # ┌─────┬─────┬─────┐
439
+ # # │ a ┆ b ┆ c │
440
+ # # │ --- ┆ --- ┆ --- │
441
+ # # │ str ┆ i64 ┆ i64 │
442
+ # # ╞═════╪═════╪═════╡
443
+ # # │ a ┆ 4 ┆ 10 │
444
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
445
+ # # │ b ┆ 11 ┆ 10 │
446
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
447
+ # # │ c ┆ 6 ┆ 1 │
448
+ # # └─────┴─────┴─────┘
449
+ def collect(
450
+ type_coercion: true,
451
+ predicate_pushdown: true,
452
+ projection_pushdown: true,
453
+ simplify_expression: true,
454
+ string_cache: false,
455
+ no_optimization: false,
456
+ slice_pushdown: true,
457
+ common_subplan_elimination: true,
458
+ allow_streaming: false
459
+ )
460
+ if no_optimization
461
+ predicate_pushdown = false
462
+ projection_pushdown = false
463
+ slice_pushdown = false
464
+ common_subplan_elimination = false
465
+ end
466
+
467
+ if allow_streaming
468
+ common_subplan_elimination = false
469
+ end
470
+
471
+ ldf = _ldf.optimization_toggle(
472
+ type_coercion,
473
+ predicate_pushdown,
474
+ projection_pushdown,
475
+ simplify_expression,
476
+ slice_pushdown,
477
+ common_subplan_elimination,
478
+ allow_streaming
479
+ )
480
+ Utils.wrap_df(ldf.collect)
481
+ end
482
+
483
+ # Collect a small number of rows for debugging purposes.
484
+ #
485
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
486
+ # read by every scan operation. This is a utility that helps debug a query on a
487
+ # smaller number of rows.
488
+ #
489
+ # Note that the fetch does not guarantee the final number of rows in the
490
+ # DataFrame. Filter, join operations and a lower number of rows available in the
491
+ # scanned file influence the final number of rows.
492
+ #
493
+ # @param n_rows [Integer]
494
+ # Collect n_rows from the data sources.
495
+ # @param type_coercion [Boolean]
496
+ # Run type coercion optimization.
497
+ # @param predicate_pushdown [Boolean]
498
+ # Run predicate pushdown optimization.
499
+ # @param projection_pushdown [Boolean]
500
+ # Run projection pushdown optimization.
501
+ # @param simplify_expression [Boolean]
502
+ # Run simplify expressions optimization.
503
+ # @param string_cache [Boolean]
504
+ # This argument is deprecated. Please set the string cache globally.
505
+ # The argument will be ignored
506
+ # @param no_optimization [Boolean]
507
+ # Turn off optimizations.
508
+ # @param slice_pushdown [Boolean]
509
+ # Slice pushdown optimization
510
+ # @param common_subplan_elimination [Boolean]
511
+ # Will try to cache branching subplans that occur on self-joins or unions.
512
+ # @param allow_streaming [Boolean]
513
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
514
+ #
515
+ # @return [DataFrame]
516
+ #
517
+ # @example
518
+ # df = Polars::DataFrame.new(
519
+ # {
520
+ # "a" => ["a", "b", "a", "b", "b", "c"],
521
+ # "b" => [1, 2, 3, 4, 5, 6],
522
+ # "c" => [6, 5, 4, 3, 2, 1]
523
+ # }
524
+ # ).lazy
525
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
526
+ # # =>
527
+ # # shape: (2, 3)
528
+ # # ┌─────┬─────┬─────┐
529
+ # # │ a ┆ b ┆ c │
530
+ # # │ --- ┆ --- ┆ --- │
531
+ # # │ str ┆ i64 ┆ i64 │
532
+ # # ╞═════╪═════╪═════╡
533
+ # # │ a ┆ 1 ┆ 6 │
534
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
535
+ # # │ b ┆ 2 ┆ 5 │
536
+ # # └─────┴─────┴─────┘
537
+ def fetch(
538
+ n_rows = 500,
539
+ type_coercion: true,
540
+ predicate_pushdown: true,
541
+ projection_pushdown: true,
542
+ simplify_expression: true,
543
+ string_cache: false,
544
+ no_optimization: false,
545
+ slice_pushdown: true,
546
+ common_subplan_elimination: true,
547
+ allow_streaming: false
548
+ )
549
+ if no_optimization
550
+ predicate_pushdown = false
551
+ projection_pushdown = false
552
+ slice_pushdown = false
553
+ common_subplan_elimination = false
554
+ end
555
+
556
+ ldf = _ldf.optimization_toggle(
557
+ type_coercion,
558
+ predicate_pushdown,
559
+ projection_pushdown,
560
+ simplify_expression,
561
+ slice_pushdown,
562
+ common_subplan_elimination,
563
+ allow_streaming
564
+ )
565
+ Utils.wrap_df(ldf.fetch(n_rows))
566
+ end
567
+
568
+ # Return lazy representation, i.e. itself.
569
+ #
570
+ # Useful for writing code that expects either a `DataFrame` or
571
+ # `LazyFrame`.
572
+ #
573
+ # @return [LazyFrame]
574
+ #
575
+ # @example
576
+ # df = Polars::DataFrame.new(
577
+ # {
578
+ # "a" => [nil, 2, 3, 4],
579
+ # "b" => [0.5, nil, 2.5, 13],
580
+ # "c" => [true, true, false, nil]
581
+ # }
582
+ # )
583
+ # df.lazy
584
+ def lazy
585
+ self
586
+ end
587
+
588
+ # Cache the result once the execution of the physical plan hits this node.
589
+ #
590
+ # @return [LazyFrame]
591
+ def cache
592
+ _from_rbldf(_ldf.cache)
593
+ end
594
+
595
+ # Create an empty copy of the current LazyFrame.
596
+ #
597
+ # The copy has an identical schema but no data.
598
+ #
599
+ # @return [LazyFrame]
600
+ #
601
+ # @example
602
+ # df = Polars::DataFrame.new(
603
+ # {
604
+ # "a" => [nil, 2, 3, 4],
605
+ # "b" => [0.5, nil, 2.5, 13],
606
+ # "c" => [true, true, false, nil],
607
+ # }
608
+ # ).lazy
609
+ # df.cleared.fetch
610
+ # # =>
611
+ # # shape: (0, 3)
612
+ # # ┌─────┬─────┬──────┐
613
+ # # │ a ┆ b ┆ c │
614
+ # # │ --- ┆ --- ┆ --- │
615
+ # # │ i64 ┆ f64 ┆ bool │
616
+ # # ╞═════╪═════╪══════╡
617
+ # # └─────┴─────┴──────┘
618
+ def cleared
619
+ DataFrame.new(columns: schema).lazy
620
+ end
621
+
622
+ # Filter the rows in the DataFrame based on a predicate expression.
623
+ #
624
+ # @param predicate [Object]
625
+ # Expression that evaluates to a boolean Series.
626
+ #
627
+ # @return [LazyFrame]
628
+ #
629
+ # @example Filter on one condition:
630
+ # lf = Polars::DataFrame.new(
631
+ # {
632
+ # "foo" => [1, 2, 3],
633
+ # "bar" => [6, 7, 8],
634
+ # "ham" => ["a", "b", "c"]
635
+ # }
636
+ # ).lazy
637
+ # lf.filter(Polars.col("foo") < 3).collect
638
+ # # =>
639
+ # # shape: (2, 3)
640
+ # # ┌─────┬─────┬─────┐
641
+ # # │ foo ┆ bar ┆ ham │
642
+ # # │ --- ┆ --- ┆ --- │
643
+ # # │ i64 ┆ i64 ┆ str │
644
+ # # ╞═════╪═════╪═════╡
645
+ # # │ 1 ┆ 6 ┆ a │
646
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
647
+ # # │ 2 ┆ 7 ┆ b │
648
+ # # └─────┴─────┴─────┘
649
+ #
650
+ # @example Filter on multiple conditions:
651
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
652
+ # # =>
653
+ # # shape: (1, 3)
654
+ # # ┌─────┬─────┬─────┐
655
+ # # │ foo ┆ bar ┆ ham │
656
+ # # │ --- ┆ --- ┆ --- │
657
+ # # │ i64 ┆ i64 ┆ str │
658
+ # # ╞═════╪═════╪═════╡
659
+ # # │ 1 ┆ 6 ┆ a │
660
+ # # └─────┴─────┴─────┘
661
+ def filter(predicate)
662
+ _from_rbldf(
663
+ _ldf.filter(
664
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
665
+ )
666
+ )
667
+ end
668
+
669
+ # Select columns from this DataFrame.
670
+ #
671
+ # @param exprs [Object]
672
+ # Column or columns to select.
673
+ #
674
+ # @return [LazyFrame]
675
+ #
676
+ # @example
677
+ # df = Polars::DataFrame.new(
678
+ # {
679
+ # "foo" => [1, 2, 3],
680
+ # "bar" => [6, 7, 8],
681
+ # "ham" => ["a", "b", "c"],
682
+ # }
683
+ # ).lazy
684
+ # df.select("foo").collect
685
+ # # =>
686
+ # # shape: (3, 1)
687
+ # # ┌─────┐
688
+ # # │ foo │
689
+ # # │ --- │
690
+ # # │ i64 │
691
+ # # ╞═════╡
692
+ # # │ 1 │
693
+ # # ├╌╌╌╌╌┤
694
+ # # │ 2 │
695
+ # # ├╌╌╌╌╌┤
696
+ # # │ 3 │
697
+ # # └─────┘
698
+ #
699
+ # @example
700
+ # df.select(["foo", "bar"]).collect
701
+ # # =>
702
+ # # shape: (3, 2)
703
+ # # ┌─────┬─────┐
704
+ # # │ foo ┆ bar │
705
+ # # │ --- ┆ --- │
706
+ # # │ i64 ┆ i64 │
707
+ # # ╞═════╪═════╡
708
+ # # │ 1 ┆ 6 │
709
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
710
+ # # │ 2 ┆ 7 │
711
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
712
+ # # │ 3 ┆ 8 │
713
+ # # └─────┴─────┘
714
+ #
715
+ # @example
716
+ # df.select(Polars.col("foo") + 1).collect
717
+ # # =>
718
+ # # shape: (3, 1)
719
+ # # ┌─────┐
720
+ # # │ foo │
721
+ # # │ --- │
722
+ # # │ i64 │
723
+ # # ╞═════╡
724
+ # # │ 2 │
725
+ # # ├╌╌╌╌╌┤
726
+ # # │ 3 │
727
+ # # ├╌╌╌╌╌┤
728
+ # # │ 4 │
729
+ # # └─────┘
730
+ #
731
+ # @example
732
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
733
+ # # =>
734
+ # # shape: (3, 2)
735
+ # # ┌─────┬─────┐
736
+ # # │ foo ┆ bar │
737
+ # # │ --- ┆ --- │
738
+ # # │ i64 ┆ i64 │
739
+ # # ╞═════╪═════╡
740
+ # # │ 2 ┆ 7 │
741
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 3 ┆ 8 │
743
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 4 ┆ 9 │
745
+ # # └─────┴─────┘
746
+ #
747
+ # @example
748
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
749
+ # # =>
750
+ # # shape: (3, 1)
751
+ # # ┌─────────┐
752
+ # # │ literal │
753
+ # # │ --- │
754
+ # # │ i64 │
755
+ # # ╞═════════╡
756
+ # # │ 0 │
757
+ # # ├╌╌╌╌╌╌╌╌╌┤
758
+ # # │ 0 │
759
+ # # ├╌╌╌╌╌╌╌╌╌┤
760
+ # # │ 10 │
761
+ # # └─────────┘
762
+ def select(exprs)
763
+ exprs = Utils.selection_to_rbexpr_list(exprs)
764
+ _from_rbldf(_ldf.select(exprs))
765
+ end
766
+
767
+ # Start a groupby operation.
768
+ #
769
+ # @param by [Object]
770
+ # Column(s) to group by.
771
+ # @param maintain_order [Boolean]
772
+ # Make sure that the order of the groups remain consistent. This is more
773
+ # expensive than a default groupby.
774
+ #
775
+ # @return [LazyGroupBy]
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new(
779
+ # {
780
+ # "a" => ["a", "b", "a", "b", "b", "c"],
781
+ # "b" => [1, 2, 3, 4, 5, 6],
782
+ # "c" => [6, 5, 4, 3, 2, 1]
783
+ # }
784
+ # ).lazy
785
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
786
+ # # =>
787
+ # # shape: (3, 2)
788
+ # # ┌─────┬─────┐
789
+ # # │ a ┆ b │
790
+ # # │ --- ┆ --- │
791
+ # # │ str ┆ i64 │
792
+ # # ╞═════╪═════╡
793
+ # # │ a ┆ 4 │
794
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
795
+ # # │ b ┆ 11 │
796
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
797
+ # # │ c ┆ 6 │
798
+ # # └─────┴─────┘
799
+ def groupby(by, maintain_order: false)
800
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
801
+ lgb = _ldf.groupby(rbexprs_by, maintain_order)
802
+ LazyGroupBy.new(lgb, self.class)
803
+ end
804
+
805
+ # Create rolling groups based on a time column.
806
+ #
807
+ # Also works for index values of type `:i32` or `:i64`.
808
+ #
809
+ # Different from a `dynamic_groupby` the windows are now determined by the
810
+ # individual values and are not of constant intervals. For constant intervals
811
+ # use *groupby_dynamic*.
812
+ #
813
+ # The `period` and `offset` arguments are created either from a timedelta, or
814
+ # by using the following string language:
815
+ #
816
+ # - 1ns (1 nanosecond)
817
+ # - 1us (1 microsecond)
818
+ # - 1ms (1 millisecond)
819
+ # - 1s (1 second)
820
+ # - 1m (1 minute)
821
+ # - 1h (1 hour)
822
+ # - 1d (1 day)
823
+ # - 1w (1 week)
824
+ # - 1mo (1 calendar month)
825
+ # - 1y (1 calendar year)
826
+ # - 1i (1 index count)
827
+ #
828
+ # Or combine them:
829
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
830
+ #
831
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
832
+ #
833
+ # - "1i" # length 1
834
+ # - "10i" # length 10
835
+ #
836
+ # @param index_column [Object]
837
+ # Column used to group based on the time window.
838
+ # Often to type Date/Datetime
839
+ # This column must be sorted in ascending order. If not the output will not
840
+ # make sense.
841
+ #
842
+ # In case of a rolling groupby on indices, dtype needs to be one of
843
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
844
+ # performance matters use an `:i64` column.
845
+ # @param period [Object]
846
+ # Length of the window.
847
+ # @param offset [Object]
848
+ # Offset of the window. Default is -period.
849
+ # @param closed ["right", "left", "both", "none"]
850
+ # Define whether the temporal window interval is closed or not.
851
+ # @param by [Object]
852
+ # Also group by this column/these columns.
853
+ #
854
+ # @return [LazyFrame]
855
+ #
856
+ # @example
857
+ # dates = [
858
+ # "2020-01-01 13:45:48",
859
+ # "2020-01-01 16:42:13",
860
+ # "2020-01-01 16:45:09",
861
+ # "2020-01-02 18:12:48",
862
+ # "2020-01-03 19:45:32",
863
+ # "2020-01-08 23:16:43"
864
+ # ]
865
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
866
+ # Polars.col("dt").str.strptime(:datetime)
867
+ # )
868
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
869
+ # [
870
+ # Polars.sum("a").alias("sum_a"),
871
+ # Polars.min("a").alias("min_a"),
872
+ # Polars.max("a").alias("max_a")
873
+ # ]
874
+ # )
875
+ # # =>
876
+ # # shape: (6, 4)
877
+ # # ┌─────────────────────┬───────┬───────┬───────┐
878
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
879
+ # # │ --- ┆ --- ┆ --- ┆ --- │
880
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
881
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
882
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
883
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
884
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
885
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
886
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
887
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
888
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
889
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
890
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
891
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
892
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
893
+ # # └─────────────────────┴───────┴───────┴───────┘
894
+ def groupby_rolling(
895
+ index_column:,
896
+ period:,
897
+ offset: nil,
898
+ closed: "right",
899
+ by: nil
900
+ )
901
+ if offset.nil?
902
+ offset = "-#{period}"
903
+ end
904
+
905
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
906
+ period = Utils._timedelta_to_pl_duration(period)
907
+ offset = Utils._timedelta_to_pl_duration(offset)
908
+
909
+ lgb = _ldf.groupby_rolling(
910
+ index_column, period, offset, closed, rbexprs_by
911
+ )
912
+ LazyGroupBy.new(lgb, self.class)
913
+ end
914
+
915
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
916
+ #
917
+ # Time windows are calculated and rows are assigned to windows. Different from a
918
+ # normal groupby is that a row can be member of multiple groups. The time/index
919
+ # window could be seen as a rolling window, with a window size determined by
920
+ # dates/times/values instead of slots in the DataFrame.
921
+ #
922
+ # A window is defined by:
923
+ #
924
+ # - every: interval of the window
925
+ # - period: length of the window
926
+ # - offset: offset of the window
927
+ #
928
+ # The `every`, `period` and `offset` arguments are created with
929
+ # the following string language:
930
+ #
931
+ # - 1ns (1 nanosecond)
932
+ # - 1us (1 microsecond)
933
+ # - 1ms (1 millisecond)
934
+ # - 1s (1 second)
935
+ # - 1m (1 minute)
936
+ # - 1h (1 hour)
937
+ # - 1d (1 day)
938
+ # - 1w (1 week)
939
+ # - 1mo (1 calendar month)
940
+ # - 1y (1 calendar year)
941
+ # - 1i (1 index count)
942
+ #
943
+ # Or combine them:
944
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
945
+ #
946
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
947
+ #
948
+ # - "1i" # length 1
949
+ # - "10i" # length 10
950
+ #
951
+ # @param index_column
952
+ # Column used to group based on the time window.
953
+ # Often to type Date/Datetime
954
+ # This column must be sorted in ascending order. If not the output will not
955
+ # make sense.
956
+ #
957
+ # In case of a dynamic groupby on indices, dtype needs to be one of
958
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
959
+ # performance matters use an `:i64` column.
960
+ # @param every
961
+ # Interval of the window.
962
+ # @param period
963
+ # Length of the window, if None it is equal to 'every'.
964
+ # @param offset
965
+ # Offset of the window if None and period is None it will be equal to negative
966
+ # `every`.
967
+ # @param truncate
968
+ # Truncate the time value to the window lower bound.
969
+ # @param include_boundaries
970
+ # Add the lower and upper bound of the window to the "_lower_bound" and
971
+ # "_upper_bound" columns. This will impact performance because it's harder to
972
+ # parallelize
973
+ # @param closed ["right", "left", "both", "none"]
974
+ # Define whether the temporal window interval is closed or not.
975
+ # @param by
976
+ # Also group by this column/these columns
977
+ #
978
+ # @return [DataFrame]
979
+ #
980
+ # @example
981
+ # df = Polars::DataFrame.new(
982
+ # {
983
+ # "time" => Polars.date_range(
984
+ # DateTime.new(2021, 12, 16),
985
+ # DateTime.new(2021, 12, 16, 3),
986
+ # "30m"
987
+ # ),
988
+ # "n" => 0..6
989
+ # }
990
+ # )
991
+ # # =>
992
+ # # shape: (7, 2)
993
+ # # ┌─────────────────────┬─────┐
994
+ # # │ time ┆ n │
995
+ # # │ --- ┆ --- │
996
+ # # │ datetime[μs] ┆ i64 │
997
+ # # ╞═════════════════════╪═════╡
998
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
999
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1000
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1001
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1002
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1003
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1004
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1005
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1006
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1007
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1008
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1009
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1010
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1011
+ # # └─────────────────────┴─────┘
1012
+ #
1013
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1014
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1015
+ # [
1016
+ # Polars.col("time").min.alias("time_min"),
1017
+ # Polars.col("time").max.alias("time_max")
1018
+ # ]
1019
+ # )
1020
+ # # =>
1021
+ # # shape: (4, 3)
1022
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1023
+ # # │ time ┆ time_min ┆ time_max │
1024
+ # # │ --- ┆ --- ┆ --- │
1025
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1026
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1027
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1028
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1029
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1030
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1031
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1032
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1033
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1034
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1035
+ #
1036
+ # @example The window boundaries can also be added to the aggregation result.
1037
+ # df.groupby_dynamic(
1038
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1039
+ # ).agg([Polars.col("time").count.alias("time_count")])
1040
+ # # =>
1041
+ # # shape: (4, 4)
1042
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1043
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1044
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1045
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1046
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1047
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1048
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1049
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1050
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1051
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1052
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1053
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1054
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1055
+ #
1056
+ # @example When closed="left", should not include right end of interval.
1057
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1058
+ # [
1059
+ # Polars.col("time").count.alias("time_count"),
1060
+ # Polars.col("time").list.alias("time_agg_list")
1061
+ # ]
1062
+ # )
1063
+ # # =>
1064
+ # # shape: (4, 3)
1065
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1066
+ # # │ time ┆ time_count ┆ time_agg_list │
1067
+ # # │ --- ┆ --- ┆ --- │
1068
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1069
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1070
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1071
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1072
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1073
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1074
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1075
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1076
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1077
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1078
+ #
1079
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1080
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1081
+ # [Polars.col("time").count.alias("time_count")]
1082
+ # )
1083
+ # # =>
1084
+ # # shape: (5, 2)
1085
+ # # ┌─────────────────────┬────────────┐
1086
+ # # │ time ┆ time_count │
1087
+ # # │ --- ┆ --- │
1088
+ # # │ datetime[μs] ┆ u32 │
1089
+ # # ╞═════════════════════╪════════════╡
1090
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1091
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1092
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1093
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1094
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1095
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1096
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1097
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1098
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1099
+ # # └─────────────────────┴────────────┘
1100
+ #
1101
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
1102
+ # df = Polars::DataFrame.new(
1103
+ # {
1104
+ # "time" => Polars.date_range(
1105
+ # DateTime.new(2021, 12, 16),
1106
+ # DateTime.new(2021, 12, 16, 3),
1107
+ # "30m"
1108
+ # ),
1109
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1110
+ # }
1111
+ # )
1112
+ # df.groupby_dynamic(
1113
+ # "time",
1114
+ # every: "1h",
1115
+ # closed: "both",
1116
+ # by: "groups",
1117
+ # include_boundaries: true
1118
+ # ).agg([Polars.col("time").count.alias("time_count")])
1119
+ # # =>
1120
+ # # shape: (7, 5)
1121
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1122
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1123
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1124
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1125
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1126
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1127
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1128
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1129
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1130
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1131
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1132
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1133
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1134
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1135
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1136
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1137
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1138
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1139
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1140
+ #
1141
+ # @example Dynamic groupby on an index column.
1142
+ # df = Polars::DataFrame.new(
1143
+ # {
1144
+ # "idx" => Polars.arange(0, 6, eager: true),
1145
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1146
+ # }
1147
+ # )
1148
+ # df.groupby_dynamic(
1149
+ # "idx",
1150
+ # every: "2i",
1151
+ # period: "3i",
1152
+ # include_boundaries: true,
1153
+ # closed: "right"
1154
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
1155
+ # # =>
1156
+ # # shape: (3, 4)
1157
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1158
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1159
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1160
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1161
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1162
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1163
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1164
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1165
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1166
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1167
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1168
+ def groupby_dynamic(
1169
+ index_column,
1170
+ every:,
1171
+ period: nil,
1172
+ offset: nil,
1173
+ truncate: true,
1174
+ include_boundaries: false,
1175
+ closed: "left",
1176
+ by: nil,
1177
+ start_by: "window"
1178
+ )
1179
+ if offset.nil?
1180
+ if period.nil?
1181
+ offset = "-#{every}"
1182
+ else
1183
+ offset = "0ns"
1184
+ end
1185
+ end
1186
+
1187
+ if period.nil?
1188
+ period = every
1189
+ end
1190
+
1191
+ period = Utils._timedelta_to_pl_duration(period)
1192
+ offset = Utils._timedelta_to_pl_duration(offset)
1193
+ every = Utils._timedelta_to_pl_duration(every)
1194
+
1195
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1196
+ lgb = _ldf.groupby_dynamic(
1197
+ index_column,
1198
+ every,
1199
+ period,
1200
+ offset,
1201
+ truncate,
1202
+ include_boundaries,
1203
+ closed,
1204
+ rbexprs_by,
1205
+ start_by
1206
+ )
1207
+ LazyGroupBy.new(lgb, self.class)
1208
+ end
1209
+
1210
+ # Perform an asof join.
1211
+ #
1212
+ # This is similar to a left-join except that we match on nearest key rather than
1213
+ # equal keys.
1214
+ #
1215
+ # Both DataFrames must be sorted by the join_asof key.
1216
+ #
1217
+ # For each row in the left DataFrame:
1218
+ #
1219
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1220
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1221
+ #
1222
+ # The default is "backward".
1223
+ #
1224
+ # @param other [LazyFrame]
1225
+ # Lazy DataFrame to join with.
1226
+ # @param left_on [String]
1227
+ # Join column of the left DataFrame.
1228
+ # @param right_on [String]
1229
+ # Join column of the right DataFrame.
1230
+ # @param on [String]
1231
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1232
+ # None.
1233
+ # @param by [Object]
1234
+ # Join on these columns before doing asof join.
1235
+ # @param by_left [Object]
1236
+ # Join on these columns before doing asof join.
1237
+ # @param by_right [Object]
1238
+ # Join on these columns before doing asof join.
1239
+ # @param strategy ["backward", "forward"]
1240
+ # Join strategy.
1241
+ # @param suffix [String]
1242
+ # Suffix to append to columns with a duplicate name.
1243
+ # @param tolerance [Object]
1244
+ # Numeric tolerance. By setting this the join will only be done if the near
1245
+ # keys are within this distance. If an asof join is done on columns of dtype
1246
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1247
+ # language:
1248
+ #
1249
+ # - 1ns (1 nanosecond)
1250
+ # - 1us (1 microsecond)
1251
+ # - 1ms (1 millisecond)
1252
+ # - 1s (1 second)
1253
+ # - 1m (1 minute)
1254
+ # - 1h (1 hour)
1255
+ # - 1d (1 day)
1256
+ # - 1w (1 week)
1257
+ # - 1mo (1 calendar month)
1258
+ # - 1y (1 calendar year)
1259
+ # - 1i (1 index count)
1260
+ #
1261
+ # Or combine them:
1262
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1263
+ #
1264
+ # @param allow_parallel [Boolean]
1265
+ # Allow the physical plan to optionally evaluate the computation of both
1266
+ # DataFrames up to the join in parallel.
1267
+ # @param force_parallel [Boolean]
1268
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1269
+ # the join in parallel.
1270
+ #
1271
+ # @return [LazyFrame]
1272
+ def join_asof(
1273
+ other,
1274
+ left_on: nil,
1275
+ right_on: nil,
1276
+ on: nil,
1277
+ by_left: nil,
1278
+ by_right: nil,
1279
+ by: nil,
1280
+ strategy: "backward",
1281
+ suffix: "_right",
1282
+ tolerance: nil,
1283
+ allow_parallel: true,
1284
+ force_parallel: false
1285
+ )
1286
+ if !other.is_a?(LazyFrame)
1287
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1288
+ end
1289
+
1290
+ if on.is_a?(String)
1291
+ left_on = on
1292
+ right_on = on
1293
+ end
1294
+
1295
+ if left_on.nil? || right_on.nil?
1296
+ raise ArgumentError, "You should pass the column to join on as an argument."
1297
+ end
1298
+
1299
+ if by_left.is_a?(String) || by_left.is_a?(Expr)
1300
+ by_left_ = [by_left]
1301
+ else
1302
+ by_left_ = by_left
1303
+ end
1304
+
1305
+ if by_right.is_a?(String) || by_right.is_a?(Expr)
1306
+ by_right_ = [by_right]
1307
+ else
1308
+ by_right_ = by_right
1309
+ end
1310
+
1311
+ if by.is_a?(String)
1312
+ by_left_ = [by]
1313
+ by_right_ = [by]
1314
+ elsif by.is_a?(Array)
1315
+ by_left_ = by
1316
+ by_right_ = by
1317
+ end
1318
+
1319
+ tolerance_str = nil
1320
+ tolerance_num = nil
1321
+ if tolerance.is_a?(String)
1322
+ tolerance_str = tolerance
1323
+ else
1324
+ tolerance_num = tolerance
1325
+ end
1326
+
1327
+ _from_rbldf(
1328
+ _ldf.join_asof(
1329
+ other._ldf,
1330
+ Polars.col(left_on)._rbexpr,
1331
+ Polars.col(right_on)._rbexpr,
1332
+ by_left_,
1333
+ by_right_,
1334
+ allow_parallel,
1335
+ force_parallel,
1336
+ suffix,
1337
+ strategy,
1338
+ tolerance_num,
1339
+ tolerance_str
1340
+ )
1341
+ )
1342
+ end
1343
+
1344
+ # Add a join operation to the Logical Plan.
1345
+ #
1346
+ # @param other [LazyFrame]
1347
+ # Lazy DataFrame to join with.
1348
+ # @param left_on [Object]
1349
+ # Join column of the left DataFrame.
1350
+ # @param right_on [Object]
1351
+ # Join column of the right DataFrame.
1352
+ # @param on Object
1353
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1354
+ # None.
1355
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1356
+ # Join strategy.
1357
+ # @param suffix [String]
1358
+ # Suffix to append to columns with a duplicate name.
1359
+ # @param allow_parallel [Boolean]
1360
+ # Allow the physical plan to optionally evaluate the computation of both
1361
+ # DataFrames up to the join in parallel.
1362
+ # @param force_parallel [Boolean]
1363
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1364
+ # the join in parallel.
1365
+ #
1366
+ # @return [LazyFrame]
1367
+ #
1368
+ # @example
1369
+ # df = Polars::DataFrame.new(
1370
+ # {
1371
+ # "foo" => [1, 2, 3],
1372
+ # "bar" => [6.0, 7.0, 8.0],
1373
+ # "ham" => ["a", "b", "c"]
1374
+ # }
1375
+ # ).lazy
1376
+ # other_df = Polars::DataFrame.new(
1377
+ # {
1378
+ # "apple" => ["x", "y", "z"],
1379
+ # "ham" => ["a", "b", "d"]
1380
+ # }
1381
+ # ).lazy
1382
+ # df.join(other_df, on: "ham").collect
1383
+ # # =>
1384
+ # # shape: (2, 4)
1385
+ # # ┌─────┬─────┬─────┬───────┐
1386
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1387
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1388
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1389
+ # # ╞═════╪═════╪═════╪═══════╡
1390
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1392
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1393
+ # # └─────┴─────┴─────┴───────┘
1394
+ #
1395
+ # @example
1396
+ # df.join(other_df, on: "ham", how: "outer").collect
1397
+ # # =>
1398
+ # # shape: (4, 4)
1399
+ # # ┌──────┬──────┬─────┬───────┐
1400
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1401
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1402
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1403
+ # # ╞══════╪══════╪═════╪═══════╡
1404
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1405
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1406
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1407
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1408
+ # # │ null ┆ null ┆ d ┆ z │
1409
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1410
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1411
+ # # └──────┴──────┴─────┴───────┘
1412
+ #
1413
+ # @example
1414
+ # df.join(other_df, on: "ham", how: "left").collect
1415
+ # # =>
1416
+ # # shape: (3, 4)
1417
+ # # ┌─────┬─────┬─────┬───────┐
1418
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1419
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1420
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1421
+ # # ╞═════╪═════╪═════╪═══════╡
1422
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1423
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1424
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1425
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1426
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1427
+ # # └─────┴─────┴─────┴───────┘
1428
+ #
1429
+ # @example
1430
+ # df.join(other_df, on: "ham", how: "semi").collect
1431
+ # # =>
1432
+ # # shape: (2, 3)
1433
+ # # ┌─────┬─────┬─────┐
1434
+ # # │ foo ┆ bar ┆ ham │
1435
+ # # │ --- ┆ --- ┆ --- │
1436
+ # # │ i64 ┆ f64 ┆ str │
1437
+ # # ╞═════╪═════╪═════╡
1438
+ # # │ 1 ┆ 6.0 ┆ a │
1439
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1440
+ # # │ 2 ┆ 7.0 ┆ b │
1441
+ # # └─────┴─────┴─────┘
1442
+ #
1443
+ # @example
1444
+ # df.join(other_df, on: "ham", how: "anti").collect
1445
+ # # =>
1446
+ # # shape: (1, 3)
1447
+ # # ┌─────┬─────┬─────┐
1448
+ # # │ foo ┆ bar ┆ ham │
1449
+ # # │ --- ┆ --- ┆ --- │
1450
+ # # │ i64 ┆ f64 ┆ str │
1451
+ # # ╞═════╪═════╪═════╡
1452
+ # # │ 3 ┆ 8.0 ┆ c │
1453
+ # # └─────┴─────┴─────┘
1454
+ def join(
1455
+ other,
1456
+ left_on: nil,
1457
+ right_on: nil,
1458
+ on: nil,
1459
+ how: "inner",
1460
+ suffix: "_right",
1461
+ allow_parallel: true,
1462
+ force_parallel: false
1463
+ )
1464
+ if !other.is_a?(LazyFrame)
1465
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1466
+ end
1467
+
1468
+ if how == "cross"
1469
+ return _from_rbldf(
1470
+ _ldf.join(
1471
+ other._ldf, [], [], allow_parallel, force_parallel, how, suffix
1472
+ )
1473
+ )
1474
+ end
1475
+
1476
+ if !on.nil?
1477
+ rbexprs = Utils.selection_to_rbexpr_list(on)
1478
+ rbexprs_left = rbexprs
1479
+ rbexprs_right = rbexprs
1480
+ elsif !left_on.nil? && !right_on.nil?
1481
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
1482
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
1483
+ else
1484
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1485
+ end
1486
+
1487
+ _from_rbldf(
1488
+ self._ldf.join(
1489
+ other._ldf,
1490
+ rbexprs_left,
1491
+ rbexprs_right,
1492
+ allow_parallel,
1493
+ force_parallel,
1494
+ how,
1495
+ suffix,
1496
+ )
1497
+ )
1498
+ end
1499
+
1500
+ # Add or overwrite multiple columns in a DataFrame.
1501
+ #
1502
+ # @param exprs [Object]
1503
+ # List of Expressions that evaluate to columns.
1504
+ #
1505
+ # @return [LazyFrame]
1506
+ #
1507
+ # @example
1508
+ # ldf = Polars::DataFrame.new(
1509
+ # {
1510
+ # "a" => [1, 2, 3, 4],
1511
+ # "b" => [0.5, 4, 10, 13],
1512
+ # "c" => [true, true, false, true]
1513
+ # }
1514
+ # ).lazy
1515
+ # ldf.with_columns(
1516
+ # [
1517
+ # (Polars.col("a") ** 2).alias("a^2"),
1518
+ # (Polars.col("b") / 2).alias("b/2"),
1519
+ # (Polars.col("c").is_not).alias("not c")
1520
+ # ]
1521
+ # ).collect
1522
+ # # =>
1523
+ # # shape: (4, 6)
1524
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
1525
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1526
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1527
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
1528
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
1529
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1531
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1533
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
1534
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1535
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
1536
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
1537
+ def with_columns(exprs)
1538
+ exprs =
1539
+ if exprs.nil?
1540
+ []
1541
+ elsif exprs.is_a?(Expr)
1542
+ [exprs]
1543
+ else
1544
+ exprs.to_a
1545
+ end
1546
+
1547
+ rbexprs = []
1548
+ exprs.each do |e|
1549
+ case e
1550
+ when Expr
1551
+ rbexprs << e._rbexpr
1552
+ when Series
1553
+ rbexprs = Utils.lit(e)._rbexpr
1554
+ else
1555
+ raise ArgumentError, "Expected an expression, got #{e}"
1556
+ end
1557
+ end
1558
+
1559
+ _from_rbldf(_ldf.with_columns(rbexprs))
1560
+ end
1561
+
1562
+ # Add an external context to the computation graph.
1563
+ #
1564
+ # This allows expressions to also access columns from DataFrames
1565
+ # that are not part of this one.
1566
+ #
1567
+ # @param other [Object]
1568
+ # Lazy DataFrame to join with.
1569
+ #
1570
+ # @return [LazyFrame]
1571
+ #
1572
+ # @example
1573
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1574
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1575
+ # (
1576
+ # df_a.with_context(df_other.lazy).select(
1577
+ # [Polars.col("b") + Polars.col("c").first]
1578
+ # )
1579
+ # ).collect
1580
+ # # =>
1581
+ # # shape: (3, 1)
1582
+ # # ┌──────┐
1583
+ # # │ b │
1584
+ # # │ --- │
1585
+ # # │ str │
1586
+ # # ╞══════╡
1587
+ # # │ afoo │
1588
+ # # ├╌╌╌╌╌╌┤
1589
+ # # │ cfoo │
1590
+ # # ├╌╌╌╌╌╌┤
1591
+ # # │ null │
1592
+ # # └──────┘
1593
+ def with_context(other)
1594
+ if !other.is_a?(Array)
1595
+ other = [other]
1596
+ end
1597
+
1598
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1599
+ end
1600
+
1601
+ # Add or overwrite column in a DataFrame.
1602
+ #
1603
+ # @param column [Object]
1604
+ # Expression that evaluates to column or a Series to use.
1605
+ #
1606
+ # @return [LazyFrame]
1607
+ #
1608
+ # @example
1609
+ # df = Polars::DataFrame.new(
1610
+ # {
1611
+ # "a" => [1, 3, 5],
1612
+ # "b" => [2, 4, 6]
1613
+ # }
1614
+ # ).lazy
1615
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1616
+ # # =>
1617
+ # # shape: (3, 3)
1618
+ # # ┌─────┬─────┬───────────┐
1619
+ # # │ a ┆ b ┆ b_squared │
1620
+ # # │ --- ┆ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 ┆ f64 │
1622
+ # # ╞═════╪═════╪═══════════╡
1623
+ # # │ 1 ┆ 2 ┆ 4.0 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1625
+ # # │ 3 ┆ 4 ┆ 16.0 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1627
+ # # │ 5 ┆ 6 ┆ 36.0 │
1628
+ # # └─────┴─────┴───────────┘
1629
+ #
1630
+ # @example
1631
+ # df.with_column(Polars.col("a") ** 2).collect
1632
+ # # =>
1633
+ # # shape: (3, 2)
1634
+ # # ┌──────┬─────┐
1635
+ # # │ a ┆ b │
1636
+ # # │ --- ┆ --- │
1637
+ # # │ f64 ┆ i64 │
1638
+ # # ╞══════╪═════╡
1639
+ # # │ 1.0 ┆ 2 │
1640
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1641
+ # # │ 9.0 ┆ 4 │
1642
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1643
+ # # │ 25.0 ┆ 6 │
1644
+ # # └──────┴─────┘
1645
+ def with_column(column)
1646
+ with_columns([column])
1647
+ end
1648
+
1649
+ # Remove one or multiple columns from a DataFrame.
1650
+ #
1651
+ # @param columns [Object]
1652
+ # - Name of the column that should be removed.
1653
+ # - List of column names.
1654
+ #
1655
+ # @return [LazyFrame]
1656
+ def drop(columns)
1657
+ if columns.is_a?(String)
1658
+ columns = [columns]
1659
+ end
1660
+ _from_rbldf(_ldf.drop_columns(columns))
1661
+ end
1662
+
1663
+ # Rename column names.
1664
+ #
1665
+ # @param mapping [Hash]
1666
+ # Key value pairs that map from old name to new name.
1667
+ #
1668
+ # @return [LazyFrame]
1669
+ def rename(mapping)
1670
+ existing = mapping.keys
1671
+ _new = mapping.values
1672
+ _from_rbldf(_ldf.rename(existing, _new))
1673
+ end
1674
+
1675
+ # Reverse the DataFrame.
1676
+ #
1677
+ # @return [LazyFrame]
1678
+ def reverse
1679
+ _from_rbldf(_ldf.reverse)
1680
+ end
1681
+
1682
+ # Shift the values by a given period.
1683
+ #
1684
+ # @param periods [Integer]
1685
+ # Number of places to shift (may be negative).
1686
+ #
1687
+ # @return [LazyFrame]
1688
+ #
1689
+ # @example
1690
+ # df = Polars::DataFrame.new(
1691
+ # {
1692
+ # "a" => [1, 3, 5],
1693
+ # "b" => [2, 4, 6]
1694
+ # }
1695
+ # ).lazy
1696
+ # df.shift(1).collect
1697
+ # # =>
1698
+ # # shape: (3, 2)
1699
+ # # ┌──────┬──────┐
1700
+ # # │ a ┆ b │
1701
+ # # │ --- ┆ --- │
1702
+ # # │ i64 ┆ i64 │
1703
+ # # ╞══════╪══════╡
1704
+ # # │ null ┆ null │
1705
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1706
+ # # │ 1 ┆ 2 │
1707
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1708
+ # # │ 3 ┆ 4 │
1709
+ # # └──────┴──────┘
1710
+ #
1711
+ # @example
1712
+ # df.shift(-1).collect
1713
+ # # =>
1714
+ # # shape: (3, 2)
1715
+ # # ┌──────┬──────┐
1716
+ # # │ a ┆ b │
1717
+ # # │ --- ┆ --- │
1718
+ # # │ i64 ┆ i64 │
1719
+ # # ╞══════╪══════╡
1720
+ # # │ 3 ┆ 4 │
1721
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1722
+ # # │ 5 ┆ 6 │
1723
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1724
+ # # │ null ┆ null │
1725
+ # # └──────┴──────┘
1726
+ def shift(periods)
1727
+ _from_rbldf(_ldf.shift(periods))
1728
+ end
1729
+
1730
+ # Shift the values by a given period and fill the resulting null values.
1731
+ #
1732
+ # @param periods [Integer]
1733
+ # Number of places to shift (may be negative).
1734
+ # @param fill_value [Object]
1735
+ # Fill `nil` values with the result of this expression.
1736
+ #
1737
+ # @return [LazyFrame]
1738
+ #
1739
+ # @example
1740
+ # df = Polars::DataFrame.new(
1741
+ # {
1742
+ # "a" => [1, 3, 5],
1743
+ # "b" => [2, 4, 6]
1744
+ # }
1745
+ # ).lazy
1746
+ # df.shift_and_fill(1, 0).collect
1747
+ # # =>
1748
+ # # shape: (3, 2)
1749
+ # # ┌─────┬─────┐
1750
+ # # │ a ┆ b │
1751
+ # # │ --- ┆ --- │
1752
+ # # │ i64 ┆ i64 │
1753
+ # # ╞═════╪═════╡
1754
+ # # │ 0 ┆ 0 │
1755
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1756
+ # # │ 1 ┆ 2 │
1757
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1758
+ # # │ 3 ┆ 4 │
1759
+ # # └─────┴─────┘
1760
+ #
1761
+ # @example
1762
+ # df.shift_and_fill(-1, 0).collect
1763
+ # # =>
1764
+ # # shape: (3, 2)
1765
+ # # ┌─────┬─────┐
1766
+ # # │ a ┆ b │
1767
+ # # │ --- ┆ --- │
1768
+ # # │ i64 ┆ i64 │
1769
+ # # ╞═════╪═════╡
1770
+ # # │ 3 ┆ 4 │
1771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1772
+ # # │ 5 ┆ 6 │
1773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1774
+ # # │ 0 ┆ 0 │
1775
+ # # └─────┴─────┘
1776
+ def shift_and_fill(periods, fill_value)
1777
+ if !fill_value.is_a?(Expr)
1778
+ fill_value = Polars.lit(fill_value)
1779
+ end
1780
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1781
+ end
1782
+
1783
+ # Get a slice of this DataFrame.
1784
+ #
1785
+ # @param offset [Integer]
1786
+ # Start index. Negative indexing is supported.
1787
+ # @param length [Integer]
1788
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1789
+ # will be selected.
1790
+ #
1791
+ # @return [LazyFrame]
1792
+ #
1793
+ # @example
1794
+ # df = Polars::DataFrame.new(
1795
+ # {
1796
+ # "a" => ["x", "y", "z"],
1797
+ # "b" => [1, 3, 5],
1798
+ # "c" => [2, 4, 6]
1799
+ # }
1800
+ # ).lazy
1801
+ # df.slice(1, 2).collect
1802
+ # # =>
1803
+ # # shape: (2, 3)
1804
+ # # ┌─────┬─────┬─────┐
1805
+ # # │ a ┆ b ┆ c │
1806
+ # # │ --- ┆ --- ┆ --- │
1807
+ # # │ str ┆ i64 ┆ i64 │
1808
+ # # ╞═════╪═════╪═════╡
1809
+ # # │ y ┆ 3 ┆ 4 │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ z ┆ 5 ┆ 6 │
1812
+ # # └─────┴─────┴─────┘
1813
+ def slice(offset, length = nil)
1814
+ if length && length < 0
1815
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
1816
+ end
1817
+ _from_rbldf(_ldf.slice(offset, length))
1818
+ end
1819
+
1820
+ # Get the first `n` rows.
1821
+ #
1822
+ # Alias for {#head}.
1823
+ #
1824
+ # @param n [Integer]
1825
+ # Number of rows to return.
1826
+ #
1827
+ # @return [LazyFrame]
1828
+ #
1829
+ # @note
1830
+ # Consider using the {#fetch} operation if you only want to test your
1831
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1832
+ # level, whereas the {#head}/{#limit} are applied at the end.
1833
+ def limit(n = 5)
1834
+ head(5)
1835
+ end
1836
+
1837
+ # Get the first `n` rows.
1838
+ #
1839
+ # @param n [Integer]
1840
+ # Number of rows to return.
1841
+ #
1842
+ # @return [LazyFrame]
1843
+ #
1844
+ # @note
1845
+ # Consider using the {#fetch} operation if you only want to test your
1846
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1847
+ # level, whereas the {#head}/{#limit} are applied at the end.
1848
+ def head(n = 5)
1849
+ slice(0, n)
1850
+ end
1851
+
1852
+ # Get the last `n` rows.
1853
+ #
1854
+ # @param n [Integer]
1855
+ # Number of rows.
1856
+ #
1857
+ # @return [LazyFrame]
1858
+ def tail(n = 5)
1859
+ _from_rbldf(_ldf.tail(n))
1860
+ end
1861
+
1862
+ # Get the last row of the DataFrame.
1863
+ #
1864
+ # @return [LazyFrame]
1865
+ def last
1866
+ tail(1)
1867
+ end
1868
+
1869
+ # Get the first row of the DataFrame.
1870
+ #
1871
+ # @return [LazyFrame]
1872
+ def first
1873
+ slice(0, 1)
1874
+ end
1875
+
1876
+ # Add a column at index 0 that counts the rows.
1877
+ #
1878
+ # @param name [String]
1879
+ # Name of the column to add.
1880
+ # @param offset [Integer]
1881
+ # Start the row count at this offset.
1882
+ #
1883
+ # @return [LazyFrame]
1884
+ #
1885
+ # @note
1886
+ # This can have a negative effect on query performance.
1887
+ # This may, for instance, block predicate pushdown optimization.
1888
+ #
1889
+ # @example
1890
+ # df = Polars::DataFrame.new(
1891
+ # {
1892
+ # "a" => [1, 3, 5],
1893
+ # "b" => [2, 4, 6]
1894
+ # }
1895
+ # ).lazy
1896
+ # df.with_row_count.collect
1897
+ # # =>
1898
+ # # shape: (3, 3)
1899
+ # # ┌────────┬─────┬─────┐
1900
+ # # │ row_nr ┆ a ┆ b │
1901
+ # # │ --- ┆ --- ┆ --- │
1902
+ # # │ u32 ┆ i64 ┆ i64 │
1903
+ # # ╞════════╪═════╪═════╡
1904
+ # # │ 0 ┆ 1 ┆ 2 │
1905
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1906
+ # # │ 1 ┆ 3 ┆ 4 │
1907
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1908
+ # # │ 2 ┆ 5 ┆ 6 │
1909
+ # # └────────┴─────┴─────┘
1910
+ def with_row_count(name: "row_nr", offset: 0)
1911
+ _from_rbldf(_ldf.with_row_count(name, offset))
1912
+ end
1913
+
1914
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
1915
+ #
1916
+ # @return [LazyFrame]
1917
+ #
1918
+ # @example
1919
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
1920
+ # s.take_every(2).collect
1921
+ # # =>
1922
+ # # shape: (2, 2)
1923
+ # # ┌─────┬─────┐
1924
+ # # │ a ┆ b │
1925
+ # # │ --- ┆ --- │
1926
+ # # │ i64 ┆ i64 │
1927
+ # # ╞═════╪═════╡
1928
+ # # │ 1 ┆ 5 │
1929
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1930
+ # # │ 3 ┆ 7 │
1931
+ # # └─────┴─────┘
1932
+ def take_every(n)
1933
+ select(Utils.col("*").take_every(n))
1934
+ end
1935
+
1936
+ # Fill null values using the specified value or strategy.
1937
+ #
1938
+ # @return [LazyFrame]
1939
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
1940
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
1941
+ end
1942
+
1943
+ # Fill floating point NaN values.
1944
+ #
1945
+ # @param fill_value [Object]
1946
+ # Value to fill the NaN values with.
1947
+ #
1948
+ # @return [LazyFrame]
1949
+ #
1950
+ # @note
1951
+ # Note that floating point NaN (Not a Number) are not missing values!
1952
+ # To replace missing values, use `fill_null` instead.
1953
+ #
1954
+ # @example
1955
+ # df = Polars::DataFrame.new(
1956
+ # {
1957
+ # "a" => [1.5, 2, Float::NAN, 4],
1958
+ # "b" => [0.5, 4, Float::NAN, 13],
1959
+ # }
1960
+ # ).lazy
1961
+ # df.fill_nan(99).collect
1962
+ # # =>
1963
+ # # shape: (4, 2)
1964
+ # # ┌──────┬──────┐
1965
+ # # │ a ┆ b │
1966
+ # # │ --- ┆ --- │
1967
+ # # │ f64 ┆ f64 │
1968
+ # # ╞══════╪══════╡
1969
+ # # │ 1.5 ┆ 0.5 │
1970
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1971
+ # # │ 2.0 ┆ 4.0 │
1972
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1973
+ # # │ 99.0 ┆ 99.0 │
1974
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1975
+ # # │ 4.0 ┆ 13.0 │
1976
+ # # └──────┴──────┘
1977
+ def fill_nan(fill_value)
1978
+ if !fill_value.is_a?(Expr)
1979
+ fill_value = Utils.lit(fill_value)
1980
+ end
1981
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
1982
+ end
1983
+
1984
+ # Aggregate the columns in the DataFrame to their standard deviation value.
1985
+ #
1986
+ # @return [LazyFrame]
1987
+ #
1988
+ # @example
1989
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1990
+ # df.std.collect
1991
+ # # =>
1992
+ # # shape: (1, 2)
1993
+ # # ┌──────────┬─────┐
1994
+ # # │ a ┆ b │
1995
+ # # │ --- ┆ --- │
1996
+ # # │ f64 ┆ f64 │
1997
+ # # ╞══════════╪═════╡
1998
+ # # │ 1.290994 ┆ 0.5 │
1999
+ # # └──────────┴─────┘
2000
+ #
2001
+ # @example
2002
+ # df.std(ddof: 0).collect
2003
+ # # =>
2004
+ # # shape: (1, 2)
2005
+ # # ┌──────────┬──────────┐
2006
+ # # │ a ┆ b │
2007
+ # # │ --- ┆ --- │
2008
+ # # │ f64 ┆ f64 │
2009
+ # # ╞══════════╪══════════╡
2010
+ # # │ 1.118034 ┆ 0.433013 │
2011
+ # # └──────────┴──────────┘
2012
+ def std(ddof: 1)
2013
+ _from_rbldf(_ldf.std(ddof))
2014
+ end
2015
+
2016
+ # Aggregate the columns in the DataFrame to their variance value.
2017
+ #
2018
+ # @return [LazyFrame]
2019
+ #
2020
+ # @example
2021
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2022
+ # df.var.collect
2023
+ # # =>
2024
+ # # shape: (1, 2)
2025
+ # # ┌──────────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════════╪══════╡
2030
+ # # │ 1.666667 ┆ 0.25 │
2031
+ # # └──────────┴──────┘
2032
+ #
2033
+ # @example
2034
+ # df.var(ddof: 0).collect
2035
+ # # =>
2036
+ # # shape: (1, 2)
2037
+ # # ┌──────┬────────┐
2038
+ # # │ a ┆ b │
2039
+ # # │ --- ┆ --- │
2040
+ # # │ f64 ┆ f64 │
2041
+ # # ╞══════╪════════╡
2042
+ # # │ 1.25 ┆ 0.1875 │
2043
+ # # └──────┴────────┘
2044
+ def var(ddof: 1)
2045
+ _from_rbldf(_ldf.var(ddof))
2046
+ end
2047
+
2048
+ # Aggregate the columns in the DataFrame to their maximum value.
2049
+ #
2050
+ # @return [LazyFrame]
2051
+ #
2052
+ # @example
2053
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2054
+ # df.max.collect
2055
+ # # =>
2056
+ # # shape: (1, 2)
2057
+ # # ┌─────┬─────┐
2058
+ # # │ a ┆ b │
2059
+ # # │ --- ┆ --- │
2060
+ # # │ i64 ┆ i64 │
2061
+ # # ╞═════╪═════╡
2062
+ # # │ 4 ┆ 2 │
2063
+ # # └─────┴─────┘
2064
+ def max
2065
+ _from_rbldf(_ldf.max)
2066
+ end
2067
+
2068
+ # Aggregate the columns in the DataFrame to their minimum value.
2069
+ #
2070
+ # @return [LazyFrame]
2071
+ #
2072
+ # @example
2073
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2074
+ # df.min.collect
2075
+ # # =>
2076
+ # # shape: (1, 2)
2077
+ # # ┌─────┬─────┐
2078
+ # # │ a ┆ b │
2079
+ # # │ --- ┆ --- │
2080
+ # # │ i64 ┆ i64 │
2081
+ # # ╞═════╪═════╡
2082
+ # # │ 1 ┆ 1 │
2083
+ # # └─────┴─────┘
2084
+ def min
2085
+ _from_rbldf(_ldf.min)
2086
+ end
2087
+
2088
+ # Aggregate the columns in the DataFrame to their sum value.
2089
+ #
2090
+ # @return [LazyFrame]
2091
+ #
2092
+ # @example
2093
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2094
+ # df.sum.collect
2095
+ # # =>
2096
+ # # shape: (1, 2)
2097
+ # # ┌─────┬─────┐
2098
+ # # │ a ┆ b │
2099
+ # # │ --- ┆ --- │
2100
+ # # │ i64 ┆ i64 │
2101
+ # # ╞═════╪═════╡
2102
+ # # │ 10 ┆ 5 │
2103
+ # # └─────┴─────┘
2104
+ def sum
2105
+ _from_rbldf(_ldf.sum)
2106
+ end
2107
+
2108
+ # Aggregate the columns in the DataFrame to their mean value.
2109
+ #
2110
+ # @return [LazyFrame]
2111
+ #
2112
+ # @example
2113
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2114
+ # df.mean.collect
2115
+ # # =>
2116
+ # # shape: (1, 2)
2117
+ # # ┌─────┬──────┐
2118
+ # # │ a ┆ b │
2119
+ # # │ --- ┆ --- │
2120
+ # # │ f64 ┆ f64 │
2121
+ # # ╞═════╪══════╡
2122
+ # # │ 2.5 ┆ 1.25 │
2123
+ # # └─────┴──────┘
2124
+ def mean
2125
+ _from_rbldf(_ldf.mean)
2126
+ end
2127
+
2128
+ # Aggregate the columns in the DataFrame to their median value.
2129
+ #
2130
+ # @return [LazyFrame]
2131
+ #
2132
+ # @example
2133
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2134
+ # df.median.collect
2135
+ # # =>
2136
+ # # shape: (1, 2)
2137
+ # # ┌─────┬─────┐
2138
+ # # │ a ┆ b │
2139
+ # # │ --- ┆ --- │
2140
+ # # │ f64 ┆ f64 │
2141
+ # # ╞═════╪═════╡
2142
+ # # │ 2.5 ┆ 1.0 │
2143
+ # # └─────┴─────┘
2144
+ def median
2145
+ _from_rbldf(_ldf.median)
2146
+ end
2147
+
2148
+ # Aggregate the columns in the DataFrame to their quantile value.
2149
+ #
2150
+ # @param quantile [Float]
2151
+ # Quantile between 0.0 and 1.0.
2152
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2153
+ # Interpolation method.
2154
+ #
2155
+ # @return [LazyFrame]
2156
+ #
2157
+ # @example
2158
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2159
+ # df.quantile(0.7).collect
2160
+ # # =>
2161
+ # # shape: (1, 2)
2162
+ # # ┌─────┬─────┐
2163
+ # # │ a ┆ b │
2164
+ # # │ --- ┆ --- │
2165
+ # # │ f64 ┆ f64 │
2166
+ # # ╞═════╪═════╡
2167
+ # # │ 3.0 ┆ 1.0 │
2168
+ # # └─────┴─────┘
2169
+ def quantile(quantile, interpolation: "nearest")
2170
+ quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2171
+ _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
2172
+ end
2173
+
2174
+ # Explode lists to long format.
2175
+ #
2176
+ # @return [LazyFrame]
2177
+ #
2178
+ # @example
2179
+ # df = Polars::DataFrame.new(
2180
+ # {
2181
+ # "letters" => ["a", "a", "b", "c"],
2182
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2183
+ # }
2184
+ # ).lazy
2185
+ # df.explode("numbers").collect
2186
+ # # =>
2187
+ # # shape: (8, 2)
2188
+ # # ┌─────────┬─────────┐
2189
+ # # │ letters ┆ numbers │
2190
+ # # │ --- ┆ --- │
2191
+ # # │ str ┆ i64 │
2192
+ # # ╞═════════╪═════════╡
2193
+ # # │ a ┆ 1 │
2194
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2195
+ # # │ a ┆ 2 │
2196
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2197
+ # # │ a ┆ 3 │
2198
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2199
+ # # │ b ┆ 4 │
2200
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2201
+ # # │ b ┆ 5 │
2202
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2203
+ # # │ c ┆ 6 │
2204
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2205
+ # # │ c ┆ 7 │
2206
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2207
+ # # │ c ┆ 8 │
2208
+ # # └─────────┴─────────┘
2209
+ def explode(columns)
2210
+ columns = Utils.selection_to_rbexpr_list(columns)
2211
+ _from_rbldf(_ldf.explode(columns))
2212
+ end
2213
+
2214
+ # Drop duplicate rows from this DataFrame.
2215
+ #
2216
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2217
+ # subset.
2218
+ #
2219
+ # @param maintain_order [Boolean]
2220
+ # Keep the same order as the original DataFrame. This requires more work to
2221
+ # compute.
2222
+ # @param subset [Object]
2223
+ # Subset to use to compare rows.
2224
+ # @param keep ["first", "last"]
2225
+ # Which of the duplicate rows to keep.
2226
+ #
2227
+ # @return [LazyFrame]
2228
+ def unique(maintain_order: true, subset: nil, keep: "first")
2229
+ if !subset.nil? && !subset.is_a?(Array)
2230
+ subset = [subset]
2231
+ end
2232
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2233
+ end
2234
+
2235
+ # Drop rows with null values from this LazyFrame.
2236
+ #
2237
+ # @param subset [Object]
2238
+ # Subset of column(s) on which `drop_nulls` will be applied.
2239
+ #
2240
+ # @return [LazyFrame]
2241
+ #
2242
+ # @example
2243
+ # df = Polars::DataFrame.new(
2244
+ # {
2245
+ # "foo" => [1, 2, 3],
2246
+ # "bar" => [6, nil, 8],
2247
+ # "ham" => ["a", "b", "c"]
2248
+ # }
2249
+ # )
2250
+ # df.lazy.drop_nulls.collect
2251
+ # # =>
2252
+ # # shape: (2, 3)
2253
+ # # ┌─────┬─────┬─────┐
2254
+ # # │ foo ┆ bar ┆ ham │
2255
+ # # │ --- ┆ --- ┆ --- │
2256
+ # # │ i64 ┆ i64 ┆ str │
2257
+ # # ╞═════╪═════╪═════╡
2258
+ # # │ 1 ┆ 6 ┆ a │
2259
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2260
+ # # │ 3 ┆ 8 ┆ c │
2261
+ # # └─────┴─────┴─────┘
2262
+ def drop_nulls(subset: nil)
2263
+ if !subset.nil? && !subset.is_a?(Array)
2264
+ subset = [subset]
2265
+ end
2266
+ _from_rbldf(_ldf.drop_nulls(subset))
2267
+ end
2268
+
2269
+ # Unpivot a DataFrame from wide to long format.
2270
+ #
2271
+ # Optionally leaves identifiers set.
2272
+ #
2273
+ # This function is useful to massage a DataFrame into a format where one or more
2274
+ # columns are identifier variables (id_vars), while all other columns, considered
2275
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2276
+ # two non-identifier columns, 'variable' and 'value'.
2277
+ #
2278
+ # @param id_vars [Object]
2279
+ # Columns to use as identifier variables.
2280
+ # @param value_vars [Object]
2281
+ # Values to use as identifier variables.
2282
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2283
+ # @param variable_name [String]
2284
+ # Name to give to the `value` column. Defaults to "variable"
2285
+ # @param value_name [String]
2286
+ # Name to give to the `value` column. Defaults to "value"
2287
+ #
2288
+ # @return [LazyFrame]
2289
+ #
2290
+ # @example
2291
+ # df = Polars::DataFrame.new(
2292
+ # {
2293
+ # "a" => ["x", "y", "z"],
2294
+ # "b" => [1, 3, 5],
2295
+ # "c" => [2, 4, 6]
2296
+ # }
2297
+ # ).lazy
2298
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2299
+ # # =>
2300
+ # # shape: (6, 3)
2301
+ # # ┌─────┬──────────┬───────┐
2302
+ # # │ a ┆ variable ┆ value │
2303
+ # # │ --- ┆ --- ┆ --- │
2304
+ # # │ str ┆ str ┆ i64 │
2305
+ # # ╞═════╪══════════╪═══════╡
2306
+ # # │ x ┆ b ┆ 1 │
2307
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2308
+ # # │ y ┆ b ┆ 3 │
2309
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2310
+ # # │ z ┆ b ┆ 5 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2312
+ # # │ x ┆ c ┆ 2 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2314
+ # # │ y ┆ c ┆ 4 │
2315
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2316
+ # # │ z ┆ c ┆ 6 │
2317
+ # # └─────┴──────────┴───────┘
2318
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
2319
+ if value_vars.is_a?(String)
2320
+ value_vars = [value_vars]
2321
+ end
2322
+ if id_vars.is_a?(String)
2323
+ id_vars = [id_vars]
2324
+ end
2325
+ if value_vars.nil?
2326
+ value_vars = []
2327
+ end
2328
+ if id_vars.nil?
2329
+ id_vars = []
2330
+ end
2331
+ _from_rbldf(
2332
+ _ldf.melt(id_vars, value_vars, value_name, variable_name)
2333
+ )
2334
+ end
2335
+
2336
+ # def map
2337
+ # end
2338
+
2339
+ # Interpolate intermediate values. The interpolation method is linear.
2340
+ #
2341
+ # @return [LazyFrame]
2342
+ #
2343
+ # @example
2344
+ # df = Polars::DataFrame.new(
2345
+ # {
2346
+ # "foo" => [1, nil, 9, 10],
2347
+ # "bar" => [6, 7, 9, nil],
2348
+ # "baz" => [1, nil, nil, 9]
2349
+ # }
2350
+ # ).lazy
2351
+ # df.interpolate.collect
2352
+ # # =>
2353
+ # # shape: (4, 3)
2354
+ # # ┌─────┬──────┬─────┐
2355
+ # # │ foo ┆ bar ┆ baz │
2356
+ # # │ --- ┆ --- ┆ --- │
2357
+ # # │ i64 ┆ i64 ┆ i64 │
2358
+ # # ╞═════╪══════╪═════╡
2359
+ # # │ 1 ┆ 6 ┆ 1 │
2360
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2361
+ # # │ 5 ┆ 7 ┆ 3 │
2362
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2363
+ # # │ 9 ┆ 9 ┆ 6 │
2364
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2365
+ # # │ 10 ┆ null ┆ 9 │
2366
+ # # └─────┴──────┴─────┘
2367
+ def interpolate
2368
+ select(Utils.col("*").interpolate)
2369
+ end
2370
+
2371
+ # Decompose a struct into its fields.
2372
+ #
2373
+ # The fields will be inserted into the `DataFrame` on the location of the
2374
+ # `struct` type.
2375
+ #
2376
+ # @param names [Object]
2377
+ # Names of the struct columns that will be decomposed by its fields
2378
+ #
2379
+ # @return [LazyFrame]
2380
+ #
2381
+ # @example
2382
+ # df = (
2383
+ # Polars::DataFrame.new(
2384
+ # {
2385
+ # "before" => ["foo", "bar"],
2386
+ # "t_a" => [1, 2],
2387
+ # "t_b" => ["a", "b"],
2388
+ # "t_c" => [true, nil],
2389
+ # "t_d" => [[1, 2], [3]],
2390
+ # "after" => ["baz", "womp"]
2391
+ # }
2392
+ # )
2393
+ # .lazy
2394
+ # .select(
2395
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2396
+ # )
2397
+ # )
2398
+ # df.fetch
2399
+ # # =>
2400
+ # # shape: (2, 3)
2401
+ # # ┌────────┬─────────────────────┬───────┐
2402
+ # # │ before ┆ t_struct ┆ after │
2403
+ # # │ --- ┆ --- ┆ --- │
2404
+ # # │ str ┆ struct[4] ┆ str │
2405
+ # # ╞════════╪═════════════════════╪═══════╡
2406
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2407
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2408
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2409
+ # # └────────┴─────────────────────┴───────┘
2410
+ #
2411
+ # @example
2412
+ # df.unnest("t_struct").fetch
2413
+ # # =>
2414
+ # # shape: (2, 6)
2415
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2416
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2417
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2418
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2419
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2420
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2421
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2422
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2423
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2424
+ def unnest(names)
2425
+ if names.is_a?(String)
2426
+ names = [names]
2427
+ end
2428
+ _from_rbldf(_ldf.unnest(names))
2429
+ end
2430
+
2431
+ private
2432
+
2433
+ def initialize_copy(other)
2434
+ super
2435
+ self._ldf = _ldf._clone
2436
+ end
2437
+
2438
+ def _from_rbldf(rb_ldf)
2439
+ self.class._from_rbldf(rb_ldf)
2440
+ end
2441
+ end
2442
+ end