polars-df 0.2.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38856 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.bundle +0 -0
  10. data/lib/polars/3.1/polars.bundle +0 -0
  11. data/lib/polars/3.2/polars.bundle +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,2442 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # @private
8
+ def self._from_rbldf(rb_ldf)
9
+ ldf = LazyFrame.allocate
10
+ ldf._ldf = rb_ldf
11
+ ldf
12
+ end
13
+
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # Read a logical plan from a JSON file to construct a LazyFrame.
153
+ #
154
+ # @param file [String]
155
+ # Path to a file or a file-like object.
156
+ #
157
+ # @return [LazyFrame]
158
+ def self.read_json(file)
159
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
160
+ file = Utils.format_path(file)
161
+ end
162
+
163
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
164
+ end
165
+
166
+ # Get or set column names.
167
+ #
168
+ # @return [Array]
169
+ #
170
+ # @example
171
+ # df = (
172
+ # Polars::DataFrame.new(
173
+ # {
174
+ # "foo" => [1, 2, 3],
175
+ # "bar" => [6, 7, 8],
176
+ # "ham" => ["a", "b", "c"]
177
+ # }
178
+ # )
179
+ # .lazy
180
+ # .select(["foo", "bar"])
181
+ # )
182
+ # df.columns
183
+ # # => ["foo", "bar"]
184
+ def columns
185
+ _ldf.columns
186
+ end
187
+
188
+ # Get dtypes of columns in LazyFrame.
189
+ #
190
+ # @return [Array]
191
+ #
192
+ # @example
193
+ # lf = Polars::DataFrame.new(
194
+ # {
195
+ # "foo" => [1, 2, 3],
196
+ # "bar" => [6.0, 7.0, 8.0],
197
+ # "ham" => ["a", "b", "c"]
198
+ # }
199
+ # ).lazy
200
+ # lf.dtypes
201
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
202
+ def dtypes
203
+ _ldf.dtypes
204
+ end
205
+
206
+ # Get the schema.
207
+ #
208
+ # @return [Hash]
209
+ #
210
+ # @example
211
+ # lf = Polars::DataFrame.new(
212
+ # {
213
+ # "foo" => [1, 2, 3],
214
+ # "bar" => [6.0, 7.0, 8.0],
215
+ # "ham" => ["a", "b", "c"]
216
+ # }
217
+ # ).lazy
218
+ # lf.schema
219
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
220
+ def schema
221
+ _ldf.schema
222
+ end
223
+
224
+ # Get the width of the LazyFrame.
225
+ #
226
+ # @return [Integer]
227
+ #
228
+ # @example
229
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
230
+ # lf.width
231
+ # # => 2
232
+ def width
233
+ _ldf.width
234
+ end
235
+
236
+ # Check if LazyFrame includes key.
237
+ #
238
+ # @return [Boolean]
239
+ def include?(key)
240
+ columns.include?(key)
241
+ end
242
+
243
+ # clone handled by initialize_copy
244
+
245
+ # def [](item)
246
+ # end
247
+
248
+ # Returns a string representing the LazyFrame.
249
+ #
250
+ # @return [String]
251
+ def to_s
252
+ <<~EOS
253
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
254
+
255
+ #{describe_plan}
256
+ EOS
257
+ end
258
+
259
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
260
+ #
261
+ # @param file [String]
262
+ # File path to which the result should be written.
263
+ #
264
+ # @return [nil]
265
+ def write_json(file)
266
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
267
+ file = Utils.format_path(file)
268
+ end
269
+ _ldf.write_json(file)
270
+ nil
271
+ end
272
+
273
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
274
+ #
275
+ # @param func [Object]
276
+ # Callable; will receive the frame as the first parameter,
277
+ # followed by any given args/kwargs.
278
+ # @param args [Object]
279
+ # Arguments to pass to the UDF.
280
+ # @param kwargs [Object]
281
+ # Keyword arguments to pass to the UDF.
282
+ #
283
+ # @return [LazyFrame]
284
+ #
285
+ # @example
286
+ # cast_str_to_int = lambda do |data, col_name:|
287
+ # data.with_column(Polars.col(col_name).cast(:i64))
288
+ # end
289
+ #
290
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
291
+ # df.pipe(cast_str_to_int, col_name: "b").collect()
292
+ # # =>
293
+ # # shape: (4, 2)
294
+ # # ┌─────┬─────┐
295
+ # # │ a ┆ b │
296
+ # # │ --- ┆ --- │
297
+ # # │ i64 ┆ i64 │
298
+ # # ╞═════╪═════╡
299
+ # # │ 1 ┆ 10 │
300
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
301
+ # # │ 2 ┆ 20 │
302
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
303
+ # # │ 3 ┆ 30 │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 4 ┆ 40 │
306
+ # # └─────┴─────┘
307
+ def pipe(func, *args, **kwargs, &block)
308
+ func.call(self, *args, **kwargs, &block)
309
+ end
310
+
311
+ # Create a string representation of the unoptimized query plan.
312
+ #
313
+ # @return [String]
314
+ def describe_plan
315
+ _ldf.describe_plan
316
+ end
317
+
318
+ # Create a string representation of the optimized query plan.
319
+ #
320
+ # @return [String]
321
+ def describe_optimized_plan(
322
+ type_coercion: true,
323
+ predicate_pushdown: true,
324
+ projection_pushdown: true,
325
+ simplify_expression: true,
326
+ slice_pushdown: true,
327
+ common_subplan_elimination: true,
328
+ allow_streaming: false
329
+ )
330
+ ldf = _ldf.optimization_toggle(
331
+ type_coercion,
332
+ predicate_pushdown,
333
+ projection_pushdown,
334
+ simplify_expression,
335
+ slice_pushdown,
336
+ common_subplan_elimination,
337
+ allow_streaming,
338
+ )
339
+
340
+ ldf.describe_optimized_plan
341
+ end
342
+
343
+ # def show_graph
344
+ # end
345
+
346
+ # Sort the DataFrame.
347
+ #
348
+ # Sorting can be done by:
349
+ #
350
+ # - A single column name
351
+ # - An expression
352
+ # - Multiple expressions
353
+ #
354
+ # @param by [Object]
355
+ # Column (expressions) to sort by.
356
+ # @param reverse [Boolean]
357
+ # Sort in descending order.
358
+ # @param nulls_last [Boolean]
359
+ # Place null values last. Can only be used if sorted by a single column.
360
+ #
361
+ # @return [LazyFrame]
362
+ #
363
+ # @example
364
+ # df = Polars::DataFrame.new(
365
+ # {
366
+ # "foo" => [1, 2, 3],
367
+ # "bar" => [6.0, 7.0, 8.0],
368
+ # "ham" => ["a", "b", "c"]
369
+ # }
370
+ # ).lazy
371
+ # df.sort("foo", reverse: true).collect
372
+ # # =>
373
+ # # shape: (3, 3)
374
+ # # ┌─────┬─────┬─────┐
375
+ # # │ foo ┆ bar ┆ ham │
376
+ # # │ --- ┆ --- ┆ --- │
377
+ # # │ i64 ┆ f64 ┆ str │
378
+ # # ╞═════╪═════╪═════╡
379
+ # # │ 3 ┆ 8.0 ┆ c │
380
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
381
+ # # │ 2 ┆ 7.0 ┆ b │
382
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
383
+ # # │ 1 ┆ 6.0 ┆ a │
384
+ # # └─────┴─────┴─────┘
385
+ def sort(by, reverse: false, nulls_last: false)
386
+ if by.is_a?(String)
387
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
388
+ end
389
+ if Utils.bool?(reverse)
390
+ reverse = [reverse]
391
+ end
392
+
393
+ by = Utils.selection_to_rbexpr_list(by)
394
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
395
+ end
396
+
397
+ # def profile
398
+ # end
399
+
400
+ # Collect into a DataFrame.
401
+ #
402
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
403
+ # only. This can be a huge time saver in debugging queries.
404
+ #
405
+ # @param type_coercion [Boolean]
406
+ # Do type coercion optimization.
407
+ # @param predicate_pushdown [Boolean]
408
+ # Do predicate pushdown optimization.
409
+ # @param projection_pushdown [Boolean]
410
+ # Do projection pushdown optimization.
411
+ # @param simplify_expression [Boolean]
412
+ # Run simplify expressions optimization.
413
+ # @param string_cache [Boolean]
414
+ # This argument is deprecated. Please set the string cache globally.
415
+ # The argument will be ignored
416
+ # @param no_optimization [Boolean]
417
+ # Turn off (certain) optimizations.
418
+ # @param slice_pushdown [Boolean]
419
+ # Slice pushdown optimization.
420
+ # @param common_subplan_elimination [Boolean]
421
+ # Will try to cache branching subplans that occur on self-joins or unions.
422
+ # @param allow_streaming [Boolean]
423
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
424
+ #
425
+ # @return [DataFrame]
426
+ #
427
+ # @example
428
+ # df = Polars::DataFrame.new(
429
+ # {
430
+ # "a" => ["a", "b", "a", "b", "b", "c"],
431
+ # "b" => [1, 2, 3, 4, 5, 6],
432
+ # "c" => [6, 5, 4, 3, 2, 1]
433
+ # }
434
+ # ).lazy
435
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
436
+ # # =>
437
+ # # shape: (3, 3)
438
+ # # ┌─────┬─────┬─────┐
439
+ # # │ a ┆ b ┆ c │
440
+ # # │ --- ┆ --- ┆ --- │
441
+ # # │ str ┆ i64 ┆ i64 │
442
+ # # ╞═════╪═════╪═════╡
443
+ # # │ a ┆ 4 ┆ 10 │
444
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
445
+ # # │ b ┆ 11 ┆ 10 │
446
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
447
+ # # │ c ┆ 6 ┆ 1 │
448
+ # # └─────┴─────┴─────┘
449
+ def collect(
450
+ type_coercion: true,
451
+ predicate_pushdown: true,
452
+ projection_pushdown: true,
453
+ simplify_expression: true,
454
+ string_cache: false,
455
+ no_optimization: false,
456
+ slice_pushdown: true,
457
+ common_subplan_elimination: true,
458
+ allow_streaming: false
459
+ )
460
+ if no_optimization
461
+ predicate_pushdown = false
462
+ projection_pushdown = false
463
+ slice_pushdown = false
464
+ common_subplan_elimination = false
465
+ end
466
+
467
+ if allow_streaming
468
+ common_subplan_elimination = false
469
+ end
470
+
471
+ ldf = _ldf.optimization_toggle(
472
+ type_coercion,
473
+ predicate_pushdown,
474
+ projection_pushdown,
475
+ simplify_expression,
476
+ slice_pushdown,
477
+ common_subplan_elimination,
478
+ allow_streaming
479
+ )
480
+ Utils.wrap_df(ldf.collect)
481
+ end
482
+
483
+ # Collect a small number of rows for debugging purposes.
484
+ #
485
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
486
+ # read by every scan operation. This is a utility that helps debug a query on a
487
+ # smaller number of rows.
488
+ #
489
+ # Note that the fetch does not guarantee the final number of rows in the
490
+ # DataFrame. Filter, join operations and a lower number of rows available in the
491
+ # scanned file influence the final number of rows.
492
+ #
493
+ # @param n_rows [Integer]
494
+ # Collect n_rows from the data sources.
495
+ # @param type_coercion [Boolean]
496
+ # Run type coercion optimization.
497
+ # @param predicate_pushdown [Boolean]
498
+ # Run predicate pushdown optimization.
499
+ # @param projection_pushdown [Boolean]
500
+ # Run projection pushdown optimization.
501
+ # @param simplify_expression [Boolean]
502
+ # Run simplify expressions optimization.
503
+ # @param string_cache [Boolean]
504
+ # This argument is deprecated. Please set the string cache globally.
505
+ # The argument will be ignored
506
+ # @param no_optimization [Boolean]
507
+ # Turn off optimizations.
508
+ # @param slice_pushdown [Boolean]
509
+ # Slice pushdown optimization
510
+ # @param common_subplan_elimination [Boolean]
511
+ # Will try to cache branching subplans that occur on self-joins or unions.
512
+ # @param allow_streaming [Boolean]
513
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
514
+ #
515
+ # @return [DataFrame]
516
+ #
517
+ # @example
518
+ # df = Polars::DataFrame.new(
519
+ # {
520
+ # "a" => ["a", "b", "a", "b", "b", "c"],
521
+ # "b" => [1, 2, 3, 4, 5, 6],
522
+ # "c" => [6, 5, 4, 3, 2, 1]
523
+ # }
524
+ # ).lazy
525
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
526
+ # # =>
527
+ # # shape: (2, 3)
528
+ # # ┌─────┬─────┬─────┐
529
+ # # │ a ┆ b ┆ c │
530
+ # # │ --- ┆ --- ┆ --- │
531
+ # # │ str ┆ i64 ┆ i64 │
532
+ # # ╞═════╪═════╪═════╡
533
+ # # │ a ┆ 1 ┆ 6 │
534
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
535
+ # # │ b ┆ 2 ┆ 5 │
536
+ # # └─────┴─────┴─────┘
537
+ def fetch(
538
+ n_rows = 500,
539
+ type_coercion: true,
540
+ predicate_pushdown: true,
541
+ projection_pushdown: true,
542
+ simplify_expression: true,
543
+ string_cache: false,
544
+ no_optimization: false,
545
+ slice_pushdown: true,
546
+ common_subplan_elimination: true,
547
+ allow_streaming: false
548
+ )
549
+ if no_optimization
550
+ predicate_pushdown = false
551
+ projection_pushdown = false
552
+ slice_pushdown = false
553
+ common_subplan_elimination = false
554
+ end
555
+
556
+ ldf = _ldf.optimization_toggle(
557
+ type_coercion,
558
+ predicate_pushdown,
559
+ projection_pushdown,
560
+ simplify_expression,
561
+ slice_pushdown,
562
+ common_subplan_elimination,
563
+ allow_streaming
564
+ )
565
+ Utils.wrap_df(ldf.fetch(n_rows))
566
+ end
567
+
568
+ # Return lazy representation, i.e. itself.
569
+ #
570
+ # Useful for writing code that expects either a `DataFrame` or
571
+ # `LazyFrame`.
572
+ #
573
+ # @return [LazyFrame]
574
+ #
575
+ # @example
576
+ # df = Polars::DataFrame.new(
577
+ # {
578
+ # "a" => [nil, 2, 3, 4],
579
+ # "b" => [0.5, nil, 2.5, 13],
580
+ # "c" => [true, true, false, nil]
581
+ # }
582
+ # )
583
+ # df.lazy
584
+ def lazy
585
+ self
586
+ end
587
+
588
+ # Cache the result once the execution of the physical plan hits this node.
589
+ #
590
+ # @return [LazyFrame]
591
+ def cache
592
+ _from_rbldf(_ldf.cache)
593
+ end
594
+
595
+ # Create an empty copy of the current LazyFrame.
596
+ #
597
+ # The copy has an identical schema but no data.
598
+ #
599
+ # @return [LazyFrame]
600
+ #
601
+ # @example
602
+ # df = Polars::DataFrame.new(
603
+ # {
604
+ # "a" => [nil, 2, 3, 4],
605
+ # "b" => [0.5, nil, 2.5, 13],
606
+ # "c" => [true, true, false, nil],
607
+ # }
608
+ # ).lazy
609
+ # df.cleared.fetch
610
+ # # =>
611
+ # # shape: (0, 3)
612
+ # # ┌─────┬─────┬──────┐
613
+ # # │ a ┆ b ┆ c │
614
+ # # │ --- ┆ --- ┆ --- │
615
+ # # │ i64 ┆ f64 ┆ bool │
616
+ # # ╞═════╪═════╪══════╡
617
+ # # └─────┴─────┴──────┘
618
+ def cleared
619
+ DataFrame.new(columns: schema).lazy
620
+ end
621
+
622
+ # Filter the rows in the DataFrame based on a predicate expression.
623
+ #
624
+ # @param predicate [Object]
625
+ # Expression that evaluates to a boolean Series.
626
+ #
627
+ # @return [LazyFrame]
628
+ #
629
+ # @example Filter on one condition:
630
+ # lf = Polars::DataFrame.new(
631
+ # {
632
+ # "foo" => [1, 2, 3],
633
+ # "bar" => [6, 7, 8],
634
+ # "ham" => ["a", "b", "c"]
635
+ # }
636
+ # ).lazy
637
+ # lf.filter(Polars.col("foo") < 3).collect
638
+ # # =>
639
+ # # shape: (2, 3)
640
+ # # ┌─────┬─────┬─────┐
641
+ # # │ foo ┆ bar ┆ ham │
642
+ # # │ --- ┆ --- ┆ --- │
643
+ # # │ i64 ┆ i64 ┆ str │
644
+ # # ╞═════╪═════╪═════╡
645
+ # # │ 1 ┆ 6 ┆ a │
646
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
647
+ # # │ 2 ┆ 7 ┆ b │
648
+ # # └─────┴─────┴─────┘
649
+ #
650
+ # @example Filter on multiple conditions:
651
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
652
+ # # =>
653
+ # # shape: (1, 3)
654
+ # # ┌─────┬─────┬─────┐
655
+ # # │ foo ┆ bar ┆ ham │
656
+ # # │ --- ┆ --- ┆ --- │
657
+ # # │ i64 ┆ i64 ┆ str │
658
+ # # ╞═════╪═════╪═════╡
659
+ # # │ 1 ┆ 6 ┆ a │
660
+ # # └─────┴─────┴─────┘
661
+ def filter(predicate)
662
+ _from_rbldf(
663
+ _ldf.filter(
664
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
665
+ )
666
+ )
667
+ end
668
+
669
+ # Select columns from this DataFrame.
670
+ #
671
+ # @param exprs [Object]
672
+ # Column or columns to select.
673
+ #
674
+ # @return [LazyFrame]
675
+ #
676
+ # @example
677
+ # df = Polars::DataFrame.new(
678
+ # {
679
+ # "foo" => [1, 2, 3],
680
+ # "bar" => [6, 7, 8],
681
+ # "ham" => ["a", "b", "c"],
682
+ # }
683
+ # ).lazy
684
+ # df.select("foo").collect
685
+ # # =>
686
+ # # shape: (3, 1)
687
+ # # ┌─────┐
688
+ # # │ foo │
689
+ # # │ --- │
690
+ # # │ i64 │
691
+ # # ╞═════╡
692
+ # # │ 1 │
693
+ # # ├╌╌╌╌╌┤
694
+ # # │ 2 │
695
+ # # ├╌╌╌╌╌┤
696
+ # # │ 3 │
697
+ # # └─────┘
698
+ #
699
+ # @example
700
+ # df.select(["foo", "bar"]).collect
701
+ # # =>
702
+ # # shape: (3, 2)
703
+ # # ┌─────┬─────┐
704
+ # # │ foo ┆ bar │
705
+ # # │ --- ┆ --- │
706
+ # # │ i64 ┆ i64 │
707
+ # # ╞═════╪═════╡
708
+ # # │ 1 ┆ 6 │
709
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
710
+ # # │ 2 ┆ 7 │
711
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
712
+ # # │ 3 ┆ 8 │
713
+ # # └─────┴─────┘
714
+ #
715
+ # @example
716
+ # df.select(Polars.col("foo") + 1).collect
717
+ # # =>
718
+ # # shape: (3, 1)
719
+ # # ┌─────┐
720
+ # # │ foo │
721
+ # # │ --- │
722
+ # # │ i64 │
723
+ # # ╞═════╡
724
+ # # │ 2 │
725
+ # # ├╌╌╌╌╌┤
726
+ # # │ 3 │
727
+ # # ├╌╌╌╌╌┤
728
+ # # │ 4 │
729
+ # # └─────┘
730
+ #
731
+ # @example
732
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
733
+ # # =>
734
+ # # shape: (3, 2)
735
+ # # ┌─────┬─────┐
736
+ # # │ foo ┆ bar │
737
+ # # │ --- ┆ --- │
738
+ # # │ i64 ┆ i64 │
739
+ # # ╞═════╪═════╡
740
+ # # │ 2 ┆ 7 │
741
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 3 ┆ 8 │
743
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 4 ┆ 9 │
745
+ # # └─────┴─────┘
746
+ #
747
+ # @example
748
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
749
+ # # =>
750
+ # # shape: (3, 1)
751
+ # # ┌─────────┐
752
+ # # │ literal │
753
+ # # │ --- │
754
+ # # │ i64 │
755
+ # # ╞═════════╡
756
+ # # │ 0 │
757
+ # # ├╌╌╌╌╌╌╌╌╌┤
758
+ # # │ 0 │
759
+ # # ├╌╌╌╌╌╌╌╌╌┤
760
+ # # │ 10 │
761
+ # # └─────────┘
762
+ def select(exprs)
763
+ exprs = Utils.selection_to_rbexpr_list(exprs)
764
+ _from_rbldf(_ldf.select(exprs))
765
+ end
766
+
767
+ # Start a groupby operation.
768
+ #
769
+ # @param by [Object]
770
+ # Column(s) to group by.
771
+ # @param maintain_order [Boolean]
772
+ # Make sure that the order of the groups remain consistent. This is more
773
+ # expensive than a default groupby.
774
+ #
775
+ # @return [LazyGroupBy]
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new(
779
+ # {
780
+ # "a" => ["a", "b", "a", "b", "b", "c"],
781
+ # "b" => [1, 2, 3, 4, 5, 6],
782
+ # "c" => [6, 5, 4, 3, 2, 1]
783
+ # }
784
+ # ).lazy
785
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
786
+ # # =>
787
+ # # shape: (3, 2)
788
+ # # ┌─────┬─────┐
789
+ # # │ a ┆ b │
790
+ # # │ --- ┆ --- │
791
+ # # │ str ┆ i64 │
792
+ # # ╞═════╪═════╡
793
+ # # │ a ┆ 4 │
794
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
795
+ # # │ b ┆ 11 │
796
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
797
+ # # │ c ┆ 6 │
798
+ # # └─────┴─────┘
799
+ def groupby(by, maintain_order: false)
800
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
801
+ lgb = _ldf.groupby(rbexprs_by, maintain_order)
802
+ LazyGroupBy.new(lgb, self.class)
803
+ end
804
+
805
+ # Create rolling groups based on a time column.
806
+ #
807
+ # Also works for index values of type `:i32` or `:i64`.
808
+ #
809
+ # Different from a `dynamic_groupby` the windows are now determined by the
810
+ # individual values and are not of constant intervals. For constant intervals
811
+ # use *groupby_dynamic*.
812
+ #
813
+ # The `period` and `offset` arguments are created either from a timedelta, or
814
+ # by using the following string language:
815
+ #
816
+ # - 1ns (1 nanosecond)
817
+ # - 1us (1 microsecond)
818
+ # - 1ms (1 millisecond)
819
+ # - 1s (1 second)
820
+ # - 1m (1 minute)
821
+ # - 1h (1 hour)
822
+ # - 1d (1 day)
823
+ # - 1w (1 week)
824
+ # - 1mo (1 calendar month)
825
+ # - 1y (1 calendar year)
826
+ # - 1i (1 index count)
827
+ #
828
+ # Or combine them:
829
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
830
+ #
831
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
832
+ #
833
+ # - "1i" # length 1
834
+ # - "10i" # length 10
835
+ #
836
+ # @param index_column [Object]
837
+ # Column used to group based on the time window.
838
+ # Often to type Date/Datetime
839
+ # This column must be sorted in ascending order. If not the output will not
840
+ # make sense.
841
+ #
842
+ # In case of a rolling groupby on indices, dtype needs to be one of
843
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
844
+ # performance matters use an `:i64` column.
845
+ # @param period [Object]
846
+ # Length of the window.
847
+ # @param offset [Object]
848
+ # Offset of the window. Default is -period.
849
+ # @param closed ["right", "left", "both", "none"]
850
+ # Define whether the temporal window interval is closed or not.
851
+ # @param by [Object]
852
+ # Also group by this column/these columns.
853
+ #
854
+ # @return [LazyFrame]
855
+ #
856
+ # @example
857
+ # dates = [
858
+ # "2020-01-01 13:45:48",
859
+ # "2020-01-01 16:42:13",
860
+ # "2020-01-01 16:45:09",
861
+ # "2020-01-02 18:12:48",
862
+ # "2020-01-03 19:45:32",
863
+ # "2020-01-08 23:16:43"
864
+ # ]
865
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
866
+ # Polars.col("dt").str.strptime(:datetime)
867
+ # )
868
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
869
+ # [
870
+ # Polars.sum("a").alias("sum_a"),
871
+ # Polars.min("a").alias("min_a"),
872
+ # Polars.max("a").alias("max_a")
873
+ # ]
874
+ # )
875
+ # # =>
876
+ # # shape: (6, 4)
877
+ # # ┌─────────────────────┬───────┬───────┬───────┐
878
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
879
+ # # │ --- ┆ --- ┆ --- ┆ --- │
880
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
881
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
882
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
883
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
884
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
885
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
886
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
887
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
888
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
889
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
890
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
891
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
892
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
893
+ # # └─────────────────────┴───────┴───────┴───────┘
894
+ def groupby_rolling(
895
+ index_column:,
896
+ period:,
897
+ offset: nil,
898
+ closed: "right",
899
+ by: nil
900
+ )
901
+ if offset.nil?
902
+ offset = "-#{period}"
903
+ end
904
+
905
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
906
+ period = Utils._timedelta_to_pl_duration(period)
907
+ offset = Utils._timedelta_to_pl_duration(offset)
908
+
909
+ lgb = _ldf.groupby_rolling(
910
+ index_column, period, offset, closed, rbexprs_by
911
+ )
912
+ LazyGroupBy.new(lgb, self.class)
913
+ end
914
+
915
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
916
+ #
917
+ # Time windows are calculated and rows are assigned to windows. Different from a
918
+ # normal groupby is that a row can be member of multiple groups. The time/index
919
+ # window could be seen as a rolling window, with a window size determined by
920
+ # dates/times/values instead of slots in the DataFrame.
921
+ #
922
+ # A window is defined by:
923
+ #
924
+ # - every: interval of the window
925
+ # - period: length of the window
926
+ # - offset: offset of the window
927
+ #
928
+ # The `every`, `period` and `offset` arguments are created with
929
+ # the following string language:
930
+ #
931
+ # - 1ns (1 nanosecond)
932
+ # - 1us (1 microsecond)
933
+ # - 1ms (1 millisecond)
934
+ # - 1s (1 second)
935
+ # - 1m (1 minute)
936
+ # - 1h (1 hour)
937
+ # - 1d (1 day)
938
+ # - 1w (1 week)
939
+ # - 1mo (1 calendar month)
940
+ # - 1y (1 calendar year)
941
+ # - 1i (1 index count)
942
+ #
943
+ # Or combine them:
944
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
945
+ #
946
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
947
+ #
948
+ # - "1i" # length 1
949
+ # - "10i" # length 10
950
+ #
951
+ # @param index_column
952
+ # Column used to group based on the time window.
953
+ # Often to type Date/Datetime
954
+ # This column must be sorted in ascending order. If not the output will not
955
+ # make sense.
956
+ #
957
+ # In case of a dynamic groupby on indices, dtype needs to be one of
958
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
959
+ # performance matters use an `:i64` column.
960
+ # @param every
961
+ # Interval of the window.
962
+ # @param period
963
+ # Length of the window, if None it is equal to 'every'.
964
+ # @param offset
965
+ # Offset of the window if None and period is None it will be equal to negative
966
+ # `every`.
967
+ # @param truncate
968
+ # Truncate the time value to the window lower bound.
969
+ # @param include_boundaries
970
+ # Add the lower and upper bound of the window to the "_lower_bound" and
971
+ # "_upper_bound" columns. This will impact performance because it's harder to
972
+ # parallelize
973
+ # @param closed ["right", "left", "both", "none"]
974
+ # Define whether the temporal window interval is closed or not.
975
+ # @param by
976
+ # Also group by this column/these columns
977
+ #
978
+ # @return [DataFrame]
979
+ #
980
+ # @example
981
+ # df = Polars::DataFrame.new(
982
+ # {
983
+ # "time" => Polars.date_range(
984
+ # DateTime.new(2021, 12, 16),
985
+ # DateTime.new(2021, 12, 16, 3),
986
+ # "30m"
987
+ # ),
988
+ # "n" => 0..6
989
+ # }
990
+ # )
991
+ # # =>
992
+ # # shape: (7, 2)
993
+ # # ┌─────────────────────┬─────┐
994
+ # # │ time ┆ n │
995
+ # # │ --- ┆ --- │
996
+ # # │ datetime[μs] ┆ i64 │
997
+ # # ╞═════════════════════╪═════╡
998
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
999
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1000
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1001
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1002
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1003
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1004
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1005
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1006
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1007
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1008
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1009
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1010
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1011
+ # # └─────────────────────┴─────┘
1012
+ #
1013
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1014
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1015
+ # [
1016
+ # Polars.col("time").min.alias("time_min"),
1017
+ # Polars.col("time").max.alias("time_max")
1018
+ # ]
1019
+ # )
1020
+ # # =>
1021
+ # # shape: (4, 3)
1022
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1023
+ # # │ time ┆ time_min ┆ time_max │
1024
+ # # │ --- ┆ --- ┆ --- │
1025
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1026
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1027
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1028
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1029
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1030
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1031
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1032
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1033
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1034
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1035
+ #
1036
+ # @example The window boundaries can also be added to the aggregation result.
1037
+ # df.groupby_dynamic(
1038
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1039
+ # ).agg([Polars.col("time").count.alias("time_count")])
1040
+ # # =>
1041
+ # # shape: (4, 4)
1042
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1043
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1044
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1045
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1046
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1047
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1048
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1049
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1050
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1051
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1052
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1053
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1054
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1055
+ #
1056
+ # @example When closed="left", should not include right end of interval.
1057
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1058
+ # [
1059
+ # Polars.col("time").count.alias("time_count"),
1060
+ # Polars.col("time").list.alias("time_agg_list")
1061
+ # ]
1062
+ # )
1063
+ # # =>
1064
+ # # shape: (4, 3)
1065
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1066
+ # # │ time ┆ time_count ┆ time_agg_list │
1067
+ # # │ --- ┆ --- ┆ --- │
1068
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1069
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1070
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1071
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1072
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1073
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1074
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1075
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1076
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1077
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1078
+ #
1079
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1080
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1081
+ # [Polars.col("time").count.alias("time_count")]
1082
+ # )
1083
+ # # =>
1084
+ # # shape: (5, 2)
1085
+ # # ┌─────────────────────┬────────────┐
1086
+ # # │ time ┆ time_count │
1087
+ # # │ --- ┆ --- │
1088
+ # # │ datetime[μs] ┆ u32 │
1089
+ # # ╞═════════════════════╪════════════╡
1090
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1091
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1092
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1093
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1094
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1095
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1096
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1097
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1098
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1099
+ # # └─────────────────────┴────────────┘
1100
+ #
1101
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
1102
+ # df = Polars::DataFrame.new(
1103
+ # {
1104
+ # "time" => Polars.date_range(
1105
+ # DateTime.new(2021, 12, 16),
1106
+ # DateTime.new(2021, 12, 16, 3),
1107
+ # "30m"
1108
+ # ),
1109
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1110
+ # }
1111
+ # )
1112
+ # df.groupby_dynamic(
1113
+ # "time",
1114
+ # every: "1h",
1115
+ # closed: "both",
1116
+ # by: "groups",
1117
+ # include_boundaries: true
1118
+ # ).agg([Polars.col("time").count.alias("time_count")])
1119
+ # # =>
1120
+ # # shape: (7, 5)
1121
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1122
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1123
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1124
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1125
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1126
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1127
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1128
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1129
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1130
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1131
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1132
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1133
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1134
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1135
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1136
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1137
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1138
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1139
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1140
+ #
1141
+ # @example Dynamic groupby on an index column.
1142
+ # df = Polars::DataFrame.new(
1143
+ # {
1144
+ # "idx" => Polars.arange(0, 6, eager: true),
1145
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1146
+ # }
1147
+ # )
1148
+ # df.groupby_dynamic(
1149
+ # "idx",
1150
+ # every: "2i",
1151
+ # period: "3i",
1152
+ # include_boundaries: true,
1153
+ # closed: "right"
1154
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
1155
+ # # =>
1156
+ # # shape: (3, 4)
1157
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1158
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1159
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1160
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1161
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1162
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1163
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1164
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1165
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1166
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1167
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1168
+ def groupby_dynamic(
1169
+ index_column,
1170
+ every:,
1171
+ period: nil,
1172
+ offset: nil,
1173
+ truncate: true,
1174
+ include_boundaries: false,
1175
+ closed: "left",
1176
+ by: nil,
1177
+ start_by: "window"
1178
+ )
1179
+ if offset.nil?
1180
+ if period.nil?
1181
+ offset = "-#{every}"
1182
+ else
1183
+ offset = "0ns"
1184
+ end
1185
+ end
1186
+
1187
+ if period.nil?
1188
+ period = every
1189
+ end
1190
+
1191
+ period = Utils._timedelta_to_pl_duration(period)
1192
+ offset = Utils._timedelta_to_pl_duration(offset)
1193
+ every = Utils._timedelta_to_pl_duration(every)
1194
+
1195
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1196
+ lgb = _ldf.groupby_dynamic(
1197
+ index_column,
1198
+ every,
1199
+ period,
1200
+ offset,
1201
+ truncate,
1202
+ include_boundaries,
1203
+ closed,
1204
+ rbexprs_by,
1205
+ start_by
1206
+ )
1207
+ LazyGroupBy.new(lgb, self.class)
1208
+ end
1209
+
1210
+ # Perform an asof join.
1211
+ #
1212
+ # This is similar to a left-join except that we match on nearest key rather than
1213
+ # equal keys.
1214
+ #
1215
+ # Both DataFrames must be sorted by the join_asof key.
1216
+ #
1217
+ # For each row in the left DataFrame:
1218
+ #
1219
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1220
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1221
+ #
1222
+ # The default is "backward".
1223
+ #
1224
+ # @param other [LazyFrame]
1225
+ # Lazy DataFrame to join with.
1226
+ # @param left_on [String]
1227
+ # Join column of the left DataFrame.
1228
+ # @param right_on [String]
1229
+ # Join column of the right DataFrame.
1230
+ # @param on [String]
1231
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1232
+ # None.
1233
+ # @param by [Object]
1234
+ # Join on these columns before doing asof join.
1235
+ # @param by_left [Object]
1236
+ # Join on these columns before doing asof join.
1237
+ # @param by_right [Object]
1238
+ # Join on these columns before doing asof join.
1239
+ # @param strategy ["backward", "forward"]
1240
+ # Join strategy.
1241
+ # @param suffix [String]
1242
+ # Suffix to append to columns with a duplicate name.
1243
+ # @param tolerance [Object]
1244
+ # Numeric tolerance. By setting this the join will only be done if the near
1245
+ # keys are within this distance. If an asof join is done on columns of dtype
1246
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1247
+ # language:
1248
+ #
1249
+ # - 1ns (1 nanosecond)
1250
+ # - 1us (1 microsecond)
1251
+ # - 1ms (1 millisecond)
1252
+ # - 1s (1 second)
1253
+ # - 1m (1 minute)
1254
+ # - 1h (1 hour)
1255
+ # - 1d (1 day)
1256
+ # - 1w (1 week)
1257
+ # - 1mo (1 calendar month)
1258
+ # - 1y (1 calendar year)
1259
+ # - 1i (1 index count)
1260
+ #
1261
+ # Or combine them:
1262
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1263
+ #
1264
+ # @param allow_parallel [Boolean]
1265
+ # Allow the physical plan to optionally evaluate the computation of both
1266
+ # DataFrames up to the join in parallel.
1267
+ # @param force_parallel [Boolean]
1268
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1269
+ # the join in parallel.
1270
+ #
1271
+ # @return [LazyFrame]
1272
+ def join_asof(
1273
+ other,
1274
+ left_on: nil,
1275
+ right_on: nil,
1276
+ on: nil,
1277
+ by_left: nil,
1278
+ by_right: nil,
1279
+ by: nil,
1280
+ strategy: "backward",
1281
+ suffix: "_right",
1282
+ tolerance: nil,
1283
+ allow_parallel: true,
1284
+ force_parallel: false
1285
+ )
1286
+ if !other.is_a?(LazyFrame)
1287
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1288
+ end
1289
+
1290
+ if on.is_a?(String)
1291
+ left_on = on
1292
+ right_on = on
1293
+ end
1294
+
1295
+ if left_on.nil? || right_on.nil?
1296
+ raise ArgumentError, "You should pass the column to join on as an argument."
1297
+ end
1298
+
1299
+ if by_left.is_a?(String) || by_left.is_a?(Expr)
1300
+ by_left_ = [by_left]
1301
+ else
1302
+ by_left_ = by_left
1303
+ end
1304
+
1305
+ if by_right.is_a?(String) || by_right.is_a?(Expr)
1306
+ by_right_ = [by_right]
1307
+ else
1308
+ by_right_ = by_right
1309
+ end
1310
+
1311
+ if by.is_a?(String)
1312
+ by_left_ = [by]
1313
+ by_right_ = [by]
1314
+ elsif by.is_a?(Array)
1315
+ by_left_ = by
1316
+ by_right_ = by
1317
+ end
1318
+
1319
+ tolerance_str = nil
1320
+ tolerance_num = nil
1321
+ if tolerance.is_a?(String)
1322
+ tolerance_str = tolerance
1323
+ else
1324
+ tolerance_num = tolerance
1325
+ end
1326
+
1327
+ _from_rbldf(
1328
+ _ldf.join_asof(
1329
+ other._ldf,
1330
+ Polars.col(left_on)._rbexpr,
1331
+ Polars.col(right_on)._rbexpr,
1332
+ by_left_,
1333
+ by_right_,
1334
+ allow_parallel,
1335
+ force_parallel,
1336
+ suffix,
1337
+ strategy,
1338
+ tolerance_num,
1339
+ tolerance_str
1340
+ )
1341
+ )
1342
+ end
1343
+
1344
+ # Add a join operation to the Logical Plan.
1345
+ #
1346
+ # @param other [LazyFrame]
1347
+ # Lazy DataFrame to join with.
1348
+ # @param left_on [Object]
1349
+ # Join column of the left DataFrame.
1350
+ # @param right_on [Object]
1351
+ # Join column of the right DataFrame.
1352
+ # @param on Object
1353
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1354
+ # None.
1355
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1356
+ # Join strategy.
1357
+ # @param suffix [String]
1358
+ # Suffix to append to columns with a duplicate name.
1359
+ # @param allow_parallel [Boolean]
1360
+ # Allow the physical plan to optionally evaluate the computation of both
1361
+ # DataFrames up to the join in parallel.
1362
+ # @param force_parallel [Boolean]
1363
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1364
+ # the join in parallel.
1365
+ #
1366
+ # @return [LazyFrame]
1367
+ #
1368
+ # @example
1369
+ # df = Polars::DataFrame.new(
1370
+ # {
1371
+ # "foo" => [1, 2, 3],
1372
+ # "bar" => [6.0, 7.0, 8.0],
1373
+ # "ham" => ["a", "b", "c"]
1374
+ # }
1375
+ # ).lazy
1376
+ # other_df = Polars::DataFrame.new(
1377
+ # {
1378
+ # "apple" => ["x", "y", "z"],
1379
+ # "ham" => ["a", "b", "d"]
1380
+ # }
1381
+ # ).lazy
1382
+ # df.join(other_df, on: "ham").collect
1383
+ # # =>
1384
+ # # shape: (2, 4)
1385
+ # # ┌─────┬─────┬─────┬───────┐
1386
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1387
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1388
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1389
+ # # ╞═════╪═════╪═════╪═══════╡
1390
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1392
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1393
+ # # └─────┴─────┴─────┴───────┘
1394
+ #
1395
+ # @example
1396
+ # df.join(other_df, on: "ham", how: "outer").collect
1397
+ # # =>
1398
+ # # shape: (4, 4)
1399
+ # # ┌──────┬──────┬─────┬───────┐
1400
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1401
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1402
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1403
+ # # ╞══════╪══════╪═════╪═══════╡
1404
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1405
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1406
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1407
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1408
+ # # │ null ┆ null ┆ d ┆ z │
1409
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1410
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1411
+ # # └──────┴──────┴─────┴───────┘
1412
+ #
1413
+ # @example
1414
+ # df.join(other_df, on: "ham", how: "left").collect
1415
+ # # =>
1416
+ # # shape: (3, 4)
1417
+ # # ┌─────┬─────┬─────┬───────┐
1418
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1419
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1420
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1421
+ # # ╞═════╪═════╪═════╪═══════╡
1422
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1423
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1424
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1425
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1426
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1427
+ # # └─────┴─────┴─────┴───────┘
1428
+ #
1429
+ # @example
1430
+ # df.join(other_df, on: "ham", how: "semi").collect
1431
+ # # =>
1432
+ # # shape: (2, 3)
1433
+ # # ┌─────┬─────┬─────┐
1434
+ # # │ foo ┆ bar ┆ ham │
1435
+ # # │ --- ┆ --- ┆ --- │
1436
+ # # │ i64 ┆ f64 ┆ str │
1437
+ # # ╞═════╪═════╪═════╡
1438
+ # # │ 1 ┆ 6.0 ┆ a │
1439
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1440
+ # # │ 2 ┆ 7.0 ┆ b │
1441
+ # # └─────┴─────┴─────┘
1442
+ #
1443
+ # @example
1444
+ # df.join(other_df, on: "ham", how: "anti").collect
1445
+ # # =>
1446
+ # # shape: (1, 3)
1447
+ # # ┌─────┬─────┬─────┐
1448
+ # # │ foo ┆ bar ┆ ham │
1449
+ # # │ --- ┆ --- ┆ --- │
1450
+ # # │ i64 ┆ f64 ┆ str │
1451
+ # # ╞═════╪═════╪═════╡
1452
+ # # │ 3 ┆ 8.0 ┆ c │
1453
+ # # └─────┴─────┴─────┘
1454
+ def join(
1455
+ other,
1456
+ left_on: nil,
1457
+ right_on: nil,
1458
+ on: nil,
1459
+ how: "inner",
1460
+ suffix: "_right",
1461
+ allow_parallel: true,
1462
+ force_parallel: false
1463
+ )
1464
+ if !other.is_a?(LazyFrame)
1465
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1466
+ end
1467
+
1468
+ if how == "cross"
1469
+ return _from_rbldf(
1470
+ _ldf.join(
1471
+ other._ldf, [], [], allow_parallel, force_parallel, how, suffix
1472
+ )
1473
+ )
1474
+ end
1475
+
1476
+ if !on.nil?
1477
+ rbexprs = Utils.selection_to_rbexpr_list(on)
1478
+ rbexprs_left = rbexprs
1479
+ rbexprs_right = rbexprs
1480
+ elsif !left_on.nil? && !right_on.nil?
1481
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
1482
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
1483
+ else
1484
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1485
+ end
1486
+
1487
+ _from_rbldf(
1488
+ self._ldf.join(
1489
+ other._ldf,
1490
+ rbexprs_left,
1491
+ rbexprs_right,
1492
+ allow_parallel,
1493
+ force_parallel,
1494
+ how,
1495
+ suffix,
1496
+ )
1497
+ )
1498
+ end
1499
+
1500
+ # Add or overwrite multiple columns in a DataFrame.
1501
+ #
1502
+ # @param exprs [Object]
1503
+ # List of Expressions that evaluate to columns.
1504
+ #
1505
+ # @return [LazyFrame]
1506
+ #
1507
+ # @example
1508
+ # ldf = Polars::DataFrame.new(
1509
+ # {
1510
+ # "a" => [1, 2, 3, 4],
1511
+ # "b" => [0.5, 4, 10, 13],
1512
+ # "c" => [true, true, false, true]
1513
+ # }
1514
+ # ).lazy
1515
+ # ldf.with_columns(
1516
+ # [
1517
+ # (Polars.col("a") ** 2).alias("a^2"),
1518
+ # (Polars.col("b") / 2).alias("b/2"),
1519
+ # (Polars.col("c").is_not).alias("not c")
1520
+ # ]
1521
+ # ).collect
1522
+ # # =>
1523
+ # # shape: (4, 6)
1524
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
1525
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1526
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1527
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
1528
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
1529
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1531
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1533
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
1534
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1535
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
1536
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
1537
+ def with_columns(exprs)
1538
+ exprs =
1539
+ if exprs.nil?
1540
+ []
1541
+ elsif exprs.is_a?(Expr)
1542
+ [exprs]
1543
+ else
1544
+ exprs.to_a
1545
+ end
1546
+
1547
+ rbexprs = []
1548
+ exprs.each do |e|
1549
+ case e
1550
+ when Expr
1551
+ rbexprs << e._rbexpr
1552
+ when Series
1553
+ rbexprs = Utils.lit(e)._rbexpr
1554
+ else
1555
+ raise ArgumentError, "Expected an expression, got #{e}"
1556
+ end
1557
+ end
1558
+
1559
+ _from_rbldf(_ldf.with_columns(rbexprs))
1560
+ end
1561
+
1562
+ # Add an external context to the computation graph.
1563
+ #
1564
+ # This allows expressions to also access columns from DataFrames
1565
+ # that are not part of this one.
1566
+ #
1567
+ # @param other [Object]
1568
+ # Lazy DataFrame to join with.
1569
+ #
1570
+ # @return [LazyFrame]
1571
+ #
1572
+ # @example
1573
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1574
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1575
+ # (
1576
+ # df_a.with_context(df_other.lazy).select(
1577
+ # [Polars.col("b") + Polars.col("c").first]
1578
+ # )
1579
+ # ).collect
1580
+ # # =>
1581
+ # # shape: (3, 1)
1582
+ # # ┌──────┐
1583
+ # # │ b │
1584
+ # # │ --- │
1585
+ # # │ str │
1586
+ # # ╞══════╡
1587
+ # # │ afoo │
1588
+ # # ├╌╌╌╌╌╌┤
1589
+ # # │ cfoo │
1590
+ # # ├╌╌╌╌╌╌┤
1591
+ # # │ null │
1592
+ # # └──────┘
1593
+ def with_context(other)
1594
+ if !other.is_a?(Array)
1595
+ other = [other]
1596
+ end
1597
+
1598
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1599
+ end
1600
+
1601
+ # Add or overwrite column in a DataFrame.
1602
+ #
1603
+ # @param column [Object]
1604
+ # Expression that evaluates to column or a Series to use.
1605
+ #
1606
+ # @return [LazyFrame]
1607
+ #
1608
+ # @example
1609
+ # df = Polars::DataFrame.new(
1610
+ # {
1611
+ # "a" => [1, 3, 5],
1612
+ # "b" => [2, 4, 6]
1613
+ # }
1614
+ # ).lazy
1615
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1616
+ # # =>
1617
+ # # shape: (3, 3)
1618
+ # # ┌─────┬─────┬───────────┐
1619
+ # # │ a ┆ b ┆ b_squared │
1620
+ # # │ --- ┆ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 ┆ f64 │
1622
+ # # ╞═════╪═════╪═══════════╡
1623
+ # # │ 1 ┆ 2 ┆ 4.0 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1625
+ # # │ 3 ┆ 4 ┆ 16.0 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1627
+ # # │ 5 ┆ 6 ┆ 36.0 │
1628
+ # # └─────┴─────┴───────────┘
1629
+ #
1630
+ # @example
1631
+ # df.with_column(Polars.col("a") ** 2).collect
1632
+ # # =>
1633
+ # # shape: (3, 2)
1634
+ # # ┌──────┬─────┐
1635
+ # # │ a ┆ b │
1636
+ # # │ --- ┆ --- │
1637
+ # # │ f64 ┆ i64 │
1638
+ # # ╞══════╪═════╡
1639
+ # # │ 1.0 ┆ 2 │
1640
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1641
+ # # │ 9.0 ┆ 4 │
1642
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1643
+ # # │ 25.0 ┆ 6 │
1644
+ # # └──────┴─────┘
1645
+ def with_column(column)
1646
+ with_columns([column])
1647
+ end
1648
+
1649
+ # Remove one or multiple columns from a DataFrame.
1650
+ #
1651
+ # @param columns [Object]
1652
+ # - Name of the column that should be removed.
1653
+ # - List of column names.
1654
+ #
1655
+ # @return [LazyFrame]
1656
+ def drop(columns)
1657
+ if columns.is_a?(String)
1658
+ columns = [columns]
1659
+ end
1660
+ _from_rbldf(_ldf.drop_columns(columns))
1661
+ end
1662
+
1663
+ # Rename column names.
1664
+ #
1665
+ # @param mapping [Hash]
1666
+ # Key value pairs that map from old name to new name.
1667
+ #
1668
+ # @return [LazyFrame]
1669
+ def rename(mapping)
1670
+ existing = mapping.keys
1671
+ _new = mapping.values
1672
+ _from_rbldf(_ldf.rename(existing, _new))
1673
+ end
1674
+
1675
+ # Reverse the DataFrame.
1676
+ #
1677
+ # @return [LazyFrame]
1678
+ def reverse
1679
+ _from_rbldf(_ldf.reverse)
1680
+ end
1681
+
1682
+ # Shift the values by a given period.
1683
+ #
1684
+ # @param periods [Integer]
1685
+ # Number of places to shift (may be negative).
1686
+ #
1687
+ # @return [LazyFrame]
1688
+ #
1689
+ # @example
1690
+ # df = Polars::DataFrame.new(
1691
+ # {
1692
+ # "a" => [1, 3, 5],
1693
+ # "b" => [2, 4, 6]
1694
+ # }
1695
+ # ).lazy
1696
+ # df.shift(1).collect
1697
+ # # =>
1698
+ # # shape: (3, 2)
1699
+ # # ┌──────┬──────┐
1700
+ # # │ a ┆ b │
1701
+ # # │ --- ┆ --- │
1702
+ # # │ i64 ┆ i64 │
1703
+ # # ╞══════╪══════╡
1704
+ # # │ null ┆ null │
1705
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1706
+ # # │ 1 ┆ 2 │
1707
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1708
+ # # │ 3 ┆ 4 │
1709
+ # # └──────┴──────┘
1710
+ #
1711
+ # @example
1712
+ # df.shift(-1).collect
1713
+ # # =>
1714
+ # # shape: (3, 2)
1715
+ # # ┌──────┬──────┐
1716
+ # # │ a ┆ b │
1717
+ # # │ --- ┆ --- │
1718
+ # # │ i64 ┆ i64 │
1719
+ # # ╞══════╪══════╡
1720
+ # # │ 3 ┆ 4 │
1721
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1722
+ # # │ 5 ┆ 6 │
1723
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1724
+ # # │ null ┆ null │
1725
+ # # └──────┴──────┘
1726
+ def shift(periods)
1727
+ _from_rbldf(_ldf.shift(periods))
1728
+ end
1729
+
1730
+ # Shift the values by a given period and fill the resulting null values.
1731
+ #
1732
+ # @param periods [Integer]
1733
+ # Number of places to shift (may be negative).
1734
+ # @param fill_value [Object]
1735
+ # Fill `nil` values with the result of this expression.
1736
+ #
1737
+ # @return [LazyFrame]
1738
+ #
1739
+ # @example
1740
+ # df = Polars::DataFrame.new(
1741
+ # {
1742
+ # "a" => [1, 3, 5],
1743
+ # "b" => [2, 4, 6]
1744
+ # }
1745
+ # ).lazy
1746
+ # df.shift_and_fill(1, 0).collect
1747
+ # # =>
1748
+ # # shape: (3, 2)
1749
+ # # ┌─────┬─────┐
1750
+ # # │ a ┆ b │
1751
+ # # │ --- ┆ --- │
1752
+ # # │ i64 ┆ i64 │
1753
+ # # ╞═════╪═════╡
1754
+ # # │ 0 ┆ 0 │
1755
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1756
+ # # │ 1 ┆ 2 │
1757
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1758
+ # # │ 3 ┆ 4 │
1759
+ # # └─────┴─────┘
1760
+ #
1761
+ # @example
1762
+ # df.shift_and_fill(-1, 0).collect
1763
+ # # =>
1764
+ # # shape: (3, 2)
1765
+ # # ┌─────┬─────┐
1766
+ # # │ a ┆ b │
1767
+ # # │ --- ┆ --- │
1768
+ # # │ i64 ┆ i64 │
1769
+ # # ╞═════╪═════╡
1770
+ # # │ 3 ┆ 4 │
1771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1772
+ # # │ 5 ┆ 6 │
1773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1774
+ # # │ 0 ┆ 0 │
1775
+ # # └─────┴─────┘
1776
+ def shift_and_fill(periods, fill_value)
1777
+ if !fill_value.is_a?(Expr)
1778
+ fill_value = Polars.lit(fill_value)
1779
+ end
1780
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1781
+ end
1782
+
1783
+ # Get a slice of this DataFrame.
1784
+ #
1785
+ # @param offset [Integer]
1786
+ # Start index. Negative indexing is supported.
1787
+ # @param length [Integer]
1788
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1789
+ # will be selected.
1790
+ #
1791
+ # @return [LazyFrame]
1792
+ #
1793
+ # @example
1794
+ # df = Polars::DataFrame.new(
1795
+ # {
1796
+ # "a" => ["x", "y", "z"],
1797
+ # "b" => [1, 3, 5],
1798
+ # "c" => [2, 4, 6]
1799
+ # }
1800
+ # ).lazy
1801
+ # df.slice(1, 2).collect
1802
+ # # =>
1803
+ # # shape: (2, 3)
1804
+ # # ┌─────┬─────┬─────┐
1805
+ # # │ a ┆ b ┆ c │
1806
+ # # │ --- ┆ --- ┆ --- │
1807
+ # # │ str ┆ i64 ┆ i64 │
1808
+ # # ╞═════╪═════╪═════╡
1809
+ # # │ y ┆ 3 ┆ 4 │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ z ┆ 5 ┆ 6 │
1812
+ # # └─────┴─────┴─────┘
1813
+ def slice(offset, length = nil)
1814
+ if length && length < 0
1815
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
1816
+ end
1817
+ _from_rbldf(_ldf.slice(offset, length))
1818
+ end
1819
+
1820
+ # Get the first `n` rows.
1821
+ #
1822
+ # Alias for {#head}.
1823
+ #
1824
+ # @param n [Integer]
1825
+ # Number of rows to return.
1826
+ #
1827
+ # @return [LazyFrame]
1828
+ #
1829
+ # @note
1830
+ # Consider using the {#fetch} operation if you only want to test your
1831
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1832
+ # level, whereas the {#head}/{#limit} are applied at the end.
1833
+ def limit(n = 5)
1834
+ head(5)
1835
+ end
1836
+
1837
+ # Get the first `n` rows.
1838
+ #
1839
+ # @param n [Integer]
1840
+ # Number of rows to return.
1841
+ #
1842
+ # @return [LazyFrame]
1843
+ #
1844
+ # @note
1845
+ # Consider using the {#fetch} operation if you only want to test your
1846
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1847
+ # level, whereas the {#head}/{#limit} are applied at the end.
1848
+ def head(n = 5)
1849
+ slice(0, n)
1850
+ end
1851
+
1852
+ # Get the last `n` rows.
1853
+ #
1854
+ # @param n [Integer]
1855
+ # Number of rows.
1856
+ #
1857
+ # @return [LazyFrame]
1858
+ def tail(n = 5)
1859
+ _from_rbldf(_ldf.tail(n))
1860
+ end
1861
+
1862
+ # Get the last row of the DataFrame.
1863
+ #
1864
+ # @return [LazyFrame]
1865
+ def last
1866
+ tail(1)
1867
+ end
1868
+
1869
+ # Get the first row of the DataFrame.
1870
+ #
1871
+ # @return [LazyFrame]
1872
+ def first
1873
+ slice(0, 1)
1874
+ end
1875
+
1876
+ # Add a column at index 0 that counts the rows.
1877
+ #
1878
+ # @param name [String]
1879
+ # Name of the column to add.
1880
+ # @param offset [Integer]
1881
+ # Start the row count at this offset.
1882
+ #
1883
+ # @return [LazyFrame]
1884
+ #
1885
+ # @note
1886
+ # This can have a negative effect on query performance.
1887
+ # This may, for instance, block predicate pushdown optimization.
1888
+ #
1889
+ # @example
1890
+ # df = Polars::DataFrame.new(
1891
+ # {
1892
+ # "a" => [1, 3, 5],
1893
+ # "b" => [2, 4, 6]
1894
+ # }
1895
+ # ).lazy
1896
+ # df.with_row_count.collect
1897
+ # # =>
1898
+ # # shape: (3, 3)
1899
+ # # ┌────────┬─────┬─────┐
1900
+ # # │ row_nr ┆ a ┆ b │
1901
+ # # │ --- ┆ --- ┆ --- │
1902
+ # # │ u32 ┆ i64 ┆ i64 │
1903
+ # # ╞════════╪═════╪═════╡
1904
+ # # │ 0 ┆ 1 ┆ 2 │
1905
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1906
+ # # │ 1 ┆ 3 ┆ 4 │
1907
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1908
+ # # │ 2 ┆ 5 ┆ 6 │
1909
+ # # └────────┴─────┴─────┘
1910
+ def with_row_count(name: "row_nr", offset: 0)
1911
+ _from_rbldf(_ldf.with_row_count(name, offset))
1912
+ end
1913
+
1914
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
1915
+ #
1916
+ # @return [LazyFrame]
1917
+ #
1918
+ # @example
1919
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
1920
+ # s.take_every(2).collect
1921
+ # # =>
1922
+ # # shape: (2, 2)
1923
+ # # ┌─────┬─────┐
1924
+ # # │ a ┆ b │
1925
+ # # │ --- ┆ --- │
1926
+ # # │ i64 ┆ i64 │
1927
+ # # ╞═════╪═════╡
1928
+ # # │ 1 ┆ 5 │
1929
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1930
+ # # │ 3 ┆ 7 │
1931
+ # # └─────┴─────┘
1932
+ def take_every(n)
1933
+ select(Utils.col("*").take_every(n))
1934
+ end
1935
+
1936
+ # Fill null values using the specified value or strategy.
1937
+ #
1938
+ # @return [LazyFrame]
1939
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
1940
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
1941
+ end
1942
+
1943
+ # Fill floating point NaN values.
1944
+ #
1945
+ # @param fill_value [Object]
1946
+ # Value to fill the NaN values with.
1947
+ #
1948
+ # @return [LazyFrame]
1949
+ #
1950
+ # @note
1951
+ # Note that floating point NaN (Not a Number) are not missing values!
1952
+ # To replace missing values, use `fill_null` instead.
1953
+ #
1954
+ # @example
1955
+ # df = Polars::DataFrame.new(
1956
+ # {
1957
+ # "a" => [1.5, 2, Float::NAN, 4],
1958
+ # "b" => [0.5, 4, Float::NAN, 13],
1959
+ # }
1960
+ # ).lazy
1961
+ # df.fill_nan(99).collect
1962
+ # # =>
1963
+ # # shape: (4, 2)
1964
+ # # ┌──────┬──────┐
1965
+ # # │ a ┆ b │
1966
+ # # │ --- ┆ --- │
1967
+ # # │ f64 ┆ f64 │
1968
+ # # ╞══════╪══════╡
1969
+ # # │ 1.5 ┆ 0.5 │
1970
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1971
+ # # │ 2.0 ┆ 4.0 │
1972
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1973
+ # # │ 99.0 ┆ 99.0 │
1974
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1975
+ # # │ 4.0 ┆ 13.0 │
1976
+ # # └──────┴──────┘
1977
+ def fill_nan(fill_value)
1978
+ if !fill_value.is_a?(Expr)
1979
+ fill_value = Utils.lit(fill_value)
1980
+ end
1981
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
1982
+ end
1983
+
1984
+ # Aggregate the columns in the DataFrame to their standard deviation value.
1985
+ #
1986
+ # @return [LazyFrame]
1987
+ #
1988
+ # @example
1989
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1990
+ # df.std.collect
1991
+ # # =>
1992
+ # # shape: (1, 2)
1993
+ # # ┌──────────┬─────┐
1994
+ # # │ a ┆ b │
1995
+ # # │ --- ┆ --- │
1996
+ # # │ f64 ┆ f64 │
1997
+ # # ╞══════════╪═════╡
1998
+ # # │ 1.290994 ┆ 0.5 │
1999
+ # # └──────────┴─────┘
2000
+ #
2001
+ # @example
2002
+ # df.std(ddof: 0).collect
2003
+ # # =>
2004
+ # # shape: (1, 2)
2005
+ # # ┌──────────┬──────────┐
2006
+ # # │ a ┆ b │
2007
+ # # │ --- ┆ --- │
2008
+ # # │ f64 ┆ f64 │
2009
+ # # ╞══════════╪══════════╡
2010
+ # # │ 1.118034 ┆ 0.433013 │
2011
+ # # └──────────┴──────────┘
2012
+ def std(ddof: 1)
2013
+ _from_rbldf(_ldf.std(ddof))
2014
+ end
2015
+
2016
+ # Aggregate the columns in the DataFrame to their variance value.
2017
+ #
2018
+ # @return [LazyFrame]
2019
+ #
2020
+ # @example
2021
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2022
+ # df.var.collect
2023
+ # # =>
2024
+ # # shape: (1, 2)
2025
+ # # ┌──────────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════════╪══════╡
2030
+ # # │ 1.666667 ┆ 0.25 │
2031
+ # # └──────────┴──────┘
2032
+ #
2033
+ # @example
2034
+ # df.var(ddof: 0).collect
2035
+ # # =>
2036
+ # # shape: (1, 2)
2037
+ # # ┌──────┬────────┐
2038
+ # # │ a ┆ b │
2039
+ # # │ --- ┆ --- │
2040
+ # # │ f64 ┆ f64 │
2041
+ # # ╞══════╪════════╡
2042
+ # # │ 1.25 ┆ 0.1875 │
2043
+ # # └──────┴────────┘
2044
+ def var(ddof: 1)
2045
+ _from_rbldf(_ldf.var(ddof))
2046
+ end
2047
+
2048
+ # Aggregate the columns in the DataFrame to their maximum value.
2049
+ #
2050
+ # @return [LazyFrame]
2051
+ #
2052
+ # @example
2053
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2054
+ # df.max.collect
2055
+ # # =>
2056
+ # # shape: (1, 2)
2057
+ # # ┌─────┬─────┐
2058
+ # # │ a ┆ b │
2059
+ # # │ --- ┆ --- │
2060
+ # # │ i64 ┆ i64 │
2061
+ # # ╞═════╪═════╡
2062
+ # # │ 4 ┆ 2 │
2063
+ # # └─────┴─────┘
2064
+ def max
2065
+ _from_rbldf(_ldf.max)
2066
+ end
2067
+
2068
+ # Aggregate the columns in the DataFrame to their minimum value.
2069
+ #
2070
+ # @return [LazyFrame]
2071
+ #
2072
+ # @example
2073
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2074
+ # df.min.collect
2075
+ # # =>
2076
+ # # shape: (1, 2)
2077
+ # # ┌─────┬─────┐
2078
+ # # │ a ┆ b │
2079
+ # # │ --- ┆ --- │
2080
+ # # │ i64 ┆ i64 │
2081
+ # # ╞═════╪═════╡
2082
+ # # │ 1 ┆ 1 │
2083
+ # # └─────┴─────┘
2084
+ def min
2085
+ _from_rbldf(_ldf.min)
2086
+ end
2087
+
2088
+ # Aggregate the columns in the DataFrame to their sum value.
2089
+ #
2090
+ # @return [LazyFrame]
2091
+ #
2092
+ # @example
2093
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2094
+ # df.sum.collect
2095
+ # # =>
2096
+ # # shape: (1, 2)
2097
+ # # ┌─────┬─────┐
2098
+ # # │ a ┆ b │
2099
+ # # │ --- ┆ --- │
2100
+ # # │ i64 ┆ i64 │
2101
+ # # ╞═════╪═════╡
2102
+ # # │ 10 ┆ 5 │
2103
+ # # └─────┴─────┘
2104
+ def sum
2105
+ _from_rbldf(_ldf.sum)
2106
+ end
2107
+
2108
+ # Aggregate the columns in the DataFrame to their mean value.
2109
+ #
2110
+ # @return [LazyFrame]
2111
+ #
2112
+ # @example
2113
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2114
+ # df.mean.collect
2115
+ # # =>
2116
+ # # shape: (1, 2)
2117
+ # # ┌─────┬──────┐
2118
+ # # │ a ┆ b │
2119
+ # # │ --- ┆ --- │
2120
+ # # │ f64 ┆ f64 │
2121
+ # # ╞═════╪══════╡
2122
+ # # │ 2.5 ┆ 1.25 │
2123
+ # # └─────┴──────┘
2124
+ def mean
2125
+ _from_rbldf(_ldf.mean)
2126
+ end
2127
+
2128
+ # Aggregate the columns in the DataFrame to their median value.
2129
+ #
2130
+ # @return [LazyFrame]
2131
+ #
2132
+ # @example
2133
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2134
+ # df.median.collect
2135
+ # # =>
2136
+ # # shape: (1, 2)
2137
+ # # ┌─────┬─────┐
2138
+ # # │ a ┆ b │
2139
+ # # │ --- ┆ --- │
2140
+ # # │ f64 ┆ f64 │
2141
+ # # ╞═════╪═════╡
2142
+ # # │ 2.5 ┆ 1.0 │
2143
+ # # └─────┴─────┘
2144
+ def median
2145
+ _from_rbldf(_ldf.median)
2146
+ end
2147
+
2148
+ # Aggregate the columns in the DataFrame to their quantile value.
2149
+ #
2150
+ # @param quantile [Float]
2151
+ # Quantile between 0.0 and 1.0.
2152
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2153
+ # Interpolation method.
2154
+ #
2155
+ # @return [LazyFrame]
2156
+ #
2157
+ # @example
2158
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2159
+ # df.quantile(0.7).collect
2160
+ # # =>
2161
+ # # shape: (1, 2)
2162
+ # # ┌─────┬─────┐
2163
+ # # │ a ┆ b │
2164
+ # # │ --- ┆ --- │
2165
+ # # │ f64 ┆ f64 │
2166
+ # # ╞═════╪═════╡
2167
+ # # │ 3.0 ┆ 1.0 │
2168
+ # # └─────┴─────┘
2169
+ def quantile(quantile, interpolation: "nearest")
2170
+ quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2171
+ _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
2172
+ end
2173
+
2174
+ # Explode lists to long format.
2175
+ #
2176
+ # @return [LazyFrame]
2177
+ #
2178
+ # @example
2179
+ # df = Polars::DataFrame.new(
2180
+ # {
2181
+ # "letters" => ["a", "a", "b", "c"],
2182
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2183
+ # }
2184
+ # ).lazy
2185
+ # df.explode("numbers").collect
2186
+ # # =>
2187
+ # # shape: (8, 2)
2188
+ # # ┌─────────┬─────────┐
2189
+ # # │ letters ┆ numbers │
2190
+ # # │ --- ┆ --- │
2191
+ # # │ str ┆ i64 │
2192
+ # # ╞═════════╪═════════╡
2193
+ # # │ a ┆ 1 │
2194
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2195
+ # # │ a ┆ 2 │
2196
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2197
+ # # │ a ┆ 3 │
2198
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2199
+ # # │ b ┆ 4 │
2200
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2201
+ # # │ b ┆ 5 │
2202
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2203
+ # # │ c ┆ 6 │
2204
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2205
+ # # │ c ┆ 7 │
2206
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2207
+ # # │ c ┆ 8 │
2208
+ # # └─────────┴─────────┘
2209
+ def explode(columns)
2210
+ columns = Utils.selection_to_rbexpr_list(columns)
2211
+ _from_rbldf(_ldf.explode(columns))
2212
+ end
2213
+
2214
+ # Drop duplicate rows from this DataFrame.
2215
+ #
2216
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2217
+ # subset.
2218
+ #
2219
+ # @param maintain_order [Boolean]
2220
+ # Keep the same order as the original DataFrame. This requires more work to
2221
+ # compute.
2222
+ # @param subset [Object]
2223
+ # Subset to use to compare rows.
2224
+ # @param keep ["first", "last"]
2225
+ # Which of the duplicate rows to keep.
2226
+ #
2227
+ # @return [LazyFrame]
2228
+ def unique(maintain_order: true, subset: nil, keep: "first")
2229
+ if !subset.nil? && !subset.is_a?(Array)
2230
+ subset = [subset]
2231
+ end
2232
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2233
+ end
2234
+
2235
+ # Drop rows with null values from this LazyFrame.
2236
+ #
2237
+ # @param subset [Object]
2238
+ # Subset of column(s) on which `drop_nulls` will be applied.
2239
+ #
2240
+ # @return [LazyFrame]
2241
+ #
2242
+ # @example
2243
+ # df = Polars::DataFrame.new(
2244
+ # {
2245
+ # "foo" => [1, 2, 3],
2246
+ # "bar" => [6, nil, 8],
2247
+ # "ham" => ["a", "b", "c"]
2248
+ # }
2249
+ # )
2250
+ # df.lazy.drop_nulls.collect
2251
+ # # =>
2252
+ # # shape: (2, 3)
2253
+ # # ┌─────┬─────┬─────┐
2254
+ # # │ foo ┆ bar ┆ ham │
2255
+ # # │ --- ┆ --- ┆ --- │
2256
+ # # │ i64 ┆ i64 ┆ str │
2257
+ # # ╞═════╪═════╪═════╡
2258
+ # # │ 1 ┆ 6 ┆ a │
2259
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2260
+ # # │ 3 ┆ 8 ┆ c │
2261
+ # # └─────┴─────┴─────┘
2262
+ def drop_nulls(subset: nil)
2263
+ if !subset.nil? && !subset.is_a?(Array)
2264
+ subset = [subset]
2265
+ end
2266
+ _from_rbldf(_ldf.drop_nulls(subset))
2267
+ end
2268
+
2269
+ # Unpivot a DataFrame from wide to long format.
2270
+ #
2271
+ # Optionally leaves identifiers set.
2272
+ #
2273
+ # This function is useful to massage a DataFrame into a format where one or more
2274
+ # columns are identifier variables (id_vars), while all other columns, considered
2275
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2276
+ # two non-identifier columns, 'variable' and 'value'.
2277
+ #
2278
+ # @param id_vars [Object]
2279
+ # Columns to use as identifier variables.
2280
+ # @param value_vars [Object]
2281
+ # Values to use as identifier variables.
2282
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2283
+ # @param variable_name [String]
2284
+ # Name to give to the `value` column. Defaults to "variable"
2285
+ # @param value_name [String]
2286
+ # Name to give to the `value` column. Defaults to "value"
2287
+ #
2288
+ # @return [LazyFrame]
2289
+ #
2290
+ # @example
2291
+ # df = Polars::DataFrame.new(
2292
+ # {
2293
+ # "a" => ["x", "y", "z"],
2294
+ # "b" => [1, 3, 5],
2295
+ # "c" => [2, 4, 6]
2296
+ # }
2297
+ # ).lazy
2298
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2299
+ # # =>
2300
+ # # shape: (6, 3)
2301
+ # # ┌─────┬──────────┬───────┐
2302
+ # # │ a ┆ variable ┆ value │
2303
+ # # │ --- ┆ --- ┆ --- │
2304
+ # # │ str ┆ str ┆ i64 │
2305
+ # # ╞═════╪══════════╪═══════╡
2306
+ # # │ x ┆ b ┆ 1 │
2307
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2308
+ # # │ y ┆ b ┆ 3 │
2309
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2310
+ # # │ z ┆ b ┆ 5 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2312
+ # # │ x ┆ c ┆ 2 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2314
+ # # │ y ┆ c ┆ 4 │
2315
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2316
+ # # │ z ┆ c ┆ 6 │
2317
+ # # └─────┴──────────┴───────┘
2318
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
2319
+ if value_vars.is_a?(String)
2320
+ value_vars = [value_vars]
2321
+ end
2322
+ if id_vars.is_a?(String)
2323
+ id_vars = [id_vars]
2324
+ end
2325
+ if value_vars.nil?
2326
+ value_vars = []
2327
+ end
2328
+ if id_vars.nil?
2329
+ id_vars = []
2330
+ end
2331
+ _from_rbldf(
2332
+ _ldf.melt(id_vars, value_vars, value_name, variable_name)
2333
+ )
2334
+ end
2335
+
2336
+ # def map
2337
+ # end
2338
+
2339
+ # Interpolate intermediate values. The interpolation method is linear.
2340
+ #
2341
+ # @return [LazyFrame]
2342
+ #
2343
+ # @example
2344
+ # df = Polars::DataFrame.new(
2345
+ # {
2346
+ # "foo" => [1, nil, 9, 10],
2347
+ # "bar" => [6, 7, 9, nil],
2348
+ # "baz" => [1, nil, nil, 9]
2349
+ # }
2350
+ # ).lazy
2351
+ # df.interpolate.collect
2352
+ # # =>
2353
+ # # shape: (4, 3)
2354
+ # # ┌─────┬──────┬─────┐
2355
+ # # │ foo ┆ bar ┆ baz │
2356
+ # # │ --- ┆ --- ┆ --- │
2357
+ # # │ i64 ┆ i64 ┆ i64 │
2358
+ # # ╞═════╪══════╪═════╡
2359
+ # # │ 1 ┆ 6 ┆ 1 │
2360
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2361
+ # # │ 5 ┆ 7 ┆ 3 │
2362
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2363
+ # # │ 9 ┆ 9 ┆ 6 │
2364
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
2365
+ # # │ 10 ┆ null ┆ 9 │
2366
+ # # └─────┴──────┴─────┘
2367
+ def interpolate
2368
+ select(Utils.col("*").interpolate)
2369
+ end
2370
+
2371
+ # Decompose a struct into its fields.
2372
+ #
2373
+ # The fields will be inserted into the `DataFrame` on the location of the
2374
+ # `struct` type.
2375
+ #
2376
+ # @param names [Object]
2377
+ # Names of the struct columns that will be decomposed by its fields
2378
+ #
2379
+ # @return [LazyFrame]
2380
+ #
2381
+ # @example
2382
+ # df = (
2383
+ # Polars::DataFrame.new(
2384
+ # {
2385
+ # "before" => ["foo", "bar"],
2386
+ # "t_a" => [1, 2],
2387
+ # "t_b" => ["a", "b"],
2388
+ # "t_c" => [true, nil],
2389
+ # "t_d" => [[1, 2], [3]],
2390
+ # "after" => ["baz", "womp"]
2391
+ # }
2392
+ # )
2393
+ # .lazy
2394
+ # .select(
2395
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2396
+ # )
2397
+ # )
2398
+ # df.fetch
2399
+ # # =>
2400
+ # # shape: (2, 3)
2401
+ # # ┌────────┬─────────────────────┬───────┐
2402
+ # # │ before ┆ t_struct ┆ after │
2403
+ # # │ --- ┆ --- ┆ --- │
2404
+ # # │ str ┆ struct[4] ┆ str │
2405
+ # # ╞════════╪═════════════════════╪═══════╡
2406
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2407
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2408
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2409
+ # # └────────┴─────────────────────┴───────┘
2410
+ #
2411
+ # @example
2412
+ # df.unnest("t_struct").fetch
2413
+ # # =>
2414
+ # # shape: (2, 6)
2415
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2416
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2417
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2418
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2419
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2420
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2421
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2422
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2423
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2424
+ def unnest(names)
2425
+ if names.is_a?(String)
2426
+ names = [names]
2427
+ end
2428
+ _from_rbldf(_ldf.unnest(names))
2429
+ end
2430
+
2431
+ private
2432
+
2433
+ def initialize_copy(other)
2434
+ super
2435
+ self._ldf = _ldf._clone
2436
+ end
2437
+
2438
+ def _from_rbldf(rb_ldf)
2439
+ self.class._from_rbldf(rb_ldf)
2440
+ end
2441
+ end
2442
+ end