polars-df 0.10.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,2833 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
23
+ # @private
24
+ def self._from_rbldf(rb_ldf)
25
+ ldf = LazyFrame.allocate
26
+ ldf._ldf = rb_ldf
27
+ ldf
28
+ end
29
+
30
+ # @private
31
+ def self._scan_csv(
32
+ file,
33
+ has_header: true,
34
+ sep: ",",
35
+ comment_char: nil,
36
+ quote_char: '"',
37
+ skip_rows: 0,
38
+ dtypes: nil,
39
+ null_values: nil,
40
+ ignore_errors: false,
41
+ cache: true,
42
+ with_column_names: nil,
43
+ infer_schema_length: 100,
44
+ n_rows: nil,
45
+ encoding: "utf8",
46
+ low_memory: false,
47
+ rechunk: true,
48
+ skip_rows_after_header: 0,
49
+ row_count_name: nil,
50
+ row_count_offset: 0,
51
+ parse_dates: false,
52
+ eol_char: "\n",
53
+ truncate_ragged_lines: true
54
+ )
55
+ dtype_list = nil
56
+ if !dtypes.nil?
57
+ dtype_list = []
58
+ dtypes.each do |k, v|
59
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
60
+ end
61
+ end
62
+ processed_null_values = Utils._process_null_values(null_values)
63
+
64
+ _from_rbldf(
65
+ RbLazyFrame.new_from_csv(
66
+ file,
67
+ sep,
68
+ has_header,
69
+ ignore_errors,
70
+ skip_rows,
71
+ n_rows,
72
+ cache,
73
+ dtype_list,
74
+ low_memory,
75
+ comment_char,
76
+ quote_char,
77
+ processed_null_values,
78
+ infer_schema_length,
79
+ with_column_names,
80
+ rechunk,
81
+ skip_rows_after_header,
82
+ encoding,
83
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
84
+ parse_dates,
85
+ eol_char,
86
+ truncate_ragged_lines
87
+ )
88
+ )
89
+ end
90
+
91
+ # @private
92
+ def self._scan_parquet(
93
+ file,
94
+ n_rows: nil,
95
+ cache: true,
96
+ parallel: "auto",
97
+ rechunk: true,
98
+ row_count_name: nil,
99
+ row_count_offset: 0,
100
+ storage_options: nil,
101
+ low_memory: false,
102
+ use_statistics: true,
103
+ hive_partitioning: true
104
+ )
105
+ _from_rbldf(
106
+ RbLazyFrame.new_from_parquet(
107
+ file,
108
+ [],
109
+ n_rows,
110
+ cache,
111
+ parallel,
112
+ rechunk,
113
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
114
+ low_memory,
115
+ use_statistics,
116
+ hive_partitioning,
117
+ nil
118
+ )
119
+ )
120
+ end
121
+
122
+ # @private
123
+ def self._scan_ipc(
124
+ file,
125
+ n_rows: nil,
126
+ cache: true,
127
+ rechunk: true,
128
+ row_count_name: nil,
129
+ row_count_offset: 0,
130
+ storage_options: nil,
131
+ memory_map: true
132
+ )
133
+ if Utils.pathlike?(file)
134
+ file = Utils.normalise_filepath(file)
135
+ end
136
+
137
+ _from_rbldf(
138
+ RbLazyFrame.new_from_ipc(
139
+ file,
140
+ n_rows,
141
+ cache,
142
+ rechunk,
143
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
144
+ memory_map
145
+ )
146
+ )
147
+ end
148
+
149
+ # @private
150
+ def self._scan_ndjson(
151
+ file,
152
+ infer_schema_length: nil,
153
+ batch_size: nil,
154
+ n_rows: nil,
155
+ low_memory: false,
156
+ rechunk: true,
157
+ row_count_name: nil,
158
+ row_count_offset: 0
159
+ )
160
+ _from_rbldf(
161
+ RbLazyFrame.new_from_ndjson(
162
+ file,
163
+ infer_schema_length,
164
+ batch_size,
165
+ n_rows,
166
+ low_memory,
167
+ rechunk,
168
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
169
+ )
170
+ )
171
+ end
172
+
173
+ # def self.from_json
174
+ # end
175
+
176
+ # Read a logical plan from a JSON file to construct a LazyFrame.
177
+ #
178
+ # @param file [String]
179
+ # Path to a file or a file-like object.
180
+ #
181
+ # @return [LazyFrame]
182
+ def self.read_json(file)
183
+ if Utils.pathlike?(file)
184
+ file = Utils.normalise_filepath(file)
185
+ end
186
+
187
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
188
+ end
189
+
190
+ # Get or set column names.
191
+ #
192
+ # @return [Array]
193
+ #
194
+ # @example
195
+ # df = (
196
+ # Polars::DataFrame.new(
197
+ # {
198
+ # "foo" => [1, 2, 3],
199
+ # "bar" => [6, 7, 8],
200
+ # "ham" => ["a", "b", "c"]
201
+ # }
202
+ # )
203
+ # .lazy
204
+ # .select(["foo", "bar"])
205
+ # )
206
+ # df.columns
207
+ # # => ["foo", "bar"]
208
+ def columns
209
+ _ldf.columns
210
+ end
211
+
212
+ # Get dtypes of columns in LazyFrame.
213
+ #
214
+ # @return [Array]
215
+ #
216
+ # @example
217
+ # lf = Polars::DataFrame.new(
218
+ # {
219
+ # "foo" => [1, 2, 3],
220
+ # "bar" => [6.0, 7.0, 8.0],
221
+ # "ham" => ["a", "b", "c"]
222
+ # }
223
+ # ).lazy
224
+ # lf.dtypes
225
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
226
+ def dtypes
227
+ _ldf.dtypes
228
+ end
229
+
230
+ # Get the schema.
231
+ #
232
+ # @return [Hash]
233
+ #
234
+ # @example
235
+ # lf = Polars::DataFrame.new(
236
+ # {
237
+ # "foo" => [1, 2, 3],
238
+ # "bar" => [6.0, 7.0, 8.0],
239
+ # "ham" => ["a", "b", "c"]
240
+ # }
241
+ # ).lazy
242
+ # lf.schema
243
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
244
+ def schema
245
+ _ldf.schema
246
+ end
247
+
248
+ # Get the width of the LazyFrame.
249
+ #
250
+ # @return [Integer]
251
+ #
252
+ # @example
253
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
254
+ # lf.width
255
+ # # => 2
256
+ def width
257
+ _ldf.width
258
+ end
259
+
260
+ # Check if LazyFrame includes key.
261
+ #
262
+ # @return [Boolean]
263
+ def include?(key)
264
+ columns.include?(key)
265
+ end
266
+
267
+ # clone handled by initialize_copy
268
+
269
+ # def [](item)
270
+ # end
271
+
272
+ # Returns a string representing the LazyFrame.
273
+ #
274
+ # @return [String]
275
+ def to_s
276
+ <<~EOS
277
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
278
+
279
+ #{describe_plan}
280
+ EOS
281
+ end
282
+
283
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
284
+ #
285
+ # @param file [String]
286
+ # File path to which the result should be written.
287
+ #
288
+ # @return [nil]
289
+ def write_json(file)
290
+ if Utils.pathlike?(file)
291
+ file = Utils.normalise_filepath(file)
292
+ end
293
+ _ldf.write_json(file)
294
+ nil
295
+ end
296
+
297
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
298
+ #
299
+ # @param func [Object]
300
+ # Callable; will receive the frame as the first parameter,
301
+ # followed by any given args/kwargs.
302
+ # @param args [Object]
303
+ # Arguments to pass to the UDF.
304
+ # @param kwargs [Object]
305
+ # Keyword arguments to pass to the UDF.
306
+ #
307
+ # @return [LazyFrame]
308
+ #
309
+ # @example
310
+ # cast_str_to_int = lambda do |data, col_name:|
311
+ # data.with_column(Polars.col(col_name).cast(:i64))
312
+ # end
313
+ #
314
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
315
+ # df.pipe(cast_str_to_int, col_name: "b").collect
316
+ # # =>
317
+ # # shape: (4, 2)
318
+ # # ┌─────┬─────┐
319
+ # # │ a ┆ b │
320
+ # # │ --- ┆ --- │
321
+ # # │ i64 ┆ i64 │
322
+ # # ╞═════╪═════╡
323
+ # # │ 1 ┆ 10 │
324
+ # # │ 2 ┆ 20 │
325
+ # # │ 3 ┆ 30 │
326
+ # # │ 4 ┆ 40 │
327
+ # # └─────┴─────┘
328
+ def pipe(func, *args, **kwargs, &block)
329
+ func.call(self, *args, **kwargs, &block)
330
+ end
331
+
332
+ # Create a string representation of the unoptimized query plan.
333
+ #
334
+ # @return [String]
335
+ def describe_plan
336
+ _ldf.describe_plan
337
+ end
338
+
339
+ # Create a string representation of the optimized query plan.
340
+ #
341
+ # @return [String]
342
+ def describe_optimized_plan(
343
+ type_coercion: true,
344
+ predicate_pushdown: true,
345
+ projection_pushdown: true,
346
+ simplify_expression: true,
347
+ slice_pushdown: true,
348
+ common_subplan_elimination: true,
349
+ comm_subexpr_elim: true,
350
+ allow_streaming: false
351
+ )
352
+ ldf = _ldf.optimization_toggle(
353
+ type_coercion,
354
+ predicate_pushdown,
355
+ projection_pushdown,
356
+ simplify_expression,
357
+ slice_pushdown,
358
+ common_subplan_elimination,
359
+ comm_subexpr_elim,
360
+ allow_streaming,
361
+ false
362
+ )
363
+
364
+ ldf.describe_optimized_plan
365
+ end
366
+
367
+ # def show_graph
368
+ # end
369
+
370
+ # Sort the DataFrame.
371
+ #
372
+ # Sorting can be done by:
373
+ #
374
+ # - A single column name
375
+ # - An expression
376
+ # - Multiple expressions
377
+ #
378
+ # @param by [Object]
379
+ # Column (expressions) to sort by.
380
+ # @param reverse [Boolean]
381
+ # Sort in descending order.
382
+ # @param nulls_last [Boolean]
383
+ # Place null values last. Can only be used if sorted by a single column.
384
+ #
385
+ # @return [LazyFrame]
386
+ #
387
+ # @example
388
+ # df = Polars::DataFrame.new(
389
+ # {
390
+ # "foo" => [1, 2, 3],
391
+ # "bar" => [6.0, 7.0, 8.0],
392
+ # "ham" => ["a", "b", "c"]
393
+ # }
394
+ # ).lazy
395
+ # df.sort("foo", reverse: true).collect
396
+ # # =>
397
+ # # shape: (3, 3)
398
+ # # ┌─────┬─────┬─────┐
399
+ # # │ foo ┆ bar ┆ ham │
400
+ # # │ --- ┆ --- ┆ --- │
401
+ # # │ i64 ┆ f64 ┆ str │
402
+ # # ╞═════╪═════╪═════╡
403
+ # # │ 3 ┆ 8.0 ┆ c │
404
+ # # │ 2 ┆ 7.0 ┆ b │
405
+ # # │ 1 ┆ 6.0 ┆ a │
406
+ # # └─────┴─────┴─────┘
407
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
408
+ if by.is_a?(::String)
409
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
410
+ end
411
+ if Utils.bool?(reverse)
412
+ reverse = [reverse]
413
+ end
414
+
415
+ by = Utils.selection_to_rbexpr_list(by)
416
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
417
+ end
418
+
419
+ # def profile
420
+ # end
421
+
422
+ # Collect into a DataFrame.
423
+ #
424
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
425
+ # only. This can be a huge time saver in debugging queries.
426
+ #
427
+ # @param type_coercion [Boolean]
428
+ # Do type coercion optimization.
429
+ # @param predicate_pushdown [Boolean]
430
+ # Do predicate pushdown optimization.
431
+ # @param projection_pushdown [Boolean]
432
+ # Do projection pushdown optimization.
433
+ # @param simplify_expression [Boolean]
434
+ # Run simplify expressions optimization.
435
+ # @param string_cache [Boolean]
436
+ # This argument is deprecated. Please set the string cache globally.
437
+ # The argument will be ignored
438
+ # @param no_optimization [Boolean]
439
+ # Turn off (certain) optimizations.
440
+ # @param slice_pushdown [Boolean]
441
+ # Slice pushdown optimization.
442
+ # @param common_subplan_elimination [Boolean]
443
+ # Will try to cache branching subplans that occur on self-joins or unions.
444
+ # @param allow_streaming [Boolean]
445
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
446
+ #
447
+ # @return [DataFrame]
448
+ #
449
+ # @example
450
+ # df = Polars::DataFrame.new(
451
+ # {
452
+ # "a" => ["a", "b", "a", "b", "b", "c"],
453
+ # "b" => [1, 2, 3, 4, 5, 6],
454
+ # "c" => [6, 5, 4, 3, 2, 1]
455
+ # }
456
+ # ).lazy
457
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
458
+ # # =>
459
+ # # shape: (3, 3)
460
+ # # ┌─────┬─────┬─────┐
461
+ # # │ a ┆ b ┆ c │
462
+ # # │ --- ┆ --- ┆ --- │
463
+ # # │ str ┆ i64 ┆ i64 │
464
+ # # ╞═════╪═════╪═════╡
465
+ # # │ a ┆ 4 ┆ 10 │
466
+ # # │ b ┆ 11 ┆ 10 │
467
+ # # │ c ┆ 6 ┆ 1 │
468
+ # # └─────┴─────┴─────┘
469
+ def collect(
470
+ type_coercion: true,
471
+ predicate_pushdown: true,
472
+ projection_pushdown: true,
473
+ simplify_expression: true,
474
+ string_cache: false,
475
+ no_optimization: false,
476
+ slice_pushdown: true,
477
+ common_subplan_elimination: true,
478
+ comm_subexpr_elim: true,
479
+ allow_streaming: false,
480
+ _eager: false
481
+ )
482
+ if no_optimization
483
+ predicate_pushdown = false
484
+ projection_pushdown = false
485
+ slice_pushdown = false
486
+ common_subplan_elimination = false
487
+ comm_subexpr_elim = false
488
+ end
489
+
490
+ if allow_streaming
491
+ common_subplan_elimination = false
492
+ end
493
+
494
+ ldf = _ldf.optimization_toggle(
495
+ type_coercion,
496
+ predicate_pushdown,
497
+ projection_pushdown,
498
+ simplify_expression,
499
+ slice_pushdown,
500
+ common_subplan_elimination,
501
+ comm_subexpr_elim,
502
+ allow_streaming,
503
+ _eager
504
+ )
505
+ Utils.wrap_df(ldf.collect)
506
+ end
507
+
508
+ # Persists a LazyFrame at the provided path.
509
+ #
510
+ # This allows streaming results that are larger than RAM to be written to disk.
511
+ #
512
+ # @param path [String]
513
+ # File path to which the file should be written.
514
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
515
+ # Choose "zstd" for good compression performance.
516
+ # Choose "lz4" for fast compression/decompression.
517
+ # Choose "snappy" for more backwards compatibility guarantees
518
+ # when you deal with older parquet readers.
519
+ # @param compression_level [Integer]
520
+ # The level of compression to use. Higher compression means smaller files on
521
+ # disk.
522
+ #
523
+ # - "gzip" : min-level: 0, max-level: 10.
524
+ # - "brotli" : min-level: 0, max-level: 11.
525
+ # - "zstd" : min-level: 1, max-level: 22.
526
+ # @param statistics [Boolean]
527
+ # Write statistics to the parquet headers. This requires extra compute.
528
+ # @param row_group_size [Integer]
529
+ # Size of the row groups in number of rows.
530
+ # If `nil` (default), the chunks of the `DataFrame` are
531
+ # used. Writing in smaller chunks may reduce memory pressure and improve
532
+ # writing speeds.
533
+ # @param data_pagesize_limit [Integer]
534
+ # Size limit of individual data pages.
535
+ # If not set defaults to 1024 * 1024 bytes
536
+ # @param maintain_order [Boolean]
537
+ # Maintain the order in which data is processed.
538
+ # Setting this to `false` will be slightly faster.
539
+ # @param type_coercion [Boolean]
540
+ # Do type coercion optimization.
541
+ # @param predicate_pushdown [Boolean]
542
+ # Do predicate pushdown optimization.
543
+ # @param projection_pushdown [Boolean]
544
+ # Do projection pushdown optimization.
545
+ # @param simplify_expression [Boolean]
546
+ # Run simplify expressions optimization.
547
+ # @param no_optimization [Boolean]
548
+ # Turn off (certain) optimizations.
549
+ # @param slice_pushdown [Boolean]
550
+ # Slice pushdown optimization.
551
+ #
552
+ # @return [DataFrame]
553
+ #
554
+ # @example
555
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
556
+ # lf.sink_parquet("out.parquet")
557
+ def sink_parquet(
558
+ path,
559
+ compression: "zstd",
560
+ compression_level: nil,
561
+ statistics: false,
562
+ row_group_size: nil,
563
+ data_pagesize_limit: nil,
564
+ maintain_order: true,
565
+ type_coercion: true,
566
+ predicate_pushdown: true,
567
+ projection_pushdown: true,
568
+ simplify_expression: true,
569
+ no_optimization: false,
570
+ slice_pushdown: true
571
+ )
572
+ lf = _set_sink_optimizations(
573
+ type_coercion: type_coercion,
574
+ predicate_pushdown: predicate_pushdown,
575
+ projection_pushdown: projection_pushdown,
576
+ simplify_expression: simplify_expression,
577
+ slice_pushdown: slice_pushdown,
578
+ no_optimization: no_optimization
579
+ )
580
+
581
+ lf.sink_parquet(
582
+ path,
583
+ compression,
584
+ compression_level,
585
+ statistics,
586
+ row_group_size,
587
+ data_pagesize_limit,
588
+ maintain_order
589
+ )
590
+ end
591
+
592
+ # Evaluate the query in streaming mode and write to an IPC file.
593
+ #
594
+ # This allows streaming results that are larger than RAM to be written to disk.
595
+ #
596
+ # @param path [String]
597
+ # File path to which the file should be written.
598
+ # @param compression ["lz4", "zstd"]
599
+ # Choose "zstd" for good compression performance.
600
+ # Choose "lz4" for fast compression/decompression.
601
+ # @param maintain_order [Boolean]
602
+ # Maintain the order in which data is processed.
603
+ # Setting this to `false` will be slightly faster.
604
+ # @param type_coercion [Boolean]
605
+ # Do type coercion optimization.
606
+ # @param predicate_pushdown [Boolean]
607
+ # Do predicate pushdown optimization.
608
+ # @param projection_pushdown [Boolean]
609
+ # Do projection pushdown optimization.
610
+ # @param simplify_expression [Boolean]
611
+ # Run simplify expressions optimization.
612
+ # @param slice_pushdown [Boolean]
613
+ # Slice pushdown optimization.
614
+ # @param no_optimization [Boolean]
615
+ # Turn off (certain) optimizations.
616
+ #
617
+ # @return [DataFrame]
618
+ #
619
+ # @example
620
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
621
+ # lf.sink_ipc("out.arrow")
622
+ def sink_ipc(
623
+ path,
624
+ compression: "zstd",
625
+ maintain_order: true,
626
+ type_coercion: true,
627
+ predicate_pushdown: true,
628
+ projection_pushdown: true,
629
+ simplify_expression: true,
630
+ slice_pushdown: true,
631
+ no_optimization: false
632
+ )
633
+ lf = _set_sink_optimizations(
634
+ type_coercion: type_coercion,
635
+ predicate_pushdown: predicate_pushdown,
636
+ projection_pushdown: projection_pushdown,
637
+ simplify_expression: simplify_expression,
638
+ slice_pushdown: slice_pushdown,
639
+ no_optimization: no_optimization
640
+ )
641
+
642
+ lf.sink_ipc(
643
+ path,
644
+ compression,
645
+ maintain_order
646
+ )
647
+ end
648
+
649
+ # Evaluate the query in streaming mode and write to a CSV file.
650
+ #
651
+ # This allows streaming results that are larger than RAM to be written to disk.
652
+ #
653
+ # @param path [String]
654
+ # File path to which the file should be written.
655
+ # @param include_bom [Boolean]
656
+ # Whether to include UTF-8 BOM in the CSV output.
657
+ # @param include_header [Boolean]
658
+ # Whether to include header in the CSV output.
659
+ # @param separator [String]
660
+ # Separate CSV fields with this symbol.
661
+ # @param line_terminator [String]
662
+ # String used to end each row.
663
+ # @param quote_char [String]
664
+ # Byte to use as quoting character.
665
+ # @param batch_size [Integer]
666
+ # Number of rows that will be processed per thread.
667
+ # @param datetime_format [String]
668
+ # A format string, with the specifiers defined by the
669
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
670
+ # Rust crate. If no format specified, the default fractional-second
671
+ # precision is inferred from the maximum timeunit found in the frame's
672
+ # Datetime cols (if any).
673
+ # @param date_format [String]
674
+ # A format string, with the specifiers defined by the
675
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
676
+ # Rust crate.
677
+ # @param time_format [String]
678
+ # A format string, with the specifiers defined by the
679
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
680
+ # Rust crate.
681
+ # @param float_precision [Integer]
682
+ # Number of decimal places to write, applied to both `Float32` and
683
+ # `Float64` datatypes.
684
+ # @param null_value [String]
685
+ # A string representing null values (defaulting to the empty string).
686
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
687
+ # Determines the quoting strategy used.
688
+ #
689
+ # - necessary (default): This puts quotes around fields only when necessary.
690
+ # They are necessary when fields contain a quote,
691
+ # delimiter or record terminator.
692
+ # Quotes are also necessary when writing an empty record
693
+ # (which is indistinguishable from a record with one empty field).
694
+ # This is the default.
695
+ # - always: This puts quotes around every field. Always.
696
+ # - never: This never puts quotes around fields, even if that results in
697
+ # invalid CSV data (e.g.: by not quoting strings containing the
698
+ # separator).
699
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
700
+ # Namely, when writing a field that does not parse as a valid float
701
+ # or integer, then quotes will be used even if they aren`t strictly
702
+ # necessary.
703
+ # @param maintain_order [Boolean]
704
+ # Maintain the order in which data is processed.
705
+ # Setting this to `false` will be slightly faster.
706
+ # @param type_coercion [Boolean]
707
+ # Do type coercion optimization.
708
+ # @param predicate_pushdown [Boolean]
709
+ # Do predicate pushdown optimization.
710
+ # @param projection_pushdown [Boolean]
711
+ # Do projection pushdown optimization.
712
+ # @param simplify_expression [Boolean]
713
+ # Run simplify expressions optimization.
714
+ # @param slice_pushdown [Boolean]
715
+ # Slice pushdown optimization.
716
+ # @param no_optimization [Boolean]
717
+ # Turn off (certain) optimizations.
718
+ #
719
+ # @return [DataFrame]
720
+ #
721
+ # @example
722
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
723
+ # lf.sink_csv("out.csv")
724
+ def sink_csv(
725
+ path,
726
+ include_bom: false,
727
+ include_header: true,
728
+ separator: ",",
729
+ line_terminator: "\n",
730
+ quote_char: '"',
731
+ batch_size: 1024,
732
+ datetime_format: nil,
733
+ date_format: nil,
734
+ time_format: nil,
735
+ float_precision: nil,
736
+ null_value: nil,
737
+ quote_style: nil,
738
+ maintain_order: true,
739
+ type_coercion: true,
740
+ predicate_pushdown: true,
741
+ projection_pushdown: true,
742
+ simplify_expression: true,
743
+ slice_pushdown: true,
744
+ no_optimization: false
745
+ )
746
+ Utils._check_arg_is_1byte("separator", separator, false)
747
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
748
+
749
+ lf = _set_sink_optimizations(
750
+ type_coercion: type_coercion,
751
+ predicate_pushdown: predicate_pushdown,
752
+ projection_pushdown: projection_pushdown,
753
+ simplify_expression: simplify_expression,
754
+ slice_pushdown: slice_pushdown,
755
+ no_optimization: no_optimization
756
+ )
757
+
758
+ lf.sink_csv(
759
+ path,
760
+ include_bom,
761
+ include_header,
762
+ separator.ord,
763
+ line_terminator,
764
+ quote_char.ord,
765
+ batch_size,
766
+ datetime_format,
767
+ date_format,
768
+ time_format,
769
+ float_precision,
770
+ null_value,
771
+ quote_style,
772
+ maintain_order
773
+ )
774
+ end
775
+
776
+ # Evaluate the query in streaming mode and write to an NDJSON file.
777
+ #
778
+ # This allows streaming results that are larger than RAM to be written to disk.
779
+ #
780
+ # @param path [String]
781
+ # File path to which the file should be written.
782
+ # @param maintain_order [Boolean]
783
+ # Maintain the order in which data is processed.
784
+ # Setting this to `false` will be slightly faster.
785
+ # @param type_coercion [Boolean]
786
+ # Do type coercion optimization.
787
+ # @param predicate_pushdown [Boolean]
788
+ # Do predicate pushdown optimization.
789
+ # @param projection_pushdown [Boolean]
790
+ # Do projection pushdown optimization.
791
+ # @param simplify_expression [Boolean]
792
+ # Run simplify expressions optimization.
793
+ # @param slice_pushdown [Boolean]
794
+ # Slice pushdown optimization.
795
+ # @param no_optimization [Boolean]
796
+ # Turn off (certain) optimizations.
797
+ #
798
+ # @return [DataFrame]
799
+ #
800
+ # @example
801
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
802
+ # lf.sink_ndjson("out.ndjson")
803
+ def sink_ndjson(
804
+ path,
805
+ maintain_order: true,
806
+ type_coercion: true,
807
+ predicate_pushdown: true,
808
+ projection_pushdown: true,
809
+ simplify_expression: true,
810
+ slice_pushdown: true,
811
+ no_optimization: false
812
+ )
813
+ lf = _set_sink_optimizations(
814
+ type_coercion: type_coercion,
815
+ predicate_pushdown: predicate_pushdown,
816
+ projection_pushdown: projection_pushdown,
817
+ simplify_expression: simplify_expression,
818
+ slice_pushdown: slice_pushdown,
819
+ no_optimization: no_optimization
820
+ )
821
+
822
+ lf.sink_json(path, maintain_order)
823
+ end
824
+
825
+ # @private
826
+ def _set_sink_optimizations(
827
+ type_coercion: true,
828
+ predicate_pushdown: true,
829
+ projection_pushdown: true,
830
+ simplify_expression: true,
831
+ slice_pushdown: true,
832
+ no_optimization: false
833
+ )
834
+ if no_optimization
835
+ predicate_pushdown = false
836
+ projection_pushdown = false
837
+ slice_pushdown = false
838
+ end
839
+
840
+ _ldf.optimization_toggle(
841
+ type_coercion,
842
+ predicate_pushdown,
843
+ projection_pushdown,
844
+ simplify_expression,
845
+ slice_pushdown,
846
+ false,
847
+ false,
848
+ true,
849
+ false
850
+ )
851
+ end
852
+
853
+ # Collect a small number of rows for debugging purposes.
854
+ #
855
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
856
+ # read by every scan operation. This is a utility that helps debug a query on a
857
+ # smaller number of rows.
858
+ #
859
+ # Note that the fetch does not guarantee the final number of rows in the
860
+ # DataFrame. Filter, join operations and a lower number of rows available in the
861
+ # scanned file influence the final number of rows.
862
+ #
863
+ # @param n_rows [Integer]
864
+ # Collect n_rows from the data sources.
865
+ # @param type_coercion [Boolean]
866
+ # Run type coercion optimization.
867
+ # @param predicate_pushdown [Boolean]
868
+ # Run predicate pushdown optimization.
869
+ # @param projection_pushdown [Boolean]
870
+ # Run projection pushdown optimization.
871
+ # @param simplify_expression [Boolean]
872
+ # Run simplify expressions optimization.
873
+ # @param string_cache [Boolean]
874
+ # This argument is deprecated. Please set the string cache globally.
875
+ # The argument will be ignored
876
+ # @param no_optimization [Boolean]
877
+ # Turn off optimizations.
878
+ # @param slice_pushdown [Boolean]
879
+ # Slice pushdown optimization
880
+ # @param common_subplan_elimination [Boolean]
881
+ # Will try to cache branching subplans that occur on self-joins or unions.
882
+ # @param allow_streaming [Boolean]
883
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
884
+ #
885
+ # @return [DataFrame]
886
+ #
887
+ # @example
888
+ # df = Polars::DataFrame.new(
889
+ # {
890
+ # "a" => ["a", "b", "a", "b", "b", "c"],
891
+ # "b" => [1, 2, 3, 4, 5, 6],
892
+ # "c" => [6, 5, 4, 3, 2, 1]
893
+ # }
894
+ # ).lazy
895
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
896
+ # # =>
897
+ # # shape: (2, 3)
898
+ # # ┌─────┬─────┬─────┐
899
+ # # │ a ┆ b ┆ c │
900
+ # # │ --- ┆ --- ┆ --- │
901
+ # # │ str ┆ i64 ┆ i64 │
902
+ # # ╞═════╪═════╪═════╡
903
+ # # │ a ┆ 1 ┆ 6 │
904
+ # # │ b ┆ 2 ┆ 5 │
905
+ # # └─────┴─────┴─────┘
906
+ def fetch(
907
+ n_rows = 500,
908
+ type_coercion: true,
909
+ predicate_pushdown: true,
910
+ projection_pushdown: true,
911
+ simplify_expression: true,
912
+ string_cache: false,
913
+ no_optimization: false,
914
+ slice_pushdown: true,
915
+ common_subplan_elimination: true,
916
+ comm_subexpr_elim: true,
917
+ allow_streaming: false
918
+ )
919
+ if no_optimization
920
+ predicate_pushdown = false
921
+ projection_pushdown = false
922
+ slice_pushdown = false
923
+ common_subplan_elimination = false
924
+ end
925
+
926
+ ldf = _ldf.optimization_toggle(
927
+ type_coercion,
928
+ predicate_pushdown,
929
+ projection_pushdown,
930
+ simplify_expression,
931
+ slice_pushdown,
932
+ common_subplan_elimination,
933
+ comm_subexpr_elim,
934
+ allow_streaming,
935
+ false
936
+ )
937
+ Utils.wrap_df(ldf.fetch(n_rows))
938
+ end
939
+
940
+ # Return lazy representation, i.e. itself.
941
+ #
942
+ # Useful for writing code that expects either a `DataFrame` or
943
+ # `LazyFrame`.
944
+ #
945
+ # @return [LazyFrame]
946
+ #
947
+ # @example
948
+ # df = Polars::DataFrame.new(
949
+ # {
950
+ # "a" => [nil, 2, 3, 4],
951
+ # "b" => [0.5, nil, 2.5, 13],
952
+ # "c" => [true, true, false, nil]
953
+ # }
954
+ # )
955
+ # df.lazy
956
+ def lazy
957
+ self
958
+ end
959
+
960
+ # Cache the result once the execution of the physical plan hits this node.
961
+ #
962
+ # @return [LazyFrame]
963
+ def cache
964
+ _from_rbldf(_ldf.cache)
965
+ end
966
+
967
+ # TODO
968
+ # def cast
969
+ # end
970
+
971
+ # Create an empty copy of the current LazyFrame.
972
+ #
973
+ # The copy has an identical schema but no data.
974
+ #
975
+ # @return [LazyFrame]
976
+ #
977
+ # @example
978
+ # lf = Polars::LazyFrame.new(
979
+ # {
980
+ # "a" => [nil, 2, 3, 4],
981
+ # "b" => [0.5, nil, 2.5, 13],
982
+ # "c" => [true, true, false, nil],
983
+ # }
984
+ # ).lazy
985
+ # lf.clear.fetch
986
+ # # =>
987
+ # # shape: (0, 3)
988
+ # # ┌─────┬─────┬──────┐
989
+ # # │ a ┆ b ┆ c │
990
+ # # │ --- ┆ --- ┆ --- │
991
+ # # │ i64 ┆ f64 ┆ bool │
992
+ # # ╞═════╪═════╪══════╡
993
+ # # └─────┴─────┴──────┘
994
+ #
995
+ # @example
996
+ # lf.clear(2).fetch
997
+ # # =>
998
+ # # shape: (2, 3)
999
+ # # ┌──────┬──────┬──────┐
1000
+ # # │ a ┆ b ┆ c │
1001
+ # # │ --- ┆ --- ┆ --- │
1002
+ # # │ i64 ┆ f64 ┆ bool │
1003
+ # # ╞══════╪══════╪══════╡
1004
+ # # │ null ┆ null ┆ null │
1005
+ # # │ null ┆ null ┆ null │
1006
+ # # └──────┴──────┴──────┘
1007
+ def clear(n = 0)
1008
+ DataFrame.new(columns: schema).clear(n).lazy
1009
+ end
1010
+ alias_method :cleared, :clear
1011
+
1012
+ # Filter the rows in the DataFrame based on a predicate expression.
1013
+ #
1014
+ # @param predicate [Object]
1015
+ # Expression that evaluates to a boolean Series.
1016
+ #
1017
+ # @return [LazyFrame]
1018
+ #
1019
+ # @example Filter on one condition:
1020
+ # lf = Polars::DataFrame.new(
1021
+ # {
1022
+ # "foo" => [1, 2, 3],
1023
+ # "bar" => [6, 7, 8],
1024
+ # "ham" => ["a", "b", "c"]
1025
+ # }
1026
+ # ).lazy
1027
+ # lf.filter(Polars.col("foo") < 3).collect
1028
+ # # =>
1029
+ # # shape: (2, 3)
1030
+ # # ┌─────┬─────┬─────┐
1031
+ # # │ foo ┆ bar ┆ ham │
1032
+ # # │ --- ┆ --- ┆ --- │
1033
+ # # │ i64 ┆ i64 ┆ str │
1034
+ # # ╞═════╪═════╪═════╡
1035
+ # # │ 1 ┆ 6 ┆ a │
1036
+ # # │ 2 ┆ 7 ┆ b │
1037
+ # # └─────┴─────┴─────┘
1038
+ #
1039
+ # @example Filter on multiple conditions:
1040
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
1041
+ # # =>
1042
+ # # shape: (1, 3)
1043
+ # # ┌─────┬─────┬─────┐
1044
+ # # │ foo ┆ bar ┆ ham │
1045
+ # # │ --- ┆ --- ┆ --- │
1046
+ # # │ i64 ┆ i64 ┆ str │
1047
+ # # ╞═════╪═════╪═════╡
1048
+ # # │ 1 ┆ 6 ┆ a │
1049
+ # # └─────┴─────┴─────┘
1050
+ def filter(predicate)
1051
+ _from_rbldf(
1052
+ _ldf.filter(
1053
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
1054
+ )
1055
+ )
1056
+ end
1057
+
1058
+ # Select columns from this DataFrame.
1059
+ #
1060
+ # @param exprs [Array]
1061
+ # Column(s) to select, specified as positional arguments.
1062
+ # Accepts expression input. Strings are parsed as column names,
1063
+ # other non-expression inputs are parsed as literals.
1064
+ # @param named_exprs [Hash]
1065
+ # Additional columns to select, specified as keyword arguments.
1066
+ # The columns will be renamed to the keyword used.
1067
+ #
1068
+ # @return [LazyFrame]
1069
+ #
1070
+ # @example
1071
+ # df = Polars::DataFrame.new(
1072
+ # {
1073
+ # "foo" => [1, 2, 3],
1074
+ # "bar" => [6, 7, 8],
1075
+ # "ham" => ["a", "b", "c"],
1076
+ # }
1077
+ # ).lazy
1078
+ # df.select("foo").collect
1079
+ # # =>
1080
+ # # shape: (3, 1)
1081
+ # # ┌─────┐
1082
+ # # │ foo │
1083
+ # # │ --- │
1084
+ # # │ i64 │
1085
+ # # ╞═════╡
1086
+ # # │ 1 │
1087
+ # # │ 2 │
1088
+ # # │ 3 │
1089
+ # # └─────┘
1090
+ #
1091
+ # @example
1092
+ # df.select(["foo", "bar"]).collect
1093
+ # # =>
1094
+ # # shape: (3, 2)
1095
+ # # ┌─────┬─────┐
1096
+ # # │ foo ┆ bar │
1097
+ # # │ --- ┆ --- │
1098
+ # # │ i64 ┆ i64 │
1099
+ # # ╞═════╪═════╡
1100
+ # # │ 1 ┆ 6 │
1101
+ # # │ 2 ┆ 7 │
1102
+ # # │ 3 ┆ 8 │
1103
+ # # └─────┴─────┘
1104
+ #
1105
+ # @example
1106
+ # df.select(Polars.col("foo") + 1).collect
1107
+ # # =>
1108
+ # # shape: (3, 1)
1109
+ # # ┌─────┐
1110
+ # # │ foo │
1111
+ # # │ --- │
1112
+ # # │ i64 │
1113
+ # # ╞═════╡
1114
+ # # │ 2 │
1115
+ # # │ 3 │
1116
+ # # │ 4 │
1117
+ # # └─────┘
1118
+ #
1119
+ # @example
1120
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
1121
+ # # =>
1122
+ # # shape: (3, 2)
1123
+ # # ┌─────┬─────┐
1124
+ # # │ foo ┆ bar │
1125
+ # # │ --- ┆ --- │
1126
+ # # │ i64 ┆ i64 │
1127
+ # # ╞═════╪═════╡
1128
+ # # │ 2 ┆ 7 │
1129
+ # # │ 3 ┆ 8 │
1130
+ # # │ 4 ┆ 9 │
1131
+ # # └─────┴─────┘
1132
+ #
1133
+ # @example
1134
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
1135
+ # # =>
1136
+ # # shape: (3, 1)
1137
+ # # ┌─────────┐
1138
+ # # │ literal │
1139
+ # # │ --- │
1140
+ # # │ i64 │
1141
+ # # ╞═════════╡
1142
+ # # │ 0 │
1143
+ # # │ 0 │
1144
+ # # │ 10 │
1145
+ # # └─────────┘
1146
+ def select(*exprs, **named_exprs)
1147
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1148
+
1149
+ rbexprs = Utils.parse_as_list_of_expressions(
1150
+ *exprs, **named_exprs, __structify: structify
1151
+ )
1152
+ _from_rbldf(_ldf.select(rbexprs))
1153
+ end
1154
+
1155
+ # Start a group by operation.
1156
+ #
1157
+ # @param by [Object]
1158
+ # Column(s) to group by.
1159
+ # @param maintain_order [Boolean]
1160
+ # Make sure that the order of the groups remain consistent. This is more
1161
+ # expensive than a default group by.
1162
+ #
1163
+ # @return [LazyGroupBy]
1164
+ #
1165
+ # @example
1166
+ # df = Polars::DataFrame.new(
1167
+ # {
1168
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1169
+ # "b" => [1, 2, 3, 4, 5, 6],
1170
+ # "c" => [6, 5, 4, 3, 2, 1]
1171
+ # }
1172
+ # ).lazy
1173
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
1174
+ # # =>
1175
+ # # shape: (3, 2)
1176
+ # # ┌─────┬─────┐
1177
+ # # │ a ┆ b │
1178
+ # # │ --- ┆ --- │
1179
+ # # │ str ┆ i64 │
1180
+ # # ╞═════╪═════╡
1181
+ # # │ a ┆ 4 │
1182
+ # # │ b ┆ 11 │
1183
+ # # │ c ┆ 6 │
1184
+ # # └─────┴─────┘
1185
+ def group_by(by, maintain_order: false)
1186
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
1187
+ lgb = _ldf.group_by(rbexprs_by, maintain_order)
1188
+ LazyGroupBy.new(lgb)
1189
+ end
1190
+ alias_method :groupby, :group_by
1191
+ alias_method :group, :group_by
1192
+
1193
+ # Create rolling groups based on a time column.
1194
+ #
1195
+ # Also works for index values of type `:i32` or `:i64`.
1196
+ #
1197
+ # Different from a `dynamic_group_by` the windows are now determined by the
1198
+ # individual values and are not of constant intervals. For constant intervals
1199
+ # use *group_by_dynamic*.
1200
+ #
1201
+ # The `period` and `offset` arguments are created either from a timedelta, or
1202
+ # by using the following string language:
1203
+ #
1204
+ # - 1ns (1 nanosecond)
1205
+ # - 1us (1 microsecond)
1206
+ # - 1ms (1 millisecond)
1207
+ # - 1s (1 second)
1208
+ # - 1m (1 minute)
1209
+ # - 1h (1 hour)
1210
+ # - 1d (1 day)
1211
+ # - 1w (1 week)
1212
+ # - 1mo (1 calendar month)
1213
+ # - 1y (1 calendar year)
1214
+ # - 1i (1 index count)
1215
+ #
1216
+ # Or combine them:
1217
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1218
+ #
1219
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1220
+ #
1221
+ # - "1i" # length 1
1222
+ # - "10i" # length 10
1223
+ #
1224
+ # @param index_column [Object]
1225
+ # Column used to group based on the time window.
1226
+ # Often to type Date/Datetime
1227
+ # This column must be sorted in ascending order. If not the output will not
1228
+ # make sense.
1229
+ #
1230
+ # In case of a rolling group by on indices, dtype needs to be one of
1231
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1232
+ # performance matters use an `:i64` column.
1233
+ # @param period [Object]
1234
+ # Length of the window.
1235
+ # @param offset [Object]
1236
+ # Offset of the window. Default is -period.
1237
+ # @param closed ["right", "left", "both", "none"]
1238
+ # Define whether the temporal window interval is closed or not.
1239
+ # @param by [Object]
1240
+ # Also group by this column/these columns.
1241
+ # @param check_sorted [Boolean]
1242
+ # When the `by` argument is given, polars can not check sortedness
1243
+ # by the metadata and has to do a full scan on the index column to
1244
+ # verify data is sorted. This is expensive. If you are sure the
1245
+ # data within the by groups is sorted, you can set this to `false`.
1246
+ # Doing so incorrectly will lead to incorrect output
1247
+ #
1248
+ # @return [LazyFrame]
1249
+ #
1250
+ # @example
1251
+ # dates = [
1252
+ # "2020-01-01 13:45:48",
1253
+ # "2020-01-01 16:42:13",
1254
+ # "2020-01-01 16:45:09",
1255
+ # "2020-01-02 18:12:48",
1256
+ # "2020-01-03 19:45:32",
1257
+ # "2020-01-08 23:16:43"
1258
+ # ]
1259
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1260
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1261
+ # )
1262
+ # df.rolling(index_column: "dt", period: "2d").agg(
1263
+ # [
1264
+ # Polars.sum("a").alias("sum_a"),
1265
+ # Polars.min("a").alias("min_a"),
1266
+ # Polars.max("a").alias("max_a")
1267
+ # ]
1268
+ # ).collect
1269
+ # # =>
1270
+ # # shape: (6, 4)
1271
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1272
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1273
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1274
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1275
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1276
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1277
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1278
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1279
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1280
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1281
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1282
+ # # └─────────────────────┴───────┴───────┴───────┘
1283
+ def rolling(
1284
+ index_column:,
1285
+ period:,
1286
+ offset: nil,
1287
+ closed: "right",
1288
+ by: nil,
1289
+ check_sorted: true
1290
+ )
1291
+ index_column = Utils.parse_as_expression(index_column)
1292
+ if offset.nil?
1293
+ offset = "-#{period}"
1294
+ end
1295
+
1296
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1297
+ period = Utils._timedelta_to_pl_duration(period)
1298
+ offset = Utils._timedelta_to_pl_duration(offset)
1299
+
1300
+ lgb = _ldf.rolling(
1301
+ index_column, period, offset, closed, rbexprs_by, check_sorted
1302
+ )
1303
+ LazyGroupBy.new(lgb)
1304
+ end
1305
+ alias_method :group_by_rolling, :rolling
1306
+ alias_method :groupby_rolling, :rolling
1307
+
1308
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1309
+ #
1310
+ # Time windows are calculated and rows are assigned to windows. Different from a
1311
+ # normal group by is that a row can be member of multiple groups. The time/index
1312
+ # window could be seen as a rolling window, with a window size determined by
1313
+ # dates/times/values instead of slots in the DataFrame.
1314
+ #
1315
+ # A window is defined by:
1316
+ #
1317
+ # - every: interval of the window
1318
+ # - period: length of the window
1319
+ # - offset: offset of the window
1320
+ #
1321
+ # The `every`, `period` and `offset` arguments are created with
1322
+ # the following string language:
1323
+ #
1324
+ # - 1ns (1 nanosecond)
1325
+ # - 1us (1 microsecond)
1326
+ # - 1ms (1 millisecond)
1327
+ # - 1s (1 second)
1328
+ # - 1m (1 minute)
1329
+ # - 1h (1 hour)
1330
+ # - 1d (1 day)
1331
+ # - 1w (1 week)
1332
+ # - 1mo (1 calendar month)
1333
+ # - 1y (1 calendar year)
1334
+ # - 1i (1 index count)
1335
+ #
1336
+ # Or combine them:
1337
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1338
+ #
1339
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1340
+ #
1341
+ # - "1i" # length 1
1342
+ # - "10i" # length 10
1343
+ #
1344
+ # @param index_column [Object]
1345
+ # Column used to group based on the time window.
1346
+ # Often to type Date/Datetime
1347
+ # This column must be sorted in ascending order. If not the output will not
1348
+ # make sense.
1349
+ #
1350
+ # In case of a dynamic group by on indices, dtype needs to be one of
1351
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1352
+ # performance matters use an `:i64` column.
1353
+ # @param every [Object]
1354
+ # Interval of the window.
1355
+ # @param period [Object]
1356
+ # Length of the window, if None it is equal to 'every'.
1357
+ # @param offset [Object]
1358
+ # Offset of the window if None and period is None it will be equal to negative
1359
+ # `every`.
1360
+ # @param truncate [Boolean]
1361
+ # Truncate the time value to the window lower bound.
1362
+ # @param include_boundaries [Boolean]
1363
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1364
+ # "_upper_bound" columns. This will impact performance because it's harder to
1365
+ # parallelize
1366
+ # @param closed ["right", "left", "both", "none"]
1367
+ # Define whether the temporal window interval is closed or not.
1368
+ # @param by [Object]
1369
+ # Also group by this column/these columns
1370
+ # @param check_sorted [Boolean]
1371
+ # When the `by` argument is given, polars can not check sortedness
1372
+ # by the metadata and has to do a full scan on the index column to
1373
+ # verify data is sorted. This is expensive. If you are sure the
1374
+ # data within the by groups is sorted, you can set this to `false`.
1375
+ # Doing so incorrectly will lead to incorrect output.
1376
+ #
1377
+ # @return [DataFrame]
1378
+ #
1379
+ # @example
1380
+ # df = Polars::DataFrame.new(
1381
+ # {
1382
+ # "time" => Polars.date_range(
1383
+ # DateTime.new(2021, 12, 16),
1384
+ # DateTime.new(2021, 12, 16, 3),
1385
+ # "30m"
1386
+ # ),
1387
+ # "n" => 0..6
1388
+ # }
1389
+ # )
1390
+ # # =>
1391
+ # # shape: (7, 2)
1392
+ # # ┌─────────────────────┬─────┐
1393
+ # # │ time ┆ n │
1394
+ # # │ --- ┆ --- │
1395
+ # # │ datetime[μs] ┆ i64 │
1396
+ # # ╞═════════════════════╪═════╡
1397
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1398
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1399
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1400
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1401
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1402
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1403
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1404
+ # # └─────────────────────┴─────┘
1405
+ #
1406
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1407
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1408
+ # [
1409
+ # Polars.col("time").min.alias("time_min"),
1410
+ # Polars.col("time").max.alias("time_max")
1411
+ # ]
1412
+ # )
1413
+ # # =>
1414
+ # # shape: (4, 3)
1415
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1416
+ # # │ time ┆ time_min ┆ time_max │
1417
+ # # │ --- ┆ --- ┆ --- │
1418
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1419
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1420
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1421
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1422
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1423
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1424
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1425
+ #
1426
+ # @example The window boundaries can also be added to the aggregation result.
1427
+ # df.group_by_dynamic(
1428
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1429
+ # ).agg([Polars.col("time").count.alias("time_count")])
1430
+ # # =>
1431
+ # # shape: (4, 4)
1432
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1433
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1434
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1435
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1436
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1437
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1438
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1439
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1440
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1441
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1442
+ #
1443
+ # @example When closed="left", should not include right end of interval.
1444
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1445
+ # [
1446
+ # Polars.col("time").count.alias("time_count"),
1447
+ # Polars.col("time").alias("time_agg_list")
1448
+ # ]
1449
+ # )
1450
+ # # =>
1451
+ # # shape: (4, 3)
1452
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1453
+ # # │ time ┆ time_count ┆ time_agg_list │
1454
+ # # │ --- ┆ --- ┆ --- │
1455
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1456
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1457
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
1458
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
1459
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
1460
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1461
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
1462
+ #
1463
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1464
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1465
+ # [Polars.col("time").count.alias("time_count")]
1466
+ # )
1467
+ # # =>
1468
+ # # shape: (5, 2)
1469
+ # # ┌─────────────────────┬────────────┐
1470
+ # # │ time ┆ time_count │
1471
+ # # │ --- ┆ --- │
1472
+ # # │ datetime[μs] ┆ u32 │
1473
+ # # ╞═════════════════════╪════════════╡
1474
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1475
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1476
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1477
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1478
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1479
+ # # └─────────────────────┴────────────┘
1480
+ #
1481
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1482
+ # df = Polars::DataFrame.new(
1483
+ # {
1484
+ # "time" => Polars.date_range(
1485
+ # DateTime.new(2021, 12, 16),
1486
+ # DateTime.new(2021, 12, 16, 3),
1487
+ # "30m"
1488
+ # ),
1489
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1490
+ # }
1491
+ # )
1492
+ # df.group_by_dynamic(
1493
+ # "time",
1494
+ # every: "1h",
1495
+ # closed: "both",
1496
+ # by: "groups",
1497
+ # include_boundaries: true
1498
+ # ).agg([Polars.col("time").count.alias("time_count")])
1499
+ # # =>
1500
+ # # shape: (7, 5)
1501
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1502
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1503
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1504
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1505
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1506
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1507
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1508
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1509
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1510
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1511
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1512
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1513
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1514
+ #
1515
+ # @example Dynamic group by on an index column.
1516
+ # df = Polars::DataFrame.new(
1517
+ # {
1518
+ # "idx" => Polars.arange(0, 6, eager: true),
1519
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1520
+ # }
1521
+ # )
1522
+ # df.group_by_dynamic(
1523
+ # "idx",
1524
+ # every: "2i",
1525
+ # period: "3i",
1526
+ # include_boundaries: true,
1527
+ # closed: "right"
1528
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1529
+ # # =>
1530
+ # # shape: (4, 4)
1531
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1532
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1533
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1534
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1535
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1536
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1537
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1538
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1539
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1540
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1541
+ def group_by_dynamic(
1542
+ index_column,
1543
+ every:,
1544
+ period: nil,
1545
+ offset: nil,
1546
+ truncate: nil,
1547
+ include_boundaries: false,
1548
+ closed: "left",
1549
+ label: "left",
1550
+ by: nil,
1551
+ start_by: "window",
1552
+ check_sorted: true
1553
+ )
1554
+ if !truncate.nil?
1555
+ label = truncate ? "left" : "datapoint"
1556
+ end
1557
+
1558
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1559
+ if offset.nil?
1560
+ offset = period.nil? ? "-#{every}" : "0ns"
1561
+ end
1562
+
1563
+ if period.nil?
1564
+ period = every
1565
+ end
1566
+
1567
+ period = Utils._timedelta_to_pl_duration(period)
1568
+ offset = Utils._timedelta_to_pl_duration(offset)
1569
+ every = Utils._timedelta_to_pl_duration(every)
1570
+
1571
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1572
+ lgb = _ldf.group_by_dynamic(
1573
+ index_column._rbexpr,
1574
+ every,
1575
+ period,
1576
+ offset,
1577
+ label,
1578
+ include_boundaries,
1579
+ closed,
1580
+ rbexprs_by,
1581
+ start_by,
1582
+ check_sorted
1583
+ )
1584
+ LazyGroupBy.new(lgb)
1585
+ end
1586
+ alias_method :groupby_dynamic, :group_by_dynamic
1587
+
1588
+ # Perform an asof join.
1589
+ #
1590
+ # This is similar to a left-join except that we match on nearest key rather than
1591
+ # equal keys.
1592
+ #
1593
+ # Both DataFrames must be sorted by the join_asof key.
1594
+ #
1595
+ # For each row in the left DataFrame:
1596
+ #
1597
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1598
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1599
+ #
1600
+ # The default is "backward".
1601
+ #
1602
+ # @param other [LazyFrame]
1603
+ # Lazy DataFrame to join with.
1604
+ # @param left_on [String]
1605
+ # Join column of the left DataFrame.
1606
+ # @param right_on [String]
1607
+ # Join column of the right DataFrame.
1608
+ # @param on [String]
1609
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1610
+ # None.
1611
+ # @param by [Object]
1612
+ # Join on these columns before doing asof join.
1613
+ # @param by_left [Object]
1614
+ # Join on these columns before doing asof join.
1615
+ # @param by_right [Object]
1616
+ # Join on these columns before doing asof join.
1617
+ # @param strategy ["backward", "forward"]
1618
+ # Join strategy.
1619
+ # @param suffix [String]
1620
+ # Suffix to append to columns with a duplicate name.
1621
+ # @param tolerance [Object]
1622
+ # Numeric tolerance. By setting this the join will only be done if the near
1623
+ # keys are within this distance. If an asof join is done on columns of dtype
1624
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1625
+ # language:
1626
+ #
1627
+ # - 1ns (1 nanosecond)
1628
+ # - 1us (1 microsecond)
1629
+ # - 1ms (1 millisecond)
1630
+ # - 1s (1 second)
1631
+ # - 1m (1 minute)
1632
+ # - 1h (1 hour)
1633
+ # - 1d (1 day)
1634
+ # - 1w (1 week)
1635
+ # - 1mo (1 calendar month)
1636
+ # - 1y (1 calendar year)
1637
+ # - 1i (1 index count)
1638
+ #
1639
+ # Or combine them:
1640
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1641
+ #
1642
+ # @param allow_parallel [Boolean]
1643
+ # Allow the physical plan to optionally evaluate the computation of both
1644
+ # DataFrames up to the join in parallel.
1645
+ # @param force_parallel [Boolean]
1646
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1647
+ # the join in parallel.
1648
+ #
1649
+ # @return [LazyFrame]
1650
+ def join_asof(
1651
+ other,
1652
+ left_on: nil,
1653
+ right_on: nil,
1654
+ on: nil,
1655
+ by_left: nil,
1656
+ by_right: nil,
1657
+ by: nil,
1658
+ strategy: "backward",
1659
+ suffix: "_right",
1660
+ tolerance: nil,
1661
+ allow_parallel: true,
1662
+ force_parallel: false
1663
+ )
1664
+ if !other.is_a?(LazyFrame)
1665
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1666
+ end
1667
+
1668
+ if on.is_a?(::String)
1669
+ left_on = on
1670
+ right_on = on
1671
+ end
1672
+
1673
+ if left_on.nil? || right_on.nil?
1674
+ raise ArgumentError, "You should pass the column to join on as an argument."
1675
+ end
1676
+
1677
+ if by_left.is_a?(::String) || by_left.is_a?(Expr)
1678
+ by_left_ = [by_left]
1679
+ else
1680
+ by_left_ = by_left
1681
+ end
1682
+
1683
+ if by_right.is_a?(::String) || by_right.is_a?(Expr)
1684
+ by_right_ = [by_right]
1685
+ else
1686
+ by_right_ = by_right
1687
+ end
1688
+
1689
+ if by.is_a?(::String)
1690
+ by_left_ = [by]
1691
+ by_right_ = [by]
1692
+ elsif by.is_a?(::Array)
1693
+ by_left_ = by
1694
+ by_right_ = by
1695
+ end
1696
+
1697
+ tolerance_str = nil
1698
+ tolerance_num = nil
1699
+ if tolerance.is_a?(::String)
1700
+ tolerance_str = tolerance
1701
+ else
1702
+ tolerance_num = tolerance
1703
+ end
1704
+
1705
+ _from_rbldf(
1706
+ _ldf.join_asof(
1707
+ other._ldf,
1708
+ Polars.col(left_on)._rbexpr,
1709
+ Polars.col(right_on)._rbexpr,
1710
+ by_left_,
1711
+ by_right_,
1712
+ allow_parallel,
1713
+ force_parallel,
1714
+ suffix,
1715
+ strategy,
1716
+ tolerance_num,
1717
+ tolerance_str
1718
+ )
1719
+ )
1720
+ end
1721
+
1722
+ # Add a join operation to the Logical Plan.
1723
+ #
1724
+ # @param other [LazyFrame]
1725
+ # Lazy DataFrame to join with.
1726
+ # @param left_on [Object]
1727
+ # Join column of the left DataFrame.
1728
+ # @param right_on [Object]
1729
+ # Join column of the right DataFrame.
1730
+ # @param on Object
1731
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1732
+ # None.
1733
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1734
+ # Join strategy.
1735
+ # @param suffix [String]
1736
+ # Suffix to append to columns with a duplicate name.
1737
+ # @param join_nulls [Boolean]
1738
+ # Join on null values. By default null values will never produce matches.
1739
+ # @param allow_parallel [Boolean]
1740
+ # Allow the physical plan to optionally evaluate the computation of both
1741
+ # DataFrames up to the join in parallel.
1742
+ # @param force_parallel [Boolean]
1743
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1744
+ # the join in parallel.
1745
+ #
1746
+ # @return [LazyFrame]
1747
+ #
1748
+ # @example
1749
+ # df = Polars::DataFrame.new(
1750
+ # {
1751
+ # "foo" => [1, 2, 3],
1752
+ # "bar" => [6.0, 7.0, 8.0],
1753
+ # "ham" => ["a", "b", "c"]
1754
+ # }
1755
+ # ).lazy
1756
+ # other_df = Polars::DataFrame.new(
1757
+ # {
1758
+ # "apple" => ["x", "y", "z"],
1759
+ # "ham" => ["a", "b", "d"]
1760
+ # }
1761
+ # ).lazy
1762
+ # df.join(other_df, on: "ham").collect
1763
+ # # =>
1764
+ # # shape: (2, 4)
1765
+ # # ┌─────┬─────┬─────┬───────┐
1766
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1767
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1768
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1769
+ # # ╞═════╪═════╪═════╪═══════╡
1770
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1771
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1772
+ # # └─────┴─────┴─────┴───────┘
1773
+ #
1774
+ # @example
1775
+ # df.join(other_df, on: "ham", how: "outer").collect
1776
+ # # =>
1777
+ # # shape: (4, 5)
1778
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
1779
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
1780
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1781
+ # # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
1782
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
1783
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
1784
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
1785
+ # # │ null ┆ null ┆ null ┆ z ┆ d │
1786
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
1787
+ # # └──────┴──────┴──────┴───────┴───────────┘
1788
+ #
1789
+ # @example
1790
+ # df.join(other_df, on: "ham", how: "left").collect
1791
+ # # =>
1792
+ # # shape: (3, 4)
1793
+ # # ┌─────┬─────┬─────┬───────┐
1794
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1795
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1796
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1797
+ # # ╞═════╪═════╪═════╪═══════╡
1798
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1799
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1800
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1801
+ # # └─────┴─────┴─────┴───────┘
1802
+ #
1803
+ # @example
1804
+ # df.join(other_df, on: "ham", how: "semi").collect
1805
+ # # =>
1806
+ # # shape: (2, 3)
1807
+ # # ┌─────┬─────┬─────┐
1808
+ # # │ foo ┆ bar ┆ ham │
1809
+ # # │ --- ┆ --- ┆ --- │
1810
+ # # │ i64 ┆ f64 ┆ str │
1811
+ # # ╞═════╪═════╪═════╡
1812
+ # # │ 1 ┆ 6.0 ┆ a │
1813
+ # # │ 2 ┆ 7.0 ┆ b │
1814
+ # # └─────┴─────┴─────┘
1815
+ #
1816
+ # @example
1817
+ # df.join(other_df, on: "ham", how: "anti").collect
1818
+ # # =>
1819
+ # # shape: (1, 3)
1820
+ # # ┌─────┬─────┬─────┐
1821
+ # # │ foo ┆ bar ┆ ham │
1822
+ # # │ --- ┆ --- ┆ --- │
1823
+ # # │ i64 ┆ f64 ┆ str │
1824
+ # # ╞═════╪═════╪═════╡
1825
+ # # │ 3 ┆ 8.0 ┆ c │
1826
+ # # └─────┴─────┴─────┘
1827
+ def join(
1828
+ other,
1829
+ left_on: nil,
1830
+ right_on: nil,
1831
+ on: nil,
1832
+ how: "inner",
1833
+ suffix: "_right",
1834
+ join_nulls: false,
1835
+ allow_parallel: true,
1836
+ force_parallel: false
1837
+ )
1838
+ if !other.is_a?(LazyFrame)
1839
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1840
+ end
1841
+
1842
+ if how == "cross"
1843
+ return _from_rbldf(
1844
+ _ldf.join(
1845
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1846
+ )
1847
+ )
1848
+ end
1849
+
1850
+ if !on.nil?
1851
+ rbexprs = Utils.selection_to_rbexpr_list(on)
1852
+ rbexprs_left = rbexprs
1853
+ rbexprs_right = rbexprs
1854
+ elsif !left_on.nil? && !right_on.nil?
1855
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
1856
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
1857
+ else
1858
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1859
+ end
1860
+
1861
+ _from_rbldf(
1862
+ self._ldf.join(
1863
+ other._ldf,
1864
+ rbexprs_left,
1865
+ rbexprs_right,
1866
+ allow_parallel,
1867
+ force_parallel,
1868
+ join_nulls,
1869
+ how,
1870
+ suffix,
1871
+ )
1872
+ )
1873
+ end
1874
+
1875
+ # Add or overwrite multiple columns in a DataFrame.
1876
+ #
1877
+ # @param exprs [Object]
1878
+ # List of Expressions that evaluate to columns.
1879
+ #
1880
+ # @return [LazyFrame]
1881
+ #
1882
+ # @example
1883
+ # ldf = Polars::DataFrame.new(
1884
+ # {
1885
+ # "a" => [1, 2, 3, 4],
1886
+ # "b" => [0.5, 4, 10, 13],
1887
+ # "c" => [true, true, false, true]
1888
+ # }
1889
+ # ).lazy
1890
+ # ldf.with_columns(
1891
+ # [
1892
+ # (Polars.col("a") ** 2).alias("a^2"),
1893
+ # (Polars.col("b") / 2).alias("b/2"),
1894
+ # (Polars.col("c").is_not).alias("not c")
1895
+ # ]
1896
+ # ).collect
1897
+ # # =>
1898
+ # # shape: (4, 6)
1899
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1900
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1901
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1902
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1903
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1904
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1905
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1906
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1907
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1908
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1909
+ def with_columns(*exprs, **named_exprs)
1910
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1911
+ rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1912
+
1913
+ _from_rbldf(_ldf.with_columns(rbexprs))
1914
+ end
1915
+
1916
+ # Add an external context to the computation graph.
1917
+ #
1918
+ # This allows expressions to also access columns from DataFrames
1919
+ # that are not part of this one.
1920
+ #
1921
+ # @param other [Object]
1922
+ # Lazy DataFrame to join with.
1923
+ #
1924
+ # @return [LazyFrame]
1925
+ #
1926
+ # @example
1927
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1928
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1929
+ # (
1930
+ # df_a.with_context(df_other.lazy).select(
1931
+ # [Polars.col("b") + Polars.col("c").first]
1932
+ # )
1933
+ # ).collect
1934
+ # # =>
1935
+ # # shape: (3, 1)
1936
+ # # ┌──────┐
1937
+ # # │ b │
1938
+ # # │ --- │
1939
+ # # │ str │
1940
+ # # ╞══════╡
1941
+ # # │ afoo │
1942
+ # # │ cfoo │
1943
+ # # │ null │
1944
+ # # └──────┘
1945
+ def with_context(other)
1946
+ if !other.is_a?(::Array)
1947
+ other = [other]
1948
+ end
1949
+
1950
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1951
+ end
1952
+
1953
+ # Add or overwrite column in a DataFrame.
1954
+ #
1955
+ # @param column [Object]
1956
+ # Expression that evaluates to column or a Series to use.
1957
+ #
1958
+ # @return [LazyFrame]
1959
+ #
1960
+ # @example
1961
+ # df = Polars::DataFrame.new(
1962
+ # {
1963
+ # "a" => [1, 3, 5],
1964
+ # "b" => [2, 4, 6]
1965
+ # }
1966
+ # ).lazy
1967
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1968
+ # # =>
1969
+ # # shape: (3, 3)
1970
+ # # ┌─────┬─────┬───────────┐
1971
+ # # │ a ┆ b ┆ b_squared │
1972
+ # # │ --- ┆ --- ┆ --- │
1973
+ # # │ i64 ┆ i64 ┆ i64 │
1974
+ # # ╞═════╪═════╪═══════════╡
1975
+ # # │ 1 ┆ 2 ┆ 4 │
1976
+ # # │ 3 ┆ 4 ┆ 16 │
1977
+ # # │ 5 ┆ 6 ┆ 36 │
1978
+ # # └─────┴─────┴───────────┘
1979
+ #
1980
+ # @example
1981
+ # df.with_column(Polars.col("a") ** 2).collect
1982
+ # # =>
1983
+ # # shape: (3, 2)
1984
+ # # ┌─────┬─────┐
1985
+ # # │ a ┆ b │
1986
+ # # │ --- ┆ --- │
1987
+ # # │ i64 ┆ i64 │
1988
+ # # ╞═════╪═════╡
1989
+ # # │ 1 ┆ 2 │
1990
+ # # │ 9 ┆ 4 │
1991
+ # # │ 25 ┆ 6 │
1992
+ # # └─────┴─────┘
1993
+ def with_column(column)
1994
+ with_columns([column])
1995
+ end
1996
+
1997
+ # Remove one or multiple columns from a DataFrame.
1998
+ #
1999
+ # @param columns [Object]
2000
+ # - Name of the column that should be removed.
2001
+ # - List of column names.
2002
+ #
2003
+ # @return [LazyFrame]
2004
+ def drop(*columns)
2005
+ drop_cols = Utils._expand_selectors(self, *columns)
2006
+ _from_rbldf(_ldf.drop(drop_cols))
2007
+ end
2008
+
2009
+ # Rename column names.
2010
+ #
2011
+ # @param mapping [Hash]
2012
+ # Key value pairs that map from old name to new name.
2013
+ #
2014
+ # @return [LazyFrame]
2015
+ def rename(mapping)
2016
+ existing = mapping.keys
2017
+ _new = mapping.values
2018
+ _from_rbldf(_ldf.rename(existing, _new))
2019
+ end
2020
+
2021
+ # Reverse the DataFrame.
2022
+ #
2023
+ # @return [LazyFrame]
2024
+ def reverse
2025
+ _from_rbldf(_ldf.reverse)
2026
+ end
2027
+
2028
+ # Shift the values by a given period.
2029
+ #
2030
+ # @param n [Integer]
2031
+ # Number of places to shift (may be negative).
2032
+ # @param fill_value [Object]
2033
+ # Fill the resulting null values with this value.
2034
+ #
2035
+ # @return [LazyFrame]
2036
+ #
2037
+ # @example
2038
+ # df = Polars::DataFrame.new(
2039
+ # {
2040
+ # "a" => [1, 3, 5],
2041
+ # "b" => [2, 4, 6]
2042
+ # }
2043
+ # ).lazy
2044
+ # df.shift(1).collect
2045
+ # # =>
2046
+ # # shape: (3, 2)
2047
+ # # ┌──────┬──────┐
2048
+ # # │ a ┆ b │
2049
+ # # │ --- ┆ --- │
2050
+ # # │ i64 ┆ i64 │
2051
+ # # ╞══════╪══════╡
2052
+ # # │ null ┆ null │
2053
+ # # │ 1 ┆ 2 │
2054
+ # # │ 3 ┆ 4 │
2055
+ # # └──────┴──────┘
2056
+ #
2057
+ # @example
2058
+ # df.shift(-1).collect
2059
+ # # =>
2060
+ # # shape: (3, 2)
2061
+ # # ┌──────┬──────┐
2062
+ # # │ a ┆ b │
2063
+ # # │ --- ┆ --- │
2064
+ # # │ i64 ┆ i64 │
2065
+ # # ╞══════╪══════╡
2066
+ # # │ 3 ┆ 4 │
2067
+ # # │ 5 ┆ 6 │
2068
+ # # │ null ┆ null │
2069
+ # # └──────┴──────┘
2070
+ def shift(n, fill_value: nil)
2071
+ if !fill_value.nil?
2072
+ fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
2073
+ end
2074
+ n = Utils.parse_as_expression(n)
2075
+ _from_rbldf(_ldf.shift(n, fill_value))
2076
+ end
2077
+
2078
+ # Shift the values by a given period and fill the resulting null values.
2079
+ #
2080
+ # @param periods [Integer]
2081
+ # Number of places to shift (may be negative).
2082
+ # @param fill_value [Object]
2083
+ # Fill `nil` values with the result of this expression.
2084
+ #
2085
+ # @return [LazyFrame]
2086
+ #
2087
+ # @example
2088
+ # df = Polars::DataFrame.new(
2089
+ # {
2090
+ # "a" => [1, 3, 5],
2091
+ # "b" => [2, 4, 6]
2092
+ # }
2093
+ # ).lazy
2094
+ # df.shift_and_fill(1, 0).collect
2095
+ # # =>
2096
+ # # shape: (3, 2)
2097
+ # # ┌─────┬─────┐
2098
+ # # │ a ┆ b │
2099
+ # # │ --- ┆ --- │
2100
+ # # │ i64 ┆ i64 │
2101
+ # # ╞═════╪═════╡
2102
+ # # │ 0 ┆ 0 │
2103
+ # # │ 1 ┆ 2 │
2104
+ # # │ 3 ┆ 4 │
2105
+ # # └─────┴─────┘
2106
+ #
2107
+ # @example
2108
+ # df.shift_and_fill(-1, 0).collect
2109
+ # # =>
2110
+ # # shape: (3, 2)
2111
+ # # ┌─────┬─────┐
2112
+ # # │ a ┆ b │
2113
+ # # │ --- ┆ --- │
2114
+ # # │ i64 ┆ i64 │
2115
+ # # ╞═════╪═════╡
2116
+ # # │ 3 ┆ 4 │
2117
+ # # │ 5 ┆ 6 │
2118
+ # # │ 0 ┆ 0 │
2119
+ # # └─────┴─────┘
2120
+ def shift_and_fill(periods, fill_value)
2121
+ shift(periods, fill_value: fill_value)
2122
+ end
2123
+
2124
+ # Get a slice of this DataFrame.
2125
+ #
2126
+ # @param offset [Integer]
2127
+ # Start index. Negative indexing is supported.
2128
+ # @param length [Integer]
2129
+ # Length of the slice. If set to `nil`, all rows starting at the offset
2130
+ # will be selected.
2131
+ #
2132
+ # @return [LazyFrame]
2133
+ #
2134
+ # @example
2135
+ # df = Polars::DataFrame.new(
2136
+ # {
2137
+ # "a" => ["x", "y", "z"],
2138
+ # "b" => [1, 3, 5],
2139
+ # "c" => [2, 4, 6]
2140
+ # }
2141
+ # ).lazy
2142
+ # df.slice(1, 2).collect
2143
+ # # =>
2144
+ # # shape: (2, 3)
2145
+ # # ┌─────┬─────┬─────┐
2146
+ # # │ a ┆ b ┆ c │
2147
+ # # │ --- ┆ --- ┆ --- │
2148
+ # # │ str ┆ i64 ┆ i64 │
2149
+ # # ╞═════╪═════╪═════╡
2150
+ # # │ y ┆ 3 ┆ 4 │
2151
+ # # │ z ┆ 5 ┆ 6 │
2152
+ # # └─────┴─────┴─────┘
2153
+ def slice(offset, length = nil)
2154
+ if length && length < 0
2155
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
2156
+ end
2157
+ _from_rbldf(_ldf.slice(offset, length))
2158
+ end
2159
+
2160
+ # Get the first `n` rows.
2161
+ #
2162
+ # Alias for {#head}.
2163
+ #
2164
+ # @param n [Integer]
2165
+ # Number of rows to return.
2166
+ #
2167
+ # @return [LazyFrame]
2168
+ #
2169
+ # @note
2170
+ # Consider using the {#fetch} operation if you only want to test your
2171
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2172
+ # level, whereas the {#head}/{#limit} are applied at the end.
2173
+ def limit(n = 5)
2174
+ head(5)
2175
+ end
2176
+
2177
+ # Get the first `n` rows.
2178
+ #
2179
+ # @param n [Integer]
2180
+ # Number of rows to return.
2181
+ #
2182
+ # @return [LazyFrame]
2183
+ #
2184
+ # @note
2185
+ # Consider using the {#fetch} operation if you only want to test your
2186
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2187
+ # level, whereas the {#head}/{#limit} are applied at the end.
2188
+ def head(n = 5)
2189
+ slice(0, n)
2190
+ end
2191
+
2192
+ # Get the last `n` rows.
2193
+ #
2194
+ # @param n [Integer]
2195
+ # Number of rows.
2196
+ #
2197
+ # @return [LazyFrame]
2198
+ def tail(n = 5)
2199
+ _from_rbldf(_ldf.tail(n))
2200
+ end
2201
+
2202
+ # Get the last row of the DataFrame.
2203
+ #
2204
+ # @return [LazyFrame]
2205
+ def last
2206
+ tail(1)
2207
+ end
2208
+
2209
+ # Get the first row of the DataFrame.
2210
+ #
2211
+ # @return [LazyFrame]
2212
+ def first
2213
+ slice(0, 1)
2214
+ end
2215
+
2216
+ # Add a column at index 0 that counts the rows.
2217
+ #
2218
+ # @param name [String]
2219
+ # Name of the column to add.
2220
+ # @param offset [Integer]
2221
+ # Start the row count at this offset.
2222
+ #
2223
+ # @return [LazyFrame]
2224
+ #
2225
+ # @note
2226
+ # This can have a negative effect on query performance.
2227
+ # This may, for instance, block predicate pushdown optimization.
2228
+ #
2229
+ # @example
2230
+ # df = Polars::DataFrame.new(
2231
+ # {
2232
+ # "a" => [1, 3, 5],
2233
+ # "b" => [2, 4, 6]
2234
+ # }
2235
+ # ).lazy
2236
+ # df.with_row_index.collect
2237
+ # # =>
2238
+ # # shape: (3, 3)
2239
+ # # ┌────────┬─────┬─────┐
2240
+ # # │ row_nr ┆ a ┆ b │
2241
+ # # │ --- ┆ --- ┆ --- │
2242
+ # # │ u32 ┆ i64 ┆ i64 │
2243
+ # # ╞════════╪═════╪═════╡
2244
+ # # │ 0 ┆ 1 ┆ 2 │
2245
+ # # │ 1 ┆ 3 ┆ 4 │
2246
+ # # │ 2 ┆ 5 ┆ 6 │
2247
+ # # └────────┴─────┴─────┘
2248
+ def with_row_index(name: "row_nr", offset: 0)
2249
+ _from_rbldf(_ldf.with_row_index(name, offset))
2250
+ end
2251
+ alias_method :with_row_count, :with_row_index
2252
+
2253
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
2254
+ #
2255
+ # @return [LazyFrame]
2256
+ #
2257
+ # @example
2258
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2259
+ # s.take_every(2).collect
2260
+ # # =>
2261
+ # # shape: (2, 2)
2262
+ # # ┌─────┬─────┐
2263
+ # # │ a ┆ b │
2264
+ # # │ --- ┆ --- │
2265
+ # # │ i64 ┆ i64 │
2266
+ # # ╞═════╪═════╡
2267
+ # # │ 1 ┆ 5 │
2268
+ # # │ 3 ┆ 7 │
2269
+ # # └─────┴─────┘
2270
+ def take_every(n)
2271
+ select(Utils.col("*").take_every(n))
2272
+ end
2273
+
2274
+ # Fill null values using the specified value or strategy.
2275
+ #
2276
+ # @return [LazyFrame]
2277
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
2278
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
2279
+ end
2280
+
2281
+ # Fill floating point NaN values.
2282
+ #
2283
+ # @param fill_value [Object]
2284
+ # Value to fill the NaN values with.
2285
+ #
2286
+ # @return [LazyFrame]
2287
+ #
2288
+ # @note
2289
+ # Note that floating point NaN (Not a Number) are not missing values!
2290
+ # To replace missing values, use `fill_null` instead.
2291
+ #
2292
+ # @example
2293
+ # df = Polars::DataFrame.new(
2294
+ # {
2295
+ # "a" => [1.5, 2, Float::NAN, 4],
2296
+ # "b" => [0.5, 4, Float::NAN, 13],
2297
+ # }
2298
+ # ).lazy
2299
+ # df.fill_nan(99).collect
2300
+ # # =>
2301
+ # # shape: (4, 2)
2302
+ # # ┌──────┬──────┐
2303
+ # # │ a ┆ b │
2304
+ # # │ --- ┆ --- │
2305
+ # # │ f64 ┆ f64 │
2306
+ # # ╞══════╪══════╡
2307
+ # # │ 1.5 ┆ 0.5 │
2308
+ # # │ 2.0 ┆ 4.0 │
2309
+ # # │ 99.0 ┆ 99.0 │
2310
+ # # │ 4.0 ┆ 13.0 │
2311
+ # # └──────┴──────┘
2312
+ def fill_nan(fill_value)
2313
+ if !fill_value.is_a?(Expr)
2314
+ fill_value = Utils.lit(fill_value)
2315
+ end
2316
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
2317
+ end
2318
+
2319
+ # Aggregate the columns in the DataFrame to their standard deviation value.
2320
+ #
2321
+ # @return [LazyFrame]
2322
+ #
2323
+ # @example
2324
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2325
+ # df.std.collect
2326
+ # # =>
2327
+ # # shape: (1, 2)
2328
+ # # ┌──────────┬─────┐
2329
+ # # │ a ┆ b │
2330
+ # # │ --- ┆ --- │
2331
+ # # │ f64 ┆ f64 │
2332
+ # # ╞══════════╪═════╡
2333
+ # # │ 1.290994 ┆ 0.5 │
2334
+ # # └──────────┴─────┘
2335
+ #
2336
+ # @example
2337
+ # df.std(ddof: 0).collect
2338
+ # # =>
2339
+ # # shape: (1, 2)
2340
+ # # ┌──────────┬──────────┐
2341
+ # # │ a ┆ b │
2342
+ # # │ --- ┆ --- │
2343
+ # # │ f64 ┆ f64 │
2344
+ # # ╞══════════╪══════════╡
2345
+ # # │ 1.118034 ┆ 0.433013 │
2346
+ # # └──────────┴──────────┘
2347
+ def std(ddof: 1)
2348
+ _from_rbldf(_ldf.std(ddof))
2349
+ end
2350
+
2351
+ # Aggregate the columns in the DataFrame to their variance value.
2352
+ #
2353
+ # @return [LazyFrame]
2354
+ #
2355
+ # @example
2356
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2357
+ # df.var.collect
2358
+ # # =>
2359
+ # # shape: (1, 2)
2360
+ # # ┌──────────┬──────┐
2361
+ # # │ a ┆ b │
2362
+ # # │ --- ┆ --- │
2363
+ # # │ f64 ┆ f64 │
2364
+ # # ╞══════════╪══════╡
2365
+ # # │ 1.666667 ┆ 0.25 │
2366
+ # # └──────────┴──────┘
2367
+ #
2368
+ # @example
2369
+ # df.var(ddof: 0).collect
2370
+ # # =>
2371
+ # # shape: (1, 2)
2372
+ # # ┌──────┬────────┐
2373
+ # # │ a ┆ b │
2374
+ # # │ --- ┆ --- │
2375
+ # # │ f64 ┆ f64 │
2376
+ # # ╞══════╪════════╡
2377
+ # # │ 1.25 ┆ 0.1875 │
2378
+ # # └──────┴────────┘
2379
+ def var(ddof: 1)
2380
+ _from_rbldf(_ldf.var(ddof))
2381
+ end
2382
+
2383
+ # Aggregate the columns in the DataFrame to their maximum value.
2384
+ #
2385
+ # @return [LazyFrame]
2386
+ #
2387
+ # @example
2388
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2389
+ # df.max.collect
2390
+ # # =>
2391
+ # # shape: (1, 2)
2392
+ # # ┌─────┬─────┐
2393
+ # # │ a ┆ b │
2394
+ # # │ --- ┆ --- │
2395
+ # # │ i64 ┆ i64 │
2396
+ # # ╞═════╪═════╡
2397
+ # # │ 4 ┆ 2 │
2398
+ # # └─────┴─────┘
2399
+ def max
2400
+ _from_rbldf(_ldf.max)
2401
+ end
2402
+
2403
+ # Aggregate the columns in the DataFrame to their minimum value.
2404
+ #
2405
+ # @return [LazyFrame]
2406
+ #
2407
+ # @example
2408
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2409
+ # df.min.collect
2410
+ # # =>
2411
+ # # shape: (1, 2)
2412
+ # # ┌─────┬─────┐
2413
+ # # │ a ┆ b │
2414
+ # # │ --- ┆ --- │
2415
+ # # │ i64 ┆ i64 │
2416
+ # # ╞═════╪═════╡
2417
+ # # │ 1 ┆ 1 │
2418
+ # # └─────┴─────┘
2419
+ def min
2420
+ _from_rbldf(_ldf.min)
2421
+ end
2422
+
2423
+ # Aggregate the columns in the DataFrame to their sum value.
2424
+ #
2425
+ # @return [LazyFrame]
2426
+ #
2427
+ # @example
2428
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2429
+ # df.sum.collect
2430
+ # # =>
2431
+ # # shape: (1, 2)
2432
+ # # ┌─────┬─────┐
2433
+ # # │ a ┆ b │
2434
+ # # │ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 │
2436
+ # # ╞═════╪═════╡
2437
+ # # │ 10 ┆ 5 │
2438
+ # # └─────┴─────┘
2439
+ def sum
2440
+ _from_rbldf(_ldf.sum)
2441
+ end
2442
+
2443
+ # Aggregate the columns in the DataFrame to their mean value.
2444
+ #
2445
+ # @return [LazyFrame]
2446
+ #
2447
+ # @example
2448
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2449
+ # df.mean.collect
2450
+ # # =>
2451
+ # # shape: (1, 2)
2452
+ # # ┌─────┬──────┐
2453
+ # # │ a ┆ b │
2454
+ # # │ --- ┆ --- │
2455
+ # # │ f64 ┆ f64 │
2456
+ # # ╞═════╪══════╡
2457
+ # # │ 2.5 ┆ 1.25 │
2458
+ # # └─────┴──────┘
2459
+ def mean
2460
+ _from_rbldf(_ldf.mean)
2461
+ end
2462
+
2463
+ # Aggregate the columns in the DataFrame to their median value.
2464
+ #
2465
+ # @return [LazyFrame]
2466
+ #
2467
+ # @example
2468
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2469
+ # df.median.collect
2470
+ # # =>
2471
+ # # shape: (1, 2)
2472
+ # # ┌─────┬─────┐
2473
+ # # │ a ┆ b │
2474
+ # # │ --- ┆ --- │
2475
+ # # │ f64 ┆ f64 │
2476
+ # # ╞═════╪═════╡
2477
+ # # │ 2.5 ┆ 1.0 │
2478
+ # # └─────┴─────┘
2479
+ def median
2480
+ _from_rbldf(_ldf.median)
2481
+ end
2482
+
2483
+ # Aggregate the columns in the DataFrame to their quantile value.
2484
+ #
2485
+ # @param quantile [Float]
2486
+ # Quantile between 0.0 and 1.0.
2487
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2488
+ # Interpolation method.
2489
+ #
2490
+ # @return [LazyFrame]
2491
+ #
2492
+ # @example
2493
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2494
+ # df.quantile(0.7).collect
2495
+ # # =>
2496
+ # # shape: (1, 2)
2497
+ # # ┌─────┬─────┐
2498
+ # # │ a ┆ b │
2499
+ # # │ --- ┆ --- │
2500
+ # # │ f64 ┆ f64 │
2501
+ # # ╞═════╪═════╡
2502
+ # # │ 3.0 ┆ 1.0 │
2503
+ # # └─────┴─────┘
2504
+ def quantile(quantile, interpolation: "nearest")
2505
+ quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2506
+ _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
2507
+ end
2508
+
2509
+ # Explode lists to long format.
2510
+ #
2511
+ # @return [LazyFrame]
2512
+ #
2513
+ # @example
2514
+ # df = Polars::DataFrame.new(
2515
+ # {
2516
+ # "letters" => ["a", "a", "b", "c"],
2517
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2518
+ # }
2519
+ # ).lazy
2520
+ # df.explode("numbers").collect
2521
+ # # =>
2522
+ # # shape: (8, 2)
2523
+ # # ┌─────────┬─────────┐
2524
+ # # │ letters ┆ numbers │
2525
+ # # │ --- ┆ --- │
2526
+ # # │ str ┆ i64 │
2527
+ # # ╞═════════╪═════════╡
2528
+ # # │ a ┆ 1 │
2529
+ # # │ a ┆ 2 │
2530
+ # # │ a ┆ 3 │
2531
+ # # │ b ┆ 4 │
2532
+ # # │ b ┆ 5 │
2533
+ # # │ c ┆ 6 │
2534
+ # # │ c ┆ 7 │
2535
+ # # │ c ┆ 8 │
2536
+ # # └─────────┴─────────┘
2537
+ def explode(columns)
2538
+ columns = Utils.selection_to_rbexpr_list(columns)
2539
+ _from_rbldf(_ldf.explode(columns))
2540
+ end
2541
+
2542
+ # Drop duplicate rows from this DataFrame.
2543
+ #
2544
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2545
+ # subset.
2546
+ #
2547
+ # @param maintain_order [Boolean]
2548
+ # Keep the same order as the original DataFrame. This requires more work to
2549
+ # compute.
2550
+ # @param subset [Object]
2551
+ # Subset to use to compare rows.
2552
+ # @param keep ["first", "last"]
2553
+ # Which of the duplicate rows to keep.
2554
+ #
2555
+ # @return [LazyFrame]
2556
+ def unique(maintain_order: true, subset: nil, keep: "first")
2557
+ if !subset.nil? && !subset.is_a?(::Array)
2558
+ subset = [subset]
2559
+ end
2560
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2561
+ end
2562
+
2563
+ # Drop rows with null values from this LazyFrame.
2564
+ #
2565
+ # @param subset [Object]
2566
+ # Subset of column(s) on which `drop_nulls` will be applied.
2567
+ #
2568
+ # @return [LazyFrame]
2569
+ #
2570
+ # @example
2571
+ # df = Polars::DataFrame.new(
2572
+ # {
2573
+ # "foo" => [1, 2, 3],
2574
+ # "bar" => [6, nil, 8],
2575
+ # "ham" => ["a", "b", "c"]
2576
+ # }
2577
+ # )
2578
+ # df.lazy.drop_nulls.collect
2579
+ # # =>
2580
+ # # shape: (2, 3)
2581
+ # # ┌─────┬─────┬─────┐
2582
+ # # │ foo ┆ bar ┆ ham │
2583
+ # # │ --- ┆ --- ┆ --- │
2584
+ # # │ i64 ┆ i64 ┆ str │
2585
+ # # ╞═════╪═════╪═════╡
2586
+ # # │ 1 ┆ 6 ┆ a │
2587
+ # # │ 3 ┆ 8 ┆ c │
2588
+ # # └─────┴─────┴─────┘
2589
+ def drop_nulls(subset: nil)
2590
+ if !subset.nil? && !subset.is_a?(::Array)
2591
+ subset = [subset]
2592
+ end
2593
+ _from_rbldf(_ldf.drop_nulls(subset))
2594
+ end
2595
+
2596
+ # Unpivot a DataFrame from wide to long format.
2597
+ #
2598
+ # Optionally leaves identifiers set.
2599
+ #
2600
+ # This function is useful to massage a DataFrame into a format where one or more
2601
+ # columns are identifier variables (id_vars), while all other columns, considered
2602
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2603
+ # two non-identifier columns, 'variable' and 'value'.
2604
+ #
2605
+ # @param id_vars [Object]
2606
+ # Columns to use as identifier variables.
2607
+ # @param value_vars [Object]
2608
+ # Values to use as identifier variables.
2609
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2610
+ # @param variable_name [String]
2611
+ # Name to give to the `value` column. Defaults to "variable"
2612
+ # @param value_name [String]
2613
+ # Name to give to the `value` column. Defaults to "value"
2614
+ # @param streamable [Boolean]
2615
+ # Allow this node to run in the streaming engine.
2616
+ # If this runs in streaming, the output of the melt operation
2617
+ # will not have a stable ordering.
2618
+ #
2619
+ # @return [LazyFrame]
2620
+ #
2621
+ # @example
2622
+ # df = Polars::DataFrame.new(
2623
+ # {
2624
+ # "a" => ["x", "y", "z"],
2625
+ # "b" => [1, 3, 5],
2626
+ # "c" => [2, 4, 6]
2627
+ # }
2628
+ # ).lazy
2629
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2630
+ # # =>
2631
+ # # shape: (6, 3)
2632
+ # # ┌─────┬──────────┬───────┐
2633
+ # # │ a ┆ variable ┆ value │
2634
+ # # │ --- ┆ --- ┆ --- │
2635
+ # # │ str ┆ str ┆ i64 │
2636
+ # # ╞═════╪══════════╪═══════╡
2637
+ # # │ x ┆ b ┆ 1 │
2638
+ # # │ y ┆ b ┆ 3 │
2639
+ # # │ z ┆ b ┆ 5 │
2640
+ # # │ x ┆ c ┆ 2 │
2641
+ # # │ y ┆ c ┆ 4 │
2642
+ # # │ z ┆ c ┆ 6 │
2643
+ # # └─────┴──────────┴───────┘
2644
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
2645
+ if value_vars.is_a?(::String)
2646
+ value_vars = [value_vars]
2647
+ end
2648
+ if id_vars.is_a?(::String)
2649
+ id_vars = [id_vars]
2650
+ end
2651
+ if value_vars.nil?
2652
+ value_vars = []
2653
+ end
2654
+ if id_vars.nil?
2655
+ id_vars = []
2656
+ end
2657
+ _from_rbldf(
2658
+ _ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
2659
+ )
2660
+ end
2661
+
2662
+ # def map
2663
+ # end
2664
+
2665
+ # Interpolate intermediate values. The interpolation method is linear.
2666
+ #
2667
+ # @return [LazyFrame]
2668
+ #
2669
+ # @example
2670
+ # df = Polars::DataFrame.new(
2671
+ # {
2672
+ # "foo" => [1, nil, 9, 10],
2673
+ # "bar" => [6, 7, 9, nil],
2674
+ # "baz" => [1, nil, nil, 9]
2675
+ # }
2676
+ # ).lazy
2677
+ # df.interpolate.collect
2678
+ # # =>
2679
+ # # shape: (4, 3)
2680
+ # # ┌──────┬──────┬──────────┐
2681
+ # # │ foo ┆ bar ┆ baz │
2682
+ # # │ --- ┆ --- ┆ --- │
2683
+ # # │ f64 ┆ f64 ┆ f64 │
2684
+ # # ╞══════╪══════╪══════════╡
2685
+ # # │ 1.0 ┆ 6.0 ┆ 1.0 │
2686
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667 │
2687
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333 │
2688
+ # # │ 10.0 ┆ null ┆ 9.0 │
2689
+ # # └──────┴──────┴──────────┘
2690
+ def interpolate
2691
+ select(Utils.col("*").interpolate)
2692
+ end
2693
+
2694
+ # Decompose a struct into its fields.
2695
+ #
2696
+ # The fields will be inserted into the `DataFrame` on the location of the
2697
+ # `struct` type.
2698
+ #
2699
+ # @param names [Object]
2700
+ # Names of the struct columns that will be decomposed by its fields
2701
+ #
2702
+ # @return [LazyFrame]
2703
+ #
2704
+ # @example
2705
+ # df = (
2706
+ # Polars::DataFrame.new(
2707
+ # {
2708
+ # "before" => ["foo", "bar"],
2709
+ # "t_a" => [1, 2],
2710
+ # "t_b" => ["a", "b"],
2711
+ # "t_c" => [true, nil],
2712
+ # "t_d" => [[1, 2], [3]],
2713
+ # "after" => ["baz", "womp"]
2714
+ # }
2715
+ # )
2716
+ # .lazy
2717
+ # .select(
2718
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2719
+ # )
2720
+ # )
2721
+ # df.fetch
2722
+ # # =>
2723
+ # # shape: (2, 3)
2724
+ # # ┌────────┬─────────────────────┬───────┐
2725
+ # # │ before ┆ t_struct ┆ after │
2726
+ # # │ --- ┆ --- ┆ --- │
2727
+ # # │ str ┆ struct[4] ┆ str │
2728
+ # # ╞════════╪═════════════════════╪═══════╡
2729
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2730
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2731
+ # # └────────┴─────────────────────┴───────┘
2732
+ #
2733
+ # @example
2734
+ # df.unnest("t_struct").fetch
2735
+ # # =>
2736
+ # # shape: (2, 6)
2737
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2738
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2739
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2740
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2741
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2742
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2743
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2744
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2745
+ def unnest(names)
2746
+ if names.is_a?(::String)
2747
+ names = [names]
2748
+ end
2749
+ _from_rbldf(_ldf.unnest(names))
2750
+ end
2751
+
2752
+ # Take two sorted DataFrames and merge them by the sorted key.
2753
+ #
2754
+ # The output of this operation will also be sorted.
2755
+ # It is the callers responsibility that the frames are sorted
2756
+ # by that key otherwise the output will not make sense.
2757
+ #
2758
+ # The schemas of both LazyFrames must be equal.
2759
+ #
2760
+ # @param other [DataFrame]
2761
+ # Other DataFrame that must be merged
2762
+ # @param key [String]
2763
+ # Key that is sorted.
2764
+ #
2765
+ # @return [LazyFrame]
2766
+ #
2767
+ # @example
2768
+ # df0 = Polars::LazyFrame.new(
2769
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2770
+ # ).sort("age")
2771
+ # df1 = Polars::LazyFrame.new(
2772
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2773
+ # ).sort("age")
2774
+ # df0.merge_sorted(df1, "age").collect
2775
+ # # =>
2776
+ # # shape: (7, 2)
2777
+ # # ┌────────┬─────┐
2778
+ # # │ name ┆ age │
2779
+ # # │ --- ┆ --- │
2780
+ # # │ str ┆ i64 │
2781
+ # # ╞════════╪═════╡
2782
+ # # │ bob ┆ 18 │
2783
+ # # │ thomas ┆ 20 │
2784
+ # # │ anna ┆ 21 │
2785
+ # # │ megan ┆ 33 │
2786
+ # # │ steve ┆ 42 │
2787
+ # # │ steve ┆ 42 │
2788
+ # # │ elise ┆ 44 │
2789
+ # # └────────┴─────┘
2790
+ def merge_sorted(other, key)
2791
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2792
+ end
2793
+
2794
+ # Indicate that one or multiple columns are sorted.
2795
+ #
2796
+ # @param column [Object]
2797
+ # Columns that are sorted
2798
+ # @param more_columns [Object]
2799
+ # Additional columns that are sorted, specified as positional arguments.
2800
+ # @param descending [Boolean]
2801
+ # Whether the columns are sorted in descending order.
2802
+ #
2803
+ # @return [LazyFrame]
2804
+ def set_sorted(
2805
+ column,
2806
+ *more_columns,
2807
+ descending: false
2808
+ )
2809
+ columns = Utils.selection_to_rbexpr_list(column)
2810
+ if more_columns.any?
2811
+ columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2812
+ end
2813
+ with_columns(
2814
+ columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2815
+ )
2816
+ end
2817
+
2818
+ # TODO
2819
+ # def update
2820
+ # end
2821
+
2822
+ private
2823
+
2824
+ def initialize_copy(other)
2825
+ super
2826
+ self._ldf = _ldf._clone
2827
+ end
2828
+
2829
+ def _from_rbldf(rb_ldf)
2830
+ self.class._from_rbldf(rb_ldf)
2831
+ end
2832
+ end
2833
+ end