polars-df 0.10.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,2833 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
23
+ # @private
24
+ def self._from_rbldf(rb_ldf)
25
+ ldf = LazyFrame.allocate
26
+ ldf._ldf = rb_ldf
27
+ ldf
28
+ end
29
+
30
+ # @private
31
+ def self._scan_csv(
32
+ file,
33
+ has_header: true,
34
+ sep: ",",
35
+ comment_char: nil,
36
+ quote_char: '"',
37
+ skip_rows: 0,
38
+ dtypes: nil,
39
+ null_values: nil,
40
+ ignore_errors: false,
41
+ cache: true,
42
+ with_column_names: nil,
43
+ infer_schema_length: 100,
44
+ n_rows: nil,
45
+ encoding: "utf8",
46
+ low_memory: false,
47
+ rechunk: true,
48
+ skip_rows_after_header: 0,
49
+ row_count_name: nil,
50
+ row_count_offset: 0,
51
+ parse_dates: false,
52
+ eol_char: "\n",
53
+ truncate_ragged_lines: true
54
+ )
55
+ dtype_list = nil
56
+ if !dtypes.nil?
57
+ dtype_list = []
58
+ dtypes.each do |k, v|
59
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
60
+ end
61
+ end
62
+ processed_null_values = Utils._process_null_values(null_values)
63
+
64
+ _from_rbldf(
65
+ RbLazyFrame.new_from_csv(
66
+ file,
67
+ sep,
68
+ has_header,
69
+ ignore_errors,
70
+ skip_rows,
71
+ n_rows,
72
+ cache,
73
+ dtype_list,
74
+ low_memory,
75
+ comment_char,
76
+ quote_char,
77
+ processed_null_values,
78
+ infer_schema_length,
79
+ with_column_names,
80
+ rechunk,
81
+ skip_rows_after_header,
82
+ encoding,
83
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
84
+ parse_dates,
85
+ eol_char,
86
+ truncate_ragged_lines
87
+ )
88
+ )
89
+ end
90
+
91
+ # @private
92
+ def self._scan_parquet(
93
+ file,
94
+ n_rows: nil,
95
+ cache: true,
96
+ parallel: "auto",
97
+ rechunk: true,
98
+ row_count_name: nil,
99
+ row_count_offset: 0,
100
+ storage_options: nil,
101
+ low_memory: false,
102
+ use_statistics: true,
103
+ hive_partitioning: true
104
+ )
105
+ _from_rbldf(
106
+ RbLazyFrame.new_from_parquet(
107
+ file,
108
+ [],
109
+ n_rows,
110
+ cache,
111
+ parallel,
112
+ rechunk,
113
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
114
+ low_memory,
115
+ use_statistics,
116
+ hive_partitioning,
117
+ nil
118
+ )
119
+ )
120
+ end
121
+
122
+ # @private
123
+ def self._scan_ipc(
124
+ file,
125
+ n_rows: nil,
126
+ cache: true,
127
+ rechunk: true,
128
+ row_count_name: nil,
129
+ row_count_offset: 0,
130
+ storage_options: nil,
131
+ memory_map: true
132
+ )
133
+ if Utils.pathlike?(file)
134
+ file = Utils.normalise_filepath(file)
135
+ end
136
+
137
+ _from_rbldf(
138
+ RbLazyFrame.new_from_ipc(
139
+ file,
140
+ n_rows,
141
+ cache,
142
+ rechunk,
143
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
144
+ memory_map
145
+ )
146
+ )
147
+ end
148
+
149
+ # @private
150
+ def self._scan_ndjson(
151
+ file,
152
+ infer_schema_length: nil,
153
+ batch_size: nil,
154
+ n_rows: nil,
155
+ low_memory: false,
156
+ rechunk: true,
157
+ row_count_name: nil,
158
+ row_count_offset: 0
159
+ )
160
+ _from_rbldf(
161
+ RbLazyFrame.new_from_ndjson(
162
+ file,
163
+ infer_schema_length,
164
+ batch_size,
165
+ n_rows,
166
+ low_memory,
167
+ rechunk,
168
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
169
+ )
170
+ )
171
+ end
172
+
173
+ # def self.from_json
174
+ # end
175
+
176
+ # Read a logical plan from a JSON file to construct a LazyFrame.
177
+ #
178
+ # @param file [String]
179
+ # Path to a file or a file-like object.
180
+ #
181
+ # @return [LazyFrame]
182
+ def self.read_json(file)
183
+ if Utils.pathlike?(file)
184
+ file = Utils.normalise_filepath(file)
185
+ end
186
+
187
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
188
+ end
189
+
190
+ # Get or set column names.
191
+ #
192
+ # @return [Array]
193
+ #
194
+ # @example
195
+ # df = (
196
+ # Polars::DataFrame.new(
197
+ # {
198
+ # "foo" => [1, 2, 3],
199
+ # "bar" => [6, 7, 8],
200
+ # "ham" => ["a", "b", "c"]
201
+ # }
202
+ # )
203
+ # .lazy
204
+ # .select(["foo", "bar"])
205
+ # )
206
+ # df.columns
207
+ # # => ["foo", "bar"]
208
+ def columns
209
+ _ldf.columns
210
+ end
211
+
212
+ # Get dtypes of columns in LazyFrame.
213
+ #
214
+ # @return [Array]
215
+ #
216
+ # @example
217
+ # lf = Polars::DataFrame.new(
218
+ # {
219
+ # "foo" => [1, 2, 3],
220
+ # "bar" => [6.0, 7.0, 8.0],
221
+ # "ham" => ["a", "b", "c"]
222
+ # }
223
+ # ).lazy
224
+ # lf.dtypes
225
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
226
+ def dtypes
227
+ _ldf.dtypes
228
+ end
229
+
230
+ # Get the schema.
231
+ #
232
+ # @return [Hash]
233
+ #
234
+ # @example
235
+ # lf = Polars::DataFrame.new(
236
+ # {
237
+ # "foo" => [1, 2, 3],
238
+ # "bar" => [6.0, 7.0, 8.0],
239
+ # "ham" => ["a", "b", "c"]
240
+ # }
241
+ # ).lazy
242
+ # lf.schema
243
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
244
+ def schema
245
+ _ldf.schema
246
+ end
247
+
248
+ # Get the width of the LazyFrame.
249
+ #
250
+ # @return [Integer]
251
+ #
252
+ # @example
253
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
254
+ # lf.width
255
+ # # => 2
256
+ def width
257
+ _ldf.width
258
+ end
259
+
260
+ # Check if LazyFrame includes key.
261
+ #
262
+ # @return [Boolean]
263
+ def include?(key)
264
+ columns.include?(key)
265
+ end
266
+
267
+ # clone handled by initialize_copy
268
+
269
+ # def [](item)
270
+ # end
271
+
272
+ # Returns a string representing the LazyFrame.
273
+ #
274
+ # @return [String]
275
+ def to_s
276
+ <<~EOS
277
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
278
+
279
+ #{describe_plan}
280
+ EOS
281
+ end
282
+
283
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
284
+ #
285
+ # @param file [String]
286
+ # File path to which the result should be written.
287
+ #
288
+ # @return [nil]
289
+ def write_json(file)
290
+ if Utils.pathlike?(file)
291
+ file = Utils.normalise_filepath(file)
292
+ end
293
+ _ldf.write_json(file)
294
+ nil
295
+ end
296
+
297
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
298
+ #
299
+ # @param func [Object]
300
+ # Callable; will receive the frame as the first parameter,
301
+ # followed by any given args/kwargs.
302
+ # @param args [Object]
303
+ # Arguments to pass to the UDF.
304
+ # @param kwargs [Object]
305
+ # Keyword arguments to pass to the UDF.
306
+ #
307
+ # @return [LazyFrame]
308
+ #
309
+ # @example
310
+ # cast_str_to_int = lambda do |data, col_name:|
311
+ # data.with_column(Polars.col(col_name).cast(:i64))
312
+ # end
313
+ #
314
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
315
+ # df.pipe(cast_str_to_int, col_name: "b").collect
316
+ # # =>
317
+ # # shape: (4, 2)
318
+ # # ┌─────┬─────┐
319
+ # # │ a ┆ b │
320
+ # # │ --- ┆ --- │
321
+ # # │ i64 ┆ i64 │
322
+ # # ╞═════╪═════╡
323
+ # # │ 1 ┆ 10 │
324
+ # # │ 2 ┆ 20 │
325
+ # # │ 3 ┆ 30 │
326
+ # # │ 4 ┆ 40 │
327
+ # # └─────┴─────┘
328
+ def pipe(func, *args, **kwargs, &block)
329
+ func.call(self, *args, **kwargs, &block)
330
+ end
331
+
332
+ # Create a string representation of the unoptimized query plan.
333
+ #
334
+ # @return [String]
335
+ def describe_plan
336
+ _ldf.describe_plan
337
+ end
338
+
339
+ # Create a string representation of the optimized query plan.
340
+ #
341
+ # @return [String]
342
+ def describe_optimized_plan(
343
+ type_coercion: true,
344
+ predicate_pushdown: true,
345
+ projection_pushdown: true,
346
+ simplify_expression: true,
347
+ slice_pushdown: true,
348
+ common_subplan_elimination: true,
349
+ comm_subexpr_elim: true,
350
+ allow_streaming: false
351
+ )
352
+ ldf = _ldf.optimization_toggle(
353
+ type_coercion,
354
+ predicate_pushdown,
355
+ projection_pushdown,
356
+ simplify_expression,
357
+ slice_pushdown,
358
+ common_subplan_elimination,
359
+ comm_subexpr_elim,
360
+ allow_streaming,
361
+ false
362
+ )
363
+
364
+ ldf.describe_optimized_plan
365
+ end
366
+
367
+ # def show_graph
368
+ # end
369
+
370
+ # Sort the DataFrame.
371
+ #
372
+ # Sorting can be done by:
373
+ #
374
+ # - A single column name
375
+ # - An expression
376
+ # - Multiple expressions
377
+ #
378
+ # @param by [Object]
379
+ # Column (expressions) to sort by.
380
+ # @param reverse [Boolean]
381
+ # Sort in descending order.
382
+ # @param nulls_last [Boolean]
383
+ # Place null values last. Can only be used if sorted by a single column.
384
+ #
385
+ # @return [LazyFrame]
386
+ #
387
+ # @example
388
+ # df = Polars::DataFrame.new(
389
+ # {
390
+ # "foo" => [1, 2, 3],
391
+ # "bar" => [6.0, 7.0, 8.0],
392
+ # "ham" => ["a", "b", "c"]
393
+ # }
394
+ # ).lazy
395
+ # df.sort("foo", reverse: true).collect
396
+ # # =>
397
+ # # shape: (3, 3)
398
+ # # ┌─────┬─────┬─────┐
399
+ # # │ foo ┆ bar ┆ ham │
400
+ # # │ --- ┆ --- ┆ --- │
401
+ # # │ i64 ┆ f64 ┆ str │
402
+ # # ╞═════╪═════╪═════╡
403
+ # # │ 3 ┆ 8.0 ┆ c │
404
+ # # │ 2 ┆ 7.0 ┆ b │
405
+ # # │ 1 ┆ 6.0 ┆ a │
406
+ # # └─────┴─────┴─────┘
407
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
408
+ if by.is_a?(::String)
409
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
410
+ end
411
+ if Utils.bool?(reverse)
412
+ reverse = [reverse]
413
+ end
414
+
415
+ by = Utils.selection_to_rbexpr_list(by)
416
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
417
+ end
418
+
419
+ # def profile
420
+ # end
421
+
422
+ # Collect into a DataFrame.
423
+ #
424
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
425
+ # only. This can be a huge time saver in debugging queries.
426
+ #
427
+ # @param type_coercion [Boolean]
428
+ # Do type coercion optimization.
429
+ # @param predicate_pushdown [Boolean]
430
+ # Do predicate pushdown optimization.
431
+ # @param projection_pushdown [Boolean]
432
+ # Do projection pushdown optimization.
433
+ # @param simplify_expression [Boolean]
434
+ # Run simplify expressions optimization.
435
+ # @param string_cache [Boolean]
436
+ # This argument is deprecated. Please set the string cache globally.
437
+ # The argument will be ignored
438
+ # @param no_optimization [Boolean]
439
+ # Turn off (certain) optimizations.
440
+ # @param slice_pushdown [Boolean]
441
+ # Slice pushdown optimization.
442
+ # @param common_subplan_elimination [Boolean]
443
+ # Will try to cache branching subplans that occur on self-joins or unions.
444
+ # @param allow_streaming [Boolean]
445
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
446
+ #
447
+ # @return [DataFrame]
448
+ #
449
+ # @example
450
+ # df = Polars::DataFrame.new(
451
+ # {
452
+ # "a" => ["a", "b", "a", "b", "b", "c"],
453
+ # "b" => [1, 2, 3, 4, 5, 6],
454
+ # "c" => [6, 5, 4, 3, 2, 1]
455
+ # }
456
+ # ).lazy
457
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
458
+ # # =>
459
+ # # shape: (3, 3)
460
+ # # ┌─────┬─────┬─────┐
461
+ # # │ a ┆ b ┆ c │
462
+ # # │ --- ┆ --- ┆ --- │
463
+ # # │ str ┆ i64 ┆ i64 │
464
+ # # ╞═════╪═════╪═════╡
465
+ # # │ a ┆ 4 ┆ 10 │
466
+ # # │ b ┆ 11 ┆ 10 │
467
+ # # │ c ┆ 6 ┆ 1 │
468
+ # # └─────┴─────┴─────┘
469
+ def collect(
470
+ type_coercion: true,
471
+ predicate_pushdown: true,
472
+ projection_pushdown: true,
473
+ simplify_expression: true,
474
+ string_cache: false,
475
+ no_optimization: false,
476
+ slice_pushdown: true,
477
+ common_subplan_elimination: true,
478
+ comm_subexpr_elim: true,
479
+ allow_streaming: false,
480
+ _eager: false
481
+ )
482
+ if no_optimization
483
+ predicate_pushdown = false
484
+ projection_pushdown = false
485
+ slice_pushdown = false
486
+ common_subplan_elimination = false
487
+ comm_subexpr_elim = false
488
+ end
489
+
490
+ if allow_streaming
491
+ common_subplan_elimination = false
492
+ end
493
+
494
+ ldf = _ldf.optimization_toggle(
495
+ type_coercion,
496
+ predicate_pushdown,
497
+ projection_pushdown,
498
+ simplify_expression,
499
+ slice_pushdown,
500
+ common_subplan_elimination,
501
+ comm_subexpr_elim,
502
+ allow_streaming,
503
+ _eager
504
+ )
505
+ Utils.wrap_df(ldf.collect)
506
+ end
507
+
508
+ # Persists a LazyFrame at the provided path.
509
+ #
510
+ # This allows streaming results that are larger than RAM to be written to disk.
511
+ #
512
+ # @param path [String]
513
+ # File path to which the file should be written.
514
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
515
+ # Choose "zstd" for good compression performance.
516
+ # Choose "lz4" for fast compression/decompression.
517
+ # Choose "snappy" for more backwards compatibility guarantees
518
+ # when you deal with older parquet readers.
519
+ # @param compression_level [Integer]
520
+ # The level of compression to use. Higher compression means smaller files on
521
+ # disk.
522
+ #
523
+ # - "gzip" : min-level: 0, max-level: 10.
524
+ # - "brotli" : min-level: 0, max-level: 11.
525
+ # - "zstd" : min-level: 1, max-level: 22.
526
+ # @param statistics [Boolean]
527
+ # Write statistics to the parquet headers. This requires extra compute.
528
+ # @param row_group_size [Integer]
529
+ # Size of the row groups in number of rows.
530
+ # If `nil` (default), the chunks of the `DataFrame` are
531
+ # used. Writing in smaller chunks may reduce memory pressure and improve
532
+ # writing speeds.
533
+ # @param data_pagesize_limit [Integer]
534
+ # Size limit of individual data pages.
535
+ # If not set defaults to 1024 * 1024 bytes
536
+ # @param maintain_order [Boolean]
537
+ # Maintain the order in which data is processed.
538
+ # Setting this to `false` will be slightly faster.
539
+ # @param type_coercion [Boolean]
540
+ # Do type coercion optimization.
541
+ # @param predicate_pushdown [Boolean]
542
+ # Do predicate pushdown optimization.
543
+ # @param projection_pushdown [Boolean]
544
+ # Do projection pushdown optimization.
545
+ # @param simplify_expression [Boolean]
546
+ # Run simplify expressions optimization.
547
+ # @param no_optimization [Boolean]
548
+ # Turn off (certain) optimizations.
549
+ # @param slice_pushdown [Boolean]
550
+ # Slice pushdown optimization.
551
+ #
552
+ # @return [DataFrame]
553
+ #
554
+ # @example
555
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
556
+ # lf.sink_parquet("out.parquet")
557
+ def sink_parquet(
558
+ path,
559
+ compression: "zstd",
560
+ compression_level: nil,
561
+ statistics: false,
562
+ row_group_size: nil,
563
+ data_pagesize_limit: nil,
564
+ maintain_order: true,
565
+ type_coercion: true,
566
+ predicate_pushdown: true,
567
+ projection_pushdown: true,
568
+ simplify_expression: true,
569
+ no_optimization: false,
570
+ slice_pushdown: true
571
+ )
572
+ lf = _set_sink_optimizations(
573
+ type_coercion: type_coercion,
574
+ predicate_pushdown: predicate_pushdown,
575
+ projection_pushdown: projection_pushdown,
576
+ simplify_expression: simplify_expression,
577
+ slice_pushdown: slice_pushdown,
578
+ no_optimization: no_optimization
579
+ )
580
+
581
+ lf.sink_parquet(
582
+ path,
583
+ compression,
584
+ compression_level,
585
+ statistics,
586
+ row_group_size,
587
+ data_pagesize_limit,
588
+ maintain_order
589
+ )
590
+ end
591
+
592
+ # Evaluate the query in streaming mode and write to an IPC file.
593
+ #
594
+ # This allows streaming results that are larger than RAM to be written to disk.
595
+ #
596
+ # @param path [String]
597
+ # File path to which the file should be written.
598
+ # @param compression ["lz4", "zstd"]
599
+ # Choose "zstd" for good compression performance.
600
+ # Choose "lz4" for fast compression/decompression.
601
+ # @param maintain_order [Boolean]
602
+ # Maintain the order in which data is processed.
603
+ # Setting this to `false` will be slightly faster.
604
+ # @param type_coercion [Boolean]
605
+ # Do type coercion optimization.
606
+ # @param predicate_pushdown [Boolean]
607
+ # Do predicate pushdown optimization.
608
+ # @param projection_pushdown [Boolean]
609
+ # Do projection pushdown optimization.
610
+ # @param simplify_expression [Boolean]
611
+ # Run simplify expressions optimization.
612
+ # @param slice_pushdown [Boolean]
613
+ # Slice pushdown optimization.
614
+ # @param no_optimization [Boolean]
615
+ # Turn off (certain) optimizations.
616
+ #
617
+ # @return [DataFrame]
618
+ #
619
+ # @example
620
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
621
+ # lf.sink_ipc("out.arrow")
622
+ def sink_ipc(
623
+ path,
624
+ compression: "zstd",
625
+ maintain_order: true,
626
+ type_coercion: true,
627
+ predicate_pushdown: true,
628
+ projection_pushdown: true,
629
+ simplify_expression: true,
630
+ slice_pushdown: true,
631
+ no_optimization: false
632
+ )
633
+ lf = _set_sink_optimizations(
634
+ type_coercion: type_coercion,
635
+ predicate_pushdown: predicate_pushdown,
636
+ projection_pushdown: projection_pushdown,
637
+ simplify_expression: simplify_expression,
638
+ slice_pushdown: slice_pushdown,
639
+ no_optimization: no_optimization
640
+ )
641
+
642
+ lf.sink_ipc(
643
+ path,
644
+ compression,
645
+ maintain_order
646
+ )
647
+ end
648
+
649
+ # Evaluate the query in streaming mode and write to a CSV file.
650
+ #
651
+ # This allows streaming results that are larger than RAM to be written to disk.
652
+ #
653
+ # @param path [String]
654
+ # File path to which the file should be written.
655
+ # @param include_bom [Boolean]
656
+ # Whether to include UTF-8 BOM in the CSV output.
657
+ # @param include_header [Boolean]
658
+ # Whether to include header in the CSV output.
659
+ # @param separator [String]
660
+ # Separate CSV fields with this symbol.
661
+ # @param line_terminator [String]
662
+ # String used to end each row.
663
+ # @param quote_char [String]
664
+ # Byte to use as quoting character.
665
+ # @param batch_size [Integer]
666
+ # Number of rows that will be processed per thread.
667
+ # @param datetime_format [String]
668
+ # A format string, with the specifiers defined by the
669
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
670
+ # Rust crate. If no format specified, the default fractional-second
671
+ # precision is inferred from the maximum timeunit found in the frame's
672
+ # Datetime cols (if any).
673
+ # @param date_format [String]
674
+ # A format string, with the specifiers defined by the
675
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
676
+ # Rust crate.
677
+ # @param time_format [String]
678
+ # A format string, with the specifiers defined by the
679
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
680
+ # Rust crate.
681
+ # @param float_precision [Integer]
682
+ # Number of decimal places to write, applied to both `Float32` and
683
+ # `Float64` datatypes.
684
+ # @param null_value [String]
685
+ # A string representing null values (defaulting to the empty string).
686
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
687
+ # Determines the quoting strategy used.
688
+ #
689
+ # - necessary (default): This puts quotes around fields only when necessary.
690
+ # They are necessary when fields contain a quote,
691
+ # delimiter or record terminator.
692
+ # Quotes are also necessary when writing an empty record
693
+ # (which is indistinguishable from a record with one empty field).
694
+ # This is the default.
695
+ # - always: This puts quotes around every field. Always.
696
+ # - never: This never puts quotes around fields, even if that results in
697
+ # invalid CSV data (e.g.: by not quoting strings containing the
698
+ # separator).
699
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
700
+ # Namely, when writing a field that does not parse as a valid float
701
+ # or integer, then quotes will be used even if they aren`t strictly
702
+ # necessary.
703
+ # @param maintain_order [Boolean]
704
+ # Maintain the order in which data is processed.
705
+ # Setting this to `false` will be slightly faster.
706
+ # @param type_coercion [Boolean]
707
+ # Do type coercion optimization.
708
+ # @param predicate_pushdown [Boolean]
709
+ # Do predicate pushdown optimization.
710
+ # @param projection_pushdown [Boolean]
711
+ # Do projection pushdown optimization.
712
+ # @param simplify_expression [Boolean]
713
+ # Run simplify expressions optimization.
714
+ # @param slice_pushdown [Boolean]
715
+ # Slice pushdown optimization.
716
+ # @param no_optimization [Boolean]
717
+ # Turn off (certain) optimizations.
718
+ #
719
+ # @return [DataFrame]
720
+ #
721
+ # @example
722
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
723
+ # lf.sink_csv("out.csv")
724
+ def sink_csv(
725
+ path,
726
+ include_bom: false,
727
+ include_header: true,
728
+ separator: ",",
729
+ line_terminator: "\n",
730
+ quote_char: '"',
731
+ batch_size: 1024,
732
+ datetime_format: nil,
733
+ date_format: nil,
734
+ time_format: nil,
735
+ float_precision: nil,
736
+ null_value: nil,
737
+ quote_style: nil,
738
+ maintain_order: true,
739
+ type_coercion: true,
740
+ predicate_pushdown: true,
741
+ projection_pushdown: true,
742
+ simplify_expression: true,
743
+ slice_pushdown: true,
744
+ no_optimization: false
745
+ )
746
+ Utils._check_arg_is_1byte("separator", separator, false)
747
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
748
+
749
+ lf = _set_sink_optimizations(
750
+ type_coercion: type_coercion,
751
+ predicate_pushdown: predicate_pushdown,
752
+ projection_pushdown: projection_pushdown,
753
+ simplify_expression: simplify_expression,
754
+ slice_pushdown: slice_pushdown,
755
+ no_optimization: no_optimization
756
+ )
757
+
758
+ lf.sink_csv(
759
+ path,
760
+ include_bom,
761
+ include_header,
762
+ separator.ord,
763
+ line_terminator,
764
+ quote_char.ord,
765
+ batch_size,
766
+ datetime_format,
767
+ date_format,
768
+ time_format,
769
+ float_precision,
770
+ null_value,
771
+ quote_style,
772
+ maintain_order
773
+ )
774
+ end
775
+
776
+ # Evaluate the query in streaming mode and write to an NDJSON file.
777
+ #
778
+ # This allows streaming results that are larger than RAM to be written to disk.
779
+ #
780
+ # @param path [String]
781
+ # File path to which the file should be written.
782
+ # @param maintain_order [Boolean]
783
+ # Maintain the order in which data is processed.
784
+ # Setting this to `false` will be slightly faster.
785
+ # @param type_coercion [Boolean]
786
+ # Do type coercion optimization.
787
+ # @param predicate_pushdown [Boolean]
788
+ # Do predicate pushdown optimization.
789
+ # @param projection_pushdown [Boolean]
790
+ # Do projection pushdown optimization.
791
+ # @param simplify_expression [Boolean]
792
+ # Run simplify expressions optimization.
793
+ # @param slice_pushdown [Boolean]
794
+ # Slice pushdown optimization.
795
+ # @param no_optimization [Boolean]
796
+ # Turn off (certain) optimizations.
797
+ #
798
+ # @return [DataFrame]
799
+ #
800
+ # @example
801
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
802
+ # lf.sink_ndjson("out.ndjson")
803
+ def sink_ndjson(
804
+ path,
805
+ maintain_order: true,
806
+ type_coercion: true,
807
+ predicate_pushdown: true,
808
+ projection_pushdown: true,
809
+ simplify_expression: true,
810
+ slice_pushdown: true,
811
+ no_optimization: false
812
+ )
813
+ lf = _set_sink_optimizations(
814
+ type_coercion: type_coercion,
815
+ predicate_pushdown: predicate_pushdown,
816
+ projection_pushdown: projection_pushdown,
817
+ simplify_expression: simplify_expression,
818
+ slice_pushdown: slice_pushdown,
819
+ no_optimization: no_optimization
820
+ )
821
+
822
+ lf.sink_json(path, maintain_order)
823
+ end
824
+
825
+ # @private
826
+ def _set_sink_optimizations(
827
+ type_coercion: true,
828
+ predicate_pushdown: true,
829
+ projection_pushdown: true,
830
+ simplify_expression: true,
831
+ slice_pushdown: true,
832
+ no_optimization: false
833
+ )
834
+ if no_optimization
835
+ predicate_pushdown = false
836
+ projection_pushdown = false
837
+ slice_pushdown = false
838
+ end
839
+
840
+ _ldf.optimization_toggle(
841
+ type_coercion,
842
+ predicate_pushdown,
843
+ projection_pushdown,
844
+ simplify_expression,
845
+ slice_pushdown,
846
+ false,
847
+ false,
848
+ true,
849
+ false
850
+ )
851
+ end
852
+
853
+ # Collect a small number of rows for debugging purposes.
854
+ #
855
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
856
+ # read by every scan operation. This is a utility that helps debug a query on a
857
+ # smaller number of rows.
858
+ #
859
+ # Note that the fetch does not guarantee the final number of rows in the
860
+ # DataFrame. Filter, join operations and a lower number of rows available in the
861
+ # scanned file influence the final number of rows.
862
+ #
863
+ # @param n_rows [Integer]
864
+ # Collect n_rows from the data sources.
865
+ # @param type_coercion [Boolean]
866
+ # Run type coercion optimization.
867
+ # @param predicate_pushdown [Boolean]
868
+ # Run predicate pushdown optimization.
869
+ # @param projection_pushdown [Boolean]
870
+ # Run projection pushdown optimization.
871
+ # @param simplify_expression [Boolean]
872
+ # Run simplify expressions optimization.
873
+ # @param string_cache [Boolean]
874
+ # This argument is deprecated. Please set the string cache globally.
875
+ # The argument will be ignored
876
+ # @param no_optimization [Boolean]
877
+ # Turn off optimizations.
878
+ # @param slice_pushdown [Boolean]
879
+ # Slice pushdown optimization
880
+ # @param common_subplan_elimination [Boolean]
881
+ # Will try to cache branching subplans that occur on self-joins or unions.
882
+ # @param allow_streaming [Boolean]
883
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
884
+ #
885
+ # @return [DataFrame]
886
+ #
887
+ # @example
888
+ # df = Polars::DataFrame.new(
889
+ # {
890
+ # "a" => ["a", "b", "a", "b", "b", "c"],
891
+ # "b" => [1, 2, 3, 4, 5, 6],
892
+ # "c" => [6, 5, 4, 3, 2, 1]
893
+ # }
894
+ # ).lazy
895
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
896
+ # # =>
897
+ # # shape: (2, 3)
898
+ # # ┌─────┬─────┬─────┐
899
+ # # │ a ┆ b ┆ c │
900
+ # # │ --- ┆ --- ┆ --- │
901
+ # # │ str ┆ i64 ┆ i64 │
902
+ # # ╞═════╪═════╪═════╡
903
+ # # │ a ┆ 1 ┆ 6 │
904
+ # # │ b ┆ 2 ┆ 5 │
905
+ # # └─────┴─────┴─────┘
906
+ def fetch(
907
+ n_rows = 500,
908
+ type_coercion: true,
909
+ predicate_pushdown: true,
910
+ projection_pushdown: true,
911
+ simplify_expression: true,
912
+ string_cache: false,
913
+ no_optimization: false,
914
+ slice_pushdown: true,
915
+ common_subplan_elimination: true,
916
+ comm_subexpr_elim: true,
917
+ allow_streaming: false
918
+ )
919
+ if no_optimization
920
+ predicate_pushdown = false
921
+ projection_pushdown = false
922
+ slice_pushdown = false
923
+ common_subplan_elimination = false
924
+ end
925
+
926
+ ldf = _ldf.optimization_toggle(
927
+ type_coercion,
928
+ predicate_pushdown,
929
+ projection_pushdown,
930
+ simplify_expression,
931
+ slice_pushdown,
932
+ common_subplan_elimination,
933
+ comm_subexpr_elim,
934
+ allow_streaming,
935
+ false
936
+ )
937
+ Utils.wrap_df(ldf.fetch(n_rows))
938
+ end
939
+
940
+ # Return lazy representation, i.e. itself.
941
+ #
942
+ # Useful for writing code that expects either a `DataFrame` or
943
+ # `LazyFrame`.
944
+ #
945
+ # @return [LazyFrame]
946
+ #
947
+ # @example
948
+ # df = Polars::DataFrame.new(
949
+ # {
950
+ # "a" => [nil, 2, 3, 4],
951
+ # "b" => [0.5, nil, 2.5, 13],
952
+ # "c" => [true, true, false, nil]
953
+ # }
954
+ # )
955
+ # df.lazy
956
+ def lazy
957
+ self
958
+ end
959
+
960
+ # Cache the result once the execution of the physical plan hits this node.
961
+ #
962
+ # @return [LazyFrame]
963
+ def cache
964
+ _from_rbldf(_ldf.cache)
965
+ end
966
+
967
+ # TODO
968
+ # def cast
969
+ # end
970
+
971
+ # Create an empty copy of the current LazyFrame.
972
+ #
973
+ # The copy has an identical schema but no data.
974
+ #
975
+ # @return [LazyFrame]
976
+ #
977
+ # @example
978
+ # lf = Polars::LazyFrame.new(
979
+ # {
980
+ # "a" => [nil, 2, 3, 4],
981
+ # "b" => [0.5, nil, 2.5, 13],
982
+ # "c" => [true, true, false, nil],
983
+ # }
984
+ # ).lazy
985
+ # lf.clear.fetch
986
+ # # =>
987
+ # # shape: (0, 3)
988
+ # # ┌─────┬─────┬──────┐
989
+ # # │ a ┆ b ┆ c │
990
+ # # │ --- ┆ --- ┆ --- │
991
+ # # │ i64 ┆ f64 ┆ bool │
992
+ # # ╞═════╪═════╪══════╡
993
+ # # └─────┴─────┴──────┘
994
+ #
995
+ # @example
996
+ # lf.clear(2).fetch
997
+ # # =>
998
+ # # shape: (2, 3)
999
+ # # ┌──────┬──────┬──────┐
1000
+ # # │ a ┆ b ┆ c │
1001
+ # # │ --- ┆ --- ┆ --- │
1002
+ # # │ i64 ┆ f64 ┆ bool │
1003
+ # # ╞══════╪══════╪══════╡
1004
+ # # │ null ┆ null ┆ null │
1005
+ # # │ null ┆ null ┆ null │
1006
+ # # └──────┴──────┴──────┘
1007
+ def clear(n = 0)
1008
+ DataFrame.new(columns: schema).clear(n).lazy
1009
+ end
1010
+ alias_method :cleared, :clear
1011
+
1012
+ # Filter the rows in the DataFrame based on a predicate expression.
1013
+ #
1014
+ # @param predicate [Object]
1015
+ # Expression that evaluates to a boolean Series.
1016
+ #
1017
+ # @return [LazyFrame]
1018
+ #
1019
+ # @example Filter on one condition:
1020
+ # lf = Polars::DataFrame.new(
1021
+ # {
1022
+ # "foo" => [1, 2, 3],
1023
+ # "bar" => [6, 7, 8],
1024
+ # "ham" => ["a", "b", "c"]
1025
+ # }
1026
+ # ).lazy
1027
+ # lf.filter(Polars.col("foo") < 3).collect
1028
+ # # =>
1029
+ # # shape: (2, 3)
1030
+ # # ┌─────┬─────┬─────┐
1031
+ # # │ foo ┆ bar ┆ ham │
1032
+ # # │ --- ┆ --- ┆ --- │
1033
+ # # │ i64 ┆ i64 ┆ str │
1034
+ # # ╞═════╪═════╪═════╡
1035
+ # # │ 1 ┆ 6 ┆ a │
1036
+ # # │ 2 ┆ 7 ┆ b │
1037
+ # # └─────┴─────┴─────┘
1038
+ #
1039
+ # @example Filter on multiple conditions:
1040
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
1041
+ # # =>
1042
+ # # shape: (1, 3)
1043
+ # # ┌─────┬─────┬─────┐
1044
+ # # │ foo ┆ bar ┆ ham │
1045
+ # # │ --- ┆ --- ┆ --- │
1046
+ # # │ i64 ┆ i64 ┆ str │
1047
+ # # ╞═════╪═════╪═════╡
1048
+ # # │ 1 ┆ 6 ┆ a │
1049
+ # # └─────┴─────┴─────┘
1050
+ def filter(predicate)
1051
+ _from_rbldf(
1052
+ _ldf.filter(
1053
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
1054
+ )
1055
+ )
1056
+ end
1057
+
1058
+ # Select columns from this DataFrame.
1059
+ #
1060
+ # @param exprs [Array]
1061
+ # Column(s) to select, specified as positional arguments.
1062
+ # Accepts expression input. Strings are parsed as column names,
1063
+ # other non-expression inputs are parsed as literals.
1064
+ # @param named_exprs [Hash]
1065
+ # Additional columns to select, specified as keyword arguments.
1066
+ # The columns will be renamed to the keyword used.
1067
+ #
1068
+ # @return [LazyFrame]
1069
+ #
1070
+ # @example
1071
+ # df = Polars::DataFrame.new(
1072
+ # {
1073
+ # "foo" => [1, 2, 3],
1074
+ # "bar" => [6, 7, 8],
1075
+ # "ham" => ["a", "b", "c"],
1076
+ # }
1077
+ # ).lazy
1078
+ # df.select("foo").collect
1079
+ # # =>
1080
+ # # shape: (3, 1)
1081
+ # # ┌─────┐
1082
+ # # │ foo │
1083
+ # # │ --- │
1084
+ # # │ i64 │
1085
+ # # ╞═════╡
1086
+ # # │ 1 │
1087
+ # # │ 2 │
1088
+ # # │ 3 │
1089
+ # # └─────┘
1090
+ #
1091
+ # @example
1092
+ # df.select(["foo", "bar"]).collect
1093
+ # # =>
1094
+ # # shape: (3, 2)
1095
+ # # ┌─────┬─────┐
1096
+ # # │ foo ┆ bar │
1097
+ # # │ --- ┆ --- │
1098
+ # # │ i64 ┆ i64 │
1099
+ # # ╞═════╪═════╡
1100
+ # # │ 1 ┆ 6 │
1101
+ # # │ 2 ┆ 7 │
1102
+ # # │ 3 ┆ 8 │
1103
+ # # └─────┴─────┘
1104
+ #
1105
+ # @example
1106
+ # df.select(Polars.col("foo") + 1).collect
1107
+ # # =>
1108
+ # # shape: (3, 1)
1109
+ # # ┌─────┐
1110
+ # # │ foo │
1111
+ # # │ --- │
1112
+ # # │ i64 │
1113
+ # # ╞═════╡
1114
+ # # │ 2 │
1115
+ # # │ 3 │
1116
+ # # │ 4 │
1117
+ # # └─────┘
1118
+ #
1119
+ # @example
1120
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
1121
+ # # =>
1122
+ # # shape: (3, 2)
1123
+ # # ┌─────┬─────┐
1124
+ # # │ foo ┆ bar │
1125
+ # # │ --- ┆ --- │
1126
+ # # │ i64 ┆ i64 │
1127
+ # # ╞═════╪═════╡
1128
+ # # │ 2 ┆ 7 │
1129
+ # # │ 3 ┆ 8 │
1130
+ # # │ 4 ┆ 9 │
1131
+ # # └─────┴─────┘
1132
+ #
1133
+ # @example
1134
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
1135
+ # # =>
1136
+ # # shape: (3, 1)
1137
+ # # ┌─────────┐
1138
+ # # │ literal │
1139
+ # # │ --- │
1140
+ # # │ i64 │
1141
+ # # ╞═════════╡
1142
+ # # │ 0 │
1143
+ # # │ 0 │
1144
+ # # │ 10 │
1145
+ # # └─────────┘
1146
+ def select(*exprs, **named_exprs)
1147
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1148
+
1149
+ rbexprs = Utils.parse_as_list_of_expressions(
1150
+ *exprs, **named_exprs, __structify: structify
1151
+ )
1152
+ _from_rbldf(_ldf.select(rbexprs))
1153
+ end
1154
+
1155
+ # Start a group by operation.
1156
+ #
1157
+ # @param by [Object]
1158
+ # Column(s) to group by.
1159
+ # @param maintain_order [Boolean]
1160
+ # Make sure that the order of the groups remain consistent. This is more
1161
+ # expensive than a default group by.
1162
+ #
1163
+ # @return [LazyGroupBy]
1164
+ #
1165
+ # @example
1166
+ # df = Polars::DataFrame.new(
1167
+ # {
1168
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1169
+ # "b" => [1, 2, 3, 4, 5, 6],
1170
+ # "c" => [6, 5, 4, 3, 2, 1]
1171
+ # }
1172
+ # ).lazy
1173
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
1174
+ # # =>
1175
+ # # shape: (3, 2)
1176
+ # # ┌─────┬─────┐
1177
+ # # │ a ┆ b │
1178
+ # # │ --- ┆ --- │
1179
+ # # │ str ┆ i64 │
1180
+ # # ╞═════╪═════╡
1181
+ # # │ a ┆ 4 │
1182
+ # # │ b ┆ 11 │
1183
+ # # │ c ┆ 6 │
1184
+ # # └─────┴─────┘
1185
+ def group_by(by, maintain_order: false)
1186
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
1187
+ lgb = _ldf.group_by(rbexprs_by, maintain_order)
1188
+ LazyGroupBy.new(lgb)
1189
+ end
1190
+ alias_method :groupby, :group_by
1191
+ alias_method :group, :group_by
1192
+
1193
+ # Create rolling groups based on a time column.
1194
+ #
1195
+ # Also works for index values of type `:i32` or `:i64`.
1196
+ #
1197
+ # Different from a `dynamic_group_by` the windows are now determined by the
1198
+ # individual values and are not of constant intervals. For constant intervals
1199
+ # use *group_by_dynamic*.
1200
+ #
1201
+ # The `period` and `offset` arguments are created either from a timedelta, or
1202
+ # by using the following string language:
1203
+ #
1204
+ # - 1ns (1 nanosecond)
1205
+ # - 1us (1 microsecond)
1206
+ # - 1ms (1 millisecond)
1207
+ # - 1s (1 second)
1208
+ # - 1m (1 minute)
1209
+ # - 1h (1 hour)
1210
+ # - 1d (1 day)
1211
+ # - 1w (1 week)
1212
+ # - 1mo (1 calendar month)
1213
+ # - 1y (1 calendar year)
1214
+ # - 1i (1 index count)
1215
+ #
1216
+ # Or combine them:
1217
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1218
+ #
1219
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1220
+ #
1221
+ # - "1i" # length 1
1222
+ # - "10i" # length 10
1223
+ #
1224
+ # @param index_column [Object]
1225
+ # Column used to group based on the time window.
1226
+ # Often to type Date/Datetime
1227
+ # This column must be sorted in ascending order. If not the output will not
1228
+ # make sense.
1229
+ #
1230
+ # In case of a rolling group by on indices, dtype needs to be one of
1231
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1232
+ # performance matters use an `:i64` column.
1233
+ # @param period [Object]
1234
+ # Length of the window.
1235
+ # @param offset [Object]
1236
+ # Offset of the window. Default is -period.
1237
+ # @param closed ["right", "left", "both", "none"]
1238
+ # Define whether the temporal window interval is closed or not.
1239
+ # @param by [Object]
1240
+ # Also group by this column/these columns.
1241
+ # @param check_sorted [Boolean]
1242
+ # When the `by` argument is given, polars can not check sortedness
1243
+ # by the metadata and has to do a full scan on the index column to
1244
+ # verify data is sorted. This is expensive. If you are sure the
1245
+ # data within the by groups is sorted, you can set this to `false`.
1246
+ # Doing so incorrectly will lead to incorrect output
1247
+ #
1248
+ # @return [LazyFrame]
1249
+ #
1250
+ # @example
1251
+ # dates = [
1252
+ # "2020-01-01 13:45:48",
1253
+ # "2020-01-01 16:42:13",
1254
+ # "2020-01-01 16:45:09",
1255
+ # "2020-01-02 18:12:48",
1256
+ # "2020-01-03 19:45:32",
1257
+ # "2020-01-08 23:16:43"
1258
+ # ]
1259
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1260
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1261
+ # )
1262
+ # df.rolling(index_column: "dt", period: "2d").agg(
1263
+ # [
1264
+ # Polars.sum("a").alias("sum_a"),
1265
+ # Polars.min("a").alias("min_a"),
1266
+ # Polars.max("a").alias("max_a")
1267
+ # ]
1268
+ # ).collect
1269
+ # # =>
1270
+ # # shape: (6, 4)
1271
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1272
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1273
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1274
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1275
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1276
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1277
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1278
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1279
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1280
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1281
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1282
+ # # └─────────────────────┴───────┴───────┴───────┘
1283
+ def rolling(
1284
+ index_column:,
1285
+ period:,
1286
+ offset: nil,
1287
+ closed: "right",
1288
+ by: nil,
1289
+ check_sorted: true
1290
+ )
1291
+ index_column = Utils.parse_as_expression(index_column)
1292
+ if offset.nil?
1293
+ offset = "-#{period}"
1294
+ end
1295
+
1296
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1297
+ period = Utils._timedelta_to_pl_duration(period)
1298
+ offset = Utils._timedelta_to_pl_duration(offset)
1299
+
1300
+ lgb = _ldf.rolling(
1301
+ index_column, period, offset, closed, rbexprs_by, check_sorted
1302
+ )
1303
+ LazyGroupBy.new(lgb)
1304
+ end
1305
+ alias_method :group_by_rolling, :rolling
1306
+ alias_method :groupby_rolling, :rolling
1307
+
1308
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1309
+ #
1310
+ # Time windows are calculated and rows are assigned to windows. Different from a
1311
+ # normal group by is that a row can be member of multiple groups. The time/index
1312
+ # window could be seen as a rolling window, with a window size determined by
1313
+ # dates/times/values instead of slots in the DataFrame.
1314
+ #
1315
+ # A window is defined by:
1316
+ #
1317
+ # - every: interval of the window
1318
+ # - period: length of the window
1319
+ # - offset: offset of the window
1320
+ #
1321
+ # The `every`, `period` and `offset` arguments are created with
1322
+ # the following string language:
1323
+ #
1324
+ # - 1ns (1 nanosecond)
1325
+ # - 1us (1 microsecond)
1326
+ # - 1ms (1 millisecond)
1327
+ # - 1s (1 second)
1328
+ # - 1m (1 minute)
1329
+ # - 1h (1 hour)
1330
+ # - 1d (1 day)
1331
+ # - 1w (1 week)
1332
+ # - 1mo (1 calendar month)
1333
+ # - 1y (1 calendar year)
1334
+ # - 1i (1 index count)
1335
+ #
1336
+ # Or combine them:
1337
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1338
+ #
1339
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1340
+ #
1341
+ # - "1i" # length 1
1342
+ # - "10i" # length 10
1343
+ #
1344
+ # @param index_column [Object]
1345
+ # Column used to group based on the time window.
1346
+ # Often to type Date/Datetime
1347
+ # This column must be sorted in ascending order. If not the output will not
1348
+ # make sense.
1349
+ #
1350
+ # In case of a dynamic group by on indices, dtype needs to be one of
1351
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1352
+ # performance matters use an `:i64` column.
1353
+ # @param every [Object]
1354
+ # Interval of the window.
1355
+ # @param period [Object]
1356
+ # Length of the window, if None it is equal to 'every'.
1357
+ # @param offset [Object]
1358
+ # Offset of the window if None and period is None it will be equal to negative
1359
+ # `every`.
1360
+ # @param truncate [Boolean]
1361
+ # Truncate the time value to the window lower bound.
1362
+ # @param include_boundaries [Boolean]
1363
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1364
+ # "_upper_bound" columns. This will impact performance because it's harder to
1365
+ # parallelize
1366
+ # @param closed ["right", "left", "both", "none"]
1367
+ # Define whether the temporal window interval is closed or not.
1368
+ # @param by [Object]
1369
+ # Also group by this column/these columns
1370
+ # @param check_sorted [Boolean]
1371
+ # When the `by` argument is given, polars can not check sortedness
1372
+ # by the metadata and has to do a full scan on the index column to
1373
+ # verify data is sorted. This is expensive. If you are sure the
1374
+ # data within the by groups is sorted, you can set this to `false`.
1375
+ # Doing so incorrectly will lead to incorrect output.
1376
+ #
1377
+ # @return [DataFrame]
1378
+ #
1379
+ # @example
1380
+ # df = Polars::DataFrame.new(
1381
+ # {
1382
+ # "time" => Polars.date_range(
1383
+ # DateTime.new(2021, 12, 16),
1384
+ # DateTime.new(2021, 12, 16, 3),
1385
+ # "30m"
1386
+ # ),
1387
+ # "n" => 0..6
1388
+ # }
1389
+ # )
1390
+ # # =>
1391
+ # # shape: (7, 2)
1392
+ # # ┌─────────────────────┬─────┐
1393
+ # # │ time ┆ n │
1394
+ # # │ --- ┆ --- │
1395
+ # # │ datetime[μs] ┆ i64 │
1396
+ # # ╞═════════════════════╪═════╡
1397
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1398
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1399
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1400
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1401
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1402
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1403
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1404
+ # # └─────────────────────┴─────┘
1405
+ #
1406
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1407
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1408
+ # [
1409
+ # Polars.col("time").min.alias("time_min"),
1410
+ # Polars.col("time").max.alias("time_max")
1411
+ # ]
1412
+ # )
1413
+ # # =>
1414
+ # # shape: (4, 3)
1415
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1416
+ # # │ time ┆ time_min ┆ time_max │
1417
+ # # │ --- ┆ --- ┆ --- │
1418
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1419
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1420
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1421
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1422
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1423
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1424
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1425
+ #
1426
+ # @example The window boundaries can also be added to the aggregation result.
1427
+ # df.group_by_dynamic(
1428
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1429
+ # ).agg([Polars.col("time").count.alias("time_count")])
1430
+ # # =>
1431
+ # # shape: (4, 4)
1432
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1433
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1434
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1435
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1436
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1437
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1438
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1439
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1440
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1441
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1442
+ #
1443
+ # @example When closed="left", should not include right end of interval.
1444
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1445
+ # [
1446
+ # Polars.col("time").count.alias("time_count"),
1447
+ # Polars.col("time").alias("time_agg_list")
1448
+ # ]
1449
+ # )
1450
+ # # =>
1451
+ # # shape: (4, 3)
1452
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1453
+ # # │ time ┆ time_count ┆ time_agg_list │
1454
+ # # │ --- ┆ --- ┆ --- │
1455
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1456
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1457
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
1458
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
1459
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
1460
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1461
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
1462
+ #
1463
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1464
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1465
+ # [Polars.col("time").count.alias("time_count")]
1466
+ # )
1467
+ # # =>
1468
+ # # shape: (5, 2)
1469
+ # # ┌─────────────────────┬────────────┐
1470
+ # # │ time ┆ time_count │
1471
+ # # │ --- ┆ --- │
1472
+ # # │ datetime[μs] ┆ u32 │
1473
+ # # ╞═════════════════════╪════════════╡
1474
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1475
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1476
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1477
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1478
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1479
+ # # └─────────────────────┴────────────┘
1480
+ #
1481
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1482
+ # df = Polars::DataFrame.new(
1483
+ # {
1484
+ # "time" => Polars.date_range(
1485
+ # DateTime.new(2021, 12, 16),
1486
+ # DateTime.new(2021, 12, 16, 3),
1487
+ # "30m"
1488
+ # ),
1489
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1490
+ # }
1491
+ # )
1492
+ # df.group_by_dynamic(
1493
+ # "time",
1494
+ # every: "1h",
1495
+ # closed: "both",
1496
+ # by: "groups",
1497
+ # include_boundaries: true
1498
+ # ).agg([Polars.col("time").count.alias("time_count")])
1499
+ # # =>
1500
+ # # shape: (7, 5)
1501
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1502
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1503
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1504
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1505
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1506
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1507
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1508
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1509
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1510
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1511
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1512
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1513
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1514
+ #
1515
+ # @example Dynamic group by on an index column.
1516
+ # df = Polars::DataFrame.new(
1517
+ # {
1518
+ # "idx" => Polars.arange(0, 6, eager: true),
1519
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1520
+ # }
1521
+ # )
1522
+ # df.group_by_dynamic(
1523
+ # "idx",
1524
+ # every: "2i",
1525
+ # period: "3i",
1526
+ # include_boundaries: true,
1527
+ # closed: "right"
1528
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1529
+ # # =>
1530
+ # # shape: (4, 4)
1531
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1532
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1533
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1534
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1535
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1536
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1537
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1538
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1539
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1540
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1541
+ def group_by_dynamic(
1542
+ index_column,
1543
+ every:,
1544
+ period: nil,
1545
+ offset: nil,
1546
+ truncate: nil,
1547
+ include_boundaries: false,
1548
+ closed: "left",
1549
+ label: "left",
1550
+ by: nil,
1551
+ start_by: "window",
1552
+ check_sorted: true
1553
+ )
1554
+ if !truncate.nil?
1555
+ label = truncate ? "left" : "datapoint"
1556
+ end
1557
+
1558
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1559
+ if offset.nil?
1560
+ offset = period.nil? ? "-#{every}" : "0ns"
1561
+ end
1562
+
1563
+ if period.nil?
1564
+ period = every
1565
+ end
1566
+
1567
+ period = Utils._timedelta_to_pl_duration(period)
1568
+ offset = Utils._timedelta_to_pl_duration(offset)
1569
+ every = Utils._timedelta_to_pl_duration(every)
1570
+
1571
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1572
+ lgb = _ldf.group_by_dynamic(
1573
+ index_column._rbexpr,
1574
+ every,
1575
+ period,
1576
+ offset,
1577
+ label,
1578
+ include_boundaries,
1579
+ closed,
1580
+ rbexprs_by,
1581
+ start_by,
1582
+ check_sorted
1583
+ )
1584
+ LazyGroupBy.new(lgb)
1585
+ end
1586
+ alias_method :groupby_dynamic, :group_by_dynamic
1587
+
1588
+ # Perform an asof join.
1589
+ #
1590
+ # This is similar to a left-join except that we match on nearest key rather than
1591
+ # equal keys.
1592
+ #
1593
+ # Both DataFrames must be sorted by the join_asof key.
1594
+ #
1595
+ # For each row in the left DataFrame:
1596
+ #
1597
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1598
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1599
+ #
1600
+ # The default is "backward".
1601
+ #
1602
+ # @param other [LazyFrame]
1603
+ # Lazy DataFrame to join with.
1604
+ # @param left_on [String]
1605
+ # Join column of the left DataFrame.
1606
+ # @param right_on [String]
1607
+ # Join column of the right DataFrame.
1608
+ # @param on [String]
1609
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1610
+ # None.
1611
+ # @param by [Object]
1612
+ # Join on these columns before doing asof join.
1613
+ # @param by_left [Object]
1614
+ # Join on these columns before doing asof join.
1615
+ # @param by_right [Object]
1616
+ # Join on these columns before doing asof join.
1617
+ # @param strategy ["backward", "forward"]
1618
+ # Join strategy.
1619
+ # @param suffix [String]
1620
+ # Suffix to append to columns with a duplicate name.
1621
+ # @param tolerance [Object]
1622
+ # Numeric tolerance. By setting this the join will only be done if the near
1623
+ # keys are within this distance. If an asof join is done on columns of dtype
1624
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1625
+ # language:
1626
+ #
1627
+ # - 1ns (1 nanosecond)
1628
+ # - 1us (1 microsecond)
1629
+ # - 1ms (1 millisecond)
1630
+ # - 1s (1 second)
1631
+ # - 1m (1 minute)
1632
+ # - 1h (1 hour)
1633
+ # - 1d (1 day)
1634
+ # - 1w (1 week)
1635
+ # - 1mo (1 calendar month)
1636
+ # - 1y (1 calendar year)
1637
+ # - 1i (1 index count)
1638
+ #
1639
+ # Or combine them:
1640
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1641
+ #
1642
+ # @param allow_parallel [Boolean]
1643
+ # Allow the physical plan to optionally evaluate the computation of both
1644
+ # DataFrames up to the join in parallel.
1645
+ # @param force_parallel [Boolean]
1646
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1647
+ # the join in parallel.
1648
+ #
1649
+ # @return [LazyFrame]
1650
+ def join_asof(
1651
+ other,
1652
+ left_on: nil,
1653
+ right_on: nil,
1654
+ on: nil,
1655
+ by_left: nil,
1656
+ by_right: nil,
1657
+ by: nil,
1658
+ strategy: "backward",
1659
+ suffix: "_right",
1660
+ tolerance: nil,
1661
+ allow_parallel: true,
1662
+ force_parallel: false
1663
+ )
1664
+ if !other.is_a?(LazyFrame)
1665
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1666
+ end
1667
+
1668
+ if on.is_a?(::String)
1669
+ left_on = on
1670
+ right_on = on
1671
+ end
1672
+
1673
+ if left_on.nil? || right_on.nil?
1674
+ raise ArgumentError, "You should pass the column to join on as an argument."
1675
+ end
1676
+
1677
+ if by_left.is_a?(::String) || by_left.is_a?(Expr)
1678
+ by_left_ = [by_left]
1679
+ else
1680
+ by_left_ = by_left
1681
+ end
1682
+
1683
+ if by_right.is_a?(::String) || by_right.is_a?(Expr)
1684
+ by_right_ = [by_right]
1685
+ else
1686
+ by_right_ = by_right
1687
+ end
1688
+
1689
+ if by.is_a?(::String)
1690
+ by_left_ = [by]
1691
+ by_right_ = [by]
1692
+ elsif by.is_a?(::Array)
1693
+ by_left_ = by
1694
+ by_right_ = by
1695
+ end
1696
+
1697
+ tolerance_str = nil
1698
+ tolerance_num = nil
1699
+ if tolerance.is_a?(::String)
1700
+ tolerance_str = tolerance
1701
+ else
1702
+ tolerance_num = tolerance
1703
+ end
1704
+
1705
+ _from_rbldf(
1706
+ _ldf.join_asof(
1707
+ other._ldf,
1708
+ Polars.col(left_on)._rbexpr,
1709
+ Polars.col(right_on)._rbexpr,
1710
+ by_left_,
1711
+ by_right_,
1712
+ allow_parallel,
1713
+ force_parallel,
1714
+ suffix,
1715
+ strategy,
1716
+ tolerance_num,
1717
+ tolerance_str
1718
+ )
1719
+ )
1720
+ end
1721
+
1722
+ # Add a join operation to the Logical Plan.
1723
+ #
1724
+ # @param other [LazyFrame]
1725
+ # Lazy DataFrame to join with.
1726
+ # @param left_on [Object]
1727
+ # Join column of the left DataFrame.
1728
+ # @param right_on [Object]
1729
+ # Join column of the right DataFrame.
1730
+ # @param on Object
1731
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1732
+ # None.
1733
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1734
+ # Join strategy.
1735
+ # @param suffix [String]
1736
+ # Suffix to append to columns with a duplicate name.
1737
+ # @param join_nulls [Boolean]
1738
+ # Join on null values. By default null values will never produce matches.
1739
+ # @param allow_parallel [Boolean]
1740
+ # Allow the physical plan to optionally evaluate the computation of both
1741
+ # DataFrames up to the join in parallel.
1742
+ # @param force_parallel [Boolean]
1743
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1744
+ # the join in parallel.
1745
+ #
1746
+ # @return [LazyFrame]
1747
+ #
1748
+ # @example
1749
+ # df = Polars::DataFrame.new(
1750
+ # {
1751
+ # "foo" => [1, 2, 3],
1752
+ # "bar" => [6.0, 7.0, 8.0],
1753
+ # "ham" => ["a", "b", "c"]
1754
+ # }
1755
+ # ).lazy
1756
+ # other_df = Polars::DataFrame.new(
1757
+ # {
1758
+ # "apple" => ["x", "y", "z"],
1759
+ # "ham" => ["a", "b", "d"]
1760
+ # }
1761
+ # ).lazy
1762
+ # df.join(other_df, on: "ham").collect
1763
+ # # =>
1764
+ # # shape: (2, 4)
1765
+ # # ┌─────┬─────┬─────┬───────┐
1766
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1767
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1768
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1769
+ # # ╞═════╪═════╪═════╪═══════╡
1770
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1771
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1772
+ # # └─────┴─────┴─────┴───────┘
1773
+ #
1774
+ # @example
1775
+ # df.join(other_df, on: "ham", how: "outer").collect
1776
+ # # =>
1777
+ # # shape: (4, 5)
1778
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
1779
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
1780
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1781
+ # # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
1782
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
1783
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
1784
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
1785
+ # # │ null ┆ null ┆ null ┆ z ┆ d │
1786
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
1787
+ # # └──────┴──────┴──────┴───────┴───────────┘
1788
+ #
1789
+ # @example
1790
+ # df.join(other_df, on: "ham", how: "left").collect
1791
+ # # =>
1792
+ # # shape: (3, 4)
1793
+ # # ┌─────┬─────┬─────┬───────┐
1794
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1795
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1796
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1797
+ # # ╞═════╪═════╪═════╪═══════╡
1798
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1799
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1800
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1801
+ # # └─────┴─────┴─────┴───────┘
1802
+ #
1803
+ # @example
1804
+ # df.join(other_df, on: "ham", how: "semi").collect
1805
+ # # =>
1806
+ # # shape: (2, 3)
1807
+ # # ┌─────┬─────┬─────┐
1808
+ # # │ foo ┆ bar ┆ ham │
1809
+ # # │ --- ┆ --- ┆ --- │
1810
+ # # │ i64 ┆ f64 ┆ str │
1811
+ # # ╞═════╪═════╪═════╡
1812
+ # # │ 1 ┆ 6.0 ┆ a │
1813
+ # # │ 2 ┆ 7.0 ┆ b │
1814
+ # # └─────┴─────┴─────┘
1815
+ #
1816
+ # @example
1817
+ # df.join(other_df, on: "ham", how: "anti").collect
1818
+ # # =>
1819
+ # # shape: (1, 3)
1820
+ # # ┌─────┬─────┬─────┐
1821
+ # # │ foo ┆ bar ┆ ham │
1822
+ # # │ --- ┆ --- ┆ --- │
1823
+ # # │ i64 ┆ f64 ┆ str │
1824
+ # # ╞═════╪═════╪═════╡
1825
+ # # │ 3 ┆ 8.0 ┆ c │
1826
+ # # └─────┴─────┴─────┘
1827
+ def join(
1828
+ other,
1829
+ left_on: nil,
1830
+ right_on: nil,
1831
+ on: nil,
1832
+ how: "inner",
1833
+ suffix: "_right",
1834
+ join_nulls: false,
1835
+ allow_parallel: true,
1836
+ force_parallel: false
1837
+ )
1838
+ if !other.is_a?(LazyFrame)
1839
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1840
+ end
1841
+
1842
+ if how == "cross"
1843
+ return _from_rbldf(
1844
+ _ldf.join(
1845
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1846
+ )
1847
+ )
1848
+ end
1849
+
1850
+ if !on.nil?
1851
+ rbexprs = Utils.selection_to_rbexpr_list(on)
1852
+ rbexprs_left = rbexprs
1853
+ rbexprs_right = rbexprs
1854
+ elsif !left_on.nil? && !right_on.nil?
1855
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
1856
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
1857
+ else
1858
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1859
+ end
1860
+
1861
+ _from_rbldf(
1862
+ self._ldf.join(
1863
+ other._ldf,
1864
+ rbexprs_left,
1865
+ rbexprs_right,
1866
+ allow_parallel,
1867
+ force_parallel,
1868
+ join_nulls,
1869
+ how,
1870
+ suffix,
1871
+ )
1872
+ )
1873
+ end
1874
+
1875
+ # Add or overwrite multiple columns in a DataFrame.
1876
+ #
1877
+ # @param exprs [Object]
1878
+ # List of Expressions that evaluate to columns.
1879
+ #
1880
+ # @return [LazyFrame]
1881
+ #
1882
+ # @example
1883
+ # ldf = Polars::DataFrame.new(
1884
+ # {
1885
+ # "a" => [1, 2, 3, 4],
1886
+ # "b" => [0.5, 4, 10, 13],
1887
+ # "c" => [true, true, false, true]
1888
+ # }
1889
+ # ).lazy
1890
+ # ldf.with_columns(
1891
+ # [
1892
+ # (Polars.col("a") ** 2).alias("a^2"),
1893
+ # (Polars.col("b") / 2).alias("b/2"),
1894
+ # (Polars.col("c").is_not).alias("not c")
1895
+ # ]
1896
+ # ).collect
1897
+ # # =>
1898
+ # # shape: (4, 6)
1899
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1900
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1901
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1902
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1903
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1904
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1905
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1906
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1907
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1908
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1909
+ def with_columns(*exprs, **named_exprs)
1910
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1911
+ rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1912
+
1913
+ _from_rbldf(_ldf.with_columns(rbexprs))
1914
+ end
1915
+
1916
+ # Add an external context to the computation graph.
1917
+ #
1918
+ # This allows expressions to also access columns from DataFrames
1919
+ # that are not part of this one.
1920
+ #
1921
+ # @param other [Object]
1922
+ # Lazy DataFrame to join with.
1923
+ #
1924
+ # @return [LazyFrame]
1925
+ #
1926
+ # @example
1927
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1928
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1929
+ # (
1930
+ # df_a.with_context(df_other.lazy).select(
1931
+ # [Polars.col("b") + Polars.col("c").first]
1932
+ # )
1933
+ # ).collect
1934
+ # # =>
1935
+ # # shape: (3, 1)
1936
+ # # ┌──────┐
1937
+ # # │ b │
1938
+ # # │ --- │
1939
+ # # │ str │
1940
+ # # ╞══════╡
1941
+ # # │ afoo │
1942
+ # # │ cfoo │
1943
+ # # │ null │
1944
+ # # └──────┘
1945
+ def with_context(other)
1946
+ if !other.is_a?(::Array)
1947
+ other = [other]
1948
+ end
1949
+
1950
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1951
+ end
1952
+
1953
+ # Add or overwrite column in a DataFrame.
1954
+ #
1955
+ # @param column [Object]
1956
+ # Expression that evaluates to column or a Series to use.
1957
+ #
1958
+ # @return [LazyFrame]
1959
+ #
1960
+ # @example
1961
+ # df = Polars::DataFrame.new(
1962
+ # {
1963
+ # "a" => [1, 3, 5],
1964
+ # "b" => [2, 4, 6]
1965
+ # }
1966
+ # ).lazy
1967
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1968
+ # # =>
1969
+ # # shape: (3, 3)
1970
+ # # ┌─────┬─────┬───────────┐
1971
+ # # │ a ┆ b ┆ b_squared │
1972
+ # # │ --- ┆ --- ┆ --- │
1973
+ # # │ i64 ┆ i64 ┆ i64 │
1974
+ # # ╞═════╪═════╪═══════════╡
1975
+ # # │ 1 ┆ 2 ┆ 4 │
1976
+ # # │ 3 ┆ 4 ┆ 16 │
1977
+ # # │ 5 ┆ 6 ┆ 36 │
1978
+ # # └─────┴─────┴───────────┘
1979
+ #
1980
+ # @example
1981
+ # df.with_column(Polars.col("a") ** 2).collect
1982
+ # # =>
1983
+ # # shape: (3, 2)
1984
+ # # ┌─────┬─────┐
1985
+ # # │ a ┆ b │
1986
+ # # │ --- ┆ --- │
1987
+ # # │ i64 ┆ i64 │
1988
+ # # ╞═════╪═════╡
1989
+ # # │ 1 ┆ 2 │
1990
+ # # │ 9 ┆ 4 │
1991
+ # # │ 25 ┆ 6 │
1992
+ # # └─────┴─────┘
1993
+ def with_column(column)
1994
+ with_columns([column])
1995
+ end
1996
+
1997
+ # Remove one or multiple columns from a DataFrame.
1998
+ #
1999
+ # @param columns [Object]
2000
+ # - Name of the column that should be removed.
2001
+ # - List of column names.
2002
+ #
2003
+ # @return [LazyFrame]
2004
+ def drop(*columns)
2005
+ drop_cols = Utils._expand_selectors(self, *columns)
2006
+ _from_rbldf(_ldf.drop(drop_cols))
2007
+ end
2008
+
2009
+ # Rename column names.
2010
+ #
2011
+ # @param mapping [Hash]
2012
+ # Key value pairs that map from old name to new name.
2013
+ #
2014
+ # @return [LazyFrame]
2015
+ def rename(mapping)
2016
+ existing = mapping.keys
2017
+ _new = mapping.values
2018
+ _from_rbldf(_ldf.rename(existing, _new))
2019
+ end
2020
+
2021
+ # Reverse the DataFrame.
2022
+ #
2023
+ # @return [LazyFrame]
2024
+ def reverse
2025
+ _from_rbldf(_ldf.reverse)
2026
+ end
2027
+
2028
+ # Shift the values by a given period.
2029
+ #
2030
+ # @param n [Integer]
2031
+ # Number of places to shift (may be negative).
2032
+ # @param fill_value [Object]
2033
+ # Fill the resulting null values with this value.
2034
+ #
2035
+ # @return [LazyFrame]
2036
+ #
2037
+ # @example
2038
+ # df = Polars::DataFrame.new(
2039
+ # {
2040
+ # "a" => [1, 3, 5],
2041
+ # "b" => [2, 4, 6]
2042
+ # }
2043
+ # ).lazy
2044
+ # df.shift(1).collect
2045
+ # # =>
2046
+ # # shape: (3, 2)
2047
+ # # ┌──────┬──────┐
2048
+ # # │ a ┆ b │
2049
+ # # │ --- ┆ --- │
2050
+ # # │ i64 ┆ i64 │
2051
+ # # ╞══════╪══════╡
2052
+ # # │ null ┆ null │
2053
+ # # │ 1 ┆ 2 │
2054
+ # # │ 3 ┆ 4 │
2055
+ # # └──────┴──────┘
2056
+ #
2057
+ # @example
2058
+ # df.shift(-1).collect
2059
+ # # =>
2060
+ # # shape: (3, 2)
2061
+ # # ┌──────┬──────┐
2062
+ # # │ a ┆ b │
2063
+ # # │ --- ┆ --- │
2064
+ # # │ i64 ┆ i64 │
2065
+ # # ╞══════╪══════╡
2066
+ # # │ 3 ┆ 4 │
2067
+ # # │ 5 ┆ 6 │
2068
+ # # │ null ┆ null │
2069
+ # # └──────┴──────┘
2070
+ def shift(n, fill_value: nil)
2071
+ if !fill_value.nil?
2072
+ fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
2073
+ end
2074
+ n = Utils.parse_as_expression(n)
2075
+ _from_rbldf(_ldf.shift(n, fill_value))
2076
+ end
2077
+
2078
+ # Shift the values by a given period and fill the resulting null values.
2079
+ #
2080
+ # @param periods [Integer]
2081
+ # Number of places to shift (may be negative).
2082
+ # @param fill_value [Object]
2083
+ # Fill `nil` values with the result of this expression.
2084
+ #
2085
+ # @return [LazyFrame]
2086
+ #
2087
+ # @example
2088
+ # df = Polars::DataFrame.new(
2089
+ # {
2090
+ # "a" => [1, 3, 5],
2091
+ # "b" => [2, 4, 6]
2092
+ # }
2093
+ # ).lazy
2094
+ # df.shift_and_fill(1, 0).collect
2095
+ # # =>
2096
+ # # shape: (3, 2)
2097
+ # # ┌─────┬─────┐
2098
+ # # │ a ┆ b │
2099
+ # # │ --- ┆ --- │
2100
+ # # │ i64 ┆ i64 │
2101
+ # # ╞═════╪═════╡
2102
+ # # │ 0 ┆ 0 │
2103
+ # # │ 1 ┆ 2 │
2104
+ # # │ 3 ┆ 4 │
2105
+ # # └─────┴─────┘
2106
+ #
2107
+ # @example
2108
+ # df.shift_and_fill(-1, 0).collect
2109
+ # # =>
2110
+ # # shape: (3, 2)
2111
+ # # ┌─────┬─────┐
2112
+ # # │ a ┆ b │
2113
+ # # │ --- ┆ --- │
2114
+ # # │ i64 ┆ i64 │
2115
+ # # ╞═════╪═════╡
2116
+ # # │ 3 ┆ 4 │
2117
+ # # │ 5 ┆ 6 │
2118
+ # # │ 0 ┆ 0 │
2119
+ # # └─────┴─────┘
2120
+ def shift_and_fill(periods, fill_value)
2121
+ shift(periods, fill_value: fill_value)
2122
+ end
2123
+
2124
+ # Get a slice of this DataFrame.
2125
+ #
2126
+ # @param offset [Integer]
2127
+ # Start index. Negative indexing is supported.
2128
+ # @param length [Integer]
2129
+ # Length of the slice. If set to `nil`, all rows starting at the offset
2130
+ # will be selected.
2131
+ #
2132
+ # @return [LazyFrame]
2133
+ #
2134
+ # @example
2135
+ # df = Polars::DataFrame.new(
2136
+ # {
2137
+ # "a" => ["x", "y", "z"],
2138
+ # "b" => [1, 3, 5],
2139
+ # "c" => [2, 4, 6]
2140
+ # }
2141
+ # ).lazy
2142
+ # df.slice(1, 2).collect
2143
+ # # =>
2144
+ # # shape: (2, 3)
2145
+ # # ┌─────┬─────┬─────┐
2146
+ # # │ a ┆ b ┆ c │
2147
+ # # │ --- ┆ --- ┆ --- │
2148
+ # # │ str ┆ i64 ┆ i64 │
2149
+ # # ╞═════╪═════╪═════╡
2150
+ # # │ y ┆ 3 ┆ 4 │
2151
+ # # │ z ┆ 5 ┆ 6 │
2152
+ # # └─────┴─────┴─────┘
2153
+ def slice(offset, length = nil)
2154
+ if length && length < 0
2155
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
2156
+ end
2157
+ _from_rbldf(_ldf.slice(offset, length))
2158
+ end
2159
+
2160
+ # Get the first `n` rows.
2161
+ #
2162
+ # Alias for {#head}.
2163
+ #
2164
+ # @param n [Integer]
2165
+ # Number of rows to return.
2166
+ #
2167
+ # @return [LazyFrame]
2168
+ #
2169
+ # @note
2170
+ # Consider using the {#fetch} operation if you only want to test your
2171
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2172
+ # level, whereas the {#head}/{#limit} are applied at the end.
2173
+ def limit(n = 5)
2174
+ head(5)
2175
+ end
2176
+
2177
+ # Get the first `n` rows.
2178
+ #
2179
+ # @param n [Integer]
2180
+ # Number of rows to return.
2181
+ #
2182
+ # @return [LazyFrame]
2183
+ #
2184
+ # @note
2185
+ # Consider using the {#fetch} operation if you only want to test your
2186
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2187
+ # level, whereas the {#head}/{#limit} are applied at the end.
2188
+ def head(n = 5)
2189
+ slice(0, n)
2190
+ end
2191
+
2192
+ # Get the last `n` rows.
2193
+ #
2194
+ # @param n [Integer]
2195
+ # Number of rows.
2196
+ #
2197
+ # @return [LazyFrame]
2198
+ def tail(n = 5)
2199
+ _from_rbldf(_ldf.tail(n))
2200
+ end
2201
+
2202
+ # Get the last row of the DataFrame.
2203
+ #
2204
+ # @return [LazyFrame]
2205
+ def last
2206
+ tail(1)
2207
+ end
2208
+
2209
+ # Get the first row of the DataFrame.
2210
+ #
2211
+ # @return [LazyFrame]
2212
+ def first
2213
+ slice(0, 1)
2214
+ end
2215
+
2216
+ # Add a column at index 0 that counts the rows.
2217
+ #
2218
+ # @param name [String]
2219
+ # Name of the column to add.
2220
+ # @param offset [Integer]
2221
+ # Start the row count at this offset.
2222
+ #
2223
+ # @return [LazyFrame]
2224
+ #
2225
+ # @note
2226
+ # This can have a negative effect on query performance.
2227
+ # This may, for instance, block predicate pushdown optimization.
2228
+ #
2229
+ # @example
2230
+ # df = Polars::DataFrame.new(
2231
+ # {
2232
+ # "a" => [1, 3, 5],
2233
+ # "b" => [2, 4, 6]
2234
+ # }
2235
+ # ).lazy
2236
+ # df.with_row_index.collect
2237
+ # # =>
2238
+ # # shape: (3, 3)
2239
+ # # ┌────────┬─────┬─────┐
2240
+ # # │ row_nr ┆ a ┆ b │
2241
+ # # │ --- ┆ --- ┆ --- │
2242
+ # # │ u32 ┆ i64 ┆ i64 │
2243
+ # # ╞════════╪═════╪═════╡
2244
+ # # │ 0 ┆ 1 ┆ 2 │
2245
+ # # │ 1 ┆ 3 ┆ 4 │
2246
+ # # │ 2 ┆ 5 ┆ 6 │
2247
+ # # └────────┴─────┴─────┘
2248
+ def with_row_index(name: "row_nr", offset: 0)
2249
+ _from_rbldf(_ldf.with_row_index(name, offset))
2250
+ end
2251
+ alias_method :with_row_count, :with_row_index
2252
+
2253
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
2254
+ #
2255
+ # @return [LazyFrame]
2256
+ #
2257
+ # @example
2258
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2259
+ # s.take_every(2).collect
2260
+ # # =>
2261
+ # # shape: (2, 2)
2262
+ # # ┌─────┬─────┐
2263
+ # # │ a ┆ b │
2264
+ # # │ --- ┆ --- │
2265
+ # # │ i64 ┆ i64 │
2266
+ # # ╞═════╪═════╡
2267
+ # # │ 1 ┆ 5 │
2268
+ # # │ 3 ┆ 7 │
2269
+ # # └─────┴─────┘
2270
+ def take_every(n)
2271
+ select(Utils.col("*").take_every(n))
2272
+ end
2273
+
2274
+ # Fill null values using the specified value or strategy.
2275
+ #
2276
+ # @return [LazyFrame]
2277
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
2278
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
2279
+ end
2280
+
2281
+ # Fill floating point NaN values.
2282
+ #
2283
+ # @param fill_value [Object]
2284
+ # Value to fill the NaN values with.
2285
+ #
2286
+ # @return [LazyFrame]
2287
+ #
2288
+ # @note
2289
+ # Note that floating point NaN (Not a Number) are not missing values!
2290
+ # To replace missing values, use `fill_null` instead.
2291
+ #
2292
+ # @example
2293
+ # df = Polars::DataFrame.new(
2294
+ # {
2295
+ # "a" => [1.5, 2, Float::NAN, 4],
2296
+ # "b" => [0.5, 4, Float::NAN, 13],
2297
+ # }
2298
+ # ).lazy
2299
+ # df.fill_nan(99).collect
2300
+ # # =>
2301
+ # # shape: (4, 2)
2302
+ # # ┌──────┬──────┐
2303
+ # # │ a ┆ b │
2304
+ # # │ --- ┆ --- │
2305
+ # # │ f64 ┆ f64 │
2306
+ # # ╞══════╪══════╡
2307
+ # # │ 1.5 ┆ 0.5 │
2308
+ # # │ 2.0 ┆ 4.0 │
2309
+ # # │ 99.0 ┆ 99.0 │
2310
+ # # │ 4.0 ┆ 13.0 │
2311
+ # # └──────┴──────┘
2312
+ def fill_nan(fill_value)
2313
+ if !fill_value.is_a?(Expr)
2314
+ fill_value = Utils.lit(fill_value)
2315
+ end
2316
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
2317
+ end
2318
+
2319
+ # Aggregate the columns in the DataFrame to their standard deviation value.
2320
+ #
2321
+ # @return [LazyFrame]
2322
+ #
2323
+ # @example
2324
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2325
+ # df.std.collect
2326
+ # # =>
2327
+ # # shape: (1, 2)
2328
+ # # ┌──────────┬─────┐
2329
+ # # │ a ┆ b │
2330
+ # # │ --- ┆ --- │
2331
+ # # │ f64 ┆ f64 │
2332
+ # # ╞══════════╪═════╡
2333
+ # # │ 1.290994 ┆ 0.5 │
2334
+ # # └──────────┴─────┘
2335
+ #
2336
+ # @example
2337
+ # df.std(ddof: 0).collect
2338
+ # # =>
2339
+ # # shape: (1, 2)
2340
+ # # ┌──────────┬──────────┐
2341
+ # # │ a ┆ b │
2342
+ # # │ --- ┆ --- │
2343
+ # # │ f64 ┆ f64 │
2344
+ # # ╞══════════╪══════════╡
2345
+ # # │ 1.118034 ┆ 0.433013 │
2346
+ # # └──────────┴──────────┘
2347
+ def std(ddof: 1)
2348
+ _from_rbldf(_ldf.std(ddof))
2349
+ end
2350
+
2351
+ # Aggregate the columns in the DataFrame to their variance value.
2352
+ #
2353
+ # @return [LazyFrame]
2354
+ #
2355
+ # @example
2356
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2357
+ # df.var.collect
2358
+ # # =>
2359
+ # # shape: (1, 2)
2360
+ # # ┌──────────┬──────┐
2361
+ # # │ a ┆ b │
2362
+ # # │ --- ┆ --- │
2363
+ # # │ f64 ┆ f64 │
2364
+ # # ╞══════════╪══════╡
2365
+ # # │ 1.666667 ┆ 0.25 │
2366
+ # # └──────────┴──────┘
2367
+ #
2368
+ # @example
2369
+ # df.var(ddof: 0).collect
2370
+ # # =>
2371
+ # # shape: (1, 2)
2372
+ # # ┌──────┬────────┐
2373
+ # # │ a ┆ b │
2374
+ # # │ --- ┆ --- │
2375
+ # # │ f64 ┆ f64 │
2376
+ # # ╞══════╪════════╡
2377
+ # # │ 1.25 ┆ 0.1875 │
2378
+ # # └──────┴────────┘
2379
+ def var(ddof: 1)
2380
+ _from_rbldf(_ldf.var(ddof))
2381
+ end
2382
+
2383
+ # Aggregate the columns in the DataFrame to their maximum value.
2384
+ #
2385
+ # @return [LazyFrame]
2386
+ #
2387
+ # @example
2388
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2389
+ # df.max.collect
2390
+ # # =>
2391
+ # # shape: (1, 2)
2392
+ # # ┌─────┬─────┐
2393
+ # # │ a ┆ b │
2394
+ # # │ --- ┆ --- │
2395
+ # # │ i64 ┆ i64 │
2396
+ # # ╞═════╪═════╡
2397
+ # # │ 4 ┆ 2 │
2398
+ # # └─────┴─────┘
2399
+ def max
2400
+ _from_rbldf(_ldf.max)
2401
+ end
2402
+
2403
+ # Aggregate the columns in the DataFrame to their minimum value.
2404
+ #
2405
+ # @return [LazyFrame]
2406
+ #
2407
+ # @example
2408
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2409
+ # df.min.collect
2410
+ # # =>
2411
+ # # shape: (1, 2)
2412
+ # # ┌─────┬─────┐
2413
+ # # │ a ┆ b │
2414
+ # # │ --- ┆ --- │
2415
+ # # │ i64 ┆ i64 │
2416
+ # # ╞═════╪═════╡
2417
+ # # │ 1 ┆ 1 │
2418
+ # # └─────┴─────┘
2419
+ def min
2420
+ _from_rbldf(_ldf.min)
2421
+ end
2422
+
2423
+ # Aggregate the columns in the DataFrame to their sum value.
2424
+ #
2425
+ # @return [LazyFrame]
2426
+ #
2427
+ # @example
2428
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2429
+ # df.sum.collect
2430
+ # # =>
2431
+ # # shape: (1, 2)
2432
+ # # ┌─────┬─────┐
2433
+ # # │ a ┆ b │
2434
+ # # │ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 │
2436
+ # # ╞═════╪═════╡
2437
+ # # │ 10 ┆ 5 │
2438
+ # # └─────┴─────┘
2439
+ def sum
2440
+ _from_rbldf(_ldf.sum)
2441
+ end
2442
+
2443
+ # Aggregate the columns in the DataFrame to their mean value.
2444
+ #
2445
+ # @return [LazyFrame]
2446
+ #
2447
+ # @example
2448
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2449
+ # df.mean.collect
2450
+ # # =>
2451
+ # # shape: (1, 2)
2452
+ # # ┌─────┬──────┐
2453
+ # # │ a ┆ b │
2454
+ # # │ --- ┆ --- │
2455
+ # # │ f64 ┆ f64 │
2456
+ # # ╞═════╪══════╡
2457
+ # # │ 2.5 ┆ 1.25 │
2458
+ # # └─────┴──────┘
2459
+ def mean
2460
+ _from_rbldf(_ldf.mean)
2461
+ end
2462
+
2463
+ # Aggregate the columns in the DataFrame to their median value.
2464
+ #
2465
+ # @return [LazyFrame]
2466
+ #
2467
+ # @example
2468
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2469
+ # df.median.collect
2470
+ # # =>
2471
+ # # shape: (1, 2)
2472
+ # # ┌─────┬─────┐
2473
+ # # │ a ┆ b │
2474
+ # # │ --- ┆ --- │
2475
+ # # │ f64 ┆ f64 │
2476
+ # # ╞═════╪═════╡
2477
+ # # │ 2.5 ┆ 1.0 │
2478
+ # # └─────┴─────┘
2479
+ def median
2480
+ _from_rbldf(_ldf.median)
2481
+ end
2482
+
2483
+ # Aggregate the columns in the DataFrame to their quantile value.
2484
+ #
2485
+ # @param quantile [Float]
2486
+ # Quantile between 0.0 and 1.0.
2487
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2488
+ # Interpolation method.
2489
+ #
2490
+ # @return [LazyFrame]
2491
+ #
2492
+ # @example
2493
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2494
+ # df.quantile(0.7).collect
2495
+ # # =>
2496
+ # # shape: (1, 2)
2497
+ # # ┌─────┬─────┐
2498
+ # # │ a ┆ b │
2499
+ # # │ --- ┆ --- │
2500
+ # # │ f64 ┆ f64 │
2501
+ # # ╞═════╪═════╡
2502
+ # # │ 3.0 ┆ 1.0 │
2503
+ # # └─────┴─────┘
2504
+ def quantile(quantile, interpolation: "nearest")
2505
+ quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2506
+ _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
2507
+ end
2508
+
2509
+ # Explode lists to long format.
2510
+ #
2511
+ # @return [LazyFrame]
2512
+ #
2513
+ # @example
2514
+ # df = Polars::DataFrame.new(
2515
+ # {
2516
+ # "letters" => ["a", "a", "b", "c"],
2517
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2518
+ # }
2519
+ # ).lazy
2520
+ # df.explode("numbers").collect
2521
+ # # =>
2522
+ # # shape: (8, 2)
2523
+ # # ┌─────────┬─────────┐
2524
+ # # │ letters ┆ numbers │
2525
+ # # │ --- ┆ --- │
2526
+ # # │ str ┆ i64 │
2527
+ # # ╞═════════╪═════════╡
2528
+ # # │ a ┆ 1 │
2529
+ # # │ a ┆ 2 │
2530
+ # # │ a ┆ 3 │
2531
+ # # │ b ┆ 4 │
2532
+ # # │ b ┆ 5 │
2533
+ # # │ c ┆ 6 │
2534
+ # # │ c ┆ 7 │
2535
+ # # │ c ┆ 8 │
2536
+ # # └─────────┴─────────┘
2537
+ def explode(columns)
2538
+ columns = Utils.selection_to_rbexpr_list(columns)
2539
+ _from_rbldf(_ldf.explode(columns))
2540
+ end
2541
+
2542
+ # Drop duplicate rows from this DataFrame.
2543
+ #
2544
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2545
+ # subset.
2546
+ #
2547
+ # @param maintain_order [Boolean]
2548
+ # Keep the same order as the original DataFrame. This requires more work to
2549
+ # compute.
2550
+ # @param subset [Object]
2551
+ # Subset to use to compare rows.
2552
+ # @param keep ["first", "last"]
2553
+ # Which of the duplicate rows to keep.
2554
+ #
2555
+ # @return [LazyFrame]
2556
+ def unique(maintain_order: true, subset: nil, keep: "first")
2557
+ if !subset.nil? && !subset.is_a?(::Array)
2558
+ subset = [subset]
2559
+ end
2560
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2561
+ end
2562
+
2563
+ # Drop rows with null values from this LazyFrame.
2564
+ #
2565
+ # @param subset [Object]
2566
+ # Subset of column(s) on which `drop_nulls` will be applied.
2567
+ #
2568
+ # @return [LazyFrame]
2569
+ #
2570
+ # @example
2571
+ # df = Polars::DataFrame.new(
2572
+ # {
2573
+ # "foo" => [1, 2, 3],
2574
+ # "bar" => [6, nil, 8],
2575
+ # "ham" => ["a", "b", "c"]
2576
+ # }
2577
+ # )
2578
+ # df.lazy.drop_nulls.collect
2579
+ # # =>
2580
+ # # shape: (2, 3)
2581
+ # # ┌─────┬─────┬─────┐
2582
+ # # │ foo ┆ bar ┆ ham │
2583
+ # # │ --- ┆ --- ┆ --- │
2584
+ # # │ i64 ┆ i64 ┆ str │
2585
+ # # ╞═════╪═════╪═════╡
2586
+ # # │ 1 ┆ 6 ┆ a │
2587
+ # # │ 3 ┆ 8 ┆ c │
2588
+ # # └─────┴─────┴─────┘
2589
+ def drop_nulls(subset: nil)
2590
+ if !subset.nil? && !subset.is_a?(::Array)
2591
+ subset = [subset]
2592
+ end
2593
+ _from_rbldf(_ldf.drop_nulls(subset))
2594
+ end
2595
+
2596
+ # Unpivot a DataFrame from wide to long format.
2597
+ #
2598
+ # Optionally leaves identifiers set.
2599
+ #
2600
+ # This function is useful to massage a DataFrame into a format where one or more
2601
+ # columns are identifier variables (id_vars), while all other columns, considered
2602
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2603
+ # two non-identifier columns, 'variable' and 'value'.
2604
+ #
2605
+ # @param id_vars [Object]
2606
+ # Columns to use as identifier variables.
2607
+ # @param value_vars [Object]
2608
+ # Values to use as identifier variables.
2609
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2610
+ # @param variable_name [String]
2611
+ # Name to give to the `value` column. Defaults to "variable"
2612
+ # @param value_name [String]
2613
+ # Name to give to the `value` column. Defaults to "value"
2614
+ # @param streamable [Boolean]
2615
+ # Allow this node to run in the streaming engine.
2616
+ # If this runs in streaming, the output of the melt operation
2617
+ # will not have a stable ordering.
2618
+ #
2619
+ # @return [LazyFrame]
2620
+ #
2621
+ # @example
2622
+ # df = Polars::DataFrame.new(
2623
+ # {
2624
+ # "a" => ["x", "y", "z"],
2625
+ # "b" => [1, 3, 5],
2626
+ # "c" => [2, 4, 6]
2627
+ # }
2628
+ # ).lazy
2629
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2630
+ # # =>
2631
+ # # shape: (6, 3)
2632
+ # # ┌─────┬──────────┬───────┐
2633
+ # # │ a ┆ variable ┆ value │
2634
+ # # │ --- ┆ --- ┆ --- │
2635
+ # # │ str ┆ str ┆ i64 │
2636
+ # # ╞═════╪══════════╪═══════╡
2637
+ # # │ x ┆ b ┆ 1 │
2638
+ # # │ y ┆ b ┆ 3 │
2639
+ # # │ z ┆ b ┆ 5 │
2640
+ # # │ x ┆ c ┆ 2 │
2641
+ # # │ y ┆ c ┆ 4 │
2642
+ # # │ z ┆ c ┆ 6 │
2643
+ # # └─────┴──────────┴───────┘
2644
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
2645
+ if value_vars.is_a?(::String)
2646
+ value_vars = [value_vars]
2647
+ end
2648
+ if id_vars.is_a?(::String)
2649
+ id_vars = [id_vars]
2650
+ end
2651
+ if value_vars.nil?
2652
+ value_vars = []
2653
+ end
2654
+ if id_vars.nil?
2655
+ id_vars = []
2656
+ end
2657
+ _from_rbldf(
2658
+ _ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
2659
+ )
2660
+ end
2661
+
2662
+ # def map
2663
+ # end
2664
+
2665
+ # Interpolate intermediate values. The interpolation method is linear.
2666
+ #
2667
+ # @return [LazyFrame]
2668
+ #
2669
+ # @example
2670
+ # df = Polars::DataFrame.new(
2671
+ # {
2672
+ # "foo" => [1, nil, 9, 10],
2673
+ # "bar" => [6, 7, 9, nil],
2674
+ # "baz" => [1, nil, nil, 9]
2675
+ # }
2676
+ # ).lazy
2677
+ # df.interpolate.collect
2678
+ # # =>
2679
+ # # shape: (4, 3)
2680
+ # # ┌──────┬──────┬──────────┐
2681
+ # # │ foo ┆ bar ┆ baz │
2682
+ # # │ --- ┆ --- ┆ --- │
2683
+ # # │ f64 ┆ f64 ┆ f64 │
2684
+ # # ╞══════╪══════╪══════════╡
2685
+ # # │ 1.0 ┆ 6.0 ┆ 1.0 │
2686
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667 │
2687
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333 │
2688
+ # # │ 10.0 ┆ null ┆ 9.0 │
2689
+ # # └──────┴──────┴──────────┘
2690
+ def interpolate
2691
+ select(Utils.col("*").interpolate)
2692
+ end
2693
+
2694
+ # Decompose a struct into its fields.
2695
+ #
2696
+ # The fields will be inserted into the `DataFrame` on the location of the
2697
+ # `struct` type.
2698
+ #
2699
+ # @param names [Object]
2700
+ # Names of the struct columns that will be decomposed by its fields
2701
+ #
2702
+ # @return [LazyFrame]
2703
+ #
2704
+ # @example
2705
+ # df = (
2706
+ # Polars::DataFrame.new(
2707
+ # {
2708
+ # "before" => ["foo", "bar"],
2709
+ # "t_a" => [1, 2],
2710
+ # "t_b" => ["a", "b"],
2711
+ # "t_c" => [true, nil],
2712
+ # "t_d" => [[1, 2], [3]],
2713
+ # "after" => ["baz", "womp"]
2714
+ # }
2715
+ # )
2716
+ # .lazy
2717
+ # .select(
2718
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2719
+ # )
2720
+ # )
2721
+ # df.fetch
2722
+ # # =>
2723
+ # # shape: (2, 3)
2724
+ # # ┌────────┬─────────────────────┬───────┐
2725
+ # # │ before ┆ t_struct ┆ after │
2726
+ # # │ --- ┆ --- ┆ --- │
2727
+ # # │ str ┆ struct[4] ┆ str │
2728
+ # # ╞════════╪═════════════════════╪═══════╡
2729
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2730
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2731
+ # # └────────┴─────────────────────┴───────┘
2732
+ #
2733
+ # @example
2734
+ # df.unnest("t_struct").fetch
2735
+ # # =>
2736
+ # # shape: (2, 6)
2737
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2738
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2739
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2740
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2741
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2742
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2743
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2744
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2745
+ def unnest(names)
2746
+ if names.is_a?(::String)
2747
+ names = [names]
2748
+ end
2749
+ _from_rbldf(_ldf.unnest(names))
2750
+ end
2751
+
2752
+ # Take two sorted DataFrames and merge them by the sorted key.
2753
+ #
2754
+ # The output of this operation will also be sorted.
2755
+ # It is the callers responsibility that the frames are sorted
2756
+ # by that key otherwise the output will not make sense.
2757
+ #
2758
+ # The schemas of both LazyFrames must be equal.
2759
+ #
2760
+ # @param other [DataFrame]
2761
+ # Other DataFrame that must be merged
2762
+ # @param key [String]
2763
+ # Key that is sorted.
2764
+ #
2765
+ # @return [LazyFrame]
2766
+ #
2767
+ # @example
2768
+ # df0 = Polars::LazyFrame.new(
2769
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2770
+ # ).sort("age")
2771
+ # df1 = Polars::LazyFrame.new(
2772
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2773
+ # ).sort("age")
2774
+ # df0.merge_sorted(df1, "age").collect
2775
+ # # =>
2776
+ # # shape: (7, 2)
2777
+ # # ┌────────┬─────┐
2778
+ # # │ name ┆ age │
2779
+ # # │ --- ┆ --- │
2780
+ # # │ str ┆ i64 │
2781
+ # # ╞════════╪═════╡
2782
+ # # │ bob ┆ 18 │
2783
+ # # │ thomas ┆ 20 │
2784
+ # # │ anna ┆ 21 │
2785
+ # # │ megan ┆ 33 │
2786
+ # # │ steve ┆ 42 │
2787
+ # # │ steve ┆ 42 │
2788
+ # # │ elise ┆ 44 │
2789
+ # # └────────┴─────┘
2790
+ def merge_sorted(other, key)
2791
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2792
+ end
2793
+
2794
+ # Indicate that one or multiple columns are sorted.
2795
+ #
2796
+ # @param column [Object]
2797
+ # Columns that are sorted
2798
+ # @param more_columns [Object]
2799
+ # Additional columns that are sorted, specified as positional arguments.
2800
+ # @param descending [Boolean]
2801
+ # Whether the columns are sorted in descending order.
2802
+ #
2803
+ # @return [LazyFrame]
2804
+ def set_sorted(
2805
+ column,
2806
+ *more_columns,
2807
+ descending: false
2808
+ )
2809
+ columns = Utils.selection_to_rbexpr_list(column)
2810
+ if more_columns.any?
2811
+ columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2812
+ end
2813
+ with_columns(
2814
+ columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2815
+ )
2816
+ end
2817
+
2818
+ # TODO
2819
+ # def update
2820
+ # end
2821
+
2822
+ private
2823
+
2824
+ def initialize_copy(other)
2825
+ super
2826
+ self._ldf = _ldf._clone
2827
+ end
2828
+
2829
+ def _from_rbldf(rb_ldf)
2830
+ self.class._from_rbldf(rb_ldf)
2831
+ end
2832
+ end
2833
+ end