polars-df 0.13.0-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,2708 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
23
+ # @private
24
+ def self._from_rbldf(rb_ldf)
25
+ ldf = LazyFrame.allocate
26
+ ldf._ldf = rb_ldf
27
+ ldf
28
+ end
29
+
30
+ # def self.from_json
31
+ # end
32
+
33
+ # Read a logical plan from a JSON file to construct a LazyFrame.
34
+ #
35
+ # @param file [String]
36
+ # Path to a file or a file-like object.
37
+ #
38
+ # @return [LazyFrame]
39
+ def self.read_json(file)
40
+ if Utils.pathlike?(file)
41
+ file = Utils.normalize_filepath(file)
42
+ end
43
+
44
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
45
+ end
46
+
47
+ # Get or set column names.
48
+ #
49
+ # @return [Array]
50
+ #
51
+ # @example
52
+ # df = (
53
+ # Polars::DataFrame.new(
54
+ # {
55
+ # "foo" => [1, 2, 3],
56
+ # "bar" => [6, 7, 8],
57
+ # "ham" => ["a", "b", "c"]
58
+ # }
59
+ # )
60
+ # .lazy
61
+ # .select(["foo", "bar"])
62
+ # )
63
+ # df.columns
64
+ # # => ["foo", "bar"]
65
+ def columns
66
+ _ldf.collect_schema.keys
67
+ end
68
+
69
+ # Get dtypes of columns in LazyFrame.
70
+ #
71
+ # @return [Array]
72
+ #
73
+ # @example
74
+ # lf = Polars::DataFrame.new(
75
+ # {
76
+ # "foo" => [1, 2, 3],
77
+ # "bar" => [6.0, 7.0, 8.0],
78
+ # "ham" => ["a", "b", "c"]
79
+ # }
80
+ # ).lazy
81
+ # lf.dtypes
82
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
83
+ def dtypes
84
+ _ldf.collect_schema.values
85
+ end
86
+
87
+ # Get the schema.
88
+ #
89
+ # @return [Hash]
90
+ #
91
+ # @example
92
+ # lf = Polars::DataFrame.new(
93
+ # {
94
+ # "foo" => [1, 2, 3],
95
+ # "bar" => [6.0, 7.0, 8.0],
96
+ # "ham" => ["a", "b", "c"]
97
+ # }
98
+ # ).lazy
99
+ # lf.schema
100
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
101
+ def schema
102
+ _ldf.collect_schema
103
+ end
104
+
105
+ # Get the width of the LazyFrame.
106
+ #
107
+ # @return [Integer]
108
+ #
109
+ # @example
110
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
111
+ # lf.width
112
+ # # => 2
113
+ def width
114
+ _ldf.collect_schema.length
115
+ end
116
+
117
+ # Check if LazyFrame includes key.
118
+ #
119
+ # @return [Boolean]
120
+ def include?(key)
121
+ columns.include?(key)
122
+ end
123
+
124
+ # clone handled by initialize_copy
125
+
126
+ # def [](item)
127
+ # end
128
+
129
+ # Returns a string representing the LazyFrame.
130
+ #
131
+ # @return [String]
132
+ def to_s
133
+ <<~EOS
134
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
135
+
136
+ #{describe_plan}
137
+ EOS
138
+ end
139
+
140
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
141
+ #
142
+ # @param file [String]
143
+ # File path to which the result should be written.
144
+ #
145
+ # @return [nil]
146
+ def write_json(file)
147
+ if Utils.pathlike?(file)
148
+ file = Utils.normalize_filepath(file)
149
+ end
150
+ _ldf.write_json(file)
151
+ nil
152
+ end
153
+
154
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
155
+ #
156
+ # @param func [Object]
157
+ # Callable; will receive the frame as the first parameter,
158
+ # followed by any given args/kwargs.
159
+ # @param args [Object]
160
+ # Arguments to pass to the UDF.
161
+ # @param kwargs [Object]
162
+ # Keyword arguments to pass to the UDF.
163
+ #
164
+ # @return [LazyFrame]
165
+ #
166
+ # @example
167
+ # cast_str_to_int = lambda do |data, col_name:|
168
+ # data.with_column(Polars.col(col_name).cast(:i64))
169
+ # end
170
+ #
171
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
172
+ # df.pipe(cast_str_to_int, col_name: "b").collect
173
+ # # =>
174
+ # # shape: (4, 2)
175
+ # # ┌─────┬─────┐
176
+ # # │ a ┆ b │
177
+ # # │ --- ┆ --- │
178
+ # # │ i64 ┆ i64 │
179
+ # # ╞═════╪═════╡
180
+ # # │ 1 ┆ 10 │
181
+ # # │ 2 ┆ 20 │
182
+ # # │ 3 ┆ 30 │
183
+ # # │ 4 ┆ 40 │
184
+ # # └─────┴─────┘
185
+ def pipe(func, *args, **kwargs, &block)
186
+ func.call(self, *args, **kwargs, &block)
187
+ end
188
+
189
+ # Create a string representation of the unoptimized query plan.
190
+ #
191
+ # @return [String]
192
+ def describe_plan
193
+ _ldf.describe_plan
194
+ end
195
+
196
+ # Create a string representation of the optimized query plan.
197
+ #
198
+ # @return [String]
199
+ def describe_optimized_plan(
200
+ type_coercion: true,
201
+ predicate_pushdown: true,
202
+ projection_pushdown: true,
203
+ simplify_expression: true,
204
+ slice_pushdown: true,
205
+ common_subplan_elimination: true,
206
+ comm_subexpr_elim: true,
207
+ allow_streaming: false
208
+ )
209
+ ldf = _ldf.optimization_toggle(
210
+ type_coercion,
211
+ predicate_pushdown,
212
+ projection_pushdown,
213
+ simplify_expression,
214
+ slice_pushdown,
215
+ common_subplan_elimination,
216
+ comm_subexpr_elim,
217
+ allow_streaming,
218
+ false
219
+ )
220
+
221
+ ldf.describe_optimized_plan
222
+ end
223
+
224
+ # def show_graph
225
+ # end
226
+
227
+ # Sort the DataFrame.
228
+ #
229
+ # Sorting can be done by:
230
+ #
231
+ # - A single column name
232
+ # - An expression
233
+ # - Multiple expressions
234
+ #
235
+ # @param by [Object]
236
+ # Column (expressions) to sort by.
237
+ # @param reverse [Boolean]
238
+ # Sort in descending order.
239
+ # @param nulls_last [Boolean]
240
+ # Place null values last. Can only be used if sorted by a single column.
241
+ #
242
+ # @return [LazyFrame]
243
+ #
244
+ # @example
245
+ # df = Polars::DataFrame.new(
246
+ # {
247
+ # "foo" => [1, 2, 3],
248
+ # "bar" => [6.0, 7.0, 8.0],
249
+ # "ham" => ["a", "b", "c"]
250
+ # }
251
+ # ).lazy
252
+ # df.sort("foo", reverse: true).collect
253
+ # # =>
254
+ # # shape: (3, 3)
255
+ # # ┌─────┬─────┬─────┐
256
+ # # │ foo ┆ bar ┆ ham │
257
+ # # │ --- ┆ --- ┆ --- │
258
+ # # │ i64 ┆ f64 ┆ str │
259
+ # # ╞═════╪═════╪═════╡
260
+ # # │ 3 ┆ 8.0 ┆ c │
261
+ # # │ 2 ┆ 7.0 ┆ b │
262
+ # # │ 1 ┆ 6.0 ┆ a │
263
+ # # └─────┴─────┴─────┘
264
+ def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
265
+ if by.is_a?(::String) && more_by.empty?
266
+ return _from_rbldf(
267
+ _ldf.sort(
268
+ by, reverse, nulls_last, maintain_order, multithreaded
269
+ )
270
+ )
271
+ end
272
+
273
+ by = Utils.parse_into_list_of_expressions(by, *more_by)
274
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
275
+ nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
276
+ _from_rbldf(
277
+ _ldf.sort_by_exprs(
278
+ by, reverse, nulls_last, maintain_order, multithreaded
279
+ )
280
+ )
281
+ end
282
+
283
+ # def profile
284
+ # end
285
+
286
+ # Collect into a DataFrame.
287
+ #
288
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
289
+ # only. This can be a huge time saver in debugging queries.
290
+ #
291
+ # @param type_coercion [Boolean]
292
+ # Do type coercion optimization.
293
+ # @param predicate_pushdown [Boolean]
294
+ # Do predicate pushdown optimization.
295
+ # @param projection_pushdown [Boolean]
296
+ # Do projection pushdown optimization.
297
+ # @param simplify_expression [Boolean]
298
+ # Run simplify expressions optimization.
299
+ # @param string_cache [Boolean]
300
+ # This argument is deprecated. Please set the string cache globally.
301
+ # The argument will be ignored
302
+ # @param no_optimization [Boolean]
303
+ # Turn off (certain) optimizations.
304
+ # @param slice_pushdown [Boolean]
305
+ # Slice pushdown optimization.
306
+ # @param common_subplan_elimination [Boolean]
307
+ # Will try to cache branching subplans that occur on self-joins or unions.
308
+ # @param allow_streaming [Boolean]
309
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
310
+ #
311
+ # @return [DataFrame]
312
+ #
313
+ # @example
314
+ # df = Polars::DataFrame.new(
315
+ # {
316
+ # "a" => ["a", "b", "a", "b", "b", "c"],
317
+ # "b" => [1, 2, 3, 4, 5, 6],
318
+ # "c" => [6, 5, 4, 3, 2, 1]
319
+ # }
320
+ # ).lazy
321
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
322
+ # # =>
323
+ # # shape: (3, 3)
324
+ # # ┌─────┬─────┬─────┐
325
+ # # │ a ┆ b ┆ c │
326
+ # # │ --- ┆ --- ┆ --- │
327
+ # # │ str ┆ i64 ┆ i64 │
328
+ # # ╞═════╪═════╪═════╡
329
+ # # │ a ┆ 4 ┆ 10 │
330
+ # # │ b ┆ 11 ┆ 10 │
331
+ # # │ c ┆ 6 ┆ 1 │
332
+ # # └─────┴─────┴─────┘
333
+ def collect(
334
+ type_coercion: true,
335
+ predicate_pushdown: true,
336
+ projection_pushdown: true,
337
+ simplify_expression: true,
338
+ string_cache: false,
339
+ no_optimization: false,
340
+ slice_pushdown: true,
341
+ common_subplan_elimination: true,
342
+ comm_subexpr_elim: true,
343
+ allow_streaming: false,
344
+ _eager: false
345
+ )
346
+ if no_optimization
347
+ predicate_pushdown = false
348
+ projection_pushdown = false
349
+ slice_pushdown = false
350
+ common_subplan_elimination = false
351
+ comm_subexpr_elim = false
352
+ end
353
+
354
+ if allow_streaming
355
+ common_subplan_elimination = false
356
+ end
357
+
358
+ ldf = _ldf.optimization_toggle(
359
+ type_coercion,
360
+ predicate_pushdown,
361
+ projection_pushdown,
362
+ simplify_expression,
363
+ slice_pushdown,
364
+ common_subplan_elimination,
365
+ comm_subexpr_elim,
366
+ allow_streaming,
367
+ _eager
368
+ )
369
+ Utils.wrap_df(ldf.collect)
370
+ end
371
+
372
+ # Persists a LazyFrame at the provided path.
373
+ #
374
+ # This allows streaming results that are larger than RAM to be written to disk.
375
+ #
376
+ # @param path [String]
377
+ # File path to which the file should be written.
378
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
379
+ # Choose "zstd" for good compression performance.
380
+ # Choose "lz4" for fast compression/decompression.
381
+ # Choose "snappy" for more backwards compatibility guarantees
382
+ # when you deal with older parquet readers.
383
+ # @param compression_level [Integer]
384
+ # The level of compression to use. Higher compression means smaller files on
385
+ # disk.
386
+ #
387
+ # - "gzip" : min-level: 0, max-level: 10.
388
+ # - "brotli" : min-level: 0, max-level: 11.
389
+ # - "zstd" : min-level: 1, max-level: 22.
390
+ # @param statistics [Boolean]
391
+ # Write statistics to the parquet headers. This requires extra compute.
392
+ # @param row_group_size [Integer]
393
+ # Size of the row groups in number of rows.
394
+ # If `nil` (default), the chunks of the `DataFrame` are
395
+ # used. Writing in smaller chunks may reduce memory pressure and improve
396
+ # writing speeds.
397
+ # @param data_pagesize_limit [Integer]
398
+ # Size limit of individual data pages.
399
+ # If not set defaults to 1024 * 1024 bytes
400
+ # @param maintain_order [Boolean]
401
+ # Maintain the order in which data is processed.
402
+ # Setting this to `false` will be slightly faster.
403
+ # @param type_coercion [Boolean]
404
+ # Do type coercion optimization.
405
+ # @param predicate_pushdown [Boolean]
406
+ # Do predicate pushdown optimization.
407
+ # @param projection_pushdown [Boolean]
408
+ # Do projection pushdown optimization.
409
+ # @param simplify_expression [Boolean]
410
+ # Run simplify expressions optimization.
411
+ # @param no_optimization [Boolean]
412
+ # Turn off (certain) optimizations.
413
+ # @param slice_pushdown [Boolean]
414
+ # Slice pushdown optimization.
415
+ #
416
+ # @return [DataFrame]
417
+ #
418
+ # @example
419
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
420
+ # lf.sink_parquet("out.parquet")
421
+ def sink_parquet(
422
+ path,
423
+ compression: "zstd",
424
+ compression_level: nil,
425
+ statistics: true,
426
+ row_group_size: nil,
427
+ data_pagesize_limit: nil,
428
+ maintain_order: true,
429
+ type_coercion: true,
430
+ predicate_pushdown: true,
431
+ projection_pushdown: true,
432
+ simplify_expression: true,
433
+ no_optimization: false,
434
+ slice_pushdown: true
435
+ )
436
+ lf = _set_sink_optimizations(
437
+ type_coercion: type_coercion,
438
+ predicate_pushdown: predicate_pushdown,
439
+ projection_pushdown: projection_pushdown,
440
+ simplify_expression: simplify_expression,
441
+ slice_pushdown: slice_pushdown,
442
+ no_optimization: no_optimization
443
+ )
444
+
445
+ if statistics == true
446
+ statistics = {
447
+ min: true,
448
+ max: true,
449
+ distinct_count: false,
450
+ null_count: true
451
+ }
452
+ elsif statistics == false
453
+ statistics = {}
454
+ elsif statistics == "full"
455
+ statistics = {
456
+ min: true,
457
+ max: true,
458
+ distinct_count: true,
459
+ null_count: true
460
+ }
461
+ end
462
+
463
+ lf.sink_parquet(
464
+ path,
465
+ compression,
466
+ compression_level,
467
+ statistics,
468
+ row_group_size,
469
+ data_pagesize_limit,
470
+ maintain_order
471
+ )
472
+ end
473
+
474
+ # Evaluate the query in streaming mode and write to an IPC file.
475
+ #
476
+ # This allows streaming results that are larger than RAM to be written to disk.
477
+ #
478
+ # @param path [String]
479
+ # File path to which the file should be written.
480
+ # @param compression ["lz4", "zstd"]
481
+ # Choose "zstd" for good compression performance.
482
+ # Choose "lz4" for fast compression/decompression.
483
+ # @param maintain_order [Boolean]
484
+ # Maintain the order in which data is processed.
485
+ # Setting this to `false` will be slightly faster.
486
+ # @param type_coercion [Boolean]
487
+ # Do type coercion optimization.
488
+ # @param predicate_pushdown [Boolean]
489
+ # Do predicate pushdown optimization.
490
+ # @param projection_pushdown [Boolean]
491
+ # Do projection pushdown optimization.
492
+ # @param simplify_expression [Boolean]
493
+ # Run simplify expressions optimization.
494
+ # @param slice_pushdown [Boolean]
495
+ # Slice pushdown optimization.
496
+ # @param no_optimization [Boolean]
497
+ # Turn off (certain) optimizations.
498
+ #
499
+ # @return [DataFrame]
500
+ #
501
+ # @example
502
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
503
+ # lf.sink_ipc("out.arrow")
504
+ def sink_ipc(
505
+ path,
506
+ compression: "zstd",
507
+ maintain_order: true,
508
+ type_coercion: true,
509
+ predicate_pushdown: true,
510
+ projection_pushdown: true,
511
+ simplify_expression: true,
512
+ slice_pushdown: true,
513
+ no_optimization: false
514
+ )
515
+ lf = _set_sink_optimizations(
516
+ type_coercion: type_coercion,
517
+ predicate_pushdown: predicate_pushdown,
518
+ projection_pushdown: projection_pushdown,
519
+ simplify_expression: simplify_expression,
520
+ slice_pushdown: slice_pushdown,
521
+ no_optimization: no_optimization
522
+ )
523
+
524
+ lf.sink_ipc(
525
+ path,
526
+ compression,
527
+ maintain_order
528
+ )
529
+ end
530
+
531
+ # Evaluate the query in streaming mode and write to a CSV file.
532
+ #
533
+ # This allows streaming results that are larger than RAM to be written to disk.
534
+ #
535
+ # @param path [String]
536
+ # File path to which the file should be written.
537
+ # @param include_bom [Boolean]
538
+ # Whether to include UTF-8 BOM in the CSV output.
539
+ # @param include_header [Boolean]
540
+ # Whether to include header in the CSV output.
541
+ # @param separator [String]
542
+ # Separate CSV fields with this symbol.
543
+ # @param line_terminator [String]
544
+ # String used to end each row.
545
+ # @param quote_char [String]
546
+ # Byte to use as quoting character.
547
+ # @param batch_size [Integer]
548
+ # Number of rows that will be processed per thread.
549
+ # @param datetime_format [String]
550
+ # A format string, with the specifiers defined by the
551
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
552
+ # Rust crate. If no format specified, the default fractional-second
553
+ # precision is inferred from the maximum timeunit found in the frame's
554
+ # Datetime cols (if any).
555
+ # @param date_format [String]
556
+ # A format string, with the specifiers defined by the
557
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
558
+ # Rust crate.
559
+ # @param time_format [String]
560
+ # A format string, with the specifiers defined by the
561
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
562
+ # Rust crate.
563
+ # @param float_precision [Integer]
564
+ # Number of decimal places to write, applied to both `Float32` and
565
+ # `Float64` datatypes.
566
+ # @param null_value [String]
567
+ # A string representing null values (defaulting to the empty string).
568
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
569
+ # Determines the quoting strategy used.
570
+ #
571
+ # - necessary (default): This puts quotes around fields only when necessary.
572
+ # They are necessary when fields contain a quote,
573
+ # delimiter or record terminator.
574
+ # Quotes are also necessary when writing an empty record
575
+ # (which is indistinguishable from a record with one empty field).
576
+ # This is the default.
577
+ # - always: This puts quotes around every field. Always.
578
+ # - never: This never puts quotes around fields, even if that results in
579
+ # invalid CSV data (e.g.: by not quoting strings containing the
580
+ # separator).
581
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
582
+ # Namely, when writing a field that does not parse as a valid float
583
+ # or integer, then quotes will be used even if they aren`t strictly
584
+ # necessary.
585
+ # @param maintain_order [Boolean]
586
+ # Maintain the order in which data is processed.
587
+ # Setting this to `false` will be slightly faster.
588
+ # @param type_coercion [Boolean]
589
+ # Do type coercion optimization.
590
+ # @param predicate_pushdown [Boolean]
591
+ # Do predicate pushdown optimization.
592
+ # @param projection_pushdown [Boolean]
593
+ # Do projection pushdown optimization.
594
+ # @param simplify_expression [Boolean]
595
+ # Run simplify expressions optimization.
596
+ # @param slice_pushdown [Boolean]
597
+ # Slice pushdown optimization.
598
+ # @param no_optimization [Boolean]
599
+ # Turn off (certain) optimizations.
600
+ #
601
+ # @return [DataFrame]
602
+ #
603
+ # @example
604
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
605
+ # lf.sink_csv("out.csv")
606
+ def sink_csv(
607
+ path,
608
+ include_bom: false,
609
+ include_header: true,
610
+ separator: ",",
611
+ line_terminator: "\n",
612
+ quote_char: '"',
613
+ batch_size: 1024,
614
+ datetime_format: nil,
615
+ date_format: nil,
616
+ time_format: nil,
617
+ float_scientific: nil,
618
+ float_precision: nil,
619
+ null_value: nil,
620
+ quote_style: nil,
621
+ maintain_order: true,
622
+ type_coercion: true,
623
+ predicate_pushdown: true,
624
+ projection_pushdown: true,
625
+ simplify_expression: true,
626
+ slice_pushdown: true,
627
+ no_optimization: false
628
+ )
629
+ Utils._check_arg_is_1byte("separator", separator, false)
630
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
631
+
632
+ lf = _set_sink_optimizations(
633
+ type_coercion: type_coercion,
634
+ predicate_pushdown: predicate_pushdown,
635
+ projection_pushdown: projection_pushdown,
636
+ simplify_expression: simplify_expression,
637
+ slice_pushdown: slice_pushdown,
638
+ no_optimization: no_optimization
639
+ )
640
+
641
+ lf.sink_csv(
642
+ path,
643
+ include_bom,
644
+ include_header,
645
+ separator.ord,
646
+ line_terminator,
647
+ quote_char.ord,
648
+ batch_size,
649
+ datetime_format,
650
+ date_format,
651
+ time_format,
652
+ float_scientific,
653
+ float_precision,
654
+ null_value,
655
+ quote_style,
656
+ maintain_order
657
+ )
658
+ end
659
+
660
+ # Evaluate the query in streaming mode and write to an NDJSON file.
661
+ #
662
+ # This allows streaming results that are larger than RAM to be written to disk.
663
+ #
664
+ # @param path [String]
665
+ # File path to which the file should be written.
666
+ # @param maintain_order [Boolean]
667
+ # Maintain the order in which data is processed.
668
+ # Setting this to `false` will be slightly faster.
669
+ # @param type_coercion [Boolean]
670
+ # Do type coercion optimization.
671
+ # @param predicate_pushdown [Boolean]
672
+ # Do predicate pushdown optimization.
673
+ # @param projection_pushdown [Boolean]
674
+ # Do projection pushdown optimization.
675
+ # @param simplify_expression [Boolean]
676
+ # Run simplify expressions optimization.
677
+ # @param slice_pushdown [Boolean]
678
+ # Slice pushdown optimization.
679
+ # @param no_optimization [Boolean]
680
+ # Turn off (certain) optimizations.
681
+ #
682
+ # @return [DataFrame]
683
+ #
684
+ # @example
685
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
686
+ # lf.sink_ndjson("out.ndjson")
687
+ def sink_ndjson(
688
+ path,
689
+ maintain_order: true,
690
+ type_coercion: true,
691
+ predicate_pushdown: true,
692
+ projection_pushdown: true,
693
+ simplify_expression: true,
694
+ slice_pushdown: true,
695
+ no_optimization: false
696
+ )
697
+ lf = _set_sink_optimizations(
698
+ type_coercion: type_coercion,
699
+ predicate_pushdown: predicate_pushdown,
700
+ projection_pushdown: projection_pushdown,
701
+ simplify_expression: simplify_expression,
702
+ slice_pushdown: slice_pushdown,
703
+ no_optimization: no_optimization
704
+ )
705
+
706
+ lf.sink_json(path, maintain_order)
707
+ end
708
+
709
+ # @private
710
+ def _set_sink_optimizations(
711
+ type_coercion: true,
712
+ predicate_pushdown: true,
713
+ projection_pushdown: true,
714
+ simplify_expression: true,
715
+ slice_pushdown: true,
716
+ no_optimization: false
717
+ )
718
+ if no_optimization
719
+ predicate_pushdown = false
720
+ projection_pushdown = false
721
+ slice_pushdown = false
722
+ end
723
+
724
+ _ldf.optimization_toggle(
725
+ type_coercion,
726
+ predicate_pushdown,
727
+ projection_pushdown,
728
+ simplify_expression,
729
+ slice_pushdown,
730
+ false,
731
+ false,
732
+ true,
733
+ false
734
+ )
735
+ end
736
+
737
+ # Collect a small number of rows for debugging purposes.
738
+ #
739
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
740
+ # read by every scan operation. This is a utility that helps debug a query on a
741
+ # smaller number of rows.
742
+ #
743
+ # Note that the fetch does not guarantee the final number of rows in the
744
+ # DataFrame. Filter, join operations and a lower number of rows available in the
745
+ # scanned file influence the final number of rows.
746
+ #
747
+ # @param n_rows [Integer]
748
+ # Collect n_rows from the data sources.
749
+ # @param type_coercion [Boolean]
750
+ # Run type coercion optimization.
751
+ # @param predicate_pushdown [Boolean]
752
+ # Run predicate pushdown optimization.
753
+ # @param projection_pushdown [Boolean]
754
+ # Run projection pushdown optimization.
755
+ # @param simplify_expression [Boolean]
756
+ # Run simplify expressions optimization.
757
+ # @param string_cache [Boolean]
758
+ # This argument is deprecated. Please set the string cache globally.
759
+ # The argument will be ignored
760
+ # @param no_optimization [Boolean]
761
+ # Turn off optimizations.
762
+ # @param slice_pushdown [Boolean]
763
+ # Slice pushdown optimization
764
+ # @param common_subplan_elimination [Boolean]
765
+ # Will try to cache branching subplans that occur on self-joins or unions.
766
+ # @param allow_streaming [Boolean]
767
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
768
+ #
769
+ # @return [DataFrame]
770
+ #
771
+ # @example
772
+ # df = Polars::DataFrame.new(
773
+ # {
774
+ # "a" => ["a", "b", "a", "b", "b", "c"],
775
+ # "b" => [1, 2, 3, 4, 5, 6],
776
+ # "c" => [6, 5, 4, 3, 2, 1]
777
+ # }
778
+ # ).lazy
779
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
780
+ # # =>
781
+ # # shape: (2, 3)
782
+ # # ┌─────┬─────┬─────┐
783
+ # # │ a ┆ b ┆ c │
784
+ # # │ --- ┆ --- ┆ --- │
785
+ # # │ str ┆ i64 ┆ i64 │
786
+ # # ╞═════╪═════╪═════╡
787
+ # # │ a ┆ 1 ┆ 6 │
788
+ # # │ b ┆ 2 ┆ 5 │
789
+ # # └─────┴─────┴─────┘
790
+ def fetch(
791
+ n_rows = 500,
792
+ type_coercion: true,
793
+ predicate_pushdown: true,
794
+ projection_pushdown: true,
795
+ simplify_expression: true,
796
+ string_cache: false,
797
+ no_optimization: false,
798
+ slice_pushdown: true,
799
+ common_subplan_elimination: true,
800
+ comm_subexpr_elim: true,
801
+ allow_streaming: false
802
+ )
803
+ if no_optimization
804
+ predicate_pushdown = false
805
+ projection_pushdown = false
806
+ slice_pushdown = false
807
+ common_subplan_elimination = false
808
+ end
809
+
810
+ ldf = _ldf.optimization_toggle(
811
+ type_coercion,
812
+ predicate_pushdown,
813
+ projection_pushdown,
814
+ simplify_expression,
815
+ slice_pushdown,
816
+ common_subplan_elimination,
817
+ comm_subexpr_elim,
818
+ allow_streaming,
819
+ false
820
+ )
821
+ Utils.wrap_df(ldf.fetch(n_rows))
822
+ end
823
+
824
+ # Return lazy representation, i.e. itself.
825
+ #
826
+ # Useful for writing code that expects either a `DataFrame` or
827
+ # `LazyFrame`.
828
+ #
829
+ # @return [LazyFrame]
830
+ #
831
+ # @example
832
+ # df = Polars::DataFrame.new(
833
+ # {
834
+ # "a" => [nil, 2, 3, 4],
835
+ # "b" => [0.5, nil, 2.5, 13],
836
+ # "c" => [true, true, false, nil]
837
+ # }
838
+ # )
839
+ # df.lazy
840
+ def lazy
841
+ self
842
+ end
843
+
844
+ # Cache the result once the execution of the physical plan hits this node.
845
+ #
846
+ # @return [LazyFrame]
847
+ def cache
848
+ _from_rbldf(_ldf.cache)
849
+ end
850
+
851
+ # TODO
852
+ # def cast
853
+ # end
854
+
855
+ # Create an empty copy of the current LazyFrame.
856
+ #
857
+ # The copy has an identical schema but no data.
858
+ #
859
+ # @return [LazyFrame]
860
+ #
861
+ # @example
862
+ # lf = Polars::LazyFrame.new(
863
+ # {
864
+ # "a" => [nil, 2, 3, 4],
865
+ # "b" => [0.5, nil, 2.5, 13],
866
+ # "c" => [true, true, false, nil],
867
+ # }
868
+ # ).lazy
869
+ # lf.clear.fetch
870
+ # # =>
871
+ # # shape: (0, 3)
872
+ # # ┌─────┬─────┬──────┐
873
+ # # │ a ┆ b ┆ c │
874
+ # # │ --- ┆ --- ┆ --- │
875
+ # # │ i64 ┆ f64 ┆ bool │
876
+ # # ╞═════╪═════╪══════╡
877
+ # # └─────┴─────┴──────┘
878
+ #
879
+ # @example
880
+ # lf.clear(2).fetch
881
+ # # =>
882
+ # # shape: (2, 3)
883
+ # # ┌──────┬──────┬──────┐
884
+ # # │ a ┆ b ┆ c │
885
+ # # │ --- ┆ --- ┆ --- │
886
+ # # │ i64 ┆ f64 ┆ bool │
887
+ # # ╞══════╪══════╪══════╡
888
+ # # │ null ┆ null ┆ null │
889
+ # # │ null ┆ null ┆ null │
890
+ # # └──────┴──────┴──────┘
891
+ def clear(n = 0)
892
+ DataFrame.new(columns: schema).clear(n).lazy
893
+ end
894
+ alias_method :cleared, :clear
895
+
896
+ # Filter the rows in the DataFrame based on a predicate expression.
897
+ #
898
+ # @param predicate [Object]
899
+ # Expression that evaluates to a boolean Series.
900
+ #
901
+ # @return [LazyFrame]
902
+ #
903
+ # @example Filter on one condition:
904
+ # lf = Polars::DataFrame.new(
905
+ # {
906
+ # "foo" => [1, 2, 3],
907
+ # "bar" => [6, 7, 8],
908
+ # "ham" => ["a", "b", "c"]
909
+ # }
910
+ # ).lazy
911
+ # lf.filter(Polars.col("foo") < 3).collect
912
+ # # =>
913
+ # # shape: (2, 3)
914
+ # # ┌─────┬─────┬─────┐
915
+ # # │ foo ┆ bar ┆ ham │
916
+ # # │ --- ┆ --- ┆ --- │
917
+ # # │ i64 ┆ i64 ┆ str │
918
+ # # ╞═════╪═════╪═════╡
919
+ # # │ 1 ┆ 6 ┆ a │
920
+ # # │ 2 ┆ 7 ┆ b │
921
+ # # └─────┴─────┴─────┘
922
+ #
923
+ # @example Filter on multiple conditions:
924
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
925
+ # # =>
926
+ # # shape: (1, 3)
927
+ # # ┌─────┬─────┬─────┐
928
+ # # │ foo ┆ bar ┆ ham │
929
+ # # │ --- ┆ --- ┆ --- │
930
+ # # │ i64 ┆ i64 ┆ str │
931
+ # # ╞═════╪═════╪═════╡
932
+ # # │ 1 ┆ 6 ┆ a │
933
+ # # └─────┴─────┴─────┘
934
+ def filter(predicate)
935
+ _from_rbldf(
936
+ _ldf.filter(
937
+ Utils.parse_into_expression(predicate, str_as_lit: false)
938
+ )
939
+ )
940
+ end
941
+
942
+ # Select columns from this DataFrame.
943
+ #
944
+ # @param exprs [Array]
945
+ # Column(s) to select, specified as positional arguments.
946
+ # Accepts expression input. Strings are parsed as column names,
947
+ # other non-expression inputs are parsed as literals.
948
+ # @param named_exprs [Hash]
949
+ # Additional columns to select, specified as keyword arguments.
950
+ # The columns will be renamed to the keyword used.
951
+ #
952
+ # @return [LazyFrame]
953
+ #
954
+ # @example
955
+ # df = Polars::DataFrame.new(
956
+ # {
957
+ # "foo" => [1, 2, 3],
958
+ # "bar" => [6, 7, 8],
959
+ # "ham" => ["a", "b", "c"],
960
+ # }
961
+ # ).lazy
962
+ # df.select("foo").collect
963
+ # # =>
964
+ # # shape: (3, 1)
965
+ # # ┌─────┐
966
+ # # │ foo │
967
+ # # │ --- │
968
+ # # │ i64 │
969
+ # # ╞═════╡
970
+ # # │ 1 │
971
+ # # │ 2 │
972
+ # # │ 3 │
973
+ # # └─────┘
974
+ #
975
+ # @example
976
+ # df.select(["foo", "bar"]).collect
977
+ # # =>
978
+ # # shape: (3, 2)
979
+ # # ┌─────┬─────┐
980
+ # # │ foo ┆ bar │
981
+ # # │ --- ┆ --- │
982
+ # # │ i64 ┆ i64 │
983
+ # # ╞═════╪═════╡
984
+ # # │ 1 ┆ 6 │
985
+ # # │ 2 ┆ 7 │
986
+ # # │ 3 ┆ 8 │
987
+ # # └─────┴─────┘
988
+ #
989
+ # @example
990
+ # df.select(Polars.col("foo") + 1).collect
991
+ # # =>
992
+ # # shape: (3, 1)
993
+ # # ┌─────┐
994
+ # # │ foo │
995
+ # # │ --- │
996
+ # # │ i64 │
997
+ # # ╞═════╡
998
+ # # │ 2 │
999
+ # # │ 3 │
1000
+ # # │ 4 │
1001
+ # # └─────┘
1002
+ #
1003
+ # @example
1004
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
1005
+ # # =>
1006
+ # # shape: (3, 2)
1007
+ # # ┌─────┬─────┐
1008
+ # # │ foo ┆ bar │
1009
+ # # │ --- ┆ --- │
1010
+ # # │ i64 ┆ i64 │
1011
+ # # ╞═════╪═════╡
1012
+ # # │ 2 ┆ 7 │
1013
+ # # │ 3 ┆ 8 │
1014
+ # # │ 4 ┆ 9 │
1015
+ # # └─────┴─────┘
1016
+ #
1017
+ # @example
1018
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
1019
+ # # =>
1020
+ # # shape: (3, 1)
1021
+ # # ┌─────────┐
1022
+ # # │ literal │
1023
+ # # │ --- │
1024
+ # # │ i32 │
1025
+ # # ╞═════════╡
1026
+ # # │ 0 │
1027
+ # # │ 0 │
1028
+ # # │ 10 │
1029
+ # # └─────────┘
1030
+ def select(*exprs, **named_exprs)
1031
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1032
+
1033
+ rbexprs = Utils.parse_into_list_of_expressions(
1034
+ *exprs, **named_exprs, __structify: structify
1035
+ )
1036
+ _from_rbldf(_ldf.select(rbexprs))
1037
+ end
1038
+
1039
+ # Start a group by operation.
1040
+ #
1041
+ # @param by [Array]
1042
+ # Column(s) to group by.
1043
+ # @param maintain_order [Boolean]
1044
+ # Make sure that the order of the groups remain consistent. This is more
1045
+ # expensive than a default group by.
1046
+ # @param named_by [Hash]
1047
+ # Additional columns to group by, specified as keyword arguments.
1048
+ # The columns will be renamed to the keyword used.
1049
+ # @return [LazyGroupBy]
1050
+ #
1051
+ # @example
1052
+ # df = Polars::DataFrame.new(
1053
+ # {
1054
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1055
+ # "b" => [1, 2, 3, 4, 5, 6],
1056
+ # "c" => [6, 5, 4, 3, 2, 1]
1057
+ # }
1058
+ # ).lazy
1059
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
1060
+ # # =>
1061
+ # # shape: (3, 2)
1062
+ # # ┌─────┬─────┐
1063
+ # # │ a ┆ b │
1064
+ # # │ --- ┆ --- │
1065
+ # # │ str ┆ i64 │
1066
+ # # ╞═════╪═════╡
1067
+ # # │ a ┆ 4 │
1068
+ # # │ b ┆ 11 │
1069
+ # # │ c ┆ 6 │
1070
+ # # └─────┴─────┘
1071
+ def group_by(*by, maintain_order: false, **named_by)
1072
+ exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
1073
+ lgb = _ldf.group_by(exprs, maintain_order)
1074
+ LazyGroupBy.new(lgb)
1075
+ end
1076
+ alias_method :groupby, :group_by
1077
+ alias_method :group, :group_by
1078
+
1079
+ # Create rolling groups based on a time column.
1080
+ #
1081
+ # Also works for index values of type `:i32` or `:i64`.
1082
+ #
1083
+ # Different from a `dynamic_group_by` the windows are now determined by the
1084
+ # individual values and are not of constant intervals. For constant intervals
1085
+ # use *group_by_dynamic*.
1086
+ #
1087
+ # The `period` and `offset` arguments are created either from a timedelta, or
1088
+ # by using the following string language:
1089
+ #
1090
+ # - 1ns (1 nanosecond)
1091
+ # - 1us (1 microsecond)
1092
+ # - 1ms (1 millisecond)
1093
+ # - 1s (1 second)
1094
+ # - 1m (1 minute)
1095
+ # - 1h (1 hour)
1096
+ # - 1d (1 day)
1097
+ # - 1w (1 week)
1098
+ # - 1mo (1 calendar month)
1099
+ # - 1y (1 calendar year)
1100
+ # - 1i (1 index count)
1101
+ #
1102
+ # Or combine them:
1103
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1104
+ #
1105
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1106
+ #
1107
+ # - "1i" # length 1
1108
+ # - "10i" # length 10
1109
+ #
1110
+ # @param index_column [Object]
1111
+ # Column used to group based on the time window.
1112
+ # Often to type Date/Datetime
1113
+ # This column must be sorted in ascending order. If not the output will not
1114
+ # make sense.
1115
+ #
1116
+ # In case of a rolling group by on indices, dtype needs to be one of
1117
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1118
+ # performance matters use an `:i64` column.
1119
+ # @param period [Object]
1120
+ # Length of the window.
1121
+ # @param offset [Object]
1122
+ # Offset of the window. Default is -period.
1123
+ # @param closed ["right", "left", "both", "none"]
1124
+ # Define whether the temporal window interval is closed or not.
1125
+ # @param by [Object]
1126
+ # Also group by this column/these columns.
1127
+ #
1128
+ # @return [LazyFrame]
1129
+ #
1130
+ # @example
1131
+ # dates = [
1132
+ # "2020-01-01 13:45:48",
1133
+ # "2020-01-01 16:42:13",
1134
+ # "2020-01-01 16:45:09",
1135
+ # "2020-01-02 18:12:48",
1136
+ # "2020-01-03 19:45:32",
1137
+ # "2020-01-08 23:16:43"
1138
+ # ]
1139
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1140
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1141
+ # )
1142
+ # df.rolling(index_column: "dt", period: "2d").agg(
1143
+ # [
1144
+ # Polars.sum("a").alias("sum_a"),
1145
+ # Polars.min("a").alias("min_a"),
1146
+ # Polars.max("a").alias("max_a")
1147
+ # ]
1148
+ # ).collect
1149
+ # # =>
1150
+ # # shape: (6, 4)
1151
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1152
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1153
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1154
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1155
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1156
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1157
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1158
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1159
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1160
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1161
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1162
+ # # └─────────────────────┴───────┴───────┴───────┘
1163
+ def rolling(
1164
+ index_column:,
1165
+ period:,
1166
+ offset: nil,
1167
+ closed: "right",
1168
+ by: nil
1169
+ )
1170
+ index_column = Utils.parse_into_expression(index_column)
1171
+ if offset.nil?
1172
+ offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
1173
+ end
1174
+
1175
+ rbexprs_by = (
1176
+ !by.nil? ? Utils.parse_into_list_of_expressions(by) : []
1177
+ )
1178
+ period = Utils.parse_as_duration_string(period)
1179
+ offset = Utils.parse_as_duration_string(offset)
1180
+
1181
+ lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
1182
+ LazyGroupBy.new(lgb)
1183
+ end
1184
+ alias_method :group_by_rolling, :rolling
1185
+ alias_method :groupby_rolling, :rolling
1186
+
1187
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1188
+ #
1189
+ # Time windows are calculated and rows are assigned to windows. Different from a
1190
+ # normal group by is that a row can be member of multiple groups. The time/index
1191
+ # window could be seen as a rolling window, with a window size determined by
1192
+ # dates/times/values instead of slots in the DataFrame.
1193
+ #
1194
+ # A window is defined by:
1195
+ #
1196
+ # - every: interval of the window
1197
+ # - period: length of the window
1198
+ # - offset: offset of the window
1199
+ #
1200
+ # The `every`, `period` and `offset` arguments are created with
1201
+ # the following string language:
1202
+ #
1203
+ # - 1ns (1 nanosecond)
1204
+ # - 1us (1 microsecond)
1205
+ # - 1ms (1 millisecond)
1206
+ # - 1s (1 second)
1207
+ # - 1m (1 minute)
1208
+ # - 1h (1 hour)
1209
+ # - 1d (1 day)
1210
+ # - 1w (1 week)
1211
+ # - 1mo (1 calendar month)
1212
+ # - 1y (1 calendar year)
1213
+ # - 1i (1 index count)
1214
+ #
1215
+ # Or combine them:
1216
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1217
+ #
1218
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1219
+ #
1220
+ # - "1i" # length 1
1221
+ # - "10i" # length 10
1222
+ #
1223
+ # @param index_column [Object]
1224
+ # Column used to group based on the time window.
1225
+ # Often to type Date/Datetime
1226
+ # This column must be sorted in ascending order. If not the output will not
1227
+ # make sense.
1228
+ #
1229
+ # In case of a dynamic group by on indices, dtype needs to be one of
1230
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1231
+ # performance matters use an `:i64` column.
1232
+ # @param every [Object]
1233
+ # Interval of the window.
1234
+ # @param period [Object]
1235
+ # Length of the window, if None it is equal to 'every'.
1236
+ # @param offset [Object]
1237
+ # Offset of the window if None and period is None it will be equal to negative
1238
+ # `every`.
1239
+ # @param truncate [Boolean]
1240
+ # Truncate the time value to the window lower bound.
1241
+ # @param include_boundaries [Boolean]
1242
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1243
+ # "_upper_bound" columns. This will impact performance because it's harder to
1244
+ # parallelize
1245
+ # @param closed ["right", "left", "both", "none"]
1246
+ # Define whether the temporal window interval is closed or not.
1247
+ # @param by [Object]
1248
+ # Also group by this column/these columns
1249
+ #
1250
+ # @return [DataFrame]
1251
+ #
1252
+ # @example
1253
+ # df = Polars::DataFrame.new(
1254
+ # {
1255
+ # "time" => Polars.datetime_range(
1256
+ # DateTime.new(2021, 12, 16),
1257
+ # DateTime.new(2021, 12, 16, 3),
1258
+ # "30m",
1259
+ # time_unit: "us",
1260
+ # eager: true
1261
+ # ),
1262
+ # "n" => 0..6
1263
+ # }
1264
+ # )
1265
+ # # =>
1266
+ # # shape: (7, 2)
1267
+ # # ┌─────────────────────┬─────┐
1268
+ # # │ time ┆ n │
1269
+ # # │ --- ┆ --- │
1270
+ # # │ datetime[μs] ┆ i64 │
1271
+ # # ╞═════════════════════╪═════╡
1272
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1273
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1274
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1275
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1276
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1277
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1278
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1279
+ # # └─────────────────────┴─────┘
1280
+ #
1281
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1282
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1283
+ # [
1284
+ # Polars.col("time").min.alias("time_min"),
1285
+ # Polars.col("time").max.alias("time_max")
1286
+ # ]
1287
+ # )
1288
+ # # =>
1289
+ # # shape: (4, 3)
1290
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1291
+ # # │ time ┆ time_min ┆ time_max │
1292
+ # # │ --- ┆ --- ┆ --- │
1293
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1294
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1295
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1296
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1297
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1298
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1299
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1300
+ #
1301
+ # @example The window boundaries can also be added to the aggregation result.
1302
+ # df.group_by_dynamic(
1303
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1304
+ # ).agg([Polars.col("time").count.alias("time_count")])
1305
+ # # =>
1306
+ # # shape: (4, 4)
1307
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1308
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1309
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1310
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1311
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1312
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1313
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1314
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1315
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1316
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1317
+ #
1318
+ # @example When closed="left", should not include right end of interval.
1319
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1320
+ # [
1321
+ # Polars.col("time").count.alias("time_count"),
1322
+ # Polars.col("time").alias("time_agg_list")
1323
+ # ]
1324
+ # )
1325
+ # # =>
1326
+ # # shape: (4, 3)
1327
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1328
+ # # │ time ┆ time_count ┆ time_agg_list │
1329
+ # # │ --- ┆ --- ┆ --- │
1330
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1331
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1332
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
1333
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
1334
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
1335
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1336
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
1337
+ #
1338
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1339
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1340
+ # [Polars.col("time").count.alias("time_count")]
1341
+ # )
1342
+ # # =>
1343
+ # # shape: (5, 2)
1344
+ # # ┌─────────────────────┬────────────┐
1345
+ # # │ time ┆ time_count │
1346
+ # # │ --- ┆ --- │
1347
+ # # │ datetime[μs] ┆ u32 │
1348
+ # # ╞═════════════════════╪════════════╡
1349
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1350
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1351
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1352
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1353
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1354
+ # # └─────────────────────┴────────────┘
1355
+ #
1356
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1357
+ # df = Polars::DataFrame.new(
1358
+ # {
1359
+ # "time" => Polars.datetime_range(
1360
+ # DateTime.new(2021, 12, 16),
1361
+ # DateTime.new(2021, 12, 16, 3),
1362
+ # "30m",
1363
+ # time_unit: "us",
1364
+ # eager: true
1365
+ # ),
1366
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1367
+ # }
1368
+ # )
1369
+ # df.group_by_dynamic(
1370
+ # "time",
1371
+ # every: "1h",
1372
+ # closed: "both",
1373
+ # by: "groups",
1374
+ # include_boundaries: true
1375
+ # ).agg([Polars.col("time").count.alias("time_count")])
1376
+ # # =>
1377
+ # # shape: (7, 5)
1378
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1379
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1380
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1381
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1382
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1383
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1384
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1385
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1386
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1387
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1388
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1389
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1390
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1391
+ #
1392
+ # @example Dynamic group by on an index column.
1393
+ # df = Polars::DataFrame.new(
1394
+ # {
1395
+ # "idx" => Polars.arange(0, 6, eager: true),
1396
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1397
+ # }
1398
+ # )
1399
+ # df.group_by_dynamic(
1400
+ # "idx",
1401
+ # every: "2i",
1402
+ # period: "3i",
1403
+ # include_boundaries: true,
1404
+ # closed: "right"
1405
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1406
+ # # =>
1407
+ # # shape: (4, 4)
1408
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1409
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1410
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1411
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1412
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1413
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1414
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1415
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1416
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1417
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1418
+ def group_by_dynamic(
1419
+ index_column,
1420
+ every:,
1421
+ period: nil,
1422
+ offset: nil,
1423
+ truncate: nil,
1424
+ include_boundaries: false,
1425
+ closed: "left",
1426
+ label: "left",
1427
+ by: nil,
1428
+ start_by: "window"
1429
+ )
1430
+ if !truncate.nil?
1431
+ label = truncate ? "left" : "datapoint"
1432
+ end
1433
+
1434
+ index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
1435
+ if offset.nil?
1436
+ offset = period.nil? ? "-#{every}" : "0ns"
1437
+ end
1438
+
1439
+ if period.nil?
1440
+ period = every
1441
+ end
1442
+
1443
+ period = Utils.parse_as_duration_string(period)
1444
+ offset = Utils.parse_as_duration_string(offset)
1445
+ every = Utils.parse_as_duration_string(every)
1446
+
1447
+ rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
1448
+ lgb = _ldf.group_by_dynamic(
1449
+ index_column,
1450
+ every,
1451
+ period,
1452
+ offset,
1453
+ label,
1454
+ include_boundaries,
1455
+ closed,
1456
+ rbexprs_by,
1457
+ start_by
1458
+ )
1459
+ LazyGroupBy.new(lgb)
1460
+ end
1461
+ alias_method :groupby_dynamic, :group_by_dynamic
1462
+
1463
+ # Perform an asof join.
1464
+ #
1465
+ # This is similar to a left-join except that we match on nearest key rather than
1466
+ # equal keys.
1467
+ #
1468
+ # Both DataFrames must be sorted by the join_asof key.
1469
+ #
1470
+ # For each row in the left DataFrame:
1471
+ #
1472
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1473
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1474
+ #
1475
+ # The default is "backward".
1476
+ #
1477
+ # @param other [LazyFrame]
1478
+ # Lazy DataFrame to join with.
1479
+ # @param left_on [String]
1480
+ # Join column of the left DataFrame.
1481
+ # @param right_on [String]
1482
+ # Join column of the right DataFrame.
1483
+ # @param on [String]
1484
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1485
+ # None.
1486
+ # @param by [Object]
1487
+ # Join on these columns before doing asof join.
1488
+ # @param by_left [Object]
1489
+ # Join on these columns before doing asof join.
1490
+ # @param by_right [Object]
1491
+ # Join on these columns before doing asof join.
1492
+ # @param strategy ["backward", "forward"]
1493
+ # Join strategy.
1494
+ # @param suffix [String]
1495
+ # Suffix to append to columns with a duplicate name.
1496
+ # @param tolerance [Object]
1497
+ # Numeric tolerance. By setting this the join will only be done if the near
1498
+ # keys are within this distance. If an asof join is done on columns of dtype
1499
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1500
+ # language:
1501
+ #
1502
+ # - 1ns (1 nanosecond)
1503
+ # - 1us (1 microsecond)
1504
+ # - 1ms (1 millisecond)
1505
+ # - 1s (1 second)
1506
+ # - 1m (1 minute)
1507
+ # - 1h (1 hour)
1508
+ # - 1d (1 day)
1509
+ # - 1w (1 week)
1510
+ # - 1mo (1 calendar month)
1511
+ # - 1y (1 calendar year)
1512
+ # - 1i (1 index count)
1513
+ #
1514
+ # Or combine them:
1515
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1516
+ #
1517
+ # @param allow_parallel [Boolean]
1518
+ # Allow the physical plan to optionally evaluate the computation of both
1519
+ # DataFrames up to the join in parallel.
1520
+ # @param force_parallel [Boolean]
1521
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1522
+ # the join in parallel.
1523
+ #
1524
+ # @return [LazyFrame]
1525
+ def join_asof(
1526
+ other,
1527
+ left_on: nil,
1528
+ right_on: nil,
1529
+ on: nil,
1530
+ by_left: nil,
1531
+ by_right: nil,
1532
+ by: nil,
1533
+ strategy: "backward",
1534
+ suffix: "_right",
1535
+ tolerance: nil,
1536
+ allow_parallel: true,
1537
+ force_parallel: false
1538
+ )
1539
+ if !other.is_a?(LazyFrame)
1540
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1541
+ end
1542
+
1543
+ if on.is_a?(::String)
1544
+ left_on = on
1545
+ right_on = on
1546
+ end
1547
+
1548
+ if left_on.nil? || right_on.nil?
1549
+ raise ArgumentError, "You should pass the column to join on as an argument."
1550
+ end
1551
+
1552
+ if by_left.is_a?(::String) || by_left.is_a?(Expr)
1553
+ by_left_ = [by_left]
1554
+ else
1555
+ by_left_ = by_left
1556
+ end
1557
+
1558
+ if by_right.is_a?(::String) || by_right.is_a?(Expr)
1559
+ by_right_ = [by_right]
1560
+ else
1561
+ by_right_ = by_right
1562
+ end
1563
+
1564
+ if by.is_a?(::String)
1565
+ by_left_ = [by]
1566
+ by_right_ = [by]
1567
+ elsif by.is_a?(::Array)
1568
+ by_left_ = by
1569
+ by_right_ = by
1570
+ end
1571
+
1572
+ tolerance_str = nil
1573
+ tolerance_num = nil
1574
+ if tolerance.is_a?(::String)
1575
+ tolerance_str = tolerance
1576
+ else
1577
+ tolerance_num = tolerance
1578
+ end
1579
+
1580
+ _from_rbldf(
1581
+ _ldf.join_asof(
1582
+ other._ldf,
1583
+ Polars.col(left_on)._rbexpr,
1584
+ Polars.col(right_on)._rbexpr,
1585
+ by_left_,
1586
+ by_right_,
1587
+ allow_parallel,
1588
+ force_parallel,
1589
+ suffix,
1590
+ strategy,
1591
+ tolerance_num,
1592
+ tolerance_str
1593
+ )
1594
+ )
1595
+ end
1596
+
1597
+ # Add a join operation to the Logical Plan.
1598
+ #
1599
+ # @param other [LazyFrame]
1600
+ # Lazy DataFrame to join with.
1601
+ # @param left_on [Object]
1602
+ # Join column of the left DataFrame.
1603
+ # @param right_on [Object]
1604
+ # Join column of the right DataFrame.
1605
+ # @param on Object
1606
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1607
+ # None.
1608
+ # @param how ["inner", "left", "full", "semi", "anti", "cross"]
1609
+ # Join strategy.
1610
+ # @param suffix [String]
1611
+ # Suffix to append to columns with a duplicate name.
1612
+ # @param join_nulls [Boolean]
1613
+ # Join on null values. By default null values will never produce matches.
1614
+ # @param allow_parallel [Boolean]
1615
+ # Allow the physical plan to optionally evaluate the computation of both
1616
+ # DataFrames up to the join in parallel.
1617
+ # @param force_parallel [Boolean]
1618
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1619
+ # the join in parallel.
1620
+ #
1621
+ # @return [LazyFrame]
1622
+ #
1623
+ # @example
1624
+ # df = Polars::DataFrame.new(
1625
+ # {
1626
+ # "foo" => [1, 2, 3],
1627
+ # "bar" => [6.0, 7.0, 8.0],
1628
+ # "ham" => ["a", "b", "c"]
1629
+ # }
1630
+ # ).lazy
1631
+ # other_df = Polars::DataFrame.new(
1632
+ # {
1633
+ # "apple" => ["x", "y", "z"],
1634
+ # "ham" => ["a", "b", "d"]
1635
+ # }
1636
+ # ).lazy
1637
+ # df.join(other_df, on: "ham").collect
1638
+ # # =>
1639
+ # # shape: (2, 4)
1640
+ # # ┌─────┬─────┬─────┬───────┐
1641
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1642
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1643
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1644
+ # # ╞═════╪═════╪═════╪═══════╡
1645
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1646
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1647
+ # # └─────┴─────┴─────┴───────┘
1648
+ #
1649
+ # @example
1650
+ # df.join(other_df, on: "ham", how: "full").collect
1651
+ # # =>
1652
+ # # shape: (4, 5)
1653
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
1654
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
1655
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1656
+ # # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
1657
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
1658
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
1659
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
1660
+ # # │ null ┆ null ┆ null ┆ z ┆ d │
1661
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
1662
+ # # └──────┴──────┴──────┴───────┴───────────┘
1663
+ #
1664
+ # @example
1665
+ # df.join(other_df, on: "ham", how: "left").collect
1666
+ # # =>
1667
+ # # shape: (3, 4)
1668
+ # # ┌─────┬─────┬─────┬───────┐
1669
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1670
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1671
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1672
+ # # ╞═════╪═════╪═════╪═══════╡
1673
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1674
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1675
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1676
+ # # └─────┴─────┴─────┴───────┘
1677
+ #
1678
+ # @example
1679
+ # df.join(other_df, on: "ham", how: "semi").collect
1680
+ # # =>
1681
+ # # shape: (2, 3)
1682
+ # # ┌─────┬─────┬─────┐
1683
+ # # │ foo ┆ bar ┆ ham │
1684
+ # # │ --- ┆ --- ┆ --- │
1685
+ # # │ i64 ┆ f64 ┆ str │
1686
+ # # ╞═════╪═════╪═════╡
1687
+ # # │ 1 ┆ 6.0 ┆ a │
1688
+ # # │ 2 ┆ 7.0 ┆ b │
1689
+ # # └─────┴─────┴─────┘
1690
+ #
1691
+ # @example
1692
+ # df.join(other_df, on: "ham", how: "anti").collect
1693
+ # # =>
1694
+ # # shape: (1, 3)
1695
+ # # ┌─────┬─────┬─────┐
1696
+ # # │ foo ┆ bar ┆ ham │
1697
+ # # │ --- ┆ --- ┆ --- │
1698
+ # # │ i64 ┆ f64 ┆ str │
1699
+ # # ╞═════╪═════╪═════╡
1700
+ # # │ 3 ┆ 8.0 ┆ c │
1701
+ # # └─────┴─────┴─────┘
1702
+ def join(
1703
+ other,
1704
+ left_on: nil,
1705
+ right_on: nil,
1706
+ on: nil,
1707
+ how: "inner",
1708
+ suffix: "_right",
1709
+ join_nulls: false,
1710
+ allow_parallel: true,
1711
+ force_parallel: false
1712
+ )
1713
+ if !other.is_a?(LazyFrame)
1714
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1715
+ end
1716
+
1717
+ if how == "outer"
1718
+ how = "full"
1719
+ elsif how == "cross"
1720
+ return _from_rbldf(
1721
+ _ldf.join(
1722
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1723
+ )
1724
+ )
1725
+ end
1726
+
1727
+ if !on.nil?
1728
+ rbexprs = Utils.parse_into_list_of_expressions(on)
1729
+ rbexprs_left = rbexprs
1730
+ rbexprs_right = rbexprs
1731
+ elsif !left_on.nil? && !right_on.nil?
1732
+ rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
1733
+ rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
1734
+ else
1735
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1736
+ end
1737
+
1738
+ _from_rbldf(
1739
+ self._ldf.join(
1740
+ other._ldf,
1741
+ rbexprs_left,
1742
+ rbexprs_right,
1743
+ allow_parallel,
1744
+ force_parallel,
1745
+ join_nulls,
1746
+ how,
1747
+ suffix,
1748
+ )
1749
+ )
1750
+ end
1751
+
1752
+ # Add or overwrite multiple columns in a DataFrame.
1753
+ #
1754
+ # @param exprs [Object]
1755
+ # List of Expressions that evaluate to columns.
1756
+ #
1757
+ # @return [LazyFrame]
1758
+ #
1759
+ # @example
1760
+ # ldf = Polars::DataFrame.new(
1761
+ # {
1762
+ # "a" => [1, 2, 3, 4],
1763
+ # "b" => [0.5, 4, 10, 13],
1764
+ # "c" => [true, true, false, true]
1765
+ # }
1766
+ # ).lazy
1767
+ # ldf.with_columns(
1768
+ # [
1769
+ # (Polars.col("a") ** 2).alias("a^2"),
1770
+ # (Polars.col("b") / 2).alias("b/2"),
1771
+ # (Polars.col("c").is_not).alias("not c")
1772
+ # ]
1773
+ # ).collect
1774
+ # # =>
1775
+ # # shape: (4, 6)
1776
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1777
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1778
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1779
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1780
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1781
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1782
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1783
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1784
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1785
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1786
+ def with_columns(*exprs, **named_exprs)
1787
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1788
+
1789
+ rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1790
+
1791
+ _from_rbldf(_ldf.with_columns(rbexprs))
1792
+ end
1793
+
1794
+ # Add an external context to the computation graph.
1795
+ #
1796
+ # This allows expressions to also access columns from DataFrames
1797
+ # that are not part of this one.
1798
+ #
1799
+ # @param other [Object]
1800
+ # Lazy DataFrame to join with.
1801
+ #
1802
+ # @return [LazyFrame]
1803
+ #
1804
+ # @example
1805
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1806
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1807
+ # (
1808
+ # df_a.with_context(df_other.lazy).select(
1809
+ # [Polars.col("b") + Polars.col("c").first]
1810
+ # )
1811
+ # ).collect
1812
+ # # =>
1813
+ # # shape: (3, 1)
1814
+ # # ┌──────┐
1815
+ # # │ b │
1816
+ # # │ --- │
1817
+ # # │ str │
1818
+ # # ╞══════╡
1819
+ # # │ afoo │
1820
+ # # │ cfoo │
1821
+ # # │ null │
1822
+ # # └──────┘
1823
+ def with_context(other)
1824
+ if !other.is_a?(::Array)
1825
+ other = [other]
1826
+ end
1827
+
1828
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1829
+ end
1830
+
1831
+ # Add or overwrite column in a DataFrame.
1832
+ #
1833
+ # @param column [Object]
1834
+ # Expression that evaluates to column or a Series to use.
1835
+ #
1836
+ # @return [LazyFrame]
1837
+ #
1838
+ # @example
1839
+ # df = Polars::DataFrame.new(
1840
+ # {
1841
+ # "a" => [1, 3, 5],
1842
+ # "b" => [2, 4, 6]
1843
+ # }
1844
+ # ).lazy
1845
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1846
+ # # =>
1847
+ # # shape: (3, 3)
1848
+ # # ┌─────┬─────┬───────────┐
1849
+ # # │ a ┆ b ┆ b_squared │
1850
+ # # │ --- ┆ --- ┆ --- │
1851
+ # # │ i64 ┆ i64 ┆ i64 │
1852
+ # # ╞═════╪═════╪═══════════╡
1853
+ # # │ 1 ┆ 2 ┆ 4 │
1854
+ # # │ 3 ┆ 4 ┆ 16 │
1855
+ # # │ 5 ┆ 6 ┆ 36 │
1856
+ # # └─────┴─────┴───────────┘
1857
+ #
1858
+ # @example
1859
+ # df.with_column(Polars.col("a") ** 2).collect
1860
+ # # =>
1861
+ # # shape: (3, 2)
1862
+ # # ┌─────┬─────┐
1863
+ # # │ a ┆ b │
1864
+ # # │ --- ┆ --- │
1865
+ # # │ i64 ┆ i64 │
1866
+ # # ╞═════╪═════╡
1867
+ # # │ 1 ┆ 2 │
1868
+ # # │ 9 ┆ 4 │
1869
+ # # │ 25 ┆ 6 │
1870
+ # # └─────┴─────┘
1871
+ def with_column(column)
1872
+ with_columns([column])
1873
+ end
1874
+
1875
+ # Remove one or multiple columns from a DataFrame.
1876
+ #
1877
+ # @param columns [Object]
1878
+ # - Name of the column that should be removed.
1879
+ # - List of column names.
1880
+ #
1881
+ # @return [LazyFrame]
1882
+ def drop(*columns)
1883
+ drop_cols = Utils._expand_selectors(self, *columns)
1884
+ _from_rbldf(_ldf.drop(drop_cols))
1885
+ end
1886
+
1887
+ # Rename column names.
1888
+ #
1889
+ # @param mapping [Hash]
1890
+ # Key value pairs that map from old name to new name.
1891
+ #
1892
+ # @return [LazyFrame]
1893
+ def rename(mapping)
1894
+ existing = mapping.keys
1895
+ _new = mapping.values
1896
+ _from_rbldf(_ldf.rename(existing, _new))
1897
+ end
1898
+
1899
+ # Reverse the DataFrame.
1900
+ #
1901
+ # @return [LazyFrame]
1902
+ def reverse
1903
+ _from_rbldf(_ldf.reverse)
1904
+ end
1905
+
1906
+ # Shift the values by a given period.
1907
+ #
1908
+ # @param n [Integer]
1909
+ # Number of places to shift (may be negative).
1910
+ # @param fill_value [Object]
1911
+ # Fill the resulting null values with this value.
1912
+ #
1913
+ # @return [LazyFrame]
1914
+ #
1915
+ # @example
1916
+ # df = Polars::DataFrame.new(
1917
+ # {
1918
+ # "a" => [1, 3, 5],
1919
+ # "b" => [2, 4, 6]
1920
+ # }
1921
+ # ).lazy
1922
+ # df.shift(1).collect
1923
+ # # =>
1924
+ # # shape: (3, 2)
1925
+ # # ┌──────┬──────┐
1926
+ # # │ a ┆ b │
1927
+ # # │ --- ┆ --- │
1928
+ # # │ i64 ┆ i64 │
1929
+ # # ╞══════╪══════╡
1930
+ # # │ null ┆ null │
1931
+ # # │ 1 ┆ 2 │
1932
+ # # │ 3 ┆ 4 │
1933
+ # # └──────┴──────┘
1934
+ #
1935
+ # @example
1936
+ # df.shift(-1).collect
1937
+ # # =>
1938
+ # # shape: (3, 2)
1939
+ # # ┌──────┬──────┐
1940
+ # # │ a ┆ b │
1941
+ # # │ --- ┆ --- │
1942
+ # # │ i64 ┆ i64 │
1943
+ # # ╞══════╪══════╡
1944
+ # # │ 3 ┆ 4 │
1945
+ # # │ 5 ┆ 6 │
1946
+ # # │ null ┆ null │
1947
+ # # └──────┴──────┘
1948
+ def shift(n, fill_value: nil)
1949
+ if !fill_value.nil?
1950
+ fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
1951
+ end
1952
+ n = Utils.parse_into_expression(n)
1953
+ _from_rbldf(_ldf.shift(n, fill_value))
1954
+ end
1955
+
1956
+ # Shift the values by a given period and fill the resulting null values.
1957
+ #
1958
+ # @param periods [Integer]
1959
+ # Number of places to shift (may be negative).
1960
+ # @param fill_value [Object]
1961
+ # Fill `nil` values with the result of this expression.
1962
+ #
1963
+ # @return [LazyFrame]
1964
+ #
1965
+ # @example
1966
+ # df = Polars::DataFrame.new(
1967
+ # {
1968
+ # "a" => [1, 3, 5],
1969
+ # "b" => [2, 4, 6]
1970
+ # }
1971
+ # ).lazy
1972
+ # df.shift_and_fill(1, 0).collect
1973
+ # # =>
1974
+ # # shape: (3, 2)
1975
+ # # ┌─────┬─────┐
1976
+ # # │ a ┆ b │
1977
+ # # │ --- ┆ --- │
1978
+ # # │ i64 ┆ i64 │
1979
+ # # ╞═════╪═════╡
1980
+ # # │ 0 ┆ 0 │
1981
+ # # │ 1 ┆ 2 │
1982
+ # # │ 3 ┆ 4 │
1983
+ # # └─────┴─────┘
1984
+ #
1985
+ # @example
1986
+ # df.shift_and_fill(-1, 0).collect
1987
+ # # =>
1988
+ # # shape: (3, 2)
1989
+ # # ┌─────┬─────┐
1990
+ # # │ a ┆ b │
1991
+ # # │ --- ┆ --- │
1992
+ # # │ i64 ┆ i64 │
1993
+ # # ╞═════╪═════╡
1994
+ # # │ 3 ┆ 4 │
1995
+ # # │ 5 ┆ 6 │
1996
+ # # │ 0 ┆ 0 │
1997
+ # # └─────┴─────┘
1998
+ def shift_and_fill(periods, fill_value)
1999
+ shift(periods, fill_value: fill_value)
2000
+ end
2001
+
2002
+ # Get a slice of this DataFrame.
2003
+ #
2004
+ # @param offset [Integer]
2005
+ # Start index. Negative indexing is supported.
2006
+ # @param length [Integer]
2007
+ # Length of the slice. If set to `nil`, all rows starting at the offset
2008
+ # will be selected.
2009
+ #
2010
+ # @return [LazyFrame]
2011
+ #
2012
+ # @example
2013
+ # df = Polars::DataFrame.new(
2014
+ # {
2015
+ # "a" => ["x", "y", "z"],
2016
+ # "b" => [1, 3, 5],
2017
+ # "c" => [2, 4, 6]
2018
+ # }
2019
+ # ).lazy
2020
+ # df.slice(1, 2).collect
2021
+ # # =>
2022
+ # # shape: (2, 3)
2023
+ # # ┌─────┬─────┬─────┐
2024
+ # # │ a ┆ b ┆ c │
2025
+ # # │ --- ┆ --- ┆ --- │
2026
+ # # │ str ┆ i64 ┆ i64 │
2027
+ # # ╞═════╪═════╪═════╡
2028
+ # # │ y ┆ 3 ┆ 4 │
2029
+ # # │ z ┆ 5 ┆ 6 │
2030
+ # # └─────┴─────┴─────┘
2031
+ def slice(offset, length = nil)
2032
+ if length && length < 0
2033
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
2034
+ end
2035
+ _from_rbldf(_ldf.slice(offset, length))
2036
+ end
2037
+
2038
+ # Get the first `n` rows.
2039
+ #
2040
+ # Alias for {#head}.
2041
+ #
2042
+ # @param n [Integer]
2043
+ # Number of rows to return.
2044
+ #
2045
+ # @return [LazyFrame]
2046
+ #
2047
+ # @note
2048
+ # Consider using the {#fetch} operation if you only want to test your
2049
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2050
+ # level, whereas the {#head}/{#limit} are applied at the end.
2051
+ def limit(n = 5)
2052
+ head(5)
2053
+ end
2054
+
2055
+ # Get the first `n` rows.
2056
+ #
2057
+ # @param n [Integer]
2058
+ # Number of rows to return.
2059
+ #
2060
+ # @return [LazyFrame]
2061
+ #
2062
+ # @note
2063
+ # Consider using the {#fetch} operation if you only want to test your
2064
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2065
+ # level, whereas the {#head}/{#limit} are applied at the end.
2066
+ def head(n = 5)
2067
+ slice(0, n)
2068
+ end
2069
+
2070
+ # Get the last `n` rows.
2071
+ #
2072
+ # @param n [Integer]
2073
+ # Number of rows.
2074
+ #
2075
+ # @return [LazyFrame]
2076
+ def tail(n = 5)
2077
+ _from_rbldf(_ldf.tail(n))
2078
+ end
2079
+
2080
+ # Get the last row of the DataFrame.
2081
+ #
2082
+ # @return [LazyFrame]
2083
+ def last
2084
+ tail(1)
2085
+ end
2086
+
2087
+ # Get the first row of the DataFrame.
2088
+ #
2089
+ # @return [LazyFrame]
2090
+ def first
2091
+ slice(0, 1)
2092
+ end
2093
+
2094
+ # Add a column at index 0 that counts the rows.
2095
+ #
2096
+ # @param name [String]
2097
+ # Name of the column to add.
2098
+ # @param offset [Integer]
2099
+ # Start the row count at this offset.
2100
+ #
2101
+ # @return [LazyFrame]
2102
+ #
2103
+ # @note
2104
+ # This can have a negative effect on query performance.
2105
+ # This may, for instance, block predicate pushdown optimization.
2106
+ #
2107
+ # @example
2108
+ # df = Polars::DataFrame.new(
2109
+ # {
2110
+ # "a" => [1, 3, 5],
2111
+ # "b" => [2, 4, 6]
2112
+ # }
2113
+ # ).lazy
2114
+ # df.with_row_index.collect
2115
+ # # =>
2116
+ # # shape: (3, 3)
2117
+ # # ┌───────┬─────┬─────┐
2118
+ # # │ index ┆ a ┆ b │
2119
+ # # │ --- ┆ --- ┆ --- │
2120
+ # # │ u32 ┆ i64 ┆ i64 │
2121
+ # # ╞═══════╪═════╪═════╡
2122
+ # # │ 0 ┆ 1 ┆ 2 │
2123
+ # # │ 1 ┆ 3 ┆ 4 │
2124
+ # # │ 2 ┆ 5 ┆ 6 │
2125
+ # # └───────┴─────┴─────┘
2126
+ def with_row_index(name: "index", offset: 0)
2127
+ _from_rbldf(_ldf.with_row_index(name, offset))
2128
+ end
2129
+ alias_method :with_row_count, :with_row_index
2130
+
2131
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
2132
+ #
2133
+ # @return [LazyFrame]
2134
+ #
2135
+ # @example
2136
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2137
+ # s.take_every(2).collect
2138
+ # # =>
2139
+ # # shape: (2, 2)
2140
+ # # ┌─────┬─────┐
2141
+ # # │ a ┆ b │
2142
+ # # │ --- ┆ --- │
2143
+ # # │ i64 ┆ i64 │
2144
+ # # ╞═════╪═════╡
2145
+ # # │ 1 ┆ 5 │
2146
+ # # │ 3 ┆ 7 │
2147
+ # # └─────┴─────┘
2148
+ def take_every(n)
2149
+ select(F.col("*").take_every(n))
2150
+ end
2151
+
2152
+ # Fill null values using the specified value or strategy.
2153
+ #
2154
+ # @return [LazyFrame]
2155
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
2156
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
2157
+ end
2158
+
2159
+ # Fill floating point NaN values.
2160
+ #
2161
+ # @param fill_value [Object]
2162
+ # Value to fill the NaN values with.
2163
+ #
2164
+ # @return [LazyFrame]
2165
+ #
2166
+ # @note
2167
+ # Note that floating point NaN (Not a Number) are not missing values!
2168
+ # To replace missing values, use `fill_null` instead.
2169
+ #
2170
+ # @example
2171
+ # df = Polars::DataFrame.new(
2172
+ # {
2173
+ # "a" => [1.5, 2, Float::NAN, 4],
2174
+ # "b" => [0.5, 4, Float::NAN, 13],
2175
+ # }
2176
+ # ).lazy
2177
+ # df.fill_nan(99).collect
2178
+ # # =>
2179
+ # # shape: (4, 2)
2180
+ # # ┌──────┬──────┐
2181
+ # # │ a ┆ b │
2182
+ # # │ --- ┆ --- │
2183
+ # # │ f64 ┆ f64 │
2184
+ # # ╞══════╪══════╡
2185
+ # # │ 1.5 ┆ 0.5 │
2186
+ # # │ 2.0 ┆ 4.0 │
2187
+ # # │ 99.0 ┆ 99.0 │
2188
+ # # │ 4.0 ┆ 13.0 │
2189
+ # # └──────┴──────┘
2190
+ def fill_nan(fill_value)
2191
+ if !fill_value.is_a?(Expr)
2192
+ fill_value = F.lit(fill_value)
2193
+ end
2194
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
2195
+ end
2196
+
2197
+ # Aggregate the columns in the DataFrame to their standard deviation value.
2198
+ #
2199
+ # @return [LazyFrame]
2200
+ #
2201
+ # @example
2202
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2203
+ # df.std.collect
2204
+ # # =>
2205
+ # # shape: (1, 2)
2206
+ # # ┌──────────┬─────┐
2207
+ # # │ a ┆ b │
2208
+ # # │ --- ┆ --- │
2209
+ # # │ f64 ┆ f64 │
2210
+ # # ╞══════════╪═════╡
2211
+ # # │ 1.290994 ┆ 0.5 │
2212
+ # # └──────────┴─────┘
2213
+ #
2214
+ # @example
2215
+ # df.std(ddof: 0).collect
2216
+ # # =>
2217
+ # # shape: (1, 2)
2218
+ # # ┌──────────┬──────────┐
2219
+ # # │ a ┆ b │
2220
+ # # │ --- ┆ --- │
2221
+ # # │ f64 ┆ f64 │
2222
+ # # ╞══════════╪══════════╡
2223
+ # # │ 1.118034 ┆ 0.433013 │
2224
+ # # └──────────┴──────────┘
2225
+ def std(ddof: 1)
2226
+ _from_rbldf(_ldf.std(ddof))
2227
+ end
2228
+
2229
+ # Aggregate the columns in the DataFrame to their variance value.
2230
+ #
2231
+ # @return [LazyFrame]
2232
+ #
2233
+ # @example
2234
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2235
+ # df.var.collect
2236
+ # # =>
2237
+ # # shape: (1, 2)
2238
+ # # ┌──────────┬──────┐
2239
+ # # │ a ┆ b │
2240
+ # # │ --- ┆ --- │
2241
+ # # │ f64 ┆ f64 │
2242
+ # # ╞══════════╪══════╡
2243
+ # # │ 1.666667 ┆ 0.25 │
2244
+ # # └──────────┴──────┘
2245
+ #
2246
+ # @example
2247
+ # df.var(ddof: 0).collect
2248
+ # # =>
2249
+ # # shape: (1, 2)
2250
+ # # ┌──────┬────────┐
2251
+ # # │ a ┆ b │
2252
+ # # │ --- ┆ --- │
2253
+ # # │ f64 ┆ f64 │
2254
+ # # ╞══════╪════════╡
2255
+ # # │ 1.25 ┆ 0.1875 │
2256
+ # # └──────┴────────┘
2257
+ def var(ddof: 1)
2258
+ _from_rbldf(_ldf.var(ddof))
2259
+ end
2260
+
2261
+ # Aggregate the columns in the DataFrame to their maximum value.
2262
+ #
2263
+ # @return [LazyFrame]
2264
+ #
2265
+ # @example
2266
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2267
+ # df.max.collect
2268
+ # # =>
2269
+ # # shape: (1, 2)
2270
+ # # ┌─────┬─────┐
2271
+ # # │ a ┆ b │
2272
+ # # │ --- ┆ --- │
2273
+ # # │ i64 ┆ i64 │
2274
+ # # ╞═════╪═════╡
2275
+ # # │ 4 ┆ 2 │
2276
+ # # └─────┴─────┘
2277
+ def max
2278
+ _from_rbldf(_ldf.max)
2279
+ end
2280
+
2281
+ # Aggregate the columns in the DataFrame to their minimum value.
2282
+ #
2283
+ # @return [LazyFrame]
2284
+ #
2285
+ # @example
2286
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2287
+ # df.min.collect
2288
+ # # =>
2289
+ # # shape: (1, 2)
2290
+ # # ┌─────┬─────┐
2291
+ # # │ a ┆ b │
2292
+ # # │ --- ┆ --- │
2293
+ # # │ i64 ┆ i64 │
2294
+ # # ╞═════╪═════╡
2295
+ # # │ 1 ┆ 1 │
2296
+ # # └─────┴─────┘
2297
+ def min
2298
+ _from_rbldf(_ldf.min)
2299
+ end
2300
+
2301
+ # Aggregate the columns in the DataFrame to their sum value.
2302
+ #
2303
+ # @return [LazyFrame]
2304
+ #
2305
+ # @example
2306
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2307
+ # df.sum.collect
2308
+ # # =>
2309
+ # # shape: (1, 2)
2310
+ # # ┌─────┬─────┐
2311
+ # # │ a ┆ b │
2312
+ # # │ --- ┆ --- │
2313
+ # # │ i64 ┆ i64 │
2314
+ # # ╞═════╪═════╡
2315
+ # # │ 10 ┆ 5 │
2316
+ # # └─────┴─────┘
2317
+ def sum
2318
+ _from_rbldf(_ldf.sum)
2319
+ end
2320
+
2321
+ # Aggregate the columns in the DataFrame to their mean value.
2322
+ #
2323
+ # @return [LazyFrame]
2324
+ #
2325
+ # @example
2326
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2327
+ # df.mean.collect
2328
+ # # =>
2329
+ # # shape: (1, 2)
2330
+ # # ┌─────┬──────┐
2331
+ # # │ a ┆ b │
2332
+ # # │ --- ┆ --- │
2333
+ # # │ f64 ┆ f64 │
2334
+ # # ╞═════╪══════╡
2335
+ # # │ 2.5 ┆ 1.25 │
2336
+ # # └─────┴──────┘
2337
+ def mean
2338
+ _from_rbldf(_ldf.mean)
2339
+ end
2340
+
2341
+ # Aggregate the columns in the DataFrame to their median value.
2342
+ #
2343
+ # @return [LazyFrame]
2344
+ #
2345
+ # @example
2346
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2347
+ # df.median.collect
2348
+ # # =>
2349
+ # # shape: (1, 2)
2350
+ # # ┌─────┬─────┐
2351
+ # # │ a ┆ b │
2352
+ # # │ --- ┆ --- │
2353
+ # # │ f64 ┆ f64 │
2354
+ # # ╞═════╪═════╡
2355
+ # # │ 2.5 ┆ 1.0 │
2356
+ # # └─────┴─────┘
2357
+ def median
2358
+ _from_rbldf(_ldf.median)
2359
+ end
2360
+
2361
+ # Aggregate the columns in the DataFrame to their quantile value.
2362
+ #
2363
+ # @param quantile [Float]
2364
+ # Quantile between 0.0 and 1.0.
2365
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2366
+ # Interpolation method.
2367
+ #
2368
+ # @return [LazyFrame]
2369
+ #
2370
+ # @example
2371
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2372
+ # df.quantile(0.7).collect
2373
+ # # =>
2374
+ # # shape: (1, 2)
2375
+ # # ┌─────┬─────┐
2376
+ # # │ a ┆ b │
2377
+ # # │ --- ┆ --- │
2378
+ # # │ f64 ┆ f64 │
2379
+ # # ╞═════╪═════╡
2380
+ # # │ 3.0 ┆ 1.0 │
2381
+ # # └─────┴─────┘
2382
+ def quantile(quantile, interpolation: "nearest")
2383
+ quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
2384
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
2385
+ end
2386
+
2387
+ # Explode lists to long format.
2388
+ #
2389
+ # @return [LazyFrame]
2390
+ #
2391
+ # @example
2392
+ # df = Polars::DataFrame.new(
2393
+ # {
2394
+ # "letters" => ["a", "a", "b", "c"],
2395
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2396
+ # }
2397
+ # ).lazy
2398
+ # df.explode("numbers").collect
2399
+ # # =>
2400
+ # # shape: (8, 2)
2401
+ # # ┌─────────┬─────────┐
2402
+ # # │ letters ┆ numbers │
2403
+ # # │ --- ┆ --- │
2404
+ # # │ str ┆ i64 │
2405
+ # # ╞═════════╪═════════╡
2406
+ # # │ a ┆ 1 │
2407
+ # # │ a ┆ 2 │
2408
+ # # │ a ┆ 3 │
2409
+ # # │ b ┆ 4 │
2410
+ # # │ b ┆ 5 │
2411
+ # # │ c ┆ 6 │
2412
+ # # │ c ┆ 7 │
2413
+ # # │ c ┆ 8 │
2414
+ # # └─────────┴─────────┘
2415
+ def explode(columns)
2416
+ columns = Utils.parse_into_list_of_expressions(columns)
2417
+ _from_rbldf(_ldf.explode(columns))
2418
+ end
2419
+
2420
+ # Drop duplicate rows from this DataFrame.
2421
+ #
2422
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2423
+ # subset.
2424
+ #
2425
+ # @param maintain_order [Boolean]
2426
+ # Keep the same order as the original DataFrame. This requires more work to
2427
+ # compute.
2428
+ # @param subset [Object]
2429
+ # Subset to use to compare rows.
2430
+ # @param keep ["first", "last"]
2431
+ # Which of the duplicate rows to keep.
2432
+ #
2433
+ # @return [LazyFrame]
2434
+ def unique(maintain_order: true, subset: nil, keep: "first")
2435
+ if !subset.nil? && !subset.is_a?(::Array)
2436
+ subset = [subset]
2437
+ end
2438
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2439
+ end
2440
+
2441
+ # Drop rows with null values from this LazyFrame.
2442
+ #
2443
+ # @param subset [Object]
2444
+ # Subset of column(s) on which `drop_nulls` will be applied.
2445
+ #
2446
+ # @return [LazyFrame]
2447
+ #
2448
+ # @example
2449
+ # df = Polars::DataFrame.new(
2450
+ # {
2451
+ # "foo" => [1, 2, 3],
2452
+ # "bar" => [6, nil, 8],
2453
+ # "ham" => ["a", "b", "c"]
2454
+ # }
2455
+ # )
2456
+ # df.lazy.drop_nulls.collect
2457
+ # # =>
2458
+ # # shape: (2, 3)
2459
+ # # ┌─────┬─────┬─────┐
2460
+ # # │ foo ┆ bar ┆ ham │
2461
+ # # │ --- ┆ --- ┆ --- │
2462
+ # # │ i64 ┆ i64 ┆ str │
2463
+ # # ╞═════╪═════╪═════╡
2464
+ # # │ 1 ┆ 6 ┆ a │
2465
+ # # │ 3 ┆ 8 ┆ c │
2466
+ # # └─────┴─────┴─────┘
2467
+ def drop_nulls(subset: nil)
2468
+ if !subset.nil? && !subset.is_a?(::Array)
2469
+ subset = [subset]
2470
+ end
2471
+ _from_rbldf(_ldf.drop_nulls(subset))
2472
+ end
2473
+
2474
+ # Unpivot a DataFrame from wide to long format.
2475
+ #
2476
+ # Optionally leaves identifiers set.
2477
+ #
2478
+ # This function is useful to massage a DataFrame into a format where one or more
2479
+ # columns are identifier variables (index) while all other columns, considered
2480
+ # measured variables (on), are "unpivoted" to the row axis leaving just
2481
+ # two non-identifier columns, 'variable' and 'value'.
2482
+ #
2483
+ # @param on [Object]
2484
+ # Column(s) or selector(s) to use as values variables; if `on`
2485
+ # is empty all columns that are not in `index` will be used.
2486
+ # @param index [Object]
2487
+ # Column(s) or selector(s) to use as identifier variables.
2488
+ # @param variable_name [String]
2489
+ # Name to give to the `variable` column. Defaults to "variable"
2490
+ # @param value_name [String]
2491
+ # Name to give to the `value` column. Defaults to "value"
2492
+ # @param streamable [Boolean]
2493
+ # Allow this node to run in the streaming engine.
2494
+ # If this runs in streaming, the output of the unpivot operation
2495
+ # will not have a stable ordering.
2496
+ #
2497
+ # @return [LazyFrame]
2498
+ #
2499
+ # @example
2500
+ # lf = Polars::LazyFrame.new(
2501
+ # {
2502
+ # "a" => ["x", "y", "z"],
2503
+ # "b" => [1, 3, 5],
2504
+ # "c" => [2, 4, 6]
2505
+ # }
2506
+ # )
2507
+ # lf.unpivot(Polars::Selectors.numeric, index: "a").collect
2508
+ # # =>
2509
+ # # shape: (6, 3)
2510
+ # # ┌─────┬──────────┬───────┐
2511
+ # # │ a ┆ variable ┆ value │
2512
+ # # │ --- ┆ --- ┆ --- │
2513
+ # # │ str ┆ str ┆ i64 │
2514
+ # # ╞═════╪══════════╪═══════╡
2515
+ # # │ x ┆ b ┆ 1 │
2516
+ # # │ y ┆ b ┆ 3 │
2517
+ # # │ z ┆ b ┆ 5 │
2518
+ # # │ x ┆ c ┆ 2 │
2519
+ # # │ y ┆ c ┆ 4 │
2520
+ # # │ z ┆ c ┆ 6 │
2521
+ # # └─────┴──────────┴───────┘
2522
+ def unpivot(
2523
+ on,
2524
+ index: nil,
2525
+ variable_name: nil,
2526
+ value_name: nil,
2527
+ streamable: true
2528
+ )
2529
+ if !streamable
2530
+ warn "The `streamable` parameter for `LazyFrame.unpivot` is deprecated"
2531
+ end
2532
+
2533
+ on = on.nil? ? [] : Utils._expand_selectors(self, on)
2534
+ index = index.nil? ? [] : Utils._expand_selectors(self, index)
2535
+
2536
+ _from_rbldf(
2537
+ _ldf.unpivot(on, index, value_name, variable_name)
2538
+ )
2539
+ end
2540
+ alias_method :melt, :unpivot
2541
+
2542
+ # def map
2543
+ # end
2544
+
2545
+ # Interpolate intermediate values. The interpolation method is linear.
2546
+ #
2547
+ # @return [LazyFrame]
2548
+ #
2549
+ # @example
2550
+ # df = Polars::DataFrame.new(
2551
+ # {
2552
+ # "foo" => [1, nil, 9, 10],
2553
+ # "bar" => [6, 7, 9, nil],
2554
+ # "baz" => [1, nil, nil, 9]
2555
+ # }
2556
+ # ).lazy
2557
+ # df.interpolate.collect
2558
+ # # =>
2559
+ # # shape: (4, 3)
2560
+ # # ┌──────┬──────┬──────────┐
2561
+ # # │ foo ┆ bar ┆ baz │
2562
+ # # │ --- ┆ --- ┆ --- │
2563
+ # # │ f64 ┆ f64 ┆ f64 │
2564
+ # # ╞══════╪══════╪══════════╡
2565
+ # # │ 1.0 ┆ 6.0 ┆ 1.0 │
2566
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667 │
2567
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333 │
2568
+ # # │ 10.0 ┆ null ┆ 9.0 │
2569
+ # # └──────┴──────┴──────────┘
2570
+ def interpolate
2571
+ select(F.col("*").interpolate)
2572
+ end
2573
+
2574
+ # Decompose a struct into its fields.
2575
+ #
2576
+ # The fields will be inserted into the `DataFrame` on the location of the
2577
+ # `struct` type.
2578
+ #
2579
+ # @param names [Object]
2580
+ # Names of the struct columns that will be decomposed by its fields
2581
+ #
2582
+ # @return [LazyFrame]
2583
+ #
2584
+ # @example
2585
+ # df = (
2586
+ # Polars::DataFrame.new(
2587
+ # {
2588
+ # "before" => ["foo", "bar"],
2589
+ # "t_a" => [1, 2],
2590
+ # "t_b" => ["a", "b"],
2591
+ # "t_c" => [true, nil],
2592
+ # "t_d" => [[1, 2], [3]],
2593
+ # "after" => ["baz", "womp"]
2594
+ # }
2595
+ # )
2596
+ # .lazy
2597
+ # .select(
2598
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2599
+ # )
2600
+ # )
2601
+ # df.fetch
2602
+ # # =>
2603
+ # # shape: (2, 3)
2604
+ # # ┌────────┬─────────────────────┬───────┐
2605
+ # # │ before ┆ t_struct ┆ after │
2606
+ # # │ --- ┆ --- ┆ --- │
2607
+ # # │ str ┆ struct[4] ┆ str │
2608
+ # # ╞════════╪═════════════════════╪═══════╡
2609
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2610
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2611
+ # # └────────┴─────────────────────┴───────┘
2612
+ #
2613
+ # @example
2614
+ # df.unnest("t_struct").fetch
2615
+ # # =>
2616
+ # # shape: (2, 6)
2617
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2618
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2619
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2620
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2621
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2622
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2623
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2624
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2625
+ def unnest(names)
2626
+ if names.is_a?(::String)
2627
+ names = [names]
2628
+ end
2629
+ _from_rbldf(_ldf.unnest(names))
2630
+ end
2631
+
2632
+ # Take two sorted DataFrames and merge them by the sorted key.
2633
+ #
2634
+ # The output of this operation will also be sorted.
2635
+ # It is the callers responsibility that the frames are sorted
2636
+ # by that key otherwise the output will not make sense.
2637
+ #
2638
+ # The schemas of both LazyFrames must be equal.
2639
+ #
2640
+ # @param other [DataFrame]
2641
+ # Other DataFrame that must be merged
2642
+ # @param key [String]
2643
+ # Key that is sorted.
2644
+ #
2645
+ # @return [LazyFrame]
2646
+ #
2647
+ # @example
2648
+ # df0 = Polars::LazyFrame.new(
2649
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2650
+ # ).sort("age")
2651
+ # df1 = Polars::LazyFrame.new(
2652
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2653
+ # ).sort("age")
2654
+ # df0.merge_sorted(df1, "age").collect
2655
+ # # =>
2656
+ # # shape: (7, 2)
2657
+ # # ┌────────┬─────┐
2658
+ # # │ name ┆ age │
2659
+ # # │ --- ┆ --- │
2660
+ # # │ str ┆ i64 │
2661
+ # # ╞════════╪═════╡
2662
+ # # │ bob ┆ 18 │
2663
+ # # │ thomas ┆ 20 │
2664
+ # # │ anna ┆ 21 │
2665
+ # # │ megan ┆ 33 │
2666
+ # # │ steve ┆ 42 │
2667
+ # # │ steve ┆ 42 │
2668
+ # # │ elise ┆ 44 │
2669
+ # # └────────┴─────┘
2670
+ def merge_sorted(other, key)
2671
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2672
+ end
2673
+
2674
+ # Indicate that one or multiple columns are sorted.
2675
+ #
2676
+ # @param column [Object]
2677
+ # Columns that are sorted
2678
+ # @param descending [Boolean]
2679
+ # Whether the columns are sorted in descending order.
2680
+ #
2681
+ # @return [LazyFrame]
2682
+ def set_sorted(
2683
+ column,
2684
+ descending: false
2685
+ )
2686
+ if !Utils.strlike?(column)
2687
+ msg = "expected a 'str' for argument 'column' in 'set_sorted'"
2688
+ raise TypeError, msg
2689
+ end
2690
+ with_columns(F.col(column).set_sorted(descending: descending))
2691
+ end
2692
+
2693
+ # TODO
2694
+ # def update
2695
+ # end
2696
+
2697
+ private
2698
+
2699
+ def initialize_copy(other)
2700
+ super
2701
+ self._ldf = _ldf._clone
2702
+ end
2703
+
2704
+ def _from_rbldf(rb_ldf)
2705
+ self.class._from_rbldf(rb_ldf)
2706
+ end
2707
+ end
2708
+ end