polars-df 0.13.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,2708 @@
1
+ module Polars
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
+ class LazyFrame
4
+ # @private
5
+ attr_accessor :_ldf
6
+
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
23
+ # @private
24
+ def self._from_rbldf(rb_ldf)
25
+ ldf = LazyFrame.allocate
26
+ ldf._ldf = rb_ldf
27
+ ldf
28
+ end
29
+
30
+ # def self.from_json
31
+ # end
32
+
33
+ # Read a logical plan from a JSON file to construct a LazyFrame.
34
+ #
35
+ # @param file [String]
36
+ # Path to a file or a file-like object.
37
+ #
38
+ # @return [LazyFrame]
39
+ def self.read_json(file)
40
+ if Utils.pathlike?(file)
41
+ file = Utils.normalize_filepath(file)
42
+ end
43
+
44
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
45
+ end
46
+
47
+ # Get or set column names.
48
+ #
49
+ # @return [Array]
50
+ #
51
+ # @example
52
+ # df = (
53
+ # Polars::DataFrame.new(
54
+ # {
55
+ # "foo" => [1, 2, 3],
56
+ # "bar" => [6, 7, 8],
57
+ # "ham" => ["a", "b", "c"]
58
+ # }
59
+ # )
60
+ # .lazy
61
+ # .select(["foo", "bar"])
62
+ # )
63
+ # df.columns
64
+ # # => ["foo", "bar"]
65
+ def columns
66
+ _ldf.collect_schema.keys
67
+ end
68
+
69
+ # Get dtypes of columns in LazyFrame.
70
+ #
71
+ # @return [Array]
72
+ #
73
+ # @example
74
+ # lf = Polars::DataFrame.new(
75
+ # {
76
+ # "foo" => [1, 2, 3],
77
+ # "bar" => [6.0, 7.0, 8.0],
78
+ # "ham" => ["a", "b", "c"]
79
+ # }
80
+ # ).lazy
81
+ # lf.dtypes
82
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
83
+ def dtypes
84
+ _ldf.collect_schema.values
85
+ end
86
+
87
+ # Get the schema.
88
+ #
89
+ # @return [Hash]
90
+ #
91
+ # @example
92
+ # lf = Polars::DataFrame.new(
93
+ # {
94
+ # "foo" => [1, 2, 3],
95
+ # "bar" => [6.0, 7.0, 8.0],
96
+ # "ham" => ["a", "b", "c"]
97
+ # }
98
+ # ).lazy
99
+ # lf.schema
100
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
101
+ def schema
102
+ _ldf.collect_schema
103
+ end
104
+
105
+ # Get the width of the LazyFrame.
106
+ #
107
+ # @return [Integer]
108
+ #
109
+ # @example
110
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
111
+ # lf.width
112
+ # # => 2
113
+ def width
114
+ _ldf.collect_schema.length
115
+ end
116
+
117
+ # Check if LazyFrame includes key.
118
+ #
119
+ # @return [Boolean]
120
+ def include?(key)
121
+ columns.include?(key)
122
+ end
123
+
124
+ # clone handled by initialize_copy
125
+
126
+ # def [](item)
127
+ # end
128
+
129
+ # Returns a string representing the LazyFrame.
130
+ #
131
+ # @return [String]
132
+ def to_s
133
+ <<~EOS
134
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
135
+
136
+ #{describe_plan}
137
+ EOS
138
+ end
139
+
140
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
141
+ #
142
+ # @param file [String]
143
+ # File path to which the result should be written.
144
+ #
145
+ # @return [nil]
146
+ def write_json(file)
147
+ if Utils.pathlike?(file)
148
+ file = Utils.normalize_filepath(file)
149
+ end
150
+ _ldf.write_json(file)
151
+ nil
152
+ end
153
+
154
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
155
+ #
156
+ # @param func [Object]
157
+ # Callable; will receive the frame as the first parameter,
158
+ # followed by any given args/kwargs.
159
+ # @param args [Object]
160
+ # Arguments to pass to the UDF.
161
+ # @param kwargs [Object]
162
+ # Keyword arguments to pass to the UDF.
163
+ #
164
+ # @return [LazyFrame]
165
+ #
166
+ # @example
167
+ # cast_str_to_int = lambda do |data, col_name:|
168
+ # data.with_column(Polars.col(col_name).cast(:i64))
169
+ # end
170
+ #
171
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
172
+ # df.pipe(cast_str_to_int, col_name: "b").collect
173
+ # # =>
174
+ # # shape: (4, 2)
175
+ # # ┌─────┬─────┐
176
+ # # │ a ┆ b │
177
+ # # │ --- ┆ --- │
178
+ # # │ i64 ┆ i64 │
179
+ # # ╞═════╪═════╡
180
+ # # │ 1 ┆ 10 │
181
+ # # │ 2 ┆ 20 │
182
+ # # │ 3 ┆ 30 │
183
+ # # │ 4 ┆ 40 │
184
+ # # └─────┴─────┘
185
+ def pipe(func, *args, **kwargs, &block)
186
+ func.call(self, *args, **kwargs, &block)
187
+ end
188
+
189
+ # Create a string representation of the unoptimized query plan.
190
+ #
191
+ # @return [String]
192
+ def describe_plan
193
+ _ldf.describe_plan
194
+ end
195
+
196
+ # Create a string representation of the optimized query plan.
197
+ #
198
+ # @return [String]
199
+ def describe_optimized_plan(
200
+ type_coercion: true,
201
+ predicate_pushdown: true,
202
+ projection_pushdown: true,
203
+ simplify_expression: true,
204
+ slice_pushdown: true,
205
+ common_subplan_elimination: true,
206
+ comm_subexpr_elim: true,
207
+ allow_streaming: false
208
+ )
209
+ ldf = _ldf.optimization_toggle(
210
+ type_coercion,
211
+ predicate_pushdown,
212
+ projection_pushdown,
213
+ simplify_expression,
214
+ slice_pushdown,
215
+ common_subplan_elimination,
216
+ comm_subexpr_elim,
217
+ allow_streaming,
218
+ false
219
+ )
220
+
221
+ ldf.describe_optimized_plan
222
+ end
223
+
224
+ # def show_graph
225
+ # end
226
+
227
+ # Sort the DataFrame.
228
+ #
229
+ # Sorting can be done by:
230
+ #
231
+ # - A single column name
232
+ # - An expression
233
+ # - Multiple expressions
234
+ #
235
+ # @param by [Object]
236
+ # Column (expressions) to sort by.
237
+ # @param reverse [Boolean]
238
+ # Sort in descending order.
239
+ # @param nulls_last [Boolean]
240
+ # Place null values last. Can only be used if sorted by a single column.
241
+ #
242
+ # @return [LazyFrame]
243
+ #
244
+ # @example
245
+ # df = Polars::DataFrame.new(
246
+ # {
247
+ # "foo" => [1, 2, 3],
248
+ # "bar" => [6.0, 7.0, 8.0],
249
+ # "ham" => ["a", "b", "c"]
250
+ # }
251
+ # ).lazy
252
+ # df.sort("foo", reverse: true).collect
253
+ # # =>
254
+ # # shape: (3, 3)
255
+ # # ┌─────┬─────┬─────┐
256
+ # # │ foo ┆ bar ┆ ham │
257
+ # # │ --- ┆ --- ┆ --- │
258
+ # # │ i64 ┆ f64 ┆ str │
259
+ # # ╞═════╪═════╪═════╡
260
+ # # │ 3 ┆ 8.0 ┆ c │
261
+ # # │ 2 ┆ 7.0 ┆ b │
262
+ # # │ 1 ┆ 6.0 ┆ a │
263
+ # # └─────┴─────┴─────┘
264
+ def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
265
+ if by.is_a?(::String) && more_by.empty?
266
+ return _from_rbldf(
267
+ _ldf.sort(
268
+ by, reverse, nulls_last, maintain_order, multithreaded
269
+ )
270
+ )
271
+ end
272
+
273
+ by = Utils.parse_into_list_of_expressions(by, *more_by)
274
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
275
+ nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
276
+ _from_rbldf(
277
+ _ldf.sort_by_exprs(
278
+ by, reverse, nulls_last, maintain_order, multithreaded
279
+ )
280
+ )
281
+ end
282
+
283
+ # def profile
284
+ # end
285
+
286
+ # Collect into a DataFrame.
287
+ #
288
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
289
+ # only. This can be a huge time saver in debugging queries.
290
+ #
291
+ # @param type_coercion [Boolean]
292
+ # Do type coercion optimization.
293
+ # @param predicate_pushdown [Boolean]
294
+ # Do predicate pushdown optimization.
295
+ # @param projection_pushdown [Boolean]
296
+ # Do projection pushdown optimization.
297
+ # @param simplify_expression [Boolean]
298
+ # Run simplify expressions optimization.
299
+ # @param string_cache [Boolean]
300
+ # This argument is deprecated. Please set the string cache globally.
301
+ # The argument will be ignored
302
+ # @param no_optimization [Boolean]
303
+ # Turn off (certain) optimizations.
304
+ # @param slice_pushdown [Boolean]
305
+ # Slice pushdown optimization.
306
+ # @param common_subplan_elimination [Boolean]
307
+ # Will try to cache branching subplans that occur on self-joins or unions.
308
+ # @param allow_streaming [Boolean]
309
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
310
+ #
311
+ # @return [DataFrame]
312
+ #
313
+ # @example
314
+ # df = Polars::DataFrame.new(
315
+ # {
316
+ # "a" => ["a", "b", "a", "b", "b", "c"],
317
+ # "b" => [1, 2, 3, 4, 5, 6],
318
+ # "c" => [6, 5, 4, 3, 2, 1]
319
+ # }
320
+ # ).lazy
321
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
322
+ # # =>
323
+ # # shape: (3, 3)
324
+ # # ┌─────┬─────┬─────┐
325
+ # # │ a ┆ b ┆ c │
326
+ # # │ --- ┆ --- ┆ --- │
327
+ # # │ str ┆ i64 ┆ i64 │
328
+ # # ╞═════╪═════╪═════╡
329
+ # # │ a ┆ 4 ┆ 10 │
330
+ # # │ b ┆ 11 ┆ 10 │
331
+ # # │ c ┆ 6 ┆ 1 │
332
+ # # └─────┴─────┴─────┘
333
+ def collect(
334
+ type_coercion: true,
335
+ predicate_pushdown: true,
336
+ projection_pushdown: true,
337
+ simplify_expression: true,
338
+ string_cache: false,
339
+ no_optimization: false,
340
+ slice_pushdown: true,
341
+ common_subplan_elimination: true,
342
+ comm_subexpr_elim: true,
343
+ allow_streaming: false,
344
+ _eager: false
345
+ )
346
+ if no_optimization
347
+ predicate_pushdown = false
348
+ projection_pushdown = false
349
+ slice_pushdown = false
350
+ common_subplan_elimination = false
351
+ comm_subexpr_elim = false
352
+ end
353
+
354
+ if allow_streaming
355
+ common_subplan_elimination = false
356
+ end
357
+
358
+ ldf = _ldf.optimization_toggle(
359
+ type_coercion,
360
+ predicate_pushdown,
361
+ projection_pushdown,
362
+ simplify_expression,
363
+ slice_pushdown,
364
+ common_subplan_elimination,
365
+ comm_subexpr_elim,
366
+ allow_streaming,
367
+ _eager
368
+ )
369
+ Utils.wrap_df(ldf.collect)
370
+ end
371
+
372
+ # Persists a LazyFrame at the provided path.
373
+ #
374
+ # This allows streaming results that are larger than RAM to be written to disk.
375
+ #
376
+ # @param path [String]
377
+ # File path to which the file should be written.
378
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
379
+ # Choose "zstd" for good compression performance.
380
+ # Choose "lz4" for fast compression/decompression.
381
+ # Choose "snappy" for more backwards compatibility guarantees
382
+ # when you deal with older parquet readers.
383
+ # @param compression_level [Integer]
384
+ # The level of compression to use. Higher compression means smaller files on
385
+ # disk.
386
+ #
387
+ # - "gzip" : min-level: 0, max-level: 10.
388
+ # - "brotli" : min-level: 0, max-level: 11.
389
+ # - "zstd" : min-level: 1, max-level: 22.
390
+ # @param statistics [Boolean]
391
+ # Write statistics to the parquet headers. This requires extra compute.
392
+ # @param row_group_size [Integer]
393
+ # Size of the row groups in number of rows.
394
+ # If `nil` (default), the chunks of the `DataFrame` are
395
+ # used. Writing in smaller chunks may reduce memory pressure and improve
396
+ # writing speeds.
397
+ # @param data_pagesize_limit [Integer]
398
+ # Size limit of individual data pages.
399
+ # If not set defaults to 1024 * 1024 bytes
400
+ # @param maintain_order [Boolean]
401
+ # Maintain the order in which data is processed.
402
+ # Setting this to `false` will be slightly faster.
403
+ # @param type_coercion [Boolean]
404
+ # Do type coercion optimization.
405
+ # @param predicate_pushdown [Boolean]
406
+ # Do predicate pushdown optimization.
407
+ # @param projection_pushdown [Boolean]
408
+ # Do projection pushdown optimization.
409
+ # @param simplify_expression [Boolean]
410
+ # Run simplify expressions optimization.
411
+ # @param no_optimization [Boolean]
412
+ # Turn off (certain) optimizations.
413
+ # @param slice_pushdown [Boolean]
414
+ # Slice pushdown optimization.
415
+ #
416
+ # @return [DataFrame]
417
+ #
418
+ # @example
419
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
420
+ # lf.sink_parquet("out.parquet")
421
+ def sink_parquet(
422
+ path,
423
+ compression: "zstd",
424
+ compression_level: nil,
425
+ statistics: true,
426
+ row_group_size: nil,
427
+ data_pagesize_limit: nil,
428
+ maintain_order: true,
429
+ type_coercion: true,
430
+ predicate_pushdown: true,
431
+ projection_pushdown: true,
432
+ simplify_expression: true,
433
+ no_optimization: false,
434
+ slice_pushdown: true
435
+ )
436
+ lf = _set_sink_optimizations(
437
+ type_coercion: type_coercion,
438
+ predicate_pushdown: predicate_pushdown,
439
+ projection_pushdown: projection_pushdown,
440
+ simplify_expression: simplify_expression,
441
+ slice_pushdown: slice_pushdown,
442
+ no_optimization: no_optimization
443
+ )
444
+
445
+ if statistics == true
446
+ statistics = {
447
+ min: true,
448
+ max: true,
449
+ distinct_count: false,
450
+ null_count: true
451
+ }
452
+ elsif statistics == false
453
+ statistics = {}
454
+ elsif statistics == "full"
455
+ statistics = {
456
+ min: true,
457
+ max: true,
458
+ distinct_count: true,
459
+ null_count: true
460
+ }
461
+ end
462
+
463
+ lf.sink_parquet(
464
+ path,
465
+ compression,
466
+ compression_level,
467
+ statistics,
468
+ row_group_size,
469
+ data_pagesize_limit,
470
+ maintain_order
471
+ )
472
+ end
473
+
474
+ # Evaluate the query in streaming mode and write to an IPC file.
475
+ #
476
+ # This allows streaming results that are larger than RAM to be written to disk.
477
+ #
478
+ # @param path [String]
479
+ # File path to which the file should be written.
480
+ # @param compression ["lz4", "zstd"]
481
+ # Choose "zstd" for good compression performance.
482
+ # Choose "lz4" for fast compression/decompression.
483
+ # @param maintain_order [Boolean]
484
+ # Maintain the order in which data is processed.
485
+ # Setting this to `false` will be slightly faster.
486
+ # @param type_coercion [Boolean]
487
+ # Do type coercion optimization.
488
+ # @param predicate_pushdown [Boolean]
489
+ # Do predicate pushdown optimization.
490
+ # @param projection_pushdown [Boolean]
491
+ # Do projection pushdown optimization.
492
+ # @param simplify_expression [Boolean]
493
+ # Run simplify expressions optimization.
494
+ # @param slice_pushdown [Boolean]
495
+ # Slice pushdown optimization.
496
+ # @param no_optimization [Boolean]
497
+ # Turn off (certain) optimizations.
498
+ #
499
+ # @return [DataFrame]
500
+ #
501
+ # @example
502
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
503
+ # lf.sink_ipc("out.arrow")
504
+ def sink_ipc(
505
+ path,
506
+ compression: "zstd",
507
+ maintain_order: true,
508
+ type_coercion: true,
509
+ predicate_pushdown: true,
510
+ projection_pushdown: true,
511
+ simplify_expression: true,
512
+ slice_pushdown: true,
513
+ no_optimization: false
514
+ )
515
+ lf = _set_sink_optimizations(
516
+ type_coercion: type_coercion,
517
+ predicate_pushdown: predicate_pushdown,
518
+ projection_pushdown: projection_pushdown,
519
+ simplify_expression: simplify_expression,
520
+ slice_pushdown: slice_pushdown,
521
+ no_optimization: no_optimization
522
+ )
523
+
524
+ lf.sink_ipc(
525
+ path,
526
+ compression,
527
+ maintain_order
528
+ )
529
+ end
530
+
531
+ # Evaluate the query in streaming mode and write to a CSV file.
532
+ #
533
+ # This allows streaming results that are larger than RAM to be written to disk.
534
+ #
535
+ # @param path [String]
536
+ # File path to which the file should be written.
537
+ # @param include_bom [Boolean]
538
+ # Whether to include UTF-8 BOM in the CSV output.
539
+ # @param include_header [Boolean]
540
+ # Whether to include header in the CSV output.
541
+ # @param separator [String]
542
+ # Separate CSV fields with this symbol.
543
+ # @param line_terminator [String]
544
+ # String used to end each row.
545
+ # @param quote_char [String]
546
+ # Byte to use as quoting character.
547
+ # @param batch_size [Integer]
548
+ # Number of rows that will be processed per thread.
549
+ # @param datetime_format [String]
550
+ # A format string, with the specifiers defined by the
551
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
552
+ # Rust crate. If no format specified, the default fractional-second
553
+ # precision is inferred from the maximum timeunit found in the frame's
554
+ # Datetime cols (if any).
555
+ # @param date_format [String]
556
+ # A format string, with the specifiers defined by the
557
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
558
+ # Rust crate.
559
+ # @param time_format [String]
560
+ # A format string, with the specifiers defined by the
561
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
562
+ # Rust crate.
563
+ # @param float_precision [Integer]
564
+ # Number of decimal places to write, applied to both `Float32` and
565
+ # `Float64` datatypes.
566
+ # @param null_value [String]
567
+ # A string representing null values (defaulting to the empty string).
568
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
569
+ # Determines the quoting strategy used.
570
+ #
571
+ # - necessary (default): This puts quotes around fields only when necessary.
572
+ # They are necessary when fields contain a quote,
573
+ # delimiter or record terminator.
574
+ # Quotes are also necessary when writing an empty record
575
+ # (which is indistinguishable from a record with one empty field).
576
+ # This is the default.
577
+ # - always: This puts quotes around every field. Always.
578
+ # - never: This never puts quotes around fields, even if that results in
579
+ # invalid CSV data (e.g.: by not quoting strings containing the
580
+ # separator).
581
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
582
+ # Namely, when writing a field that does not parse as a valid float
583
+ # or integer, then quotes will be used even if they aren`t strictly
584
+ # necessary.
585
+ # @param maintain_order [Boolean]
586
+ # Maintain the order in which data is processed.
587
+ # Setting this to `false` will be slightly faster.
588
+ # @param type_coercion [Boolean]
589
+ # Do type coercion optimization.
590
+ # @param predicate_pushdown [Boolean]
591
+ # Do predicate pushdown optimization.
592
+ # @param projection_pushdown [Boolean]
593
+ # Do projection pushdown optimization.
594
+ # @param simplify_expression [Boolean]
595
+ # Run simplify expressions optimization.
596
+ # @param slice_pushdown [Boolean]
597
+ # Slice pushdown optimization.
598
+ # @param no_optimization [Boolean]
599
+ # Turn off (certain) optimizations.
600
+ #
601
+ # @return [DataFrame]
602
+ #
603
+ # @example
604
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
605
+ # lf.sink_csv("out.csv")
606
+ def sink_csv(
607
+ path,
608
+ include_bom: false,
609
+ include_header: true,
610
+ separator: ",",
611
+ line_terminator: "\n",
612
+ quote_char: '"',
613
+ batch_size: 1024,
614
+ datetime_format: nil,
615
+ date_format: nil,
616
+ time_format: nil,
617
+ float_scientific: nil,
618
+ float_precision: nil,
619
+ null_value: nil,
620
+ quote_style: nil,
621
+ maintain_order: true,
622
+ type_coercion: true,
623
+ predicate_pushdown: true,
624
+ projection_pushdown: true,
625
+ simplify_expression: true,
626
+ slice_pushdown: true,
627
+ no_optimization: false
628
+ )
629
+ Utils._check_arg_is_1byte("separator", separator, false)
630
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
631
+
632
+ lf = _set_sink_optimizations(
633
+ type_coercion: type_coercion,
634
+ predicate_pushdown: predicate_pushdown,
635
+ projection_pushdown: projection_pushdown,
636
+ simplify_expression: simplify_expression,
637
+ slice_pushdown: slice_pushdown,
638
+ no_optimization: no_optimization
639
+ )
640
+
641
+ lf.sink_csv(
642
+ path,
643
+ include_bom,
644
+ include_header,
645
+ separator.ord,
646
+ line_terminator,
647
+ quote_char.ord,
648
+ batch_size,
649
+ datetime_format,
650
+ date_format,
651
+ time_format,
652
+ float_scientific,
653
+ float_precision,
654
+ null_value,
655
+ quote_style,
656
+ maintain_order
657
+ )
658
+ end
659
+
660
+ # Evaluate the query in streaming mode and write to an NDJSON file.
661
+ #
662
+ # This allows streaming results that are larger than RAM to be written to disk.
663
+ #
664
+ # @param path [String]
665
+ # File path to which the file should be written.
666
+ # @param maintain_order [Boolean]
667
+ # Maintain the order in which data is processed.
668
+ # Setting this to `false` will be slightly faster.
669
+ # @param type_coercion [Boolean]
670
+ # Do type coercion optimization.
671
+ # @param predicate_pushdown [Boolean]
672
+ # Do predicate pushdown optimization.
673
+ # @param projection_pushdown [Boolean]
674
+ # Do projection pushdown optimization.
675
+ # @param simplify_expression [Boolean]
676
+ # Run simplify expressions optimization.
677
+ # @param slice_pushdown [Boolean]
678
+ # Slice pushdown optimization.
679
+ # @param no_optimization [Boolean]
680
+ # Turn off (certain) optimizations.
681
+ #
682
+ # @return [DataFrame]
683
+ #
684
+ # @example
685
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
686
+ # lf.sink_ndjson("out.ndjson")
687
+ def sink_ndjson(
688
+ path,
689
+ maintain_order: true,
690
+ type_coercion: true,
691
+ predicate_pushdown: true,
692
+ projection_pushdown: true,
693
+ simplify_expression: true,
694
+ slice_pushdown: true,
695
+ no_optimization: false
696
+ )
697
+ lf = _set_sink_optimizations(
698
+ type_coercion: type_coercion,
699
+ predicate_pushdown: predicate_pushdown,
700
+ projection_pushdown: projection_pushdown,
701
+ simplify_expression: simplify_expression,
702
+ slice_pushdown: slice_pushdown,
703
+ no_optimization: no_optimization
704
+ )
705
+
706
+ lf.sink_json(path, maintain_order)
707
+ end
708
+
709
+ # @private
710
+ def _set_sink_optimizations(
711
+ type_coercion: true,
712
+ predicate_pushdown: true,
713
+ projection_pushdown: true,
714
+ simplify_expression: true,
715
+ slice_pushdown: true,
716
+ no_optimization: false
717
+ )
718
+ if no_optimization
719
+ predicate_pushdown = false
720
+ projection_pushdown = false
721
+ slice_pushdown = false
722
+ end
723
+
724
+ _ldf.optimization_toggle(
725
+ type_coercion,
726
+ predicate_pushdown,
727
+ projection_pushdown,
728
+ simplify_expression,
729
+ slice_pushdown,
730
+ false,
731
+ false,
732
+ true,
733
+ false
734
+ )
735
+ end
736
+
737
+ # Collect a small number of rows for debugging purposes.
738
+ #
739
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
740
+ # read by every scan operation. This is a utility that helps debug a query on a
741
+ # smaller number of rows.
742
+ #
743
+ # Note that the fetch does not guarantee the final number of rows in the
744
+ # DataFrame. Filter, join operations and a lower number of rows available in the
745
+ # scanned file influence the final number of rows.
746
+ #
747
+ # @param n_rows [Integer]
748
+ # Collect n_rows from the data sources.
749
+ # @param type_coercion [Boolean]
750
+ # Run type coercion optimization.
751
+ # @param predicate_pushdown [Boolean]
752
+ # Run predicate pushdown optimization.
753
+ # @param projection_pushdown [Boolean]
754
+ # Run projection pushdown optimization.
755
+ # @param simplify_expression [Boolean]
756
+ # Run simplify expressions optimization.
757
+ # @param string_cache [Boolean]
758
+ # This argument is deprecated. Please set the string cache globally.
759
+ # The argument will be ignored
760
+ # @param no_optimization [Boolean]
761
+ # Turn off optimizations.
762
+ # @param slice_pushdown [Boolean]
763
+ # Slice pushdown optimization
764
+ # @param common_subplan_elimination [Boolean]
765
+ # Will try to cache branching subplans that occur on self-joins or unions.
766
+ # @param allow_streaming [Boolean]
767
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
768
+ #
769
+ # @return [DataFrame]
770
+ #
771
+ # @example
772
+ # df = Polars::DataFrame.new(
773
+ # {
774
+ # "a" => ["a", "b", "a", "b", "b", "c"],
775
+ # "b" => [1, 2, 3, 4, 5, 6],
776
+ # "c" => [6, 5, 4, 3, 2, 1]
777
+ # }
778
+ # ).lazy
779
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
780
+ # # =>
781
+ # # shape: (2, 3)
782
+ # # ┌─────┬─────┬─────┐
783
+ # # │ a ┆ b ┆ c │
784
+ # # │ --- ┆ --- ┆ --- │
785
+ # # │ str ┆ i64 ┆ i64 │
786
+ # # ╞═════╪═════╪═════╡
787
+ # # │ a ┆ 1 ┆ 6 │
788
+ # # │ b ┆ 2 ┆ 5 │
789
+ # # └─────┴─────┴─────┘
790
+ def fetch(
791
+ n_rows = 500,
792
+ type_coercion: true,
793
+ predicate_pushdown: true,
794
+ projection_pushdown: true,
795
+ simplify_expression: true,
796
+ string_cache: false,
797
+ no_optimization: false,
798
+ slice_pushdown: true,
799
+ common_subplan_elimination: true,
800
+ comm_subexpr_elim: true,
801
+ allow_streaming: false
802
+ )
803
+ if no_optimization
804
+ predicate_pushdown = false
805
+ projection_pushdown = false
806
+ slice_pushdown = false
807
+ common_subplan_elimination = false
808
+ end
809
+
810
+ ldf = _ldf.optimization_toggle(
811
+ type_coercion,
812
+ predicate_pushdown,
813
+ projection_pushdown,
814
+ simplify_expression,
815
+ slice_pushdown,
816
+ common_subplan_elimination,
817
+ comm_subexpr_elim,
818
+ allow_streaming,
819
+ false
820
+ )
821
+ Utils.wrap_df(ldf.fetch(n_rows))
822
+ end
823
+
824
+ # Return lazy representation, i.e. itself.
825
+ #
826
+ # Useful for writing code that expects either a `DataFrame` or
827
+ # `LazyFrame`.
828
+ #
829
+ # @return [LazyFrame]
830
+ #
831
+ # @example
832
+ # df = Polars::DataFrame.new(
833
+ # {
834
+ # "a" => [nil, 2, 3, 4],
835
+ # "b" => [0.5, nil, 2.5, 13],
836
+ # "c" => [true, true, false, nil]
837
+ # }
838
+ # )
839
+ # df.lazy
840
+ def lazy
841
+ self
842
+ end
843
+
844
+ # Cache the result once the execution of the physical plan hits this node.
845
+ #
846
+ # @return [LazyFrame]
847
+ def cache
848
+ _from_rbldf(_ldf.cache)
849
+ end
850
+
851
+ # TODO
852
+ # def cast
853
+ # end
854
+
855
+ # Create an empty copy of the current LazyFrame.
856
+ #
857
+ # The copy has an identical schema but no data.
858
+ #
859
+ # @return [LazyFrame]
860
+ #
861
+ # @example
862
+ # lf = Polars::LazyFrame.new(
863
+ # {
864
+ # "a" => [nil, 2, 3, 4],
865
+ # "b" => [0.5, nil, 2.5, 13],
866
+ # "c" => [true, true, false, nil],
867
+ # }
868
+ # ).lazy
869
+ # lf.clear.fetch
870
+ # # =>
871
+ # # shape: (0, 3)
872
+ # # ┌─────┬─────┬──────┐
873
+ # # │ a ┆ b ┆ c │
874
+ # # │ --- ┆ --- ┆ --- │
875
+ # # │ i64 ┆ f64 ┆ bool │
876
+ # # ╞═════╪═════╪══════╡
877
+ # # └─────┴─────┴──────┘
878
+ #
879
+ # @example
880
+ # lf.clear(2).fetch
881
+ # # =>
882
+ # # shape: (2, 3)
883
+ # # ┌──────┬──────┬──────┐
884
+ # # │ a ┆ b ┆ c │
885
+ # # │ --- ┆ --- ┆ --- │
886
+ # # │ i64 ┆ f64 ┆ bool │
887
+ # # ╞══════╪══════╪══════╡
888
+ # # │ null ┆ null ┆ null │
889
+ # # │ null ┆ null ┆ null │
890
+ # # └──────┴──────┴──────┘
891
+ def clear(n = 0)
892
+ DataFrame.new(columns: schema).clear(n).lazy
893
+ end
894
+ alias_method :cleared, :clear
895
+
896
+ # Filter the rows in the DataFrame based on a predicate expression.
897
+ #
898
+ # @param predicate [Object]
899
+ # Expression that evaluates to a boolean Series.
900
+ #
901
+ # @return [LazyFrame]
902
+ #
903
+ # @example Filter on one condition:
904
+ # lf = Polars::DataFrame.new(
905
+ # {
906
+ # "foo" => [1, 2, 3],
907
+ # "bar" => [6, 7, 8],
908
+ # "ham" => ["a", "b", "c"]
909
+ # }
910
+ # ).lazy
911
+ # lf.filter(Polars.col("foo") < 3).collect
912
+ # # =>
913
+ # # shape: (2, 3)
914
+ # # ┌─────┬─────┬─────┐
915
+ # # │ foo ┆ bar ┆ ham │
916
+ # # │ --- ┆ --- ┆ --- │
917
+ # # │ i64 ┆ i64 ┆ str │
918
+ # # ╞═════╪═════╪═════╡
919
+ # # │ 1 ┆ 6 ┆ a │
920
+ # # │ 2 ┆ 7 ┆ b │
921
+ # # └─────┴─────┴─────┘
922
+ #
923
+ # @example Filter on multiple conditions:
924
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
925
+ # # =>
926
+ # # shape: (1, 3)
927
+ # # ┌─────┬─────┬─────┐
928
+ # # │ foo ┆ bar ┆ ham │
929
+ # # │ --- ┆ --- ┆ --- │
930
+ # # │ i64 ┆ i64 ┆ str │
931
+ # # ╞═════╪═════╪═════╡
932
+ # # │ 1 ┆ 6 ┆ a │
933
+ # # └─────┴─────┴─────┘
934
+ def filter(predicate)
935
+ _from_rbldf(
936
+ _ldf.filter(
937
+ Utils.parse_into_expression(predicate, str_as_lit: false)
938
+ )
939
+ )
940
+ end
941
+
942
+ # Select columns from this DataFrame.
943
+ #
944
+ # @param exprs [Array]
945
+ # Column(s) to select, specified as positional arguments.
946
+ # Accepts expression input. Strings are parsed as column names,
947
+ # other non-expression inputs are parsed as literals.
948
+ # @param named_exprs [Hash]
949
+ # Additional columns to select, specified as keyword arguments.
950
+ # The columns will be renamed to the keyword used.
951
+ #
952
+ # @return [LazyFrame]
953
+ #
954
+ # @example
955
+ # df = Polars::DataFrame.new(
956
+ # {
957
+ # "foo" => [1, 2, 3],
958
+ # "bar" => [6, 7, 8],
959
+ # "ham" => ["a", "b", "c"],
960
+ # }
961
+ # ).lazy
962
+ # df.select("foo").collect
963
+ # # =>
964
+ # # shape: (3, 1)
965
+ # # ┌─────┐
966
+ # # │ foo │
967
+ # # │ --- │
968
+ # # │ i64 │
969
+ # # ╞═════╡
970
+ # # │ 1 │
971
+ # # │ 2 │
972
+ # # │ 3 │
973
+ # # └─────┘
974
+ #
975
+ # @example
976
+ # df.select(["foo", "bar"]).collect
977
+ # # =>
978
+ # # shape: (3, 2)
979
+ # # ┌─────┬─────┐
980
+ # # │ foo ┆ bar │
981
+ # # │ --- ┆ --- │
982
+ # # │ i64 ┆ i64 │
983
+ # # ╞═════╪═════╡
984
+ # # │ 1 ┆ 6 │
985
+ # # │ 2 ┆ 7 │
986
+ # # │ 3 ┆ 8 │
987
+ # # └─────┴─────┘
988
+ #
989
+ # @example
990
+ # df.select(Polars.col("foo") + 1).collect
991
+ # # =>
992
+ # # shape: (3, 1)
993
+ # # ┌─────┐
994
+ # # │ foo │
995
+ # # │ --- │
996
+ # # │ i64 │
997
+ # # ╞═════╡
998
+ # # │ 2 │
999
+ # # │ 3 │
1000
+ # # │ 4 │
1001
+ # # └─────┘
1002
+ #
1003
+ # @example
1004
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
1005
+ # # =>
1006
+ # # shape: (3, 2)
1007
+ # # ┌─────┬─────┐
1008
+ # # │ foo ┆ bar │
1009
+ # # │ --- ┆ --- │
1010
+ # # │ i64 ┆ i64 │
1011
+ # # ╞═════╪═════╡
1012
+ # # │ 2 ┆ 7 │
1013
+ # # │ 3 ┆ 8 │
1014
+ # # │ 4 ┆ 9 │
1015
+ # # └─────┴─────┘
1016
+ #
1017
+ # @example
1018
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
1019
+ # # =>
1020
+ # # shape: (3, 1)
1021
+ # # ┌─────────┐
1022
+ # # │ literal │
1023
+ # # │ --- │
1024
+ # # │ i32 │
1025
+ # # ╞═════════╡
1026
+ # # │ 0 │
1027
+ # # │ 0 │
1028
+ # # │ 10 │
1029
+ # # └─────────┘
1030
+ def select(*exprs, **named_exprs)
1031
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1032
+
1033
+ rbexprs = Utils.parse_into_list_of_expressions(
1034
+ *exprs, **named_exprs, __structify: structify
1035
+ )
1036
+ _from_rbldf(_ldf.select(rbexprs))
1037
+ end
1038
+
1039
+ # Start a group by operation.
1040
+ #
1041
+ # @param by [Array]
1042
+ # Column(s) to group by.
1043
+ # @param maintain_order [Boolean]
1044
+ # Make sure that the order of the groups remain consistent. This is more
1045
+ # expensive than a default group by.
1046
+ # @param named_by [Hash]
1047
+ # Additional columns to group by, specified as keyword arguments.
1048
+ # The columns will be renamed to the keyword used.
1049
+ # @return [LazyGroupBy]
1050
+ #
1051
+ # @example
1052
+ # df = Polars::DataFrame.new(
1053
+ # {
1054
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1055
+ # "b" => [1, 2, 3, 4, 5, 6],
1056
+ # "c" => [6, 5, 4, 3, 2, 1]
1057
+ # }
1058
+ # ).lazy
1059
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
1060
+ # # =>
1061
+ # # shape: (3, 2)
1062
+ # # ┌─────┬─────┐
1063
+ # # │ a ┆ b │
1064
+ # # │ --- ┆ --- │
1065
+ # # │ str ┆ i64 │
1066
+ # # ╞═════╪═════╡
1067
+ # # │ a ┆ 4 │
1068
+ # # │ b ┆ 11 │
1069
+ # # │ c ┆ 6 │
1070
+ # # └─────┴─────┘
1071
+ def group_by(*by, maintain_order: false, **named_by)
1072
+ exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
1073
+ lgb = _ldf.group_by(exprs, maintain_order)
1074
+ LazyGroupBy.new(lgb)
1075
+ end
1076
+ alias_method :groupby, :group_by
1077
+ alias_method :group, :group_by
1078
+
1079
+ # Create rolling groups based on a time column.
1080
+ #
1081
+ # Also works for index values of type `:i32` or `:i64`.
1082
+ #
1083
+ # Different from a `dynamic_group_by` the windows are now determined by the
1084
+ # individual values and are not of constant intervals. For constant intervals
1085
+ # use *group_by_dynamic*.
1086
+ #
1087
+ # The `period` and `offset` arguments are created either from a timedelta, or
1088
+ # by using the following string language:
1089
+ #
1090
+ # - 1ns (1 nanosecond)
1091
+ # - 1us (1 microsecond)
1092
+ # - 1ms (1 millisecond)
1093
+ # - 1s (1 second)
1094
+ # - 1m (1 minute)
1095
+ # - 1h (1 hour)
1096
+ # - 1d (1 day)
1097
+ # - 1w (1 week)
1098
+ # - 1mo (1 calendar month)
1099
+ # - 1y (1 calendar year)
1100
+ # - 1i (1 index count)
1101
+ #
1102
+ # Or combine them:
1103
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1104
+ #
1105
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1106
+ #
1107
+ # - "1i" # length 1
1108
+ # - "10i" # length 10
1109
+ #
1110
+ # @param index_column [Object]
1111
+ # Column used to group based on the time window.
1112
+ # Often to type Date/Datetime
1113
+ # This column must be sorted in ascending order. If not the output will not
1114
+ # make sense.
1115
+ #
1116
+ # In case of a rolling group by on indices, dtype needs to be one of
1117
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1118
+ # performance matters use an `:i64` column.
1119
+ # @param period [Object]
1120
+ # Length of the window.
1121
+ # @param offset [Object]
1122
+ # Offset of the window. Default is -period.
1123
+ # @param closed ["right", "left", "both", "none"]
1124
+ # Define whether the temporal window interval is closed or not.
1125
+ # @param by [Object]
1126
+ # Also group by this column/these columns.
1127
+ #
1128
+ # @return [LazyFrame]
1129
+ #
1130
+ # @example
1131
+ # dates = [
1132
+ # "2020-01-01 13:45:48",
1133
+ # "2020-01-01 16:42:13",
1134
+ # "2020-01-01 16:45:09",
1135
+ # "2020-01-02 18:12:48",
1136
+ # "2020-01-03 19:45:32",
1137
+ # "2020-01-08 23:16:43"
1138
+ # ]
1139
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1140
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1141
+ # )
1142
+ # df.rolling(index_column: "dt", period: "2d").agg(
1143
+ # [
1144
+ # Polars.sum("a").alias("sum_a"),
1145
+ # Polars.min("a").alias("min_a"),
1146
+ # Polars.max("a").alias("max_a")
1147
+ # ]
1148
+ # ).collect
1149
+ # # =>
1150
+ # # shape: (6, 4)
1151
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1152
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1153
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1154
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1155
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1156
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1157
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1158
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1159
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1160
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1161
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1162
+ # # └─────────────────────┴───────┴───────┴───────┘
1163
+ def rolling(
1164
+ index_column:,
1165
+ period:,
1166
+ offset: nil,
1167
+ closed: "right",
1168
+ by: nil
1169
+ )
1170
+ index_column = Utils.parse_into_expression(index_column)
1171
+ if offset.nil?
1172
+ offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
1173
+ end
1174
+
1175
+ rbexprs_by = (
1176
+ !by.nil? ? Utils.parse_into_list_of_expressions(by) : []
1177
+ )
1178
+ period = Utils.parse_as_duration_string(period)
1179
+ offset = Utils.parse_as_duration_string(offset)
1180
+
1181
+ lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
1182
+ LazyGroupBy.new(lgb)
1183
+ end
1184
+ alias_method :group_by_rolling, :rolling
1185
+ alias_method :groupby_rolling, :rolling
1186
+
1187
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1188
+ #
1189
+ # Time windows are calculated and rows are assigned to windows. Different from a
1190
+ # normal group by is that a row can be member of multiple groups. The time/index
1191
+ # window could be seen as a rolling window, with a window size determined by
1192
+ # dates/times/values instead of slots in the DataFrame.
1193
+ #
1194
+ # A window is defined by:
1195
+ #
1196
+ # - every: interval of the window
1197
+ # - period: length of the window
1198
+ # - offset: offset of the window
1199
+ #
1200
+ # The `every`, `period` and `offset` arguments are created with
1201
+ # the following string language:
1202
+ #
1203
+ # - 1ns (1 nanosecond)
1204
+ # - 1us (1 microsecond)
1205
+ # - 1ms (1 millisecond)
1206
+ # - 1s (1 second)
1207
+ # - 1m (1 minute)
1208
+ # - 1h (1 hour)
1209
+ # - 1d (1 day)
1210
+ # - 1w (1 week)
1211
+ # - 1mo (1 calendar month)
1212
+ # - 1y (1 calendar year)
1213
+ # - 1i (1 index count)
1214
+ #
1215
+ # Or combine them:
1216
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1217
+ #
1218
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1219
+ #
1220
+ # - "1i" # length 1
1221
+ # - "10i" # length 10
1222
+ #
1223
+ # @param index_column [Object]
1224
+ # Column used to group based on the time window.
1225
+ # Often to type Date/Datetime
1226
+ # This column must be sorted in ascending order. If not the output will not
1227
+ # make sense.
1228
+ #
1229
+ # In case of a dynamic group by on indices, dtype needs to be one of
1230
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1231
+ # performance matters use an `:i64` column.
1232
+ # @param every [Object]
1233
+ # Interval of the window.
1234
+ # @param period [Object]
1235
+ # Length of the window, if None it is equal to 'every'.
1236
+ # @param offset [Object]
1237
+ # Offset of the window if None and period is None it will be equal to negative
1238
+ # `every`.
1239
+ # @param truncate [Boolean]
1240
+ # Truncate the time value to the window lower bound.
1241
+ # @param include_boundaries [Boolean]
1242
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1243
+ # "_upper_bound" columns. This will impact performance because it's harder to
1244
+ # parallelize
1245
+ # @param closed ["right", "left", "both", "none"]
1246
+ # Define whether the temporal window interval is closed or not.
1247
+ # @param by [Object]
1248
+ # Also group by this column/these columns
1249
+ #
1250
+ # @return [DataFrame]
1251
+ #
1252
+ # @example
1253
+ # df = Polars::DataFrame.new(
1254
+ # {
1255
+ # "time" => Polars.datetime_range(
1256
+ # DateTime.new(2021, 12, 16),
1257
+ # DateTime.new(2021, 12, 16, 3),
1258
+ # "30m",
1259
+ # time_unit: "us",
1260
+ # eager: true
1261
+ # ),
1262
+ # "n" => 0..6
1263
+ # }
1264
+ # )
1265
+ # # =>
1266
+ # # shape: (7, 2)
1267
+ # # ┌─────────────────────┬─────┐
1268
+ # # │ time ┆ n │
1269
+ # # │ --- ┆ --- │
1270
+ # # │ datetime[μs] ┆ i64 │
1271
+ # # ╞═════════════════════╪═════╡
1272
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1273
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1274
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1275
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1276
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1277
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1278
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1279
+ # # └─────────────────────┴─────┘
1280
+ #
1281
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1282
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1283
+ # [
1284
+ # Polars.col("time").min.alias("time_min"),
1285
+ # Polars.col("time").max.alias("time_max")
1286
+ # ]
1287
+ # )
1288
+ # # =>
1289
+ # # shape: (4, 3)
1290
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1291
+ # # │ time ┆ time_min ┆ time_max │
1292
+ # # │ --- ┆ --- ┆ --- │
1293
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1294
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1295
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1296
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1297
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1298
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1299
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1300
+ #
1301
+ # @example The window boundaries can also be added to the aggregation result.
1302
+ # df.group_by_dynamic(
1303
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1304
+ # ).agg([Polars.col("time").count.alias("time_count")])
1305
+ # # =>
1306
+ # # shape: (4, 4)
1307
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1308
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1309
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1310
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1311
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1312
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1313
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1314
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1315
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1316
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1317
+ #
1318
+ # @example When closed="left", should not include right end of interval.
1319
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1320
+ # [
1321
+ # Polars.col("time").count.alias("time_count"),
1322
+ # Polars.col("time").alias("time_agg_list")
1323
+ # ]
1324
+ # )
1325
+ # # =>
1326
+ # # shape: (4, 3)
1327
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1328
+ # # │ time ┆ time_count ┆ time_agg_list │
1329
+ # # │ --- ┆ --- ┆ --- │
1330
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1331
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1332
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
1333
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
1334
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
1335
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1336
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
1337
+ #
1338
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1339
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1340
+ # [Polars.col("time").count.alias("time_count")]
1341
+ # )
1342
+ # # =>
1343
+ # # shape: (5, 2)
1344
+ # # ┌─────────────────────┬────────────┐
1345
+ # # │ time ┆ time_count │
1346
+ # # │ --- ┆ --- │
1347
+ # # │ datetime[μs] ┆ u32 │
1348
+ # # ╞═════════════════════╪════════════╡
1349
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1350
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1351
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1352
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1353
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1354
+ # # └─────────────────────┴────────────┘
1355
+ #
1356
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1357
+ # df = Polars::DataFrame.new(
1358
+ # {
1359
+ # "time" => Polars.datetime_range(
1360
+ # DateTime.new(2021, 12, 16),
1361
+ # DateTime.new(2021, 12, 16, 3),
1362
+ # "30m",
1363
+ # time_unit: "us",
1364
+ # eager: true
1365
+ # ),
1366
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1367
+ # }
1368
+ # )
1369
+ # df.group_by_dynamic(
1370
+ # "time",
1371
+ # every: "1h",
1372
+ # closed: "both",
1373
+ # by: "groups",
1374
+ # include_boundaries: true
1375
+ # ).agg([Polars.col("time").count.alias("time_count")])
1376
+ # # =>
1377
+ # # shape: (7, 5)
1378
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1379
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1380
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1381
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1382
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1383
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1384
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1385
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1386
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1387
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1388
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1389
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1390
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1391
+ #
1392
+ # @example Dynamic group by on an index column.
1393
+ # df = Polars::DataFrame.new(
1394
+ # {
1395
+ # "idx" => Polars.arange(0, 6, eager: true),
1396
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1397
+ # }
1398
+ # )
1399
+ # df.group_by_dynamic(
1400
+ # "idx",
1401
+ # every: "2i",
1402
+ # period: "3i",
1403
+ # include_boundaries: true,
1404
+ # closed: "right"
1405
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1406
+ # # =>
1407
+ # # shape: (4, 4)
1408
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1409
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1410
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1411
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1412
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1413
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1414
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1415
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1416
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1417
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1418
+ def group_by_dynamic(
1419
+ index_column,
1420
+ every:,
1421
+ period: nil,
1422
+ offset: nil,
1423
+ truncate: nil,
1424
+ include_boundaries: false,
1425
+ closed: "left",
1426
+ label: "left",
1427
+ by: nil,
1428
+ start_by: "window"
1429
+ )
1430
+ if !truncate.nil?
1431
+ label = truncate ? "left" : "datapoint"
1432
+ end
1433
+
1434
+ index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
1435
+ if offset.nil?
1436
+ offset = period.nil? ? "-#{every}" : "0ns"
1437
+ end
1438
+
1439
+ if period.nil?
1440
+ period = every
1441
+ end
1442
+
1443
+ period = Utils.parse_as_duration_string(period)
1444
+ offset = Utils.parse_as_duration_string(offset)
1445
+ every = Utils.parse_as_duration_string(every)
1446
+
1447
+ rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
1448
+ lgb = _ldf.group_by_dynamic(
1449
+ index_column,
1450
+ every,
1451
+ period,
1452
+ offset,
1453
+ label,
1454
+ include_boundaries,
1455
+ closed,
1456
+ rbexprs_by,
1457
+ start_by
1458
+ )
1459
+ LazyGroupBy.new(lgb)
1460
+ end
1461
+ alias_method :groupby_dynamic, :group_by_dynamic
1462
+
1463
+ # Perform an asof join.
1464
+ #
1465
+ # This is similar to a left-join except that we match on nearest key rather than
1466
+ # equal keys.
1467
+ #
1468
+ # Both DataFrames must be sorted by the join_asof key.
1469
+ #
1470
+ # For each row in the left DataFrame:
1471
+ #
1472
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1473
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1474
+ #
1475
+ # The default is "backward".
1476
+ #
1477
+ # @param other [LazyFrame]
1478
+ # Lazy DataFrame to join with.
1479
+ # @param left_on [String]
1480
+ # Join column of the left DataFrame.
1481
+ # @param right_on [String]
1482
+ # Join column of the right DataFrame.
1483
+ # @param on [String]
1484
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1485
+ # None.
1486
+ # @param by [Object]
1487
+ # Join on these columns before doing asof join.
1488
+ # @param by_left [Object]
1489
+ # Join on these columns before doing asof join.
1490
+ # @param by_right [Object]
1491
+ # Join on these columns before doing asof join.
1492
+ # @param strategy ["backward", "forward"]
1493
+ # Join strategy.
1494
+ # @param suffix [String]
1495
+ # Suffix to append to columns with a duplicate name.
1496
+ # @param tolerance [Object]
1497
+ # Numeric tolerance. By setting this the join will only be done if the near
1498
+ # keys are within this distance. If an asof join is done on columns of dtype
1499
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1500
+ # language:
1501
+ #
1502
+ # - 1ns (1 nanosecond)
1503
+ # - 1us (1 microsecond)
1504
+ # - 1ms (1 millisecond)
1505
+ # - 1s (1 second)
1506
+ # - 1m (1 minute)
1507
+ # - 1h (1 hour)
1508
+ # - 1d (1 day)
1509
+ # - 1w (1 week)
1510
+ # - 1mo (1 calendar month)
1511
+ # - 1y (1 calendar year)
1512
+ # - 1i (1 index count)
1513
+ #
1514
+ # Or combine them:
1515
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1516
+ #
1517
+ # @param allow_parallel [Boolean]
1518
+ # Allow the physical plan to optionally evaluate the computation of both
1519
+ # DataFrames up to the join in parallel.
1520
+ # @param force_parallel [Boolean]
1521
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1522
+ # the join in parallel.
1523
+ #
1524
+ # @return [LazyFrame]
1525
+ def join_asof(
1526
+ other,
1527
+ left_on: nil,
1528
+ right_on: nil,
1529
+ on: nil,
1530
+ by_left: nil,
1531
+ by_right: nil,
1532
+ by: nil,
1533
+ strategy: "backward",
1534
+ suffix: "_right",
1535
+ tolerance: nil,
1536
+ allow_parallel: true,
1537
+ force_parallel: false
1538
+ )
1539
+ if !other.is_a?(LazyFrame)
1540
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1541
+ end
1542
+
1543
+ if on.is_a?(::String)
1544
+ left_on = on
1545
+ right_on = on
1546
+ end
1547
+
1548
+ if left_on.nil? || right_on.nil?
1549
+ raise ArgumentError, "You should pass the column to join on as an argument."
1550
+ end
1551
+
1552
+ if by_left.is_a?(::String) || by_left.is_a?(Expr)
1553
+ by_left_ = [by_left]
1554
+ else
1555
+ by_left_ = by_left
1556
+ end
1557
+
1558
+ if by_right.is_a?(::String) || by_right.is_a?(Expr)
1559
+ by_right_ = [by_right]
1560
+ else
1561
+ by_right_ = by_right
1562
+ end
1563
+
1564
+ if by.is_a?(::String)
1565
+ by_left_ = [by]
1566
+ by_right_ = [by]
1567
+ elsif by.is_a?(::Array)
1568
+ by_left_ = by
1569
+ by_right_ = by
1570
+ end
1571
+
1572
+ tolerance_str = nil
1573
+ tolerance_num = nil
1574
+ if tolerance.is_a?(::String)
1575
+ tolerance_str = tolerance
1576
+ else
1577
+ tolerance_num = tolerance
1578
+ end
1579
+
1580
+ _from_rbldf(
1581
+ _ldf.join_asof(
1582
+ other._ldf,
1583
+ Polars.col(left_on)._rbexpr,
1584
+ Polars.col(right_on)._rbexpr,
1585
+ by_left_,
1586
+ by_right_,
1587
+ allow_parallel,
1588
+ force_parallel,
1589
+ suffix,
1590
+ strategy,
1591
+ tolerance_num,
1592
+ tolerance_str
1593
+ )
1594
+ )
1595
+ end
1596
+
1597
+ # Add a join operation to the Logical Plan.
1598
+ #
1599
+ # @param other [LazyFrame]
1600
+ # Lazy DataFrame to join with.
1601
+ # @param left_on [Object]
1602
+ # Join column of the left DataFrame.
1603
+ # @param right_on [Object]
1604
+ # Join column of the right DataFrame.
1605
+ # @param on Object
1606
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1607
+ # None.
1608
+ # @param how ["inner", "left", "full", "semi", "anti", "cross"]
1609
+ # Join strategy.
1610
+ # @param suffix [String]
1611
+ # Suffix to append to columns with a duplicate name.
1612
+ # @param join_nulls [Boolean]
1613
+ # Join on null values. By default null values will never produce matches.
1614
+ # @param allow_parallel [Boolean]
1615
+ # Allow the physical plan to optionally evaluate the computation of both
1616
+ # DataFrames up to the join in parallel.
1617
+ # @param force_parallel [Boolean]
1618
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1619
+ # the join in parallel.
1620
+ #
1621
+ # @return [LazyFrame]
1622
+ #
1623
+ # @example
1624
+ # df = Polars::DataFrame.new(
1625
+ # {
1626
+ # "foo" => [1, 2, 3],
1627
+ # "bar" => [6.0, 7.0, 8.0],
1628
+ # "ham" => ["a", "b", "c"]
1629
+ # }
1630
+ # ).lazy
1631
+ # other_df = Polars::DataFrame.new(
1632
+ # {
1633
+ # "apple" => ["x", "y", "z"],
1634
+ # "ham" => ["a", "b", "d"]
1635
+ # }
1636
+ # ).lazy
1637
+ # df.join(other_df, on: "ham").collect
1638
+ # # =>
1639
+ # # shape: (2, 4)
1640
+ # # ┌─────┬─────┬─────┬───────┐
1641
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1642
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1643
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1644
+ # # ╞═════╪═════╪═════╪═══════╡
1645
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1646
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1647
+ # # └─────┴─────┴─────┴───────┘
1648
+ #
1649
+ # @example
1650
+ # df.join(other_df, on: "ham", how: "full").collect
1651
+ # # =>
1652
+ # # shape: (4, 5)
1653
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
1654
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
1655
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1656
+ # # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
1657
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
1658
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
1659
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
1660
+ # # │ null ┆ null ┆ null ┆ z ┆ d │
1661
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
1662
+ # # └──────┴──────┴──────┴───────┴───────────┘
1663
+ #
1664
+ # @example
1665
+ # df.join(other_df, on: "ham", how: "left").collect
1666
+ # # =>
1667
+ # # shape: (3, 4)
1668
+ # # ┌─────┬─────┬─────┬───────┐
1669
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1670
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1671
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1672
+ # # ╞═════╪═════╪═════╪═══════╡
1673
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1674
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1675
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1676
+ # # └─────┴─────┴─────┴───────┘
1677
+ #
1678
+ # @example
1679
+ # df.join(other_df, on: "ham", how: "semi").collect
1680
+ # # =>
1681
+ # # shape: (2, 3)
1682
+ # # ┌─────┬─────┬─────┐
1683
+ # # │ foo ┆ bar ┆ ham │
1684
+ # # │ --- ┆ --- ┆ --- │
1685
+ # # │ i64 ┆ f64 ┆ str │
1686
+ # # ╞═════╪═════╪═════╡
1687
+ # # │ 1 ┆ 6.0 ┆ a │
1688
+ # # │ 2 ┆ 7.0 ┆ b │
1689
+ # # └─────┴─────┴─────┘
1690
+ #
1691
+ # @example
1692
+ # df.join(other_df, on: "ham", how: "anti").collect
1693
+ # # =>
1694
+ # # shape: (1, 3)
1695
+ # # ┌─────┬─────┬─────┐
1696
+ # # │ foo ┆ bar ┆ ham │
1697
+ # # │ --- ┆ --- ┆ --- │
1698
+ # # │ i64 ┆ f64 ┆ str │
1699
+ # # ╞═════╪═════╪═════╡
1700
+ # # │ 3 ┆ 8.0 ┆ c │
1701
+ # # └─────┴─────┴─────┘
1702
+ def join(
1703
+ other,
1704
+ left_on: nil,
1705
+ right_on: nil,
1706
+ on: nil,
1707
+ how: "inner",
1708
+ suffix: "_right",
1709
+ join_nulls: false,
1710
+ allow_parallel: true,
1711
+ force_parallel: false
1712
+ )
1713
+ if !other.is_a?(LazyFrame)
1714
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1715
+ end
1716
+
1717
+ if how == "outer"
1718
+ how = "full"
1719
+ elsif how == "cross"
1720
+ return _from_rbldf(
1721
+ _ldf.join(
1722
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1723
+ )
1724
+ )
1725
+ end
1726
+
1727
+ if !on.nil?
1728
+ rbexprs = Utils.parse_into_list_of_expressions(on)
1729
+ rbexprs_left = rbexprs
1730
+ rbexprs_right = rbexprs
1731
+ elsif !left_on.nil? && !right_on.nil?
1732
+ rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
1733
+ rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
1734
+ else
1735
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1736
+ end
1737
+
1738
+ _from_rbldf(
1739
+ self._ldf.join(
1740
+ other._ldf,
1741
+ rbexprs_left,
1742
+ rbexprs_right,
1743
+ allow_parallel,
1744
+ force_parallel,
1745
+ join_nulls,
1746
+ how,
1747
+ suffix,
1748
+ )
1749
+ )
1750
+ end
1751
+
1752
+ # Add or overwrite multiple columns in a DataFrame.
1753
+ #
1754
+ # @param exprs [Object]
1755
+ # List of Expressions that evaluate to columns.
1756
+ #
1757
+ # @return [LazyFrame]
1758
+ #
1759
+ # @example
1760
+ # ldf = Polars::DataFrame.new(
1761
+ # {
1762
+ # "a" => [1, 2, 3, 4],
1763
+ # "b" => [0.5, 4, 10, 13],
1764
+ # "c" => [true, true, false, true]
1765
+ # }
1766
+ # ).lazy
1767
+ # ldf.with_columns(
1768
+ # [
1769
+ # (Polars.col("a") ** 2).alias("a^2"),
1770
+ # (Polars.col("b") / 2).alias("b/2"),
1771
+ # (Polars.col("c").is_not).alias("not c")
1772
+ # ]
1773
+ # ).collect
1774
+ # # =>
1775
+ # # shape: (4, 6)
1776
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1777
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1778
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1779
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1780
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1781
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1782
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1783
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1784
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1785
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1786
+ def with_columns(*exprs, **named_exprs)
1787
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1788
+
1789
+ rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1790
+
1791
+ _from_rbldf(_ldf.with_columns(rbexprs))
1792
+ end
1793
+
1794
+ # Add an external context to the computation graph.
1795
+ #
1796
+ # This allows expressions to also access columns from DataFrames
1797
+ # that are not part of this one.
1798
+ #
1799
+ # @param other [Object]
1800
+ # Lazy DataFrame to join with.
1801
+ #
1802
+ # @return [LazyFrame]
1803
+ #
1804
+ # @example
1805
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1806
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1807
+ # (
1808
+ # df_a.with_context(df_other.lazy).select(
1809
+ # [Polars.col("b") + Polars.col("c").first]
1810
+ # )
1811
+ # ).collect
1812
+ # # =>
1813
+ # # shape: (3, 1)
1814
+ # # ┌──────┐
1815
+ # # │ b │
1816
+ # # │ --- │
1817
+ # # │ str │
1818
+ # # ╞══════╡
1819
+ # # │ afoo │
1820
+ # # │ cfoo │
1821
+ # # │ null │
1822
+ # # └──────┘
1823
+ def with_context(other)
1824
+ if !other.is_a?(::Array)
1825
+ other = [other]
1826
+ end
1827
+
1828
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1829
+ end
1830
+
1831
+ # Add or overwrite column in a DataFrame.
1832
+ #
1833
+ # @param column [Object]
1834
+ # Expression that evaluates to column or a Series to use.
1835
+ #
1836
+ # @return [LazyFrame]
1837
+ #
1838
+ # @example
1839
+ # df = Polars::DataFrame.new(
1840
+ # {
1841
+ # "a" => [1, 3, 5],
1842
+ # "b" => [2, 4, 6]
1843
+ # }
1844
+ # ).lazy
1845
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
1846
+ # # =>
1847
+ # # shape: (3, 3)
1848
+ # # ┌─────┬─────┬───────────┐
1849
+ # # │ a ┆ b ┆ b_squared │
1850
+ # # │ --- ┆ --- ┆ --- │
1851
+ # # │ i64 ┆ i64 ┆ i64 │
1852
+ # # ╞═════╪═════╪═══════════╡
1853
+ # # │ 1 ┆ 2 ┆ 4 │
1854
+ # # │ 3 ┆ 4 ┆ 16 │
1855
+ # # │ 5 ┆ 6 ┆ 36 │
1856
+ # # └─────┴─────┴───────────┘
1857
+ #
1858
+ # @example
1859
+ # df.with_column(Polars.col("a") ** 2).collect
1860
+ # # =>
1861
+ # # shape: (3, 2)
1862
+ # # ┌─────┬─────┐
1863
+ # # │ a ┆ b │
1864
+ # # │ --- ┆ --- │
1865
+ # # │ i64 ┆ i64 │
1866
+ # # ╞═════╪═════╡
1867
+ # # │ 1 ┆ 2 │
1868
+ # # │ 9 ┆ 4 │
1869
+ # # │ 25 ┆ 6 │
1870
+ # # └─────┴─────┘
1871
+ def with_column(column)
1872
+ with_columns([column])
1873
+ end
1874
+
1875
+ # Remove one or multiple columns from a DataFrame.
1876
+ #
1877
+ # @param columns [Object]
1878
+ # - Name of the column that should be removed.
1879
+ # - List of column names.
1880
+ #
1881
+ # @return [LazyFrame]
1882
+ def drop(*columns)
1883
+ drop_cols = Utils._expand_selectors(self, *columns)
1884
+ _from_rbldf(_ldf.drop(drop_cols))
1885
+ end
1886
+
1887
+ # Rename column names.
1888
+ #
1889
+ # @param mapping [Hash]
1890
+ # Key value pairs that map from old name to new name.
1891
+ #
1892
+ # @return [LazyFrame]
1893
+ def rename(mapping)
1894
+ existing = mapping.keys
1895
+ _new = mapping.values
1896
+ _from_rbldf(_ldf.rename(existing, _new))
1897
+ end
1898
+
1899
+ # Reverse the DataFrame.
1900
+ #
1901
+ # @return [LazyFrame]
1902
+ def reverse
1903
+ _from_rbldf(_ldf.reverse)
1904
+ end
1905
+
1906
+ # Shift the values by a given period.
1907
+ #
1908
+ # @param n [Integer]
1909
+ # Number of places to shift (may be negative).
1910
+ # @param fill_value [Object]
1911
+ # Fill the resulting null values with this value.
1912
+ #
1913
+ # @return [LazyFrame]
1914
+ #
1915
+ # @example
1916
+ # df = Polars::DataFrame.new(
1917
+ # {
1918
+ # "a" => [1, 3, 5],
1919
+ # "b" => [2, 4, 6]
1920
+ # }
1921
+ # ).lazy
1922
+ # df.shift(1).collect
1923
+ # # =>
1924
+ # # shape: (3, 2)
1925
+ # # ┌──────┬──────┐
1926
+ # # │ a ┆ b │
1927
+ # # │ --- ┆ --- │
1928
+ # # │ i64 ┆ i64 │
1929
+ # # ╞══════╪══════╡
1930
+ # # │ null ┆ null │
1931
+ # # │ 1 ┆ 2 │
1932
+ # # │ 3 ┆ 4 │
1933
+ # # └──────┴──────┘
1934
+ #
1935
+ # @example
1936
+ # df.shift(-1).collect
1937
+ # # =>
1938
+ # # shape: (3, 2)
1939
+ # # ┌──────┬──────┐
1940
+ # # │ a ┆ b │
1941
+ # # │ --- ┆ --- │
1942
+ # # │ i64 ┆ i64 │
1943
+ # # ╞══════╪══════╡
1944
+ # # │ 3 ┆ 4 │
1945
+ # # │ 5 ┆ 6 │
1946
+ # # │ null ┆ null │
1947
+ # # └──────┴──────┘
1948
+ def shift(n, fill_value: nil)
1949
+ if !fill_value.nil?
1950
+ fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
1951
+ end
1952
+ n = Utils.parse_into_expression(n)
1953
+ _from_rbldf(_ldf.shift(n, fill_value))
1954
+ end
1955
+
1956
+ # Shift the values by a given period and fill the resulting null values.
1957
+ #
1958
+ # @param periods [Integer]
1959
+ # Number of places to shift (may be negative).
1960
+ # @param fill_value [Object]
1961
+ # Fill `nil` values with the result of this expression.
1962
+ #
1963
+ # @return [LazyFrame]
1964
+ #
1965
+ # @example
1966
+ # df = Polars::DataFrame.new(
1967
+ # {
1968
+ # "a" => [1, 3, 5],
1969
+ # "b" => [2, 4, 6]
1970
+ # }
1971
+ # ).lazy
1972
+ # df.shift_and_fill(1, 0).collect
1973
+ # # =>
1974
+ # # shape: (3, 2)
1975
+ # # ┌─────┬─────┐
1976
+ # # │ a ┆ b │
1977
+ # # │ --- ┆ --- │
1978
+ # # │ i64 ┆ i64 │
1979
+ # # ╞═════╪═════╡
1980
+ # # │ 0 ┆ 0 │
1981
+ # # │ 1 ┆ 2 │
1982
+ # # │ 3 ┆ 4 │
1983
+ # # └─────┴─────┘
1984
+ #
1985
+ # @example
1986
+ # df.shift_and_fill(-1, 0).collect
1987
+ # # =>
1988
+ # # shape: (3, 2)
1989
+ # # ┌─────┬─────┐
1990
+ # # │ a ┆ b │
1991
+ # # │ --- ┆ --- │
1992
+ # # │ i64 ┆ i64 │
1993
+ # # ╞═════╪═════╡
1994
+ # # │ 3 ┆ 4 │
1995
+ # # │ 5 ┆ 6 │
1996
+ # # │ 0 ┆ 0 │
1997
+ # # └─────┴─────┘
1998
+ def shift_and_fill(periods, fill_value)
1999
+ shift(periods, fill_value: fill_value)
2000
+ end
2001
+
2002
+ # Get a slice of this DataFrame.
2003
+ #
2004
+ # @param offset [Integer]
2005
+ # Start index. Negative indexing is supported.
2006
+ # @param length [Integer]
2007
+ # Length of the slice. If set to `nil`, all rows starting at the offset
2008
+ # will be selected.
2009
+ #
2010
+ # @return [LazyFrame]
2011
+ #
2012
+ # @example
2013
+ # df = Polars::DataFrame.new(
2014
+ # {
2015
+ # "a" => ["x", "y", "z"],
2016
+ # "b" => [1, 3, 5],
2017
+ # "c" => [2, 4, 6]
2018
+ # }
2019
+ # ).lazy
2020
+ # df.slice(1, 2).collect
2021
+ # # =>
2022
+ # # shape: (2, 3)
2023
+ # # ┌─────┬─────┬─────┐
2024
+ # # │ a ┆ b ┆ c │
2025
+ # # │ --- ┆ --- ┆ --- │
2026
+ # # │ str ┆ i64 ┆ i64 │
2027
+ # # ╞═════╪═════╪═════╡
2028
+ # # │ y ┆ 3 ┆ 4 │
2029
+ # # │ z ┆ 5 ┆ 6 │
2030
+ # # └─────┴─────┴─────┘
2031
+ def slice(offset, length = nil)
2032
+ if length && length < 0
2033
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
2034
+ end
2035
+ _from_rbldf(_ldf.slice(offset, length))
2036
+ end
2037
+
2038
+ # Get the first `n` rows.
2039
+ #
2040
+ # Alias for {#head}.
2041
+ #
2042
+ # @param n [Integer]
2043
+ # Number of rows to return.
2044
+ #
2045
+ # @return [LazyFrame]
2046
+ #
2047
+ # @note
2048
+ # Consider using the {#fetch} operation if you only want to test your
2049
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2050
+ # level, whereas the {#head}/{#limit} are applied at the end.
2051
+ def limit(n = 5)
2052
+ head(5)
2053
+ end
2054
+
2055
+ # Get the first `n` rows.
2056
+ #
2057
+ # @param n [Integer]
2058
+ # Number of rows to return.
2059
+ #
2060
+ # @return [LazyFrame]
2061
+ #
2062
+ # @note
2063
+ # Consider using the {#fetch} operation if you only want to test your
2064
+ # query. The {#fetch} operation will load the first `n` rows at the scan
2065
+ # level, whereas the {#head}/{#limit} are applied at the end.
2066
+ def head(n = 5)
2067
+ slice(0, n)
2068
+ end
2069
+
2070
+ # Get the last `n` rows.
2071
+ #
2072
+ # @param n [Integer]
2073
+ # Number of rows.
2074
+ #
2075
+ # @return [LazyFrame]
2076
+ def tail(n = 5)
2077
+ _from_rbldf(_ldf.tail(n))
2078
+ end
2079
+
2080
+ # Get the last row of the DataFrame.
2081
+ #
2082
+ # @return [LazyFrame]
2083
+ def last
2084
+ tail(1)
2085
+ end
2086
+
2087
+ # Get the first row of the DataFrame.
2088
+ #
2089
+ # @return [LazyFrame]
2090
+ def first
2091
+ slice(0, 1)
2092
+ end
2093
+
2094
+ # Add a column at index 0 that counts the rows.
2095
+ #
2096
+ # @param name [String]
2097
+ # Name of the column to add.
2098
+ # @param offset [Integer]
2099
+ # Start the row count at this offset.
2100
+ #
2101
+ # @return [LazyFrame]
2102
+ #
2103
+ # @note
2104
+ # This can have a negative effect on query performance.
2105
+ # This may, for instance, block predicate pushdown optimization.
2106
+ #
2107
+ # @example
2108
+ # df = Polars::DataFrame.new(
2109
+ # {
2110
+ # "a" => [1, 3, 5],
2111
+ # "b" => [2, 4, 6]
2112
+ # }
2113
+ # ).lazy
2114
+ # df.with_row_index.collect
2115
+ # # =>
2116
+ # # shape: (3, 3)
2117
+ # # ┌───────┬─────┬─────┐
2118
+ # # │ index ┆ a ┆ b │
2119
+ # # │ --- ┆ --- ┆ --- │
2120
+ # # │ u32 ┆ i64 ┆ i64 │
2121
+ # # ╞═══════╪═════╪═════╡
2122
+ # # │ 0 ┆ 1 ┆ 2 │
2123
+ # # │ 1 ┆ 3 ┆ 4 │
2124
+ # # │ 2 ┆ 5 ┆ 6 │
2125
+ # # └───────┴─────┴─────┘
2126
+ def with_row_index(name: "index", offset: 0)
2127
+ _from_rbldf(_ldf.with_row_index(name, offset))
2128
+ end
2129
+ alias_method :with_row_count, :with_row_index
2130
+
2131
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
2132
+ #
2133
+ # @return [LazyFrame]
2134
+ #
2135
+ # @example
2136
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2137
+ # s.take_every(2).collect
2138
+ # # =>
2139
+ # # shape: (2, 2)
2140
+ # # ┌─────┬─────┐
2141
+ # # │ a ┆ b │
2142
+ # # │ --- ┆ --- │
2143
+ # # │ i64 ┆ i64 │
2144
+ # # ╞═════╪═════╡
2145
+ # # │ 1 ┆ 5 │
2146
+ # # │ 3 ┆ 7 │
2147
+ # # └─────┴─────┘
2148
+ def take_every(n)
2149
+ select(F.col("*").take_every(n))
2150
+ end
2151
+
2152
+ # Fill null values using the specified value or strategy.
2153
+ #
2154
+ # @return [LazyFrame]
2155
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
2156
+ select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
2157
+ end
2158
+
2159
+ # Fill floating point NaN values.
2160
+ #
2161
+ # @param fill_value [Object]
2162
+ # Value to fill the NaN values with.
2163
+ #
2164
+ # @return [LazyFrame]
2165
+ #
2166
+ # @note
2167
+ # Note that floating point NaN (Not a Number) are not missing values!
2168
+ # To replace missing values, use `fill_null` instead.
2169
+ #
2170
+ # @example
2171
+ # df = Polars::DataFrame.new(
2172
+ # {
2173
+ # "a" => [1.5, 2, Float::NAN, 4],
2174
+ # "b" => [0.5, 4, Float::NAN, 13],
2175
+ # }
2176
+ # ).lazy
2177
+ # df.fill_nan(99).collect
2178
+ # # =>
2179
+ # # shape: (4, 2)
2180
+ # # ┌──────┬──────┐
2181
+ # # │ a ┆ b │
2182
+ # # │ --- ┆ --- │
2183
+ # # │ f64 ┆ f64 │
2184
+ # # ╞══════╪══════╡
2185
+ # # │ 1.5 ┆ 0.5 │
2186
+ # # │ 2.0 ┆ 4.0 │
2187
+ # # │ 99.0 ┆ 99.0 │
2188
+ # # │ 4.0 ┆ 13.0 │
2189
+ # # └──────┴──────┘
2190
+ def fill_nan(fill_value)
2191
+ if !fill_value.is_a?(Expr)
2192
+ fill_value = F.lit(fill_value)
2193
+ end
2194
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
2195
+ end
2196
+
2197
+ # Aggregate the columns in the DataFrame to their standard deviation value.
2198
+ #
2199
+ # @return [LazyFrame]
2200
+ #
2201
+ # @example
2202
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2203
+ # df.std.collect
2204
+ # # =>
2205
+ # # shape: (1, 2)
2206
+ # # ┌──────────┬─────┐
2207
+ # # │ a ┆ b │
2208
+ # # │ --- ┆ --- │
2209
+ # # │ f64 ┆ f64 │
2210
+ # # ╞══════════╪═════╡
2211
+ # # │ 1.290994 ┆ 0.5 │
2212
+ # # └──────────┴─────┘
2213
+ #
2214
+ # @example
2215
+ # df.std(ddof: 0).collect
2216
+ # # =>
2217
+ # # shape: (1, 2)
2218
+ # # ┌──────────┬──────────┐
2219
+ # # │ a ┆ b │
2220
+ # # │ --- ┆ --- │
2221
+ # # │ f64 ┆ f64 │
2222
+ # # ╞══════════╪══════════╡
2223
+ # # │ 1.118034 ┆ 0.433013 │
2224
+ # # └──────────┴──────────┘
2225
+ def std(ddof: 1)
2226
+ _from_rbldf(_ldf.std(ddof))
2227
+ end
2228
+
2229
+ # Aggregate the columns in the DataFrame to their variance value.
2230
+ #
2231
+ # @return [LazyFrame]
2232
+ #
2233
+ # @example
2234
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2235
+ # df.var.collect
2236
+ # # =>
2237
+ # # shape: (1, 2)
2238
+ # # ┌──────────┬──────┐
2239
+ # # │ a ┆ b │
2240
+ # # │ --- ┆ --- │
2241
+ # # │ f64 ┆ f64 │
2242
+ # # ╞══════════╪══════╡
2243
+ # # │ 1.666667 ┆ 0.25 │
2244
+ # # └──────────┴──────┘
2245
+ #
2246
+ # @example
2247
+ # df.var(ddof: 0).collect
2248
+ # # =>
2249
+ # # shape: (1, 2)
2250
+ # # ┌──────┬────────┐
2251
+ # # │ a ┆ b │
2252
+ # # │ --- ┆ --- │
2253
+ # # │ f64 ┆ f64 │
2254
+ # # ╞══════╪════════╡
2255
+ # # │ 1.25 ┆ 0.1875 │
2256
+ # # └──────┴────────┘
2257
+ def var(ddof: 1)
2258
+ _from_rbldf(_ldf.var(ddof))
2259
+ end
2260
+
2261
+ # Aggregate the columns in the DataFrame to their maximum value.
2262
+ #
2263
+ # @return [LazyFrame]
2264
+ #
2265
+ # @example
2266
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2267
+ # df.max.collect
2268
+ # # =>
2269
+ # # shape: (1, 2)
2270
+ # # ┌─────┬─────┐
2271
+ # # │ a ┆ b │
2272
+ # # │ --- ┆ --- │
2273
+ # # │ i64 ┆ i64 │
2274
+ # # ╞═════╪═════╡
2275
+ # # │ 4 ┆ 2 │
2276
+ # # └─────┴─────┘
2277
+ def max
2278
+ _from_rbldf(_ldf.max)
2279
+ end
2280
+
2281
+ # Aggregate the columns in the DataFrame to their minimum value.
2282
+ #
2283
+ # @return [LazyFrame]
2284
+ #
2285
+ # @example
2286
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2287
+ # df.min.collect
2288
+ # # =>
2289
+ # # shape: (1, 2)
2290
+ # # ┌─────┬─────┐
2291
+ # # │ a ┆ b │
2292
+ # # │ --- ┆ --- │
2293
+ # # │ i64 ┆ i64 │
2294
+ # # ╞═════╪═════╡
2295
+ # # │ 1 ┆ 1 │
2296
+ # # └─────┴─────┘
2297
+ def min
2298
+ _from_rbldf(_ldf.min)
2299
+ end
2300
+
2301
+ # Aggregate the columns in the DataFrame to their sum value.
2302
+ #
2303
+ # @return [LazyFrame]
2304
+ #
2305
+ # @example
2306
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2307
+ # df.sum.collect
2308
+ # # =>
2309
+ # # shape: (1, 2)
2310
+ # # ┌─────┬─────┐
2311
+ # # │ a ┆ b │
2312
+ # # │ --- ┆ --- │
2313
+ # # │ i64 ┆ i64 │
2314
+ # # ╞═════╪═════╡
2315
+ # # │ 10 ┆ 5 │
2316
+ # # └─────┴─────┘
2317
+ def sum
2318
+ _from_rbldf(_ldf.sum)
2319
+ end
2320
+
2321
+ # Aggregate the columns in the DataFrame to their mean value.
2322
+ #
2323
+ # @return [LazyFrame]
2324
+ #
2325
+ # @example
2326
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2327
+ # df.mean.collect
2328
+ # # =>
2329
+ # # shape: (1, 2)
2330
+ # # ┌─────┬──────┐
2331
+ # # │ a ┆ b │
2332
+ # # │ --- ┆ --- │
2333
+ # # │ f64 ┆ f64 │
2334
+ # # ╞═════╪══════╡
2335
+ # # │ 2.5 ┆ 1.25 │
2336
+ # # └─────┴──────┘
2337
+ def mean
2338
+ _from_rbldf(_ldf.mean)
2339
+ end
2340
+
2341
+ # Aggregate the columns in the DataFrame to their median value.
2342
+ #
2343
+ # @return [LazyFrame]
2344
+ #
2345
+ # @example
2346
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2347
+ # df.median.collect
2348
+ # # =>
2349
+ # # shape: (1, 2)
2350
+ # # ┌─────┬─────┐
2351
+ # # │ a ┆ b │
2352
+ # # │ --- ┆ --- │
2353
+ # # │ f64 ┆ f64 │
2354
+ # # ╞═════╪═════╡
2355
+ # # │ 2.5 ┆ 1.0 │
2356
+ # # └─────┴─────┘
2357
+ def median
2358
+ _from_rbldf(_ldf.median)
2359
+ end
2360
+
2361
+ # Aggregate the columns in the DataFrame to their quantile value.
2362
+ #
2363
+ # @param quantile [Float]
2364
+ # Quantile between 0.0 and 1.0.
2365
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2366
+ # Interpolation method.
2367
+ #
2368
+ # @return [LazyFrame]
2369
+ #
2370
+ # @example
2371
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
2372
+ # df.quantile(0.7).collect
2373
+ # # =>
2374
+ # # shape: (1, 2)
2375
+ # # ┌─────┬─────┐
2376
+ # # │ a ┆ b │
2377
+ # # │ --- ┆ --- │
2378
+ # # │ f64 ┆ f64 │
2379
+ # # ╞═════╪═════╡
2380
+ # # │ 3.0 ┆ 1.0 │
2381
+ # # └─────┴─────┘
2382
+ def quantile(quantile, interpolation: "nearest")
2383
+ quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
2384
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
2385
+ end
2386
+
2387
+ # Explode lists to long format.
2388
+ #
2389
+ # @return [LazyFrame]
2390
+ #
2391
+ # @example
2392
+ # df = Polars::DataFrame.new(
2393
+ # {
2394
+ # "letters" => ["a", "a", "b", "c"],
2395
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
2396
+ # }
2397
+ # ).lazy
2398
+ # df.explode("numbers").collect
2399
+ # # =>
2400
+ # # shape: (8, 2)
2401
+ # # ┌─────────┬─────────┐
2402
+ # # │ letters ┆ numbers │
2403
+ # # │ --- ┆ --- │
2404
+ # # │ str ┆ i64 │
2405
+ # # ╞═════════╪═════════╡
2406
+ # # │ a ┆ 1 │
2407
+ # # │ a ┆ 2 │
2408
+ # # │ a ┆ 3 │
2409
+ # # │ b ┆ 4 │
2410
+ # # │ b ┆ 5 │
2411
+ # # │ c ┆ 6 │
2412
+ # # │ c ┆ 7 │
2413
+ # # │ c ┆ 8 │
2414
+ # # └─────────┴─────────┘
2415
+ def explode(columns)
2416
+ columns = Utils.parse_into_list_of_expressions(columns)
2417
+ _from_rbldf(_ldf.explode(columns))
2418
+ end
2419
+
2420
+ # Drop duplicate rows from this DataFrame.
2421
+ #
2422
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2423
+ # subset.
2424
+ #
2425
+ # @param maintain_order [Boolean]
2426
+ # Keep the same order as the original DataFrame. This requires more work to
2427
+ # compute.
2428
+ # @param subset [Object]
2429
+ # Subset to use to compare rows.
2430
+ # @param keep ["first", "last"]
2431
+ # Which of the duplicate rows to keep.
2432
+ #
2433
+ # @return [LazyFrame]
2434
+ def unique(maintain_order: true, subset: nil, keep: "first")
2435
+ if !subset.nil? && !subset.is_a?(::Array)
2436
+ subset = [subset]
2437
+ end
2438
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
2439
+ end
2440
+
2441
+ # Drop rows with null values from this LazyFrame.
2442
+ #
2443
+ # @param subset [Object]
2444
+ # Subset of column(s) on which `drop_nulls` will be applied.
2445
+ #
2446
+ # @return [LazyFrame]
2447
+ #
2448
+ # @example
2449
+ # df = Polars::DataFrame.new(
2450
+ # {
2451
+ # "foo" => [1, 2, 3],
2452
+ # "bar" => [6, nil, 8],
2453
+ # "ham" => ["a", "b", "c"]
2454
+ # }
2455
+ # )
2456
+ # df.lazy.drop_nulls.collect
2457
+ # # =>
2458
+ # # shape: (2, 3)
2459
+ # # ┌─────┬─────┬─────┐
2460
+ # # │ foo ┆ bar ┆ ham │
2461
+ # # │ --- ┆ --- ┆ --- │
2462
+ # # │ i64 ┆ i64 ┆ str │
2463
+ # # ╞═════╪═════╪═════╡
2464
+ # # │ 1 ┆ 6 ┆ a │
2465
+ # # │ 3 ┆ 8 ┆ c │
2466
+ # # └─────┴─────┴─────┘
2467
+ def drop_nulls(subset: nil)
2468
+ if !subset.nil? && !subset.is_a?(::Array)
2469
+ subset = [subset]
2470
+ end
2471
+ _from_rbldf(_ldf.drop_nulls(subset))
2472
+ end
2473
+
2474
+ # Unpivot a DataFrame from wide to long format.
2475
+ #
2476
+ # Optionally leaves identifiers set.
2477
+ #
2478
+ # This function is useful to massage a DataFrame into a format where one or more
2479
+ # columns are identifier variables (index) while all other columns, considered
2480
+ # measured variables (on), are "unpivoted" to the row axis leaving just
2481
+ # two non-identifier columns, 'variable' and 'value'.
2482
+ #
2483
+ # @param on [Object]
2484
+ # Column(s) or selector(s) to use as values variables; if `on`
2485
+ # is empty all columns that are not in `index` will be used.
2486
+ # @param index [Object]
2487
+ # Column(s) or selector(s) to use as identifier variables.
2488
+ # @param variable_name [String]
2489
+ # Name to give to the `variable` column. Defaults to "variable"
2490
+ # @param value_name [String]
2491
+ # Name to give to the `value` column. Defaults to "value"
2492
+ # @param streamable [Boolean]
2493
+ # Allow this node to run in the streaming engine.
2494
+ # If this runs in streaming, the output of the unpivot operation
2495
+ # will not have a stable ordering.
2496
+ #
2497
+ # @return [LazyFrame]
2498
+ #
2499
+ # @example
2500
+ # lf = Polars::LazyFrame.new(
2501
+ # {
2502
+ # "a" => ["x", "y", "z"],
2503
+ # "b" => [1, 3, 5],
2504
+ # "c" => [2, 4, 6]
2505
+ # }
2506
+ # )
2507
+ # lf.unpivot(Polars::Selectors.numeric, index: "a").collect
2508
+ # # =>
2509
+ # # shape: (6, 3)
2510
+ # # ┌─────┬──────────┬───────┐
2511
+ # # │ a ┆ variable ┆ value │
2512
+ # # │ --- ┆ --- ┆ --- │
2513
+ # # │ str ┆ str ┆ i64 │
2514
+ # # ╞═════╪══════════╪═══════╡
2515
+ # # │ x ┆ b ┆ 1 │
2516
+ # # │ y ┆ b ┆ 3 │
2517
+ # # │ z ┆ b ┆ 5 │
2518
+ # # │ x ┆ c ┆ 2 │
2519
+ # # │ y ┆ c ┆ 4 │
2520
+ # # │ z ┆ c ┆ 6 │
2521
+ # # └─────┴──────────┴───────┘
2522
+ def unpivot(
2523
+ on,
2524
+ index: nil,
2525
+ variable_name: nil,
2526
+ value_name: nil,
2527
+ streamable: true
2528
+ )
2529
+ if !streamable
2530
+ warn "The `streamable` parameter for `LazyFrame.unpivot` is deprecated"
2531
+ end
2532
+
2533
+ on = on.nil? ? [] : Utils._expand_selectors(self, on)
2534
+ index = index.nil? ? [] : Utils._expand_selectors(self, index)
2535
+
2536
+ _from_rbldf(
2537
+ _ldf.unpivot(on, index, value_name, variable_name)
2538
+ )
2539
+ end
2540
+ alias_method :melt, :unpivot
2541
+
2542
+ # def map
2543
+ # end
2544
+
2545
+ # Interpolate intermediate values. The interpolation method is linear.
2546
+ #
2547
+ # @return [LazyFrame]
2548
+ #
2549
+ # @example
2550
+ # df = Polars::DataFrame.new(
2551
+ # {
2552
+ # "foo" => [1, nil, 9, 10],
2553
+ # "bar" => [6, 7, 9, nil],
2554
+ # "baz" => [1, nil, nil, 9]
2555
+ # }
2556
+ # ).lazy
2557
+ # df.interpolate.collect
2558
+ # # =>
2559
+ # # shape: (4, 3)
2560
+ # # ┌──────┬──────┬──────────┐
2561
+ # # │ foo ┆ bar ┆ baz │
2562
+ # # │ --- ┆ --- ┆ --- │
2563
+ # # │ f64 ┆ f64 ┆ f64 │
2564
+ # # ╞══════╪══════╪══════════╡
2565
+ # # │ 1.0 ┆ 6.0 ┆ 1.0 │
2566
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667 │
2567
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333 │
2568
+ # # │ 10.0 ┆ null ┆ 9.0 │
2569
+ # # └──────┴──────┴──────────┘
2570
+ def interpolate
2571
+ select(F.col("*").interpolate)
2572
+ end
2573
+
2574
+ # Decompose a struct into its fields.
2575
+ #
2576
+ # The fields will be inserted into the `DataFrame` on the location of the
2577
+ # `struct` type.
2578
+ #
2579
+ # @param names [Object]
2580
+ # Names of the struct columns that will be decomposed by its fields
2581
+ #
2582
+ # @return [LazyFrame]
2583
+ #
2584
+ # @example
2585
+ # df = (
2586
+ # Polars::DataFrame.new(
2587
+ # {
2588
+ # "before" => ["foo", "bar"],
2589
+ # "t_a" => [1, 2],
2590
+ # "t_b" => ["a", "b"],
2591
+ # "t_c" => [true, nil],
2592
+ # "t_d" => [[1, 2], [3]],
2593
+ # "after" => ["baz", "womp"]
2594
+ # }
2595
+ # )
2596
+ # .lazy
2597
+ # .select(
2598
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
2599
+ # )
2600
+ # )
2601
+ # df.fetch
2602
+ # # =>
2603
+ # # shape: (2, 3)
2604
+ # # ┌────────┬─────────────────────┬───────┐
2605
+ # # │ before ┆ t_struct ┆ after │
2606
+ # # │ --- ┆ --- ┆ --- │
2607
+ # # │ str ┆ struct[4] ┆ str │
2608
+ # # ╞════════╪═════════════════════╪═══════╡
2609
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
2610
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
2611
+ # # └────────┴─────────────────────┴───────┘
2612
+ #
2613
+ # @example
2614
+ # df.unnest("t_struct").fetch
2615
+ # # =>
2616
+ # # shape: (2, 6)
2617
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
2618
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
2619
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2620
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
2621
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
2622
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
2623
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
2624
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
2625
+ def unnest(names)
2626
+ if names.is_a?(::String)
2627
+ names = [names]
2628
+ end
2629
+ _from_rbldf(_ldf.unnest(names))
2630
+ end
2631
+
2632
+ # Take two sorted DataFrames and merge them by the sorted key.
2633
+ #
2634
+ # The output of this operation will also be sorted.
2635
+ # It is the callers responsibility that the frames are sorted
2636
+ # by that key otherwise the output will not make sense.
2637
+ #
2638
+ # The schemas of both LazyFrames must be equal.
2639
+ #
2640
+ # @param other [DataFrame]
2641
+ # Other DataFrame that must be merged
2642
+ # @param key [String]
2643
+ # Key that is sorted.
2644
+ #
2645
+ # @return [LazyFrame]
2646
+ #
2647
+ # @example
2648
+ # df0 = Polars::LazyFrame.new(
2649
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2650
+ # ).sort("age")
2651
+ # df1 = Polars::LazyFrame.new(
2652
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2653
+ # ).sort("age")
2654
+ # df0.merge_sorted(df1, "age").collect
2655
+ # # =>
2656
+ # # shape: (7, 2)
2657
+ # # ┌────────┬─────┐
2658
+ # # │ name ┆ age │
2659
+ # # │ --- ┆ --- │
2660
+ # # │ str ┆ i64 │
2661
+ # # ╞════════╪═════╡
2662
+ # # │ bob ┆ 18 │
2663
+ # # │ thomas ┆ 20 │
2664
+ # # │ anna ┆ 21 │
2665
+ # # │ megan ┆ 33 │
2666
+ # # │ steve ┆ 42 │
2667
+ # # │ steve ┆ 42 │
2668
+ # # │ elise ┆ 44 │
2669
+ # # └────────┴─────┘
2670
+ def merge_sorted(other, key)
2671
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2672
+ end
2673
+
2674
+ # Indicate that one or multiple columns are sorted.
2675
+ #
2676
+ # @param column [Object]
2677
+ # Columns that are sorted
2678
+ # @param descending [Boolean]
2679
+ # Whether the columns are sorted in descending order.
2680
+ #
2681
+ # @return [LazyFrame]
2682
+ def set_sorted(
2683
+ column,
2684
+ descending: false
2685
+ )
2686
+ if !Utils.strlike?(column)
2687
+ msg = "expected a 'str' for argument 'column' in 'set_sorted'"
2688
+ raise TypeError, msg
2689
+ end
2690
+ with_columns(F.col(column).set_sorted(descending: descending))
2691
+ end
2692
+
2693
+ # TODO
2694
+ # def update
2695
+ # end
2696
+
2697
+ private
2698
+
2699
+ def initialize_copy(other)
2700
+ super
2701
+ self._ldf = _ldf._clone
2702
+ end
2703
+
2704
+ def _from_rbldf(rb_ldf)
2705
+ self.class._from_rbldf(rb_ldf)
2706
+ end
2707
+ end
2708
+ end