polars-df 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -152,29 +152,98 @@ module Polars
152
152
  # def self.read_json
153
153
  # end
154
154
 
155
- # def columns
156
- # end
155
+ # Get or set column names.
156
+ #
157
+ # @return [Array]
158
+ #
159
+ # @example
160
+ # df = (
161
+ # Polars::DataFrame.new(
162
+ # {
163
+ # "foo" => [1, 2, 3],
164
+ # "bar" => [6, 7, 8],
165
+ # "ham" => ["a", "b", "c"]
166
+ # }
167
+ # )
168
+ # .lazy
169
+ # .select(["foo", "bar"])
170
+ # )
171
+ # df.columns
172
+ # # => ["foo", "bar"]
173
+ def columns
174
+ _ldf.columns
175
+ end
157
176
 
158
- # def dtypes
159
- # end
177
+ # Get dtypes of columns in LazyFrame.
178
+ #
179
+ # @return [Array]
180
+ #
181
+ # @example
182
+ # lf = Polars::DataFrame.new(
183
+ # {
184
+ # "foo" => [1, 2, 3],
185
+ # "bar" => [6.0, 7.0, 8.0],
186
+ # "ham" => ["a", "b", "c"]
187
+ # }
188
+ # ).lazy
189
+ # lf.dtypes
190
+ # # => [:i64, :f64, :str]
191
+ def dtypes
192
+ _ldf.dtypes
193
+ end
160
194
 
161
- # def schema
162
- # end
195
+ # Get the schema.
196
+ #
197
+ # @return [Hash]
198
+ #
199
+ # @example
200
+ # lf = Polars::DataFrame.new(
201
+ # {
202
+ # "foo" => [1, 2, 3],
203
+ # "bar" => [6.0, 7.0, 8.0],
204
+ # "ham" => ["a", "b", "c"]
205
+ # }
206
+ # ).lazy
207
+ # lf.schema
208
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
209
+ def schema
210
+ _ldf.schema
211
+ end
163
212
 
164
- # def width
165
- # end
213
+ # Get the width of the LazyFrame.
214
+ #
215
+ # @return [Integer]
216
+ #
217
+ # @example
218
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
219
+ # lf.width
220
+ # # => 2
221
+ def width
222
+ _ldf.width
223
+ end
166
224
 
167
- # def include?(key)
168
- # end
225
+ # Check if LazyFrame includes key.
226
+ #
227
+ # @return [Boolean]
228
+ def include?(key)
229
+ columns.include?(key)
230
+ end
169
231
 
170
232
  # clone handled by initialize_copy
171
233
 
172
234
  # def [](item)
173
235
  # end
174
236
 
175
- # def to_s
176
- # end
177
- # alias_method :inspect, :to_s
237
+ # Returns a string representing the LazyFrame.
238
+ #
239
+ # @return [String]
240
+ def to_s
241
+ <<~EOS
242
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
243
+
244
+ #{describe_plan}
245
+ EOS
246
+ end
178
247
 
179
248
  # def write_json
180
249
  # end
@@ -182,22 +251,125 @@ module Polars
182
251
  # def pipe
183
252
  # end
184
253
 
185
- # def describe_plan
186
- # end
254
+ # Create a string representation of the unoptimized query plan.
255
+ #
256
+ # @return [String]
257
+ def describe_plan
258
+ _ldf.describe_plan
259
+ end
187
260
 
261
+ # Create a string representation of the optimized query plan.
262
+ #
263
+ # @return [String]
188
264
  # def describe_optimized_plan
189
265
  # end
190
266
 
191
267
  # def show_graph
192
268
  # end
193
269
 
194
- # def sort
195
- # end
270
+ # Sort the DataFrame.
271
+ #
272
+ # Sorting can be done by:
273
+ #
274
+ # - A single column name
275
+ # - An expression
276
+ # - Multiple expressions
277
+ #
278
+ # @param by [Object]
279
+ # Column (expressions) to sort by.
280
+ # @param reverse [Boolean]
281
+ # Sort in descending order.
282
+ # @param nulls_last [Boolean]
283
+ # Place null values last. Can only be used if sorted by a single column.
284
+ #
285
+ # @return [LazyFrame]
286
+ #
287
+ # @example
288
+ # df = Polars::DataFrame.new(
289
+ # {
290
+ # "foo" => [1, 2, 3],
291
+ # "bar" => [6.0, 7.0, 8.0],
292
+ # "ham" => ["a", "b", "c"]
293
+ # }
294
+ # ).lazy
295
+ # df.sort("foo", reverse: true).collect
296
+ # # =>
297
+ # # shape: (3, 3)
298
+ # # ┌─────┬─────┬─────┐
299
+ # # │ foo ┆ bar ┆ ham │
300
+ # # │ --- ┆ --- ┆ --- │
301
+ # # │ i64 ┆ f64 ┆ str │
302
+ # # ╞═════╪═════╪═════╡
303
+ # # │ 3 ┆ 8.0 ┆ c │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 2 ┆ 7.0 ┆ b │
306
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
307
+ # # │ 1 ┆ 6.0 ┆ a │
308
+ # # └─────┴─────┴─────┘
309
+ def sort(by, reverse: false, nulls_last: false)
310
+ if by.is_a?(String)
311
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
312
+ end
313
+ if Utils.bool?(reverse)
314
+ reverse = [reverse]
315
+ end
316
+
317
+ by = Utils.selection_to_rbexpr_list(by)
318
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
319
+ end
196
320
 
197
321
  # def profile
198
322
  # end
199
323
 
324
+ # Collect into a DataFrame.
325
+ #
326
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
327
+ # only. This can be a huge time saver in debugging queries.
328
+ #
329
+ # @param type_coercion [Boolean]
330
+ # Do type coercion optimization.
331
+ # @param predicate_pushdown [Boolean]
332
+ # Do predicate pushdown optimization.
333
+ # @param projection_pushdown [Boolean]
334
+ # Do projection pushdown optimization.
335
+ # @param simplify_expression [Boolean]
336
+ # Run simplify expressions optimization.
337
+ # @param string_cache [Boolean]
338
+ # This argument is deprecated. Please set the string cache globally.
339
+ # The argument will be ignored
340
+ # @param no_optimization [Boolean]
341
+ # Turn off (certain) optimizations.
342
+ # @param slice_pushdown [Boolean]
343
+ # Slice pushdown optimization.
344
+ # @param common_subplan_elimination [Boolean]
345
+ # Will try to cache branching subplans that occur on self-joins or unions.
346
+ # @param allow_streaming [Boolean]
347
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
200
348
  #
349
+ # @return [DataFrame]
350
+ #
351
+ # @example
352
+ # df = Polars::DataFrame.new(
353
+ # {
354
+ # "a" => ["a", "b", "a", "b", "b", "c"],
355
+ # "b" => [1, 2, 3, 4, 5, 6],
356
+ # "c" => [6, 5, 4, 3, 2, 1]
357
+ # }
358
+ # ).lazy
359
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
360
+ # # =>
361
+ # # shape: (3, 3)
362
+ # # ┌─────┬─────┬─────┐
363
+ # # │ a ┆ b ┆ c │
364
+ # # │ --- ┆ --- ┆ --- │
365
+ # # │ str ┆ i64 ┆ i64 │
366
+ # # ╞═════╪═════╪═════╡
367
+ # # │ a ┆ 4 ┆ 10 │
368
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
369
+ # # │ b ┆ 11 ┆ 10 │
370
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
371
+ # # │ c ┆ 6 ┆ 1 │
372
+ # # └─────┴─────┴─────┘
201
373
  def collect(
202
374
  type_coercion: true,
203
375
  predicate_pushdown: true,
@@ -232,21 +404,184 @@ module Polars
232
404
  Utils.wrap_df(ldf.collect)
233
405
  end
234
406
 
235
- # def fetch
236
- # end
407
+ # Collect a small number of rows for debugging purposes.
408
+ #
409
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
410
+ # read by every scan operation. This is a utility that helps debug a query on a
411
+ # smaller number of rows.
412
+ #
413
+ # Note that the fetch does not guarantee the final number of rows in the
414
+ # DataFrame. Filter, join operations and a lower number of rows available in the
415
+ # scanned file influence the final number of rows.
416
+ #
417
+ # @param n_rows [Integer]
418
+ # Collect n_rows from the data sources.
419
+ # @param type_coercion [Boolean]
420
+ # Run type coercion optimization.
421
+ # @param predicate_pushdown [Boolean]
422
+ # Run predicate pushdown optimization.
423
+ # @param projection_pushdown [Boolean]
424
+ # Run projection pushdown optimization.
425
+ # @param simplify_expression [Boolean]
426
+ # Run simplify expressions optimization.
427
+ # @param string_cache [Boolean]
428
+ # This argument is deprecated. Please set the string cache globally.
429
+ # The argument will be ignored
430
+ # @param no_optimization [Boolean]
431
+ # Turn off optimizations.
432
+ # @param slice_pushdown [Boolean]
433
+ # Slice pushdown optimization
434
+ # @param common_subplan_elimination [Boolean]
435
+ # Will try to cache branching subplans that occur on self-joins or unions.
436
+ # @param allow_streaming [Boolean]
437
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
438
+ #
439
+ # @return [DataFrame]
440
+ #
441
+ # @example
442
+ # df = Polars::DataFrame.new(
443
+ # {
444
+ # "a" => ["a", "b", "a", "b", "b", "c"],
445
+ # "b" => [1, 2, 3, 4, 5, 6],
446
+ # "c" => [6, 5, 4, 3, 2, 1]
447
+ # }
448
+ # ).lazy
449
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
450
+ # # =>
451
+ # # shape: (2, 3)
452
+ # # ┌─────┬─────┬─────┐
453
+ # # │ a ┆ b ┆ c │
454
+ # # │ --- ┆ --- ┆ --- │
455
+ # # │ str ┆ i64 ┆ i64 │
456
+ # # ╞═════╪═════╪═════╡
457
+ # # │ a ┆ 1 ┆ 6 │
458
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
459
+ # # │ b ┆ 2 ┆ 5 │
460
+ # # └─────┴─────┴─────┘
461
+ def fetch(
462
+ n_rows = 500,
463
+ type_coercion: true,
464
+ predicate_pushdown: true,
465
+ projection_pushdown: true,
466
+ simplify_expression: true,
467
+ string_cache: false,
468
+ no_optimization: false,
469
+ slice_pushdown: true,
470
+ common_subplan_elimination: true,
471
+ allow_streaming: false
472
+ )
473
+ if no_optimization
474
+ predicate_pushdown = false
475
+ projection_pushdown = false
476
+ slice_pushdown = false
477
+ common_subplan_elimination = false
478
+ end
479
+
480
+ ldf = _ldf.optimization_toggle(
481
+ type_coercion,
482
+ predicate_pushdown,
483
+ projection_pushdown,
484
+ simplify_expression,
485
+ slice_pushdown,
486
+ common_subplan_elimination,
487
+ allow_streaming
488
+ )
489
+ Utils.wrap_df(ldf.fetch(n_rows))
490
+ end
237
491
 
492
+ # Return lazy representation, i.e. itself.
238
493
  #
494
+ # Useful for writing code that expects either a `DataFrame` or
495
+ # `LazyFrame`.
496
+ #
497
+ # @return [LazyFrame]
498
+ #
499
+ # @example
500
+ # df = Polars::DataFrame.new(
501
+ # {
502
+ # "a" => [nil, 2, 3, 4],
503
+ # "b" => [0.5, nil, 2.5, 13],
504
+ # "c" => [true, true, false, nil]
505
+ # }
506
+ # )
507
+ # df.lazy
239
508
  def lazy
240
509
  self
241
510
  end
242
511
 
243
- # def cache
244
- # end
512
+ # Cache the result once the execution of the physical plan hits this node.
513
+ #
514
+ # @return [LazyFrame]
515
+ def cache
516
+ _from_rbldf(_ldf.cache)
517
+ end
245
518
 
246
- # def cleared
247
- # end
519
+ # Create an empty copy of the current LazyFrame.
520
+ #
521
+ # The copy has an identical schema but no data.
522
+ #
523
+ # @return [LazyFrame]
524
+ #
525
+ # @example
526
+ # df = Polars::DataFrame.new(
527
+ # {
528
+ # "a" => [nil, 2, 3, 4],
529
+ # "b" => [0.5, nil, 2.5, 13],
530
+ # "c" => [true, true, false, nil],
531
+ # }
532
+ # ).lazy
533
+ # df.cleared.fetch
534
+ # # =>
535
+ # # shape: (0, 3)
536
+ # # ┌─────┬─────┬──────┐
537
+ # # │ a ┆ b ┆ c │
538
+ # # │ --- ┆ --- ┆ --- │
539
+ # # │ i64 ┆ f64 ┆ bool │
540
+ # # ╞═════╪═════╪══════╡
541
+ # # └─────┴─────┴──────┘
542
+ def cleared
543
+ DataFrame.new(columns: schema).lazy
544
+ end
248
545
 
546
+ # Filter the rows in the DataFrame based on a predicate expression.
249
547
  #
548
+ # @param predicate [Object]
549
+ # Expression that evaluates to a boolean Series.
550
+ #
551
+ # @return [LazyFrame]
552
+ #
553
+ # @example Filter on one condition:
554
+ # lf = Polars::DataFrame.new(
555
+ # {
556
+ # "foo" => [1, 2, 3],
557
+ # "bar" => [6, 7, 8],
558
+ # "ham" => ["a", "b", "c"]
559
+ # }
560
+ # ).lazy
561
+ # lf.filter(Polars.col("foo") < 3).collect()
562
+ # # =>
563
+ # # shape: (2, 3)
564
+ # # ┌─────┬─────┬─────┐
565
+ # # │ foo ┆ bar ┆ ham │
566
+ # # │ --- ┆ --- ┆ --- │
567
+ # # │ i64 ┆ i64 ┆ str │
568
+ # # ╞═════╪═════╪═════╡
569
+ # # │ 1 ┆ 6 ┆ a │
570
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
571
+ # # │ 2 ┆ 7 ┆ b │
572
+ # # └─────┴─────┴─────┘
573
+ #
574
+ # @example Filter on multiple conditions:
575
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
576
+ # # =>
577
+ # # shape: (1, 3)
578
+ # # ┌─────┬─────┬─────┐
579
+ # # │ foo ┆ bar ┆ ham │
580
+ # # │ --- ┆ --- ┆ --- │
581
+ # # │ i64 ┆ i64 ┆ str │
582
+ # # ╞═════╪═════╪═════╡
583
+ # # │ 1 ┆ 6 ┆ a │
584
+ # # └─────┴─────┴─────┘
250
585
  def filter(predicate)
251
586
  _from_rbldf(
252
587
  _ldf.filter(
@@ -255,11 +590,136 @@ module Polars
255
590
  )
256
591
  end
257
592
 
593
+ # Select columns from this DataFrame.
594
+ #
595
+ # @param exprs [Object]
596
+ # Column or columns to select.
597
+ #
598
+ # @return [LazyFrame]
599
+ #
600
+ # @example
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"],
606
+ # }
607
+ # ).lazy
608
+ # df.select("foo").collect
609
+ # # =>
610
+ # # shape: (3, 1)
611
+ # # ┌─────┐
612
+ # # │ foo │
613
+ # # │ --- │
614
+ # # │ i64 │
615
+ # # ╞═════╡
616
+ # # │ 1 │
617
+ # # ├╌╌╌╌╌┤
618
+ # # │ 2 │
619
+ # # ├╌╌╌╌╌┤
620
+ # # │ 3 │
621
+ # # └─────┘
622
+ #
623
+ # @example
624
+ # df.select(["foo", "bar"]).collect
625
+ # # =>
626
+ # # shape: (3, 2)
627
+ # # ┌─────┬─────┐
628
+ # # │ foo ┆ bar │
629
+ # # │ --- ┆ --- │
630
+ # # │ i64 ┆ i64 │
631
+ # # ╞═════╪═════╡
632
+ # # │ 1 ┆ 6 │
633
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
634
+ # # │ 2 ┆ 7 │
635
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
636
+ # # │ 3 ┆ 8 │
637
+ # # └─────┴─────┘
638
+ #
639
+ # @example
640
+ # df.select(Polars.col("foo") + 1).collect
641
+ # # =>
642
+ # # shape: (3, 1)
643
+ # # ┌─────┐
644
+ # # │ foo │
645
+ # # │ --- │
646
+ # # │ i64 │
647
+ # # ╞═════╡
648
+ # # │ 2 │
649
+ # # ├╌╌╌╌╌┤
650
+ # # │ 3 │
651
+ # # ├╌╌╌╌╌┤
652
+ # # │ 4 │
653
+ # # └─────┘
654
+ #
655
+ # @example
656
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
657
+ # # =>
658
+ # # shape: (3, 2)
659
+ # # ┌─────┬─────┐
660
+ # # │ foo ┆ bar │
661
+ # # │ --- ┆ --- │
662
+ # # │ i64 ┆ i64 │
663
+ # # ╞═════╪═════╡
664
+ # # │ 2 ┆ 7 │
665
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
666
+ # # │ 3 ┆ 8 │
667
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
668
+ # # │ 4 ┆ 9 │
669
+ # # └─────┴─────┘
670
+ #
671
+ # @example
672
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
673
+ # # =>
674
+ # # shape: (3, 1)
675
+ # # ┌─────────┐
676
+ # # │ literal │
677
+ # # │ --- │
678
+ # # │ i64 │
679
+ # # ╞═════════╡
680
+ # # │ 0 │
681
+ # # ├╌╌╌╌╌╌╌╌╌┤
682
+ # # │ 0 │
683
+ # # ├╌╌╌╌╌╌╌╌╌┤
684
+ # # │ 10 │
685
+ # # └─────────┘
258
686
  def select(exprs)
259
687
  exprs = Utils.selection_to_rbexpr_list(exprs)
260
688
  _from_rbldf(_ldf.select(exprs))
261
689
  end
262
690
 
691
+ # Start a groupby operation.
692
+ #
693
+ # @param by [Object]
694
+ # Column(s) to group by.
695
+ # @param maintain_order [Boolean]
696
+ # Make sure that the order of the groups remain consistent. This is more
697
+ # expensive than a default groupby.
698
+ #
699
+ # @return [LazyGroupBy]
700
+ #
701
+ # @example
702
+ # df = Polars::DataFrame.new(
703
+ # {
704
+ # "a" => ["a", "b", "a", "b", "b", "c"],
705
+ # "b" => [1, 2, 3, 4, 5, 6],
706
+ # "c" => [6, 5, 4, 3, 2, 1]
707
+ # }
708
+ # ).lazy
709
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
710
+ # # =>
711
+ # # shape: (3, 2)
712
+ # # ┌─────┬─────┐
713
+ # # │ a ┆ b │
714
+ # # │ --- ┆ --- │
715
+ # # │ str ┆ i64 │
716
+ # # ╞═════╪═════╡
717
+ # # │ a ┆ 4 │
718
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
719
+ # # │ b ┆ 11 │
720
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
721
+ # # │ c ┆ 6 │
722
+ # # └─────┴─────┘
263
723
  def groupby(by, maintain_order: false)
264
724
  rbexprs_by = Utils.selection_to_rbexpr_list(by)
265
725
  lgb = _ldf.groupby(rbexprs_by, maintain_order)
@@ -275,7 +735,116 @@ module Polars
275
735
  # def join_asof
276
736
  # end
277
737
 
738
+ # Add a join operation to the Logical Plan.
739
+ #
740
+ # @param other [LazyFrame]
741
+ # Lazy DataFrame to join with.
742
+ # @param left_on [Object]
743
+ # Join column of the left DataFrame.
744
+ # @param right_on [Object]
745
+ # Join column of the right DataFrame.
746
+ # @param on Object
747
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
748
+ # None.
749
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
750
+ # Join strategy.
751
+ # @param suffix [String]
752
+ # Suffix to append to columns with a duplicate name.
753
+ # @param allow_parallel [Boolean]
754
+ # Allow the physical plan to optionally evaluate the computation of both
755
+ # DataFrames up to the join in parallel.
756
+ # @param force_parallel [Boolean]
757
+ # Force the physical plan to evaluate the computation of both DataFrames up to
758
+ # the join in parallel.
278
759
  #
760
+ # @return [LazyFrame]
761
+ #
762
+ # @example
763
+ # df = Polars::DataFrame.new(
764
+ # {
765
+ # "foo" => [1, 2, 3],
766
+ # "bar" => [6.0, 7.0, 8.0],
767
+ # "ham" => ["a", "b", "c"]
768
+ # }
769
+ # ).lazy
770
+ # other_df = Polars::DataFrame.new(
771
+ # {
772
+ # "apple" => ["x", "y", "z"],
773
+ # "ham" => ["a", "b", "d"]
774
+ # }
775
+ # ).lazy
776
+ # df.join(other_df, on: "ham").collect
777
+ # # =>
778
+ # # shape: (2, 4)
779
+ # # ┌─────┬─────┬─────┬───────┐
780
+ # # │ foo ┆ bar ┆ ham ┆ apple │
781
+ # # │ --- ┆ --- ┆ --- ┆ --- │
782
+ # # │ i64 ┆ f64 ┆ str ┆ str │
783
+ # # ╞═════╪═════╪═════╪═══════╡
784
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
785
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
786
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
787
+ # # └─────┴─────┴─────┴───────┘
788
+ #
789
+ # @example
790
+ # df.join(other_df, on: "ham", how: "outer").collect
791
+ # # =>
792
+ # # shape: (4, 4)
793
+ # # ┌──────┬──────┬─────┬───────┐
794
+ # # │ foo ┆ bar ┆ ham ┆ apple │
795
+ # # │ --- ┆ --- ┆ --- ┆ --- │
796
+ # # │ i64 ┆ f64 ┆ str ┆ str │
797
+ # # ╞══════╪══════╪═════╪═══════╡
798
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
799
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
800
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
801
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
802
+ # # │ null ┆ null ┆ d ┆ z │
803
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
804
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
805
+ # # └──────┴──────┴─────┴───────┘
806
+ #
807
+ # @example
808
+ # df.join(other_df, on: "ham", how: "left").collect
809
+ # # =>
810
+ # # shape: (3, 4)
811
+ # # ┌─────┬─────┬─────┬───────┐
812
+ # # │ foo ┆ bar ┆ ham ┆ apple │
813
+ # # │ --- ┆ --- ┆ --- ┆ --- │
814
+ # # │ i64 ┆ f64 ┆ str ┆ str │
815
+ # # ╞═════╪═════╪═════╪═══════╡
816
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
817
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
818
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
819
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
820
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
821
+ # # └─────┴─────┴─────┴───────┘
822
+ #
823
+ # @example
824
+ # df.join(other_df, on: "ham", how: "semi").collect
825
+ # # =>
826
+ # # shape: (2, 3)
827
+ # # ┌─────┬─────┬─────┐
828
+ # # │ foo ┆ bar ┆ ham │
829
+ # # │ --- ┆ --- ┆ --- │
830
+ # # │ i64 ┆ f64 ┆ str │
831
+ # # ╞═════╪═════╪═════╡
832
+ # # │ 1 ┆ 6.0 ┆ a │
833
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
834
+ # # │ 2 ┆ 7.0 ┆ b │
835
+ # # └─────┴─────┴─────┘
836
+ #
837
+ # @example
838
+ # df.join(other_df, on: "ham", how: "anti").collect
839
+ # # =>
840
+ # # shape: (1, 3)
841
+ # # ┌─────┬─────┬─────┐
842
+ # # │ foo ┆ bar ┆ ham │
843
+ # # │ --- ┆ --- ┆ --- │
844
+ # # │ i64 ┆ f64 ┆ str │
845
+ # # ╞═════╪═════╪═════╡
846
+ # # │ 3 ┆ 8.0 ┆ c │
847
+ # # └─────┴─────┴─────┘
279
848
  def join(
280
849
  other,
281
850
  left_on: nil,
@@ -322,6 +891,43 @@ module Polars
322
891
  )
323
892
  end
324
893
 
894
+ # Add or overwrite multiple columns in a DataFrame.
895
+ #
896
+ # @param exprs [Object]
897
+ # List of Expressions that evaluate to columns.
898
+ #
899
+ # @return [LazyFrame]
900
+ #
901
+ # @example
902
+ # ldf = Polars::DataFrame.new(
903
+ # {
904
+ # "a" => [1, 2, 3, 4],
905
+ # "b" => [0.5, 4, 10, 13],
906
+ # "c" => [true, true, false, true]
907
+ # }
908
+ # ).lazy
909
+ # ldf.with_columns(
910
+ # [
911
+ # (Polars.col("a") ** 2).alias("a^2"),
912
+ # (Polars.col("b") / 2).alias("b/2"),
913
+ # (Polars.col("c").is_not()).alias("not c")
914
+ # ]
915
+ # ).collect
916
+ # # =>
917
+ # # shape: (4, 6)
918
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
919
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
920
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
921
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
922
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
923
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
924
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
925
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
926
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
927
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
928
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
929
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
930
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
325
931
  def with_columns(exprs)
326
932
  exprs =
327
933
  if exprs.nil?
@@ -350,58 +956,343 @@ module Polars
350
956
  # def with_context
351
957
  # end
352
958
 
959
+ # Add or overwrite column in a DataFrame.
960
+ #
961
+ # @param column [Object]
962
+ # Expression that evaluates to column or a Series to use.
963
+ #
964
+ # @return [LazyFrame]
353
965
  #
966
+ # @example
967
+ # df = Polars::DataFrame.new(
968
+ # {
969
+ # "a" => [1, 3, 5],
970
+ # "b" => [2, 4, 6]
971
+ # }
972
+ # ).lazy
973
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
974
+ # # =>
975
+ # # shape: (3, 3)
976
+ # # ┌─────┬─────┬───────────┐
977
+ # # │ a ┆ b ┆ b_squared │
978
+ # # │ --- ┆ --- ┆ --- │
979
+ # # │ i64 ┆ i64 ┆ f64 │
980
+ # # ╞═════╪═════╪═══════════╡
981
+ # # │ 1 ┆ 2 ┆ 4.0 │
982
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
983
+ # # │ 3 ┆ 4 ┆ 16.0 │
984
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
985
+ # # │ 5 ┆ 6 ┆ 36.0 │
986
+ # # └─────┴─────┴───────────┘
987
+ #
988
+ # @example
989
+ # df.with_column(Polars.col("a") ** 2).collect
990
+ # # =>
991
+ # # shape: (3, 2)
992
+ # # ┌──────┬─────┐
993
+ # # │ a ┆ b │
994
+ # # │ --- ┆ --- │
995
+ # # │ f64 ┆ i64 │
996
+ # # ╞══════╪═════╡
997
+ # # │ 1.0 ┆ 2 │
998
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
999
+ # # │ 9.0 ┆ 4 │
1000
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1001
+ # # │ 25.0 ┆ 6 │
1002
+ # # └──────┴─────┘
354
1003
  def with_column(column)
355
1004
  with_columns([column])
356
1005
  end
357
1006
 
358
- # def drop
359
- # end
1007
+ # Remove one or multiple columns from a DataFrame.
1008
+ #
1009
+ # @param columns [Object]
1010
+ # - Name of the column that should be removed.
1011
+ # - List of column names.
1012
+ #
1013
+ # @return [LazyFrame]
1014
+ def drop(columns)
1015
+ if columns.is_a?(String)
1016
+ columns = [columns]
1017
+ end
1018
+ _from_rbldf(_ldf.drop_columns(columns))
1019
+ end
360
1020
 
1021
+ # Rename column names.
1022
+ #
1023
+ # @param mapping [Hash]
1024
+ # Key value pairs that map from old name to new name.
361
1025
  #
1026
+ # @return [LazyFrame]
362
1027
  def rename(mapping)
363
1028
  existing = mapping.keys
364
1029
  _new = mapping.values
365
1030
  _from_rbldf(_ldf.rename(existing, _new))
366
1031
  end
367
1032
 
368
- # def reverse
369
- # end
1033
+ # Reverse the DataFrame.
1034
+ #
1035
+ # @return [LazyFrame]
1036
+ def reverse
1037
+ _from_rbldf(_ldf.reverse)
1038
+ end
370
1039
 
371
- # def shift
372
- # end
1040
+ # Shift the values by a given period.
1041
+ #
1042
+ # @param periods [Integer]
1043
+ # Number of places to shift (may be negative).
1044
+ #
1045
+ # @return [LazyFrame]
1046
+ #
1047
+ # @example
1048
+ # df = Polars::DataFrame.new(
1049
+ # {
1050
+ # "a" => [1, 3, 5],
1051
+ # "b" => [2, 4, 6]
1052
+ # }
1053
+ # ).lazy
1054
+ # df.shift(1).collect
1055
+ # # =>
1056
+ # # shape: (3, 2)
1057
+ # # ┌──────┬──────┐
1058
+ # # │ a ┆ b │
1059
+ # # │ --- ┆ --- │
1060
+ # # │ i64 ┆ i64 │
1061
+ # # ╞══════╪══════╡
1062
+ # # │ null ┆ null │
1063
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1064
+ # # │ 1 ┆ 2 │
1065
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1066
+ # # │ 3 ┆ 4 │
1067
+ # # └──────┴──────┘
1068
+ #
1069
+ # @example
1070
+ # df.shift(-1).collect
1071
+ # # =>
1072
+ # # shape: (3, 2)
1073
+ # # ┌──────┬──────┐
1074
+ # # │ a ┆ b │
1075
+ # # │ --- ┆ --- │
1076
+ # # │ i64 ┆ i64 │
1077
+ # # ╞══════╪══════╡
1078
+ # # │ 3 ┆ 4 │
1079
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1080
+ # # │ 5 ┆ 6 │
1081
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1082
+ # # │ null ┆ null │
1083
+ # # └──────┴──────┘
1084
+ def shift(periods)
1085
+ _from_rbldf(_ldf.shift(periods))
1086
+ end
373
1087
 
374
- # def shift_and_fill
375
- # end
1088
+ # Shift the values by a given period and fill the resulting null values.
1089
+ #
1090
+ # @param periods [Integer]
1091
+ # Number of places to shift (may be negative).
1092
+ # @param fill_value [Object]
1093
+ # Fill `nil` values with the result of this expression.
1094
+ #
1095
+ # @return [LazyFrame]
1096
+ #
1097
+ # @example
1098
+ # df = Polars::DataFrame.new(
1099
+ # {
1100
+ # "a" => [1, 3, 5],
1101
+ # "b" => [2, 4, 6]
1102
+ # }
1103
+ # ).lazy
1104
+ # df.shift_and_fill(1, 0).collect
1105
+ # # =>
1106
+ # # shape: (3, 2)
1107
+ # # ┌─────┬─────┐
1108
+ # # │ a ┆ b │
1109
+ # # │ --- ┆ --- │
1110
+ # # │ i64 ┆ i64 │
1111
+ # # ╞═════╪═════╡
1112
+ # # │ 0 ┆ 0 │
1113
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1114
+ # # │ 1 ┆ 2 │
1115
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1116
+ # # │ 3 ┆ 4 │
1117
+ # # └─────┴─────┘
1118
+ #
1119
+ # @example
1120
+ # df.shift_and_fill(-1, 0).collect
1121
+ # # =>
1122
+ # # shape: (3, 2)
1123
+ # # ┌─────┬─────┐
1124
+ # # │ a ┆ b │
1125
+ # # │ --- ┆ --- │
1126
+ # # │ i64 ┆ i64 │
1127
+ # # ╞═════╪═════╡
1128
+ # # │ 3 ┆ 4 │
1129
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1130
+ # # │ 5 ┆ 6 │
1131
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1132
+ # # │ 0 ┆ 0 │
1133
+ # # └─────┴─────┘
1134
+ def shift_and_fill(periods, fill_value)
1135
+ if !fill_value.is_a?(Expr)
1136
+ fill_value = Polars.lit(fill_value)
1137
+ end
1138
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1139
+ end
376
1140
 
377
- # def slice
378
- # end
1141
+ # Get a slice of this DataFrame.
1142
+ #
1143
+ # @param offset [Integer]
1144
+ # Start index. Negative indexing is supported.
1145
+ # @param length [Integer]
1146
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1147
+ # will be selected.
1148
+ #
1149
+ # @return [LazyFrame]
1150
+ #
1151
+ # @example
1152
+ # df = Polars::DataFrame.new(
1153
+ # {
1154
+ # "a" => ["x", "y", "z"],
1155
+ # "b" => [1, 3, 5],
1156
+ # "c" => [2, 4, 6]
1157
+ # }
1158
+ # ).lazy
1159
+ # df.slice(1, 2).collect
1160
+ # # =>
1161
+ # # shape: (2, 3)
1162
+ # # ┌─────┬─────┬─────┐
1163
+ # # │ a ┆ b ┆ c │
1164
+ # # │ --- ┆ --- ┆ --- │
1165
+ # # │ str ┆ i64 ┆ i64 │
1166
+ # # ╞═════╪═════╪═════╡
1167
+ # # │ y ┆ 3 ┆ 4 │
1168
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1169
+ # # │ z ┆ 5 ┆ 6 │
1170
+ # # └─────┴─────┴─────┘
1171
+ def slice(offset, length = nil)
1172
+ if length && length < 0
1173
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
1174
+ end
1175
+ _from_rbldf(_ldf.slice(offset, length))
1176
+ end
379
1177
 
380
- # def limit
381
- # end
1178
+ # Get the first `n` rows.
1179
+ #
1180
+ # Alias for {#head}.
1181
+ #
1182
+ # @param n [Integer]
1183
+ # Number of rows to return.
1184
+ #
1185
+ # @return [LazyFrame]
1186
+ #
1187
+ # @note
1188
+ # Consider using the {#fetch} operation if you only want to test your
1189
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1190
+ # level, whereas the {#head}/{#limit} are applied at the end.
1191
+ def limit(n = 5)
1192
+ head(5)
1193
+ end
382
1194
 
383
- # def head
384
- # end
1195
+ # Get the first `n` rows.
1196
+ #
1197
+ # @param n [Integer]
1198
+ # Number of rows to return.
1199
+ #
1200
+ # @return [LazyFrame]
1201
+ #
1202
+ # @note
1203
+ # Consider using the {#fetch} operation if you only want to test your
1204
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1205
+ # level, whereas the {#head}/{#limit} are applied at the end.
1206
+ def head(n = 5)
1207
+ slice(0, n)
1208
+ end
385
1209
 
386
- # def tail
387
- # end
1210
+ # Get the last `n` rows.
1211
+ #
1212
+ # @param n [Integer]
1213
+ # Number of rows.
1214
+ #
1215
+ # @return [LazyFrame]
1216
+ def tail(n = 5)
1217
+ _from_rbldf(_ldf.tail(n))
1218
+ end
388
1219
 
389
- # def last
390
- # end
1220
+ # Get the last row of the DataFrame.
1221
+ #
1222
+ # @return [LazyFrame]
1223
+ def last
1224
+ tail(1)
1225
+ end
391
1226
 
392
- # def first
393
- # end
1227
+ # Get the first row of the DataFrame.
1228
+ #
1229
+ # @return [LazyFrame]
1230
+ def first
1231
+ slice(0, 1)
1232
+ end
394
1233
 
395
1234
  # def with_row_count
396
1235
  # end
397
1236
 
398
- # def take_every
399
- # end
1237
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
1238
+ #
1239
+ # @return [LazyFrame]
1240
+ #
1241
+ # @example
1242
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
1243
+ # s.take_every(2).collect
1244
+ # # =>
1245
+ # # shape: (2, 2)
1246
+ # # ┌─────┬─────┐
1247
+ # # │ a ┆ b │
1248
+ # # │ --- ┆ --- │
1249
+ # # │ i64 ┆ i64 │
1250
+ # # ╞═════╪═════╡
1251
+ # # │ 1 ┆ 5 │
1252
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1253
+ # # │ 3 ┆ 7 │
1254
+ # # └─────┴─────┘
1255
+ def take_every(n)
1256
+ select(Utils.col("*").take_every(n))
1257
+ end
400
1258
 
401
1259
  # def fill_null
402
1260
  # end
403
1261
 
1262
+ # Fill floating point NaN values.
1263
+ #
1264
+ # @param fill_value [Object]
1265
+ # Value to fill the NaN values with.
1266
+ #
1267
+ # @return [LazyFrame]
1268
+ #
1269
+ # @note
1270
+ # Note that floating point NaN (Not a Number) are not missing values!
1271
+ # To replace missing values, use `fill_null` instead.
404
1272
  #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new(
1275
+ # {
1276
+ # "a" => [1.5, 2, Float::NAN, 4],
1277
+ # "b" => [0.5, 4, Float::NAN, 13],
1278
+ # }
1279
+ # ).lazy
1280
+ # df.fill_nan(99).collect
1281
+ # # =>
1282
+ # # shape: (4, 2)
1283
+ # # ┌──────┬──────┐
1284
+ # # │ a ┆ b │
1285
+ # # │ --- ┆ --- │
1286
+ # # │ f64 ┆ f64 │
1287
+ # # ╞══════╪══════╡
1288
+ # # │ 1.5 ┆ 0.5 │
1289
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1290
+ # # │ 2.0 ┆ 4.0 │
1291
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1292
+ # # │ 99.0 ┆ 99.0 │
1293
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1294
+ # # │ 4.0 ┆ 13.0 │
1295
+ # # └──────┴──────┘
405
1296
  def fill_nan(fill_value)
406
1297
  if !fill_value.is_a?(Expr)
407
1298
  fill_value = Utils.lit(fill_value)
@@ -409,38 +1300,255 @@ module Polars
409
1300
  _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
410
1301
  end
411
1302
 
412
- # def std
413
- # end
1303
+ # Aggregate the columns in the DataFrame to their standard deviation value.
1304
+ #
1305
+ # @return [LazyFrame]
1306
+ #
1307
+ # @example
1308
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1309
+ # df.std.collect
1310
+ # # =>
1311
+ # # shape: (1, 2)
1312
+ # # ┌──────────┬─────┐
1313
+ # # │ a ┆ b │
1314
+ # # │ --- ┆ --- │
1315
+ # # │ f64 ┆ f64 │
1316
+ # # ╞══════════╪═════╡
1317
+ # # │ 1.290994 ┆ 0.5 │
1318
+ # # └──────────┴─────┘
1319
+ #
1320
+ # @example
1321
+ # df.std(ddof: 0).collect
1322
+ # # =>
1323
+ # # shape: (1, 2)
1324
+ # # ┌──────────┬──────────┐
1325
+ # # │ a ┆ b │
1326
+ # # │ --- ┆ --- │
1327
+ # # │ f64 ┆ f64 │
1328
+ # # ╞══════════╪══════════╡
1329
+ # # │ 1.118034 ┆ 0.433013 │
1330
+ # # └──────────┴──────────┘
1331
+ def std(ddof: 1)
1332
+ _from_rbldf(_ldf.std(ddof))
1333
+ end
414
1334
 
415
- # def var
416
- # end
1335
+ # Aggregate the columns in the DataFrame to their variance value.
1336
+ #
1337
+ # @return [LazyFrame]
1338
+ #
1339
+ # @example
1340
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1341
+ # df.var.collect
1342
+ # # =>
1343
+ # # shape: (1, 2)
1344
+ # # ┌──────────┬──────┐
1345
+ # # │ a ┆ b │
1346
+ # # │ --- ┆ --- │
1347
+ # # │ f64 ┆ f64 │
1348
+ # # ╞══════════╪══════╡
1349
+ # # │ 1.666667 ┆ 0.25 │
1350
+ # # └──────────┴──────┘
1351
+ #
1352
+ # @example
1353
+ # df.var(ddof: 0).collect
1354
+ # # =>
1355
+ # # shape: (1, 2)
1356
+ # # ┌──────┬────────┐
1357
+ # # │ a ┆ b │
1358
+ # # │ --- ┆ --- │
1359
+ # # │ f64 ┆ f64 │
1360
+ # # ╞══════╪════════╡
1361
+ # # │ 1.25 ┆ 0.1875 │
1362
+ # # └──────┴────────┘
1363
+ def var(ddof: 1)
1364
+ _from_rbldf(_ldf.var(ddof))
1365
+ end
417
1366
 
418
- # def max
419
- # end
1367
+ # Aggregate the columns in the DataFrame to their maximum value.
1368
+ #
1369
+ # @return [LazyFrame]
1370
+ #
1371
+ # @example
1372
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1373
+ # df.max.collect
1374
+ # # =>
1375
+ # # shape: (1, 2)
1376
+ # # ┌─────┬─────┐
1377
+ # # │ a ┆ b │
1378
+ # # │ --- ┆ --- │
1379
+ # # │ i64 ┆ i64 │
1380
+ # # ╞═════╪═════╡
1381
+ # # │ 4 ┆ 2 │
1382
+ # # └─────┴─────┘
1383
+ def max
1384
+ _from_rbldf(_ldf.max)
1385
+ end
420
1386
 
421
- # def min
422
- # end
1387
+ # Aggregate the columns in the DataFrame to their minimum value.
1388
+ #
1389
+ # @return [LazyFrame]
1390
+ #
1391
+ # @example
1392
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1393
+ # df.min.collect
1394
+ # # =>
1395
+ # # shape: (1, 2)
1396
+ # # ┌─────┬─────┐
1397
+ # # │ a ┆ b │
1398
+ # # │ --- ┆ --- │
1399
+ # # │ i64 ┆ i64 │
1400
+ # # ╞═════╪═════╡
1401
+ # # │ 1 ┆ 1 │
1402
+ # # └─────┴─────┘
1403
+ def min
1404
+ _from_rbldf(_ldf.min)
1405
+ end
423
1406
 
424
- # def sum
425
- # end
1407
+ # Aggregate the columns in the DataFrame to their sum value.
1408
+ #
1409
+ # @return [LazyFrame]
1410
+ #
1411
+ # @example
1412
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1413
+ # df.sum.collect
1414
+ # # =>
1415
+ # # shape: (1, 2)
1416
+ # # ┌─────┬─────┐
1417
+ # # │ a ┆ b │
1418
+ # # │ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 │
1420
+ # # ╞═════╪═════╡
1421
+ # # │ 10 ┆ 5 │
1422
+ # # └─────┴─────┘
1423
+ def sum
1424
+ _from_rbldf(_ldf.sum)
1425
+ end
426
1426
 
427
- # def mean
428
- # end
1427
+ # Aggregate the columns in the DataFrame to their mean value.
1428
+ #
1429
+ # @return [LazyFrame]
1430
+ #
1431
+ # @example
1432
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1433
+ # df.mean.collect
1434
+ # # =>
1435
+ # # shape: (1, 2)
1436
+ # # ┌─────┬──────┐
1437
+ # # │ a ┆ b │
1438
+ # # │ --- ┆ --- │
1439
+ # # │ f64 ┆ f64 │
1440
+ # # ╞═════╪══════╡
1441
+ # # │ 2.5 ┆ 1.25 │
1442
+ # # └─────┴──────┘
1443
+ def mean
1444
+ _from_rbldf(_ldf.mean)
1445
+ end
429
1446
 
430
- # def median
431
- # end
1447
+ # Aggregate the columns in the DataFrame to their median value.
1448
+ #
1449
+ # @return [LazyFrame]
1450
+ #
1451
+ # @example
1452
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1453
+ # df.median.collect
1454
+ # # =>
1455
+ # # shape: (1, 2)
1456
+ # # ┌─────┬─────┐
1457
+ # # │ a ┆ b │
1458
+ # # │ --- ┆ --- │
1459
+ # # │ f64 ┆ f64 │
1460
+ # # ╞═════╪═════╡
1461
+ # # │ 2.5 ┆ 1.0 │
1462
+ # # └─────┴─────┘
1463
+ def median
1464
+ _from_rbldf(_ldf.median)
1465
+ end
432
1466
 
433
- # def quantile
434
- # end
1467
+ # Aggregate the columns in the DataFrame to their quantile value.
1468
+ #
1469
+ # @param quantile [Float]
1470
+ # Quantile between 0.0 and 1.0.
1471
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
1472
+ # Interpolation method.
1473
+ #
1474
+ # @return [LazyFrame]
1475
+ #
1476
+ # @example
1477
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1478
+ # df.quantile(0.7).collect
1479
+ # # =>
1480
+ # # shape: (1, 2)
1481
+ # # ┌─────┬─────┐
1482
+ # # │ a ┆ b │
1483
+ # # │ --- ┆ --- │
1484
+ # # │ f64 ┆ f64 │
1485
+ # # ╞═════╪═════╡
1486
+ # # │ 3.0 ┆ 1.0 │
1487
+ # # └─────┴─────┘
1488
+ def quantile(quantile, interpolation: "nearest")
1489
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
1490
+ end
435
1491
 
1492
+ # Explode lists to long format.
1493
+ #
1494
+ # @return [LazyFrame]
436
1495
  #
1496
+ # @example
1497
+ # df = Polars::DataFrame.new(
1498
+ # {
1499
+ # "letters" => ["a", "a", "b", "c"],
1500
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
1501
+ # }
1502
+ # ).lazy
1503
+ # df.explode("numbers").collect
1504
+ # # =>
1505
+ # # shape: (8, 2)
1506
+ # # ┌─────────┬─────────┐
1507
+ # # │ letters ┆ numbers │
1508
+ # # │ --- ┆ --- │
1509
+ # # │ str ┆ i64 │
1510
+ # # ╞═════════╪═════════╡
1511
+ # # │ a ┆ 1 │
1512
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1513
+ # # │ a ┆ 2 │
1514
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1515
+ # # │ a ┆ 3 │
1516
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1517
+ # # │ b ┆ 4 │
1518
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1519
+ # # │ b ┆ 5 │
1520
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1521
+ # # │ c ┆ 6 │
1522
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1523
+ # # │ c ┆ 7 │
1524
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1525
+ # # │ c ┆ 8 │
1526
+ # # └─────────┴─────────┘
437
1527
  def explode(columns)
438
1528
  columns = Utils.selection_to_rbexpr_list(columns)
439
1529
  _from_rbldf(_ldf.explode(columns))
440
1530
  end
441
1531
 
442
- # def unique
443
- # end
1532
+ # Drop duplicate rows from this DataFrame.
1533
+ #
1534
+ # Note that this fails if there is a column of type `List` in the DataFrame or
1535
+ # subset.
1536
+ #
1537
+ # @param maintain_order [Boolean]
1538
+ # Keep the same order as the original DataFrame. This requires more work to
1539
+ # compute.
1540
+ # @param subset [Object]
1541
+ # Subset to use to compare rows.
1542
+ # @param keep ["first", "last"]
1543
+ # Which of the duplicate rows to keep.
1544
+ #
1545
+ # @return [LazyFrame]
1546
+ def unique(maintain_order: true, subset: nil, keep: "first")
1547
+ if !subset.nil? && !subset.is_a?(Array)
1548
+ subset = [subset]
1549
+ end
1550
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
1551
+ end
444
1552
 
445
1553
  # def drop_nulls
446
1554
  # end
@@ -451,11 +1559,97 @@ module Polars
451
1559
  # def map
452
1560
  # end
453
1561
 
454
- # def interpolate
455
- # end
1562
+ # Interpolate intermediate values. The interpolation method is linear.
1563
+ #
1564
+ # @return [LazyFrame]
1565
+ #
1566
+ # @example
1567
+ # df = Polars::DataFrame.new(
1568
+ # {
1569
+ # "foo" => [1, nil, 9, 10],
1570
+ # "bar" => [6, 7, 9, nil],
1571
+ # "baz" => [1, nil, nil, 9]
1572
+ # }
1573
+ # ).lazy
1574
+ # df.interpolate.collect
1575
+ # # =>
1576
+ # # shape: (4, 3)
1577
+ # # ┌─────┬──────┬─────┐
1578
+ # # │ foo ┆ bar ┆ baz │
1579
+ # # │ --- ┆ --- ┆ --- │
1580
+ # # │ i64 ┆ i64 ┆ i64 │
1581
+ # # ╞═════╪══════╪═════╡
1582
+ # # │ 1 ┆ 6 ┆ 1 │
1583
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1584
+ # # │ 5 ┆ 7 ┆ 3 │
1585
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1586
+ # # │ 9 ┆ 9 ┆ 6 │
1587
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1588
+ # # │ 10 ┆ null ┆ 9 │
1589
+ # # └─────┴──────┴─────┘
1590
+ def interpolate
1591
+ select(Utils.col("*").interpolate)
1592
+ end
456
1593
 
457
- # def unnest
458
- # end
1594
+ # Decompose a struct into its fields.
1595
+ #
1596
+ # The fields will be inserted into the `DataFrame` on the location of the
1597
+ # `struct` type.
1598
+ #
1599
+ # @param names [Object]
1600
+ # Names of the struct columns that will be decomposed by its fields
1601
+ #
1602
+ # @return [LazyFrame]
1603
+ #
1604
+ # @example
1605
+ # df = (
1606
+ # Polars::DataFrame.new(
1607
+ # {
1608
+ # "before" => ["foo", "bar"],
1609
+ # "t_a" => [1, 2],
1610
+ # "t_b" => ["a", "b"],
1611
+ # "t_c" => [true, nil],
1612
+ # "t_d" => [[1, 2], [3]],
1613
+ # "after" => ["baz", "womp"]
1614
+ # }
1615
+ # )
1616
+ # .lazy
1617
+ # .select(
1618
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
1619
+ # )
1620
+ # )
1621
+ # df.fetch
1622
+ # # =>
1623
+ # # shape: (2, 3)
1624
+ # # ┌────────┬─────────────────────┬───────┐
1625
+ # # │ before ┆ t_struct ┆ after │
1626
+ # # │ --- ┆ --- ┆ --- │
1627
+ # # │ str ┆ struct[4] ┆ str │
1628
+ # # ╞════════╪═════════════════════╪═══════╡
1629
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
1630
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1631
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
1632
+ # # └────────┴─────────────────────┴───────┘
1633
+ #
1634
+ # @example
1635
+ # df.unnest("t_struct").fetch
1636
+ # # =>
1637
+ # # shape: (2, 6)
1638
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
1639
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
1640
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1641
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
1642
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
1643
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
1644
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1645
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
1646
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
1647
+ def unnest(names)
1648
+ if names.is_a?(String)
1649
+ names = [names]
1650
+ end
1651
+ _from_rbldf(_ldf.unnest(names))
1652
+ end
459
1653
 
460
1654
  private
461
1655