polars-df 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -152,29 +152,98 @@ module Polars
152
152
  # def self.read_json
153
153
  # end
154
154
 
155
- # def columns
156
- # end
155
+ # Get or set column names.
156
+ #
157
+ # @return [Array]
158
+ #
159
+ # @example
160
+ # df = (
161
+ # Polars::DataFrame.new(
162
+ # {
163
+ # "foo" => [1, 2, 3],
164
+ # "bar" => [6, 7, 8],
165
+ # "ham" => ["a", "b", "c"]
166
+ # }
167
+ # )
168
+ # .lazy
169
+ # .select(["foo", "bar"])
170
+ # )
171
+ # df.columns
172
+ # # => ["foo", "bar"]
173
+ def columns
174
+ _ldf.columns
175
+ end
157
176
 
158
- # def dtypes
159
- # end
177
+ # Get dtypes of columns in LazyFrame.
178
+ #
179
+ # @return [Array]
180
+ #
181
+ # @example
182
+ # lf = Polars::DataFrame.new(
183
+ # {
184
+ # "foo" => [1, 2, 3],
185
+ # "bar" => [6.0, 7.0, 8.0],
186
+ # "ham" => ["a", "b", "c"]
187
+ # }
188
+ # ).lazy
189
+ # lf.dtypes
190
+ # # => [:i64, :f64, :str]
191
+ def dtypes
192
+ _ldf.dtypes
193
+ end
160
194
 
161
- # def schema
162
- # end
195
+ # Get the schema.
196
+ #
197
+ # @return [Hash]
198
+ #
199
+ # @example
200
+ # lf = Polars::DataFrame.new(
201
+ # {
202
+ # "foo" => [1, 2, 3],
203
+ # "bar" => [6.0, 7.0, 8.0],
204
+ # "ham" => ["a", "b", "c"]
205
+ # }
206
+ # ).lazy
207
+ # lf.schema
208
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
209
+ def schema
210
+ _ldf.schema
211
+ end
163
212
 
164
- # def width
165
- # end
213
+ # Get the width of the LazyFrame.
214
+ #
215
+ # @return [Integer]
216
+ #
217
+ # @example
218
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
219
+ # lf.width
220
+ # # => 2
221
+ def width
222
+ _ldf.width
223
+ end
166
224
 
167
- # def include?(key)
168
- # end
225
+ # Check if LazyFrame includes key.
226
+ #
227
+ # @return [Boolean]
228
+ def include?(key)
229
+ columns.include?(key)
230
+ end
169
231
 
170
232
  # clone handled by initialize_copy
171
233
 
172
234
  # def [](item)
173
235
  # end
174
236
 
175
- # def to_s
176
- # end
177
- # alias_method :inspect, :to_s
237
+ # Returns a string representing the LazyFrame.
238
+ #
239
+ # @return [String]
240
+ def to_s
241
+ <<~EOS
242
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
243
+
244
+ #{describe_plan}
245
+ EOS
246
+ end
178
247
 
179
248
  # def write_json
180
249
  # end
@@ -182,22 +251,125 @@ module Polars
182
251
  # def pipe
183
252
  # end
184
253
 
185
- # def describe_plan
186
- # end
254
+ # Create a string representation of the unoptimized query plan.
255
+ #
256
+ # @return [String]
257
+ def describe_plan
258
+ _ldf.describe_plan
259
+ end
187
260
 
261
+ # Create a string representation of the optimized query plan.
262
+ #
263
+ # @return [String]
188
264
  # def describe_optimized_plan
189
265
  # end
190
266
 
191
267
  # def show_graph
192
268
  # end
193
269
 
194
- # def sort
195
- # end
270
+ # Sort the DataFrame.
271
+ #
272
+ # Sorting can be done by:
273
+ #
274
+ # - A single column name
275
+ # - An expression
276
+ # - Multiple expressions
277
+ #
278
+ # @param by [Object]
279
+ # Column (expressions) to sort by.
280
+ # @param reverse [Boolean]
281
+ # Sort in descending order.
282
+ # @param nulls_last [Boolean]
283
+ # Place null values last. Can only be used if sorted by a single column.
284
+ #
285
+ # @return [LazyFrame]
286
+ #
287
+ # @example
288
+ # df = Polars::DataFrame.new(
289
+ # {
290
+ # "foo" => [1, 2, 3],
291
+ # "bar" => [6.0, 7.0, 8.0],
292
+ # "ham" => ["a", "b", "c"]
293
+ # }
294
+ # ).lazy
295
+ # df.sort("foo", reverse: true).collect
296
+ # # =>
297
+ # # shape: (3, 3)
298
+ # # ┌─────┬─────┬─────┐
299
+ # # │ foo ┆ bar ┆ ham │
300
+ # # │ --- ┆ --- ┆ --- │
301
+ # # │ i64 ┆ f64 ┆ str │
302
+ # # ╞═════╪═════╪═════╡
303
+ # # │ 3 ┆ 8.0 ┆ c │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 2 ┆ 7.0 ┆ b │
306
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
307
+ # # │ 1 ┆ 6.0 ┆ a │
308
+ # # └─────┴─────┴─────┘
309
+ def sort(by, reverse: false, nulls_last: false)
310
+ if by.is_a?(String)
311
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
312
+ end
313
+ if Utils.bool?(reverse)
314
+ reverse = [reverse]
315
+ end
316
+
317
+ by = Utils.selection_to_rbexpr_list(by)
318
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
319
+ end
196
320
 
197
321
  # def profile
198
322
  # end
199
323
 
324
+ # Collect into a DataFrame.
325
+ #
326
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
327
+ # only. This can be a huge time saver in debugging queries.
328
+ #
329
+ # @param type_coercion [Boolean]
330
+ # Do type coercion optimization.
331
+ # @param predicate_pushdown [Boolean]
332
+ # Do predicate pushdown optimization.
333
+ # @param projection_pushdown [Boolean]
334
+ # Do projection pushdown optimization.
335
+ # @param simplify_expression [Boolean]
336
+ # Run simplify expressions optimization.
337
+ # @param string_cache [Boolean]
338
+ # This argument is deprecated. Please set the string cache globally.
339
+ # The argument will be ignored
340
+ # @param no_optimization [Boolean]
341
+ # Turn off (certain) optimizations.
342
+ # @param slice_pushdown [Boolean]
343
+ # Slice pushdown optimization.
344
+ # @param common_subplan_elimination [Boolean]
345
+ # Will try to cache branching subplans that occur on self-joins or unions.
346
+ # @param allow_streaming [Boolean]
347
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
200
348
  #
349
+ # @return [DataFrame]
350
+ #
351
+ # @example
352
+ # df = Polars::DataFrame.new(
353
+ # {
354
+ # "a" => ["a", "b", "a", "b", "b", "c"],
355
+ # "b" => [1, 2, 3, 4, 5, 6],
356
+ # "c" => [6, 5, 4, 3, 2, 1]
357
+ # }
358
+ # ).lazy
359
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
360
+ # # =>
361
+ # # shape: (3, 3)
362
+ # # ┌─────┬─────┬─────┐
363
+ # # │ a ┆ b ┆ c │
364
+ # # │ --- ┆ --- ┆ --- │
365
+ # # │ str ┆ i64 ┆ i64 │
366
+ # # ╞═════╪═════╪═════╡
367
+ # # │ a ┆ 4 ┆ 10 │
368
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
369
+ # # │ b ┆ 11 ┆ 10 │
370
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
371
+ # # │ c ┆ 6 ┆ 1 │
372
+ # # └─────┴─────┴─────┘
201
373
  def collect(
202
374
  type_coercion: true,
203
375
  predicate_pushdown: true,
@@ -232,21 +404,184 @@ module Polars
232
404
  Utils.wrap_df(ldf.collect)
233
405
  end
234
406
 
235
- # def fetch
236
- # end
407
+ # Collect a small number of rows for debugging purposes.
408
+ #
409
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
410
+ # read by every scan operation. This is a utility that helps debug a query on a
411
+ # smaller number of rows.
412
+ #
413
+ # Note that the fetch does not guarantee the final number of rows in the
414
+ # DataFrame. Filter, join operations and a lower number of rows available in the
415
+ # scanned file influence the final number of rows.
416
+ #
417
+ # @param n_rows [Integer]
418
+ # Collect n_rows from the data sources.
419
+ # @param type_coercion [Boolean]
420
+ # Run type coercion optimization.
421
+ # @param predicate_pushdown [Boolean]
422
+ # Run predicate pushdown optimization.
423
+ # @param projection_pushdown [Boolean]
424
+ # Run projection pushdown optimization.
425
+ # @param simplify_expression [Boolean]
426
+ # Run simplify expressions optimization.
427
+ # @param string_cache [Boolean]
428
+ # This argument is deprecated. Please set the string cache globally.
429
+ # The argument will be ignored
430
+ # @param no_optimization [Boolean]
431
+ # Turn off optimizations.
432
+ # @param slice_pushdown [Boolean]
433
+ # Slice pushdown optimization
434
+ # @param common_subplan_elimination [Boolean]
435
+ # Will try to cache branching subplans that occur on self-joins or unions.
436
+ # @param allow_streaming [Boolean]
437
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
438
+ #
439
+ # @return [DataFrame]
440
+ #
441
+ # @example
442
+ # df = Polars::DataFrame.new(
443
+ # {
444
+ # "a" => ["a", "b", "a", "b", "b", "c"],
445
+ # "b" => [1, 2, 3, 4, 5, 6],
446
+ # "c" => [6, 5, 4, 3, 2, 1]
447
+ # }
448
+ # ).lazy
449
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
450
+ # # =>
451
+ # # shape: (2, 3)
452
+ # # ┌─────┬─────┬─────┐
453
+ # # │ a ┆ b ┆ c │
454
+ # # │ --- ┆ --- ┆ --- │
455
+ # # │ str ┆ i64 ┆ i64 │
456
+ # # ╞═════╪═════╪═════╡
457
+ # # │ a ┆ 1 ┆ 6 │
458
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
459
+ # # │ b ┆ 2 ┆ 5 │
460
+ # # └─────┴─────┴─────┘
461
+ def fetch(
462
+ n_rows = 500,
463
+ type_coercion: true,
464
+ predicate_pushdown: true,
465
+ projection_pushdown: true,
466
+ simplify_expression: true,
467
+ string_cache: false,
468
+ no_optimization: false,
469
+ slice_pushdown: true,
470
+ common_subplan_elimination: true,
471
+ allow_streaming: false
472
+ )
473
+ if no_optimization
474
+ predicate_pushdown = false
475
+ projection_pushdown = false
476
+ slice_pushdown = false
477
+ common_subplan_elimination = false
478
+ end
479
+
480
+ ldf = _ldf.optimization_toggle(
481
+ type_coercion,
482
+ predicate_pushdown,
483
+ projection_pushdown,
484
+ simplify_expression,
485
+ slice_pushdown,
486
+ common_subplan_elimination,
487
+ allow_streaming
488
+ )
489
+ Utils.wrap_df(ldf.fetch(n_rows))
490
+ end
237
491
 
492
+ # Return lazy representation, i.e. itself.
238
493
  #
494
+ # Useful for writing code that expects either a `DataFrame` or
495
+ # `LazyFrame`.
496
+ #
497
+ # @return [LazyFrame]
498
+ #
499
+ # @example
500
+ # df = Polars::DataFrame.new(
501
+ # {
502
+ # "a" => [nil, 2, 3, 4],
503
+ # "b" => [0.5, nil, 2.5, 13],
504
+ # "c" => [true, true, false, nil]
505
+ # }
506
+ # )
507
+ # df.lazy
239
508
  def lazy
240
509
  self
241
510
  end
242
511
 
243
- # def cache
244
- # end
512
+ # Cache the result once the execution of the physical plan hits this node.
513
+ #
514
+ # @return [LazyFrame]
515
+ def cache
516
+ _from_rbldf(_ldf.cache)
517
+ end
245
518
 
246
- # def cleared
247
- # end
519
+ # Create an empty copy of the current LazyFrame.
520
+ #
521
+ # The copy has an identical schema but no data.
522
+ #
523
+ # @return [LazyFrame]
524
+ #
525
+ # @example
526
+ # df = Polars::DataFrame.new(
527
+ # {
528
+ # "a" => [nil, 2, 3, 4],
529
+ # "b" => [0.5, nil, 2.5, 13],
530
+ # "c" => [true, true, false, nil],
531
+ # }
532
+ # ).lazy
533
+ # df.cleared.fetch
534
+ # # =>
535
+ # # shape: (0, 3)
536
+ # # ┌─────┬─────┬──────┐
537
+ # # │ a ┆ b ┆ c │
538
+ # # │ --- ┆ --- ┆ --- │
539
+ # # │ i64 ┆ f64 ┆ bool │
540
+ # # ╞═════╪═════╪══════╡
541
+ # # └─────┴─────┴──────┘
542
+ def cleared
543
+ DataFrame.new(columns: schema).lazy
544
+ end
248
545
 
546
+ # Filter the rows in the DataFrame based on a predicate expression.
249
547
  #
548
+ # @param predicate [Object]
549
+ # Expression that evaluates to a boolean Series.
550
+ #
551
+ # @return [LazyFrame]
552
+ #
553
+ # @example Filter on one condition:
554
+ # lf = Polars::DataFrame.new(
555
+ # {
556
+ # "foo" => [1, 2, 3],
557
+ # "bar" => [6, 7, 8],
558
+ # "ham" => ["a", "b", "c"]
559
+ # }
560
+ # ).lazy
561
+ # lf.filter(Polars.col("foo") < 3).collect()
562
+ # # =>
563
+ # # shape: (2, 3)
564
+ # # ┌─────┬─────┬─────┐
565
+ # # │ foo ┆ bar ┆ ham │
566
+ # # │ --- ┆ --- ┆ --- │
567
+ # # │ i64 ┆ i64 ┆ str │
568
+ # # ╞═════╪═════╪═════╡
569
+ # # │ 1 ┆ 6 ┆ a │
570
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
571
+ # # │ 2 ┆ 7 ┆ b │
572
+ # # └─────┴─────┴─────┘
573
+ #
574
+ # @example Filter on multiple conditions:
575
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
576
+ # # =>
577
+ # # shape: (1, 3)
578
+ # # ┌─────┬─────┬─────┐
579
+ # # │ foo ┆ bar ┆ ham │
580
+ # # │ --- ┆ --- ┆ --- │
581
+ # # │ i64 ┆ i64 ┆ str │
582
+ # # ╞═════╪═════╪═════╡
583
+ # # │ 1 ┆ 6 ┆ a │
584
+ # # └─────┴─────┴─────┘
250
585
  def filter(predicate)
251
586
  _from_rbldf(
252
587
  _ldf.filter(
@@ -255,11 +590,136 @@ module Polars
255
590
  )
256
591
  end
257
592
 
593
+ # Select columns from this DataFrame.
594
+ #
595
+ # @param exprs [Object]
596
+ # Column or columns to select.
597
+ #
598
+ # @return [LazyFrame]
599
+ #
600
+ # @example
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"],
606
+ # }
607
+ # ).lazy
608
+ # df.select("foo").collect
609
+ # # =>
610
+ # # shape: (3, 1)
611
+ # # ┌─────┐
612
+ # # │ foo │
613
+ # # │ --- │
614
+ # # │ i64 │
615
+ # # ╞═════╡
616
+ # # │ 1 │
617
+ # # ├╌╌╌╌╌┤
618
+ # # │ 2 │
619
+ # # ├╌╌╌╌╌┤
620
+ # # │ 3 │
621
+ # # └─────┘
622
+ #
623
+ # @example
624
+ # df.select(["foo", "bar"]).collect
625
+ # # =>
626
+ # # shape: (3, 2)
627
+ # # ┌─────┬─────┐
628
+ # # │ foo ┆ bar │
629
+ # # │ --- ┆ --- │
630
+ # # │ i64 ┆ i64 │
631
+ # # ╞═════╪═════╡
632
+ # # │ 1 ┆ 6 │
633
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
634
+ # # │ 2 ┆ 7 │
635
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
636
+ # # │ 3 ┆ 8 │
637
+ # # └─────┴─────┘
638
+ #
639
+ # @example
640
+ # df.select(Polars.col("foo") + 1).collect
641
+ # # =>
642
+ # # shape: (3, 1)
643
+ # # ┌─────┐
644
+ # # │ foo │
645
+ # # │ --- │
646
+ # # │ i64 │
647
+ # # ╞═════╡
648
+ # # │ 2 │
649
+ # # ├╌╌╌╌╌┤
650
+ # # │ 3 │
651
+ # # ├╌╌╌╌╌┤
652
+ # # │ 4 │
653
+ # # └─────┘
654
+ #
655
+ # @example
656
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
657
+ # # =>
658
+ # # shape: (3, 2)
659
+ # # ┌─────┬─────┐
660
+ # # │ foo ┆ bar │
661
+ # # │ --- ┆ --- │
662
+ # # │ i64 ┆ i64 │
663
+ # # ╞═════╪═════╡
664
+ # # │ 2 ┆ 7 │
665
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
666
+ # # │ 3 ┆ 8 │
667
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
668
+ # # │ 4 ┆ 9 │
669
+ # # └─────┴─────┘
670
+ #
671
+ # @example
672
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
673
+ # # =>
674
+ # # shape: (3, 1)
675
+ # # ┌─────────┐
676
+ # # │ literal │
677
+ # # │ --- │
678
+ # # │ i64 │
679
+ # # ╞═════════╡
680
+ # # │ 0 │
681
+ # # ├╌╌╌╌╌╌╌╌╌┤
682
+ # # │ 0 │
683
+ # # ├╌╌╌╌╌╌╌╌╌┤
684
+ # # │ 10 │
685
+ # # └─────────┘
258
686
  def select(exprs)
259
687
  exprs = Utils.selection_to_rbexpr_list(exprs)
260
688
  _from_rbldf(_ldf.select(exprs))
261
689
  end
262
690
 
691
+ # Start a groupby operation.
692
+ #
693
+ # @param by [Object]
694
+ # Column(s) to group by.
695
+ # @param maintain_order [Boolean]
696
+ # Make sure that the order of the groups remain consistent. This is more
697
+ # expensive than a default groupby.
698
+ #
699
+ # @return [LazyGroupBy]
700
+ #
701
+ # @example
702
+ # df = Polars::DataFrame.new(
703
+ # {
704
+ # "a" => ["a", "b", "a", "b", "b", "c"],
705
+ # "b" => [1, 2, 3, 4, 5, 6],
706
+ # "c" => [6, 5, 4, 3, 2, 1]
707
+ # }
708
+ # ).lazy
709
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
710
+ # # =>
711
+ # # shape: (3, 2)
712
+ # # ┌─────┬─────┐
713
+ # # │ a ┆ b │
714
+ # # │ --- ┆ --- │
715
+ # # │ str ┆ i64 │
716
+ # # ╞═════╪═════╡
717
+ # # │ a ┆ 4 │
718
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
719
+ # # │ b ┆ 11 │
720
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
721
+ # # │ c ┆ 6 │
722
+ # # └─────┴─────┘
263
723
  def groupby(by, maintain_order: false)
264
724
  rbexprs_by = Utils.selection_to_rbexpr_list(by)
265
725
  lgb = _ldf.groupby(rbexprs_by, maintain_order)
@@ -275,7 +735,116 @@ module Polars
275
735
  # def join_asof
276
736
  # end
277
737
 
738
+ # Add a join operation to the Logical Plan.
739
+ #
740
+ # @param other [LazyFrame]
741
+ # Lazy DataFrame to join with.
742
+ # @param left_on [Object]
743
+ # Join column of the left DataFrame.
744
+ # @param right_on [Object]
745
+ # Join column of the right DataFrame.
746
+ # @param on Object
747
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
748
+ # None.
749
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
750
+ # Join strategy.
751
+ # @param suffix [String]
752
+ # Suffix to append to columns with a duplicate name.
753
+ # @param allow_parallel [Boolean]
754
+ # Allow the physical plan to optionally evaluate the computation of both
755
+ # DataFrames up to the join in parallel.
756
+ # @param force_parallel [Boolean]
757
+ # Force the physical plan to evaluate the computation of both DataFrames up to
758
+ # the join in parallel.
278
759
  #
760
+ # @return [LazyFrame]
761
+ #
762
+ # @example
763
+ # df = Polars::DataFrame.new(
764
+ # {
765
+ # "foo" => [1, 2, 3],
766
+ # "bar" => [6.0, 7.0, 8.0],
767
+ # "ham" => ["a", "b", "c"]
768
+ # }
769
+ # ).lazy
770
+ # other_df = Polars::DataFrame.new(
771
+ # {
772
+ # "apple" => ["x", "y", "z"],
773
+ # "ham" => ["a", "b", "d"]
774
+ # }
775
+ # ).lazy
776
+ # df.join(other_df, on: "ham").collect
777
+ # # =>
778
+ # # shape: (2, 4)
779
+ # # ┌─────┬─────┬─────┬───────┐
780
+ # # │ foo ┆ bar ┆ ham ┆ apple │
781
+ # # │ --- ┆ --- ┆ --- ┆ --- │
782
+ # # │ i64 ┆ f64 ┆ str ┆ str │
783
+ # # ╞═════╪═════╪═════╪═══════╡
784
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
785
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
786
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
787
+ # # └─────┴─────┴─────┴───────┘
788
+ #
789
+ # @example
790
+ # df.join(other_df, on: "ham", how: "outer").collect
791
+ # # =>
792
+ # # shape: (4, 4)
793
+ # # ┌──────┬──────┬─────┬───────┐
794
+ # # │ foo ┆ bar ┆ ham ┆ apple │
795
+ # # │ --- ┆ --- ┆ --- ┆ --- │
796
+ # # │ i64 ┆ f64 ┆ str ┆ str │
797
+ # # ╞══════╪══════╪═════╪═══════╡
798
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
799
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
800
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
801
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
802
+ # # │ null ┆ null ┆ d ┆ z │
803
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
804
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
805
+ # # └──────┴──────┴─────┴───────┘
806
+ #
807
+ # @example
808
+ # df.join(other_df, on: "ham", how: "left").collect
809
+ # # =>
810
+ # # shape: (3, 4)
811
+ # # ┌─────┬─────┬─────┬───────┐
812
+ # # │ foo ┆ bar ┆ ham ┆ apple │
813
+ # # │ --- ┆ --- ┆ --- ┆ --- │
814
+ # # │ i64 ┆ f64 ┆ str ┆ str │
815
+ # # ╞═════╪═════╪═════╪═══════╡
816
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
817
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
818
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
819
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
820
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
821
+ # # └─────┴─────┴─────┴───────┘
822
+ #
823
+ # @example
824
+ # df.join(other_df, on: "ham", how: "semi").collect
825
+ # # =>
826
+ # # shape: (2, 3)
827
+ # # ┌─────┬─────┬─────┐
828
+ # # │ foo ┆ bar ┆ ham │
829
+ # # │ --- ┆ --- ┆ --- │
830
+ # # │ i64 ┆ f64 ┆ str │
831
+ # # ╞═════╪═════╪═════╡
832
+ # # │ 1 ┆ 6.0 ┆ a │
833
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
834
+ # # │ 2 ┆ 7.0 ┆ b │
835
+ # # └─────┴─────┴─────┘
836
+ #
837
+ # @example
838
+ # df.join(other_df, on: "ham", how: "anti").collect
839
+ # # =>
840
+ # # shape: (1, 3)
841
+ # # ┌─────┬─────┬─────┐
842
+ # # │ foo ┆ bar ┆ ham │
843
+ # # │ --- ┆ --- ┆ --- │
844
+ # # │ i64 ┆ f64 ┆ str │
845
+ # # ╞═════╪═════╪═════╡
846
+ # # │ 3 ┆ 8.0 ┆ c │
847
+ # # └─────┴─────┴─────┘
279
848
  def join(
280
849
  other,
281
850
  left_on: nil,
@@ -322,6 +891,43 @@ module Polars
322
891
  )
323
892
  end
324
893
 
894
+ # Add or overwrite multiple columns in a DataFrame.
895
+ #
896
+ # @param exprs [Object]
897
+ # List of Expressions that evaluate to columns.
898
+ #
899
+ # @return [LazyFrame]
900
+ #
901
+ # @example
902
+ # ldf = Polars::DataFrame.new(
903
+ # {
904
+ # "a" => [1, 2, 3, 4],
905
+ # "b" => [0.5, 4, 10, 13],
906
+ # "c" => [true, true, false, true]
907
+ # }
908
+ # ).lazy
909
+ # ldf.with_columns(
910
+ # [
911
+ # (Polars.col("a") ** 2).alias("a^2"),
912
+ # (Polars.col("b") / 2).alias("b/2"),
913
+ # (Polars.col("c").is_not()).alias("not c")
914
+ # ]
915
+ # ).collect
916
+ # # =>
917
+ # # shape: (4, 6)
918
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
919
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
920
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
921
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
922
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
923
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
924
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
925
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
926
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
927
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
928
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
929
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
930
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
325
931
  def with_columns(exprs)
326
932
  exprs =
327
933
  if exprs.nil?
@@ -350,58 +956,343 @@ module Polars
350
956
  # def with_context
351
957
  # end
352
958
 
959
+ # Add or overwrite column in a DataFrame.
960
+ #
961
+ # @param column [Object]
962
+ # Expression that evaluates to column or a Series to use.
963
+ #
964
+ # @return [LazyFrame]
353
965
  #
966
+ # @example
967
+ # df = Polars::DataFrame.new(
968
+ # {
969
+ # "a" => [1, 3, 5],
970
+ # "b" => [2, 4, 6]
971
+ # }
972
+ # ).lazy
973
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
974
+ # # =>
975
+ # # shape: (3, 3)
976
+ # # ┌─────┬─────┬───────────┐
977
+ # # │ a ┆ b ┆ b_squared │
978
+ # # │ --- ┆ --- ┆ --- │
979
+ # # │ i64 ┆ i64 ┆ f64 │
980
+ # # ╞═════╪═════╪═══════════╡
981
+ # # │ 1 ┆ 2 ┆ 4.0 │
982
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
983
+ # # │ 3 ┆ 4 ┆ 16.0 │
984
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
985
+ # # │ 5 ┆ 6 ┆ 36.0 │
986
+ # # └─────┴─────┴───────────┘
987
+ #
988
+ # @example
989
+ # df.with_column(Polars.col("a") ** 2).collect
990
+ # # =>
991
+ # # shape: (3, 2)
992
+ # # ┌──────┬─────┐
993
+ # # │ a ┆ b │
994
+ # # │ --- ┆ --- │
995
+ # # │ f64 ┆ i64 │
996
+ # # ╞══════╪═════╡
997
+ # # │ 1.0 ┆ 2 │
998
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
999
+ # # │ 9.0 ┆ 4 │
1000
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1001
+ # # │ 25.0 ┆ 6 │
1002
+ # # └──────┴─────┘
354
1003
  def with_column(column)
355
1004
  with_columns([column])
356
1005
  end
357
1006
 
358
- # def drop
359
- # end
1007
+ # Remove one or multiple columns from a DataFrame.
1008
+ #
1009
+ # @param columns [Object]
1010
+ # - Name of the column that should be removed.
1011
+ # - List of column names.
1012
+ #
1013
+ # @return [LazyFrame]
1014
+ def drop(columns)
1015
+ if columns.is_a?(String)
1016
+ columns = [columns]
1017
+ end
1018
+ _from_rbldf(_ldf.drop_columns(columns))
1019
+ end
360
1020
 
1021
+ # Rename column names.
1022
+ #
1023
+ # @param mapping [Hash]
1024
+ # Key value pairs that map from old name to new name.
361
1025
  #
1026
+ # @return [LazyFrame]
362
1027
  def rename(mapping)
363
1028
  existing = mapping.keys
364
1029
  _new = mapping.values
365
1030
  _from_rbldf(_ldf.rename(existing, _new))
366
1031
  end
367
1032
 
368
- # def reverse
369
- # end
1033
+ # Reverse the DataFrame.
1034
+ #
1035
+ # @return [LazyFrame]
1036
+ def reverse
1037
+ _from_rbldf(_ldf.reverse)
1038
+ end
370
1039
 
371
- # def shift
372
- # end
1040
+ # Shift the values by a given period.
1041
+ #
1042
+ # @param periods [Integer]
1043
+ # Number of places to shift (may be negative).
1044
+ #
1045
+ # @return [LazyFrame]
1046
+ #
1047
+ # @example
1048
+ # df = Polars::DataFrame.new(
1049
+ # {
1050
+ # "a" => [1, 3, 5],
1051
+ # "b" => [2, 4, 6]
1052
+ # }
1053
+ # ).lazy
1054
+ # df.shift(1).collect
1055
+ # # =>
1056
+ # # shape: (3, 2)
1057
+ # # ┌──────┬──────┐
1058
+ # # │ a ┆ b │
1059
+ # # │ --- ┆ --- │
1060
+ # # │ i64 ┆ i64 │
1061
+ # # ╞══════╪══════╡
1062
+ # # │ null ┆ null │
1063
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1064
+ # # │ 1 ┆ 2 │
1065
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1066
+ # # │ 3 ┆ 4 │
1067
+ # # └──────┴──────┘
1068
+ #
1069
+ # @example
1070
+ # df.shift(-1).collect
1071
+ # # =>
1072
+ # # shape: (3, 2)
1073
+ # # ┌──────┬──────┐
1074
+ # # │ a ┆ b │
1075
+ # # │ --- ┆ --- │
1076
+ # # │ i64 ┆ i64 │
1077
+ # # ╞══════╪══════╡
1078
+ # # │ 3 ┆ 4 │
1079
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1080
+ # # │ 5 ┆ 6 │
1081
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1082
+ # # │ null ┆ null │
1083
+ # # └──────┴──────┘
1084
+ def shift(periods)
1085
+ _from_rbldf(_ldf.shift(periods))
1086
+ end
373
1087
 
374
- # def shift_and_fill
375
- # end
1088
+ # Shift the values by a given period and fill the resulting null values.
1089
+ #
1090
+ # @param periods [Integer]
1091
+ # Number of places to shift (may be negative).
1092
+ # @param fill_value [Object]
1093
+ # Fill `nil` values with the result of this expression.
1094
+ #
1095
+ # @return [LazyFrame]
1096
+ #
1097
+ # @example
1098
+ # df = Polars::DataFrame.new(
1099
+ # {
1100
+ # "a" => [1, 3, 5],
1101
+ # "b" => [2, 4, 6]
1102
+ # }
1103
+ # ).lazy
1104
+ # df.shift_and_fill(1, 0).collect
1105
+ # # =>
1106
+ # # shape: (3, 2)
1107
+ # # ┌─────┬─────┐
1108
+ # # │ a ┆ b │
1109
+ # # │ --- ┆ --- │
1110
+ # # │ i64 ┆ i64 │
1111
+ # # ╞═════╪═════╡
1112
+ # # │ 0 ┆ 0 │
1113
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1114
+ # # │ 1 ┆ 2 │
1115
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1116
+ # # │ 3 ┆ 4 │
1117
+ # # └─────┴─────┘
1118
+ #
1119
+ # @example
1120
+ # df.shift_and_fill(-1, 0).collect
1121
+ # # =>
1122
+ # # shape: (3, 2)
1123
+ # # ┌─────┬─────┐
1124
+ # # │ a ┆ b │
1125
+ # # │ --- ┆ --- │
1126
+ # # │ i64 ┆ i64 │
1127
+ # # ╞═════╪═════╡
1128
+ # # │ 3 ┆ 4 │
1129
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1130
+ # # │ 5 ┆ 6 │
1131
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1132
+ # # │ 0 ┆ 0 │
1133
+ # # └─────┴─────┘
1134
+ def shift_and_fill(periods, fill_value)
1135
+ if !fill_value.is_a?(Expr)
1136
+ fill_value = Polars.lit(fill_value)
1137
+ end
1138
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1139
+ end
376
1140
 
377
- # def slice
378
- # end
1141
+ # Get a slice of this DataFrame.
1142
+ #
1143
+ # @param offset [Integer]
1144
+ # Start index. Negative indexing is supported.
1145
+ # @param length [Integer]
1146
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1147
+ # will be selected.
1148
+ #
1149
+ # @return [LazyFrame]
1150
+ #
1151
+ # @example
1152
+ # df = Polars::DataFrame.new(
1153
+ # {
1154
+ # "a" => ["x", "y", "z"],
1155
+ # "b" => [1, 3, 5],
1156
+ # "c" => [2, 4, 6]
1157
+ # }
1158
+ # ).lazy
1159
+ # df.slice(1, 2).collect
1160
+ # # =>
1161
+ # # shape: (2, 3)
1162
+ # # ┌─────┬─────┬─────┐
1163
+ # # │ a ┆ b ┆ c │
1164
+ # # │ --- ┆ --- ┆ --- │
1165
+ # # │ str ┆ i64 ┆ i64 │
1166
+ # # ╞═════╪═════╪═════╡
1167
+ # # │ y ┆ 3 ┆ 4 │
1168
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1169
+ # # │ z ┆ 5 ┆ 6 │
1170
+ # # └─────┴─────┴─────┘
1171
+ def slice(offset, length = nil)
1172
+ if length && length < 0
1173
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
1174
+ end
1175
+ _from_rbldf(_ldf.slice(offset, length))
1176
+ end
379
1177
 
380
- # def limit
381
- # end
1178
+ # Get the first `n` rows.
1179
+ #
1180
+ # Alias for {#head}.
1181
+ #
1182
+ # @param n [Integer]
1183
+ # Number of rows to return.
1184
+ #
1185
+ # @return [LazyFrame]
1186
+ #
1187
+ # @note
1188
+ # Consider using the {#fetch} operation if you only want to test your
1189
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1190
+ # level, whereas the {#head}/{#limit} are applied at the end.
1191
+ def limit(n = 5)
1192
+ head(5)
1193
+ end
382
1194
 
383
- # def head
384
- # end
1195
+ # Get the first `n` rows.
1196
+ #
1197
+ # @param n [Integer]
1198
+ # Number of rows to return.
1199
+ #
1200
+ # @return [LazyFrame]
1201
+ #
1202
+ # @note
1203
+ # Consider using the {#fetch} operation if you only want to test your
1204
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1205
+ # level, whereas the {#head}/{#limit} are applied at the end.
1206
+ def head(n = 5)
1207
+ slice(0, n)
1208
+ end
385
1209
 
386
- # def tail
387
- # end
1210
+ # Get the last `n` rows.
1211
+ #
1212
+ # @param n [Integer]
1213
+ # Number of rows.
1214
+ #
1215
+ # @return [LazyFrame]
1216
+ def tail(n = 5)
1217
+ _from_rbldf(_ldf.tail(n))
1218
+ end
388
1219
 
389
- # def last
390
- # end
1220
+ # Get the last row of the DataFrame.
1221
+ #
1222
+ # @return [LazyFrame]
1223
+ def last
1224
+ tail(1)
1225
+ end
391
1226
 
392
- # def first
393
- # end
1227
+ # Get the first row of the DataFrame.
1228
+ #
1229
+ # @return [LazyFrame]
1230
+ def first
1231
+ slice(0, 1)
1232
+ end
394
1233
 
395
1234
  # def with_row_count
396
1235
  # end
397
1236
 
398
- # def take_every
399
- # end
1237
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
1238
+ #
1239
+ # @return [LazyFrame]
1240
+ #
1241
+ # @example
1242
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
1243
+ # s.take_every(2).collect
1244
+ # # =>
1245
+ # # shape: (2, 2)
1246
+ # # ┌─────┬─────┐
1247
+ # # │ a ┆ b │
1248
+ # # │ --- ┆ --- │
1249
+ # # │ i64 ┆ i64 │
1250
+ # # ╞═════╪═════╡
1251
+ # # │ 1 ┆ 5 │
1252
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1253
+ # # │ 3 ┆ 7 │
1254
+ # # └─────┴─────┘
1255
+ def take_every(n)
1256
+ select(Utils.col("*").take_every(n))
1257
+ end
400
1258
 
401
1259
  # def fill_null
402
1260
  # end
403
1261
 
1262
+ # Fill floating point NaN values.
1263
+ #
1264
+ # @param fill_value [Object]
1265
+ # Value to fill the NaN values with.
1266
+ #
1267
+ # @return [LazyFrame]
1268
+ #
1269
+ # @note
1270
+ # Note that floating point NaN (Not a Number) are not missing values!
1271
+ # To replace missing values, use `fill_null` instead.
404
1272
  #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new(
1275
+ # {
1276
+ # "a" => [1.5, 2, Float::NAN, 4],
1277
+ # "b" => [0.5, 4, Float::NAN, 13],
1278
+ # }
1279
+ # ).lazy
1280
+ # df.fill_nan(99).collect
1281
+ # # =>
1282
+ # # shape: (4, 2)
1283
+ # # ┌──────┬──────┐
1284
+ # # │ a ┆ b │
1285
+ # # │ --- ┆ --- │
1286
+ # # │ f64 ┆ f64 │
1287
+ # # ╞══════╪══════╡
1288
+ # # │ 1.5 ┆ 0.5 │
1289
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1290
+ # # │ 2.0 ┆ 4.0 │
1291
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1292
+ # # │ 99.0 ┆ 99.0 │
1293
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1294
+ # # │ 4.0 ┆ 13.0 │
1295
+ # # └──────┴──────┘
405
1296
  def fill_nan(fill_value)
406
1297
  if !fill_value.is_a?(Expr)
407
1298
  fill_value = Utils.lit(fill_value)
@@ -409,38 +1300,255 @@ module Polars
409
1300
  _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
410
1301
  end
411
1302
 
412
- # def std
413
- # end
1303
+ # Aggregate the columns in the DataFrame to their standard deviation value.
1304
+ #
1305
+ # @return [LazyFrame]
1306
+ #
1307
+ # @example
1308
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1309
+ # df.std.collect
1310
+ # # =>
1311
+ # # shape: (1, 2)
1312
+ # # ┌──────────┬─────┐
1313
+ # # │ a ┆ b │
1314
+ # # │ --- ┆ --- │
1315
+ # # │ f64 ┆ f64 │
1316
+ # # ╞══════════╪═════╡
1317
+ # # │ 1.290994 ┆ 0.5 │
1318
+ # # └──────────┴─────┘
1319
+ #
1320
+ # @example
1321
+ # df.std(ddof: 0).collect
1322
+ # # =>
1323
+ # # shape: (1, 2)
1324
+ # # ┌──────────┬──────────┐
1325
+ # # │ a ┆ b │
1326
+ # # │ --- ┆ --- │
1327
+ # # │ f64 ┆ f64 │
1328
+ # # ╞══════════╪══════════╡
1329
+ # # │ 1.118034 ┆ 0.433013 │
1330
+ # # └──────────┴──────────┘
1331
+ def std(ddof: 1)
1332
+ _from_rbldf(_ldf.std(ddof))
1333
+ end
414
1334
 
415
- # def var
416
- # end
1335
+ # Aggregate the columns in the DataFrame to their variance value.
1336
+ #
1337
+ # @return [LazyFrame]
1338
+ #
1339
+ # @example
1340
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1341
+ # df.var.collect
1342
+ # # =>
1343
+ # # shape: (1, 2)
1344
+ # # ┌──────────┬──────┐
1345
+ # # │ a ┆ b │
1346
+ # # │ --- ┆ --- │
1347
+ # # │ f64 ┆ f64 │
1348
+ # # ╞══════════╪══════╡
1349
+ # # │ 1.666667 ┆ 0.25 │
1350
+ # # └──────────┴──────┘
1351
+ #
1352
+ # @example
1353
+ # df.var(ddof: 0).collect
1354
+ # # =>
1355
+ # # shape: (1, 2)
1356
+ # # ┌──────┬────────┐
1357
+ # # │ a ┆ b │
1358
+ # # │ --- ┆ --- │
1359
+ # # │ f64 ┆ f64 │
1360
+ # # ╞══════╪════════╡
1361
+ # # │ 1.25 ┆ 0.1875 │
1362
+ # # └──────┴────────┘
1363
+ def var(ddof: 1)
1364
+ _from_rbldf(_ldf.var(ddof))
1365
+ end
417
1366
 
418
- # def max
419
- # end
1367
+ # Aggregate the columns in the DataFrame to their maximum value.
1368
+ #
1369
+ # @return [LazyFrame]
1370
+ #
1371
+ # @example
1372
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1373
+ # df.max.collect
1374
+ # # =>
1375
+ # # shape: (1, 2)
1376
+ # # ┌─────┬─────┐
1377
+ # # │ a ┆ b │
1378
+ # # │ --- ┆ --- │
1379
+ # # │ i64 ┆ i64 │
1380
+ # # ╞═════╪═════╡
1381
+ # # │ 4 ┆ 2 │
1382
+ # # └─────┴─────┘
1383
+ def max
1384
+ _from_rbldf(_ldf.max)
1385
+ end
420
1386
 
421
- # def min
422
- # end
1387
+ # Aggregate the columns in the DataFrame to their minimum value.
1388
+ #
1389
+ # @return [LazyFrame]
1390
+ #
1391
+ # @example
1392
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1393
+ # df.min.collect
1394
+ # # =>
1395
+ # # shape: (1, 2)
1396
+ # # ┌─────┬─────┐
1397
+ # # │ a ┆ b │
1398
+ # # │ --- ┆ --- │
1399
+ # # │ i64 ┆ i64 │
1400
+ # # ╞═════╪═════╡
1401
+ # # │ 1 ┆ 1 │
1402
+ # # └─────┴─────┘
1403
+ def min
1404
+ _from_rbldf(_ldf.min)
1405
+ end
423
1406
 
424
- # def sum
425
- # end
1407
+ # Aggregate the columns in the DataFrame to their sum value.
1408
+ #
1409
+ # @return [LazyFrame]
1410
+ #
1411
+ # @example
1412
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1413
+ # df.sum.collect
1414
+ # # =>
1415
+ # # shape: (1, 2)
1416
+ # # ┌─────┬─────┐
1417
+ # # │ a ┆ b │
1418
+ # # │ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 │
1420
+ # # ╞═════╪═════╡
1421
+ # # │ 10 ┆ 5 │
1422
+ # # └─────┴─────┘
1423
+ def sum
1424
+ _from_rbldf(_ldf.sum)
1425
+ end
426
1426
 
427
- # def mean
428
- # end
1427
+ # Aggregate the columns in the DataFrame to their mean value.
1428
+ #
1429
+ # @return [LazyFrame]
1430
+ #
1431
+ # @example
1432
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1433
+ # df.mean.collect
1434
+ # # =>
1435
+ # # shape: (1, 2)
1436
+ # # ┌─────┬──────┐
1437
+ # # │ a ┆ b │
1438
+ # # │ --- ┆ --- │
1439
+ # # │ f64 ┆ f64 │
1440
+ # # ╞═════╪══════╡
1441
+ # # │ 2.5 ┆ 1.25 │
1442
+ # # └─────┴──────┘
1443
+ def mean
1444
+ _from_rbldf(_ldf.mean)
1445
+ end
429
1446
 
430
- # def median
431
- # end
1447
+ # Aggregate the columns in the DataFrame to their median value.
1448
+ #
1449
+ # @return [LazyFrame]
1450
+ #
1451
+ # @example
1452
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1453
+ # df.median.collect
1454
+ # # =>
1455
+ # # shape: (1, 2)
1456
+ # # ┌─────┬─────┐
1457
+ # # │ a ┆ b │
1458
+ # # │ --- ┆ --- │
1459
+ # # │ f64 ┆ f64 │
1460
+ # # ╞═════╪═════╡
1461
+ # # │ 2.5 ┆ 1.0 │
1462
+ # # └─────┴─────┘
1463
+ def median
1464
+ _from_rbldf(_ldf.median)
1465
+ end
432
1466
 
433
- # def quantile
434
- # end
1467
+ # Aggregate the columns in the DataFrame to their quantile value.
1468
+ #
1469
+ # @param quantile [Float]
1470
+ # Quantile between 0.0 and 1.0.
1471
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
1472
+ # Interpolation method.
1473
+ #
1474
+ # @return [LazyFrame]
1475
+ #
1476
+ # @example
1477
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1478
+ # df.quantile(0.7).collect
1479
+ # # =>
1480
+ # # shape: (1, 2)
1481
+ # # ┌─────┬─────┐
1482
+ # # │ a ┆ b │
1483
+ # # │ --- ┆ --- │
1484
+ # # │ f64 ┆ f64 │
1485
+ # # ╞═════╪═════╡
1486
+ # # │ 3.0 ┆ 1.0 │
1487
+ # # └─────┴─────┘
1488
+ def quantile(quantile, interpolation: "nearest")
1489
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
1490
+ end
435
1491
 
1492
+ # Explode lists to long format.
1493
+ #
1494
+ # @return [LazyFrame]
436
1495
  #
1496
+ # @example
1497
+ # df = Polars::DataFrame.new(
1498
+ # {
1499
+ # "letters" => ["a", "a", "b", "c"],
1500
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
1501
+ # }
1502
+ # ).lazy
1503
+ # df.explode("numbers").collect
1504
+ # # =>
1505
+ # # shape: (8, 2)
1506
+ # # ┌─────────┬─────────┐
1507
+ # # │ letters ┆ numbers │
1508
+ # # │ --- ┆ --- │
1509
+ # # │ str ┆ i64 │
1510
+ # # ╞═════════╪═════════╡
1511
+ # # │ a ┆ 1 │
1512
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1513
+ # # │ a ┆ 2 │
1514
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1515
+ # # │ a ┆ 3 │
1516
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1517
+ # # │ b ┆ 4 │
1518
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1519
+ # # │ b ┆ 5 │
1520
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1521
+ # # │ c ┆ 6 │
1522
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1523
+ # # │ c ┆ 7 │
1524
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1525
+ # # │ c ┆ 8 │
1526
+ # # └─────────┴─────────┘
437
1527
  def explode(columns)
438
1528
  columns = Utils.selection_to_rbexpr_list(columns)
439
1529
  _from_rbldf(_ldf.explode(columns))
440
1530
  end
441
1531
 
442
- # def unique
443
- # end
1532
+ # Drop duplicate rows from this DataFrame.
1533
+ #
1534
+ # Note that this fails if there is a column of type `List` in the DataFrame or
1535
+ # subset.
1536
+ #
1537
+ # @param maintain_order [Boolean]
1538
+ # Keep the same order as the original DataFrame. This requires more work to
1539
+ # compute.
1540
+ # @param subset [Object]
1541
+ # Subset to use to compare rows.
1542
+ # @param keep ["first", "last"]
1543
+ # Which of the duplicate rows to keep.
1544
+ #
1545
+ # @return [LazyFrame]
1546
+ def unique(maintain_order: true, subset: nil, keep: "first")
1547
+ if !subset.nil? && !subset.is_a?(Array)
1548
+ subset = [subset]
1549
+ end
1550
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
1551
+ end
444
1552
 
445
1553
  # def drop_nulls
446
1554
  # end
@@ -451,11 +1559,97 @@ module Polars
451
1559
  # def map
452
1560
  # end
453
1561
 
454
- # def interpolate
455
- # end
1562
+ # Interpolate intermediate values. The interpolation method is linear.
1563
+ #
1564
+ # @return [LazyFrame]
1565
+ #
1566
+ # @example
1567
+ # df = Polars::DataFrame.new(
1568
+ # {
1569
+ # "foo" => [1, nil, 9, 10],
1570
+ # "bar" => [6, 7, 9, nil],
1571
+ # "baz" => [1, nil, nil, 9]
1572
+ # }
1573
+ # ).lazy
1574
+ # df.interpolate.collect
1575
+ # # =>
1576
+ # # shape: (4, 3)
1577
+ # # ┌─────┬──────┬─────┐
1578
+ # # │ foo ┆ bar ┆ baz │
1579
+ # # │ --- ┆ --- ┆ --- │
1580
+ # # │ i64 ┆ i64 ┆ i64 │
1581
+ # # ╞═════╪══════╪═════╡
1582
+ # # │ 1 ┆ 6 ┆ 1 │
1583
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1584
+ # # │ 5 ┆ 7 ┆ 3 │
1585
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1586
+ # # │ 9 ┆ 9 ┆ 6 │
1587
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1588
+ # # │ 10 ┆ null ┆ 9 │
1589
+ # # └─────┴──────┴─────┘
1590
+ def interpolate
1591
+ select(Utils.col("*").interpolate)
1592
+ end
456
1593
 
457
- # def unnest
458
- # end
1594
+ # Decompose a struct into its fields.
1595
+ #
1596
+ # The fields will be inserted into the `DataFrame` on the location of the
1597
+ # `struct` type.
1598
+ #
1599
+ # @param names [Object]
1600
+ # Names of the struct columns that will be decomposed by its fields
1601
+ #
1602
+ # @return [LazyFrame]
1603
+ #
1604
+ # @example
1605
+ # df = (
1606
+ # Polars::DataFrame.new(
1607
+ # {
1608
+ # "before" => ["foo", "bar"],
1609
+ # "t_a" => [1, 2],
1610
+ # "t_b" => ["a", "b"],
1611
+ # "t_c" => [true, nil],
1612
+ # "t_d" => [[1, 2], [3]],
1613
+ # "after" => ["baz", "womp"]
1614
+ # }
1615
+ # )
1616
+ # .lazy
1617
+ # .select(
1618
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
1619
+ # )
1620
+ # )
1621
+ # df.fetch
1622
+ # # =>
1623
+ # # shape: (2, 3)
1624
+ # # ┌────────┬─────────────────────┬───────┐
1625
+ # # │ before ┆ t_struct ┆ after │
1626
+ # # │ --- ┆ --- ┆ --- │
1627
+ # # │ str ┆ struct[4] ┆ str │
1628
+ # # ╞════════╪═════════════════════╪═══════╡
1629
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
1630
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1631
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
1632
+ # # └────────┴─────────────────────┴───────┘
1633
+ #
1634
+ # @example
1635
+ # df.unnest("t_struct").fetch
1636
+ # # =>
1637
+ # # shape: (2, 6)
1638
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
1639
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
1640
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1641
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
1642
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
1643
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
1644
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1645
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
1646
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
1647
+ def unnest(names)
1648
+ if names.is_a?(String)
1649
+ names = [names]
1650
+ end
1651
+ _from_rbldf(_ldf.unnest(names))
1652
+ end
459
1653
 
460
1654
  private
461
1655