polars-df 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -155,12 +155,35 @@ module Polars
155
155
  end
156
156
 
157
157
  # @private
158
- def self._read_parquet(file)
158
+ def self._read_parquet(
159
+ file,
160
+ columns: nil,
161
+ n_rows: nil,
162
+ parallel: "auto",
163
+ row_count_name: nil,
164
+ row_count_offset: 0,
165
+ low_memory: false
166
+ )
159
167
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
160
168
  file = Utils.format_path(file)
161
169
  end
162
170
 
163
- _from_rbdf(RbDataFrame.read_parquet(file))
171
+ if file.is_a?(String) && file.include?("*")
172
+ raise Todo
173
+ end
174
+
175
+ projection, columns = Utils.handle_projection_columns(columns)
176
+ _from_rbdf(
177
+ RbDataFrame.read_parquet(
178
+ file,
179
+ columns,
180
+ projection,
181
+ n_rows,
182
+ parallel,
183
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
184
+ low_memory
185
+ )
186
+ )
164
187
  end
165
188
 
166
189
  # def self._read_avro
@@ -259,11 +282,13 @@ module Polars
259
282
  # @return [Array]
260
283
  #
261
284
  # @example
262
- # df = Polars::DataFrame.new({
263
- # "foo" => [1, 2, 3],
264
- # "bar" => [6, 7, 8],
265
- # "ham" => ["a", "b", "c"]
266
- # })
285
+ # df = Polars::DataFrame.new(
286
+ # {
287
+ # "foo" => [1, 2, 3],
288
+ # "bar" => [6, 7, 8],
289
+ # "ham" => ["a", "b", "c"]
290
+ # }
291
+ # )
267
292
  # df.columns
268
293
  # # => ["foo", "bar", "ham"]
269
294
  def columns
@@ -279,11 +304,13 @@ module Polars
279
304
  # @return [Object]
280
305
  #
281
306
  # @example
282
- # df = Polars::DataFrame.new({
283
- # "foo" => [1, 2, 3],
284
- # "bar" => [6, 7, 8],
285
- # "ham" => ["a", "b", "c"]
286
- # })
307
+ # df = Polars::DataFrame.new(
308
+ # {
309
+ # "foo" => [1, 2, 3],
310
+ # "bar" => [6, 7, 8],
311
+ # "ham" => ["a", "b", "c"]
312
+ # }
313
+ # )
287
314
  # df.columns = ["apple", "banana", "orange"]
288
315
  # df
289
316
  # # =>
@@ -308,11 +335,13 @@ module Polars
308
335
  # @return [Array]
309
336
  #
310
337
  # @example
311
- # df = Polars::DataFrame.new({
312
- # "foo" => [1, 2, 3],
313
- # "bar" => [6.0, 7.0, 8.0],
314
- # "ham" => ["a", "b", "c"]
315
- # })
338
+ # df = Polars::DataFrame.new(
339
+ # {
340
+ # "foo" => [1, 2, 3],
341
+ # "bar" => [6.0, 7.0, 8.0],
342
+ # "ham" => ["a", "b", "c"]
343
+ # }
344
+ # )
316
345
  # df.dtypes
317
346
  # # => [:i64, :f64, :str]
318
347
  def dtypes
@@ -324,56 +353,132 @@ module Polars
324
353
  # @return [Hash]
325
354
  #
326
355
  # @example
327
- # df = Polars::DataFrame.new({
328
- # "foo" => [1, 2, 3],
329
- # "bar" => [6.0, 7.0, 8.0],
330
- # "ham" => ["a", "b", "c"]
331
- # })
356
+ # df = Polars::DataFrame.new(
357
+ # {
358
+ # "foo" => [1, 2, 3],
359
+ # "bar" => [6.0, 7.0, 8.0],
360
+ # "ham" => ["a", "b", "c"]
361
+ # }
362
+ # )
332
363
  # df.schema
333
364
  # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
334
365
  def schema
335
366
  columns.zip(dtypes).to_h
336
367
  end
337
368
 
338
- # def ==(other)
339
- # end
369
+ # Equal.
370
+ #
371
+ # @return [DataFrame]
372
+ def ==(other)
373
+ _comp(other, "eq")
374
+ end
340
375
 
341
- # def !=(other)
342
- # end
376
+ # Not equal.
377
+ #
378
+ # @return [DataFrame]
379
+ def !=(other)
380
+ _comp(other, "neq")
381
+ end
343
382
 
344
- # def >(other)
345
- # end
383
+ # Greater than.
384
+ #
385
+ # @return [DataFrame]
386
+ def >(other)
387
+ _comp(other, "gt")
388
+ end
346
389
 
347
- # def <(other)
348
- # end
390
+ # Less than.
391
+ #
392
+ # @return [DataFrame]
393
+ def <(other)
394
+ _comp(other, "lt")
395
+ end
349
396
 
350
- # def >=(other)
351
- # end
397
+ # Greater than or equal.
398
+ #
399
+ # @return [DataFrame]
400
+ def >=(other)
401
+ _comp(other, "gt_eq")
402
+ end
352
403
 
353
- # def <=(other)
354
- # end
404
+ # Less than or equal.
405
+ #
406
+ # @return [DataFrame]
407
+ def <=(other)
408
+ _comp(other, "lt_eq")
409
+ end
355
410
 
356
- # def *(other)
357
- # end
411
+ # Performs multiplication.
412
+ #
413
+ # @return [DataFrame]
414
+ def *(other)
415
+ if other.is_a?(DataFrame)
416
+ return _from_rbdf(_df.mul_df(other._df))
417
+ end
358
418
 
359
- # def /(other)
360
- # end
419
+ other = _prepare_other_arg(other)
420
+ _from_rbdf(_df.mul(other._s))
421
+ end
361
422
 
362
- # def +(other)
363
- # end
423
+ # Performs division.
424
+ #
425
+ # @return [DataFrame]
426
+ def /(other)
427
+ if other.is_a?(DataFrame)
428
+ return _from_rbdf(_df.div_df(other._df))
429
+ end
364
430
 
365
- # def -(other)
366
- # end
431
+ other = _prepare_other_arg(other)
432
+ _from_rbdf(_df.div(other._s))
433
+ end
367
434
 
368
- # def %(other)
369
- # end
435
+ # Performs addition.
436
+ #
437
+ # @return [DataFrame]
438
+ def +(other)
439
+ if other.is_a?(DataFrame)
440
+ return _from_rbdf(_df.add_df(other._df))
441
+ end
442
+
443
+ other = _prepare_other_arg(other)
444
+ _from_rbdf(_df.add(other._s))
445
+ end
446
+
447
+ # Performs subtraction.
448
+ #
449
+ # @return [DataFrame]
450
+ def -(other)
451
+ if other.is_a?(DataFrame)
452
+ return _from_rbdf(_df.sub_df(other._df))
453
+ end
454
+
455
+ other = _prepare_other_arg(other)
456
+ _from_rbdf(_df.sub(other._s))
457
+ end
458
+
459
+ # Returns the modulo.
460
+ #
461
+ # @return [DataFrame]
462
+ def %(other)
463
+ if other.is_a?(DataFrame)
464
+ return _from_rbdf(_df.rem_df(other._df))
465
+ end
466
+
467
+ other = _prepare_other_arg(other)
468
+ _from_rbdf(_df.rem(other._s))
469
+ end
370
470
 
471
+ # Returns a string representing the DataFrame.
371
472
  #
473
+ # @return [String]
372
474
  def to_s
373
475
  _df.to_s
374
476
  end
375
477
  alias_method :inspect, :to_s
376
478
 
479
+ # Check if DataFrame includes column.
480
+ #
481
+ # @return [Boolean]
377
482
  def include?(name)
378
483
  columns.include?(name)
379
484
  end
@@ -387,9 +492,78 @@ module Polars
387
492
  # def _pos_idxs
388
493
  # end
389
494
 
495
+ # Returns subset of the DataFrame.
390
496
  #
391
- def [](name)
392
- Utils.wrap_s(_df.column(name))
497
+ # @return [Object]
498
+ def [](*args)
499
+ if args.size == 2
500
+ row_selection, col_selection = args
501
+
502
+ # df[.., unknown]
503
+ if row_selection.is_a?(Range)
504
+
505
+ # multiple slices
506
+ # df[.., ..]
507
+ if col_selection.is_a?(Range)
508
+ raise Todo
509
+ end
510
+ end
511
+
512
+ # df[2, ..] (select row as df)
513
+ if row_selection.is_a?(Integer)
514
+ if col_selection.is_a?(Array)
515
+ df = self[0.., col_selection]
516
+ return df.slice(row_selection, 1)
517
+ end
518
+ # df[2, "a"]
519
+ if col_selection.is_a?(String)
520
+ return self[col_selection][row_selection]
521
+ end
522
+ end
523
+
524
+ # column selection can be "a" and ["a", "b"]
525
+ if col_selection.is_a?(String)
526
+ col_selection = [col_selection]
527
+ end
528
+
529
+ # df[.., 1]
530
+ if col_selection.is_a?(Integer)
531
+ series = to_series(col_selection)
532
+ return series[row_selection]
533
+ end
534
+
535
+ if col_selection.is_a?(Array)
536
+ # df[.., [1, 2]]
537
+ if is_int_sequence(col_selection)
538
+ series_list = col_selection.map { |i| to_series(i) }
539
+ df = self.class.new(series_list)
540
+ return df[row_selection]
541
+ end
542
+ end
543
+
544
+ df = self[col_selection]
545
+ return df[row_selection]
546
+ elsif args.size == 1
547
+ item = args[0]
548
+
549
+ # select single column
550
+ # df["foo"]
551
+ if item.is_a?(String)
552
+ return Utils.wrap_s(_df.column(item))
553
+ end
554
+
555
+ # df[idx]
556
+ if item.is_a?(Integer)
557
+ return slice(_pos_idx(item, dim: 0), 1)
558
+ end
559
+
560
+ # df[..]
561
+ if item.is_a?(Range)
562
+ return Slice.new(self).apply(item)
563
+ end
564
+ end
565
+
566
+ raise ArgumentError, "Cannot get item of type: #{item.class.name}"
393
567
  end
394
568
 
395
569
  # def []=(key, value)
@@ -397,7 +571,9 @@ module Polars
397
571
 
398
572
  # no to_arrow
399
573
 
574
+ # Convert DataFrame to a hash mapping column name to values.
400
575
  #
576
+ # @return [Hash]
401
577
  def to_h(as_series: true)
402
578
  if as_series
403
579
  get_columns.to_h { |s| [s.name, s] }
@@ -422,11 +598,13 @@ module Polars
422
598
  # @return [Series]
423
599
  #
424
600
  # @example
425
- # df = Polars::DataFrame.new({
426
- # "foo" => [1, 2, 3],
427
- # "bar" => [6, 7, 8],
428
- # "ham" => ["a", "b", "c"]
429
- # })
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"]
606
+ # }
607
+ # )
430
608
  # df.to_series(1)
431
609
  # # =>
432
610
  # # shape: (3,)
@@ -519,11 +697,13 @@ module Polars
519
697
  # @return [String, nil]
520
698
  #
521
699
  # @example
522
- # df = Polars::DataFrame.new({
523
- # "foo" => [1, 2, 3, 4, 5],
524
- # "bar" => [6, 7, 8, 9, 10],
525
- # "ham" => ["a", "b", "c", "d", "e"]
526
- # })
700
+ # df = Polars::DataFrame.new(
701
+ # {
702
+ # "foo" => [1, 2, 3, 4, 5],
703
+ # "bar" => [6, 7, 8, 9, 10],
704
+ # "ham" => ["a", "b", "c", "d", "e"]
705
+ # }
706
+ # )
527
707
  # df.write_csv("file.csv")
528
708
  def write_csv(
529
709
  file = nil,
@@ -694,10 +874,12 @@ module Polars
694
874
  # @return [DataFrame]
695
875
  #
696
876
  # @example
697
- # df = Polars::DataFrame.new({
698
- # "key" => ["a", "b", "c"],
699
- # "val" => [1, 2, 3]
700
- # })
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "key" => ["a", "b", "c"],
880
+ # "val" => [1, 2, 3]
881
+ # }
882
+ # )
701
883
  # df.reverse()
702
884
  # # =>
703
885
  # # shape: (3, 2)
@@ -724,11 +906,13 @@ module Polars
724
906
  # @return [DataFrame]
725
907
  #
726
908
  # @example
727
- # df = Polars::DataFrame.new({
728
- # "foo" => [1, 2, 3],
729
- # "bar" => [6, 7, 8],
730
- # "ham" => ["a", "b", "c"]
731
- # })
909
+ # df = Polars::DataFrame.new(
910
+ # {
911
+ # "foo" => [1, 2, 3],
912
+ # "bar" => [6, 7, 8],
913
+ # "ham" => ["a", "b", "c"]
914
+ # }
915
+ # )
732
916
  # df.rename({"foo" => "apple"})
733
917
  # # =>
734
918
  # # shape: (3, 3)
@@ -775,11 +959,13 @@ module Polars
775
959
  # # └─────┴─────┴─────┘
776
960
  #
777
961
  # @example
778
- # df = Polars::DataFrame.new({
779
- # "a" => [1, 2, 3, 4],
780
- # "b" => [0.5, 4, 10, 13],
781
- # "c" => [true, true, false, true]
782
- # })
962
+ # df = Polars::DataFrame.new(
963
+ # {
964
+ # "a" => [1, 2, 3, 4],
965
+ # "b" => [0.5, 4, 10, 13],
966
+ # "c" => [true, true, false, true]
967
+ # }
968
+ # )
783
969
  # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
970
  # df.insert_at_idx(3, s)
785
971
  # # =>
@@ -805,63 +991,560 @@ module Polars
805
991
  self
806
992
  end
807
993
 
994
+ # Filter the rows in the DataFrame based on a predicate expression.
995
+ #
996
+ # @param predicate [Expr]
997
+ # Expression that evaluates to a boolean Series.
998
+ #
999
+ # @return [DataFrame]
1000
+ #
1001
+ # @example Filter on one condition:
1002
+ # df = Polars::DataFrame.new(
1003
+ # {
1004
+ # "foo" => [1, 2, 3],
1005
+ # "bar" => [6, 7, 8],
1006
+ # "ham" => ["a", "b", "c"]
1007
+ # }
1008
+ # )
1009
+ # df.filter(Polars.col("foo") < 3)
1010
+ # # =>
1011
+ # # shape: (2, 3)
1012
+ # # ┌─────┬─────┬─────┐
1013
+ # # │ foo ┆ bar ┆ ham │
1014
+ # # │ --- ┆ --- ┆ --- │
1015
+ # # │ i64 ┆ i64 ┆ str │
1016
+ # # ╞═════╪═════╪═════╡
1017
+ # # │ 1 ┆ 6 ┆ a │
1018
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1019
+ # # │ 2 ┆ 7 ┆ b │
1020
+ # # └─────┴─────┴─────┘
1021
+ #
1022
+ # @example Filter on multiple conditions:
1023
+ # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
1024
+ # # =>
1025
+ # # shape: (1, 3)
1026
+ # # ┌─────┬─────┬─────┐
1027
+ # # │ foo ┆ bar ┆ ham │
1028
+ # # │ --- ┆ --- ┆ --- │
1029
+ # # │ i64 ┆ i64 ┆ str │
1030
+ # # ╞═════╪═════╪═════╡
1031
+ # # │ 1 ┆ 6 ┆ a │
1032
+ # # └─────┴─────┴─────┘
808
1033
  def filter(predicate)
809
1034
  lazy.filter(predicate).collect
810
1035
  end
811
1036
 
812
- # def describe
813
- # end
1037
+ # Summary statistics for a DataFrame.
1038
+ #
1039
+ # @return [DataFrame]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new(
1043
+ # {
1044
+ # "a" => [1.0, 2.8, 3.0],
1045
+ # "b" => [4, 5, nil],
1046
+ # "c" => [true, false, true],
1047
+ # "d" => [nil, "b", "c"],
1048
+ # "e" => ["usd", "eur", nil]
1049
+ # }
1050
+ # )
1051
+ # df.describe
1052
+ # # =>
1053
+ # # shape: (7, 6)
1054
+ # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
+ # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
+ # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1073
+ def describe
1074
+ describe_cast = lambda do |stat|
1075
+ columns = []
1076
+ self.columns.each_with_index do |s, i|
1077
+ if self[s].is_numeric || self[s].is_boolean
1078
+ columns << stat[0.., i].cast(:f64)
1079
+ else
1080
+ # for dates, strings, etc, we cast to string so that all
1081
+ # statistics can be shown
1082
+ columns << stat[0.., i].cast(:str)
1083
+ end
1084
+ end
1085
+ self.class.new(columns)
1086
+ end
814
1087
 
815
- # def find_idx_by_name
816
- # end
1088
+ summary = _from_rbdf(
1089
+ Polars.concat(
1090
+ [
1091
+ describe_cast.(
1092
+ self.class.new(columns.to_h { |c| [c, [height]] })
1093
+ ),
1094
+ describe_cast.(null_count),
1095
+ describe_cast.(mean),
1096
+ describe_cast.(std),
1097
+ describe_cast.(min),
1098
+ describe_cast.(max),
1099
+ describe_cast.(median)
1100
+ ]
1101
+ )._df
1102
+ )
1103
+ summary.insert_at_idx(
1104
+ 0,
1105
+ Polars::Series.new(
1106
+ "describe",
1107
+ ["count", "null_count", "mean", "std", "min", "max", "median"],
1108
+ )
1109
+ )
1110
+ summary
1111
+ end
817
1112
 
818
- # def replace_at_idx
819
- # end
1113
+ # Find the index of a column by name.
1114
+ #
1115
+ # @param name [String]
1116
+ # Name of the column to find.
1117
+ #
1118
+ # @return [Series]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new(
1122
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1123
+ # )
1124
+ # df.find_idx_by_name("ham")
1125
+ # # => 2
1126
+ def find_idx_by_name(name)
1127
+ _df.find_idx_by_name(name)
1128
+ end
1129
+
1130
+ # Replace a column at an index location.
1131
+ #
1132
+ # @param index [Integer]
1133
+ # Column index.
1134
+ # @param series [Series]
1135
+ # Series that will replace the column.
1136
+ #
1137
+ # @return [DataFrame]
1138
+ #
1139
+ # @example
1140
+ # df = Polars::DataFrame.new(
1141
+ # {
1142
+ # "foo" => [1, 2, 3],
1143
+ # "bar" => [6, 7, 8],
1144
+ # "ham" => ["a", "b", "c"]
1145
+ # }
1146
+ # )
1147
+ # s = Polars::Series.new("apple", [10, 20, 30])
1148
+ # df.replace_at_idx(0, s)
1149
+ # # =>
1150
+ # # shape: (3, 3)
1151
+ # # ┌───────┬─────┬─────┐
1152
+ # # │ apple ┆ bar ┆ ham │
1153
+ # # │ --- ┆ --- ┆ --- │
1154
+ # # │ i64 ┆ i64 ┆ str │
1155
+ # # ╞═══════╪═════╪═════╡
1156
+ # # │ 10 ┆ 6 ┆ a │
1157
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1158
+ # # │ 20 ┆ 7 ┆ b │
1159
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1160
+ # # │ 30 ┆ 8 ┆ c │
1161
+ # # └───────┴─────┴─────┘
1162
+ def replace_at_idx(index, series)
1163
+ if index < 0
1164
+ index = columns.length + index
1165
+ end
1166
+ _df.replace_at_idx(index, series._s)
1167
+ self
1168
+ end
820
1169
 
1170
+ # Sort the DataFrame by column.
1171
+ #
1172
+ # @param by [String]
1173
+ # By which column to sort.
1174
+ # @param reverse [Boolean]
1175
+ # Reverse/descending sort.
1176
+ # @param nulls_last [Boolean]
1177
+ # Place null values last. Can only be used if sorted by a single column.
1178
+ #
1179
+ # @return [DataFrame]
1180
+ #
1181
+ # @example
1182
+ # df = Polars::DataFrame.new(
1183
+ # {
1184
+ # "foo" => [1, 2, 3],
1185
+ # "bar" => [6.0, 7.0, 8.0],
1186
+ # "ham" => ["a", "b", "c"]
1187
+ # }
1188
+ # )
1189
+ # df.sort("foo", reverse: true)
1190
+ # # =>
1191
+ # # shape: (3, 3)
1192
+ # # ┌─────┬─────┬─────┐
1193
+ # # │ foo ┆ bar ┆ ham │
1194
+ # # │ --- ┆ --- ┆ --- │
1195
+ # # │ i64 ┆ f64 ┆ str │
1196
+ # # ╞═════╪═════╪═════╡
1197
+ # # │ 3 ┆ 8.0 ┆ c │
1198
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1199
+ # # │ 2 ┆ 7.0 ┆ b │
1200
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1201
+ # # │ 1 ┆ 6.0 ┆ a │
1202
+ # # └─────┴─────┴─────┘
821
1203
  #
1204
+ # @example Sort by multiple columns.
1205
+ # df.sort(
1206
+ # [Polars.col("foo"), Polars.col("bar")**2],
1207
+ # reverse: [true, false]
1208
+ # )
1209
+ # # =>
1210
+ # # shape: (3, 3)
1211
+ # # ┌─────┬─────┬─────┐
1212
+ # # │ foo ┆ bar ┆ ham │
1213
+ # # │ --- ┆ --- ┆ --- │
1214
+ # # │ i64 ┆ f64 ┆ str │
1215
+ # # ╞═════╪═════╪═════╡
1216
+ # # │ 3 ┆ 8.0 ┆ c │
1217
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1218
+ # # │ 2 ┆ 7.0 ┆ b │
1219
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1220
+ # # │ 1 ┆ 6.0 ┆ a │
1221
+ # # └─────┴─────┴─────┘
822
1222
  def sort(by, reverse: false, nulls_last: false)
823
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1223
+ if by.is_a?(Array) || by.is_a?(Expr)
1224
+ lazy
1225
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1226
+ .collect(no_optimization: true, string_cache: false)
1227
+ else
1228
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
1229
+ end
824
1230
  end
825
1231
 
1232
+ # Check if DataFrame is equal to other.
1233
+ #
1234
+ # @param other [DataFrame]
1235
+ # DataFrame to compare with.
1236
+ # @param null_equal [Boolean]
1237
+ # Consider null values as equal.
1238
+ #
1239
+ # @return [Boolean]
1240
+ #
1241
+ # @example
1242
+ # df1 = Polars::DataFrame.new(
1243
+ # {
1244
+ # "foo" => [1, 2, 3],
1245
+ # "bar" => [6.0, 7.0, 8.0],
1246
+ # "ham" => ["a", "b", "c"]
1247
+ # }
1248
+ # )
1249
+ # df2 = Polars::DataFrame.new(
1250
+ # {
1251
+ # "foo" => [3, 2, 1],
1252
+ # "bar" => [8.0, 7.0, 6.0],
1253
+ # "ham" => ["c", "b", "a"]
1254
+ # }
1255
+ # )
1256
+ # df1.frame_equal(df1)
1257
+ # # => true
1258
+ # df1.frame_equal(df2)
1259
+ # # => false
826
1260
  def frame_equal(other, null_equal: true)
827
1261
  _df.frame_equal(other._df, null_equal)
828
1262
  end
829
1263
 
830
- # def replace
831
- # end
832
-
1264
+ # Replace a column by a new Series.
833
1265
  #
834
- def slice(offset, length = nil)
835
- if !length.nil? && length < 0
836
- length = height - offset + length
837
- end
838
- _from_rbdf(_df.slice(offset, length))
1266
+ # @param column [String]
1267
+ # Column to replace.
1268
+ # @param new_col [Series]
1269
+ # New column to insert.
1270
+ #
1271
+ # @return [DataFrame]
1272
+ #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1275
+ # s = Polars::Series.new([10, 20, 30])
1276
+ # df.replace("foo", s)
1277
+ # # =>
1278
+ # # shape: (3, 2)
1279
+ # # ┌─────┬─────┐
1280
+ # # │ foo ┆ bar │
1281
+ # # │ --- ┆ --- │
1282
+ # # │ i64 ┆ i64 │
1283
+ # # ╞═════╪═════╡
1284
+ # # │ 10 ┆ 4 │
1285
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1286
+ # # │ 20 ┆ 5 │
1287
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1288
+ # # │ 30 ┆ 6 │
1289
+ # # └─────┴─────┘
1290
+ def replace(column, new_col)
1291
+ _df.replace(column, new_col._s)
1292
+ self
839
1293
  end
840
1294
 
1295
+ # Get a slice of this DataFrame.
1296
+ #
1297
+ # @param offset [Integer]
1298
+ # Start index. Negative indexing is supported.
1299
+ # @param length [Integer, nil]
1300
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1301
+ # will be selected.
1302
+ #
1303
+ # @return [DataFrame]
1304
+ #
1305
+ # @example
1306
+ # df = Polars::DataFrame.new(
1307
+ # {
1308
+ # "foo" => [1, 2, 3],
1309
+ # "bar" => [6.0, 7.0, 8.0],
1310
+ # "ham" => ["a", "b", "c"]
1311
+ # }
1312
+ # )
1313
+ # df.slice(1, 2)
1314
+ # # =>
1315
+ # # shape: (2, 3)
1316
+ # # ┌─────┬─────┬─────┐
1317
+ # # │ foo ┆ bar ┆ ham │
1318
+ # # │ --- ┆ --- ┆ --- │
1319
+ # # │ i64 ┆ f64 ┆ str │
1320
+ # # ╞═════╪═════╪═════╡
1321
+ # # │ 2 ┆ 7.0 ┆ b │
1322
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1323
+ # # │ 3 ┆ 8.0 ┆ c │
1324
+ # # └─────┴─────┴─────┘
1325
+ def slice(offset, length = nil)
1326
+ if !length.nil? && length < 0
1327
+ length = height - offset + length
1328
+ end
1329
+ _from_rbdf(_df.slice(offset, length))
1330
+ end
1331
+
1332
+ # Get the first `n` rows.
1333
+ #
1334
+ # Alias for {#head}.
1335
+ #
1336
+ # @param n [Integer]
1337
+ # Number of rows to return.
1338
+ #
1339
+ # @return [DataFrame]
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new(
1343
+ # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
1344
+ # )
1345
+ # df.limit(4)
1346
+ # # =>
1347
+ # # shape: (4, 2)
1348
+ # # ┌─────┬─────┐
1349
+ # # │ foo ┆ bar │
1350
+ # # │ --- ┆ --- │
1351
+ # # │ i64 ┆ str │
1352
+ # # ╞═════╪═════╡
1353
+ # # │ 1 ┆ a │
1354
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1355
+ # # │ 2 ┆ b │
1356
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1357
+ # # │ 3 ┆ c │
1358
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1359
+ # # │ 4 ┆ d │
1360
+ # # └─────┴─────┘
841
1361
  def limit(n = 5)
842
1362
  head(n)
843
1363
  end
844
1364
 
1365
+ # Get the first `n` rows.
1366
+ #
1367
+ # @param n [Integer]
1368
+ # Number of rows to return.
1369
+ #
1370
+ # @return [DataFrame]
1371
+ #
1372
+ # @example
1373
+ # df = Polars::DataFrame.new(
1374
+ # {
1375
+ # "foo" => [1, 2, 3, 4, 5],
1376
+ # "bar" => [6, 7, 8, 9, 10],
1377
+ # "ham" => ["a", "b", "c", "d", "e"]
1378
+ # }
1379
+ # )
1380
+ # df.head(3)
1381
+ # # =>
1382
+ # # shape: (3, 3)
1383
+ # # ┌─────┬─────┬─────┐
1384
+ # # │ foo ┆ bar ┆ ham │
1385
+ # # │ --- ┆ --- ┆ --- │
1386
+ # # │ i64 ┆ i64 ┆ str │
1387
+ # # ╞═════╪═════╪═════╡
1388
+ # # │ 1 ┆ 6 ┆ a │
1389
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1390
+ # # │ 2 ┆ 7 ┆ b │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1392
+ # # │ 3 ┆ 8 ┆ c │
1393
+ # # └─────┴─────┴─────┘
845
1394
  def head(n = 5)
846
1395
  _from_rbdf(_df.head(n))
847
1396
  end
848
1397
 
1398
+ # Get the last `n` rows.
1399
+ #
1400
+ # @param n [Integer]
1401
+ # Number of rows to return.
1402
+ #
1403
+ # @return [DataFrame]
1404
+ #
1405
+ # @example
1406
+ # df = Polars::DataFrame.new(
1407
+ # {
1408
+ # "foo" => [1, 2, 3, 4, 5],
1409
+ # "bar" => [6, 7, 8, 9, 10],
1410
+ # "ham" => ["a", "b", "c", "d", "e"]
1411
+ # }
1412
+ # )
1413
+ # df.tail(3)
1414
+ # # =>
1415
+ # # shape: (3, 3)
1416
+ # # ┌─────┬─────┬─────┐
1417
+ # # │ foo ┆ bar ┆ ham │
1418
+ # # │ --- ┆ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 ┆ str │
1420
+ # # ╞═════╪═════╪═════╡
1421
+ # # │ 3 ┆ 8 ┆ c │
1422
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1423
+ # # │ 4 ┆ 9 ┆ d │
1424
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1425
+ # # │ 5 ┆ 10 ┆ e │
1426
+ # # └─────┴─────┴─────┘
849
1427
  def tail(n = 5)
850
1428
  _from_rbdf(_df.tail(n))
851
1429
  end
852
1430
 
853
- # def drop_nulls
854
- # end
1431
+ # Return a new DataFrame where the null values are dropped.
1432
+ #
1433
+ # @param subset [Object]
1434
+ # Subset of column(s) on which `drop_nulls` will be applied.
1435
+ #
1436
+ # @return [DataFrame]
1437
+ #
1438
+ # @example
1439
+ # df = Polars::DataFrame.new(
1440
+ # {
1441
+ # "foo" => [1, 2, 3],
1442
+ # "bar" => [6, nil, 8],
1443
+ # "ham" => ["a", "b", "c"]
1444
+ # }
1445
+ # )
1446
+ # df.drop_nulls
1447
+ # # =>
1448
+ # # shape: (2, 3)
1449
+ # # ┌─────┬─────┬─────┐
1450
+ # # │ foo ┆ bar ┆ ham │
1451
+ # # │ --- ┆ --- ┆ --- │
1452
+ # # │ i64 ┆ i64 ┆ str │
1453
+ # # ╞═════╪═════╪═════╡
1454
+ # # │ 1 ┆ 6 ┆ a │
1455
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1456
+ # # │ 3 ┆ 8 ┆ c │
1457
+ # # └─────┴─────┴─────┘
1458
+ def drop_nulls(subset: nil)
1459
+ if subset.is_a?(String)
1460
+ subset = [subset]
1461
+ end
1462
+ _from_rbdf(_df.drop_nulls(subset))
1463
+ end
855
1464
 
856
1465
  # def pipe
857
1466
  # end
858
1467
 
859
- # def with_row_count
860
- # end
861
-
1468
+ # Add a column at index 0 that counts the rows.
1469
+ #
1470
+ # @param name [String]
1471
+ # Name of the column to add.
1472
+ # @param offset [Integer]
1473
+ # Start the row count at this offset.
1474
+ #
1475
+ # @return [DataFrame]
1476
+ #
1477
+ # @example
1478
+ # df = Polars::DataFrame.new(
1479
+ # {
1480
+ # "a" => [1, 3, 5],
1481
+ # "b" => [2, 4, 6]
1482
+ # }
1483
+ # )
1484
+ # df.with_row_count
1485
+ # # =>
1486
+ # # shape: (3, 3)
1487
+ # # ┌────────┬─────┬─────┐
1488
+ # # │ row_nr ┆ a ┆ b │
1489
+ # # │ --- ┆ --- ┆ --- │
1490
+ # # │ u32 ┆ i64 ┆ i64 │
1491
+ # # ╞════════╪═════╪═════╡
1492
+ # # │ 0 ┆ 1 ┆ 2 │
1493
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1494
+ # # │ 1 ┆ 3 ┆ 4 │
1495
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1496
+ # # │ 2 ┆ 5 ┆ 6 │
1497
+ # # └────────┴─────┴─────┘
1498
+ def with_row_count(name: "row_nr", offset: 0)
1499
+ _from_rbdf(_df.with_row_count(name, offset))
1500
+ end
1501
+
1502
+ # Start a groupby operation.
862
1503
  #
1504
+ # @param by [Object]
1505
+ # Column(s) to group by.
1506
+ # @param maintain_order [Boolean]
1507
+ # Make sure that the order of the groups remain consistent. This is more
1508
+ # expensive than a default groupby. Note that this only works in expression
1509
+ # aggregations.
1510
+ #
1511
+ # @return [GroupBy]
1512
+ #
1513
+ # @example
1514
+ # df = Polars::DataFrame.new(
1515
+ # {
1516
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1517
+ # "b" => [1, 2, 3, 4, 5, 6],
1518
+ # "c" => [6, 5, 4, 3, 2, 1]
1519
+ # }
1520
+ # )
1521
+ # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1522
+ # # =>
1523
+ # # shape: (3, 2)
1524
+ # # ┌─────┬─────┐
1525
+ # # │ a ┆ b │
1526
+ # # │ --- ┆ --- │
1527
+ # # │ str ┆ i64 │
1528
+ # # ╞═════╪═════╡
1529
+ # # │ a ┆ 4 │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1531
+ # # │ b ┆ 11 │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1533
+ # # │ c ┆ 6 │
1534
+ # # └─────┴─────┘
863
1535
  def groupby(by, maintain_order: false)
864
- lazy.groupby(by, maintain_order: maintain_order)
1536
+ if !Utils.bool?(maintain_order)
1537
+ raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1538
+ end
1539
+ if by.is_a?(String)
1540
+ by = [by]
1541
+ end
1542
+ GroupBy.new(
1543
+ _df,
1544
+ by,
1545
+ self.class,
1546
+ maintain_order: maintain_order
1547
+ )
865
1548
  end
866
1549
 
867
1550
  # def groupby_rolling
@@ -876,7 +1559,109 @@ module Polars
876
1559
  # def join_asof
877
1560
  # end
878
1561
 
1562
+ # Join in SQL-like fashion.
1563
+ #
1564
+ # @param other [DataFrame]
1565
+ # DataFrame to join with.
1566
+ # @param left_on [Object]
1567
+ # Name(s) of the left join column(s).
1568
+ # @param right_on [Object]
1569
+ # Name(s) of the right join column(s).
1570
+ # @param on [Object]
1571
+ # Name(s) of the join columns in both DataFrames.
1572
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1573
+ # Join strategy.
1574
+ # @param suffix [String]
1575
+ # Suffix to append to columns with a duplicate name.
1576
+ #
1577
+ # @return [DataFrame]
1578
+ #
1579
+ # @example
1580
+ # df = Polars::DataFrame.new(
1581
+ # {
1582
+ # "foo" => [1, 2, 3],
1583
+ # "bar" => [6.0, 7.0, 8.0],
1584
+ # "ham" => ["a", "b", "c"]
1585
+ # }
1586
+ # )
1587
+ # other_df = Polars::DataFrame.new(
1588
+ # {
1589
+ # "apple" => ["x", "y", "z"],
1590
+ # "ham" => ["a", "b", "d"]
1591
+ # }
1592
+ # )
1593
+ # df.join(other_df, on: "ham")
1594
+ # # =>
1595
+ # # shape: (2, 4)
1596
+ # # ┌─────┬─────┬─────┬───────┐
1597
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1598
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1599
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1600
+ # # ╞═════╪═════╪═════╪═══════╡
1601
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1602
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1603
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1604
+ # # └─────┴─────┴─────┴───────┘
1605
+ #
1606
+ # @example
1607
+ # df.join(other_df, on: "ham", how: "outer")
1608
+ # # =>
1609
+ # # shape: (4, 4)
1610
+ # # ┌──────┬──────┬─────┬───────┐
1611
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1612
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1613
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1614
+ # # ╞══════╪══════╪═════╪═══════╡
1615
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1616
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1617
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1618
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1619
+ # # │ null ┆ null ┆ d ┆ z │
1620
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1621
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1622
+ # # └──────┴──────┴─────┴───────┘
1623
+ #
1624
+ # @example
1625
+ # df.join(other_df, on: "ham", how: "left")
1626
+ # # =>
1627
+ # # shape: (3, 4)
1628
+ # # ┌─────┬─────┬─────┬───────┐
1629
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1630
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1631
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1632
+ # # ╞═════╪═════╪═════╪═══════╡
1633
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1634
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1635
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1636
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1637
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1638
+ # # └─────┴─────┴─────┴───────┘
879
1639
  #
1640
+ # @example
1641
+ # df.join(other_df, on: "ham", how: "semi")
1642
+ # # =>
1643
+ # # shape: (2, 3)
1644
+ # # ┌─────┬─────┬─────┐
1645
+ # # │ foo ┆ bar ┆ ham │
1646
+ # # │ --- ┆ --- ┆ --- │
1647
+ # # │ i64 ┆ f64 ┆ str │
1648
+ # # ╞═════╪═════╪═════╡
1649
+ # # │ 1 ┆ 6.0 ┆ a │
1650
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1651
+ # # │ 2 ┆ 7.0 ┆ b │
1652
+ # # └─────┴─────┴─────┘
1653
+ #
1654
+ # @example
1655
+ # df.join(other_df, on: "ham", how: "anti")
1656
+ # # =>
1657
+ # # shape: (1, 3)
1658
+ # # ┌─────┬─────┬─────┐
1659
+ # # │ foo ┆ bar ┆ ham │
1660
+ # # │ --- ┆ --- ┆ --- │
1661
+ # # │ i64 ┆ f64 ┆ str │
1662
+ # # ╞═════╪═════╪═════╡
1663
+ # # │ 3 ┆ 8.0 ┆ c │
1664
+ # # └─────┴─────┴─────┘
880
1665
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
881
1666
  lazy
882
1667
  .join(
@@ -893,41 +1678,322 @@ module Polars
893
1678
  # def apply
894
1679
  # end
895
1680
 
1681
+ # Return a new DataFrame with the column added or replaced.
1682
+ #
1683
+ # @param column [Object]
1684
+ # Series, where the name of the Series refers to the column in the DataFrame.
1685
+ #
1686
+ # @return [DataFrame]
896
1687
  #
1688
+ # @example Added
1689
+ # df = Polars::DataFrame.new(
1690
+ # {
1691
+ # "a" => [1, 3, 5],
1692
+ # "b" => [2, 4, 6]
1693
+ # }
1694
+ # )
1695
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
1696
+ # # =>
1697
+ # # shape: (3, 3)
1698
+ # # ┌─────┬─────┬───────────┐
1699
+ # # │ a ┆ b ┆ b_squared │
1700
+ # # │ --- ┆ --- ┆ --- │
1701
+ # # │ i64 ┆ i64 ┆ f64 │
1702
+ # # ╞═════╪═════╪═══════════╡
1703
+ # # │ 1 ┆ 2 ┆ 4.0 │
1704
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1705
+ # # │ 3 ┆ 4 ┆ 16.0 │
1706
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1707
+ # # │ 5 ┆ 6 ┆ 36.0 │
1708
+ # # └─────┴─────┴───────────┘
1709
+ #
1710
+ # @example Replaced
1711
+ # df.with_column(Polars.col("a") ** 2)
1712
+ # # =>
1713
+ # # shape: (3, 2)
1714
+ # # ┌──────┬─────┐
1715
+ # # │ a ┆ b │
1716
+ # # │ --- ┆ --- │
1717
+ # # │ f64 ┆ i64 │
1718
+ # # ╞══════╪═════╡
1719
+ # # │ 1.0 ┆ 2 │
1720
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1721
+ # # │ 9.0 ┆ 4 │
1722
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1723
+ # # │ 25.0 ┆ 6 │
1724
+ # # └──────┴─────┘
897
1725
  def with_column(column)
898
1726
  lazy
899
1727
  .with_column(column)
900
1728
  .collect(no_optimization: true, string_cache: false)
901
1729
  end
902
1730
 
903
- # def hstack
904
- # end
1731
+ # Return a new DataFrame grown horizontally by stacking multiple Series to it.
1732
+ #
1733
+ # @param columns [Object]
1734
+ # Series to stack.
1735
+ # @param in_place [Boolean]
1736
+ # Modify in place.
1737
+ #
1738
+ # @return [DataFrame]
1739
+ #
1740
+ # @example
1741
+ # df = Polars::DataFrame.new(
1742
+ # {
1743
+ # "foo" => [1, 2, 3],
1744
+ # "bar" => [6, 7, 8],
1745
+ # "ham" => ["a", "b", "c"]
1746
+ # }
1747
+ # )
1748
+ # x = Polars::Series.new("apple", [10, 20, 30])
1749
+ # df.hstack([x])
1750
+ # # =>
1751
+ # # shape: (3, 4)
1752
+ # # ┌─────┬─────┬─────┬───────┐
1753
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1754
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1755
+ # # │ i64 ┆ i64 ┆ str ┆ i64 │
1756
+ # # ╞═════╪═════╪═════╪═══════╡
1757
+ # # │ 1 ┆ 6 ┆ a ┆ 10 │
1758
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1759
+ # # │ 2 ┆ 7 ┆ b ┆ 20 │
1760
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1761
+ # # │ 3 ┆ 8 ┆ c ┆ 30 │
1762
+ # # └─────┴─────┴─────┴───────┘
1763
+ def hstack(columns, in_place: false)
1764
+ if !columns.is_a?(Array)
1765
+ columns = columns.get_columns
1766
+ end
1767
+ if in_place
1768
+ _df.hstack_mut(columns.map(&:_s))
1769
+ self
1770
+ else
1771
+ _from_rbdf(_df.hstack(columns.map(&:_s)))
1772
+ end
1773
+ end
905
1774
 
906
- # def vstack
907
- # end
1775
+ # Grow this DataFrame vertically by stacking a DataFrame to it.
1776
+ #
1777
+ # @param df [DataFrame]
1778
+ # DataFrame to stack.
1779
+ # @param in_place [Boolean]
1780
+ # Modify in place
1781
+ #
1782
+ # @return [DataFrame]
1783
+ #
1784
+ # @example
1785
+ # df1 = Polars::DataFrame.new(
1786
+ # {
1787
+ # "foo" => [1, 2],
1788
+ # "bar" => [6, 7],
1789
+ # "ham" => ["a", "b"]
1790
+ # }
1791
+ # )
1792
+ # df2 = Polars::DataFrame.new(
1793
+ # {
1794
+ # "foo" => [3, 4],
1795
+ # "bar" => [8, 9],
1796
+ # "ham" => ["c", "d"]
1797
+ # }
1798
+ # )
1799
+ # df1.vstack(df2)
1800
+ # # =>
1801
+ # # shape: (4, 3)
1802
+ # # ┌─────┬─────┬─────┐
1803
+ # # │ foo ┆ bar ┆ ham │
1804
+ # # │ --- ┆ --- ┆ --- │
1805
+ # # │ i64 ┆ i64 ┆ str │
1806
+ # # ╞═════╪═════╪═════╡
1807
+ # # │ 1 ┆ 6 ┆ a │
1808
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1809
+ # # │ 2 ┆ 7 ┆ b │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ 3 ┆ 8 ┆ c │
1812
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1813
+ # # │ 4 ┆ 9 ┆ d │
1814
+ # # └─────┴─────┴─────┘
1815
+ def vstack(df, in_place: false)
1816
+ if in_place
1817
+ _df.vstack_mut(df._df)
1818
+ self
1819
+ else
1820
+ _from_rbdf(_df.vstack(df._df))
1821
+ end
1822
+ end
908
1823
 
1824
+ # Extend the memory backed by this `DataFrame` with the values from `other`.
1825
+ #
1826
+ # Different from `vstack` which adds the chunks from `other` to the chunks of this
1827
+ # `DataFrame` `extend` appends the data from `other` to the underlying memory
1828
+ # locations and thus may cause a reallocation.
1829
+ #
1830
+ # If this does not cause a reallocation, the resulting data structure will not
1831
+ # have any extra chunks and thus will yield faster queries.
1832
+ #
1833
+ # Prefer `extend` over `vstack` when you want to do a query after a single append.
1834
+ # For instance during online operations where you add `n` rows and rerun a query.
1835
+ #
1836
+ # Prefer `vstack` over `extend` when you want to append many times before doing a
1837
+ # query. For instance when you read in multiple files and when to store them in a
1838
+ # single `DataFrame`. In the latter case, finish the sequence of `vstack`
1839
+ # operations with a `rechunk`.
1840
+ #
1841
+ # @param other [DataFrame]
1842
+ # DataFrame to vertically add.
1843
+ #
1844
+ # @return [DataFrame]
909
1845
  #
1846
+ # @example
1847
+ # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1848
+ # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
1849
+ # df1.extend(df2)
1850
+ # # =>
1851
+ # # shape: (6, 2)
1852
+ # # ┌─────┬─────┐
1853
+ # # │ foo ┆ bar │
1854
+ # # │ --- ┆ --- │
1855
+ # # │ i64 ┆ i64 │
1856
+ # # ╞═════╪═════╡
1857
+ # # │ 1 ┆ 4 │
1858
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1859
+ # # │ 2 ┆ 5 │
1860
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1861
+ # # │ 3 ┆ 6 │
1862
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1863
+ # # │ 10 ┆ 40 │
1864
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1865
+ # # │ 20 ┆ 50 │
1866
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1867
+ # # │ 30 ┆ 60 │
1868
+ # # └─────┴─────┘
910
1869
  def extend(other)
911
1870
  _df.extend(other._df)
912
1871
  self
913
1872
  end
914
1873
 
915
- # def drop
916
- # end
1874
+ # Remove column from DataFrame and return as new.
1875
+ #
1876
+ # @param columns [Object]
1877
+ # Column(s) to drop.
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "foo" => [1, 2, 3],
1885
+ # "bar" => [6.0, 7.0, 8.0],
1886
+ # "ham" => ["a", "b", "c"]
1887
+ # }
1888
+ # )
1889
+ # df.drop("ham")
1890
+ # # =>
1891
+ # # shape: (3, 2)
1892
+ # # ┌─────┬─────┐
1893
+ # # │ foo ┆ bar │
1894
+ # # │ --- ┆ --- │
1895
+ # # │ i64 ┆ f64 │
1896
+ # # ╞═════╪═════╡
1897
+ # # │ 1 ┆ 6.0 │
1898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1899
+ # # │ 2 ┆ 7.0 │
1900
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 3 ┆ 8.0 │
1902
+ # # └─────┴─────┘
1903
+ def drop(columns)
1904
+ if columns.is_a?(Array)
1905
+ df = clone
1906
+ columns.each do |n|
1907
+ df._df.drop_in_place(n)
1908
+ end
1909
+ df
1910
+ else
1911
+ _from_rbdf(_df.drop(columns))
1912
+ end
1913
+ end
917
1914
 
918
- # def drop_in_place
919
- # end
1915
+ # Drop in place.
1916
+ #
1917
+ # @param name [Object]
1918
+ # Column to drop.
1919
+ #
1920
+ # @return [Series]
1921
+ #
1922
+ # @example
1923
+ # df = Polars::DataFrame.new(
1924
+ # {
1925
+ # "foo" => [1, 2, 3],
1926
+ # "bar" => [6, 7, 8],
1927
+ # "ham" => ["a", "b", "c"]
1928
+ # }
1929
+ # )
1930
+ # df.drop_in_place("ham")
1931
+ # # =>
1932
+ # # shape: (3,)
1933
+ # # Series: 'ham' [str]
1934
+ # # [
1935
+ # # "a"
1936
+ # # "b"
1937
+ # # "c"
1938
+ # # ]
1939
+ def drop_in_place(name)
1940
+ Utils.wrap_s(_df.drop_in_place(name))
1941
+ end
920
1942
 
921
- # def cleared
922
- # end
1943
+ # Create an empty copy of the current DataFrame.
1944
+ #
1945
+ # Returns a DataFrame with identical schema but no data.
1946
+ #
1947
+ # @return [DataFrame]
1948
+ #
1949
+ # @example
1950
+ # df = Polars::DataFrame.new(
1951
+ # {
1952
+ # "a" => [nil, 2, 3, 4],
1953
+ # "b" => [0.5, nil, 2.5, 13],
1954
+ # "c" => [true, true, false, nil]
1955
+ # }
1956
+ # )
1957
+ # df.cleared
1958
+ # # =>
1959
+ # # shape: (0, 3)
1960
+ # # ┌─────┬─────┬──────┐
1961
+ # # │ a ┆ b ┆ c │
1962
+ # # │ --- ┆ --- ┆ --- │
1963
+ # # │ i64 ┆ f64 ┆ bool │
1964
+ # # ╞═════╪═════╪══════╡
1965
+ # # └─────┴─────┴──────┘
1966
+ def cleared
1967
+ height > 0 ? head(0) : clone
1968
+ end
923
1969
 
924
1970
  # clone handled by initialize_copy
925
1971
 
1972
+ # Get the DataFrame as a Array of Series.
926
1973
  #
1974
+ # @return [Array]
927
1975
  def get_columns
928
1976
  _df.get_columns.map { |s| Utils.wrap_s(s) }
929
1977
  end
930
1978
 
1979
+ # Get a single column as Series by name.
1980
+ #
1981
+ # @param name [String]
1982
+ # Name of the column to retrieve.
1983
+ #
1984
+ # @return [Series]
1985
+ #
1986
+ # @example
1987
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1988
+ # df.get_column("foo")
1989
+ # # =>
1990
+ # # shape: (3,)
1991
+ # # Series: 'foo' [i64]
1992
+ # # [
1993
+ # # 1
1994
+ # # 2
1995
+ # # 3
1996
+ # # ]
931
1997
  def get_column(name)
932
1998
  self[name]
933
1999
  end
@@ -935,13 +2001,85 @@ module Polars
935
2001
  # def fill_null
936
2002
  # end
937
2003
 
2004
+ # Fill floating point NaN values by an Expression evaluation.
2005
+ #
2006
+ # @param fill_value [Object]
2007
+ # Value to fill NaN with.
2008
+ #
2009
+ # @return [DataFrame]
2010
+ #
2011
+ # @note
2012
+ # Note that floating point NaNs (Not a Number) are not missing values!
2013
+ # To replace missing values, use `fill_null`.
938
2014
  #
2015
+ # @example
2016
+ # df = Polars::DataFrame.new(
2017
+ # {
2018
+ # "a" => [1.5, 2, Float::NAN, 4],
2019
+ # "b" => [0.5, 4, Float::NAN, 13]
2020
+ # }
2021
+ # )
2022
+ # df.fill_nan(99)
2023
+ # # =>
2024
+ # # shape: (4, 2)
2025
+ # # ┌──────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════╪══════╡
2030
+ # # │ 1.5 ┆ 0.5 │
2031
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2032
+ # # │ 2.0 ┆ 4.0 │
2033
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2034
+ # # │ 99.0 ┆ 99.0 │
2035
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2036
+ # # │ 4.0 ┆ 13.0 │
2037
+ # # └──────┴──────┘
939
2038
  def fill_nan(fill_value)
940
2039
  lazy.fill_nan(fill_value).collect(no_optimization: true)
941
2040
  end
942
2041
 
943
- # def explode
944
- # end
2042
+ # Explode `DataFrame` to long format by exploding a column with Lists.
2043
+ #
2044
+ # @param columns [Object]
2045
+ # Column of LargeList type.
2046
+ #
2047
+ # @return [DataFrame]
2048
+ #
2049
+ # @example
2050
+ # df = Polars::DataFrame.new(
2051
+ # {
2052
+ # "letters" => ["a", "a", "b", "c"],
2053
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
2054
+ # }
2055
+ # )
2056
+ # df.explode("numbers")
2057
+ # # =>
2058
+ # # shape: (8, 2)
2059
+ # # ┌─────────┬─────────┐
2060
+ # # │ letters ┆ numbers │
2061
+ # # │ --- ┆ --- │
2062
+ # # │ str ┆ i64 │
2063
+ # # ╞═════════╪═════════╡
2064
+ # # │ a ┆ 1 │
2065
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2066
+ # # │ a ┆ 2 │
2067
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2068
+ # # │ a ┆ 3 │
2069
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2070
+ # # │ b ┆ 4 │
2071
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2072
+ # # │ b ┆ 5 │
2073
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2074
+ # # │ c ┆ 6 │
2075
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2076
+ # # │ c ┆ 7 │
2077
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2078
+ # # │ c ┆ 8 │
2079
+ # # └─────────┴─────────┘
2080
+ def explode(columns)
2081
+ lazy.explode(columns).collect(no_optimization: true)
2082
+ end
945
2083
 
946
2084
  # def pivot
947
2085
  # end
@@ -955,25 +2093,242 @@ module Polars
955
2093
  # def partition_by
956
2094
  # end
957
2095
 
958
- # def shift
959
- # end
960
-
961
- # def shift_and_fill
962
- # end
2096
+ # Shift values by the given period.
2097
+ #
2098
+ # @param periods [Integer]
2099
+ # Number of places to shift (may be negative).
2100
+ #
2101
+ # @return [DataFrame]
2102
+ #
2103
+ # @example
2104
+ # df = Polars::DataFrame.new(
2105
+ # {
2106
+ # "foo" => [1, 2, 3],
2107
+ # "bar" => [6, 7, 8],
2108
+ # "ham" => ["a", "b", "c"]
2109
+ # }
2110
+ # )
2111
+ # df.shift(1)
2112
+ # # =>
2113
+ # # shape: (3, 3)
2114
+ # # ┌──────┬──────┬──────┐
2115
+ # # │ foo ┆ bar ┆ ham │
2116
+ # # │ --- ┆ --- ┆ --- │
2117
+ # # │ i64 ┆ i64 ┆ str │
2118
+ # # ╞══════╪══════╪══════╡
2119
+ # # │ null ┆ null ┆ null │
2120
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2121
+ # # │ 1 ┆ 6 ┆ a │
2122
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2123
+ # # │ 2 ┆ 7 ┆ b │
2124
+ # # └──────┴──────┴──────┘
2125
+ #
2126
+ # @example
2127
+ # df.shift(-1)
2128
+ # # =>
2129
+ # # shape: (3, 3)
2130
+ # # ┌──────┬──────┬──────┐
2131
+ # # │ foo ┆ bar ┆ ham │
2132
+ # # │ --- ┆ --- ┆ --- │
2133
+ # # │ i64 ┆ i64 ┆ str │
2134
+ # # ╞══════╪══════╪══════╡
2135
+ # # │ 2 ┆ 7 ┆ b │
2136
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2137
+ # # │ 3 ┆ 8 ┆ c │
2138
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2139
+ # # │ null ┆ null ┆ null │
2140
+ # # └──────┴──────┴──────┘
2141
+ def shift(periods)
2142
+ _from_rbdf(_df.shift(periods))
2143
+ end
2144
+
2145
+ # Shift the values by a given period and fill the resulting null values.
2146
+ #
2147
+ # @param periods [Integer]
2148
+ # Number of places to shift (may be negative).
2149
+ # @param fill_value [Object]
2150
+ # fill nil values with this value.
2151
+ #
2152
+ # @return [DataFrame]
2153
+ #
2154
+ # @example
2155
+ # df = Polars::DataFrame.new(
2156
+ # {
2157
+ # "foo" => [1, 2, 3],
2158
+ # "bar" => [6, 7, 8],
2159
+ # "ham" => ["a", "b", "c"]
2160
+ # }
2161
+ # )
2162
+ # df.shift_and_fill(1, 0)
2163
+ # # =>
2164
+ # # shape: (3, 3)
2165
+ # # ┌─────┬─────┬─────┐
2166
+ # # │ foo ┆ bar ┆ ham │
2167
+ # # │ --- ┆ --- ┆ --- │
2168
+ # # │ i64 ┆ i64 ┆ str │
2169
+ # # ╞═════╪═════╪═════╡
2170
+ # # │ 0 ┆ 0 ┆ 0 │
2171
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2172
+ # # │ 1 ┆ 6 ┆ a │
2173
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2174
+ # # │ 2 ┆ 7 ┆ b │
2175
+ # # └─────┴─────┴─────┘
2176
+ def shift_and_fill(periods, fill_value)
2177
+ lazy
2178
+ .shift_and_fill(periods, fill_value)
2179
+ .collect(no_optimization: true, string_cache: false)
2180
+ end
963
2181
 
2182
+ # Get a mask of all duplicated rows in this DataFrame.
2183
+ #
2184
+ # @return [Series]
964
2185
  #
2186
+ # @example
2187
+ # df = Polars::DataFrame.new(
2188
+ # {
2189
+ # "a" => [1, 2, 3, 1],
2190
+ # "b" => ["x", "y", "z", "x"],
2191
+ # }
2192
+ # )
2193
+ # df.is_duplicated
2194
+ # # =>
2195
+ # # shape: (4,)
2196
+ # # Series: '' [bool]
2197
+ # # [
2198
+ # # true
2199
+ # # false
2200
+ # # false
2201
+ # # true
2202
+ # # ]
965
2203
  def is_duplicated
966
2204
  Utils.wrap_s(_df.is_duplicated)
967
2205
  end
968
2206
 
2207
+ # Get a mask of all unique rows in this DataFrame.
2208
+ #
2209
+ # @return [Series]
2210
+ #
2211
+ # @example
2212
+ # df = Polars::DataFrame.new(
2213
+ # {
2214
+ # "a" => [1, 2, 3, 1],
2215
+ # "b" => ["x", "y", "z", "x"]
2216
+ # }
2217
+ # )
2218
+ # df.is_unique
2219
+ # # =>
2220
+ # # shape: (4,)
2221
+ # # Series: '' [bool]
2222
+ # # [
2223
+ # # false
2224
+ # # true
2225
+ # # true
2226
+ # # false
2227
+ # # ]
969
2228
  def is_unique
970
2229
  Utils.wrap_s(_df.is_unique)
971
2230
  end
972
2231
 
2232
+ # Start a lazy query from this point.
2233
+ #
2234
+ # @return [LazyFrame]
973
2235
  def lazy
974
2236
  wrap_ldf(_df.lazy)
975
2237
  end
976
2238
 
2239
+ # Select columns from this DataFrame.
2240
+ #
2241
+ # @param exprs [Object]
2242
+ # Column or columns to select.
2243
+ #
2244
+ # @return [DataFrame]
2245
+ #
2246
+ # @example
2247
+ # df = Polars::DataFrame.new(
2248
+ # {
2249
+ # "foo" => [1, 2, 3],
2250
+ # "bar" => [6, 7, 8],
2251
+ # "ham" => ["a", "b", "c"]
2252
+ # }
2253
+ # )
2254
+ # df.select("foo")
2255
+ # # =>
2256
+ # # shape: (3, 1)
2257
+ # # ┌─────┐
2258
+ # # │ foo │
2259
+ # # │ --- │
2260
+ # # │ i64 │
2261
+ # # ╞═════╡
2262
+ # # │ 1 │
2263
+ # # ├╌╌╌╌╌┤
2264
+ # # │ 2 │
2265
+ # # ├╌╌╌╌╌┤
2266
+ # # │ 3 │
2267
+ # # └─────┘
2268
+ #
2269
+ # @example
2270
+ # df.select(["foo", "bar"])
2271
+ # # =>
2272
+ # # shape: (3, 2)
2273
+ # # ┌─────┬─────┐
2274
+ # # │ foo ┆ bar │
2275
+ # # │ --- ┆ --- │
2276
+ # # │ i64 ┆ i64 │
2277
+ # # ╞═════╪═════╡
2278
+ # # │ 1 ┆ 6 │
2279
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2280
+ # # │ 2 ┆ 7 │
2281
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2282
+ # # │ 3 ┆ 8 │
2283
+ # # └─────┴─────┘
2284
+ #
2285
+ # @example
2286
+ # df.select(Polars.col("foo") + 1)
2287
+ # # =>
2288
+ # # shape: (3, 1)
2289
+ # # ┌─────┐
2290
+ # # │ foo │
2291
+ # # │ --- │
2292
+ # # │ i64 │
2293
+ # # ╞═════╡
2294
+ # # │ 2 │
2295
+ # # ├╌╌╌╌╌┤
2296
+ # # │ 3 │
2297
+ # # ├╌╌╌╌╌┤
2298
+ # # │ 4 │
2299
+ # # └─────┘
2300
+ #
2301
+ # @example
2302
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
2303
+ # # =>
2304
+ # # shape: (3, 2)
2305
+ # # ┌─────┬─────┐
2306
+ # # │ foo ┆ bar │
2307
+ # # │ --- ┆ --- │
2308
+ # # │ i64 ┆ i64 │
2309
+ # # ╞═════╪═════╡
2310
+ # # │ 2 ┆ 7 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2312
+ # # │ 3 ┆ 8 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2314
+ # # │ 4 ┆ 9 │
2315
+ # # └─────┴─────┘
2316
+ #
2317
+ # @example
2318
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
2319
+ # # =>
2320
+ # # shape: (3, 1)
2321
+ # # ┌─────────┐
2322
+ # # │ literal │
2323
+ # # │ --- │
2324
+ # # │ i64 │
2325
+ # # ╞═════════╡
2326
+ # # │ 0 │
2327
+ # # ├╌╌╌╌╌╌╌╌╌┤
2328
+ # # │ 0 │
2329
+ # # ├╌╌╌╌╌╌╌╌╌┤
2330
+ # # │ 10 │
2331
+ # # └─────────┘
977
2332
  def select(exprs)
978
2333
  _from_rbdf(
979
2334
  lazy
@@ -983,6 +2338,43 @@ module Polars
983
2338
  )
984
2339
  end
985
2340
 
2341
+ # Add or overwrite multiple columns in a DataFrame.
2342
+ #
2343
+ # @param exprs [Array]
2344
+ # Array of Expressions that evaluate to columns.
2345
+ #
2346
+ # @return [DataFrame]
2347
+ #
2348
+ # @example
2349
+ # df = Polars::DataFrame.new(
2350
+ # {
2351
+ # "a" => [1, 2, 3, 4],
2352
+ # "b" => [0.5, 4, 10, 13],
2353
+ # "c" => [true, true, false, true]
2354
+ # }
2355
+ # )
2356
+ # df.with_columns(
2357
+ # [
2358
+ # (Polars.col("a") ** 2).alias("a^2"),
2359
+ # (Polars.col("b") / 2).alias("b/2"),
2360
+ # (Polars.col("c").is_not()).alias("not c")
2361
+ # ]
2362
+ # )
2363
+ # # =>
2364
+ # # shape: (4, 6)
2365
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
2366
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
2367
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2368
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
2369
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
2370
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
2371
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2372
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
2373
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2374
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
2375
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2376
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
2377
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
986
2378
  def with_columns(exprs)
987
2379
  if !exprs.nil? && !exprs.is_a?(Array)
988
2380
  exprs = [exprs]
@@ -992,6 +2384,26 @@ module Polars
992
2384
  .collect(no_optimization: true, string_cache: false)
993
2385
  end
994
2386
 
2387
+ # Get number of chunks used by the ChunkedArrays of this DataFrame.
2388
+ #
2389
+ # @param strategy ["first", "all"]
2390
+ # Return the number of chunks of the 'first' column,
2391
+ # or 'all' columns in this DataFrame.
2392
+ #
2393
+ # @return [Object]
2394
+ #
2395
+ # @example
2396
+ # df = Polars::DataFrame.new(
2397
+ # {
2398
+ # "a" => [1, 2, 3, 4],
2399
+ # "b" => [0.5, 4, 10, 13],
2400
+ # "c" => [true, true, false, true]
2401
+ # }
2402
+ # )
2403
+ # df.n_chunks
2404
+ # # => 1
2405
+ # df.n_chunks(strategy: "all")
2406
+ # # => [1, 1, 1]
995
2407
  def n_chunks(strategy: "first")
996
2408
  if strategy == "first"
997
2409
  _df.n_chunks
@@ -1002,6 +2414,28 @@ module Polars
1002
2414
  end
1003
2415
  end
1004
2416
 
2417
+ # Aggregate the columns of this DataFrame to their maximum value.
2418
+ #
2419
+ # @return [DataFrame]
2420
+ #
2421
+ # @example
2422
+ # df = Polars::DataFrame.new(
2423
+ # {
2424
+ # "foo" => [1, 2, 3],
2425
+ # "bar" => [6, 7, 8],
2426
+ # "ham" => ["a", "b", "c"]
2427
+ # }
2428
+ # )
2429
+ # df.max
2430
+ # # =>
2431
+ # # shape: (1, 3)
2432
+ # # ┌─────┬─────┬─────┐
2433
+ # # │ foo ┆ bar ┆ ham │
2434
+ # # │ --- ┆ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 ┆ str │
2436
+ # # ╞═════╪═════╪═════╡
2437
+ # # │ 3 ┆ 8 ┆ c │
2438
+ # # └─────┴─────┴─────┘
1005
2439
  def max(axis: 0)
1006
2440
  if axis == 0
1007
2441
  _from_rbdf(_df.max)
@@ -1012,6 +2446,28 @@ module Polars
1012
2446
  end
1013
2447
  end
1014
2448
 
2449
+ # Aggregate the columns of this DataFrame to their minimum value.
2450
+ #
2451
+ # @return [DataFrame]
2452
+ #
2453
+ # @example
2454
+ # df = Polars::DataFrame.new(
2455
+ # {
2456
+ # "foo" => [1, 2, 3],
2457
+ # "bar" => [6, 7, 8],
2458
+ # "ham" => ["a", "b", "c"]
2459
+ # }
2460
+ # )
2461
+ # df.min
2462
+ # # =>
2463
+ # # shape: (1, 3)
2464
+ # # ┌─────┬─────┬─────┐
2465
+ # # │ foo ┆ bar ┆ ham │
2466
+ # # │ --- ┆ --- ┆ --- │
2467
+ # # │ i64 ┆ i64 ┆ str │
2468
+ # # ╞═════╪═════╪═════╡
2469
+ # # │ 1 ┆ 6 ┆ a │
2470
+ # # └─────┴─────┴─────┘
1015
2471
  def min(axis: 0)
1016
2472
  if axis == 0
1017
2473
  _from_rbdf(_df.min)
@@ -1022,6 +2478,44 @@ module Polars
1022
2478
  end
1023
2479
  end
1024
2480
 
2481
+ # Aggregate the columns of this DataFrame to their sum value.
2482
+ #
2483
+ # @param axis [Integer]
2484
+ # Either 0 or 1.
2485
+ # @param null_strategy ["ignore", "propagate"]
2486
+ # This argument is only used if axis == 1.
2487
+ #
2488
+ # @return [DataFrame]
2489
+ #
2490
+ # @example
2491
+ # df = Polars::DataFrame.new(
2492
+ # {
2493
+ # "foo" => [1, 2, 3],
2494
+ # "bar" => [6, 7, 8],
2495
+ # "ham" => ["a", "b", "c"],
2496
+ # }
2497
+ # )
2498
+ # df.sum
2499
+ # # =>
2500
+ # # shape: (1, 3)
2501
+ # # ┌─────┬─────┬──────┐
2502
+ # # │ foo ┆ bar ┆ ham │
2503
+ # # │ --- ┆ --- ┆ --- │
2504
+ # # │ i64 ┆ i64 ┆ str │
2505
+ # # ╞═════╪═════╪══════╡
2506
+ # # │ 6 ┆ 21 ┆ null │
2507
+ # # └─────┴─────┴──────┘
2508
+ #
2509
+ # @example
2510
+ # df.sum(axis: 1)
2511
+ # # =>
2512
+ # # shape: (3,)
2513
+ # # Series: 'foo' [str]
2514
+ # # [
2515
+ # # "16a"
2516
+ # # "27b"
2517
+ # # "38c"
2518
+ # # ]
1025
2519
  def sum(axis: 0, null_strategy: "ignore")
1026
2520
  case axis
1027
2521
  when 0
@@ -1033,6 +2527,33 @@ module Polars
1033
2527
  end
1034
2528
  end
1035
2529
 
2530
+ # Aggregate the columns of this DataFrame to their mean value.
2531
+ #
2532
+ # @param axis [Integer]
2533
+ # Either 0 or 1.
2534
+ # @param null_strategy ["ignore", "propagate"]
2535
+ # This argument is only used if axis == 1.
2536
+ #
2537
+ # @return [DataFrame]
2538
+ #
2539
+ # @example
2540
+ # df = Polars::DataFrame.new(
2541
+ # {
2542
+ # "foo" => [1, 2, 3],
2543
+ # "bar" => [6, 7, 8],
2544
+ # "ham" => ["a", "b", "c"]
2545
+ # }
2546
+ # )
2547
+ # df.mean
2548
+ # # =>
2549
+ # # shape: (1, 3)
2550
+ # # ┌─────┬─────┬──────┐
2551
+ # # │ foo ┆ bar ┆ ham │
2552
+ # # │ --- ┆ --- ┆ --- │
2553
+ # # │ f64 ┆ f64 ┆ str │
2554
+ # # ╞═════╪═════╪══════╡
2555
+ # # │ 2.0 ┆ 7.0 ┆ null │
2556
+ # # └─────┴─────┴──────┘
1036
2557
  def mean(axis: 0, null_strategy: "ignore")
1037
2558
  case axis
1038
2559
  when 0
@@ -1044,77 +2565,633 @@ module Polars
1044
2565
  end
1045
2566
  end
1046
2567
 
2568
+ # Aggregate the columns of this DataFrame to their standard deviation value.
2569
+ #
2570
+ # @param ddof [Integer]
2571
+ # Degrees of freedom
2572
+ #
2573
+ # @return [DataFrame]
2574
+ #
2575
+ # @example
2576
+ # df = Polars::DataFrame.new(
2577
+ # {
2578
+ # "foo" => [1, 2, 3],
2579
+ # "bar" => [6, 7, 8],
2580
+ # "ham" => ["a", "b", "c"]
2581
+ # }
2582
+ # )
2583
+ # df.std
2584
+ # # =>
2585
+ # # shape: (1, 3)
2586
+ # # ┌─────┬─────┬──────┐
2587
+ # # │ foo ┆ bar ┆ ham │
2588
+ # # │ --- ┆ --- ┆ --- │
2589
+ # # │ f64 ┆ f64 ┆ str │
2590
+ # # ╞═════╪═════╪══════╡
2591
+ # # │ 1.0 ┆ 1.0 ┆ null │
2592
+ # # └─────┴─────┴──────┘
2593
+ #
2594
+ # @example
2595
+ # df.std(ddof: 0)
2596
+ # # =>
2597
+ # # shape: (1, 3)
2598
+ # # ┌──────────┬──────────┬──────┐
2599
+ # # │ foo ┆ bar ┆ ham │
2600
+ # # │ --- ┆ --- ┆ --- │
2601
+ # # │ f64 ┆ f64 ┆ str │
2602
+ # # ╞══════════╪══════════╪══════╡
2603
+ # # │ 0.816497 ┆ 0.816497 ┆ null │
2604
+ # # └──────────┴──────────┴──────┘
1047
2605
  def std(ddof: 1)
1048
2606
  _from_rbdf(_df.std(ddof))
1049
2607
  end
1050
2608
 
2609
+ # Aggregate the columns of this DataFrame to their variance value.
2610
+ #
2611
+ # @param ddof [Integer]
2612
+ # Degrees of freedom
2613
+ #
2614
+ # @return [DataFrame]
2615
+ #
2616
+ # @example
2617
+ # df = Polars::DataFrame.new(
2618
+ # {
2619
+ # "foo" => [1, 2, 3],
2620
+ # "bar" => [6, 7, 8],
2621
+ # "ham" => ["a", "b", "c"]
2622
+ # }
2623
+ # )
2624
+ # df.var
2625
+ # # =>
2626
+ # # shape: (1, 3)
2627
+ # # ┌─────┬─────┬──────┐
2628
+ # # │ foo ┆ bar ┆ ham │
2629
+ # # │ --- ┆ --- ┆ --- │
2630
+ # # │ f64 ┆ f64 ┆ str │
2631
+ # # ╞═════╪═════╪══════╡
2632
+ # # │ 1.0 ┆ 1.0 ┆ null │
2633
+ # # └─────┴─────┴──────┘
2634
+ #
2635
+ # @example
2636
+ # df.var(ddof: 0)
2637
+ # # =>
2638
+ # # shape: (1, 3)
2639
+ # # ┌──────────┬──────────┬──────┐
2640
+ # # │ foo ┆ bar ┆ ham │
2641
+ # # │ --- ┆ --- ┆ --- │
2642
+ # # │ f64 ┆ f64 ┆ str │
2643
+ # # ╞══════════╪══════════╪══════╡
2644
+ # # │ 0.666667 ┆ 0.666667 ┆ null │
2645
+ # # └──────────┴──────────┴──────┘
1051
2646
  def var(ddof: 1)
1052
2647
  _from_rbdf(_df.var(ddof))
1053
2648
  end
1054
2649
 
2650
+ # Aggregate the columns of this DataFrame to their median value.
2651
+ #
2652
+ # @return [DataFrame]
2653
+ #
2654
+ # @example
2655
+ # df = Polars::DataFrame.new(
2656
+ # {
2657
+ # "foo" => [1, 2, 3],
2658
+ # "bar" => [6, 7, 8],
2659
+ # "ham" => ["a", "b", "c"]
2660
+ # }
2661
+ # )
2662
+ # df.median
2663
+ # # =>
2664
+ # # shape: (1, 3)
2665
+ # # ┌─────┬─────┬──────┐
2666
+ # # │ foo ┆ bar ┆ ham │
2667
+ # # │ --- ┆ --- ┆ --- │
2668
+ # # │ f64 ┆ f64 ┆ str │
2669
+ # # ╞═════╪═════╪══════╡
2670
+ # # │ 2.0 ┆ 7.0 ┆ null │
2671
+ # # └─────┴─────┴──────┘
1055
2672
  def median
1056
2673
  _from_rbdf(_df.median)
1057
2674
  end
1058
2675
 
1059
- # def product
1060
- # end
2676
+ # Aggregate the columns of this DataFrame to their product values.
2677
+ #
2678
+ # @return [DataFrame]
2679
+ #
2680
+ # @example
2681
+ # df = Polars::DataFrame.new(
2682
+ # {
2683
+ # "a" => [1, 2, 3],
2684
+ # "b" => [0.5, 4, 10],
2685
+ # "c" => [true, true, false]
2686
+ # }
2687
+ # )
2688
+ # df.product
2689
+ # # =>
2690
+ # # shape: (1, 3)
2691
+ # # ┌─────┬──────┬─────┐
2692
+ # # │ a ┆ b ┆ c │
2693
+ # # │ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ f64 ┆ i64 │
2695
+ # # ╞═════╪══════╪═════╡
2696
+ # # │ 6 ┆ 20.0 ┆ 0 │
2697
+ # # └─────┴──────┴─────┘
2698
+ def product
2699
+ select(Polars.all.product)
2700
+ end
2701
+
2702
+ # Aggregate the columns of this DataFrame to their quantile value.
2703
+ #
2704
+ # @param quantile [Float]
2705
+ # Quantile between 0.0 and 1.0.
2706
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2707
+ # Interpolation method.
2708
+ #
2709
+ # @return [DataFrame]
2710
+ #
2711
+ # @example
2712
+ # df = Polars::DataFrame.new(
2713
+ # {
2714
+ # "foo" => [1, 2, 3],
2715
+ # "bar" => [6, 7, 8],
2716
+ # "ham" => ["a", "b", "c"]
2717
+ # }
2718
+ # )
2719
+ # df.quantile(0.5, interpolation: "nearest")
2720
+ # # =>
2721
+ # # shape: (1, 3)
2722
+ # # ┌─────┬─────┬──────┐
2723
+ # # │ foo ┆ bar ┆ ham │
2724
+ # # │ --- ┆ --- ┆ --- │
2725
+ # # │ f64 ┆ f64 ┆ str │
2726
+ # # ╞═════╪═════╪══════╡
2727
+ # # │ 2.0 ┆ 7.0 ┆ null │
2728
+ # # └─────┴─────┴──────┘
2729
+ def quantile(quantile, interpolation: "nearest")
2730
+ _from_rbdf(_df.quantile(quantile, interpolation))
2731
+ end
2732
+
2733
+ # Get one hot encoded dummy variables.
2734
+ #
2735
+ # @param columns
2736
+ # A subset of columns to convert to dummy variables. `nil` means
2737
+ # "all columns".
2738
+ #
2739
+ # @return [DataFrame]
2740
+ #
2741
+ # @example
2742
+ # df = Polars::DataFrame.new(
2743
+ # {
2744
+ # "foo" => [1, 2],
2745
+ # "bar" => [3, 4],
2746
+ # "ham" => ["a", "b"]
2747
+ # }
2748
+ # )
2749
+ # df.to_dummies
2750
+ # # =>
2751
+ # # shape: (2, 6)
2752
+ # # ┌───────┬───────┬───────┬───────┬───────┬───────┐
2753
+ # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
2754
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2755
+ # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
2756
+ # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
2757
+ # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
2758
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2759
+ # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
2760
+ # # └───────┴───────┴───────┴───────┴───────┴───────┘
2761
+ def to_dummies(columns: nil)
2762
+ if columns.is_a?(String)
2763
+ columns = [columns]
2764
+ end
2765
+ _from_rbdf(_df.to_dummies(columns))
2766
+ end
1061
2767
 
1062
- # def quantile(quantile, interpolation: "nearest")
1063
- # end
2768
+ # Drop duplicate rows from this DataFrame.
2769
+ #
2770
+ # @param maintain_order [Boolean]
2771
+ # Keep the same order as the original DataFrame. This requires more work to
2772
+ # compute.
2773
+ # @param subset [Object]
2774
+ # Subset to use to compare rows.
2775
+ # @param keep ["first", "last"]
2776
+ # Which of the duplicate rows to keep (in conjunction with `subset`).
2777
+ #
2778
+ # @return [DataFrame]
2779
+ #
2780
+ # @note
2781
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2782
+ # subset.
2783
+ #
2784
+ # @example
2785
+ # df = Polars::DataFrame.new(
2786
+ # {
2787
+ # "a" => [1, 1, 2, 3, 4, 5],
2788
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2789
+ # "c" => [true, true, true, false, true, true]
2790
+ # }
2791
+ # )
2792
+ # df.unique
2793
+ # # =>
2794
+ # # shape: (5, 3)
2795
+ # # ┌─────┬─────┬───────┐
2796
+ # # │ a ┆ b ┆ c │
2797
+ # # │ --- ┆ --- ┆ --- │
2798
+ # # │ i64 ┆ f64 ┆ bool │
2799
+ # # ╞═════╪═════╪═══════╡
2800
+ # # │ 1 ┆ 0.5 ┆ true │
2801
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2802
+ # # │ 2 ┆ 1.0 ┆ true │
2803
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2804
+ # # │ 3 ┆ 2.0 ┆ false │
2805
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2806
+ # # │ 4 ┆ 3.0 ┆ true │
2807
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2808
+ # # │ 5 ┆ 3.0 ┆ true │
2809
+ # # └─────┴─────┴───────┘
2810
+ def unique(maintain_order: true, subset: nil, keep: "first")
2811
+ if !subset.nil?
2812
+ if subset.is_a?(String)
2813
+ subset = [subset]
2814
+ elsif !subset.is_a?(Array)
2815
+ subset = subset.to_a
2816
+ end
2817
+ end
1064
2818
 
1065
- # def to_dummies
1066
- # end
2819
+ _from_rbdf(_df.unique(maintain_order, subset, keep))
2820
+ end
1067
2821
 
1068
- # def unique
1069
- # end
2822
+ # Return the number of unique rows, or the number of unique row-subsets.
2823
+ #
2824
+ # @param subset [Object]
2825
+ # One or more columns/expressions that define what to count;
2826
+ # omit to return the count of unique rows.
2827
+ #
2828
+ # @return [DataFrame]
2829
+ #
2830
+ # @example
2831
+ # df = Polars::DataFrame.new(
2832
+ # {
2833
+ # "a" => [1, 1, 2, 3, 4, 5],
2834
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2835
+ # "c" => [true, true, true, false, true, true]
2836
+ # }
2837
+ # )
2838
+ # df.n_unique
2839
+ # # => 5
2840
+ #
2841
+ # @example Simple columns subset
2842
+ # df.n_unique(subset: ["b", "c"])
2843
+ # # => 4
2844
+ #
2845
+ # @example Expression subset
2846
+ # df.n_unique(
2847
+ # subset: [
2848
+ # (Polars.col("a").floordiv(2)),
2849
+ # (Polars.col("c") | (Polars.col("b") >= 2))
2850
+ # ]
2851
+ # )
2852
+ # # => 3
2853
+ def n_unique(subset: nil)
2854
+ if subset.is_a?(StringIO)
2855
+ subset = [Polars.col(subset)]
2856
+ elsif subset.is_a?(Expr)
2857
+ subset = [subset]
2858
+ end
1070
2859
 
1071
- # def n_unique
1072
- # end
2860
+ if subset.is_a?(Array) && subset.length == 1
2861
+ expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
2862
+ else
2863
+ struct_fields = subset.nil? ? Polars.all : subset
2864
+ expr = Polars.struct(struct_fields)
2865
+ end
2866
+
2867
+ df = lazy.select(expr.n_unique).collect
2868
+ df.is_empty ? 0 : df.row(0)[0]
2869
+ end
2870
+
2871
+ # Rechunk the data in this DataFrame to a contiguous allocation.
1073
2872
 
2873
+ # This will make sure all subsequent operations have optimal and predictable
2874
+ # performance.
1074
2875
  #
2876
+ # @return [DataFrame]
1075
2877
  def rechunk
1076
2878
  _from_rbdf(_df.rechunk)
1077
2879
  end
1078
2880
 
2881
+ # Create a new DataFrame that shows the null counts per column.
2882
+ #
2883
+ # @return [DataFrame]
2884
+ #
2885
+ # @example
2886
+ # df = Polars::DataFrame.new(
2887
+ # {
2888
+ # "foo" => [1, nil, 3],
2889
+ # "bar" => [6, 7, nil],
2890
+ # "ham" => ["a", "b", "c"]
2891
+ # }
2892
+ # )
2893
+ # df.null_count
2894
+ # # =>
2895
+ # # shape: (1, 3)
2896
+ # # ┌─────┬─────┬─────┐
2897
+ # # │ foo ┆ bar ┆ ham │
2898
+ # # │ --- ┆ --- ┆ --- │
2899
+ # # │ u32 ┆ u32 ┆ u32 │
2900
+ # # ╞═════╪═════╪═════╡
2901
+ # # │ 1 ┆ 1 ┆ 0 │
2902
+ # # └─────┴─────┴─────┘
1079
2903
  def null_count
1080
2904
  _from_rbdf(_df.null_count)
1081
2905
  end
1082
2906
 
1083
- # def sample
1084
- # end
2907
+ # Sample from this DataFrame.
2908
+ #
2909
+ # @param n [Integer]
2910
+ # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
2911
+ # `frac` is nil.
2912
+ # @param frac [Float]
2913
+ # Fraction of items to return. Cannot be used with `n`.
2914
+ # @param with_replacement [Boolean]
2915
+ # Allow values to be sampled more than once.
2916
+ # @param shuffle [Boolean]
2917
+ # Shuffle the order of sampled data points.
2918
+ # @param seed [Integer]
2919
+ # Seed for the random number generator. If set to nil (default), a random
2920
+ # seed is used.
2921
+ #
2922
+ # @return [DataFrame]
2923
+ #
2924
+ # @example
2925
+ # df = Polars::DataFrame.new(
2926
+ # {
2927
+ # "foo" => [1, 2, 3],
2928
+ # "bar" => [6, 7, 8],
2929
+ # "ham" => ["a", "b", "c"]
2930
+ # }
2931
+ # )
2932
+ # df.sample(n: 2, seed: 0)
2933
+ # # =>
2934
+ # # shape: (2, 3)
2935
+ # # ┌─────┬─────┬─────┐
2936
+ # # │ foo ┆ bar ┆ ham │
2937
+ # # │ --- ┆ --- ┆ --- │
2938
+ # # │ i64 ┆ i64 ┆ str │
2939
+ # # ╞═════╪═════╪═════╡
2940
+ # # │ 3 ┆ 8 ┆ c │
2941
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2942
+ # # │ 2 ┆ 7 ┆ b │
2943
+ # # └─────┴─────┴─────┘
2944
+ def sample(
2945
+ n: nil,
2946
+ frac: nil,
2947
+ with_replacement: false,
2948
+ shuffle: false,
2949
+ seed: nil
2950
+ )
2951
+ if !n.nil? && !frac.nil?
2952
+ raise ArgumentError, "cannot specify both `n` and `frac`"
2953
+ end
2954
+
2955
+ if n.nil? && !frac.nil?
2956
+ _from_rbdf(
2957
+ _df.sample_frac(frac, with_replacement, shuffle, seed)
2958
+ )
2959
+ end
2960
+
2961
+ if n.nil?
2962
+ n = 1
2963
+ end
2964
+ _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
2965
+ end
1085
2966
 
1086
2967
  # def fold
1087
2968
  # end
1088
2969
 
1089
- # def row
1090
- # end
2970
+ # Get a row as tuple, either by index or by predicate.
2971
+ #
2972
+ # @param index [Object]
2973
+ # Row index.
2974
+ # @param by_predicate [Object]
2975
+ # Select the row according to a given expression/predicate.
2976
+ #
2977
+ # @return [Object]
2978
+ #
2979
+ # @note
2980
+ # The `index` and `by_predicate` params are mutually exclusive. Additionally,
2981
+ # to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
2982
+ #
2983
+ # When using `by_predicate` it is an error condition if anything other than
2984
+ # one row is returned; more than one row raises `TooManyRowsReturned`, and
2985
+ # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
2986
+ #
2987
+ # @example Return the row at the given index
2988
+ # df = Polars::DataFrame.new(
2989
+ # {
2990
+ # "foo" => [1, 2, 3],
2991
+ # "bar" => [6, 7, 8],
2992
+ # "ham" => ["a", "b", "c"]
2993
+ # }
2994
+ # )
2995
+ # df.row(2)
2996
+ # # => [3, 8, "c"]
2997
+ #
2998
+ # @example Return the row that matches the given predicate
2999
+ # df.row(by_predicate: Polars.col("ham") == "b")
3000
+ # # => [2, 7, "b"]
3001
+ def row(index = nil, by_predicate: nil)
3002
+ if !index.nil? && !by_predicate.nil?
3003
+ raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
3004
+ elsif index.is_a?(Expr)
3005
+ raise TypeError, "Expressions should be passed to the 'by_predicate' param"
3006
+ elsif index.is_a?(Integer)
3007
+ _df.row_tuple(index)
3008
+ elsif by_predicate.is_a?(Expr)
3009
+ rows = filter(by_predicate).rows
3010
+ n_rows = rows.length
3011
+ if n_rows > 1
3012
+ raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
3013
+ elsif n_rows == 0
3014
+ raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
3015
+ end
3016
+ rows[0]
3017
+ else
3018
+ raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
3019
+ end
3020
+ end
1091
3021
 
1092
- # def rows
1093
- # end
3022
+ # Convert columnar data to rows as Ruby arrays.
3023
+ #
3024
+ # @return [Array]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 3, 5],
3030
+ # "b" => [2, 4, 6]
3031
+ # }
3032
+ # )
3033
+ # df.rows
3034
+ # # => [[1, 2], [3, 4], [5, 6]]
3035
+ def rows
3036
+ _df.row_tuples
3037
+ end
1094
3038
 
1095
- # def shrink_to_fit
1096
- # end
3039
+ # Shrink DataFrame memory usage.
3040
+ #
3041
+ # Shrinks to fit the exact capacity needed to hold the data.
3042
+ #
3043
+ # @return [DataFrame]
3044
+ def shrink_to_fit(in_place: false)
3045
+ if in_place
3046
+ _df.shrink_to_fit
3047
+ self
3048
+ else
3049
+ df = clone
3050
+ df._df.shrink_to_fit
3051
+ df
3052
+ end
3053
+ end
1097
3054
 
1098
- # def take_every
1099
- # end
3055
+ # Take every nth row in the DataFrame and return as a new DataFrame.
3056
+ #
3057
+ # @return [DataFrame]
3058
+ #
3059
+ # @example
3060
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
3061
+ # s.take_every(2)
3062
+ # # =>
3063
+ # # shape: (2, 2)
3064
+ # # ┌─────┬─────┐
3065
+ # # │ a ┆ b │
3066
+ # # │ --- ┆ --- │
3067
+ # # │ i64 ┆ i64 │
3068
+ # # ╞═════╪═════╡
3069
+ # # │ 1 ┆ 5 │
3070
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3071
+ # # │ 3 ┆ 7 │
3072
+ # # └─────┴─────┘
3073
+ def take_every(n)
3074
+ select(Utils.col("*").take_every(n))
3075
+ end
1100
3076
 
1101
3077
  # def hash_rows
1102
3078
  # end
1103
3079
 
1104
- # def interpolate
1105
- # end
1106
-
3080
+ # Interpolate intermediate values. The interpolation method is linear.
3081
+ #
3082
+ # @return [DataFrame]
3083
+ #
3084
+ # @example
3085
+ # df = Polars::DataFrame.new(
3086
+ # {
3087
+ # "foo" => [1, nil, 9, 10],
3088
+ # "bar" => [6, 7, 9, nil],
3089
+ # "baz" => [1, nil, nil, 9]
3090
+ # }
3091
+ # )
3092
+ # df.interpolate
3093
+ # # =>
3094
+ # # shape: (4, 3)
3095
+ # # ┌─────┬──────┬─────┐
3096
+ # # │ foo ┆ bar ┆ baz │
3097
+ # # │ --- ┆ --- ┆ --- │
3098
+ # # │ i64 ┆ i64 ┆ i64 │
3099
+ # # ╞═════╪══════╪═════╡
3100
+ # # │ 1 ┆ 6 ┆ 1 │
3101
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3102
+ # # │ 5 ┆ 7 ┆ 3 │
3103
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3104
+ # # │ 9 ┆ 9 ┆ 6 │
3105
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3106
+ # # │ 10 ┆ null ┆ 9 │
3107
+ # # └─────┴──────┴─────┘
3108
+ def interpolate
3109
+ select(Utils.col("*").interpolate)
3110
+ end
3111
+
3112
+ # Check if the dataframe is empty.
3113
+ #
3114
+ # @return [Boolean]
1107
3115
  #
3116
+ # @example
3117
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3118
+ # df.is_empty
3119
+ # # => false
3120
+ # df.filter(Polars.col("foo") > 99).is_empty
3121
+ # # => true
1108
3122
  def is_empty
1109
3123
  height == 0
1110
3124
  end
1111
3125
  alias_method :empty?, :is_empty
1112
3126
 
1113
- # def to_struct(name)
1114
- # end
3127
+ # Convert a `DataFrame` to a `Series` of type `Struct`.
3128
+ #
3129
+ # @param name [String]
3130
+ # Name for the struct Series
3131
+ #
3132
+ # @return [Series]
3133
+ #
3134
+ # @example
3135
+ # df = Polars::DataFrame.new(
3136
+ # {
3137
+ # "a" => [1, 2, 3, 4, 5],
3138
+ # "b" => ["one", "two", "three", "four", "five"]
3139
+ # }
3140
+ # )
3141
+ # df.to_struct("nums")
3142
+ # # =>
3143
+ # # shape: (5,)
3144
+ # # Series: 'nums' [struct[2]]
3145
+ # # [
3146
+ # # {1,"one"}
3147
+ # # {2,"two"}
3148
+ # # {3,"three"}
3149
+ # # {4,"four"}
3150
+ # # {5,"five"}
3151
+ # # ]
3152
+ def to_struct(name)
3153
+ Utils.wrap_s(_df.to_struct(name))
3154
+ end
1115
3155
 
1116
- # def unnest
1117
- # end
3156
+ # Decompose a struct into its fields.
3157
+ #
3158
+ # The fields will be inserted into the `DataFrame` on the location of the
3159
+ # `struct` type.
3160
+ #
3161
+ # @param names [Object]
3162
+ # Names of the struct columns that will be decomposed by its fields
3163
+ #
3164
+ # @return [DataFrame]
3165
+ #
3166
+ # @example
3167
+ # df = Polars::DataFrame.new(
3168
+ # {
3169
+ # "before" => ["foo", "bar"],
3170
+ # "t_a" => [1, 2],
3171
+ # "t_b" => ["a", "b"],
3172
+ # "t_c" => [true, nil],
3173
+ # "t_d" => [[1, 2], [3]],
3174
+ # "after" => ["baz", "womp"]
3175
+ # }
3176
+ # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
3177
+ # df.unnest("t_struct")
3178
+ # # =>
3179
+ # # shape: (2, 6)
3180
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
3181
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
3182
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3183
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
3184
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
3185
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
3186
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3187
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
3188
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
3189
+ def unnest(names)
3190
+ if names.is_a?(String)
3191
+ names = [names]
3192
+ end
3193
+ _from_rbdf(_df.unnest(names))
3194
+ end
1118
3195
 
1119
3196
  private
1120
3197
 
@@ -1127,7 +3204,7 @@ module Polars
1127
3204
  if !columns.nil?
1128
3205
  columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
3206
 
1130
- if !data && dtypes
3207
+ if data.empty? && dtypes
1131
3208
  data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
3209
  else
1133
3210
  data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
@@ -1147,7 +3224,7 @@ module Polars
1147
3224
  if columns.nil?
1148
3225
  data
1149
3226
  else
1150
- if !data
3227
+ if data.empty?
1151
3228
  columns.map { |c| Series.new(c, nil)._s }
1152
3229
  elsif data.length == columns.length
1153
3230
  columns.each_with_index do |c, i|
@@ -1182,5 +3259,75 @@ module Polars
1182
3259
  def _from_rbdf(rb_df)
1183
3260
  self.class._from_rbdf(rb_df)
1184
3261
  end
3262
+
3263
+ def _comp(other, op)
3264
+ if other.is_a?(DataFrame)
3265
+ _compare_to_other_df(other, op)
3266
+ else
3267
+ _compare_to_non_df(other, op)
3268
+ end
3269
+ end
3270
+
3271
+ def _compare_to_other_df(other, op)
3272
+ if columns != other.columns
3273
+ raise ArgmentError, "DataFrame columns do not match"
3274
+ end
3275
+ if shape != other.shape
3276
+ raise ArgmentError, "DataFrame dimensions do not match"
3277
+ end
3278
+
3279
+ suffix = "__POLARS_CMP_OTHER"
3280
+ other_renamed = other.select(Polars.all.suffix(suffix))
3281
+ combined = Polars.concat([self, other_renamed], how: "horizontal")
3282
+
3283
+ expr = case op
3284
+ when "eq"
3285
+ columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
3286
+ when "neq"
3287
+ columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
3288
+ when "gt"
3289
+ columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
3290
+ when "lt"
3291
+ columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
3292
+ when "gt_eq"
3293
+ columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
3294
+ when "lt_eq"
3295
+ columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
3296
+ else
3297
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3298
+ end
3299
+
3300
+ combined.select(expr)
3301
+ end
3302
+
3303
+ def _compare_to_non_df(other, op)
3304
+ case op
3305
+ when "eq"
3306
+ select(Polars.all == other)
3307
+ when "neq"
3308
+ select(Polars.all != other)
3309
+ when "gt"
3310
+ select(Polars.all > other)
3311
+ when "lt"
3312
+ select(Polars.all < other)
3313
+ when "gt_eq"
3314
+ select(Polars.all >= other)
3315
+ when "lt_eq"
3316
+ select(Polars.all <= other)
3317
+ else
3318
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3319
+ end
3320
+ end
3321
+
3322
+ def _prepare_other_arg(other)
3323
+ if !other.is_a?(Series)
3324
+ if other.is_a?(Array)
3325
+ raise ArgumentError, "Operation not supported."
3326
+ end
3327
+
3328
+ other = Series.new("", [other])
3329
+ end
3330
+ other
3331
+ end
1185
3332
  end
1186
3333
  end