polars-df 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -155,12 +155,35 @@ module Polars
155
155
  end
156
156
 
157
157
  # @private
158
- def self._read_parquet(file)
158
+ def self._read_parquet(
159
+ file,
160
+ columns: nil,
161
+ n_rows: nil,
162
+ parallel: "auto",
163
+ row_count_name: nil,
164
+ row_count_offset: 0,
165
+ low_memory: false
166
+ )
159
167
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
160
168
  file = Utils.format_path(file)
161
169
  end
162
170
 
163
- _from_rbdf(RbDataFrame.read_parquet(file))
171
+ if file.is_a?(String) && file.include?("*")
172
+ raise Todo
173
+ end
174
+
175
+ projection, columns = Utils.handle_projection_columns(columns)
176
+ _from_rbdf(
177
+ RbDataFrame.read_parquet(
178
+ file,
179
+ columns,
180
+ projection,
181
+ n_rows,
182
+ parallel,
183
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
184
+ low_memory
185
+ )
186
+ )
164
187
  end
165
188
 
166
189
  # def self._read_avro
@@ -259,11 +282,13 @@ module Polars
259
282
  # @return [Array]
260
283
  #
261
284
  # @example
262
- # df = Polars::DataFrame.new({
263
- # "foo" => [1, 2, 3],
264
- # "bar" => [6, 7, 8],
265
- # "ham" => ["a", "b", "c"]
266
- # })
285
+ # df = Polars::DataFrame.new(
286
+ # {
287
+ # "foo" => [1, 2, 3],
288
+ # "bar" => [6, 7, 8],
289
+ # "ham" => ["a", "b", "c"]
290
+ # }
291
+ # )
267
292
  # df.columns
268
293
  # # => ["foo", "bar", "ham"]
269
294
  def columns
@@ -279,11 +304,13 @@ module Polars
279
304
  # @return [Object]
280
305
  #
281
306
  # @example
282
- # df = Polars::DataFrame.new({
283
- # "foo" => [1, 2, 3],
284
- # "bar" => [6, 7, 8],
285
- # "ham" => ["a", "b", "c"]
286
- # })
307
+ # df = Polars::DataFrame.new(
308
+ # {
309
+ # "foo" => [1, 2, 3],
310
+ # "bar" => [6, 7, 8],
311
+ # "ham" => ["a", "b", "c"]
312
+ # }
313
+ # )
287
314
  # df.columns = ["apple", "banana", "orange"]
288
315
  # df
289
316
  # # =>
@@ -308,11 +335,13 @@ module Polars
308
335
  # @return [Array]
309
336
  #
310
337
  # @example
311
- # df = Polars::DataFrame.new({
312
- # "foo" => [1, 2, 3],
313
- # "bar" => [6.0, 7.0, 8.0],
314
- # "ham" => ["a", "b", "c"]
315
- # })
338
+ # df = Polars::DataFrame.new(
339
+ # {
340
+ # "foo" => [1, 2, 3],
341
+ # "bar" => [6.0, 7.0, 8.0],
342
+ # "ham" => ["a", "b", "c"]
343
+ # }
344
+ # )
316
345
  # df.dtypes
317
346
  # # => [:i64, :f64, :str]
318
347
  def dtypes
@@ -324,56 +353,132 @@ module Polars
324
353
  # @return [Hash]
325
354
  #
326
355
  # @example
327
- # df = Polars::DataFrame.new({
328
- # "foo" => [1, 2, 3],
329
- # "bar" => [6.0, 7.0, 8.0],
330
- # "ham" => ["a", "b", "c"]
331
- # })
356
+ # df = Polars::DataFrame.new(
357
+ # {
358
+ # "foo" => [1, 2, 3],
359
+ # "bar" => [6.0, 7.0, 8.0],
360
+ # "ham" => ["a", "b", "c"]
361
+ # }
362
+ # )
332
363
  # df.schema
333
364
  # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
334
365
  def schema
335
366
  columns.zip(dtypes).to_h
336
367
  end
337
368
 
338
- # def ==(other)
339
- # end
369
+ # Equal.
370
+ #
371
+ # @return [DataFrame]
372
+ def ==(other)
373
+ _comp(other, "eq")
374
+ end
340
375
 
341
- # def !=(other)
342
- # end
376
+ # Not equal.
377
+ #
378
+ # @return [DataFrame]
379
+ def !=(other)
380
+ _comp(other, "neq")
381
+ end
343
382
 
344
- # def >(other)
345
- # end
383
+ # Greater than.
384
+ #
385
+ # @return [DataFrame]
386
+ def >(other)
387
+ _comp(other, "gt")
388
+ end
346
389
 
347
- # def <(other)
348
- # end
390
+ # Less than.
391
+ #
392
+ # @return [DataFrame]
393
+ def <(other)
394
+ _comp(other, "lt")
395
+ end
349
396
 
350
- # def >=(other)
351
- # end
397
+ # Greater than or equal.
398
+ #
399
+ # @return [DataFrame]
400
+ def >=(other)
401
+ _comp(other, "gt_eq")
402
+ end
352
403
 
353
- # def <=(other)
354
- # end
404
+ # Less than or equal.
405
+ #
406
+ # @return [DataFrame]
407
+ def <=(other)
408
+ _comp(other, "lt_eq")
409
+ end
355
410
 
356
- # def *(other)
357
- # end
411
+ # Performs multiplication.
412
+ #
413
+ # @return [DataFrame]
414
+ def *(other)
415
+ if other.is_a?(DataFrame)
416
+ return _from_rbdf(_df.mul_df(other._df))
417
+ end
358
418
 
359
- # def /(other)
360
- # end
419
+ other = _prepare_other_arg(other)
420
+ _from_rbdf(_df.mul(other._s))
421
+ end
361
422
 
362
- # def +(other)
363
- # end
423
+ # Performs division.
424
+ #
425
+ # @return [DataFrame]
426
+ def /(other)
427
+ if other.is_a?(DataFrame)
428
+ return _from_rbdf(_df.div_df(other._df))
429
+ end
364
430
 
365
- # def -(other)
366
- # end
431
+ other = _prepare_other_arg(other)
432
+ _from_rbdf(_df.div(other._s))
433
+ end
367
434
 
368
- # def %(other)
369
- # end
435
+ # Performs addition.
436
+ #
437
+ # @return [DataFrame]
438
+ def +(other)
439
+ if other.is_a?(DataFrame)
440
+ return _from_rbdf(_df.add_df(other._df))
441
+ end
442
+
443
+ other = _prepare_other_arg(other)
444
+ _from_rbdf(_df.add(other._s))
445
+ end
446
+
447
+ # Performs subtraction.
448
+ #
449
+ # @return [DataFrame]
450
+ def -(other)
451
+ if other.is_a?(DataFrame)
452
+ return _from_rbdf(_df.sub_df(other._df))
453
+ end
454
+
455
+ other = _prepare_other_arg(other)
456
+ _from_rbdf(_df.sub(other._s))
457
+ end
458
+
459
+ # Returns the modulo.
460
+ #
461
+ # @return [DataFrame]
462
+ def %(other)
463
+ if other.is_a?(DataFrame)
464
+ return _from_rbdf(_df.rem_df(other._df))
465
+ end
466
+
467
+ other = _prepare_other_arg(other)
468
+ _from_rbdf(_df.rem(other._s))
469
+ end
370
470
 
471
+ # Returns a string representing the DataFrame.
371
472
  #
473
+ # @return [String]
372
474
  def to_s
373
475
  _df.to_s
374
476
  end
375
477
  alias_method :inspect, :to_s
376
478
 
479
+ # Check if DataFrame includes column.
480
+ #
481
+ # @return [Boolean]
377
482
  def include?(name)
378
483
  columns.include?(name)
379
484
  end
@@ -387,9 +492,78 @@ module Polars
387
492
  # def _pos_idxs
388
493
  # end
389
494
 
495
+ # Returns subset of the DataFrame.
390
496
  #
391
- def [](name)
392
- Utils.wrap_s(_df.column(name))
497
+ # @return [Object]
498
+ def [](*args)
499
+ if args.size == 2
500
+ row_selection, col_selection = args
501
+
502
+ # df[.., unknown]
503
+ if row_selection.is_a?(Range)
504
+
505
+ # multiple slices
506
+ # df[.., ..]
507
+ if col_selection.is_a?(Range)
508
+ raise Todo
509
+ end
510
+ end
511
+
512
+ # df[2, ..] (select row as df)
513
+ if row_selection.is_a?(Integer)
514
+ if col_selection.is_a?(Array)
515
+ df = self[0.., col_selection]
516
+ return df.slice(row_selection, 1)
517
+ end
518
+ # df[2, "a"]
519
+ if col_selection.is_a?(String)
520
+ return self[col_selection][row_selection]
521
+ end
522
+ end
523
+
524
+ # column selection can be "a" and ["a", "b"]
525
+ if col_selection.is_a?(String)
526
+ col_selection = [col_selection]
527
+ end
528
+
529
+ # df[.., 1]
530
+ if col_selection.is_a?(Integer)
531
+ series = to_series(col_selection)
532
+ return series[row_selection]
533
+ end
534
+
535
+ if col_selection.is_a?(Array)
536
+ # df[.., [1, 2]]
537
+ if is_int_sequence(col_selection)
538
+ series_list = col_selection.map { |i| to_series(i) }
539
+ df = self.class.new(series_list)
540
+ return df[row_selection]
541
+ end
542
+ end
543
+
544
+ df = self[col_selection]
545
+ return df[row_selection]
546
+ elsif args.size == 1
547
+ item = args[0]
548
+
549
+ # select single column
550
+ # df["foo"]
551
+ if item.is_a?(String)
552
+ return Utils.wrap_s(_df.column(item))
553
+ end
554
+
555
+ # df[idx]
556
+ if item.is_a?(Integer)
557
+ return slice(_pos_idx(item, dim: 0), 1)
558
+ end
559
+
560
+ # df[..]
561
+ if item.is_a?(Range)
562
+ return Slice.new(self).apply(item)
563
+ end
564
+ end
565
+
566
+ raise ArgumentError, "Cannot get item of type: #{item.class.name}"
393
567
  end
394
568
 
395
569
  # def []=(key, value)
@@ -397,7 +571,9 @@ module Polars
397
571
 
398
572
  # no to_arrow
399
573
 
574
+ # Convert DataFrame to a hash mapping column name to values.
400
575
  #
576
+ # @return [Hash]
401
577
  def to_h(as_series: true)
402
578
  if as_series
403
579
  get_columns.to_h { |s| [s.name, s] }
@@ -422,11 +598,13 @@ module Polars
422
598
  # @return [Series]
423
599
  #
424
600
  # @example
425
- # df = Polars::DataFrame.new({
426
- # "foo" => [1, 2, 3],
427
- # "bar" => [6, 7, 8],
428
- # "ham" => ["a", "b", "c"]
429
- # })
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"]
606
+ # }
607
+ # )
430
608
  # df.to_series(1)
431
609
  # # =>
432
610
  # # shape: (3,)
@@ -519,11 +697,13 @@ module Polars
519
697
  # @return [String, nil]
520
698
  #
521
699
  # @example
522
- # df = Polars::DataFrame.new({
523
- # "foo" => [1, 2, 3, 4, 5],
524
- # "bar" => [6, 7, 8, 9, 10],
525
- # "ham" => ["a", "b", "c", "d", "e"]
526
- # })
700
+ # df = Polars::DataFrame.new(
701
+ # {
702
+ # "foo" => [1, 2, 3, 4, 5],
703
+ # "bar" => [6, 7, 8, 9, 10],
704
+ # "ham" => ["a", "b", "c", "d", "e"]
705
+ # }
706
+ # )
527
707
  # df.write_csv("file.csv")
528
708
  def write_csv(
529
709
  file = nil,
@@ -694,10 +874,12 @@ module Polars
694
874
  # @return [DataFrame]
695
875
  #
696
876
  # @example
697
- # df = Polars::DataFrame.new({
698
- # "key" => ["a", "b", "c"],
699
- # "val" => [1, 2, 3]
700
- # })
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "key" => ["a", "b", "c"],
880
+ # "val" => [1, 2, 3]
881
+ # }
882
+ # )
701
883
  # df.reverse()
702
884
  # # =>
703
885
  # # shape: (3, 2)
@@ -724,11 +906,13 @@ module Polars
724
906
  # @return [DataFrame]
725
907
  #
726
908
  # @example
727
- # df = Polars::DataFrame.new({
728
- # "foo" => [1, 2, 3],
729
- # "bar" => [6, 7, 8],
730
- # "ham" => ["a", "b", "c"]
731
- # })
909
+ # df = Polars::DataFrame.new(
910
+ # {
911
+ # "foo" => [1, 2, 3],
912
+ # "bar" => [6, 7, 8],
913
+ # "ham" => ["a", "b", "c"]
914
+ # }
915
+ # )
732
916
  # df.rename({"foo" => "apple"})
733
917
  # # =>
734
918
  # # shape: (3, 3)
@@ -775,11 +959,13 @@ module Polars
775
959
  # # └─────┴─────┴─────┘
776
960
  #
777
961
  # @example
778
- # df = Polars::DataFrame.new({
779
- # "a" => [1, 2, 3, 4],
780
- # "b" => [0.5, 4, 10, 13],
781
- # "c" => [true, true, false, true]
782
- # })
962
+ # df = Polars::DataFrame.new(
963
+ # {
964
+ # "a" => [1, 2, 3, 4],
965
+ # "b" => [0.5, 4, 10, 13],
966
+ # "c" => [true, true, false, true]
967
+ # }
968
+ # )
783
969
  # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
970
  # df.insert_at_idx(3, s)
785
971
  # # =>
@@ -805,63 +991,560 @@ module Polars
805
991
  self
806
992
  end
807
993
 
994
+ # Filter the rows in the DataFrame based on a predicate expression.
995
+ #
996
+ # @param predicate [Expr]
997
+ # Expression that evaluates to a boolean Series.
998
+ #
999
+ # @return [DataFrame]
1000
+ #
1001
+ # @example Filter on one condition:
1002
+ # df = Polars::DataFrame.new(
1003
+ # {
1004
+ # "foo" => [1, 2, 3],
1005
+ # "bar" => [6, 7, 8],
1006
+ # "ham" => ["a", "b", "c"]
1007
+ # }
1008
+ # )
1009
+ # df.filter(Polars.col("foo") < 3)
1010
+ # # =>
1011
+ # # shape: (2, 3)
1012
+ # # ┌─────┬─────┬─────┐
1013
+ # # │ foo ┆ bar ┆ ham │
1014
+ # # │ --- ┆ --- ┆ --- │
1015
+ # # │ i64 ┆ i64 ┆ str │
1016
+ # # ╞═════╪═════╪═════╡
1017
+ # # │ 1 ┆ 6 ┆ a │
1018
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1019
+ # # │ 2 ┆ 7 ┆ b │
1020
+ # # └─────┴─────┴─────┘
1021
+ #
1022
+ # @example Filter on multiple conditions:
1023
+ # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
1024
+ # # =>
1025
+ # # shape: (1, 3)
1026
+ # # ┌─────┬─────┬─────┐
1027
+ # # │ foo ┆ bar ┆ ham │
1028
+ # # │ --- ┆ --- ┆ --- │
1029
+ # # │ i64 ┆ i64 ┆ str │
1030
+ # # ╞═════╪═════╪═════╡
1031
+ # # │ 1 ┆ 6 ┆ a │
1032
+ # # └─────┴─────┴─────┘
808
1033
  def filter(predicate)
809
1034
  lazy.filter(predicate).collect
810
1035
  end
811
1036
 
812
- # def describe
813
- # end
1037
+ # Summary statistics for a DataFrame.
1038
+ #
1039
+ # @return [DataFrame]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new(
1043
+ # {
1044
+ # "a" => [1.0, 2.8, 3.0],
1045
+ # "b" => [4, 5, nil],
1046
+ # "c" => [true, false, true],
1047
+ # "d" => [nil, "b", "c"],
1048
+ # "e" => ["usd", "eur", nil]
1049
+ # }
1050
+ # )
1051
+ # df.describe
1052
+ # # =>
1053
+ # # shape: (7, 6)
1054
+ # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
+ # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
+ # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1073
+ def describe
1074
+ describe_cast = lambda do |stat|
1075
+ columns = []
1076
+ self.columns.each_with_index do |s, i|
1077
+ if self[s].is_numeric || self[s].is_boolean
1078
+ columns << stat[0.., i].cast(:f64)
1079
+ else
1080
+ # for dates, strings, etc, we cast to string so that all
1081
+ # statistics can be shown
1082
+ columns << stat[0.., i].cast(:str)
1083
+ end
1084
+ end
1085
+ self.class.new(columns)
1086
+ end
814
1087
 
815
- # def find_idx_by_name
816
- # end
1088
+ summary = _from_rbdf(
1089
+ Polars.concat(
1090
+ [
1091
+ describe_cast.(
1092
+ self.class.new(columns.to_h { |c| [c, [height]] })
1093
+ ),
1094
+ describe_cast.(null_count),
1095
+ describe_cast.(mean),
1096
+ describe_cast.(std),
1097
+ describe_cast.(min),
1098
+ describe_cast.(max),
1099
+ describe_cast.(median)
1100
+ ]
1101
+ )._df
1102
+ )
1103
+ summary.insert_at_idx(
1104
+ 0,
1105
+ Polars::Series.new(
1106
+ "describe",
1107
+ ["count", "null_count", "mean", "std", "min", "max", "median"],
1108
+ )
1109
+ )
1110
+ summary
1111
+ end
817
1112
 
818
- # def replace_at_idx
819
- # end
1113
+ # Find the index of a column by name.
1114
+ #
1115
+ # @param name [String]
1116
+ # Name of the column to find.
1117
+ #
1118
+ # @return [Series]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new(
1122
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1123
+ # )
1124
+ # df.find_idx_by_name("ham")
1125
+ # # => 2
1126
+ def find_idx_by_name(name)
1127
+ _df.find_idx_by_name(name)
1128
+ end
1129
+
1130
+ # Replace a column at an index location.
1131
+ #
1132
+ # @param index [Integer]
1133
+ # Column index.
1134
+ # @param series [Series]
1135
+ # Series that will replace the column.
1136
+ #
1137
+ # @return [DataFrame]
1138
+ #
1139
+ # @example
1140
+ # df = Polars::DataFrame.new(
1141
+ # {
1142
+ # "foo" => [1, 2, 3],
1143
+ # "bar" => [6, 7, 8],
1144
+ # "ham" => ["a", "b", "c"]
1145
+ # }
1146
+ # )
1147
+ # s = Polars::Series.new("apple", [10, 20, 30])
1148
+ # df.replace_at_idx(0, s)
1149
+ # # =>
1150
+ # # shape: (3, 3)
1151
+ # # ┌───────┬─────┬─────┐
1152
+ # # │ apple ┆ bar ┆ ham │
1153
+ # # │ --- ┆ --- ┆ --- │
1154
+ # # │ i64 ┆ i64 ┆ str │
1155
+ # # ╞═══════╪═════╪═════╡
1156
+ # # │ 10 ┆ 6 ┆ a │
1157
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1158
+ # # │ 20 ┆ 7 ┆ b │
1159
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1160
+ # # │ 30 ┆ 8 ┆ c │
1161
+ # # └───────┴─────┴─────┘
1162
+ def replace_at_idx(index, series)
1163
+ if index < 0
1164
+ index = columns.length + index
1165
+ end
1166
+ _df.replace_at_idx(index, series._s)
1167
+ self
1168
+ end
820
1169
 
1170
+ # Sort the DataFrame by column.
1171
+ #
1172
+ # @param by [String]
1173
+ # By which column to sort.
1174
+ # @param reverse [Boolean]
1175
+ # Reverse/descending sort.
1176
+ # @param nulls_last [Boolean]
1177
+ # Place null values last. Can only be used if sorted by a single column.
1178
+ #
1179
+ # @return [DataFrame]
1180
+ #
1181
+ # @example
1182
+ # df = Polars::DataFrame.new(
1183
+ # {
1184
+ # "foo" => [1, 2, 3],
1185
+ # "bar" => [6.0, 7.0, 8.0],
1186
+ # "ham" => ["a", "b", "c"]
1187
+ # }
1188
+ # )
1189
+ # df.sort("foo", reverse: true)
1190
+ # # =>
1191
+ # # shape: (3, 3)
1192
+ # # ┌─────┬─────┬─────┐
1193
+ # # │ foo ┆ bar ┆ ham │
1194
+ # # │ --- ┆ --- ┆ --- │
1195
+ # # │ i64 ┆ f64 ┆ str │
1196
+ # # ╞═════╪═════╪═════╡
1197
+ # # │ 3 ┆ 8.0 ┆ c │
1198
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1199
+ # # │ 2 ┆ 7.0 ┆ b │
1200
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1201
+ # # │ 1 ┆ 6.0 ┆ a │
1202
+ # # └─────┴─────┴─────┘
821
1203
  #
1204
+ # @example Sort by multiple columns.
1205
+ # df.sort(
1206
+ # [Polars.col("foo"), Polars.col("bar")**2],
1207
+ # reverse: [true, false]
1208
+ # )
1209
+ # # =>
1210
+ # # shape: (3, 3)
1211
+ # # ┌─────┬─────┬─────┐
1212
+ # # │ foo ┆ bar ┆ ham │
1213
+ # # │ --- ┆ --- ┆ --- │
1214
+ # # │ i64 ┆ f64 ┆ str │
1215
+ # # ╞═════╪═════╪═════╡
1216
+ # # │ 3 ┆ 8.0 ┆ c │
1217
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1218
+ # # │ 2 ┆ 7.0 ┆ b │
1219
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1220
+ # # │ 1 ┆ 6.0 ┆ a │
1221
+ # # └─────┴─────┴─────┘
822
1222
  def sort(by, reverse: false, nulls_last: false)
823
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1223
+ if by.is_a?(Array) || by.is_a?(Expr)
1224
+ lazy
1225
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1226
+ .collect(no_optimization: true, string_cache: false)
1227
+ else
1228
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
1229
+ end
824
1230
  end
825
1231
 
1232
+ # Check if DataFrame is equal to other.
1233
+ #
1234
+ # @param other [DataFrame]
1235
+ # DataFrame to compare with.
1236
+ # @param null_equal [Boolean]
1237
+ # Consider null values as equal.
1238
+ #
1239
+ # @return [Boolean]
1240
+ #
1241
+ # @example
1242
+ # df1 = Polars::DataFrame.new(
1243
+ # {
1244
+ # "foo" => [1, 2, 3],
1245
+ # "bar" => [6.0, 7.0, 8.0],
1246
+ # "ham" => ["a", "b", "c"]
1247
+ # }
1248
+ # )
1249
+ # df2 = Polars::DataFrame.new(
1250
+ # {
1251
+ # "foo" => [3, 2, 1],
1252
+ # "bar" => [8.0, 7.0, 6.0],
1253
+ # "ham" => ["c", "b", "a"]
1254
+ # }
1255
+ # )
1256
+ # df1.frame_equal(df1)
1257
+ # # => true
1258
+ # df1.frame_equal(df2)
1259
+ # # => false
826
1260
  def frame_equal(other, null_equal: true)
827
1261
  _df.frame_equal(other._df, null_equal)
828
1262
  end
829
1263
 
830
- # def replace
831
- # end
832
-
1264
+ # Replace a column by a new Series.
833
1265
  #
834
- def slice(offset, length = nil)
835
- if !length.nil? && length < 0
836
- length = height - offset + length
837
- end
838
- _from_rbdf(_df.slice(offset, length))
1266
+ # @param column [String]
1267
+ # Column to replace.
1268
+ # @param new_col [Series]
1269
+ # New column to insert.
1270
+ #
1271
+ # @return [DataFrame]
1272
+ #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1275
+ # s = Polars::Series.new([10, 20, 30])
1276
+ # df.replace("foo", s)
1277
+ # # =>
1278
+ # # shape: (3, 2)
1279
+ # # ┌─────┬─────┐
1280
+ # # │ foo ┆ bar │
1281
+ # # │ --- ┆ --- │
1282
+ # # │ i64 ┆ i64 │
1283
+ # # ╞═════╪═════╡
1284
+ # # │ 10 ┆ 4 │
1285
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1286
+ # # │ 20 ┆ 5 │
1287
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1288
+ # # │ 30 ┆ 6 │
1289
+ # # └─────┴─────┘
1290
+ def replace(column, new_col)
1291
+ _df.replace(column, new_col._s)
1292
+ self
839
1293
  end
840
1294
 
1295
+ # Get a slice of this DataFrame.
1296
+ #
1297
+ # @param offset [Integer]
1298
+ # Start index. Negative indexing is supported.
1299
+ # @param length [Integer, nil]
1300
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1301
+ # will be selected.
1302
+ #
1303
+ # @return [DataFrame]
1304
+ #
1305
+ # @example
1306
+ # df = Polars::DataFrame.new(
1307
+ # {
1308
+ # "foo" => [1, 2, 3],
1309
+ # "bar" => [6.0, 7.0, 8.0],
1310
+ # "ham" => ["a", "b", "c"]
1311
+ # }
1312
+ # )
1313
+ # df.slice(1, 2)
1314
+ # # =>
1315
+ # # shape: (2, 3)
1316
+ # # ┌─────┬─────┬─────┐
1317
+ # # │ foo ┆ bar ┆ ham │
1318
+ # # │ --- ┆ --- ┆ --- │
1319
+ # # │ i64 ┆ f64 ┆ str │
1320
+ # # ╞═════╪═════╪═════╡
1321
+ # # │ 2 ┆ 7.0 ┆ b │
1322
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1323
+ # # │ 3 ┆ 8.0 ┆ c │
1324
+ # # └─────┴─────┴─────┘
1325
+ def slice(offset, length = nil)
1326
+ if !length.nil? && length < 0
1327
+ length = height - offset + length
1328
+ end
1329
+ _from_rbdf(_df.slice(offset, length))
1330
+ end
1331
+
1332
+ # Get the first `n` rows.
1333
+ #
1334
+ # Alias for {#head}.
1335
+ #
1336
+ # @param n [Integer]
1337
+ # Number of rows to return.
1338
+ #
1339
+ # @return [DataFrame]
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new(
1343
+ # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
1344
+ # )
1345
+ # df.limit(4)
1346
+ # # =>
1347
+ # # shape: (4, 2)
1348
+ # # ┌─────┬─────┐
1349
+ # # │ foo ┆ bar │
1350
+ # # │ --- ┆ --- │
1351
+ # # │ i64 ┆ str │
1352
+ # # ╞═════╪═════╡
1353
+ # # │ 1 ┆ a │
1354
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1355
+ # # │ 2 ┆ b │
1356
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1357
+ # # │ 3 ┆ c │
1358
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1359
+ # # │ 4 ┆ d │
1360
+ # # └─────┴─────┘
841
1361
  def limit(n = 5)
842
1362
  head(n)
843
1363
  end
844
1364
 
1365
+ # Get the first `n` rows.
1366
+ #
1367
+ # @param n [Integer]
1368
+ # Number of rows to return.
1369
+ #
1370
+ # @return [DataFrame]
1371
+ #
1372
+ # @example
1373
+ # df = Polars::DataFrame.new(
1374
+ # {
1375
+ # "foo" => [1, 2, 3, 4, 5],
1376
+ # "bar" => [6, 7, 8, 9, 10],
1377
+ # "ham" => ["a", "b", "c", "d", "e"]
1378
+ # }
1379
+ # )
1380
+ # df.head(3)
1381
+ # # =>
1382
+ # # shape: (3, 3)
1383
+ # # ┌─────┬─────┬─────┐
1384
+ # # │ foo ┆ bar ┆ ham │
1385
+ # # │ --- ┆ --- ┆ --- │
1386
+ # # │ i64 ┆ i64 ┆ str │
1387
+ # # ╞═════╪═════╪═════╡
1388
+ # # │ 1 ┆ 6 ┆ a │
1389
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1390
+ # # │ 2 ┆ 7 ┆ b │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1392
+ # # │ 3 ┆ 8 ┆ c │
1393
+ # # └─────┴─────┴─────┘
845
1394
  def head(n = 5)
846
1395
  _from_rbdf(_df.head(n))
847
1396
  end
848
1397
 
1398
+ # Get the last `n` rows.
1399
+ #
1400
+ # @param n [Integer]
1401
+ # Number of rows to return.
1402
+ #
1403
+ # @return [DataFrame]
1404
+ #
1405
+ # @example
1406
+ # df = Polars::DataFrame.new(
1407
+ # {
1408
+ # "foo" => [1, 2, 3, 4, 5],
1409
+ # "bar" => [6, 7, 8, 9, 10],
1410
+ # "ham" => ["a", "b", "c", "d", "e"]
1411
+ # }
1412
+ # )
1413
+ # df.tail(3)
1414
+ # # =>
1415
+ # # shape: (3, 3)
1416
+ # # ┌─────┬─────┬─────┐
1417
+ # # │ foo ┆ bar ┆ ham │
1418
+ # # │ --- ┆ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 ┆ str │
1420
+ # # ╞═════╪═════╪═════╡
1421
+ # # │ 3 ┆ 8 ┆ c │
1422
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1423
+ # # │ 4 ┆ 9 ┆ d │
1424
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1425
+ # # │ 5 ┆ 10 ┆ e │
1426
+ # # └─────┴─────┴─────┘
849
1427
  def tail(n = 5)
850
1428
  _from_rbdf(_df.tail(n))
851
1429
  end
852
1430
 
853
- # def drop_nulls
854
- # end
1431
+ # Return a new DataFrame where the null values are dropped.
1432
+ #
1433
+ # @param subset [Object]
1434
+ # Subset of column(s) on which `drop_nulls` will be applied.
1435
+ #
1436
+ # @return [DataFrame]
1437
+ #
1438
+ # @example
1439
+ # df = Polars::DataFrame.new(
1440
+ # {
1441
+ # "foo" => [1, 2, 3],
1442
+ # "bar" => [6, nil, 8],
1443
+ # "ham" => ["a", "b", "c"]
1444
+ # }
1445
+ # )
1446
+ # df.drop_nulls
1447
+ # # =>
1448
+ # # shape: (2, 3)
1449
+ # # ┌─────┬─────┬─────┐
1450
+ # # │ foo ┆ bar ┆ ham │
1451
+ # # │ --- ┆ --- ┆ --- │
1452
+ # # │ i64 ┆ i64 ┆ str │
1453
+ # # ╞═════╪═════╪═════╡
1454
+ # # │ 1 ┆ 6 ┆ a │
1455
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1456
+ # # │ 3 ┆ 8 ┆ c │
1457
+ # # └─────┴─────┴─────┘
1458
+ def drop_nulls(subset: nil)
1459
+ if subset.is_a?(String)
1460
+ subset = [subset]
1461
+ end
1462
+ _from_rbdf(_df.drop_nulls(subset))
1463
+ end
855
1464
 
856
1465
  # def pipe
857
1466
  # end
858
1467
 
859
- # def with_row_count
860
- # end
861
-
1468
+ # Add a column at index 0 that counts the rows.
1469
+ #
1470
+ # @param name [String]
1471
+ # Name of the column to add.
1472
+ # @param offset [Integer]
1473
+ # Start the row count at this offset.
1474
+ #
1475
+ # @return [DataFrame]
1476
+ #
1477
+ # @example
1478
+ # df = Polars::DataFrame.new(
1479
+ # {
1480
+ # "a" => [1, 3, 5],
1481
+ # "b" => [2, 4, 6]
1482
+ # }
1483
+ # )
1484
+ # df.with_row_count
1485
+ # # =>
1486
+ # # shape: (3, 3)
1487
+ # # ┌────────┬─────┬─────┐
1488
+ # # │ row_nr ┆ a ┆ b │
1489
+ # # │ --- ┆ --- ┆ --- │
1490
+ # # │ u32 ┆ i64 ┆ i64 │
1491
+ # # ╞════════╪═════╪═════╡
1492
+ # # │ 0 ┆ 1 ┆ 2 │
1493
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1494
+ # # │ 1 ┆ 3 ┆ 4 │
1495
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1496
+ # # │ 2 ┆ 5 ┆ 6 │
1497
+ # # └────────┴─────┴─────┘
1498
+ def with_row_count(name: "row_nr", offset: 0)
1499
+ _from_rbdf(_df.with_row_count(name, offset))
1500
+ end
1501
+
1502
+ # Start a groupby operation.
862
1503
  #
1504
+ # @param by [Object]
1505
+ # Column(s) to group by.
1506
+ # @param maintain_order [Boolean]
1507
+ # Make sure that the order of the groups remain consistent. This is more
1508
+ # expensive than a default groupby. Note that this only works in expression
1509
+ # aggregations.
1510
+ #
1511
+ # @return [GroupBy]
1512
+ #
1513
+ # @example
1514
+ # df = Polars::DataFrame.new(
1515
+ # {
1516
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1517
+ # "b" => [1, 2, 3, 4, 5, 6],
1518
+ # "c" => [6, 5, 4, 3, 2, 1]
1519
+ # }
1520
+ # )
1521
+ # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1522
+ # # =>
1523
+ # # shape: (3, 2)
1524
+ # # ┌─────┬─────┐
1525
+ # # │ a ┆ b │
1526
+ # # │ --- ┆ --- │
1527
+ # # │ str ┆ i64 │
1528
+ # # ╞═════╪═════╡
1529
+ # # │ a ┆ 4 │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1531
+ # # │ b ┆ 11 │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1533
+ # # │ c ┆ 6 │
1534
+ # # └─────┴─────┘
863
1535
  def groupby(by, maintain_order: false)
864
- lazy.groupby(by, maintain_order: maintain_order)
1536
+ if !Utils.bool?(maintain_order)
1537
+ raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1538
+ end
1539
+ if by.is_a?(String)
1540
+ by = [by]
1541
+ end
1542
+ GroupBy.new(
1543
+ _df,
1544
+ by,
1545
+ self.class,
1546
+ maintain_order: maintain_order
1547
+ )
865
1548
  end
866
1549
 
867
1550
  # def groupby_rolling
@@ -876,7 +1559,109 @@ module Polars
876
1559
  # def join_asof
877
1560
  # end
878
1561
 
1562
+ # Join in SQL-like fashion.
1563
+ #
1564
+ # @param other [DataFrame]
1565
+ # DataFrame to join with.
1566
+ # @param left_on [Object]
1567
+ # Name(s) of the left join column(s).
1568
+ # @param right_on [Object]
1569
+ # Name(s) of the right join column(s).
1570
+ # @param on [Object]
1571
+ # Name(s) of the join columns in both DataFrames.
1572
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1573
+ # Join strategy.
1574
+ # @param suffix [String]
1575
+ # Suffix to append to columns with a duplicate name.
1576
+ #
1577
+ # @return [DataFrame]
1578
+ #
1579
+ # @example
1580
+ # df = Polars::DataFrame.new(
1581
+ # {
1582
+ # "foo" => [1, 2, 3],
1583
+ # "bar" => [6.0, 7.0, 8.0],
1584
+ # "ham" => ["a", "b", "c"]
1585
+ # }
1586
+ # )
1587
+ # other_df = Polars::DataFrame.new(
1588
+ # {
1589
+ # "apple" => ["x", "y", "z"],
1590
+ # "ham" => ["a", "b", "d"]
1591
+ # }
1592
+ # )
1593
+ # df.join(other_df, on: "ham")
1594
+ # # =>
1595
+ # # shape: (2, 4)
1596
+ # # ┌─────┬─────┬─────┬───────┐
1597
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1598
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1599
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1600
+ # # ╞═════╪═════╪═════╪═══════╡
1601
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1602
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1603
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1604
+ # # └─────┴─────┴─────┴───────┘
1605
+ #
1606
+ # @example
1607
+ # df.join(other_df, on: "ham", how: "outer")
1608
+ # # =>
1609
+ # # shape: (4, 4)
1610
+ # # ┌──────┬──────┬─────┬───────┐
1611
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1612
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1613
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1614
+ # # ╞══════╪══════╪═════╪═══════╡
1615
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1616
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1617
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1618
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1619
+ # # │ null ┆ null ┆ d ┆ z │
1620
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1621
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1622
+ # # └──────┴──────┴─────┴───────┘
1623
+ #
1624
+ # @example
1625
+ # df.join(other_df, on: "ham", how: "left")
1626
+ # # =>
1627
+ # # shape: (3, 4)
1628
+ # # ┌─────┬─────┬─────┬───────┐
1629
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1630
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1631
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1632
+ # # ╞═════╪═════╪═════╪═══════╡
1633
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1634
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1635
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1636
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1637
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1638
+ # # └─────┴─────┴─────┴───────┘
879
1639
  #
1640
+ # @example
1641
+ # df.join(other_df, on: "ham", how: "semi")
1642
+ # # =>
1643
+ # # shape: (2, 3)
1644
+ # # ┌─────┬─────┬─────┐
1645
+ # # │ foo ┆ bar ┆ ham │
1646
+ # # │ --- ┆ --- ┆ --- │
1647
+ # # │ i64 ┆ f64 ┆ str │
1648
+ # # ╞═════╪═════╪═════╡
1649
+ # # │ 1 ┆ 6.0 ┆ a │
1650
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1651
+ # # │ 2 ┆ 7.0 ┆ b │
1652
+ # # └─────┴─────┴─────┘
1653
+ #
1654
+ # @example
1655
+ # df.join(other_df, on: "ham", how: "anti")
1656
+ # # =>
1657
+ # # shape: (1, 3)
1658
+ # # ┌─────┬─────┬─────┐
1659
+ # # │ foo ┆ bar ┆ ham │
1660
+ # # │ --- ┆ --- ┆ --- │
1661
+ # # │ i64 ┆ f64 ┆ str │
1662
+ # # ╞═════╪═════╪═════╡
1663
+ # # │ 3 ┆ 8.0 ┆ c │
1664
+ # # └─────┴─────┴─────┘
880
1665
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
881
1666
  lazy
882
1667
  .join(
@@ -893,41 +1678,322 @@ module Polars
893
1678
  # def apply
894
1679
  # end
895
1680
 
1681
+ # Return a new DataFrame with the column added or replaced.
1682
+ #
1683
+ # @param column [Object]
1684
+ # Series, where the name of the Series refers to the column in the DataFrame.
1685
+ #
1686
+ # @return [DataFrame]
896
1687
  #
1688
+ # @example Added
1689
+ # df = Polars::DataFrame.new(
1690
+ # {
1691
+ # "a" => [1, 3, 5],
1692
+ # "b" => [2, 4, 6]
1693
+ # }
1694
+ # )
1695
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
1696
+ # # =>
1697
+ # # shape: (3, 3)
1698
+ # # ┌─────┬─────┬───────────┐
1699
+ # # │ a ┆ b ┆ b_squared │
1700
+ # # │ --- ┆ --- ┆ --- │
1701
+ # # │ i64 ┆ i64 ┆ f64 │
1702
+ # # ╞═════╪═════╪═══════════╡
1703
+ # # │ 1 ┆ 2 ┆ 4.0 │
1704
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1705
+ # # │ 3 ┆ 4 ┆ 16.0 │
1706
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1707
+ # # │ 5 ┆ 6 ┆ 36.0 │
1708
+ # # └─────┴─────┴───────────┘
1709
+ #
1710
+ # @example Replaced
1711
+ # df.with_column(Polars.col("a") ** 2)
1712
+ # # =>
1713
+ # # shape: (3, 2)
1714
+ # # ┌──────┬─────┐
1715
+ # # │ a ┆ b │
1716
+ # # │ --- ┆ --- │
1717
+ # # │ f64 ┆ i64 │
1718
+ # # ╞══════╪═════╡
1719
+ # # │ 1.0 ┆ 2 │
1720
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1721
+ # # │ 9.0 ┆ 4 │
1722
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1723
+ # # │ 25.0 ┆ 6 │
1724
+ # # └──────┴─────┘
897
1725
  def with_column(column)
898
1726
  lazy
899
1727
  .with_column(column)
900
1728
  .collect(no_optimization: true, string_cache: false)
901
1729
  end
902
1730
 
903
- # def hstack
904
- # end
1731
+ # Return a new DataFrame grown horizontally by stacking multiple Series to it.
1732
+ #
1733
+ # @param columns [Object]
1734
+ # Series to stack.
1735
+ # @param in_place [Boolean]
1736
+ # Modify in place.
1737
+ #
1738
+ # @return [DataFrame]
1739
+ #
1740
+ # @example
1741
+ # df = Polars::DataFrame.new(
1742
+ # {
1743
+ # "foo" => [1, 2, 3],
1744
+ # "bar" => [6, 7, 8],
1745
+ # "ham" => ["a", "b", "c"]
1746
+ # }
1747
+ # )
1748
+ # x = Polars::Series.new("apple", [10, 20, 30])
1749
+ # df.hstack([x])
1750
+ # # =>
1751
+ # # shape: (3, 4)
1752
+ # # ┌─────┬─────┬─────┬───────┐
1753
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1754
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1755
+ # # │ i64 ┆ i64 ┆ str ┆ i64 │
1756
+ # # ╞═════╪═════╪═════╪═══════╡
1757
+ # # │ 1 ┆ 6 ┆ a ┆ 10 │
1758
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1759
+ # # │ 2 ┆ 7 ┆ b ┆ 20 │
1760
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1761
+ # # │ 3 ┆ 8 ┆ c ┆ 30 │
1762
+ # # └─────┴─────┴─────┴───────┘
1763
+ def hstack(columns, in_place: false)
1764
+ if !columns.is_a?(Array)
1765
+ columns = columns.get_columns
1766
+ end
1767
+ if in_place
1768
+ _df.hstack_mut(columns.map(&:_s))
1769
+ self
1770
+ else
1771
+ _from_rbdf(_df.hstack(columns.map(&:_s)))
1772
+ end
1773
+ end
905
1774
 
906
- # def vstack
907
- # end
1775
+ # Grow this DataFrame vertically by stacking a DataFrame to it.
1776
+ #
1777
+ # @param df [DataFrame]
1778
+ # DataFrame to stack.
1779
+ # @param in_place [Boolean]
1780
+ # Modify in place
1781
+ #
1782
+ # @return [DataFrame]
1783
+ #
1784
+ # @example
1785
+ # df1 = Polars::DataFrame.new(
1786
+ # {
1787
+ # "foo" => [1, 2],
1788
+ # "bar" => [6, 7],
1789
+ # "ham" => ["a", "b"]
1790
+ # }
1791
+ # )
1792
+ # df2 = Polars::DataFrame.new(
1793
+ # {
1794
+ # "foo" => [3, 4],
1795
+ # "bar" => [8, 9],
1796
+ # "ham" => ["c", "d"]
1797
+ # }
1798
+ # )
1799
+ # df1.vstack(df2)
1800
+ # # =>
1801
+ # # shape: (4, 3)
1802
+ # # ┌─────┬─────┬─────┐
1803
+ # # │ foo ┆ bar ┆ ham │
1804
+ # # │ --- ┆ --- ┆ --- │
1805
+ # # │ i64 ┆ i64 ┆ str │
1806
+ # # ╞═════╪═════╪═════╡
1807
+ # # │ 1 ┆ 6 ┆ a │
1808
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1809
+ # # │ 2 ┆ 7 ┆ b │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ 3 ┆ 8 ┆ c │
1812
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1813
+ # # │ 4 ┆ 9 ┆ d │
1814
+ # # └─────┴─────┴─────┘
1815
+ def vstack(df, in_place: false)
1816
+ if in_place
1817
+ _df.vstack_mut(df._df)
1818
+ self
1819
+ else
1820
+ _from_rbdf(_df.vstack(df._df))
1821
+ end
1822
+ end
908
1823
 
1824
+ # Extend the memory backed by this `DataFrame` with the values from `other`.
1825
+ #
1826
+ # Different from `vstack` which adds the chunks from `other` to the chunks of this
1827
+ # `DataFrame` `extend` appends the data from `other` to the underlying memory
1828
+ # locations and thus may cause a reallocation.
1829
+ #
1830
+ # If this does not cause a reallocation, the resulting data structure will not
1831
+ # have any extra chunks and thus will yield faster queries.
1832
+ #
1833
+ # Prefer `extend` over `vstack` when you want to do a query after a single append.
1834
+ # For instance during online operations where you add `n` rows and rerun a query.
1835
+ #
1836
+ # Prefer `vstack` over `extend` when you want to append many times before doing a
1837
+ # query. For instance when you read in multiple files and when to store them in a
1838
+ # single `DataFrame`. In the latter case, finish the sequence of `vstack`
1839
+ # operations with a `rechunk`.
1840
+ #
1841
+ # @param other [DataFrame]
1842
+ # DataFrame to vertically add.
1843
+ #
1844
+ # @return [DataFrame]
909
1845
  #
1846
+ # @example
1847
+ # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1848
+ # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
1849
+ # df1.extend(df2)
1850
+ # # =>
1851
+ # # shape: (6, 2)
1852
+ # # ┌─────┬─────┐
1853
+ # # │ foo ┆ bar │
1854
+ # # │ --- ┆ --- │
1855
+ # # │ i64 ┆ i64 │
1856
+ # # ╞═════╪═════╡
1857
+ # # │ 1 ┆ 4 │
1858
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1859
+ # # │ 2 ┆ 5 │
1860
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1861
+ # # │ 3 ┆ 6 │
1862
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1863
+ # # │ 10 ┆ 40 │
1864
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1865
+ # # │ 20 ┆ 50 │
1866
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1867
+ # # │ 30 ┆ 60 │
1868
+ # # └─────┴─────┘
910
1869
  def extend(other)
911
1870
  _df.extend(other._df)
912
1871
  self
913
1872
  end
914
1873
 
915
- # def drop
916
- # end
1874
+ # Remove column from DataFrame and return as new.
1875
+ #
1876
+ # @param columns [Object]
1877
+ # Column(s) to drop.
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "foo" => [1, 2, 3],
1885
+ # "bar" => [6.0, 7.0, 8.0],
1886
+ # "ham" => ["a", "b", "c"]
1887
+ # }
1888
+ # )
1889
+ # df.drop("ham")
1890
+ # # =>
1891
+ # # shape: (3, 2)
1892
+ # # ┌─────┬─────┐
1893
+ # # │ foo ┆ bar │
1894
+ # # │ --- ┆ --- │
1895
+ # # │ i64 ┆ f64 │
1896
+ # # ╞═════╪═════╡
1897
+ # # │ 1 ┆ 6.0 │
1898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1899
+ # # │ 2 ┆ 7.0 │
1900
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 3 ┆ 8.0 │
1902
+ # # └─────┴─────┘
1903
+ def drop(columns)
1904
+ if columns.is_a?(Array)
1905
+ df = clone
1906
+ columns.each do |n|
1907
+ df._df.drop_in_place(n)
1908
+ end
1909
+ df
1910
+ else
1911
+ _from_rbdf(_df.drop(columns))
1912
+ end
1913
+ end
917
1914
 
918
- # def drop_in_place
919
- # end
1915
+ # Drop in place.
1916
+ #
1917
+ # @param name [Object]
1918
+ # Column to drop.
1919
+ #
1920
+ # @return [Series]
1921
+ #
1922
+ # @example
1923
+ # df = Polars::DataFrame.new(
1924
+ # {
1925
+ # "foo" => [1, 2, 3],
1926
+ # "bar" => [6, 7, 8],
1927
+ # "ham" => ["a", "b", "c"]
1928
+ # }
1929
+ # )
1930
+ # df.drop_in_place("ham")
1931
+ # # =>
1932
+ # # shape: (3,)
1933
+ # # Series: 'ham' [str]
1934
+ # # [
1935
+ # # "a"
1936
+ # # "b"
1937
+ # # "c"
1938
+ # # ]
1939
+ def drop_in_place(name)
1940
+ Utils.wrap_s(_df.drop_in_place(name))
1941
+ end
920
1942
 
921
- # def cleared
922
- # end
1943
+ # Create an empty copy of the current DataFrame.
1944
+ #
1945
+ # Returns a DataFrame with identical schema but no data.
1946
+ #
1947
+ # @return [DataFrame]
1948
+ #
1949
+ # @example
1950
+ # df = Polars::DataFrame.new(
1951
+ # {
1952
+ # "a" => [nil, 2, 3, 4],
1953
+ # "b" => [0.5, nil, 2.5, 13],
1954
+ # "c" => [true, true, false, nil]
1955
+ # }
1956
+ # )
1957
+ # df.cleared
1958
+ # # =>
1959
+ # # shape: (0, 3)
1960
+ # # ┌─────┬─────┬──────┐
1961
+ # # │ a ┆ b ┆ c │
1962
+ # # │ --- ┆ --- ┆ --- │
1963
+ # # │ i64 ┆ f64 ┆ bool │
1964
+ # # ╞═════╪═════╪══════╡
1965
+ # # └─────┴─────┴──────┘
1966
+ def cleared
1967
+ height > 0 ? head(0) : clone
1968
+ end
923
1969
 
924
1970
  # clone handled by initialize_copy
925
1971
 
1972
+ # Get the DataFrame as a Array of Series.
926
1973
  #
1974
+ # @return [Array]
927
1975
  def get_columns
928
1976
  _df.get_columns.map { |s| Utils.wrap_s(s) }
929
1977
  end
930
1978
 
1979
+ # Get a single column as Series by name.
1980
+ #
1981
+ # @param name [String]
1982
+ # Name of the column to retrieve.
1983
+ #
1984
+ # @return [Series]
1985
+ #
1986
+ # @example
1987
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1988
+ # df.get_column("foo")
1989
+ # # =>
1990
+ # # shape: (3,)
1991
+ # # Series: 'foo' [i64]
1992
+ # # [
1993
+ # # 1
1994
+ # # 2
1995
+ # # 3
1996
+ # # ]
931
1997
  def get_column(name)
932
1998
  self[name]
933
1999
  end
@@ -935,13 +2001,85 @@ module Polars
935
2001
  # def fill_null
936
2002
  # end
937
2003
 
2004
+ # Fill floating point NaN values by an Expression evaluation.
2005
+ #
2006
+ # @param fill_value [Object]
2007
+ # Value to fill NaN with.
2008
+ #
2009
+ # @return [DataFrame]
2010
+ #
2011
+ # @note
2012
+ # Note that floating point NaNs (Not a Number) are not missing values!
2013
+ # To replace missing values, use `fill_null`.
938
2014
  #
2015
+ # @example
2016
+ # df = Polars::DataFrame.new(
2017
+ # {
2018
+ # "a" => [1.5, 2, Float::NAN, 4],
2019
+ # "b" => [0.5, 4, Float::NAN, 13]
2020
+ # }
2021
+ # )
2022
+ # df.fill_nan(99)
2023
+ # # =>
2024
+ # # shape: (4, 2)
2025
+ # # ┌──────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════╪══════╡
2030
+ # # │ 1.5 ┆ 0.5 │
2031
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2032
+ # # │ 2.0 ┆ 4.0 │
2033
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2034
+ # # │ 99.0 ┆ 99.0 │
2035
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2036
+ # # │ 4.0 ┆ 13.0 │
2037
+ # # └──────┴──────┘
939
2038
  def fill_nan(fill_value)
940
2039
  lazy.fill_nan(fill_value).collect(no_optimization: true)
941
2040
  end
942
2041
 
943
- # def explode
944
- # end
2042
+ # Explode `DataFrame` to long format by exploding a column with Lists.
2043
+ #
2044
+ # @param columns [Object]
2045
+ # Column of LargeList type.
2046
+ #
2047
+ # @return [DataFrame]
2048
+ #
2049
+ # @example
2050
+ # df = Polars::DataFrame.new(
2051
+ # {
2052
+ # "letters" => ["a", "a", "b", "c"],
2053
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
2054
+ # }
2055
+ # )
2056
+ # df.explode("numbers")
2057
+ # # =>
2058
+ # # shape: (8, 2)
2059
+ # # ┌─────────┬─────────┐
2060
+ # # │ letters ┆ numbers │
2061
+ # # │ --- ┆ --- │
2062
+ # # │ str ┆ i64 │
2063
+ # # ╞═════════╪═════════╡
2064
+ # # │ a ┆ 1 │
2065
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2066
+ # # │ a ┆ 2 │
2067
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2068
+ # # │ a ┆ 3 │
2069
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2070
+ # # │ b ┆ 4 │
2071
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2072
+ # # │ b ┆ 5 │
2073
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2074
+ # # │ c ┆ 6 │
2075
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2076
+ # # │ c ┆ 7 │
2077
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2078
+ # # │ c ┆ 8 │
2079
+ # # └─────────┴─────────┘
2080
+ def explode(columns)
2081
+ lazy.explode(columns).collect(no_optimization: true)
2082
+ end
945
2083
 
946
2084
  # def pivot
947
2085
  # end
@@ -955,25 +2093,242 @@ module Polars
955
2093
  # def partition_by
956
2094
  # end
957
2095
 
958
- # def shift
959
- # end
960
-
961
- # def shift_and_fill
962
- # end
2096
+ # Shift values by the given period.
2097
+ #
2098
+ # @param periods [Integer]
2099
+ # Number of places to shift (may be negative).
2100
+ #
2101
+ # @return [DataFrame]
2102
+ #
2103
+ # @example
2104
+ # df = Polars::DataFrame.new(
2105
+ # {
2106
+ # "foo" => [1, 2, 3],
2107
+ # "bar" => [6, 7, 8],
2108
+ # "ham" => ["a", "b", "c"]
2109
+ # }
2110
+ # )
2111
+ # df.shift(1)
2112
+ # # =>
2113
+ # # shape: (3, 3)
2114
+ # # ┌──────┬──────┬──────┐
2115
+ # # │ foo ┆ bar ┆ ham │
2116
+ # # │ --- ┆ --- ┆ --- │
2117
+ # # │ i64 ┆ i64 ┆ str │
2118
+ # # ╞══════╪══════╪══════╡
2119
+ # # │ null ┆ null ┆ null │
2120
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2121
+ # # │ 1 ┆ 6 ┆ a │
2122
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2123
+ # # │ 2 ┆ 7 ┆ b │
2124
+ # # └──────┴──────┴──────┘
2125
+ #
2126
+ # @example
2127
+ # df.shift(-1)
2128
+ # # =>
2129
+ # # shape: (3, 3)
2130
+ # # ┌──────┬──────┬──────┐
2131
+ # # │ foo ┆ bar ┆ ham │
2132
+ # # │ --- ┆ --- ┆ --- │
2133
+ # # │ i64 ┆ i64 ┆ str │
2134
+ # # ╞══════╪══════╪══════╡
2135
+ # # │ 2 ┆ 7 ┆ b │
2136
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2137
+ # # │ 3 ┆ 8 ┆ c │
2138
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2139
+ # # │ null ┆ null ┆ null │
2140
+ # # └──────┴──────┴──────┘
2141
+ def shift(periods)
2142
+ _from_rbdf(_df.shift(periods))
2143
+ end
2144
+
2145
+ # Shift the values by a given period and fill the resulting null values.
2146
+ #
2147
+ # @param periods [Integer]
2148
+ # Number of places to shift (may be negative).
2149
+ # @param fill_value [Object]
2150
+ # fill nil values with this value.
2151
+ #
2152
+ # @return [DataFrame]
2153
+ #
2154
+ # @example
2155
+ # df = Polars::DataFrame.new(
2156
+ # {
2157
+ # "foo" => [1, 2, 3],
2158
+ # "bar" => [6, 7, 8],
2159
+ # "ham" => ["a", "b", "c"]
2160
+ # }
2161
+ # )
2162
+ # df.shift_and_fill(1, 0)
2163
+ # # =>
2164
+ # # shape: (3, 3)
2165
+ # # ┌─────┬─────┬─────┐
2166
+ # # │ foo ┆ bar ┆ ham │
2167
+ # # │ --- ┆ --- ┆ --- │
2168
+ # # │ i64 ┆ i64 ┆ str │
2169
+ # # ╞═════╪═════╪═════╡
2170
+ # # │ 0 ┆ 0 ┆ 0 │
2171
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2172
+ # # │ 1 ┆ 6 ┆ a │
2173
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2174
+ # # │ 2 ┆ 7 ┆ b │
2175
+ # # └─────┴─────┴─────┘
2176
+ def shift_and_fill(periods, fill_value)
2177
+ lazy
2178
+ .shift_and_fill(periods, fill_value)
2179
+ .collect(no_optimization: true, string_cache: false)
2180
+ end
963
2181
 
2182
+ # Get a mask of all duplicated rows in this DataFrame.
2183
+ #
2184
+ # @return [Series]
964
2185
  #
2186
+ # @example
2187
+ # df = Polars::DataFrame.new(
2188
+ # {
2189
+ # "a" => [1, 2, 3, 1],
2190
+ # "b" => ["x", "y", "z", "x"],
2191
+ # }
2192
+ # )
2193
+ # df.is_duplicated
2194
+ # # =>
2195
+ # # shape: (4,)
2196
+ # # Series: '' [bool]
2197
+ # # [
2198
+ # # true
2199
+ # # false
2200
+ # # false
2201
+ # # true
2202
+ # # ]
965
2203
  def is_duplicated
966
2204
  Utils.wrap_s(_df.is_duplicated)
967
2205
  end
968
2206
 
2207
+ # Get a mask of all unique rows in this DataFrame.
2208
+ #
2209
+ # @return [Series]
2210
+ #
2211
+ # @example
2212
+ # df = Polars::DataFrame.new(
2213
+ # {
2214
+ # "a" => [1, 2, 3, 1],
2215
+ # "b" => ["x", "y", "z", "x"]
2216
+ # }
2217
+ # )
2218
+ # df.is_unique
2219
+ # # =>
2220
+ # # shape: (4,)
2221
+ # # Series: '' [bool]
2222
+ # # [
2223
+ # # false
2224
+ # # true
2225
+ # # true
2226
+ # # false
2227
+ # # ]
969
2228
  def is_unique
970
2229
  Utils.wrap_s(_df.is_unique)
971
2230
  end
972
2231
 
2232
+ # Start a lazy query from this point.
2233
+ #
2234
+ # @return [LazyFrame]
973
2235
  def lazy
974
2236
  wrap_ldf(_df.lazy)
975
2237
  end
976
2238
 
2239
+ # Select columns from this DataFrame.
2240
+ #
2241
+ # @param exprs [Object]
2242
+ # Column or columns to select.
2243
+ #
2244
+ # @return [DataFrame]
2245
+ #
2246
+ # @example
2247
+ # df = Polars::DataFrame.new(
2248
+ # {
2249
+ # "foo" => [1, 2, 3],
2250
+ # "bar" => [6, 7, 8],
2251
+ # "ham" => ["a", "b", "c"]
2252
+ # }
2253
+ # )
2254
+ # df.select("foo")
2255
+ # # =>
2256
+ # # shape: (3, 1)
2257
+ # # ┌─────┐
2258
+ # # │ foo │
2259
+ # # │ --- │
2260
+ # # │ i64 │
2261
+ # # ╞═════╡
2262
+ # # │ 1 │
2263
+ # # ├╌╌╌╌╌┤
2264
+ # # │ 2 │
2265
+ # # ├╌╌╌╌╌┤
2266
+ # # │ 3 │
2267
+ # # └─────┘
2268
+ #
2269
+ # @example
2270
+ # df.select(["foo", "bar"])
2271
+ # # =>
2272
+ # # shape: (3, 2)
2273
+ # # ┌─────┬─────┐
2274
+ # # │ foo ┆ bar │
2275
+ # # │ --- ┆ --- │
2276
+ # # │ i64 ┆ i64 │
2277
+ # # ╞═════╪═════╡
2278
+ # # │ 1 ┆ 6 │
2279
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2280
+ # # │ 2 ┆ 7 │
2281
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2282
+ # # │ 3 ┆ 8 │
2283
+ # # └─────┴─────┘
2284
+ #
2285
+ # @example
2286
+ # df.select(Polars.col("foo") + 1)
2287
+ # # =>
2288
+ # # shape: (3, 1)
2289
+ # # ┌─────┐
2290
+ # # │ foo │
2291
+ # # │ --- │
2292
+ # # │ i64 │
2293
+ # # ╞═════╡
2294
+ # # │ 2 │
2295
+ # # ├╌╌╌╌╌┤
2296
+ # # │ 3 │
2297
+ # # ├╌╌╌╌╌┤
2298
+ # # │ 4 │
2299
+ # # └─────┘
2300
+ #
2301
+ # @example
2302
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
2303
+ # # =>
2304
+ # # shape: (3, 2)
2305
+ # # ┌─────┬─────┐
2306
+ # # │ foo ┆ bar │
2307
+ # # │ --- ┆ --- │
2308
+ # # │ i64 ┆ i64 │
2309
+ # # ╞═════╪═════╡
2310
+ # # │ 2 ┆ 7 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2312
+ # # │ 3 ┆ 8 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2314
+ # # │ 4 ┆ 9 │
2315
+ # # └─────┴─────┘
2316
+ #
2317
+ # @example
2318
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
2319
+ # # =>
2320
+ # # shape: (3, 1)
2321
+ # # ┌─────────┐
2322
+ # # │ literal │
2323
+ # # │ --- │
2324
+ # # │ i64 │
2325
+ # # ╞═════════╡
2326
+ # # │ 0 │
2327
+ # # ├╌╌╌╌╌╌╌╌╌┤
2328
+ # # │ 0 │
2329
+ # # ├╌╌╌╌╌╌╌╌╌┤
2330
+ # # │ 10 │
2331
+ # # └─────────┘
977
2332
  def select(exprs)
978
2333
  _from_rbdf(
979
2334
  lazy
@@ -983,6 +2338,43 @@ module Polars
983
2338
  )
984
2339
  end
985
2340
 
2341
+ # Add or overwrite multiple columns in a DataFrame.
2342
+ #
2343
+ # @param exprs [Array]
2344
+ # Array of Expressions that evaluate to columns.
2345
+ #
2346
+ # @return [DataFrame]
2347
+ #
2348
+ # @example
2349
+ # df = Polars::DataFrame.new(
2350
+ # {
2351
+ # "a" => [1, 2, 3, 4],
2352
+ # "b" => [0.5, 4, 10, 13],
2353
+ # "c" => [true, true, false, true]
2354
+ # }
2355
+ # )
2356
+ # df.with_columns(
2357
+ # [
2358
+ # (Polars.col("a") ** 2).alias("a^2"),
2359
+ # (Polars.col("b") / 2).alias("b/2"),
2360
+ # (Polars.col("c").is_not()).alias("not c")
2361
+ # ]
2362
+ # )
2363
+ # # =>
2364
+ # # shape: (4, 6)
2365
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
2366
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
2367
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2368
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
2369
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
2370
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
2371
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2372
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
2373
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2374
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
2375
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2376
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
2377
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
986
2378
  def with_columns(exprs)
987
2379
  if !exprs.nil? && !exprs.is_a?(Array)
988
2380
  exprs = [exprs]
@@ -992,6 +2384,26 @@ module Polars
992
2384
  .collect(no_optimization: true, string_cache: false)
993
2385
  end
994
2386
 
2387
+ # Get number of chunks used by the ChunkedArrays of this DataFrame.
2388
+ #
2389
+ # @param strategy ["first", "all"]
2390
+ # Return the number of chunks of the 'first' column,
2391
+ # or 'all' columns in this DataFrame.
2392
+ #
2393
+ # @return [Object]
2394
+ #
2395
+ # @example
2396
+ # df = Polars::DataFrame.new(
2397
+ # {
2398
+ # "a" => [1, 2, 3, 4],
2399
+ # "b" => [0.5, 4, 10, 13],
2400
+ # "c" => [true, true, false, true]
2401
+ # }
2402
+ # )
2403
+ # df.n_chunks
2404
+ # # => 1
2405
+ # df.n_chunks(strategy: "all")
2406
+ # # => [1, 1, 1]
995
2407
  def n_chunks(strategy: "first")
996
2408
  if strategy == "first"
997
2409
  _df.n_chunks
@@ -1002,6 +2414,28 @@ module Polars
1002
2414
  end
1003
2415
  end
1004
2416
 
2417
+ # Aggregate the columns of this DataFrame to their maximum value.
2418
+ #
2419
+ # @return [DataFrame]
2420
+ #
2421
+ # @example
2422
+ # df = Polars::DataFrame.new(
2423
+ # {
2424
+ # "foo" => [1, 2, 3],
2425
+ # "bar" => [6, 7, 8],
2426
+ # "ham" => ["a", "b", "c"]
2427
+ # }
2428
+ # )
2429
+ # df.max
2430
+ # # =>
2431
+ # # shape: (1, 3)
2432
+ # # ┌─────┬─────┬─────┐
2433
+ # # │ foo ┆ bar ┆ ham │
2434
+ # # │ --- ┆ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 ┆ str │
2436
+ # # ╞═════╪═════╪═════╡
2437
+ # # │ 3 ┆ 8 ┆ c │
2438
+ # # └─────┴─────┴─────┘
1005
2439
  def max(axis: 0)
1006
2440
  if axis == 0
1007
2441
  _from_rbdf(_df.max)
@@ -1012,6 +2446,28 @@ module Polars
1012
2446
  end
1013
2447
  end
1014
2448
 
2449
+ # Aggregate the columns of this DataFrame to their minimum value.
2450
+ #
2451
+ # @return [DataFrame]
2452
+ #
2453
+ # @example
2454
+ # df = Polars::DataFrame.new(
2455
+ # {
2456
+ # "foo" => [1, 2, 3],
2457
+ # "bar" => [6, 7, 8],
2458
+ # "ham" => ["a", "b", "c"]
2459
+ # }
2460
+ # )
2461
+ # df.min
2462
+ # # =>
2463
+ # # shape: (1, 3)
2464
+ # # ┌─────┬─────┬─────┐
2465
+ # # │ foo ┆ bar ┆ ham │
2466
+ # # │ --- ┆ --- ┆ --- │
2467
+ # # │ i64 ┆ i64 ┆ str │
2468
+ # # ╞═════╪═════╪═════╡
2469
+ # # │ 1 ┆ 6 ┆ a │
2470
+ # # └─────┴─────┴─────┘
1015
2471
  def min(axis: 0)
1016
2472
  if axis == 0
1017
2473
  _from_rbdf(_df.min)
@@ -1022,6 +2478,44 @@ module Polars
1022
2478
  end
1023
2479
  end
1024
2480
 
2481
+ # Aggregate the columns of this DataFrame to their sum value.
2482
+ #
2483
+ # @param axis [Integer]
2484
+ # Either 0 or 1.
2485
+ # @param null_strategy ["ignore", "propagate"]
2486
+ # This argument is only used if axis == 1.
2487
+ #
2488
+ # @return [DataFrame]
2489
+ #
2490
+ # @example
2491
+ # df = Polars::DataFrame.new(
2492
+ # {
2493
+ # "foo" => [1, 2, 3],
2494
+ # "bar" => [6, 7, 8],
2495
+ # "ham" => ["a", "b", "c"],
2496
+ # }
2497
+ # )
2498
+ # df.sum
2499
+ # # =>
2500
+ # # shape: (1, 3)
2501
+ # # ┌─────┬─────┬──────┐
2502
+ # # │ foo ┆ bar ┆ ham │
2503
+ # # │ --- ┆ --- ┆ --- │
2504
+ # # │ i64 ┆ i64 ┆ str │
2505
+ # # ╞═════╪═════╪══════╡
2506
+ # # │ 6 ┆ 21 ┆ null │
2507
+ # # └─────┴─────┴──────┘
2508
+ #
2509
+ # @example
2510
+ # df.sum(axis: 1)
2511
+ # # =>
2512
+ # # shape: (3,)
2513
+ # # Series: 'foo' [str]
2514
+ # # [
2515
+ # # "16a"
2516
+ # # "27b"
2517
+ # # "38c"
2518
+ # # ]
1025
2519
  def sum(axis: 0, null_strategy: "ignore")
1026
2520
  case axis
1027
2521
  when 0
@@ -1033,6 +2527,33 @@ module Polars
1033
2527
  end
1034
2528
  end
1035
2529
 
2530
+ # Aggregate the columns of this DataFrame to their mean value.
2531
+ #
2532
+ # @param axis [Integer]
2533
+ # Either 0 or 1.
2534
+ # @param null_strategy ["ignore", "propagate"]
2535
+ # This argument is only used if axis == 1.
2536
+ #
2537
+ # @return [DataFrame]
2538
+ #
2539
+ # @example
2540
+ # df = Polars::DataFrame.new(
2541
+ # {
2542
+ # "foo" => [1, 2, 3],
2543
+ # "bar" => [6, 7, 8],
2544
+ # "ham" => ["a", "b", "c"]
2545
+ # }
2546
+ # )
2547
+ # df.mean
2548
+ # # =>
2549
+ # # shape: (1, 3)
2550
+ # # ┌─────┬─────┬──────┐
2551
+ # # │ foo ┆ bar ┆ ham │
2552
+ # # │ --- ┆ --- ┆ --- │
2553
+ # # │ f64 ┆ f64 ┆ str │
2554
+ # # ╞═════╪═════╪══════╡
2555
+ # # │ 2.0 ┆ 7.0 ┆ null │
2556
+ # # └─────┴─────┴──────┘
1036
2557
  def mean(axis: 0, null_strategy: "ignore")
1037
2558
  case axis
1038
2559
  when 0
@@ -1044,77 +2565,633 @@ module Polars
1044
2565
  end
1045
2566
  end
1046
2567
 
2568
+ # Aggregate the columns of this DataFrame to their standard deviation value.
2569
+ #
2570
+ # @param ddof [Integer]
2571
+ # Degrees of freedom
2572
+ #
2573
+ # @return [DataFrame]
2574
+ #
2575
+ # @example
2576
+ # df = Polars::DataFrame.new(
2577
+ # {
2578
+ # "foo" => [1, 2, 3],
2579
+ # "bar" => [6, 7, 8],
2580
+ # "ham" => ["a", "b", "c"]
2581
+ # }
2582
+ # )
2583
+ # df.std
2584
+ # # =>
2585
+ # # shape: (1, 3)
2586
+ # # ┌─────┬─────┬──────┐
2587
+ # # │ foo ┆ bar ┆ ham │
2588
+ # # │ --- ┆ --- ┆ --- │
2589
+ # # │ f64 ┆ f64 ┆ str │
2590
+ # # ╞═════╪═════╪══════╡
2591
+ # # │ 1.0 ┆ 1.0 ┆ null │
2592
+ # # └─────┴─────┴──────┘
2593
+ #
2594
+ # @example
2595
+ # df.std(ddof: 0)
2596
+ # # =>
2597
+ # # shape: (1, 3)
2598
+ # # ┌──────────┬──────────┬──────┐
2599
+ # # │ foo ┆ bar ┆ ham │
2600
+ # # │ --- ┆ --- ┆ --- │
2601
+ # # │ f64 ┆ f64 ┆ str │
2602
+ # # ╞══════════╪══════════╪══════╡
2603
+ # # │ 0.816497 ┆ 0.816497 ┆ null │
2604
+ # # └──────────┴──────────┴──────┘
1047
2605
  def std(ddof: 1)
1048
2606
  _from_rbdf(_df.std(ddof))
1049
2607
  end
1050
2608
 
2609
+ # Aggregate the columns of this DataFrame to their variance value.
2610
+ #
2611
+ # @param ddof [Integer]
2612
+ # Degrees of freedom
2613
+ #
2614
+ # @return [DataFrame]
2615
+ #
2616
+ # @example
2617
+ # df = Polars::DataFrame.new(
2618
+ # {
2619
+ # "foo" => [1, 2, 3],
2620
+ # "bar" => [6, 7, 8],
2621
+ # "ham" => ["a", "b", "c"]
2622
+ # }
2623
+ # )
2624
+ # df.var
2625
+ # # =>
2626
+ # # shape: (1, 3)
2627
+ # # ┌─────┬─────┬──────┐
2628
+ # # │ foo ┆ bar ┆ ham │
2629
+ # # │ --- ┆ --- ┆ --- │
2630
+ # # │ f64 ┆ f64 ┆ str │
2631
+ # # ╞═════╪═════╪══════╡
2632
+ # # │ 1.0 ┆ 1.0 ┆ null │
2633
+ # # └─────┴─────┴──────┘
2634
+ #
2635
+ # @example
2636
+ # df.var(ddof: 0)
2637
+ # # =>
2638
+ # # shape: (1, 3)
2639
+ # # ┌──────────┬──────────┬──────┐
2640
+ # # │ foo ┆ bar ┆ ham │
2641
+ # # │ --- ┆ --- ┆ --- │
2642
+ # # │ f64 ┆ f64 ┆ str │
2643
+ # # ╞══════════╪══════════╪══════╡
2644
+ # # │ 0.666667 ┆ 0.666667 ┆ null │
2645
+ # # └──────────┴──────────┴──────┘
1051
2646
  def var(ddof: 1)
1052
2647
  _from_rbdf(_df.var(ddof))
1053
2648
  end
1054
2649
 
2650
+ # Aggregate the columns of this DataFrame to their median value.
2651
+ #
2652
+ # @return [DataFrame]
2653
+ #
2654
+ # @example
2655
+ # df = Polars::DataFrame.new(
2656
+ # {
2657
+ # "foo" => [1, 2, 3],
2658
+ # "bar" => [6, 7, 8],
2659
+ # "ham" => ["a", "b", "c"]
2660
+ # }
2661
+ # )
2662
+ # df.median
2663
+ # # =>
2664
+ # # shape: (1, 3)
2665
+ # # ┌─────┬─────┬──────┐
2666
+ # # │ foo ┆ bar ┆ ham │
2667
+ # # │ --- ┆ --- ┆ --- │
2668
+ # # │ f64 ┆ f64 ┆ str │
2669
+ # # ╞═════╪═════╪══════╡
2670
+ # # │ 2.0 ┆ 7.0 ┆ null │
2671
+ # # └─────┴─────┴──────┘
1055
2672
  def median
1056
2673
  _from_rbdf(_df.median)
1057
2674
  end
1058
2675
 
1059
- # def product
1060
- # end
2676
+ # Aggregate the columns of this DataFrame to their product values.
2677
+ #
2678
+ # @return [DataFrame]
2679
+ #
2680
+ # @example
2681
+ # df = Polars::DataFrame.new(
2682
+ # {
2683
+ # "a" => [1, 2, 3],
2684
+ # "b" => [0.5, 4, 10],
2685
+ # "c" => [true, true, false]
2686
+ # }
2687
+ # )
2688
+ # df.product
2689
+ # # =>
2690
+ # # shape: (1, 3)
2691
+ # # ┌─────┬──────┬─────┐
2692
+ # # │ a ┆ b ┆ c │
2693
+ # # │ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ f64 ┆ i64 │
2695
+ # # ╞═════╪══════╪═════╡
2696
+ # # │ 6 ┆ 20.0 ┆ 0 │
2697
+ # # └─────┴──────┴─────┘
2698
+ def product
2699
+ select(Polars.all.product)
2700
+ end
2701
+
2702
+ # Aggregate the columns of this DataFrame to their quantile value.
2703
+ #
2704
+ # @param quantile [Float]
2705
+ # Quantile between 0.0 and 1.0.
2706
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2707
+ # Interpolation method.
2708
+ #
2709
+ # @return [DataFrame]
2710
+ #
2711
+ # @example
2712
+ # df = Polars::DataFrame.new(
2713
+ # {
2714
+ # "foo" => [1, 2, 3],
2715
+ # "bar" => [6, 7, 8],
2716
+ # "ham" => ["a", "b", "c"]
2717
+ # }
2718
+ # )
2719
+ # df.quantile(0.5, interpolation: "nearest")
2720
+ # # =>
2721
+ # # shape: (1, 3)
2722
+ # # ┌─────┬─────┬──────┐
2723
+ # # │ foo ┆ bar ┆ ham │
2724
+ # # │ --- ┆ --- ┆ --- │
2725
+ # # │ f64 ┆ f64 ┆ str │
2726
+ # # ╞═════╪═════╪══════╡
2727
+ # # │ 2.0 ┆ 7.0 ┆ null │
2728
+ # # └─────┴─────┴──────┘
2729
+ def quantile(quantile, interpolation: "nearest")
2730
+ _from_rbdf(_df.quantile(quantile, interpolation))
2731
+ end
2732
+
2733
+ # Get one hot encoded dummy variables.
2734
+ #
2735
+ # @param columns
2736
+ # A subset of columns to convert to dummy variables. `nil` means
2737
+ # "all columns".
2738
+ #
2739
+ # @return [DataFrame]
2740
+ #
2741
+ # @example
2742
+ # df = Polars::DataFrame.new(
2743
+ # {
2744
+ # "foo" => [1, 2],
2745
+ # "bar" => [3, 4],
2746
+ # "ham" => ["a", "b"]
2747
+ # }
2748
+ # )
2749
+ # df.to_dummies
2750
+ # # =>
2751
+ # # shape: (2, 6)
2752
+ # # ┌───────┬───────┬───────┬───────┬───────┬───────┐
2753
+ # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
2754
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2755
+ # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
2756
+ # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
2757
+ # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
2758
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2759
+ # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
2760
+ # # └───────┴───────┴───────┴───────┴───────┴───────┘
2761
+ def to_dummies(columns: nil)
2762
+ if columns.is_a?(String)
2763
+ columns = [columns]
2764
+ end
2765
+ _from_rbdf(_df.to_dummies(columns))
2766
+ end
1061
2767
 
1062
- # def quantile(quantile, interpolation: "nearest")
1063
- # end
2768
+ # Drop duplicate rows from this DataFrame.
2769
+ #
2770
+ # @param maintain_order [Boolean]
2771
+ # Keep the same order as the original DataFrame. This requires more work to
2772
+ # compute.
2773
+ # @param subset [Object]
2774
+ # Subset to use to compare rows.
2775
+ # @param keep ["first", "last"]
2776
+ # Which of the duplicate rows to keep (in conjunction with `subset`).
2777
+ #
2778
+ # @return [DataFrame]
2779
+ #
2780
+ # @note
2781
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2782
+ # subset.
2783
+ #
2784
+ # @example
2785
+ # df = Polars::DataFrame.new(
2786
+ # {
2787
+ # "a" => [1, 1, 2, 3, 4, 5],
2788
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2789
+ # "c" => [true, true, true, false, true, true]
2790
+ # }
2791
+ # )
2792
+ # df.unique
2793
+ # # =>
2794
+ # # shape: (5, 3)
2795
+ # # ┌─────┬─────┬───────┐
2796
+ # # │ a ┆ b ┆ c │
2797
+ # # │ --- ┆ --- ┆ --- │
2798
+ # # │ i64 ┆ f64 ┆ bool │
2799
+ # # ╞═════╪═════╪═══════╡
2800
+ # # │ 1 ┆ 0.5 ┆ true │
2801
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2802
+ # # │ 2 ┆ 1.0 ┆ true │
2803
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2804
+ # # │ 3 ┆ 2.0 ┆ false │
2805
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2806
+ # # │ 4 ┆ 3.0 ┆ true │
2807
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2808
+ # # │ 5 ┆ 3.0 ┆ true │
2809
+ # # └─────┴─────┴───────┘
2810
+ def unique(maintain_order: true, subset: nil, keep: "first")
2811
+ if !subset.nil?
2812
+ if subset.is_a?(String)
2813
+ subset = [subset]
2814
+ elsif !subset.is_a?(Array)
2815
+ subset = subset.to_a
2816
+ end
2817
+ end
1064
2818
 
1065
- # def to_dummies
1066
- # end
2819
+ _from_rbdf(_df.unique(maintain_order, subset, keep))
2820
+ end
1067
2821
 
1068
- # def unique
1069
- # end
2822
+ # Return the number of unique rows, or the number of unique row-subsets.
2823
+ #
2824
+ # @param subset [Object]
2825
+ # One or more columns/expressions that define what to count;
2826
+ # omit to return the count of unique rows.
2827
+ #
2828
+ # @return [DataFrame]
2829
+ #
2830
+ # @example
2831
+ # df = Polars::DataFrame.new(
2832
+ # {
2833
+ # "a" => [1, 1, 2, 3, 4, 5],
2834
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2835
+ # "c" => [true, true, true, false, true, true]
2836
+ # }
2837
+ # )
2838
+ # df.n_unique
2839
+ # # => 5
2840
+ #
2841
+ # @example Simple columns subset
2842
+ # df.n_unique(subset: ["b", "c"])
2843
+ # # => 4
2844
+ #
2845
+ # @example Expression subset
2846
+ # df.n_unique(
2847
+ # subset: [
2848
+ # (Polars.col("a").floordiv(2)),
2849
+ # (Polars.col("c") | (Polars.col("b") >= 2))
2850
+ # ]
2851
+ # )
2852
+ # # => 3
2853
+ def n_unique(subset: nil)
2854
+ if subset.is_a?(StringIO)
2855
+ subset = [Polars.col(subset)]
2856
+ elsif subset.is_a?(Expr)
2857
+ subset = [subset]
2858
+ end
1070
2859
 
1071
- # def n_unique
1072
- # end
2860
+ if subset.is_a?(Array) && subset.length == 1
2861
+ expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
2862
+ else
2863
+ struct_fields = subset.nil? ? Polars.all : subset
2864
+ expr = Polars.struct(struct_fields)
2865
+ end
2866
+
2867
+ df = lazy.select(expr.n_unique).collect
2868
+ df.is_empty ? 0 : df.row(0)[0]
2869
+ end
2870
+
2871
+ # Rechunk the data in this DataFrame to a contiguous allocation.
1073
2872
 
2873
+ # This will make sure all subsequent operations have optimal and predictable
2874
+ # performance.
1074
2875
  #
2876
+ # @return [DataFrame]
1075
2877
  def rechunk
1076
2878
  _from_rbdf(_df.rechunk)
1077
2879
  end
1078
2880
 
2881
+ # Create a new DataFrame that shows the null counts per column.
2882
+ #
2883
+ # @return [DataFrame]
2884
+ #
2885
+ # @example
2886
+ # df = Polars::DataFrame.new(
2887
+ # {
2888
+ # "foo" => [1, nil, 3],
2889
+ # "bar" => [6, 7, nil],
2890
+ # "ham" => ["a", "b", "c"]
2891
+ # }
2892
+ # )
2893
+ # df.null_count
2894
+ # # =>
2895
+ # # shape: (1, 3)
2896
+ # # ┌─────┬─────┬─────┐
2897
+ # # │ foo ┆ bar ┆ ham │
2898
+ # # │ --- ┆ --- ┆ --- │
2899
+ # # │ u32 ┆ u32 ┆ u32 │
2900
+ # # ╞═════╪═════╪═════╡
2901
+ # # │ 1 ┆ 1 ┆ 0 │
2902
+ # # └─────┴─────┴─────┘
1079
2903
  def null_count
1080
2904
  _from_rbdf(_df.null_count)
1081
2905
  end
1082
2906
 
1083
- # def sample
1084
- # end
2907
+ # Sample from this DataFrame.
2908
+ #
2909
+ # @param n [Integer]
2910
+ # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
2911
+ # `frac` is nil.
2912
+ # @param frac [Float]
2913
+ # Fraction of items to return. Cannot be used with `n`.
2914
+ # @param with_replacement [Boolean]
2915
+ # Allow values to be sampled more than once.
2916
+ # @param shuffle [Boolean]
2917
+ # Shuffle the order of sampled data points.
2918
+ # @param seed [Integer]
2919
+ # Seed for the random number generator. If set to nil (default), a random
2920
+ # seed is used.
2921
+ #
2922
+ # @return [DataFrame]
2923
+ #
2924
+ # @example
2925
+ # df = Polars::DataFrame.new(
2926
+ # {
2927
+ # "foo" => [1, 2, 3],
2928
+ # "bar" => [6, 7, 8],
2929
+ # "ham" => ["a", "b", "c"]
2930
+ # }
2931
+ # )
2932
+ # df.sample(n: 2, seed: 0)
2933
+ # # =>
2934
+ # # shape: (2, 3)
2935
+ # # ┌─────┬─────┬─────┐
2936
+ # # │ foo ┆ bar ┆ ham │
2937
+ # # │ --- ┆ --- ┆ --- │
2938
+ # # │ i64 ┆ i64 ┆ str │
2939
+ # # ╞═════╪═════╪═════╡
2940
+ # # │ 3 ┆ 8 ┆ c │
2941
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2942
+ # # │ 2 ┆ 7 ┆ b │
2943
+ # # └─────┴─────┴─────┘
2944
+ def sample(
2945
+ n: nil,
2946
+ frac: nil,
2947
+ with_replacement: false,
2948
+ shuffle: false,
2949
+ seed: nil
2950
+ )
2951
+ if !n.nil? && !frac.nil?
2952
+ raise ArgumentError, "cannot specify both `n` and `frac`"
2953
+ end
2954
+
2955
+ if n.nil? && !frac.nil?
2956
+ _from_rbdf(
2957
+ _df.sample_frac(frac, with_replacement, shuffle, seed)
2958
+ )
2959
+ end
2960
+
2961
+ if n.nil?
2962
+ n = 1
2963
+ end
2964
+ _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
2965
+ end
1085
2966
 
1086
2967
  # def fold
1087
2968
  # end
1088
2969
 
1089
- # def row
1090
- # end
2970
+ # Get a row as tuple, either by index or by predicate.
2971
+ #
2972
+ # @param index [Object]
2973
+ # Row index.
2974
+ # @param by_predicate [Object]
2975
+ # Select the row according to a given expression/predicate.
2976
+ #
2977
+ # @return [Object]
2978
+ #
2979
+ # @note
2980
+ # The `index` and `by_predicate` params are mutually exclusive. Additionally,
2981
+ # to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
2982
+ #
2983
+ # When using `by_predicate` it is an error condition if anything other than
2984
+ # one row is returned; more than one row raises `TooManyRowsReturned`, and
2985
+ # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
2986
+ #
2987
+ # @example Return the row at the given index
2988
+ # df = Polars::DataFrame.new(
2989
+ # {
2990
+ # "foo" => [1, 2, 3],
2991
+ # "bar" => [6, 7, 8],
2992
+ # "ham" => ["a", "b", "c"]
2993
+ # }
2994
+ # )
2995
+ # df.row(2)
2996
+ # # => [3, 8, "c"]
2997
+ #
2998
+ # @example Return the row that matches the given predicate
2999
+ # df.row(by_predicate: Polars.col("ham") == "b")
3000
+ # # => [2, 7, "b"]
3001
+ def row(index = nil, by_predicate: nil)
3002
+ if !index.nil? && !by_predicate.nil?
3003
+ raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
3004
+ elsif index.is_a?(Expr)
3005
+ raise TypeError, "Expressions should be passed to the 'by_predicate' param"
3006
+ elsif index.is_a?(Integer)
3007
+ _df.row_tuple(index)
3008
+ elsif by_predicate.is_a?(Expr)
3009
+ rows = filter(by_predicate).rows
3010
+ n_rows = rows.length
3011
+ if n_rows > 1
3012
+ raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
3013
+ elsif n_rows == 0
3014
+ raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
3015
+ end
3016
+ rows[0]
3017
+ else
3018
+ raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
3019
+ end
3020
+ end
1091
3021
 
1092
- # def rows
1093
- # end
3022
+ # Convert columnar data to rows as Ruby arrays.
3023
+ #
3024
+ # @return [Array]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 3, 5],
3030
+ # "b" => [2, 4, 6]
3031
+ # }
3032
+ # )
3033
+ # df.rows
3034
+ # # => [[1, 2], [3, 4], [5, 6]]
3035
+ def rows
3036
+ _df.row_tuples
3037
+ end
1094
3038
 
1095
- # def shrink_to_fit
1096
- # end
3039
+ # Shrink DataFrame memory usage.
3040
+ #
3041
+ # Shrinks to fit the exact capacity needed to hold the data.
3042
+ #
3043
+ # @return [DataFrame]
3044
+ def shrink_to_fit(in_place: false)
3045
+ if in_place
3046
+ _df.shrink_to_fit
3047
+ self
3048
+ else
3049
+ df = clone
3050
+ df._df.shrink_to_fit
3051
+ df
3052
+ end
3053
+ end
1097
3054
 
1098
- # def take_every
1099
- # end
3055
+ # Take every nth row in the DataFrame and return as a new DataFrame.
3056
+ #
3057
+ # @return [DataFrame]
3058
+ #
3059
+ # @example
3060
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
3061
+ # s.take_every(2)
3062
+ # # =>
3063
+ # # shape: (2, 2)
3064
+ # # ┌─────┬─────┐
3065
+ # # │ a ┆ b │
3066
+ # # │ --- ┆ --- │
3067
+ # # │ i64 ┆ i64 │
3068
+ # # ╞═════╪═════╡
3069
+ # # │ 1 ┆ 5 │
3070
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3071
+ # # │ 3 ┆ 7 │
3072
+ # # └─────┴─────┘
3073
+ def take_every(n)
3074
+ select(Utils.col("*").take_every(n))
3075
+ end
1100
3076
 
1101
3077
  # def hash_rows
1102
3078
  # end
1103
3079
 
1104
- # def interpolate
1105
- # end
1106
-
3080
+ # Interpolate intermediate values. The interpolation method is linear.
3081
+ #
3082
+ # @return [DataFrame]
3083
+ #
3084
+ # @example
3085
+ # df = Polars::DataFrame.new(
3086
+ # {
3087
+ # "foo" => [1, nil, 9, 10],
3088
+ # "bar" => [6, 7, 9, nil],
3089
+ # "baz" => [1, nil, nil, 9]
3090
+ # }
3091
+ # )
3092
+ # df.interpolate
3093
+ # # =>
3094
+ # # shape: (4, 3)
3095
+ # # ┌─────┬──────┬─────┐
3096
+ # # │ foo ┆ bar ┆ baz │
3097
+ # # │ --- ┆ --- ┆ --- │
3098
+ # # │ i64 ┆ i64 ┆ i64 │
3099
+ # # ╞═════╪══════╪═════╡
3100
+ # # │ 1 ┆ 6 ┆ 1 │
3101
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3102
+ # # │ 5 ┆ 7 ┆ 3 │
3103
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3104
+ # # │ 9 ┆ 9 ┆ 6 │
3105
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3106
+ # # │ 10 ┆ null ┆ 9 │
3107
+ # # └─────┴──────┴─────┘
3108
+ def interpolate
3109
+ select(Utils.col("*").interpolate)
3110
+ end
3111
+
3112
+ # Check if the dataframe is empty.
3113
+ #
3114
+ # @return [Boolean]
1107
3115
  #
3116
+ # @example
3117
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3118
+ # df.is_empty
3119
+ # # => false
3120
+ # df.filter(Polars.col("foo") > 99).is_empty
3121
+ # # => true
1108
3122
  def is_empty
1109
3123
  height == 0
1110
3124
  end
1111
3125
  alias_method :empty?, :is_empty
1112
3126
 
1113
- # def to_struct(name)
1114
- # end
3127
+ # Convert a `DataFrame` to a `Series` of type `Struct`.
3128
+ #
3129
+ # @param name [String]
3130
+ # Name for the struct Series
3131
+ #
3132
+ # @return [Series]
3133
+ #
3134
+ # @example
3135
+ # df = Polars::DataFrame.new(
3136
+ # {
3137
+ # "a" => [1, 2, 3, 4, 5],
3138
+ # "b" => ["one", "two", "three", "four", "five"]
3139
+ # }
3140
+ # )
3141
+ # df.to_struct("nums")
3142
+ # # =>
3143
+ # # shape: (5,)
3144
+ # # Series: 'nums' [struct[2]]
3145
+ # # [
3146
+ # # {1,"one"}
3147
+ # # {2,"two"}
3148
+ # # {3,"three"}
3149
+ # # {4,"four"}
3150
+ # # {5,"five"}
3151
+ # # ]
3152
+ def to_struct(name)
3153
+ Utils.wrap_s(_df.to_struct(name))
3154
+ end
1115
3155
 
1116
- # def unnest
1117
- # end
3156
+ # Decompose a struct into its fields.
3157
+ #
3158
+ # The fields will be inserted into the `DataFrame` on the location of the
3159
+ # `struct` type.
3160
+ #
3161
+ # @param names [Object]
3162
+ # Names of the struct columns that will be decomposed by its fields
3163
+ #
3164
+ # @return [DataFrame]
3165
+ #
3166
+ # @example
3167
+ # df = Polars::DataFrame.new(
3168
+ # {
3169
+ # "before" => ["foo", "bar"],
3170
+ # "t_a" => [1, 2],
3171
+ # "t_b" => ["a", "b"],
3172
+ # "t_c" => [true, nil],
3173
+ # "t_d" => [[1, 2], [3]],
3174
+ # "after" => ["baz", "womp"]
3175
+ # }
3176
+ # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
3177
+ # df.unnest("t_struct")
3178
+ # # =>
3179
+ # # shape: (2, 6)
3180
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
3181
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
3182
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3183
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
3184
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
3185
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
3186
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3187
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
3188
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
3189
+ def unnest(names)
3190
+ if names.is_a?(String)
3191
+ names = [names]
3192
+ end
3193
+ _from_rbdf(_df.unnest(names))
3194
+ end
1118
3195
 
1119
3196
  private
1120
3197
 
@@ -1127,7 +3204,7 @@ module Polars
1127
3204
  if !columns.nil?
1128
3205
  columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
3206
 
1130
- if !data && dtypes
3207
+ if data.empty? && dtypes
1131
3208
  data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
3209
  else
1133
3210
  data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
@@ -1147,7 +3224,7 @@ module Polars
1147
3224
  if columns.nil?
1148
3225
  data
1149
3226
  else
1150
- if !data
3227
+ if data.empty?
1151
3228
  columns.map { |c| Series.new(c, nil)._s }
1152
3229
  elsif data.length == columns.length
1153
3230
  columns.each_with_index do |c, i|
@@ -1182,5 +3259,75 @@ module Polars
1182
3259
  def _from_rbdf(rb_df)
1183
3260
  self.class._from_rbdf(rb_df)
1184
3261
  end
3262
+
3263
+ def _comp(other, op)
3264
+ if other.is_a?(DataFrame)
3265
+ _compare_to_other_df(other, op)
3266
+ else
3267
+ _compare_to_non_df(other, op)
3268
+ end
3269
+ end
3270
+
3271
+ def _compare_to_other_df(other, op)
3272
+ if columns != other.columns
3273
+ raise ArgmentError, "DataFrame columns do not match"
3274
+ end
3275
+ if shape != other.shape
3276
+ raise ArgmentError, "DataFrame dimensions do not match"
3277
+ end
3278
+
3279
+ suffix = "__POLARS_CMP_OTHER"
3280
+ other_renamed = other.select(Polars.all.suffix(suffix))
3281
+ combined = Polars.concat([self, other_renamed], how: "horizontal")
3282
+
3283
+ expr = case op
3284
+ when "eq"
3285
+ columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
3286
+ when "neq"
3287
+ columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
3288
+ when "gt"
3289
+ columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
3290
+ when "lt"
3291
+ columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
3292
+ when "gt_eq"
3293
+ columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
3294
+ when "lt_eq"
3295
+ columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
3296
+ else
3297
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3298
+ end
3299
+
3300
+ combined.select(expr)
3301
+ end
3302
+
3303
+ def _compare_to_non_df(other, op)
3304
+ case op
3305
+ when "eq"
3306
+ select(Polars.all == other)
3307
+ when "neq"
3308
+ select(Polars.all != other)
3309
+ when "gt"
3310
+ select(Polars.all > other)
3311
+ when "lt"
3312
+ select(Polars.all < other)
3313
+ when "gt_eq"
3314
+ select(Polars.all >= other)
3315
+ when "lt_eq"
3316
+ select(Polars.all <= other)
3317
+ else
3318
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3319
+ end
3320
+ end
3321
+
3322
+ def _prepare_other_arg(other)
3323
+ if !other.is_a?(Series)
3324
+ if other.is_a?(Array)
3325
+ raise ArgumentError, "Operation not supported."
3326
+ end
3327
+
3328
+ other = Series.new("", [other])
3329
+ end
3330
+ other
3331
+ end
1185
3332
  end
1186
3333
  end