polars-df 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/conversion.rs +35 -2
- data/ext/polars/src/dataframe.rs +228 -11
- data/ext/polars/src/lazy/dataframe.rs +3 -3
- data/ext/polars/src/lazy/dsl.rs +59 -2
- data/ext/polars/src/lib.rs +151 -10
- data/ext/polars/src/series.rs +182 -29
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +1 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2284 -137
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +612 -7
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +421 -2
- data/lib/polars/lazy_frame.rb +1261 -67
- data/lib/polars/lazy_functions.rb +288 -10
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +1476 -212
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +43 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +7 -10
- metadata +9 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -155,12 +155,35 @@ module Polars
|
|
155
155
|
end
|
156
156
|
|
157
157
|
# @private
|
158
|
-
def self._read_parquet(
|
158
|
+
def self._read_parquet(
|
159
|
+
file,
|
160
|
+
columns: nil,
|
161
|
+
n_rows: nil,
|
162
|
+
parallel: "auto",
|
163
|
+
row_count_name: nil,
|
164
|
+
row_count_offset: 0,
|
165
|
+
low_memory: false
|
166
|
+
)
|
159
167
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
160
168
|
file = Utils.format_path(file)
|
161
169
|
end
|
162
170
|
|
163
|
-
|
171
|
+
if file.is_a?(String) && file.include?("*")
|
172
|
+
raise Todo
|
173
|
+
end
|
174
|
+
|
175
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
176
|
+
_from_rbdf(
|
177
|
+
RbDataFrame.read_parquet(
|
178
|
+
file,
|
179
|
+
columns,
|
180
|
+
projection,
|
181
|
+
n_rows,
|
182
|
+
parallel,
|
183
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
184
|
+
low_memory
|
185
|
+
)
|
186
|
+
)
|
164
187
|
end
|
165
188
|
|
166
189
|
# def self._read_avro
|
@@ -259,11 +282,13 @@ module Polars
|
|
259
282
|
# @return [Array]
|
260
283
|
#
|
261
284
|
# @example
|
262
|
-
# df = Polars::DataFrame.new(
|
263
|
-
#
|
264
|
-
#
|
265
|
-
#
|
266
|
-
#
|
285
|
+
# df = Polars::DataFrame.new(
|
286
|
+
# {
|
287
|
+
# "foo" => [1, 2, 3],
|
288
|
+
# "bar" => [6, 7, 8],
|
289
|
+
# "ham" => ["a", "b", "c"]
|
290
|
+
# }
|
291
|
+
# )
|
267
292
|
# df.columns
|
268
293
|
# # => ["foo", "bar", "ham"]
|
269
294
|
def columns
|
@@ -279,11 +304,13 @@ module Polars
|
|
279
304
|
# @return [Object]
|
280
305
|
#
|
281
306
|
# @example
|
282
|
-
# df = Polars::DataFrame.new(
|
283
|
-
#
|
284
|
-
#
|
285
|
-
#
|
286
|
-
#
|
307
|
+
# df = Polars::DataFrame.new(
|
308
|
+
# {
|
309
|
+
# "foo" => [1, 2, 3],
|
310
|
+
# "bar" => [6, 7, 8],
|
311
|
+
# "ham" => ["a", "b", "c"]
|
312
|
+
# }
|
313
|
+
# )
|
287
314
|
# df.columns = ["apple", "banana", "orange"]
|
288
315
|
# df
|
289
316
|
# # =>
|
@@ -308,11 +335,13 @@ module Polars
|
|
308
335
|
# @return [Array]
|
309
336
|
#
|
310
337
|
# @example
|
311
|
-
# df = Polars::DataFrame.new(
|
312
|
-
#
|
313
|
-
#
|
314
|
-
#
|
315
|
-
#
|
338
|
+
# df = Polars::DataFrame.new(
|
339
|
+
# {
|
340
|
+
# "foo" => [1, 2, 3],
|
341
|
+
# "bar" => [6.0, 7.0, 8.0],
|
342
|
+
# "ham" => ["a", "b", "c"]
|
343
|
+
# }
|
344
|
+
# )
|
316
345
|
# df.dtypes
|
317
346
|
# # => [:i64, :f64, :str]
|
318
347
|
def dtypes
|
@@ -324,56 +353,132 @@ module Polars
|
|
324
353
|
# @return [Hash]
|
325
354
|
#
|
326
355
|
# @example
|
327
|
-
# df = Polars::DataFrame.new(
|
328
|
-
#
|
329
|
-
#
|
330
|
-
#
|
331
|
-
#
|
356
|
+
# df = Polars::DataFrame.new(
|
357
|
+
# {
|
358
|
+
# "foo" => [1, 2, 3],
|
359
|
+
# "bar" => [6.0, 7.0, 8.0],
|
360
|
+
# "ham" => ["a", "b", "c"]
|
361
|
+
# }
|
362
|
+
# )
|
332
363
|
# df.schema
|
333
364
|
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
334
365
|
def schema
|
335
366
|
columns.zip(dtypes).to_h
|
336
367
|
end
|
337
368
|
|
338
|
-
#
|
339
|
-
#
|
369
|
+
# Equal.
|
370
|
+
#
|
371
|
+
# @return [DataFrame]
|
372
|
+
def ==(other)
|
373
|
+
_comp(other, "eq")
|
374
|
+
end
|
340
375
|
|
341
|
-
#
|
342
|
-
#
|
376
|
+
# Not equal.
|
377
|
+
#
|
378
|
+
# @return [DataFrame]
|
379
|
+
def !=(other)
|
380
|
+
_comp(other, "neq")
|
381
|
+
end
|
343
382
|
|
344
|
-
#
|
345
|
-
#
|
383
|
+
# Greater than.
|
384
|
+
#
|
385
|
+
# @return [DataFrame]
|
386
|
+
def >(other)
|
387
|
+
_comp(other, "gt")
|
388
|
+
end
|
346
389
|
|
347
|
-
#
|
348
|
-
#
|
390
|
+
# Less than.
|
391
|
+
#
|
392
|
+
# @return [DataFrame]
|
393
|
+
def <(other)
|
394
|
+
_comp(other, "lt")
|
395
|
+
end
|
349
396
|
|
350
|
-
#
|
351
|
-
#
|
397
|
+
# Greater than or equal.
|
398
|
+
#
|
399
|
+
# @return [DataFrame]
|
400
|
+
def >=(other)
|
401
|
+
_comp(other, "gt_eq")
|
402
|
+
end
|
352
403
|
|
353
|
-
#
|
354
|
-
#
|
404
|
+
# Less than or equal.
|
405
|
+
#
|
406
|
+
# @return [DataFrame]
|
407
|
+
def <=(other)
|
408
|
+
_comp(other, "lt_eq")
|
409
|
+
end
|
355
410
|
|
356
|
-
#
|
357
|
-
#
|
411
|
+
# Performs multiplication.
|
412
|
+
#
|
413
|
+
# @return [DataFrame]
|
414
|
+
def *(other)
|
415
|
+
if other.is_a?(DataFrame)
|
416
|
+
return _from_rbdf(_df.mul_df(other._df))
|
417
|
+
end
|
358
418
|
|
359
|
-
|
360
|
-
|
419
|
+
other = _prepare_other_arg(other)
|
420
|
+
_from_rbdf(_df.mul(other._s))
|
421
|
+
end
|
361
422
|
|
362
|
-
#
|
363
|
-
#
|
423
|
+
# Performs division.
|
424
|
+
#
|
425
|
+
# @return [DataFrame]
|
426
|
+
def /(other)
|
427
|
+
if other.is_a?(DataFrame)
|
428
|
+
return _from_rbdf(_df.div_df(other._df))
|
429
|
+
end
|
364
430
|
|
365
|
-
|
366
|
-
|
431
|
+
other = _prepare_other_arg(other)
|
432
|
+
_from_rbdf(_df.div(other._s))
|
433
|
+
end
|
367
434
|
|
368
|
-
#
|
369
|
-
#
|
435
|
+
# Performs addition.
|
436
|
+
#
|
437
|
+
# @return [DataFrame]
|
438
|
+
def +(other)
|
439
|
+
if other.is_a?(DataFrame)
|
440
|
+
return _from_rbdf(_df.add_df(other._df))
|
441
|
+
end
|
442
|
+
|
443
|
+
other = _prepare_other_arg(other)
|
444
|
+
_from_rbdf(_df.add(other._s))
|
445
|
+
end
|
446
|
+
|
447
|
+
# Performs subtraction.
|
448
|
+
#
|
449
|
+
# @return [DataFrame]
|
450
|
+
def -(other)
|
451
|
+
if other.is_a?(DataFrame)
|
452
|
+
return _from_rbdf(_df.sub_df(other._df))
|
453
|
+
end
|
454
|
+
|
455
|
+
other = _prepare_other_arg(other)
|
456
|
+
_from_rbdf(_df.sub(other._s))
|
457
|
+
end
|
458
|
+
|
459
|
+
# Returns the modulo.
|
460
|
+
#
|
461
|
+
# @return [DataFrame]
|
462
|
+
def %(other)
|
463
|
+
if other.is_a?(DataFrame)
|
464
|
+
return _from_rbdf(_df.rem_df(other._df))
|
465
|
+
end
|
466
|
+
|
467
|
+
other = _prepare_other_arg(other)
|
468
|
+
_from_rbdf(_df.rem(other._s))
|
469
|
+
end
|
370
470
|
|
471
|
+
# Returns a string representing the DataFrame.
|
371
472
|
#
|
473
|
+
# @return [String]
|
372
474
|
def to_s
|
373
475
|
_df.to_s
|
374
476
|
end
|
375
477
|
alias_method :inspect, :to_s
|
376
478
|
|
479
|
+
# Check if DataFrame includes column.
|
480
|
+
#
|
481
|
+
# @return [Boolean]
|
377
482
|
def include?(name)
|
378
483
|
columns.include?(name)
|
379
484
|
end
|
@@ -387,9 +492,78 @@ module Polars
|
|
387
492
|
# def _pos_idxs
|
388
493
|
# end
|
389
494
|
|
495
|
+
# Returns subset of the DataFrame.
|
390
496
|
#
|
391
|
-
|
392
|
-
|
497
|
+
# @return [Object]
|
498
|
+
def [](*args)
|
499
|
+
if args.size == 2
|
500
|
+
row_selection, col_selection = args
|
501
|
+
|
502
|
+
# df[.., unknown]
|
503
|
+
if row_selection.is_a?(Range)
|
504
|
+
|
505
|
+
# multiple slices
|
506
|
+
# df[.., ..]
|
507
|
+
if col_selection.is_a?(Range)
|
508
|
+
raise Todo
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
# df[2, ..] (select row as df)
|
513
|
+
if row_selection.is_a?(Integer)
|
514
|
+
if col_selection.is_a?(Array)
|
515
|
+
df = self[0.., col_selection]
|
516
|
+
return df.slice(row_selection, 1)
|
517
|
+
end
|
518
|
+
# df[2, "a"]
|
519
|
+
if col_selection.is_a?(String)
|
520
|
+
return self[col_selection][row_selection]
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
524
|
+
# column selection can be "a" and ["a", "b"]
|
525
|
+
if col_selection.is_a?(String)
|
526
|
+
col_selection = [col_selection]
|
527
|
+
end
|
528
|
+
|
529
|
+
# df[.., 1]
|
530
|
+
if col_selection.is_a?(Integer)
|
531
|
+
series = to_series(col_selection)
|
532
|
+
return series[row_selection]
|
533
|
+
end
|
534
|
+
|
535
|
+
if col_selection.is_a?(Array)
|
536
|
+
# df[.., [1, 2]]
|
537
|
+
if is_int_sequence(col_selection)
|
538
|
+
series_list = col_selection.map { |i| to_series(i) }
|
539
|
+
df = self.class.new(series_list)
|
540
|
+
return df[row_selection]
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
df = self[col_selection]
|
545
|
+
return df[row_selection]
|
546
|
+
elsif args.size == 1
|
547
|
+
item = args[0]
|
548
|
+
|
549
|
+
# select single column
|
550
|
+
# df["foo"]
|
551
|
+
if item.is_a?(String)
|
552
|
+
return Utils.wrap_s(_df.column(item))
|
553
|
+
end
|
554
|
+
|
555
|
+
# df[idx]
|
556
|
+
if item.is_a?(Integer)
|
557
|
+
return slice(_pos_idx(item, dim: 0), 1)
|
558
|
+
end
|
559
|
+
|
560
|
+
# df[..]
|
561
|
+
if item.is_a?(Range)
|
562
|
+
return Slice.new(self).apply(item)
|
563
|
+
end
|
564
|
+
end
|
565
|
+
|
566
|
+
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
393
567
|
end
|
394
568
|
|
395
569
|
# def []=(key, value)
|
@@ -397,7 +571,9 @@ module Polars
|
|
397
571
|
|
398
572
|
# no to_arrow
|
399
573
|
|
574
|
+
# Convert DataFrame to a hash mapping column name to values.
|
400
575
|
#
|
576
|
+
# @return [Hash]
|
401
577
|
def to_h(as_series: true)
|
402
578
|
if as_series
|
403
579
|
get_columns.to_h { |s| [s.name, s] }
|
@@ -422,11 +598,13 @@ module Polars
|
|
422
598
|
# @return [Series]
|
423
599
|
#
|
424
600
|
# @example
|
425
|
-
# df = Polars::DataFrame.new(
|
426
|
-
#
|
427
|
-
#
|
428
|
-
#
|
429
|
-
#
|
601
|
+
# df = Polars::DataFrame.new(
|
602
|
+
# {
|
603
|
+
# "foo" => [1, 2, 3],
|
604
|
+
# "bar" => [6, 7, 8],
|
605
|
+
# "ham" => ["a", "b", "c"]
|
606
|
+
# }
|
607
|
+
# )
|
430
608
|
# df.to_series(1)
|
431
609
|
# # =>
|
432
610
|
# # shape: (3,)
|
@@ -519,11 +697,13 @@ module Polars
|
|
519
697
|
# @return [String, nil]
|
520
698
|
#
|
521
699
|
# @example
|
522
|
-
# df = Polars::DataFrame.new(
|
523
|
-
#
|
524
|
-
#
|
525
|
-
#
|
526
|
-
#
|
700
|
+
# df = Polars::DataFrame.new(
|
701
|
+
# {
|
702
|
+
# "foo" => [1, 2, 3, 4, 5],
|
703
|
+
# "bar" => [6, 7, 8, 9, 10],
|
704
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
705
|
+
# }
|
706
|
+
# )
|
527
707
|
# df.write_csv("file.csv")
|
528
708
|
def write_csv(
|
529
709
|
file = nil,
|
@@ -694,10 +874,12 @@ module Polars
|
|
694
874
|
# @return [DataFrame]
|
695
875
|
#
|
696
876
|
# @example
|
697
|
-
# df = Polars::DataFrame.new(
|
698
|
-
#
|
699
|
-
#
|
700
|
-
#
|
877
|
+
# df = Polars::DataFrame.new(
|
878
|
+
# {
|
879
|
+
# "key" => ["a", "b", "c"],
|
880
|
+
# "val" => [1, 2, 3]
|
881
|
+
# }
|
882
|
+
# )
|
701
883
|
# df.reverse()
|
702
884
|
# # =>
|
703
885
|
# # shape: (3, 2)
|
@@ -724,11 +906,13 @@ module Polars
|
|
724
906
|
# @return [DataFrame]
|
725
907
|
#
|
726
908
|
# @example
|
727
|
-
# df = Polars::DataFrame.new(
|
728
|
-
#
|
729
|
-
#
|
730
|
-
#
|
731
|
-
#
|
909
|
+
# df = Polars::DataFrame.new(
|
910
|
+
# {
|
911
|
+
# "foo" => [1, 2, 3],
|
912
|
+
# "bar" => [6, 7, 8],
|
913
|
+
# "ham" => ["a", "b", "c"]
|
914
|
+
# }
|
915
|
+
# )
|
732
916
|
# df.rename({"foo" => "apple"})
|
733
917
|
# # =>
|
734
918
|
# # shape: (3, 3)
|
@@ -775,11 +959,13 @@ module Polars
|
|
775
959
|
# # └─────┴─────┴─────┘
|
776
960
|
#
|
777
961
|
# @example
|
778
|
-
# df = Polars::DataFrame.new(
|
779
|
-
#
|
780
|
-
#
|
781
|
-
#
|
782
|
-
#
|
962
|
+
# df = Polars::DataFrame.new(
|
963
|
+
# {
|
964
|
+
# "a" => [1, 2, 3, 4],
|
965
|
+
# "b" => [0.5, 4, 10, 13],
|
966
|
+
# "c" => [true, true, false, true]
|
967
|
+
# }
|
968
|
+
# )
|
783
969
|
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
784
970
|
# df.insert_at_idx(3, s)
|
785
971
|
# # =>
|
@@ -805,63 +991,560 @@ module Polars
|
|
805
991
|
self
|
806
992
|
end
|
807
993
|
|
994
|
+
# Filter the rows in the DataFrame based on a predicate expression.
|
995
|
+
#
|
996
|
+
# @param predicate [Expr]
|
997
|
+
# Expression that evaluates to a boolean Series.
|
998
|
+
#
|
999
|
+
# @return [DataFrame]
|
1000
|
+
#
|
1001
|
+
# @example Filter on one condition:
|
1002
|
+
# df = Polars::DataFrame.new(
|
1003
|
+
# {
|
1004
|
+
# "foo" => [1, 2, 3],
|
1005
|
+
# "bar" => [6, 7, 8],
|
1006
|
+
# "ham" => ["a", "b", "c"]
|
1007
|
+
# }
|
1008
|
+
# )
|
1009
|
+
# df.filter(Polars.col("foo") < 3)
|
1010
|
+
# # =>
|
1011
|
+
# # shape: (2, 3)
|
1012
|
+
# # ┌─────┬─────┬─────┐
|
1013
|
+
# # │ foo ┆ bar ┆ ham │
|
1014
|
+
# # │ --- ┆ --- ┆ --- │
|
1015
|
+
# # │ i64 ┆ i64 ┆ str │
|
1016
|
+
# # ╞═════╪═════╪═════╡
|
1017
|
+
# # │ 1 ┆ 6 ┆ a │
|
1018
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1019
|
+
# # │ 2 ┆ 7 ┆ b │
|
1020
|
+
# # └─────┴─────┴─────┘
|
1021
|
+
#
|
1022
|
+
# @example Filter on multiple conditions:
|
1023
|
+
# df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
|
1024
|
+
# # =>
|
1025
|
+
# # shape: (1, 3)
|
1026
|
+
# # ┌─────┬─────┬─────┐
|
1027
|
+
# # │ foo ┆ bar ┆ ham │
|
1028
|
+
# # │ --- ┆ --- ┆ --- │
|
1029
|
+
# # │ i64 ┆ i64 ┆ str │
|
1030
|
+
# # ╞═════╪═════╪═════╡
|
1031
|
+
# # │ 1 ┆ 6 ┆ a │
|
1032
|
+
# # └─────┴─────┴─────┘
|
808
1033
|
def filter(predicate)
|
809
1034
|
lazy.filter(predicate).collect
|
810
1035
|
end
|
811
1036
|
|
812
|
-
#
|
813
|
-
#
|
1037
|
+
# Summary statistics for a DataFrame.
|
1038
|
+
#
|
1039
|
+
# @return [DataFrame]
|
1040
|
+
#
|
1041
|
+
# @example
|
1042
|
+
# df = Polars::DataFrame.new(
|
1043
|
+
# {
|
1044
|
+
# "a" => [1.0, 2.8, 3.0],
|
1045
|
+
# "b" => [4, 5, nil],
|
1046
|
+
# "c" => [true, false, true],
|
1047
|
+
# "d" => [nil, "b", "c"],
|
1048
|
+
# "e" => ["usd", "eur", nil]
|
1049
|
+
# }
|
1050
|
+
# )
|
1051
|
+
# df.describe
|
1052
|
+
# # =>
|
1053
|
+
# # shape: (7, 6)
|
1054
|
+
# # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
|
1055
|
+
# # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
|
1056
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1057
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
|
1058
|
+
# # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
|
1059
|
+
# # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
|
1060
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1061
|
+
# # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
|
1062
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1063
|
+
# # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
|
1064
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1065
|
+
# # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
|
1066
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1067
|
+
# # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
|
1068
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1069
|
+
# # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
|
1070
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1071
|
+
# # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
|
1072
|
+
# # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
|
1073
|
+
def describe
|
1074
|
+
describe_cast = lambda do |stat|
|
1075
|
+
columns = []
|
1076
|
+
self.columns.each_with_index do |s, i|
|
1077
|
+
if self[s].is_numeric || self[s].is_boolean
|
1078
|
+
columns << stat[0.., i].cast(:f64)
|
1079
|
+
else
|
1080
|
+
# for dates, strings, etc, we cast to string so that all
|
1081
|
+
# statistics can be shown
|
1082
|
+
columns << stat[0.., i].cast(:str)
|
1083
|
+
end
|
1084
|
+
end
|
1085
|
+
self.class.new(columns)
|
1086
|
+
end
|
814
1087
|
|
815
|
-
|
816
|
-
|
1088
|
+
summary = _from_rbdf(
|
1089
|
+
Polars.concat(
|
1090
|
+
[
|
1091
|
+
describe_cast.(
|
1092
|
+
self.class.new(columns.to_h { |c| [c, [height]] })
|
1093
|
+
),
|
1094
|
+
describe_cast.(null_count),
|
1095
|
+
describe_cast.(mean),
|
1096
|
+
describe_cast.(std),
|
1097
|
+
describe_cast.(min),
|
1098
|
+
describe_cast.(max),
|
1099
|
+
describe_cast.(median)
|
1100
|
+
]
|
1101
|
+
)._df
|
1102
|
+
)
|
1103
|
+
summary.insert_at_idx(
|
1104
|
+
0,
|
1105
|
+
Polars::Series.new(
|
1106
|
+
"describe",
|
1107
|
+
["count", "null_count", "mean", "std", "min", "max", "median"],
|
1108
|
+
)
|
1109
|
+
)
|
1110
|
+
summary
|
1111
|
+
end
|
817
1112
|
|
818
|
-
#
|
819
|
-
#
|
1113
|
+
# Find the index of a column by name.
|
1114
|
+
#
|
1115
|
+
# @param name [String]
|
1116
|
+
# Name of the column to find.
|
1117
|
+
#
|
1118
|
+
# @return [Series]
|
1119
|
+
#
|
1120
|
+
# @example
|
1121
|
+
# df = Polars::DataFrame.new(
|
1122
|
+
# {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
|
1123
|
+
# )
|
1124
|
+
# df.find_idx_by_name("ham")
|
1125
|
+
# # => 2
|
1126
|
+
def find_idx_by_name(name)
|
1127
|
+
_df.find_idx_by_name(name)
|
1128
|
+
end
|
1129
|
+
|
1130
|
+
# Replace a column at an index location.
|
1131
|
+
#
|
1132
|
+
# @param index [Integer]
|
1133
|
+
# Column index.
|
1134
|
+
# @param series [Series]
|
1135
|
+
# Series that will replace the column.
|
1136
|
+
#
|
1137
|
+
# @return [DataFrame]
|
1138
|
+
#
|
1139
|
+
# @example
|
1140
|
+
# df = Polars::DataFrame.new(
|
1141
|
+
# {
|
1142
|
+
# "foo" => [1, 2, 3],
|
1143
|
+
# "bar" => [6, 7, 8],
|
1144
|
+
# "ham" => ["a", "b", "c"]
|
1145
|
+
# }
|
1146
|
+
# )
|
1147
|
+
# s = Polars::Series.new("apple", [10, 20, 30])
|
1148
|
+
# df.replace_at_idx(0, s)
|
1149
|
+
# # =>
|
1150
|
+
# # shape: (3, 3)
|
1151
|
+
# # ┌───────┬─────┬─────┐
|
1152
|
+
# # │ apple ┆ bar ┆ ham │
|
1153
|
+
# # │ --- ┆ --- ┆ --- │
|
1154
|
+
# # │ i64 ┆ i64 ┆ str │
|
1155
|
+
# # ╞═══════╪═════╪═════╡
|
1156
|
+
# # │ 10 ┆ 6 ┆ a │
|
1157
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1158
|
+
# # │ 20 ┆ 7 ┆ b │
|
1159
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1160
|
+
# # │ 30 ┆ 8 ┆ c │
|
1161
|
+
# # └───────┴─────┴─────┘
|
1162
|
+
def replace_at_idx(index, series)
|
1163
|
+
if index < 0
|
1164
|
+
index = columns.length + index
|
1165
|
+
end
|
1166
|
+
_df.replace_at_idx(index, series._s)
|
1167
|
+
self
|
1168
|
+
end
|
820
1169
|
|
1170
|
+
# Sort the DataFrame by column.
|
1171
|
+
#
|
1172
|
+
# @param by [String]
|
1173
|
+
# By which column to sort.
|
1174
|
+
# @param reverse [Boolean]
|
1175
|
+
# Reverse/descending sort.
|
1176
|
+
# @param nulls_last [Boolean]
|
1177
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1178
|
+
#
|
1179
|
+
# @return [DataFrame]
|
1180
|
+
#
|
1181
|
+
# @example
|
1182
|
+
# df = Polars::DataFrame.new(
|
1183
|
+
# {
|
1184
|
+
# "foo" => [1, 2, 3],
|
1185
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1186
|
+
# "ham" => ["a", "b", "c"]
|
1187
|
+
# }
|
1188
|
+
# )
|
1189
|
+
# df.sort("foo", reverse: true)
|
1190
|
+
# # =>
|
1191
|
+
# # shape: (3, 3)
|
1192
|
+
# # ┌─────┬─────┬─────┐
|
1193
|
+
# # │ foo ┆ bar ┆ ham │
|
1194
|
+
# # │ --- ┆ --- ┆ --- │
|
1195
|
+
# # │ i64 ┆ f64 ┆ str │
|
1196
|
+
# # ╞═════╪═════╪═════╡
|
1197
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1198
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1199
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1200
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1201
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1202
|
+
# # └─────┴─────┴─────┘
|
821
1203
|
#
|
1204
|
+
# @example Sort by multiple columns.
|
1205
|
+
# df.sort(
|
1206
|
+
# [Polars.col("foo"), Polars.col("bar")**2],
|
1207
|
+
# reverse: [true, false]
|
1208
|
+
# )
|
1209
|
+
# # =>
|
1210
|
+
# # shape: (3, 3)
|
1211
|
+
# # ┌─────┬─────┬─────┐
|
1212
|
+
# # │ foo ┆ bar ┆ ham │
|
1213
|
+
# # │ --- ┆ --- ┆ --- │
|
1214
|
+
# # │ i64 ┆ f64 ┆ str │
|
1215
|
+
# # ╞═════╪═════╪═════╡
|
1216
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1217
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1218
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1219
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1220
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1221
|
+
# # └─────┴─────┴─────┘
|
822
1222
|
def sort(by, reverse: false, nulls_last: false)
|
823
|
-
|
1223
|
+
if by.is_a?(Array) || by.is_a?(Expr)
|
1224
|
+
lazy
|
1225
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1226
|
+
.collect(no_optimization: true, string_cache: false)
|
1227
|
+
else
|
1228
|
+
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1229
|
+
end
|
824
1230
|
end
|
825
1231
|
|
1232
|
+
# Check if DataFrame is equal to other.
|
1233
|
+
#
|
1234
|
+
# @param other [DataFrame]
|
1235
|
+
# DataFrame to compare with.
|
1236
|
+
# @param null_equal [Boolean]
|
1237
|
+
# Consider null values as equal.
|
1238
|
+
#
|
1239
|
+
# @return [Boolean]
|
1240
|
+
#
|
1241
|
+
# @example
|
1242
|
+
# df1 = Polars::DataFrame.new(
|
1243
|
+
# {
|
1244
|
+
# "foo" => [1, 2, 3],
|
1245
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1246
|
+
# "ham" => ["a", "b", "c"]
|
1247
|
+
# }
|
1248
|
+
# )
|
1249
|
+
# df2 = Polars::DataFrame.new(
|
1250
|
+
# {
|
1251
|
+
# "foo" => [3, 2, 1],
|
1252
|
+
# "bar" => [8.0, 7.0, 6.0],
|
1253
|
+
# "ham" => ["c", "b", "a"]
|
1254
|
+
# }
|
1255
|
+
# )
|
1256
|
+
# df1.frame_equal(df1)
|
1257
|
+
# # => true
|
1258
|
+
# df1.frame_equal(df2)
|
1259
|
+
# # => false
|
826
1260
|
def frame_equal(other, null_equal: true)
|
827
1261
|
_df.frame_equal(other._df, null_equal)
|
828
1262
|
end
|
829
1263
|
|
830
|
-
#
|
831
|
-
# end
|
832
|
-
|
1264
|
+
# Replace a column by a new Series.
|
833
1265
|
#
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
1266
|
+
# @param column [String]
|
1267
|
+
# Column to replace.
|
1268
|
+
# @param new_col [Series]
|
1269
|
+
# New column to insert.
|
1270
|
+
#
|
1271
|
+
# @return [DataFrame]
|
1272
|
+
#
|
1273
|
+
# @example
|
1274
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1275
|
+
# s = Polars::Series.new([10, 20, 30])
|
1276
|
+
# df.replace("foo", s)
|
1277
|
+
# # =>
|
1278
|
+
# # shape: (3, 2)
|
1279
|
+
# # ┌─────┬─────┐
|
1280
|
+
# # │ foo ┆ bar │
|
1281
|
+
# # │ --- ┆ --- │
|
1282
|
+
# # │ i64 ┆ i64 │
|
1283
|
+
# # ╞═════╪═════╡
|
1284
|
+
# # │ 10 ┆ 4 │
|
1285
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1286
|
+
# # │ 20 ┆ 5 │
|
1287
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1288
|
+
# # │ 30 ┆ 6 │
|
1289
|
+
# # └─────┴─────┘
|
1290
|
+
def replace(column, new_col)
|
1291
|
+
_df.replace(column, new_col._s)
|
1292
|
+
self
|
839
1293
|
end
|
840
1294
|
|
1295
|
+
# Get a slice of this DataFrame.
|
1296
|
+
#
|
1297
|
+
# @param offset [Integer]
|
1298
|
+
# Start index. Negative indexing is supported.
|
1299
|
+
# @param length [Integer, nil]
|
1300
|
+
# Length of the slice. If set to `nil`, all rows starting at the offset
|
1301
|
+
# will be selected.
|
1302
|
+
#
|
1303
|
+
# @return [DataFrame]
|
1304
|
+
#
|
1305
|
+
# @example
|
1306
|
+
# df = Polars::DataFrame.new(
|
1307
|
+
# {
|
1308
|
+
# "foo" => [1, 2, 3],
|
1309
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1310
|
+
# "ham" => ["a", "b", "c"]
|
1311
|
+
# }
|
1312
|
+
# )
|
1313
|
+
# df.slice(1, 2)
|
1314
|
+
# # =>
|
1315
|
+
# # shape: (2, 3)
|
1316
|
+
# # ┌─────┬─────┬─────┐
|
1317
|
+
# # │ foo ┆ bar ┆ ham │
|
1318
|
+
# # │ --- ┆ --- ┆ --- │
|
1319
|
+
# # │ i64 ┆ f64 ┆ str │
|
1320
|
+
# # ╞═════╪═════╪═════╡
|
1321
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1322
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1323
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1324
|
+
# # └─────┴─────┴─────┘
|
1325
|
+
def slice(offset, length = nil)
|
1326
|
+
if !length.nil? && length < 0
|
1327
|
+
length = height - offset + length
|
1328
|
+
end
|
1329
|
+
_from_rbdf(_df.slice(offset, length))
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
# Get the first `n` rows.
|
1333
|
+
#
|
1334
|
+
# Alias for {#head}.
|
1335
|
+
#
|
1336
|
+
# @param n [Integer]
|
1337
|
+
# Number of rows to return.
|
1338
|
+
#
|
1339
|
+
# @return [DataFrame]
|
1340
|
+
#
|
1341
|
+
# @example
|
1342
|
+
# df = Polars::DataFrame.new(
|
1343
|
+
# {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
|
1344
|
+
# )
|
1345
|
+
# df.limit(4)
|
1346
|
+
# # =>
|
1347
|
+
# # shape: (4, 2)
|
1348
|
+
# # ┌─────┬─────┐
|
1349
|
+
# # │ foo ┆ bar │
|
1350
|
+
# # │ --- ┆ --- │
|
1351
|
+
# # │ i64 ┆ str │
|
1352
|
+
# # ╞═════╪═════╡
|
1353
|
+
# # │ 1 ┆ a │
|
1354
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1355
|
+
# # │ 2 ┆ b │
|
1356
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1357
|
+
# # │ 3 ┆ c │
|
1358
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1359
|
+
# # │ 4 ┆ d │
|
1360
|
+
# # └─────┴─────┘
|
841
1361
|
def limit(n = 5)
|
842
1362
|
head(n)
|
843
1363
|
end
|
844
1364
|
|
1365
|
+
# Get the first `n` rows.
|
1366
|
+
#
|
1367
|
+
# @param n [Integer]
|
1368
|
+
# Number of rows to return.
|
1369
|
+
#
|
1370
|
+
# @return [DataFrame]
|
1371
|
+
#
|
1372
|
+
# @example
|
1373
|
+
# df = Polars::DataFrame.new(
|
1374
|
+
# {
|
1375
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1376
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1377
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1378
|
+
# }
|
1379
|
+
# )
|
1380
|
+
# df.head(3)
|
1381
|
+
# # =>
|
1382
|
+
# # shape: (3, 3)
|
1383
|
+
# # ┌─────┬─────┬─────┐
|
1384
|
+
# # │ foo ┆ bar ┆ ham │
|
1385
|
+
# # │ --- ┆ --- ┆ --- │
|
1386
|
+
# # │ i64 ┆ i64 ┆ str │
|
1387
|
+
# # ╞═════╪═════╪═════╡
|
1388
|
+
# # │ 1 ┆ 6 ┆ a │
|
1389
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1390
|
+
# # │ 2 ┆ 7 ┆ b │
|
1391
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1392
|
+
# # │ 3 ┆ 8 ┆ c │
|
1393
|
+
# # └─────┴─────┴─────┘
|
845
1394
|
def head(n = 5)
|
846
1395
|
_from_rbdf(_df.head(n))
|
847
1396
|
end
|
848
1397
|
|
1398
|
+
# Get the last `n` rows.
|
1399
|
+
#
|
1400
|
+
# @param n [Integer]
|
1401
|
+
# Number of rows to return.
|
1402
|
+
#
|
1403
|
+
# @return [DataFrame]
|
1404
|
+
#
|
1405
|
+
# @example
|
1406
|
+
# df = Polars::DataFrame.new(
|
1407
|
+
# {
|
1408
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1409
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1410
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1411
|
+
# }
|
1412
|
+
# )
|
1413
|
+
# df.tail(3)
|
1414
|
+
# # =>
|
1415
|
+
# # shape: (3, 3)
|
1416
|
+
# # ┌─────┬─────┬─────┐
|
1417
|
+
# # │ foo ┆ bar ┆ ham │
|
1418
|
+
# # │ --- ┆ --- ┆ --- │
|
1419
|
+
# # │ i64 ┆ i64 ┆ str │
|
1420
|
+
# # ╞═════╪═════╪═════╡
|
1421
|
+
# # │ 3 ┆ 8 ┆ c │
|
1422
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1423
|
+
# # │ 4 ┆ 9 ┆ d │
|
1424
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1425
|
+
# # │ 5 ┆ 10 ┆ e │
|
1426
|
+
# # └─────┴─────┴─────┘
|
849
1427
|
def tail(n = 5)
|
850
1428
|
_from_rbdf(_df.tail(n))
|
851
1429
|
end
|
852
1430
|
|
853
|
-
#
|
854
|
-
#
|
1431
|
+
# Return a new DataFrame where the null values are dropped.
|
1432
|
+
#
|
1433
|
+
# @param subset [Object]
|
1434
|
+
# Subset of column(s) on which `drop_nulls` will be applied.
|
1435
|
+
#
|
1436
|
+
# @return [DataFrame]
|
1437
|
+
#
|
1438
|
+
# @example
|
1439
|
+
# df = Polars::DataFrame.new(
|
1440
|
+
# {
|
1441
|
+
# "foo" => [1, 2, 3],
|
1442
|
+
# "bar" => [6, nil, 8],
|
1443
|
+
# "ham" => ["a", "b", "c"]
|
1444
|
+
# }
|
1445
|
+
# )
|
1446
|
+
# df.drop_nulls
|
1447
|
+
# # =>
|
1448
|
+
# # shape: (2, 3)
|
1449
|
+
# # ┌─────┬─────┬─────┐
|
1450
|
+
# # │ foo ┆ bar ┆ ham │
|
1451
|
+
# # │ --- ┆ --- ┆ --- │
|
1452
|
+
# # │ i64 ┆ i64 ┆ str │
|
1453
|
+
# # ╞═════╪═════╪═════╡
|
1454
|
+
# # │ 1 ┆ 6 ┆ a │
|
1455
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1456
|
+
# # │ 3 ┆ 8 ┆ c │
|
1457
|
+
# # └─────┴─────┴─────┘
|
1458
|
+
def drop_nulls(subset: nil)
|
1459
|
+
if subset.is_a?(String)
|
1460
|
+
subset = [subset]
|
1461
|
+
end
|
1462
|
+
_from_rbdf(_df.drop_nulls(subset))
|
1463
|
+
end
|
855
1464
|
|
856
1465
|
# def pipe
|
857
1466
|
# end
|
858
1467
|
|
859
|
-
#
|
860
|
-
#
|
861
|
-
|
1468
|
+
# Add a column at index 0 that counts the rows.
|
1469
|
+
#
|
1470
|
+
# @param name [String]
|
1471
|
+
# Name of the column to add.
|
1472
|
+
# @param offset [Integer]
|
1473
|
+
# Start the row count at this offset.
|
1474
|
+
#
|
1475
|
+
# @return [DataFrame]
|
1476
|
+
#
|
1477
|
+
# @example
|
1478
|
+
# df = Polars::DataFrame.new(
|
1479
|
+
# {
|
1480
|
+
# "a" => [1, 3, 5],
|
1481
|
+
# "b" => [2, 4, 6]
|
1482
|
+
# }
|
1483
|
+
# )
|
1484
|
+
# df.with_row_count
|
1485
|
+
# # =>
|
1486
|
+
# # shape: (3, 3)
|
1487
|
+
# # ┌────────┬─────┬─────┐
|
1488
|
+
# # │ row_nr ┆ a ┆ b │
|
1489
|
+
# # │ --- ┆ --- ┆ --- │
|
1490
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1491
|
+
# # ╞════════╪═════╪═════╡
|
1492
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1493
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1494
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1495
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1496
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1497
|
+
# # └────────┴─────┴─────┘
|
1498
|
+
def with_row_count(name: "row_nr", offset: 0)
|
1499
|
+
_from_rbdf(_df.with_row_count(name, offset))
|
1500
|
+
end
|
1501
|
+
|
1502
|
+
# Start a groupby operation.
|
862
1503
|
#
|
1504
|
+
# @param by [Object]
|
1505
|
+
# Column(s) to group by.
|
1506
|
+
# @param maintain_order [Boolean]
|
1507
|
+
# Make sure that the order of the groups remain consistent. This is more
|
1508
|
+
# expensive than a default groupby. Note that this only works in expression
|
1509
|
+
# aggregations.
|
1510
|
+
#
|
1511
|
+
# @return [GroupBy]
|
1512
|
+
#
|
1513
|
+
# @example
|
1514
|
+
# df = Polars::DataFrame.new(
|
1515
|
+
# {
|
1516
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1517
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
1518
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
1519
|
+
# }
|
1520
|
+
# )
|
1521
|
+
# df.groupby("a").agg(Polars.col("b").sum).sort("a")
|
1522
|
+
# # =>
|
1523
|
+
# # shape: (3, 2)
|
1524
|
+
# # ┌─────┬─────┐
|
1525
|
+
# # │ a ┆ b │
|
1526
|
+
# # │ --- ┆ --- │
|
1527
|
+
# # │ str ┆ i64 │
|
1528
|
+
# # ╞═════╪═════╡
|
1529
|
+
# # │ a ┆ 4 │
|
1530
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1531
|
+
# # │ b ┆ 11 │
|
1532
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1533
|
+
# # │ c ┆ 6 │
|
1534
|
+
# # └─────┴─────┘
|
863
1535
|
def groupby(by, maintain_order: false)
|
864
|
-
|
1536
|
+
if !Utils.bool?(maintain_order)
|
1537
|
+
raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
|
1538
|
+
end
|
1539
|
+
if by.is_a?(String)
|
1540
|
+
by = [by]
|
1541
|
+
end
|
1542
|
+
GroupBy.new(
|
1543
|
+
_df,
|
1544
|
+
by,
|
1545
|
+
self.class,
|
1546
|
+
maintain_order: maintain_order
|
1547
|
+
)
|
865
1548
|
end
|
866
1549
|
|
867
1550
|
# def groupby_rolling
|
@@ -876,7 +1559,109 @@ module Polars
|
|
876
1559
|
# def join_asof
|
877
1560
|
# end
|
878
1561
|
|
1562
|
+
# Join in SQL-like fashion.
|
1563
|
+
#
|
1564
|
+
# @param other [DataFrame]
|
1565
|
+
# DataFrame to join with.
|
1566
|
+
# @param left_on [Object]
|
1567
|
+
# Name(s) of the left join column(s).
|
1568
|
+
# @param right_on [Object]
|
1569
|
+
# Name(s) of the right join column(s).
|
1570
|
+
# @param on [Object]
|
1571
|
+
# Name(s) of the join columns in both DataFrames.
|
1572
|
+
# @param how ["inner", "left", "outer", "semi", "anti", "cross"]
|
1573
|
+
# Join strategy.
|
1574
|
+
# @param suffix [String]
|
1575
|
+
# Suffix to append to columns with a duplicate name.
|
1576
|
+
#
|
1577
|
+
# @return [DataFrame]
|
1578
|
+
#
|
1579
|
+
# @example
|
1580
|
+
# df = Polars::DataFrame.new(
|
1581
|
+
# {
|
1582
|
+
# "foo" => [1, 2, 3],
|
1583
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1584
|
+
# "ham" => ["a", "b", "c"]
|
1585
|
+
# }
|
1586
|
+
# )
|
1587
|
+
# other_df = Polars::DataFrame.new(
|
1588
|
+
# {
|
1589
|
+
# "apple" => ["x", "y", "z"],
|
1590
|
+
# "ham" => ["a", "b", "d"]
|
1591
|
+
# }
|
1592
|
+
# )
|
1593
|
+
# df.join(other_df, on: "ham")
|
1594
|
+
# # =>
|
1595
|
+
# # shape: (2, 4)
|
1596
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1597
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1598
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1599
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1600
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1601
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1602
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1603
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1604
|
+
# # └─────┴─────┴─────┴───────┘
|
1605
|
+
#
|
1606
|
+
# @example
|
1607
|
+
# df.join(other_df, on: "ham", how: "outer")
|
1608
|
+
# # =>
|
1609
|
+
# # shape: (4, 4)
|
1610
|
+
# # ┌──────┬──────┬─────┬───────┐
|
1611
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1612
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1613
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1614
|
+
# # ╞══════╪══════╪═════╪═══════╡
|
1615
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1616
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1617
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1618
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1619
|
+
# # │ null ┆ null ┆ d ┆ z │
|
1620
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1621
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
1622
|
+
# # └──────┴──────┴─────┴───────┘
|
1623
|
+
#
|
1624
|
+
# @example
|
1625
|
+
# df.join(other_df, on: "ham", how: "left")
|
1626
|
+
# # =>
|
1627
|
+
# # shape: (3, 4)
|
1628
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1629
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1630
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1631
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1632
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1633
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1634
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1635
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1636
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1637
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
1638
|
+
# # └─────┴─────┴─────┴───────┘
|
879
1639
|
#
|
1640
|
+
# @example
|
1641
|
+
# df.join(other_df, on: "ham", how: "semi")
|
1642
|
+
# # =>
|
1643
|
+
# # shape: (2, 3)
|
1644
|
+
# # ┌─────┬─────┬─────┐
|
1645
|
+
# # │ foo ┆ bar ┆ ham │
|
1646
|
+
# # │ --- ┆ --- ┆ --- │
|
1647
|
+
# # │ i64 ┆ f64 ┆ str │
|
1648
|
+
# # ╞═════╪═════╪═════╡
|
1649
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1650
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1651
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1652
|
+
# # └─────┴─────┴─────┘
|
1653
|
+
#
|
1654
|
+
# @example
|
1655
|
+
# df.join(other_df, on: "ham", how: "anti")
|
1656
|
+
# # =>
|
1657
|
+
# # shape: (1, 3)
|
1658
|
+
# # ┌─────┬─────┬─────┐
|
1659
|
+
# # │ foo ┆ bar ┆ ham │
|
1660
|
+
# # │ --- ┆ --- ┆ --- │
|
1661
|
+
# # │ i64 ┆ f64 ┆ str │
|
1662
|
+
# # ╞═════╪═════╪═════╡
|
1663
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1664
|
+
# # └─────┴─────┴─────┘
|
880
1665
|
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
881
1666
|
lazy
|
882
1667
|
.join(
|
@@ -893,41 +1678,322 @@ module Polars
|
|
893
1678
|
# def apply
|
894
1679
|
# end
|
895
1680
|
|
1681
|
+
# Return a new DataFrame with the column added or replaced.
|
1682
|
+
#
|
1683
|
+
# @param column [Object]
|
1684
|
+
# Series, where the name of the Series refers to the column in the DataFrame.
|
1685
|
+
#
|
1686
|
+
# @return [DataFrame]
|
896
1687
|
#
|
1688
|
+
# @example Added
|
1689
|
+
# df = Polars::DataFrame.new(
|
1690
|
+
# {
|
1691
|
+
# "a" => [1, 3, 5],
|
1692
|
+
# "b" => [2, 4, 6]
|
1693
|
+
# }
|
1694
|
+
# )
|
1695
|
+
# df.with_column((Polars.col("b") ** 2).alias("b_squared"))
|
1696
|
+
# # =>
|
1697
|
+
# # shape: (3, 3)
|
1698
|
+
# # ┌─────┬─────┬───────────┐
|
1699
|
+
# # │ a ┆ b ┆ b_squared │
|
1700
|
+
# # │ --- ┆ --- ┆ --- │
|
1701
|
+
# # │ i64 ┆ i64 ┆ f64 │
|
1702
|
+
# # ╞═════╪═════╪═══════════╡
|
1703
|
+
# # │ 1 ┆ 2 ┆ 4.0 │
|
1704
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
1705
|
+
# # │ 3 ┆ 4 ┆ 16.0 │
|
1706
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
1707
|
+
# # │ 5 ┆ 6 ┆ 36.0 │
|
1708
|
+
# # └─────┴─────┴───────────┘
|
1709
|
+
#
|
1710
|
+
# @example Replaced
|
1711
|
+
# df.with_column(Polars.col("a") ** 2)
|
1712
|
+
# # =>
|
1713
|
+
# # shape: (3, 2)
|
1714
|
+
# # ┌──────┬─────┐
|
1715
|
+
# # │ a ┆ b │
|
1716
|
+
# # │ --- ┆ --- │
|
1717
|
+
# # │ f64 ┆ i64 │
|
1718
|
+
# # ╞══════╪═════╡
|
1719
|
+
# # │ 1.0 ┆ 2 │
|
1720
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1721
|
+
# # │ 9.0 ┆ 4 │
|
1722
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1723
|
+
# # │ 25.0 ┆ 6 │
|
1724
|
+
# # └──────┴─────┘
|
897
1725
|
def with_column(column)
|
898
1726
|
lazy
|
899
1727
|
.with_column(column)
|
900
1728
|
.collect(no_optimization: true, string_cache: false)
|
901
1729
|
end
|
902
1730
|
|
903
|
-
#
|
904
|
-
#
|
1731
|
+
# Return a new DataFrame grown horizontally by stacking multiple Series to it.
|
1732
|
+
#
|
1733
|
+
# @param columns [Object]
|
1734
|
+
# Series to stack.
|
1735
|
+
# @param in_place [Boolean]
|
1736
|
+
# Modify in place.
|
1737
|
+
#
|
1738
|
+
# @return [DataFrame]
|
1739
|
+
#
|
1740
|
+
# @example
|
1741
|
+
# df = Polars::DataFrame.new(
|
1742
|
+
# {
|
1743
|
+
# "foo" => [1, 2, 3],
|
1744
|
+
# "bar" => [6, 7, 8],
|
1745
|
+
# "ham" => ["a", "b", "c"]
|
1746
|
+
# }
|
1747
|
+
# )
|
1748
|
+
# x = Polars::Series.new("apple", [10, 20, 30])
|
1749
|
+
# df.hstack([x])
|
1750
|
+
# # =>
|
1751
|
+
# # shape: (3, 4)
|
1752
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1753
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1754
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1755
|
+
# # │ i64 ┆ i64 ┆ str ┆ i64 │
|
1756
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1757
|
+
# # │ 1 ┆ 6 ┆ a ┆ 10 │
|
1758
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1759
|
+
# # │ 2 ┆ 7 ┆ b ┆ 20 │
|
1760
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1761
|
+
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
1762
|
+
# # └─────┴─────┴─────┴───────┘
|
1763
|
+
def hstack(columns, in_place: false)
|
1764
|
+
if !columns.is_a?(Array)
|
1765
|
+
columns = columns.get_columns
|
1766
|
+
end
|
1767
|
+
if in_place
|
1768
|
+
_df.hstack_mut(columns.map(&:_s))
|
1769
|
+
self
|
1770
|
+
else
|
1771
|
+
_from_rbdf(_df.hstack(columns.map(&:_s)))
|
1772
|
+
end
|
1773
|
+
end
|
905
1774
|
|
906
|
-
#
|
907
|
-
#
|
1775
|
+
# Grow this DataFrame vertically by stacking a DataFrame to it.
|
1776
|
+
#
|
1777
|
+
# @param df [DataFrame]
|
1778
|
+
# DataFrame to stack.
|
1779
|
+
# @param in_place [Boolean]
|
1780
|
+
# Modify in place
|
1781
|
+
#
|
1782
|
+
# @return [DataFrame]
|
1783
|
+
#
|
1784
|
+
# @example
|
1785
|
+
# df1 = Polars::DataFrame.new(
|
1786
|
+
# {
|
1787
|
+
# "foo" => [1, 2],
|
1788
|
+
# "bar" => [6, 7],
|
1789
|
+
# "ham" => ["a", "b"]
|
1790
|
+
# }
|
1791
|
+
# )
|
1792
|
+
# df2 = Polars::DataFrame.new(
|
1793
|
+
# {
|
1794
|
+
# "foo" => [3, 4],
|
1795
|
+
# "bar" => [8, 9],
|
1796
|
+
# "ham" => ["c", "d"]
|
1797
|
+
# }
|
1798
|
+
# )
|
1799
|
+
# df1.vstack(df2)
|
1800
|
+
# # =>
|
1801
|
+
# # shape: (4, 3)
|
1802
|
+
# # ┌─────┬─────┬─────┐
|
1803
|
+
# # │ foo ┆ bar ┆ ham │
|
1804
|
+
# # │ --- ┆ --- ┆ --- │
|
1805
|
+
# # │ i64 ┆ i64 ┆ str │
|
1806
|
+
# # ╞═════╪═════╪═════╡
|
1807
|
+
# # │ 1 ┆ 6 ┆ a │
|
1808
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1809
|
+
# # │ 2 ┆ 7 ┆ b │
|
1810
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1811
|
+
# # │ 3 ┆ 8 ┆ c │
|
1812
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1813
|
+
# # │ 4 ┆ 9 ┆ d │
|
1814
|
+
# # └─────┴─────┴─────┘
|
1815
|
+
def vstack(df, in_place: false)
|
1816
|
+
if in_place
|
1817
|
+
_df.vstack_mut(df._df)
|
1818
|
+
self
|
1819
|
+
else
|
1820
|
+
_from_rbdf(_df.vstack(df._df))
|
1821
|
+
end
|
1822
|
+
end
|
908
1823
|
|
1824
|
+
# Extend the memory backed by this `DataFrame` with the values from `other`.
|
1825
|
+
#
|
1826
|
+
# Different from `vstack` which adds the chunks from `other` to the chunks of this
|
1827
|
+
# `DataFrame` `extend` appends the data from `other` to the underlying memory
|
1828
|
+
# locations and thus may cause a reallocation.
|
1829
|
+
#
|
1830
|
+
# If this does not cause a reallocation, the resulting data structure will not
|
1831
|
+
# have any extra chunks and thus will yield faster queries.
|
1832
|
+
#
|
1833
|
+
# Prefer `extend` over `vstack` when you want to do a query after a single append.
|
1834
|
+
# For instance during online operations where you add `n` rows and rerun a query.
|
1835
|
+
#
|
1836
|
+
# Prefer `vstack` over `extend` when you want to append many times before doing a
|
1837
|
+
# query. For instance when you read in multiple files and when to store them in a
|
1838
|
+
# single `DataFrame`. In the latter case, finish the sequence of `vstack`
|
1839
|
+
# operations with a `rechunk`.
|
1840
|
+
#
|
1841
|
+
# @param other [DataFrame]
|
1842
|
+
# DataFrame to vertically add.
|
1843
|
+
#
|
1844
|
+
# @return [DataFrame]
|
909
1845
|
#
|
1846
|
+
# @example
|
1847
|
+
# df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1848
|
+
# df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
|
1849
|
+
# df1.extend(df2)
|
1850
|
+
# # =>
|
1851
|
+
# # shape: (6, 2)
|
1852
|
+
# # ┌─────┬─────┐
|
1853
|
+
# # │ foo ┆ bar │
|
1854
|
+
# # │ --- ┆ --- │
|
1855
|
+
# # │ i64 ┆ i64 │
|
1856
|
+
# # ╞═════╪═════╡
|
1857
|
+
# # │ 1 ┆ 4 │
|
1858
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1859
|
+
# # │ 2 ┆ 5 │
|
1860
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1861
|
+
# # │ 3 ┆ 6 │
|
1862
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1863
|
+
# # │ 10 ┆ 40 │
|
1864
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1865
|
+
# # │ 20 ┆ 50 │
|
1866
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1867
|
+
# # │ 30 ┆ 60 │
|
1868
|
+
# # └─────┴─────┘
|
910
1869
|
def extend(other)
|
911
1870
|
_df.extend(other._df)
|
912
1871
|
self
|
913
1872
|
end
|
914
1873
|
|
915
|
-
#
|
916
|
-
#
|
1874
|
+
# Remove column from DataFrame and return as new.
|
1875
|
+
#
|
1876
|
+
# @param columns [Object]
|
1877
|
+
# Column(s) to drop.
|
1878
|
+
#
|
1879
|
+
# @return [DataFrame]
|
1880
|
+
#
|
1881
|
+
# @example
|
1882
|
+
# df = Polars::DataFrame.new(
|
1883
|
+
# {
|
1884
|
+
# "foo" => [1, 2, 3],
|
1885
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1886
|
+
# "ham" => ["a", "b", "c"]
|
1887
|
+
# }
|
1888
|
+
# )
|
1889
|
+
# df.drop("ham")
|
1890
|
+
# # =>
|
1891
|
+
# # shape: (3, 2)
|
1892
|
+
# # ┌─────┬─────┐
|
1893
|
+
# # │ foo ┆ bar │
|
1894
|
+
# # │ --- ┆ --- │
|
1895
|
+
# # │ i64 ┆ f64 │
|
1896
|
+
# # ╞═════╪═════╡
|
1897
|
+
# # │ 1 ┆ 6.0 │
|
1898
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1899
|
+
# # │ 2 ┆ 7.0 │
|
1900
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1901
|
+
# # │ 3 ┆ 8.0 │
|
1902
|
+
# # └─────┴─────┘
|
1903
|
+
def drop(columns)
|
1904
|
+
if columns.is_a?(Array)
|
1905
|
+
df = clone
|
1906
|
+
columns.each do |n|
|
1907
|
+
df._df.drop_in_place(n)
|
1908
|
+
end
|
1909
|
+
df
|
1910
|
+
else
|
1911
|
+
_from_rbdf(_df.drop(columns))
|
1912
|
+
end
|
1913
|
+
end
|
917
1914
|
|
918
|
-
#
|
919
|
-
#
|
1915
|
+
# Drop in place.
|
1916
|
+
#
|
1917
|
+
# @param name [Object]
|
1918
|
+
# Column to drop.
|
1919
|
+
#
|
1920
|
+
# @return [Series]
|
1921
|
+
#
|
1922
|
+
# @example
|
1923
|
+
# df = Polars::DataFrame.new(
|
1924
|
+
# {
|
1925
|
+
# "foo" => [1, 2, 3],
|
1926
|
+
# "bar" => [6, 7, 8],
|
1927
|
+
# "ham" => ["a", "b", "c"]
|
1928
|
+
# }
|
1929
|
+
# )
|
1930
|
+
# df.drop_in_place("ham")
|
1931
|
+
# # =>
|
1932
|
+
# # shape: (3,)
|
1933
|
+
# # Series: 'ham' [str]
|
1934
|
+
# # [
|
1935
|
+
# # "a"
|
1936
|
+
# # "b"
|
1937
|
+
# # "c"
|
1938
|
+
# # ]
|
1939
|
+
def drop_in_place(name)
|
1940
|
+
Utils.wrap_s(_df.drop_in_place(name))
|
1941
|
+
end
|
920
1942
|
|
921
|
-
#
|
922
|
-
#
|
1943
|
+
# Create an empty copy of the current DataFrame.
|
1944
|
+
#
|
1945
|
+
# Returns a DataFrame with identical schema but no data.
|
1946
|
+
#
|
1947
|
+
# @return [DataFrame]
|
1948
|
+
#
|
1949
|
+
# @example
|
1950
|
+
# df = Polars::DataFrame.new(
|
1951
|
+
# {
|
1952
|
+
# "a" => [nil, 2, 3, 4],
|
1953
|
+
# "b" => [0.5, nil, 2.5, 13],
|
1954
|
+
# "c" => [true, true, false, nil]
|
1955
|
+
# }
|
1956
|
+
# )
|
1957
|
+
# df.cleared
|
1958
|
+
# # =>
|
1959
|
+
# # shape: (0, 3)
|
1960
|
+
# # ┌─────┬─────┬──────┐
|
1961
|
+
# # │ a ┆ b ┆ c │
|
1962
|
+
# # │ --- ┆ --- ┆ --- │
|
1963
|
+
# # │ i64 ┆ f64 ┆ bool │
|
1964
|
+
# # ╞═════╪═════╪══════╡
|
1965
|
+
# # └─────┴─────┴──────┘
|
1966
|
+
def cleared
|
1967
|
+
height > 0 ? head(0) : clone
|
1968
|
+
end
|
923
1969
|
|
924
1970
|
# clone handled by initialize_copy
|
925
1971
|
|
1972
|
+
# Get the DataFrame as a Array of Series.
|
926
1973
|
#
|
1974
|
+
# @return [Array]
|
927
1975
|
def get_columns
|
928
1976
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
929
1977
|
end
|
930
1978
|
|
1979
|
+
# Get a single column as Series by name.
|
1980
|
+
#
|
1981
|
+
# @param name [String]
|
1982
|
+
# Name of the column to retrieve.
|
1983
|
+
#
|
1984
|
+
# @return [Series]
|
1985
|
+
#
|
1986
|
+
# @example
|
1987
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1988
|
+
# df.get_column("foo")
|
1989
|
+
# # =>
|
1990
|
+
# # shape: (3,)
|
1991
|
+
# # Series: 'foo' [i64]
|
1992
|
+
# # [
|
1993
|
+
# # 1
|
1994
|
+
# # 2
|
1995
|
+
# # 3
|
1996
|
+
# # ]
|
931
1997
|
def get_column(name)
|
932
1998
|
self[name]
|
933
1999
|
end
|
@@ -935,13 +2001,85 @@ module Polars
|
|
935
2001
|
# def fill_null
|
936
2002
|
# end
|
937
2003
|
|
2004
|
+
# Fill floating point NaN values by an Expression evaluation.
|
2005
|
+
#
|
2006
|
+
# @param fill_value [Object]
|
2007
|
+
# Value to fill NaN with.
|
2008
|
+
#
|
2009
|
+
# @return [DataFrame]
|
2010
|
+
#
|
2011
|
+
# @note
|
2012
|
+
# Note that floating point NaNs (Not a Number) are not missing values!
|
2013
|
+
# To replace missing values, use `fill_null`.
|
938
2014
|
#
|
2015
|
+
# @example
|
2016
|
+
# df = Polars::DataFrame.new(
|
2017
|
+
# {
|
2018
|
+
# "a" => [1.5, 2, Float::NAN, 4],
|
2019
|
+
# "b" => [0.5, 4, Float::NAN, 13]
|
2020
|
+
# }
|
2021
|
+
# )
|
2022
|
+
# df.fill_nan(99)
|
2023
|
+
# # =>
|
2024
|
+
# # shape: (4, 2)
|
2025
|
+
# # ┌──────┬──────┐
|
2026
|
+
# # │ a ┆ b │
|
2027
|
+
# # │ --- ┆ --- │
|
2028
|
+
# # │ f64 ┆ f64 │
|
2029
|
+
# # ╞══════╪══════╡
|
2030
|
+
# # │ 1.5 ┆ 0.5 │
|
2031
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2032
|
+
# # │ 2.0 ┆ 4.0 │
|
2033
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2034
|
+
# # │ 99.0 ┆ 99.0 │
|
2035
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2036
|
+
# # │ 4.0 ┆ 13.0 │
|
2037
|
+
# # └──────┴──────┘
|
939
2038
|
def fill_nan(fill_value)
|
940
2039
|
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
941
2040
|
end
|
942
2041
|
|
943
|
-
#
|
944
|
-
#
|
2042
|
+
# Explode `DataFrame` to long format by exploding a column with Lists.
|
2043
|
+
#
|
2044
|
+
# @param columns [Object]
|
2045
|
+
# Column of LargeList type.
|
2046
|
+
#
|
2047
|
+
# @return [DataFrame]
|
2048
|
+
#
|
2049
|
+
# @example
|
2050
|
+
# df = Polars::DataFrame.new(
|
2051
|
+
# {
|
2052
|
+
# "letters" => ["a", "a", "b", "c"],
|
2053
|
+
# "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
|
2054
|
+
# }
|
2055
|
+
# )
|
2056
|
+
# df.explode("numbers")
|
2057
|
+
# # =>
|
2058
|
+
# # shape: (8, 2)
|
2059
|
+
# # ┌─────────┬─────────┐
|
2060
|
+
# # │ letters ┆ numbers │
|
2061
|
+
# # │ --- ┆ --- │
|
2062
|
+
# # │ str ┆ i64 │
|
2063
|
+
# # ╞═════════╪═════════╡
|
2064
|
+
# # │ a ┆ 1 │
|
2065
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2066
|
+
# # │ a ┆ 2 │
|
2067
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2068
|
+
# # │ a ┆ 3 │
|
2069
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2070
|
+
# # │ b ┆ 4 │
|
2071
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2072
|
+
# # │ b ┆ 5 │
|
2073
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2074
|
+
# # │ c ┆ 6 │
|
2075
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2076
|
+
# # │ c ┆ 7 │
|
2077
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2078
|
+
# # │ c ┆ 8 │
|
2079
|
+
# # └─────────┴─────────┘
|
2080
|
+
def explode(columns)
|
2081
|
+
lazy.explode(columns).collect(no_optimization: true)
|
2082
|
+
end
|
945
2083
|
|
946
2084
|
# def pivot
|
947
2085
|
# end
|
@@ -955,25 +2093,242 @@ module Polars
|
|
955
2093
|
# def partition_by
|
956
2094
|
# end
|
957
2095
|
|
958
|
-
#
|
959
|
-
#
|
960
|
-
|
961
|
-
#
|
962
|
-
#
|
2096
|
+
# Shift values by the given period.
|
2097
|
+
#
|
2098
|
+
# @param periods [Integer]
|
2099
|
+
# Number of places to shift (may be negative).
|
2100
|
+
#
|
2101
|
+
# @return [DataFrame]
|
2102
|
+
#
|
2103
|
+
# @example
|
2104
|
+
# df = Polars::DataFrame.new(
|
2105
|
+
# {
|
2106
|
+
# "foo" => [1, 2, 3],
|
2107
|
+
# "bar" => [6, 7, 8],
|
2108
|
+
# "ham" => ["a", "b", "c"]
|
2109
|
+
# }
|
2110
|
+
# )
|
2111
|
+
# df.shift(1)
|
2112
|
+
# # =>
|
2113
|
+
# # shape: (3, 3)
|
2114
|
+
# # ┌──────┬──────┬──────┐
|
2115
|
+
# # │ foo ┆ bar ┆ ham │
|
2116
|
+
# # │ --- ┆ --- ┆ --- │
|
2117
|
+
# # │ i64 ┆ i64 ┆ str │
|
2118
|
+
# # ╞══════╪══════╪══════╡
|
2119
|
+
# # │ null ┆ null ┆ null │
|
2120
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2121
|
+
# # │ 1 ┆ 6 ┆ a │
|
2122
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2123
|
+
# # │ 2 ┆ 7 ┆ b │
|
2124
|
+
# # └──────┴──────┴──────┘
|
2125
|
+
#
|
2126
|
+
# @example
|
2127
|
+
# df.shift(-1)
|
2128
|
+
# # =>
|
2129
|
+
# # shape: (3, 3)
|
2130
|
+
# # ┌──────┬──────┬──────┐
|
2131
|
+
# # │ foo ┆ bar ┆ ham │
|
2132
|
+
# # │ --- ┆ --- ┆ --- │
|
2133
|
+
# # │ i64 ┆ i64 ┆ str │
|
2134
|
+
# # ╞══════╪══════╪══════╡
|
2135
|
+
# # │ 2 ┆ 7 ┆ b │
|
2136
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2137
|
+
# # │ 3 ┆ 8 ┆ c │
|
2138
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2139
|
+
# # │ null ┆ null ┆ null │
|
2140
|
+
# # └──────┴──────┴──────┘
|
2141
|
+
def shift(periods)
|
2142
|
+
_from_rbdf(_df.shift(periods))
|
2143
|
+
end
|
2144
|
+
|
2145
|
+
# Shift the values by a given period and fill the resulting null values.
|
2146
|
+
#
|
2147
|
+
# @param periods [Integer]
|
2148
|
+
# Number of places to shift (may be negative).
|
2149
|
+
# @param fill_value [Object]
|
2150
|
+
# fill nil values with this value.
|
2151
|
+
#
|
2152
|
+
# @return [DataFrame]
|
2153
|
+
#
|
2154
|
+
# @example
|
2155
|
+
# df = Polars::DataFrame.new(
|
2156
|
+
# {
|
2157
|
+
# "foo" => [1, 2, 3],
|
2158
|
+
# "bar" => [6, 7, 8],
|
2159
|
+
# "ham" => ["a", "b", "c"]
|
2160
|
+
# }
|
2161
|
+
# )
|
2162
|
+
# df.shift_and_fill(1, 0)
|
2163
|
+
# # =>
|
2164
|
+
# # shape: (3, 3)
|
2165
|
+
# # ┌─────┬─────┬─────┐
|
2166
|
+
# # │ foo ┆ bar ┆ ham │
|
2167
|
+
# # │ --- ┆ --- ┆ --- │
|
2168
|
+
# # │ i64 ┆ i64 ┆ str │
|
2169
|
+
# # ╞═════╪═════╪═════╡
|
2170
|
+
# # │ 0 ┆ 0 ┆ 0 │
|
2171
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2172
|
+
# # │ 1 ┆ 6 ┆ a │
|
2173
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2174
|
+
# # │ 2 ┆ 7 ┆ b │
|
2175
|
+
# # └─────┴─────┴─────┘
|
2176
|
+
def shift_and_fill(periods, fill_value)
|
2177
|
+
lazy
|
2178
|
+
.shift_and_fill(periods, fill_value)
|
2179
|
+
.collect(no_optimization: true, string_cache: false)
|
2180
|
+
end
|
963
2181
|
|
2182
|
+
# Get a mask of all duplicated rows in this DataFrame.
|
2183
|
+
#
|
2184
|
+
# @return [Series]
|
964
2185
|
#
|
2186
|
+
# @example
|
2187
|
+
# df = Polars::DataFrame.new(
|
2188
|
+
# {
|
2189
|
+
# "a" => [1, 2, 3, 1],
|
2190
|
+
# "b" => ["x", "y", "z", "x"],
|
2191
|
+
# }
|
2192
|
+
# )
|
2193
|
+
# df.is_duplicated
|
2194
|
+
# # =>
|
2195
|
+
# # shape: (4,)
|
2196
|
+
# # Series: '' [bool]
|
2197
|
+
# # [
|
2198
|
+
# # true
|
2199
|
+
# # false
|
2200
|
+
# # false
|
2201
|
+
# # true
|
2202
|
+
# # ]
|
965
2203
|
def is_duplicated
|
966
2204
|
Utils.wrap_s(_df.is_duplicated)
|
967
2205
|
end
|
968
2206
|
|
2207
|
+
# Get a mask of all unique rows in this DataFrame.
|
2208
|
+
#
|
2209
|
+
# @return [Series]
|
2210
|
+
#
|
2211
|
+
# @example
|
2212
|
+
# df = Polars::DataFrame.new(
|
2213
|
+
# {
|
2214
|
+
# "a" => [1, 2, 3, 1],
|
2215
|
+
# "b" => ["x", "y", "z", "x"]
|
2216
|
+
# }
|
2217
|
+
# )
|
2218
|
+
# df.is_unique
|
2219
|
+
# # =>
|
2220
|
+
# # shape: (4,)
|
2221
|
+
# # Series: '' [bool]
|
2222
|
+
# # [
|
2223
|
+
# # false
|
2224
|
+
# # true
|
2225
|
+
# # true
|
2226
|
+
# # false
|
2227
|
+
# # ]
|
969
2228
|
def is_unique
|
970
2229
|
Utils.wrap_s(_df.is_unique)
|
971
2230
|
end
|
972
2231
|
|
2232
|
+
# Start a lazy query from this point.
|
2233
|
+
#
|
2234
|
+
# @return [LazyFrame]
|
973
2235
|
def lazy
|
974
2236
|
wrap_ldf(_df.lazy)
|
975
2237
|
end
|
976
2238
|
|
2239
|
+
# Select columns from this DataFrame.
|
2240
|
+
#
|
2241
|
+
# @param exprs [Object]
|
2242
|
+
# Column or columns to select.
|
2243
|
+
#
|
2244
|
+
# @return [DataFrame]
|
2245
|
+
#
|
2246
|
+
# @example
|
2247
|
+
# df = Polars::DataFrame.new(
|
2248
|
+
# {
|
2249
|
+
# "foo" => [1, 2, 3],
|
2250
|
+
# "bar" => [6, 7, 8],
|
2251
|
+
# "ham" => ["a", "b", "c"]
|
2252
|
+
# }
|
2253
|
+
# )
|
2254
|
+
# df.select("foo")
|
2255
|
+
# # =>
|
2256
|
+
# # shape: (3, 1)
|
2257
|
+
# # ┌─────┐
|
2258
|
+
# # │ foo │
|
2259
|
+
# # │ --- │
|
2260
|
+
# # │ i64 │
|
2261
|
+
# # ╞═════╡
|
2262
|
+
# # │ 1 │
|
2263
|
+
# # ├╌╌╌╌╌┤
|
2264
|
+
# # │ 2 │
|
2265
|
+
# # ├╌╌╌╌╌┤
|
2266
|
+
# # │ 3 │
|
2267
|
+
# # └─────┘
|
2268
|
+
#
|
2269
|
+
# @example
|
2270
|
+
# df.select(["foo", "bar"])
|
2271
|
+
# # =>
|
2272
|
+
# # shape: (3, 2)
|
2273
|
+
# # ┌─────┬─────┐
|
2274
|
+
# # │ foo ┆ bar │
|
2275
|
+
# # │ --- ┆ --- │
|
2276
|
+
# # │ i64 ┆ i64 │
|
2277
|
+
# # ╞═════╪═════╡
|
2278
|
+
# # │ 1 ┆ 6 │
|
2279
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2280
|
+
# # │ 2 ┆ 7 │
|
2281
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2282
|
+
# # │ 3 ┆ 8 │
|
2283
|
+
# # └─────┴─────┘
|
2284
|
+
#
|
2285
|
+
# @example
|
2286
|
+
# df.select(Polars.col("foo") + 1)
|
2287
|
+
# # =>
|
2288
|
+
# # shape: (3, 1)
|
2289
|
+
# # ┌─────┐
|
2290
|
+
# # │ foo │
|
2291
|
+
# # │ --- │
|
2292
|
+
# # │ i64 │
|
2293
|
+
# # ╞═════╡
|
2294
|
+
# # │ 2 │
|
2295
|
+
# # ├╌╌╌╌╌┤
|
2296
|
+
# # │ 3 │
|
2297
|
+
# # ├╌╌╌╌╌┤
|
2298
|
+
# # │ 4 │
|
2299
|
+
# # └─────┘
|
2300
|
+
#
|
2301
|
+
# @example
|
2302
|
+
# df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
|
2303
|
+
# # =>
|
2304
|
+
# # shape: (3, 2)
|
2305
|
+
# # ┌─────┬─────┐
|
2306
|
+
# # │ foo ┆ bar │
|
2307
|
+
# # │ --- ┆ --- │
|
2308
|
+
# # │ i64 ┆ i64 │
|
2309
|
+
# # ╞═════╪═════╡
|
2310
|
+
# # │ 2 ┆ 7 │
|
2311
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2312
|
+
# # │ 3 ┆ 8 │
|
2313
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2314
|
+
# # │ 4 ┆ 9 │
|
2315
|
+
# # └─────┴─────┘
|
2316
|
+
#
|
2317
|
+
# @example
|
2318
|
+
# df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
|
2319
|
+
# # =>
|
2320
|
+
# # shape: (3, 1)
|
2321
|
+
# # ┌─────────┐
|
2322
|
+
# # │ literal │
|
2323
|
+
# # │ --- │
|
2324
|
+
# # │ i64 │
|
2325
|
+
# # ╞═════════╡
|
2326
|
+
# # │ 0 │
|
2327
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
2328
|
+
# # │ 0 │
|
2329
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
2330
|
+
# # │ 10 │
|
2331
|
+
# # └─────────┘
|
977
2332
|
def select(exprs)
|
978
2333
|
_from_rbdf(
|
979
2334
|
lazy
|
@@ -983,6 +2338,43 @@ module Polars
|
|
983
2338
|
)
|
984
2339
|
end
|
985
2340
|
|
2341
|
+
# Add or overwrite multiple columns in a DataFrame.
|
2342
|
+
#
|
2343
|
+
# @param exprs [Array]
|
2344
|
+
# Array of Expressions that evaluate to columns.
|
2345
|
+
#
|
2346
|
+
# @return [DataFrame]
|
2347
|
+
#
|
2348
|
+
# @example
|
2349
|
+
# df = Polars::DataFrame.new(
|
2350
|
+
# {
|
2351
|
+
# "a" => [1, 2, 3, 4],
|
2352
|
+
# "b" => [0.5, 4, 10, 13],
|
2353
|
+
# "c" => [true, true, false, true]
|
2354
|
+
# }
|
2355
|
+
# )
|
2356
|
+
# df.with_columns(
|
2357
|
+
# [
|
2358
|
+
# (Polars.col("a") ** 2).alias("a^2"),
|
2359
|
+
# (Polars.col("b") / 2).alias("b/2"),
|
2360
|
+
# (Polars.col("c").is_not()).alias("not c")
|
2361
|
+
# ]
|
2362
|
+
# )
|
2363
|
+
# # =>
|
2364
|
+
# # shape: (4, 6)
|
2365
|
+
# # ┌─────┬──────┬───────┬──────┬──────┬───────┐
|
2366
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
2367
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2368
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
|
2369
|
+
# # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
|
2370
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
|
2371
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2372
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
|
2373
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2374
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
2375
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2376
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
2377
|
+
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
986
2378
|
def with_columns(exprs)
|
987
2379
|
if !exprs.nil? && !exprs.is_a?(Array)
|
988
2380
|
exprs = [exprs]
|
@@ -992,6 +2384,26 @@ module Polars
|
|
992
2384
|
.collect(no_optimization: true, string_cache: false)
|
993
2385
|
end
|
994
2386
|
|
2387
|
+
# Get number of chunks used by the ChunkedArrays of this DataFrame.
|
2388
|
+
#
|
2389
|
+
# @param strategy ["first", "all"]
|
2390
|
+
# Return the number of chunks of the 'first' column,
|
2391
|
+
# or 'all' columns in this DataFrame.
|
2392
|
+
#
|
2393
|
+
# @return [Object]
|
2394
|
+
#
|
2395
|
+
# @example
|
2396
|
+
# df = Polars::DataFrame.new(
|
2397
|
+
# {
|
2398
|
+
# "a" => [1, 2, 3, 4],
|
2399
|
+
# "b" => [0.5, 4, 10, 13],
|
2400
|
+
# "c" => [true, true, false, true]
|
2401
|
+
# }
|
2402
|
+
# )
|
2403
|
+
# df.n_chunks
|
2404
|
+
# # => 1
|
2405
|
+
# df.n_chunks(strategy: "all")
|
2406
|
+
# # => [1, 1, 1]
|
995
2407
|
def n_chunks(strategy: "first")
|
996
2408
|
if strategy == "first"
|
997
2409
|
_df.n_chunks
|
@@ -1002,6 +2414,28 @@ module Polars
|
|
1002
2414
|
end
|
1003
2415
|
end
|
1004
2416
|
|
2417
|
+
# Aggregate the columns of this DataFrame to their maximum value.
|
2418
|
+
#
|
2419
|
+
# @return [DataFrame]
|
2420
|
+
#
|
2421
|
+
# @example
|
2422
|
+
# df = Polars::DataFrame.new(
|
2423
|
+
# {
|
2424
|
+
# "foo" => [1, 2, 3],
|
2425
|
+
# "bar" => [6, 7, 8],
|
2426
|
+
# "ham" => ["a", "b", "c"]
|
2427
|
+
# }
|
2428
|
+
# )
|
2429
|
+
# df.max
|
2430
|
+
# # =>
|
2431
|
+
# # shape: (1, 3)
|
2432
|
+
# # ┌─────┬─────┬─────┐
|
2433
|
+
# # │ foo ┆ bar ┆ ham │
|
2434
|
+
# # │ --- ┆ --- ┆ --- │
|
2435
|
+
# # │ i64 ┆ i64 ┆ str │
|
2436
|
+
# # ╞═════╪═════╪═════╡
|
2437
|
+
# # │ 3 ┆ 8 ┆ c │
|
2438
|
+
# # └─────┴─────┴─────┘
|
1005
2439
|
def max(axis: 0)
|
1006
2440
|
if axis == 0
|
1007
2441
|
_from_rbdf(_df.max)
|
@@ -1012,6 +2446,28 @@ module Polars
|
|
1012
2446
|
end
|
1013
2447
|
end
|
1014
2448
|
|
2449
|
+
# Aggregate the columns of this DataFrame to their minimum value.
|
2450
|
+
#
|
2451
|
+
# @return [DataFrame]
|
2452
|
+
#
|
2453
|
+
# @example
|
2454
|
+
# df = Polars::DataFrame.new(
|
2455
|
+
# {
|
2456
|
+
# "foo" => [1, 2, 3],
|
2457
|
+
# "bar" => [6, 7, 8],
|
2458
|
+
# "ham" => ["a", "b", "c"]
|
2459
|
+
# }
|
2460
|
+
# )
|
2461
|
+
# df.min
|
2462
|
+
# # =>
|
2463
|
+
# # shape: (1, 3)
|
2464
|
+
# # ┌─────┬─────┬─────┐
|
2465
|
+
# # │ foo ┆ bar ┆ ham │
|
2466
|
+
# # │ --- ┆ --- ┆ --- │
|
2467
|
+
# # │ i64 ┆ i64 ┆ str │
|
2468
|
+
# # ╞═════╪═════╪═════╡
|
2469
|
+
# # │ 1 ┆ 6 ┆ a │
|
2470
|
+
# # └─────┴─────┴─────┘
|
1015
2471
|
def min(axis: 0)
|
1016
2472
|
if axis == 0
|
1017
2473
|
_from_rbdf(_df.min)
|
@@ -1022,6 +2478,44 @@ module Polars
|
|
1022
2478
|
end
|
1023
2479
|
end
|
1024
2480
|
|
2481
|
+
# Aggregate the columns of this DataFrame to their sum value.
|
2482
|
+
#
|
2483
|
+
# @param axis [Integer]
|
2484
|
+
# Either 0 or 1.
|
2485
|
+
# @param null_strategy ["ignore", "propagate"]
|
2486
|
+
# This argument is only used if axis == 1.
|
2487
|
+
#
|
2488
|
+
# @return [DataFrame]
|
2489
|
+
#
|
2490
|
+
# @example
|
2491
|
+
# df = Polars::DataFrame.new(
|
2492
|
+
# {
|
2493
|
+
# "foo" => [1, 2, 3],
|
2494
|
+
# "bar" => [6, 7, 8],
|
2495
|
+
# "ham" => ["a", "b", "c"],
|
2496
|
+
# }
|
2497
|
+
# )
|
2498
|
+
# df.sum
|
2499
|
+
# # =>
|
2500
|
+
# # shape: (1, 3)
|
2501
|
+
# # ┌─────┬─────┬──────┐
|
2502
|
+
# # │ foo ┆ bar ┆ ham │
|
2503
|
+
# # │ --- ┆ --- ┆ --- │
|
2504
|
+
# # │ i64 ┆ i64 ┆ str │
|
2505
|
+
# # ╞═════╪═════╪══════╡
|
2506
|
+
# # │ 6 ┆ 21 ┆ null │
|
2507
|
+
# # └─────┴─────┴──────┘
|
2508
|
+
#
|
2509
|
+
# @example
|
2510
|
+
# df.sum(axis: 1)
|
2511
|
+
# # =>
|
2512
|
+
# # shape: (3,)
|
2513
|
+
# # Series: 'foo' [str]
|
2514
|
+
# # [
|
2515
|
+
# # "16a"
|
2516
|
+
# # "27b"
|
2517
|
+
# # "38c"
|
2518
|
+
# # ]
|
1025
2519
|
def sum(axis: 0, null_strategy: "ignore")
|
1026
2520
|
case axis
|
1027
2521
|
when 0
|
@@ -1033,6 +2527,33 @@ module Polars
|
|
1033
2527
|
end
|
1034
2528
|
end
|
1035
2529
|
|
2530
|
+
# Aggregate the columns of this DataFrame to their mean value.
|
2531
|
+
#
|
2532
|
+
# @param axis [Integer]
|
2533
|
+
# Either 0 or 1.
|
2534
|
+
# @param null_strategy ["ignore", "propagate"]
|
2535
|
+
# This argument is only used if axis == 1.
|
2536
|
+
#
|
2537
|
+
# @return [DataFrame]
|
2538
|
+
#
|
2539
|
+
# @example
|
2540
|
+
# df = Polars::DataFrame.new(
|
2541
|
+
# {
|
2542
|
+
# "foo" => [1, 2, 3],
|
2543
|
+
# "bar" => [6, 7, 8],
|
2544
|
+
# "ham" => ["a", "b", "c"]
|
2545
|
+
# }
|
2546
|
+
# )
|
2547
|
+
# df.mean
|
2548
|
+
# # =>
|
2549
|
+
# # shape: (1, 3)
|
2550
|
+
# # ┌─────┬─────┬──────┐
|
2551
|
+
# # │ foo ┆ bar ┆ ham │
|
2552
|
+
# # │ --- ┆ --- ┆ --- │
|
2553
|
+
# # │ f64 ┆ f64 ┆ str │
|
2554
|
+
# # ╞═════╪═════╪══════╡
|
2555
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2556
|
+
# # └─────┴─────┴──────┘
|
1036
2557
|
def mean(axis: 0, null_strategy: "ignore")
|
1037
2558
|
case axis
|
1038
2559
|
when 0
|
@@ -1044,77 +2565,633 @@ module Polars
|
|
1044
2565
|
end
|
1045
2566
|
end
|
1046
2567
|
|
2568
|
+
# Aggregate the columns of this DataFrame to their standard deviation value.
|
2569
|
+
#
|
2570
|
+
# @param ddof [Integer]
|
2571
|
+
# Degrees of freedom
|
2572
|
+
#
|
2573
|
+
# @return [DataFrame]
|
2574
|
+
#
|
2575
|
+
# @example
|
2576
|
+
# df = Polars::DataFrame.new(
|
2577
|
+
# {
|
2578
|
+
# "foo" => [1, 2, 3],
|
2579
|
+
# "bar" => [6, 7, 8],
|
2580
|
+
# "ham" => ["a", "b", "c"]
|
2581
|
+
# }
|
2582
|
+
# )
|
2583
|
+
# df.std
|
2584
|
+
# # =>
|
2585
|
+
# # shape: (1, 3)
|
2586
|
+
# # ┌─────┬─────┬──────┐
|
2587
|
+
# # │ foo ┆ bar ┆ ham │
|
2588
|
+
# # │ --- ┆ --- ┆ --- │
|
2589
|
+
# # │ f64 ┆ f64 ┆ str │
|
2590
|
+
# # ╞═════╪═════╪══════╡
|
2591
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
2592
|
+
# # └─────┴─────┴──────┘
|
2593
|
+
#
|
2594
|
+
# @example
|
2595
|
+
# df.std(ddof: 0)
|
2596
|
+
# # =>
|
2597
|
+
# # shape: (1, 3)
|
2598
|
+
# # ┌──────────┬──────────┬──────┐
|
2599
|
+
# # │ foo ┆ bar ┆ ham │
|
2600
|
+
# # │ --- ┆ --- ┆ --- │
|
2601
|
+
# # │ f64 ┆ f64 ┆ str │
|
2602
|
+
# # ╞══════════╪══════════╪══════╡
|
2603
|
+
# # │ 0.816497 ┆ 0.816497 ┆ null │
|
2604
|
+
# # └──────────┴──────────┴──────┘
|
1047
2605
|
def std(ddof: 1)
|
1048
2606
|
_from_rbdf(_df.std(ddof))
|
1049
2607
|
end
|
1050
2608
|
|
2609
|
+
# Aggregate the columns of this DataFrame to their variance value.
|
2610
|
+
#
|
2611
|
+
# @param ddof [Integer]
|
2612
|
+
# Degrees of freedom
|
2613
|
+
#
|
2614
|
+
# @return [DataFrame]
|
2615
|
+
#
|
2616
|
+
# @example
|
2617
|
+
# df = Polars::DataFrame.new(
|
2618
|
+
# {
|
2619
|
+
# "foo" => [1, 2, 3],
|
2620
|
+
# "bar" => [6, 7, 8],
|
2621
|
+
# "ham" => ["a", "b", "c"]
|
2622
|
+
# }
|
2623
|
+
# )
|
2624
|
+
# df.var
|
2625
|
+
# # =>
|
2626
|
+
# # shape: (1, 3)
|
2627
|
+
# # ┌─────┬─────┬──────┐
|
2628
|
+
# # │ foo ┆ bar ┆ ham │
|
2629
|
+
# # │ --- ┆ --- ┆ --- │
|
2630
|
+
# # │ f64 ┆ f64 ┆ str │
|
2631
|
+
# # ╞═════╪═════╪══════╡
|
2632
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
2633
|
+
# # └─────┴─────┴──────┘
|
2634
|
+
#
|
2635
|
+
# @example
|
2636
|
+
# df.var(ddof: 0)
|
2637
|
+
# # =>
|
2638
|
+
# # shape: (1, 3)
|
2639
|
+
# # ┌──────────┬──────────┬──────┐
|
2640
|
+
# # │ foo ┆ bar ┆ ham │
|
2641
|
+
# # │ --- ┆ --- ┆ --- │
|
2642
|
+
# # │ f64 ┆ f64 ┆ str │
|
2643
|
+
# # ╞══════════╪══════════╪══════╡
|
2644
|
+
# # │ 0.666667 ┆ 0.666667 ┆ null │
|
2645
|
+
# # └──────────┴──────────┴──────┘
|
1051
2646
|
def var(ddof: 1)
|
1052
2647
|
_from_rbdf(_df.var(ddof))
|
1053
2648
|
end
|
1054
2649
|
|
2650
|
+
# Aggregate the columns of this DataFrame to their median value.
|
2651
|
+
#
|
2652
|
+
# @return [DataFrame]
|
2653
|
+
#
|
2654
|
+
# @example
|
2655
|
+
# df = Polars::DataFrame.new(
|
2656
|
+
# {
|
2657
|
+
# "foo" => [1, 2, 3],
|
2658
|
+
# "bar" => [6, 7, 8],
|
2659
|
+
# "ham" => ["a", "b", "c"]
|
2660
|
+
# }
|
2661
|
+
# )
|
2662
|
+
# df.median
|
2663
|
+
# # =>
|
2664
|
+
# # shape: (1, 3)
|
2665
|
+
# # ┌─────┬─────┬──────┐
|
2666
|
+
# # │ foo ┆ bar ┆ ham │
|
2667
|
+
# # │ --- ┆ --- ┆ --- │
|
2668
|
+
# # │ f64 ┆ f64 ┆ str │
|
2669
|
+
# # ╞═════╪═════╪══════╡
|
2670
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2671
|
+
# # └─────┴─────┴──────┘
|
1055
2672
|
def median
|
1056
2673
|
_from_rbdf(_df.median)
|
1057
2674
|
end
|
1058
2675
|
|
1059
|
-
#
|
1060
|
-
#
|
2676
|
+
# Aggregate the columns of this DataFrame to their product values.
|
2677
|
+
#
|
2678
|
+
# @return [DataFrame]
|
2679
|
+
#
|
2680
|
+
# @example
|
2681
|
+
# df = Polars::DataFrame.new(
|
2682
|
+
# {
|
2683
|
+
# "a" => [1, 2, 3],
|
2684
|
+
# "b" => [0.5, 4, 10],
|
2685
|
+
# "c" => [true, true, false]
|
2686
|
+
# }
|
2687
|
+
# )
|
2688
|
+
# df.product
|
2689
|
+
# # =>
|
2690
|
+
# # shape: (1, 3)
|
2691
|
+
# # ┌─────┬──────┬─────┐
|
2692
|
+
# # │ a ┆ b ┆ c │
|
2693
|
+
# # │ --- ┆ --- ┆ --- │
|
2694
|
+
# # │ i64 ┆ f64 ┆ i64 │
|
2695
|
+
# # ╞═════╪══════╪═════╡
|
2696
|
+
# # │ 6 ┆ 20.0 ┆ 0 │
|
2697
|
+
# # └─────┴──────┴─────┘
|
2698
|
+
def product
|
2699
|
+
select(Polars.all.product)
|
2700
|
+
end
|
2701
|
+
|
2702
|
+
# Aggregate the columns of this DataFrame to their quantile value.
|
2703
|
+
#
|
2704
|
+
# @param quantile [Float]
|
2705
|
+
# Quantile between 0.0 and 1.0.
|
2706
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
2707
|
+
# Interpolation method.
|
2708
|
+
#
|
2709
|
+
# @return [DataFrame]
|
2710
|
+
#
|
2711
|
+
# @example
|
2712
|
+
# df = Polars::DataFrame.new(
|
2713
|
+
# {
|
2714
|
+
# "foo" => [1, 2, 3],
|
2715
|
+
# "bar" => [6, 7, 8],
|
2716
|
+
# "ham" => ["a", "b", "c"]
|
2717
|
+
# }
|
2718
|
+
# )
|
2719
|
+
# df.quantile(0.5, interpolation: "nearest")
|
2720
|
+
# # =>
|
2721
|
+
# # shape: (1, 3)
|
2722
|
+
# # ┌─────┬─────┬──────┐
|
2723
|
+
# # │ foo ┆ bar ┆ ham │
|
2724
|
+
# # │ --- ┆ --- ┆ --- │
|
2725
|
+
# # │ f64 ┆ f64 ┆ str │
|
2726
|
+
# # ╞═════╪═════╪══════╡
|
2727
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2728
|
+
# # └─────┴─────┴──────┘
|
2729
|
+
def quantile(quantile, interpolation: "nearest")
|
2730
|
+
_from_rbdf(_df.quantile(quantile, interpolation))
|
2731
|
+
end
|
2732
|
+
|
2733
|
+
# Get one hot encoded dummy variables.
|
2734
|
+
#
|
2735
|
+
# @param columns
|
2736
|
+
# A subset of columns to convert to dummy variables. `nil` means
|
2737
|
+
# "all columns".
|
2738
|
+
#
|
2739
|
+
# @return [DataFrame]
|
2740
|
+
#
|
2741
|
+
# @example
|
2742
|
+
# df = Polars::DataFrame.new(
|
2743
|
+
# {
|
2744
|
+
# "foo" => [1, 2],
|
2745
|
+
# "bar" => [3, 4],
|
2746
|
+
# "ham" => ["a", "b"]
|
2747
|
+
# }
|
2748
|
+
# )
|
2749
|
+
# df.to_dummies
|
2750
|
+
# # =>
|
2751
|
+
# # shape: (2, 6)
|
2752
|
+
# # ┌───────┬───────┬───────┬───────┬───────┬───────┐
|
2753
|
+
# # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
|
2754
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2755
|
+
# # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
|
2756
|
+
# # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
|
2757
|
+
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
2758
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2759
|
+
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
2760
|
+
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
2761
|
+
def to_dummies(columns: nil)
|
2762
|
+
if columns.is_a?(String)
|
2763
|
+
columns = [columns]
|
2764
|
+
end
|
2765
|
+
_from_rbdf(_df.to_dummies(columns))
|
2766
|
+
end
|
1061
2767
|
|
1062
|
-
#
|
1063
|
-
#
|
2768
|
+
# Drop duplicate rows from this DataFrame.
|
2769
|
+
#
|
2770
|
+
# @param maintain_order [Boolean]
|
2771
|
+
# Keep the same order as the original DataFrame. This requires more work to
|
2772
|
+
# compute.
|
2773
|
+
# @param subset [Object]
|
2774
|
+
# Subset to use to compare rows.
|
2775
|
+
# @param keep ["first", "last"]
|
2776
|
+
# Which of the duplicate rows to keep (in conjunction with `subset`).
|
2777
|
+
#
|
2778
|
+
# @return [DataFrame]
|
2779
|
+
#
|
2780
|
+
# @note
|
2781
|
+
# Note that this fails if there is a column of type `List` in the DataFrame or
|
2782
|
+
# subset.
|
2783
|
+
#
|
2784
|
+
# @example
|
2785
|
+
# df = Polars::DataFrame.new(
|
2786
|
+
# {
|
2787
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
2788
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
2789
|
+
# "c" => [true, true, true, false, true, true]
|
2790
|
+
# }
|
2791
|
+
# )
|
2792
|
+
# df.unique
|
2793
|
+
# # =>
|
2794
|
+
# # shape: (5, 3)
|
2795
|
+
# # ┌─────┬─────┬───────┐
|
2796
|
+
# # │ a ┆ b ┆ c │
|
2797
|
+
# # │ --- ┆ --- ┆ --- │
|
2798
|
+
# # │ i64 ┆ f64 ┆ bool │
|
2799
|
+
# # ╞═════╪═════╪═══════╡
|
2800
|
+
# # │ 1 ┆ 0.5 ┆ true │
|
2801
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2802
|
+
# # │ 2 ┆ 1.0 ┆ true │
|
2803
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2804
|
+
# # │ 3 ┆ 2.0 ┆ false │
|
2805
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2806
|
+
# # │ 4 ┆ 3.0 ┆ true │
|
2807
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2808
|
+
# # │ 5 ┆ 3.0 ┆ true │
|
2809
|
+
# # └─────┴─────┴───────┘
|
2810
|
+
def unique(maintain_order: true, subset: nil, keep: "first")
|
2811
|
+
if !subset.nil?
|
2812
|
+
if subset.is_a?(String)
|
2813
|
+
subset = [subset]
|
2814
|
+
elsif !subset.is_a?(Array)
|
2815
|
+
subset = subset.to_a
|
2816
|
+
end
|
2817
|
+
end
|
1064
2818
|
|
1065
|
-
|
1066
|
-
|
2819
|
+
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
2820
|
+
end
|
1067
2821
|
|
1068
|
-
#
|
1069
|
-
#
|
2822
|
+
# Return the number of unique rows, or the number of unique row-subsets.
|
2823
|
+
#
|
2824
|
+
# @param subset [Object]
|
2825
|
+
# One or more columns/expressions that define what to count;
|
2826
|
+
# omit to return the count of unique rows.
|
2827
|
+
#
|
2828
|
+
# @return [DataFrame]
|
2829
|
+
#
|
2830
|
+
# @example
|
2831
|
+
# df = Polars::DataFrame.new(
|
2832
|
+
# {
|
2833
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
2834
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
2835
|
+
# "c" => [true, true, true, false, true, true]
|
2836
|
+
# }
|
2837
|
+
# )
|
2838
|
+
# df.n_unique
|
2839
|
+
# # => 5
|
2840
|
+
#
|
2841
|
+
# @example Simple columns subset
|
2842
|
+
# df.n_unique(subset: ["b", "c"])
|
2843
|
+
# # => 4
|
2844
|
+
#
|
2845
|
+
# @example Expression subset
|
2846
|
+
# df.n_unique(
|
2847
|
+
# subset: [
|
2848
|
+
# (Polars.col("a").floordiv(2)),
|
2849
|
+
# (Polars.col("c") | (Polars.col("b") >= 2))
|
2850
|
+
# ]
|
2851
|
+
# )
|
2852
|
+
# # => 3
|
2853
|
+
def n_unique(subset: nil)
|
2854
|
+
if subset.is_a?(StringIO)
|
2855
|
+
subset = [Polars.col(subset)]
|
2856
|
+
elsif subset.is_a?(Expr)
|
2857
|
+
subset = [subset]
|
2858
|
+
end
|
1070
2859
|
|
1071
|
-
|
1072
|
-
|
2860
|
+
if subset.is_a?(Array) && subset.length == 1
|
2861
|
+
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
2862
|
+
else
|
2863
|
+
struct_fields = subset.nil? ? Polars.all : subset
|
2864
|
+
expr = Polars.struct(struct_fields)
|
2865
|
+
end
|
2866
|
+
|
2867
|
+
df = lazy.select(expr.n_unique).collect
|
2868
|
+
df.is_empty ? 0 : df.row(0)[0]
|
2869
|
+
end
|
2870
|
+
|
2871
|
+
# Rechunk the data in this DataFrame to a contiguous allocation.
|
1073
2872
|
|
2873
|
+
# This will make sure all subsequent operations have optimal and predictable
|
2874
|
+
# performance.
|
1074
2875
|
#
|
2876
|
+
# @return [DataFrame]
|
1075
2877
|
def rechunk
|
1076
2878
|
_from_rbdf(_df.rechunk)
|
1077
2879
|
end
|
1078
2880
|
|
2881
|
+
# Create a new DataFrame that shows the null counts per column.
|
2882
|
+
#
|
2883
|
+
# @return [DataFrame]
|
2884
|
+
#
|
2885
|
+
# @example
|
2886
|
+
# df = Polars::DataFrame.new(
|
2887
|
+
# {
|
2888
|
+
# "foo" => [1, nil, 3],
|
2889
|
+
# "bar" => [6, 7, nil],
|
2890
|
+
# "ham" => ["a", "b", "c"]
|
2891
|
+
# }
|
2892
|
+
# )
|
2893
|
+
# df.null_count
|
2894
|
+
# # =>
|
2895
|
+
# # shape: (1, 3)
|
2896
|
+
# # ┌─────┬─────┬─────┐
|
2897
|
+
# # │ foo ┆ bar ┆ ham │
|
2898
|
+
# # │ --- ┆ --- ┆ --- │
|
2899
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
2900
|
+
# # ╞═════╪═════╪═════╡
|
2901
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
2902
|
+
# # └─────┴─────┴─────┘
|
1079
2903
|
def null_count
|
1080
2904
|
_from_rbdf(_df.null_count)
|
1081
2905
|
end
|
1082
2906
|
|
1083
|
-
#
|
1084
|
-
#
|
2907
|
+
# Sample from this DataFrame.
|
2908
|
+
#
|
2909
|
+
# @param n [Integer]
|
2910
|
+
# Number of items to return. Cannot be used with `frac`. Defaults to 1 if
|
2911
|
+
# `frac` is nil.
|
2912
|
+
# @param frac [Float]
|
2913
|
+
# Fraction of items to return. Cannot be used with `n`.
|
2914
|
+
# @param with_replacement [Boolean]
|
2915
|
+
# Allow values to be sampled more than once.
|
2916
|
+
# @param shuffle [Boolean]
|
2917
|
+
# Shuffle the order of sampled data points.
|
2918
|
+
# @param seed [Integer]
|
2919
|
+
# Seed for the random number generator. If set to nil (default), a random
|
2920
|
+
# seed is used.
|
2921
|
+
#
|
2922
|
+
# @return [DataFrame]
|
2923
|
+
#
|
2924
|
+
# @example
|
2925
|
+
# df = Polars::DataFrame.new(
|
2926
|
+
# {
|
2927
|
+
# "foo" => [1, 2, 3],
|
2928
|
+
# "bar" => [6, 7, 8],
|
2929
|
+
# "ham" => ["a", "b", "c"]
|
2930
|
+
# }
|
2931
|
+
# )
|
2932
|
+
# df.sample(n: 2, seed: 0)
|
2933
|
+
# # =>
|
2934
|
+
# # shape: (2, 3)
|
2935
|
+
# # ┌─────┬─────┬─────┐
|
2936
|
+
# # │ foo ┆ bar ┆ ham │
|
2937
|
+
# # │ --- ┆ --- ┆ --- │
|
2938
|
+
# # │ i64 ┆ i64 ┆ str │
|
2939
|
+
# # ╞═════╪═════╪═════╡
|
2940
|
+
# # │ 3 ┆ 8 ┆ c │
|
2941
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2942
|
+
# # │ 2 ┆ 7 ┆ b │
|
2943
|
+
# # └─────┴─────┴─────┘
|
2944
|
+
def sample(
|
2945
|
+
n: nil,
|
2946
|
+
frac: nil,
|
2947
|
+
with_replacement: false,
|
2948
|
+
shuffle: false,
|
2949
|
+
seed: nil
|
2950
|
+
)
|
2951
|
+
if !n.nil? && !frac.nil?
|
2952
|
+
raise ArgumentError, "cannot specify both `n` and `frac`"
|
2953
|
+
end
|
2954
|
+
|
2955
|
+
if n.nil? && !frac.nil?
|
2956
|
+
_from_rbdf(
|
2957
|
+
_df.sample_frac(frac, with_replacement, shuffle, seed)
|
2958
|
+
)
|
2959
|
+
end
|
2960
|
+
|
2961
|
+
if n.nil?
|
2962
|
+
n = 1
|
2963
|
+
end
|
2964
|
+
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
|
2965
|
+
end
|
1085
2966
|
|
1086
2967
|
# def fold
|
1087
2968
|
# end
|
1088
2969
|
|
1089
|
-
#
|
1090
|
-
#
|
2970
|
+
# Get a row as tuple, either by index or by predicate.
|
2971
|
+
#
|
2972
|
+
# @param index [Object]
|
2973
|
+
# Row index.
|
2974
|
+
# @param by_predicate [Object]
|
2975
|
+
# Select the row according to a given expression/predicate.
|
2976
|
+
#
|
2977
|
+
# @return [Object]
|
2978
|
+
#
|
2979
|
+
# @note
|
2980
|
+
# The `index` and `by_predicate` params are mutually exclusive. Additionally,
|
2981
|
+
# to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
|
2982
|
+
#
|
2983
|
+
# When using `by_predicate` it is an error condition if anything other than
|
2984
|
+
# one row is returned; more than one row raises `TooManyRowsReturned`, and
|
2985
|
+
# zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
|
2986
|
+
#
|
2987
|
+
# @example Return the row at the given index
|
2988
|
+
# df = Polars::DataFrame.new(
|
2989
|
+
# {
|
2990
|
+
# "foo" => [1, 2, 3],
|
2991
|
+
# "bar" => [6, 7, 8],
|
2992
|
+
# "ham" => ["a", "b", "c"]
|
2993
|
+
# }
|
2994
|
+
# )
|
2995
|
+
# df.row(2)
|
2996
|
+
# # => [3, 8, "c"]
|
2997
|
+
#
|
2998
|
+
# @example Return the row that matches the given predicate
|
2999
|
+
# df.row(by_predicate: Polars.col("ham") == "b")
|
3000
|
+
# # => [2, 7, "b"]
|
3001
|
+
def row(index = nil, by_predicate: nil)
|
3002
|
+
if !index.nil? && !by_predicate.nil?
|
3003
|
+
raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
|
3004
|
+
elsif index.is_a?(Expr)
|
3005
|
+
raise TypeError, "Expressions should be passed to the 'by_predicate' param"
|
3006
|
+
elsif index.is_a?(Integer)
|
3007
|
+
_df.row_tuple(index)
|
3008
|
+
elsif by_predicate.is_a?(Expr)
|
3009
|
+
rows = filter(by_predicate).rows
|
3010
|
+
n_rows = rows.length
|
3011
|
+
if n_rows > 1
|
3012
|
+
raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
|
3013
|
+
elsif n_rows == 0
|
3014
|
+
raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
|
3015
|
+
end
|
3016
|
+
rows[0]
|
3017
|
+
else
|
3018
|
+
raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
|
3019
|
+
end
|
3020
|
+
end
|
1091
3021
|
|
1092
|
-
#
|
1093
|
-
#
|
3022
|
+
# Convert columnar data to rows as Ruby arrays.
|
3023
|
+
#
|
3024
|
+
# @return [Array]
|
3025
|
+
#
|
3026
|
+
# @example
|
3027
|
+
# df = Polars::DataFrame.new(
|
3028
|
+
# {
|
3029
|
+
# "a" => [1, 3, 5],
|
3030
|
+
# "b" => [2, 4, 6]
|
3031
|
+
# }
|
3032
|
+
# )
|
3033
|
+
# df.rows
|
3034
|
+
# # => [[1, 2], [3, 4], [5, 6]]
|
3035
|
+
def rows
|
3036
|
+
_df.row_tuples
|
3037
|
+
end
|
1094
3038
|
|
1095
|
-
#
|
1096
|
-
#
|
3039
|
+
# Shrink DataFrame memory usage.
|
3040
|
+
#
|
3041
|
+
# Shrinks to fit the exact capacity needed to hold the data.
|
3042
|
+
#
|
3043
|
+
# @return [DataFrame]
|
3044
|
+
def shrink_to_fit(in_place: false)
|
3045
|
+
if in_place
|
3046
|
+
_df.shrink_to_fit
|
3047
|
+
self
|
3048
|
+
else
|
3049
|
+
df = clone
|
3050
|
+
df._df.shrink_to_fit
|
3051
|
+
df
|
3052
|
+
end
|
3053
|
+
end
|
1097
3054
|
|
1098
|
-
#
|
1099
|
-
#
|
3055
|
+
# Take every nth row in the DataFrame and return as a new DataFrame.
|
3056
|
+
#
|
3057
|
+
# @return [DataFrame]
|
3058
|
+
#
|
3059
|
+
# @example
|
3060
|
+
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
|
3061
|
+
# s.take_every(2)
|
3062
|
+
# # =>
|
3063
|
+
# # shape: (2, 2)
|
3064
|
+
# # ┌─────┬─────┐
|
3065
|
+
# # │ a ┆ b │
|
3066
|
+
# # │ --- ┆ --- │
|
3067
|
+
# # │ i64 ┆ i64 │
|
3068
|
+
# # ╞═════╪═════╡
|
3069
|
+
# # │ 1 ┆ 5 │
|
3070
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3071
|
+
# # │ 3 ┆ 7 │
|
3072
|
+
# # └─────┴─────┘
|
3073
|
+
def take_every(n)
|
3074
|
+
select(Utils.col("*").take_every(n))
|
3075
|
+
end
|
1100
3076
|
|
1101
3077
|
# def hash_rows
|
1102
3078
|
# end
|
1103
3079
|
|
1104
|
-
#
|
1105
|
-
#
|
1106
|
-
|
3080
|
+
# Interpolate intermediate values. The interpolation method is linear.
|
3081
|
+
#
|
3082
|
+
# @return [DataFrame]
|
3083
|
+
#
|
3084
|
+
# @example
|
3085
|
+
# df = Polars::DataFrame.new(
|
3086
|
+
# {
|
3087
|
+
# "foo" => [1, nil, 9, 10],
|
3088
|
+
# "bar" => [6, 7, 9, nil],
|
3089
|
+
# "baz" => [1, nil, nil, 9]
|
3090
|
+
# }
|
3091
|
+
# )
|
3092
|
+
# df.interpolate
|
3093
|
+
# # =>
|
3094
|
+
# # shape: (4, 3)
|
3095
|
+
# # ┌─────┬──────┬─────┐
|
3096
|
+
# # │ foo ┆ bar ┆ baz │
|
3097
|
+
# # │ --- ┆ --- ┆ --- │
|
3098
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
3099
|
+
# # ╞═════╪══════╪═════╡
|
3100
|
+
# # │ 1 ┆ 6 ┆ 1 │
|
3101
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3102
|
+
# # │ 5 ┆ 7 ┆ 3 │
|
3103
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3104
|
+
# # │ 9 ┆ 9 ┆ 6 │
|
3105
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3106
|
+
# # │ 10 ┆ null ┆ 9 │
|
3107
|
+
# # └─────┴──────┴─────┘
|
3108
|
+
def interpolate
|
3109
|
+
select(Utils.col("*").interpolate)
|
3110
|
+
end
|
3111
|
+
|
3112
|
+
# Check if the dataframe is empty.
|
3113
|
+
#
|
3114
|
+
# @return [Boolean]
|
1107
3115
|
#
|
3116
|
+
# @example
|
3117
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
3118
|
+
# df.is_empty
|
3119
|
+
# # => false
|
3120
|
+
# df.filter(Polars.col("foo") > 99).is_empty
|
3121
|
+
# # => true
|
1108
3122
|
def is_empty
|
1109
3123
|
height == 0
|
1110
3124
|
end
|
1111
3125
|
alias_method :empty?, :is_empty
|
1112
3126
|
|
1113
|
-
#
|
1114
|
-
#
|
3127
|
+
# Convert a `DataFrame` to a `Series` of type `Struct`.
|
3128
|
+
#
|
3129
|
+
# @param name [String]
|
3130
|
+
# Name for the struct Series
|
3131
|
+
#
|
3132
|
+
# @return [Series]
|
3133
|
+
#
|
3134
|
+
# @example
|
3135
|
+
# df = Polars::DataFrame.new(
|
3136
|
+
# {
|
3137
|
+
# "a" => [1, 2, 3, 4, 5],
|
3138
|
+
# "b" => ["one", "two", "three", "four", "five"]
|
3139
|
+
# }
|
3140
|
+
# )
|
3141
|
+
# df.to_struct("nums")
|
3142
|
+
# # =>
|
3143
|
+
# # shape: (5,)
|
3144
|
+
# # Series: 'nums' [struct[2]]
|
3145
|
+
# # [
|
3146
|
+
# # {1,"one"}
|
3147
|
+
# # {2,"two"}
|
3148
|
+
# # {3,"three"}
|
3149
|
+
# # {4,"four"}
|
3150
|
+
# # {5,"five"}
|
3151
|
+
# # ]
|
3152
|
+
def to_struct(name)
|
3153
|
+
Utils.wrap_s(_df.to_struct(name))
|
3154
|
+
end
|
1115
3155
|
|
1116
|
-
#
|
1117
|
-
#
|
3156
|
+
# Decompose a struct into its fields.
|
3157
|
+
#
|
3158
|
+
# The fields will be inserted into the `DataFrame` on the location of the
|
3159
|
+
# `struct` type.
|
3160
|
+
#
|
3161
|
+
# @param names [Object]
|
3162
|
+
# Names of the struct columns that will be decomposed by its fields
|
3163
|
+
#
|
3164
|
+
# @return [DataFrame]
|
3165
|
+
#
|
3166
|
+
# @example
|
3167
|
+
# df = Polars::DataFrame.new(
|
3168
|
+
# {
|
3169
|
+
# "before" => ["foo", "bar"],
|
3170
|
+
# "t_a" => [1, 2],
|
3171
|
+
# "t_b" => ["a", "b"],
|
3172
|
+
# "t_c" => [true, nil],
|
3173
|
+
# "t_d" => [[1, 2], [3]],
|
3174
|
+
# "after" => ["baz", "womp"]
|
3175
|
+
# }
|
3176
|
+
# ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
|
3177
|
+
# df.unnest("t_struct")
|
3178
|
+
# # =>
|
3179
|
+
# # shape: (2, 6)
|
3180
|
+
# # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
|
3181
|
+
# # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
|
3182
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3183
|
+
# # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
|
3184
|
+
# # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
|
3185
|
+
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
3186
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3187
|
+
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
3188
|
+
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
3189
|
+
def unnest(names)
|
3190
|
+
if names.is_a?(String)
|
3191
|
+
names = [names]
|
3192
|
+
end
|
3193
|
+
_from_rbdf(_df.unnest(names))
|
3194
|
+
end
|
1118
3195
|
|
1119
3196
|
private
|
1120
3197
|
|
@@ -1127,7 +3204,7 @@ module Polars
|
|
1127
3204
|
if !columns.nil?
|
1128
3205
|
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
1129
3206
|
|
1130
|
-
if
|
3207
|
+
if data.empty? && dtypes
|
1131
3208
|
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
1132
3209
|
else
|
1133
3210
|
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
@@ -1147,7 +3224,7 @@ module Polars
|
|
1147
3224
|
if columns.nil?
|
1148
3225
|
data
|
1149
3226
|
else
|
1150
|
-
if
|
3227
|
+
if data.empty?
|
1151
3228
|
columns.map { |c| Series.new(c, nil)._s }
|
1152
3229
|
elsif data.length == columns.length
|
1153
3230
|
columns.each_with_index do |c, i|
|
@@ -1182,5 +3259,75 @@ module Polars
|
|
1182
3259
|
def _from_rbdf(rb_df)
|
1183
3260
|
self.class._from_rbdf(rb_df)
|
1184
3261
|
end
|
3262
|
+
|
3263
|
+
def _comp(other, op)
|
3264
|
+
if other.is_a?(DataFrame)
|
3265
|
+
_compare_to_other_df(other, op)
|
3266
|
+
else
|
3267
|
+
_compare_to_non_df(other, op)
|
3268
|
+
end
|
3269
|
+
end
|
3270
|
+
|
3271
|
+
def _compare_to_other_df(other, op)
|
3272
|
+
if columns != other.columns
|
3273
|
+
raise ArgmentError, "DataFrame columns do not match"
|
3274
|
+
end
|
3275
|
+
if shape != other.shape
|
3276
|
+
raise ArgmentError, "DataFrame dimensions do not match"
|
3277
|
+
end
|
3278
|
+
|
3279
|
+
suffix = "__POLARS_CMP_OTHER"
|
3280
|
+
other_renamed = other.select(Polars.all.suffix(suffix))
|
3281
|
+
combined = Polars.concat([self, other_renamed], how: "horizontal")
|
3282
|
+
|
3283
|
+
expr = case op
|
3284
|
+
when "eq"
|
3285
|
+
columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
|
3286
|
+
when "neq"
|
3287
|
+
columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
|
3288
|
+
when "gt"
|
3289
|
+
columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
|
3290
|
+
when "lt"
|
3291
|
+
columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
|
3292
|
+
when "gt_eq"
|
3293
|
+
columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
|
3294
|
+
when "lt_eq"
|
3295
|
+
columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
|
3296
|
+
else
|
3297
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
3298
|
+
end
|
3299
|
+
|
3300
|
+
combined.select(expr)
|
3301
|
+
end
|
3302
|
+
|
3303
|
+
def _compare_to_non_df(other, op)
|
3304
|
+
case op
|
3305
|
+
when "eq"
|
3306
|
+
select(Polars.all == other)
|
3307
|
+
when "neq"
|
3308
|
+
select(Polars.all != other)
|
3309
|
+
when "gt"
|
3310
|
+
select(Polars.all > other)
|
3311
|
+
when "lt"
|
3312
|
+
select(Polars.all < other)
|
3313
|
+
when "gt_eq"
|
3314
|
+
select(Polars.all >= other)
|
3315
|
+
when "lt_eq"
|
3316
|
+
select(Polars.all <= other)
|
3317
|
+
else
|
3318
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
3319
|
+
end
|
3320
|
+
end
|
3321
|
+
|
3322
|
+
def _prepare_other_arg(other)
|
3323
|
+
if !other.is_a?(Series)
|
3324
|
+
if other.is_a?(Array)
|
3325
|
+
raise ArgumentError, "Operation not supported."
|
3326
|
+
end
|
3327
|
+
|
3328
|
+
other = Series.new("", [other])
|
3329
|
+
end
|
3330
|
+
other
|
3331
|
+
end
|
1185
3332
|
end
|
1186
3333
|
end
|