polars-df 0.2.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38856 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.bundle +0 -0
  10. data/lib/polars/3.1/polars.bundle +0 -0
  11. data/lib/polars/3.2/polars.bundle +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,4833 @@
1
+ module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
3
+ class DataFrame
4
+ # @private
5
+ attr_accessor :_df
6
+
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
20
+ if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
21
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
22
+ data = {}
23
+ result.columns.each_with_index do |k, i|
24
+ data[k] = result.rows.map { |r| r[i] }
25
+ end
26
+ end
27
+
28
+ if data.nil?
29
+ self._df = self.class.hash_to_rbdf({}, columns: columns)
30
+ elsif data.is_a?(Hash)
31
+ data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
+ self._df = self.class.hash_to_rbdf(data, columns: columns)
33
+ elsif data.is_a?(Array)
34
+ self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
35
+ elsif data.is_a?(Series)
36
+ self._df = self.class.series_to_rbdf(data, columns: columns)
37
+ else
38
+ raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
+ end
40
+ end
41
+
42
+ # @private
43
+ def self._from_rbdf(rb_df)
44
+ df = DataFrame.allocate
45
+ df._df = rb_df
46
+ df
47
+ end
48
+
49
+ # @private
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
52
+ _from_rbdf(rbdf)
53
+ end
54
+
55
+ # @private
56
+ def self._from_hash(data, columns: nil)
57
+ _from_rbdf(hash_to_rbdf(data, columns: columns))
58
+ end
59
+
60
+ # def self._from_records
61
+ # end
62
+
63
+ # def self._from_numo
64
+ # end
65
+
66
+ # no self._from_arrow
67
+
68
+ # no self._from_pandas
69
+
70
+ # @private
71
+ def self._read_csv(
72
+ file,
73
+ has_header: true,
74
+ columns: nil,
75
+ sep: str = ",",
76
+ comment_char: nil,
77
+ quote_char: '"',
78
+ skip_rows: 0,
79
+ dtypes: nil,
80
+ null_values: nil,
81
+ ignore_errors: false,
82
+ parse_dates: false,
83
+ n_threads: nil,
84
+ infer_schema_length: 100,
85
+ batch_size: 8192,
86
+ n_rows: nil,
87
+ encoding: "utf8",
88
+ low_memory: false,
89
+ rechunk: true,
90
+ skip_rows_after_header: 0,
91
+ row_count_name: nil,
92
+ row_count_offset: 0,
93
+ sample_size: 1024,
94
+ eol_char: "\n"
95
+ )
96
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
97
+ path = Utils.format_path(file)
98
+ else
99
+ path = nil
100
+ # if defined?(StringIO) && file.is_a?(StringIO)
101
+ # file = file.string
102
+ # end
103
+ end
104
+
105
+ dtype_list = nil
106
+ dtype_slice = nil
107
+ if !dtypes.nil?
108
+ if dtypes.is_a?(Hash)
109
+ dtype_list = []
110
+ dtypes.each do|k, v|
111
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
112
+ end
113
+ elsif dtypes.is_a?(Array)
114
+ dtype_slice = dtypes
115
+ else
116
+ raise ArgumentError, "dtype arg should be list or dict"
117
+ end
118
+ end
119
+
120
+ processed_null_values = Utils._process_null_values(null_values)
121
+
122
+ if columns.is_a?(String)
123
+ columns = [columns]
124
+ end
125
+ if file.is_a?(String) && file.include?("*")
126
+ raise Todo
127
+ end
128
+
129
+ projection, columns = Utils.handle_projection_columns(columns)
130
+
131
+ _from_rbdf(
132
+ RbDataFrame.read_csv(
133
+ file,
134
+ infer_schema_length,
135
+ batch_size,
136
+ has_header,
137
+ ignore_errors,
138
+ n_rows,
139
+ skip_rows,
140
+ projection,
141
+ sep,
142
+ rechunk,
143
+ columns,
144
+ encoding,
145
+ n_threads,
146
+ path,
147
+ dtype_list,
148
+ dtype_slice,
149
+ low_memory,
150
+ comment_char,
151
+ quote_char,
152
+ processed_null_values,
153
+ parse_dates,
154
+ skip_rows_after_header,
155
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
156
+ sample_size,
157
+ eol_char
158
+ )
159
+ )
160
+ end
161
+
162
+ # @private
163
+ def self._read_parquet(
164
+ file,
165
+ columns: nil,
166
+ n_rows: nil,
167
+ parallel: "auto",
168
+ row_count_name: nil,
169
+ row_count_offset: 0,
170
+ low_memory: false
171
+ )
172
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
173
+ file = Utils.format_path(file)
174
+ end
175
+
176
+ if file.is_a?(String) && file.include?("*")
177
+ raise Todo
178
+ end
179
+
180
+ projection, columns = Utils.handle_projection_columns(columns)
181
+ _from_rbdf(
182
+ RbDataFrame.read_parquet(
183
+ file,
184
+ columns,
185
+ projection,
186
+ n_rows,
187
+ parallel,
188
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
189
+ low_memory
190
+ )
191
+ )
192
+ end
193
+
194
+ # @private
195
+ def self._read_avro(file, columns: nil, n_rows: nil)
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+ projection, columns = Utils.handle_projection_columns(columns)
200
+ _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
201
+ end
202
+
203
+ # @private
204
+ def self._read_ipc(
205
+ file,
206
+ columns: nil,
207
+ n_rows: nil,
208
+ row_count_name: nil,
209
+ row_count_offset: 0,
210
+ rechunk: true,
211
+ memory_map: true
212
+ )
213
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
214
+ file = Utils.format_path(file)
215
+ end
216
+ if columns.is_a?(String)
217
+ columns = [columns]
218
+ end
219
+
220
+ if file.is_a?(String) && file.include?("*")
221
+ raise Todo
222
+ end
223
+
224
+ projection, columns = Utils.handle_projection_columns(columns)
225
+ _from_rbdf(
226
+ RbDataFrame.read_ipc(
227
+ file,
228
+ columns,
229
+ projection,
230
+ n_rows,
231
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
232
+ memory_map
233
+ )
234
+ )
235
+ end
236
+
237
+ # @private
238
+ def self._read_json(file)
239
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
240
+ file = Utils.format_path(file)
241
+ end
242
+
243
+ _from_rbdf(RbDataFrame.read_json(file))
244
+ end
245
+
246
+ # @private
247
+ def self._read_ndjson(file)
248
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
249
+ file = Utils.format_path(file)
250
+ end
251
+
252
+ _from_rbdf(RbDataFrame.read_ndjson(file))
253
+ end
254
+
255
+ # Get the shape of the DataFrame.
256
+ #
257
+ # @return [Array]
258
+ #
259
+ # @example
260
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
261
+ # df.shape
262
+ # # => [5, 1]
263
+ def shape
264
+ _df.shape
265
+ end
266
+
267
+ # Get the height of the DataFrame.
268
+ #
269
+ # @return [Integer]
270
+ #
271
+ # @example
272
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
273
+ # df.height
274
+ # # => 5
275
+ def height
276
+ _df.height
277
+ end
278
+
279
+ # Get the width of the DataFrame.
280
+ #
281
+ # @return [Integer]
282
+ #
283
+ # @example
284
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
285
+ # df.width
286
+ # # => 1
287
+ def width
288
+ _df.width
289
+ end
290
+
291
+ # Get column names.
292
+ #
293
+ # @return [Array]
294
+ #
295
+ # @example
296
+ # df = Polars::DataFrame.new(
297
+ # {
298
+ # "foo" => [1, 2, 3],
299
+ # "bar" => [6, 7, 8],
300
+ # "ham" => ["a", "b", "c"]
301
+ # }
302
+ # )
303
+ # df.columns
304
+ # # => ["foo", "bar", "ham"]
305
+ def columns
306
+ _df.columns
307
+ end
308
+
309
+ # Change the column names of the DataFrame.
310
+ #
311
+ # @param columns [Array]
312
+ # A list with new names for the DataFrame.
313
+ # The length of the list should be equal to the width of the DataFrame.
314
+ #
315
+ # @return [Object]
316
+ #
317
+ # @example
318
+ # df = Polars::DataFrame.new(
319
+ # {
320
+ # "foo" => [1, 2, 3],
321
+ # "bar" => [6, 7, 8],
322
+ # "ham" => ["a", "b", "c"]
323
+ # }
324
+ # )
325
+ # df.columns = ["apple", "banana", "orange"]
326
+ # df
327
+ # # =>
328
+ # # shape: (3, 3)
329
+ # # ┌───────┬────────┬────────┐
330
+ # # │ apple ┆ banana ┆ orange │
331
+ # # │ --- ┆ --- ┆ --- │
332
+ # # │ i64 ┆ i64 ┆ str │
333
+ # # ╞═══════╪════════╪════════╡
334
+ # # │ 1 ┆ 6 ┆ a │
335
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
336
+ # # │ 2 ┆ 7 ┆ b │
337
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
338
+ # # │ 3 ┆ 8 ┆ c │
339
+ # # └───────┴────────┴────────┘
340
+ def columns=(columns)
341
+ _df.set_column_names(columns)
342
+ end
343
+
344
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
345
+ #
346
+ # @return [Array]
347
+ #
348
+ # @example
349
+ # df = Polars::DataFrame.new(
350
+ # {
351
+ # "foo" => [1, 2, 3],
352
+ # "bar" => [6.0, 7.0, 8.0],
353
+ # "ham" => ["a", "b", "c"]
354
+ # }
355
+ # )
356
+ # df.dtypes
357
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
358
+ def dtypes
359
+ _df.dtypes
360
+ end
361
+
362
+ # Get the schema.
363
+ #
364
+ # @return [Hash]
365
+ #
366
+ # @example
367
+ # df = Polars::DataFrame.new(
368
+ # {
369
+ # "foo" => [1, 2, 3],
370
+ # "bar" => [6.0, 7.0, 8.0],
371
+ # "ham" => ["a", "b", "c"]
372
+ # }
373
+ # )
374
+ # df.schema
375
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
376
+ def schema
377
+ columns.zip(dtypes).to_h
378
+ end
379
+
380
+ # Equal.
381
+ #
382
+ # @return [DataFrame]
383
+ def ==(other)
384
+ _comp(other, "eq")
385
+ end
386
+
387
+ # Not equal.
388
+ #
389
+ # @return [DataFrame]
390
+ def !=(other)
391
+ _comp(other, "neq")
392
+ end
393
+
394
+ # Greater than.
395
+ #
396
+ # @return [DataFrame]
397
+ def >(other)
398
+ _comp(other, "gt")
399
+ end
400
+
401
+ # Less than.
402
+ #
403
+ # @return [DataFrame]
404
+ def <(other)
405
+ _comp(other, "lt")
406
+ end
407
+
408
+ # Greater than or equal.
409
+ #
410
+ # @return [DataFrame]
411
+ def >=(other)
412
+ _comp(other, "gt_eq")
413
+ end
414
+
415
+ # Less than or equal.
416
+ #
417
+ # @return [DataFrame]
418
+ def <=(other)
419
+ _comp(other, "lt_eq")
420
+ end
421
+
422
+ # Performs multiplication.
423
+ #
424
+ # @return [DataFrame]
425
+ def *(other)
426
+ if other.is_a?(DataFrame)
427
+ return _from_rbdf(_df.mul_df(other._df))
428
+ end
429
+
430
+ other = _prepare_other_arg(other)
431
+ _from_rbdf(_df.mul(other._s))
432
+ end
433
+
434
+ # Performs division.
435
+ #
436
+ # @return [DataFrame]
437
+ def /(other)
438
+ if other.is_a?(DataFrame)
439
+ return _from_rbdf(_df.div_df(other._df))
440
+ end
441
+
442
+ other = _prepare_other_arg(other)
443
+ _from_rbdf(_df.div(other._s))
444
+ end
445
+
446
+ # Performs addition.
447
+ #
448
+ # @return [DataFrame]
449
+ def +(other)
450
+ if other.is_a?(DataFrame)
451
+ return _from_rbdf(_df.add_df(other._df))
452
+ end
453
+
454
+ other = _prepare_other_arg(other)
455
+ _from_rbdf(_df.add(other._s))
456
+ end
457
+
458
+ # Performs subtraction.
459
+ #
460
+ # @return [DataFrame]
461
+ def -(other)
462
+ if other.is_a?(DataFrame)
463
+ return _from_rbdf(_df.sub_df(other._df))
464
+ end
465
+
466
+ other = _prepare_other_arg(other)
467
+ _from_rbdf(_df.sub(other._s))
468
+ end
469
+
470
+ # Returns the modulo.
471
+ #
472
+ # @return [DataFrame]
473
+ def %(other)
474
+ if other.is_a?(DataFrame)
475
+ return _from_rbdf(_df.rem_df(other._df))
476
+ end
477
+
478
+ other = _prepare_other_arg(other)
479
+ _from_rbdf(_df.rem(other._s))
480
+ end
481
+
482
+ # Returns a string representing the DataFrame.
483
+ #
484
+ # @return [String]
485
+ def to_s
486
+ _df.to_s
487
+ end
488
+ alias_method :inspect, :to_s
489
+
490
+ # Check if DataFrame includes column.
491
+ #
492
+ # @return [Boolean]
493
+ def include?(name)
494
+ columns.include?(name)
495
+ end
496
+
497
+ # def each
498
+ # end
499
+
500
+ # Returns subset of the DataFrame.
501
+ #
502
+ # @return [Object]
503
+ def [](*args)
504
+ if args.size == 2
505
+ row_selection, col_selection = args
506
+
507
+ # df[.., unknown]
508
+ if row_selection.is_a?(Range)
509
+
510
+ # multiple slices
511
+ # df[.., ..]
512
+ if col_selection.is_a?(Range)
513
+ raise Todo
514
+ end
515
+ end
516
+
517
+ # df[2, ..] (select row as df)
518
+ if row_selection.is_a?(Integer)
519
+ if col_selection.is_a?(Array)
520
+ df = self[0.., col_selection]
521
+ return df.slice(row_selection, 1)
522
+ end
523
+ # df[2, "a"]
524
+ if col_selection.is_a?(String)
525
+ return self[col_selection][row_selection]
526
+ end
527
+ end
528
+
529
+ # column selection can be "a" and ["a", "b"]
530
+ if col_selection.is_a?(String)
531
+ col_selection = [col_selection]
532
+ end
533
+
534
+ # df[.., 1]
535
+ if col_selection.is_a?(Integer)
536
+ series = to_series(col_selection)
537
+ return series[row_selection]
538
+ end
539
+
540
+ if col_selection.is_a?(Array)
541
+ # df[.., [1, 2]]
542
+ if is_int_sequence(col_selection)
543
+ series_list = col_selection.map { |i| to_series(i) }
544
+ df = self.class.new(series_list)
545
+ return df[row_selection]
546
+ end
547
+ end
548
+
549
+ df = self[col_selection]
550
+ return df[row_selection]
551
+ elsif args.size == 1
552
+ item = args[0]
553
+
554
+ # select single column
555
+ # df["foo"]
556
+ if item.is_a?(String)
557
+ return Utils.wrap_s(_df.column(item))
558
+ end
559
+
560
+ # df[idx]
561
+ if item.is_a?(Integer)
562
+ return slice(_pos_idx(item, 0), 1)
563
+ end
564
+
565
+ # df[..]
566
+ if item.is_a?(Range)
567
+ return Slice.new(self).apply(item)
568
+ end
569
+
570
+ if Utils.is_str_sequence(item, allow_str: false)
571
+ # select multiple columns
572
+ # df[["foo", "bar"]]
573
+ return _from_rbdf(_df.select(item))
574
+ end
575
+ end
576
+
577
+ raise ArgumentError, "Cannot get item of type: #{item.class.name}"
578
+ end
579
+
580
+ # Set item.
581
+ #
582
+ # @return [Object]
583
+ # def []=(key, value)
584
+ # if key.is_a?(String)
585
+ # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
586
+ # end
587
+
588
+ # raise Todo
589
+ # end
590
+
591
+ # no to_arrow
592
+
593
+ # Convert DataFrame to a hash mapping column name to values.
594
+ #
595
+ # @return [Hash]
596
+ def to_h(as_series: true)
597
+ if as_series
598
+ get_columns.to_h { |s| [s.name, s] }
599
+ else
600
+ get_columns.to_h { |s| [s.name, s.to_a] }
601
+ end
602
+ end
603
+
604
+ # Convert every row to a dictionary.
605
+ #
606
+ # Note that this is slow.
607
+ #
608
+ # @return [Array]
609
+ #
610
+ # @example
611
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
612
+ # df.to_hashes
613
+ # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
614
+ def to_hashes
615
+ rbdf = _df
616
+ names = columns
617
+
618
+ height.times.map do |i|
619
+ names.zip(rbdf.row_tuple(i)).to_h
620
+ end
621
+ end
622
+
623
+ # def to_numo
624
+ # end
625
+
626
+ # no to_pandas
627
+
628
+ # Select column as Series at index location.
629
+ #
630
+ # @param index [Integer]
631
+ # Location of selection.
632
+ #
633
+ # @return [Series]
634
+ #
635
+ # @example
636
+ # df = Polars::DataFrame.new(
637
+ # {
638
+ # "foo" => [1, 2, 3],
639
+ # "bar" => [6, 7, 8],
640
+ # "ham" => ["a", "b", "c"]
641
+ # }
642
+ # )
643
+ # df.to_series(1)
644
+ # # =>
645
+ # # shape: (3,)
646
+ # # Series: 'bar' [i64]
647
+ # # [
648
+ # # 6
649
+ # # 7
650
+ # # 8
651
+ # # ]
652
+ def to_series(index = 0)
653
+ if index < 0
654
+ index = columns.length + index
655
+ end
656
+ Utils.wrap_s(_df.select_at_idx(index))
657
+ end
658
+
659
+ # Serialize to JSON representation.
660
+ #
661
+ # @return [nil]
662
+ #
663
+ # @param file [String]
664
+ # File path to which the result should be written.
665
+ # @param pretty [Boolean]
666
+ # Pretty serialize json.
667
+ # @param row_oriented [Boolean]
668
+ # Write to row oriented json. This is slower, but more common.
669
+ #
670
+ # @see #write_ndjson
671
+ def write_json(
672
+ file,
673
+ pretty: false,
674
+ row_oriented: false
675
+ )
676
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
677
+ file = Utils.format_path(file)
678
+ end
679
+
680
+ _df.write_json(file, pretty, row_oriented)
681
+ nil
682
+ end
683
+
684
+ # Serialize to newline delimited JSON representation.
685
+ #
686
+ # @param file [String]
687
+ # File path to which the result should be written.
688
+ #
689
+ # @return [nil]
690
+ def write_ndjson(file)
691
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
692
+ file = Utils.format_path(file)
693
+ end
694
+
695
+ _df.write_ndjson(file)
696
+ nil
697
+ end
698
+
699
+ # Write to comma-separated values (CSV) file.
700
+ #
701
+ # @param file [String, nil]
702
+ # File path to which the result should be written. If set to `nil`
703
+ # (default), the output is returned as a string instead.
704
+ # @param has_header [Boolean]
705
+ # Whether to include header in the CSV output.
706
+ # @param sep [String]
707
+ # Separate CSV fields with this symbol.
708
+ # @param quote [String]
709
+ # Byte to use as quoting character.
710
+ # @param batch_size [Integer]
711
+ # Number of rows that will be processed per thread.
712
+ # @param datetime_format [String, nil]
713
+ # A format string, with the specifiers defined by the
714
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
715
+ # Rust crate. If no format specified, the default fractional-second
716
+ # precision is inferred from the maximum timeunit found in the frame's
717
+ # Datetime cols (if any).
718
+ # @param date_format [String, nil]
719
+ # A format string, with the specifiers defined by the
720
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
721
+ # Rust crate.
722
+ # @param time_format [String, nil]
723
+ # A format string, with the specifiers defined by the
724
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
725
+ # Rust crate.
726
+ # @param float_precision [Integer, nil]
727
+ # Number of decimal places to write, applied to both `:f32` and
728
+ # `:f64` datatypes.
729
+ # @param null_value [String, nil]
730
+ # A string representing null values (defaulting to the empty string).
731
+ #
732
+ # @return [String, nil]
733
+ #
734
+ # @example
735
+ # df = Polars::DataFrame.new(
736
+ # {
737
+ # "foo" => [1, 2, 3, 4, 5],
738
+ # "bar" => [6, 7, 8, 9, 10],
739
+ # "ham" => ["a", "b", "c", "d", "e"]
740
+ # }
741
+ # )
742
+ # df.write_csv("file.csv")
743
+ def write_csv(
744
+ file = nil,
745
+ has_header: true,
746
+ sep: ",",
747
+ quote: '"',
748
+ batch_size: 1024,
749
+ datetime_format: nil,
750
+ date_format: nil,
751
+ time_format: nil,
752
+ float_precision: nil,
753
+ null_value: nil
754
+ )
755
+ if sep.length > 1
756
+ raise ArgumentError, "only single byte separator is allowed"
757
+ elsif quote.length > 1
758
+ raise ArgumentError, "only single byte quote char is allowed"
759
+ elsif null_value == ""
760
+ null_value = nil
761
+ end
762
+
763
+ if file.nil?
764
+ buffer = StringIO.new
765
+ buffer.set_encoding(Encoding::BINARY)
766
+ _df.write_csv(
767
+ buffer,
768
+ has_header,
769
+ sep.ord,
770
+ quote.ord,
771
+ batch_size,
772
+ datetime_format,
773
+ date_format,
774
+ time_format,
775
+ float_precision,
776
+ null_value
777
+ )
778
+ return buffer.string.force_encoding(Encoding::UTF_8)
779
+ end
780
+
781
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
782
+ file = Utils.format_path(file)
783
+ end
784
+
785
+ _df.write_csv(
786
+ file,
787
+ has_header,
788
+ sep.ord,
789
+ quote.ord,
790
+ batch_size,
791
+ datetime_format,
792
+ date_format,
793
+ time_format,
794
+ float_precision,
795
+ null_value,
796
+ )
797
+ nil
798
+ end
799
+
800
+ # Write to Apache Avro file.
801
+ #
802
+ # @param file [String]
803
+ # File path to which the file should be written.
804
+ # @param compression ["uncompressed", "snappy", "deflate"]
805
+ # Compression method. Defaults to "uncompressed".
806
+ #
807
+ # @return [nil]
808
+ def write_avro(file, compression = "uncompressed")
809
+ if compression.nil?
810
+ compression = "uncompressed"
811
+ end
812
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
813
+ file = Utils.format_path(file)
814
+ end
815
+
816
+ _df.write_avro(file, compression)
817
+ end
818
+
819
+ # Write to Arrow IPC binary stream or Feather file.
820
+ #
821
+ # @param file [String]
822
+ # File path to which the file should be written.
823
+ # @param compression ["uncompressed", "lz4", "zstd"]
824
+ # Compression method. Defaults to "uncompressed".
825
+ #
826
+ # @return [nil]
827
+ def write_ipc(file, compression: "uncompressed")
828
+ if compression.nil?
829
+ compression = "uncompressed"
830
+ end
831
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
832
+ file = Utils.format_path(file)
833
+ end
834
+
835
+ _df.write_ipc(file, compression)
836
+ end
837
+
838
+ # Write to Apache Parquet file.
839
+ #
840
+ # @param file [String]
841
+ # File path to which the file should be written.
842
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
843
+ # Choose "zstd" for good compression performance.
844
+ # Choose "lz4" for fast compression/decompression.
845
+ # Choose "snappy" for more backwards compatibility guarantees
846
+ # when you deal with older parquet readers.
847
+ # @param compression_level [Integer, nil]
848
+ # The level of compression to use. Higher compression means smaller files on
849
+ # disk.
850
+ #
851
+ # - "gzip" : min-level: 0, max-level: 10.
852
+ # - "brotli" : min-level: 0, max-level: 11.
853
+ # - "zstd" : min-level: 1, max-level: 22.
854
+ # @param statistics [Boolean]
855
+ # Write statistics to the parquet headers. This requires extra compute.
856
+ # @param row_group_size [Integer, nil]
857
+ # Size of the row groups in number of rows.
858
+ # If `nil` (default), the chunks of the DataFrame are
859
+ # used. Writing in smaller chunks may reduce memory pressure and improve
860
+ # writing speeds.
861
+ #
862
+ # @return [nil]
863
+ def write_parquet(
864
+ file,
865
+ compression: "zstd",
866
+ compression_level: nil,
867
+ statistics: false,
868
+ row_group_size: nil
869
+ )
870
+ if compression.nil?
871
+ compression = "uncompressed"
872
+ end
873
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
874
+ file = Utils.format_path(file)
875
+ end
876
+
877
+ _df.write_parquet(
878
+ file, compression, compression_level, statistics, row_group_size
879
+ )
880
+ end
881
+
882
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
883
+ #
884
+ # Estimated size is given in the specified unit (bytes by default).
885
+ #
886
+ # This estimation is the sum of the size of its buffers, validity, including
887
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
888
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
889
+ # particular, StructArray's size is an upper bound.
890
+ #
891
+ # When an array is sliced, its allocated size remains constant because the buffer
892
+ # unchanged. However, this function will yield a smaller number. This is because
893
+ # this function returns the visible size of the buffer, not its total capacity.
894
+ #
895
+ # FFI buffers are included in this estimation.
896
+ #
897
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
898
+ # Scale the returned size to the given unit.
899
+ #
900
+ # @return [Numeric]
901
+ #
902
+ # @example
903
+ # df = Polars::DataFrame.new(
904
+ # {
905
+ # "x" => 1_000_000.times.to_a.reverse,
906
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
907
+ # "z" => 1_000_000.times.map(&:to_s)
908
+ # },
909
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
910
+ # )
911
+ # df.estimated_size
912
+ # # => 25888898
913
+ # df.estimated_size("mb")
914
+ # # => 24.689577102661133
915
+ def estimated_size(unit = "b")
916
+ sz = _df.estimated_size
917
+ Utils.scale_bytes(sz, to: unit)
918
+ end
919
+
920
+ # Transpose a DataFrame over the diagonal.
921
+ #
922
+ # @param include_header [Boolean]
923
+ # If set, the column names will be added as first column.
924
+ # @param header_name [String]
925
+ # If `include_header` is set, this determines the name of the column that will
926
+ # be inserted.
927
+ # @param column_names [Array]
928
+ # Optional generator/iterator that yields column names. Will be used to
929
+ # replace the columns in the DataFrame.
930
+ #
931
+ # @return [DataFrame]
932
+ #
933
+ # @note
934
+ # This is a very expensive operation. Perhaps you can do it differently.
935
+ #
936
+ # @example
937
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
938
+ # df.transpose(include_header: true)
939
+ # # =>
940
+ # # shape: (2, 4)
941
+ # # ┌────────┬──────────┬──────────┬──────────┐
942
+ # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
943
+ # # │ --- ┆ --- ┆ --- ┆ --- │
944
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
945
+ # # ╞════════╪══════════╪══════════╪══════════╡
946
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
947
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
948
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
949
+ # # └────────┴──────────┴──────────┴──────────┘
950
+ #
951
+ # @example Replace the auto-generated column names with a list
952
+ # df.transpose(include_header: false, column_names: ["a", "b", "c"])
953
+ # # =>
954
+ # # shape: (2, 3)
955
+ # # ┌─────┬─────┬─────┐
956
+ # # │ a ┆ b ┆ c │
957
+ # # │ --- ┆ --- ┆ --- │
958
+ # # │ i64 ┆ i64 ┆ i64 │
959
+ # # ╞═════╪═════╪═════╡
960
+ # # │ 1 ┆ 2 ┆ 3 │
961
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
962
+ # # │ 1 ┆ 2 ┆ 3 │
963
+ # # └─────┴─────┴─────┘
964
+ #
965
+ # @example Include the header as a separate column
966
+ # df.transpose(
967
+ # include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
968
+ # )
969
+ # # =>
970
+ # # shape: (2, 4)
971
+ # # ┌─────┬─────┬─────┬─────┐
972
+ # # │ foo ┆ a ┆ b ┆ c │
973
+ # # │ --- ┆ --- ┆ --- ┆ --- │
974
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
975
+ # # ╞═════╪═════╪═════╪═════╡
976
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
977
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
978
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
979
+ # # └─────┴─────┴─────┴─────┘
980
+ def transpose(include_header: false, header_name: "column", column_names: nil)
981
+ df = _from_rbdf(_df.transpose(include_header, header_name))
982
+ if !column_names.nil?
983
+ names = []
984
+ n = df.width
985
+ if include_header
986
+ names << header_name
987
+ n -= 1
988
+ end
989
+
990
+ column_names = column_names.each
991
+ n.times do
992
+ names << column_names.next
993
+ end
994
+ df.columns = names
995
+ end
996
+ df
997
+ end
998
+
999
+ # Reverse the DataFrame.
1000
+ #
1001
+ # @return [DataFrame]
1002
+ #
1003
+ # @example
1004
+ # df = Polars::DataFrame.new(
1005
+ # {
1006
+ # "key" => ["a", "b", "c"],
1007
+ # "val" => [1, 2, 3]
1008
+ # }
1009
+ # )
1010
+ # df.reverse
1011
+ # # =>
1012
+ # # shape: (3, 2)
1013
+ # # ┌─────┬─────┐
1014
+ # # │ key ┆ val │
1015
+ # # │ --- ┆ --- │
1016
+ # # │ str ┆ i64 │
1017
+ # # ╞═════╪═════╡
1018
+ # # │ c ┆ 3 │
1019
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1020
+ # # │ b ┆ 2 │
1021
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1022
+ # # │ a ┆ 1 │
1023
+ # # └─────┴─────┘
1024
+ def reverse
1025
+ select(Polars.col("*").reverse)
1026
+ end
1027
+
1028
+ # Rename column names.
1029
+ #
1030
+ # @param mapping [Hash]
1031
+ # Key value pairs that map from old name to new name.
1032
+ #
1033
+ # @return [DataFrame]
1034
+ #
1035
+ # @example
1036
+ # df = Polars::DataFrame.new(
1037
+ # {
1038
+ # "foo" => [1, 2, 3],
1039
+ # "bar" => [6, 7, 8],
1040
+ # "ham" => ["a", "b", "c"]
1041
+ # }
1042
+ # )
1043
+ # df.rename({"foo" => "apple"})
1044
+ # # =>
1045
+ # # shape: (3, 3)
1046
+ # # ┌───────┬─────┬─────┐
1047
+ # # │ apple ┆ bar ┆ ham │
1048
+ # # │ --- ┆ --- ┆ --- │
1049
+ # # │ i64 ┆ i64 ┆ str │
1050
+ # # ╞═══════╪═════╪═════╡
1051
+ # # │ 1 ┆ 6 ┆ a │
1052
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1053
+ # # │ 2 ┆ 7 ┆ b │
1054
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1055
+ # # │ 3 ┆ 8 ┆ c │
1056
+ # # └───────┴─────┴─────┘
1057
+ def rename(mapping)
1058
+ lazy.rename(mapping).collect(no_optimization: true)
1059
+ end
1060
+
1061
+ # Insert a Series at a certain column index. This operation is in place.
1062
+ #
1063
+ # @param index [Integer]
1064
+ # Column to insert the new `Series` column.
1065
+ # @param series [Series]
1066
+ # `Series` to insert.
1067
+ #
1068
+ # @return [DataFrame]
1069
+ #
1070
+ # @example
1071
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1072
+ # s = Polars::Series.new("baz", [97, 98, 99])
1073
+ # df.insert_at_idx(1, s)
1074
+ # # =>
1075
+ # # shape: (3, 3)
1076
+ # # ┌─────┬─────┬─────┐
1077
+ # # │ foo ┆ baz ┆ bar │
1078
+ # # │ --- ┆ --- ┆ --- │
1079
+ # # │ i64 ┆ i64 ┆ i64 │
1080
+ # # ╞═════╪═════╪═════╡
1081
+ # # │ 1 ┆ 97 ┆ 4 │
1082
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1083
+ # # │ 2 ┆ 98 ┆ 5 │
1084
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1085
+ # # │ 3 ┆ 99 ┆ 6 │
1086
+ # # └─────┴─────┴─────┘
1087
+ #
1088
+ # @example
1089
+ # df = Polars::DataFrame.new(
1090
+ # {
1091
+ # "a" => [1, 2, 3, 4],
1092
+ # "b" => [0.5, 4, 10, 13],
1093
+ # "c" => [true, true, false, true]
1094
+ # }
1095
+ # )
1096
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
1097
+ # df.insert_at_idx(3, s)
1098
+ # # =>
1099
+ # # shape: (4, 4)
1100
+ # # ┌─────┬──────┬───────┬──────┐
1101
+ # # │ a ┆ b ┆ c ┆ d │
1102
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1103
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
1104
+ # # ╞═════╪══════╪═══════╪══════╡
1105
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
1106
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1107
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
1108
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1109
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
1110
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1111
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
1112
+ # # └─────┴──────┴───────┴──────┘
1113
+ def insert_at_idx(index, series)
1114
+ if index < 0
1115
+ index = columns.length + index
1116
+ end
1117
+ _df.insert_at_idx(index, series._s)
1118
+ self
1119
+ end
1120
+
1121
+ # Filter the rows in the DataFrame based on a predicate expression.
1122
+ #
1123
+ # @param predicate [Expr]
1124
+ # Expression that evaluates to a boolean Series.
1125
+ #
1126
+ # @return [DataFrame]
1127
+ #
1128
+ # @example Filter on one condition:
1129
+ # df = Polars::DataFrame.new(
1130
+ # {
1131
+ # "foo" => [1, 2, 3],
1132
+ # "bar" => [6, 7, 8],
1133
+ # "ham" => ["a", "b", "c"]
1134
+ # }
1135
+ # )
1136
+ # df.filter(Polars.col("foo") < 3)
1137
+ # # =>
1138
+ # # shape: (2, 3)
1139
+ # # ┌─────┬─────┬─────┐
1140
+ # # │ foo ┆ bar ┆ ham │
1141
+ # # │ --- ┆ --- ┆ --- │
1142
+ # # │ i64 ┆ i64 ┆ str │
1143
+ # # ╞═════╪═════╪═════╡
1144
+ # # │ 1 ┆ 6 ┆ a │
1145
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1146
+ # # │ 2 ┆ 7 ┆ b │
1147
+ # # └─────┴─────┴─────┘
1148
+ #
1149
+ # @example Filter on multiple conditions:
1150
+ # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
1151
+ # # =>
1152
+ # # shape: (1, 3)
1153
+ # # ┌─────┬─────┬─────┐
1154
+ # # │ foo ┆ bar ┆ ham │
1155
+ # # │ --- ┆ --- ┆ --- │
1156
+ # # │ i64 ┆ i64 ┆ str │
1157
+ # # ╞═════╪═════╪═════╡
1158
+ # # │ 1 ┆ 6 ┆ a │
1159
+ # # └─────┴─────┴─────┘
1160
+ def filter(predicate)
1161
+ lazy.filter(predicate).collect
1162
+ end
1163
+
1164
+ # Summary statistics for a DataFrame.
1165
+ #
1166
+ # @return [DataFrame]
1167
+ #
1168
+ # @example
1169
+ # df = Polars::DataFrame.new(
1170
+ # {
1171
+ # "a" => [1.0, 2.8, 3.0],
1172
+ # "b" => [4, 5, nil],
1173
+ # "c" => [true, false, true],
1174
+ # "d" => [nil, "b", "c"],
1175
+ # "e" => ["usd", "eur", nil]
1176
+ # }
1177
+ # )
1178
+ # df.describe
1179
+ # # =>
1180
+ # # shape: (7, 6)
1181
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
1182
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1183
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1184
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1185
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
1186
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1187
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1188
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1189
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1190
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null │
1191
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1192
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null │
1193
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1194
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1195
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1196
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1197
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1198
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null │
1199
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴──────┘
1200
+ def describe
1201
+ describe_cast = lambda do |stat|
1202
+ columns = []
1203
+ self.columns.each_with_index do |s, i|
1204
+ if self[s].is_numeric || self[s].is_boolean
1205
+ columns << stat[0.., i].cast(:f64)
1206
+ else
1207
+ # for dates, strings, etc, we cast to string so that all
1208
+ # statistics can be shown
1209
+ columns << stat[0.., i].cast(:str)
1210
+ end
1211
+ end
1212
+ self.class.new(columns)
1213
+ end
1214
+
1215
+ summary = _from_rbdf(
1216
+ Polars.concat(
1217
+ [
1218
+ describe_cast.(
1219
+ self.class.new(columns.to_h { |c| [c, [height]] })
1220
+ ),
1221
+ describe_cast.(null_count),
1222
+ describe_cast.(mean),
1223
+ describe_cast.(std),
1224
+ describe_cast.(min),
1225
+ describe_cast.(max),
1226
+ describe_cast.(median)
1227
+ ]
1228
+ )._df
1229
+ )
1230
+ summary.insert_at_idx(
1231
+ 0,
1232
+ Polars::Series.new(
1233
+ "describe",
1234
+ ["count", "null_count", "mean", "std", "min", "max", "median"],
1235
+ )
1236
+ )
1237
+ summary
1238
+ end
1239
+
1240
+ # Find the index of a column by name.
1241
+ #
1242
+ # @param name [String]
1243
+ # Name of the column to find.
1244
+ #
1245
+ # @return [Series]
1246
+ #
1247
+ # @example
1248
+ # df = Polars::DataFrame.new(
1249
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1250
+ # )
1251
+ # df.find_idx_by_name("ham")
1252
+ # # => 2
1253
+ def find_idx_by_name(name)
1254
+ _df.find_idx_by_name(name)
1255
+ end
1256
+
1257
+ # Replace a column at an index location.
1258
+ #
1259
+ # @param index [Integer]
1260
+ # Column index.
1261
+ # @param series [Series]
1262
+ # Series that will replace the column.
1263
+ #
1264
+ # @return [DataFrame]
1265
+ #
1266
+ # @example
1267
+ # df = Polars::DataFrame.new(
1268
+ # {
1269
+ # "foo" => [1, 2, 3],
1270
+ # "bar" => [6, 7, 8],
1271
+ # "ham" => ["a", "b", "c"]
1272
+ # }
1273
+ # )
1274
+ # s = Polars::Series.new("apple", [10, 20, 30])
1275
+ # df.replace_at_idx(0, s)
1276
+ # # =>
1277
+ # # shape: (3, 3)
1278
+ # # ┌───────┬─────┬─────┐
1279
+ # # │ apple ┆ bar ┆ ham │
1280
+ # # │ --- ┆ --- ┆ --- │
1281
+ # # │ i64 ┆ i64 ┆ str │
1282
+ # # ╞═══════╪═════╪═════╡
1283
+ # # │ 10 ┆ 6 ┆ a │
1284
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1285
+ # # │ 20 ┆ 7 ┆ b │
1286
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1287
+ # # │ 30 ┆ 8 ┆ c │
1288
+ # # └───────┴─────┴─────┘
1289
+ def replace_at_idx(index, series)
1290
+ if index < 0
1291
+ index = columns.length + index
1292
+ end
1293
+ _df.replace_at_idx(index, series._s)
1294
+ self
1295
+ end
1296
+
1297
+ # Sort the DataFrame by column.
1298
+ #
1299
+ # @param by [String]
1300
+ # By which column to sort.
1301
+ # @param reverse [Boolean]
1302
+ # Reverse/descending sort.
1303
+ # @param nulls_last [Boolean]
1304
+ # Place null values last. Can only be used if sorted by a single column.
1305
+ #
1306
+ # @return [DataFrame]
1307
+ #
1308
+ # @example
1309
+ # df = Polars::DataFrame.new(
1310
+ # {
1311
+ # "foo" => [1, 2, 3],
1312
+ # "bar" => [6.0, 7.0, 8.0],
1313
+ # "ham" => ["a", "b", "c"]
1314
+ # }
1315
+ # )
1316
+ # df.sort("foo", reverse: true)
1317
+ # # =>
1318
+ # # shape: (3, 3)
1319
+ # # ┌─────┬─────┬─────┐
1320
+ # # │ foo ┆ bar ┆ ham │
1321
+ # # │ --- ┆ --- ┆ --- │
1322
+ # # │ i64 ┆ f64 ┆ str │
1323
+ # # ╞═════╪═════╪═════╡
1324
+ # # │ 3 ┆ 8.0 ┆ c │
1325
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1326
+ # # │ 2 ┆ 7.0 ┆ b │
1327
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1328
+ # # │ 1 ┆ 6.0 ┆ a │
1329
+ # # └─────┴─────┴─────┘
1330
+ #
1331
+ # @example Sort by multiple columns.
1332
+ # df.sort(
1333
+ # [Polars.col("foo"), Polars.col("bar")**2],
1334
+ # reverse: [true, false]
1335
+ # )
1336
+ # # =>
1337
+ # # shape: (3, 3)
1338
+ # # ┌─────┬─────┬─────┐
1339
+ # # │ foo ┆ bar ┆ ham │
1340
+ # # │ --- ┆ --- ┆ --- │
1341
+ # # │ i64 ┆ f64 ┆ str │
1342
+ # # ╞═════╪═════╪═════╡
1343
+ # # │ 3 ┆ 8.0 ┆ c │
1344
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1345
+ # # │ 2 ┆ 7.0 ┆ b │
1346
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1347
+ # # │ 1 ┆ 6.0 ┆ a │
1348
+ # # └─────┴─────┴─────┘
1349
+ def sort(by, reverse: false, nulls_last: false)
1350
+ if by.is_a?(Array) || by.is_a?(Expr)
1351
+ lazy
1352
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1353
+ .collect(no_optimization: true, string_cache: false)
1354
+ else
1355
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
1356
+ end
1357
+ end
1358
+
1359
+ # Check if DataFrame is equal to other.
1360
+ #
1361
+ # @param other [DataFrame]
1362
+ # DataFrame to compare with.
1363
+ # @param null_equal [Boolean]
1364
+ # Consider null values as equal.
1365
+ #
1366
+ # @return [Boolean]
1367
+ #
1368
+ # @example
1369
+ # df1 = Polars::DataFrame.new(
1370
+ # {
1371
+ # "foo" => [1, 2, 3],
1372
+ # "bar" => [6.0, 7.0, 8.0],
1373
+ # "ham" => ["a", "b", "c"]
1374
+ # }
1375
+ # )
1376
+ # df2 = Polars::DataFrame.new(
1377
+ # {
1378
+ # "foo" => [3, 2, 1],
1379
+ # "bar" => [8.0, 7.0, 6.0],
1380
+ # "ham" => ["c", "b", "a"]
1381
+ # }
1382
+ # )
1383
+ # df1.frame_equal(df1)
1384
+ # # => true
1385
+ # df1.frame_equal(df2)
1386
+ # # => false
1387
+ def frame_equal(other, null_equal: true)
1388
+ _df.frame_equal(other._df, null_equal)
1389
+ end
1390
+
1391
+ # Replace a column by a new Series.
1392
+ #
1393
+ # @param column [String]
1394
+ # Column to replace.
1395
+ # @param new_col [Series]
1396
+ # New column to insert.
1397
+ #
1398
+ # @return [DataFrame]
1399
+ #
1400
+ # @example
1401
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1402
+ # s = Polars::Series.new([10, 20, 30])
1403
+ # df.replace("foo", s)
1404
+ # # =>
1405
+ # # shape: (3, 2)
1406
+ # # ┌─────┬─────┐
1407
+ # # │ foo ┆ bar │
1408
+ # # │ --- ┆ --- │
1409
+ # # │ i64 ┆ i64 │
1410
+ # # ╞═════╪═════╡
1411
+ # # │ 10 ┆ 4 │
1412
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1413
+ # # │ 20 ┆ 5 │
1414
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1415
+ # # │ 30 ┆ 6 │
1416
+ # # └─────┴─────┘
1417
+ def replace(column, new_col)
1418
+ _df.replace(column, new_col._s)
1419
+ self
1420
+ end
1421
+
1422
+ # Get a slice of this DataFrame.
1423
+ #
1424
+ # @param offset [Integer]
1425
+ # Start index. Negative indexing is supported.
1426
+ # @param length [Integer, nil]
1427
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1428
+ # will be selected.
1429
+ #
1430
+ # @return [DataFrame]
1431
+ #
1432
+ # @example
1433
+ # df = Polars::DataFrame.new(
1434
+ # {
1435
+ # "foo" => [1, 2, 3],
1436
+ # "bar" => [6.0, 7.0, 8.0],
1437
+ # "ham" => ["a", "b", "c"]
1438
+ # }
1439
+ # )
1440
+ # df.slice(1, 2)
1441
+ # # =>
1442
+ # # shape: (2, 3)
1443
+ # # ┌─────┬─────┬─────┐
1444
+ # # │ foo ┆ bar ┆ ham │
1445
+ # # │ --- ┆ --- ┆ --- │
1446
+ # # │ i64 ┆ f64 ┆ str │
1447
+ # # ╞═════╪═════╪═════╡
1448
+ # # │ 2 ┆ 7.0 ┆ b │
1449
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1450
+ # # │ 3 ┆ 8.0 ┆ c │
1451
+ # # └─────┴─────┴─────┘
1452
+ def slice(offset, length = nil)
1453
+ if !length.nil? && length < 0
1454
+ length = height - offset + length
1455
+ end
1456
+ _from_rbdf(_df.slice(offset, length))
1457
+ end
1458
+
1459
+ # Get the first `n` rows.
1460
+ #
1461
+ # Alias for {#head}.
1462
+ #
1463
+ # @param n [Integer]
1464
+ # Number of rows to return.
1465
+ #
1466
+ # @return [DataFrame]
1467
+ #
1468
+ # @example
1469
+ # df = Polars::DataFrame.new(
1470
+ # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
1471
+ # )
1472
+ # df.limit(4)
1473
+ # # =>
1474
+ # # shape: (4, 2)
1475
+ # # ┌─────┬─────┐
1476
+ # # │ foo ┆ bar │
1477
+ # # │ --- ┆ --- │
1478
+ # # │ i64 ┆ str │
1479
+ # # ╞═════╪═════╡
1480
+ # # │ 1 ┆ a │
1481
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1482
+ # # │ 2 ┆ b │
1483
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1484
+ # # │ 3 ┆ c │
1485
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1486
+ # # │ 4 ┆ d │
1487
+ # # └─────┴─────┘
1488
+ def limit(n = 5)
1489
+ head(n)
1490
+ end
1491
+
1492
+ # Get the first `n` rows.
1493
+ #
1494
+ # @param n [Integer]
1495
+ # Number of rows to return.
1496
+ #
1497
+ # @return [DataFrame]
1498
+ #
1499
+ # @example
1500
+ # df = Polars::DataFrame.new(
1501
+ # {
1502
+ # "foo" => [1, 2, 3, 4, 5],
1503
+ # "bar" => [6, 7, 8, 9, 10],
1504
+ # "ham" => ["a", "b", "c", "d", "e"]
1505
+ # }
1506
+ # )
1507
+ # df.head(3)
1508
+ # # =>
1509
+ # # shape: (3, 3)
1510
+ # # ┌─────┬─────┬─────┐
1511
+ # # │ foo ┆ bar ┆ ham │
1512
+ # # │ --- ┆ --- ┆ --- │
1513
+ # # │ i64 ┆ i64 ┆ str │
1514
+ # # ╞═════╪═════╪═════╡
1515
+ # # │ 1 ┆ 6 ┆ a │
1516
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1517
+ # # │ 2 ┆ 7 ┆ b │
1518
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1519
+ # # │ 3 ┆ 8 ┆ c │
1520
+ # # └─────┴─────┴─────┘
1521
+ def head(n = 5)
1522
+ _from_rbdf(_df.head(n))
1523
+ end
1524
+
1525
+ # Get the last `n` rows.
1526
+ #
1527
+ # @param n [Integer]
1528
+ # Number of rows to return.
1529
+ #
1530
+ # @return [DataFrame]
1531
+ #
1532
+ # @example
1533
+ # df = Polars::DataFrame.new(
1534
+ # {
1535
+ # "foo" => [1, 2, 3, 4, 5],
1536
+ # "bar" => [6, 7, 8, 9, 10],
1537
+ # "ham" => ["a", "b", "c", "d", "e"]
1538
+ # }
1539
+ # )
1540
+ # df.tail(3)
1541
+ # # =>
1542
+ # # shape: (3, 3)
1543
+ # # ┌─────┬─────┬─────┐
1544
+ # # │ foo ┆ bar ┆ ham │
1545
+ # # │ --- ┆ --- ┆ --- │
1546
+ # # │ i64 ┆ i64 ┆ str │
1547
+ # # ╞═════╪═════╪═════╡
1548
+ # # │ 3 ┆ 8 ┆ c │
1549
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1550
+ # # │ 4 ┆ 9 ┆ d │
1551
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1552
+ # # │ 5 ┆ 10 ┆ e │
1553
+ # # └─────┴─────┴─────┘
1554
+ def tail(n = 5)
1555
+ _from_rbdf(_df.tail(n))
1556
+ end
1557
+
1558
+ # Return a new DataFrame where the null values are dropped.
1559
+ #
1560
+ # @param subset [Object]
1561
+ # Subset of column(s) on which `drop_nulls` will be applied.
1562
+ #
1563
+ # @return [DataFrame]
1564
+ #
1565
+ # @example
1566
+ # df = Polars::DataFrame.new(
1567
+ # {
1568
+ # "foo" => [1, 2, 3],
1569
+ # "bar" => [6, nil, 8],
1570
+ # "ham" => ["a", "b", "c"]
1571
+ # }
1572
+ # )
1573
+ # df.drop_nulls
1574
+ # # =>
1575
+ # # shape: (2, 3)
1576
+ # # ┌─────┬─────┬─────┐
1577
+ # # │ foo ┆ bar ┆ ham │
1578
+ # # │ --- ┆ --- ┆ --- │
1579
+ # # │ i64 ┆ i64 ┆ str │
1580
+ # # ╞═════╪═════╪═════╡
1581
+ # # │ 1 ┆ 6 ┆ a │
1582
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1583
+ # # │ 3 ┆ 8 ┆ c │
1584
+ # # └─────┴─────┴─────┘
1585
+ def drop_nulls(subset: nil)
1586
+ if subset.is_a?(String)
1587
+ subset = [subset]
1588
+ end
1589
+ _from_rbdf(_df.drop_nulls(subset))
1590
+ end
1591
+
1592
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
1593
+ #
1594
+ # @param func [Object]
1595
+ # Callable; will receive the frame as the first parameter,
1596
+ # followed by any given args/kwargs.
1597
+ # @param args [Object]
1598
+ # Arguments to pass to the UDF.
1599
+ # @param kwargs [Object]
1600
+ # Keyword arguments to pass to the UDF.
1601
+ #
1602
+ # @return [Object]
1603
+ #
1604
+ # @note
1605
+ # It is recommended to use LazyFrame when piping operations, in order
1606
+ # to fully take advantage of query optimization and parallelization.
1607
+ # See {#lazy}.
1608
+ #
1609
+ # @example
1610
+ # cast_str_to_int = lambda do |data, col_name:|
1611
+ # data.with_column(Polars.col(col_name).cast(:i64))
1612
+ # end
1613
+ #
1614
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
1615
+ # df.pipe(cast_str_to_int, col_name: "b")
1616
+ # # =>
1617
+ # # shape: (4, 2)
1618
+ # # ┌─────┬─────┐
1619
+ # # │ a ┆ b │
1620
+ # # │ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 │
1622
+ # # ╞═════╪═════╡
1623
+ # # │ 1 ┆ 10 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1625
+ # # │ 2 ┆ 20 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1627
+ # # │ 3 ┆ 30 │
1628
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1629
+ # # │ 4 ┆ 40 │
1630
+ # # └─────┴─────┘
1631
+ def pipe(func, *args, **kwargs, &block)
1632
+ func.call(self, *args, **kwargs, &block)
1633
+ end
1634
+
1635
+ # Add a column at index 0 that counts the rows.
1636
+ #
1637
+ # @param name [String]
1638
+ # Name of the column to add.
1639
+ # @param offset [Integer]
1640
+ # Start the row count at this offset.
1641
+ #
1642
+ # @return [DataFrame]
1643
+ #
1644
+ # @example
1645
+ # df = Polars::DataFrame.new(
1646
+ # {
1647
+ # "a" => [1, 3, 5],
1648
+ # "b" => [2, 4, 6]
1649
+ # }
1650
+ # )
1651
+ # df.with_row_count
1652
+ # # =>
1653
+ # # shape: (3, 3)
1654
+ # # ┌────────┬─────┬─────┐
1655
+ # # │ row_nr ┆ a ┆ b │
1656
+ # # │ --- ┆ --- ┆ --- │
1657
+ # # │ u32 ┆ i64 ┆ i64 │
1658
+ # # ╞════════╪═════╪═════╡
1659
+ # # │ 0 ┆ 1 ┆ 2 │
1660
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1661
+ # # │ 1 ┆ 3 ┆ 4 │
1662
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1663
+ # # │ 2 ┆ 5 ┆ 6 │
1664
+ # # └────────┴─────┴─────┘
1665
+ def with_row_count(name: "row_nr", offset: 0)
1666
+ _from_rbdf(_df.with_row_count(name, offset))
1667
+ end
1668
+
1669
+ # Start a groupby operation.
1670
+ #
1671
+ # @param by [Object]
1672
+ # Column(s) to group by.
1673
+ # @param maintain_order [Boolean]
1674
+ # Make sure that the order of the groups remain consistent. This is more
1675
+ # expensive than a default groupby. Note that this only works in expression
1676
+ # aggregations.
1677
+ #
1678
+ # @return [GroupBy]
1679
+ #
1680
+ # @example
1681
+ # df = Polars::DataFrame.new(
1682
+ # {
1683
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1684
+ # "b" => [1, 2, 3, 4, 5, 6],
1685
+ # "c" => [6, 5, 4, 3, 2, 1]
1686
+ # }
1687
+ # )
1688
+ # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1689
+ # # =>
1690
+ # # shape: (3, 2)
1691
+ # # ┌─────┬─────┐
1692
+ # # │ a ┆ b │
1693
+ # # │ --- ┆ --- │
1694
+ # # │ str ┆ i64 │
1695
+ # # ╞═════╪═════╡
1696
+ # # │ a ┆ 4 │
1697
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1698
+ # # │ b ┆ 11 │
1699
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1700
+ # # │ c ┆ 6 │
1701
+ # # └─────┴─────┘
1702
+ def groupby(by, maintain_order: false)
1703
+ if !Utils.bool?(maintain_order)
1704
+ raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1705
+ end
1706
+ if by.is_a?(String)
1707
+ by = [by]
1708
+ end
1709
+ GroupBy.new(
1710
+ _df,
1711
+ by,
1712
+ self.class,
1713
+ maintain_order: maintain_order
1714
+ )
1715
+ end
1716
+
1717
+ # Create rolling groups based on a time column.
1718
+ #
1719
+ # Also works for index values of type `:i32` or `:i64`.
1720
+ #
1721
+ # Different from a `dynamic_groupby` the windows are now determined by the
1722
+ # individual values and are not of constant intervals. For constant intervals use
1723
+ # *groupby_dynamic*
1724
+ #
1725
+ # The `period` and `offset` arguments are created either from a timedelta, or
1726
+ # by using the following string language:
1727
+ #
1728
+ # - 1ns (1 nanosecond)
1729
+ # - 1us (1 microsecond)
1730
+ # - 1ms (1 millisecond)
1731
+ # - 1s (1 second)
1732
+ # - 1m (1 minute)
1733
+ # - 1h (1 hour)
1734
+ # - 1d (1 day)
1735
+ # - 1w (1 week)
1736
+ # - 1mo (1 calendar month)
1737
+ # - 1y (1 calendar year)
1738
+ # - 1i (1 index count)
1739
+ #
1740
+ # Or combine them:
1741
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1742
+ #
1743
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
1744
+ #
1745
+ # - **"1i" # length 1**
1746
+ # - **"10i" # length 10**
1747
+ #
1748
+ # @param index_column [Object]
1749
+ # Column used to group based on the time window.
1750
+ # Often to type Date/Datetime
1751
+ # This column must be sorted in ascending order. If not the output will not
1752
+ # make sense.
1753
+ #
1754
+ # In case of a rolling groupby on indices, dtype needs to be one of
1755
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1756
+ # performance matters use an `:i64` column.
1757
+ # @param period [Object]
1758
+ # Length of the window.
1759
+ # @param offset [Object]
1760
+ # Offset of the window. Default is -period.
1761
+ # @param closed ["right", "left", "both", "none"]
1762
+ # Define whether the temporal window interval is closed or not.
1763
+ # @param by [Object]
1764
+ # Also group by this column/these columns.
1765
+ #
1766
+ # @return [RollingGroupBy]
1767
+ #
1768
+ # @example
1769
+ # dates = [
1770
+ # "2020-01-01 13:45:48",
1771
+ # "2020-01-01 16:42:13",
1772
+ # "2020-01-01 16:45:09",
1773
+ # "2020-01-02 18:12:48",
1774
+ # "2020-01-03 19:45:32",
1775
+ # "2020-01-08 23:16:43"
1776
+ # ]
1777
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1778
+ # Polars.col("dt").str.strptime(:datetime)
1779
+ # )
1780
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1781
+ # [
1782
+ # Polars.sum("a").alias("sum_a"),
1783
+ # Polars.min("a").alias("min_a"),
1784
+ # Polars.max("a").alias("max_a")
1785
+ # ]
1786
+ # )
1787
+ # # =>
1788
+ # # shape: (6, 4)
1789
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1790
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1791
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1792
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1793
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1794
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1795
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1796
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1797
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1798
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1799
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1800
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1801
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1802
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1803
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1804
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1805
+ # # └─────────────────────┴───────┴───────┴───────┘
1806
+ def groupby_rolling(
1807
+ index_column:,
1808
+ period:,
1809
+ offset: nil,
1810
+ closed: "right",
1811
+ by: nil
1812
+ )
1813
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1814
+ end
1815
+
1816
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1817
+ #
1818
+ # Time windows are calculated and rows are assigned to windows. Different from a
1819
+ # normal groupby is that a row can be member of multiple groups. The time/index
1820
+ # window could be seen as a rolling window, with a window size determined by
1821
+ # dates/times/values instead of slots in the DataFrame.
1822
+ #
1823
+ # A window is defined by:
1824
+ #
1825
+ # - every: interval of the window
1826
+ # - period: length of the window
1827
+ # - offset: offset of the window
1828
+ #
1829
+ # The `every`, `period` and `offset` arguments are created with
1830
+ # the following string language:
1831
+ #
1832
+ # - 1ns (1 nanosecond)
1833
+ # - 1us (1 microsecond)
1834
+ # - 1ms (1 millisecond)
1835
+ # - 1s (1 second)
1836
+ # - 1m (1 minute)
1837
+ # - 1h (1 hour)
1838
+ # - 1d (1 day)
1839
+ # - 1w (1 week)
1840
+ # - 1mo (1 calendar month)
1841
+ # - 1y (1 calendar year)
1842
+ # - 1i (1 index count)
1843
+ #
1844
+ # Or combine them:
1845
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1846
+ #
1847
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
1848
+ #
1849
+ # - "1i" # length 1
1850
+ # - "10i" # length 10
1851
+ #
1852
+ # @param index_column
1853
+ # Column used to group based on the time window.
1854
+ # Often to type Date/Datetime
1855
+ # This column must be sorted in ascending order. If not the output will not
1856
+ # make sense.
1857
+ #
1858
+ # In case of a dynamic groupby on indices, dtype needs to be one of
1859
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1860
+ # performance matters use an `:i64` column.
1861
+ # @param every
1862
+ # Interval of the window.
1863
+ # @param period
1864
+ # Length of the window, if None it is equal to 'every'.
1865
+ # @param offset
1866
+ # Offset of the window if None and period is None it will be equal to negative
1867
+ # `every`.
1868
+ # @param truncate
1869
+ # Truncate the time value to the window lower bound.
1870
+ # @param include_boundaries
1871
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1872
+ # "_upper_bound" columns. This will impact performance because it's harder to
1873
+ # parallelize
1874
+ # @param closed ["right", "left", "both", "none"]
1875
+ # Define whether the temporal window interval is closed or not.
1876
+ # @param by
1877
+ # Also group by this column/these columns
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "time" => Polars.date_range(
1885
+ # DateTime.new(2021, 12, 16),
1886
+ # DateTime.new(2021, 12, 16, 3),
1887
+ # "30m"
1888
+ # ),
1889
+ # "n" => 0..6
1890
+ # }
1891
+ # )
1892
+ # # =>
1893
+ # # shape: (7, 2)
1894
+ # # ┌─────────────────────┬─────┐
1895
+ # # │ time ┆ n │
1896
+ # # │ --- ┆ --- │
1897
+ # # │ datetime[μs] ┆ i64 │
1898
+ # # ╞═════════════════════╪═════╡
1899
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1900
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1902
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1903
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1904
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1905
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1906
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1907
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1908
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1909
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1910
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1911
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1912
+ # # └─────────────────────┴─────┘
1913
+ #
1914
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1915
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1916
+ # [
1917
+ # Polars.col("time").min.alias("time_min"),
1918
+ # Polars.col("time").max.alias("time_max")
1919
+ # ]
1920
+ # )
1921
+ # # =>
1922
+ # # shape: (4, 3)
1923
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1924
+ # # │ time ┆ time_min ┆ time_max │
1925
+ # # │ --- ┆ --- ┆ --- │
1926
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1927
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1928
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1929
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1930
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1931
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1932
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1933
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1934
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1935
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1936
+ #
1937
+ # @example The window boundaries can also be added to the aggregation result.
1938
+ # df.groupby_dynamic(
1939
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1940
+ # ).agg([Polars.col("time").count.alias("time_count")])
1941
+ # # =>
1942
+ # # shape: (4, 4)
1943
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1944
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1945
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1946
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1947
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1948
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1949
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1950
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1951
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1952
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1953
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1954
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1955
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1956
+ #
1957
+ # @example When closed="left", should not include right end of interval.
1958
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1959
+ # [
1960
+ # Polars.col("time").count.alias("time_count"),
1961
+ # Polars.col("time").list.alias("time_agg_list")
1962
+ # ]
1963
+ # )
1964
+ # # =>
1965
+ # # shape: (4, 3)
1966
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1967
+ # # │ time ┆ time_count ┆ time_agg_list │
1968
+ # # │ --- ┆ --- ┆ --- │
1969
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1970
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1971
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1972
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1973
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1974
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1975
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1976
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1977
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1978
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1979
+ #
1980
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1981
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1982
+ # [Polars.col("time").count.alias("time_count")]
1983
+ # )
1984
+ # # =>
1985
+ # # shape: (5, 2)
1986
+ # # ┌─────────────────────┬────────────┐
1987
+ # # │ time ┆ time_count │
1988
+ # # │ --- ┆ --- │
1989
+ # # │ datetime[μs] ┆ u32 │
1990
+ # # ╞═════════════════════╪════════════╡
1991
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1992
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1993
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1994
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1995
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1996
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1997
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1998
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1999
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
2000
+ # # └─────────────────────┴────────────┘
2001
+ #
2002
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
2003
+ # df = Polars::DataFrame.new(
2004
+ # {
2005
+ # "time" => Polars.date_range(
2006
+ # DateTime.new(2021, 12, 16),
2007
+ # DateTime.new(2021, 12, 16, 3),
2008
+ # "30m"
2009
+ # ),
2010
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2011
+ # }
2012
+ # )
2013
+ # df.groupby_dynamic(
2014
+ # "time",
2015
+ # every: "1h",
2016
+ # closed: "both",
2017
+ # by: "groups",
2018
+ # include_boundaries: true
2019
+ # ).agg([Polars.col("time").count.alias("time_count")])
2020
+ # # =>
2021
+ # # shape: (7, 5)
2022
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
2023
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
2024
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2025
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
2026
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
2027
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
2028
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2029
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
2030
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2031
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
2032
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2033
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
2034
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2035
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
2036
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2037
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
2038
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2039
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2040
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2041
+ #
2042
+ # @example Dynamic groupby on an index column.
2043
+ # df = Polars::DataFrame.new(
2044
+ # {
2045
+ # "idx" => Polars.arange(0, 6, eager: true),
2046
+ # "A" => ["A", "A", "B", "B", "B", "C"]
2047
+ # }
2048
+ # )
2049
+ # df.groupby_dynamic(
2050
+ # "idx",
2051
+ # every: "2i",
2052
+ # period: "3i",
2053
+ # include_boundaries: true,
2054
+ # closed: "right"
2055
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
2056
+ # # =>
2057
+ # # shape: (3, 4)
2058
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2059
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2060
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2061
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2062
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2063
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2065
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2067
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2068
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2069
+ def groupby_dynamic(
2070
+ index_column,
2071
+ every:,
2072
+ period: nil,
2073
+ offset: nil,
2074
+ truncate: true,
2075
+ include_boundaries: false,
2076
+ closed: "left",
2077
+ by: nil,
2078
+ start_by: "window"
2079
+ )
2080
+ DynamicGroupBy.new(
2081
+ self,
2082
+ index_column,
2083
+ every,
2084
+ period,
2085
+ offset,
2086
+ truncate,
2087
+ include_boundaries,
2088
+ closed,
2089
+ by,
2090
+ start_by
2091
+ )
2092
+ end
2093
+
2094
+ # Upsample a DataFrame at a regular frequency.
2095
+ #
2096
+ # @param time_column [Object]
2097
+ # time column will be used to determine a date_range.
2098
+ # Note that this column has to be sorted for the output to make sense.
2099
+ # @param every [String]
2100
+ # interval will start 'every' duration
2101
+ # @param offset [String]
2102
+ # change the start of the date_range by this offset.
2103
+ # @param by [Object]
2104
+ # First group by these columns and then upsample for every group
2105
+ # @param maintain_order [Boolean]
2106
+ # Keep the ordering predictable. This is slower.
2107
+ #
2108
+ # The `every` and `offset` arguments are created with
2109
+ # the following string language:
2110
+ #
2111
+ # - 1ns (1 nanosecond)
2112
+ # - 1us (1 microsecond)
2113
+ # - 1ms (1 millisecond)
2114
+ # - 1s (1 second)
2115
+ # - 1m (1 minute)
2116
+ # - 1h (1 hour)
2117
+ # - 1d (1 day)
2118
+ # - 1w (1 week)
2119
+ # - 1mo (1 calendar month)
2120
+ # - 1y (1 calendar year)
2121
+ # - 1i (1 index count)
2122
+ #
2123
+ # Or combine them:
2124
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2125
+ #
2126
+ # @return [DataFrame]
2127
+ #
2128
+ # @example Upsample a DataFrame by a certain interval.
2129
+ # df = Polars::DataFrame.new(
2130
+ # {
2131
+ # "time" => [
2132
+ # DateTime.new(2021, 2, 1),
2133
+ # DateTime.new(2021, 4, 1),
2134
+ # DateTime.new(2021, 5, 1),
2135
+ # DateTime.new(2021, 6, 1)
2136
+ # ],
2137
+ # "groups" => ["A", "B", "A", "B"],
2138
+ # "values" => [0, 1, 2, 3]
2139
+ # }
2140
+ # )
2141
+ # df.upsample(
2142
+ # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2143
+ # ).select(Polars.all.forward_fill)
2144
+ # # =>
2145
+ # # shape: (7, 3)
2146
+ # # ┌─────────────────────┬────────┬────────┐
2147
+ # # │ time ┆ groups ┆ values │
2148
+ # # │ --- ┆ --- ┆ --- │
2149
+ # # │ datetime[ns] ┆ str ┆ i64 │
2150
+ # # ╞═════════════════════╪════════╪════════╡
2151
+ # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
2152
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2153
+ # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
2154
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2155
+ # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
2156
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2157
+ # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
2158
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2159
+ # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
2160
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2161
+ # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
2162
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2163
+ # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
2164
+ # # └─────────────────────┴────────┴────────┘
2165
+ def upsample(
2166
+ time_column:,
2167
+ every:,
2168
+ offset: nil,
2169
+ by: nil,
2170
+ maintain_order: false
2171
+ )
2172
+ if by.nil?
2173
+ by = []
2174
+ end
2175
+ if by.is_a?(String)
2176
+ by = [by]
2177
+ end
2178
+ if offset.nil?
2179
+ offset = "0ns"
2180
+ end
2181
+
2182
+ every = Utils._timedelta_to_pl_duration(every)
2183
+ offset = Utils._timedelta_to_pl_duration(offset)
2184
+
2185
+ _from_rbdf(
2186
+ _df.upsample(by, time_column, every, offset, maintain_order)
2187
+ )
2188
+ end
2189
+
2190
+ # Perform an asof join.
2191
+ #
2192
+ # This is similar to a left-join except that we match on nearest key rather than
2193
+ # equal keys.
2194
+ #
2195
+ # Both DataFrames must be sorted by the asof_join key.
2196
+ #
2197
+ # For each row in the left DataFrame:
2198
+ #
2199
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
2200
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
2201
+ #
2202
+ # The default is "backward".
2203
+ #
2204
+ # @param other [DataFrame]
2205
+ # DataFrame to join with.
2206
+ # @param left_on [String]
2207
+ # Join column of the left DataFrame.
2208
+ # @param right_on [String]
2209
+ # Join column of the right DataFrame.
2210
+ # @param on [String]
2211
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2212
+ # None.
2213
+ # @param by [Object]
2214
+ # join on these columns before doing asof join
2215
+ # @param by_left [Object]
2216
+ # join on these columns before doing asof join
2217
+ # @param by_right [Object]
2218
+ # join on these columns before doing asof join
2219
+ # @param strategy ["backward", "forward"]
2220
+ # Join strategy.
2221
+ # @param suffix [String]
2222
+ # Suffix to append to columns with a duplicate name.
2223
+ # @param tolerance [Object]
2224
+ # Numeric tolerance. By setting this the join will only be done if the near
2225
+ # keys are within this distance. If an asof join is done on columns of dtype
2226
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
2227
+ # language:
2228
+ #
2229
+ # - 1ns (1 nanosecond)
2230
+ # - 1us (1 microsecond)
2231
+ # - 1ms (1 millisecond)
2232
+ # - 1s (1 second)
2233
+ # - 1m (1 minute)
2234
+ # - 1h (1 hour)
2235
+ # - 1d (1 day)
2236
+ # - 1w (1 week)
2237
+ # - 1mo (1 calendar month)
2238
+ # - 1y (1 calendar year)
2239
+ # - 1i (1 index count)
2240
+ #
2241
+ # Or combine them:
2242
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2243
+ #
2244
+ # @param allow_parallel [Boolean]
2245
+ # Allow the physical plan to optionally evaluate the computation of both
2246
+ # DataFrames up to the join in parallel.
2247
+ # @param force_parallel [Boolean]
2248
+ # Force the physical plan to evaluate the computation of both DataFrames up to
2249
+ # the join in parallel.
2250
+ #
2251
+ # @return [DataFrame]
2252
+ #
2253
+ # @example
2254
+ # gdp = Polars::DataFrame.new(
2255
+ # {
2256
+ # "date" => [
2257
+ # DateTime.new(2016, 1, 1),
2258
+ # DateTime.new(2017, 1, 1),
2259
+ # DateTime.new(2018, 1, 1),
2260
+ # DateTime.new(2019, 1, 1),
2261
+ # ], # note record date: Jan 1st (sorted!)
2262
+ # "gdp" => [4164, 4411, 4566, 4696]
2263
+ # }
2264
+ # )
2265
+ # population = Polars::DataFrame.new(
2266
+ # {
2267
+ # "date" => [
2268
+ # DateTime.new(2016, 5, 12),
2269
+ # DateTime.new(2017, 5, 12),
2270
+ # DateTime.new(2018, 5, 12),
2271
+ # DateTime.new(2019, 5, 12),
2272
+ # ], # note record date: May 12th (sorted!)
2273
+ # "population" => [82.19, 82.66, 83.12, 83.52]
2274
+ # }
2275
+ # )
2276
+ # population.join_asof(
2277
+ # gdp, left_on: "date", right_on: "date", strategy: "backward"
2278
+ # )
2279
+ # # =>
2280
+ # # shape: (4, 3)
2281
+ # # ┌─────────────────────┬────────────┬──────┐
2282
+ # # │ date ┆ population ┆ gdp │
2283
+ # # │ --- ┆ --- ┆ --- │
2284
+ # # │ datetime[ns] ┆ f64 ┆ i64 │
2285
+ # # ╞═════════════════════╪════════════╪══════╡
2286
+ # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
2287
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2288
+ # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
2289
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2290
+ # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
2291
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2292
+ # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
2293
+ # # └─────────────────────┴────────────┴──────┘
2294
+ def join_asof(
2295
+ other,
2296
+ left_on: nil,
2297
+ right_on: nil,
2298
+ on: nil,
2299
+ by_left: nil,
2300
+ by_right: nil,
2301
+ by: nil,
2302
+ strategy: "backward",
2303
+ suffix: "_right",
2304
+ tolerance: nil,
2305
+ allow_parallel: true,
2306
+ force_parallel: false
2307
+ )
2308
+ lazy
2309
+ .join_asof(
2310
+ other.lazy,
2311
+ left_on: left_on,
2312
+ right_on: right_on,
2313
+ on: on,
2314
+ by_left: by_left,
2315
+ by_right: by_right,
2316
+ by: by,
2317
+ strategy: strategy,
2318
+ suffix: suffix,
2319
+ tolerance: tolerance,
2320
+ allow_parallel: allow_parallel,
2321
+ force_parallel: force_parallel
2322
+ )
2323
+ .collect(no_optimization: true)
2324
+ end
2325
+
2326
+ # Join in SQL-like fashion.
2327
+ #
2328
+ # @param other [DataFrame]
2329
+ # DataFrame to join with.
2330
+ # @param left_on [Object]
2331
+ # Name(s) of the left join column(s).
2332
+ # @param right_on [Object]
2333
+ # Name(s) of the right join column(s).
2334
+ # @param on [Object]
2335
+ # Name(s) of the join columns in both DataFrames.
2336
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
2337
+ # Join strategy.
2338
+ # @param suffix [String]
2339
+ # Suffix to append to columns with a duplicate name.
2340
+ #
2341
+ # @return [DataFrame]
2342
+ #
2343
+ # @example
2344
+ # df = Polars::DataFrame.new(
2345
+ # {
2346
+ # "foo" => [1, 2, 3],
2347
+ # "bar" => [6.0, 7.0, 8.0],
2348
+ # "ham" => ["a", "b", "c"]
2349
+ # }
2350
+ # )
2351
+ # other_df = Polars::DataFrame.new(
2352
+ # {
2353
+ # "apple" => ["x", "y", "z"],
2354
+ # "ham" => ["a", "b", "d"]
2355
+ # }
2356
+ # )
2357
+ # df.join(other_df, on: "ham")
2358
+ # # =>
2359
+ # # shape: (2, 4)
2360
+ # # ┌─────┬─────┬─────┬───────┐
2361
+ # # │ foo ┆ bar ┆ ham ┆ apple │
2362
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2363
+ # # │ i64 ┆ f64 ┆ str ┆ str │
2364
+ # # ╞═════╪═════╪═════╪═══════╡
2365
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
2366
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2367
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
2368
+ # # └─────┴─────┴─────┴───────┘
2369
+ #
2370
+ # @example
2371
+ # df.join(other_df, on: "ham", how: "outer")
2372
+ # # =>
2373
+ # # shape: (4, 4)
2374
+ # # ┌──────┬──────┬─────┬───────┐
2375
+ # # │ foo ┆ bar ┆ ham ┆ apple │
2376
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2377
+ # # │ i64 ┆ f64 ┆ str ┆ str │
2378
+ # # ╞══════╪══════╪═════╪═══════╡
2379
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
2380
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2381
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
2382
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2383
+ # # │ null ┆ null ┆ d ┆ z │
2384
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2385
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
2386
+ # # └──────┴──────┴─────┴───────┘
2387
+ #
2388
+ # @example
2389
+ # df.join(other_df, on: "ham", how: "left")
2390
+ # # =>
2391
+ # # shape: (3, 4)
2392
+ # # ┌─────┬─────┬─────┬───────┐
2393
+ # # │ foo ┆ bar ┆ ham ┆ apple │
2394
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2395
+ # # │ i64 ┆ f64 ┆ str ┆ str │
2396
+ # # ╞═════╪═════╪═════╪═══════╡
2397
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
2398
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2399
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
2400
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2401
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
2402
+ # # └─────┴─────┴─────┴───────┘
2403
+ #
2404
+ # @example
2405
+ # df.join(other_df, on: "ham", how: "semi")
2406
+ # # =>
2407
+ # # shape: (2, 3)
2408
+ # # ┌─────┬─────┬─────┐
2409
+ # # │ foo ┆ bar ┆ ham │
2410
+ # # │ --- ┆ --- ┆ --- │
2411
+ # # │ i64 ┆ f64 ┆ str │
2412
+ # # ╞═════╪═════╪═════╡
2413
+ # # │ 1 ┆ 6.0 ┆ a │
2414
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2415
+ # # │ 2 ┆ 7.0 ┆ b │
2416
+ # # └─────┴─────┴─────┘
2417
+ #
2418
+ # @example
2419
+ # df.join(other_df, on: "ham", how: "anti")
2420
+ # # =>
2421
+ # # shape: (1, 3)
2422
+ # # ┌─────┬─────┬─────┐
2423
+ # # │ foo ┆ bar ┆ ham │
2424
+ # # │ --- ┆ --- ┆ --- │
2425
+ # # │ i64 ┆ f64 ┆ str │
2426
+ # # ╞═════╪═════╪═════╡
2427
+ # # │ 3 ┆ 8.0 ┆ c │
2428
+ # # └─────┴─────┴─────┘
2429
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2430
+ lazy
2431
+ .join(
2432
+ other.lazy,
2433
+ left_on: left_on,
2434
+ right_on: right_on,
2435
+ on: on,
2436
+ how: how,
2437
+ suffix: suffix,
2438
+ )
2439
+ .collect(no_optimization: true)
2440
+ end
2441
+
2442
+ # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2443
+ #
2444
+ # The UDF will receive each row as a tuple of values: `udf(row)`.
2445
+ #
2446
+ # Implementing logic using a Ruby function is almost always _significantly_
2447
+ # slower and more memory intensive than implementing the same logic using
2448
+ # the native expression API because:
2449
+ #
2450
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2451
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2452
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2453
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2454
+ #
2455
+ # Wherever possible you should strongly prefer the native expression API
2456
+ # to achieve the best performance.
2457
+ #
2458
+ # @param return_dtype [Symbol]
2459
+ # Output type of the operation. If none given, Polars tries to infer the type.
2460
+ # @param inference_size [Integer]
2461
+ # Only used in the case when the custom function returns rows.
2462
+ # This uses the first `n` rows to determine the output schema
2463
+ #
2464
+ # @return [Object]
2465
+ #
2466
+ # @note
2467
+ # The frame-level `apply` cannot track column names (as the UDF is a black-box
2468
+ # that may arbitrarily drop, rearrange, transform, or add new columns); if you
2469
+ # want to apply a UDF such that column names are preserved, you should use the
2470
+ # expression-level `apply` syntax instead.
2471
+ #
2472
+ # @example
2473
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2474
+ #
2475
+ # @example Return a DataFrame by mapping each row to a tuple:
2476
+ # df.apply { |t| [t[0] * 2, t[1] * 3] }
2477
+ # # =>
2478
+ # # shape: (3, 2)
2479
+ # # ┌──────────┬──────────┐
2480
+ # # │ column_0 ┆ column_1 │
2481
+ # # │ --- ┆ --- │
2482
+ # # │ i64 ┆ i64 │
2483
+ # # ╞══════════╪══════════╡
2484
+ # # │ 2 ┆ -3 │
2485
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2486
+ # # │ 4 ┆ 15 │
2487
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2488
+ # # │ 6 ┆ 24 │
2489
+ # # └──────────┴──────────┘
2490
+ #
2491
+ # @example Return a Series by mapping each row to a scalar:
2492
+ # df.apply { |t| t[0] * 2 + t[1] }
2493
+ # # =>
2494
+ # # shape: (3, 1)
2495
+ # # ┌───────┐
2496
+ # # │ apply │
2497
+ # # │ --- │
2498
+ # # │ i64 │
2499
+ # # ╞═══════╡
2500
+ # # │ 1 │
2501
+ # # ├╌╌╌╌╌╌╌┤
2502
+ # # │ 9 │
2503
+ # # ├╌╌╌╌╌╌╌┤
2504
+ # # │ 14 │
2505
+ # # └───────┘
2506
+ def apply(return_dtype: nil, inference_size: 256, &f)
2507
+ out, is_df = _df.apply(f, return_dtype, inference_size)
2508
+ if is_df
2509
+ _from_rbdf(out)
2510
+ else
2511
+ _from_rbdf(Utils.wrap_s(out).to_frame._df)
2512
+ end
2513
+ end
2514
+
2515
+ # Return a new DataFrame with the column added or replaced.
2516
+ #
2517
+ # @param column [Object]
2518
+ # Series, where the name of the Series refers to the column in the DataFrame.
2519
+ #
2520
+ # @return [DataFrame]
2521
+ #
2522
+ # @example Added
2523
+ # df = Polars::DataFrame.new(
2524
+ # {
2525
+ # "a" => [1, 3, 5],
2526
+ # "b" => [2, 4, 6]
2527
+ # }
2528
+ # )
2529
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
2530
+ # # =>
2531
+ # # shape: (3, 3)
2532
+ # # ┌─────┬─────┬───────────┐
2533
+ # # │ a ┆ b ┆ b_squared │
2534
+ # # │ --- ┆ --- ┆ --- │
2535
+ # # │ i64 ┆ i64 ┆ f64 │
2536
+ # # ╞═════╪═════╪═══════════╡
2537
+ # # │ 1 ┆ 2 ┆ 4.0 │
2538
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2539
+ # # │ 3 ┆ 4 ┆ 16.0 │
2540
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2541
+ # # │ 5 ┆ 6 ┆ 36.0 │
2542
+ # # └─────┴─────┴───────────┘
2543
+ #
2544
+ # @example Replaced
2545
+ # df.with_column(Polars.col("a") ** 2)
2546
+ # # =>
2547
+ # # shape: (3, 2)
2548
+ # # ┌──────┬─────┐
2549
+ # # │ a ┆ b │
2550
+ # # │ --- ┆ --- │
2551
+ # # │ f64 ┆ i64 │
2552
+ # # ╞══════╪═════╡
2553
+ # # │ 1.0 ┆ 2 │
2554
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
2555
+ # # │ 9.0 ┆ 4 │
2556
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
2557
+ # # │ 25.0 ┆ 6 │
2558
+ # # └──────┴─────┘
2559
+ def with_column(column)
2560
+ lazy
2561
+ .with_column(column)
2562
+ .collect(no_optimization: true, string_cache: false)
2563
+ end
2564
+
2565
+ # Return a new DataFrame grown horizontally by stacking multiple Series to it.
2566
+ #
2567
+ # @param columns [Object]
2568
+ # Series to stack.
2569
+ # @param in_place [Boolean]
2570
+ # Modify in place.
2571
+ #
2572
+ # @return [DataFrame]
2573
+ #
2574
+ # @example
2575
+ # df = Polars::DataFrame.new(
2576
+ # {
2577
+ # "foo" => [1, 2, 3],
2578
+ # "bar" => [6, 7, 8],
2579
+ # "ham" => ["a", "b", "c"]
2580
+ # }
2581
+ # )
2582
+ # x = Polars::Series.new("apple", [10, 20, 30])
2583
+ # df.hstack([x])
2584
+ # # =>
2585
+ # # shape: (3, 4)
2586
+ # # ┌─────┬─────┬─────┬───────┐
2587
+ # # │ foo ┆ bar ┆ ham ┆ apple │
2588
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2589
+ # # │ i64 ┆ i64 ┆ str ┆ i64 │
2590
+ # # ╞═════╪═════╪═════╪═══════╡
2591
+ # # │ 1 ┆ 6 ┆ a ┆ 10 │
2592
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2593
+ # # │ 2 ┆ 7 ┆ b ┆ 20 │
2594
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2595
+ # # │ 3 ┆ 8 ┆ c ┆ 30 │
2596
+ # # └─────┴─────┴─────┴───────┘
2597
+ def hstack(columns, in_place: false)
2598
+ if !columns.is_a?(Array)
2599
+ columns = columns.get_columns
2600
+ end
2601
+ if in_place
2602
+ _df.hstack_mut(columns.map(&:_s))
2603
+ self
2604
+ else
2605
+ _from_rbdf(_df.hstack(columns.map(&:_s)))
2606
+ end
2607
+ end
2608
+
2609
+ # Grow this DataFrame vertically by stacking a DataFrame to it.
2610
+ #
2611
+ # @param df [DataFrame]
2612
+ # DataFrame to stack.
2613
+ # @param in_place [Boolean]
2614
+ # Modify in place
2615
+ #
2616
+ # @return [DataFrame]
2617
+ #
2618
+ # @example
2619
+ # df1 = Polars::DataFrame.new(
2620
+ # {
2621
+ # "foo" => [1, 2],
2622
+ # "bar" => [6, 7],
2623
+ # "ham" => ["a", "b"]
2624
+ # }
2625
+ # )
2626
+ # df2 = Polars::DataFrame.new(
2627
+ # {
2628
+ # "foo" => [3, 4],
2629
+ # "bar" => [8, 9],
2630
+ # "ham" => ["c", "d"]
2631
+ # }
2632
+ # )
2633
+ # df1.vstack(df2)
2634
+ # # =>
2635
+ # # shape: (4, 3)
2636
+ # # ┌─────┬─────┬─────┐
2637
+ # # │ foo ┆ bar ┆ ham │
2638
+ # # │ --- ┆ --- ┆ --- │
2639
+ # # │ i64 ┆ i64 ┆ str │
2640
+ # # ╞═════╪═════╪═════╡
2641
+ # # │ 1 ┆ 6 ┆ a │
2642
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2643
+ # # │ 2 ┆ 7 ┆ b │
2644
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2645
+ # # │ 3 ┆ 8 ┆ c │
2646
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2647
+ # # │ 4 ┆ 9 ┆ d │
2648
+ # # └─────┴─────┴─────┘
2649
+ def vstack(df, in_place: false)
2650
+ if in_place
2651
+ _df.vstack_mut(df._df)
2652
+ self
2653
+ else
2654
+ _from_rbdf(_df.vstack(df._df))
2655
+ end
2656
+ end
2657
+
2658
+ # Extend the memory backed by this `DataFrame` with the values from `other`.
2659
+ #
2660
+ # Different from `vstack` which adds the chunks from `other` to the chunks of this
2661
+ # `DataFrame` `extend` appends the data from `other` to the underlying memory
2662
+ # locations and thus may cause a reallocation.
2663
+ #
2664
+ # If this does not cause a reallocation, the resulting data structure will not
2665
+ # have any extra chunks and thus will yield faster queries.
2666
+ #
2667
+ # Prefer `extend` over `vstack` when you want to do a query after a single append.
2668
+ # For instance during online operations where you add `n` rows and rerun a query.
2669
+ #
2670
+ # Prefer `vstack` over `extend` when you want to append many times before doing a
2671
+ # query. For instance when you read in multiple files and when to store them in a
2672
+ # single `DataFrame`. In the latter case, finish the sequence of `vstack`
2673
+ # operations with a `rechunk`.
2674
+ #
2675
+ # @param other [DataFrame]
2676
+ # DataFrame to vertically add.
2677
+ #
2678
+ # @return [DataFrame]
2679
+ #
2680
+ # @example
2681
+ # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
2682
+ # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
2683
+ # df1.extend(df2)
2684
+ # # =>
2685
+ # # shape: (6, 2)
2686
+ # # ┌─────┬─────┐
2687
+ # # │ foo ┆ bar │
2688
+ # # │ --- ┆ --- │
2689
+ # # │ i64 ┆ i64 │
2690
+ # # ╞═════╪═════╡
2691
+ # # │ 1 ┆ 4 │
2692
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2693
+ # # │ 2 ┆ 5 │
2694
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2695
+ # # │ 3 ┆ 6 │
2696
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2697
+ # # │ 10 ┆ 40 │
2698
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2699
+ # # │ 20 ┆ 50 │
2700
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2701
+ # # │ 30 ┆ 60 │
2702
+ # # └─────┴─────┘
2703
+ def extend(other)
2704
+ _df.extend(other._df)
2705
+ self
2706
+ end
2707
+
2708
+ # Remove column from DataFrame and return as new.
2709
+ #
2710
+ # @param columns [Object]
2711
+ # Column(s) to drop.
2712
+ #
2713
+ # @return [DataFrame]
2714
+ #
2715
+ # @example
2716
+ # df = Polars::DataFrame.new(
2717
+ # {
2718
+ # "foo" => [1, 2, 3],
2719
+ # "bar" => [6.0, 7.0, 8.0],
2720
+ # "ham" => ["a", "b", "c"]
2721
+ # }
2722
+ # )
2723
+ # df.drop("ham")
2724
+ # # =>
2725
+ # # shape: (3, 2)
2726
+ # # ┌─────┬─────┐
2727
+ # # │ foo ┆ bar │
2728
+ # # │ --- ┆ --- │
2729
+ # # │ i64 ┆ f64 │
2730
+ # # ╞═════╪═════╡
2731
+ # # │ 1 ┆ 6.0 │
2732
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2733
+ # # │ 2 ┆ 7.0 │
2734
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2735
+ # # │ 3 ┆ 8.0 │
2736
+ # # └─────┴─────┘
2737
+ def drop(columns)
2738
+ if columns.is_a?(Array)
2739
+ df = clone
2740
+ columns.each do |n|
2741
+ df._df.drop_in_place(n)
2742
+ end
2743
+ df
2744
+ else
2745
+ _from_rbdf(_df.drop(columns))
2746
+ end
2747
+ end
2748
+
2749
+ # Drop in place.
2750
+ #
2751
+ # @param name [Object]
2752
+ # Column to drop.
2753
+ #
2754
+ # @return [Series]
2755
+ #
2756
+ # @example
2757
+ # df = Polars::DataFrame.new(
2758
+ # {
2759
+ # "foo" => [1, 2, 3],
2760
+ # "bar" => [6, 7, 8],
2761
+ # "ham" => ["a", "b", "c"]
2762
+ # }
2763
+ # )
2764
+ # df.drop_in_place("ham")
2765
+ # # =>
2766
+ # # shape: (3,)
2767
+ # # Series: 'ham' [str]
2768
+ # # [
2769
+ # # "a"
2770
+ # # "b"
2771
+ # # "c"
2772
+ # # ]
2773
+ def drop_in_place(name)
2774
+ Utils.wrap_s(_df.drop_in_place(name))
2775
+ end
2776
+
2777
+ # Create an empty copy of the current DataFrame.
2778
+ #
2779
+ # Returns a DataFrame with identical schema but no data.
2780
+ #
2781
+ # @return [DataFrame]
2782
+ #
2783
+ # @example
2784
+ # df = Polars::DataFrame.new(
2785
+ # {
2786
+ # "a" => [nil, 2, 3, 4],
2787
+ # "b" => [0.5, nil, 2.5, 13],
2788
+ # "c" => [true, true, false, nil]
2789
+ # }
2790
+ # )
2791
+ # df.cleared
2792
+ # # =>
2793
+ # # shape: (0, 3)
2794
+ # # ┌─────┬─────┬──────┐
2795
+ # # │ a ┆ b ┆ c │
2796
+ # # │ --- ┆ --- ┆ --- │
2797
+ # # │ i64 ┆ f64 ┆ bool │
2798
+ # # ╞═════╪═════╪══════╡
2799
+ # # └─────┴─────┴──────┘
2800
+ def cleared
2801
+ height > 0 ? head(0) : clone
2802
+ end
2803
+
2804
+ # clone handled by initialize_copy
2805
+
2806
+ # Get the DataFrame as a Array of Series.
2807
+ #
2808
+ # @return [Array]
2809
+ def get_columns
2810
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
2811
+ end
2812
+
2813
+ # Get a single column as Series by name.
2814
+ #
2815
+ # @param name [String]
2816
+ # Name of the column to retrieve.
2817
+ #
2818
+ # @return [Series]
2819
+ #
2820
+ # @example
2821
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
2822
+ # df.get_column("foo")
2823
+ # # =>
2824
+ # # shape: (3,)
2825
+ # # Series: 'foo' [i64]
2826
+ # # [
2827
+ # # 1
2828
+ # # 2
2829
+ # # 3
2830
+ # # ]
2831
+ def get_column(name)
2832
+ self[name]
2833
+ end
2834
+
2835
+ # Fill null values using the specified value or strategy.
2836
+ #
2837
+ # @param value [Numeric]
2838
+ # Value used to fill null values.
2839
+ # @param strategy [nil, "forward", "backward", "min", "max", "mean", "zero", "one"]
2840
+ # Strategy used to fill null values.
2841
+ # @param limit [Integer]
2842
+ # Number of consecutive null values to fill when using the 'forward' or
2843
+ # 'backward' strategy.
2844
+ # @param matches_supertype [Boolean]
2845
+ # Fill all matching supertype of the fill `value`.
2846
+ #
2847
+ # @return [DataFrame]
2848
+ #
2849
+ # @example
2850
+ # df = Polars::DataFrame.new(
2851
+ # {
2852
+ # "a" => [1, 2, nil, 4],
2853
+ # "b" => [0.5, 4, nil, 13]
2854
+ # }
2855
+ # )
2856
+ # df.fill_null(99)
2857
+ # # =>
2858
+ # # shape: (4, 2)
2859
+ # # ┌─────┬──────┐
2860
+ # # │ a ┆ b │
2861
+ # # │ --- ┆ --- │
2862
+ # # │ i64 ┆ f64 │
2863
+ # # ╞═════╪══════╡
2864
+ # # │ 1 ┆ 0.5 │
2865
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2866
+ # # │ 2 ┆ 4.0 │
2867
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2868
+ # # │ 99 ┆ 99.0 │
2869
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2870
+ # # │ 4 ┆ 13.0 │
2871
+ # # └─────┴──────┘
2872
+ #
2873
+ # @example
2874
+ # df.fill_null(strategy: "forward")
2875
+ # # =>
2876
+ # # shape: (4, 2)
2877
+ # # ┌─────┬──────┐
2878
+ # # │ a ┆ b │
2879
+ # # │ --- ┆ --- │
2880
+ # # │ i64 ┆ f64 │
2881
+ # # ╞═════╪══════╡
2882
+ # # │ 1 ┆ 0.5 │
2883
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2884
+ # # │ 2 ┆ 4.0 │
2885
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2886
+ # # │ 2 ┆ 4.0 │
2887
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2888
+ # # │ 4 ┆ 13.0 │
2889
+ # # └─────┴──────┘
2890
+ #
2891
+ # @example
2892
+ # df.fill_null(strategy: "max")
2893
+ # # =>
2894
+ # # shape: (4, 2)
2895
+ # # ┌─────┬──────┐
2896
+ # # │ a ┆ b │
2897
+ # # │ --- ┆ --- │
2898
+ # # │ i64 ┆ f64 │
2899
+ # # ╞═════╪══════╡
2900
+ # # │ 1 ┆ 0.5 │
2901
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2902
+ # # │ 2 ┆ 4.0 │
2903
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2904
+ # # │ 4 ┆ 13.0 │
2905
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2906
+ # # │ 4 ┆ 13.0 │
2907
+ # # └─────┴──────┘
2908
+ #
2909
+ # @example
2910
+ # df.fill_null(strategy: "zero")
2911
+ # # =>
2912
+ # # shape: (4, 2)
2913
+ # # ┌─────┬──────┐
2914
+ # # │ a ┆ b │
2915
+ # # │ --- ┆ --- │
2916
+ # # │ i64 ┆ f64 │
2917
+ # # ╞═════╪══════╡
2918
+ # # │ 1 ┆ 0.5 │
2919
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2920
+ # # │ 2 ┆ 4.0 │
2921
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2922
+ # # │ 0 ┆ 0.0 │
2923
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
2924
+ # # │ 4 ┆ 13.0 │
2925
+ # # └─────┴──────┘
2926
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true)
2927
+ _from_rbdf(
2928
+ lazy
2929
+ .fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
2930
+ .collect(no_optimization: true)
2931
+ ._df
2932
+ )
2933
+ end
2934
+
2935
+ # Fill floating point NaN values by an Expression evaluation.
2936
+ #
2937
+ # @param fill_value [Object]
2938
+ # Value to fill NaN with.
2939
+ #
2940
+ # @return [DataFrame]
2941
+ #
2942
+ # @note
2943
+ # Note that floating point NaNs (Not a Number) are not missing values!
2944
+ # To replace missing values, use `fill_null`.
2945
+ #
2946
+ # @example
2947
+ # df = Polars::DataFrame.new(
2948
+ # {
2949
+ # "a" => [1.5, 2, Float::NAN, 4],
2950
+ # "b" => [0.5, 4, Float::NAN, 13]
2951
+ # }
2952
+ # )
2953
+ # df.fill_nan(99)
2954
+ # # =>
2955
+ # # shape: (4, 2)
2956
+ # # ┌──────┬──────┐
2957
+ # # │ a ┆ b │
2958
+ # # │ --- ┆ --- │
2959
+ # # │ f64 ┆ f64 │
2960
+ # # ╞══════╪══════╡
2961
+ # # │ 1.5 ┆ 0.5 │
2962
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2963
+ # # │ 2.0 ┆ 4.0 │
2964
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2965
+ # # │ 99.0 ┆ 99.0 │
2966
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2967
+ # # │ 4.0 ┆ 13.0 │
2968
+ # # └──────┴──────┘
2969
+ def fill_nan(fill_value)
2970
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
2971
+ end
2972
+
2973
+ # Explode `DataFrame` to long format by exploding a column with Lists.
2974
+ #
2975
+ # @param columns [Object]
2976
+ # Column of LargeList type.
2977
+ #
2978
+ # @return [DataFrame]
2979
+ #
2980
+ # @example
2981
+ # df = Polars::DataFrame.new(
2982
+ # {
2983
+ # "letters" => ["a", "a", "b", "c"],
2984
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
2985
+ # }
2986
+ # )
2987
+ # df.explode("numbers")
2988
+ # # =>
2989
+ # # shape: (8, 2)
2990
+ # # ┌─────────┬─────────┐
2991
+ # # │ letters ┆ numbers │
2992
+ # # │ --- ┆ --- │
2993
+ # # │ str ┆ i64 │
2994
+ # # ╞═════════╪═════════╡
2995
+ # # │ a ┆ 1 │
2996
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2997
+ # # │ a ┆ 2 │
2998
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2999
+ # # │ a ┆ 3 │
3000
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
3001
+ # # │ b ┆ 4 │
3002
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
3003
+ # # │ b ┆ 5 │
3004
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
3005
+ # # │ c ┆ 6 │
3006
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
3007
+ # # │ c ┆ 7 │
3008
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
3009
+ # # │ c ┆ 8 │
3010
+ # # └─────────┴─────────┘
3011
+ def explode(columns)
3012
+ lazy.explode(columns).collect(no_optimization: true)
3013
+ end
3014
+
3015
+ # Create a spreadsheet-style pivot table as a DataFrame.
3016
+ #
3017
+ # @param values [Object]
3018
+ # Column values to aggregate. Can be multiple columns if the *columns*
3019
+ # arguments contains multiple columns as well
3020
+ # @param index [Object]
3021
+ # One or multiple keys to group by
3022
+ # @param columns [Object]
3023
+ # Columns whose values will be used as the header of the output DataFrame
3024
+ # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3025
+ # A predefined aggregate function str or an expression.
3026
+ # @param maintain_order [Object]
3027
+ # Sort the grouped keys so that the output order is predictable.
3028
+ # @param sort_columns [Object]
3029
+ # Sort the transposed columns by name. Default is by order of discovery.
3030
+ #
3031
+ # @return [DataFrame]
3032
+ #
3033
+ # @example
3034
+ # df = Polars::DataFrame.new(
3035
+ # {
3036
+ # "foo" => ["one", "one", "one", "two", "two", "two"],
3037
+ # "bar" => ["A", "B", "C", "A", "B", "C"],
3038
+ # "baz" => [1, 2, 3, 4, 5, 6]
3039
+ # }
3040
+ # )
3041
+ # df.pivot(values: "baz", index: "foo", columns: "bar")
3042
+ # # =>
3043
+ # # shape: (2, 4)
3044
+ # # ┌─────┬─────┬─────┬─────┐
3045
+ # # │ foo ┆ A ┆ B ┆ C │
3046
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3047
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
3048
+ # # ╞═════╪═════╪═════╪═════╡
3049
+ # # │ one ┆ 1 ┆ 2 ┆ 3 │
3050
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3051
+ # # │ two ┆ 4 ┆ 5 ┆ 6 │
3052
+ # # └─────┴─────┴─────┴─────┘
3053
+ def pivot(
3054
+ values:,
3055
+ index:,
3056
+ columns:,
3057
+ aggregate_fn: "first",
3058
+ maintain_order: true,
3059
+ sort_columns: false
3060
+ )
3061
+ if values.is_a?(String)
3062
+ values = [values]
3063
+ end
3064
+ if index.is_a?(String)
3065
+ index = [index]
3066
+ end
3067
+ if columns.is_a?(String)
3068
+ columns = [columns]
3069
+ end
3070
+
3071
+ if aggregate_fn.is_a?(String)
3072
+ case aggregate_fn
3073
+ when "first"
3074
+ aggregate_fn = Polars.element.first
3075
+ when "sum"
3076
+ aggregate_fn = Polars.element.sum
3077
+ when "max"
3078
+ aggregate_fn = Polars.element.max
3079
+ when "min"
3080
+ aggregate_fn = Polars.element.min
3081
+ when "mean"
3082
+ aggregate_fn = Polars.element.mean
3083
+ when "median"
3084
+ aggregate_fn = Polars.element.median
3085
+ when "last"
3086
+ aggregate_fn = Polars.element.last
3087
+ when "count"
3088
+ aggregate_fn = Polars.count
3089
+ else
3090
+ raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3091
+ end
3092
+ end
3093
+
3094
+ _from_rbdf(
3095
+ _df.pivot_expr(
3096
+ values,
3097
+ index,
3098
+ columns,
3099
+ aggregate_fn._rbexpr,
3100
+ maintain_order,
3101
+ sort_columns
3102
+ )
3103
+ )
3104
+ end
3105
+
3106
+ # Unpivot a DataFrame from wide to long format.
3107
+ #
3108
+ # Optionally leaves identifiers set.
3109
+ #
3110
+ # This function is useful to massage a DataFrame into a format where one or more
3111
+ # columns are identifier variables (id_vars), while all other columns, considered
3112
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3113
+ # two non-identifier columns, 'variable' and 'value'.
3114
+ #
3115
+ # @param id_vars [Object]
3116
+ # Columns to use as identifier variables.
3117
+ # @param value_vars [Object]
3118
+ # Values to use as identifier variables.
3119
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3120
+ # @param variable_name [String]
3121
+ # Name to give to the `value` column. Defaults to "variable"
3122
+ # @param value_name [String]
3123
+ # Name to give to the `value` column. Defaults to "value"
3124
+ #
3125
+ # @return [DataFrame]
3126
+ #
3127
+ # @example
3128
+ # df = Polars::DataFrame.new(
3129
+ # {
3130
+ # "a" => ["x", "y", "z"],
3131
+ # "b" => [1, 3, 5],
3132
+ # "c" => [2, 4, 6]
3133
+ # }
3134
+ # )
3135
+ # df.melt(id_vars: "a", value_vars: ["b", "c"])
3136
+ # # =>
3137
+ # # shape: (6, 3)
3138
+ # # ┌─────┬──────────┬───────┐
3139
+ # # │ a ┆ variable ┆ value │
3140
+ # # │ --- ┆ --- ┆ --- │
3141
+ # # │ str ┆ str ┆ i64 │
3142
+ # # ╞═════╪══════════╪═══════╡
3143
+ # # │ x ┆ b ┆ 1 │
3144
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3145
+ # # │ y ┆ b ┆ 3 │
3146
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3147
+ # # │ z ┆ b ┆ 5 │
3148
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3149
+ # # │ x ┆ c ┆ 2 │
3150
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3151
+ # # │ y ┆ c ┆ 4 │
3152
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3153
+ # # │ z ┆ c ┆ 6 │
3154
+ # # └─────┴──────────┴───────┘
3155
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3156
+ if value_vars.is_a?(String)
3157
+ value_vars = [value_vars]
3158
+ end
3159
+ if id_vars.is_a?(String)
3160
+ id_vars = [id_vars]
3161
+ end
3162
+ if value_vars.nil?
3163
+ value_vars = []
3164
+ end
3165
+ if id_vars.nil?
3166
+ id_vars = []
3167
+ end
3168
+ _from_rbdf(
3169
+ _df.melt(id_vars, value_vars, value_name, variable_name)
3170
+ )
3171
+ end
3172
+
3173
+ # Unstack a long table to a wide form without doing an aggregation.
3174
+ #
3175
+ # This can be much faster than a pivot, because it can skip the grouping phase.
3176
+ #
3177
+ # @note
3178
+ # This functionality is experimental and may be subject to changes
3179
+ # without it being considered a breaking change.
3180
+ #
3181
+ # @param step Integer
3182
+ # Number of rows in the unstacked frame.
3183
+ # @param how ["vertical", "horizontal"]
3184
+ # Direction of the unstack.
3185
+ # @param columns [Object]
3186
+ # Column to include in the operation.
3187
+ # @param fill_values [Object]
3188
+ # Fill values that don't fit the new size with this value.
3189
+ #
3190
+ # @return [DataFrame]
3191
+ #
3192
+ # @example
3193
+ # df = Polars::DataFrame.new(
3194
+ # {
3195
+ # "col1" => "A".."I",
3196
+ # "col2" => Polars.arange(0, 9, eager: true)
3197
+ # }
3198
+ # )
3199
+ # # =>
3200
+ # # shape: (9, 2)
3201
+ # # ┌──────┬──────┐
3202
+ # # │ col1 ┆ col2 │
3203
+ # # │ --- ┆ --- │
3204
+ # # │ str ┆ i64 │
3205
+ # # ╞══════╪══════╡
3206
+ # # │ A ┆ 0 │
3207
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3208
+ # # │ B ┆ 1 │
3209
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3210
+ # # │ C ┆ 2 │
3211
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3212
+ # # │ D ┆ 3 │
3213
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3214
+ # # │ ... ┆ ... │
3215
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3216
+ # # │ F ┆ 5 │
3217
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3218
+ # # │ G ┆ 6 │
3219
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3220
+ # # │ H ┆ 7 │
3221
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3222
+ # # │ I ┆ 8 │
3223
+ # # └──────┴──────┘
3224
+ #
3225
+ # @example
3226
+ # df.unstack(step: 3, how: "vertical")
3227
+ # # =>
3228
+ # # shape: (3, 6)
3229
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3230
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3231
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3232
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3233
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3234
+ # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
3235
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3236
+ # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
3237
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3238
+ # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
3239
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3240
+ #
3241
+ # @example
3242
+ # df.unstack(step: 3, how: "horizontal")
3243
+ # # =>
3244
+ # # shape: (3, 6)
3245
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3246
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3247
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3248
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3249
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3250
+ # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
3251
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3252
+ # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
3253
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3254
+ # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
3255
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3256
+ def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
3257
+ if !columns.nil?
3258
+ df = select(columns)
3259
+ else
3260
+ df = self
3261
+ end
3262
+
3263
+ height = df.height
3264
+ if how == "vertical"
3265
+ n_rows = step
3266
+ n_cols = (height / n_rows.to_f).ceil
3267
+ else
3268
+ n_cols = step
3269
+ n_rows = (height / n_cols.to_f).ceil
3270
+ end
3271
+
3272
+ n_fill = n_cols * n_rows - height
3273
+
3274
+ if n_fill > 0
3275
+ if !fill_values.is_a?(Array)
3276
+ fill_values = [fill_values] * df.width
3277
+ end
3278
+
3279
+ df = df.select(
3280
+ df.get_columns.zip(fill_values).map do |s, next_fill|
3281
+ s.extend_constant(next_fill, n_fill)
3282
+ end
3283
+ )
3284
+ end
3285
+
3286
+ if how == "horizontal"
3287
+ df = (
3288
+ df.with_column(
3289
+ (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
3290
+ "__sort_order"
3291
+ )
3292
+ )
3293
+ .sort("__sort_order")
3294
+ .drop("__sort_order")
3295
+ )
3296
+ end
3297
+
3298
+ zfill_val = Math.log10(n_cols).floor + 1
3299
+ slices =
3300
+ df.get_columns.flat_map do |s|
3301
+ n_cols.times.map do |slice_nbr|
3302
+ s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
3303
+ end
3304
+ end
3305
+
3306
+ _from_rbdf(DataFrame.new(slices)._df)
3307
+ end
3308
+
3309
+ # Split into multiple DataFrames partitioned by groups.
3310
+ #
3311
+ # @param groups [Object]
3312
+ # Groups to partition by.
3313
+ # @param maintain_order [Boolean]
3314
+ # Keep predictable output order. This is slower as it requires an extra sort
3315
+ # operation.
3316
+ # @param as_dict [Boolean]
3317
+ # If true, return the partitions in a dictionary keyed by the distinct group
3318
+ # values instead of a list.
3319
+ #
3320
+ # @return [Object]
3321
+ #
3322
+ # @example
3323
+ # df = Polars::DataFrame.new(
3324
+ # {
3325
+ # "foo" => ["A", "A", "B", "B", "C"],
3326
+ # "N" => [1, 2, 2, 4, 2],
3327
+ # "bar" => ["k", "l", "m", "m", "l"]
3328
+ # }
3329
+ # )
3330
+ # df.partition_by("foo", maintain_order: true)
3331
+ # # =>
3332
+ # # [shape: (2, 3)
3333
+ # # ┌─────┬─────┬─────┐
3334
+ # # │ foo ┆ N ┆ bar │
3335
+ # # │ --- ┆ --- ┆ --- │
3336
+ # # │ str ┆ i64 ┆ str │
3337
+ # # ╞═════╪═════╪═════╡
3338
+ # # │ A ┆ 1 ┆ k │
3339
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3340
+ # # │ A ┆ 2 ┆ l │
3341
+ # # └─────┴─────┴─────┘, shape: (2, 3)
3342
+ # # ┌─────┬─────┬─────┐
3343
+ # # │ foo ┆ N ┆ bar │
3344
+ # # │ --- ┆ --- ┆ --- │
3345
+ # # │ str ┆ i64 ┆ str │
3346
+ # # ╞═════╪═════╪═════╡
3347
+ # # │ B ┆ 2 ┆ m │
3348
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3349
+ # # │ B ┆ 4 ┆ m │
3350
+ # # └─────┴─────┴─────┘, shape: (1, 3)
3351
+ # # ┌─────┬─────┬─────┐
3352
+ # # │ foo ┆ N ┆ bar │
3353
+ # # │ --- ┆ --- ┆ --- │
3354
+ # # │ str ┆ i64 ┆ str │
3355
+ # # ╞═════╪═════╪═════╡
3356
+ # # │ C ┆ 2 ┆ l │
3357
+ # # └─────┴─────┴─────┘]
3358
+ #
3359
+ # @example
3360
+ # df.partition_by("foo", maintain_order: true, as_dict: true)
3361
+ # # =>
3362
+ # # {"A"=>shape: (2, 3)
3363
+ # # ┌─────┬─────┬─────┐
3364
+ # # │ foo ┆ N ┆ bar │
3365
+ # # │ --- ┆ --- ┆ --- │
3366
+ # # │ str ┆ i64 ┆ str │
3367
+ # # ╞═════╪═════╪═════╡
3368
+ # # │ A ┆ 1 ┆ k │
3369
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3370
+ # # │ A ┆ 2 ┆ l │
3371
+ # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
3372
+ # # ┌─────┬─────┬─────┐
3373
+ # # │ foo ┆ N ┆ bar │
3374
+ # # │ --- ┆ --- ┆ --- │
3375
+ # # │ str ┆ i64 ┆ str │
3376
+ # # ╞═════╪═════╪═════╡
3377
+ # # │ B ┆ 2 ┆ m │
3378
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3379
+ # # │ B ┆ 4 ┆ m │
3380
+ # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
3381
+ # # ┌─────┬─────┬─────┐
3382
+ # # │ foo ┆ N ┆ bar │
3383
+ # # │ --- ┆ --- ┆ --- │
3384
+ # # │ str ┆ i64 ┆ str │
3385
+ # # ╞═════╪═════╪═════╡
3386
+ # # │ C ┆ 2 ┆ l │
3387
+ # # └─────┴─────┴─────┘}
3388
+ def partition_by(groups, maintain_order: true, as_dict: false)
3389
+ if groups.is_a?(String)
3390
+ groups = [groups]
3391
+ elsif !groups.is_a?(Array)
3392
+ groups = Array(groups)
3393
+ end
3394
+
3395
+ if as_dict
3396
+ out = {}
3397
+ if groups.length == 1
3398
+ _df.partition_by(groups, maintain_order).each do |df|
3399
+ df = _from_rbdf(df)
3400
+ out[df[groups][0, 0]] = df
3401
+ end
3402
+ else
3403
+ _df.partition_by(groups, maintain_order).each do |df|
3404
+ df = _from_rbdf(df)
3405
+ out[df[groups].row(0)] = df
3406
+ end
3407
+ end
3408
+ out
3409
+ else
3410
+ _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3411
+ end
3412
+ end
3413
+
3414
+ # Shift values by the given period.
3415
+ #
3416
+ # @param periods [Integer]
3417
+ # Number of places to shift (may be negative).
3418
+ #
3419
+ # @return [DataFrame]
3420
+ #
3421
+ # @example
3422
+ # df = Polars::DataFrame.new(
3423
+ # {
3424
+ # "foo" => [1, 2, 3],
3425
+ # "bar" => [6, 7, 8],
3426
+ # "ham" => ["a", "b", "c"]
3427
+ # }
3428
+ # )
3429
+ # df.shift(1)
3430
+ # # =>
3431
+ # # shape: (3, 3)
3432
+ # # ┌──────┬──────┬──────┐
3433
+ # # │ foo ┆ bar ┆ ham │
3434
+ # # │ --- ┆ --- ┆ --- │
3435
+ # # │ i64 ┆ i64 ┆ str │
3436
+ # # ╞══════╪══════╪══════╡
3437
+ # # │ null ┆ null ┆ null │
3438
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3439
+ # # │ 1 ┆ 6 ┆ a │
3440
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3441
+ # # │ 2 ┆ 7 ┆ b │
3442
+ # # └──────┴──────┴──────┘
3443
+ #
3444
+ # @example
3445
+ # df.shift(-1)
3446
+ # # =>
3447
+ # # shape: (3, 3)
3448
+ # # ┌──────┬──────┬──────┐
3449
+ # # │ foo ┆ bar ┆ ham │
3450
+ # # │ --- ┆ --- ┆ --- │
3451
+ # # │ i64 ┆ i64 ┆ str │
3452
+ # # ╞══════╪══════╪══════╡
3453
+ # # │ 2 ┆ 7 ┆ b │
3454
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3455
+ # # │ 3 ┆ 8 ┆ c │
3456
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3457
+ # # │ null ┆ null ┆ null │
3458
+ # # └──────┴──────┴──────┘
3459
+ def shift(periods)
3460
+ _from_rbdf(_df.shift(periods))
3461
+ end
3462
+
3463
+ # Shift the values by a given period and fill the resulting null values.
3464
+ #
3465
+ # @param periods [Integer]
3466
+ # Number of places to shift (may be negative).
3467
+ # @param fill_value [Object]
3468
+ # fill nil values with this value.
3469
+ #
3470
+ # @return [DataFrame]
3471
+ #
3472
+ # @example
3473
+ # df = Polars::DataFrame.new(
3474
+ # {
3475
+ # "foo" => [1, 2, 3],
3476
+ # "bar" => [6, 7, 8],
3477
+ # "ham" => ["a", "b", "c"]
3478
+ # }
3479
+ # )
3480
+ # df.shift_and_fill(1, 0)
3481
+ # # =>
3482
+ # # shape: (3, 3)
3483
+ # # ┌─────┬─────┬─────┐
3484
+ # # │ foo ┆ bar ┆ ham │
3485
+ # # │ --- ┆ --- ┆ --- │
3486
+ # # │ i64 ┆ i64 ┆ str │
3487
+ # # ╞═════╪═════╪═════╡
3488
+ # # │ 0 ┆ 0 ┆ 0 │
3489
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3490
+ # # │ 1 ┆ 6 ┆ a │
3491
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3492
+ # # │ 2 ┆ 7 ┆ b │
3493
+ # # └─────┴─────┴─────┘
3494
+ def shift_and_fill(periods, fill_value)
3495
+ lazy
3496
+ .shift_and_fill(periods, fill_value)
3497
+ .collect(no_optimization: true, string_cache: false)
3498
+ end
3499
+
3500
+ # Get a mask of all duplicated rows in this DataFrame.
3501
+ #
3502
+ # @return [Series]
3503
+ #
3504
+ # @example
3505
+ # df = Polars::DataFrame.new(
3506
+ # {
3507
+ # "a" => [1, 2, 3, 1],
3508
+ # "b" => ["x", "y", "z", "x"],
3509
+ # }
3510
+ # )
3511
+ # df.is_duplicated
3512
+ # # =>
3513
+ # # shape: (4,)
3514
+ # # Series: '' [bool]
3515
+ # # [
3516
+ # # true
3517
+ # # false
3518
+ # # false
3519
+ # # true
3520
+ # # ]
3521
+ def is_duplicated
3522
+ Utils.wrap_s(_df.is_duplicated)
3523
+ end
3524
+
3525
+ # Get a mask of all unique rows in this DataFrame.
3526
+ #
3527
+ # @return [Series]
3528
+ #
3529
+ # @example
3530
+ # df = Polars::DataFrame.new(
3531
+ # {
3532
+ # "a" => [1, 2, 3, 1],
3533
+ # "b" => ["x", "y", "z", "x"]
3534
+ # }
3535
+ # )
3536
+ # df.is_unique
3537
+ # # =>
3538
+ # # shape: (4,)
3539
+ # # Series: '' [bool]
3540
+ # # [
3541
+ # # false
3542
+ # # true
3543
+ # # true
3544
+ # # false
3545
+ # # ]
3546
+ def is_unique
3547
+ Utils.wrap_s(_df.is_unique)
3548
+ end
3549
+
3550
+ # Start a lazy query from this point.
3551
+ #
3552
+ # @return [LazyFrame]
3553
+ def lazy
3554
+ wrap_ldf(_df.lazy)
3555
+ end
3556
+
3557
+ # Select columns from this DataFrame.
3558
+ #
3559
+ # @param exprs [Object]
3560
+ # Column or columns to select.
3561
+ #
3562
+ # @return [DataFrame]
3563
+ #
3564
+ # @example
3565
+ # df = Polars::DataFrame.new(
3566
+ # {
3567
+ # "foo" => [1, 2, 3],
3568
+ # "bar" => [6, 7, 8],
3569
+ # "ham" => ["a", "b", "c"]
3570
+ # }
3571
+ # )
3572
+ # df.select("foo")
3573
+ # # =>
3574
+ # # shape: (3, 1)
3575
+ # # ┌─────┐
3576
+ # # │ foo │
3577
+ # # │ --- │
3578
+ # # │ i64 │
3579
+ # # ╞═════╡
3580
+ # # │ 1 │
3581
+ # # ├╌╌╌╌╌┤
3582
+ # # │ 2 │
3583
+ # # ├╌╌╌╌╌┤
3584
+ # # │ 3 │
3585
+ # # └─────┘
3586
+ #
3587
+ # @example
3588
+ # df.select(["foo", "bar"])
3589
+ # # =>
3590
+ # # shape: (3, 2)
3591
+ # # ┌─────┬─────┐
3592
+ # # │ foo ┆ bar │
3593
+ # # │ --- ┆ --- │
3594
+ # # │ i64 ┆ i64 │
3595
+ # # ╞═════╪═════╡
3596
+ # # │ 1 ┆ 6 │
3597
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3598
+ # # │ 2 ┆ 7 │
3599
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3600
+ # # │ 3 ┆ 8 │
3601
+ # # └─────┴─────┘
3602
+ #
3603
+ # @example
3604
+ # df.select(Polars.col("foo") + 1)
3605
+ # # =>
3606
+ # # shape: (3, 1)
3607
+ # # ┌─────┐
3608
+ # # │ foo │
3609
+ # # │ --- │
3610
+ # # │ i64 │
3611
+ # # ╞═════╡
3612
+ # # │ 2 │
3613
+ # # ├╌╌╌╌╌┤
3614
+ # # │ 3 │
3615
+ # # ├╌╌╌╌╌┤
3616
+ # # │ 4 │
3617
+ # # └─────┘
3618
+ #
3619
+ # @example
3620
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
3621
+ # # =>
3622
+ # # shape: (3, 2)
3623
+ # # ┌─────┬─────┐
3624
+ # # │ foo ┆ bar │
3625
+ # # │ --- ┆ --- │
3626
+ # # │ i64 ┆ i64 │
3627
+ # # ╞═════╪═════╡
3628
+ # # │ 2 ┆ 7 │
3629
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3630
+ # # │ 3 ┆ 8 │
3631
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3632
+ # # │ 4 ┆ 9 │
3633
+ # # └─────┴─────┘
3634
+ #
3635
+ # @example
3636
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
3637
+ # # =>
3638
+ # # shape: (3, 1)
3639
+ # # ┌─────────┐
3640
+ # # │ literal │
3641
+ # # │ --- │
3642
+ # # │ i64 │
3643
+ # # ╞═════════╡
3644
+ # # │ 0 │
3645
+ # # ├╌╌╌╌╌╌╌╌╌┤
3646
+ # # │ 0 │
3647
+ # # ├╌╌╌╌╌╌╌╌╌┤
3648
+ # # │ 10 │
3649
+ # # └─────────┘
3650
+ def select(exprs)
3651
+ _from_rbdf(
3652
+ lazy
3653
+ .select(exprs)
3654
+ .collect(no_optimization: true, string_cache: false)
3655
+ ._df
3656
+ )
3657
+ end
3658
+
3659
+ # Add or overwrite multiple columns in a DataFrame.
3660
+ #
3661
+ # @param exprs [Array]
3662
+ # Array of Expressions that evaluate to columns.
3663
+ #
3664
+ # @return [DataFrame]
3665
+ #
3666
+ # @example
3667
+ # df = Polars::DataFrame.new(
3668
+ # {
3669
+ # "a" => [1, 2, 3, 4],
3670
+ # "b" => [0.5, 4, 10, 13],
3671
+ # "c" => [true, true, false, true]
3672
+ # }
3673
+ # )
3674
+ # df.with_columns(
3675
+ # [
3676
+ # (Polars.col("a") ** 2).alias("a^2"),
3677
+ # (Polars.col("b") / 2).alias("b/2"),
3678
+ # (Polars.col("c").is_not).alias("not c")
3679
+ # ]
3680
+ # )
3681
+ # # =>
3682
+ # # shape: (4, 6)
3683
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3684
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3685
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3686
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3687
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3688
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3689
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3690
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3691
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3692
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3693
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3694
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3695
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
3696
+ def with_columns(exprs)
3697
+ if !exprs.nil? && !exprs.is_a?(Array)
3698
+ exprs = [exprs]
3699
+ end
3700
+ lazy
3701
+ .with_columns(exprs)
3702
+ .collect(no_optimization: true, string_cache: false)
3703
+ end
3704
+
3705
+ # Get number of chunks used by the ChunkedArrays of this DataFrame.
3706
+ #
3707
+ # @param strategy ["first", "all"]
3708
+ # Return the number of chunks of the 'first' column,
3709
+ # or 'all' columns in this DataFrame.
3710
+ #
3711
+ # @return [Object]
3712
+ #
3713
+ # @example
3714
+ # df = Polars::DataFrame.new(
3715
+ # {
3716
+ # "a" => [1, 2, 3, 4],
3717
+ # "b" => [0.5, 4, 10, 13],
3718
+ # "c" => [true, true, false, true]
3719
+ # }
3720
+ # )
3721
+ # df.n_chunks
3722
+ # # => 1
3723
+ # df.n_chunks(strategy: "all")
3724
+ # # => [1, 1, 1]
3725
+ def n_chunks(strategy: "first")
3726
+ if strategy == "first"
3727
+ _df.n_chunks
3728
+ elsif strategy == "all"
3729
+ get_columns.map(&:n_chunks)
3730
+ else
3731
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
3732
+ end
3733
+ end
3734
+
3735
+ # Aggregate the columns of this DataFrame to their maximum value.
3736
+ #
3737
+ # @return [DataFrame]
3738
+ #
3739
+ # @example
3740
+ # df = Polars::DataFrame.new(
3741
+ # {
3742
+ # "foo" => [1, 2, 3],
3743
+ # "bar" => [6, 7, 8],
3744
+ # "ham" => ["a", "b", "c"]
3745
+ # }
3746
+ # )
3747
+ # df.max
3748
+ # # =>
3749
+ # # shape: (1, 3)
3750
+ # # ┌─────┬─────┬─────┐
3751
+ # # │ foo ┆ bar ┆ ham │
3752
+ # # │ --- ┆ --- ┆ --- │
3753
+ # # │ i64 ┆ i64 ┆ str │
3754
+ # # ╞═════╪═════╪═════╡
3755
+ # # │ 3 ┆ 8 ┆ c │
3756
+ # # └─────┴─────┴─────┘
3757
+ def max(axis: 0)
3758
+ if axis == 0
3759
+ _from_rbdf(_df.max)
3760
+ elsif axis == 1
3761
+ Utils.wrap_s(_df.hmax)
3762
+ else
3763
+ raise ArgumentError, "Axis should be 0 or 1."
3764
+ end
3765
+ end
3766
+
3767
+ # Aggregate the columns of this DataFrame to their minimum value.
3768
+ #
3769
+ # @return [DataFrame]
3770
+ #
3771
+ # @example
3772
+ # df = Polars::DataFrame.new(
3773
+ # {
3774
+ # "foo" => [1, 2, 3],
3775
+ # "bar" => [6, 7, 8],
3776
+ # "ham" => ["a", "b", "c"]
3777
+ # }
3778
+ # )
3779
+ # df.min
3780
+ # # =>
3781
+ # # shape: (1, 3)
3782
+ # # ┌─────┬─────┬─────┐
3783
+ # # │ foo ┆ bar ┆ ham │
3784
+ # # │ --- ┆ --- ┆ --- │
3785
+ # # │ i64 ┆ i64 ┆ str │
3786
+ # # ╞═════╪═════╪═════╡
3787
+ # # │ 1 ┆ 6 ┆ a │
3788
+ # # └─────┴─────┴─────┘
3789
+ def min(axis: 0)
3790
+ if axis == 0
3791
+ _from_rbdf(_df.min)
3792
+ elsif axis == 1
3793
+ Utils.wrap_s(_df.hmin)
3794
+ else
3795
+ raise ArgumentError, "Axis should be 0 or 1."
3796
+ end
3797
+ end
3798
+
3799
+ # Aggregate the columns of this DataFrame to their sum value.
3800
+ #
3801
+ # @param axis [Integer]
3802
+ # Either 0 or 1.
3803
+ # @param null_strategy ["ignore", "propagate"]
3804
+ # This argument is only used if axis == 1.
3805
+ #
3806
+ # @return [DataFrame]
3807
+ #
3808
+ # @example
3809
+ # df = Polars::DataFrame.new(
3810
+ # {
3811
+ # "foo" => [1, 2, 3],
3812
+ # "bar" => [6, 7, 8],
3813
+ # "ham" => ["a", "b", "c"],
3814
+ # }
3815
+ # )
3816
+ # df.sum
3817
+ # # =>
3818
+ # # shape: (1, 3)
3819
+ # # ┌─────┬─────┬──────┐
3820
+ # # │ foo ┆ bar ┆ ham │
3821
+ # # │ --- ┆ --- ┆ --- │
3822
+ # # │ i64 ┆ i64 ┆ str │
3823
+ # # ╞═════╪═════╪══════╡
3824
+ # # │ 6 ┆ 21 ┆ null │
3825
+ # # └─────┴─────┴──────┘
3826
+ #
3827
+ # @example
3828
+ # df.sum(axis: 1)
3829
+ # # =>
3830
+ # # shape: (3,)
3831
+ # # Series: 'foo' [str]
3832
+ # # [
3833
+ # # "16a"
3834
+ # # "27b"
3835
+ # # "38c"
3836
+ # # ]
3837
+ def sum(axis: 0, null_strategy: "ignore")
3838
+ case axis
3839
+ when 0
3840
+ _from_rbdf(_df.sum)
3841
+ when 1
3842
+ Utils.wrap_s(_df.hsum(null_strategy))
3843
+ else
3844
+ raise ArgumentError, "Axis should be 0 or 1."
3845
+ end
3846
+ end
3847
+
3848
+ # Aggregate the columns of this DataFrame to their mean value.
3849
+ #
3850
+ # @param axis [Integer]
3851
+ # Either 0 or 1.
3852
+ # @param null_strategy ["ignore", "propagate"]
3853
+ # This argument is only used if axis == 1.
3854
+ #
3855
+ # @return [DataFrame]
3856
+ #
3857
+ # @example
3858
+ # df = Polars::DataFrame.new(
3859
+ # {
3860
+ # "foo" => [1, 2, 3],
3861
+ # "bar" => [6, 7, 8],
3862
+ # "ham" => ["a", "b", "c"]
3863
+ # }
3864
+ # )
3865
+ # df.mean
3866
+ # # =>
3867
+ # # shape: (1, 3)
3868
+ # # ┌─────┬─────┬──────┐
3869
+ # # │ foo ┆ bar ┆ ham │
3870
+ # # │ --- ┆ --- ┆ --- │
3871
+ # # │ f64 ┆ f64 ┆ str │
3872
+ # # ╞═════╪═════╪══════╡
3873
+ # # │ 2.0 ┆ 7.0 ┆ null │
3874
+ # # └─────┴─────┴──────┘
3875
+ def mean(axis: 0, null_strategy: "ignore")
3876
+ case axis
3877
+ when 0
3878
+ _from_rbdf(_df.mean)
3879
+ when 1
3880
+ Utils.wrap_s(_df.hmean(null_strategy))
3881
+ else
3882
+ raise ArgumentError, "Axis should be 0 or 1."
3883
+ end
3884
+ end
3885
+
3886
+ # Aggregate the columns of this DataFrame to their standard deviation value.
3887
+ #
3888
+ # @param ddof [Integer]
3889
+ # Degrees of freedom
3890
+ #
3891
+ # @return [DataFrame]
3892
+ #
3893
+ # @example
3894
+ # df = Polars::DataFrame.new(
3895
+ # {
3896
+ # "foo" => [1, 2, 3],
3897
+ # "bar" => [6, 7, 8],
3898
+ # "ham" => ["a", "b", "c"]
3899
+ # }
3900
+ # )
3901
+ # df.std
3902
+ # # =>
3903
+ # # shape: (1, 3)
3904
+ # # ┌─────┬─────┬──────┐
3905
+ # # │ foo ┆ bar ┆ ham │
3906
+ # # │ --- ┆ --- ┆ --- │
3907
+ # # │ f64 ┆ f64 ┆ str │
3908
+ # # ╞═════╪═════╪══════╡
3909
+ # # │ 1.0 ┆ 1.0 ┆ null │
3910
+ # # └─────┴─────┴──────┘
3911
+ #
3912
+ # @example
3913
+ # df.std(ddof: 0)
3914
+ # # =>
3915
+ # # shape: (1, 3)
3916
+ # # ┌──────────┬──────────┬──────┐
3917
+ # # │ foo ┆ bar ┆ ham │
3918
+ # # │ --- ┆ --- ┆ --- │
3919
+ # # │ f64 ┆ f64 ┆ str │
3920
+ # # ╞══════════╪══════════╪══════╡
3921
+ # # │ 0.816497 ┆ 0.816497 ┆ null │
3922
+ # # └──────────┴──────────┴──────┘
3923
+ def std(ddof: 1)
3924
+ _from_rbdf(_df.std(ddof))
3925
+ end
3926
+
3927
+ # Aggregate the columns of this DataFrame to their variance value.
3928
+ #
3929
+ # @param ddof [Integer]
3930
+ # Degrees of freedom
3931
+ #
3932
+ # @return [DataFrame]
3933
+ #
3934
+ # @example
3935
+ # df = Polars::DataFrame.new(
3936
+ # {
3937
+ # "foo" => [1, 2, 3],
3938
+ # "bar" => [6, 7, 8],
3939
+ # "ham" => ["a", "b", "c"]
3940
+ # }
3941
+ # )
3942
+ # df.var
3943
+ # # =>
3944
+ # # shape: (1, 3)
3945
+ # # ┌─────┬─────┬──────┐
3946
+ # # │ foo ┆ bar ┆ ham │
3947
+ # # │ --- ┆ --- ┆ --- │
3948
+ # # │ f64 ┆ f64 ┆ str │
3949
+ # # ╞═════╪═════╪══════╡
3950
+ # # │ 1.0 ┆ 1.0 ┆ null │
3951
+ # # └─────┴─────┴──────┘
3952
+ #
3953
+ # @example
3954
+ # df.var(ddof: 0)
3955
+ # # =>
3956
+ # # shape: (1, 3)
3957
+ # # ┌──────────┬──────────┬──────┐
3958
+ # # │ foo ┆ bar ┆ ham │
3959
+ # # │ --- ┆ --- ┆ --- │
3960
+ # # │ f64 ┆ f64 ┆ str │
3961
+ # # ╞══════════╪══════════╪══════╡
3962
+ # # │ 0.666667 ┆ 0.666667 ┆ null │
3963
+ # # └──────────┴──────────┴──────┘
3964
+ def var(ddof: 1)
3965
+ _from_rbdf(_df.var(ddof))
3966
+ end
3967
+
3968
+ # Aggregate the columns of this DataFrame to their median value.
3969
+ #
3970
+ # @return [DataFrame]
3971
+ #
3972
+ # @example
3973
+ # df = Polars::DataFrame.new(
3974
+ # {
3975
+ # "foo" => [1, 2, 3],
3976
+ # "bar" => [6, 7, 8],
3977
+ # "ham" => ["a", "b", "c"]
3978
+ # }
3979
+ # )
3980
+ # df.median
3981
+ # # =>
3982
+ # # shape: (1, 3)
3983
+ # # ┌─────┬─────┬──────┐
3984
+ # # │ foo ┆ bar ┆ ham │
3985
+ # # │ --- ┆ --- ┆ --- │
3986
+ # # │ f64 ┆ f64 ┆ str │
3987
+ # # ╞═════╪═════╪══════╡
3988
+ # # │ 2.0 ┆ 7.0 ┆ null │
3989
+ # # └─────┴─────┴──────┘
3990
+ def median
3991
+ _from_rbdf(_df.median)
3992
+ end
3993
+
3994
+ # Aggregate the columns of this DataFrame to their product values.
3995
+ #
3996
+ # @return [DataFrame]
3997
+ #
3998
+ # @example
3999
+ # df = Polars::DataFrame.new(
4000
+ # {
4001
+ # "a" => [1, 2, 3],
4002
+ # "b" => [0.5, 4, 10],
4003
+ # "c" => [true, true, false]
4004
+ # }
4005
+ # )
4006
+ # df.product
4007
+ # # =>
4008
+ # # shape: (1, 3)
4009
+ # # ┌─────┬──────┬─────┐
4010
+ # # │ a ┆ b ┆ c │
4011
+ # # │ --- ┆ --- ┆ --- │
4012
+ # # │ i64 ┆ f64 ┆ i64 │
4013
+ # # ╞═════╪══════╪═════╡
4014
+ # # │ 6 ┆ 20.0 ┆ 0 │
4015
+ # # └─────┴──────┴─────┘
4016
+ def product
4017
+ select(Polars.all.product)
4018
+ end
4019
+
4020
+ # Aggregate the columns of this DataFrame to their quantile value.
4021
+ #
4022
+ # @param quantile [Float]
4023
+ # Quantile between 0.0 and 1.0.
4024
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
4025
+ # Interpolation method.
4026
+ #
4027
+ # @return [DataFrame]
4028
+ #
4029
+ # @example
4030
+ # df = Polars::DataFrame.new(
4031
+ # {
4032
+ # "foo" => [1, 2, 3],
4033
+ # "bar" => [6, 7, 8],
4034
+ # "ham" => ["a", "b", "c"]
4035
+ # }
4036
+ # )
4037
+ # df.quantile(0.5, interpolation: "nearest")
4038
+ # # =>
4039
+ # # shape: (1, 3)
4040
+ # # ┌─────┬─────┬──────┐
4041
+ # # │ foo ┆ bar ┆ ham │
4042
+ # # │ --- ┆ --- ┆ --- │
4043
+ # # │ f64 ┆ f64 ┆ str │
4044
+ # # ╞═════╪═════╪══════╡
4045
+ # # │ 2.0 ┆ 7.0 ┆ null │
4046
+ # # └─────┴─────┴──────┘
4047
+ def quantile(quantile, interpolation: "nearest")
4048
+ _from_rbdf(_df.quantile(quantile, interpolation))
4049
+ end
4050
+
4051
+ # Get one hot encoded dummy variables.
4052
+ #
4053
+ # @param columns
4054
+ # A subset of columns to convert to dummy variables. `nil` means
4055
+ # "all columns".
4056
+ #
4057
+ # @return [DataFrame]
4058
+ #
4059
+ # @example
4060
+ # df = Polars::DataFrame.new(
4061
+ # {
4062
+ # "foo" => [1, 2],
4063
+ # "bar" => [3, 4],
4064
+ # "ham" => ["a", "b"]
4065
+ # }
4066
+ # )
4067
+ # df.to_dummies
4068
+ # # =>
4069
+ # # shape: (2, 6)
4070
+ # # ┌───────┬───────┬───────┬───────┬───────┬───────┐
4071
+ # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
4072
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
4073
+ # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
4074
+ # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
4075
+ # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
4076
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4077
+ # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4078
+ # # └───────┴───────┴───────┴───────┴───────┴───────┘
4079
+ def to_dummies(columns: nil)
4080
+ if columns.is_a?(String)
4081
+ columns = [columns]
4082
+ end
4083
+ _from_rbdf(_df.to_dummies(columns))
4084
+ end
4085
+
4086
+ # Drop duplicate rows from this DataFrame.
4087
+ #
4088
+ # @param maintain_order [Boolean]
4089
+ # Keep the same order as the original DataFrame. This requires more work to
4090
+ # compute.
4091
+ # @param subset [Object]
4092
+ # Subset to use to compare rows.
4093
+ # @param keep ["first", "last"]
4094
+ # Which of the duplicate rows to keep (in conjunction with `subset`).
4095
+ #
4096
+ # @return [DataFrame]
4097
+ #
4098
+ # @note
4099
+ # Note that this fails if there is a column of type `List` in the DataFrame or
4100
+ # subset.
4101
+ #
4102
+ # @example
4103
+ # df = Polars::DataFrame.new(
4104
+ # {
4105
+ # "a" => [1, 1, 2, 3, 4, 5],
4106
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
4107
+ # "c" => [true, true, true, false, true, true]
4108
+ # }
4109
+ # )
4110
+ # df.unique
4111
+ # # =>
4112
+ # # shape: (5, 3)
4113
+ # # ┌─────┬─────┬───────┐
4114
+ # # │ a ┆ b ┆ c │
4115
+ # # │ --- ┆ --- ┆ --- │
4116
+ # # │ i64 ┆ f64 ┆ bool │
4117
+ # # ╞═════╪═════╪═══════╡
4118
+ # # │ 1 ┆ 0.5 ┆ true │
4119
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4120
+ # # │ 2 ┆ 1.0 ┆ true │
4121
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4122
+ # # │ 3 ┆ 2.0 ┆ false │
4123
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4124
+ # # │ 4 ┆ 3.0 ┆ true │
4125
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4126
+ # # │ 5 ┆ 3.0 ┆ true │
4127
+ # # └─────┴─────┴───────┘
4128
+ def unique(maintain_order: true, subset: nil, keep: "first")
4129
+ if !subset.nil?
4130
+ if subset.is_a?(String)
4131
+ subset = [subset]
4132
+ elsif !subset.is_a?(Array)
4133
+ subset = subset.to_a
4134
+ end
4135
+ end
4136
+
4137
+ _from_rbdf(_df.unique(maintain_order, subset, keep))
4138
+ end
4139
+
4140
+ # Return the number of unique rows, or the number of unique row-subsets.
4141
+ #
4142
+ # @param subset [Object]
4143
+ # One or more columns/expressions that define what to count;
4144
+ # omit to return the count of unique rows.
4145
+ #
4146
+ # @return [DataFrame]
4147
+ #
4148
+ # @example
4149
+ # df = Polars::DataFrame.new(
4150
+ # {
4151
+ # "a" => [1, 1, 2, 3, 4, 5],
4152
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
4153
+ # "c" => [true, true, true, false, true, true]
4154
+ # }
4155
+ # )
4156
+ # df.n_unique
4157
+ # # => 5
4158
+ #
4159
+ # @example Simple columns subset
4160
+ # df.n_unique(subset: ["b", "c"])
4161
+ # # => 4
4162
+ #
4163
+ # @example Expression subset
4164
+ # df.n_unique(
4165
+ # subset: [
4166
+ # (Polars.col("a").floordiv(2)),
4167
+ # (Polars.col("c") | (Polars.col("b") >= 2))
4168
+ # ]
4169
+ # )
4170
+ # # => 3
4171
+ def n_unique(subset: nil)
4172
+ if subset.is_a?(StringIO)
4173
+ subset = [Polars.col(subset)]
4174
+ elsif subset.is_a?(Expr)
4175
+ subset = [subset]
4176
+ end
4177
+
4178
+ if subset.is_a?(Array) && subset.length == 1
4179
+ expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4180
+ else
4181
+ struct_fields = subset.nil? ? Polars.all : subset
4182
+ expr = Polars.struct(struct_fields)
4183
+ end
4184
+
4185
+ df = lazy.select(expr.n_unique).collect
4186
+ df.is_empty ? 0 : df.row(0)[0]
4187
+ end
4188
+
4189
+ # Rechunk the data in this DataFrame to a contiguous allocation.
4190
+
4191
+ # This will make sure all subsequent operations have optimal and predictable
4192
+ # performance.
4193
+ #
4194
+ # @return [DataFrame]
4195
+ def rechunk
4196
+ _from_rbdf(_df.rechunk)
4197
+ end
4198
+
4199
+ # Create a new DataFrame that shows the null counts per column.
4200
+ #
4201
+ # @return [DataFrame]
4202
+ #
4203
+ # @example
4204
+ # df = Polars::DataFrame.new(
4205
+ # {
4206
+ # "foo" => [1, nil, 3],
4207
+ # "bar" => [6, 7, nil],
4208
+ # "ham" => ["a", "b", "c"]
4209
+ # }
4210
+ # )
4211
+ # df.null_count
4212
+ # # =>
4213
+ # # shape: (1, 3)
4214
+ # # ┌─────┬─────┬─────┐
4215
+ # # │ foo ┆ bar ┆ ham │
4216
+ # # │ --- ┆ --- ┆ --- │
4217
+ # # │ u32 ┆ u32 ┆ u32 │
4218
+ # # ╞═════╪═════╪═════╡
4219
+ # # │ 1 ┆ 1 ┆ 0 │
4220
+ # # └─────┴─────┴─────┘
4221
+ def null_count
4222
+ _from_rbdf(_df.null_count)
4223
+ end
4224
+
4225
+ # Sample from this DataFrame.
4226
+ #
4227
+ # @param n [Integer]
4228
+ # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
4229
+ # `frac` is nil.
4230
+ # @param frac [Float]
4231
+ # Fraction of items to return. Cannot be used with `n`.
4232
+ # @param with_replacement [Boolean]
4233
+ # Allow values to be sampled more than once.
4234
+ # @param shuffle [Boolean]
4235
+ # Shuffle the order of sampled data points.
4236
+ # @param seed [Integer]
4237
+ # Seed for the random number generator. If set to nil (default), a random
4238
+ # seed is used.
4239
+ #
4240
+ # @return [DataFrame]
4241
+ #
4242
+ # @example
4243
+ # df = Polars::DataFrame.new(
4244
+ # {
4245
+ # "foo" => [1, 2, 3],
4246
+ # "bar" => [6, 7, 8],
4247
+ # "ham" => ["a", "b", "c"]
4248
+ # }
4249
+ # )
4250
+ # df.sample(n: 2, seed: 0)
4251
+ # # =>
4252
+ # # shape: (2, 3)
4253
+ # # ┌─────┬─────┬─────┐
4254
+ # # │ foo ┆ bar ┆ ham │
4255
+ # # │ --- ┆ --- ┆ --- │
4256
+ # # │ i64 ┆ i64 ┆ str │
4257
+ # # ╞═════╪═════╪═════╡
4258
+ # # │ 3 ┆ 8 ┆ c │
4259
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
4260
+ # # │ 2 ┆ 7 ┆ b │
4261
+ # # └─────┴─────┴─────┘
4262
+ def sample(
4263
+ n: nil,
4264
+ frac: nil,
4265
+ with_replacement: false,
4266
+ shuffle: false,
4267
+ seed: nil
4268
+ )
4269
+ if !n.nil? && !frac.nil?
4270
+ raise ArgumentError, "cannot specify both `n` and `frac`"
4271
+ end
4272
+
4273
+ if n.nil? && !frac.nil?
4274
+ _from_rbdf(
4275
+ _df.sample_frac(frac, with_replacement, shuffle, seed)
4276
+ )
4277
+ end
4278
+
4279
+ if n.nil?
4280
+ n = 1
4281
+ end
4282
+ _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
4283
+ end
4284
+
4285
+ # Apply a horizontal reduction on a DataFrame.
4286
+ #
4287
+ # This can be used to effectively determine aggregations on a row level, and can
4288
+ # be applied to any DataType that can be supercasted (casted to a similar parent
4289
+ # type).
4290
+ #
4291
+ # An example of the supercast rules when applying an arithmetic operation on two
4292
+ # DataTypes are for instance:
4293
+ #
4294
+ # i8 + str = str
4295
+ # f32 + i64 = f32
4296
+ # f32 + f64 = f64
4297
+ #
4298
+ # @return [Series]
4299
+ #
4300
+ # @example A horizontal sum operation:
4301
+ # df = Polars::DataFrame.new(
4302
+ # {
4303
+ # "a" => [2, 1, 3],
4304
+ # "b" => [1, 2, 3],
4305
+ # "c" => [1.0, 2.0, 3.0]
4306
+ # }
4307
+ # )
4308
+ # df.fold { |s1, s2| s1 + s2 }
4309
+ # # =>
4310
+ # # shape: (3,)
4311
+ # # Series: 'a' [f64]
4312
+ # # [
4313
+ # # 4.0
4314
+ # # 5.0
4315
+ # # 9.0
4316
+ # # ]
4317
+ #
4318
+ # @example A horizontal minimum operation:
4319
+ # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
4320
+ # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
4321
+ # # =>
4322
+ # # shape: (3,)
4323
+ # # Series: 'a' [f64]
4324
+ # # [
4325
+ # # 1.0
4326
+ # # 1.0
4327
+ # # 3.0
4328
+ # # ]
4329
+ #
4330
+ # @example A horizontal string concatenation:
4331
+ # df = Polars::DataFrame.new(
4332
+ # {
4333
+ # "a" => ["foo", "bar", 2],
4334
+ # "b" => [1, 2, 3],
4335
+ # "c" => [1.0, 2.0, 3.0]
4336
+ # }
4337
+ # )
4338
+ # df.fold { |s1, s2| s1 + s2 }
4339
+ # # =>
4340
+ # # shape: (3,)
4341
+ # # Series: 'a' [str]
4342
+ # # [
4343
+ # # "foo11.0"
4344
+ # # "bar22.0"
4345
+ # # null
4346
+ # # ]
4347
+ #
4348
+ # @example A horizontal boolean or, similar to a row-wise .any():
4349
+ # df = Polars::DataFrame.new(
4350
+ # {
4351
+ # "a" => [false, false, true],
4352
+ # "b" => [false, true, false]
4353
+ # }
4354
+ # )
4355
+ # df.fold { |s1, s2| s1 | s2 }
4356
+ # # =>
4357
+ # # shape: (3,)
4358
+ # # Series: 'a' [bool]
4359
+ # # [
4360
+ # # false
4361
+ # # true
4362
+ # # true
4363
+ # # ]
4364
+ def fold(&operation)
4365
+ acc = to_series(0)
4366
+
4367
+ 1.upto(width - 1) do |i|
4368
+ acc = operation.call(acc, to_series(i))
4369
+ end
4370
+ acc
4371
+ end
4372
+
4373
+ # Get a row as tuple, either by index or by predicate.
4374
+ #
4375
+ # @param index [Object]
4376
+ # Row index.
4377
+ # @param by_predicate [Object]
4378
+ # Select the row according to a given expression/predicate.
4379
+ #
4380
+ # @return [Object]
4381
+ #
4382
+ # @note
4383
+ # The `index` and `by_predicate` params are mutually exclusive. Additionally,
4384
+ # to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
4385
+ #
4386
+ # When using `by_predicate` it is an error condition if anything other than
4387
+ # one row is returned; more than one row raises `TooManyRowsReturned`, and
4388
+ # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
4389
+ #
4390
+ # @example Return the row at the given index
4391
+ # df = Polars::DataFrame.new(
4392
+ # {
4393
+ # "foo" => [1, 2, 3],
4394
+ # "bar" => [6, 7, 8],
4395
+ # "ham" => ["a", "b", "c"]
4396
+ # }
4397
+ # )
4398
+ # df.row(2)
4399
+ # # => [3, 8, "c"]
4400
+ #
4401
+ # @example Return the row that matches the given predicate
4402
+ # df.row(by_predicate: Polars.col("ham") == "b")
4403
+ # # => [2, 7, "b"]
4404
+ def row(index = nil, by_predicate: nil)
4405
+ if !index.nil? && !by_predicate.nil?
4406
+ raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
4407
+ elsif index.is_a?(Expr)
4408
+ raise TypeError, "Expressions should be passed to the 'by_predicate' param"
4409
+ elsif index.is_a?(Integer)
4410
+ _df.row_tuple(index)
4411
+ elsif by_predicate.is_a?(Expr)
4412
+ rows = filter(by_predicate).rows
4413
+ n_rows = rows.length
4414
+ if n_rows > 1
4415
+ raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
4416
+ elsif n_rows == 0
4417
+ raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
4418
+ end
4419
+ rows[0]
4420
+ else
4421
+ raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
4422
+ end
4423
+ end
4424
+
4425
+ # Convert columnar data to rows as Ruby arrays.
4426
+ #
4427
+ # @return [Array]
4428
+ #
4429
+ # @example
4430
+ # df = Polars::DataFrame.new(
4431
+ # {
4432
+ # "a" => [1, 3, 5],
4433
+ # "b" => [2, 4, 6]
4434
+ # }
4435
+ # )
4436
+ # df.rows
4437
+ # # => [[1, 2], [3, 4], [5, 6]]
4438
+ def rows
4439
+ _df.row_tuples
4440
+ end
4441
+
4442
+ # Shrink DataFrame memory usage.
4443
+ #
4444
+ # Shrinks to fit the exact capacity needed to hold the data.
4445
+ #
4446
+ # @return [DataFrame]
4447
+ def shrink_to_fit(in_place: false)
4448
+ if in_place
4449
+ _df.shrink_to_fit
4450
+ self
4451
+ else
4452
+ df = clone
4453
+ df._df.shrink_to_fit
4454
+ df
4455
+ end
4456
+ end
4457
+
4458
+ # Take every nth row in the DataFrame and return as a new DataFrame.
4459
+ #
4460
+ # @return [DataFrame]
4461
+ #
4462
+ # @example
4463
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
4464
+ # s.take_every(2)
4465
+ # # =>
4466
+ # # shape: (2, 2)
4467
+ # # ┌─────┬─────┐
4468
+ # # │ a ┆ b │
4469
+ # # │ --- ┆ --- │
4470
+ # # │ i64 ┆ i64 │
4471
+ # # ╞═════╪═════╡
4472
+ # # │ 1 ┆ 5 │
4473
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
4474
+ # # │ 3 ┆ 7 │
4475
+ # # └─────┴─────┘
4476
+ def take_every(n)
4477
+ select(Utils.col("*").take_every(n))
4478
+ end
4479
+
4480
+ # Hash and combine the rows in this DataFrame.
4481
+ #
4482
+ # The hash value is of type `:u64`.
4483
+ #
4484
+ # @param seed [Integer]
4485
+ # Random seed parameter. Defaults to 0.
4486
+ # @param seed_1 [Integer]
4487
+ # Random seed parameter. Defaults to `seed` if not set.
4488
+ # @param seed_2 [Integer]
4489
+ # Random seed parameter. Defaults to `seed` if not set.
4490
+ # @param seed_3 [Integer]
4491
+ # Random seed parameter. Defaults to `seed` if not set.
4492
+ #
4493
+ # @return [Series]
4494
+ #
4495
+ # @example
4496
+ # df = Polars::DataFrame.new(
4497
+ # {
4498
+ # "foo" => [1, nil, 3, 4],
4499
+ # "ham" => ["a", "b", nil, "d"]
4500
+ # }
4501
+ # )
4502
+ # df.hash_rows(seed: 42)
4503
+ # # =>
4504
+ # # shape: (4,)
4505
+ # # Series: '' [u64]
4506
+ # # [
4507
+ # # 4238614331852490969
4508
+ # # 17976148875586754089
4509
+ # # 4702262519505526977
4510
+ # # 18144177983981041107
4511
+ # # ]
4512
+ def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
4513
+ k0 = seed
4514
+ k1 = seed_1.nil? ? seed : seed_1
4515
+ k2 = seed_2.nil? ? seed : seed_2
4516
+ k3 = seed_3.nil? ? seed : seed_3
4517
+ Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
4518
+ end
4519
+
4520
+ # Interpolate intermediate values. The interpolation method is linear.
4521
+ #
4522
+ # @return [DataFrame]
4523
+ #
4524
+ # @example
4525
+ # df = Polars::DataFrame.new(
4526
+ # {
4527
+ # "foo" => [1, nil, 9, 10],
4528
+ # "bar" => [6, 7, 9, nil],
4529
+ # "baz" => [1, nil, nil, 9]
4530
+ # }
4531
+ # )
4532
+ # df.interpolate
4533
+ # # =>
4534
+ # # shape: (4, 3)
4535
+ # # ┌─────┬──────┬─────┐
4536
+ # # │ foo ┆ bar ┆ baz │
4537
+ # # │ --- ┆ --- ┆ --- │
4538
+ # # │ i64 ┆ i64 ┆ i64 │
4539
+ # # ╞═════╪══════╪═════╡
4540
+ # # │ 1 ┆ 6 ┆ 1 │
4541
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
4542
+ # # │ 5 ┆ 7 ┆ 3 │
4543
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
4544
+ # # │ 9 ┆ 9 ┆ 6 │
4545
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
4546
+ # # │ 10 ┆ null ┆ 9 │
4547
+ # # └─────┴──────┴─────┘
4548
+ def interpolate
4549
+ select(Utils.col("*").interpolate)
4550
+ end
4551
+
4552
+ # Check if the dataframe is empty.
4553
+ #
4554
+ # @return [Boolean]
4555
+ #
4556
+ # @example
4557
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
4558
+ # df.is_empty
4559
+ # # => false
4560
+ # df.filter(Polars.col("foo") > 99).is_empty
4561
+ # # => true
4562
+ def is_empty
4563
+ height == 0
4564
+ end
4565
+ alias_method :empty?, :is_empty
4566
+
4567
+ # Convert a `DataFrame` to a `Series` of type `Struct`.
4568
+ #
4569
+ # @param name [String]
4570
+ # Name for the struct Series
4571
+ #
4572
+ # @return [Series]
4573
+ #
4574
+ # @example
4575
+ # df = Polars::DataFrame.new(
4576
+ # {
4577
+ # "a" => [1, 2, 3, 4, 5],
4578
+ # "b" => ["one", "two", "three", "four", "five"]
4579
+ # }
4580
+ # )
4581
+ # df.to_struct("nums")
4582
+ # # =>
4583
+ # # shape: (5,)
4584
+ # # Series: 'nums' [struct[2]]
4585
+ # # [
4586
+ # # {1,"one"}
4587
+ # # {2,"two"}
4588
+ # # {3,"three"}
4589
+ # # {4,"four"}
4590
+ # # {5,"five"}
4591
+ # # ]
4592
+ def to_struct(name)
4593
+ Utils.wrap_s(_df.to_struct(name))
4594
+ end
4595
+
4596
+ # Decompose a struct into its fields.
4597
+ #
4598
+ # The fields will be inserted into the `DataFrame` on the location of the
4599
+ # `struct` type.
4600
+ #
4601
+ # @param names [Object]
4602
+ # Names of the struct columns that will be decomposed by its fields
4603
+ #
4604
+ # @return [DataFrame]
4605
+ #
4606
+ # @example
4607
+ # df = Polars::DataFrame.new(
4608
+ # {
4609
+ # "before" => ["foo", "bar"],
4610
+ # "t_a" => [1, 2],
4611
+ # "t_b" => ["a", "b"],
4612
+ # "t_c" => [true, nil],
4613
+ # "t_d" => [[1, 2], [3]],
4614
+ # "after" => ["baz", "womp"]
4615
+ # }
4616
+ # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
4617
+ # df.unnest("t_struct")
4618
+ # # =>
4619
+ # # shape: (2, 6)
4620
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
4621
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
4622
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
4623
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
4624
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
4625
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
4626
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
4627
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
4628
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
4629
+ def unnest(names)
4630
+ if names.is_a?(String)
4631
+ names = [names]
4632
+ end
4633
+ _from_rbdf(_df.unnest(names))
4634
+ end
4635
+
4636
+ private
4637
+
4638
+ def initialize_copy(other)
4639
+ super
4640
+ self._df = _df._clone
4641
+ end
4642
+
4643
+ def _pos_idx(idx, dim)
4644
+ if idx >= 0
4645
+ idx
4646
+ else
4647
+ shape[dim] + idx
4648
+ end
4649
+ end
4650
+
4651
+ # def _pos_idxs
4652
+ # end
4653
+
4654
+ # @private
4655
+ def self.hash_to_rbdf(data, columns: nil)
4656
+ if !columns.nil?
4657
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
4658
+
4659
+ if data.empty? && dtypes
4660
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
4661
+ else
4662
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
4663
+ end
4664
+ data_series = _handle_columns_arg(data_series, columns: columns)
4665
+ return RbDataFrame.new(data_series)
4666
+ end
4667
+
4668
+ RbDataFrame.read_hash(data)
4669
+ end
4670
+
4671
+ # @private
4672
+ def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
4673
+ if columns.is_a?(Hash)
4674
+ columns = columns.to_a
4675
+ end
4676
+ column_names =
4677
+ (columns || []).map.with_index do |col, i|
4678
+ if col.is_a?(String)
4679
+ col || "column_#{i}"
4680
+ else
4681
+ col[0]
4682
+ end
4683
+ end
4684
+ if column_names.empty? && n_expected
4685
+ column_names = n_expected.times.map { |i| "column_#{i}" }
4686
+ end
4687
+ # TODO zip_longest
4688
+ lookup = column_names.zip(lookup_names || []).to_h
4689
+
4690
+ [
4691
+ column_names,
4692
+ (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4693
+ [lookup[col[0]] || col[0], col[1]]
4694
+ end
4695
+ ]
4696
+ end
4697
+
4698
+ def self._handle_columns_arg(data, columns: nil)
4699
+ if columns.nil?
4700
+ data
4701
+ else
4702
+ if data.empty?
4703
+ columns.map { |c| Series.new(c, nil)._s }
4704
+ elsif data.length == columns.length
4705
+ columns.each_with_index do |c, i|
4706
+ # not in-place?
4707
+ data[i].rename(c)
4708
+ end
4709
+ data
4710
+ else
4711
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
4712
+ end
4713
+ end
4714
+ end
4715
+
4716
+ # @private
4717
+ def self.sequence_to_rbdf(data, columns: nil, orient: nil)
4718
+ if data.length == 0
4719
+ return hash_to_rbdf({}, columns: columns)
4720
+ end
4721
+
4722
+ if data[0].is_a?(Series)
4723
+ # series_names = data.map(&:name)
4724
+ # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
4725
+ data_series = []
4726
+ data.each do |s|
4727
+ data_series << s._s
4728
+ end
4729
+ elsif data[0].is_a?(Array)
4730
+ if orient.nil? && !columns.nil?
4731
+ orient = columns.length == data.length ? "col" : "row"
4732
+ end
4733
+
4734
+ if orient == "row"
4735
+ raise Todo
4736
+ elsif orient == "col" || orient.nil?
4737
+ raise Todo
4738
+ else
4739
+ raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
4740
+ end
4741
+ end
4742
+
4743
+ data_series = _handle_columns_arg(data_series, columns: columns)
4744
+ RbDataFrame.new(data_series)
4745
+ end
4746
+
4747
+ # @private
4748
+ def self.series_to_rbdf(data, columns: nil)
4749
+ if columns
4750
+ raise Todo
4751
+ end
4752
+ RbDataFrame.new([data._s])
4753
+ end
4754
+
4755
+ def wrap_ldf(ldf)
4756
+ LazyFrame._from_rbldf(ldf)
4757
+ end
4758
+
4759
+ def _from_rbdf(rb_df)
4760
+ self.class._from_rbdf(rb_df)
4761
+ end
4762
+
4763
+ def _comp(other, op)
4764
+ if other.is_a?(DataFrame)
4765
+ _compare_to_other_df(other, op)
4766
+ else
4767
+ _compare_to_non_df(other, op)
4768
+ end
4769
+ end
4770
+
4771
+ def _compare_to_other_df(other, op)
4772
+ if columns != other.columns
4773
+ raise ArgmentError, "DataFrame columns do not match"
4774
+ end
4775
+ if shape != other.shape
4776
+ raise ArgmentError, "DataFrame dimensions do not match"
4777
+ end
4778
+
4779
+ suffix = "__POLARS_CMP_OTHER"
4780
+ other_renamed = other.select(Polars.all.suffix(suffix))
4781
+ combined = Polars.concat([self, other_renamed], how: "horizontal")
4782
+
4783
+ expr = case op
4784
+ when "eq"
4785
+ columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
4786
+ when "neq"
4787
+ columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
4788
+ when "gt"
4789
+ columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
4790
+ when "lt"
4791
+ columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
4792
+ when "gt_eq"
4793
+ columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
4794
+ when "lt_eq"
4795
+ columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
4796
+ else
4797
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
4798
+ end
4799
+
4800
+ combined.select(expr)
4801
+ end
4802
+
4803
+ def _compare_to_non_df(other, op)
4804
+ case op
4805
+ when "eq"
4806
+ select(Polars.all == other)
4807
+ when "neq"
4808
+ select(Polars.all != other)
4809
+ when "gt"
4810
+ select(Polars.all > other)
4811
+ when "lt"
4812
+ select(Polars.all < other)
4813
+ when "gt_eq"
4814
+ select(Polars.all >= other)
4815
+ when "lt_eq"
4816
+ select(Polars.all <= other)
4817
+ else
4818
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
4819
+ end
4820
+ end
4821
+
4822
+ def _prepare_other_arg(other)
4823
+ if !other.is_a?(Series)
4824
+ if other.is_a?(Array)
4825
+ raise ArgumentError, "Operation not supported."
4826
+ end
4827
+
4828
+ other = Series.new("", [other])
4829
+ end
4830
+ other
4831
+ end
4832
+ end
4833
+ end