polars-df 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,41 +26,204 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
91
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
113
+ end
114
+
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
122
+ end
123
+
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
155
+ end
156
+
157
+ # @private
158
+ def self._read_parquet(
159
+ file,
160
+ columns: nil,
161
+ n_rows: nil,
162
+ parallel: "auto",
163
+ row_count_name: nil,
164
+ row_count_offset: 0,
165
+ low_memory: false
166
+ )
35
167
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
168
  file = Utils.format_path(file)
37
169
  end
38
170
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
171
+ if file.is_a?(String) && file.include?("*")
172
+ raise Todo
173
+ end
174
+
175
+ projection, columns = Utils.handle_projection_columns(columns)
176
+ _from_rbdf(
177
+ RbDataFrame.read_parquet(
178
+ file,
179
+ columns,
180
+ projection,
181
+ n_rows,
182
+ parallel,
183
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
184
+ low_memory
185
+ )
186
+ )
40
187
  end
41
188
 
42
- def self._read_parquet(file)
189
+ # def self._read_avro
190
+ # end
191
+
192
+ # @private
193
+ def self._read_ipc(
194
+ file,
195
+ columns: nil,
196
+ n_rows: nil,
197
+ row_count_name: nil,
198
+ row_count_offset: 0,
199
+ rechunk: true,
200
+ memory_map: true
201
+ )
43
202
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
203
  file = Utils.format_path(file)
45
204
  end
205
+ if columns.is_a?(String)
206
+ columns = [columns]
207
+ end
208
+
209
+ if file.is_a?(String) && file.include?("*")
210
+ raise Todo
211
+ end
46
212
 
47
- _from_rbdf(RbDataFrame.read_parquet(file))
213
+ projection, columns = Utils.handle_projection_columns(columns)
214
+ _from_rbdf(
215
+ RbDataFrame.read_ipc(
216
+ file,
217
+ columns,
218
+ projection,
219
+ n_rows,
220
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
221
+ memory_map
222
+ )
223
+ )
48
224
  end
49
225
 
226
+ # @private
50
227
  def self._read_json(file)
51
228
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
229
  file = Utils.format_path(file)
@@ -55,6 +232,7 @@ module Polars
55
232
  _from_rbdf(RbDataFrame.read_json(file))
56
233
  end
57
234
 
235
+ # @private
58
236
  def self._read_ndjson(file)
59
237
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
238
  file = Utils.format_path(file)
@@ -63,83 +241,339 @@ module Polars
63
241
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
242
  end
65
243
 
244
+ # Get the shape of the DataFrame.
245
+ #
246
+ # @return [Array]
247
+ #
248
+ # @example
249
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
250
+ # df.shape
251
+ # # => [5, 1]
66
252
  def shape
67
253
  _df.shape
68
254
  end
69
255
 
256
+ # Get the height of the DataFrame.
257
+ #
258
+ # @return [Integer]
259
+ #
260
+ # @example
261
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
262
+ # df.height
263
+ # # => 5
70
264
  def height
71
265
  _df.height
72
266
  end
73
267
 
268
+ # Get the width of the DataFrame.
269
+ #
270
+ # @return [Integer]
271
+ #
272
+ # @example
273
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
274
+ # df.width
275
+ # # => 1
74
276
  def width
75
277
  _df.width
76
278
  end
77
279
 
280
+ # Get column names.
281
+ #
282
+ # @return [Array]
283
+ #
284
+ # @example
285
+ # df = Polars::DataFrame.new(
286
+ # {
287
+ # "foo" => [1, 2, 3],
288
+ # "bar" => [6, 7, 8],
289
+ # "ham" => ["a", "b", "c"]
290
+ # }
291
+ # )
292
+ # df.columns
293
+ # # => ["foo", "bar", "ham"]
78
294
  def columns
79
295
  _df.columns
80
296
  end
81
297
 
298
+ # Change the column names of the DataFrame.
299
+ #
300
+ # @param columns [Array]
301
+ # A list with new names for the DataFrame.
302
+ # The length of the list should be equal to the width of the DataFrame.
303
+ #
304
+ # @return [Object]
305
+ #
306
+ # @example
307
+ # df = Polars::DataFrame.new(
308
+ # {
309
+ # "foo" => [1, 2, 3],
310
+ # "bar" => [6, 7, 8],
311
+ # "ham" => ["a", "b", "c"]
312
+ # }
313
+ # )
314
+ # df.columns = ["apple", "banana", "orange"]
315
+ # df
316
+ # # =>
317
+ # # shape: (3, 3)
318
+ # # ┌───────┬────────┬────────┐
319
+ # # │ apple ┆ banana ┆ orange │
320
+ # # │ --- ┆ --- ┆ --- │
321
+ # # │ i64 ┆ i64 ┆ str │
322
+ # # ╞═══════╪════════╪════════╡
323
+ # # │ 1 ┆ 6 ┆ a │
324
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
325
+ # # │ 2 ┆ 7 ┆ b │
326
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
327
+ # # │ 3 ┆ 8 ┆ c │
328
+ # # └───────┴────────┴────────┘
82
329
  def columns=(columns)
83
330
  _df.set_column_names(columns)
84
331
  end
85
332
 
333
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
334
+ #
335
+ # @return [Array]
336
+ #
337
+ # @example
338
+ # df = Polars::DataFrame.new(
339
+ # {
340
+ # "foo" => [1, 2, 3],
341
+ # "bar" => [6.0, 7.0, 8.0],
342
+ # "ham" => ["a", "b", "c"]
343
+ # }
344
+ # )
345
+ # df.dtypes
346
+ # # => [:i64, :f64, :str]
86
347
  def dtypes
87
- _df.dtypes.map(&:to_sym)
348
+ _df.dtypes
88
349
  end
89
350
 
351
+ # Get the schema.
352
+ #
353
+ # @return [Hash]
354
+ #
355
+ # @example
356
+ # df = Polars::DataFrame.new(
357
+ # {
358
+ # "foo" => [1, 2, 3],
359
+ # "bar" => [6.0, 7.0, 8.0],
360
+ # "ham" => ["a", "b", "c"]
361
+ # }
362
+ # )
363
+ # df.schema
364
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
90
365
  def schema
91
366
  columns.zip(dtypes).to_h
92
367
  end
93
368
 
94
- # def ==(other)
95
- # end
369
+ # Equal.
370
+ #
371
+ # @return [DataFrame]
372
+ def ==(other)
373
+ _comp(other, "eq")
374
+ end
96
375
 
97
- # def !=(other)
98
- # end
376
+ # Not equal.
377
+ #
378
+ # @return [DataFrame]
379
+ def !=(other)
380
+ _comp(other, "neq")
381
+ end
99
382
 
100
- # def >(other)
101
- # end
383
+ # Greater than.
384
+ #
385
+ # @return [DataFrame]
386
+ def >(other)
387
+ _comp(other, "gt")
388
+ end
102
389
 
103
- # def <(other)
104
- # end
390
+ # Less than.
391
+ #
392
+ # @return [DataFrame]
393
+ def <(other)
394
+ _comp(other, "lt")
395
+ end
105
396
 
106
- # def >=(other)
107
- # end
397
+ # Greater than or equal.
398
+ #
399
+ # @return [DataFrame]
400
+ def >=(other)
401
+ _comp(other, "gt_eq")
402
+ end
108
403
 
109
- # def <=(other)
110
- # end
404
+ # Less than or equal.
405
+ #
406
+ # @return [DataFrame]
407
+ def <=(other)
408
+ _comp(other, "lt_eq")
409
+ end
111
410
 
112
- # def *(other)
113
- # end
411
+ # Performs multiplication.
412
+ #
413
+ # @return [DataFrame]
414
+ def *(other)
415
+ if other.is_a?(DataFrame)
416
+ return _from_rbdf(_df.mul_df(other._df))
417
+ end
114
418
 
115
- # def /(other)
116
- # end
419
+ other = _prepare_other_arg(other)
420
+ _from_rbdf(_df.mul(other._s))
421
+ end
117
422
 
118
- # def +(other)
119
- # end
423
+ # Performs division.
424
+ #
425
+ # @return [DataFrame]
426
+ def /(other)
427
+ if other.is_a?(DataFrame)
428
+ return _from_rbdf(_df.div_df(other._df))
429
+ end
120
430
 
121
- # def -(other)
122
- # end
431
+ other = _prepare_other_arg(other)
432
+ _from_rbdf(_df.div(other._s))
433
+ end
123
434
 
124
- # def %(other)
125
- # end
435
+ # Performs addition.
436
+ #
437
+ # @return [DataFrame]
438
+ def +(other)
439
+ if other.is_a?(DataFrame)
440
+ return _from_rbdf(_df.add_df(other._df))
441
+ end
442
+
443
+ other = _prepare_other_arg(other)
444
+ _from_rbdf(_df.add(other._s))
445
+ end
446
+
447
+ # Performs subtraction.
448
+ #
449
+ # @return [DataFrame]
450
+ def -(other)
451
+ if other.is_a?(DataFrame)
452
+ return _from_rbdf(_df.sub_df(other._df))
453
+ end
454
+
455
+ other = _prepare_other_arg(other)
456
+ _from_rbdf(_df.sub(other._s))
457
+ end
458
+
459
+ # Returns the modulo.
460
+ #
461
+ # @return [DataFrame]
462
+ def %(other)
463
+ if other.is_a?(DataFrame)
464
+ return _from_rbdf(_df.rem_df(other._df))
465
+ end
466
+
467
+ other = _prepare_other_arg(other)
468
+ _from_rbdf(_df.rem(other._s))
469
+ end
126
470
 
471
+ # Returns a string representing the DataFrame.
472
+ #
473
+ # @return [String]
127
474
  def to_s
128
475
  _df.to_s
129
476
  end
130
477
  alias_method :inspect, :to_s
131
478
 
479
+ # Check if DataFrame includes column.
480
+ #
481
+ # @return [Boolean]
132
482
  def include?(name)
133
483
  columns.include?(name)
134
484
  end
135
485
 
136
- def [](name)
137
- Utils.wrap_s(_df.column(name))
486
+ # def each
487
+ # end
488
+
489
+ # def _pos_idx
490
+ # end
491
+
492
+ # def _pos_idxs
493
+ # end
494
+
495
+ # Returns subset of the DataFrame.
496
+ #
497
+ # @return [Object]
498
+ def [](*args)
499
+ if args.size == 2
500
+ row_selection, col_selection = args
501
+
502
+ # df[.., unknown]
503
+ if row_selection.is_a?(Range)
504
+
505
+ # multiple slices
506
+ # df[.., ..]
507
+ if col_selection.is_a?(Range)
508
+ raise Todo
509
+ end
510
+ end
511
+
512
+ # df[2, ..] (select row as df)
513
+ if row_selection.is_a?(Integer)
514
+ if col_selection.is_a?(Array)
515
+ df = self[0.., col_selection]
516
+ return df.slice(row_selection, 1)
517
+ end
518
+ # df[2, "a"]
519
+ if col_selection.is_a?(String)
520
+ return self[col_selection][row_selection]
521
+ end
522
+ end
523
+
524
+ # column selection can be "a" and ["a", "b"]
525
+ if col_selection.is_a?(String)
526
+ col_selection = [col_selection]
527
+ end
528
+
529
+ # df[.., 1]
530
+ if col_selection.is_a?(Integer)
531
+ series = to_series(col_selection)
532
+ return series[row_selection]
533
+ end
534
+
535
+ if col_selection.is_a?(Array)
536
+ # df[.., [1, 2]]
537
+ if is_int_sequence(col_selection)
538
+ series_list = col_selection.map { |i| to_series(i) }
539
+ df = self.class.new(series_list)
540
+ return df[row_selection]
541
+ end
542
+ end
543
+
544
+ df = self[col_selection]
545
+ return df[row_selection]
546
+ elsif args.size == 1
547
+ item = args[0]
548
+
549
+ # select single column
550
+ # df["foo"]
551
+ if item.is_a?(String)
552
+ return Utils.wrap_s(_df.column(item))
553
+ end
554
+
555
+ # df[idx]
556
+ if item.is_a?(Integer)
557
+ return slice(_pos_idx(item, dim: 0), 1)
558
+ end
559
+
560
+ # df[..]
561
+ if item.is_a?(Range)
562
+ return Slice.new(self).apply(item)
563
+ end
564
+ end
565
+
566
+ raise ArgumentError, "Cannot get item of type: #{item.class.name}"
138
567
  end
139
568
 
140
569
  # def []=(key, value)
141
570
  # end
142
571
 
572
+ # no to_arrow
573
+
574
+ # Convert DataFrame to a hash mapping column name to values.
575
+ #
576
+ # @return [Hash]
143
577
  def to_h(as_series: true)
144
578
  if as_series
145
579
  get_columns.to_h { |s| [s.name, s] }
@@ -148,7 +582,7 @@ module Polars
148
582
  end
149
583
  end
150
584
 
151
- # def to_hs / to_a
585
+ # def to_hashes / to_a
152
586
  # end
153
587
 
154
588
  # def to_numo
@@ -156,6 +590,30 @@ module Polars
156
590
 
157
591
  # no to_pandas
158
592
 
593
+ # Select column as Series at index location.
594
+ #
595
+ # @param index [Integer]
596
+ # Location of selection.
597
+ #
598
+ # @return [Series]
599
+ #
600
+ # @example
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"]
606
+ # }
607
+ # )
608
+ # df.to_series(1)
609
+ # # =>
610
+ # # shape: (3,)
611
+ # # Series: 'bar' [i64]
612
+ # # [
613
+ # # 6
614
+ # # 7
615
+ # # 8
616
+ # # ]
159
617
  def to_series(index = 0)
160
618
  if index < 0
161
619
  index = columns.length + index
@@ -163,6 +621,18 @@ module Polars
163
621
  Utils.wrap_s(_df.select_at_idx(index))
164
622
  end
165
623
 
624
+ # Serialize to JSON representation.
625
+ #
626
+ # @return [nil]
627
+ #
628
+ # @param file [String]
629
+ # File path to which the result should be written.
630
+ # @param pretty [Boolean]
631
+ # Pretty serialize json.
632
+ # @param row_oriented [Boolean]
633
+ # Write to row oriented json. This is slower, but more common.
634
+ #
635
+ # @see #write_ndjson
166
636
  def write_json(
167
637
  file,
168
638
  pretty: false,
@@ -176,6 +646,12 @@ module Polars
176
646
  nil
177
647
  end
178
648
 
649
+ # Serialize to newline delimited JSON representation.
650
+ #
651
+ # @param file [String]
652
+ # File path to which the result should be written.
653
+ #
654
+ # @return [nil]
179
655
  def write_ndjson(file)
180
656
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
181
657
  file = Utils.format_path(file)
@@ -185,6 +661,50 @@ module Polars
185
661
  nil
186
662
  end
187
663
 
664
+ # Write to comma-separated values (CSV) file.
665
+ #
666
+ # @param file [String, nil]
667
+ # File path to which the result should be written. If set to `nil`
668
+ # (default), the output is returned as a string instead.
669
+ # @param has_header [Boolean]
670
+ # Whether to include header in the CSV output.
671
+ # @param sep [String]
672
+ # Separate CSV fields with this symbol.
673
+ # @param quote [String]
674
+ # Byte to use as quoting character.
675
+ # @param batch_size [Integer]
676
+ # Number of rows that will be processed per thread.
677
+ # @param datetime_format [String, nil]
678
+ # A format string, with the specifiers defined by the
679
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
680
+ # Rust crate. If no format specified, the default fractional-second
681
+ # precision is inferred from the maximum timeunit found in the frame's
682
+ # Datetime cols (if any).
683
+ # @param date_format [String, nil]
684
+ # A format string, with the specifiers defined by the
685
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
686
+ # Rust crate.
687
+ # @param time_format [String, nil]
688
+ # A format string, with the specifiers defined by the
689
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
690
+ # Rust crate.
691
+ # @param float_precision [Integer, nil]
692
+ # Number of decimal places to write, applied to both `:f32` and
693
+ # `:f64` datatypes.
694
+ # @param null_value [String, nil]
695
+ # A string representing null values (defaulting to the empty string).
696
+ #
697
+ # @return [String, nil]
698
+ #
699
+ # @example
700
+ # df = Polars::DataFrame.new(
701
+ # {
702
+ # "foo" => [1, 2, 3, 4, 5],
703
+ # "bar" => [6, 7, 8, 9, 10],
704
+ # "ham" => ["a", "b", "c", "d", "e"]
705
+ # }
706
+ # )
707
+ # df.write_csv("file.csv")
188
708
  def write_csv(
189
709
  file = nil,
190
710
  has_header: true,
@@ -220,8 +740,7 @@ module Polars
220
740
  float_precision,
221
741
  null_value
222
742
  )
223
- buffer.rewind
224
- return buffer.read.force_encoding(Encoding::UTF_8)
743
+ return buffer.string.force_encoding(Encoding::UTF_8)
225
744
  end
226
745
 
227
746
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -246,9 +765,50 @@ module Polars
246
765
  # def write_avro
247
766
  # end
248
767
 
249
- # def write_ipc
250
- # end
768
+ # Write to Arrow IPC binary stream or Feather file.
769
+ #
770
+ # @param file [String]
771
+ # File path to which the file should be written.
772
+ # @param compression ["uncompressed", "lz4", "zstd"]
773
+ # Compression method. Defaults to "uncompressed".
774
+ #
775
+ # @return [nil]
776
+ def write_ipc(file, compression: "uncompressed")
777
+ if compression.nil?
778
+ compression = "uncompressed"
779
+ end
780
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
781
+ file = Utils.format_path(file)
782
+ end
251
783
 
784
+ _df.write_ipc(file, compression)
785
+ end
786
+
787
+ # Write to Apache Parquet file.
788
+ #
789
+ # @param file [String]
790
+ # File path to which the file should be written.
791
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
792
+ # Choose "zstd" for good compression performance.
793
+ # Choose "lz4" for fast compression/decompression.
794
+ # Choose "snappy" for more backwards compatibility guarantees
795
+ # when you deal with older parquet readers.
796
+ # @param compression_level [Integer, nil]
797
+ # The level of compression to use. Higher compression means smaller files on
798
+ # disk.
799
+ #
800
+ # - "gzip" : min-level: 0, max-level: 10.
801
+ # - "brotli" : min-level: 0, max-level: 11.
802
+ # - "zstd" : min-level: 1, max-level: 22.
803
+ # @param statistics [Boolean]
804
+ # Write statistics to the parquet headers. This requires extra compute.
805
+ # @param row_group_size [Integer, nil]
806
+ # Size of the row groups in number of rows.
807
+ # If `nil` (default), the chunks of the DataFrame are
808
+ # used. Writing in smaller chunks may reduce memory pressure and improve
809
+ # writing speeds.
810
+ #
811
+ # @return [nil]
252
812
  def write_parquet(
253
813
  file,
254
814
  compression: "zstd",
@@ -268,6 +828,39 @@ module Polars
268
828
  )
269
829
  end
270
830
 
831
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
832
+ #
833
+ # Estimated size is given in the specified unit (bytes by default).
834
+ #
835
+ # This estimation is the sum of the size of its buffers, validity, including
836
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
837
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
838
+ # particular, StructArray's size is an upper bound.
839
+ #
840
+ # When an array is sliced, its allocated size remains constant because the buffer
841
+ # unchanged. However, this function will yield a smaller number. This is because
842
+ # this function returns the visible size of the buffer, not its total capacity.
843
+ #
844
+ # FFI buffers are included in this estimation.
845
+ #
846
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
847
+ # Scale the returned size to the given unit.
848
+ #
849
+ # @return [Numeric]
850
+ #
851
+ # @example
852
+ # df = Polars::DataFrame.new(
853
+ # {
854
+ # "x" => 1_000_000.times.to_a.reverse,
855
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
856
+ # "z" => 1_000_000.times.map(&:to_s)
857
+ # },
858
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
859
+ # )
860
+ # df.estimated_size
861
+ # # => 25888898
862
+ # df.estimated_size("mb")
863
+ # # => 24.689577102661133
271
864
  def estimated_size(unit = "b")
272
865
  sz = _df.estimated_size
273
866
  Utils.scale_bytes(sz, to: unit)
@@ -276,14 +869,120 @@ module Polars
276
869
  # def transpose
277
870
  # end
278
871
 
872
+ # Reverse the DataFrame.
873
+ #
874
+ # @return [DataFrame]
875
+ #
876
+ # @example
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "key" => ["a", "b", "c"],
880
+ # "val" => [1, 2, 3]
881
+ # }
882
+ # )
883
+ # df.reverse()
884
+ # # =>
885
+ # # shape: (3, 2)
886
+ # # ┌─────┬─────┐
887
+ # # │ key ┆ val │
888
+ # # │ --- ┆ --- │
889
+ # # │ str ┆ i64 │
890
+ # # ╞═════╪═════╡
891
+ # # │ c ┆ 3 │
892
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
893
+ # # │ b ┆ 2 │
894
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
895
+ # # │ a ┆ 1 │
896
+ # # └─────┴─────┘
279
897
  def reverse
280
898
  select(Polars.col("*").reverse)
281
899
  end
282
900
 
901
+ # Rename column names.
902
+ #
903
+ # @param mapping [Hash]
904
+ # Key value pairs that map from old name to new name.
905
+ #
906
+ # @return [DataFrame]
907
+ #
908
+ # @example
909
+ # df = Polars::DataFrame.new(
910
+ # {
911
+ # "foo" => [1, 2, 3],
912
+ # "bar" => [6, 7, 8],
913
+ # "ham" => ["a", "b", "c"]
914
+ # }
915
+ # )
916
+ # df.rename({"foo" => "apple"})
917
+ # # =>
918
+ # # shape: (3, 3)
919
+ # # ┌───────┬─────┬─────┐
920
+ # # │ apple ┆ bar ┆ ham │
921
+ # # │ --- ┆ --- ┆ --- │
922
+ # # │ i64 ┆ i64 ┆ str │
923
+ # # ╞═══════╪═════╪═════╡
924
+ # # │ 1 ┆ 6 ┆ a │
925
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
926
+ # # │ 2 ┆ 7 ┆ b │
927
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
928
+ # # │ 3 ┆ 8 ┆ c │
929
+ # # └───────┴─────┴─────┘
283
930
  def rename(mapping)
284
931
  lazy.rename(mapping).collect(no_optimization: true)
285
932
  end
286
933
 
934
+ # Insert a Series at a certain column index. This operation is in place.
935
+ #
936
+ # @param index [Integer]
937
+ # Column to insert the new `Series` column.
938
+ # @param series [Series]
939
+ # `Series` to insert.
940
+ #
941
+ # @return [DataFrame]
942
+ #
943
+ # @example
944
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
945
+ # s = Polars::Series.new("baz", [97, 98, 99])
946
+ # df.insert_at_idx(1, s)
947
+ # # =>
948
+ # # shape: (3, 3)
949
+ # # ┌─────┬─────┬─────┐
950
+ # # │ foo ┆ baz ┆ bar │
951
+ # # │ --- ┆ --- ┆ --- │
952
+ # # │ i64 ┆ i64 ┆ i64 │
953
+ # # ╞═════╪═════╪═════╡
954
+ # # │ 1 ┆ 97 ┆ 4 │
955
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
956
+ # # │ 2 ┆ 98 ┆ 5 │
957
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
958
+ # # │ 3 ┆ 99 ┆ 6 │
959
+ # # └─────┴─────┴─────┘
960
+ #
961
+ # @example
962
+ # df = Polars::DataFrame.new(
963
+ # {
964
+ # "a" => [1, 2, 3, 4],
965
+ # "b" => [0.5, 4, 10, 13],
966
+ # "c" => [true, true, false, true]
967
+ # }
968
+ # )
969
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
970
+ # df.insert_at_idx(3, s)
971
+ # # =>
972
+ # # shape: (4, 4)
973
+ # # ┌─────┬──────┬───────┬──────┐
974
+ # # │ a ┆ b ┆ c ┆ d │
975
+ # # │ --- ┆ --- ┆ --- ┆ --- │
976
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
977
+ # # ╞═════╪══════╪═══════╪══════╡
978
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
979
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
980
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
981
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
982
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
983
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
984
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
985
+ # # └─────┴──────┴───────┴──────┘
287
986
  def insert_at_idx(index, series)
288
987
  if index < 0
289
988
  index = columns.length + index
@@ -292,30 +991,337 @@ module Polars
292
991
  self
293
992
  end
294
993
 
994
+ # Filter the rows in the DataFrame based on a predicate expression.
995
+ #
996
+ # @param predicate [Expr]
997
+ # Expression that evaluates to a boolean Series.
998
+ #
999
+ # @return [DataFrame]
1000
+ #
1001
+ # @example Filter on one condition:
1002
+ # df = Polars::DataFrame.new(
1003
+ # {
1004
+ # "foo" => [1, 2, 3],
1005
+ # "bar" => [6, 7, 8],
1006
+ # "ham" => ["a", "b", "c"]
1007
+ # }
1008
+ # )
1009
+ # df.filter(Polars.col("foo") < 3)
1010
+ # # =>
1011
+ # # shape: (2, 3)
1012
+ # # ┌─────┬─────┬─────┐
1013
+ # # │ foo ┆ bar ┆ ham │
1014
+ # # │ --- ┆ --- ┆ --- │
1015
+ # # │ i64 ┆ i64 ┆ str │
1016
+ # # ╞═════╪═════╪═════╡
1017
+ # # │ 1 ┆ 6 ┆ a │
1018
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1019
+ # # │ 2 ┆ 7 ┆ b │
1020
+ # # └─────┴─────┴─────┘
1021
+ #
1022
+ # @example Filter on multiple conditions:
1023
+ # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
1024
+ # # =>
1025
+ # # shape: (1, 3)
1026
+ # # ┌─────┬─────┬─────┐
1027
+ # # │ foo ┆ bar ┆ ham │
1028
+ # # │ --- ┆ --- ┆ --- │
1029
+ # # │ i64 ┆ i64 ┆ str │
1030
+ # # ╞═════╪═════╪═════╡
1031
+ # # │ 1 ┆ 6 ┆ a │
1032
+ # # └─────┴─────┴─────┘
295
1033
  def filter(predicate)
296
1034
  lazy.filter(predicate).collect
297
1035
  end
298
1036
 
299
- # def describe
300
- # end
1037
+ # Summary statistics for a DataFrame.
1038
+ #
1039
+ # @return [DataFrame]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new(
1043
+ # {
1044
+ # "a" => [1.0, 2.8, 3.0],
1045
+ # "b" => [4, 5, nil],
1046
+ # "c" => [true, false, true],
1047
+ # "d" => [nil, "b", "c"],
1048
+ # "e" => ["usd", "eur", nil]
1049
+ # }
1050
+ # )
1051
+ # df.describe
1052
+ # # =>
1053
+ # # shape: (7, 6)
1054
+ # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
+ # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
+ # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1073
+ def describe
1074
+ describe_cast = lambda do |stat|
1075
+ columns = []
1076
+ self.columns.each_with_index do |s, i|
1077
+ if self[s].is_numeric || self[s].is_boolean
1078
+ columns << stat[0.., i].cast(:f64)
1079
+ else
1080
+ # for dates, strings, etc, we cast to string so that all
1081
+ # statistics can be shown
1082
+ columns << stat[0.., i].cast(:str)
1083
+ end
1084
+ end
1085
+ self.class.new(columns)
1086
+ end
301
1087
 
302
- # def find_idx_by_name
303
- # end
1088
+ summary = _from_rbdf(
1089
+ Polars.concat(
1090
+ [
1091
+ describe_cast.(
1092
+ self.class.new(columns.to_h { |c| [c, [height]] })
1093
+ ),
1094
+ describe_cast.(null_count),
1095
+ describe_cast.(mean),
1096
+ describe_cast.(std),
1097
+ describe_cast.(min),
1098
+ describe_cast.(max),
1099
+ describe_cast.(median)
1100
+ ]
1101
+ )._df
1102
+ )
1103
+ summary.insert_at_idx(
1104
+ 0,
1105
+ Polars::Series.new(
1106
+ "describe",
1107
+ ["count", "null_count", "mean", "std", "min", "max", "median"],
1108
+ )
1109
+ )
1110
+ summary
1111
+ end
304
1112
 
305
- # def replace_at_idx
306
- # end
1113
+ # Find the index of a column by name.
1114
+ #
1115
+ # @param name [String]
1116
+ # Name of the column to find.
1117
+ #
1118
+ # @return [Series]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new(
1122
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1123
+ # )
1124
+ # df.find_idx_by_name("ham")
1125
+ # # => 2
1126
+ def find_idx_by_name(name)
1127
+ _df.find_idx_by_name(name)
1128
+ end
1129
+
1130
+ # Replace a column at an index location.
1131
+ #
1132
+ # @param index [Integer]
1133
+ # Column index.
1134
+ # @param series [Series]
1135
+ # Series that will replace the column.
1136
+ #
1137
+ # @return [DataFrame]
1138
+ #
1139
+ # @example
1140
+ # df = Polars::DataFrame.new(
1141
+ # {
1142
+ # "foo" => [1, 2, 3],
1143
+ # "bar" => [6, 7, 8],
1144
+ # "ham" => ["a", "b", "c"]
1145
+ # }
1146
+ # )
1147
+ # s = Polars::Series.new("apple", [10, 20, 30])
1148
+ # df.replace_at_idx(0, s)
1149
+ # # =>
1150
+ # # shape: (3, 3)
1151
+ # # ┌───────┬─────┬─────┐
1152
+ # # │ apple ┆ bar ┆ ham │
1153
+ # # │ --- ┆ --- ┆ --- │
1154
+ # # │ i64 ┆ i64 ┆ str │
1155
+ # # ╞═══════╪═════╪═════╡
1156
+ # # │ 10 ┆ 6 ┆ a │
1157
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1158
+ # # │ 20 ┆ 7 ┆ b │
1159
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1160
+ # # │ 30 ┆ 8 ┆ c │
1161
+ # # └───────┴─────┴─────┘
1162
+ def replace_at_idx(index, series)
1163
+ if index < 0
1164
+ index = columns.length + index
1165
+ end
1166
+ _df.replace_at_idx(index, series._s)
1167
+ self
1168
+ end
307
1169
 
1170
+ # Sort the DataFrame by column.
1171
+ #
1172
+ # @param by [String]
1173
+ # By which column to sort.
1174
+ # @param reverse [Boolean]
1175
+ # Reverse/descending sort.
1176
+ # @param nulls_last [Boolean]
1177
+ # Place null values last. Can only be used if sorted by a single column.
1178
+ #
1179
+ # @return [DataFrame]
1180
+ #
1181
+ # @example
1182
+ # df = Polars::DataFrame.new(
1183
+ # {
1184
+ # "foo" => [1, 2, 3],
1185
+ # "bar" => [6.0, 7.0, 8.0],
1186
+ # "ham" => ["a", "b", "c"]
1187
+ # }
1188
+ # )
1189
+ # df.sort("foo", reverse: true)
1190
+ # # =>
1191
+ # # shape: (3, 3)
1192
+ # # ┌─────┬─────┬─────┐
1193
+ # # │ foo ┆ bar ┆ ham │
1194
+ # # │ --- ┆ --- ┆ --- │
1195
+ # # │ i64 ┆ f64 ┆ str │
1196
+ # # ╞═════╪═════╪═════╡
1197
+ # # │ 3 ┆ 8.0 ┆ c │
1198
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1199
+ # # │ 2 ┆ 7.0 ┆ b │
1200
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1201
+ # # │ 1 ┆ 6.0 ┆ a │
1202
+ # # └─────┴─────┴─────┘
1203
+ #
1204
+ # @example Sort by multiple columns.
1205
+ # df.sort(
1206
+ # [Polars.col("foo"), Polars.col("bar")**2],
1207
+ # reverse: [true, false]
1208
+ # )
1209
+ # # =>
1210
+ # # shape: (3, 3)
1211
+ # # ┌─────┬─────┬─────┐
1212
+ # # │ foo ┆ bar ┆ ham │
1213
+ # # │ --- ┆ --- ┆ --- │
1214
+ # # │ i64 ┆ f64 ┆ str │
1215
+ # # ╞═════╪═════╪═════╡
1216
+ # # │ 3 ┆ 8.0 ┆ c │
1217
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1218
+ # # │ 2 ┆ 7.0 ┆ b │
1219
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1220
+ # # │ 1 ┆ 6.0 ┆ a │
1221
+ # # └─────┴─────┴─────┘
308
1222
  def sort(by, reverse: false, nulls_last: false)
309
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1223
+ if by.is_a?(Array) || by.is_a?(Expr)
1224
+ lazy
1225
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1226
+ .collect(no_optimization: true, string_cache: false)
1227
+ else
1228
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
1229
+ end
310
1230
  end
311
1231
 
1232
+ # Check if DataFrame is equal to other.
1233
+ #
1234
+ # @param other [DataFrame]
1235
+ # DataFrame to compare with.
1236
+ # @param null_equal [Boolean]
1237
+ # Consider null values as equal.
1238
+ #
1239
+ # @return [Boolean]
1240
+ #
1241
+ # @example
1242
+ # df1 = Polars::DataFrame.new(
1243
+ # {
1244
+ # "foo" => [1, 2, 3],
1245
+ # "bar" => [6.0, 7.0, 8.0],
1246
+ # "ham" => ["a", "b", "c"]
1247
+ # }
1248
+ # )
1249
+ # df2 = Polars::DataFrame.new(
1250
+ # {
1251
+ # "foo" => [3, 2, 1],
1252
+ # "bar" => [8.0, 7.0, 6.0],
1253
+ # "ham" => ["c", "b", "a"]
1254
+ # }
1255
+ # )
1256
+ # df1.frame_equal(df1)
1257
+ # # => true
1258
+ # df1.frame_equal(df2)
1259
+ # # => false
312
1260
  def frame_equal(other, null_equal: true)
313
1261
  _df.frame_equal(other._df, null_equal)
314
1262
  end
315
1263
 
316
- # def replace
317
- # end
1264
+ # Replace a column by a new Series.
1265
+ #
1266
+ # @param column [String]
1267
+ # Column to replace.
1268
+ # @param new_col [Series]
1269
+ # New column to insert.
1270
+ #
1271
+ # @return [DataFrame]
1272
+ #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1275
+ # s = Polars::Series.new([10, 20, 30])
1276
+ # df.replace("foo", s)
1277
+ # # =>
1278
+ # # shape: (3, 2)
1279
+ # # ┌─────┬─────┐
1280
+ # # │ foo ┆ bar │
1281
+ # # │ --- ┆ --- │
1282
+ # # │ i64 ┆ i64 │
1283
+ # # ╞═════╪═════╡
1284
+ # # │ 10 ┆ 4 │
1285
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1286
+ # # │ 20 ┆ 5 │
1287
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1288
+ # # │ 30 ┆ 6 │
1289
+ # # └─────┴─────┘
1290
+ def replace(column, new_col)
1291
+ _df.replace(column, new_col._s)
1292
+ self
1293
+ end
318
1294
 
1295
+ # Get a slice of this DataFrame.
1296
+ #
1297
+ # @param offset [Integer]
1298
+ # Start index. Negative indexing is supported.
1299
+ # @param length [Integer, nil]
1300
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1301
+ # will be selected.
1302
+ #
1303
+ # @return [DataFrame]
1304
+ #
1305
+ # @example
1306
+ # df = Polars::DataFrame.new(
1307
+ # {
1308
+ # "foo" => [1, 2, 3],
1309
+ # "bar" => [6.0, 7.0, 8.0],
1310
+ # "ham" => ["a", "b", "c"]
1311
+ # }
1312
+ # )
1313
+ # df.slice(1, 2)
1314
+ # # =>
1315
+ # # shape: (2, 3)
1316
+ # # ┌─────┬─────┬─────┐
1317
+ # # │ foo ┆ bar ┆ ham │
1318
+ # # │ --- ┆ --- ┆ --- │
1319
+ # # │ i64 ┆ f64 ┆ str │
1320
+ # # ╞═════╪═════╪═════╡
1321
+ # # │ 2 ┆ 7.0 ┆ b │
1322
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1323
+ # # │ 3 ┆ 8.0 ┆ c │
1324
+ # # └─────┴─────┴─────┘
319
1325
  def slice(offset, length = nil)
320
1326
  if !length.nil? && length < 0
321
1327
  length = height - offset + length
@@ -323,29 +1329,222 @@ module Polars
323
1329
  _from_rbdf(_df.slice(offset, length))
324
1330
  end
325
1331
 
1332
+ # Get the first `n` rows.
1333
+ #
1334
+ # Alias for {#head}.
1335
+ #
1336
+ # @param n [Integer]
1337
+ # Number of rows to return.
1338
+ #
1339
+ # @return [DataFrame]
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new(
1343
+ # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
1344
+ # )
1345
+ # df.limit(4)
1346
+ # # =>
1347
+ # # shape: (4, 2)
1348
+ # # ┌─────┬─────┐
1349
+ # # │ foo ┆ bar │
1350
+ # # │ --- ┆ --- │
1351
+ # # │ i64 ┆ str │
1352
+ # # ╞═════╪═════╡
1353
+ # # │ 1 ┆ a │
1354
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1355
+ # # │ 2 ┆ b │
1356
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1357
+ # # │ 3 ┆ c │
1358
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1359
+ # # │ 4 ┆ d │
1360
+ # # └─────┴─────┘
326
1361
  def limit(n = 5)
327
1362
  head(n)
328
1363
  end
329
1364
 
1365
+ # Get the first `n` rows.
1366
+ #
1367
+ # @param n [Integer]
1368
+ # Number of rows to return.
1369
+ #
1370
+ # @return [DataFrame]
1371
+ #
1372
+ # @example
1373
+ # df = Polars::DataFrame.new(
1374
+ # {
1375
+ # "foo" => [1, 2, 3, 4, 5],
1376
+ # "bar" => [6, 7, 8, 9, 10],
1377
+ # "ham" => ["a", "b", "c", "d", "e"]
1378
+ # }
1379
+ # )
1380
+ # df.head(3)
1381
+ # # =>
1382
+ # # shape: (3, 3)
1383
+ # # ┌─────┬─────┬─────┐
1384
+ # # │ foo ┆ bar ┆ ham │
1385
+ # # │ --- ┆ --- ┆ --- │
1386
+ # # │ i64 ┆ i64 ┆ str │
1387
+ # # ╞═════╪═════╪═════╡
1388
+ # # │ 1 ┆ 6 ┆ a │
1389
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1390
+ # # │ 2 ┆ 7 ┆ b │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1392
+ # # │ 3 ┆ 8 ┆ c │
1393
+ # # └─────┴─────┴─────┘
330
1394
  def head(n = 5)
331
1395
  _from_rbdf(_df.head(n))
332
1396
  end
333
1397
 
1398
+ # Get the last `n` rows.
1399
+ #
1400
+ # @param n [Integer]
1401
+ # Number of rows to return.
1402
+ #
1403
+ # @return [DataFrame]
1404
+ #
1405
+ # @example
1406
+ # df = Polars::DataFrame.new(
1407
+ # {
1408
+ # "foo" => [1, 2, 3, 4, 5],
1409
+ # "bar" => [6, 7, 8, 9, 10],
1410
+ # "ham" => ["a", "b", "c", "d", "e"]
1411
+ # }
1412
+ # )
1413
+ # df.tail(3)
1414
+ # # =>
1415
+ # # shape: (3, 3)
1416
+ # # ┌─────┬─────┬─────┐
1417
+ # # │ foo ┆ bar ┆ ham │
1418
+ # # │ --- ┆ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 ┆ str │
1420
+ # # ╞═════╪═════╪═════╡
1421
+ # # │ 3 ┆ 8 ┆ c │
1422
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1423
+ # # │ 4 ┆ 9 ┆ d │
1424
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1425
+ # # │ 5 ┆ 10 ┆ e │
1426
+ # # └─────┴─────┴─────┘
334
1427
  def tail(n = 5)
335
1428
  _from_rbdf(_df.tail(n))
336
1429
  end
337
1430
 
338
- # def drop_nulls
339
- # end
1431
+ # Return a new DataFrame where the null values are dropped.
1432
+ #
1433
+ # @param subset [Object]
1434
+ # Subset of column(s) on which `drop_nulls` will be applied.
1435
+ #
1436
+ # @return [DataFrame]
1437
+ #
1438
+ # @example
1439
+ # df = Polars::DataFrame.new(
1440
+ # {
1441
+ # "foo" => [1, 2, 3],
1442
+ # "bar" => [6, nil, 8],
1443
+ # "ham" => ["a", "b", "c"]
1444
+ # }
1445
+ # )
1446
+ # df.drop_nulls
1447
+ # # =>
1448
+ # # shape: (2, 3)
1449
+ # # ┌─────┬─────┬─────┐
1450
+ # # │ foo ┆ bar ┆ ham │
1451
+ # # │ --- ┆ --- ┆ --- │
1452
+ # # │ i64 ┆ i64 ┆ str │
1453
+ # # ╞═════╪═════╪═════╡
1454
+ # # │ 1 ┆ 6 ┆ a │
1455
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1456
+ # # │ 3 ┆ 8 ┆ c │
1457
+ # # └─────┴─────┴─────┘
1458
+ def drop_nulls(subset: nil)
1459
+ if subset.is_a?(String)
1460
+ subset = [subset]
1461
+ end
1462
+ _from_rbdf(_df.drop_nulls(subset))
1463
+ end
340
1464
 
341
1465
  # def pipe
342
1466
  # end
343
1467
 
344
- # def with_row_count
345
- # end
1468
+ # Add a column at index 0 that counts the rows.
1469
+ #
1470
+ # @param name [String]
1471
+ # Name of the column to add.
1472
+ # @param offset [Integer]
1473
+ # Start the row count at this offset.
1474
+ #
1475
+ # @return [DataFrame]
1476
+ #
1477
+ # @example
1478
+ # df = Polars::DataFrame.new(
1479
+ # {
1480
+ # "a" => [1, 3, 5],
1481
+ # "b" => [2, 4, 6]
1482
+ # }
1483
+ # )
1484
+ # df.with_row_count
1485
+ # # =>
1486
+ # # shape: (3, 3)
1487
+ # # ┌────────┬─────┬─────┐
1488
+ # # │ row_nr ┆ a ┆ b │
1489
+ # # │ --- ┆ --- ┆ --- │
1490
+ # # │ u32 ┆ i64 ┆ i64 │
1491
+ # # ╞════════╪═════╪═════╡
1492
+ # # │ 0 ┆ 1 ┆ 2 │
1493
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1494
+ # # │ 1 ┆ 3 ┆ 4 │
1495
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1496
+ # # │ 2 ┆ 5 ┆ 6 │
1497
+ # # └────────┴─────┴─────┘
1498
+ def with_row_count(name: "row_nr", offset: 0)
1499
+ _from_rbdf(_df.with_row_count(name, offset))
1500
+ end
346
1501
 
1502
+ # Start a groupby operation.
1503
+ #
1504
+ # @param by [Object]
1505
+ # Column(s) to group by.
1506
+ # @param maintain_order [Boolean]
1507
+ # Make sure that the order of the groups remain consistent. This is more
1508
+ # expensive than a default groupby. Note that this only works in expression
1509
+ # aggregations.
1510
+ #
1511
+ # @return [GroupBy]
1512
+ #
1513
+ # @example
1514
+ # df = Polars::DataFrame.new(
1515
+ # {
1516
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1517
+ # "b" => [1, 2, 3, 4, 5, 6],
1518
+ # "c" => [6, 5, 4, 3, 2, 1]
1519
+ # }
1520
+ # )
1521
+ # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1522
+ # # =>
1523
+ # # shape: (3, 2)
1524
+ # # ┌─────┬─────┐
1525
+ # # │ a ┆ b │
1526
+ # # │ --- ┆ --- │
1527
+ # # │ str ┆ i64 │
1528
+ # # ╞═════╪═════╡
1529
+ # # │ a ┆ 4 │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1531
+ # # │ b ┆ 11 │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1533
+ # # │ c ┆ 6 │
1534
+ # # └─────┴─────┘
347
1535
  def groupby(by, maintain_order: false)
348
- lazy.groupby(by, maintain_order: maintain_order)
1536
+ if !Utils.bool?(maintain_order)
1537
+ raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1538
+ end
1539
+ if by.is_a?(String)
1540
+ by = [by]
1541
+ end
1542
+ GroupBy.new(
1543
+ _df,
1544
+ by,
1545
+ self.class,
1546
+ maintain_order: maintain_order
1547
+ )
349
1548
  end
350
1549
 
351
1550
  # def groupby_rolling
@@ -360,6 +1559,109 @@ module Polars
360
1559
  # def join_asof
361
1560
  # end
362
1561
 
1562
+ # Join in SQL-like fashion.
1563
+ #
1564
+ # @param other [DataFrame]
1565
+ # DataFrame to join with.
1566
+ # @param left_on [Object]
1567
+ # Name(s) of the left join column(s).
1568
+ # @param right_on [Object]
1569
+ # Name(s) of the right join column(s).
1570
+ # @param on [Object]
1571
+ # Name(s) of the join columns in both DataFrames.
1572
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1573
+ # Join strategy.
1574
+ # @param suffix [String]
1575
+ # Suffix to append to columns with a duplicate name.
1576
+ #
1577
+ # @return [DataFrame]
1578
+ #
1579
+ # @example
1580
+ # df = Polars::DataFrame.new(
1581
+ # {
1582
+ # "foo" => [1, 2, 3],
1583
+ # "bar" => [6.0, 7.0, 8.0],
1584
+ # "ham" => ["a", "b", "c"]
1585
+ # }
1586
+ # )
1587
+ # other_df = Polars::DataFrame.new(
1588
+ # {
1589
+ # "apple" => ["x", "y", "z"],
1590
+ # "ham" => ["a", "b", "d"]
1591
+ # }
1592
+ # )
1593
+ # df.join(other_df, on: "ham")
1594
+ # # =>
1595
+ # # shape: (2, 4)
1596
+ # # ┌─────┬─────┬─────┬───────┐
1597
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1598
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1599
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1600
+ # # ╞═════╪═════╪═════╪═══════╡
1601
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1602
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1603
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1604
+ # # └─────┴─────┴─────┴───────┘
1605
+ #
1606
+ # @example
1607
+ # df.join(other_df, on: "ham", how: "outer")
1608
+ # # =>
1609
+ # # shape: (4, 4)
1610
+ # # ┌──────┬──────┬─────┬───────┐
1611
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1612
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1613
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1614
+ # # ╞══════╪══════╪═════╪═══════╡
1615
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1616
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1617
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1618
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1619
+ # # │ null ┆ null ┆ d ┆ z │
1620
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1621
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1622
+ # # └──────┴──────┴─────┴───────┘
1623
+ #
1624
+ # @example
1625
+ # df.join(other_df, on: "ham", how: "left")
1626
+ # # =>
1627
+ # # shape: (3, 4)
1628
+ # # ┌─────┬─────┬─────┬───────┐
1629
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1630
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1631
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1632
+ # # ╞═════╪═════╪═════╪═══════╡
1633
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1634
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1635
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1636
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1637
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1638
+ # # └─────┴─────┴─────┴───────┘
1639
+ #
1640
+ # @example
1641
+ # df.join(other_df, on: "ham", how: "semi")
1642
+ # # =>
1643
+ # # shape: (2, 3)
1644
+ # # ┌─────┬─────┬─────┐
1645
+ # # │ foo ┆ bar ┆ ham │
1646
+ # # │ --- ┆ --- ┆ --- │
1647
+ # # │ i64 ┆ f64 ┆ str │
1648
+ # # ╞═════╪═════╪═════╡
1649
+ # # │ 1 ┆ 6.0 ┆ a │
1650
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1651
+ # # │ 2 ┆ 7.0 ┆ b │
1652
+ # # └─────┴─────┴─────┘
1653
+ #
1654
+ # @example
1655
+ # df.join(other_df, on: "ham", how: "anti")
1656
+ # # =>
1657
+ # # shape: (1, 3)
1658
+ # # ┌─────┬─────┬─────┐
1659
+ # # │ foo ┆ bar ┆ ham │
1660
+ # # │ --- ┆ --- ┆ --- │
1661
+ # # │ i64 ┆ f64 ┆ str │
1662
+ # # ╞═════╪═════╪═════╡
1663
+ # # │ 3 ┆ 8.0 ┆ c │
1664
+ # # └─────┴─────┴─────┘
363
1665
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
364
1666
  lazy
365
1667
  .join(
@@ -376,36 +1678,322 @@ module Polars
376
1678
  # def apply
377
1679
  # end
378
1680
 
1681
+ # Return a new DataFrame with the column added or replaced.
1682
+ #
1683
+ # @param column [Object]
1684
+ # Series, where the name of the Series refers to the column in the DataFrame.
1685
+ #
1686
+ # @return [DataFrame]
1687
+ #
1688
+ # @example Added
1689
+ # df = Polars::DataFrame.new(
1690
+ # {
1691
+ # "a" => [1, 3, 5],
1692
+ # "b" => [2, 4, 6]
1693
+ # }
1694
+ # )
1695
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
1696
+ # # =>
1697
+ # # shape: (3, 3)
1698
+ # # ┌─────┬─────┬───────────┐
1699
+ # # │ a ┆ b ┆ b_squared │
1700
+ # # │ --- ┆ --- ┆ --- │
1701
+ # # │ i64 ┆ i64 ┆ f64 │
1702
+ # # ╞═════╪═════╪═══════════╡
1703
+ # # │ 1 ┆ 2 ┆ 4.0 │
1704
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1705
+ # # │ 3 ┆ 4 ┆ 16.0 │
1706
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1707
+ # # │ 5 ┆ 6 ┆ 36.0 │
1708
+ # # └─────┴─────┴───────────┘
1709
+ #
1710
+ # @example Replaced
1711
+ # df.with_column(Polars.col("a") ** 2)
1712
+ # # =>
1713
+ # # shape: (3, 2)
1714
+ # # ┌──────┬─────┐
1715
+ # # │ a ┆ b │
1716
+ # # │ --- ┆ --- │
1717
+ # # │ f64 ┆ i64 │
1718
+ # # ╞══════╪═════╡
1719
+ # # │ 1.0 ┆ 2 │
1720
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1721
+ # # │ 9.0 ┆ 4 │
1722
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1723
+ # # │ 25.0 ┆ 6 │
1724
+ # # └──────┴─────┘
379
1725
  def with_column(column)
380
1726
  lazy
381
1727
  .with_column(column)
382
1728
  .collect(no_optimization: true, string_cache: false)
383
1729
  end
384
1730
 
385
- # def hstack
386
- # end
1731
+ # Return a new DataFrame grown horizontally by stacking multiple Series to it.
1732
+ #
1733
+ # @param columns [Object]
1734
+ # Series to stack.
1735
+ # @param in_place [Boolean]
1736
+ # Modify in place.
1737
+ #
1738
+ # @return [DataFrame]
1739
+ #
1740
+ # @example
1741
+ # df = Polars::DataFrame.new(
1742
+ # {
1743
+ # "foo" => [1, 2, 3],
1744
+ # "bar" => [6, 7, 8],
1745
+ # "ham" => ["a", "b", "c"]
1746
+ # }
1747
+ # )
1748
+ # x = Polars::Series.new("apple", [10, 20, 30])
1749
+ # df.hstack([x])
1750
+ # # =>
1751
+ # # shape: (3, 4)
1752
+ # # ┌─────┬─────┬─────┬───────┐
1753
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1754
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1755
+ # # │ i64 ┆ i64 ┆ str ┆ i64 │
1756
+ # # ╞═════╪═════╪═════╪═══════╡
1757
+ # # │ 1 ┆ 6 ┆ a ┆ 10 │
1758
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1759
+ # # │ 2 ┆ 7 ┆ b ┆ 20 │
1760
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1761
+ # # │ 3 ┆ 8 ┆ c ┆ 30 │
1762
+ # # └─────┴─────┴─────┴───────┘
1763
+ def hstack(columns, in_place: false)
1764
+ if !columns.is_a?(Array)
1765
+ columns = columns.get_columns
1766
+ end
1767
+ if in_place
1768
+ _df.hstack_mut(columns.map(&:_s))
1769
+ self
1770
+ else
1771
+ _from_rbdf(_df.hstack(columns.map(&:_s)))
1772
+ end
1773
+ end
387
1774
 
388
- # def vstack
389
- # end
1775
+ # Grow this DataFrame vertically by stacking a DataFrame to it.
1776
+ #
1777
+ # @param df [DataFrame]
1778
+ # DataFrame to stack.
1779
+ # @param in_place [Boolean]
1780
+ # Modify in place
1781
+ #
1782
+ # @return [DataFrame]
1783
+ #
1784
+ # @example
1785
+ # df1 = Polars::DataFrame.new(
1786
+ # {
1787
+ # "foo" => [1, 2],
1788
+ # "bar" => [6, 7],
1789
+ # "ham" => ["a", "b"]
1790
+ # }
1791
+ # )
1792
+ # df2 = Polars::DataFrame.new(
1793
+ # {
1794
+ # "foo" => [3, 4],
1795
+ # "bar" => [8, 9],
1796
+ # "ham" => ["c", "d"]
1797
+ # }
1798
+ # )
1799
+ # df1.vstack(df2)
1800
+ # # =>
1801
+ # # shape: (4, 3)
1802
+ # # ┌─────┬─────┬─────┐
1803
+ # # │ foo ┆ bar ┆ ham │
1804
+ # # │ --- ┆ --- ┆ --- │
1805
+ # # │ i64 ┆ i64 ┆ str │
1806
+ # # ╞═════╪═════╪═════╡
1807
+ # # │ 1 ┆ 6 ┆ a │
1808
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1809
+ # # │ 2 ┆ 7 ┆ b │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ 3 ┆ 8 ┆ c │
1812
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1813
+ # # │ 4 ┆ 9 ┆ d │
1814
+ # # └─────┴─────┴─────┘
1815
+ def vstack(df, in_place: false)
1816
+ if in_place
1817
+ _df.vstack_mut(df._df)
1818
+ self
1819
+ else
1820
+ _from_rbdf(_df.vstack(df._df))
1821
+ end
1822
+ end
390
1823
 
391
- # def extend
392
- # end
1824
+ # Extend the memory backed by this `DataFrame` with the values from `other`.
1825
+ #
1826
+ # Different from `vstack` which adds the chunks from `other` to the chunks of this
1827
+ # `DataFrame` `extend` appends the data from `other` to the underlying memory
1828
+ # locations and thus may cause a reallocation.
1829
+ #
1830
+ # If this does not cause a reallocation, the resulting data structure will not
1831
+ # have any extra chunks and thus will yield faster queries.
1832
+ #
1833
+ # Prefer `extend` over `vstack` when you want to do a query after a single append.
1834
+ # For instance during online operations where you add `n` rows and rerun a query.
1835
+ #
1836
+ # Prefer `vstack` over `extend` when you want to append many times before doing a
1837
+ # query. For instance when you read in multiple files and when to store them in a
1838
+ # single `DataFrame`. In the latter case, finish the sequence of `vstack`
1839
+ # operations with a `rechunk`.
1840
+ #
1841
+ # @param other [DataFrame]
1842
+ # DataFrame to vertically add.
1843
+ #
1844
+ # @return [DataFrame]
1845
+ #
1846
+ # @example
1847
+ # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1848
+ # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
1849
+ # df1.extend(df2)
1850
+ # # =>
1851
+ # # shape: (6, 2)
1852
+ # # ┌─────┬─────┐
1853
+ # # │ foo ┆ bar │
1854
+ # # │ --- ┆ --- │
1855
+ # # │ i64 ┆ i64 │
1856
+ # # ╞═════╪═════╡
1857
+ # # │ 1 ┆ 4 │
1858
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1859
+ # # │ 2 ┆ 5 │
1860
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1861
+ # # │ 3 ┆ 6 │
1862
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1863
+ # # │ 10 ┆ 40 │
1864
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1865
+ # # │ 20 ┆ 50 │
1866
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1867
+ # # │ 30 ┆ 60 │
1868
+ # # └─────┴─────┘
1869
+ def extend(other)
1870
+ _df.extend(other._df)
1871
+ self
1872
+ end
393
1873
 
394
- # def drop
395
- # end
1874
+ # Remove column from DataFrame and return as new.
1875
+ #
1876
+ # @param columns [Object]
1877
+ # Column(s) to drop.
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "foo" => [1, 2, 3],
1885
+ # "bar" => [6.0, 7.0, 8.0],
1886
+ # "ham" => ["a", "b", "c"]
1887
+ # }
1888
+ # )
1889
+ # df.drop("ham")
1890
+ # # =>
1891
+ # # shape: (3, 2)
1892
+ # # ┌─────┬─────┐
1893
+ # # │ foo ┆ bar │
1894
+ # # │ --- ┆ --- │
1895
+ # # │ i64 ┆ f64 │
1896
+ # # ╞═════╪═════╡
1897
+ # # │ 1 ┆ 6.0 │
1898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1899
+ # # │ 2 ┆ 7.0 │
1900
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 3 ┆ 8.0 │
1902
+ # # └─────┴─────┘
1903
+ def drop(columns)
1904
+ if columns.is_a?(Array)
1905
+ df = clone
1906
+ columns.each do |n|
1907
+ df._df.drop_in_place(n)
1908
+ end
1909
+ df
1910
+ else
1911
+ _from_rbdf(_df.drop(columns))
1912
+ end
1913
+ end
396
1914
 
397
- # def drop_in_place
398
- # end
1915
+ # Drop in place.
1916
+ #
1917
+ # @param name [Object]
1918
+ # Column to drop.
1919
+ #
1920
+ # @return [Series]
1921
+ #
1922
+ # @example
1923
+ # df = Polars::DataFrame.new(
1924
+ # {
1925
+ # "foo" => [1, 2, 3],
1926
+ # "bar" => [6, 7, 8],
1927
+ # "ham" => ["a", "b", "c"]
1928
+ # }
1929
+ # )
1930
+ # df.drop_in_place("ham")
1931
+ # # =>
1932
+ # # shape: (3,)
1933
+ # # Series: 'ham' [str]
1934
+ # # [
1935
+ # # "a"
1936
+ # # "b"
1937
+ # # "c"
1938
+ # # ]
1939
+ def drop_in_place(name)
1940
+ Utils.wrap_s(_df.drop_in_place(name))
1941
+ end
399
1942
 
400
- # def cleared
401
- # end
1943
+ # Create an empty copy of the current DataFrame.
1944
+ #
1945
+ # Returns a DataFrame with identical schema but no data.
1946
+ #
1947
+ # @return [DataFrame]
1948
+ #
1949
+ # @example
1950
+ # df = Polars::DataFrame.new(
1951
+ # {
1952
+ # "a" => [nil, 2, 3, 4],
1953
+ # "b" => [0.5, nil, 2.5, 13],
1954
+ # "c" => [true, true, false, nil]
1955
+ # }
1956
+ # )
1957
+ # df.cleared
1958
+ # # =>
1959
+ # # shape: (0, 3)
1960
+ # # ┌─────┬─────┬──────┐
1961
+ # # │ a ┆ b ┆ c │
1962
+ # # │ --- ┆ --- ┆ --- │
1963
+ # # │ i64 ┆ f64 ┆ bool │
1964
+ # # ╞═════╪═════╪══════╡
1965
+ # # └─────┴─────┴──────┘
1966
+ def cleared
1967
+ height > 0 ? head(0) : clone
1968
+ end
402
1969
 
403
1970
  # clone handled by initialize_copy
404
1971
 
1972
+ # Get the DataFrame as a Array of Series.
1973
+ #
1974
+ # @return [Array]
405
1975
  def get_columns
406
1976
  _df.get_columns.map { |s| Utils.wrap_s(s) }
407
1977
  end
408
1978
 
1979
+ # Get a single column as Series by name.
1980
+ #
1981
+ # @param name [String]
1982
+ # Name of the column to retrieve.
1983
+ #
1984
+ # @return [Series]
1985
+ #
1986
+ # @example
1987
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1988
+ # df.get_column("foo")
1989
+ # # =>
1990
+ # # shape: (3,)
1991
+ # # Series: 'foo' [i64]
1992
+ # # [
1993
+ # # 1
1994
+ # # 2
1995
+ # # 3
1996
+ # # ]
409
1997
  def get_column(name)
410
1998
  self[name]
411
1999
  end
@@ -413,12 +2001,85 @@ module Polars
413
2001
  # def fill_null
414
2002
  # end
415
2003
 
2004
+ # Fill floating point NaN values by an Expression evaluation.
2005
+ #
2006
+ # @param fill_value [Object]
2007
+ # Value to fill NaN with.
2008
+ #
2009
+ # @return [DataFrame]
2010
+ #
2011
+ # @note
2012
+ # Note that floating point NaNs (Not a Number) are not missing values!
2013
+ # To replace missing values, use `fill_null`.
2014
+ #
2015
+ # @example
2016
+ # df = Polars::DataFrame.new(
2017
+ # {
2018
+ # "a" => [1.5, 2, Float::NAN, 4],
2019
+ # "b" => [0.5, 4, Float::NAN, 13]
2020
+ # }
2021
+ # )
2022
+ # df.fill_nan(99)
2023
+ # # =>
2024
+ # # shape: (4, 2)
2025
+ # # ┌──────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════╪══════╡
2030
+ # # │ 1.5 ┆ 0.5 │
2031
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2032
+ # # │ 2.0 ┆ 4.0 │
2033
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2034
+ # # │ 99.0 ┆ 99.0 │
2035
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2036
+ # # │ 4.0 ┆ 13.0 │
2037
+ # # └──────┴──────┘
416
2038
  def fill_nan(fill_value)
417
2039
  lazy.fill_nan(fill_value).collect(no_optimization: true)
418
2040
  end
419
2041
 
420
- # def explode
421
- # end
2042
+ # Explode `DataFrame` to long format by exploding a column with Lists.
2043
+ #
2044
+ # @param columns [Object]
2045
+ # Column of LargeList type.
2046
+ #
2047
+ # @return [DataFrame]
2048
+ #
2049
+ # @example
2050
+ # df = Polars::DataFrame.new(
2051
+ # {
2052
+ # "letters" => ["a", "a", "b", "c"],
2053
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
2054
+ # }
2055
+ # )
2056
+ # df.explode("numbers")
2057
+ # # =>
2058
+ # # shape: (8, 2)
2059
+ # # ┌─────────┬─────────┐
2060
+ # # │ letters ┆ numbers │
2061
+ # # │ --- ┆ --- │
2062
+ # # │ str ┆ i64 │
2063
+ # # ╞═════════╪═════════╡
2064
+ # # │ a ┆ 1 │
2065
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2066
+ # # │ a ┆ 2 │
2067
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2068
+ # # │ a ┆ 3 │
2069
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2070
+ # # │ b ┆ 4 │
2071
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2072
+ # # │ b ┆ 5 │
2073
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2074
+ # # │ c ┆ 6 │
2075
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2076
+ # # │ c ┆ 7 │
2077
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2078
+ # # │ c ┆ 8 │
2079
+ # # └─────────┴─────────┘
2080
+ def explode(columns)
2081
+ lazy.explode(columns).collect(no_optimization: true)
2082
+ end
422
2083
 
423
2084
  # def pivot
424
2085
  # end
@@ -432,24 +2093,242 @@ module Polars
432
2093
  # def partition_by
433
2094
  # end
434
2095
 
435
- # def shift
436
- # end
2096
+ # Shift values by the given period.
2097
+ #
2098
+ # @param periods [Integer]
2099
+ # Number of places to shift (may be negative).
2100
+ #
2101
+ # @return [DataFrame]
2102
+ #
2103
+ # @example
2104
+ # df = Polars::DataFrame.new(
2105
+ # {
2106
+ # "foo" => [1, 2, 3],
2107
+ # "bar" => [6, 7, 8],
2108
+ # "ham" => ["a", "b", "c"]
2109
+ # }
2110
+ # )
2111
+ # df.shift(1)
2112
+ # # =>
2113
+ # # shape: (3, 3)
2114
+ # # ┌──────┬──────┬──────┐
2115
+ # # │ foo ┆ bar ┆ ham │
2116
+ # # │ --- ┆ --- ┆ --- │
2117
+ # # │ i64 ┆ i64 ┆ str │
2118
+ # # ╞══════╪══════╪══════╡
2119
+ # # │ null ┆ null ┆ null │
2120
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2121
+ # # │ 1 ┆ 6 ┆ a │
2122
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2123
+ # # │ 2 ┆ 7 ┆ b │
2124
+ # # └──────┴──────┴──────┘
2125
+ #
2126
+ # @example
2127
+ # df.shift(-1)
2128
+ # # =>
2129
+ # # shape: (3, 3)
2130
+ # # ┌──────┬──────┬──────┐
2131
+ # # │ foo ┆ bar ┆ ham │
2132
+ # # │ --- ┆ --- ┆ --- │
2133
+ # # │ i64 ┆ i64 ┆ str │
2134
+ # # ╞══════╪══════╪══════╡
2135
+ # # │ 2 ┆ 7 ┆ b │
2136
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2137
+ # # │ 3 ┆ 8 ┆ c │
2138
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2139
+ # # │ null ┆ null ┆ null │
2140
+ # # └──────┴──────┴──────┘
2141
+ def shift(periods)
2142
+ _from_rbdf(_df.shift(periods))
2143
+ end
437
2144
 
438
- # def shift_and_fill
439
- # end
2145
+ # Shift the values by a given period and fill the resulting null values.
2146
+ #
2147
+ # @param periods [Integer]
2148
+ # Number of places to shift (may be negative).
2149
+ # @param fill_value [Object]
2150
+ # fill nil values with this value.
2151
+ #
2152
+ # @return [DataFrame]
2153
+ #
2154
+ # @example
2155
+ # df = Polars::DataFrame.new(
2156
+ # {
2157
+ # "foo" => [1, 2, 3],
2158
+ # "bar" => [6, 7, 8],
2159
+ # "ham" => ["a", "b", "c"]
2160
+ # }
2161
+ # )
2162
+ # df.shift_and_fill(1, 0)
2163
+ # # =>
2164
+ # # shape: (3, 3)
2165
+ # # ┌─────┬─────┬─────┐
2166
+ # # │ foo ┆ bar ┆ ham │
2167
+ # # │ --- ┆ --- ┆ --- │
2168
+ # # │ i64 ┆ i64 ┆ str │
2169
+ # # ╞═════╪═════╪═════╡
2170
+ # # │ 0 ┆ 0 ┆ 0 │
2171
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2172
+ # # │ 1 ┆ 6 ┆ a │
2173
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2174
+ # # │ 2 ┆ 7 ┆ b │
2175
+ # # └─────┴─────┴─────┘
2176
+ def shift_and_fill(periods, fill_value)
2177
+ lazy
2178
+ .shift_and_fill(periods, fill_value)
2179
+ .collect(no_optimization: true, string_cache: false)
2180
+ end
440
2181
 
2182
+ # Get a mask of all duplicated rows in this DataFrame.
2183
+ #
2184
+ # @return [Series]
2185
+ #
2186
+ # @example
2187
+ # df = Polars::DataFrame.new(
2188
+ # {
2189
+ # "a" => [1, 2, 3, 1],
2190
+ # "b" => ["x", "y", "z", "x"],
2191
+ # }
2192
+ # )
2193
+ # df.is_duplicated
2194
+ # # =>
2195
+ # # shape: (4,)
2196
+ # # Series: '' [bool]
2197
+ # # [
2198
+ # # true
2199
+ # # false
2200
+ # # false
2201
+ # # true
2202
+ # # ]
441
2203
  def is_duplicated
442
2204
  Utils.wrap_s(_df.is_duplicated)
443
2205
  end
444
2206
 
2207
+ # Get a mask of all unique rows in this DataFrame.
2208
+ #
2209
+ # @return [Series]
2210
+ #
2211
+ # @example
2212
+ # df = Polars::DataFrame.new(
2213
+ # {
2214
+ # "a" => [1, 2, 3, 1],
2215
+ # "b" => ["x", "y", "z", "x"]
2216
+ # }
2217
+ # )
2218
+ # df.is_unique
2219
+ # # =>
2220
+ # # shape: (4,)
2221
+ # # Series: '' [bool]
2222
+ # # [
2223
+ # # false
2224
+ # # true
2225
+ # # true
2226
+ # # false
2227
+ # # ]
445
2228
  def is_unique
446
2229
  Utils.wrap_s(_df.is_unique)
447
2230
  end
448
2231
 
2232
+ # Start a lazy query from this point.
2233
+ #
2234
+ # @return [LazyFrame]
449
2235
  def lazy
450
2236
  wrap_ldf(_df.lazy)
451
2237
  end
452
2238
 
2239
+ # Select columns from this DataFrame.
2240
+ #
2241
+ # @param exprs [Object]
2242
+ # Column or columns to select.
2243
+ #
2244
+ # @return [DataFrame]
2245
+ #
2246
+ # @example
2247
+ # df = Polars::DataFrame.new(
2248
+ # {
2249
+ # "foo" => [1, 2, 3],
2250
+ # "bar" => [6, 7, 8],
2251
+ # "ham" => ["a", "b", "c"]
2252
+ # }
2253
+ # )
2254
+ # df.select("foo")
2255
+ # # =>
2256
+ # # shape: (3, 1)
2257
+ # # ┌─────┐
2258
+ # # │ foo │
2259
+ # # │ --- │
2260
+ # # │ i64 │
2261
+ # # ╞═════╡
2262
+ # # │ 1 │
2263
+ # # ├╌╌╌╌╌┤
2264
+ # # │ 2 │
2265
+ # # ├╌╌╌╌╌┤
2266
+ # # │ 3 │
2267
+ # # └─────┘
2268
+ #
2269
+ # @example
2270
+ # df.select(["foo", "bar"])
2271
+ # # =>
2272
+ # # shape: (3, 2)
2273
+ # # ┌─────┬─────┐
2274
+ # # │ foo ┆ bar │
2275
+ # # │ --- ┆ --- │
2276
+ # # │ i64 ┆ i64 │
2277
+ # # ╞═════╪═════╡
2278
+ # # │ 1 ┆ 6 │
2279
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2280
+ # # │ 2 ┆ 7 │
2281
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2282
+ # # │ 3 ┆ 8 │
2283
+ # # └─────┴─────┘
2284
+ #
2285
+ # @example
2286
+ # df.select(Polars.col("foo") + 1)
2287
+ # # =>
2288
+ # # shape: (3, 1)
2289
+ # # ┌─────┐
2290
+ # # │ foo │
2291
+ # # │ --- │
2292
+ # # │ i64 │
2293
+ # # ╞═════╡
2294
+ # # │ 2 │
2295
+ # # ├╌╌╌╌╌┤
2296
+ # # │ 3 │
2297
+ # # ├╌╌╌╌╌┤
2298
+ # # │ 4 │
2299
+ # # └─────┘
2300
+ #
2301
+ # @example
2302
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
2303
+ # # =>
2304
+ # # shape: (3, 2)
2305
+ # # ┌─────┬─────┐
2306
+ # # │ foo ┆ bar │
2307
+ # # │ --- ┆ --- │
2308
+ # # │ i64 ┆ i64 │
2309
+ # # ╞═════╪═════╡
2310
+ # # │ 2 ┆ 7 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2312
+ # # │ 3 ┆ 8 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2314
+ # # │ 4 ┆ 9 │
2315
+ # # └─────┴─────┘
2316
+ #
2317
+ # @example
2318
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
2319
+ # # =>
2320
+ # # shape: (3, 1)
2321
+ # # ┌─────────┐
2322
+ # # │ literal │
2323
+ # # │ --- │
2324
+ # # │ i64 │
2325
+ # # ╞═════════╡
2326
+ # # │ 0 │
2327
+ # # ├╌╌╌╌╌╌╌╌╌┤
2328
+ # # │ 0 │
2329
+ # # ├╌╌╌╌╌╌╌╌╌┤
2330
+ # # │ 10 │
2331
+ # # └─────────┘
453
2332
  def select(exprs)
454
2333
  _from_rbdf(
455
2334
  lazy
@@ -459,6 +2338,43 @@ module Polars
459
2338
  )
460
2339
  end
461
2340
 
2341
+ # Add or overwrite multiple columns in a DataFrame.
2342
+ #
2343
+ # @param exprs [Array]
2344
+ # Array of Expressions that evaluate to columns.
2345
+ #
2346
+ # @return [DataFrame]
2347
+ #
2348
+ # @example
2349
+ # df = Polars::DataFrame.new(
2350
+ # {
2351
+ # "a" => [1, 2, 3, 4],
2352
+ # "b" => [0.5, 4, 10, 13],
2353
+ # "c" => [true, true, false, true]
2354
+ # }
2355
+ # )
2356
+ # df.with_columns(
2357
+ # [
2358
+ # (Polars.col("a") ** 2).alias("a^2"),
2359
+ # (Polars.col("b") / 2).alias("b/2"),
2360
+ # (Polars.col("c").is_not()).alias("not c")
2361
+ # ]
2362
+ # )
2363
+ # # =>
2364
+ # # shape: (4, 6)
2365
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
2366
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
2367
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2368
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
2369
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
2370
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
2371
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2372
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
2373
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2374
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
2375
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2376
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
2377
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
462
2378
  def with_columns(exprs)
463
2379
  if !exprs.nil? && !exprs.is_a?(Array)
464
2380
  exprs = [exprs]
@@ -468,6 +2384,26 @@ module Polars
468
2384
  .collect(no_optimization: true, string_cache: false)
469
2385
  end
470
2386
 
2387
+ # Get number of chunks used by the ChunkedArrays of this DataFrame.
2388
+ #
2389
+ # @param strategy ["first", "all"]
2390
+ # Return the number of chunks of the 'first' column,
2391
+ # or 'all' columns in this DataFrame.
2392
+ #
2393
+ # @return [Object]
2394
+ #
2395
+ # @example
2396
+ # df = Polars::DataFrame.new(
2397
+ # {
2398
+ # "a" => [1, 2, 3, 4],
2399
+ # "b" => [0.5, 4, 10, 13],
2400
+ # "c" => [true, true, false, true]
2401
+ # }
2402
+ # )
2403
+ # df.n_chunks
2404
+ # # => 1
2405
+ # df.n_chunks(strategy: "all")
2406
+ # # => [1, 1, 1]
471
2407
  def n_chunks(strategy: "first")
472
2408
  if strategy == "first"
473
2409
  _df.n_chunks
@@ -478,6 +2414,28 @@ module Polars
478
2414
  end
479
2415
  end
480
2416
 
2417
+ # Aggregate the columns of this DataFrame to their maximum value.
2418
+ #
2419
+ # @return [DataFrame]
2420
+ #
2421
+ # @example
2422
+ # df = Polars::DataFrame.new(
2423
+ # {
2424
+ # "foo" => [1, 2, 3],
2425
+ # "bar" => [6, 7, 8],
2426
+ # "ham" => ["a", "b", "c"]
2427
+ # }
2428
+ # )
2429
+ # df.max
2430
+ # # =>
2431
+ # # shape: (1, 3)
2432
+ # # ┌─────┬─────┬─────┐
2433
+ # # │ foo ┆ bar ┆ ham │
2434
+ # # │ --- ┆ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 ┆ str │
2436
+ # # ╞═════╪═════╪═════╡
2437
+ # # │ 3 ┆ 8 ┆ c │
2438
+ # # └─────┴─────┴─────┘
481
2439
  def max(axis: 0)
482
2440
  if axis == 0
483
2441
  _from_rbdf(_df.max)
@@ -488,6 +2446,28 @@ module Polars
488
2446
  end
489
2447
  end
490
2448
 
2449
+ # Aggregate the columns of this DataFrame to their minimum value.
2450
+ #
2451
+ # @return [DataFrame]
2452
+ #
2453
+ # @example
2454
+ # df = Polars::DataFrame.new(
2455
+ # {
2456
+ # "foo" => [1, 2, 3],
2457
+ # "bar" => [6, 7, 8],
2458
+ # "ham" => ["a", "b", "c"]
2459
+ # }
2460
+ # )
2461
+ # df.min
2462
+ # # =>
2463
+ # # shape: (1, 3)
2464
+ # # ┌─────┬─────┬─────┐
2465
+ # # │ foo ┆ bar ┆ ham │
2466
+ # # │ --- ┆ --- ┆ --- │
2467
+ # # │ i64 ┆ i64 ┆ str │
2468
+ # # ╞═════╪═════╪═════╡
2469
+ # # │ 1 ┆ 6 ┆ a │
2470
+ # # └─────┴─────┴─────┘
491
2471
  def min(axis: 0)
492
2472
  if axis == 0
493
2473
  _from_rbdf(_df.min)
@@ -498,6 +2478,44 @@ module Polars
498
2478
  end
499
2479
  end
500
2480
 
2481
+ # Aggregate the columns of this DataFrame to their sum value.
2482
+ #
2483
+ # @param axis [Integer]
2484
+ # Either 0 or 1.
2485
+ # @param null_strategy ["ignore", "propagate"]
2486
+ # This argument is only used if axis == 1.
2487
+ #
2488
+ # @return [DataFrame]
2489
+ #
2490
+ # @example
2491
+ # df = Polars::DataFrame.new(
2492
+ # {
2493
+ # "foo" => [1, 2, 3],
2494
+ # "bar" => [6, 7, 8],
2495
+ # "ham" => ["a", "b", "c"],
2496
+ # }
2497
+ # )
2498
+ # df.sum
2499
+ # # =>
2500
+ # # shape: (1, 3)
2501
+ # # ┌─────┬─────┬──────┐
2502
+ # # │ foo ┆ bar ┆ ham │
2503
+ # # │ --- ┆ --- ┆ --- │
2504
+ # # │ i64 ┆ i64 ┆ str │
2505
+ # # ╞═════╪═════╪══════╡
2506
+ # # │ 6 ┆ 21 ┆ null │
2507
+ # # └─────┴─────┴──────┘
2508
+ #
2509
+ # @example
2510
+ # df.sum(axis: 1)
2511
+ # # =>
2512
+ # # shape: (3,)
2513
+ # # Series: 'foo' [str]
2514
+ # # [
2515
+ # # "16a"
2516
+ # # "27b"
2517
+ # # "38c"
2518
+ # # ]
501
2519
  def sum(axis: 0, null_strategy: "ignore")
502
2520
  case axis
503
2521
  when 0
@@ -509,6 +2527,33 @@ module Polars
509
2527
  end
510
2528
  end
511
2529
 
2530
+ # Aggregate the columns of this DataFrame to their mean value.
2531
+ #
2532
+ # @param axis [Integer]
2533
+ # Either 0 or 1.
2534
+ # @param null_strategy ["ignore", "propagate"]
2535
+ # This argument is only used if axis == 1.
2536
+ #
2537
+ # @return [DataFrame]
2538
+ #
2539
+ # @example
2540
+ # df = Polars::DataFrame.new(
2541
+ # {
2542
+ # "foo" => [1, 2, 3],
2543
+ # "bar" => [6, 7, 8],
2544
+ # "ham" => ["a", "b", "c"]
2545
+ # }
2546
+ # )
2547
+ # df.mean
2548
+ # # =>
2549
+ # # shape: (1, 3)
2550
+ # # ┌─────┬─────┬──────┐
2551
+ # # │ foo ┆ bar ┆ ham │
2552
+ # # │ --- ┆ --- ┆ --- │
2553
+ # # │ f64 ┆ f64 ┆ str │
2554
+ # # ╞═════╪═════╪══════╡
2555
+ # # │ 2.0 ┆ 7.0 ┆ null │
2556
+ # # └─────┴─────┴──────┘
512
2557
  def mean(axis: 0, null_strategy: "ignore")
513
2558
  case axis
514
2559
  when 0
@@ -520,75 +2565,633 @@ module Polars
520
2565
  end
521
2566
  end
522
2567
 
2568
+ # Aggregate the columns of this DataFrame to their standard deviation value.
2569
+ #
2570
+ # @param ddof [Integer]
2571
+ # Degrees of freedom
2572
+ #
2573
+ # @return [DataFrame]
2574
+ #
2575
+ # @example
2576
+ # df = Polars::DataFrame.new(
2577
+ # {
2578
+ # "foo" => [1, 2, 3],
2579
+ # "bar" => [6, 7, 8],
2580
+ # "ham" => ["a", "b", "c"]
2581
+ # }
2582
+ # )
2583
+ # df.std
2584
+ # # =>
2585
+ # # shape: (1, 3)
2586
+ # # ┌─────┬─────┬──────┐
2587
+ # # │ foo ┆ bar ┆ ham │
2588
+ # # │ --- ┆ --- ┆ --- │
2589
+ # # │ f64 ┆ f64 ┆ str │
2590
+ # # ╞═════╪═════╪══════╡
2591
+ # # │ 1.0 ┆ 1.0 ┆ null │
2592
+ # # └─────┴─────┴──────┘
2593
+ #
2594
+ # @example
2595
+ # df.std(ddof: 0)
2596
+ # # =>
2597
+ # # shape: (1, 3)
2598
+ # # ┌──────────┬──────────┬──────┐
2599
+ # # │ foo ┆ bar ┆ ham │
2600
+ # # │ --- ┆ --- ┆ --- │
2601
+ # # │ f64 ┆ f64 ┆ str │
2602
+ # # ╞══════════╪══════════╪══════╡
2603
+ # # │ 0.816497 ┆ 0.816497 ┆ null │
2604
+ # # └──────────┴──────────┴──────┘
523
2605
  def std(ddof: 1)
524
2606
  _from_rbdf(_df.std(ddof))
525
2607
  end
526
2608
 
2609
+ # Aggregate the columns of this DataFrame to their variance value.
2610
+ #
2611
+ # @param ddof [Integer]
2612
+ # Degrees of freedom
2613
+ #
2614
+ # @return [DataFrame]
2615
+ #
2616
+ # @example
2617
+ # df = Polars::DataFrame.new(
2618
+ # {
2619
+ # "foo" => [1, 2, 3],
2620
+ # "bar" => [6, 7, 8],
2621
+ # "ham" => ["a", "b", "c"]
2622
+ # }
2623
+ # )
2624
+ # df.var
2625
+ # # =>
2626
+ # # shape: (1, 3)
2627
+ # # ┌─────┬─────┬──────┐
2628
+ # # │ foo ┆ bar ┆ ham │
2629
+ # # │ --- ┆ --- ┆ --- │
2630
+ # # │ f64 ┆ f64 ┆ str │
2631
+ # # ╞═════╪═════╪══════╡
2632
+ # # │ 1.0 ┆ 1.0 ┆ null │
2633
+ # # └─────┴─────┴──────┘
2634
+ #
2635
+ # @example
2636
+ # df.var(ddof: 0)
2637
+ # # =>
2638
+ # # shape: (1, 3)
2639
+ # # ┌──────────┬──────────┬──────┐
2640
+ # # │ foo ┆ bar ┆ ham │
2641
+ # # │ --- ┆ --- ┆ --- │
2642
+ # # │ f64 ┆ f64 ┆ str │
2643
+ # # ╞══════════╪══════════╪══════╡
2644
+ # # │ 0.666667 ┆ 0.666667 ┆ null │
2645
+ # # └──────────┴──────────┴──────┘
527
2646
  def var(ddof: 1)
528
2647
  _from_rbdf(_df.var(ddof))
529
2648
  end
530
2649
 
2650
+ # Aggregate the columns of this DataFrame to their median value.
2651
+ #
2652
+ # @return [DataFrame]
2653
+ #
2654
+ # @example
2655
+ # df = Polars::DataFrame.new(
2656
+ # {
2657
+ # "foo" => [1, 2, 3],
2658
+ # "bar" => [6, 7, 8],
2659
+ # "ham" => ["a", "b", "c"]
2660
+ # }
2661
+ # )
2662
+ # df.median
2663
+ # # =>
2664
+ # # shape: (1, 3)
2665
+ # # ┌─────┬─────┬──────┐
2666
+ # # │ foo ┆ bar ┆ ham │
2667
+ # # │ --- ┆ --- ┆ --- │
2668
+ # # │ f64 ┆ f64 ┆ str │
2669
+ # # ╞═════╪═════╪══════╡
2670
+ # # │ 2.0 ┆ 7.0 ┆ null │
2671
+ # # └─────┴─────┴──────┘
531
2672
  def median
532
2673
  _from_rbdf(_df.median)
533
2674
  end
534
2675
 
535
- # def product
536
- # end
2676
+ # Aggregate the columns of this DataFrame to their product values.
2677
+ #
2678
+ # @return [DataFrame]
2679
+ #
2680
+ # @example
2681
+ # df = Polars::DataFrame.new(
2682
+ # {
2683
+ # "a" => [1, 2, 3],
2684
+ # "b" => [0.5, 4, 10],
2685
+ # "c" => [true, true, false]
2686
+ # }
2687
+ # )
2688
+ # df.product
2689
+ # # =>
2690
+ # # shape: (1, 3)
2691
+ # # ┌─────┬──────┬─────┐
2692
+ # # │ a ┆ b ┆ c │
2693
+ # # │ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ f64 ┆ i64 │
2695
+ # # ╞═════╪══════╪═════╡
2696
+ # # │ 6 ┆ 20.0 ┆ 0 │
2697
+ # # └─────┴──────┴─────┘
2698
+ def product
2699
+ select(Polars.all.product)
2700
+ end
537
2701
 
538
- # def quantile(quantile, interpolation: "nearest")
539
- # end
2702
+ # Aggregate the columns of this DataFrame to their quantile value.
2703
+ #
2704
+ # @param quantile [Float]
2705
+ # Quantile between 0.0 and 1.0.
2706
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2707
+ # Interpolation method.
2708
+ #
2709
+ # @return [DataFrame]
2710
+ #
2711
+ # @example
2712
+ # df = Polars::DataFrame.new(
2713
+ # {
2714
+ # "foo" => [1, 2, 3],
2715
+ # "bar" => [6, 7, 8],
2716
+ # "ham" => ["a", "b", "c"]
2717
+ # }
2718
+ # )
2719
+ # df.quantile(0.5, interpolation: "nearest")
2720
+ # # =>
2721
+ # # shape: (1, 3)
2722
+ # # ┌─────┬─────┬──────┐
2723
+ # # │ foo ┆ bar ┆ ham │
2724
+ # # │ --- ┆ --- ┆ --- │
2725
+ # # │ f64 ┆ f64 ┆ str │
2726
+ # # ╞═════╪═════╪══════╡
2727
+ # # │ 2.0 ┆ 7.0 ┆ null │
2728
+ # # └─────┴─────┴──────┘
2729
+ def quantile(quantile, interpolation: "nearest")
2730
+ _from_rbdf(_df.quantile(quantile, interpolation))
2731
+ end
540
2732
 
541
- # def to_dummies
542
- # end
2733
+ # Get one hot encoded dummy variables.
2734
+ #
2735
+ # @param columns
2736
+ # A subset of columns to convert to dummy variables. `nil` means
2737
+ # "all columns".
2738
+ #
2739
+ # @return [DataFrame]
2740
+ #
2741
+ # @example
2742
+ # df = Polars::DataFrame.new(
2743
+ # {
2744
+ # "foo" => [1, 2],
2745
+ # "bar" => [3, 4],
2746
+ # "ham" => ["a", "b"]
2747
+ # }
2748
+ # )
2749
+ # df.to_dummies
2750
+ # # =>
2751
+ # # shape: (2, 6)
2752
+ # # ┌───────┬───────┬───────┬───────┬───────┬───────┐
2753
+ # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
2754
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2755
+ # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
2756
+ # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
2757
+ # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
2758
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2759
+ # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
2760
+ # # └───────┴───────┴───────┴───────┴───────┴───────┘
2761
+ def to_dummies(columns: nil)
2762
+ if columns.is_a?(String)
2763
+ columns = [columns]
2764
+ end
2765
+ _from_rbdf(_df.to_dummies(columns))
2766
+ end
543
2767
 
544
- # def unique
545
- # end
2768
+ # Drop duplicate rows from this DataFrame.
2769
+ #
2770
+ # @param maintain_order [Boolean]
2771
+ # Keep the same order as the original DataFrame. This requires more work to
2772
+ # compute.
2773
+ # @param subset [Object]
2774
+ # Subset to use to compare rows.
2775
+ # @param keep ["first", "last"]
2776
+ # Which of the duplicate rows to keep (in conjunction with `subset`).
2777
+ #
2778
+ # @return [DataFrame]
2779
+ #
2780
+ # @note
2781
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2782
+ # subset.
2783
+ #
2784
+ # @example
2785
+ # df = Polars::DataFrame.new(
2786
+ # {
2787
+ # "a" => [1, 1, 2, 3, 4, 5],
2788
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2789
+ # "c" => [true, true, true, false, true, true]
2790
+ # }
2791
+ # )
2792
+ # df.unique
2793
+ # # =>
2794
+ # # shape: (5, 3)
2795
+ # # ┌─────┬─────┬───────┐
2796
+ # # │ a ┆ b ┆ c │
2797
+ # # │ --- ┆ --- ┆ --- │
2798
+ # # │ i64 ┆ f64 ┆ bool │
2799
+ # # ╞═════╪═════╪═══════╡
2800
+ # # │ 1 ┆ 0.5 ┆ true │
2801
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2802
+ # # │ 2 ┆ 1.0 ┆ true │
2803
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2804
+ # # │ 3 ┆ 2.0 ┆ false │
2805
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2806
+ # # │ 4 ┆ 3.0 ┆ true │
2807
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2808
+ # # │ 5 ┆ 3.0 ┆ true │
2809
+ # # └─────┴─────┴───────┘
2810
+ def unique(maintain_order: true, subset: nil, keep: "first")
2811
+ if !subset.nil?
2812
+ if subset.is_a?(String)
2813
+ subset = [subset]
2814
+ elsif !subset.is_a?(Array)
2815
+ subset = subset.to_a
2816
+ end
2817
+ end
546
2818
 
547
- # def n_unique
548
- # end
2819
+ _from_rbdf(_df.unique(maintain_order, subset, keep))
2820
+ end
2821
+
2822
+ # Return the number of unique rows, or the number of unique row-subsets.
2823
+ #
2824
+ # @param subset [Object]
2825
+ # One or more columns/expressions that define what to count;
2826
+ # omit to return the count of unique rows.
2827
+ #
2828
+ # @return [DataFrame]
2829
+ #
2830
+ # @example
2831
+ # df = Polars::DataFrame.new(
2832
+ # {
2833
+ # "a" => [1, 1, 2, 3, 4, 5],
2834
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2835
+ # "c" => [true, true, true, false, true, true]
2836
+ # }
2837
+ # )
2838
+ # df.n_unique
2839
+ # # => 5
2840
+ #
2841
+ # @example Simple columns subset
2842
+ # df.n_unique(subset: ["b", "c"])
2843
+ # # => 4
2844
+ #
2845
+ # @example Expression subset
2846
+ # df.n_unique(
2847
+ # subset: [
2848
+ # (Polars.col("a").floordiv(2)),
2849
+ # (Polars.col("c") | (Polars.col("b") >= 2))
2850
+ # ]
2851
+ # )
2852
+ # # => 3
2853
+ def n_unique(subset: nil)
2854
+ if subset.is_a?(StringIO)
2855
+ subset = [Polars.col(subset)]
2856
+ elsif subset.is_a?(Expr)
2857
+ subset = [subset]
2858
+ end
2859
+
2860
+ if subset.is_a?(Array) && subset.length == 1
2861
+ expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
2862
+ else
2863
+ struct_fields = subset.nil? ? Polars.all : subset
2864
+ expr = Polars.struct(struct_fields)
2865
+ end
2866
+
2867
+ df = lazy.select(expr.n_unique).collect
2868
+ df.is_empty ? 0 : df.row(0)[0]
2869
+ end
549
2870
 
2871
+ # Rechunk the data in this DataFrame to a contiguous allocation.
2872
+
2873
+ # This will make sure all subsequent operations have optimal and predictable
2874
+ # performance.
2875
+ #
2876
+ # @return [DataFrame]
550
2877
  def rechunk
551
2878
  _from_rbdf(_df.rechunk)
552
2879
  end
553
2880
 
2881
+ # Create a new DataFrame that shows the null counts per column.
2882
+ #
2883
+ # @return [DataFrame]
2884
+ #
2885
+ # @example
2886
+ # df = Polars::DataFrame.new(
2887
+ # {
2888
+ # "foo" => [1, nil, 3],
2889
+ # "bar" => [6, 7, nil],
2890
+ # "ham" => ["a", "b", "c"]
2891
+ # }
2892
+ # )
2893
+ # df.null_count
2894
+ # # =>
2895
+ # # shape: (1, 3)
2896
+ # # ┌─────┬─────┬─────┐
2897
+ # # │ foo ┆ bar ┆ ham │
2898
+ # # │ --- ┆ --- ┆ --- │
2899
+ # # │ u32 ┆ u32 ┆ u32 │
2900
+ # # ╞═════╪═════╪═════╡
2901
+ # # │ 1 ┆ 1 ┆ 0 │
2902
+ # # └─────┴─────┴─────┘
554
2903
  def null_count
555
2904
  _from_rbdf(_df.null_count)
556
2905
  end
557
2906
 
558
- # def sample
559
- # end
2907
+ # Sample from this DataFrame.
2908
+ #
2909
+ # @param n [Integer]
2910
+ # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
2911
+ # `frac` is nil.
2912
+ # @param frac [Float]
2913
+ # Fraction of items to return. Cannot be used with `n`.
2914
+ # @param with_replacement [Boolean]
2915
+ # Allow values to be sampled more than once.
2916
+ # @param shuffle [Boolean]
2917
+ # Shuffle the order of sampled data points.
2918
+ # @param seed [Integer]
2919
+ # Seed for the random number generator. If set to nil (default), a random
2920
+ # seed is used.
2921
+ #
2922
+ # @return [DataFrame]
2923
+ #
2924
+ # @example
2925
+ # df = Polars::DataFrame.new(
2926
+ # {
2927
+ # "foo" => [1, 2, 3],
2928
+ # "bar" => [6, 7, 8],
2929
+ # "ham" => ["a", "b", "c"]
2930
+ # }
2931
+ # )
2932
+ # df.sample(n: 2, seed: 0)
2933
+ # # =>
2934
+ # # shape: (2, 3)
2935
+ # # ┌─────┬─────┬─────┐
2936
+ # # │ foo ┆ bar ┆ ham │
2937
+ # # │ --- ┆ --- ┆ --- │
2938
+ # # │ i64 ┆ i64 ┆ str │
2939
+ # # ╞═════╪═════╪═════╡
2940
+ # # │ 3 ┆ 8 ┆ c │
2941
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2942
+ # # │ 2 ┆ 7 ┆ b │
2943
+ # # └─────┴─────┴─────┘
2944
+ def sample(
2945
+ n: nil,
2946
+ frac: nil,
2947
+ with_replacement: false,
2948
+ shuffle: false,
2949
+ seed: nil
2950
+ )
2951
+ if !n.nil? && !frac.nil?
2952
+ raise ArgumentError, "cannot specify both `n` and `frac`"
2953
+ end
2954
+
2955
+ if n.nil? && !frac.nil?
2956
+ _from_rbdf(
2957
+ _df.sample_frac(frac, with_replacement, shuffle, seed)
2958
+ )
2959
+ end
2960
+
2961
+ if n.nil?
2962
+ n = 1
2963
+ end
2964
+ _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
2965
+ end
560
2966
 
561
2967
  # def fold
562
2968
  # end
563
2969
 
564
- # def row
565
- # end
2970
+ # Get a row as tuple, either by index or by predicate.
2971
+ #
2972
+ # @param index [Object]
2973
+ # Row index.
2974
+ # @param by_predicate [Object]
2975
+ # Select the row according to a given expression/predicate.
2976
+ #
2977
+ # @return [Object]
2978
+ #
2979
+ # @note
2980
+ # The `index` and `by_predicate` params are mutually exclusive. Additionally,
2981
+ # to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
2982
+ #
2983
+ # When using `by_predicate` it is an error condition if anything other than
2984
+ # one row is returned; more than one row raises `TooManyRowsReturned`, and
2985
+ # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
2986
+ #
2987
+ # @example Return the row at the given index
2988
+ # df = Polars::DataFrame.new(
2989
+ # {
2990
+ # "foo" => [1, 2, 3],
2991
+ # "bar" => [6, 7, 8],
2992
+ # "ham" => ["a", "b", "c"]
2993
+ # }
2994
+ # )
2995
+ # df.row(2)
2996
+ # # => [3, 8, "c"]
2997
+ #
2998
+ # @example Return the row that matches the given predicate
2999
+ # df.row(by_predicate: Polars.col("ham") == "b")
3000
+ # # => [2, 7, "b"]
3001
+ def row(index = nil, by_predicate: nil)
3002
+ if !index.nil? && !by_predicate.nil?
3003
+ raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
3004
+ elsif index.is_a?(Expr)
3005
+ raise TypeError, "Expressions should be passed to the 'by_predicate' param"
3006
+ elsif index.is_a?(Integer)
3007
+ _df.row_tuple(index)
3008
+ elsif by_predicate.is_a?(Expr)
3009
+ rows = filter(by_predicate).rows
3010
+ n_rows = rows.length
3011
+ if n_rows > 1
3012
+ raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
3013
+ elsif n_rows == 0
3014
+ raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
3015
+ end
3016
+ rows[0]
3017
+ else
3018
+ raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
3019
+ end
3020
+ end
566
3021
 
567
- # def rows
568
- # end
3022
+ # Convert columnar data to rows as Ruby arrays.
3023
+ #
3024
+ # @return [Array]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 3, 5],
3030
+ # "b" => [2, 4, 6]
3031
+ # }
3032
+ # )
3033
+ # df.rows
3034
+ # # => [[1, 2], [3, 4], [5, 6]]
3035
+ def rows
3036
+ _df.row_tuples
3037
+ end
569
3038
 
570
- # def shrink_to_fit
571
- # end
3039
+ # Shrink DataFrame memory usage.
3040
+ #
3041
+ # Shrinks to fit the exact capacity needed to hold the data.
3042
+ #
3043
+ # @return [DataFrame]
3044
+ def shrink_to_fit(in_place: false)
3045
+ if in_place
3046
+ _df.shrink_to_fit
3047
+ self
3048
+ else
3049
+ df = clone
3050
+ df._df.shrink_to_fit
3051
+ df
3052
+ end
3053
+ end
572
3054
 
573
- # def take_every
574
- # end
3055
+ # Take every nth row in the DataFrame and return as a new DataFrame.
3056
+ #
3057
+ # @return [DataFrame]
3058
+ #
3059
+ # @example
3060
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
3061
+ # s.take_every(2)
3062
+ # # =>
3063
+ # # shape: (2, 2)
3064
+ # # ┌─────┬─────┐
3065
+ # # │ a ┆ b │
3066
+ # # │ --- ┆ --- │
3067
+ # # │ i64 ┆ i64 │
3068
+ # # ╞═════╪═════╡
3069
+ # # │ 1 ┆ 5 │
3070
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3071
+ # # │ 3 ┆ 7 │
3072
+ # # └─────┴─────┘
3073
+ def take_every(n)
3074
+ select(Utils.col("*").take_every(n))
3075
+ end
575
3076
 
576
3077
  # def hash_rows
577
3078
  # end
578
3079
 
579
- # def interpolate
580
- # end
3080
+ # Interpolate intermediate values. The interpolation method is linear.
3081
+ #
3082
+ # @return [DataFrame]
3083
+ #
3084
+ # @example
3085
+ # df = Polars::DataFrame.new(
3086
+ # {
3087
+ # "foo" => [1, nil, 9, 10],
3088
+ # "bar" => [6, 7, 9, nil],
3089
+ # "baz" => [1, nil, nil, 9]
3090
+ # }
3091
+ # )
3092
+ # df.interpolate
3093
+ # # =>
3094
+ # # shape: (4, 3)
3095
+ # # ┌─────┬──────┬─────┐
3096
+ # # │ foo ┆ bar ┆ baz │
3097
+ # # │ --- ┆ --- ┆ --- │
3098
+ # # │ i64 ┆ i64 ┆ i64 │
3099
+ # # ╞═════╪══════╪═════╡
3100
+ # # │ 1 ┆ 6 ┆ 1 │
3101
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3102
+ # # │ 5 ┆ 7 ┆ 3 │
3103
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3104
+ # # │ 9 ┆ 9 ┆ 6 │
3105
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3106
+ # # │ 10 ┆ null ┆ 9 │
3107
+ # # └─────┴──────┴─────┘
3108
+ def interpolate
3109
+ select(Utils.col("*").interpolate)
3110
+ end
581
3111
 
3112
+ # Check if the dataframe is empty.
3113
+ #
3114
+ # @return [Boolean]
3115
+ #
3116
+ # @example
3117
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3118
+ # df.is_empty
3119
+ # # => false
3120
+ # df.filter(Polars.col("foo") > 99).is_empty
3121
+ # # => true
582
3122
  def is_empty
583
3123
  height == 0
584
3124
  end
585
3125
  alias_method :empty?, :is_empty
586
3126
 
587
- # def to_struct(name)
588
- # end
3127
+ # Convert a `DataFrame` to a `Series` of type `Struct`.
3128
+ #
3129
+ # @param name [String]
3130
+ # Name for the struct Series
3131
+ #
3132
+ # @return [Series]
3133
+ #
3134
+ # @example
3135
+ # df = Polars::DataFrame.new(
3136
+ # {
3137
+ # "a" => [1, 2, 3, 4, 5],
3138
+ # "b" => ["one", "two", "three", "four", "five"]
3139
+ # }
3140
+ # )
3141
+ # df.to_struct("nums")
3142
+ # # =>
3143
+ # # shape: (5,)
3144
+ # # Series: 'nums' [struct[2]]
3145
+ # # [
3146
+ # # {1,"one"}
3147
+ # # {2,"two"}
3148
+ # # {3,"three"}
3149
+ # # {4,"four"}
3150
+ # # {5,"five"}
3151
+ # # ]
3152
+ def to_struct(name)
3153
+ Utils.wrap_s(_df.to_struct(name))
3154
+ end
589
3155
 
590
- # def unnest
591
- # end
3156
+ # Decompose a struct into its fields.
3157
+ #
3158
+ # The fields will be inserted into the `DataFrame` on the location of the
3159
+ # `struct` type.
3160
+ #
3161
+ # @param names [Object]
3162
+ # Names of the struct columns that will be decomposed by its fields
3163
+ #
3164
+ # @return [DataFrame]
3165
+ #
3166
+ # @example
3167
+ # df = Polars::DataFrame.new(
3168
+ # {
3169
+ # "before" => ["foo", "bar"],
3170
+ # "t_a" => [1, 2],
3171
+ # "t_b" => ["a", "b"],
3172
+ # "t_c" => [true, nil],
3173
+ # "t_d" => [[1, 2], [3]],
3174
+ # "after" => ["baz", "womp"]
3175
+ # }
3176
+ # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
3177
+ # df.unnest("t_struct")
3178
+ # # =>
3179
+ # # shape: (2, 6)
3180
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
3181
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
3182
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3183
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
3184
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
3185
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
3186
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3187
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
3188
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
3189
+ def unnest(names)
3190
+ if names.is_a?(String)
3191
+ names = [names]
3192
+ end
3193
+ _from_rbdf(_df.unnest(names))
3194
+ end
592
3195
 
593
3196
  private
594
3197
 
@@ -597,15 +3200,55 @@ module Polars
597
3200
  self._df = _df._clone
598
3201
  end
599
3202
 
600
- def hash_to_rbdf(data)
3203
+ def hash_to_rbdf(data, columns: nil)
3204
+ if !columns.nil?
3205
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
3206
+
3207
+ if data.empty? && dtypes
3208
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
3209
+ else
3210
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
3211
+ end
3212
+ data_series = _handle_columns_arg(data_series, columns: columns)
3213
+ return RbDataFrame.new(data_series)
3214
+ end
3215
+
601
3216
  RbDataFrame.read_hash(data)
602
3217
  end
603
3218
 
604
- def sequence_to_rbdf(data)
3219
+ def _unpack_columns(columns, lookup_names: nil)
3220
+ [columns.keys, columns]
3221
+ end
3222
+
3223
+ def _handle_columns_arg(data, columns: nil)
3224
+ if columns.nil?
3225
+ data
3226
+ else
3227
+ if data.empty?
3228
+ columns.map { |c| Series.new(c, nil)._s }
3229
+ elsif data.length == columns.length
3230
+ columns.each_with_index do |c, i|
3231
+ # not in-place?
3232
+ data[i].rename(c)
3233
+ end
3234
+ data
3235
+ else
3236
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
3237
+ end
3238
+ end
3239
+ end
3240
+
3241
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
3242
+ if columns || orient
3243
+ raise Todo
3244
+ end
605
3245
  RbDataFrame.new(data.map(&:_s))
606
3246
  end
607
3247
 
608
- def series_to_rbdf(data)
3248
+ def series_to_rbdf(data, columns: nil)
3249
+ if columns
3250
+ raise Todo
3251
+ end
609
3252
  RbDataFrame.new([data._s])
610
3253
  end
611
3254
 
@@ -616,5 +3259,75 @@ module Polars
616
3259
  def _from_rbdf(rb_df)
617
3260
  self.class._from_rbdf(rb_df)
618
3261
  end
3262
+
3263
+ def _comp(other, op)
3264
+ if other.is_a?(DataFrame)
3265
+ _compare_to_other_df(other, op)
3266
+ else
3267
+ _compare_to_non_df(other, op)
3268
+ end
3269
+ end
3270
+
3271
+ def _compare_to_other_df(other, op)
3272
+ if columns != other.columns
3273
+ raise ArgmentError, "DataFrame columns do not match"
3274
+ end
3275
+ if shape != other.shape
3276
+ raise ArgmentError, "DataFrame dimensions do not match"
3277
+ end
3278
+
3279
+ suffix = "__POLARS_CMP_OTHER"
3280
+ other_renamed = other.select(Polars.all.suffix(suffix))
3281
+ combined = Polars.concat([self, other_renamed], how: "horizontal")
3282
+
3283
+ expr = case op
3284
+ when "eq"
3285
+ columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
3286
+ when "neq"
3287
+ columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
3288
+ when "gt"
3289
+ columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
3290
+ when "lt"
3291
+ columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
3292
+ when "gt_eq"
3293
+ columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
3294
+ when "lt_eq"
3295
+ columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
3296
+ else
3297
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3298
+ end
3299
+
3300
+ combined.select(expr)
3301
+ end
3302
+
3303
+ def _compare_to_non_df(other, op)
3304
+ case op
3305
+ when "eq"
3306
+ select(Polars.all == other)
3307
+ when "neq"
3308
+ select(Polars.all != other)
3309
+ when "gt"
3310
+ select(Polars.all > other)
3311
+ when "lt"
3312
+ select(Polars.all < other)
3313
+ when "gt_eq"
3314
+ select(Polars.all >= other)
3315
+ when "lt_eq"
3316
+ select(Polars.all <= other)
3317
+ else
3318
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3319
+ end
3320
+ end
3321
+
3322
+ def _prepare_other_arg(other)
3323
+ if !other.is_a?(Series)
3324
+ if other.is_a?(Array)
3325
+ raise ArgumentError, "Operation not supported."
3326
+ end
3327
+
3328
+ other = Series.new("", [other])
3329
+ end
3330
+ other
3331
+ end
619
3332
  end
620
3333
  end