polars-df 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,41 +26,204 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
91
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
113
+ end
114
+
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
122
+ end
123
+
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
155
+ end
156
+
157
+ # @private
158
+ def self._read_parquet(
159
+ file,
160
+ columns: nil,
161
+ n_rows: nil,
162
+ parallel: "auto",
163
+ row_count_name: nil,
164
+ row_count_offset: 0,
165
+ low_memory: false
166
+ )
35
167
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
168
  file = Utils.format_path(file)
37
169
  end
38
170
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
171
+ if file.is_a?(String) && file.include?("*")
172
+ raise Todo
173
+ end
174
+
175
+ projection, columns = Utils.handle_projection_columns(columns)
176
+ _from_rbdf(
177
+ RbDataFrame.read_parquet(
178
+ file,
179
+ columns,
180
+ projection,
181
+ n_rows,
182
+ parallel,
183
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
184
+ low_memory
185
+ )
186
+ )
40
187
  end
41
188
 
42
- def self._read_parquet(file)
189
+ # def self._read_avro
190
+ # end
191
+
192
+ # @private
193
+ def self._read_ipc(
194
+ file,
195
+ columns: nil,
196
+ n_rows: nil,
197
+ row_count_name: nil,
198
+ row_count_offset: 0,
199
+ rechunk: true,
200
+ memory_map: true
201
+ )
43
202
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
203
  file = Utils.format_path(file)
45
204
  end
205
+ if columns.is_a?(String)
206
+ columns = [columns]
207
+ end
208
+
209
+ if file.is_a?(String) && file.include?("*")
210
+ raise Todo
211
+ end
46
212
 
47
- _from_rbdf(RbDataFrame.read_parquet(file))
213
+ projection, columns = Utils.handle_projection_columns(columns)
214
+ _from_rbdf(
215
+ RbDataFrame.read_ipc(
216
+ file,
217
+ columns,
218
+ projection,
219
+ n_rows,
220
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
221
+ memory_map
222
+ )
223
+ )
48
224
  end
49
225
 
226
+ # @private
50
227
  def self._read_json(file)
51
228
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
229
  file = Utils.format_path(file)
@@ -55,6 +232,7 @@ module Polars
55
232
  _from_rbdf(RbDataFrame.read_json(file))
56
233
  end
57
234
 
235
+ # @private
58
236
  def self._read_ndjson(file)
59
237
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
238
  file = Utils.format_path(file)
@@ -63,83 +241,339 @@ module Polars
63
241
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
242
  end
65
243
 
244
+ # Get the shape of the DataFrame.
245
+ #
246
+ # @return [Array]
247
+ #
248
+ # @example
249
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
250
+ # df.shape
251
+ # # => [5, 1]
66
252
  def shape
67
253
  _df.shape
68
254
  end
69
255
 
256
+ # Get the height of the DataFrame.
257
+ #
258
+ # @return [Integer]
259
+ #
260
+ # @example
261
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
262
+ # df.height
263
+ # # => 5
70
264
  def height
71
265
  _df.height
72
266
  end
73
267
 
268
+ # Get the width of the DataFrame.
269
+ #
270
+ # @return [Integer]
271
+ #
272
+ # @example
273
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
274
+ # df.width
275
+ # # => 1
74
276
  def width
75
277
  _df.width
76
278
  end
77
279
 
280
+ # Get column names.
281
+ #
282
+ # @return [Array]
283
+ #
284
+ # @example
285
+ # df = Polars::DataFrame.new(
286
+ # {
287
+ # "foo" => [1, 2, 3],
288
+ # "bar" => [6, 7, 8],
289
+ # "ham" => ["a", "b", "c"]
290
+ # }
291
+ # )
292
+ # df.columns
293
+ # # => ["foo", "bar", "ham"]
78
294
  def columns
79
295
  _df.columns
80
296
  end
81
297
 
298
+ # Change the column names of the DataFrame.
299
+ #
300
+ # @param columns [Array]
301
+ # A list with new names for the DataFrame.
302
+ # The length of the list should be equal to the width of the DataFrame.
303
+ #
304
+ # @return [Object]
305
+ #
306
+ # @example
307
+ # df = Polars::DataFrame.new(
308
+ # {
309
+ # "foo" => [1, 2, 3],
310
+ # "bar" => [6, 7, 8],
311
+ # "ham" => ["a", "b", "c"]
312
+ # }
313
+ # )
314
+ # df.columns = ["apple", "banana", "orange"]
315
+ # df
316
+ # # =>
317
+ # # shape: (3, 3)
318
+ # # ┌───────┬────────┬────────┐
319
+ # # │ apple ┆ banana ┆ orange │
320
+ # # │ --- ┆ --- ┆ --- │
321
+ # # │ i64 ┆ i64 ┆ str │
322
+ # # ╞═══════╪════════╪════════╡
323
+ # # │ 1 ┆ 6 ┆ a │
324
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
325
+ # # │ 2 ┆ 7 ┆ b │
326
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
327
+ # # │ 3 ┆ 8 ┆ c │
328
+ # # └───────┴────────┴────────┘
82
329
  def columns=(columns)
83
330
  _df.set_column_names(columns)
84
331
  end
85
332
 
333
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
334
+ #
335
+ # @return [Array]
336
+ #
337
+ # @example
338
+ # df = Polars::DataFrame.new(
339
+ # {
340
+ # "foo" => [1, 2, 3],
341
+ # "bar" => [6.0, 7.0, 8.0],
342
+ # "ham" => ["a", "b", "c"]
343
+ # }
344
+ # )
345
+ # df.dtypes
346
+ # # => [:i64, :f64, :str]
86
347
  def dtypes
87
- _df.dtypes.map(&:to_sym)
348
+ _df.dtypes
88
349
  end
89
350
 
351
+ # Get the schema.
352
+ #
353
+ # @return [Hash]
354
+ #
355
+ # @example
356
+ # df = Polars::DataFrame.new(
357
+ # {
358
+ # "foo" => [1, 2, 3],
359
+ # "bar" => [6.0, 7.0, 8.0],
360
+ # "ham" => ["a", "b", "c"]
361
+ # }
362
+ # )
363
+ # df.schema
364
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
90
365
  def schema
91
366
  columns.zip(dtypes).to_h
92
367
  end
93
368
 
94
- # def ==(other)
95
- # end
369
+ # Equal.
370
+ #
371
+ # @return [DataFrame]
372
+ def ==(other)
373
+ _comp(other, "eq")
374
+ end
96
375
 
97
- # def !=(other)
98
- # end
376
+ # Not equal.
377
+ #
378
+ # @return [DataFrame]
379
+ def !=(other)
380
+ _comp(other, "neq")
381
+ end
99
382
 
100
- # def >(other)
101
- # end
383
+ # Greater than.
384
+ #
385
+ # @return [DataFrame]
386
+ def >(other)
387
+ _comp(other, "gt")
388
+ end
102
389
 
103
- # def <(other)
104
- # end
390
+ # Less than.
391
+ #
392
+ # @return [DataFrame]
393
+ def <(other)
394
+ _comp(other, "lt")
395
+ end
105
396
 
106
- # def >=(other)
107
- # end
397
+ # Greater than or equal.
398
+ #
399
+ # @return [DataFrame]
400
+ def >=(other)
401
+ _comp(other, "gt_eq")
402
+ end
108
403
 
109
- # def <=(other)
110
- # end
404
+ # Less than or equal.
405
+ #
406
+ # @return [DataFrame]
407
+ def <=(other)
408
+ _comp(other, "lt_eq")
409
+ end
111
410
 
112
- # def *(other)
113
- # end
411
+ # Performs multiplication.
412
+ #
413
+ # @return [DataFrame]
414
+ def *(other)
415
+ if other.is_a?(DataFrame)
416
+ return _from_rbdf(_df.mul_df(other._df))
417
+ end
114
418
 
115
- # def /(other)
116
- # end
419
+ other = _prepare_other_arg(other)
420
+ _from_rbdf(_df.mul(other._s))
421
+ end
117
422
 
118
- # def +(other)
119
- # end
423
+ # Performs division.
424
+ #
425
+ # @return [DataFrame]
426
+ def /(other)
427
+ if other.is_a?(DataFrame)
428
+ return _from_rbdf(_df.div_df(other._df))
429
+ end
120
430
 
121
- # def -(other)
122
- # end
431
+ other = _prepare_other_arg(other)
432
+ _from_rbdf(_df.div(other._s))
433
+ end
123
434
 
124
- # def %(other)
125
- # end
435
+ # Performs addition.
436
+ #
437
+ # @return [DataFrame]
438
+ def +(other)
439
+ if other.is_a?(DataFrame)
440
+ return _from_rbdf(_df.add_df(other._df))
441
+ end
442
+
443
+ other = _prepare_other_arg(other)
444
+ _from_rbdf(_df.add(other._s))
445
+ end
446
+
447
+ # Performs subtraction.
448
+ #
449
+ # @return [DataFrame]
450
+ def -(other)
451
+ if other.is_a?(DataFrame)
452
+ return _from_rbdf(_df.sub_df(other._df))
453
+ end
454
+
455
+ other = _prepare_other_arg(other)
456
+ _from_rbdf(_df.sub(other._s))
457
+ end
458
+
459
+ # Returns the modulo.
460
+ #
461
+ # @return [DataFrame]
462
+ def %(other)
463
+ if other.is_a?(DataFrame)
464
+ return _from_rbdf(_df.rem_df(other._df))
465
+ end
466
+
467
+ other = _prepare_other_arg(other)
468
+ _from_rbdf(_df.rem(other._s))
469
+ end
126
470
 
471
+ # Returns a string representing the DataFrame.
472
+ #
473
+ # @return [String]
127
474
  def to_s
128
475
  _df.to_s
129
476
  end
130
477
  alias_method :inspect, :to_s
131
478
 
479
+ # Check if DataFrame includes column.
480
+ #
481
+ # @return [Boolean]
132
482
  def include?(name)
133
483
  columns.include?(name)
134
484
  end
135
485
 
136
- def [](name)
137
- Utils.wrap_s(_df.column(name))
486
+ # def each
487
+ # end
488
+
489
+ # def _pos_idx
490
+ # end
491
+
492
+ # def _pos_idxs
493
+ # end
494
+
495
+ # Returns subset of the DataFrame.
496
+ #
497
+ # @return [Object]
498
+ def [](*args)
499
+ if args.size == 2
500
+ row_selection, col_selection = args
501
+
502
+ # df[.., unknown]
503
+ if row_selection.is_a?(Range)
504
+
505
+ # multiple slices
506
+ # df[.., ..]
507
+ if col_selection.is_a?(Range)
508
+ raise Todo
509
+ end
510
+ end
511
+
512
+ # df[2, ..] (select row as df)
513
+ if row_selection.is_a?(Integer)
514
+ if col_selection.is_a?(Array)
515
+ df = self[0.., col_selection]
516
+ return df.slice(row_selection, 1)
517
+ end
518
+ # df[2, "a"]
519
+ if col_selection.is_a?(String)
520
+ return self[col_selection][row_selection]
521
+ end
522
+ end
523
+
524
+ # column selection can be "a" and ["a", "b"]
525
+ if col_selection.is_a?(String)
526
+ col_selection = [col_selection]
527
+ end
528
+
529
+ # df[.., 1]
530
+ if col_selection.is_a?(Integer)
531
+ series = to_series(col_selection)
532
+ return series[row_selection]
533
+ end
534
+
535
+ if col_selection.is_a?(Array)
536
+ # df[.., [1, 2]]
537
+ if is_int_sequence(col_selection)
538
+ series_list = col_selection.map { |i| to_series(i) }
539
+ df = self.class.new(series_list)
540
+ return df[row_selection]
541
+ end
542
+ end
543
+
544
+ df = self[col_selection]
545
+ return df[row_selection]
546
+ elsif args.size == 1
547
+ item = args[0]
548
+
549
+ # select single column
550
+ # df["foo"]
551
+ if item.is_a?(String)
552
+ return Utils.wrap_s(_df.column(item))
553
+ end
554
+
555
+ # df[idx]
556
+ if item.is_a?(Integer)
557
+ return slice(_pos_idx(item, dim: 0), 1)
558
+ end
559
+
560
+ # df[..]
561
+ if item.is_a?(Range)
562
+ return Slice.new(self).apply(item)
563
+ end
564
+ end
565
+
566
+ raise ArgumentError, "Cannot get item of type: #{item.class.name}"
138
567
  end
139
568
 
140
569
  # def []=(key, value)
141
570
  # end
142
571
 
572
+ # no to_arrow
573
+
574
+ # Convert DataFrame to a hash mapping column name to values.
575
+ #
576
+ # @return [Hash]
143
577
  def to_h(as_series: true)
144
578
  if as_series
145
579
  get_columns.to_h { |s| [s.name, s] }
@@ -148,7 +582,7 @@ module Polars
148
582
  end
149
583
  end
150
584
 
151
- # def to_hs / to_a
585
+ # def to_hashes / to_a
152
586
  # end
153
587
 
154
588
  # def to_numo
@@ -156,6 +590,30 @@ module Polars
156
590
 
157
591
  # no to_pandas
158
592
 
593
+ # Select column as Series at index location.
594
+ #
595
+ # @param index [Integer]
596
+ # Location of selection.
597
+ #
598
+ # @return [Series]
599
+ #
600
+ # @example
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"]
606
+ # }
607
+ # )
608
+ # df.to_series(1)
609
+ # # =>
610
+ # # shape: (3,)
611
+ # # Series: 'bar' [i64]
612
+ # # [
613
+ # # 6
614
+ # # 7
615
+ # # 8
616
+ # # ]
159
617
  def to_series(index = 0)
160
618
  if index < 0
161
619
  index = columns.length + index
@@ -163,6 +621,18 @@ module Polars
163
621
  Utils.wrap_s(_df.select_at_idx(index))
164
622
  end
165
623
 
624
+ # Serialize to JSON representation.
625
+ #
626
+ # @return [nil]
627
+ #
628
+ # @param file [String]
629
+ # File path to which the result should be written.
630
+ # @param pretty [Boolean]
631
+ # Pretty serialize json.
632
+ # @param row_oriented [Boolean]
633
+ # Write to row oriented json. This is slower, but more common.
634
+ #
635
+ # @see #write_ndjson
166
636
  def write_json(
167
637
  file,
168
638
  pretty: false,
@@ -176,6 +646,12 @@ module Polars
176
646
  nil
177
647
  end
178
648
 
649
+ # Serialize to newline delimited JSON representation.
650
+ #
651
+ # @param file [String]
652
+ # File path to which the result should be written.
653
+ #
654
+ # @return [nil]
179
655
  def write_ndjson(file)
180
656
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
181
657
  file = Utils.format_path(file)
@@ -185,6 +661,50 @@ module Polars
185
661
  nil
186
662
  end
187
663
 
664
+ # Write to comma-separated values (CSV) file.
665
+ #
666
+ # @param file [String, nil]
667
+ # File path to which the result should be written. If set to `nil`
668
+ # (default), the output is returned as a string instead.
669
+ # @param has_header [Boolean]
670
+ # Whether to include header in the CSV output.
671
+ # @param sep [String]
672
+ # Separate CSV fields with this symbol.
673
+ # @param quote [String]
674
+ # Byte to use as quoting character.
675
+ # @param batch_size [Integer]
676
+ # Number of rows that will be processed per thread.
677
+ # @param datetime_format [String, nil]
678
+ # A format string, with the specifiers defined by the
679
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
680
+ # Rust crate. If no format specified, the default fractional-second
681
+ # precision is inferred from the maximum timeunit found in the frame's
682
+ # Datetime cols (if any).
683
+ # @param date_format [String, nil]
684
+ # A format string, with the specifiers defined by the
685
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
686
+ # Rust crate.
687
+ # @param time_format [String, nil]
688
+ # A format string, with the specifiers defined by the
689
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
690
+ # Rust crate.
691
+ # @param float_precision [Integer, nil]
692
+ # Number of decimal places to write, applied to both `:f32` and
693
+ # `:f64` datatypes.
694
+ # @param null_value [String, nil]
695
+ # A string representing null values (defaulting to the empty string).
696
+ #
697
+ # @return [String, nil]
698
+ #
699
+ # @example
700
+ # df = Polars::DataFrame.new(
701
+ # {
702
+ # "foo" => [1, 2, 3, 4, 5],
703
+ # "bar" => [6, 7, 8, 9, 10],
704
+ # "ham" => ["a", "b", "c", "d", "e"]
705
+ # }
706
+ # )
707
+ # df.write_csv("file.csv")
188
708
  def write_csv(
189
709
  file = nil,
190
710
  has_header: true,
@@ -220,8 +740,7 @@ module Polars
220
740
  float_precision,
221
741
  null_value
222
742
  )
223
- buffer.rewind
224
- return buffer.read.force_encoding(Encoding::UTF_8)
743
+ return buffer.string.force_encoding(Encoding::UTF_8)
225
744
  end
226
745
 
227
746
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -246,9 +765,50 @@ module Polars
246
765
  # def write_avro
247
766
  # end
248
767
 
249
- # def write_ipc
250
- # end
768
+ # Write to Arrow IPC binary stream or Feather file.
769
+ #
770
+ # @param file [String]
771
+ # File path to which the file should be written.
772
+ # @param compression ["uncompressed", "lz4", "zstd"]
773
+ # Compression method. Defaults to "uncompressed".
774
+ #
775
+ # @return [nil]
776
+ def write_ipc(file, compression: "uncompressed")
777
+ if compression.nil?
778
+ compression = "uncompressed"
779
+ end
780
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
781
+ file = Utils.format_path(file)
782
+ end
251
783
 
784
+ _df.write_ipc(file, compression)
785
+ end
786
+
787
+ # Write to Apache Parquet file.
788
+ #
789
+ # @param file [String]
790
+ # File path to which the file should be written.
791
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
792
+ # Choose "zstd" for good compression performance.
793
+ # Choose "lz4" for fast compression/decompression.
794
+ # Choose "snappy" for more backwards compatibility guarantees
795
+ # when you deal with older parquet readers.
796
+ # @param compression_level [Integer, nil]
797
+ # The level of compression to use. Higher compression means smaller files on
798
+ # disk.
799
+ #
800
+ # - "gzip" : min-level: 0, max-level: 10.
801
+ # - "brotli" : min-level: 0, max-level: 11.
802
+ # - "zstd" : min-level: 1, max-level: 22.
803
+ # @param statistics [Boolean]
804
+ # Write statistics to the parquet headers. This requires extra compute.
805
+ # @param row_group_size [Integer, nil]
806
+ # Size of the row groups in number of rows.
807
+ # If `nil` (default), the chunks of the DataFrame are
808
+ # used. Writing in smaller chunks may reduce memory pressure and improve
809
+ # writing speeds.
810
+ #
811
+ # @return [nil]
252
812
  def write_parquet(
253
813
  file,
254
814
  compression: "zstd",
@@ -268,6 +828,39 @@ module Polars
268
828
  )
269
829
  end
270
830
 
831
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
832
+ #
833
+ # Estimated size is given in the specified unit (bytes by default).
834
+ #
835
+ # This estimation is the sum of the size of its buffers, validity, including
836
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
837
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
838
+ # particular, StructArray's size is an upper bound.
839
+ #
840
+ # When an array is sliced, its allocated size remains constant because the buffer
841
+ # unchanged. However, this function will yield a smaller number. This is because
842
+ # this function returns the visible size of the buffer, not its total capacity.
843
+ #
844
+ # FFI buffers are included in this estimation.
845
+ #
846
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
847
+ # Scale the returned size to the given unit.
848
+ #
849
+ # @return [Numeric]
850
+ #
851
+ # @example
852
+ # df = Polars::DataFrame.new(
853
+ # {
854
+ # "x" => 1_000_000.times.to_a.reverse,
855
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
856
+ # "z" => 1_000_000.times.map(&:to_s)
857
+ # },
858
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
859
+ # )
860
+ # df.estimated_size
861
+ # # => 25888898
862
+ # df.estimated_size("mb")
863
+ # # => 24.689577102661133
271
864
  def estimated_size(unit = "b")
272
865
  sz = _df.estimated_size
273
866
  Utils.scale_bytes(sz, to: unit)
@@ -276,14 +869,120 @@ module Polars
276
869
  # def transpose
277
870
  # end
278
871
 
872
+ # Reverse the DataFrame.
873
+ #
874
+ # @return [DataFrame]
875
+ #
876
+ # @example
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "key" => ["a", "b", "c"],
880
+ # "val" => [1, 2, 3]
881
+ # }
882
+ # )
883
+ # df.reverse()
884
+ # # =>
885
+ # # shape: (3, 2)
886
+ # # ┌─────┬─────┐
887
+ # # │ key ┆ val │
888
+ # # │ --- ┆ --- │
889
+ # # │ str ┆ i64 │
890
+ # # ╞═════╪═════╡
891
+ # # │ c ┆ 3 │
892
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
893
+ # # │ b ┆ 2 │
894
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
895
+ # # │ a ┆ 1 │
896
+ # # └─────┴─────┘
279
897
  def reverse
280
898
  select(Polars.col("*").reverse)
281
899
  end
282
900
 
901
+ # Rename column names.
902
+ #
903
+ # @param mapping [Hash]
904
+ # Key value pairs that map from old name to new name.
905
+ #
906
+ # @return [DataFrame]
907
+ #
908
+ # @example
909
+ # df = Polars::DataFrame.new(
910
+ # {
911
+ # "foo" => [1, 2, 3],
912
+ # "bar" => [6, 7, 8],
913
+ # "ham" => ["a", "b", "c"]
914
+ # }
915
+ # )
916
+ # df.rename({"foo" => "apple"})
917
+ # # =>
918
+ # # shape: (3, 3)
919
+ # # ┌───────┬─────┬─────┐
920
+ # # │ apple ┆ bar ┆ ham │
921
+ # # │ --- ┆ --- ┆ --- │
922
+ # # │ i64 ┆ i64 ┆ str │
923
+ # # ╞═══════╪═════╪═════╡
924
+ # # │ 1 ┆ 6 ┆ a │
925
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
926
+ # # │ 2 ┆ 7 ┆ b │
927
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
928
+ # # │ 3 ┆ 8 ┆ c │
929
+ # # └───────┴─────┴─────┘
283
930
  def rename(mapping)
284
931
  lazy.rename(mapping).collect(no_optimization: true)
285
932
  end
286
933
 
934
+ # Insert a Series at a certain column index. This operation is in place.
935
+ #
936
+ # @param index [Integer]
937
+ # Column to insert the new `Series` column.
938
+ # @param series [Series]
939
+ # `Series` to insert.
940
+ #
941
+ # @return [DataFrame]
942
+ #
943
+ # @example
944
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
945
+ # s = Polars::Series.new("baz", [97, 98, 99])
946
+ # df.insert_at_idx(1, s)
947
+ # # =>
948
+ # # shape: (3, 3)
949
+ # # ┌─────┬─────┬─────┐
950
+ # # │ foo ┆ baz ┆ bar │
951
+ # # │ --- ┆ --- ┆ --- │
952
+ # # │ i64 ┆ i64 ┆ i64 │
953
+ # # ╞═════╪═════╪═════╡
954
+ # # │ 1 ┆ 97 ┆ 4 │
955
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
956
+ # # │ 2 ┆ 98 ┆ 5 │
957
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
958
+ # # │ 3 ┆ 99 ┆ 6 │
959
+ # # └─────┴─────┴─────┘
960
+ #
961
+ # @example
962
+ # df = Polars::DataFrame.new(
963
+ # {
964
+ # "a" => [1, 2, 3, 4],
965
+ # "b" => [0.5, 4, 10, 13],
966
+ # "c" => [true, true, false, true]
967
+ # }
968
+ # )
969
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
970
+ # df.insert_at_idx(3, s)
971
+ # # =>
972
+ # # shape: (4, 4)
973
+ # # ┌─────┬──────┬───────┬──────┐
974
+ # # │ a ┆ b ┆ c ┆ d │
975
+ # # │ --- ┆ --- ┆ --- ┆ --- │
976
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
977
+ # # ╞═════╪══════╪═══════╪══════╡
978
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
979
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
980
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
981
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
982
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
983
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
984
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
985
+ # # └─────┴──────┴───────┴──────┘
287
986
  def insert_at_idx(index, series)
288
987
  if index < 0
289
988
  index = columns.length + index
@@ -292,30 +991,337 @@ module Polars
292
991
  self
293
992
  end
294
993
 
994
+ # Filter the rows in the DataFrame based on a predicate expression.
995
+ #
996
+ # @param predicate [Expr]
997
+ # Expression that evaluates to a boolean Series.
998
+ #
999
+ # @return [DataFrame]
1000
+ #
1001
+ # @example Filter on one condition:
1002
+ # df = Polars::DataFrame.new(
1003
+ # {
1004
+ # "foo" => [1, 2, 3],
1005
+ # "bar" => [6, 7, 8],
1006
+ # "ham" => ["a", "b", "c"]
1007
+ # }
1008
+ # )
1009
+ # df.filter(Polars.col("foo") < 3)
1010
+ # # =>
1011
+ # # shape: (2, 3)
1012
+ # # ┌─────┬─────┬─────┐
1013
+ # # │ foo ┆ bar ┆ ham │
1014
+ # # │ --- ┆ --- ┆ --- │
1015
+ # # │ i64 ┆ i64 ┆ str │
1016
+ # # ╞═════╪═════╪═════╡
1017
+ # # │ 1 ┆ 6 ┆ a │
1018
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1019
+ # # │ 2 ┆ 7 ┆ b │
1020
+ # # └─────┴─────┴─────┘
1021
+ #
1022
+ # @example Filter on multiple conditions:
1023
+ # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
1024
+ # # =>
1025
+ # # shape: (1, 3)
1026
+ # # ┌─────┬─────┬─────┐
1027
+ # # │ foo ┆ bar ┆ ham │
1028
+ # # │ --- ┆ --- ┆ --- │
1029
+ # # │ i64 ┆ i64 ┆ str │
1030
+ # # ╞═════╪═════╪═════╡
1031
+ # # │ 1 ┆ 6 ┆ a │
1032
+ # # └─────┴─────┴─────┘
295
1033
  def filter(predicate)
296
1034
  lazy.filter(predicate).collect
297
1035
  end
298
1036
 
299
- # def describe
300
- # end
1037
+ # Summary statistics for a DataFrame.
1038
+ #
1039
+ # @return [DataFrame]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new(
1043
+ # {
1044
+ # "a" => [1.0, 2.8, 3.0],
1045
+ # "b" => [4, 5, nil],
1046
+ # "c" => [true, false, true],
1047
+ # "d" => [nil, "b", "c"],
1048
+ # "e" => ["usd", "eur", nil]
1049
+ # }
1050
+ # )
1051
+ # df.describe
1052
+ # # =>
1053
+ # # shape: (7, 6)
1054
+ # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
+ # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
+ # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1073
+ def describe
1074
+ describe_cast = lambda do |stat|
1075
+ columns = []
1076
+ self.columns.each_with_index do |s, i|
1077
+ if self[s].is_numeric || self[s].is_boolean
1078
+ columns << stat[0.., i].cast(:f64)
1079
+ else
1080
+ # for dates, strings, etc, we cast to string so that all
1081
+ # statistics can be shown
1082
+ columns << stat[0.., i].cast(:str)
1083
+ end
1084
+ end
1085
+ self.class.new(columns)
1086
+ end
301
1087
 
302
- # def find_idx_by_name
303
- # end
1088
+ summary = _from_rbdf(
1089
+ Polars.concat(
1090
+ [
1091
+ describe_cast.(
1092
+ self.class.new(columns.to_h { |c| [c, [height]] })
1093
+ ),
1094
+ describe_cast.(null_count),
1095
+ describe_cast.(mean),
1096
+ describe_cast.(std),
1097
+ describe_cast.(min),
1098
+ describe_cast.(max),
1099
+ describe_cast.(median)
1100
+ ]
1101
+ )._df
1102
+ )
1103
+ summary.insert_at_idx(
1104
+ 0,
1105
+ Polars::Series.new(
1106
+ "describe",
1107
+ ["count", "null_count", "mean", "std", "min", "max", "median"],
1108
+ )
1109
+ )
1110
+ summary
1111
+ end
304
1112
 
305
- # def replace_at_idx
306
- # end
1113
+ # Find the index of a column by name.
1114
+ #
1115
+ # @param name [String]
1116
+ # Name of the column to find.
1117
+ #
1118
+ # @return [Series]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new(
1122
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1123
+ # )
1124
+ # df.find_idx_by_name("ham")
1125
+ # # => 2
1126
+ def find_idx_by_name(name)
1127
+ _df.find_idx_by_name(name)
1128
+ end
1129
+
1130
+ # Replace a column at an index location.
1131
+ #
1132
+ # @param index [Integer]
1133
+ # Column index.
1134
+ # @param series [Series]
1135
+ # Series that will replace the column.
1136
+ #
1137
+ # @return [DataFrame]
1138
+ #
1139
+ # @example
1140
+ # df = Polars::DataFrame.new(
1141
+ # {
1142
+ # "foo" => [1, 2, 3],
1143
+ # "bar" => [6, 7, 8],
1144
+ # "ham" => ["a", "b", "c"]
1145
+ # }
1146
+ # )
1147
+ # s = Polars::Series.new("apple", [10, 20, 30])
1148
+ # df.replace_at_idx(0, s)
1149
+ # # =>
1150
+ # # shape: (3, 3)
1151
+ # # ┌───────┬─────┬─────┐
1152
+ # # │ apple ┆ bar ┆ ham │
1153
+ # # │ --- ┆ --- ┆ --- │
1154
+ # # │ i64 ┆ i64 ┆ str │
1155
+ # # ╞═══════╪═════╪═════╡
1156
+ # # │ 10 ┆ 6 ┆ a │
1157
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1158
+ # # │ 20 ┆ 7 ┆ b │
1159
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1160
+ # # │ 30 ┆ 8 ┆ c │
1161
+ # # └───────┴─────┴─────┘
1162
+ def replace_at_idx(index, series)
1163
+ if index < 0
1164
+ index = columns.length + index
1165
+ end
1166
+ _df.replace_at_idx(index, series._s)
1167
+ self
1168
+ end
307
1169
 
1170
+ # Sort the DataFrame by column.
1171
+ #
1172
+ # @param by [String]
1173
+ # By which column to sort.
1174
+ # @param reverse [Boolean]
1175
+ # Reverse/descending sort.
1176
+ # @param nulls_last [Boolean]
1177
+ # Place null values last. Can only be used if sorted by a single column.
1178
+ #
1179
+ # @return [DataFrame]
1180
+ #
1181
+ # @example
1182
+ # df = Polars::DataFrame.new(
1183
+ # {
1184
+ # "foo" => [1, 2, 3],
1185
+ # "bar" => [6.0, 7.0, 8.0],
1186
+ # "ham" => ["a", "b", "c"]
1187
+ # }
1188
+ # )
1189
+ # df.sort("foo", reverse: true)
1190
+ # # =>
1191
+ # # shape: (3, 3)
1192
+ # # ┌─────┬─────┬─────┐
1193
+ # # │ foo ┆ bar ┆ ham │
1194
+ # # │ --- ┆ --- ┆ --- │
1195
+ # # │ i64 ┆ f64 ┆ str │
1196
+ # # ╞═════╪═════╪═════╡
1197
+ # # │ 3 ┆ 8.0 ┆ c │
1198
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1199
+ # # │ 2 ┆ 7.0 ┆ b │
1200
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1201
+ # # │ 1 ┆ 6.0 ┆ a │
1202
+ # # └─────┴─────┴─────┘
1203
+ #
1204
+ # @example Sort by multiple columns.
1205
+ # df.sort(
1206
+ # [Polars.col("foo"), Polars.col("bar")**2],
1207
+ # reverse: [true, false]
1208
+ # )
1209
+ # # =>
1210
+ # # shape: (3, 3)
1211
+ # # ┌─────┬─────┬─────┐
1212
+ # # │ foo ┆ bar ┆ ham │
1213
+ # # │ --- ┆ --- ┆ --- │
1214
+ # # │ i64 ┆ f64 ┆ str │
1215
+ # # ╞═════╪═════╪═════╡
1216
+ # # │ 3 ┆ 8.0 ┆ c │
1217
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1218
+ # # │ 2 ┆ 7.0 ┆ b │
1219
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1220
+ # # │ 1 ┆ 6.0 ┆ a │
1221
+ # # └─────┴─────┴─────┘
308
1222
  def sort(by, reverse: false, nulls_last: false)
309
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1223
+ if by.is_a?(Array) || by.is_a?(Expr)
1224
+ lazy
1225
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1226
+ .collect(no_optimization: true, string_cache: false)
1227
+ else
1228
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
1229
+ end
310
1230
  end
311
1231
 
1232
+ # Check if DataFrame is equal to other.
1233
+ #
1234
+ # @param other [DataFrame]
1235
+ # DataFrame to compare with.
1236
+ # @param null_equal [Boolean]
1237
+ # Consider null values as equal.
1238
+ #
1239
+ # @return [Boolean]
1240
+ #
1241
+ # @example
1242
+ # df1 = Polars::DataFrame.new(
1243
+ # {
1244
+ # "foo" => [1, 2, 3],
1245
+ # "bar" => [6.0, 7.0, 8.0],
1246
+ # "ham" => ["a", "b", "c"]
1247
+ # }
1248
+ # )
1249
+ # df2 = Polars::DataFrame.new(
1250
+ # {
1251
+ # "foo" => [3, 2, 1],
1252
+ # "bar" => [8.0, 7.0, 6.0],
1253
+ # "ham" => ["c", "b", "a"]
1254
+ # }
1255
+ # )
1256
+ # df1.frame_equal(df1)
1257
+ # # => true
1258
+ # df1.frame_equal(df2)
1259
+ # # => false
312
1260
  def frame_equal(other, null_equal: true)
313
1261
  _df.frame_equal(other._df, null_equal)
314
1262
  end
315
1263
 
316
- # def replace
317
- # end
1264
+ # Replace a column by a new Series.
1265
+ #
1266
+ # @param column [String]
1267
+ # Column to replace.
1268
+ # @param new_col [Series]
1269
+ # New column to insert.
1270
+ #
1271
+ # @return [DataFrame]
1272
+ #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1275
+ # s = Polars::Series.new([10, 20, 30])
1276
+ # df.replace("foo", s)
1277
+ # # =>
1278
+ # # shape: (3, 2)
1279
+ # # ┌─────┬─────┐
1280
+ # # │ foo ┆ bar │
1281
+ # # │ --- ┆ --- │
1282
+ # # │ i64 ┆ i64 │
1283
+ # # ╞═════╪═════╡
1284
+ # # │ 10 ┆ 4 │
1285
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1286
+ # # │ 20 ┆ 5 │
1287
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1288
+ # # │ 30 ┆ 6 │
1289
+ # # └─────┴─────┘
1290
+ def replace(column, new_col)
1291
+ _df.replace(column, new_col._s)
1292
+ self
1293
+ end
318
1294
 
1295
+ # Get a slice of this DataFrame.
1296
+ #
1297
+ # @param offset [Integer]
1298
+ # Start index. Negative indexing is supported.
1299
+ # @param length [Integer, nil]
1300
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1301
+ # will be selected.
1302
+ #
1303
+ # @return [DataFrame]
1304
+ #
1305
+ # @example
1306
+ # df = Polars::DataFrame.new(
1307
+ # {
1308
+ # "foo" => [1, 2, 3],
1309
+ # "bar" => [6.0, 7.0, 8.0],
1310
+ # "ham" => ["a", "b", "c"]
1311
+ # }
1312
+ # )
1313
+ # df.slice(1, 2)
1314
+ # # =>
1315
+ # # shape: (2, 3)
1316
+ # # ┌─────┬─────┬─────┐
1317
+ # # │ foo ┆ bar ┆ ham │
1318
+ # # │ --- ┆ --- ┆ --- │
1319
+ # # │ i64 ┆ f64 ┆ str │
1320
+ # # ╞═════╪═════╪═════╡
1321
+ # # │ 2 ┆ 7.0 ┆ b │
1322
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1323
+ # # │ 3 ┆ 8.0 ┆ c │
1324
+ # # └─────┴─────┴─────┘
319
1325
  def slice(offset, length = nil)
320
1326
  if !length.nil? && length < 0
321
1327
  length = height - offset + length
@@ -323,29 +1329,222 @@ module Polars
323
1329
  _from_rbdf(_df.slice(offset, length))
324
1330
  end
325
1331
 
1332
+ # Get the first `n` rows.
1333
+ #
1334
+ # Alias for {#head}.
1335
+ #
1336
+ # @param n [Integer]
1337
+ # Number of rows to return.
1338
+ #
1339
+ # @return [DataFrame]
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new(
1343
+ # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
1344
+ # )
1345
+ # df.limit(4)
1346
+ # # =>
1347
+ # # shape: (4, 2)
1348
+ # # ┌─────┬─────┐
1349
+ # # │ foo ┆ bar │
1350
+ # # │ --- ┆ --- │
1351
+ # # │ i64 ┆ str │
1352
+ # # ╞═════╪═════╡
1353
+ # # │ 1 ┆ a │
1354
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1355
+ # # │ 2 ┆ b │
1356
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1357
+ # # │ 3 ┆ c │
1358
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1359
+ # # │ 4 ┆ d │
1360
+ # # └─────┴─────┘
326
1361
  def limit(n = 5)
327
1362
  head(n)
328
1363
  end
329
1364
 
1365
+ # Get the first `n` rows.
1366
+ #
1367
+ # @param n [Integer]
1368
+ # Number of rows to return.
1369
+ #
1370
+ # @return [DataFrame]
1371
+ #
1372
+ # @example
1373
+ # df = Polars::DataFrame.new(
1374
+ # {
1375
+ # "foo" => [1, 2, 3, 4, 5],
1376
+ # "bar" => [6, 7, 8, 9, 10],
1377
+ # "ham" => ["a", "b", "c", "d", "e"]
1378
+ # }
1379
+ # )
1380
+ # df.head(3)
1381
+ # # =>
1382
+ # # shape: (3, 3)
1383
+ # # ┌─────┬─────┬─────┐
1384
+ # # │ foo ┆ bar ┆ ham │
1385
+ # # │ --- ┆ --- ┆ --- │
1386
+ # # │ i64 ┆ i64 ┆ str │
1387
+ # # ╞═════╪═════╪═════╡
1388
+ # # │ 1 ┆ 6 ┆ a │
1389
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1390
+ # # │ 2 ┆ 7 ┆ b │
1391
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1392
+ # # │ 3 ┆ 8 ┆ c │
1393
+ # # └─────┴─────┴─────┘
330
1394
  def head(n = 5)
331
1395
  _from_rbdf(_df.head(n))
332
1396
  end
333
1397
 
1398
+ # Get the last `n` rows.
1399
+ #
1400
+ # @param n [Integer]
1401
+ # Number of rows to return.
1402
+ #
1403
+ # @return [DataFrame]
1404
+ #
1405
+ # @example
1406
+ # df = Polars::DataFrame.new(
1407
+ # {
1408
+ # "foo" => [1, 2, 3, 4, 5],
1409
+ # "bar" => [6, 7, 8, 9, 10],
1410
+ # "ham" => ["a", "b", "c", "d", "e"]
1411
+ # }
1412
+ # )
1413
+ # df.tail(3)
1414
+ # # =>
1415
+ # # shape: (3, 3)
1416
+ # # ┌─────┬─────┬─────┐
1417
+ # # │ foo ┆ bar ┆ ham │
1418
+ # # │ --- ┆ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 ┆ str │
1420
+ # # ╞═════╪═════╪═════╡
1421
+ # # │ 3 ┆ 8 ┆ c │
1422
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1423
+ # # │ 4 ┆ 9 ┆ d │
1424
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1425
+ # # │ 5 ┆ 10 ┆ e │
1426
+ # # └─────┴─────┴─────┘
334
1427
  def tail(n = 5)
335
1428
  _from_rbdf(_df.tail(n))
336
1429
  end
337
1430
 
338
- # def drop_nulls
339
- # end
1431
+ # Return a new DataFrame where the null values are dropped.
1432
+ #
1433
+ # @param subset [Object]
1434
+ # Subset of column(s) on which `drop_nulls` will be applied.
1435
+ #
1436
+ # @return [DataFrame]
1437
+ #
1438
+ # @example
1439
+ # df = Polars::DataFrame.new(
1440
+ # {
1441
+ # "foo" => [1, 2, 3],
1442
+ # "bar" => [6, nil, 8],
1443
+ # "ham" => ["a", "b", "c"]
1444
+ # }
1445
+ # )
1446
+ # df.drop_nulls
1447
+ # # =>
1448
+ # # shape: (2, 3)
1449
+ # # ┌─────┬─────┬─────┐
1450
+ # # │ foo ┆ bar ┆ ham │
1451
+ # # │ --- ┆ --- ┆ --- │
1452
+ # # │ i64 ┆ i64 ┆ str │
1453
+ # # ╞═════╪═════╪═════╡
1454
+ # # │ 1 ┆ 6 ┆ a │
1455
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1456
+ # # │ 3 ┆ 8 ┆ c │
1457
+ # # └─────┴─────┴─────┘
1458
+ def drop_nulls(subset: nil)
1459
+ if subset.is_a?(String)
1460
+ subset = [subset]
1461
+ end
1462
+ _from_rbdf(_df.drop_nulls(subset))
1463
+ end
340
1464
 
341
1465
  # def pipe
342
1466
  # end
343
1467
 
344
- # def with_row_count
345
- # end
1468
+ # Add a column at index 0 that counts the rows.
1469
+ #
1470
+ # @param name [String]
1471
+ # Name of the column to add.
1472
+ # @param offset [Integer]
1473
+ # Start the row count at this offset.
1474
+ #
1475
+ # @return [DataFrame]
1476
+ #
1477
+ # @example
1478
+ # df = Polars::DataFrame.new(
1479
+ # {
1480
+ # "a" => [1, 3, 5],
1481
+ # "b" => [2, 4, 6]
1482
+ # }
1483
+ # )
1484
+ # df.with_row_count
1485
+ # # =>
1486
+ # # shape: (3, 3)
1487
+ # # ┌────────┬─────┬─────┐
1488
+ # # │ row_nr ┆ a ┆ b │
1489
+ # # │ --- ┆ --- ┆ --- │
1490
+ # # │ u32 ┆ i64 ┆ i64 │
1491
+ # # ╞════════╪═════╪═════╡
1492
+ # # │ 0 ┆ 1 ┆ 2 │
1493
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1494
+ # # │ 1 ┆ 3 ┆ 4 │
1495
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1496
+ # # │ 2 ┆ 5 ┆ 6 │
1497
+ # # └────────┴─────┴─────┘
1498
+ def with_row_count(name: "row_nr", offset: 0)
1499
+ _from_rbdf(_df.with_row_count(name, offset))
1500
+ end
346
1501
 
1502
+ # Start a groupby operation.
1503
+ #
1504
+ # @param by [Object]
1505
+ # Column(s) to group by.
1506
+ # @param maintain_order [Boolean]
1507
+ # Make sure that the order of the groups remain consistent. This is more
1508
+ # expensive than a default groupby. Note that this only works in expression
1509
+ # aggregations.
1510
+ #
1511
+ # @return [GroupBy]
1512
+ #
1513
+ # @example
1514
+ # df = Polars::DataFrame.new(
1515
+ # {
1516
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1517
+ # "b" => [1, 2, 3, 4, 5, 6],
1518
+ # "c" => [6, 5, 4, 3, 2, 1]
1519
+ # }
1520
+ # )
1521
+ # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1522
+ # # =>
1523
+ # # shape: (3, 2)
1524
+ # # ┌─────┬─────┐
1525
+ # # │ a ┆ b │
1526
+ # # │ --- ┆ --- │
1527
+ # # │ str ┆ i64 │
1528
+ # # ╞═════╪═════╡
1529
+ # # │ a ┆ 4 │
1530
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1531
+ # # │ b ┆ 11 │
1532
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1533
+ # # │ c ┆ 6 │
1534
+ # # └─────┴─────┘
347
1535
  def groupby(by, maintain_order: false)
348
- lazy.groupby(by, maintain_order: maintain_order)
1536
+ if !Utils.bool?(maintain_order)
1537
+ raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1538
+ end
1539
+ if by.is_a?(String)
1540
+ by = [by]
1541
+ end
1542
+ GroupBy.new(
1543
+ _df,
1544
+ by,
1545
+ self.class,
1546
+ maintain_order: maintain_order
1547
+ )
349
1548
  end
350
1549
 
351
1550
  # def groupby_rolling
@@ -360,6 +1559,109 @@ module Polars
360
1559
  # def join_asof
361
1560
  # end
362
1561
 
1562
+ # Join in SQL-like fashion.
1563
+ #
1564
+ # @param other [DataFrame]
1565
+ # DataFrame to join with.
1566
+ # @param left_on [Object]
1567
+ # Name(s) of the left join column(s).
1568
+ # @param right_on [Object]
1569
+ # Name(s) of the right join column(s).
1570
+ # @param on [Object]
1571
+ # Name(s) of the join columns in both DataFrames.
1572
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1573
+ # Join strategy.
1574
+ # @param suffix [String]
1575
+ # Suffix to append to columns with a duplicate name.
1576
+ #
1577
+ # @return [DataFrame]
1578
+ #
1579
+ # @example
1580
+ # df = Polars::DataFrame.new(
1581
+ # {
1582
+ # "foo" => [1, 2, 3],
1583
+ # "bar" => [6.0, 7.0, 8.0],
1584
+ # "ham" => ["a", "b", "c"]
1585
+ # }
1586
+ # )
1587
+ # other_df = Polars::DataFrame.new(
1588
+ # {
1589
+ # "apple" => ["x", "y", "z"],
1590
+ # "ham" => ["a", "b", "d"]
1591
+ # }
1592
+ # )
1593
+ # df.join(other_df, on: "ham")
1594
+ # # =>
1595
+ # # shape: (2, 4)
1596
+ # # ┌─────┬─────┬─────┬───────┐
1597
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1598
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1599
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1600
+ # # ╞═════╪═════╪═════╪═══════╡
1601
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1602
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1603
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1604
+ # # └─────┴─────┴─────┴───────┘
1605
+ #
1606
+ # @example
1607
+ # df.join(other_df, on: "ham", how: "outer")
1608
+ # # =>
1609
+ # # shape: (4, 4)
1610
+ # # ┌──────┬──────┬─────┬───────┐
1611
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1612
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1613
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1614
+ # # ╞══════╪══════╪═════╪═══════╡
1615
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1616
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1617
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1618
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1619
+ # # │ null ┆ null ┆ d ┆ z │
1620
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1621
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1622
+ # # └──────┴──────┴─────┴───────┘
1623
+ #
1624
+ # @example
1625
+ # df.join(other_df, on: "ham", how: "left")
1626
+ # # =>
1627
+ # # shape: (3, 4)
1628
+ # # ┌─────┬─────┬─────┬───────┐
1629
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1630
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1631
+ # # │ i64 ┆ f64 ┆ str ┆ str │
1632
+ # # ╞═════╪═════╪═════╪═══════╡
1633
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
1634
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1635
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
1636
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1637
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
1638
+ # # └─────┴─────┴─────┴───────┘
1639
+ #
1640
+ # @example
1641
+ # df.join(other_df, on: "ham", how: "semi")
1642
+ # # =>
1643
+ # # shape: (2, 3)
1644
+ # # ┌─────┬─────┬─────┐
1645
+ # # │ foo ┆ bar ┆ ham │
1646
+ # # │ --- ┆ --- ┆ --- │
1647
+ # # │ i64 ┆ f64 ┆ str │
1648
+ # # ╞═════╪═════╪═════╡
1649
+ # # │ 1 ┆ 6.0 ┆ a │
1650
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1651
+ # # │ 2 ┆ 7.0 ┆ b │
1652
+ # # └─────┴─────┴─────┘
1653
+ #
1654
+ # @example
1655
+ # df.join(other_df, on: "ham", how: "anti")
1656
+ # # =>
1657
+ # # shape: (1, 3)
1658
+ # # ┌─────┬─────┬─────┐
1659
+ # # │ foo ┆ bar ┆ ham │
1660
+ # # │ --- ┆ --- ┆ --- │
1661
+ # # │ i64 ┆ f64 ┆ str │
1662
+ # # ╞═════╪═════╪═════╡
1663
+ # # │ 3 ┆ 8.0 ┆ c │
1664
+ # # └─────┴─────┴─────┘
363
1665
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
364
1666
  lazy
365
1667
  .join(
@@ -376,36 +1678,322 @@ module Polars
376
1678
  # def apply
377
1679
  # end
378
1680
 
1681
+ # Return a new DataFrame with the column added or replaced.
1682
+ #
1683
+ # @param column [Object]
1684
+ # Series, where the name of the Series refers to the column in the DataFrame.
1685
+ #
1686
+ # @return [DataFrame]
1687
+ #
1688
+ # @example Added
1689
+ # df = Polars::DataFrame.new(
1690
+ # {
1691
+ # "a" => [1, 3, 5],
1692
+ # "b" => [2, 4, 6]
1693
+ # }
1694
+ # )
1695
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
1696
+ # # =>
1697
+ # # shape: (3, 3)
1698
+ # # ┌─────┬─────┬───────────┐
1699
+ # # │ a ┆ b ┆ b_squared │
1700
+ # # │ --- ┆ --- ┆ --- │
1701
+ # # │ i64 ┆ i64 ┆ f64 │
1702
+ # # ╞═════╪═════╪═══════════╡
1703
+ # # │ 1 ┆ 2 ┆ 4.0 │
1704
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1705
+ # # │ 3 ┆ 4 ┆ 16.0 │
1706
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
1707
+ # # │ 5 ┆ 6 ┆ 36.0 │
1708
+ # # └─────┴─────┴───────────┘
1709
+ #
1710
+ # @example Replaced
1711
+ # df.with_column(Polars.col("a") ** 2)
1712
+ # # =>
1713
+ # # shape: (3, 2)
1714
+ # # ┌──────┬─────┐
1715
+ # # │ a ┆ b │
1716
+ # # │ --- ┆ --- │
1717
+ # # │ f64 ┆ i64 │
1718
+ # # ╞══════╪═════╡
1719
+ # # │ 1.0 ┆ 2 │
1720
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1721
+ # # │ 9.0 ┆ 4 │
1722
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1723
+ # # │ 25.0 ┆ 6 │
1724
+ # # └──────┴─────┘
379
1725
  def with_column(column)
380
1726
  lazy
381
1727
  .with_column(column)
382
1728
  .collect(no_optimization: true, string_cache: false)
383
1729
  end
384
1730
 
385
- # def hstack
386
- # end
1731
+ # Return a new DataFrame grown horizontally by stacking multiple Series to it.
1732
+ #
1733
+ # @param columns [Object]
1734
+ # Series to stack.
1735
+ # @param in_place [Boolean]
1736
+ # Modify in place.
1737
+ #
1738
+ # @return [DataFrame]
1739
+ #
1740
+ # @example
1741
+ # df = Polars::DataFrame.new(
1742
+ # {
1743
+ # "foo" => [1, 2, 3],
1744
+ # "bar" => [6, 7, 8],
1745
+ # "ham" => ["a", "b", "c"]
1746
+ # }
1747
+ # )
1748
+ # x = Polars::Series.new("apple", [10, 20, 30])
1749
+ # df.hstack([x])
1750
+ # # =>
1751
+ # # shape: (3, 4)
1752
+ # # ┌─────┬─────┬─────┬───────┐
1753
+ # # │ foo ┆ bar ┆ ham ┆ apple │
1754
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1755
+ # # │ i64 ┆ i64 ┆ str ┆ i64 │
1756
+ # # ╞═════╪═════╪═════╪═══════╡
1757
+ # # │ 1 ┆ 6 ┆ a ┆ 10 │
1758
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1759
+ # # │ 2 ┆ 7 ┆ b ┆ 20 │
1760
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1761
+ # # │ 3 ┆ 8 ┆ c ┆ 30 │
1762
+ # # └─────┴─────┴─────┴───────┘
1763
+ def hstack(columns, in_place: false)
1764
+ if !columns.is_a?(Array)
1765
+ columns = columns.get_columns
1766
+ end
1767
+ if in_place
1768
+ _df.hstack_mut(columns.map(&:_s))
1769
+ self
1770
+ else
1771
+ _from_rbdf(_df.hstack(columns.map(&:_s)))
1772
+ end
1773
+ end
387
1774
 
388
- # def vstack
389
- # end
1775
+ # Grow this DataFrame vertically by stacking a DataFrame to it.
1776
+ #
1777
+ # @param df [DataFrame]
1778
+ # DataFrame to stack.
1779
+ # @param in_place [Boolean]
1780
+ # Modify in place
1781
+ #
1782
+ # @return [DataFrame]
1783
+ #
1784
+ # @example
1785
+ # df1 = Polars::DataFrame.new(
1786
+ # {
1787
+ # "foo" => [1, 2],
1788
+ # "bar" => [6, 7],
1789
+ # "ham" => ["a", "b"]
1790
+ # }
1791
+ # )
1792
+ # df2 = Polars::DataFrame.new(
1793
+ # {
1794
+ # "foo" => [3, 4],
1795
+ # "bar" => [8, 9],
1796
+ # "ham" => ["c", "d"]
1797
+ # }
1798
+ # )
1799
+ # df1.vstack(df2)
1800
+ # # =>
1801
+ # # shape: (4, 3)
1802
+ # # ┌─────┬─────┬─────┐
1803
+ # # │ foo ┆ bar ┆ ham │
1804
+ # # │ --- ┆ --- ┆ --- │
1805
+ # # │ i64 ┆ i64 ┆ str │
1806
+ # # ╞═════╪═════╪═════╡
1807
+ # # │ 1 ┆ 6 ┆ a │
1808
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1809
+ # # │ 2 ┆ 7 ┆ b │
1810
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1811
+ # # │ 3 ┆ 8 ┆ c │
1812
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1813
+ # # │ 4 ┆ 9 ┆ d │
1814
+ # # └─────┴─────┴─────┘
1815
+ def vstack(df, in_place: false)
1816
+ if in_place
1817
+ _df.vstack_mut(df._df)
1818
+ self
1819
+ else
1820
+ _from_rbdf(_df.vstack(df._df))
1821
+ end
1822
+ end
390
1823
 
391
- # def extend
392
- # end
1824
+ # Extend the memory backed by this `DataFrame` with the values from `other`.
1825
+ #
1826
+ # Different from `vstack` which adds the chunks from `other` to the chunks of this
1827
+ # `DataFrame` `extend` appends the data from `other` to the underlying memory
1828
+ # locations and thus may cause a reallocation.
1829
+ #
1830
+ # If this does not cause a reallocation, the resulting data structure will not
1831
+ # have any extra chunks and thus will yield faster queries.
1832
+ #
1833
+ # Prefer `extend` over `vstack` when you want to do a query after a single append.
1834
+ # For instance during online operations where you add `n` rows and rerun a query.
1835
+ #
1836
+ # Prefer `vstack` over `extend` when you want to append many times before doing a
1837
+ # query. For instance when you read in multiple files and when to store them in a
1838
+ # single `DataFrame`. In the latter case, finish the sequence of `vstack`
1839
+ # operations with a `rechunk`.
1840
+ #
1841
+ # @param other [DataFrame]
1842
+ # DataFrame to vertically add.
1843
+ #
1844
+ # @return [DataFrame]
1845
+ #
1846
+ # @example
1847
+ # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1848
+ # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
1849
+ # df1.extend(df2)
1850
+ # # =>
1851
+ # # shape: (6, 2)
1852
+ # # ┌─────┬─────┐
1853
+ # # │ foo ┆ bar │
1854
+ # # │ --- ┆ --- │
1855
+ # # │ i64 ┆ i64 │
1856
+ # # ╞═════╪═════╡
1857
+ # # │ 1 ┆ 4 │
1858
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1859
+ # # │ 2 ┆ 5 │
1860
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1861
+ # # │ 3 ┆ 6 │
1862
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1863
+ # # │ 10 ┆ 40 │
1864
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1865
+ # # │ 20 ┆ 50 │
1866
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1867
+ # # │ 30 ┆ 60 │
1868
+ # # └─────┴─────┘
1869
+ def extend(other)
1870
+ _df.extend(other._df)
1871
+ self
1872
+ end
393
1873
 
394
- # def drop
395
- # end
1874
+ # Remove column from DataFrame and return as new.
1875
+ #
1876
+ # @param columns [Object]
1877
+ # Column(s) to drop.
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "foo" => [1, 2, 3],
1885
+ # "bar" => [6.0, 7.0, 8.0],
1886
+ # "ham" => ["a", "b", "c"]
1887
+ # }
1888
+ # )
1889
+ # df.drop("ham")
1890
+ # # =>
1891
+ # # shape: (3, 2)
1892
+ # # ┌─────┬─────┐
1893
+ # # │ foo ┆ bar │
1894
+ # # │ --- ┆ --- │
1895
+ # # │ i64 ┆ f64 │
1896
+ # # ╞═════╪═════╡
1897
+ # # │ 1 ┆ 6.0 │
1898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1899
+ # # │ 2 ┆ 7.0 │
1900
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 3 ┆ 8.0 │
1902
+ # # └─────┴─────┘
1903
+ def drop(columns)
1904
+ if columns.is_a?(Array)
1905
+ df = clone
1906
+ columns.each do |n|
1907
+ df._df.drop_in_place(n)
1908
+ end
1909
+ df
1910
+ else
1911
+ _from_rbdf(_df.drop(columns))
1912
+ end
1913
+ end
396
1914
 
397
- # def drop_in_place
398
- # end
1915
+ # Drop in place.
1916
+ #
1917
+ # @param name [Object]
1918
+ # Column to drop.
1919
+ #
1920
+ # @return [Series]
1921
+ #
1922
+ # @example
1923
+ # df = Polars::DataFrame.new(
1924
+ # {
1925
+ # "foo" => [1, 2, 3],
1926
+ # "bar" => [6, 7, 8],
1927
+ # "ham" => ["a", "b", "c"]
1928
+ # }
1929
+ # )
1930
+ # df.drop_in_place("ham")
1931
+ # # =>
1932
+ # # shape: (3,)
1933
+ # # Series: 'ham' [str]
1934
+ # # [
1935
+ # # "a"
1936
+ # # "b"
1937
+ # # "c"
1938
+ # # ]
1939
+ def drop_in_place(name)
1940
+ Utils.wrap_s(_df.drop_in_place(name))
1941
+ end
399
1942
 
400
- # def cleared
401
- # end
1943
+ # Create an empty copy of the current DataFrame.
1944
+ #
1945
+ # Returns a DataFrame with identical schema but no data.
1946
+ #
1947
+ # @return [DataFrame]
1948
+ #
1949
+ # @example
1950
+ # df = Polars::DataFrame.new(
1951
+ # {
1952
+ # "a" => [nil, 2, 3, 4],
1953
+ # "b" => [0.5, nil, 2.5, 13],
1954
+ # "c" => [true, true, false, nil]
1955
+ # }
1956
+ # )
1957
+ # df.cleared
1958
+ # # =>
1959
+ # # shape: (0, 3)
1960
+ # # ┌─────┬─────┬──────┐
1961
+ # # │ a ┆ b ┆ c │
1962
+ # # │ --- ┆ --- ┆ --- │
1963
+ # # │ i64 ┆ f64 ┆ bool │
1964
+ # # ╞═════╪═════╪══════╡
1965
+ # # └─────┴─────┴──────┘
1966
+ def cleared
1967
+ height > 0 ? head(0) : clone
1968
+ end
402
1969
 
403
1970
  # clone handled by initialize_copy
404
1971
 
1972
+ # Get the DataFrame as a Array of Series.
1973
+ #
1974
+ # @return [Array]
405
1975
  def get_columns
406
1976
  _df.get_columns.map { |s| Utils.wrap_s(s) }
407
1977
  end
408
1978
 
1979
+ # Get a single column as Series by name.
1980
+ #
1981
+ # @param name [String]
1982
+ # Name of the column to retrieve.
1983
+ #
1984
+ # @return [Series]
1985
+ #
1986
+ # @example
1987
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1988
+ # df.get_column("foo")
1989
+ # # =>
1990
+ # # shape: (3,)
1991
+ # # Series: 'foo' [i64]
1992
+ # # [
1993
+ # # 1
1994
+ # # 2
1995
+ # # 3
1996
+ # # ]
409
1997
  def get_column(name)
410
1998
  self[name]
411
1999
  end
@@ -413,12 +2001,85 @@ module Polars
413
2001
  # def fill_null
414
2002
  # end
415
2003
 
2004
+ # Fill floating point NaN values by an Expression evaluation.
2005
+ #
2006
+ # @param fill_value [Object]
2007
+ # Value to fill NaN with.
2008
+ #
2009
+ # @return [DataFrame]
2010
+ #
2011
+ # @note
2012
+ # Note that floating point NaNs (Not a Number) are not missing values!
2013
+ # To replace missing values, use `fill_null`.
2014
+ #
2015
+ # @example
2016
+ # df = Polars::DataFrame.new(
2017
+ # {
2018
+ # "a" => [1.5, 2, Float::NAN, 4],
2019
+ # "b" => [0.5, 4, Float::NAN, 13]
2020
+ # }
2021
+ # )
2022
+ # df.fill_nan(99)
2023
+ # # =>
2024
+ # # shape: (4, 2)
2025
+ # # ┌──────┬──────┐
2026
+ # # │ a ┆ b │
2027
+ # # │ --- ┆ --- │
2028
+ # # │ f64 ┆ f64 │
2029
+ # # ╞══════╪══════╡
2030
+ # # │ 1.5 ┆ 0.5 │
2031
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2032
+ # # │ 2.0 ┆ 4.0 │
2033
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2034
+ # # │ 99.0 ┆ 99.0 │
2035
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2036
+ # # │ 4.0 ┆ 13.0 │
2037
+ # # └──────┴──────┘
416
2038
  def fill_nan(fill_value)
417
2039
  lazy.fill_nan(fill_value).collect(no_optimization: true)
418
2040
  end
419
2041
 
420
- # def explode
421
- # end
2042
+ # Explode `DataFrame` to long format by exploding a column with Lists.
2043
+ #
2044
+ # @param columns [Object]
2045
+ # Column of LargeList type.
2046
+ #
2047
+ # @return [DataFrame]
2048
+ #
2049
+ # @example
2050
+ # df = Polars::DataFrame.new(
2051
+ # {
2052
+ # "letters" => ["a", "a", "b", "c"],
2053
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
2054
+ # }
2055
+ # )
2056
+ # df.explode("numbers")
2057
+ # # =>
2058
+ # # shape: (8, 2)
2059
+ # # ┌─────────┬─────────┐
2060
+ # # │ letters ┆ numbers │
2061
+ # # │ --- ┆ --- │
2062
+ # # │ str ┆ i64 │
2063
+ # # ╞═════════╪═════════╡
2064
+ # # │ a ┆ 1 │
2065
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2066
+ # # │ a ┆ 2 │
2067
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2068
+ # # │ a ┆ 3 │
2069
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2070
+ # # │ b ┆ 4 │
2071
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2072
+ # # │ b ┆ 5 │
2073
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2074
+ # # │ c ┆ 6 │
2075
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2076
+ # # │ c ┆ 7 │
2077
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
2078
+ # # │ c ┆ 8 │
2079
+ # # └─────────┴─────────┘
2080
+ def explode(columns)
2081
+ lazy.explode(columns).collect(no_optimization: true)
2082
+ end
422
2083
 
423
2084
  # def pivot
424
2085
  # end
@@ -432,24 +2093,242 @@ module Polars
432
2093
  # def partition_by
433
2094
  # end
434
2095
 
435
- # def shift
436
- # end
2096
+ # Shift values by the given period.
2097
+ #
2098
+ # @param periods [Integer]
2099
+ # Number of places to shift (may be negative).
2100
+ #
2101
+ # @return [DataFrame]
2102
+ #
2103
+ # @example
2104
+ # df = Polars::DataFrame.new(
2105
+ # {
2106
+ # "foo" => [1, 2, 3],
2107
+ # "bar" => [6, 7, 8],
2108
+ # "ham" => ["a", "b", "c"]
2109
+ # }
2110
+ # )
2111
+ # df.shift(1)
2112
+ # # =>
2113
+ # # shape: (3, 3)
2114
+ # # ┌──────┬──────┬──────┐
2115
+ # # │ foo ┆ bar ┆ ham │
2116
+ # # │ --- ┆ --- ┆ --- │
2117
+ # # │ i64 ┆ i64 ┆ str │
2118
+ # # ╞══════╪══════╪══════╡
2119
+ # # │ null ┆ null ┆ null │
2120
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2121
+ # # │ 1 ┆ 6 ┆ a │
2122
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2123
+ # # │ 2 ┆ 7 ┆ b │
2124
+ # # └──────┴──────┴──────┘
2125
+ #
2126
+ # @example
2127
+ # df.shift(-1)
2128
+ # # =>
2129
+ # # shape: (3, 3)
2130
+ # # ┌──────┬──────┬──────┐
2131
+ # # │ foo ┆ bar ┆ ham │
2132
+ # # │ --- ┆ --- ┆ --- │
2133
+ # # │ i64 ┆ i64 ┆ str │
2134
+ # # ╞══════╪══════╪══════╡
2135
+ # # │ 2 ┆ 7 ┆ b │
2136
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2137
+ # # │ 3 ┆ 8 ┆ c │
2138
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2139
+ # # │ null ┆ null ┆ null │
2140
+ # # └──────┴──────┴──────┘
2141
+ def shift(periods)
2142
+ _from_rbdf(_df.shift(periods))
2143
+ end
437
2144
 
438
- # def shift_and_fill
439
- # end
2145
+ # Shift the values by a given period and fill the resulting null values.
2146
+ #
2147
+ # @param periods [Integer]
2148
+ # Number of places to shift (may be negative).
2149
+ # @param fill_value [Object]
2150
+ # fill nil values with this value.
2151
+ #
2152
+ # @return [DataFrame]
2153
+ #
2154
+ # @example
2155
+ # df = Polars::DataFrame.new(
2156
+ # {
2157
+ # "foo" => [1, 2, 3],
2158
+ # "bar" => [6, 7, 8],
2159
+ # "ham" => ["a", "b", "c"]
2160
+ # }
2161
+ # )
2162
+ # df.shift_and_fill(1, 0)
2163
+ # # =>
2164
+ # # shape: (3, 3)
2165
+ # # ┌─────┬─────┬─────┐
2166
+ # # │ foo ┆ bar ┆ ham │
2167
+ # # │ --- ┆ --- ┆ --- │
2168
+ # # │ i64 ┆ i64 ┆ str │
2169
+ # # ╞═════╪═════╪═════╡
2170
+ # # │ 0 ┆ 0 ┆ 0 │
2171
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2172
+ # # │ 1 ┆ 6 ┆ a │
2173
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2174
+ # # │ 2 ┆ 7 ┆ b │
2175
+ # # └─────┴─────┴─────┘
2176
+ def shift_and_fill(periods, fill_value)
2177
+ lazy
2178
+ .shift_and_fill(periods, fill_value)
2179
+ .collect(no_optimization: true, string_cache: false)
2180
+ end
440
2181
 
2182
+ # Get a mask of all duplicated rows in this DataFrame.
2183
+ #
2184
+ # @return [Series]
2185
+ #
2186
+ # @example
2187
+ # df = Polars::DataFrame.new(
2188
+ # {
2189
+ # "a" => [1, 2, 3, 1],
2190
+ # "b" => ["x", "y", "z", "x"],
2191
+ # }
2192
+ # )
2193
+ # df.is_duplicated
2194
+ # # =>
2195
+ # # shape: (4,)
2196
+ # # Series: '' [bool]
2197
+ # # [
2198
+ # # true
2199
+ # # false
2200
+ # # false
2201
+ # # true
2202
+ # # ]
441
2203
  def is_duplicated
442
2204
  Utils.wrap_s(_df.is_duplicated)
443
2205
  end
444
2206
 
2207
+ # Get a mask of all unique rows in this DataFrame.
2208
+ #
2209
+ # @return [Series]
2210
+ #
2211
+ # @example
2212
+ # df = Polars::DataFrame.new(
2213
+ # {
2214
+ # "a" => [1, 2, 3, 1],
2215
+ # "b" => ["x", "y", "z", "x"]
2216
+ # }
2217
+ # )
2218
+ # df.is_unique
2219
+ # # =>
2220
+ # # shape: (4,)
2221
+ # # Series: '' [bool]
2222
+ # # [
2223
+ # # false
2224
+ # # true
2225
+ # # true
2226
+ # # false
2227
+ # # ]
445
2228
  def is_unique
446
2229
  Utils.wrap_s(_df.is_unique)
447
2230
  end
448
2231
 
2232
+ # Start a lazy query from this point.
2233
+ #
2234
+ # @return [LazyFrame]
449
2235
  def lazy
450
2236
  wrap_ldf(_df.lazy)
451
2237
  end
452
2238
 
2239
+ # Select columns from this DataFrame.
2240
+ #
2241
+ # @param exprs [Object]
2242
+ # Column or columns to select.
2243
+ #
2244
+ # @return [DataFrame]
2245
+ #
2246
+ # @example
2247
+ # df = Polars::DataFrame.new(
2248
+ # {
2249
+ # "foo" => [1, 2, 3],
2250
+ # "bar" => [6, 7, 8],
2251
+ # "ham" => ["a", "b", "c"]
2252
+ # }
2253
+ # )
2254
+ # df.select("foo")
2255
+ # # =>
2256
+ # # shape: (3, 1)
2257
+ # # ┌─────┐
2258
+ # # │ foo │
2259
+ # # │ --- │
2260
+ # # │ i64 │
2261
+ # # ╞═════╡
2262
+ # # │ 1 │
2263
+ # # ├╌╌╌╌╌┤
2264
+ # # │ 2 │
2265
+ # # ├╌╌╌╌╌┤
2266
+ # # │ 3 │
2267
+ # # └─────┘
2268
+ #
2269
+ # @example
2270
+ # df.select(["foo", "bar"])
2271
+ # # =>
2272
+ # # shape: (3, 2)
2273
+ # # ┌─────┬─────┐
2274
+ # # │ foo ┆ bar │
2275
+ # # │ --- ┆ --- │
2276
+ # # │ i64 ┆ i64 │
2277
+ # # ╞═════╪═════╡
2278
+ # # │ 1 ┆ 6 │
2279
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2280
+ # # │ 2 ┆ 7 │
2281
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2282
+ # # │ 3 ┆ 8 │
2283
+ # # └─────┴─────┘
2284
+ #
2285
+ # @example
2286
+ # df.select(Polars.col("foo") + 1)
2287
+ # # =>
2288
+ # # shape: (3, 1)
2289
+ # # ┌─────┐
2290
+ # # │ foo │
2291
+ # # │ --- │
2292
+ # # │ i64 │
2293
+ # # ╞═════╡
2294
+ # # │ 2 │
2295
+ # # ├╌╌╌╌╌┤
2296
+ # # │ 3 │
2297
+ # # ├╌╌╌╌╌┤
2298
+ # # │ 4 │
2299
+ # # └─────┘
2300
+ #
2301
+ # @example
2302
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
2303
+ # # =>
2304
+ # # shape: (3, 2)
2305
+ # # ┌─────┬─────┐
2306
+ # # │ foo ┆ bar │
2307
+ # # │ --- ┆ --- │
2308
+ # # │ i64 ┆ i64 │
2309
+ # # ╞═════╪═════╡
2310
+ # # │ 2 ┆ 7 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2312
+ # # │ 3 ┆ 8 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2314
+ # # │ 4 ┆ 9 │
2315
+ # # └─────┴─────┘
2316
+ #
2317
+ # @example
2318
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
2319
+ # # =>
2320
+ # # shape: (3, 1)
2321
+ # # ┌─────────┐
2322
+ # # │ literal │
2323
+ # # │ --- │
2324
+ # # │ i64 │
2325
+ # # ╞═════════╡
2326
+ # # │ 0 │
2327
+ # # ├╌╌╌╌╌╌╌╌╌┤
2328
+ # # │ 0 │
2329
+ # # ├╌╌╌╌╌╌╌╌╌┤
2330
+ # # │ 10 │
2331
+ # # └─────────┘
453
2332
  def select(exprs)
454
2333
  _from_rbdf(
455
2334
  lazy
@@ -459,6 +2338,43 @@ module Polars
459
2338
  )
460
2339
  end
461
2340
 
2341
+ # Add or overwrite multiple columns in a DataFrame.
2342
+ #
2343
+ # @param exprs [Array]
2344
+ # Array of Expressions that evaluate to columns.
2345
+ #
2346
+ # @return [DataFrame]
2347
+ #
2348
+ # @example
2349
+ # df = Polars::DataFrame.new(
2350
+ # {
2351
+ # "a" => [1, 2, 3, 4],
2352
+ # "b" => [0.5, 4, 10, 13],
2353
+ # "c" => [true, true, false, true]
2354
+ # }
2355
+ # )
2356
+ # df.with_columns(
2357
+ # [
2358
+ # (Polars.col("a") ** 2).alias("a^2"),
2359
+ # (Polars.col("b") / 2).alias("b/2"),
2360
+ # (Polars.col("c").is_not()).alias("not c")
2361
+ # ]
2362
+ # )
2363
+ # # =>
2364
+ # # shape: (4, 6)
2365
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
2366
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
2367
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2368
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
2369
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
2370
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
2371
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2372
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
2373
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2374
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
2375
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2376
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
2377
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
462
2378
  def with_columns(exprs)
463
2379
  if !exprs.nil? && !exprs.is_a?(Array)
464
2380
  exprs = [exprs]
@@ -468,6 +2384,26 @@ module Polars
468
2384
  .collect(no_optimization: true, string_cache: false)
469
2385
  end
470
2386
 
2387
+ # Get number of chunks used by the ChunkedArrays of this DataFrame.
2388
+ #
2389
+ # @param strategy ["first", "all"]
2390
+ # Return the number of chunks of the 'first' column,
2391
+ # or 'all' columns in this DataFrame.
2392
+ #
2393
+ # @return [Object]
2394
+ #
2395
+ # @example
2396
+ # df = Polars::DataFrame.new(
2397
+ # {
2398
+ # "a" => [1, 2, 3, 4],
2399
+ # "b" => [0.5, 4, 10, 13],
2400
+ # "c" => [true, true, false, true]
2401
+ # }
2402
+ # )
2403
+ # df.n_chunks
2404
+ # # => 1
2405
+ # df.n_chunks(strategy: "all")
2406
+ # # => [1, 1, 1]
471
2407
  def n_chunks(strategy: "first")
472
2408
  if strategy == "first"
473
2409
  _df.n_chunks
@@ -478,6 +2414,28 @@ module Polars
478
2414
  end
479
2415
  end
480
2416
 
2417
+ # Aggregate the columns of this DataFrame to their maximum value.
2418
+ #
2419
+ # @return [DataFrame]
2420
+ #
2421
+ # @example
2422
+ # df = Polars::DataFrame.new(
2423
+ # {
2424
+ # "foo" => [1, 2, 3],
2425
+ # "bar" => [6, 7, 8],
2426
+ # "ham" => ["a", "b", "c"]
2427
+ # }
2428
+ # )
2429
+ # df.max
2430
+ # # =>
2431
+ # # shape: (1, 3)
2432
+ # # ┌─────┬─────┬─────┐
2433
+ # # │ foo ┆ bar ┆ ham │
2434
+ # # │ --- ┆ --- ┆ --- │
2435
+ # # │ i64 ┆ i64 ┆ str │
2436
+ # # ╞═════╪═════╪═════╡
2437
+ # # │ 3 ┆ 8 ┆ c │
2438
+ # # └─────┴─────┴─────┘
481
2439
  def max(axis: 0)
482
2440
  if axis == 0
483
2441
  _from_rbdf(_df.max)
@@ -488,6 +2446,28 @@ module Polars
488
2446
  end
489
2447
  end
490
2448
 
2449
+ # Aggregate the columns of this DataFrame to their minimum value.
2450
+ #
2451
+ # @return [DataFrame]
2452
+ #
2453
+ # @example
2454
+ # df = Polars::DataFrame.new(
2455
+ # {
2456
+ # "foo" => [1, 2, 3],
2457
+ # "bar" => [6, 7, 8],
2458
+ # "ham" => ["a", "b", "c"]
2459
+ # }
2460
+ # )
2461
+ # df.min
2462
+ # # =>
2463
+ # # shape: (1, 3)
2464
+ # # ┌─────┬─────┬─────┐
2465
+ # # │ foo ┆ bar ┆ ham │
2466
+ # # │ --- ┆ --- ┆ --- │
2467
+ # # │ i64 ┆ i64 ┆ str │
2468
+ # # ╞═════╪═════╪═════╡
2469
+ # # │ 1 ┆ 6 ┆ a │
2470
+ # # └─────┴─────┴─────┘
491
2471
  def min(axis: 0)
492
2472
  if axis == 0
493
2473
  _from_rbdf(_df.min)
@@ -498,6 +2478,44 @@ module Polars
498
2478
  end
499
2479
  end
500
2480
 
2481
+ # Aggregate the columns of this DataFrame to their sum value.
2482
+ #
2483
+ # @param axis [Integer]
2484
+ # Either 0 or 1.
2485
+ # @param null_strategy ["ignore", "propagate"]
2486
+ # This argument is only used if axis == 1.
2487
+ #
2488
+ # @return [DataFrame]
2489
+ #
2490
+ # @example
2491
+ # df = Polars::DataFrame.new(
2492
+ # {
2493
+ # "foo" => [1, 2, 3],
2494
+ # "bar" => [6, 7, 8],
2495
+ # "ham" => ["a", "b", "c"],
2496
+ # }
2497
+ # )
2498
+ # df.sum
2499
+ # # =>
2500
+ # # shape: (1, 3)
2501
+ # # ┌─────┬─────┬──────┐
2502
+ # # │ foo ┆ bar ┆ ham │
2503
+ # # │ --- ┆ --- ┆ --- │
2504
+ # # │ i64 ┆ i64 ┆ str │
2505
+ # # ╞═════╪═════╪══════╡
2506
+ # # │ 6 ┆ 21 ┆ null │
2507
+ # # └─────┴─────┴──────┘
2508
+ #
2509
+ # @example
2510
+ # df.sum(axis: 1)
2511
+ # # =>
2512
+ # # shape: (3,)
2513
+ # # Series: 'foo' [str]
2514
+ # # [
2515
+ # # "16a"
2516
+ # # "27b"
2517
+ # # "38c"
2518
+ # # ]
501
2519
  def sum(axis: 0, null_strategy: "ignore")
502
2520
  case axis
503
2521
  when 0
@@ -509,6 +2527,33 @@ module Polars
509
2527
  end
510
2528
  end
511
2529
 
2530
+ # Aggregate the columns of this DataFrame to their mean value.
2531
+ #
2532
+ # @param axis [Integer]
2533
+ # Either 0 or 1.
2534
+ # @param null_strategy ["ignore", "propagate"]
2535
+ # This argument is only used if axis == 1.
2536
+ #
2537
+ # @return [DataFrame]
2538
+ #
2539
+ # @example
2540
+ # df = Polars::DataFrame.new(
2541
+ # {
2542
+ # "foo" => [1, 2, 3],
2543
+ # "bar" => [6, 7, 8],
2544
+ # "ham" => ["a", "b", "c"]
2545
+ # }
2546
+ # )
2547
+ # df.mean
2548
+ # # =>
2549
+ # # shape: (1, 3)
2550
+ # # ┌─────┬─────┬──────┐
2551
+ # # │ foo ┆ bar ┆ ham │
2552
+ # # │ --- ┆ --- ┆ --- │
2553
+ # # │ f64 ┆ f64 ┆ str │
2554
+ # # ╞═════╪═════╪══════╡
2555
+ # # │ 2.0 ┆ 7.0 ┆ null │
2556
+ # # └─────┴─────┴──────┘
512
2557
  def mean(axis: 0, null_strategy: "ignore")
513
2558
  case axis
514
2559
  when 0
@@ -520,75 +2565,633 @@ module Polars
520
2565
  end
521
2566
  end
522
2567
 
2568
+ # Aggregate the columns of this DataFrame to their standard deviation value.
2569
+ #
2570
+ # @param ddof [Integer]
2571
+ # Degrees of freedom
2572
+ #
2573
+ # @return [DataFrame]
2574
+ #
2575
+ # @example
2576
+ # df = Polars::DataFrame.new(
2577
+ # {
2578
+ # "foo" => [1, 2, 3],
2579
+ # "bar" => [6, 7, 8],
2580
+ # "ham" => ["a", "b", "c"]
2581
+ # }
2582
+ # )
2583
+ # df.std
2584
+ # # =>
2585
+ # # shape: (1, 3)
2586
+ # # ┌─────┬─────┬──────┐
2587
+ # # │ foo ┆ bar ┆ ham │
2588
+ # # │ --- ┆ --- ┆ --- │
2589
+ # # │ f64 ┆ f64 ┆ str │
2590
+ # # ╞═════╪═════╪══════╡
2591
+ # # │ 1.0 ┆ 1.0 ┆ null │
2592
+ # # └─────┴─────┴──────┘
2593
+ #
2594
+ # @example
2595
+ # df.std(ddof: 0)
2596
+ # # =>
2597
+ # # shape: (1, 3)
2598
+ # # ┌──────────┬──────────┬──────┐
2599
+ # # │ foo ┆ bar ┆ ham │
2600
+ # # │ --- ┆ --- ┆ --- │
2601
+ # # │ f64 ┆ f64 ┆ str │
2602
+ # # ╞══════════╪══════════╪══════╡
2603
+ # # │ 0.816497 ┆ 0.816497 ┆ null │
2604
+ # # └──────────┴──────────┴──────┘
523
2605
  def std(ddof: 1)
524
2606
  _from_rbdf(_df.std(ddof))
525
2607
  end
526
2608
 
2609
+ # Aggregate the columns of this DataFrame to their variance value.
2610
+ #
2611
+ # @param ddof [Integer]
2612
+ # Degrees of freedom
2613
+ #
2614
+ # @return [DataFrame]
2615
+ #
2616
+ # @example
2617
+ # df = Polars::DataFrame.new(
2618
+ # {
2619
+ # "foo" => [1, 2, 3],
2620
+ # "bar" => [6, 7, 8],
2621
+ # "ham" => ["a", "b", "c"]
2622
+ # }
2623
+ # )
2624
+ # df.var
2625
+ # # =>
2626
+ # # shape: (1, 3)
2627
+ # # ┌─────┬─────┬──────┐
2628
+ # # │ foo ┆ bar ┆ ham │
2629
+ # # │ --- ┆ --- ┆ --- │
2630
+ # # │ f64 ┆ f64 ┆ str │
2631
+ # # ╞═════╪═════╪══════╡
2632
+ # # │ 1.0 ┆ 1.0 ┆ null │
2633
+ # # └─────┴─────┴──────┘
2634
+ #
2635
+ # @example
2636
+ # df.var(ddof: 0)
2637
+ # # =>
2638
+ # # shape: (1, 3)
2639
+ # # ┌──────────┬──────────┬──────┐
2640
+ # # │ foo ┆ bar ┆ ham │
2641
+ # # │ --- ┆ --- ┆ --- │
2642
+ # # │ f64 ┆ f64 ┆ str │
2643
+ # # ╞══════════╪══════════╪══════╡
2644
+ # # │ 0.666667 ┆ 0.666667 ┆ null │
2645
+ # # └──────────┴──────────┴──────┘
527
2646
  def var(ddof: 1)
528
2647
  _from_rbdf(_df.var(ddof))
529
2648
  end
530
2649
 
2650
+ # Aggregate the columns of this DataFrame to their median value.
2651
+ #
2652
+ # @return [DataFrame]
2653
+ #
2654
+ # @example
2655
+ # df = Polars::DataFrame.new(
2656
+ # {
2657
+ # "foo" => [1, 2, 3],
2658
+ # "bar" => [6, 7, 8],
2659
+ # "ham" => ["a", "b", "c"]
2660
+ # }
2661
+ # )
2662
+ # df.median
2663
+ # # =>
2664
+ # # shape: (1, 3)
2665
+ # # ┌─────┬─────┬──────┐
2666
+ # # │ foo ┆ bar ┆ ham │
2667
+ # # │ --- ┆ --- ┆ --- │
2668
+ # # │ f64 ┆ f64 ┆ str │
2669
+ # # ╞═════╪═════╪══════╡
2670
+ # # │ 2.0 ┆ 7.0 ┆ null │
2671
+ # # └─────┴─────┴──────┘
531
2672
  def median
532
2673
  _from_rbdf(_df.median)
533
2674
  end
534
2675
 
535
- # def product
536
- # end
2676
+ # Aggregate the columns of this DataFrame to their product values.
2677
+ #
2678
+ # @return [DataFrame]
2679
+ #
2680
+ # @example
2681
+ # df = Polars::DataFrame.new(
2682
+ # {
2683
+ # "a" => [1, 2, 3],
2684
+ # "b" => [0.5, 4, 10],
2685
+ # "c" => [true, true, false]
2686
+ # }
2687
+ # )
2688
+ # df.product
2689
+ # # =>
2690
+ # # shape: (1, 3)
2691
+ # # ┌─────┬──────┬─────┐
2692
+ # # │ a ┆ b ┆ c │
2693
+ # # │ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ f64 ┆ i64 │
2695
+ # # ╞═════╪══════╪═════╡
2696
+ # # │ 6 ┆ 20.0 ┆ 0 │
2697
+ # # └─────┴──────┴─────┘
2698
+ def product
2699
+ select(Polars.all.product)
2700
+ end
537
2701
 
538
- # def quantile(quantile, interpolation: "nearest")
539
- # end
2702
+ # Aggregate the columns of this DataFrame to their quantile value.
2703
+ #
2704
+ # @param quantile [Float]
2705
+ # Quantile between 0.0 and 1.0.
2706
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
2707
+ # Interpolation method.
2708
+ #
2709
+ # @return [DataFrame]
2710
+ #
2711
+ # @example
2712
+ # df = Polars::DataFrame.new(
2713
+ # {
2714
+ # "foo" => [1, 2, 3],
2715
+ # "bar" => [6, 7, 8],
2716
+ # "ham" => ["a", "b", "c"]
2717
+ # }
2718
+ # )
2719
+ # df.quantile(0.5, interpolation: "nearest")
2720
+ # # =>
2721
+ # # shape: (1, 3)
2722
+ # # ┌─────┬─────┬──────┐
2723
+ # # │ foo ┆ bar ┆ ham │
2724
+ # # │ --- ┆ --- ┆ --- │
2725
+ # # │ f64 ┆ f64 ┆ str │
2726
+ # # ╞═════╪═════╪══════╡
2727
+ # # │ 2.0 ┆ 7.0 ┆ null │
2728
+ # # └─────┴─────┴──────┘
2729
+ def quantile(quantile, interpolation: "nearest")
2730
+ _from_rbdf(_df.quantile(quantile, interpolation))
2731
+ end
540
2732
 
541
- # def to_dummies
542
- # end
2733
+ # Get one hot encoded dummy variables.
2734
+ #
2735
+ # @param columns
2736
+ # A subset of columns to convert to dummy variables. `nil` means
2737
+ # "all columns".
2738
+ #
2739
+ # @return [DataFrame]
2740
+ #
2741
+ # @example
2742
+ # df = Polars::DataFrame.new(
2743
+ # {
2744
+ # "foo" => [1, 2],
2745
+ # "bar" => [3, 4],
2746
+ # "ham" => ["a", "b"]
2747
+ # }
2748
+ # )
2749
+ # df.to_dummies
2750
+ # # =>
2751
+ # # shape: (2, 6)
2752
+ # # ┌───────┬───────┬───────┬───────┬───────┬───────┐
2753
+ # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
2754
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2755
+ # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
2756
+ # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
2757
+ # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
2758
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2759
+ # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
2760
+ # # └───────┴───────┴───────┴───────┴───────┴───────┘
2761
+ def to_dummies(columns: nil)
2762
+ if columns.is_a?(String)
2763
+ columns = [columns]
2764
+ end
2765
+ _from_rbdf(_df.to_dummies(columns))
2766
+ end
543
2767
 
544
- # def unique
545
- # end
2768
+ # Drop duplicate rows from this DataFrame.
2769
+ #
2770
+ # @param maintain_order [Boolean]
2771
+ # Keep the same order as the original DataFrame. This requires more work to
2772
+ # compute.
2773
+ # @param subset [Object]
2774
+ # Subset to use to compare rows.
2775
+ # @param keep ["first", "last"]
2776
+ # Which of the duplicate rows to keep (in conjunction with `subset`).
2777
+ #
2778
+ # @return [DataFrame]
2779
+ #
2780
+ # @note
2781
+ # Note that this fails if there is a column of type `List` in the DataFrame or
2782
+ # subset.
2783
+ #
2784
+ # @example
2785
+ # df = Polars::DataFrame.new(
2786
+ # {
2787
+ # "a" => [1, 1, 2, 3, 4, 5],
2788
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2789
+ # "c" => [true, true, true, false, true, true]
2790
+ # }
2791
+ # )
2792
+ # df.unique
2793
+ # # =>
2794
+ # # shape: (5, 3)
2795
+ # # ┌─────┬─────┬───────┐
2796
+ # # │ a ┆ b ┆ c │
2797
+ # # │ --- ┆ --- ┆ --- │
2798
+ # # │ i64 ┆ f64 ┆ bool │
2799
+ # # ╞═════╪═════╪═══════╡
2800
+ # # │ 1 ┆ 0.5 ┆ true │
2801
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2802
+ # # │ 2 ┆ 1.0 ┆ true │
2803
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2804
+ # # │ 3 ┆ 2.0 ┆ false │
2805
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2806
+ # # │ 4 ┆ 3.0 ┆ true │
2807
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2808
+ # # │ 5 ┆ 3.0 ┆ true │
2809
+ # # └─────┴─────┴───────┘
2810
+ def unique(maintain_order: true, subset: nil, keep: "first")
2811
+ if !subset.nil?
2812
+ if subset.is_a?(String)
2813
+ subset = [subset]
2814
+ elsif !subset.is_a?(Array)
2815
+ subset = subset.to_a
2816
+ end
2817
+ end
546
2818
 
547
- # def n_unique
548
- # end
2819
+ _from_rbdf(_df.unique(maintain_order, subset, keep))
2820
+ end
2821
+
2822
+ # Return the number of unique rows, or the number of unique row-subsets.
2823
+ #
2824
+ # @param subset [Object]
2825
+ # One or more columns/expressions that define what to count;
2826
+ # omit to return the count of unique rows.
2827
+ #
2828
+ # @return [DataFrame]
2829
+ #
2830
+ # @example
2831
+ # df = Polars::DataFrame.new(
2832
+ # {
2833
+ # "a" => [1, 1, 2, 3, 4, 5],
2834
+ # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
2835
+ # "c" => [true, true, true, false, true, true]
2836
+ # }
2837
+ # )
2838
+ # df.n_unique
2839
+ # # => 5
2840
+ #
2841
+ # @example Simple columns subset
2842
+ # df.n_unique(subset: ["b", "c"])
2843
+ # # => 4
2844
+ #
2845
+ # @example Expression subset
2846
+ # df.n_unique(
2847
+ # subset: [
2848
+ # (Polars.col("a").floordiv(2)),
2849
+ # (Polars.col("c") | (Polars.col("b") >= 2))
2850
+ # ]
2851
+ # )
2852
+ # # => 3
2853
+ def n_unique(subset: nil)
2854
+ if subset.is_a?(StringIO)
2855
+ subset = [Polars.col(subset)]
2856
+ elsif subset.is_a?(Expr)
2857
+ subset = [subset]
2858
+ end
2859
+
2860
+ if subset.is_a?(Array) && subset.length == 1
2861
+ expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
2862
+ else
2863
+ struct_fields = subset.nil? ? Polars.all : subset
2864
+ expr = Polars.struct(struct_fields)
2865
+ end
2866
+
2867
+ df = lazy.select(expr.n_unique).collect
2868
+ df.is_empty ? 0 : df.row(0)[0]
2869
+ end
549
2870
 
2871
+ # Rechunk the data in this DataFrame to a contiguous allocation.
2872
+
2873
+ # This will make sure all subsequent operations have optimal and predictable
2874
+ # performance.
2875
+ #
2876
+ # @return [DataFrame]
550
2877
  def rechunk
551
2878
  _from_rbdf(_df.rechunk)
552
2879
  end
553
2880
 
2881
+ # Create a new DataFrame that shows the null counts per column.
2882
+ #
2883
+ # @return [DataFrame]
2884
+ #
2885
+ # @example
2886
+ # df = Polars::DataFrame.new(
2887
+ # {
2888
+ # "foo" => [1, nil, 3],
2889
+ # "bar" => [6, 7, nil],
2890
+ # "ham" => ["a", "b", "c"]
2891
+ # }
2892
+ # )
2893
+ # df.null_count
2894
+ # # =>
2895
+ # # shape: (1, 3)
2896
+ # # ┌─────┬─────┬─────┐
2897
+ # # │ foo ┆ bar ┆ ham │
2898
+ # # │ --- ┆ --- ┆ --- │
2899
+ # # │ u32 ┆ u32 ┆ u32 │
2900
+ # # ╞═════╪═════╪═════╡
2901
+ # # │ 1 ┆ 1 ┆ 0 │
2902
+ # # └─────┴─────┴─────┘
554
2903
  def null_count
555
2904
  _from_rbdf(_df.null_count)
556
2905
  end
557
2906
 
558
- # def sample
559
- # end
2907
+ # Sample from this DataFrame.
2908
+ #
2909
+ # @param n [Integer]
2910
+ # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
2911
+ # `frac` is nil.
2912
+ # @param frac [Float]
2913
+ # Fraction of items to return. Cannot be used with `n`.
2914
+ # @param with_replacement [Boolean]
2915
+ # Allow values to be sampled more than once.
2916
+ # @param shuffle [Boolean]
2917
+ # Shuffle the order of sampled data points.
2918
+ # @param seed [Integer]
2919
+ # Seed for the random number generator. If set to nil (default), a random
2920
+ # seed is used.
2921
+ #
2922
+ # @return [DataFrame]
2923
+ #
2924
+ # @example
2925
+ # df = Polars::DataFrame.new(
2926
+ # {
2927
+ # "foo" => [1, 2, 3],
2928
+ # "bar" => [6, 7, 8],
2929
+ # "ham" => ["a", "b", "c"]
2930
+ # }
2931
+ # )
2932
+ # df.sample(n: 2, seed: 0)
2933
+ # # =>
2934
+ # # shape: (2, 3)
2935
+ # # ┌─────┬─────┬─────┐
2936
+ # # │ foo ┆ bar ┆ ham │
2937
+ # # │ --- ┆ --- ┆ --- │
2938
+ # # │ i64 ┆ i64 ┆ str │
2939
+ # # ╞═════╪═════╪═════╡
2940
+ # # │ 3 ┆ 8 ┆ c │
2941
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2942
+ # # │ 2 ┆ 7 ┆ b │
2943
+ # # └─────┴─────┴─────┘
2944
+ def sample(
2945
+ n: nil,
2946
+ frac: nil,
2947
+ with_replacement: false,
2948
+ shuffle: false,
2949
+ seed: nil
2950
+ )
2951
+ if !n.nil? && !frac.nil?
2952
+ raise ArgumentError, "cannot specify both `n` and `frac`"
2953
+ end
2954
+
2955
+ if n.nil? && !frac.nil?
2956
+ _from_rbdf(
2957
+ _df.sample_frac(frac, with_replacement, shuffle, seed)
2958
+ )
2959
+ end
2960
+
2961
+ if n.nil?
2962
+ n = 1
2963
+ end
2964
+ _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
2965
+ end
560
2966
 
561
2967
  # def fold
562
2968
  # end
563
2969
 
564
- # def row
565
- # end
2970
+ # Get a row as tuple, either by index or by predicate.
2971
+ #
2972
+ # @param index [Object]
2973
+ # Row index.
2974
+ # @param by_predicate [Object]
2975
+ # Select the row according to a given expression/predicate.
2976
+ #
2977
+ # @return [Object]
2978
+ #
2979
+ # @note
2980
+ # The `index` and `by_predicate` params are mutually exclusive. Additionally,
2981
+ # to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
2982
+ #
2983
+ # When using `by_predicate` it is an error condition if anything other than
2984
+ # one row is returned; more than one row raises `TooManyRowsReturned`, and
2985
+ # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
2986
+ #
2987
+ # @example Return the row at the given index
2988
+ # df = Polars::DataFrame.new(
2989
+ # {
2990
+ # "foo" => [1, 2, 3],
2991
+ # "bar" => [6, 7, 8],
2992
+ # "ham" => ["a", "b", "c"]
2993
+ # }
2994
+ # )
2995
+ # df.row(2)
2996
+ # # => [3, 8, "c"]
2997
+ #
2998
+ # @example Return the row that matches the given predicate
2999
+ # df.row(by_predicate: Polars.col("ham") == "b")
3000
+ # # => [2, 7, "b"]
3001
+ def row(index = nil, by_predicate: nil)
3002
+ if !index.nil? && !by_predicate.nil?
3003
+ raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
3004
+ elsif index.is_a?(Expr)
3005
+ raise TypeError, "Expressions should be passed to the 'by_predicate' param"
3006
+ elsif index.is_a?(Integer)
3007
+ _df.row_tuple(index)
3008
+ elsif by_predicate.is_a?(Expr)
3009
+ rows = filter(by_predicate).rows
3010
+ n_rows = rows.length
3011
+ if n_rows > 1
3012
+ raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
3013
+ elsif n_rows == 0
3014
+ raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
3015
+ end
3016
+ rows[0]
3017
+ else
3018
+ raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
3019
+ end
3020
+ end
566
3021
 
567
- # def rows
568
- # end
3022
+ # Convert columnar data to rows as Ruby arrays.
3023
+ #
3024
+ # @return [Array]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 3, 5],
3030
+ # "b" => [2, 4, 6]
3031
+ # }
3032
+ # )
3033
+ # df.rows
3034
+ # # => [[1, 2], [3, 4], [5, 6]]
3035
+ def rows
3036
+ _df.row_tuples
3037
+ end
569
3038
 
570
- # def shrink_to_fit
571
- # end
3039
+ # Shrink DataFrame memory usage.
3040
+ #
3041
+ # Shrinks to fit the exact capacity needed to hold the data.
3042
+ #
3043
+ # @return [DataFrame]
3044
+ def shrink_to_fit(in_place: false)
3045
+ if in_place
3046
+ _df.shrink_to_fit
3047
+ self
3048
+ else
3049
+ df = clone
3050
+ df._df.shrink_to_fit
3051
+ df
3052
+ end
3053
+ end
572
3054
 
573
- # def take_every
574
- # end
3055
+ # Take every nth row in the DataFrame and return as a new DataFrame.
3056
+ #
3057
+ # @return [DataFrame]
3058
+ #
3059
+ # @example
3060
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
3061
+ # s.take_every(2)
3062
+ # # =>
3063
+ # # shape: (2, 2)
3064
+ # # ┌─────┬─────┐
3065
+ # # │ a ┆ b │
3066
+ # # │ --- ┆ --- │
3067
+ # # │ i64 ┆ i64 │
3068
+ # # ╞═════╪═════╡
3069
+ # # │ 1 ┆ 5 │
3070
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
3071
+ # # │ 3 ┆ 7 │
3072
+ # # └─────┴─────┘
3073
+ def take_every(n)
3074
+ select(Utils.col("*").take_every(n))
3075
+ end
575
3076
 
576
3077
  # def hash_rows
577
3078
  # end
578
3079
 
579
- # def interpolate
580
- # end
3080
+ # Interpolate intermediate values. The interpolation method is linear.
3081
+ #
3082
+ # @return [DataFrame]
3083
+ #
3084
+ # @example
3085
+ # df = Polars::DataFrame.new(
3086
+ # {
3087
+ # "foo" => [1, nil, 9, 10],
3088
+ # "bar" => [6, 7, 9, nil],
3089
+ # "baz" => [1, nil, nil, 9]
3090
+ # }
3091
+ # )
3092
+ # df.interpolate
3093
+ # # =>
3094
+ # # shape: (4, 3)
3095
+ # # ┌─────┬──────┬─────┐
3096
+ # # │ foo ┆ bar ┆ baz │
3097
+ # # │ --- ┆ --- ┆ --- │
3098
+ # # │ i64 ┆ i64 ┆ i64 │
3099
+ # # ╞═════╪══════╪═════╡
3100
+ # # │ 1 ┆ 6 ┆ 1 │
3101
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3102
+ # # │ 5 ┆ 7 ┆ 3 │
3103
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3104
+ # # │ 9 ┆ 9 ┆ 6 │
3105
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
3106
+ # # │ 10 ┆ null ┆ 9 │
3107
+ # # └─────┴──────┴─────┘
3108
+ def interpolate
3109
+ select(Utils.col("*").interpolate)
3110
+ end
581
3111
 
3112
+ # Check if the dataframe is empty.
3113
+ #
3114
+ # @return [Boolean]
3115
+ #
3116
+ # @example
3117
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3118
+ # df.is_empty
3119
+ # # => false
3120
+ # df.filter(Polars.col("foo") > 99).is_empty
3121
+ # # => true
582
3122
  def is_empty
583
3123
  height == 0
584
3124
  end
585
3125
  alias_method :empty?, :is_empty
586
3126
 
587
- # def to_struct(name)
588
- # end
3127
+ # Convert a `DataFrame` to a `Series` of type `Struct`.
3128
+ #
3129
+ # @param name [String]
3130
+ # Name for the struct Series
3131
+ #
3132
+ # @return [Series]
3133
+ #
3134
+ # @example
3135
+ # df = Polars::DataFrame.new(
3136
+ # {
3137
+ # "a" => [1, 2, 3, 4, 5],
3138
+ # "b" => ["one", "two", "three", "four", "five"]
3139
+ # }
3140
+ # )
3141
+ # df.to_struct("nums")
3142
+ # # =>
3143
+ # # shape: (5,)
3144
+ # # Series: 'nums' [struct[2]]
3145
+ # # [
3146
+ # # {1,"one"}
3147
+ # # {2,"two"}
3148
+ # # {3,"three"}
3149
+ # # {4,"four"}
3150
+ # # {5,"five"}
3151
+ # # ]
3152
+ def to_struct(name)
3153
+ Utils.wrap_s(_df.to_struct(name))
3154
+ end
589
3155
 
590
- # def unnest
591
- # end
3156
+ # Decompose a struct into its fields.
3157
+ #
3158
+ # The fields will be inserted into the `DataFrame` on the location of the
3159
+ # `struct` type.
3160
+ #
3161
+ # @param names [Object]
3162
+ # Names of the struct columns that will be decomposed by its fields
3163
+ #
3164
+ # @return [DataFrame]
3165
+ #
3166
+ # @example
3167
+ # df = Polars::DataFrame.new(
3168
+ # {
3169
+ # "before" => ["foo", "bar"],
3170
+ # "t_a" => [1, 2],
3171
+ # "t_b" => ["a", "b"],
3172
+ # "t_c" => [true, nil],
3173
+ # "t_d" => [[1, 2], [3]],
3174
+ # "after" => ["baz", "womp"]
3175
+ # }
3176
+ # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
3177
+ # df.unnest("t_struct")
3178
+ # # =>
3179
+ # # shape: (2, 6)
3180
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
3181
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
3182
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3183
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
3184
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
3185
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
3186
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3187
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
3188
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
3189
+ def unnest(names)
3190
+ if names.is_a?(String)
3191
+ names = [names]
3192
+ end
3193
+ _from_rbdf(_df.unnest(names))
3194
+ end
592
3195
 
593
3196
  private
594
3197
 
@@ -597,15 +3200,55 @@ module Polars
597
3200
  self._df = _df._clone
598
3201
  end
599
3202
 
600
- def hash_to_rbdf(data)
3203
+ def hash_to_rbdf(data, columns: nil)
3204
+ if !columns.nil?
3205
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
3206
+
3207
+ if data.empty? && dtypes
3208
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
3209
+ else
3210
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
3211
+ end
3212
+ data_series = _handle_columns_arg(data_series, columns: columns)
3213
+ return RbDataFrame.new(data_series)
3214
+ end
3215
+
601
3216
  RbDataFrame.read_hash(data)
602
3217
  end
603
3218
 
604
- def sequence_to_rbdf(data)
3219
+ def _unpack_columns(columns, lookup_names: nil)
3220
+ [columns.keys, columns]
3221
+ end
3222
+
3223
+ def _handle_columns_arg(data, columns: nil)
3224
+ if columns.nil?
3225
+ data
3226
+ else
3227
+ if data.empty?
3228
+ columns.map { |c| Series.new(c, nil)._s }
3229
+ elsif data.length == columns.length
3230
+ columns.each_with_index do |c, i|
3231
+ # not in-place?
3232
+ data[i].rename(c)
3233
+ end
3234
+ data
3235
+ else
3236
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
3237
+ end
3238
+ end
3239
+ end
3240
+
3241
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
3242
+ if columns || orient
3243
+ raise Todo
3244
+ end
605
3245
  RbDataFrame.new(data.map(&:_s))
606
3246
  end
607
3247
 
608
- def series_to_rbdf(data)
3248
+ def series_to_rbdf(data, columns: nil)
3249
+ if columns
3250
+ raise Todo
3251
+ end
609
3252
  RbDataFrame.new([data._s])
610
3253
  end
611
3254
 
@@ -616,5 +3259,75 @@ module Polars
616
3259
  def _from_rbdf(rb_df)
617
3260
  self.class._from_rbdf(rb_df)
618
3261
  end
3262
+
3263
+ def _comp(other, op)
3264
+ if other.is_a?(DataFrame)
3265
+ _compare_to_other_df(other, op)
3266
+ else
3267
+ _compare_to_non_df(other, op)
3268
+ end
3269
+ end
3270
+
3271
+ def _compare_to_other_df(other, op)
3272
+ if columns != other.columns
3273
+ raise ArgmentError, "DataFrame columns do not match"
3274
+ end
3275
+ if shape != other.shape
3276
+ raise ArgmentError, "DataFrame dimensions do not match"
3277
+ end
3278
+
3279
+ suffix = "__POLARS_CMP_OTHER"
3280
+ other_renamed = other.select(Polars.all.suffix(suffix))
3281
+ combined = Polars.concat([self, other_renamed], how: "horizontal")
3282
+
3283
+ expr = case op
3284
+ when "eq"
3285
+ columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
3286
+ when "neq"
3287
+ columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
3288
+ when "gt"
3289
+ columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
3290
+ when "lt"
3291
+ columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
3292
+ when "gt_eq"
3293
+ columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
3294
+ when "lt_eq"
3295
+ columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
3296
+ else
3297
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3298
+ end
3299
+
3300
+ combined.select(expr)
3301
+ end
3302
+
3303
+ def _compare_to_non_df(other, op)
3304
+ case op
3305
+ when "eq"
3306
+ select(Polars.all == other)
3307
+ when "neq"
3308
+ select(Polars.all != other)
3309
+ when "gt"
3310
+ select(Polars.all > other)
3311
+ when "lt"
3312
+ select(Polars.all < other)
3313
+ when "gt_eq"
3314
+ select(Polars.all >= other)
3315
+ when "lt_eq"
3316
+ select(Polars.all <= other)
3317
+ else
3318
+ raise ArgumentError, "got unexpected comparison operator: #{op}"
3319
+ end
3320
+ end
3321
+
3322
+ def _prepare_other_arg(other)
3323
+ if !other.is_a?(Series)
3324
+ if other.is_a?(Array)
3325
+ raise ArgumentError, "Operation not supported."
3326
+ end
3327
+
3328
+ other = Series.new("", [other])
3329
+ end
3330
+ other
3331
+ end
619
3332
  end
620
3333
  end