polars-df 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,36 +1,249 @@
1
1
  module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
2
3
  class LazyFrame
4
+ # @private
3
5
  attr_accessor :_ldf
4
6
 
7
+ # @private
5
8
  def self._from_rbldf(rb_ldf)
6
9
  ldf = LazyFrame.allocate
7
10
  ldf._ldf = rb_ldf
8
11
  ldf
9
12
  end
10
13
 
11
- # def columns
12
- # end
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
13
46
 
14
- # def dtypes
15
- # end
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
16
72
 
17
- # def schema
18
- # end
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
19
112
 
20
- # def width
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
21
150
  # end
22
151
 
23
- # def include?(key)
152
+ # def self.read_json
24
153
  # end
25
154
 
155
+ # Get or set column names.
156
+ #
157
+ # @return [Array]
158
+ #
159
+ # @example
160
+ # df = (
161
+ # Polars::DataFrame.new(
162
+ # {
163
+ # "foo" => [1, 2, 3],
164
+ # "bar" => [6, 7, 8],
165
+ # "ham" => ["a", "b", "c"]
166
+ # }
167
+ # )
168
+ # .lazy
169
+ # .select(["foo", "bar"])
170
+ # )
171
+ # df.columns
172
+ # # => ["foo", "bar"]
173
+ def columns
174
+ _ldf.columns
175
+ end
176
+
177
+ # Get dtypes of columns in LazyFrame.
178
+ #
179
+ # @return [Array]
180
+ #
181
+ # @example
182
+ # lf = Polars::DataFrame.new(
183
+ # {
184
+ # "foo" => [1, 2, 3],
185
+ # "bar" => [6.0, 7.0, 8.0],
186
+ # "ham" => ["a", "b", "c"]
187
+ # }
188
+ # ).lazy
189
+ # lf.dtypes
190
+ # # => [:i64, :f64, :str]
191
+ def dtypes
192
+ _ldf.dtypes
193
+ end
194
+
195
+ # Get the schema.
196
+ #
197
+ # @return [Hash]
198
+ #
199
+ # @example
200
+ # lf = Polars::DataFrame.new(
201
+ # {
202
+ # "foo" => [1, 2, 3],
203
+ # "bar" => [6.0, 7.0, 8.0],
204
+ # "ham" => ["a", "b", "c"]
205
+ # }
206
+ # ).lazy
207
+ # lf.schema
208
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
209
+ def schema
210
+ _ldf.schema
211
+ end
212
+
213
+ # Get the width of the LazyFrame.
214
+ #
215
+ # @return [Integer]
216
+ #
217
+ # @example
218
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
219
+ # lf.width
220
+ # # => 2
221
+ def width
222
+ _ldf.width
223
+ end
224
+
225
+ # Check if LazyFrame includes key.
226
+ #
227
+ # @return [Boolean]
228
+ def include?(key)
229
+ columns.include?(key)
230
+ end
231
+
26
232
  # clone handled by initialize_copy
27
233
 
28
234
  # def [](item)
29
235
  # end
30
236
 
31
- # def to_s
32
- # end
33
- # alias_method :inspect, :to_s
237
+ # Returns a string representing the LazyFrame.
238
+ #
239
+ # @return [String]
240
+ def to_s
241
+ <<~EOS
242
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
243
+
244
+ #{describe_plan}
245
+ EOS
246
+ end
34
247
 
35
248
  # def write_json
36
249
  # end
@@ -38,21 +251,125 @@ module Polars
38
251
  # def pipe
39
252
  # end
40
253
 
41
- # def describe_plan
42
- # end
254
+ # Create a string representation of the unoptimized query plan.
255
+ #
256
+ # @return [String]
257
+ def describe_plan
258
+ _ldf.describe_plan
259
+ end
43
260
 
261
+ # Create a string representation of the optimized query plan.
262
+ #
263
+ # @return [String]
44
264
  # def describe_optimized_plan
45
265
  # end
46
266
 
47
267
  # def show_graph
48
268
  # end
49
269
 
50
- # def sort
51
- # end
270
+ # Sort the DataFrame.
271
+ #
272
+ # Sorting can be done by:
273
+ #
274
+ # - A single column name
275
+ # - An expression
276
+ # - Multiple expressions
277
+ #
278
+ # @param by [Object]
279
+ # Column (expressions) to sort by.
280
+ # @param reverse [Boolean]
281
+ # Sort in descending order.
282
+ # @param nulls_last [Boolean]
283
+ # Place null values last. Can only be used if sorted by a single column.
284
+ #
285
+ # @return [LazyFrame]
286
+ #
287
+ # @example
288
+ # df = Polars::DataFrame.new(
289
+ # {
290
+ # "foo" => [1, 2, 3],
291
+ # "bar" => [6.0, 7.0, 8.0],
292
+ # "ham" => ["a", "b", "c"]
293
+ # }
294
+ # ).lazy
295
+ # df.sort("foo", reverse: true).collect
296
+ # # =>
297
+ # # shape: (3, 3)
298
+ # # ┌─────┬─────┬─────┐
299
+ # # │ foo ┆ bar ┆ ham │
300
+ # # │ --- ┆ --- ┆ --- │
301
+ # # │ i64 ┆ f64 ┆ str │
302
+ # # ╞═════╪═════╪═════╡
303
+ # # │ 3 ┆ 8.0 ┆ c │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 2 ┆ 7.0 ┆ b │
306
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
307
+ # # │ 1 ┆ 6.0 ┆ a │
308
+ # # └─────┴─────┴─────┘
309
+ def sort(by, reverse: false, nulls_last: false)
310
+ if by.is_a?(String)
311
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
312
+ end
313
+ if Utils.bool?(reverse)
314
+ reverse = [reverse]
315
+ end
316
+
317
+ by = Utils.selection_to_rbexpr_list(by)
318
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
319
+ end
52
320
 
53
321
  # def profile
54
322
  # end
55
323
 
324
+ # Collect into a DataFrame.
325
+ #
326
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
327
+ # only. This can be a huge time saver in debugging queries.
328
+ #
329
+ # @param type_coercion [Boolean]
330
+ # Do type coercion optimization.
331
+ # @param predicate_pushdown [Boolean]
332
+ # Do predicate pushdown optimization.
333
+ # @param projection_pushdown [Boolean]
334
+ # Do projection pushdown optimization.
335
+ # @param simplify_expression [Boolean]
336
+ # Run simplify expressions optimization.
337
+ # @param string_cache [Boolean]
338
+ # This argument is deprecated. Please set the string cache globally.
339
+ # The argument will be ignored
340
+ # @param no_optimization [Boolean]
341
+ # Turn off (certain) optimizations.
342
+ # @param slice_pushdown [Boolean]
343
+ # Slice pushdown optimization.
344
+ # @param common_subplan_elimination [Boolean]
345
+ # Will try to cache branching subplans that occur on self-joins or unions.
346
+ # @param allow_streaming [Boolean]
347
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
348
+ #
349
+ # @return [DataFrame]
350
+ #
351
+ # @example
352
+ # df = Polars::DataFrame.new(
353
+ # {
354
+ # "a" => ["a", "b", "a", "b", "b", "c"],
355
+ # "b" => [1, 2, 3, 4, 5, 6],
356
+ # "c" => [6, 5, 4, 3, 2, 1]
357
+ # }
358
+ # ).lazy
359
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
360
+ # # =>
361
+ # # shape: (3, 3)
362
+ # # ┌─────┬─────┬─────┐
363
+ # # │ a ┆ b ┆ c │
364
+ # # │ --- ┆ --- ┆ --- │
365
+ # # │ str ┆ i64 ┆ i64 │
366
+ # # ╞═════╪═════╪═════╡
367
+ # # │ a ┆ 4 ┆ 10 │
368
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
369
+ # # │ b ┆ 11 ┆ 10 │
370
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
371
+ # # │ c ┆ 6 ┆ 1 │
372
+ # # └─────┴─────┴─────┘
56
373
  def collect(
57
374
  type_coercion: true,
58
375
  predicate_pushdown: true,
@@ -87,19 +404,184 @@ module Polars
87
404
  Utils.wrap_df(ldf.collect)
88
405
  end
89
406
 
90
- # def fetch
91
- # end
407
+ # Collect a small number of rows for debugging purposes.
408
+ #
409
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
410
+ # read by every scan operation. This is a utility that helps debug a query on a
411
+ # smaller number of rows.
412
+ #
413
+ # Note that the fetch does not guarantee the final number of rows in the
414
+ # DataFrame. Filter, join operations and a lower number of rows available in the
415
+ # scanned file influence the final number of rows.
416
+ #
417
+ # @param n_rows [Integer]
418
+ # Collect n_rows from the data sources.
419
+ # @param type_coercion [Boolean]
420
+ # Run type coercion optimization.
421
+ # @param predicate_pushdown [Boolean]
422
+ # Run predicate pushdown optimization.
423
+ # @param projection_pushdown [Boolean]
424
+ # Run projection pushdown optimization.
425
+ # @param simplify_expression [Boolean]
426
+ # Run simplify expressions optimization.
427
+ # @param string_cache [Boolean]
428
+ # This argument is deprecated. Please set the string cache globally.
429
+ # The argument will be ignored
430
+ # @param no_optimization [Boolean]
431
+ # Turn off optimizations.
432
+ # @param slice_pushdown [Boolean]
433
+ # Slice pushdown optimization
434
+ # @param common_subplan_elimination [Boolean]
435
+ # Will try to cache branching subplans that occur on self-joins or unions.
436
+ # @param allow_streaming [Boolean]
437
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
438
+ #
439
+ # @return [DataFrame]
440
+ #
441
+ # @example
442
+ # df = Polars::DataFrame.new(
443
+ # {
444
+ # "a" => ["a", "b", "a", "b", "b", "c"],
445
+ # "b" => [1, 2, 3, 4, 5, 6],
446
+ # "c" => [6, 5, 4, 3, 2, 1]
447
+ # }
448
+ # ).lazy
449
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
450
+ # # =>
451
+ # # shape: (2, 3)
452
+ # # ┌─────┬─────┬─────┐
453
+ # # │ a ┆ b ┆ c │
454
+ # # │ --- ┆ --- ┆ --- │
455
+ # # │ str ┆ i64 ┆ i64 │
456
+ # # ╞═════╪═════╪═════╡
457
+ # # │ a ┆ 1 ┆ 6 │
458
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
459
+ # # │ b ┆ 2 ┆ 5 │
460
+ # # └─────┴─────┴─────┘
461
+ def fetch(
462
+ n_rows = 500,
463
+ type_coercion: true,
464
+ predicate_pushdown: true,
465
+ projection_pushdown: true,
466
+ simplify_expression: true,
467
+ string_cache: false,
468
+ no_optimization: false,
469
+ slice_pushdown: true,
470
+ common_subplan_elimination: true,
471
+ allow_streaming: false
472
+ )
473
+ if no_optimization
474
+ predicate_pushdown = false
475
+ projection_pushdown = false
476
+ slice_pushdown = false
477
+ common_subplan_elimination = false
478
+ end
92
479
 
480
+ ldf = _ldf.optimization_toggle(
481
+ type_coercion,
482
+ predicate_pushdown,
483
+ projection_pushdown,
484
+ simplify_expression,
485
+ slice_pushdown,
486
+ common_subplan_elimination,
487
+ allow_streaming
488
+ )
489
+ Utils.wrap_df(ldf.fetch(n_rows))
490
+ end
491
+
492
+ # Return lazy representation, i.e. itself.
493
+ #
494
+ # Useful for writing code that expects either a `DataFrame` or
495
+ # `LazyFrame`.
496
+ #
497
+ # @return [LazyFrame]
498
+ #
499
+ # @example
500
+ # df = Polars::DataFrame.new(
501
+ # {
502
+ # "a" => [nil, 2, 3, 4],
503
+ # "b" => [0.5, nil, 2.5, 13],
504
+ # "c" => [true, true, false, nil]
505
+ # }
506
+ # )
507
+ # df.lazy
93
508
  def lazy
94
509
  self
95
510
  end
96
511
 
97
- # def cache
98
- # end
512
+ # Cache the result once the execution of the physical plan hits this node.
513
+ #
514
+ # @return [LazyFrame]
515
+ def cache
516
+ _from_rbldf(_ldf.cache)
517
+ end
99
518
 
100
- # def cleared
101
- # end
519
+ # Create an empty copy of the current LazyFrame.
520
+ #
521
+ # The copy has an identical schema but no data.
522
+ #
523
+ # @return [LazyFrame]
524
+ #
525
+ # @example
526
+ # df = Polars::DataFrame.new(
527
+ # {
528
+ # "a" => [nil, 2, 3, 4],
529
+ # "b" => [0.5, nil, 2.5, 13],
530
+ # "c" => [true, true, false, nil],
531
+ # }
532
+ # ).lazy
533
+ # df.cleared.fetch
534
+ # # =>
535
+ # # shape: (0, 3)
536
+ # # ┌─────┬─────┬──────┐
537
+ # # │ a ┆ b ┆ c │
538
+ # # │ --- ┆ --- ┆ --- │
539
+ # # │ i64 ┆ f64 ┆ bool │
540
+ # # ╞═════╪═════╪══════╡
541
+ # # └─────┴─────┴──────┘
542
+ def cleared
543
+ DataFrame.new(columns: schema).lazy
544
+ end
102
545
 
546
+ # Filter the rows in the DataFrame based on a predicate expression.
547
+ #
548
+ # @param predicate [Object]
549
+ # Expression that evaluates to a boolean Series.
550
+ #
551
+ # @return [LazyFrame]
552
+ #
553
+ # @example Filter on one condition:
554
+ # lf = Polars::DataFrame.new(
555
+ # {
556
+ # "foo" => [1, 2, 3],
557
+ # "bar" => [6, 7, 8],
558
+ # "ham" => ["a", "b", "c"]
559
+ # }
560
+ # ).lazy
561
+ # lf.filter(Polars.col("foo") < 3).collect()
562
+ # # =>
563
+ # # shape: (2, 3)
564
+ # # ┌─────┬─────┬─────┐
565
+ # # │ foo ┆ bar ┆ ham │
566
+ # # │ --- ┆ --- ┆ --- │
567
+ # # │ i64 ┆ i64 ┆ str │
568
+ # # ╞═════╪═════╪═════╡
569
+ # # │ 1 ┆ 6 ┆ a │
570
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
571
+ # # │ 2 ┆ 7 ┆ b │
572
+ # # └─────┴─────┴─────┘
573
+ #
574
+ # @example Filter on multiple conditions:
575
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
576
+ # # =>
577
+ # # shape: (1, 3)
578
+ # # ┌─────┬─────┬─────┐
579
+ # # │ foo ┆ bar ┆ ham │
580
+ # # │ --- ┆ --- ┆ --- │
581
+ # # │ i64 ┆ i64 ┆ str │
582
+ # # ╞═════╪═════╪═════╡
583
+ # # │ 1 ┆ 6 ┆ a │
584
+ # # └─────┴─────┴─────┘
103
585
  def filter(predicate)
104
586
  _from_rbldf(
105
587
  _ldf.filter(
@@ -108,11 +590,136 @@ module Polars
108
590
  )
109
591
  end
110
592
 
593
+ # Select columns from this DataFrame.
594
+ #
595
+ # @param exprs [Object]
596
+ # Column or columns to select.
597
+ #
598
+ # @return [LazyFrame]
599
+ #
600
+ # @example
601
+ # df = Polars::DataFrame.new(
602
+ # {
603
+ # "foo" => [1, 2, 3],
604
+ # "bar" => [6, 7, 8],
605
+ # "ham" => ["a", "b", "c"],
606
+ # }
607
+ # ).lazy
608
+ # df.select("foo").collect
609
+ # # =>
610
+ # # shape: (3, 1)
611
+ # # ┌─────┐
612
+ # # │ foo │
613
+ # # │ --- │
614
+ # # │ i64 │
615
+ # # ╞═════╡
616
+ # # │ 1 │
617
+ # # ├╌╌╌╌╌┤
618
+ # # │ 2 │
619
+ # # ├╌╌╌╌╌┤
620
+ # # │ 3 │
621
+ # # └─────┘
622
+ #
623
+ # @example
624
+ # df.select(["foo", "bar"]).collect
625
+ # # =>
626
+ # # shape: (3, 2)
627
+ # # ┌─────┬─────┐
628
+ # # │ foo ┆ bar │
629
+ # # │ --- ┆ --- │
630
+ # # │ i64 ┆ i64 │
631
+ # # ╞═════╪═════╡
632
+ # # │ 1 ┆ 6 │
633
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
634
+ # # │ 2 ┆ 7 │
635
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
636
+ # # │ 3 ┆ 8 │
637
+ # # └─────┴─────┘
638
+ #
639
+ # @example
640
+ # df.select(Polars.col("foo") + 1).collect
641
+ # # =>
642
+ # # shape: (3, 1)
643
+ # # ┌─────┐
644
+ # # │ foo │
645
+ # # │ --- │
646
+ # # │ i64 │
647
+ # # ╞═════╡
648
+ # # │ 2 │
649
+ # # ├╌╌╌╌╌┤
650
+ # # │ 3 │
651
+ # # ├╌╌╌╌╌┤
652
+ # # │ 4 │
653
+ # # └─────┘
654
+ #
655
+ # @example
656
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
657
+ # # =>
658
+ # # shape: (3, 2)
659
+ # # ┌─────┬─────┐
660
+ # # │ foo ┆ bar │
661
+ # # │ --- ┆ --- │
662
+ # # │ i64 ┆ i64 │
663
+ # # ╞═════╪═════╡
664
+ # # │ 2 ┆ 7 │
665
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
666
+ # # │ 3 ┆ 8 │
667
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
668
+ # # │ 4 ┆ 9 │
669
+ # # └─────┴─────┘
670
+ #
671
+ # @example
672
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
673
+ # # =>
674
+ # # shape: (3, 1)
675
+ # # ┌─────────┐
676
+ # # │ literal │
677
+ # # │ --- │
678
+ # # │ i64 │
679
+ # # ╞═════════╡
680
+ # # │ 0 │
681
+ # # ├╌╌╌╌╌╌╌╌╌┤
682
+ # # │ 0 │
683
+ # # ├╌╌╌╌╌╌╌╌╌┤
684
+ # # │ 10 │
685
+ # # └─────────┘
111
686
  def select(exprs)
112
687
  exprs = Utils.selection_to_rbexpr_list(exprs)
113
688
  _from_rbldf(_ldf.select(exprs))
114
689
  end
115
690
 
691
+ # Start a groupby operation.
692
+ #
693
+ # @param by [Object]
694
+ # Column(s) to group by.
695
+ # @param maintain_order [Boolean]
696
+ # Make sure that the order of the groups remain consistent. This is more
697
+ # expensive than a default groupby.
698
+ #
699
+ # @return [LazyGroupBy]
700
+ #
701
+ # @example
702
+ # df = Polars::DataFrame.new(
703
+ # {
704
+ # "a" => ["a", "b", "a", "b", "b", "c"],
705
+ # "b" => [1, 2, 3, 4, 5, 6],
706
+ # "c" => [6, 5, 4, 3, 2, 1]
707
+ # }
708
+ # ).lazy
709
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
710
+ # # =>
711
+ # # shape: (3, 2)
712
+ # # ┌─────┬─────┐
713
+ # # │ a ┆ b │
714
+ # # │ --- ┆ --- │
715
+ # # │ str ┆ i64 │
716
+ # # ╞═════╪═════╡
717
+ # # │ a ┆ 4 │
718
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
719
+ # # │ b ┆ 11 │
720
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
721
+ # # │ c ┆ 6 │
722
+ # # └─────┴─────┘
116
723
  def groupby(by, maintain_order: false)
117
724
  rbexprs_by = Utils.selection_to_rbexpr_list(by)
118
725
  lgb = _ldf.groupby(rbexprs_by, maintain_order)
@@ -128,6 +735,116 @@ module Polars
128
735
  # def join_asof
129
736
  # end
130
737
 
738
+ # Add a join operation to the Logical Plan.
739
+ #
740
+ # @param other [LazyFrame]
741
+ # Lazy DataFrame to join with.
742
+ # @param left_on [Object]
743
+ # Join column of the left DataFrame.
744
+ # @param right_on [Object]
745
+ # Join column of the right DataFrame.
746
+ # @param on Object
747
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
748
+ # None.
749
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
750
+ # Join strategy.
751
+ # @param suffix [String]
752
+ # Suffix to append to columns with a duplicate name.
753
+ # @param allow_parallel [Boolean]
754
+ # Allow the physical plan to optionally evaluate the computation of both
755
+ # DataFrames up to the join in parallel.
756
+ # @param force_parallel [Boolean]
757
+ # Force the physical plan to evaluate the computation of both DataFrames up to
758
+ # the join in parallel.
759
+ #
760
+ # @return [LazyFrame]
761
+ #
762
+ # @example
763
+ # df = Polars::DataFrame.new(
764
+ # {
765
+ # "foo" => [1, 2, 3],
766
+ # "bar" => [6.0, 7.0, 8.0],
767
+ # "ham" => ["a", "b", "c"]
768
+ # }
769
+ # ).lazy
770
+ # other_df = Polars::DataFrame.new(
771
+ # {
772
+ # "apple" => ["x", "y", "z"],
773
+ # "ham" => ["a", "b", "d"]
774
+ # }
775
+ # ).lazy
776
+ # df.join(other_df, on: "ham").collect
777
+ # # =>
778
+ # # shape: (2, 4)
779
+ # # ┌─────┬─────┬─────┬───────┐
780
+ # # │ foo ┆ bar ┆ ham ┆ apple │
781
+ # # │ --- ┆ --- ┆ --- ┆ --- │
782
+ # # │ i64 ┆ f64 ┆ str ┆ str │
783
+ # # ╞═════╪═════╪═════╪═══════╡
784
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
785
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
786
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
787
+ # # └─────┴─────┴─────┴───────┘
788
+ #
789
+ # @example
790
+ # df.join(other_df, on: "ham", how: "outer").collect
791
+ # # =>
792
+ # # shape: (4, 4)
793
+ # # ┌──────┬──────┬─────┬───────┐
794
+ # # │ foo ┆ bar ┆ ham ┆ apple │
795
+ # # │ --- ┆ --- ┆ --- ┆ --- │
796
+ # # │ i64 ┆ f64 ┆ str ┆ str │
797
+ # # ╞══════╪══════╪═════╪═══════╡
798
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
799
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
800
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
801
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
802
+ # # │ null ┆ null ┆ d ┆ z │
803
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
804
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
805
+ # # └──────┴──────┴─────┴───────┘
806
+ #
807
+ # @example
808
+ # df.join(other_df, on: "ham", how: "left").collect
809
+ # # =>
810
+ # # shape: (3, 4)
811
+ # # ┌─────┬─────┬─────┬───────┐
812
+ # # │ foo ┆ bar ┆ ham ┆ apple │
813
+ # # │ --- ┆ --- ┆ --- ┆ --- │
814
+ # # │ i64 ┆ f64 ┆ str ┆ str │
815
+ # # ╞═════╪═════╪═════╪═══════╡
816
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
817
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
818
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
819
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
820
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
821
+ # # └─────┴─────┴─────┴───────┘
822
+ #
823
+ # @example
824
+ # df.join(other_df, on: "ham", how: "semi").collect
825
+ # # =>
826
+ # # shape: (2, 3)
827
+ # # ┌─────┬─────┬─────┐
828
+ # # │ foo ┆ bar ┆ ham │
829
+ # # │ --- ┆ --- ┆ --- │
830
+ # # │ i64 ┆ f64 ┆ str │
831
+ # # ╞═════╪═════╪═════╡
832
+ # # │ 1 ┆ 6.0 ┆ a │
833
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
834
+ # # │ 2 ┆ 7.0 ┆ b │
835
+ # # └─────┴─────┴─────┘
836
+ #
837
+ # @example
838
+ # df.join(other_df, on: "ham", how: "anti").collect
839
+ # # =>
840
+ # # shape: (1, 3)
841
+ # # ┌─────┬─────┬─────┐
842
+ # # │ foo ┆ bar ┆ ham │
843
+ # # │ --- ┆ --- ┆ --- │
844
+ # # │ i64 ┆ f64 ┆ str │
845
+ # # ╞═════╪═════╪═════╡
846
+ # # │ 3 ┆ 8.0 ┆ c │
847
+ # # └─────┴─────┴─────┘
131
848
  def join(
132
849
  other,
133
850
  left_on: nil,
@@ -174,6 +891,43 @@ module Polars
174
891
  )
175
892
  end
176
893
 
894
+ # Add or overwrite multiple columns in a DataFrame.
895
+ #
896
+ # @param exprs [Object]
897
+ # List of Expressions that evaluate to columns.
898
+ #
899
+ # @return [LazyFrame]
900
+ #
901
+ # @example
902
+ # ldf = Polars::DataFrame.new(
903
+ # {
904
+ # "a" => [1, 2, 3, 4],
905
+ # "b" => [0.5, 4, 10, 13],
906
+ # "c" => [true, true, false, true]
907
+ # }
908
+ # ).lazy
909
+ # ldf.with_columns(
910
+ # [
911
+ # (Polars.col("a") ** 2).alias("a^2"),
912
+ # (Polars.col("b") / 2).alias("b/2"),
913
+ # (Polars.col("c").is_not()).alias("not c")
914
+ # ]
915
+ # ).collect
916
+ # # =>
917
+ # # shape: (4, 6)
918
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
919
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
920
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
921
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
922
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
923
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
924
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
925
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
926
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
927
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
928
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
929
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
930
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
177
931
  def with_columns(exprs)
178
932
  exprs =
179
933
  if exprs.nil?
@@ -202,55 +956,343 @@ module Polars
202
956
  # def with_context
203
957
  # end
204
958
 
959
+ # Add or overwrite column in a DataFrame.
960
+ #
961
+ # @param column [Object]
962
+ # Expression that evaluates to column or a Series to use.
963
+ #
964
+ # @return [LazyFrame]
965
+ #
966
+ # @example
967
+ # df = Polars::DataFrame.new(
968
+ # {
969
+ # "a" => [1, 3, 5],
970
+ # "b" => [2, 4, 6]
971
+ # }
972
+ # ).lazy
973
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
974
+ # # =>
975
+ # # shape: (3, 3)
976
+ # # ┌─────┬─────┬───────────┐
977
+ # # │ a ┆ b ┆ b_squared │
978
+ # # │ --- ┆ --- ┆ --- │
979
+ # # │ i64 ┆ i64 ┆ f64 │
980
+ # # ╞═════╪═════╪═══════════╡
981
+ # # │ 1 ┆ 2 ┆ 4.0 │
982
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
983
+ # # │ 3 ┆ 4 ┆ 16.0 │
984
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
985
+ # # │ 5 ┆ 6 ┆ 36.0 │
986
+ # # └─────┴─────┴───────────┘
987
+ #
988
+ # @example
989
+ # df.with_column(Polars.col("a") ** 2).collect
990
+ # # =>
991
+ # # shape: (3, 2)
992
+ # # ┌──────┬─────┐
993
+ # # │ a ┆ b │
994
+ # # │ --- ┆ --- │
995
+ # # │ f64 ┆ i64 │
996
+ # # ╞══════╪═════╡
997
+ # # │ 1.0 ┆ 2 │
998
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
999
+ # # │ 9.0 ┆ 4 │
1000
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
1001
+ # # │ 25.0 ┆ 6 │
1002
+ # # └──────┴─────┘
205
1003
  def with_column(column)
206
1004
  with_columns([column])
207
1005
  end
208
1006
 
209
- # def drop
210
- # end
1007
+ # Remove one or multiple columns from a DataFrame.
1008
+ #
1009
+ # @param columns [Object]
1010
+ # - Name of the column that should be removed.
1011
+ # - List of column names.
1012
+ #
1013
+ # @return [LazyFrame]
1014
+ def drop(columns)
1015
+ if columns.is_a?(String)
1016
+ columns = [columns]
1017
+ end
1018
+ _from_rbldf(_ldf.drop_columns(columns))
1019
+ end
211
1020
 
1021
+ # Rename column names.
1022
+ #
1023
+ # @param mapping [Hash]
1024
+ # Key value pairs that map from old name to new name.
1025
+ #
1026
+ # @return [LazyFrame]
212
1027
  def rename(mapping)
213
1028
  existing = mapping.keys
214
1029
  _new = mapping.values
215
1030
  _from_rbldf(_ldf.rename(existing, _new))
216
1031
  end
217
1032
 
218
- # def reverse
219
- # end
1033
+ # Reverse the DataFrame.
1034
+ #
1035
+ # @return [LazyFrame]
1036
+ def reverse
1037
+ _from_rbldf(_ldf.reverse)
1038
+ end
220
1039
 
221
- # def shift
222
- # end
1040
+ # Shift the values by a given period.
1041
+ #
1042
+ # @param periods [Integer]
1043
+ # Number of places to shift (may be negative).
1044
+ #
1045
+ # @return [LazyFrame]
1046
+ #
1047
+ # @example
1048
+ # df = Polars::DataFrame.new(
1049
+ # {
1050
+ # "a" => [1, 3, 5],
1051
+ # "b" => [2, 4, 6]
1052
+ # }
1053
+ # ).lazy
1054
+ # df.shift(1).collect
1055
+ # # =>
1056
+ # # shape: (3, 2)
1057
+ # # ┌──────┬──────┐
1058
+ # # │ a ┆ b │
1059
+ # # │ --- ┆ --- │
1060
+ # # │ i64 ┆ i64 │
1061
+ # # ╞══════╪══════╡
1062
+ # # │ null ┆ null │
1063
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1064
+ # # │ 1 ┆ 2 │
1065
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1066
+ # # │ 3 ┆ 4 │
1067
+ # # └──────┴──────┘
1068
+ #
1069
+ # @example
1070
+ # df.shift(-1).collect
1071
+ # # =>
1072
+ # # shape: (3, 2)
1073
+ # # ┌──────┬──────┐
1074
+ # # │ a ┆ b │
1075
+ # # │ --- ┆ --- │
1076
+ # # │ i64 ┆ i64 │
1077
+ # # ╞══════╪══════╡
1078
+ # # │ 3 ┆ 4 │
1079
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1080
+ # # │ 5 ┆ 6 │
1081
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1082
+ # # │ null ┆ null │
1083
+ # # └──────┴──────┘
1084
+ def shift(periods)
1085
+ _from_rbldf(_ldf.shift(periods))
1086
+ end
223
1087
 
224
- # def shift_and_fill
225
- # end
1088
+ # Shift the values by a given period and fill the resulting null values.
1089
+ #
1090
+ # @param periods [Integer]
1091
+ # Number of places to shift (may be negative).
1092
+ # @param fill_value [Object]
1093
+ # Fill `nil` values with the result of this expression.
1094
+ #
1095
+ # @return [LazyFrame]
1096
+ #
1097
+ # @example
1098
+ # df = Polars::DataFrame.new(
1099
+ # {
1100
+ # "a" => [1, 3, 5],
1101
+ # "b" => [2, 4, 6]
1102
+ # }
1103
+ # ).lazy
1104
+ # df.shift_and_fill(1, 0).collect
1105
+ # # =>
1106
+ # # shape: (3, 2)
1107
+ # # ┌─────┬─────┐
1108
+ # # │ a ┆ b │
1109
+ # # │ --- ┆ --- │
1110
+ # # │ i64 ┆ i64 │
1111
+ # # ╞═════╪═════╡
1112
+ # # │ 0 ┆ 0 │
1113
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1114
+ # # │ 1 ┆ 2 │
1115
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1116
+ # # │ 3 ┆ 4 │
1117
+ # # └─────┴─────┘
1118
+ #
1119
+ # @example
1120
+ # df.shift_and_fill(-1, 0).collect
1121
+ # # =>
1122
+ # # shape: (3, 2)
1123
+ # # ┌─────┬─────┐
1124
+ # # │ a ┆ b │
1125
+ # # │ --- ┆ --- │
1126
+ # # │ i64 ┆ i64 │
1127
+ # # ╞═════╪═════╡
1128
+ # # │ 3 ┆ 4 │
1129
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1130
+ # # │ 5 ┆ 6 │
1131
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1132
+ # # │ 0 ┆ 0 │
1133
+ # # └─────┴─────┘
1134
+ def shift_and_fill(periods, fill_value)
1135
+ if !fill_value.is_a?(Expr)
1136
+ fill_value = Polars.lit(fill_value)
1137
+ end
1138
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1139
+ end
226
1140
 
227
- # def slice
228
- # end
1141
+ # Get a slice of this DataFrame.
1142
+ #
1143
+ # @param offset [Integer]
1144
+ # Start index. Negative indexing is supported.
1145
+ # @param length [Integer]
1146
+ # Length of the slice. If set to `nil`, all rows starting at the offset
1147
+ # will be selected.
1148
+ #
1149
+ # @return [LazyFrame]
1150
+ #
1151
+ # @example
1152
+ # df = Polars::DataFrame.new(
1153
+ # {
1154
+ # "a" => ["x", "y", "z"],
1155
+ # "b" => [1, 3, 5],
1156
+ # "c" => [2, 4, 6]
1157
+ # }
1158
+ # ).lazy
1159
+ # df.slice(1, 2).collect
1160
+ # # =>
1161
+ # # shape: (2, 3)
1162
+ # # ┌─────┬─────┬─────┐
1163
+ # # │ a ┆ b ┆ c │
1164
+ # # │ --- ┆ --- ┆ --- │
1165
+ # # │ str ┆ i64 ┆ i64 │
1166
+ # # ╞═════╪═════╪═════╡
1167
+ # # │ y ┆ 3 ┆ 4 │
1168
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1169
+ # # │ z ┆ 5 ┆ 6 │
1170
+ # # └─────┴─────┴─────┘
1171
+ def slice(offset, length = nil)
1172
+ if length && length < 0
1173
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
1174
+ end
1175
+ _from_rbldf(_ldf.slice(offset, length))
1176
+ end
229
1177
 
230
- # def limit
231
- # end
1178
+ # Get the first `n` rows.
1179
+ #
1180
+ # Alias for {#head}.
1181
+ #
1182
+ # @param n [Integer]
1183
+ # Number of rows to return.
1184
+ #
1185
+ # @return [LazyFrame]
1186
+ #
1187
+ # @note
1188
+ # Consider using the {#fetch} operation if you only want to test your
1189
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1190
+ # level, whereas the {#head}/{#limit} are applied at the end.
1191
+ def limit(n = 5)
1192
+ head(5)
1193
+ end
232
1194
 
233
- # def head
234
- # end
1195
+ # Get the first `n` rows.
1196
+ #
1197
+ # @param n [Integer]
1198
+ # Number of rows to return.
1199
+ #
1200
+ # @return [LazyFrame]
1201
+ #
1202
+ # @note
1203
+ # Consider using the {#fetch} operation if you only want to test your
1204
+ # query. The {#fetch} operation will load the first `n` rows at the scan
1205
+ # level, whereas the {#head}/{#limit} are applied at the end.
1206
+ def head(n = 5)
1207
+ slice(0, n)
1208
+ end
235
1209
 
236
- # def tail
237
- # end
1210
+ # Get the last `n` rows.
1211
+ #
1212
+ # @param n [Integer]
1213
+ # Number of rows.
1214
+ #
1215
+ # @return [LazyFrame]
1216
+ def tail(n = 5)
1217
+ _from_rbldf(_ldf.tail(n))
1218
+ end
238
1219
 
239
- # def last
240
- # end
1220
+ # Get the last row of the DataFrame.
1221
+ #
1222
+ # @return [LazyFrame]
1223
+ def last
1224
+ tail(1)
1225
+ end
241
1226
 
242
- # def first
243
- # end
1227
+ # Get the first row of the DataFrame.
1228
+ #
1229
+ # @return [LazyFrame]
1230
+ def first
1231
+ slice(0, 1)
1232
+ end
244
1233
 
245
1234
  # def with_row_count
246
1235
  # end
247
1236
 
248
- # def take_every
249
- # end
1237
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
1238
+ #
1239
+ # @return [LazyFrame]
1240
+ #
1241
+ # @example
1242
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
1243
+ # s.take_every(2).collect
1244
+ # # =>
1245
+ # # shape: (2, 2)
1246
+ # # ┌─────┬─────┐
1247
+ # # │ a ┆ b │
1248
+ # # │ --- ┆ --- │
1249
+ # # │ i64 ┆ i64 │
1250
+ # # ╞═════╪═════╡
1251
+ # # │ 1 ┆ 5 │
1252
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1253
+ # # │ 3 ┆ 7 │
1254
+ # # └─────┴─────┘
1255
+ def take_every(n)
1256
+ select(Utils.col("*").take_every(n))
1257
+ end
250
1258
 
251
1259
  # def fill_null
252
1260
  # end
253
1261
 
1262
+ # Fill floating point NaN values.
1263
+ #
1264
+ # @param fill_value [Object]
1265
+ # Value to fill the NaN values with.
1266
+ #
1267
+ # @return [LazyFrame]
1268
+ #
1269
+ # @note
1270
+ # Note that floating point NaN (Not a Number) are not missing values!
1271
+ # To replace missing values, use `fill_null` instead.
1272
+ #
1273
+ # @example
1274
+ # df = Polars::DataFrame.new(
1275
+ # {
1276
+ # "a" => [1.5, 2, Float::NAN, 4],
1277
+ # "b" => [0.5, 4, Float::NAN, 13],
1278
+ # }
1279
+ # ).lazy
1280
+ # df.fill_nan(99).collect
1281
+ # # =>
1282
+ # # shape: (4, 2)
1283
+ # # ┌──────┬──────┐
1284
+ # # │ a ┆ b │
1285
+ # # │ --- ┆ --- │
1286
+ # # │ f64 ┆ f64 │
1287
+ # # ╞══════╪══════╡
1288
+ # # │ 1.5 ┆ 0.5 │
1289
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1290
+ # # │ 2.0 ┆ 4.0 │
1291
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1292
+ # # │ 99.0 ┆ 99.0 │
1293
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1294
+ # # │ 4.0 ┆ 13.0 │
1295
+ # # └──────┴──────┘
254
1296
  def fill_nan(fill_value)
255
1297
  if !fill_value.is_a?(Expr)
256
1298
  fill_value = Utils.lit(fill_value)
@@ -258,35 +1300,255 @@ module Polars
258
1300
  _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
259
1301
  end
260
1302
 
261
- # def std
262
- # end
1303
+ # Aggregate the columns in the DataFrame to their standard deviation value.
1304
+ #
1305
+ # @return [LazyFrame]
1306
+ #
1307
+ # @example
1308
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1309
+ # df.std.collect
1310
+ # # =>
1311
+ # # shape: (1, 2)
1312
+ # # ┌──────────┬─────┐
1313
+ # # │ a ┆ b │
1314
+ # # │ --- ┆ --- │
1315
+ # # │ f64 ┆ f64 │
1316
+ # # ╞══════════╪═════╡
1317
+ # # │ 1.290994 ┆ 0.5 │
1318
+ # # └──────────┴─────┘
1319
+ #
1320
+ # @example
1321
+ # df.std(ddof: 0).collect
1322
+ # # =>
1323
+ # # shape: (1, 2)
1324
+ # # ┌──────────┬──────────┐
1325
+ # # │ a ┆ b │
1326
+ # # │ --- ┆ --- │
1327
+ # # │ f64 ┆ f64 │
1328
+ # # ╞══════════╪══════════╡
1329
+ # # │ 1.118034 ┆ 0.433013 │
1330
+ # # └──────────┴──────────┘
1331
+ def std(ddof: 1)
1332
+ _from_rbldf(_ldf.std(ddof))
1333
+ end
263
1334
 
264
- # def var
265
- # end
1335
+ # Aggregate the columns in the DataFrame to their variance value.
1336
+ #
1337
+ # @return [LazyFrame]
1338
+ #
1339
+ # @example
1340
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1341
+ # df.var.collect
1342
+ # # =>
1343
+ # # shape: (1, 2)
1344
+ # # ┌──────────┬──────┐
1345
+ # # │ a ┆ b │
1346
+ # # │ --- ┆ --- │
1347
+ # # │ f64 ┆ f64 │
1348
+ # # ╞══════════╪══════╡
1349
+ # # │ 1.666667 ┆ 0.25 │
1350
+ # # └──────────┴──────┘
1351
+ #
1352
+ # @example
1353
+ # df.var(ddof: 0).collect
1354
+ # # =>
1355
+ # # shape: (1, 2)
1356
+ # # ┌──────┬────────┐
1357
+ # # │ a ┆ b │
1358
+ # # │ --- ┆ --- │
1359
+ # # │ f64 ┆ f64 │
1360
+ # # ╞══════╪════════╡
1361
+ # # │ 1.25 ┆ 0.1875 │
1362
+ # # └──────┴────────┘
1363
+ def var(ddof: 1)
1364
+ _from_rbldf(_ldf.var(ddof))
1365
+ end
266
1366
 
267
- # def max
268
- # end
1367
+ # Aggregate the columns in the DataFrame to their maximum value.
1368
+ #
1369
+ # @return [LazyFrame]
1370
+ #
1371
+ # @example
1372
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1373
+ # df.max.collect
1374
+ # # =>
1375
+ # # shape: (1, 2)
1376
+ # # ┌─────┬─────┐
1377
+ # # │ a ┆ b │
1378
+ # # │ --- ┆ --- │
1379
+ # # │ i64 ┆ i64 │
1380
+ # # ╞═════╪═════╡
1381
+ # # │ 4 ┆ 2 │
1382
+ # # └─────┴─────┘
1383
+ def max
1384
+ _from_rbldf(_ldf.max)
1385
+ end
269
1386
 
270
- # def min
271
- # end
1387
+ # Aggregate the columns in the DataFrame to their minimum value.
1388
+ #
1389
+ # @return [LazyFrame]
1390
+ #
1391
+ # @example
1392
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1393
+ # df.min.collect
1394
+ # # =>
1395
+ # # shape: (1, 2)
1396
+ # # ┌─────┬─────┐
1397
+ # # │ a ┆ b │
1398
+ # # │ --- ┆ --- │
1399
+ # # │ i64 ┆ i64 │
1400
+ # # ╞═════╪═════╡
1401
+ # # │ 1 ┆ 1 │
1402
+ # # └─────┴─────┘
1403
+ def min
1404
+ _from_rbldf(_ldf.min)
1405
+ end
272
1406
 
273
- # def sum
274
- # end
1407
+ # Aggregate the columns in the DataFrame to their sum value.
1408
+ #
1409
+ # @return [LazyFrame]
1410
+ #
1411
+ # @example
1412
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1413
+ # df.sum.collect
1414
+ # # =>
1415
+ # # shape: (1, 2)
1416
+ # # ┌─────┬─────┐
1417
+ # # │ a ┆ b │
1418
+ # # │ --- ┆ --- │
1419
+ # # │ i64 ┆ i64 │
1420
+ # # ╞═════╪═════╡
1421
+ # # │ 10 ┆ 5 │
1422
+ # # └─────┴─────┘
1423
+ def sum
1424
+ _from_rbldf(_ldf.sum)
1425
+ end
275
1426
 
276
- # def mean
277
- # end
1427
+ # Aggregate the columns in the DataFrame to their mean value.
1428
+ #
1429
+ # @return [LazyFrame]
1430
+ #
1431
+ # @example
1432
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1433
+ # df.mean.collect
1434
+ # # =>
1435
+ # # shape: (1, 2)
1436
+ # # ┌─────┬──────┐
1437
+ # # │ a ┆ b │
1438
+ # # │ --- ┆ --- │
1439
+ # # │ f64 ┆ f64 │
1440
+ # # ╞═════╪══════╡
1441
+ # # │ 2.5 ┆ 1.25 │
1442
+ # # └─────┴──────┘
1443
+ def mean
1444
+ _from_rbldf(_ldf.mean)
1445
+ end
278
1446
 
279
- # def median
280
- # end
1447
+ # Aggregate the columns in the DataFrame to their median value.
1448
+ #
1449
+ # @return [LazyFrame]
1450
+ #
1451
+ # @example
1452
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1453
+ # df.median.collect
1454
+ # # =>
1455
+ # # shape: (1, 2)
1456
+ # # ┌─────┬─────┐
1457
+ # # │ a ┆ b │
1458
+ # # │ --- ┆ --- │
1459
+ # # │ f64 ┆ f64 │
1460
+ # # ╞═════╪═════╡
1461
+ # # │ 2.5 ┆ 1.0 │
1462
+ # # └─────┴─────┘
1463
+ def median
1464
+ _from_rbldf(_ldf.median)
1465
+ end
281
1466
 
282
- # def quantile
283
- # end
1467
+ # Aggregate the columns in the DataFrame to their quantile value.
1468
+ #
1469
+ # @param quantile [Float]
1470
+ # Quantile between 0.0 and 1.0.
1471
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
1472
+ # Interpolation method.
1473
+ #
1474
+ # @return [LazyFrame]
1475
+ #
1476
+ # @example
1477
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
1478
+ # df.quantile(0.7).collect
1479
+ # # =>
1480
+ # # shape: (1, 2)
1481
+ # # ┌─────┬─────┐
1482
+ # # │ a ┆ b │
1483
+ # # │ --- ┆ --- │
1484
+ # # │ f64 ┆ f64 │
1485
+ # # ╞═════╪═════╡
1486
+ # # │ 3.0 ┆ 1.0 │
1487
+ # # └─────┴─────┘
1488
+ def quantile(quantile, interpolation: "nearest")
1489
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
1490
+ end
284
1491
 
285
- # def explode
286
- # end
1492
+ # Explode lists to long format.
1493
+ #
1494
+ # @return [LazyFrame]
1495
+ #
1496
+ # @example
1497
+ # df = Polars::DataFrame.new(
1498
+ # {
1499
+ # "letters" => ["a", "a", "b", "c"],
1500
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
1501
+ # }
1502
+ # ).lazy
1503
+ # df.explode("numbers").collect
1504
+ # # =>
1505
+ # # shape: (8, 2)
1506
+ # # ┌─────────┬─────────┐
1507
+ # # │ letters ┆ numbers │
1508
+ # # │ --- ┆ --- │
1509
+ # # │ str ┆ i64 │
1510
+ # # ╞═════════╪═════════╡
1511
+ # # │ a ┆ 1 │
1512
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1513
+ # # │ a ┆ 2 │
1514
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1515
+ # # │ a ┆ 3 │
1516
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1517
+ # # │ b ┆ 4 │
1518
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1519
+ # # │ b ┆ 5 │
1520
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1521
+ # # │ c ┆ 6 │
1522
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1523
+ # # │ c ┆ 7 │
1524
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
1525
+ # # │ c ┆ 8 │
1526
+ # # └─────────┴─────────┘
1527
+ def explode(columns)
1528
+ columns = Utils.selection_to_rbexpr_list(columns)
1529
+ _from_rbldf(_ldf.explode(columns))
1530
+ end
287
1531
 
288
- # def unique
289
- # end
1532
+ # Drop duplicate rows from this DataFrame.
1533
+ #
1534
+ # Note that this fails if there is a column of type `List` in the DataFrame or
1535
+ # subset.
1536
+ #
1537
+ # @param maintain_order [Boolean]
1538
+ # Keep the same order as the original DataFrame. This requires more work to
1539
+ # compute.
1540
+ # @param subset [Object]
1541
+ # Subset to use to compare rows.
1542
+ # @param keep ["first", "last"]
1543
+ # Which of the duplicate rows to keep.
1544
+ #
1545
+ # @return [LazyFrame]
1546
+ def unique(maintain_order: true, subset: nil, keep: "first")
1547
+ if !subset.nil? && !subset.is_a?(Array)
1548
+ subset = [subset]
1549
+ end
1550
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
1551
+ end
290
1552
 
291
1553
  # def drop_nulls
292
1554
  # end
@@ -297,11 +1559,97 @@ module Polars
297
1559
  # def map
298
1560
  # end
299
1561
 
300
- # def interpolate
301
- # end
1562
+ # Interpolate intermediate values. The interpolation method is linear.
1563
+ #
1564
+ # @return [LazyFrame]
1565
+ #
1566
+ # @example
1567
+ # df = Polars::DataFrame.new(
1568
+ # {
1569
+ # "foo" => [1, nil, 9, 10],
1570
+ # "bar" => [6, 7, 9, nil],
1571
+ # "baz" => [1, nil, nil, 9]
1572
+ # }
1573
+ # ).lazy
1574
+ # df.interpolate.collect
1575
+ # # =>
1576
+ # # shape: (4, 3)
1577
+ # # ┌─────┬──────┬─────┐
1578
+ # # │ foo ┆ bar ┆ baz │
1579
+ # # │ --- ┆ --- ┆ --- │
1580
+ # # │ i64 ┆ i64 ┆ i64 │
1581
+ # # ╞═════╪══════╪═════╡
1582
+ # # │ 1 ┆ 6 ┆ 1 │
1583
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1584
+ # # │ 5 ┆ 7 ┆ 3 │
1585
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1586
+ # # │ 9 ┆ 9 ┆ 6 │
1587
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
1588
+ # # │ 10 ┆ null ┆ 9 │
1589
+ # # └─────┴──────┴─────┘
1590
+ def interpolate
1591
+ select(Utils.col("*").interpolate)
1592
+ end
302
1593
 
303
- # def unnest
304
- # end
1594
+ # Decompose a struct into its fields.
1595
+ #
1596
+ # The fields will be inserted into the `DataFrame` on the location of the
1597
+ # `struct` type.
1598
+ #
1599
+ # @param names [Object]
1600
+ # Names of the struct columns that will be decomposed by its fields
1601
+ #
1602
+ # @return [LazyFrame]
1603
+ #
1604
+ # @example
1605
+ # df = (
1606
+ # Polars::DataFrame.new(
1607
+ # {
1608
+ # "before" => ["foo", "bar"],
1609
+ # "t_a" => [1, 2],
1610
+ # "t_b" => ["a", "b"],
1611
+ # "t_c" => [true, nil],
1612
+ # "t_d" => [[1, 2], [3]],
1613
+ # "after" => ["baz", "womp"]
1614
+ # }
1615
+ # )
1616
+ # .lazy
1617
+ # .select(
1618
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
1619
+ # )
1620
+ # )
1621
+ # df.fetch
1622
+ # # =>
1623
+ # # shape: (2, 3)
1624
+ # # ┌────────┬─────────────────────┬───────┐
1625
+ # # │ before ┆ t_struct ┆ after │
1626
+ # # │ --- ┆ --- ┆ --- │
1627
+ # # │ str ┆ struct[4] ┆ str │
1628
+ # # ╞════════╪═════════════════════╪═══════╡
1629
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
1630
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1631
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
1632
+ # # └────────┴─────────────────────┴───────┘
1633
+ #
1634
+ # @example
1635
+ # df.unnest("t_struct").fetch
1636
+ # # =>
1637
+ # # shape: (2, 6)
1638
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
1639
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
1640
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1641
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
1642
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
1643
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
1644
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1645
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
1646
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
1647
+ def unnest(names)
1648
+ if names.is_a?(String)
1649
+ names = [names]
1650
+ end
1651
+ _from_rbldf(_ldf.unnest(names))
1652
+ end
305
1653
 
306
1654
  private
307
1655