polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/3.3/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -1,1181 +0,0 @@
1
- module Polars
2
- module LazyFunctions
3
- # Return an expression representing a column in a DataFrame.
4
- #
5
- # @return [Expr]
6
- def col(name)
7
- if name.is_a?(Series)
8
- name = name.to_a
9
- end
10
-
11
- if name.is_a?(Class) && name < DataType
12
- name = [name]
13
- end
14
-
15
- if name.is_a?(DataType)
16
- Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(::Array)
18
- if name.length == 0 || Utils.strlike?(name[0])
19
- name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
- Utils.wrap_expr(RbExpr.cols(name))
21
- elsif Utils.is_polars_dtype(name[0])
22
- Utils.wrap_expr(_dtype_cols(name))
23
- else
24
- raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
25
- end
26
- else
27
- name = name.to_s if name.is_a?(Symbol)
28
- Utils.wrap_expr(RbExpr.col(name))
29
- end
30
- end
31
-
32
- # Alias for an element in evaluated in an `eval` expression.
33
- #
34
- # @return [Expr]
35
- #
36
- # @example A horizontal rank computation by taking the elements of a list
37
- # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
- # df.with_column(
39
- # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
- # )
41
- # # =>
42
- # # shape: (3, 3)
43
- # # ┌─────┬─────┬────────────┐
44
- # # │ a ┆ b ┆ rank │
45
- # # │ --- ┆ --- ┆ --- │
46
- # # │ i64 ┆ i64 ┆ list[f64] │
47
- # # ╞═════╪═════╪════════════╡
48
- # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
49
- # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
50
- # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
51
- # # └─────┴─────┴────────────┘
52
- def element
53
- col("")
54
- end
55
-
56
- # Count the number of values in this column/context.
57
- #
58
- # @param column [String, Series, nil]
59
- # If dtype is:
60
- #
61
- # * `Series` : count the values in the series.
62
- # * `String` : count the values in this column.
63
- # * `None` : count the number of values in this context.
64
- #
65
- # @return [Expr, Integer]
66
- def count(column = nil)
67
- if column.nil?
68
- return Utils.wrap_expr(RbExpr.count)
69
- end
70
-
71
- if column.is_a?(Series)
72
- column.len
73
- else
74
- col(column).count
75
- end
76
- end
77
-
78
- # Aggregate to list.
79
- #
80
- # @return [Expr]
81
- def to_list(name)
82
- col(name).list
83
- end
84
-
85
- # Get the standard deviation.
86
- #
87
- # @return [Object]
88
- def std(column, ddof: 1)
89
- if column.is_a?(Series)
90
- column.std(ddof: ddof)
91
- else
92
- col(column).std(ddof: ddof)
93
- end
94
- end
95
-
96
- # Get the variance.
97
- #
98
- # @return [Object]
99
- def var(column, ddof: 1)
100
- if column.is_a?(Series)
101
- column.var(ddof: ddof)
102
- else
103
- col(column).var(ddof: ddof)
104
- end
105
- end
106
-
107
- # Get the maximum value.
108
- #
109
- # @param column [Object]
110
- # Column(s) to be used in aggregation.
111
- #
112
- # @return [Expr, Object]
113
- def max(column)
114
- if column.is_a?(Series)
115
- column.max
116
- else
117
- col(column).max
118
- end
119
- end
120
-
121
- # Get the minimum value.
122
- #
123
- # @param column [Object]
124
- # Column(s) to be used in aggregation.
125
- #
126
- # @return [Expr, Object]
127
- def min(column)
128
- if column.is_a?(Series)
129
- column.min
130
- else
131
- col(column).min
132
- end
133
- end
134
-
135
- # Sum values in a column/Series, or horizontally across list of columns/expressions.
136
- #
137
- # @return [Object]
138
- def sum(column)
139
- if column.is_a?(Series)
140
- column.sum
141
- elsif Utils.strlike?(column)
142
- col(column.to_s).sum
143
- elsif column.is_a?(::Array)
144
- exprs = Utils.selection_to_rbexpr_list(column)
145
- Utils.wrap_expr(_sum_horizontal(exprs))
146
- else
147
- fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
148
- end
149
- end
150
-
151
- # Get the mean value.
152
- #
153
- # @return [Expr, Float]
154
- def mean(column)
155
- if column.is_a?(Series)
156
- column.mean
157
- else
158
- col(column).mean
159
- end
160
- end
161
-
162
- # Get the mean value.
163
- #
164
- # @return [Expr, Float]
165
- def avg(column)
166
- mean(column)
167
- end
168
-
169
- # Get the median value.
170
- #
171
- # @return [Object]
172
- def median(column)
173
- if column.is_a?(Series)
174
- column.median
175
- else
176
- col(column).median
177
- end
178
- end
179
-
180
- # Count unique values.
181
- #
182
- # @return [Object]
183
- def n_unique(column)
184
- if column.is_a?(Series)
185
- column.n_unique
186
- else
187
- col(column).n_unique
188
- end
189
- end
190
-
191
- # Get the first value.
192
- #
193
- # @return [Object]
194
- def first(column = nil)
195
- if column.nil?
196
- return Utils.wrap_expr(RbExpr.first)
197
- end
198
-
199
- if column.is_a?(Series)
200
- if column.len > 0
201
- column[0]
202
- else
203
- raise IndexError, "The series is empty, so no first value can be returned."
204
- end
205
- else
206
- col(column).first
207
- end
208
- end
209
-
210
- # Get the last value.
211
- #
212
- # Depending on the input type this function does different things:
213
- #
214
- # - nil -> expression to take last column of a context.
215
- # - String -> syntactic sugar for `Polars.col(..).last`
216
- # - Series -> Take last value in `Series`
217
- #
218
- # @return [Object]
219
- def last(column = nil)
220
- if column.nil?
221
- return Utils.wrap_expr(_last)
222
- end
223
-
224
- if column.is_a?(Series)
225
- if column.len > 0
226
- return column[-1]
227
- else
228
- raise IndexError, "The series is empty, so no last value can be returned"
229
- end
230
- end
231
- col(column).last
232
- end
233
-
234
- # Get the first `n` rows.
235
- #
236
- # @param column [Object]
237
- # Column name or Series.
238
- # @param n [Integer]
239
- # Number of rows to return.
240
- #
241
- # @return [Object]
242
- def head(column, n = 10)
243
- if column.is_a?(Series)
244
- column.head(n)
245
- else
246
- col(column).head(n)
247
- end
248
- end
249
-
250
- # Get the last `n` rows.
251
- #
252
- # @param column [Object]
253
- # Column name or Series.
254
- # @param n [Integer]
255
- # Number of rows to return.
256
- #
257
- # @return [Object]
258
- def tail(column, n = 10)
259
- if column.is_a?(Series)
260
- column.tail(n)
261
- else
262
- col(column).tail(n)
263
- end
264
- end
265
-
266
- # Return an expression representing a literal value.
267
- #
268
- # @return [Expr]
269
- def lit(value, dtype: nil, allow_object: nil)
270
- if value.is_a?(::Time) || value.is_a?(::DateTime)
271
- time_unit = dtype&.time_unit || "ns"
272
- time_zone = dtype.&time_zone
273
- e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
274
- if time_zone
275
- return e.dt.replace_time_zone(time_zone.to_s)
276
- else
277
- return e
278
- end
279
- elsif value.is_a?(::Date)
280
- return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
281
- elsif value.is_a?(Polars::Series)
282
- name = value.name
283
- value = value._s
284
- e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
285
- if name == ""
286
- return e
287
- end
288
- return e.alias(name)
289
- elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
290
- return lit(Series.new("", value))
291
- elsif dtype
292
- return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
293
- end
294
-
295
- Utils.wrap_expr(RbExpr.lit(value, allow_object))
296
- end
297
-
298
- # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
299
- #
300
- # @param column [Object]
301
- # Column(s) to be used in aggregation.
302
- #
303
- # @return [Object]
304
- #
305
- # @example
306
- # df = Polars::DataFrame.new(
307
- # {
308
- # "a" => [1, 2],
309
- # "b" => [3, 4],
310
- # "c" => [5, 6]
311
- # }
312
- # )
313
- # # =>
314
- # # shape: (2, 3)
315
- # # ┌─────┬─────┬─────┐
316
- # # │ a ┆ b ┆ c │
317
- # # │ --- ┆ --- ┆ --- │
318
- # # │ i64 ┆ i64 ┆ i64 │
319
- # # ╞═════╪═════╪═════╡
320
- # # │ 1 ┆ 3 ┆ 5 │
321
- # # │ 2 ┆ 4 ┆ 6 │
322
- # # └─────┴─────┴─────┘
323
- #
324
- # @example Cumulatively sum a column by name:
325
- # df.select(Polars.cumsum("a"))
326
- # # =>
327
- # # shape: (2, 1)
328
- # # ┌─────┐
329
- # # │ a │
330
- # # │ --- │
331
- # # │ i64 │
332
- # # ╞═════╡
333
- # # │ 1 │
334
- # # │ 3 │
335
- # # └─────┘
336
- #
337
- # @example Cumulatively sum a list of columns/expressions horizontally:
338
- # df.with_column(Polars.cumsum(["a", "c"]))
339
- # # =>
340
- # # shape: (2, 4)
341
- # # ┌─────┬─────┬─────┬───────────┐
342
- # # │ a ┆ b ┆ c ┆ cumsum │
343
- # # │ --- ┆ --- ┆ --- ┆ --- │
344
- # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
345
- # # ╞═════╪═════╪═════╪═══════════╡
346
- # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
347
- # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
348
- # # └─────┴─────┴─────┴───────────┘
349
- def cumsum(column)
350
- if column.is_a?(Series)
351
- column.cumsum
352
- elsif Utils.strlike?(column)
353
- col(column).cumsum
354
- else
355
- cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
356
- end
357
- end
358
-
359
- # Compute the spearman rank correlation between two columns.
360
- #
361
- # Missing data will be excluded from the computation.
362
- #
363
- # @param a [Object]
364
- # Column name or Expression.
365
- # @param b [Object]
366
- # Column name or Expression.
367
- # @param ddof [Integer]
368
- # Delta degrees of freedom
369
- # @param propagate_nans [Boolean]
370
- # If `True` any `NaN` encountered will lead to `NaN` in the output.
371
- # Defaults to `False` where `NaN` are regarded as larger than any finite number
372
- # and thus lead to the highest rank.
373
- #
374
- # @return [Expr]
375
- def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
376
- if Utils.strlike?(a)
377
- a = col(a)
378
- end
379
- if Utils.strlike?(b)
380
- b = col(b)
381
- end
382
- Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
383
- end
384
-
385
- # Compute the pearson's correlation between two columns.
386
- #
387
- # @param a [Object]
388
- # Column name or Expression.
389
- # @param b [Object]
390
- # Column name or Expression.
391
- # @param ddof [Integer]
392
- # Delta degrees of freedom
393
- #
394
- # @return [Expr]
395
- def pearson_corr(a, b, ddof: 1)
396
- if Utils.strlike?(a)
397
- a = col(a)
398
- end
399
- if Utils.strlike?(b)
400
- b = col(b)
401
- end
402
- Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
403
- end
404
-
405
- # Compute the covariance between two columns/ expressions.
406
- #
407
- # @param a [Object]
408
- # Column name or Expression.
409
- # @param b [Object]
410
- # Column name or Expression.
411
- #
412
- # @return [Expr]
413
- def cov(a, b)
414
- if Utils.strlike?(a)
415
- a = col(a)
416
- end
417
- if Utils.strlike?(b)
418
- b = col(b)
419
- end
420
- Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
421
- end
422
-
423
- # def map
424
- # end
425
-
426
- # def apply
427
- # end
428
-
429
- # Accumulate over multiple columns horizontally/row wise with a left fold.
430
- #
431
- # @return [Expr]
432
- def fold(acc, f, exprs)
433
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
434
- if exprs.is_a?(Expr)
435
- exprs = [exprs]
436
- end
437
-
438
- exprs = Utils.selection_to_rbexpr_list(exprs)
439
- Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
440
- end
441
-
442
- # def reduce
443
- # end
444
-
445
- # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
446
- #
447
- # Every cumulative result is added as a separate field in a Struct column.
448
- #
449
- # @param acc [Object]
450
- # Accumulator Expression. This is the value that will be initialized when the fold
451
- # starts. For a sum this could for instance be lit(0).
452
- # @param f [Object]
453
- # Function to apply over the accumulator and the value.
454
- # Fn(acc, value) -> new_value
455
- # @param exprs [Object]
456
- # Expressions to aggregate over. May also be a wildcard expression.
457
- # @param include_init [Boolean]
458
- # Include the initial accumulator state as struct field.
459
- #
460
- # @return [Object]
461
- #
462
- # @note
463
- # If you simply want the first encountered expression as accumulator,
464
- # consider using `cumreduce`.
465
- def cumfold(acc, f, exprs, include_init: false)
466
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
467
- if exprs.is_a?(Expr)
468
- exprs = [exprs]
469
- end
470
-
471
- exprs = Utils.selection_to_rbexpr_list(exprs)
472
- Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
473
- end
474
-
475
- # def cumreduce
476
- # end
477
-
478
- # Evaluate columnwise or elementwise with a bitwise OR operation.
479
- #
480
- # @return [Expr]
481
- def any(name)
482
- if Utils.strlike?(name)
483
- col(name).any
484
- else
485
- fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
486
- end
487
- end
488
-
489
- # Exclude certain columns from a wildcard/regex selection.
490
- #
491
- # @param columns [Object]
492
- # Column(s) to exclude from selection
493
- # This can be:
494
- #
495
- # - a column name, or multiple column names
496
- # - a regular expression starting with `^` and ending with `$`
497
- # - a dtype or multiple dtypes
498
- #
499
- # @return [Object]
500
- #
501
- # @example
502
- # df = Polars::DataFrame.new(
503
- # {
504
- # "aa" => [1, 2, 3],
505
- # "ba" => ["a", "b", nil],
506
- # "cc" => [nil, 2.5, 1.5]
507
- # }
508
- # )
509
- # # =>
510
- # # shape: (3, 3)
511
- # # ┌─────┬──────┬──────┐
512
- # # │ aa ┆ ba ┆ cc │
513
- # # │ --- ┆ --- ┆ --- │
514
- # # │ i64 ┆ str ┆ f64 │
515
- # # ╞═════╪══════╪══════╡
516
- # # │ 1 ┆ a ┆ null │
517
- # # │ 2 ┆ b ┆ 2.5 │
518
- # # │ 3 ┆ null ┆ 1.5 │
519
- # # └─────┴──────┴──────┘
520
- #
521
- # @example Exclude by column name(s):
522
- # df.select(Polars.exclude("ba"))
523
- # # =>
524
- # # shape: (3, 2)
525
- # # ┌─────┬──────┐
526
- # # │ aa ┆ cc │
527
- # # │ --- ┆ --- │
528
- # # │ i64 ┆ f64 │
529
- # # ╞═════╪══════╡
530
- # # │ 1 ┆ null │
531
- # # │ 2 ┆ 2.5 │
532
- # # │ 3 ┆ 1.5 │
533
- # # └─────┴──────┘
534
- #
535
- # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
536
- # df.select(Polars.exclude("^.*a$"))
537
- # # =>
538
- # # shape: (3, 1)
539
- # # ┌──────┐
540
- # # │ cc │
541
- # # │ --- │
542
- # # │ f64 │
543
- # # ╞══════╡
544
- # # │ null │
545
- # # │ 2.5 │
546
- # # │ 1.5 │
547
- # # └──────┘
548
- def exclude(columns)
549
- col("*").exclude(columns)
550
- end
551
-
552
- # Do one of two things.
553
- #
554
- # * function can do a columnwise or elementwise AND operation
555
- # * a wildcard column selection
556
- #
557
- # @param name [Object]
558
- # If given this function will apply a bitwise & on the columns.
559
- #
560
- # @return [Expr]
561
- #
562
- # @example Sum all columns
563
- # df = Polars::DataFrame.new(
564
- # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
565
- # )
566
- # df.select(Polars.all.sum)
567
- # # =>
568
- # # shape: (1, 3)
569
- # # ┌─────┬──────┬─────┐
570
- # # │ a ┆ b ┆ c │
571
- # # │ --- ┆ --- ┆ --- │
572
- # # │ i64 ┆ str ┆ i64 │
573
- # # ╞═════╪══════╪═════╡
574
- # # │ 6 ┆ null ┆ 3 │
575
- # # └─────┴──────┴─────┘
576
- def all(name = nil)
577
- if name.nil?
578
- col("*")
579
- elsif Utils.strlike?(name)
580
- col(name).all
581
- else
582
- raise Todo
583
- end
584
- end
585
-
586
- # Syntactic sugar for `Polars.col("foo").agg_groups`.
587
- #
588
- # @return [Object]
589
- def groups(column)
590
- col(column).agg_groups
591
- end
592
-
593
- # Syntactic sugar for `Polars.col("foo").quantile(...)`.
594
- #
595
- # @param column [String]
596
- # Column name.
597
- # @param quantile [Float]
598
- # Quantile between 0.0 and 1.0.
599
- # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
600
- # Interpolation method.
601
- #
602
- # @return [Expr]
603
- def quantile(column, quantile, interpolation: "nearest")
604
- col(column).quantile(quantile, interpolation: interpolation)
605
- end
606
-
607
- # Create a range expression (or Series).
608
- #
609
- # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
610
- # range size is equal to the length of the DataFrame you are collecting.
611
- #
612
- # @param start [Integer, Expr, Series]
613
- # Lower bound of range.
614
- # @param stop [Integer, Expr, Series]
615
- # Upper bound of range.
616
- # @param step [Integer]
617
- # Step size of the range.
618
- # @param eager [Boolean]
619
- # If eager evaluation is `True`, a Series is returned instead of an Expr.
620
- # @param dtype [Symbol]
621
- # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
622
- #
623
- # @return [Expr, Series]
624
- #
625
- # @example
626
- # Polars.arange(0, 3, eager: true)
627
- # # =>
628
- # # shape: (3,)
629
- # # Series: 'arange' [i64]
630
- # # [
631
- # # 0
632
- # # 1
633
- # # 2
634
- # # ]
635
- def int_range(start, stop, step: 1, eager: false, dtype: nil)
636
- start = Utils.parse_as_expression(start)
637
- stop = Utils.parse_as_expression(stop)
638
- dtype ||= Int64
639
- dtype = dtype.to_s if dtype.is_a?(Symbol)
640
- result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
641
-
642
- if eager
643
- return select(result).to_series
644
- end
645
-
646
- result
647
- end
648
- alias_method :arange, :int_range
649
-
650
- # Find the indexes that would sort the columns.
651
- #
652
- # Argsort by multiple columns. The first column will be used for the ordering.
653
- # If there are duplicates in the first column, the second column will be used to
654
- # determine the ordering and so on.
655
- #
656
- # @param exprs [Object]
657
- # Columns use to determine the ordering.
658
- # @param reverse [Boolean]
659
- # Default is ascending.
660
- #
661
- # @return [Expr]
662
- def arg_sort_by(exprs, reverse: false)
663
- if !exprs.is_a?(::Array)
664
- exprs = [exprs]
665
- end
666
- if reverse == true || reverse == false
667
- reverse = [reverse] * exprs.length
668
- end
669
- exprs = Utils.selection_to_rbexpr_list(exprs)
670
- Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
671
- end
672
- alias_method :argsort_by, :arg_sort_by
673
-
674
- # Create polars `Duration` from distinct time components.
675
- #
676
- # @return [Expr]
677
- #
678
- # @example
679
- # df = Polars::DataFrame.new(
680
- # {
681
- # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
682
- # "add" => [1, 2]
683
- # }
684
- # )
685
- # df.select(
686
- # [
687
- # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
688
- # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
689
- # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
690
- # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
691
- # "add_milliseconds"
692
- # ),
693
- # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
694
- # ]
695
- # )
696
- # # =>
697
- # # shape: (2, 5)
698
- # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
699
- # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
700
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
701
- # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
702
- # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
703
- # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
704
- # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
705
- # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
706
- def duration(
707
- weeks: nil,
708
- days: nil,
709
- hours: nil,
710
- minutes: nil,
711
- seconds: nil,
712
- milliseconds: nil,
713
- microseconds: nil,
714
- nanoseconds: nil,
715
- time_unit: "us"
716
- )
717
- if !weeks.nil?
718
- weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
719
- end
720
- if !days.nil?
721
- days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
722
- end
723
- if !hours.nil?
724
- hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
725
- end
726
- if !minutes.nil?
727
- minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
728
- end
729
- if !seconds.nil?
730
- seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
731
- end
732
- if !milliseconds.nil?
733
- milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
734
- end
735
- if !microseconds.nil?
736
- microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
737
- end
738
- if !nanoseconds.nil?
739
- nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
740
- end
741
-
742
- Utils.wrap_expr(
743
- _rb_duration(
744
- weeks,
745
- days,
746
- hours,
747
- minutes,
748
- seconds,
749
- milliseconds,
750
- microseconds,
751
- nanoseconds,
752
- time_unit
753
- )
754
- )
755
- end
756
-
757
- # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
758
- #
759
- # @param exprs [Object]
760
- # Columns to concat into a Utf8 Series.
761
- # @param sep [String]
762
- # String value that will be used to separate the values.
763
- #
764
- # @return [Expr]
765
- #
766
- # @example
767
- # df = Polars::DataFrame.new(
768
- # {
769
- # "a" => [1, 2, 3],
770
- # "b" => ["dogs", "cats", nil],
771
- # "c" => ["play", "swim", "walk"]
772
- # }
773
- # )
774
- # df.with_columns(
775
- # [
776
- # Polars.concat_str(
777
- # [
778
- # Polars.col("a") * 2,
779
- # Polars.col("b"),
780
- # Polars.col("c")
781
- # ],
782
- # sep: " "
783
- # ).alias("full_sentence")
784
- # ]
785
- # )
786
- # # =>
787
- # # shape: (3, 4)
788
- # # ┌─────┬──────┬──────┬───────────────┐
789
- # # │ a ┆ b ┆ c ┆ full_sentence │
790
- # # │ --- ┆ --- ┆ --- ┆ --- │
791
- # # │ i64 ┆ str ┆ str ┆ str │
792
- # # ╞═════╪══════╪══════╪═══════════════╡
793
- # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
794
- # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
795
- # # │ 3 ┆ null ┆ walk ┆ null │
796
- # # └─────┴──────┴──────┴───────────────┘
797
- def concat_str(exprs, sep: "")
798
- exprs = Utils.selection_to_rbexpr_list(exprs)
799
- return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
800
- end
801
-
802
- # Format expressions as a string.
803
- #
804
- # @param fstring [String]
805
- # A string that with placeholders.
806
- # For example: "hello_{}" or "{}_world
807
- # @param args [Object]
808
- # Expression(s) that fill the placeholders
809
- #
810
- # @return [Expr]
811
- #
812
- # @example
813
- # df = Polars::DataFrame.new(
814
- # {
815
- # "a": ["a", "b", "c"],
816
- # "b": [1, 2, 3]
817
- # }
818
- # )
819
- # df.select(
820
- # [
821
- # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
822
- # ]
823
- # )
824
- # # =>
825
- # # shape: (3, 1)
826
- # # ┌─────────────┐
827
- # # │ fmt │
828
- # # │ --- │
829
- # # │ str │
830
- # # ╞═════════════╡
831
- # # │ foo_a_bar_1 │
832
- # # │ foo_b_bar_2 │
833
- # # │ foo_c_bar_3 │
834
- # # └─────────────┘
835
- def format(fstring, *args)
836
- if fstring.scan("{}").length != args.length
837
- raise ArgumentError, "number of placeholders should equal the number of arguments"
838
- end
839
-
840
- exprs = []
841
-
842
- arguments = args.each
843
- fstring.split(/(\{\})/).each do |s|
844
- if s == "{}"
845
- e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
846
- exprs << e
847
- elsif s.length > 0
848
- exprs << lit(s)
849
- end
850
- end
851
-
852
- concat_str(exprs, sep: "")
853
- end
854
-
855
- # Concat the arrays in a Series dtype List in linear time.
856
- #
857
- # @return [Expr]
858
- def concat_list(exprs)
859
- exprs = Utils.selection_to_rbexpr_list(exprs)
860
- Utils.wrap_expr(RbExpr.concat_lst(exprs))
861
- end
862
-
863
- # Collect multiple LazyFrames at the same time.
864
- #
865
- # This runs all the computation graphs in parallel on Polars threadpool.
866
- #
867
- # @param lazy_frames [Boolean]
868
- # A list of LazyFrames to collect.
869
- # @param type_coercion [Boolean]
870
- # Do type coercion optimization.
871
- # @param predicate_pushdown [Boolean]
872
- # Do predicate pushdown optimization.
873
- # @param projection_pushdown [Boolean]
874
- # Do projection pushdown optimization.
875
- # @param simplify_expression [Boolean]
876
- # Run simplify expressions optimization.
877
- # @param string_cache [Boolean]
878
- # This argument is deprecated and will be ignored
879
- # @param no_optimization [Boolean]
880
- # Turn off optimizations.
881
- # @param slice_pushdown [Boolean]
882
- # Slice pushdown optimization.
883
- # @param common_subplan_elimination [Boolean]
884
- # Will try to cache branching subplans that occur on self-joins or unions.
885
- # @param allow_streaming [Boolean]
886
- # Run parts of the query in a streaming fashion (this is in an alpha state)
887
- #
888
- # @return [Array]
889
- def collect_all(
890
- lazy_frames,
891
- type_coercion: true,
892
- predicate_pushdown: true,
893
- projection_pushdown: true,
894
- simplify_expression: true,
895
- string_cache: false,
896
- no_optimization: false,
897
- slice_pushdown: true,
898
- common_subplan_elimination: true,
899
- allow_streaming: false
900
- )
901
- if no_optimization
902
- predicate_pushdown = false
903
- projection_pushdown = false
904
- slice_pushdown = false
905
- common_subplan_elimination = false
906
- end
907
-
908
- prepared = []
909
-
910
- lazy_frames.each do |lf|
911
- ldf = lf._ldf.optimization_toggle(
912
- type_coercion,
913
- predicate_pushdown,
914
- projection_pushdown,
915
- simplify_expression,
916
- slice_pushdown,
917
- common_subplan_elimination,
918
- allow_streaming,
919
- false
920
- )
921
- prepared << ldf
922
- end
923
-
924
- out = _collect_all(prepared)
925
-
926
- # wrap the rbdataframes into dataframe
927
- result = out.map { |rbdf| Utils.wrap_df(rbdf) }
928
-
929
- result
930
- end
931
-
932
- # Run polars expressions without a context.
933
- #
934
- # @return [DataFrame]
935
- def select(exprs)
936
- DataFrame.new([]).select(exprs)
937
- end
938
-
939
- # Collect several columns into a Series of dtype Struct.
940
- #
941
- # @param exprs [Object]
942
- # Columns/Expressions to collect into a Struct
943
- # @param eager [Boolean]
944
- # Evaluate immediately
945
- #
946
- # @return [Object]
947
- #
948
- # @example
949
- # Polars::DataFrame.new(
950
- # {
951
- # "int" => [1, 2],
952
- # "str" => ["a", "b"],
953
- # "bool" => [true, nil],
954
- # "list" => [[1, 2], [3]],
955
- # }
956
- # ).select([Polars.struct(Polars.all).alias("my_struct")])
957
- # # =>
958
- # # shape: (2, 1)
959
- # # ┌─────────────────────┐
960
- # # │ my_struct │
961
- # # │ --- │
962
- # # │ struct[4] │
963
- # # ╞═════════════════════╡
964
- # # │ {1,"a",true,[1, 2]} │
965
- # # │ {2,"b",null,[3]} │
966
- # # └─────────────────────┘
967
- #
968
- # @example Only collect specific columns as a struct:
969
- # df = Polars::DataFrame.new(
970
- # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
971
- # )
972
- # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
973
- # # =>
974
- # # shape: (4, 4)
975
- # # ┌─────┬───────┬─────┬─────────────┐
976
- # # │ a ┆ b ┆ c ┆ a_and_b │
977
- # # │ --- ┆ --- ┆ --- ┆ --- │
978
- # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
979
- # # ╞═════╪═══════╪═════╪═════════════╡
980
- # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
981
- # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
982
- # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
983
- # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
984
- # # └─────┴───────┴─────┴─────────────┘
985
- def struct(exprs, eager: false)
986
- if eager
987
- Polars.select(struct(exprs, eager: false)).to_series
988
- end
989
- exprs = Utils.selection_to_rbexpr_list(exprs)
990
- Utils.wrap_expr(_as_struct(exprs))
991
- end
992
-
993
- # Repeat a single value n times.
994
- #
995
- # @param value [Object]
996
- # Value to repeat.
997
- # @param n [Integer]
998
- # Repeat `n` times.
999
- # @param eager [Boolean]
1000
- # Run eagerly and collect into a `Series`.
1001
- # @param name [String]
1002
- # Only used in `eager` mode. As expression, use `alias`.
1003
- #
1004
- # @return [Expr]
1005
- def repeat(value, n, dtype: nil, eager: false, name: nil)
1006
- if !name.nil?
1007
- warn "the `name` argument is deprecated. Use the `alias` method instead."
1008
- end
1009
-
1010
- if n.is_a?(Integer)
1011
- n = lit(n)
1012
- end
1013
-
1014
- value = Utils.parse_as_expression(value, str_as_lit: true)
1015
- expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1016
- if !name.nil?
1017
- expr = expr.alias(name)
1018
- end
1019
- if eager
1020
- return select(expr).to_series
1021
- end
1022
- expr
1023
- end
1024
-
1025
- # Return indices where `condition` evaluates `true`.
1026
- #
1027
- # @param condition [Expr]
1028
- # Boolean expression to evaluate
1029
- # @param eager [Boolean]
1030
- # Whether to apply this function eagerly (as opposed to lazily).
1031
- #
1032
- # @return [Expr, Series]
1033
- #
1034
- # @example
1035
- # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
1036
- # df.select(
1037
- # [
1038
- # Polars.arg_where(Polars.col("a") % 2 == 0)
1039
- # ]
1040
- # ).to_series
1041
- # # =>
1042
- # # shape: (2,)
1043
- # # Series: 'a' [u32]
1044
- # # [
1045
- # # 1
1046
- # # 3
1047
- # # ]
1048
- def arg_where(condition, eager: false)
1049
- if eager
1050
- if !condition.is_a?(Series)
1051
- raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
1052
- end
1053
- condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
1054
- else
1055
- condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
1056
- Utils.wrap_expr(_arg_where(condition._rbexpr))
1057
- end
1058
- end
1059
-
1060
- # Folds the expressions from left to right, keeping the first non-null value.
1061
- #
1062
- # @param exprs [Object]
1063
- # Expressions to coalesce.
1064
- #
1065
- # @return [Expr]
1066
- #
1067
- # @example
1068
- # df = Polars::DataFrame.new(
1069
- # [
1070
- # [nil, 1.0, 1.0],
1071
- # [nil, 2.0, 2.0],
1072
- # [nil, nil, 3.0],
1073
- # [nil, nil, nil]
1074
- # ],
1075
- # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1076
- # )
1077
- # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1078
- # # =>
1079
- # # shape: (4, 4)
1080
- # # ┌──────┬──────┬──────┬──────┐
1081
- # # │ a ┆ b ┆ c ┆ d │
1082
- # # │ --- ┆ --- ┆ --- ┆ --- │
1083
- # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1084
- # # ╞══════╪══════╪══════╪══════╡
1085
- # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1086
- # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1087
- # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1088
- # # │ null ┆ null ┆ null ┆ 99.9 │
1089
- # # └──────┴──────┴──────┴──────┘
1090
- def coalesce(exprs, *more_exprs)
1091
- exprs = Utils.selection_to_rbexpr_list(exprs)
1092
- if more_exprs.any?
1093
- exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1094
- end
1095
- Utils.wrap_expr(_coalesce_exprs(exprs))
1096
- end
1097
-
1098
- # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1099
- #
1100
- # Depending on the `unit` provided, this function will return a different dtype:
1101
- # - unit: "d" returns pl.Date
1102
- # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1103
- # - unit: "ms" returns pl.Datetime["ms"]
1104
- # - unit: "us" returns pl.Datetime["us"]
1105
- # - unit: "ns" returns pl.Datetime["ns"]
1106
- #
1107
- # @param column [Object]
1108
- # Series or expression to parse integers to pl.Datetime.
1109
- # @param unit [String]
1110
- # The unit of the timesteps since epoch time.
1111
- # @param eager [Boolean]
1112
- # If eager evaluation is `true`, a Series is returned instead of an Expr.
1113
- #
1114
- # @return [Object]
1115
- #
1116
- # @example
1117
- # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1118
- # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1119
- # # =>
1120
- # # shape: (2, 1)
1121
- # # ┌─────────────────────┐
1122
- # # │ timestamp │
1123
- # # │ --- │
1124
- # # │ datetime[μs] │
1125
- # # ╞═════════════════════╡
1126
- # # │ 2022-10-25 07:31:17 │
1127
- # # │ 2022-10-25 07:31:39 │
1128
- # # └─────────────────────┘
1129
- def from_epoch(column, unit: "s", eager: false)
1130
- if Utils.strlike?(column)
1131
- column = col(column)
1132
- elsif !column.is_a?(Series) && !column.is_a?(Expr)
1133
- column = Series.new(column)
1134
- end
1135
-
1136
- if unit == "d"
1137
- expr = column.cast(Date)
1138
- elsif unit == "s"
1139
- expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1140
- elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1141
- expr = column.cast(Datetime.new(unit))
1142
- else
1143
- raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1144
- end
1145
-
1146
- if eager
1147
- if !column.is_a?(Series)
1148
- raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1149
- else
1150
- column.to_frame.select(expr).to_series
1151
- end
1152
- else
1153
- expr
1154
- end
1155
- end
1156
-
1157
- # Start a "when, then, otherwise" expression.
1158
- #
1159
- # @return [When]
1160
- #
1161
- # @example
1162
- # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
1163
- # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
1164
- # # =>
1165
- # # shape: (3, 3)
1166
- # # ┌─────┬─────┬─────────┐
1167
- # # │ foo ┆ bar ┆ literal │
1168
- # # │ --- ┆ --- ┆ --- │
1169
- # # │ i64 ┆ i64 ┆ i32 │
1170
- # # ╞═════╪═════╪═════════╡
1171
- # # │ 1 ┆ 3 ┆ -1 │
1172
- # # │ 3 ┆ 4 ┆ 1 │
1173
- # # │ 4 ┆ 0 ┆ 1 │
1174
- # # └─────┴─────┴─────────┘
1175
- def when(expr)
1176
- expr = Utils.expr_to_lit_or_expr(expr)
1177
- pw = RbExpr.when(expr._rbexpr)
1178
- When.new(pw)
1179
- end
1180
- end
1181
- end