polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/3.3/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -1,1181 +0,0 @@
1
- module Polars
2
- module LazyFunctions
3
- # Return an expression representing a column in a DataFrame.
4
- #
5
- # @return [Expr]
6
- def col(name)
7
- if name.is_a?(Series)
8
- name = name.to_a
9
- end
10
-
11
- if name.is_a?(Class) && name < DataType
12
- name = [name]
13
- end
14
-
15
- if name.is_a?(DataType)
16
- Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(::Array)
18
- if name.length == 0 || Utils.strlike?(name[0])
19
- name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
- Utils.wrap_expr(RbExpr.cols(name))
21
- elsif Utils.is_polars_dtype(name[0])
22
- Utils.wrap_expr(_dtype_cols(name))
23
- else
24
- raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
25
- end
26
- else
27
- name = name.to_s if name.is_a?(Symbol)
28
- Utils.wrap_expr(RbExpr.col(name))
29
- end
30
- end
31
-
32
- # Alias for an element in evaluated in an `eval` expression.
33
- #
34
- # @return [Expr]
35
- #
36
- # @example A horizontal rank computation by taking the elements of a list
37
- # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
- # df.with_column(
39
- # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
- # )
41
- # # =>
42
- # # shape: (3, 3)
43
- # # ┌─────┬─────┬────────────┐
44
- # # │ a ┆ b ┆ rank │
45
- # # │ --- ┆ --- ┆ --- │
46
- # # │ i64 ┆ i64 ┆ list[f64] │
47
- # # ╞═════╪═════╪════════════╡
48
- # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
49
- # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
50
- # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
51
- # # └─────┴─────┴────────────┘
52
- def element
53
- col("")
54
- end
55
-
56
- # Count the number of values in this column/context.
57
- #
58
- # @param column [String, Series, nil]
59
- # If dtype is:
60
- #
61
- # * `Series` : count the values in the series.
62
- # * `String` : count the values in this column.
63
- # * `None` : count the number of values in this context.
64
- #
65
- # @return [Expr, Integer]
66
- def count(column = nil)
67
- if column.nil?
68
- return Utils.wrap_expr(RbExpr.count)
69
- end
70
-
71
- if column.is_a?(Series)
72
- column.len
73
- else
74
- col(column).count
75
- end
76
- end
77
-
78
- # Aggregate to list.
79
- #
80
- # @return [Expr]
81
- def to_list(name)
82
- col(name).list
83
- end
84
-
85
- # Get the standard deviation.
86
- #
87
- # @return [Object]
88
- def std(column, ddof: 1)
89
- if column.is_a?(Series)
90
- column.std(ddof: ddof)
91
- else
92
- col(column).std(ddof: ddof)
93
- end
94
- end
95
-
96
- # Get the variance.
97
- #
98
- # @return [Object]
99
- def var(column, ddof: 1)
100
- if column.is_a?(Series)
101
- column.var(ddof: ddof)
102
- else
103
- col(column).var(ddof: ddof)
104
- end
105
- end
106
-
107
- # Get the maximum value.
108
- #
109
- # @param column [Object]
110
- # Column(s) to be used in aggregation.
111
- #
112
- # @return [Expr, Object]
113
- def max(column)
114
- if column.is_a?(Series)
115
- column.max
116
- else
117
- col(column).max
118
- end
119
- end
120
-
121
- # Get the minimum value.
122
- #
123
- # @param column [Object]
124
- # Column(s) to be used in aggregation.
125
- #
126
- # @return [Expr, Object]
127
- def min(column)
128
- if column.is_a?(Series)
129
- column.min
130
- else
131
- col(column).min
132
- end
133
- end
134
-
135
- # Sum values in a column/Series, or horizontally across list of columns/expressions.
136
- #
137
- # @return [Object]
138
- def sum(column)
139
- if column.is_a?(Series)
140
- column.sum
141
- elsif Utils.strlike?(column)
142
- col(column.to_s).sum
143
- elsif column.is_a?(::Array)
144
- exprs = Utils.selection_to_rbexpr_list(column)
145
- Utils.wrap_expr(_sum_horizontal(exprs))
146
- else
147
- fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
148
- end
149
- end
150
-
151
- # Get the mean value.
152
- #
153
- # @return [Expr, Float]
154
- def mean(column)
155
- if column.is_a?(Series)
156
- column.mean
157
- else
158
- col(column).mean
159
- end
160
- end
161
-
162
- # Get the mean value.
163
- #
164
- # @return [Expr, Float]
165
- def avg(column)
166
- mean(column)
167
- end
168
-
169
- # Get the median value.
170
- #
171
- # @return [Object]
172
- def median(column)
173
- if column.is_a?(Series)
174
- column.median
175
- else
176
- col(column).median
177
- end
178
- end
179
-
180
- # Count unique values.
181
- #
182
- # @return [Object]
183
- def n_unique(column)
184
- if column.is_a?(Series)
185
- column.n_unique
186
- else
187
- col(column).n_unique
188
- end
189
- end
190
-
191
- # Get the first value.
192
- #
193
- # @return [Object]
194
- def first(column = nil)
195
- if column.nil?
196
- return Utils.wrap_expr(RbExpr.first)
197
- end
198
-
199
- if column.is_a?(Series)
200
- if column.len > 0
201
- column[0]
202
- else
203
- raise IndexError, "The series is empty, so no first value can be returned."
204
- end
205
- else
206
- col(column).first
207
- end
208
- end
209
-
210
- # Get the last value.
211
- #
212
- # Depending on the input type this function does different things:
213
- #
214
- # - nil -> expression to take last column of a context.
215
- # - String -> syntactic sugar for `Polars.col(..).last`
216
- # - Series -> Take last value in `Series`
217
- #
218
- # @return [Object]
219
- def last(column = nil)
220
- if column.nil?
221
- return Utils.wrap_expr(_last)
222
- end
223
-
224
- if column.is_a?(Series)
225
- if column.len > 0
226
- return column[-1]
227
- else
228
- raise IndexError, "The series is empty, so no last value can be returned"
229
- end
230
- end
231
- col(column).last
232
- end
233
-
234
- # Get the first `n` rows.
235
- #
236
- # @param column [Object]
237
- # Column name or Series.
238
- # @param n [Integer]
239
- # Number of rows to return.
240
- #
241
- # @return [Object]
242
- def head(column, n = 10)
243
- if column.is_a?(Series)
244
- column.head(n)
245
- else
246
- col(column).head(n)
247
- end
248
- end
249
-
250
- # Get the last `n` rows.
251
- #
252
- # @param column [Object]
253
- # Column name or Series.
254
- # @param n [Integer]
255
- # Number of rows to return.
256
- #
257
- # @return [Object]
258
- def tail(column, n = 10)
259
- if column.is_a?(Series)
260
- column.tail(n)
261
- else
262
- col(column).tail(n)
263
- end
264
- end
265
-
266
- # Return an expression representing a literal value.
267
- #
268
- # @return [Expr]
269
- def lit(value, dtype: nil, allow_object: nil)
270
- if value.is_a?(::Time) || value.is_a?(::DateTime)
271
- time_unit = dtype&.time_unit || "ns"
272
- time_zone = dtype.&time_zone
273
- e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
274
- if time_zone
275
- return e.dt.replace_time_zone(time_zone.to_s)
276
- else
277
- return e
278
- end
279
- elsif value.is_a?(::Date)
280
- return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
281
- elsif value.is_a?(Polars::Series)
282
- name = value.name
283
- value = value._s
284
- e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
285
- if name == ""
286
- return e
287
- end
288
- return e.alias(name)
289
- elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
290
- return lit(Series.new("", value))
291
- elsif dtype
292
- return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
293
- end
294
-
295
- Utils.wrap_expr(RbExpr.lit(value, allow_object))
296
- end
297
-
298
- # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
299
- #
300
- # @param column [Object]
301
- # Column(s) to be used in aggregation.
302
- #
303
- # @return [Object]
304
- #
305
- # @example
306
- # df = Polars::DataFrame.new(
307
- # {
308
- # "a" => [1, 2],
309
- # "b" => [3, 4],
310
- # "c" => [5, 6]
311
- # }
312
- # )
313
- # # =>
314
- # # shape: (2, 3)
315
- # # ┌─────┬─────┬─────┐
316
- # # │ a ┆ b ┆ c │
317
- # # │ --- ┆ --- ┆ --- │
318
- # # │ i64 ┆ i64 ┆ i64 │
319
- # # ╞═════╪═════╪═════╡
320
- # # │ 1 ┆ 3 ┆ 5 │
321
- # # │ 2 ┆ 4 ┆ 6 │
322
- # # └─────┴─────┴─────┘
323
- #
324
- # @example Cumulatively sum a column by name:
325
- # df.select(Polars.cumsum("a"))
326
- # # =>
327
- # # shape: (2, 1)
328
- # # ┌─────┐
329
- # # │ a │
330
- # # │ --- │
331
- # # │ i64 │
332
- # # ╞═════╡
333
- # # │ 1 │
334
- # # │ 3 │
335
- # # └─────┘
336
- #
337
- # @example Cumulatively sum a list of columns/expressions horizontally:
338
- # df.with_column(Polars.cumsum(["a", "c"]))
339
- # # =>
340
- # # shape: (2, 4)
341
- # # ┌─────┬─────┬─────┬───────────┐
342
- # # │ a ┆ b ┆ c ┆ cumsum │
343
- # # │ --- ┆ --- ┆ --- ┆ --- │
344
- # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
345
- # # ╞═════╪═════╪═════╪═══════════╡
346
- # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
347
- # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
348
- # # └─────┴─────┴─────┴───────────┘
349
- def cumsum(column)
350
- if column.is_a?(Series)
351
- column.cumsum
352
- elsif Utils.strlike?(column)
353
- col(column).cumsum
354
- else
355
- cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
356
- end
357
- end
358
-
359
- # Compute the spearman rank correlation between two columns.
360
- #
361
- # Missing data will be excluded from the computation.
362
- #
363
- # @param a [Object]
364
- # Column name or Expression.
365
- # @param b [Object]
366
- # Column name or Expression.
367
- # @param ddof [Integer]
368
- # Delta degrees of freedom
369
- # @param propagate_nans [Boolean]
370
- # If `True` any `NaN` encountered will lead to `NaN` in the output.
371
- # Defaults to `False` where `NaN` are regarded as larger than any finite number
372
- # and thus lead to the highest rank.
373
- #
374
- # @return [Expr]
375
- def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
376
- if Utils.strlike?(a)
377
- a = col(a)
378
- end
379
- if Utils.strlike?(b)
380
- b = col(b)
381
- end
382
- Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
383
- end
384
-
385
- # Compute the pearson's correlation between two columns.
386
- #
387
- # @param a [Object]
388
- # Column name or Expression.
389
- # @param b [Object]
390
- # Column name or Expression.
391
- # @param ddof [Integer]
392
- # Delta degrees of freedom
393
- #
394
- # @return [Expr]
395
- def pearson_corr(a, b, ddof: 1)
396
- if Utils.strlike?(a)
397
- a = col(a)
398
- end
399
- if Utils.strlike?(b)
400
- b = col(b)
401
- end
402
- Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
403
- end
404
-
405
- # Compute the covariance between two columns/ expressions.
406
- #
407
- # @param a [Object]
408
- # Column name or Expression.
409
- # @param b [Object]
410
- # Column name or Expression.
411
- #
412
- # @return [Expr]
413
- def cov(a, b)
414
- if Utils.strlike?(a)
415
- a = col(a)
416
- end
417
- if Utils.strlike?(b)
418
- b = col(b)
419
- end
420
- Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
421
- end
422
-
423
- # def map
424
- # end
425
-
426
- # def apply
427
- # end
428
-
429
- # Accumulate over multiple columns horizontally/row wise with a left fold.
430
- #
431
- # @return [Expr]
432
- def fold(acc, f, exprs)
433
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
434
- if exprs.is_a?(Expr)
435
- exprs = [exprs]
436
- end
437
-
438
- exprs = Utils.selection_to_rbexpr_list(exprs)
439
- Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
440
- end
441
-
442
- # def reduce
443
- # end
444
-
445
- # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
446
- #
447
- # Every cumulative result is added as a separate field in a Struct column.
448
- #
449
- # @param acc [Object]
450
- # Accumulator Expression. This is the value that will be initialized when the fold
451
- # starts. For a sum this could for instance be lit(0).
452
- # @param f [Object]
453
- # Function to apply over the accumulator and the value.
454
- # Fn(acc, value) -> new_value
455
- # @param exprs [Object]
456
- # Expressions to aggregate over. May also be a wildcard expression.
457
- # @param include_init [Boolean]
458
- # Include the initial accumulator state as struct field.
459
- #
460
- # @return [Object]
461
- #
462
- # @note
463
- # If you simply want the first encountered expression as accumulator,
464
- # consider using `cumreduce`.
465
- def cumfold(acc, f, exprs, include_init: false)
466
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
467
- if exprs.is_a?(Expr)
468
- exprs = [exprs]
469
- end
470
-
471
- exprs = Utils.selection_to_rbexpr_list(exprs)
472
- Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
473
- end
474
-
475
- # def cumreduce
476
- # end
477
-
478
- # Evaluate columnwise or elementwise with a bitwise OR operation.
479
- #
480
- # @return [Expr]
481
- def any(name)
482
- if Utils.strlike?(name)
483
- col(name).any
484
- else
485
- fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
486
- end
487
- end
488
-
489
- # Exclude certain columns from a wildcard/regex selection.
490
- #
491
- # @param columns [Object]
492
- # Column(s) to exclude from selection
493
- # This can be:
494
- #
495
- # - a column name, or multiple column names
496
- # - a regular expression starting with `^` and ending with `$`
497
- # - a dtype or multiple dtypes
498
- #
499
- # @return [Object]
500
- #
501
- # @example
502
- # df = Polars::DataFrame.new(
503
- # {
504
- # "aa" => [1, 2, 3],
505
- # "ba" => ["a", "b", nil],
506
- # "cc" => [nil, 2.5, 1.5]
507
- # }
508
- # )
509
- # # =>
510
- # # shape: (3, 3)
511
- # # ┌─────┬──────┬──────┐
512
- # # │ aa ┆ ba ┆ cc │
513
- # # │ --- ┆ --- ┆ --- │
514
- # # │ i64 ┆ str ┆ f64 │
515
- # # ╞═════╪══════╪══════╡
516
- # # │ 1 ┆ a ┆ null │
517
- # # │ 2 ┆ b ┆ 2.5 │
518
- # # │ 3 ┆ null ┆ 1.5 │
519
- # # └─────┴──────┴──────┘
520
- #
521
- # @example Exclude by column name(s):
522
- # df.select(Polars.exclude("ba"))
523
- # # =>
524
- # # shape: (3, 2)
525
- # # ┌─────┬──────┐
526
- # # │ aa ┆ cc │
527
- # # │ --- ┆ --- │
528
- # # │ i64 ┆ f64 │
529
- # # ╞═════╪══════╡
530
- # # │ 1 ┆ null │
531
- # # │ 2 ┆ 2.5 │
532
- # # │ 3 ┆ 1.5 │
533
- # # └─────┴──────┘
534
- #
535
- # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
536
- # df.select(Polars.exclude("^.*a$"))
537
- # # =>
538
- # # shape: (3, 1)
539
- # # ┌──────┐
540
- # # │ cc │
541
- # # │ --- │
542
- # # │ f64 │
543
- # # ╞══════╡
544
- # # │ null │
545
- # # │ 2.5 │
546
- # # │ 1.5 │
547
- # # └──────┘
548
- def exclude(columns)
549
- col("*").exclude(columns)
550
- end
551
-
552
- # Do one of two things.
553
- #
554
- # * function can do a columnwise or elementwise AND operation
555
- # * a wildcard column selection
556
- #
557
- # @param name [Object]
558
- # If given this function will apply a bitwise & on the columns.
559
- #
560
- # @return [Expr]
561
- #
562
- # @example Sum all columns
563
- # df = Polars::DataFrame.new(
564
- # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
565
- # )
566
- # df.select(Polars.all.sum)
567
- # # =>
568
- # # shape: (1, 3)
569
- # # ┌─────┬──────┬─────┐
570
- # # │ a ┆ b ┆ c │
571
- # # │ --- ┆ --- ┆ --- │
572
- # # │ i64 ┆ str ┆ i64 │
573
- # # ╞═════╪══════╪═════╡
574
- # # │ 6 ┆ null ┆ 3 │
575
- # # └─────┴──────┴─────┘
576
- def all(name = nil)
577
- if name.nil?
578
- col("*")
579
- elsif Utils.strlike?(name)
580
- col(name).all
581
- else
582
- raise Todo
583
- end
584
- end
585
-
586
- # Syntactic sugar for `Polars.col("foo").agg_groups`.
587
- #
588
- # @return [Object]
589
- def groups(column)
590
- col(column).agg_groups
591
- end
592
-
593
- # Syntactic sugar for `Polars.col("foo").quantile(...)`.
594
- #
595
- # @param column [String]
596
- # Column name.
597
- # @param quantile [Float]
598
- # Quantile between 0.0 and 1.0.
599
- # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
600
- # Interpolation method.
601
- #
602
- # @return [Expr]
603
- def quantile(column, quantile, interpolation: "nearest")
604
- col(column).quantile(quantile, interpolation: interpolation)
605
- end
606
-
607
- # Create a range expression (or Series).
608
- #
609
- # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
610
- # range size is equal to the length of the DataFrame you are collecting.
611
- #
612
- # @param start [Integer, Expr, Series]
613
- # Lower bound of range.
614
- # @param stop [Integer, Expr, Series]
615
- # Upper bound of range.
616
- # @param step [Integer]
617
- # Step size of the range.
618
- # @param eager [Boolean]
619
- # If eager evaluation is `True`, a Series is returned instead of an Expr.
620
- # @param dtype [Symbol]
621
- # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
622
- #
623
- # @return [Expr, Series]
624
- #
625
- # @example
626
- # Polars.arange(0, 3, eager: true)
627
- # # =>
628
- # # shape: (3,)
629
- # # Series: 'arange' [i64]
630
- # # [
631
- # # 0
632
- # # 1
633
- # # 2
634
- # # ]
635
- def int_range(start, stop, step: 1, eager: false, dtype: nil)
636
- start = Utils.parse_as_expression(start)
637
- stop = Utils.parse_as_expression(stop)
638
- dtype ||= Int64
639
- dtype = dtype.to_s if dtype.is_a?(Symbol)
640
- result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
641
-
642
- if eager
643
- return select(result).to_series
644
- end
645
-
646
- result
647
- end
648
- alias_method :arange, :int_range
649
-
650
- # Find the indexes that would sort the columns.
651
- #
652
- # Argsort by multiple columns. The first column will be used for the ordering.
653
- # If there are duplicates in the first column, the second column will be used to
654
- # determine the ordering and so on.
655
- #
656
- # @param exprs [Object]
657
- # Columns use to determine the ordering.
658
- # @param reverse [Boolean]
659
- # Default is ascending.
660
- #
661
- # @return [Expr]
662
- def arg_sort_by(exprs, reverse: false)
663
- if !exprs.is_a?(::Array)
664
- exprs = [exprs]
665
- end
666
- if reverse == true || reverse == false
667
- reverse = [reverse] * exprs.length
668
- end
669
- exprs = Utils.selection_to_rbexpr_list(exprs)
670
- Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
671
- end
672
- alias_method :argsort_by, :arg_sort_by
673
-
674
- # Create polars `Duration` from distinct time components.
675
- #
676
- # @return [Expr]
677
- #
678
- # @example
679
- # df = Polars::DataFrame.new(
680
- # {
681
- # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
682
- # "add" => [1, 2]
683
- # }
684
- # )
685
- # df.select(
686
- # [
687
- # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
688
- # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
689
- # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
690
- # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
691
- # "add_milliseconds"
692
- # ),
693
- # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
694
- # ]
695
- # )
696
- # # =>
697
- # # shape: (2, 5)
698
- # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
699
- # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
700
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
701
- # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
702
- # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
703
- # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
704
- # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
705
- # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
706
- def duration(
707
- weeks: nil,
708
- days: nil,
709
- hours: nil,
710
- minutes: nil,
711
- seconds: nil,
712
- milliseconds: nil,
713
- microseconds: nil,
714
- nanoseconds: nil,
715
- time_unit: "us"
716
- )
717
- if !weeks.nil?
718
- weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
719
- end
720
- if !days.nil?
721
- days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
722
- end
723
- if !hours.nil?
724
- hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
725
- end
726
- if !minutes.nil?
727
- minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
728
- end
729
- if !seconds.nil?
730
- seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
731
- end
732
- if !milliseconds.nil?
733
- milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
734
- end
735
- if !microseconds.nil?
736
- microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
737
- end
738
- if !nanoseconds.nil?
739
- nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
740
- end
741
-
742
- Utils.wrap_expr(
743
- _rb_duration(
744
- weeks,
745
- days,
746
- hours,
747
- minutes,
748
- seconds,
749
- milliseconds,
750
- microseconds,
751
- nanoseconds,
752
- time_unit
753
- )
754
- )
755
- end
756
-
757
- # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
758
- #
759
- # @param exprs [Object]
760
- # Columns to concat into a Utf8 Series.
761
- # @param sep [String]
762
- # String value that will be used to separate the values.
763
- #
764
- # @return [Expr]
765
- #
766
- # @example
767
- # df = Polars::DataFrame.new(
768
- # {
769
- # "a" => [1, 2, 3],
770
- # "b" => ["dogs", "cats", nil],
771
- # "c" => ["play", "swim", "walk"]
772
- # }
773
- # )
774
- # df.with_columns(
775
- # [
776
- # Polars.concat_str(
777
- # [
778
- # Polars.col("a") * 2,
779
- # Polars.col("b"),
780
- # Polars.col("c")
781
- # ],
782
- # sep: " "
783
- # ).alias("full_sentence")
784
- # ]
785
- # )
786
- # # =>
787
- # # shape: (3, 4)
788
- # # ┌─────┬──────┬──────┬───────────────┐
789
- # # │ a ┆ b ┆ c ┆ full_sentence │
790
- # # │ --- ┆ --- ┆ --- ┆ --- │
791
- # # │ i64 ┆ str ┆ str ┆ str │
792
- # # ╞═════╪══════╪══════╪═══════════════╡
793
- # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
794
- # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
795
- # # │ 3 ┆ null ┆ walk ┆ null │
796
- # # └─────┴──────┴──────┴───────────────┘
797
- def concat_str(exprs, sep: "")
798
- exprs = Utils.selection_to_rbexpr_list(exprs)
799
- return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
800
- end
801
-
802
- # Format expressions as a string.
803
- #
804
- # @param fstring [String]
805
- # A string that with placeholders.
806
- # For example: "hello_{}" or "{}_world
807
- # @param args [Object]
808
- # Expression(s) that fill the placeholders
809
- #
810
- # @return [Expr]
811
- #
812
- # @example
813
- # df = Polars::DataFrame.new(
814
- # {
815
- # "a": ["a", "b", "c"],
816
- # "b": [1, 2, 3]
817
- # }
818
- # )
819
- # df.select(
820
- # [
821
- # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
822
- # ]
823
- # )
824
- # # =>
825
- # # shape: (3, 1)
826
- # # ┌─────────────┐
827
- # # │ fmt │
828
- # # │ --- │
829
- # # │ str │
830
- # # ╞═════════════╡
831
- # # │ foo_a_bar_1 │
832
- # # │ foo_b_bar_2 │
833
- # # │ foo_c_bar_3 │
834
- # # └─────────────┘
835
- def format(fstring, *args)
836
- if fstring.scan("{}").length != args.length
837
- raise ArgumentError, "number of placeholders should equal the number of arguments"
838
- end
839
-
840
- exprs = []
841
-
842
- arguments = args.each
843
- fstring.split(/(\{\})/).each do |s|
844
- if s == "{}"
845
- e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
846
- exprs << e
847
- elsif s.length > 0
848
- exprs << lit(s)
849
- end
850
- end
851
-
852
- concat_str(exprs, sep: "")
853
- end
854
-
855
- # Concat the arrays in a Series dtype List in linear time.
856
- #
857
- # @return [Expr]
858
- def concat_list(exprs)
859
- exprs = Utils.selection_to_rbexpr_list(exprs)
860
- Utils.wrap_expr(RbExpr.concat_lst(exprs))
861
- end
862
-
863
- # Collect multiple LazyFrames at the same time.
864
- #
865
- # This runs all the computation graphs in parallel on Polars threadpool.
866
- #
867
- # @param lazy_frames [Boolean]
868
- # A list of LazyFrames to collect.
869
- # @param type_coercion [Boolean]
870
- # Do type coercion optimization.
871
- # @param predicate_pushdown [Boolean]
872
- # Do predicate pushdown optimization.
873
- # @param projection_pushdown [Boolean]
874
- # Do projection pushdown optimization.
875
- # @param simplify_expression [Boolean]
876
- # Run simplify expressions optimization.
877
- # @param string_cache [Boolean]
878
- # This argument is deprecated and will be ignored
879
- # @param no_optimization [Boolean]
880
- # Turn off optimizations.
881
- # @param slice_pushdown [Boolean]
882
- # Slice pushdown optimization.
883
- # @param common_subplan_elimination [Boolean]
884
- # Will try to cache branching subplans that occur on self-joins or unions.
885
- # @param allow_streaming [Boolean]
886
- # Run parts of the query in a streaming fashion (this is in an alpha state)
887
- #
888
- # @return [Array]
889
- def collect_all(
890
- lazy_frames,
891
- type_coercion: true,
892
- predicate_pushdown: true,
893
- projection_pushdown: true,
894
- simplify_expression: true,
895
- string_cache: false,
896
- no_optimization: false,
897
- slice_pushdown: true,
898
- common_subplan_elimination: true,
899
- allow_streaming: false
900
- )
901
- if no_optimization
902
- predicate_pushdown = false
903
- projection_pushdown = false
904
- slice_pushdown = false
905
- common_subplan_elimination = false
906
- end
907
-
908
- prepared = []
909
-
910
- lazy_frames.each do |lf|
911
- ldf = lf._ldf.optimization_toggle(
912
- type_coercion,
913
- predicate_pushdown,
914
- projection_pushdown,
915
- simplify_expression,
916
- slice_pushdown,
917
- common_subplan_elimination,
918
- allow_streaming,
919
- false
920
- )
921
- prepared << ldf
922
- end
923
-
924
- out = _collect_all(prepared)
925
-
926
- # wrap the rbdataframes into dataframe
927
- result = out.map { |rbdf| Utils.wrap_df(rbdf) }
928
-
929
- result
930
- end
931
-
932
- # Run polars expressions without a context.
933
- #
934
- # @return [DataFrame]
935
- def select(exprs)
936
- DataFrame.new([]).select(exprs)
937
- end
938
-
939
- # Collect several columns into a Series of dtype Struct.
940
- #
941
- # @param exprs [Object]
942
- # Columns/Expressions to collect into a Struct
943
- # @param eager [Boolean]
944
- # Evaluate immediately
945
- #
946
- # @return [Object]
947
- #
948
- # @example
949
- # Polars::DataFrame.new(
950
- # {
951
- # "int" => [1, 2],
952
- # "str" => ["a", "b"],
953
- # "bool" => [true, nil],
954
- # "list" => [[1, 2], [3]],
955
- # }
956
- # ).select([Polars.struct(Polars.all).alias("my_struct")])
957
- # # =>
958
- # # shape: (2, 1)
959
- # # ┌─────────────────────┐
960
- # # │ my_struct │
961
- # # │ --- │
962
- # # │ struct[4] │
963
- # # ╞═════════════════════╡
964
- # # │ {1,"a",true,[1, 2]} │
965
- # # │ {2,"b",null,[3]} │
966
- # # └─────────────────────┘
967
- #
968
- # @example Only collect specific columns as a struct:
969
- # df = Polars::DataFrame.new(
970
- # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
971
- # )
972
- # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
973
- # # =>
974
- # # shape: (4, 4)
975
- # # ┌─────┬───────┬─────┬─────────────┐
976
- # # │ a ┆ b ┆ c ┆ a_and_b │
977
- # # │ --- ┆ --- ┆ --- ┆ --- │
978
- # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
979
- # # ╞═════╪═══════╪═════╪═════════════╡
980
- # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
981
- # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
982
- # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
983
- # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
984
- # # └─────┴───────┴─────┴─────────────┘
985
- def struct(exprs, eager: false)
986
- if eager
987
- Polars.select(struct(exprs, eager: false)).to_series
988
- end
989
- exprs = Utils.selection_to_rbexpr_list(exprs)
990
- Utils.wrap_expr(_as_struct(exprs))
991
- end
992
-
993
- # Repeat a single value n times.
994
- #
995
- # @param value [Object]
996
- # Value to repeat.
997
- # @param n [Integer]
998
- # Repeat `n` times.
999
- # @param eager [Boolean]
1000
- # Run eagerly and collect into a `Series`.
1001
- # @param name [String]
1002
- # Only used in `eager` mode. As expression, use `alias`.
1003
- #
1004
- # @return [Expr]
1005
- def repeat(value, n, dtype: nil, eager: false, name: nil)
1006
- if !name.nil?
1007
- warn "the `name` argument is deprecated. Use the `alias` method instead."
1008
- end
1009
-
1010
- if n.is_a?(Integer)
1011
- n = lit(n)
1012
- end
1013
-
1014
- value = Utils.parse_as_expression(value, str_as_lit: true)
1015
- expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1016
- if !name.nil?
1017
- expr = expr.alias(name)
1018
- end
1019
- if eager
1020
- return select(expr).to_series
1021
- end
1022
- expr
1023
- end
1024
-
1025
- # Return indices where `condition` evaluates `true`.
1026
- #
1027
- # @param condition [Expr]
1028
- # Boolean expression to evaluate
1029
- # @param eager [Boolean]
1030
- # Whether to apply this function eagerly (as opposed to lazily).
1031
- #
1032
- # @return [Expr, Series]
1033
- #
1034
- # @example
1035
- # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
1036
- # df.select(
1037
- # [
1038
- # Polars.arg_where(Polars.col("a") % 2 == 0)
1039
- # ]
1040
- # ).to_series
1041
- # # =>
1042
- # # shape: (2,)
1043
- # # Series: 'a' [u32]
1044
- # # [
1045
- # # 1
1046
- # # 3
1047
- # # ]
1048
- def arg_where(condition, eager: false)
1049
- if eager
1050
- if !condition.is_a?(Series)
1051
- raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
1052
- end
1053
- condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
1054
- else
1055
- condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
1056
- Utils.wrap_expr(_arg_where(condition._rbexpr))
1057
- end
1058
- end
1059
-
1060
- # Folds the expressions from left to right, keeping the first non-null value.
1061
- #
1062
- # @param exprs [Object]
1063
- # Expressions to coalesce.
1064
- #
1065
- # @return [Expr]
1066
- #
1067
- # @example
1068
- # df = Polars::DataFrame.new(
1069
- # [
1070
- # [nil, 1.0, 1.0],
1071
- # [nil, 2.0, 2.0],
1072
- # [nil, nil, 3.0],
1073
- # [nil, nil, nil]
1074
- # ],
1075
- # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1076
- # )
1077
- # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1078
- # # =>
1079
- # # shape: (4, 4)
1080
- # # ┌──────┬──────┬──────┬──────┐
1081
- # # │ a ┆ b ┆ c ┆ d │
1082
- # # │ --- ┆ --- ┆ --- ┆ --- │
1083
- # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1084
- # # ╞══════╪══════╪══════╪══════╡
1085
- # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1086
- # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1087
- # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1088
- # # │ null ┆ null ┆ null ┆ 99.9 │
1089
- # # └──────┴──────┴──────┴──────┘
1090
- def coalesce(exprs, *more_exprs)
1091
- exprs = Utils.selection_to_rbexpr_list(exprs)
1092
- if more_exprs.any?
1093
- exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1094
- end
1095
- Utils.wrap_expr(_coalesce_exprs(exprs))
1096
- end
1097
-
1098
- # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1099
- #
1100
- # Depending on the `unit` provided, this function will return a different dtype:
1101
- # - unit: "d" returns pl.Date
1102
- # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1103
- # - unit: "ms" returns pl.Datetime["ms"]
1104
- # - unit: "us" returns pl.Datetime["us"]
1105
- # - unit: "ns" returns pl.Datetime["ns"]
1106
- #
1107
- # @param column [Object]
1108
- # Series or expression to parse integers to pl.Datetime.
1109
- # @param unit [String]
1110
- # The unit of the timesteps since epoch time.
1111
- # @param eager [Boolean]
1112
- # If eager evaluation is `true`, a Series is returned instead of an Expr.
1113
- #
1114
- # @return [Object]
1115
- #
1116
- # @example
1117
- # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1118
- # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1119
- # # =>
1120
- # # shape: (2, 1)
1121
- # # ┌─────────────────────┐
1122
- # # │ timestamp │
1123
- # # │ --- │
1124
- # # │ datetime[μs] │
1125
- # # ╞═════════════════════╡
1126
- # # │ 2022-10-25 07:31:17 │
1127
- # # │ 2022-10-25 07:31:39 │
1128
- # # └─────────────────────┘
1129
- def from_epoch(column, unit: "s", eager: false)
1130
- if Utils.strlike?(column)
1131
- column = col(column)
1132
- elsif !column.is_a?(Series) && !column.is_a?(Expr)
1133
- column = Series.new(column)
1134
- end
1135
-
1136
- if unit == "d"
1137
- expr = column.cast(Date)
1138
- elsif unit == "s"
1139
- expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1140
- elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1141
- expr = column.cast(Datetime.new(unit))
1142
- else
1143
- raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1144
- end
1145
-
1146
- if eager
1147
- if !column.is_a?(Series)
1148
- raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1149
- else
1150
- column.to_frame.select(expr).to_series
1151
- end
1152
- else
1153
- expr
1154
- end
1155
- end
1156
-
1157
- # Start a "when, then, otherwise" expression.
1158
- #
1159
- # @return [When]
1160
- #
1161
- # @example
1162
- # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
1163
- # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
1164
- # # =>
1165
- # # shape: (3, 3)
1166
- # # ┌─────┬─────┬─────────┐
1167
- # # │ foo ┆ bar ┆ literal │
1168
- # # │ --- ┆ --- ┆ --- │
1169
- # # │ i64 ┆ i64 ┆ i32 │
1170
- # # ╞═════╪═════╪═════════╡
1171
- # # │ 1 ┆ 3 ┆ -1 │
1172
- # # │ 3 ┆ 4 ┆ 1 │
1173
- # # │ 4 ┆ 0 ┆ 1 │
1174
- # # └─────┴─────┴─────────┘
1175
- def when(expr)
1176
- expr = Utils.expr_to_lit_or_expr(expr)
1177
- pw = RbExpr.when(expr._rbexpr)
1178
- When.new(pw)
1179
- end
1180
- end
1181
- end