polars-df 0.7.0-arm64-darwin → 0.9.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +4014 -3495
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -1,1197 +0,0 @@
1
- module Polars
2
- module LazyFunctions
3
- # Return an expression representing a column in a DataFrame.
4
- #
5
- # @return [Expr]
6
- def col(name)
7
- if name.is_a?(Series)
8
- name = name.to_a
9
- end
10
-
11
- if name.is_a?(Class) && name < DataType
12
- name = [name]
13
- end
14
-
15
- if name.is_a?(DataType)
16
- Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(::Array)
18
- if name.length == 0 || Utils.strlike?(name[0])
19
- name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
- Utils.wrap_expr(RbExpr.cols(name))
21
- elsif Utils.is_polars_dtype(name[0])
22
- Utils.wrap_expr(_dtype_cols(name))
23
- else
24
- raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
25
- end
26
- else
27
- name = name.to_s if name.is_a?(Symbol)
28
- Utils.wrap_expr(RbExpr.col(name))
29
- end
30
- end
31
-
32
- # Alias for an element in evaluated in an `eval` expression.
33
- #
34
- # @return [Expr]
35
- #
36
- # @example A horizontal rank computation by taking the elements of a list
37
- # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
- # df.with_column(
39
- # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
- # )
41
- # # =>
42
- # # shape: (3, 3)
43
- # # ┌─────┬─────┬────────────┐
44
- # # │ a ┆ b ┆ rank │
45
- # # │ --- ┆ --- ┆ --- │
46
- # # │ i64 ┆ i64 ┆ list[f64] │
47
- # # ╞═════╪═════╪════════════╡
48
- # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
49
- # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
50
- # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
51
- # # └─────┴─────┴────────────┘
52
- def element
53
- col("")
54
- end
55
-
56
- # Count the number of values in this column/context.
57
- #
58
- # @param column [String, Series, nil]
59
- # If dtype is:
60
- #
61
- # * `Series` : count the values in the series.
62
- # * `String` : count the values in this column.
63
- # * `None` : count the number of values in this context.
64
- #
65
- # @return [Expr, Integer]
66
- def count(column = nil)
67
- if column.nil?
68
- return Utils.wrap_expr(RbExpr.count)
69
- end
70
-
71
- if column.is_a?(Series)
72
- column.len
73
- else
74
- col(column).count
75
- end
76
- end
77
-
78
- # Aggregate to list.
79
- #
80
- # @return [Expr]
81
- def to_list(name)
82
- col(name).list
83
- end
84
-
85
- # Get the standard deviation.
86
- #
87
- # @return [Object]
88
- def std(column, ddof: 1)
89
- if column.is_a?(Series)
90
- column.std(ddof: ddof)
91
- else
92
- col(column).std(ddof: ddof)
93
- end
94
- end
95
-
96
- # Get the variance.
97
- #
98
- # @return [Object]
99
- def var(column, ddof: 1)
100
- if column.is_a?(Series)
101
- column.var(ddof: ddof)
102
- else
103
- col(column).var(ddof: ddof)
104
- end
105
- end
106
-
107
- # Get the maximum value.
108
- #
109
- # @param column [Object]
110
- # Column(s) to be used in aggregation. Will lead to different behavior based on
111
- # the input:
112
- #
113
- # - [String, Series] -> aggregate the maximum value of that column.
114
- # - [Array<Expr>] -> aggregate the maximum value horizontally.
115
- #
116
- # @return [Expr, Object]
117
- def max(column)
118
- if column.is_a?(Series)
119
- column.max
120
- elsif Utils.strlike?(column)
121
- col(column).max
122
- else
123
- exprs = Utils.selection_to_rbexpr_list(column)
124
- # TODO
125
- Utils.wrap_expr(_max_exprs(exprs))
126
- end
127
- end
128
-
129
- # Get the minimum value.
130
- #
131
- # @param column [Object]
132
- # Column(s) to be used in aggregation. Will lead to different behavior based on
133
- # the input:
134
- #
135
- # - [String, Series] -> aggregate the minimum value of that column.
136
- # - [Array<Expr>] -> aggregate the minimum value horizontally.
137
- #
138
- # @return [Expr, Object]
139
- def min(column)
140
- if column.is_a?(Series)
141
- column.min
142
- elsif Utils.strlike?(column)
143
- col(column).min
144
- else
145
- exprs = Utils.selection_to_rbexpr_list(column)
146
- # TODO
147
- Utils.wrap_expr(_min_exprs(exprs))
148
- end
149
- end
150
-
151
- # Sum values in a column/Series, or horizontally across list of columns/expressions.
152
- #
153
- # @return [Object]
154
- def sum(column)
155
- if column.is_a?(Series)
156
- column.sum
157
- elsif Utils.strlike?(column)
158
- col(column.to_s).sum
159
- elsif column.is_a?(::Array)
160
- exprs = Utils.selection_to_rbexpr_list(column)
161
- Utils.wrap_expr(_sum_horizontal(exprs))
162
- else
163
- fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
164
- end
165
- end
166
-
167
- # Get the mean value.
168
- #
169
- # @return [Expr, Float]
170
- def mean(column)
171
- if column.is_a?(Series)
172
- column.mean
173
- else
174
- col(column).mean
175
- end
176
- end
177
-
178
- # Get the mean value.
179
- #
180
- # @return [Expr, Float]
181
- def avg(column)
182
- mean(column)
183
- end
184
-
185
- # Get the median value.
186
- #
187
- # @return [Object]
188
- def median(column)
189
- if column.is_a?(Series)
190
- column.median
191
- else
192
- col(column).median
193
- end
194
- end
195
-
196
- # Count unique values.
197
- #
198
- # @return [Object]
199
- def n_unique(column)
200
- if column.is_a?(Series)
201
- column.n_unique
202
- else
203
- col(column).n_unique
204
- end
205
- end
206
-
207
- # Get the first value.
208
- #
209
- # @return [Object]
210
- def first(column = nil)
211
- if column.nil?
212
- return Utils.wrap_expr(RbExpr.first)
213
- end
214
-
215
- if column.is_a?(Series)
216
- if column.len > 0
217
- column[0]
218
- else
219
- raise IndexError, "The series is empty, so no first value can be returned."
220
- end
221
- else
222
- col(column).first
223
- end
224
- end
225
-
226
- # Get the last value.
227
- #
228
- # Depending on the input type this function does different things:
229
- #
230
- # - nil -> expression to take last column of a context.
231
- # - String -> syntactic sugar for `Polars.col(..).last`
232
- # - Series -> Take last value in `Series`
233
- #
234
- # @return [Object]
235
- def last(column = nil)
236
- if column.nil?
237
- return Utils.wrap_expr(_last)
238
- end
239
-
240
- if column.is_a?(Series)
241
- if column.len > 0
242
- return column[-1]
243
- else
244
- raise IndexError, "The series is empty, so no last value can be returned"
245
- end
246
- end
247
- col(column).last
248
- end
249
-
250
- # Get the first `n` rows.
251
- #
252
- # @param column [Object]
253
- # Column name or Series.
254
- # @param n [Integer]
255
- # Number of rows to return.
256
- #
257
- # @return [Object]
258
- def head(column, n = 10)
259
- if column.is_a?(Series)
260
- column.head(n)
261
- else
262
- col(column).head(n)
263
- end
264
- end
265
-
266
- # Get the last `n` rows.
267
- #
268
- # @param column [Object]
269
- # Column name or Series.
270
- # @param n [Integer]
271
- # Number of rows to return.
272
- #
273
- # @return [Object]
274
- def tail(column, n = 10)
275
- if column.is_a?(Series)
276
- column.tail(n)
277
- else
278
- col(column).tail(n)
279
- end
280
- end
281
-
282
- # Return an expression representing a literal value.
283
- #
284
- # @return [Expr]
285
- def lit(value, dtype: nil, allow_object: nil)
286
- if value.is_a?(::Time) || value.is_a?(::DateTime)
287
- time_unit = dtype&.time_unit || "ns"
288
- time_zone = dtype.&time_zone
289
- e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
290
- if time_zone
291
- return e.dt.replace_time_zone(time_zone.to_s)
292
- else
293
- return e
294
- end
295
- elsif value.is_a?(::Date)
296
- return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
297
- elsif value.is_a?(Polars::Series)
298
- name = value.name
299
- value = value._s
300
- e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
301
- if name == ""
302
- return e
303
- end
304
- return e.alias(name)
305
- elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
306
- return lit(Series.new("", value))
307
- elsif dtype
308
- return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
309
- end
310
-
311
- Utils.wrap_expr(RbExpr.lit(value, allow_object))
312
- end
313
-
314
- # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
315
- #
316
- # @param column [Object]
317
- # Column(s) to be used in aggregation.
318
- #
319
- # @return [Object]
320
- #
321
- # @example
322
- # df = Polars::DataFrame.new(
323
- # {
324
- # "a" => [1, 2],
325
- # "b" => [3, 4],
326
- # "c" => [5, 6]
327
- # }
328
- # )
329
- # # =>
330
- # # shape: (2, 3)
331
- # # ┌─────┬─────┬─────┐
332
- # # │ a ┆ b ┆ c │
333
- # # │ --- ┆ --- ┆ --- │
334
- # # │ i64 ┆ i64 ┆ i64 │
335
- # # ╞═════╪═════╪═════╡
336
- # # │ 1 ┆ 3 ┆ 5 │
337
- # # │ 2 ┆ 4 ┆ 6 │
338
- # # └─────┴─────┴─────┘
339
- #
340
- # @example Cumulatively sum a column by name:
341
- # df.select(Polars.cumsum("a"))
342
- # # =>
343
- # # shape: (2, 1)
344
- # # ┌─────┐
345
- # # │ a │
346
- # # │ --- │
347
- # # │ i64 │
348
- # # ╞═════╡
349
- # # │ 1 │
350
- # # │ 3 │
351
- # # └─────┘
352
- #
353
- # @example Cumulatively sum a list of columns/expressions horizontally:
354
- # df.with_column(Polars.cumsum(["a", "c"]))
355
- # # =>
356
- # # shape: (2, 4)
357
- # # ┌─────┬─────┬─────┬───────────┐
358
- # # │ a ┆ b ┆ c ┆ cumsum │
359
- # # │ --- ┆ --- ┆ --- ┆ --- │
360
- # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
361
- # # ╞═════╪═════╪═════╪═══════════╡
362
- # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
363
- # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
364
- # # └─────┴─────┴─────┴───────────┘
365
- def cumsum(column)
366
- if column.is_a?(Series)
367
- column.cumsum
368
- elsif Utils.strlike?(column)
369
- col(column).cumsum
370
- else
371
- cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
372
- end
373
- end
374
-
375
- # Compute the spearman rank correlation between two columns.
376
- #
377
- # Missing data will be excluded from the computation.
378
- #
379
- # @param a [Object]
380
- # Column name or Expression.
381
- # @param b [Object]
382
- # Column name or Expression.
383
- # @param ddof [Integer]
384
- # Delta degrees of freedom
385
- # @param propagate_nans [Boolean]
386
- # If `True` any `NaN` encountered will lead to `NaN` in the output.
387
- # Defaults to `False` where `NaN` are regarded as larger than any finite number
388
- # and thus lead to the highest rank.
389
- #
390
- # @return [Expr]
391
- def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
392
- if Utils.strlike?(a)
393
- a = col(a)
394
- end
395
- if Utils.strlike?(b)
396
- b = col(b)
397
- end
398
- Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
399
- end
400
-
401
- # Compute the pearson's correlation between two columns.
402
- #
403
- # @param a [Object]
404
- # Column name or Expression.
405
- # @param b [Object]
406
- # Column name or Expression.
407
- # @param ddof [Integer]
408
- # Delta degrees of freedom
409
- #
410
- # @return [Expr]
411
- def pearson_corr(a, b, ddof: 1)
412
- if Utils.strlike?(a)
413
- a = col(a)
414
- end
415
- if Utils.strlike?(b)
416
- b = col(b)
417
- end
418
- Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
419
- end
420
-
421
- # Compute the covariance between two columns/ expressions.
422
- #
423
- # @param a [Object]
424
- # Column name or Expression.
425
- # @param b [Object]
426
- # Column name or Expression.
427
- #
428
- # @return [Expr]
429
- def cov(a, b)
430
- if Utils.strlike?(a)
431
- a = col(a)
432
- end
433
- if Utils.strlike?(b)
434
- b = col(b)
435
- end
436
- Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
437
- end
438
-
439
- # def map
440
- # end
441
-
442
- # def apply
443
- # end
444
-
445
- # Accumulate over multiple columns horizontally/row wise with a left fold.
446
- #
447
- # @return [Expr]
448
- def fold(acc, f, exprs)
449
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
450
- if exprs.is_a?(Expr)
451
- exprs = [exprs]
452
- end
453
-
454
- exprs = Utils.selection_to_rbexpr_list(exprs)
455
- Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
456
- end
457
-
458
- # def reduce
459
- # end
460
-
461
- # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
462
- #
463
- # Every cumulative result is added as a separate field in a Struct column.
464
- #
465
- # @param acc [Object]
466
- # Accumulator Expression. This is the value that will be initialized when the fold
467
- # starts. For a sum this could for instance be lit(0).
468
- # @param f [Object]
469
- # Function to apply over the accumulator and the value.
470
- # Fn(acc, value) -> new_value
471
- # @param exprs [Object]
472
- # Expressions to aggregate over. May also be a wildcard expression.
473
- # @param include_init [Boolean]
474
- # Include the initial accumulator state as struct field.
475
- #
476
- # @return [Object]
477
- #
478
- # @note
479
- # If you simply want the first encountered expression as accumulator,
480
- # consider using `cumreduce`.
481
- def cumfold(acc, f, exprs, include_init: false)
482
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
483
- if exprs.is_a?(Expr)
484
- exprs = [exprs]
485
- end
486
-
487
- exprs = Utils.selection_to_rbexpr_list(exprs)
488
- Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
489
- end
490
-
491
- # def cumreduce
492
- # end
493
-
494
- # Evaluate columnwise or elementwise with a bitwise OR operation.
495
- #
496
- # @return [Expr]
497
- def any(name)
498
- if Utils.strlike?(name)
499
- col(name).any
500
- else
501
- fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
502
- end
503
- end
504
-
505
- # Exclude certain columns from a wildcard/regex selection.
506
- #
507
- # @param columns [Object]
508
- # Column(s) to exclude from selection
509
- # This can be:
510
- #
511
- # - a column name, or multiple column names
512
- # - a regular expression starting with `^` and ending with `$`
513
- # - a dtype or multiple dtypes
514
- #
515
- # @return [Object]
516
- #
517
- # @example
518
- # df = Polars::DataFrame.new(
519
- # {
520
- # "aa" => [1, 2, 3],
521
- # "ba" => ["a", "b", nil],
522
- # "cc" => [nil, 2.5, 1.5]
523
- # }
524
- # )
525
- # # =>
526
- # # shape: (3, 3)
527
- # # ┌─────┬──────┬──────┐
528
- # # │ aa ┆ ba ┆ cc │
529
- # # │ --- ┆ --- ┆ --- │
530
- # # │ i64 ┆ str ┆ f64 │
531
- # # ╞═════╪══════╪══════╡
532
- # # │ 1 ┆ a ┆ null │
533
- # # │ 2 ┆ b ┆ 2.5 │
534
- # # │ 3 ┆ null ┆ 1.5 │
535
- # # └─────┴──────┴──────┘
536
- #
537
- # @example Exclude by column name(s):
538
- # df.select(Polars.exclude("ba"))
539
- # # =>
540
- # # shape: (3, 2)
541
- # # ┌─────┬──────┐
542
- # # │ aa ┆ cc │
543
- # # │ --- ┆ --- │
544
- # # │ i64 ┆ f64 │
545
- # # ╞═════╪══════╡
546
- # # │ 1 ┆ null │
547
- # # │ 2 ┆ 2.5 │
548
- # # │ 3 ┆ 1.5 │
549
- # # └─────┴──────┘
550
- #
551
- # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
552
- # df.select(Polars.exclude("^.*a$"))
553
- # # =>
554
- # # shape: (3, 1)
555
- # # ┌──────┐
556
- # # │ cc │
557
- # # │ --- │
558
- # # │ f64 │
559
- # # ╞══════╡
560
- # # │ null │
561
- # # │ 2.5 │
562
- # # │ 1.5 │
563
- # # └──────┘
564
- def exclude(columns)
565
- col("*").exclude(columns)
566
- end
567
-
568
- # Do one of two things.
569
- #
570
- # * function can do a columnwise or elementwise AND operation
571
- # * a wildcard column selection
572
- #
573
- # @param name [Object]
574
- # If given this function will apply a bitwise & on the columns.
575
- #
576
- # @return [Expr]
577
- #
578
- # @example Sum all columns
579
- # df = Polars::DataFrame.new(
580
- # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
581
- # )
582
- # df.select(Polars.all.sum)
583
- # # =>
584
- # # shape: (1, 3)
585
- # # ┌─────┬──────┬─────┐
586
- # # │ a ┆ b ┆ c │
587
- # # │ --- ┆ --- ┆ --- │
588
- # # │ i64 ┆ str ┆ i64 │
589
- # # ╞═════╪══════╪═════╡
590
- # # │ 6 ┆ null ┆ 3 │
591
- # # └─────┴──────┴─────┘
592
- def all(name = nil)
593
- if name.nil?
594
- col("*")
595
- elsif Utils.strlike?(name)
596
- col(name).all
597
- else
598
- raise Todo
599
- end
600
- end
601
-
602
- # Syntactic sugar for `Polars.col("foo").agg_groups`.
603
- #
604
- # @return [Object]
605
- def groups(column)
606
- col(column).agg_groups
607
- end
608
-
609
- # Syntactic sugar for `Polars.col("foo").quantile(...)`.
610
- #
611
- # @param column [String]
612
- # Column name.
613
- # @param quantile [Float]
614
- # Quantile between 0.0 and 1.0.
615
- # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
616
- # Interpolation method.
617
- #
618
- # @return [Expr]
619
- def quantile(column, quantile, interpolation: "nearest")
620
- col(column).quantile(quantile, interpolation: interpolation)
621
- end
622
-
623
- # Create a range expression (or Series).
624
- #
625
- # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
626
- # range size is equal to the length of the DataFrame you are collecting.
627
- #
628
- # @param start [Integer, Expr, Series]
629
- # Lower bound of range.
630
- # @param stop [Integer, Expr, Series]
631
- # Upper bound of range.
632
- # @param step [Integer]
633
- # Step size of the range.
634
- # @param eager [Boolean]
635
- # If eager evaluation is `True`, a Series is returned instead of an Expr.
636
- # @param dtype [Symbol]
637
- # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
638
- #
639
- # @return [Expr, Series]
640
- #
641
- # @example
642
- # Polars.arange(0, 3, eager: true)
643
- # # =>
644
- # # shape: (3,)
645
- # # Series: 'arange' [i64]
646
- # # [
647
- # # 0
648
- # # 1
649
- # # 2
650
- # # ]
651
- def int_range(start, stop, step: 1, eager: false, dtype: nil)
652
- start = Utils.parse_as_expression(start)
653
- stop = Utils.parse_as_expression(stop)
654
- dtype ||= Int64
655
- dtype = dtype.to_s if dtype.is_a?(Symbol)
656
- result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
657
-
658
- if eager
659
- return select(result).to_series
660
- end
661
-
662
- result
663
- end
664
- alias_method :arange, :int_range
665
-
666
- # Find the indexes that would sort the columns.
667
- #
668
- # Argsort by multiple columns. The first column will be used for the ordering.
669
- # If there are duplicates in the first column, the second column will be used to
670
- # determine the ordering and so on.
671
- #
672
- # @param exprs [Object]
673
- # Columns use to determine the ordering.
674
- # @param reverse [Boolean]
675
- # Default is ascending.
676
- #
677
- # @return [Expr]
678
- def arg_sort_by(exprs, reverse: false)
679
- if !exprs.is_a?(::Array)
680
- exprs = [exprs]
681
- end
682
- if reverse == true || reverse == false
683
- reverse = [reverse] * exprs.length
684
- end
685
- exprs = Utils.selection_to_rbexpr_list(exprs)
686
- Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
687
- end
688
- alias_method :argsort_by, :arg_sort_by
689
-
690
- # Create polars `Duration` from distinct time components.
691
- #
692
- # @return [Expr]
693
- #
694
- # @example
695
- # df = Polars::DataFrame.new(
696
- # {
697
- # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
698
- # "add" => [1, 2]
699
- # }
700
- # )
701
- # df.select(
702
- # [
703
- # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
704
- # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
705
- # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
706
- # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
707
- # "add_milliseconds"
708
- # ),
709
- # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
710
- # ]
711
- # )
712
- # # =>
713
- # # shape: (2, 5)
714
- # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
715
- # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
716
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
717
- # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
718
- # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
719
- # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
720
- # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
721
- # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
722
- def duration(
723
- weeks: nil,
724
- days: nil,
725
- hours: nil,
726
- minutes: nil,
727
- seconds: nil,
728
- milliseconds: nil,
729
- microseconds: nil,
730
- nanoseconds: nil,
731
- time_unit: "us"
732
- )
733
- if !weeks.nil?
734
- weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
735
- end
736
- if !days.nil?
737
- days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
738
- end
739
- if !hours.nil?
740
- hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
741
- end
742
- if !minutes.nil?
743
- minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
744
- end
745
- if !seconds.nil?
746
- seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
747
- end
748
- if !milliseconds.nil?
749
- milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
750
- end
751
- if !microseconds.nil?
752
- microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
753
- end
754
- if !nanoseconds.nil?
755
- nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
756
- end
757
-
758
- Utils.wrap_expr(
759
- _rb_duration(
760
- weeks,
761
- days,
762
- hours,
763
- minutes,
764
- seconds,
765
- milliseconds,
766
- microseconds,
767
- nanoseconds,
768
- time_unit
769
- )
770
- )
771
- end
772
-
773
- # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
774
- #
775
- # @param exprs [Object]
776
- # Columns to concat into a Utf8 Series.
777
- # @param sep [String]
778
- # String value that will be used to separate the values.
779
- #
780
- # @return [Expr]
781
- #
782
- # @example
783
- # df = Polars::DataFrame.new(
784
- # {
785
- # "a" => [1, 2, 3],
786
- # "b" => ["dogs", "cats", nil],
787
- # "c" => ["play", "swim", "walk"]
788
- # }
789
- # )
790
- # df.with_columns(
791
- # [
792
- # Polars.concat_str(
793
- # [
794
- # Polars.col("a") * 2,
795
- # Polars.col("b"),
796
- # Polars.col("c")
797
- # ],
798
- # sep: " "
799
- # ).alias("full_sentence")
800
- # ]
801
- # )
802
- # # =>
803
- # # shape: (3, 4)
804
- # # ┌─────┬──────┬──────┬───────────────┐
805
- # # │ a ┆ b ┆ c ┆ full_sentence │
806
- # # │ --- ┆ --- ┆ --- ┆ --- │
807
- # # │ i64 ┆ str ┆ str ┆ str │
808
- # # ╞═════╪══════╪══════╪═══════════════╡
809
- # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
810
- # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
811
- # # │ 3 ┆ null ┆ walk ┆ null │
812
- # # └─────┴──────┴──────┴───────────────┘
813
- def concat_str(exprs, sep: "")
814
- exprs = Utils.selection_to_rbexpr_list(exprs)
815
- return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
816
- end
817
-
818
- # Format expressions as a string.
819
- #
820
- # @param fstring [String]
821
- # A string that with placeholders.
822
- # For example: "hello_{}" or "{}_world
823
- # @param args [Object]
824
- # Expression(s) that fill the placeholders
825
- #
826
- # @return [Expr]
827
- #
828
- # @example
829
- # df = Polars::DataFrame.new(
830
- # {
831
- # "a": ["a", "b", "c"],
832
- # "b": [1, 2, 3]
833
- # }
834
- # )
835
- # df.select(
836
- # [
837
- # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
838
- # ]
839
- # )
840
- # # =>
841
- # # shape: (3, 1)
842
- # # ┌─────────────┐
843
- # # │ fmt │
844
- # # │ --- │
845
- # # │ str │
846
- # # ╞═════════════╡
847
- # # │ foo_a_bar_1 │
848
- # # │ foo_b_bar_2 │
849
- # # │ foo_c_bar_3 │
850
- # # └─────────────┘
851
- def format(fstring, *args)
852
- if fstring.scan("{}").length != args.length
853
- raise ArgumentError, "number of placeholders should equal the number of arguments"
854
- end
855
-
856
- exprs = []
857
-
858
- arguments = args.each
859
- fstring.split(/(\{\})/).each do |s|
860
- if s == "{}"
861
- e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
862
- exprs << e
863
- elsif s.length > 0
864
- exprs << lit(s)
865
- end
866
- end
867
-
868
- concat_str(exprs, sep: "")
869
- end
870
-
871
- # Concat the arrays in a Series dtype List in linear time.
872
- #
873
- # @return [Expr]
874
- def concat_list(exprs)
875
- exprs = Utils.selection_to_rbexpr_list(exprs)
876
- Utils.wrap_expr(RbExpr.concat_lst(exprs))
877
- end
878
-
879
- # Collect multiple LazyFrames at the same time.
880
- #
881
- # This runs all the computation graphs in parallel on Polars threadpool.
882
- #
883
- # @param lazy_frames [Boolean]
884
- # A list of LazyFrames to collect.
885
- # @param type_coercion [Boolean]
886
- # Do type coercion optimization.
887
- # @param predicate_pushdown [Boolean]
888
- # Do predicate pushdown optimization.
889
- # @param projection_pushdown [Boolean]
890
- # Do projection pushdown optimization.
891
- # @param simplify_expression [Boolean]
892
- # Run simplify expressions optimization.
893
- # @param string_cache [Boolean]
894
- # This argument is deprecated and will be ignored
895
- # @param no_optimization [Boolean]
896
- # Turn off optimizations.
897
- # @param slice_pushdown [Boolean]
898
- # Slice pushdown optimization.
899
- # @param common_subplan_elimination [Boolean]
900
- # Will try to cache branching subplans that occur on self-joins or unions.
901
- # @param allow_streaming [Boolean]
902
- # Run parts of the query in a streaming fashion (this is in an alpha state)
903
- #
904
- # @return [Array]
905
- def collect_all(
906
- lazy_frames,
907
- type_coercion: true,
908
- predicate_pushdown: true,
909
- projection_pushdown: true,
910
- simplify_expression: true,
911
- string_cache: false,
912
- no_optimization: false,
913
- slice_pushdown: true,
914
- common_subplan_elimination: true,
915
- allow_streaming: false
916
- )
917
- if no_optimization
918
- predicate_pushdown = false
919
- projection_pushdown = false
920
- slice_pushdown = false
921
- common_subplan_elimination = false
922
- end
923
-
924
- prepared = []
925
-
926
- lazy_frames.each do |lf|
927
- ldf = lf._ldf.optimization_toggle(
928
- type_coercion,
929
- predicate_pushdown,
930
- projection_pushdown,
931
- simplify_expression,
932
- slice_pushdown,
933
- common_subplan_elimination,
934
- allow_streaming,
935
- false
936
- )
937
- prepared << ldf
938
- end
939
-
940
- out = _collect_all(prepared)
941
-
942
- # wrap the rbdataframes into dataframe
943
- result = out.map { |rbdf| Utils.wrap_df(rbdf) }
944
-
945
- result
946
- end
947
-
948
- # Run polars expressions without a context.
949
- #
950
- # @return [DataFrame]
951
- def select(exprs)
952
- DataFrame.new([]).select(exprs)
953
- end
954
-
955
- # Collect several columns into a Series of dtype Struct.
956
- #
957
- # @param exprs [Object]
958
- # Columns/Expressions to collect into a Struct
959
- # @param eager [Boolean]
960
- # Evaluate immediately
961
- #
962
- # @return [Object]
963
- #
964
- # @example
965
- # Polars::DataFrame.new(
966
- # {
967
- # "int" => [1, 2],
968
- # "str" => ["a", "b"],
969
- # "bool" => [true, nil],
970
- # "list" => [[1, 2], [3]],
971
- # }
972
- # ).select([Polars.struct(Polars.all).alias("my_struct")])
973
- # # =>
974
- # # shape: (2, 1)
975
- # # ┌─────────────────────┐
976
- # # │ my_struct │
977
- # # │ --- │
978
- # # │ struct[4] │
979
- # # ╞═════════════════════╡
980
- # # │ {1,"a",true,[1, 2]} │
981
- # # │ {2,"b",null,[3]} │
982
- # # └─────────────────────┘
983
- #
984
- # @example Only collect specific columns as a struct:
985
- # df = Polars::DataFrame.new(
986
- # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
987
- # )
988
- # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
989
- # # =>
990
- # # shape: (4, 4)
991
- # # ┌─────┬───────┬─────┬─────────────┐
992
- # # │ a ┆ b ┆ c ┆ a_and_b │
993
- # # │ --- ┆ --- ┆ --- ┆ --- │
994
- # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
995
- # # ╞═════╪═══════╪═════╪═════════════╡
996
- # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
997
- # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
998
- # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
999
- # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
1000
- # # └─────┴───────┴─────┴─────────────┘
1001
- def struct(exprs, eager: false)
1002
- if eager
1003
- Polars.select(struct(exprs, eager: false)).to_series
1004
- end
1005
- exprs = Utils.selection_to_rbexpr_list(exprs)
1006
- Utils.wrap_expr(_as_struct(exprs))
1007
- end
1008
-
1009
- # Repeat a single value n times.
1010
- #
1011
- # @param value [Object]
1012
- # Value to repeat.
1013
- # @param n [Integer]
1014
- # Repeat `n` times.
1015
- # @param eager [Boolean]
1016
- # Run eagerly and collect into a `Series`.
1017
- # @param name [String]
1018
- # Only used in `eager` mode. As expression, use `alias`.
1019
- #
1020
- # @return [Expr]
1021
- def repeat(value, n, dtype: nil, eager: false, name: nil)
1022
- if !name.nil?
1023
- warn "the `name` argument is deprecated. Use the `alias` method instead."
1024
- end
1025
-
1026
- if n.is_a?(Integer)
1027
- n = lit(n)
1028
- end
1029
-
1030
- value = Utils.parse_as_expression(value, str_as_lit: true)
1031
- expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1032
- if !name.nil?
1033
- expr = expr.alias(name)
1034
- end
1035
- if eager
1036
- return select(expr).to_series
1037
- end
1038
- expr
1039
- end
1040
-
1041
- # Return indices where `condition` evaluates `true`.
1042
- #
1043
- # @param condition [Expr]
1044
- # Boolean expression to evaluate
1045
- # @param eager [Boolean]
1046
- # Whether to apply this function eagerly (as opposed to lazily).
1047
- #
1048
- # @return [Expr, Series]
1049
- #
1050
- # @example
1051
- # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
1052
- # df.select(
1053
- # [
1054
- # Polars.arg_where(Polars.col("a") % 2 == 0)
1055
- # ]
1056
- # ).to_series
1057
- # # =>
1058
- # # shape: (2,)
1059
- # # Series: 'a' [u32]
1060
- # # [
1061
- # # 1
1062
- # # 3
1063
- # # ]
1064
- def arg_where(condition, eager: false)
1065
- if eager
1066
- if !condition.is_a?(Series)
1067
- raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
1068
- end
1069
- condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
1070
- else
1071
- condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
1072
- Utils.wrap_expr(_arg_where(condition._rbexpr))
1073
- end
1074
- end
1075
-
1076
- # Folds the expressions from left to right, keeping the first non-null value.
1077
- #
1078
- # @param exprs [Object]
1079
- # Expressions to coalesce.
1080
- #
1081
- # @return [Expr]
1082
- #
1083
- # @example
1084
- # df = Polars::DataFrame.new(
1085
- # [
1086
- # [nil, 1.0, 1.0],
1087
- # [nil, 2.0, 2.0],
1088
- # [nil, nil, 3.0],
1089
- # [nil, nil, nil]
1090
- # ],
1091
- # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1092
- # )
1093
- # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1094
- # # =>
1095
- # # shape: (4, 4)
1096
- # # ┌──────┬──────┬──────┬──────┐
1097
- # # │ a ┆ b ┆ c ┆ d │
1098
- # # │ --- ┆ --- ┆ --- ┆ --- │
1099
- # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1100
- # # ╞══════╪══════╪══════╪══════╡
1101
- # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1102
- # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1103
- # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1104
- # # │ null ┆ null ┆ null ┆ 99.9 │
1105
- # # └──────┴──────┴──────┴──────┘
1106
- def coalesce(exprs, *more_exprs)
1107
- exprs = Utils.selection_to_rbexpr_list(exprs)
1108
- if more_exprs.any?
1109
- exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1110
- end
1111
- Utils.wrap_expr(_coalesce_exprs(exprs))
1112
- end
1113
-
1114
- # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1115
- #
1116
- # Depending on the `unit` provided, this function will return a different dtype:
1117
- # - unit: "d" returns pl.Date
1118
- # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1119
- # - unit: "ms" returns pl.Datetime["ms"]
1120
- # - unit: "us" returns pl.Datetime["us"]
1121
- # - unit: "ns" returns pl.Datetime["ns"]
1122
- #
1123
- # @param column [Object]
1124
- # Series or expression to parse integers to pl.Datetime.
1125
- # @param unit [String]
1126
- # The unit of the timesteps since epoch time.
1127
- # @param eager [Boolean]
1128
- # If eager evaluation is `true`, a Series is returned instead of an Expr.
1129
- #
1130
- # @return [Object]
1131
- #
1132
- # @example
1133
- # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1134
- # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1135
- # # =>
1136
- # # shape: (2, 1)
1137
- # # ┌─────────────────────┐
1138
- # # │ timestamp │
1139
- # # │ --- │
1140
- # # │ datetime[μs] │
1141
- # # ╞═════════════════════╡
1142
- # # │ 2022-10-25 07:31:17 │
1143
- # # │ 2022-10-25 07:31:39 │
1144
- # # └─────────────────────┘
1145
- def from_epoch(column, unit: "s", eager: false)
1146
- if Utils.strlike?(column)
1147
- column = col(column)
1148
- elsif !column.is_a?(Series) && !column.is_a?(Expr)
1149
- column = Series.new(column)
1150
- end
1151
-
1152
- if unit == "d"
1153
- expr = column.cast(Date)
1154
- elsif unit == "s"
1155
- expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1156
- elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1157
- expr = column.cast(Datetime.new(unit))
1158
- else
1159
- raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1160
- end
1161
-
1162
- if eager
1163
- if !column.is_a?(Series)
1164
- raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1165
- else
1166
- column.to_frame.select(expr).to_series
1167
- end
1168
- else
1169
- expr
1170
- end
1171
- end
1172
-
1173
- # Start a "when, then, otherwise" expression.
1174
- #
1175
- # @return [When]
1176
- #
1177
- # @example
1178
- # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
1179
- # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
1180
- # # =>
1181
- # # shape: (3, 3)
1182
- # # ┌─────┬─────┬─────────┐
1183
- # # │ foo ┆ bar ┆ literal │
1184
- # # │ --- ┆ --- ┆ --- │
1185
- # # │ i64 ┆ i64 ┆ i32 │
1186
- # # ╞═════╪═════╪═════════╡
1187
- # # │ 1 ┆ 3 ┆ -1 │
1188
- # # │ 3 ┆ 4 ┆ 1 │
1189
- # # │ 4 ┆ 0 ┆ 1 │
1190
- # # └─────┴─────┴─────────┘
1191
- def when(expr)
1192
- expr = Utils.expr_to_lit_or_expr(expr)
1193
- pw = RbExpr.when(expr._rbexpr)
1194
- When.new(pw)
1195
- end
1196
- end
1197
- end