polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1978 -1459
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -1,1197 +0,0 @@
1
- module Polars
2
- module LazyFunctions
3
- # Return an expression representing a column in a DataFrame.
4
- #
5
- # @return [Expr]
6
- def col(name)
7
- if name.is_a?(Series)
8
- name = name.to_a
9
- end
10
-
11
- if name.is_a?(Class) && name < DataType
12
- name = [name]
13
- end
14
-
15
- if name.is_a?(DataType)
16
- Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(::Array)
18
- if name.length == 0 || Utils.strlike?(name[0])
19
- name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
- Utils.wrap_expr(RbExpr.cols(name))
21
- elsif Utils.is_polars_dtype(name[0])
22
- Utils.wrap_expr(_dtype_cols(name))
23
- else
24
- raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
25
- end
26
- else
27
- name = name.to_s if name.is_a?(Symbol)
28
- Utils.wrap_expr(RbExpr.col(name))
29
- end
30
- end
31
-
32
- # Alias for an element in evaluated in an `eval` expression.
33
- #
34
- # @return [Expr]
35
- #
36
- # @example A horizontal rank computation by taking the elements of a list
37
- # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
- # df.with_column(
39
- # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
- # )
41
- # # =>
42
- # # shape: (3, 3)
43
- # # ┌─────┬─────┬────────────┐
44
- # # │ a ┆ b ┆ rank │
45
- # # │ --- ┆ --- ┆ --- │
46
- # # │ i64 ┆ i64 ┆ list[f64] │
47
- # # ╞═════╪═════╪════════════╡
48
- # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
49
- # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
50
- # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
51
- # # └─────┴─────┴────────────┘
52
- def element
53
- col("")
54
- end
55
-
56
- # Count the number of values in this column/context.
57
- #
58
- # @param column [String, Series, nil]
59
- # If dtype is:
60
- #
61
- # * `Series` : count the values in the series.
62
- # * `String` : count the values in this column.
63
- # * `None` : count the number of values in this context.
64
- #
65
- # @return [Expr, Integer]
66
- def count(column = nil)
67
- if column.nil?
68
- return Utils.wrap_expr(RbExpr.count)
69
- end
70
-
71
- if column.is_a?(Series)
72
- column.len
73
- else
74
- col(column).count
75
- end
76
- end
77
-
78
- # Aggregate to list.
79
- #
80
- # @return [Expr]
81
- def to_list(name)
82
- col(name).list
83
- end
84
-
85
- # Get the standard deviation.
86
- #
87
- # @return [Object]
88
- def std(column, ddof: 1)
89
- if column.is_a?(Series)
90
- column.std(ddof: ddof)
91
- else
92
- col(column).std(ddof: ddof)
93
- end
94
- end
95
-
96
- # Get the variance.
97
- #
98
- # @return [Object]
99
- def var(column, ddof: 1)
100
- if column.is_a?(Series)
101
- column.var(ddof: ddof)
102
- else
103
- col(column).var(ddof: ddof)
104
- end
105
- end
106
-
107
- # Get the maximum value.
108
- #
109
- # @param column [Object]
110
- # Column(s) to be used in aggregation. Will lead to different behavior based on
111
- # the input:
112
- #
113
- # - [String, Series] -> aggregate the maximum value of that column.
114
- # - [Array<Expr>] -> aggregate the maximum value horizontally.
115
- #
116
- # @return [Expr, Object]
117
- def max(column)
118
- if column.is_a?(Series)
119
- column.max
120
- elsif Utils.strlike?(column)
121
- col(column).max
122
- else
123
- exprs = Utils.selection_to_rbexpr_list(column)
124
- # TODO
125
- Utils.wrap_expr(_max_exprs(exprs))
126
- end
127
- end
128
-
129
- # Get the minimum value.
130
- #
131
- # @param column [Object]
132
- # Column(s) to be used in aggregation. Will lead to different behavior based on
133
- # the input:
134
- #
135
- # - [String, Series] -> aggregate the minimum value of that column.
136
- # - [Array<Expr>] -> aggregate the minimum value horizontally.
137
- #
138
- # @return [Expr, Object]
139
- def min(column)
140
- if column.is_a?(Series)
141
- column.min
142
- elsif Utils.strlike?(column)
143
- col(column).min
144
- else
145
- exprs = Utils.selection_to_rbexpr_list(column)
146
- # TODO
147
- Utils.wrap_expr(_min_exprs(exprs))
148
- end
149
- end
150
-
151
- # Sum values in a column/Series, or horizontally across list of columns/expressions.
152
- #
153
- # @return [Object]
154
- def sum(column)
155
- if column.is_a?(Series)
156
- column.sum
157
- elsif Utils.strlike?(column)
158
- col(column.to_s).sum
159
- elsif column.is_a?(::Array)
160
- exprs = Utils.selection_to_rbexpr_list(column)
161
- Utils.wrap_expr(_sum_horizontal(exprs))
162
- else
163
- fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
164
- end
165
- end
166
-
167
- # Get the mean value.
168
- #
169
- # @return [Expr, Float]
170
- def mean(column)
171
- if column.is_a?(Series)
172
- column.mean
173
- else
174
- col(column).mean
175
- end
176
- end
177
-
178
- # Get the mean value.
179
- #
180
- # @return [Expr, Float]
181
- def avg(column)
182
- mean(column)
183
- end
184
-
185
- # Get the median value.
186
- #
187
- # @return [Object]
188
- def median(column)
189
- if column.is_a?(Series)
190
- column.median
191
- else
192
- col(column).median
193
- end
194
- end
195
-
196
- # Count unique values.
197
- #
198
- # @return [Object]
199
- def n_unique(column)
200
- if column.is_a?(Series)
201
- column.n_unique
202
- else
203
- col(column).n_unique
204
- end
205
- end
206
-
207
- # Get the first value.
208
- #
209
- # @return [Object]
210
- def first(column = nil)
211
- if column.nil?
212
- return Utils.wrap_expr(RbExpr.first)
213
- end
214
-
215
- if column.is_a?(Series)
216
- if column.len > 0
217
- column[0]
218
- else
219
- raise IndexError, "The series is empty, so no first value can be returned."
220
- end
221
- else
222
- col(column).first
223
- end
224
- end
225
-
226
- # Get the last value.
227
- #
228
- # Depending on the input type this function does different things:
229
- #
230
- # - nil -> expression to take last column of a context.
231
- # - String -> syntactic sugar for `Polars.col(..).last`
232
- # - Series -> Take last value in `Series`
233
- #
234
- # @return [Object]
235
- def last(column = nil)
236
- if column.nil?
237
- return Utils.wrap_expr(_last)
238
- end
239
-
240
- if column.is_a?(Series)
241
- if column.len > 0
242
- return column[-1]
243
- else
244
- raise IndexError, "The series is empty, so no last value can be returned"
245
- end
246
- end
247
- col(column).last
248
- end
249
-
250
- # Get the first `n` rows.
251
- #
252
- # @param column [Object]
253
- # Column name or Series.
254
- # @param n [Integer]
255
- # Number of rows to return.
256
- #
257
- # @return [Object]
258
- def head(column, n = 10)
259
- if column.is_a?(Series)
260
- column.head(n)
261
- else
262
- col(column).head(n)
263
- end
264
- end
265
-
266
- # Get the last `n` rows.
267
- #
268
- # @param column [Object]
269
- # Column name or Series.
270
- # @param n [Integer]
271
- # Number of rows to return.
272
- #
273
- # @return [Object]
274
- def tail(column, n = 10)
275
- if column.is_a?(Series)
276
- column.tail(n)
277
- else
278
- col(column).tail(n)
279
- end
280
- end
281
-
282
- # Return an expression representing a literal value.
283
- #
284
- # @return [Expr]
285
- def lit(value, dtype: nil, allow_object: nil)
286
- if value.is_a?(::Time) || value.is_a?(::DateTime)
287
- time_unit = dtype&.time_unit || "ns"
288
- time_zone = dtype.&time_zone
289
- e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
290
- if time_zone
291
- return e.dt.replace_time_zone(time_zone.to_s)
292
- else
293
- return e
294
- end
295
- elsif value.is_a?(::Date)
296
- return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
297
- elsif value.is_a?(Polars::Series)
298
- name = value.name
299
- value = value._s
300
- e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
301
- if name == ""
302
- return e
303
- end
304
- return e.alias(name)
305
- elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
306
- return lit(Series.new("", value))
307
- elsif dtype
308
- return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
309
- end
310
-
311
- Utils.wrap_expr(RbExpr.lit(value, allow_object))
312
- end
313
-
314
- # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
315
- #
316
- # @param column [Object]
317
- # Column(s) to be used in aggregation.
318
- #
319
- # @return [Object]
320
- #
321
- # @example
322
- # df = Polars::DataFrame.new(
323
- # {
324
- # "a" => [1, 2],
325
- # "b" => [3, 4],
326
- # "c" => [5, 6]
327
- # }
328
- # )
329
- # # =>
330
- # # shape: (2, 3)
331
- # # ┌─────┬─────┬─────┐
332
- # # │ a ┆ b ┆ c │
333
- # # │ --- ┆ --- ┆ --- │
334
- # # │ i64 ┆ i64 ┆ i64 │
335
- # # ╞═════╪═════╪═════╡
336
- # # │ 1 ┆ 3 ┆ 5 │
337
- # # │ 2 ┆ 4 ┆ 6 │
338
- # # └─────┴─────┴─────┘
339
- #
340
- # @example Cumulatively sum a column by name:
341
- # df.select(Polars.cumsum("a"))
342
- # # =>
343
- # # shape: (2, 1)
344
- # # ┌─────┐
345
- # # │ a │
346
- # # │ --- │
347
- # # │ i64 │
348
- # # ╞═════╡
349
- # # │ 1 │
350
- # # │ 3 │
351
- # # └─────┘
352
- #
353
- # @example Cumulatively sum a list of columns/expressions horizontally:
354
- # df.with_column(Polars.cumsum(["a", "c"]))
355
- # # =>
356
- # # shape: (2, 4)
357
- # # ┌─────┬─────┬─────┬───────────┐
358
- # # │ a ┆ b ┆ c ┆ cumsum │
359
- # # │ --- ┆ --- ┆ --- ┆ --- │
360
- # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
361
- # # ╞═════╪═════╪═════╪═══════════╡
362
- # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
363
- # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
364
- # # └─────┴─────┴─────┴───────────┘
365
- def cumsum(column)
366
- if column.is_a?(Series)
367
- column.cumsum
368
- elsif Utils.strlike?(column)
369
- col(column).cumsum
370
- else
371
- cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
372
- end
373
- end
374
-
375
- # Compute the spearman rank correlation between two columns.
376
- #
377
- # Missing data will be excluded from the computation.
378
- #
379
- # @param a [Object]
380
- # Column name or Expression.
381
- # @param b [Object]
382
- # Column name or Expression.
383
- # @param ddof [Integer]
384
- # Delta degrees of freedom
385
- # @param propagate_nans [Boolean]
386
- # If `True` any `NaN` encountered will lead to `NaN` in the output.
387
- # Defaults to `False` where `NaN` are regarded as larger than any finite number
388
- # and thus lead to the highest rank.
389
- #
390
- # @return [Expr]
391
- def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
392
- if Utils.strlike?(a)
393
- a = col(a)
394
- end
395
- if Utils.strlike?(b)
396
- b = col(b)
397
- end
398
- Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
399
- end
400
-
401
- # Compute the pearson's correlation between two columns.
402
- #
403
- # @param a [Object]
404
- # Column name or Expression.
405
- # @param b [Object]
406
- # Column name or Expression.
407
- # @param ddof [Integer]
408
- # Delta degrees of freedom
409
- #
410
- # @return [Expr]
411
- def pearson_corr(a, b, ddof: 1)
412
- if Utils.strlike?(a)
413
- a = col(a)
414
- end
415
- if Utils.strlike?(b)
416
- b = col(b)
417
- end
418
- Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
419
- end
420
-
421
- # Compute the covariance between two columns/ expressions.
422
- #
423
- # @param a [Object]
424
- # Column name or Expression.
425
- # @param b [Object]
426
- # Column name or Expression.
427
- #
428
- # @return [Expr]
429
- def cov(a, b)
430
- if Utils.strlike?(a)
431
- a = col(a)
432
- end
433
- if Utils.strlike?(b)
434
- b = col(b)
435
- end
436
- Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
437
- end
438
-
439
- # def map
440
- # end
441
-
442
- # def apply
443
- # end
444
-
445
- # Accumulate over multiple columns horizontally/row wise with a left fold.
446
- #
447
- # @return [Expr]
448
- def fold(acc, f, exprs)
449
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
450
- if exprs.is_a?(Expr)
451
- exprs = [exprs]
452
- end
453
-
454
- exprs = Utils.selection_to_rbexpr_list(exprs)
455
- Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
456
- end
457
-
458
- # def reduce
459
- # end
460
-
461
- # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
462
- #
463
- # Every cumulative result is added as a separate field in a Struct column.
464
- #
465
- # @param acc [Object]
466
- # Accumulator Expression. This is the value that will be initialized when the fold
467
- # starts. For a sum this could for instance be lit(0).
468
- # @param f [Object]
469
- # Function to apply over the accumulator and the value.
470
- # Fn(acc, value) -> new_value
471
- # @param exprs [Object]
472
- # Expressions to aggregate over. May also be a wildcard expression.
473
- # @param include_init [Boolean]
474
- # Include the initial accumulator state as struct field.
475
- #
476
- # @return [Object]
477
- #
478
- # @note
479
- # If you simply want the first encountered expression as accumulator,
480
- # consider using `cumreduce`.
481
- def cumfold(acc, f, exprs, include_init: false)
482
- acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
483
- if exprs.is_a?(Expr)
484
- exprs = [exprs]
485
- end
486
-
487
- exprs = Utils.selection_to_rbexpr_list(exprs)
488
- Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
489
- end
490
-
491
- # def cumreduce
492
- # end
493
-
494
- # Evaluate columnwise or elementwise with a bitwise OR operation.
495
- #
496
- # @return [Expr]
497
- def any(name)
498
- if Utils.strlike?(name)
499
- col(name).any
500
- else
501
- fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
502
- end
503
- end
504
-
505
- # Exclude certain columns from a wildcard/regex selection.
506
- #
507
- # @param columns [Object]
508
- # Column(s) to exclude from selection
509
- # This can be:
510
- #
511
- # - a column name, or multiple column names
512
- # - a regular expression starting with `^` and ending with `$`
513
- # - a dtype or multiple dtypes
514
- #
515
- # @return [Object]
516
- #
517
- # @example
518
- # df = Polars::DataFrame.new(
519
- # {
520
- # "aa" => [1, 2, 3],
521
- # "ba" => ["a", "b", nil],
522
- # "cc" => [nil, 2.5, 1.5]
523
- # }
524
- # )
525
- # # =>
526
- # # shape: (3, 3)
527
- # # ┌─────┬──────┬──────┐
528
- # # │ aa ┆ ba ┆ cc │
529
- # # │ --- ┆ --- ┆ --- │
530
- # # │ i64 ┆ str ┆ f64 │
531
- # # ╞═════╪══════╪══════╡
532
- # # │ 1 ┆ a ┆ null │
533
- # # │ 2 ┆ b ┆ 2.5 │
534
- # # │ 3 ┆ null ┆ 1.5 │
535
- # # └─────┴──────┴──────┘
536
- #
537
- # @example Exclude by column name(s):
538
- # df.select(Polars.exclude("ba"))
539
- # # =>
540
- # # shape: (3, 2)
541
- # # ┌─────┬──────┐
542
- # # │ aa ┆ cc │
543
- # # │ --- ┆ --- │
544
- # # │ i64 ┆ f64 │
545
- # # ╞═════╪══════╡
546
- # # │ 1 ┆ null │
547
- # # │ 2 ┆ 2.5 │
548
- # # │ 3 ┆ 1.5 │
549
- # # └─────┴──────┘
550
- #
551
- # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
552
- # df.select(Polars.exclude("^.*a$"))
553
- # # =>
554
- # # shape: (3, 1)
555
- # # ┌──────┐
556
- # # │ cc │
557
- # # │ --- │
558
- # # │ f64 │
559
- # # ╞══════╡
560
- # # │ null │
561
- # # │ 2.5 │
562
- # # │ 1.5 │
563
- # # └──────┘
564
- def exclude(columns)
565
- col("*").exclude(columns)
566
- end
567
-
568
- # Do one of two things.
569
- #
570
- # * function can do a columnwise or elementwise AND operation
571
- # * a wildcard column selection
572
- #
573
- # @param name [Object]
574
- # If given this function will apply a bitwise & on the columns.
575
- #
576
- # @return [Expr]
577
- #
578
- # @example Sum all columns
579
- # df = Polars::DataFrame.new(
580
- # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
581
- # )
582
- # df.select(Polars.all.sum)
583
- # # =>
584
- # # shape: (1, 3)
585
- # # ┌─────┬──────┬─────┐
586
- # # │ a ┆ b ┆ c │
587
- # # │ --- ┆ --- ┆ --- │
588
- # # │ i64 ┆ str ┆ i64 │
589
- # # ╞═════╪══════╪═════╡
590
- # # │ 6 ┆ null ┆ 3 │
591
- # # └─────┴──────┴─────┘
592
- def all(name = nil)
593
- if name.nil?
594
- col("*")
595
- elsif Utils.strlike?(name)
596
- col(name).all
597
- else
598
- raise Todo
599
- end
600
- end
601
-
602
- # Syntactic sugar for `Polars.col("foo").agg_groups`.
603
- #
604
- # @return [Object]
605
- def groups(column)
606
- col(column).agg_groups
607
- end
608
-
609
- # Syntactic sugar for `Polars.col("foo").quantile(...)`.
610
- #
611
- # @param column [String]
612
- # Column name.
613
- # @param quantile [Float]
614
- # Quantile between 0.0 and 1.0.
615
- # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
616
- # Interpolation method.
617
- #
618
- # @return [Expr]
619
- def quantile(column, quantile, interpolation: "nearest")
620
- col(column).quantile(quantile, interpolation: interpolation)
621
- end
622
-
623
- # Create a range expression (or Series).
624
- #
625
- # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
626
- # range size is equal to the length of the DataFrame you are collecting.
627
- #
628
- # @param start [Integer, Expr, Series]
629
- # Lower bound of range.
630
- # @param stop [Integer, Expr, Series]
631
- # Upper bound of range.
632
- # @param step [Integer]
633
- # Step size of the range.
634
- # @param eager [Boolean]
635
- # If eager evaluation is `True`, a Series is returned instead of an Expr.
636
- # @param dtype [Symbol]
637
- # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
638
- #
639
- # @return [Expr, Series]
640
- #
641
- # @example
642
- # Polars.arange(0, 3, eager: true)
643
- # # =>
644
- # # shape: (3,)
645
- # # Series: 'arange' [i64]
646
- # # [
647
- # # 0
648
- # # 1
649
- # # 2
650
- # # ]
651
- def int_range(start, stop, step: 1, eager: false, dtype: nil)
652
- start = Utils.parse_as_expression(start)
653
- stop = Utils.parse_as_expression(stop)
654
- dtype ||= Int64
655
- dtype = dtype.to_s if dtype.is_a?(Symbol)
656
- result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
657
-
658
- if eager
659
- return select(result).to_series
660
- end
661
-
662
- result
663
- end
664
- alias_method :arange, :int_range
665
-
666
- # Find the indexes that would sort the columns.
667
- #
668
- # Argsort by multiple columns. The first column will be used for the ordering.
669
- # If there are duplicates in the first column, the second column will be used to
670
- # determine the ordering and so on.
671
- #
672
- # @param exprs [Object]
673
- # Columns use to determine the ordering.
674
- # @param reverse [Boolean]
675
- # Default is ascending.
676
- #
677
- # @return [Expr]
678
- def arg_sort_by(exprs, reverse: false)
679
- if !exprs.is_a?(::Array)
680
- exprs = [exprs]
681
- end
682
- if reverse == true || reverse == false
683
- reverse = [reverse] * exprs.length
684
- end
685
- exprs = Utils.selection_to_rbexpr_list(exprs)
686
- Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
687
- end
688
- alias_method :argsort_by, :arg_sort_by
689
-
690
- # Create polars `Duration` from distinct time components.
691
- #
692
- # @return [Expr]
693
- #
694
- # @example
695
- # df = Polars::DataFrame.new(
696
- # {
697
- # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
698
- # "add" => [1, 2]
699
- # }
700
- # )
701
- # df.select(
702
- # [
703
- # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
704
- # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
705
- # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
706
- # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
707
- # "add_milliseconds"
708
- # ),
709
- # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
710
- # ]
711
- # )
712
- # # =>
713
- # # shape: (2, 5)
714
- # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
715
- # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
716
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
717
- # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
718
- # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
719
- # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
720
- # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
721
- # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
722
- def duration(
723
- weeks: nil,
724
- days: nil,
725
- hours: nil,
726
- minutes: nil,
727
- seconds: nil,
728
- milliseconds: nil,
729
- microseconds: nil,
730
- nanoseconds: nil,
731
- time_unit: "us"
732
- )
733
- if !weeks.nil?
734
- weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
735
- end
736
- if !days.nil?
737
- days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
738
- end
739
- if !hours.nil?
740
- hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
741
- end
742
- if !minutes.nil?
743
- minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
744
- end
745
- if !seconds.nil?
746
- seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
747
- end
748
- if !milliseconds.nil?
749
- milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
750
- end
751
- if !microseconds.nil?
752
- microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
753
- end
754
- if !nanoseconds.nil?
755
- nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
756
- end
757
-
758
- Utils.wrap_expr(
759
- _rb_duration(
760
- weeks,
761
- days,
762
- hours,
763
- minutes,
764
- seconds,
765
- milliseconds,
766
- microseconds,
767
- nanoseconds,
768
- time_unit
769
- )
770
- )
771
- end
772
-
773
- # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
774
- #
775
- # @param exprs [Object]
776
- # Columns to concat into a Utf8 Series.
777
- # @param sep [String]
778
- # String value that will be used to separate the values.
779
- #
780
- # @return [Expr]
781
- #
782
- # @example
783
- # df = Polars::DataFrame.new(
784
- # {
785
- # "a" => [1, 2, 3],
786
- # "b" => ["dogs", "cats", nil],
787
- # "c" => ["play", "swim", "walk"]
788
- # }
789
- # )
790
- # df.with_columns(
791
- # [
792
- # Polars.concat_str(
793
- # [
794
- # Polars.col("a") * 2,
795
- # Polars.col("b"),
796
- # Polars.col("c")
797
- # ],
798
- # sep: " "
799
- # ).alias("full_sentence")
800
- # ]
801
- # )
802
- # # =>
803
- # # shape: (3, 4)
804
- # # ┌─────┬──────┬──────┬───────────────┐
805
- # # │ a ┆ b ┆ c ┆ full_sentence │
806
- # # │ --- ┆ --- ┆ --- ┆ --- │
807
- # # │ i64 ┆ str ┆ str ┆ str │
808
- # # ╞═════╪══════╪══════╪═══════════════╡
809
- # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
810
- # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
811
- # # │ 3 ┆ null ┆ walk ┆ null │
812
- # # └─────┴──────┴──────┴───────────────┘
813
- def concat_str(exprs, sep: "")
814
- exprs = Utils.selection_to_rbexpr_list(exprs)
815
- return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
816
- end
817
-
818
- # Format expressions as a string.
819
- #
820
- # @param fstring [String]
821
- # A string that with placeholders.
822
- # For example: "hello_{}" or "{}_world
823
- # @param args [Object]
824
- # Expression(s) that fill the placeholders
825
- #
826
- # @return [Expr]
827
- #
828
- # @example
829
- # df = Polars::DataFrame.new(
830
- # {
831
- # "a": ["a", "b", "c"],
832
- # "b": [1, 2, 3]
833
- # }
834
- # )
835
- # df.select(
836
- # [
837
- # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
838
- # ]
839
- # )
840
- # # =>
841
- # # shape: (3, 1)
842
- # # ┌─────────────┐
843
- # # │ fmt │
844
- # # │ --- │
845
- # # │ str │
846
- # # ╞═════════════╡
847
- # # │ foo_a_bar_1 │
848
- # # │ foo_b_bar_2 │
849
- # # │ foo_c_bar_3 │
850
- # # └─────────────┘
851
- def format(fstring, *args)
852
- if fstring.scan("{}").length != args.length
853
- raise ArgumentError, "number of placeholders should equal the number of arguments"
854
- end
855
-
856
- exprs = []
857
-
858
- arguments = args.each
859
- fstring.split(/(\{\})/).each do |s|
860
- if s == "{}"
861
- e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
862
- exprs << e
863
- elsif s.length > 0
864
- exprs << lit(s)
865
- end
866
- end
867
-
868
- concat_str(exprs, sep: "")
869
- end
870
-
871
- # Concat the arrays in a Series dtype List in linear time.
872
- #
873
- # @return [Expr]
874
- def concat_list(exprs)
875
- exprs = Utils.selection_to_rbexpr_list(exprs)
876
- Utils.wrap_expr(RbExpr.concat_lst(exprs))
877
- end
878
-
879
- # Collect multiple LazyFrames at the same time.
880
- #
881
- # This runs all the computation graphs in parallel on Polars threadpool.
882
- #
883
- # @param lazy_frames [Boolean]
884
- # A list of LazyFrames to collect.
885
- # @param type_coercion [Boolean]
886
- # Do type coercion optimization.
887
- # @param predicate_pushdown [Boolean]
888
- # Do predicate pushdown optimization.
889
- # @param projection_pushdown [Boolean]
890
- # Do projection pushdown optimization.
891
- # @param simplify_expression [Boolean]
892
- # Run simplify expressions optimization.
893
- # @param string_cache [Boolean]
894
- # This argument is deprecated and will be ignored
895
- # @param no_optimization [Boolean]
896
- # Turn off optimizations.
897
- # @param slice_pushdown [Boolean]
898
- # Slice pushdown optimization.
899
- # @param common_subplan_elimination [Boolean]
900
- # Will try to cache branching subplans that occur on self-joins or unions.
901
- # @param allow_streaming [Boolean]
902
- # Run parts of the query in a streaming fashion (this is in an alpha state)
903
- #
904
- # @return [Array]
905
- def collect_all(
906
- lazy_frames,
907
- type_coercion: true,
908
- predicate_pushdown: true,
909
- projection_pushdown: true,
910
- simplify_expression: true,
911
- string_cache: false,
912
- no_optimization: false,
913
- slice_pushdown: true,
914
- common_subplan_elimination: true,
915
- allow_streaming: false
916
- )
917
- if no_optimization
918
- predicate_pushdown = false
919
- projection_pushdown = false
920
- slice_pushdown = false
921
- common_subplan_elimination = false
922
- end
923
-
924
- prepared = []
925
-
926
- lazy_frames.each do |lf|
927
- ldf = lf._ldf.optimization_toggle(
928
- type_coercion,
929
- predicate_pushdown,
930
- projection_pushdown,
931
- simplify_expression,
932
- slice_pushdown,
933
- common_subplan_elimination,
934
- allow_streaming,
935
- false
936
- )
937
- prepared << ldf
938
- end
939
-
940
- out = _collect_all(prepared)
941
-
942
- # wrap the rbdataframes into dataframe
943
- result = out.map { |rbdf| Utils.wrap_df(rbdf) }
944
-
945
- result
946
- end
947
-
948
- # Run polars expressions without a context.
949
- #
950
- # @return [DataFrame]
951
- def select(exprs)
952
- DataFrame.new([]).select(exprs)
953
- end
954
-
955
- # Collect several columns into a Series of dtype Struct.
956
- #
957
- # @param exprs [Object]
958
- # Columns/Expressions to collect into a Struct
959
- # @param eager [Boolean]
960
- # Evaluate immediately
961
- #
962
- # @return [Object]
963
- #
964
- # @example
965
- # Polars::DataFrame.new(
966
- # {
967
- # "int" => [1, 2],
968
- # "str" => ["a", "b"],
969
- # "bool" => [true, nil],
970
- # "list" => [[1, 2], [3]],
971
- # }
972
- # ).select([Polars.struct(Polars.all).alias("my_struct")])
973
- # # =>
974
- # # shape: (2, 1)
975
- # # ┌─────────────────────┐
976
- # # │ my_struct │
977
- # # │ --- │
978
- # # │ struct[4] │
979
- # # ╞═════════════════════╡
980
- # # │ {1,"a",true,[1, 2]} │
981
- # # │ {2,"b",null,[3]} │
982
- # # └─────────────────────┘
983
- #
984
- # @example Only collect specific columns as a struct:
985
- # df = Polars::DataFrame.new(
986
- # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
987
- # )
988
- # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
989
- # # =>
990
- # # shape: (4, 4)
991
- # # ┌─────┬───────┬─────┬─────────────┐
992
- # # │ a ┆ b ┆ c ┆ a_and_b │
993
- # # │ --- ┆ --- ┆ --- ┆ --- │
994
- # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
995
- # # ╞═════╪═══════╪═════╪═════════════╡
996
- # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
997
- # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
998
- # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
999
- # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
1000
- # # └─────┴───────┴─────┴─────────────┘
1001
- def struct(exprs, eager: false)
1002
- if eager
1003
- Polars.select(struct(exprs, eager: false)).to_series
1004
- end
1005
- exprs = Utils.selection_to_rbexpr_list(exprs)
1006
- Utils.wrap_expr(_as_struct(exprs))
1007
- end
1008
-
1009
- # Repeat a single value n times.
1010
- #
1011
- # @param value [Object]
1012
- # Value to repeat.
1013
- # @param n [Integer]
1014
- # Repeat `n` times.
1015
- # @param eager [Boolean]
1016
- # Run eagerly and collect into a `Series`.
1017
- # @param name [String]
1018
- # Only used in `eager` mode. As expression, use `alias`.
1019
- #
1020
- # @return [Expr]
1021
- def repeat(value, n, dtype: nil, eager: false, name: nil)
1022
- if !name.nil?
1023
- warn "the `name` argument is deprecated. Use the `alias` method instead."
1024
- end
1025
-
1026
- if n.is_a?(Integer)
1027
- n = lit(n)
1028
- end
1029
-
1030
- value = Utils.parse_as_expression(value, str_as_lit: true)
1031
- expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1032
- if !name.nil?
1033
- expr = expr.alias(name)
1034
- end
1035
- if eager
1036
- return select(expr).to_series
1037
- end
1038
- expr
1039
- end
1040
-
1041
- # Return indices where `condition` evaluates `true`.
1042
- #
1043
- # @param condition [Expr]
1044
- # Boolean expression to evaluate
1045
- # @param eager [Boolean]
1046
- # Whether to apply this function eagerly (as opposed to lazily).
1047
- #
1048
- # @return [Expr, Series]
1049
- #
1050
- # @example
1051
- # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
1052
- # df.select(
1053
- # [
1054
- # Polars.arg_where(Polars.col("a") % 2 == 0)
1055
- # ]
1056
- # ).to_series
1057
- # # =>
1058
- # # shape: (2,)
1059
- # # Series: 'a' [u32]
1060
- # # [
1061
- # # 1
1062
- # # 3
1063
- # # ]
1064
- def arg_where(condition, eager: false)
1065
- if eager
1066
- if !condition.is_a?(Series)
1067
- raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
1068
- end
1069
- condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
1070
- else
1071
- condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
1072
- Utils.wrap_expr(_arg_where(condition._rbexpr))
1073
- end
1074
- end
1075
-
1076
- # Folds the expressions from left to right, keeping the first non-null value.
1077
- #
1078
- # @param exprs [Object]
1079
- # Expressions to coalesce.
1080
- #
1081
- # @return [Expr]
1082
- #
1083
- # @example
1084
- # df = Polars::DataFrame.new(
1085
- # [
1086
- # [nil, 1.0, 1.0],
1087
- # [nil, 2.0, 2.0],
1088
- # [nil, nil, 3.0],
1089
- # [nil, nil, nil]
1090
- # ],
1091
- # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1092
- # )
1093
- # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1094
- # # =>
1095
- # # shape: (4, 4)
1096
- # # ┌──────┬──────┬──────┬──────┐
1097
- # # │ a ┆ b ┆ c ┆ d │
1098
- # # │ --- ┆ --- ┆ --- ┆ --- │
1099
- # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1100
- # # ╞══════╪══════╪══════╪══════╡
1101
- # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1102
- # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1103
- # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1104
- # # │ null ┆ null ┆ null ┆ 99.9 │
1105
- # # └──────┴──────┴──────┴──────┘
1106
- def coalesce(exprs, *more_exprs)
1107
- exprs = Utils.selection_to_rbexpr_list(exprs)
1108
- if more_exprs.any?
1109
- exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1110
- end
1111
- Utils.wrap_expr(_coalesce_exprs(exprs))
1112
- end
1113
-
1114
- # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1115
- #
1116
- # Depending on the `unit` provided, this function will return a different dtype:
1117
- # - unit: "d" returns pl.Date
1118
- # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1119
- # - unit: "ms" returns pl.Datetime["ms"]
1120
- # - unit: "us" returns pl.Datetime["us"]
1121
- # - unit: "ns" returns pl.Datetime["ns"]
1122
- #
1123
- # @param column [Object]
1124
- # Series or expression to parse integers to pl.Datetime.
1125
- # @param unit [String]
1126
- # The unit of the timesteps since epoch time.
1127
- # @param eager [Boolean]
1128
- # If eager evaluation is `true`, a Series is returned instead of an Expr.
1129
- #
1130
- # @return [Object]
1131
- #
1132
- # @example
1133
- # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1134
- # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1135
- # # =>
1136
- # # shape: (2, 1)
1137
- # # ┌─────────────────────┐
1138
- # # │ timestamp │
1139
- # # │ --- │
1140
- # # │ datetime[μs] │
1141
- # # ╞═════════════════════╡
1142
- # # │ 2022-10-25 07:31:17 │
1143
- # # │ 2022-10-25 07:31:39 │
1144
- # # └─────────────────────┘
1145
- def from_epoch(column, unit: "s", eager: false)
1146
- if Utils.strlike?(column)
1147
- column = col(column)
1148
- elsif !column.is_a?(Series) && !column.is_a?(Expr)
1149
- column = Series.new(column)
1150
- end
1151
-
1152
- if unit == "d"
1153
- expr = column.cast(Date)
1154
- elsif unit == "s"
1155
- expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1156
- elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1157
- expr = column.cast(Datetime.new(unit))
1158
- else
1159
- raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1160
- end
1161
-
1162
- if eager
1163
- if !column.is_a?(Series)
1164
- raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1165
- else
1166
- column.to_frame.select(expr).to_series
1167
- end
1168
- else
1169
- expr
1170
- end
1171
- end
1172
-
1173
- # Start a "when, then, otherwise" expression.
1174
- #
1175
- # @return [When]
1176
- #
1177
- # @example
1178
- # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
1179
- # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
1180
- # # =>
1181
- # # shape: (3, 3)
1182
- # # ┌─────┬─────┬─────────┐
1183
- # # │ foo ┆ bar ┆ literal │
1184
- # # │ --- ┆ --- ┆ --- │
1185
- # # │ i64 ┆ i64 ┆ i32 │
1186
- # # ╞═════╪═════╪═════════╡
1187
- # # │ 1 ┆ 3 ┆ -1 │
1188
- # # │ 3 ┆ 4 ┆ 1 │
1189
- # # │ 4 ┆ 0 ┆ 1 │
1190
- # # └─────┴─────┴─────────┘
1191
- def when(expr)
1192
- expr = Utils.expr_to_lit_or_expr(expr)
1193
- pw = RbExpr.when(expr._rbexpr)
1194
- When.new(pw)
1195
- end
1196
- end
1197
- end