polars-df 0.8.0-x86_64-darwin → 0.10.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/3.3/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -0,0 +1,49 @@
1
+ module Polars
2
+ module Functions
3
+ # Return the number of rows in the context.
4
+ #
5
+ # This is similar to `COUNT(*)` in SQL.
6
+ #
7
+ # @return [Expr]
8
+ #
9
+ # @example
10
+ # df = Polars::DataFrame.new(
11
+ # {
12
+ # "a" => [1, 2, nil],
13
+ # "b" => [3, nil, nil],
14
+ # "c" => ["foo", "bar", "foo"]
15
+ # }
16
+ # )
17
+ # df.select(Polars.len)
18
+ # # =>
19
+ # # shape: (1, 1)
20
+ # # ┌─────┐
21
+ # # │ len │
22
+ # # │ --- │
23
+ # # │ u32 │
24
+ # # ╞═════╡
25
+ # # │ 3 │
26
+ # # └─────┘
27
+ #
28
+ # @example Generate an index column by using `len` in conjunction with `int_range`.
29
+ # df.select([
30
+ # Polars.int_range(Polars.len, dtype: Polars::UInt32).alias("index"),
31
+ # Polars.all
32
+ # ])
33
+ # # =>
34
+ # # shape: (3, 4)
35
+ # # ┌───────┬──────┬──────┬─────┐
36
+ # # │ index ┆ a ┆ b ┆ c │
37
+ # # │ --- ┆ --- ┆ --- ┆ --- │
38
+ # # │ u32 ┆ i64 ┆ i64 ┆ str │
39
+ # # ╞═══════╪══════╪══════╪═════╡
40
+ # # │ 0 ┆ 1 ┆ 3 ┆ foo │
41
+ # # │ 1 ┆ 2 ┆ null ┆ bar │
42
+ # # │ 2 ┆ null ┆ null ┆ foo │
43
+ # # └───────┴──────┴──────┴─────┘
44
+ def len
45
+ Utils.wrap_expr(Plr.len)
46
+ end
47
+ alias_method :length, :len
48
+ end
49
+ end
@@ -0,0 +1,35 @@
1
+ module Polars
2
+ module Functions
3
+ # Return an expression representing a literal value.
4
+ #
5
+ # @return [Expr]
6
+ def lit(value, dtype: nil, allow_object: nil)
7
+ if value.is_a?(::Time) || value.is_a?(::DateTime)
8
+ time_unit = dtype&.time_unit || "ns"
9
+ time_zone = dtype.&time_zone
10
+ e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
11
+ if time_zone
12
+ return e.dt.replace_time_zone(time_zone.to_s)
13
+ else
14
+ return e
15
+ end
16
+ elsif value.is_a?(::Date)
17
+ return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
18
+ elsif value.is_a?(Polars::Series)
19
+ name = value.name
20
+ value = value._s
21
+ e = Utils.wrap_expr(Plr.lit(value, allow_object))
22
+ if name == ""
23
+ return e
24
+ end
25
+ return e.alias(name)
26
+ elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
27
+ return lit(Series.new("", value))
28
+ elsif dtype
29
+ return Utils.wrap_expr(Plr.lit(value, allow_object)).cast(dtype)
30
+ end
31
+
32
+ Utils.wrap_expr(Plr.lit(value, allow_object))
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,16 @@
1
+ module Polars
2
+ module Functions
3
+ # Set the global random seed for Polars.
4
+ #
5
+ # This random seed is used to determine things such as shuffle ordering.
6
+ #
7
+ # @param seed [Integer]
8
+ # A non-negative integer < 2**64 used to seed the internal global
9
+ # random number generator.
10
+ #
11
+ # @return [nil]
12
+ def set_random_seed(seed)
13
+ Plr.set_random_seed(seed)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,103 @@
1
+ module Polars
2
+ module Functions
3
+ # Create a range of type `Datetime` (or `Date`).
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the date range.
7
+ # @param stop [Object]
8
+ # Upper bound of the date range.
9
+ # @param interval [Object]
10
+ # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
11
+ # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
12
+ # @param lazy [Boolean]
13
+ # Return an expression.
14
+ # @param closed ["both", "left", "right", "none"]
15
+ # Define whether the temporal window interval is closed or not.
16
+ # @param name [String]
17
+ # Name of the output Series.
18
+ # @param time_unit [nil, "ns", "us", "ms"]
19
+ # Set the time unit.
20
+ # @param time_zone [String]
21
+ # Optional timezone
22
+ #
23
+ # @return [Object]
24
+ #
25
+ # @note
26
+ # If both `low` and `high` are passed as date types (not datetime), and the
27
+ # interval granularity is no finer than 1d, the returned range is also of
28
+ # type date. All other permutations return a datetime Series.
29
+ #
30
+ # @example Using polars duration string to specify the interval
31
+ # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: 'drange' [date]
35
+ # # [
36
+ # # 2022-01-01
37
+ # # 2022-02-01
38
+ # # 2022-03-01
39
+ # # ]
40
+ #
41
+ # @example Using `timedelta` object to specify the interval:
42
+ # Polars.date_range(
43
+ # DateTime.new(1985, 1, 1),
44
+ # DateTime.new(1985, 1, 10),
45
+ # "1d12h",
46
+ # time_unit: "ms"
47
+ # )
48
+ # # =>
49
+ # # shape: (7,)
50
+ # # Series: '' [datetime[ms]]
51
+ # # [
52
+ # # 1985-01-01 00:00:00
53
+ # # 1985-01-02 12:00:00
54
+ # # 1985-01-04 00:00:00
55
+ # # 1985-01-05 12:00:00
56
+ # # 1985-01-07 00:00:00
57
+ # # 1985-01-08 12:00:00
58
+ # # 1985-01-10 00:00:00
59
+ # # ]
60
+ def date_range(
61
+ start,
62
+ stop,
63
+ interval,
64
+ lazy: false,
65
+ closed: "both",
66
+ name: nil,
67
+ time_unit: nil,
68
+ time_zone: nil
69
+ )
70
+ if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
71
+ raise Todo
72
+ else
73
+ interval = interval.to_s
74
+ if interval.include?(" ")
75
+ interval = interval.gsub(" ", "")
76
+ end
77
+ end
78
+
79
+ if time_unit.nil?
80
+ if interval.include?("ns")
81
+ time_unit = "ns"
82
+ else
83
+ time_unit = "us"
84
+ end
85
+ end
86
+
87
+ start_rbexpr = Utils.parse_as_expression(start)
88
+ stop_rbexpr = Utils.parse_as_expression(stop)
89
+
90
+ result = Utils.wrap_expr(
91
+ Plr.date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
92
+ )
93
+
94
+ result = result.alias(name.to_s)
95
+
96
+ if !lazy
97
+ return select(result).to_series
98
+ end
99
+
100
+ result
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,51 @@
1
+ module Polars
2
+ module Functions
3
+ # Create a range expression (or Series).
4
+ #
5
+ # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
6
+ # range size is equal to the length of the DataFrame you are collecting.
7
+ #
8
+ # @param start [Integer, Expr, Series]
9
+ # Lower bound of range.
10
+ # @param stop [Integer, Expr, Series]
11
+ # Upper bound of range.
12
+ # @param step [Integer]
13
+ # Step size of the range.
14
+ # @param eager [Boolean]
15
+ # If eager evaluation is `True`, a Series is returned instead of an Expr.
16
+ # @param dtype [Symbol]
17
+ # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
18
+ #
19
+ # @return [Expr, Series]
20
+ #
21
+ # @example
22
+ # Polars.arange(0, 3, eager: true)
23
+ # # =>
24
+ # # shape: (3,)
25
+ # # Series: 'arange' [i64]
26
+ # # [
27
+ # # 0
28
+ # # 1
29
+ # # 2
30
+ # # ]
31
+ def int_range(start, stop = nil, step: 1, eager: false, dtype: nil)
32
+ if stop.nil?
33
+ stop = start
34
+ start = 0
35
+ end
36
+
37
+ start = Utils.parse_as_expression(start)
38
+ stop = Utils.parse_as_expression(stop)
39
+ dtype ||= Int64
40
+ dtype = dtype.to_s if dtype.is_a?(Symbol)
41
+ result = Utils.wrap_expr(Plr.int_range(start, stop, step, dtype)).alias("arange")
42
+
43
+ if eager
44
+ return select(result).to_series
45
+ end
46
+
47
+ result
48
+ end
49
+ alias_method :arange, :int_range
50
+ end
51
+ end
@@ -0,0 +1,144 @@
1
+ module Polars
2
+ module Functions
3
+ # Repeat a single value n times.
4
+ #
5
+ # @param value [Object]
6
+ # Value to repeat.
7
+ # @param n [Integer]
8
+ # Repeat `n` times.
9
+ # @param eager [Boolean]
10
+ # Run eagerly and collect into a `Series`.
11
+ # @param name [String]
12
+ # Only used in `eager` mode. As expression, use `alias`.
13
+ #
14
+ # @return [Object]
15
+ #
16
+ # @example Construct a column with a repeated value in a lazy context.
17
+ # Polars.select(Polars.repeat("z", 3)).to_series
18
+ # # =>
19
+ # # shape: (3,)
20
+ # # Series: 'repeat' [str]
21
+ # # [
22
+ # # "z"
23
+ # # "z"
24
+ # # "z"
25
+ # # ]
26
+ #
27
+ # @example Generate a Series directly by setting `eager: true`.
28
+ # Polars.repeat(3, 3, dtype: Polars::Int8, eager: true)
29
+ # # =>
30
+ # # shape: (3,)
31
+ # # Series: 'repeat' [i8]
32
+ # # [
33
+ # # 3
34
+ # # 3
35
+ # # 3
36
+ # # ]
37
+ def repeat(value, n, dtype: nil, eager: false, name: nil)
38
+ if !name.nil?
39
+ warn "the `name` argument is deprecated. Use the `alias` method instead."
40
+ end
41
+
42
+ if n.is_a?(Integer)
43
+ n = lit(n)
44
+ end
45
+
46
+ value = Utils.parse_as_expression(value, str_as_lit: true)
47
+ expr = Utils.wrap_expr(Plr.repeat(value, n._rbexpr, dtype))
48
+ if !name.nil?
49
+ expr = expr.alias(name)
50
+ end
51
+ if eager
52
+ return select(expr).to_series
53
+ end
54
+ expr
55
+ end
56
+
57
+ # Construct a column of length `n` filled with ones.
58
+ #
59
+ # This is syntactic sugar for the `repeat` function.
60
+ #
61
+ # @param n [Integer]
62
+ # Length of the resulting column.
63
+ # @param dtype [Object]
64
+ # Data type of the resulting column. Defaults to Float64.
65
+ # @param eager [Boolean]
66
+ # Evaluate immediately and return a `Series`. If set to `false`,
67
+ # return an expression instead.
68
+ #
69
+ # @return [Object]
70
+ #
71
+ # @example
72
+ # Polars.ones(3, dtype: Polars::Int8, eager: true)
73
+ # # =>
74
+ # # shape: (3,)
75
+ # # Series: 'ones' [i8]
76
+ # # [
77
+ # # 1
78
+ # # 1
79
+ # # 1
80
+ # # ]
81
+ def ones(n, dtype: nil, eager: true)
82
+ if (zero = _one_or_zero_by_dtype(1, dtype)).nil?
83
+ msg = "invalid dtype for `ones`; found #{dtype}"
84
+ raise TypeError, msg
85
+ end
86
+
87
+ repeat(zero, n, dtype: dtype, eager: eager).alias("ones")
88
+ end
89
+
90
+ # Construct a column of length `n` filled with zeros.
91
+ #
92
+ # This is syntactic sugar for the `repeat` function.
93
+ #
94
+ # @param n [Integer]
95
+ # Length of the resulting column.
96
+ # @param dtype [Object]
97
+ # Data type of the resulting column. Defaults to Float64.
98
+ # @param eager [Boolean]
99
+ # Evaluate immediately and return a `Series`. If set to `false`,
100
+ # return an expression instead.
101
+ #
102
+ # @return [Object]
103
+ #
104
+ # @example
105
+ # Polars.zeros(3, dtype: Polars::Int8, eager: true)
106
+ # # =>
107
+ # # shape: (3,)
108
+ # # Series: 'zeros' [i8]
109
+ # # [
110
+ # # 0
111
+ # # 0
112
+ # # 0
113
+ # # ]
114
+ def zeros(n, dtype: nil, eager: true)
115
+ if (zero = _one_or_zero_by_dtype(0, dtype)).nil?
116
+ msg = "invalid dtype for `zeros`; found #{dtype}"
117
+ raise TypeError, msg
118
+ end
119
+
120
+ repeat(zero, n, dtype: dtype, eager: eager).alias("zeros")
121
+ end
122
+
123
+ private
124
+
125
+ def _one_or_zero_by_dtype(value, dtype)
126
+ if dtype.integer?
127
+ value
128
+ elsif dtype.float?
129
+ value.to_f
130
+ elsif dtype == Boolean
131
+ value != 0
132
+ elsif dtype == Utf8
133
+ value.to_s
134
+ elsif dtype == Decimal
135
+ Decimal(value.to_s)
136
+ elsif [List, Array].include?(dtype)
137
+ arr_width = dtype.respond_to?(:width) ? dtype.width : 1
138
+ [_one_or_zero_by_dtype(value, dtype.inner)] * arr_width
139
+ else
140
+ nil
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,96 @@
1
+ module Polars
2
+ module Functions
3
+ # Start a "when, then, otherwise" expression.
4
+ #
5
+ # @return [When]
6
+ #
7
+ # @example Below we add a column with the value 1, where column "foo" > 2 and the value -1 where it isn't.
8
+ # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
9
+ # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
10
+ # # =>
11
+ # # shape: (3, 3)
12
+ # # ┌─────┬─────┬─────────┐
13
+ # # │ foo ┆ bar ┆ literal │
14
+ # # │ --- ┆ --- ┆ --- │
15
+ # # │ i64 ┆ i64 ┆ i32 │
16
+ # # ╞═════╪═════╪═════════╡
17
+ # # │ 1 ┆ 3 ┆ -1 │
18
+ # # │ 3 ┆ 4 ┆ 1 │
19
+ # # │ 4 ┆ 0 ┆ 1 │
20
+ # # └─────┴─────┴─────────┘
21
+ #
22
+ # @example Or with multiple when-then operations chained:
23
+ # df.with_columns(
24
+ # Polars.when(Polars.col("foo") > 2)
25
+ # .then(1)
26
+ # .when(Polars.col("bar") > 2)
27
+ # .then(4)
28
+ # .otherwise(-1)
29
+ # .alias("val")
30
+ # )
31
+ # # =>
32
+ # # shape: (3, 3)
33
+ # # ┌─────┬─────┬─────┐
34
+ # # │ foo ┆ bar ┆ val │
35
+ # # │ --- ┆ --- ┆ --- │
36
+ # # │ i64 ┆ i64 ┆ i32 │
37
+ # # ╞═════╪═════╪═════╡
38
+ # # │ 1 ┆ 3 ┆ 4 │
39
+ # # │ 3 ┆ 4 ┆ 1 │
40
+ # # │ 4 ┆ 0 ┆ 1 │
41
+ # # └─────┴─────┴─────┘
42
+ #
43
+ # @example The `otherwise` at the end is optional. If left out, any rows where none of the `when` expressions evaluate to True, are set to `null`:
44
+ # df.with_columns(Polars.when(Polars.col("foo") > 2).then(1).alias("val"))
45
+ # # =>
46
+ # # shape: (3, 3)
47
+ # # ┌─────┬─────┬──────┐
48
+ # # │ foo ┆ bar ┆ val │
49
+ # # │ --- ┆ --- ┆ --- │
50
+ # # │ i64 ┆ i64 ┆ i32 │
51
+ # # ╞═════╪═════╪══════╡
52
+ # # │ 1 ┆ 3 ┆ null │
53
+ # # │ 3 ┆ 4 ┆ 1 │
54
+ # # │ 4 ┆ 0 ┆ 1 │
55
+ # # └─────┴─────┴──────┘
56
+ #
57
+ # @example Pass multiple predicates, each of which must be met:
58
+ # df.with_columns(
59
+ # val: Polars.when(
60
+ # Polars.col("bar") > 0,
61
+ # Polars.col("foo") % 2 != 0
62
+ # )
63
+ # .then(99)
64
+ # .otherwise(-1)
65
+ # )
66
+ # # =>
67
+ # # shape: (3, 3)
68
+ # # ┌─────┬─────┬─────┐
69
+ # # │ foo ┆ bar ┆ val │
70
+ # # │ --- ┆ --- ┆ --- │
71
+ # # │ i64 ┆ i64 ┆ i32 │
72
+ # # ╞═════╪═════╪═════╡
73
+ # # │ 1 ┆ 3 ┆ 99 │
74
+ # # │ 3 ┆ 4 ┆ 99 │
75
+ # # │ 4 ┆ 0 ┆ -1 │
76
+ # # └─────┴─────┴─────┘
77
+ #
78
+ # @example Pass conditions as keyword arguments:
79
+ # df.with_columns(val: Polars.when(foo: 4, bar: 0).then(99).otherwise(-1))
80
+ # # =>
81
+ # # shape: (3, 3)
82
+ # # ┌─────┬─────┬─────┐
83
+ # # │ foo ┆ bar ┆ val │
84
+ # # │ --- ┆ --- ┆ --- │
85
+ # # │ i64 ┆ i64 ┆ i32 │
86
+ # # ╞═════╪═════╪═════╡
87
+ # # │ 1 ┆ 3 ┆ -1 │
88
+ # # │ 3 ┆ 4 ┆ -1 │
89
+ # # │ 4 ┆ 0 ┆ 99 │
90
+ # # └─────┴─────┴─────┘
91
+ def when(*predicates, **constraints)
92
+ condition = Utils.parse_when_inputs(*predicates, **constraints)
93
+ When.new(Plr.when(condition))
94
+ end
95
+ end
96
+ end