polars-df 0.7.0-x86_64-darwin → 0.9.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/Cargo.lock +353 -237
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +4014 -3495
- data/LICENSE.txt +1 -1
- data/README.md +2 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +449 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +248 -108
- data/lib/polars/data_types.rb +195 -29
- data/lib/polars/date_time_expr.rb +41 -24
- data/lib/polars/date_time_name_space.rb +12 -12
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +1080 -195
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +27 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +3 -3
- data/lib/polars/io.rb +21 -28
- data/lib/polars/lazy_frame.rb +390 -76
- data/lib/polars/list_expr.rb +152 -6
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +557 -59
- data/lib/polars/sql_context.rb +1 -1
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +412 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_expr.rb +1 -1
- data/lib/polars/struct_name_space.rb +1 -1
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +64 -20
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -2
- metadata +36 -7
- data/lib/polars/lazy_functions.rb +0 -1197
@@ -0,0 +1,49 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Return the number of rows in the context.
|
4
|
+
#
|
5
|
+
# This is similar to `COUNT(*)` in SQL.
|
6
|
+
#
|
7
|
+
# @return [Expr]
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# df = Polars::DataFrame.new(
|
11
|
+
# {
|
12
|
+
# "a" => [1, 2, nil],
|
13
|
+
# "b" => [3, nil, nil],
|
14
|
+
# "c" => ["foo", "bar", "foo"]
|
15
|
+
# }
|
16
|
+
# )
|
17
|
+
# df.select(Polars.len)
|
18
|
+
# # =>
|
19
|
+
# # shape: (1, 1)
|
20
|
+
# # ┌─────┐
|
21
|
+
# # │ len │
|
22
|
+
# # │ --- │
|
23
|
+
# # │ u32 │
|
24
|
+
# # ╞═════╡
|
25
|
+
# # │ 3 │
|
26
|
+
# # └─────┘
|
27
|
+
#
|
28
|
+
# @example Generate an index column by using `len` in conjunction with `int_range`.
|
29
|
+
# df.select([
|
30
|
+
# Polars.int_range(Polars.len, dtype: Polars::UInt32).alias("index"),
|
31
|
+
# Polars.all
|
32
|
+
# ])
|
33
|
+
# # =>
|
34
|
+
# # shape: (3, 4)
|
35
|
+
# # ┌───────┬──────┬──────┬─────┐
|
36
|
+
# # │ index ┆ a ┆ b ┆ c │
|
37
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
38
|
+
# # │ u32 ┆ i64 ┆ i64 ┆ str │
|
39
|
+
# # ╞═══════╪══════╪══════╪═════╡
|
40
|
+
# # │ 0 ┆ 1 ┆ 3 ┆ foo │
|
41
|
+
# # │ 1 ┆ 2 ┆ null ┆ bar │
|
42
|
+
# # │ 2 ┆ null ┆ null ┆ foo │
|
43
|
+
# # └───────┴──────┴──────┴─────┘
|
44
|
+
def len
|
45
|
+
Utils.wrap_expr(Plr.len)
|
46
|
+
end
|
47
|
+
alias_method :length, :len
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Return an expression representing a literal value.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
def lit(value, dtype: nil, allow_object: nil)
|
7
|
+
if value.is_a?(::Time) || value.is_a?(::DateTime)
|
8
|
+
time_unit = dtype&.time_unit || "ns"
|
9
|
+
time_zone = dtype.&time_zone
|
10
|
+
e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
|
11
|
+
if time_zone
|
12
|
+
return e.dt.replace_time_zone(time_zone.to_s)
|
13
|
+
else
|
14
|
+
return e
|
15
|
+
end
|
16
|
+
elsif value.is_a?(::Date)
|
17
|
+
return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
|
18
|
+
elsif value.is_a?(Polars::Series)
|
19
|
+
name = value.name
|
20
|
+
value = value._s
|
21
|
+
e = Utils.wrap_expr(Plr.lit(value, allow_object))
|
22
|
+
if name == ""
|
23
|
+
return e
|
24
|
+
end
|
25
|
+
return e.alias(name)
|
26
|
+
elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
|
27
|
+
return lit(Series.new("", value))
|
28
|
+
elsif dtype
|
29
|
+
return Utils.wrap_expr(Plr.lit(value, allow_object)).cast(dtype)
|
30
|
+
end
|
31
|
+
|
32
|
+
Utils.wrap_expr(Plr.lit(value, allow_object))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Set the global random seed for Polars.
|
4
|
+
#
|
5
|
+
# This random seed is used to determine things such as shuffle ordering.
|
6
|
+
#
|
7
|
+
# @param seed [Integer]
|
8
|
+
# A non-negative integer < 2**64 used to seed the internal global
|
9
|
+
# random number generator.
|
10
|
+
#
|
11
|
+
# @return [nil]
|
12
|
+
def set_random_seed(seed)
|
13
|
+
Plr.set_random_seed(seed)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Create a range of type `Datetime` (or `Date`).
|
4
|
+
#
|
5
|
+
# @param start [Object]
|
6
|
+
# Lower bound of the date range.
|
7
|
+
# @param stop [Object]
|
8
|
+
# Upper bound of the date range.
|
9
|
+
# @param interval [Object]
|
10
|
+
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
11
|
+
# representing 3 days, 12 hours, 4 minutes, and 25 seconds.
|
12
|
+
# @param lazy [Boolean]
|
13
|
+
# Return an expression.
|
14
|
+
# @param closed ["both", "left", "right", "none"]
|
15
|
+
# Define whether the temporal window interval is closed or not.
|
16
|
+
# @param name [String]
|
17
|
+
# Name of the output Series.
|
18
|
+
# @param time_unit [nil, "ns", "us", "ms"]
|
19
|
+
# Set the time unit.
|
20
|
+
# @param time_zone [String]
|
21
|
+
# Optional timezone
|
22
|
+
#
|
23
|
+
# @return [Object]
|
24
|
+
#
|
25
|
+
# @note
|
26
|
+
# If both `low` and `high` are passed as date types (not datetime), and the
|
27
|
+
# interval granularity is no finer than 1d, the returned range is also of
|
28
|
+
# type date. All other permutations return a datetime Series.
|
29
|
+
#
|
30
|
+
# @example Using polars duration string to specify the interval
|
31
|
+
# Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
|
32
|
+
# # =>
|
33
|
+
# # shape: (3,)
|
34
|
+
# # Series: 'drange' [date]
|
35
|
+
# # [
|
36
|
+
# # 2022-01-01
|
37
|
+
# # 2022-02-01
|
38
|
+
# # 2022-03-01
|
39
|
+
# # ]
|
40
|
+
#
|
41
|
+
# @example Using `timedelta` object to specify the interval:
|
42
|
+
# Polars.date_range(
|
43
|
+
# DateTime.new(1985, 1, 1),
|
44
|
+
# DateTime.new(1985, 1, 10),
|
45
|
+
# "1d12h",
|
46
|
+
# time_unit: "ms"
|
47
|
+
# )
|
48
|
+
# # =>
|
49
|
+
# # shape: (7,)
|
50
|
+
# # Series: '' [datetime[ms]]
|
51
|
+
# # [
|
52
|
+
# # 1985-01-01 00:00:00
|
53
|
+
# # 1985-01-02 12:00:00
|
54
|
+
# # 1985-01-04 00:00:00
|
55
|
+
# # 1985-01-05 12:00:00
|
56
|
+
# # 1985-01-07 00:00:00
|
57
|
+
# # 1985-01-08 12:00:00
|
58
|
+
# # 1985-01-10 00:00:00
|
59
|
+
# # ]
|
60
|
+
def date_range(
|
61
|
+
start,
|
62
|
+
stop,
|
63
|
+
interval,
|
64
|
+
lazy: false,
|
65
|
+
closed: "both",
|
66
|
+
name: nil,
|
67
|
+
time_unit: nil,
|
68
|
+
time_zone: nil
|
69
|
+
)
|
70
|
+
if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
|
71
|
+
raise Todo
|
72
|
+
else
|
73
|
+
interval = interval.to_s
|
74
|
+
if interval.include?(" ")
|
75
|
+
interval = interval.gsub(" ", "")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if time_unit.nil?
|
80
|
+
if interval.include?("ns")
|
81
|
+
time_unit = "ns"
|
82
|
+
else
|
83
|
+
time_unit = "us"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
start_rbexpr = Utils.parse_as_expression(start)
|
88
|
+
stop_rbexpr = Utils.parse_as_expression(stop)
|
89
|
+
|
90
|
+
result = Utils.wrap_expr(
|
91
|
+
Plr.date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
92
|
+
)
|
93
|
+
|
94
|
+
result = result.alias(name.to_s)
|
95
|
+
|
96
|
+
if !lazy
|
97
|
+
return select(result).to_series
|
98
|
+
end
|
99
|
+
|
100
|
+
result
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Create a range expression (or Series).
|
4
|
+
#
|
5
|
+
# This can be used in a `select`, `with_column`, etc. Be sure that the resulting
|
6
|
+
# range size is equal to the length of the DataFrame you are collecting.
|
7
|
+
#
|
8
|
+
# @param start [Integer, Expr, Series]
|
9
|
+
# Lower bound of range.
|
10
|
+
# @param stop [Integer, Expr, Series]
|
11
|
+
# Upper bound of range.
|
12
|
+
# @param step [Integer]
|
13
|
+
# Step size of the range.
|
14
|
+
# @param eager [Boolean]
|
15
|
+
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
16
|
+
# @param dtype [Symbol]
|
17
|
+
# Apply an explicit integer dtype to the resulting expression (default is `Int64`).
|
18
|
+
#
|
19
|
+
# @return [Expr, Series]
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# Polars.arange(0, 3, eager: true)
|
23
|
+
# # =>
|
24
|
+
# # shape: (3,)
|
25
|
+
# # Series: 'arange' [i64]
|
26
|
+
# # [
|
27
|
+
# # 0
|
28
|
+
# # 1
|
29
|
+
# # 2
|
30
|
+
# # ]
|
31
|
+
def int_range(start, stop = nil, step: 1, eager: false, dtype: nil)
|
32
|
+
if stop.nil?
|
33
|
+
stop = start
|
34
|
+
start = 0
|
35
|
+
end
|
36
|
+
|
37
|
+
start = Utils.parse_as_expression(start)
|
38
|
+
stop = Utils.parse_as_expression(stop)
|
39
|
+
dtype ||= Int64
|
40
|
+
dtype = dtype.to_s if dtype.is_a?(Symbol)
|
41
|
+
result = Utils.wrap_expr(Plr.int_range(start, stop, step, dtype)).alias("arange")
|
42
|
+
|
43
|
+
if eager
|
44
|
+
return select(result).to_series
|
45
|
+
end
|
46
|
+
|
47
|
+
result
|
48
|
+
end
|
49
|
+
alias_method :arange, :int_range
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Repeat a single value n times.
|
4
|
+
#
|
5
|
+
# @param value [Object]
|
6
|
+
# Value to repeat.
|
7
|
+
# @param n [Integer]
|
8
|
+
# Repeat `n` times.
|
9
|
+
# @param eager [Boolean]
|
10
|
+
# Run eagerly and collect into a `Series`.
|
11
|
+
# @param name [String]
|
12
|
+
# Only used in `eager` mode. As expression, use `alias`.
|
13
|
+
#
|
14
|
+
# @return [Object]
|
15
|
+
#
|
16
|
+
# @example Construct a column with a repeated value in a lazy context.
|
17
|
+
# Polars.select(Polars.repeat("z", 3)).to_series
|
18
|
+
# # =>
|
19
|
+
# # shape: (3,)
|
20
|
+
# # Series: 'repeat' [str]
|
21
|
+
# # [
|
22
|
+
# # "z"
|
23
|
+
# # "z"
|
24
|
+
# # "z"
|
25
|
+
# # ]
|
26
|
+
#
|
27
|
+
# @example Generate a Series directly by setting `eager: true`.
|
28
|
+
# Polars.repeat(3, 3, dtype: Polars::Int8, eager: true)
|
29
|
+
# # =>
|
30
|
+
# # shape: (3,)
|
31
|
+
# # Series: 'repeat' [i8]
|
32
|
+
# # [
|
33
|
+
# # 3
|
34
|
+
# # 3
|
35
|
+
# # 3
|
36
|
+
# # ]
|
37
|
+
def repeat(value, n, dtype: nil, eager: false, name: nil)
|
38
|
+
if !name.nil?
|
39
|
+
warn "the `name` argument is deprecated. Use the `alias` method instead."
|
40
|
+
end
|
41
|
+
|
42
|
+
if n.is_a?(Integer)
|
43
|
+
n = lit(n)
|
44
|
+
end
|
45
|
+
|
46
|
+
value = Utils.parse_as_expression(value, str_as_lit: true)
|
47
|
+
expr = Utils.wrap_expr(Plr.repeat(value, n._rbexpr, dtype))
|
48
|
+
if !name.nil?
|
49
|
+
expr = expr.alias(name)
|
50
|
+
end
|
51
|
+
if eager
|
52
|
+
return select(expr).to_series
|
53
|
+
end
|
54
|
+
expr
|
55
|
+
end
|
56
|
+
|
57
|
+
# Construct a column of length `n` filled with ones.
|
58
|
+
#
|
59
|
+
# This is syntactic sugar for the `repeat` function.
|
60
|
+
#
|
61
|
+
# @param n [Integer]
|
62
|
+
# Length of the resulting column.
|
63
|
+
# @param dtype [Object]
|
64
|
+
# Data type of the resulting column. Defaults to Float64.
|
65
|
+
# @param eager [Boolean]
|
66
|
+
# Evaluate immediately and return a `Series`. If set to `false`,
|
67
|
+
# return an expression instead.
|
68
|
+
#
|
69
|
+
# @return [Object]
|
70
|
+
#
|
71
|
+
# @example
|
72
|
+
# Polars.ones(3, dtype: Polars::Int8, eager: true)
|
73
|
+
# # =>
|
74
|
+
# # shape: (3,)
|
75
|
+
# # Series: 'ones' [i8]
|
76
|
+
# # [
|
77
|
+
# # 1
|
78
|
+
# # 1
|
79
|
+
# # 1
|
80
|
+
# # ]
|
81
|
+
def ones(n, dtype: nil, eager: true)
|
82
|
+
if (zero = _one_or_zero_by_dtype(1, dtype)).nil?
|
83
|
+
msg = "invalid dtype for `ones`; found #{dtype}"
|
84
|
+
raise TypeError, msg
|
85
|
+
end
|
86
|
+
|
87
|
+
repeat(zero, n, dtype: dtype, eager: eager).alias("ones")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Construct a column of length `n` filled with zeros.
|
91
|
+
#
|
92
|
+
# This is syntactic sugar for the `repeat` function.
|
93
|
+
#
|
94
|
+
# @param n [Integer]
|
95
|
+
# Length of the resulting column.
|
96
|
+
# @param dtype [Object]
|
97
|
+
# Data type of the resulting column. Defaults to Float64.
|
98
|
+
# @param eager [Boolean]
|
99
|
+
# Evaluate immediately and return a `Series`. If set to `false`,
|
100
|
+
# return an expression instead.
|
101
|
+
#
|
102
|
+
# @return [Object]
|
103
|
+
#
|
104
|
+
# @example
|
105
|
+
# Polars.zeros(3, dtype: Polars::Int8, eager: true)
|
106
|
+
# # =>
|
107
|
+
# # shape: (3,)
|
108
|
+
# # Series: 'zeros' [i8]
|
109
|
+
# # [
|
110
|
+
# # 0
|
111
|
+
# # 0
|
112
|
+
# # 0
|
113
|
+
# # ]
|
114
|
+
def zeros(n, dtype: nil, eager: true)
|
115
|
+
if (zero = _one_or_zero_by_dtype(0, dtype)).nil?
|
116
|
+
msg = "invalid dtype for `zeros`; found #{dtype}"
|
117
|
+
raise TypeError, msg
|
118
|
+
end
|
119
|
+
|
120
|
+
repeat(zero, n, dtype: dtype, eager: eager).alias("zeros")
|
121
|
+
end
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
def _one_or_zero_by_dtype(value, dtype)
|
126
|
+
if dtype.integer?
|
127
|
+
value
|
128
|
+
elsif dtype.float?
|
129
|
+
value.to_f
|
130
|
+
elsif dtype == Boolean
|
131
|
+
value != 0
|
132
|
+
elsif dtype == Utf8
|
133
|
+
value.to_s
|
134
|
+
elsif dtype == Decimal
|
135
|
+
Decimal(value.to_s)
|
136
|
+
elsif [List, Array].include?(dtype)
|
137
|
+
arr_width = dtype.respond_to?(:width) ? dtype.width : 1
|
138
|
+
[_one_or_zero_by_dtype(value, dtype.inner)] * arr_width
|
139
|
+
else
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Start a "when, then, otherwise" expression.
|
4
|
+
#
|
5
|
+
# @return [When]
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
|
9
|
+
# df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
|
10
|
+
# # =>
|
11
|
+
# # shape: (3, 3)
|
12
|
+
# # ┌─────┬─────┬─────────┐
|
13
|
+
# # │ foo ┆ bar ┆ literal │
|
14
|
+
# # │ --- ┆ --- ┆ --- │
|
15
|
+
# # │ i64 ┆ i64 ┆ i32 │
|
16
|
+
# # ╞═════╪═════╪═════════╡
|
17
|
+
# # │ 1 ┆ 3 ┆ -1 │
|
18
|
+
# # │ 3 ┆ 4 ┆ 1 │
|
19
|
+
# # │ 4 ┆ 0 ┆ 1 │
|
20
|
+
# # └─────┴─────┴─────────┘
|
21
|
+
def when(expr)
|
22
|
+
expr = Utils.expr_to_lit_or_expr(expr)
|
23
|
+
pw = Plr.when(expr._rbexpr)
|
24
|
+
When.new(pw)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|