polars-df 0.20.0-x86_64-darwin → 0.21.1-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +192 -186
- data/LICENSE-THIRD-PARTY.txt +1431 -1810
- data/LICENSE.txt +1 -1
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/3.4/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +130 -32
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +12 -2
- data/lib/polars/data_frame.rb +834 -48
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +61 -5
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +1247 -211
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +127 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +19 -1
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +70 -66
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +1099 -95
- data/lib/polars/list_expr.rb +400 -11
- data/lib/polars/list_name_space.rb +321 -5
- data/lib/polars/meta_expr.rb +71 -22
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +84 -3
- data/lib/polars/selector.rb +210 -0
- data/lib/polars/selectors.rb +932 -203
- data/lib/polars/series.rb +1083 -63
- data/lib/polars/string_expr.rb +435 -9
- data/lib/polars/string_name_space.rb +729 -45
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +9 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +10 -0
- metadata +12 -2
data/lib/polars/schema.rb
CHANGED
@@ -1,29 +1,110 @@
|
|
1
1
|
module Polars
|
2
2
|
class Schema
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
# Ordered mapping of column names to their data type.
|
4
|
+
#
|
5
|
+
# @param schema [Object]
|
6
|
+
# The schema definition given by column names and their associated
|
7
|
+
# Polars data type. Accepts a mapping or an enumerable of arrays.
|
8
|
+
def initialize(schema = nil, check_dtypes: true)
|
9
|
+
input = schema || {}
|
10
|
+
@schema = {}
|
11
|
+
input.each do |name, tp|
|
12
|
+
if !check_dtypes
|
13
|
+
@schema[name] = tp
|
14
|
+
elsif Utils.is_polars_dtype(tp)
|
15
|
+
@schema[name] = _check_dtype(tp)
|
16
|
+
else
|
17
|
+
self[name] = tp
|
18
|
+
end
|
19
|
+
end
|
6
20
|
end
|
7
21
|
|
22
|
+
# Returns the data type of the column.
|
23
|
+
#
|
24
|
+
# @return [Object]
|
8
25
|
def [](key)
|
9
26
|
@schema[key]
|
10
27
|
end
|
11
28
|
|
29
|
+
# Sets the data type of the column.
|
30
|
+
#
|
31
|
+
# @return [Object]
|
32
|
+
def []=(name, dtype)
|
33
|
+
# TODO check dtype
|
34
|
+
@schema[name] = dtype
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the column names of the schema.
|
38
|
+
#
|
39
|
+
# @return [Array]
|
40
|
+
#
|
41
|
+
# @example
|
42
|
+
# s = Polars::Schema.new({"x" => Polars::Float64.new, "y" => Polars::Datetime.new(time_zone: "UTC")})
|
43
|
+
# s.names
|
44
|
+
# # => ["x", "y"]
|
12
45
|
def names
|
13
46
|
@schema.keys
|
14
47
|
end
|
15
48
|
|
49
|
+
# Get the data types of the schema.
|
50
|
+
#
|
51
|
+
# @return [Array]
|
52
|
+
#
|
53
|
+
# @example
|
54
|
+
# s = Polars::Schema.new({"x" => Polars::UInt8.new, "y" => Polars::List.new(Polars::UInt8)})
|
55
|
+
# s.dtypes
|
56
|
+
# # => [Polars::UInt8, Polars::List(Polars::UInt8)]
|
16
57
|
def dtypes
|
17
58
|
@schema.values
|
18
59
|
end
|
19
60
|
|
61
|
+
# Get the number of schema entries.
|
62
|
+
#
|
63
|
+
# @return [Integer]
|
64
|
+
#
|
65
|
+
# @example
|
66
|
+
# s = Polars::Schema.new({"x" => Polars::Int32.new, "y" => Polars::List.new(Polars::String)})
|
67
|
+
# s.length
|
68
|
+
# # => 2
|
20
69
|
def length
|
21
70
|
@schema.length
|
22
71
|
end
|
23
72
|
|
73
|
+
# Returns a string representing the Schema.
|
74
|
+
#
|
75
|
+
# @return [String]
|
24
76
|
def to_s
|
25
77
|
"#{self.class.name}(#{@schema})"
|
26
78
|
end
|
27
79
|
alias_method :inspect, :to_s
|
80
|
+
|
81
|
+
# @private
|
82
|
+
def include?(name)
|
83
|
+
@schema.include?(name)
|
84
|
+
end
|
85
|
+
|
86
|
+
# @private
|
87
|
+
def to_h
|
88
|
+
@schema.to_h
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def _check_dtype(tp)
|
94
|
+
if !tp.is_a?(DataType)
|
95
|
+
# note: if nested/decimal, or has signature params, this implies required args
|
96
|
+
if tp.nested? || tp.decimal? || _required_init_args(tp)
|
97
|
+
msg = "dtypes must be fully-specified, got: #{tp.inspect}"
|
98
|
+
raise TypeError, msg
|
99
|
+
end
|
100
|
+
tp = tp.new
|
101
|
+
end
|
102
|
+
tp
|
103
|
+
end
|
104
|
+
|
105
|
+
def _required_init_args(tp)
|
106
|
+
arity = tp.method(:new).arity
|
107
|
+
arity > 0 || arity < -1
|
108
|
+
end
|
28
109
|
end
|
29
110
|
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
module Polars
|
2
|
+
# Base column selector expression/proxy.
|
3
|
+
class Selector < Expr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbselector
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def self._from_rbselector(rbselector)
|
9
|
+
slf = new
|
10
|
+
slf._rbselector = rbselector
|
11
|
+
slf._rbexpr = RbExpr.new_selector(rbselector)
|
12
|
+
slf
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a string representing the Selector.
|
16
|
+
#
|
17
|
+
# @return [String]
|
18
|
+
def inspect
|
19
|
+
Expr._from_rbexpr(_rbexpr).to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
# @private
|
23
|
+
def self._by_dtype(dtypes)
|
24
|
+
selectors = []
|
25
|
+
concrete_dtypes = []
|
26
|
+
dtypes.each do |dt|
|
27
|
+
if Utils.is_polars_dtype(dt)
|
28
|
+
concrete_dtypes += [dt]
|
29
|
+
else
|
30
|
+
raise Todo
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
dtype_selector = _from_rbselector(RbSelector.by_dtype(concrete_dtypes))
|
35
|
+
|
36
|
+
if selectors.length == 0
|
37
|
+
return dtype_selector
|
38
|
+
end
|
39
|
+
|
40
|
+
selector = selectors[0]
|
41
|
+
selectors[1..].each do |s|
|
42
|
+
selector = selector | s
|
43
|
+
end
|
44
|
+
if concrete_dtypes.length == 0
|
45
|
+
selector
|
46
|
+
else
|
47
|
+
dtype_selector | selector
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @private
|
52
|
+
def self._by_name(names, strict:)
|
53
|
+
_from_rbselector(RbSelector.by_name(names, strict))
|
54
|
+
end
|
55
|
+
|
56
|
+
# Invert the selector.
|
57
|
+
#
|
58
|
+
# @return [Selector]
|
59
|
+
def ~
|
60
|
+
Selectors.all - self
|
61
|
+
end
|
62
|
+
|
63
|
+
# AND.
|
64
|
+
#
|
65
|
+
# @return [Selector]
|
66
|
+
def &(other)
|
67
|
+
if Utils.is_column(other)
|
68
|
+
colname = other.meta.output_name
|
69
|
+
other = by_name(colname)
|
70
|
+
end
|
71
|
+
if Utils.is_selector(other)
|
72
|
+
Selector._from_rbselector(
|
73
|
+
_rbselector.intersect(other._rbselector)
|
74
|
+
)
|
75
|
+
else
|
76
|
+
as_expr & other
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# OR.
|
81
|
+
#
|
82
|
+
# @return [Selector]
|
83
|
+
def |(other)
|
84
|
+
if Utils.is_column(other)
|
85
|
+
other = by_name(other.meta.output_name)
|
86
|
+
end
|
87
|
+
if Utils.is_selector(other)
|
88
|
+
Selector._from_rbselector(
|
89
|
+
_rbselector.union(other._rbselector)
|
90
|
+
)
|
91
|
+
else
|
92
|
+
as_expr | other
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Difference.
|
97
|
+
#
|
98
|
+
# @return [Selector]
|
99
|
+
def -(other)
|
100
|
+
if Utils.is_selector(other)
|
101
|
+
Selector._from_rbselector(
|
102
|
+
_rbselector.difference(other._rbselector)
|
103
|
+
)
|
104
|
+
else
|
105
|
+
as_expr - other
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# XOR.
|
110
|
+
#
|
111
|
+
# @return [Selector]
|
112
|
+
def ^(other)
|
113
|
+
if Utils.is_column(other)
|
114
|
+
other = by_name(other.meta.output_name)
|
115
|
+
end
|
116
|
+
if Utils.is_selector(other)
|
117
|
+
Selector._from_rbselector(
|
118
|
+
_rbselector.exclusive_or(other._rbselector)
|
119
|
+
)
|
120
|
+
else
|
121
|
+
as_expr ^ other
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Exclude columns from a multi-column expression.
|
126
|
+
#
|
127
|
+
# Only works after a wildcard or regex column selection, and you cannot provide
|
128
|
+
# both string column names *and* dtypes (you may prefer to use selectors instead).
|
129
|
+
#
|
130
|
+
# @return [Selector]
|
131
|
+
#
|
132
|
+
# @param columns [Object]
|
133
|
+
# The name or datatype of the column(s) to exclude. Accepts regular expression
|
134
|
+
# input. Regular expressions should start with `^` and end with `$`.
|
135
|
+
# @param more_columns [Array]
|
136
|
+
# Additional names or datatypes of columns to exclude, specified as positional
|
137
|
+
# arguments.
|
138
|
+
def exclude(columns, *more_columns)
|
139
|
+
exclude_cols = []
|
140
|
+
exclude_dtypes = []
|
141
|
+
((columns.is_a?(::Array) ? columns : [columns]) + more_columns).each do |item|
|
142
|
+
if item.is_a?(::String)
|
143
|
+
exclude_cols << item
|
144
|
+
elsif Utils.is_polars_dtype(item)
|
145
|
+
exclude_dtypes << item
|
146
|
+
else
|
147
|
+
msg = (
|
148
|
+
"invalid input for `exclude`" +
|
149
|
+
"\n\nExpected one or more `str` or `DataType`; found #{item.inspect} instead."
|
150
|
+
)
|
151
|
+
raise TypeError, msg
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
if exclude_cols.any? && exclude_dtypes.any?
|
156
|
+
msg = "cannot exclude by both column name and dtype; use a selector instead"
|
157
|
+
raise TypeError, msg
|
158
|
+
elsif exclude_dtypes.any?
|
159
|
+
self - Selectors.by_dtype(exclude_dtypes)
|
160
|
+
else
|
161
|
+
self - Selectors.by_name(exclude_cols, require_all: false)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Materialize the `selector` as a normal expression.
|
166
|
+
#
|
167
|
+
# This ensures that the operators `|`, `&`, `~` and `-`
|
168
|
+
# are applied on the data and not on the selector sets.
|
169
|
+
#
|
170
|
+
# @return [Expr]
|
171
|
+
#
|
172
|
+
# @example Inverting the boolean selector will choose the non-boolean columns:
|
173
|
+
# df = Polars::DataFrame.new(
|
174
|
+
# {
|
175
|
+
# "colx" => ["aa", "bb", "cc"],
|
176
|
+
# "coly" => [true, false, true],
|
177
|
+
# "colz" => [1, 2, 3]
|
178
|
+
# }
|
179
|
+
# )
|
180
|
+
# df.select(~Polars.cs.boolean)
|
181
|
+
# # =>
|
182
|
+
# # shape: (3, 2)
|
183
|
+
# # ┌──────┬──────┐
|
184
|
+
# # │ colx ┆ colz │
|
185
|
+
# # │ --- ┆ --- │
|
186
|
+
# # │ str ┆ i64 │
|
187
|
+
# # ╞══════╪══════╡
|
188
|
+
# # │ aa ┆ 1 │
|
189
|
+
# # │ bb ┆ 2 │
|
190
|
+
# # │ cc ┆ 3 │
|
191
|
+
# # └──────┴──────┘
|
192
|
+
#
|
193
|
+
# @example To invert the *values* in the selected boolean columns, we need to materialize the selector as a standard expression instead:
|
194
|
+
# df.select(~Polars.cs.boolean.as_expr)
|
195
|
+
# # =>
|
196
|
+
# # shape: (3, 1)
|
197
|
+
# # ┌───────┐
|
198
|
+
# # │ coly │
|
199
|
+
# # │ --- │
|
200
|
+
# # │ bool │
|
201
|
+
# # ╞═══════╡
|
202
|
+
# # │ false │
|
203
|
+
# # │ true │
|
204
|
+
# # │ false │
|
205
|
+
# # └───────┘
|
206
|
+
def as_expr
|
207
|
+
Expr._from_rbexpr(_rbexpr)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|