polars-df 0.10.0-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +175 -0
- data/Cargo.lock +2536 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +38726 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +98 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +72 -0
- data/lib/polars/cat_name_space.rb +125 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +93 -0
- data/lib/polars/data_frame.rb +5418 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1444 -0
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +31 -0
- data/lib/polars/expr.rb +6105 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +548 -0
- data/lib/polars/io.rb +890 -0
- data/lib/polars/lazy_frame.rb +2833 -0
- data/lib/polars/lazy_group_by.rb +84 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +445 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +37 -0
- data/lib/polars/series.rb +4527 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1519 -0
- data/lib/polars/string_name_space.rb +810 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +422 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +72 -0
- metadata +125 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for struct related expressions.
|
3
|
+
class StructExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
13
|
+
#
|
14
|
+
# @return [Expr]
|
15
|
+
def [](item)
|
16
|
+
if item.is_a?(::String)
|
17
|
+
field(item)
|
18
|
+
elsif item.is_a?(Integer)
|
19
|
+
Utils.wrap_expr(_rbexpr.struct_field_by_index(item))
|
20
|
+
else
|
21
|
+
raise ArgumentError, "expected type Integer or String, got #{item.class.name}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
26
|
+
#
|
27
|
+
# @param name [String]
|
28
|
+
# Name of the field
|
29
|
+
#
|
30
|
+
# @return [Expr]
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# df = (
|
34
|
+
# Polars::DataFrame.new(
|
35
|
+
# {
|
36
|
+
# "int" => [1, 2],
|
37
|
+
# "str" => ["a", "b"],
|
38
|
+
# "bool" => [true, nil],
|
39
|
+
# "list" => [[1, 2], [3]]
|
40
|
+
# }
|
41
|
+
# )
|
42
|
+
# .to_struct("my_struct")
|
43
|
+
# .to_frame
|
44
|
+
# )
|
45
|
+
# df.select(Polars.col("my_struct").struct.field("str"))
|
46
|
+
# # =>
|
47
|
+
# # shape: (2, 1)
|
48
|
+
# # ┌─────┐
|
49
|
+
# # │ str │
|
50
|
+
# # │ --- │
|
51
|
+
# # │ str │
|
52
|
+
# # ╞═════╡
|
53
|
+
# # │ a │
|
54
|
+
# # │ b │
|
55
|
+
# # └─────┘
|
56
|
+
def field(name)
|
57
|
+
Utils.wrap_expr(_rbexpr.struct_field_by_name(name))
|
58
|
+
end
|
59
|
+
|
60
|
+
# Rename the fields of the struct.
|
61
|
+
#
|
62
|
+
# @param names [Array]
|
63
|
+
# New names in the order of the struct's fields
|
64
|
+
#
|
65
|
+
# @return [Expr]
|
66
|
+
#
|
67
|
+
# @example
|
68
|
+
# df = (
|
69
|
+
# Polars::DataFrame.new(
|
70
|
+
# {
|
71
|
+
# "int" => [1, 2],
|
72
|
+
# "str" => ["a", "b"],
|
73
|
+
# "bool" => [true, nil],
|
74
|
+
# "list" => [[1, 2], [3]]
|
75
|
+
# }
|
76
|
+
# )
|
77
|
+
# .to_struct("my_struct")
|
78
|
+
# .to_frame
|
79
|
+
# )
|
80
|
+
# df = df.with_column(
|
81
|
+
# Polars.col("my_struct").struct.rename_fields(["INT", "STR", "BOOL", "LIST"])
|
82
|
+
# )
|
83
|
+
# df.select(Polars.col("my_struct").struct.field("INT"))
|
84
|
+
# # =>
|
85
|
+
# # shape: (2, 1)
|
86
|
+
# # ┌─────┐
|
87
|
+
# # │ INT │
|
88
|
+
# # │ --- │
|
89
|
+
# # │ i64 │
|
90
|
+
# # ╞═════╡
|
91
|
+
# # │ 1 │
|
92
|
+
# # │ 2 │
|
93
|
+
# # └─────┘
|
94
|
+
def rename_fields(names)
|
95
|
+
Utils.wrap_expr(_rbexpr.struct_rename_fields(names))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.struct namespace.
|
3
|
+
class StructNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "struct"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
14
|
+
#
|
15
|
+
# @return [Series]
|
16
|
+
def [](item)
|
17
|
+
if item.is_a?(Integer)
|
18
|
+
field(fields[item])
|
19
|
+
elsif item.is_a?(::String)
|
20
|
+
field(item)
|
21
|
+
else
|
22
|
+
raise ArgumentError, "expected type Integer or String, got #{item.class.name}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Convert this Struct Series to a DataFrame.
|
27
|
+
#
|
28
|
+
# @return [DataFrame]
|
29
|
+
def to_frame
|
30
|
+
Utils.wrap_df(_s.struct_to_frame)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the names of the fields.
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
def fields
|
37
|
+
if _s.nil?
|
38
|
+
[]
|
39
|
+
else
|
40
|
+
_s.struct_fields
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
45
|
+
#
|
46
|
+
# @param name [String]
|
47
|
+
# Name of the field
|
48
|
+
#
|
49
|
+
# @return [Series]
|
50
|
+
def field(name)
|
51
|
+
super
|
52
|
+
end
|
53
|
+
|
54
|
+
# Rename the fields of the struct.
|
55
|
+
#
|
56
|
+
# @param names [Array]
|
57
|
+
# New names in the order of the struct's fields
|
58
|
+
#
|
59
|
+
# @return [Series]
|
60
|
+
def rename_fields(names)
|
61
|
+
super
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the struct definition as a name/dtype schema dict.
|
65
|
+
#
|
66
|
+
# @return [Object]
|
67
|
+
def schema
|
68
|
+
if _s.nil?
|
69
|
+
{}
|
70
|
+
else
|
71
|
+
_s.dtype.to_schema
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Convert this struct Series to a DataFrame with a separate column for each field.
|
76
|
+
#
|
77
|
+
# @return [DataFrame]
|
78
|
+
#
|
79
|
+
# @example
|
80
|
+
# s = Polars::Series.new([{"a" => 1, "b" => 2}, {"a" => 3, "b" => 4}])
|
81
|
+
# s.struct.unnest
|
82
|
+
# # =>
|
83
|
+
# # shape: (2, 2)
|
84
|
+
# # ┌─────┬─────┐
|
85
|
+
# # │ a ┆ b │
|
86
|
+
# # │ --- ┆ --- │
|
87
|
+
# # │ i64 ┆ i64 │
|
88
|
+
# # ╞═════╪═════╡
|
89
|
+
# # │ 1 ┆ 2 │
|
90
|
+
# # │ 3 ┆ 4 │
|
91
|
+
# # └─────┴─────┘
|
92
|
+
def unnest
|
93
|
+
Utils.wrap_df(_s.struct_unnest)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,507 @@
|
|
1
|
+
module Polars
|
2
|
+
module Testing
|
3
|
+
# Assert that the left and right frame are equal.
|
4
|
+
#
|
5
|
+
# Raises a detailed `AssertionError` if the frames differ.
|
6
|
+
# This function is intended for use in unit tests.
|
7
|
+
#
|
8
|
+
# @param left [Object]
|
9
|
+
# The first DataFrame or LazyFrame to compare.
|
10
|
+
# @param right [Object]
|
11
|
+
# The second DataFrame or LazyFrame to compare.
|
12
|
+
# @param check_row_order [Boolean]
|
13
|
+
# Require row order to match.
|
14
|
+
# @param check_column_order [Boolean]
|
15
|
+
# Require column order to match.
|
16
|
+
# @param check_dtype [Boolean]
|
17
|
+
# Require data types to match.
|
18
|
+
# @param check_exact [Boolean]
|
19
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
20
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
21
|
+
# Only affects columns with a Float data type.
|
22
|
+
# @param rtol [Float]
|
23
|
+
# Relative tolerance for inexact checking. Fraction of values in `right`.
|
24
|
+
# @param atol [Float]
|
25
|
+
# Absolute tolerance for inexact checking.
|
26
|
+
# @param categorical_as_str [Boolean]
|
27
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
28
|
+
# compare columns that do not share the same string cache.
|
29
|
+
#
|
30
|
+
# @return [nil]
|
31
|
+
def assert_frame_equal(
|
32
|
+
left,
|
33
|
+
right,
|
34
|
+
check_row_order: true,
|
35
|
+
check_column_order: true,
|
36
|
+
check_dtype: true,
|
37
|
+
check_exact: false,
|
38
|
+
rtol: 1e-5,
|
39
|
+
atol: 1e-8,
|
40
|
+
categorical_as_str: false
|
41
|
+
)
|
42
|
+
lazy = _assert_correct_input_type(left, right)
|
43
|
+
objects = lazy ? "LazyFrames" : "DataFrames"
|
44
|
+
|
45
|
+
_assert_frame_schema_equal(
|
46
|
+
left,
|
47
|
+
right,
|
48
|
+
check_column_order: check_column_order,
|
49
|
+
check_dtype: check_dtype,
|
50
|
+
objects: objects,
|
51
|
+
)
|
52
|
+
|
53
|
+
if lazy
|
54
|
+
left, right = left.collect, right.collect
|
55
|
+
end
|
56
|
+
|
57
|
+
if left.height != right.height
|
58
|
+
raise_assertion_error(
|
59
|
+
objects, "number of rows does not match", left.height, right.height
|
60
|
+
)
|
61
|
+
end
|
62
|
+
|
63
|
+
if !check_row_order
|
64
|
+
left, right = _sort_dataframes(left, right)
|
65
|
+
end
|
66
|
+
|
67
|
+
left.columns.each do |c|
|
68
|
+
s_left, s_right = left.get_column(c), right.get_column(c)
|
69
|
+
begin
|
70
|
+
_assert_series_values_equal(
|
71
|
+
s_left,
|
72
|
+
s_right,
|
73
|
+
check_exact: check_exact,
|
74
|
+
rtol: rtol,
|
75
|
+
atol: atol,
|
76
|
+
categorical_as_str: categorical_as_str
|
77
|
+
)
|
78
|
+
rescue AssertionError
|
79
|
+
raise_assertion_error(
|
80
|
+
objects,
|
81
|
+
"value mismatch for column #{c.inspect}",
|
82
|
+
s_left.to_a,
|
83
|
+
s_right.to_a
|
84
|
+
)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Assert that the left and right frame are **not** equal.
|
90
|
+
#
|
91
|
+
# This function is intended for use in unit tests.
|
92
|
+
#
|
93
|
+
# @param left [Object]
|
94
|
+
# The first DataFrame or LazyFrame to compare.
|
95
|
+
# @param right [Object]
|
96
|
+
# The second DataFrame or LazyFrame to compare.
|
97
|
+
# @param check_row_order [Boolean]
|
98
|
+
# Require row order to match.
|
99
|
+
# @param check_column_order [Boolean]
|
100
|
+
# Require column order to match.
|
101
|
+
# @param check_dtype [Boolean]
|
102
|
+
# Require data types to match.
|
103
|
+
# @param check_exact [Boolean]
|
104
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
105
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
106
|
+
# Only affects columns with a Float data type.
|
107
|
+
# @param rtol [Float]
|
108
|
+
# Relative tolerance for inexact checking. Fraction of values in `right`.
|
109
|
+
# @param atol [Float]
|
110
|
+
# Absolute tolerance for inexact checking.
|
111
|
+
# @param categorical_as_str [Boolean]
|
112
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
113
|
+
# compare columns that do not share the same string cache.
|
114
|
+
#
|
115
|
+
# @return [nil]
|
116
|
+
def assert_frame_not_equal(
|
117
|
+
left,
|
118
|
+
right,
|
119
|
+
check_row_order: true,
|
120
|
+
check_column_order: true,
|
121
|
+
check_dtype: true,
|
122
|
+
check_exact: false,
|
123
|
+
rtol: 1e-5,
|
124
|
+
atol: 1e-8,
|
125
|
+
categorical_as_str: false
|
126
|
+
)
|
127
|
+
begin
|
128
|
+
assert_frame_equal(
|
129
|
+
left,
|
130
|
+
right,
|
131
|
+
check_column_order: check_column_order,
|
132
|
+
check_row_order: check_row_order,
|
133
|
+
check_dtype: check_dtype,
|
134
|
+
check_exact: check_exact,
|
135
|
+
rtol: rtol,
|
136
|
+
atol: atol,
|
137
|
+
categorical_as_str: categorical_as_str
|
138
|
+
)
|
139
|
+
rescue AssertionError
|
140
|
+
return
|
141
|
+
end
|
142
|
+
|
143
|
+
msg = "frames are equal"
|
144
|
+
raise AssertionError, msg
|
145
|
+
end
|
146
|
+
|
147
|
+
# Assert that the left and right Series are equal.
|
148
|
+
#
|
149
|
+
# Raises a detailed `AssertionError` if the Series differ.
|
150
|
+
# This function is intended for use in unit tests.
|
151
|
+
#
|
152
|
+
# @param left [Object]
|
153
|
+
# The first Series to compare.
|
154
|
+
# @param right [Object]
|
155
|
+
# The second Series to compare.
|
156
|
+
# @param check_dtype [Boolean]
|
157
|
+
# Require data types to match.
|
158
|
+
# @param check_names [Boolean]
|
159
|
+
# Require names to match.
|
160
|
+
# @param check_exact [Boolean]
|
161
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
162
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
163
|
+
# Only affects columns with a Float data type.
|
164
|
+
# @param rtol [Float]
|
165
|
+
# Relative tolerance for inexact checking, given as a fraction of the values in
|
166
|
+
# `right`.
|
167
|
+
# @param atol [Float]
|
168
|
+
# Absolute tolerance for inexact checking.
|
169
|
+
# @param categorical_as_str [Boolean]
|
170
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
171
|
+
# compare columns that do not share the same string cache.
|
172
|
+
#
|
173
|
+
# @return [nil]
|
174
|
+
def assert_series_equal(
|
175
|
+
left,
|
176
|
+
right,
|
177
|
+
check_dtype: true,
|
178
|
+
check_names: true,
|
179
|
+
check_exact: false,
|
180
|
+
rtol: 1e-5,
|
181
|
+
atol: 1e-8,
|
182
|
+
categorical_as_str: false
|
183
|
+
)
|
184
|
+
if !(left.is_a?(Series) && right.is_a?(Series))
|
185
|
+
raise_assertion_error(
|
186
|
+
"inputs",
|
187
|
+
"unexpected input types",
|
188
|
+
left.class.name,
|
189
|
+
right.class.name
|
190
|
+
)
|
191
|
+
end
|
192
|
+
|
193
|
+
if left.len != right.len
|
194
|
+
raise_assertion_error("Series", "length mismatch", left.len, right.len)
|
195
|
+
end
|
196
|
+
|
197
|
+
if check_names && left.name != right.name
|
198
|
+
raise_assertion_error("Series", "name mismatch", left.name, right.name)
|
199
|
+
end
|
200
|
+
|
201
|
+
if check_dtype && left.dtype != right.dtype
|
202
|
+
raise_assertion_error("Series", "dtype mismatch", left.dtype, right.dtype)
|
203
|
+
end
|
204
|
+
|
205
|
+
_assert_series_values_equal(
|
206
|
+
left,
|
207
|
+
right,
|
208
|
+
check_exact: check_exact,
|
209
|
+
rtol: rtol,
|
210
|
+
atol: atol,
|
211
|
+
categorical_as_str: categorical_as_str
|
212
|
+
)
|
213
|
+
end
|
214
|
+
|
215
|
+
# Assert that the left and right Series are **not** equal.
|
216
|
+
#
|
217
|
+
# This function is intended for use in unit tests.
|
218
|
+
#
|
219
|
+
# @param left [Object]
|
220
|
+
# The first Series to compare.
|
221
|
+
# @param right [Object]
|
222
|
+
# The second Series to compare.
|
223
|
+
# @param check_dtype [Boolean]
|
224
|
+
# Require data types to match.
|
225
|
+
# @param check_names [Boolean]
|
226
|
+
# Require names to match.
|
227
|
+
# @param check_exact [Boolean]
|
228
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
229
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
230
|
+
# Only affects columns with a Float data type.
|
231
|
+
# @param rtol [Float]
|
232
|
+
# Relative tolerance for inexact checking, given as a fraction of the values in
|
233
|
+
# `right`.
|
234
|
+
# @param atol [Float]
|
235
|
+
# Absolute tolerance for inexact checking.
|
236
|
+
# @param categorical_as_str [Boolean]
|
237
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
238
|
+
# compare columns that do not share the same string cache.
|
239
|
+
#
|
240
|
+
# @return [nil]
|
241
|
+
def assert_series_not_equal(
|
242
|
+
left,
|
243
|
+
right,
|
244
|
+
check_dtype: true,
|
245
|
+
check_names: true,
|
246
|
+
check_exact: false,
|
247
|
+
rtol: 1e-5,
|
248
|
+
atol: 1e-8,
|
249
|
+
categorical_as_str: false
|
250
|
+
)
|
251
|
+
begin
|
252
|
+
assert_series_equal(
|
253
|
+
left,
|
254
|
+
right,
|
255
|
+
check_dtype: check_dtype,
|
256
|
+
check_names: check_names,
|
257
|
+
check_exact: check_exact,
|
258
|
+
rtol: rtol,
|
259
|
+
atol: atol,
|
260
|
+
categorical_as_str: categorical_as_str
|
261
|
+
)
|
262
|
+
rescue AssertionError
|
263
|
+
return
|
264
|
+
end
|
265
|
+
|
266
|
+
msg = "Series are equal"
|
267
|
+
raise AssertionError, msg
|
268
|
+
end
|
269
|
+
|
270
|
+
private
|
271
|
+
|
272
|
+
def _assert_correct_input_type(left, right)
|
273
|
+
if left.is_a?(DataFrame) && right.is_a?(DataFrame)
|
274
|
+
return false
|
275
|
+
elsif left.is_a?(LazyFrame) && right.is_a?(DataFrame)
|
276
|
+
return true
|
277
|
+
else
|
278
|
+
raise_assertion_error(
|
279
|
+
"inputs",
|
280
|
+
"unexpected input types",
|
281
|
+
left.class.name,
|
282
|
+
right.class.name
|
283
|
+
)
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
def _assert_frame_schema_equal(
|
288
|
+
left,
|
289
|
+
right,
|
290
|
+
check_dtype:,
|
291
|
+
check_column_order:,
|
292
|
+
objects:
|
293
|
+
)
|
294
|
+
left_schema, right_schema = left.schema, right.schema
|
295
|
+
|
296
|
+
# Fast path for equal frames
|
297
|
+
if left_schema == right_schema
|
298
|
+
return
|
299
|
+
end
|
300
|
+
|
301
|
+
# Special error message for when column names do not match
|
302
|
+
if left_schema.keys != right_schema.keys
|
303
|
+
if (left_not_right = right_schema.keys - left_schema.keys).any?
|
304
|
+
msg = "columns #{left_not_right.inspect} in left #{objects[..-1]}, but not in right"
|
305
|
+
raise AssertionError, msg
|
306
|
+
else
|
307
|
+
right_not_left = right_schema.keys - left_schema.keys
|
308
|
+
msg = "columns #{right_not_left.inspect} in right #{objects[..-1]}, but not in left"
|
309
|
+
raise AssertionError, msg
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
if check_column_order
|
314
|
+
left_columns, right_columns = left_schema.keys, right_schema.keys
|
315
|
+
if left_columns != right_columns
|
316
|
+
detail = "columns are not in the same order"
|
317
|
+
raise_assertion_error(objects, detail, left_columns, right_columns)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
if check_dtype
|
322
|
+
left_schema_dict, right_schema_dict = left_schema.to_h, right_schema.to_h
|
323
|
+
if check_column_order || left_schema_dict != right_schema_dict
|
324
|
+
detail = "dtypes do not match"
|
325
|
+
raise_assertion_error(objects, detail, left_schema_dict, right_schema_dict)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def _sort_dataframes(left, right)
|
331
|
+
by = left.columns
|
332
|
+
begin
|
333
|
+
left = left.sort(by)
|
334
|
+
right = right.sort(by)
|
335
|
+
rescue
|
336
|
+
msg = "cannot set `check_row_order: false` on frame with unsortable columns"
|
337
|
+
raise InvalidAssert, msg
|
338
|
+
end
|
339
|
+
[left, right]
|
340
|
+
end
|
341
|
+
|
342
|
+
def _assert_series_values_equal(
|
343
|
+
left,
|
344
|
+
right,
|
345
|
+
check_exact:,
|
346
|
+
rtol:,
|
347
|
+
atol:,
|
348
|
+
categorical_as_str:
|
349
|
+
)
|
350
|
+
if categorical_as_str
|
351
|
+
if left.dtype == Categorical
|
352
|
+
left = left.cast(String)
|
353
|
+
end
|
354
|
+
if right.dtype == Categorical
|
355
|
+
right = right.cast(String)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# Determine unequal elements
|
360
|
+
begin
|
361
|
+
unequal = left.ne_missing(right)
|
362
|
+
rescue
|
363
|
+
raise_assertion_error(
|
364
|
+
"Series",
|
365
|
+
"incompatible data types",
|
366
|
+
left.dtype,
|
367
|
+
right.dtype
|
368
|
+
)
|
369
|
+
end
|
370
|
+
|
371
|
+
# Check nested dtypes in separate function
|
372
|
+
if _comparing_nested_floats(left.dtype, right.dtype)
|
373
|
+
begin
|
374
|
+
_assert_series_nested_values_equal(
|
375
|
+
left: left.filter(unequal),
|
376
|
+
right: right.filter(unequal),
|
377
|
+
check_exact: check_exact,
|
378
|
+
rtol: rtol,
|
379
|
+
atol: atol,
|
380
|
+
categorical_as_str: categorical_as_str
|
381
|
+
)
|
382
|
+
rescue AssertionError
|
383
|
+
raise_assertion_error(
|
384
|
+
"Series",
|
385
|
+
"nested value mismatch",
|
386
|
+
left.to_a,
|
387
|
+
right.to_a
|
388
|
+
)
|
389
|
+
else
|
390
|
+
return
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
# If no differences found during exact checking, we're done
|
395
|
+
if !unequal.any
|
396
|
+
return
|
397
|
+
end
|
398
|
+
|
399
|
+
# Only do inexact checking for float types
|
400
|
+
if check_exact || !left.dtype.float? || !right.dtype.float?
|
401
|
+
raise_assertion_error(
|
402
|
+
"Series", "exact value mismatch", left.to_a, right.to_a
|
403
|
+
)
|
404
|
+
end
|
405
|
+
|
406
|
+
_assert_series_null_values_match(left, right)
|
407
|
+
_assert_series_nan_values_match(left, right)
|
408
|
+
_assert_series_values_within_tolerance(
|
409
|
+
left,
|
410
|
+
right,
|
411
|
+
unequal,
|
412
|
+
rtol: rtol,
|
413
|
+
atol: atol
|
414
|
+
)
|
415
|
+
end
|
416
|
+
|
417
|
+
def _assert_series_nested_values_equal(
|
418
|
+
left,
|
419
|
+
right,
|
420
|
+
check_exact:,
|
421
|
+
rtol:,
|
422
|
+
atol:,
|
423
|
+
categorical_as_str:
|
424
|
+
)
|
425
|
+
# compare nested lists element-wise
|
426
|
+
if _comparing_lists(left.dtype, right.dtype)
|
427
|
+
left.zip(right) do |s1, s2|
|
428
|
+
if s1.nil? || s2.nil?
|
429
|
+
raise_assertion_error("Series", "nested value mismatch", s1, s2)
|
430
|
+
end
|
431
|
+
|
432
|
+
_assert_series_values_equal(
|
433
|
+
s1,
|
434
|
+
s2,
|
435
|
+
check_exact: check_exact,
|
436
|
+
rtol: rtol,
|
437
|
+
atol: atol,
|
438
|
+
categorical_as_str: categorical_as_str
|
439
|
+
)
|
440
|
+
end
|
441
|
+
|
442
|
+
# unnest structs as series and compare
|
443
|
+
else
|
444
|
+
ls, rs = left.struct.unnest, right.struct.unnest
|
445
|
+
ls.zip(rs) do |s1, s2|
|
446
|
+
_assert_series_values_equal(
|
447
|
+
s1,
|
448
|
+
s2,
|
449
|
+
check_exact: check_exact,
|
450
|
+
rtol: rtol,
|
451
|
+
atol: atol,
|
452
|
+
categorical_as_str: categorical_as_str
|
453
|
+
)
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def _assert_series_null_values_match(left, right)
|
459
|
+
null_value_mismatch = left.is_null != right.is_null
|
460
|
+
if null_value_mismatch.any
|
461
|
+
raise_assertion_error(
|
462
|
+
"Series", "null value mismatch", left.to_a, right.to_a
|
463
|
+
)
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
def _assert_series_nan_values_match(left, right)
|
468
|
+
if !_comparing_floats(left.dtype, right.dtype)
|
469
|
+
return
|
470
|
+
end
|
471
|
+
nan_value_mismatch = left.is_nan != right.is_nan
|
472
|
+
if nan_value_mismatch.any
|
473
|
+
raise_assertion_error(
|
474
|
+
"Series",
|
475
|
+
"nan value mismatch",
|
476
|
+
left.to_a,
|
477
|
+
right.to_a
|
478
|
+
)
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
def _comparing_floats(left, right)
|
483
|
+
left.is_float && right.is_float
|
484
|
+
end
|
485
|
+
|
486
|
+
def _comparing_lists(left, right)
|
487
|
+
[List, Array].include?(left) && [List, Array].include?(right)
|
488
|
+
end
|
489
|
+
|
490
|
+
def _comparing_structs(left, right)
|
491
|
+
left == Struct && right == Struct
|
492
|
+
end
|
493
|
+
|
494
|
+
def _comparing_nested_floats(left, right)
|
495
|
+
if !(_comparing_lists(left, right) || _comparing_structs(left, right))
|
496
|
+
return false
|
497
|
+
end
|
498
|
+
|
499
|
+
left.float? && right.float?
|
500
|
+
end
|
501
|
+
|
502
|
+
def raise_assertion_error(objects, detail, left, right)
|
503
|
+
msg = "#{objects} are different (#{detail})\n[left]: #{left}\n[right]: #{right}"
|
504
|
+
raise AssertionError, msg
|
505
|
+
end
|
506
|
+
end
|
507
|
+
end
|