polars-df 0.13.0-aarch64-linux-musl
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39059 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for struct related expressions.
|
3
|
+
class StructExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
13
|
+
#
|
14
|
+
# @return [Expr]
|
15
|
+
def [](item)
|
16
|
+
if item.is_a?(::String)
|
17
|
+
field(item)
|
18
|
+
elsif item.is_a?(Integer)
|
19
|
+
Utils.wrap_expr(_rbexpr.struct_field_by_index(item))
|
20
|
+
else
|
21
|
+
raise ArgumentError, "expected type Integer or String, got #{item.class.name}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
26
|
+
#
|
27
|
+
# @param name [String]
|
28
|
+
# Name of the field
|
29
|
+
#
|
30
|
+
# @return [Expr]
|
31
|
+
#
|
32
|
+
# @example
|
33
|
+
# df = (
|
34
|
+
# Polars::DataFrame.new(
|
35
|
+
# {
|
36
|
+
# "int" => [1, 2],
|
37
|
+
# "str" => ["a", "b"],
|
38
|
+
# "bool" => [true, nil],
|
39
|
+
# "list" => [[1, 2], [3]]
|
40
|
+
# }
|
41
|
+
# )
|
42
|
+
# .to_struct("my_struct")
|
43
|
+
# .to_frame
|
44
|
+
# )
|
45
|
+
# df.select(Polars.col("my_struct").struct.field("str"))
|
46
|
+
# # =>
|
47
|
+
# # shape: (2, 1)
|
48
|
+
# # ┌─────┐
|
49
|
+
# # │ str │
|
50
|
+
# # │ --- │
|
51
|
+
# # │ str │
|
52
|
+
# # ╞═════╡
|
53
|
+
# # │ a │
|
54
|
+
# # │ b │
|
55
|
+
# # └─────┘
|
56
|
+
def field(name)
|
57
|
+
Utils.wrap_expr(_rbexpr.struct_field_by_name(name))
|
58
|
+
end
|
59
|
+
|
60
|
+
# Rename the fields of the struct.
|
61
|
+
#
|
62
|
+
# @param names [Array]
|
63
|
+
# New names in the order of the struct's fields
|
64
|
+
#
|
65
|
+
# @return [Expr]
|
66
|
+
#
|
67
|
+
# @example
|
68
|
+
# df = (
|
69
|
+
# Polars::DataFrame.new(
|
70
|
+
# {
|
71
|
+
# "int" => [1, 2],
|
72
|
+
# "str" => ["a", "b"],
|
73
|
+
# "bool" => [true, nil],
|
74
|
+
# "list" => [[1, 2], [3]]
|
75
|
+
# }
|
76
|
+
# )
|
77
|
+
# .to_struct("my_struct")
|
78
|
+
# .to_frame
|
79
|
+
# )
|
80
|
+
# df = df.with_column(
|
81
|
+
# Polars.col("my_struct").struct.rename_fields(["INT", "STR", "BOOL", "LIST"])
|
82
|
+
# )
|
83
|
+
# df.select(Polars.col("my_struct").struct.field("INT"))
|
84
|
+
# # =>
|
85
|
+
# # shape: (2, 1)
|
86
|
+
# # ┌─────┐
|
87
|
+
# # │ INT │
|
88
|
+
# # │ --- │
|
89
|
+
# # │ i64 │
|
90
|
+
# # ╞═════╡
|
91
|
+
# # │ 1 │
|
92
|
+
# # │ 2 │
|
93
|
+
# # └─────┘
|
94
|
+
def rename_fields(names)
|
95
|
+
Utils.wrap_expr(_rbexpr.struct_rename_fields(names))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.struct namespace.
|
3
|
+
class StructNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "struct"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
14
|
+
#
|
15
|
+
# @return [Series]
|
16
|
+
def [](item)
|
17
|
+
if item.is_a?(Integer)
|
18
|
+
field(fields[item])
|
19
|
+
elsif item.is_a?(::String)
|
20
|
+
field(item)
|
21
|
+
else
|
22
|
+
raise ArgumentError, "expected type Integer or String, got #{item.class.name}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Convert this Struct Series to a DataFrame.
|
27
|
+
#
|
28
|
+
# @return [DataFrame]
|
29
|
+
def to_frame
|
30
|
+
Utils.wrap_df(_s.struct_to_frame)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the names of the fields.
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
def fields
|
37
|
+
if _s.nil?
|
38
|
+
[]
|
39
|
+
else
|
40
|
+
_s.struct_fields
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Retrieve one of the fields of this `Struct` as a new Series.
|
45
|
+
#
|
46
|
+
# @param name [String]
|
47
|
+
# Name of the field
|
48
|
+
#
|
49
|
+
# @return [Series]
|
50
|
+
def field(name)
|
51
|
+
super
|
52
|
+
end
|
53
|
+
|
54
|
+
# Rename the fields of the struct.
|
55
|
+
#
|
56
|
+
# @param names [Array]
|
57
|
+
# New names in the order of the struct's fields
|
58
|
+
#
|
59
|
+
# @return [Series]
|
60
|
+
def rename_fields(names)
|
61
|
+
super
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the struct definition as a name/dtype schema dict.
|
65
|
+
#
|
66
|
+
# @return [Object]
|
67
|
+
def schema
|
68
|
+
if _s.nil?
|
69
|
+
{}
|
70
|
+
else
|
71
|
+
_s.dtype.to_schema
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Convert this struct Series to a DataFrame with a separate column for each field.
|
76
|
+
#
|
77
|
+
# @return [DataFrame]
|
78
|
+
#
|
79
|
+
# @example
|
80
|
+
# s = Polars::Series.new([{"a" => 1, "b" => 2}, {"a" => 3, "b" => 4}])
|
81
|
+
# s.struct.unnest
|
82
|
+
# # =>
|
83
|
+
# # shape: (2, 2)
|
84
|
+
# # ┌─────┬─────┐
|
85
|
+
# # │ a ┆ b │
|
86
|
+
# # │ --- ┆ --- │
|
87
|
+
# # │ i64 ┆ i64 │
|
88
|
+
# # ╞═════╪═════╡
|
89
|
+
# # │ 1 ┆ 2 │
|
90
|
+
# # │ 3 ┆ 4 │
|
91
|
+
# # └─────┴─────┘
|
92
|
+
def unnest
|
93
|
+
Utils.wrap_df(_s.struct_unnest)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,507 @@
|
|
1
|
+
module Polars
|
2
|
+
module Testing
|
3
|
+
# Assert that the left and right frame are equal.
|
4
|
+
#
|
5
|
+
# Raises a detailed `AssertionError` if the frames differ.
|
6
|
+
# This function is intended for use in unit tests.
|
7
|
+
#
|
8
|
+
# @param left [Object]
|
9
|
+
# The first DataFrame or LazyFrame to compare.
|
10
|
+
# @param right [Object]
|
11
|
+
# The second DataFrame or LazyFrame to compare.
|
12
|
+
# @param check_row_order [Boolean]
|
13
|
+
# Require row order to match.
|
14
|
+
# @param check_column_order [Boolean]
|
15
|
+
# Require column order to match.
|
16
|
+
# @param check_dtype [Boolean]
|
17
|
+
# Require data types to match.
|
18
|
+
# @param check_exact [Boolean]
|
19
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
20
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
21
|
+
# Only affects columns with a Float data type.
|
22
|
+
# @param rtol [Float]
|
23
|
+
# Relative tolerance for inexact checking. Fraction of values in `right`.
|
24
|
+
# @param atol [Float]
|
25
|
+
# Absolute tolerance for inexact checking.
|
26
|
+
# @param categorical_as_str [Boolean]
|
27
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
28
|
+
# compare columns that do not share the same string cache.
|
29
|
+
#
|
30
|
+
# @return [nil]
|
31
|
+
def assert_frame_equal(
|
32
|
+
left,
|
33
|
+
right,
|
34
|
+
check_row_order: true,
|
35
|
+
check_column_order: true,
|
36
|
+
check_dtype: true,
|
37
|
+
check_exact: false,
|
38
|
+
rtol: 1e-5,
|
39
|
+
atol: 1e-8,
|
40
|
+
categorical_as_str: false
|
41
|
+
)
|
42
|
+
lazy = _assert_correct_input_type(left, right)
|
43
|
+
objects = lazy ? "LazyFrames" : "DataFrames"
|
44
|
+
|
45
|
+
_assert_frame_schema_equal(
|
46
|
+
left,
|
47
|
+
right,
|
48
|
+
check_column_order: check_column_order,
|
49
|
+
check_dtype: check_dtype,
|
50
|
+
objects: objects,
|
51
|
+
)
|
52
|
+
|
53
|
+
if lazy
|
54
|
+
left, right = left.collect, right.collect
|
55
|
+
end
|
56
|
+
|
57
|
+
if left.height != right.height
|
58
|
+
raise_assertion_error(
|
59
|
+
objects, "number of rows does not match", left.height, right.height
|
60
|
+
)
|
61
|
+
end
|
62
|
+
|
63
|
+
if !check_row_order
|
64
|
+
left, right = _sort_dataframes(left, right)
|
65
|
+
end
|
66
|
+
|
67
|
+
left.columns.each do |c|
|
68
|
+
s_left, s_right = left.get_column(c), right.get_column(c)
|
69
|
+
begin
|
70
|
+
_assert_series_values_equal(
|
71
|
+
s_left,
|
72
|
+
s_right,
|
73
|
+
check_exact: check_exact,
|
74
|
+
rtol: rtol,
|
75
|
+
atol: atol,
|
76
|
+
categorical_as_str: categorical_as_str
|
77
|
+
)
|
78
|
+
rescue AssertionError
|
79
|
+
raise_assertion_error(
|
80
|
+
objects,
|
81
|
+
"value mismatch for column #{c.inspect}",
|
82
|
+
s_left.to_a,
|
83
|
+
s_right.to_a
|
84
|
+
)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Assert that the left and right frame are **not** equal.
|
90
|
+
#
|
91
|
+
# This function is intended for use in unit tests.
|
92
|
+
#
|
93
|
+
# @param left [Object]
|
94
|
+
# The first DataFrame or LazyFrame to compare.
|
95
|
+
# @param right [Object]
|
96
|
+
# The second DataFrame or LazyFrame to compare.
|
97
|
+
# @param check_row_order [Boolean]
|
98
|
+
# Require row order to match.
|
99
|
+
# @param check_column_order [Boolean]
|
100
|
+
# Require column order to match.
|
101
|
+
# @param check_dtype [Boolean]
|
102
|
+
# Require data types to match.
|
103
|
+
# @param check_exact [Boolean]
|
104
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
105
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
106
|
+
# Only affects columns with a Float data type.
|
107
|
+
# @param rtol [Float]
|
108
|
+
# Relative tolerance for inexact checking. Fraction of values in `right`.
|
109
|
+
# @param atol [Float]
|
110
|
+
# Absolute tolerance for inexact checking.
|
111
|
+
# @param categorical_as_str [Boolean]
|
112
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
113
|
+
# compare columns that do not share the same string cache.
|
114
|
+
#
|
115
|
+
# @return [nil]
|
116
|
+
def assert_frame_not_equal(
|
117
|
+
left,
|
118
|
+
right,
|
119
|
+
check_row_order: true,
|
120
|
+
check_column_order: true,
|
121
|
+
check_dtype: true,
|
122
|
+
check_exact: false,
|
123
|
+
rtol: 1e-5,
|
124
|
+
atol: 1e-8,
|
125
|
+
categorical_as_str: false
|
126
|
+
)
|
127
|
+
begin
|
128
|
+
assert_frame_equal(
|
129
|
+
left,
|
130
|
+
right,
|
131
|
+
check_column_order: check_column_order,
|
132
|
+
check_row_order: check_row_order,
|
133
|
+
check_dtype: check_dtype,
|
134
|
+
check_exact: check_exact,
|
135
|
+
rtol: rtol,
|
136
|
+
atol: atol,
|
137
|
+
categorical_as_str: categorical_as_str
|
138
|
+
)
|
139
|
+
rescue AssertionError
|
140
|
+
return
|
141
|
+
end
|
142
|
+
|
143
|
+
msg = "frames are equal"
|
144
|
+
raise AssertionError, msg
|
145
|
+
end
|
146
|
+
|
147
|
+
# Assert that the left and right Series are equal.
|
148
|
+
#
|
149
|
+
# Raises a detailed `AssertionError` if the Series differ.
|
150
|
+
# This function is intended for use in unit tests.
|
151
|
+
#
|
152
|
+
# @param left [Object]
|
153
|
+
# The first Series to compare.
|
154
|
+
# @param right [Object]
|
155
|
+
# The second Series to compare.
|
156
|
+
# @param check_dtype [Boolean]
|
157
|
+
# Require data types to match.
|
158
|
+
# @param check_names [Boolean]
|
159
|
+
# Require names to match.
|
160
|
+
# @param check_exact [Boolean]
|
161
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
162
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
163
|
+
# Only affects columns with a Float data type.
|
164
|
+
# @param rtol [Float]
|
165
|
+
# Relative tolerance for inexact checking, given as a fraction of the values in
|
166
|
+
# `right`.
|
167
|
+
# @param atol [Float]
|
168
|
+
# Absolute tolerance for inexact checking.
|
169
|
+
# @param categorical_as_str [Boolean]
|
170
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
171
|
+
# compare columns that do not share the same string cache.
|
172
|
+
#
|
173
|
+
# @return [nil]
|
174
|
+
def assert_series_equal(
|
175
|
+
left,
|
176
|
+
right,
|
177
|
+
check_dtype: true,
|
178
|
+
check_names: true,
|
179
|
+
check_exact: false,
|
180
|
+
rtol: 1e-5,
|
181
|
+
atol: 1e-8,
|
182
|
+
categorical_as_str: false
|
183
|
+
)
|
184
|
+
if !(left.is_a?(Series) && right.is_a?(Series))
|
185
|
+
raise_assertion_error(
|
186
|
+
"inputs",
|
187
|
+
"unexpected input types",
|
188
|
+
left.class.name,
|
189
|
+
right.class.name
|
190
|
+
)
|
191
|
+
end
|
192
|
+
|
193
|
+
if left.len != right.len
|
194
|
+
raise_assertion_error("Series", "length mismatch", left.len, right.len)
|
195
|
+
end
|
196
|
+
|
197
|
+
if check_names && left.name != right.name
|
198
|
+
raise_assertion_error("Series", "name mismatch", left.name, right.name)
|
199
|
+
end
|
200
|
+
|
201
|
+
if check_dtype && left.dtype != right.dtype
|
202
|
+
raise_assertion_error("Series", "dtype mismatch", left.dtype, right.dtype)
|
203
|
+
end
|
204
|
+
|
205
|
+
_assert_series_values_equal(
|
206
|
+
left,
|
207
|
+
right,
|
208
|
+
check_exact: check_exact,
|
209
|
+
rtol: rtol,
|
210
|
+
atol: atol,
|
211
|
+
categorical_as_str: categorical_as_str
|
212
|
+
)
|
213
|
+
end
|
214
|
+
|
215
|
+
# Assert that the left and right Series are **not** equal.
|
216
|
+
#
|
217
|
+
# This function is intended for use in unit tests.
|
218
|
+
#
|
219
|
+
# @param left [Object]
|
220
|
+
# The first Series to compare.
|
221
|
+
# @param right [Object]
|
222
|
+
# The second Series to compare.
|
223
|
+
# @param check_dtype [Boolean]
|
224
|
+
# Require data types to match.
|
225
|
+
# @param check_names [Boolean]
|
226
|
+
# Require names to match.
|
227
|
+
# @param check_exact [Boolean]
|
228
|
+
# Require float values to match exactly. If set to `false`, values are considered
|
229
|
+
# equal when within tolerance of each other (see `rtol` and `atol`).
|
230
|
+
# Only affects columns with a Float data type.
|
231
|
+
# @param rtol [Float]
|
232
|
+
# Relative tolerance for inexact checking, given as a fraction of the values in
|
233
|
+
# `right`.
|
234
|
+
# @param atol [Float]
|
235
|
+
# Absolute tolerance for inexact checking.
|
236
|
+
# @param categorical_as_str [Boolean]
|
237
|
+
# Cast categorical columns to string before comparing. Enabling this helps
|
238
|
+
# compare columns that do not share the same string cache.
|
239
|
+
#
|
240
|
+
# @return [nil]
|
241
|
+
def assert_series_not_equal(
|
242
|
+
left,
|
243
|
+
right,
|
244
|
+
check_dtype: true,
|
245
|
+
check_names: true,
|
246
|
+
check_exact: false,
|
247
|
+
rtol: 1e-5,
|
248
|
+
atol: 1e-8,
|
249
|
+
categorical_as_str: false
|
250
|
+
)
|
251
|
+
begin
|
252
|
+
assert_series_equal(
|
253
|
+
left,
|
254
|
+
right,
|
255
|
+
check_dtype: check_dtype,
|
256
|
+
check_names: check_names,
|
257
|
+
check_exact: check_exact,
|
258
|
+
rtol: rtol,
|
259
|
+
atol: atol,
|
260
|
+
categorical_as_str: categorical_as_str
|
261
|
+
)
|
262
|
+
rescue AssertionError
|
263
|
+
return
|
264
|
+
end
|
265
|
+
|
266
|
+
msg = "Series are equal"
|
267
|
+
raise AssertionError, msg
|
268
|
+
end
|
269
|
+
|
270
|
+
private
|
271
|
+
|
272
|
+
def _assert_correct_input_type(left, right)
|
273
|
+
if left.is_a?(DataFrame) && right.is_a?(DataFrame)
|
274
|
+
false
|
275
|
+
elsif left.is_a?(LazyFrame) && right.is_a?(DataFrame)
|
276
|
+
true
|
277
|
+
else
|
278
|
+
raise_assertion_error(
|
279
|
+
"inputs",
|
280
|
+
"unexpected input types",
|
281
|
+
left.class.name,
|
282
|
+
right.class.name
|
283
|
+
)
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
def _assert_frame_schema_equal(
|
288
|
+
left,
|
289
|
+
right,
|
290
|
+
check_dtype:,
|
291
|
+
check_column_order:,
|
292
|
+
objects:
|
293
|
+
)
|
294
|
+
left_schema, right_schema = left.schema, right.schema
|
295
|
+
|
296
|
+
# Fast path for equal frames
|
297
|
+
if left_schema == right_schema
|
298
|
+
return
|
299
|
+
end
|
300
|
+
|
301
|
+
# Special error message for when column names do not match
|
302
|
+
if left_schema.keys != right_schema.keys
|
303
|
+
if (left_not_right = right_schema.keys - left_schema.keys).any?
|
304
|
+
msg = "columns #{left_not_right.inspect} in left #{objects[..-1]}, but not in right"
|
305
|
+
raise AssertionError, msg
|
306
|
+
else
|
307
|
+
right_not_left = right_schema.keys - left_schema.keys
|
308
|
+
msg = "columns #{right_not_left.inspect} in right #{objects[..-1]}, but not in left"
|
309
|
+
raise AssertionError, msg
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
if check_column_order
|
314
|
+
left_columns, right_columns = left_schema.keys, right_schema.keys
|
315
|
+
if left_columns != right_columns
|
316
|
+
detail = "columns are not in the same order"
|
317
|
+
raise_assertion_error(objects, detail, left_columns, right_columns)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
if check_dtype
|
322
|
+
left_schema_dict, right_schema_dict = left_schema.to_h, right_schema.to_h
|
323
|
+
if check_column_order || left_schema_dict != right_schema_dict
|
324
|
+
detail = "dtypes do not match"
|
325
|
+
raise_assertion_error(objects, detail, left_schema_dict, right_schema_dict)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def _sort_dataframes(left, right)
|
331
|
+
by = left.columns
|
332
|
+
begin
|
333
|
+
left = left.sort(by)
|
334
|
+
right = right.sort(by)
|
335
|
+
rescue
|
336
|
+
msg = "cannot set `check_row_order: false` on frame with unsortable columns"
|
337
|
+
raise InvalidAssert, msg
|
338
|
+
end
|
339
|
+
[left, right]
|
340
|
+
end
|
341
|
+
|
342
|
+
def _assert_series_values_equal(
|
343
|
+
left,
|
344
|
+
right,
|
345
|
+
check_exact:,
|
346
|
+
rtol:,
|
347
|
+
atol:,
|
348
|
+
categorical_as_str:
|
349
|
+
)
|
350
|
+
if categorical_as_str
|
351
|
+
if left.dtype == Categorical
|
352
|
+
left = left.cast(String)
|
353
|
+
end
|
354
|
+
if right.dtype == Categorical
|
355
|
+
right = right.cast(String)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# Determine unequal elements
|
360
|
+
begin
|
361
|
+
unequal = left.ne_missing(right)
|
362
|
+
rescue
|
363
|
+
raise_assertion_error(
|
364
|
+
"Series",
|
365
|
+
"incompatible data types",
|
366
|
+
left.dtype,
|
367
|
+
right.dtype
|
368
|
+
)
|
369
|
+
end
|
370
|
+
|
371
|
+
# Check nested dtypes in separate function
|
372
|
+
if _comparing_nested_floats(left.dtype, right.dtype)
|
373
|
+
begin
|
374
|
+
_assert_series_nested_values_equal(
|
375
|
+
left: left.filter(unequal),
|
376
|
+
right: right.filter(unequal),
|
377
|
+
check_exact: check_exact,
|
378
|
+
rtol: rtol,
|
379
|
+
atol: atol,
|
380
|
+
categorical_as_str: categorical_as_str
|
381
|
+
)
|
382
|
+
rescue AssertionError
|
383
|
+
raise_assertion_error(
|
384
|
+
"Series",
|
385
|
+
"nested value mismatch",
|
386
|
+
left.to_a,
|
387
|
+
right.to_a
|
388
|
+
)
|
389
|
+
else
|
390
|
+
return
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
# If no differences found during exact checking, we're done
|
395
|
+
if !unequal.any
|
396
|
+
return
|
397
|
+
end
|
398
|
+
|
399
|
+
# Only do inexact checking for float types
|
400
|
+
if check_exact || !left.dtype.float? || !right.dtype.float?
|
401
|
+
raise_assertion_error(
|
402
|
+
"Series", "exact value mismatch", left.to_a, right.to_a
|
403
|
+
)
|
404
|
+
end
|
405
|
+
|
406
|
+
_assert_series_null_values_match(left, right)
|
407
|
+
_assert_series_nan_values_match(left, right)
|
408
|
+
_assert_series_values_within_tolerance(
|
409
|
+
left,
|
410
|
+
right,
|
411
|
+
unequal,
|
412
|
+
rtol: rtol,
|
413
|
+
atol: atol
|
414
|
+
)
|
415
|
+
end
|
416
|
+
|
417
|
+
def _assert_series_nested_values_equal(
|
418
|
+
left,
|
419
|
+
right,
|
420
|
+
check_exact:,
|
421
|
+
rtol:,
|
422
|
+
atol:,
|
423
|
+
categorical_as_str:
|
424
|
+
)
|
425
|
+
# compare nested lists element-wise
|
426
|
+
if _comparing_lists(left.dtype, right.dtype)
|
427
|
+
left.zip(right) do |s1, s2|
|
428
|
+
if s1.nil? || s2.nil?
|
429
|
+
raise_assertion_error("Series", "nested value mismatch", s1, s2)
|
430
|
+
end
|
431
|
+
|
432
|
+
_assert_series_values_equal(
|
433
|
+
s1,
|
434
|
+
s2,
|
435
|
+
check_exact: check_exact,
|
436
|
+
rtol: rtol,
|
437
|
+
atol: atol,
|
438
|
+
categorical_as_str: categorical_as_str
|
439
|
+
)
|
440
|
+
end
|
441
|
+
|
442
|
+
# unnest structs as series and compare
|
443
|
+
else
|
444
|
+
ls, rs = left.struct.unnest, right.struct.unnest
|
445
|
+
ls.zip(rs) do |s1, s2|
|
446
|
+
_assert_series_values_equal(
|
447
|
+
s1,
|
448
|
+
s2,
|
449
|
+
check_exact: check_exact,
|
450
|
+
rtol: rtol,
|
451
|
+
atol: atol,
|
452
|
+
categorical_as_str: categorical_as_str
|
453
|
+
)
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def _assert_series_null_values_match(left, right)
|
459
|
+
null_value_mismatch = left.is_null != right.is_null
|
460
|
+
if null_value_mismatch.any
|
461
|
+
raise_assertion_error(
|
462
|
+
"Series", "null value mismatch", left.to_a, right.to_a
|
463
|
+
)
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
def _assert_series_nan_values_match(left, right)
|
468
|
+
if !_comparing_floats(left.dtype, right.dtype)
|
469
|
+
return
|
470
|
+
end
|
471
|
+
nan_value_mismatch = left.is_nan != right.is_nan
|
472
|
+
if nan_value_mismatch.any
|
473
|
+
raise_assertion_error(
|
474
|
+
"Series",
|
475
|
+
"nan value mismatch",
|
476
|
+
left.to_a,
|
477
|
+
right.to_a
|
478
|
+
)
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
def _comparing_floats(left, right)
|
483
|
+
left.is_float && right.is_float
|
484
|
+
end
|
485
|
+
|
486
|
+
def _comparing_lists(left, right)
|
487
|
+
[List, Array].include?(left) && [List, Array].include?(right)
|
488
|
+
end
|
489
|
+
|
490
|
+
def _comparing_structs(left, right)
|
491
|
+
left == Struct && right == Struct
|
492
|
+
end
|
493
|
+
|
494
|
+
def _comparing_nested_floats(left, right)
|
495
|
+
if !(_comparing_lists(left, right) || _comparing_structs(left, right))
|
496
|
+
return false
|
497
|
+
end
|
498
|
+
|
499
|
+
left.float? && right.float?
|
500
|
+
end
|
501
|
+
|
502
|
+
def raise_assertion_error(objects, detail, left, right)
|
503
|
+
msg = "#{objects} are different (#{detail})\n[left]: #{left}\n[right]: #{right}"
|
504
|
+
raise AssertionError, msg
|
505
|
+
end
|
506
|
+
end
|
507
|
+
end
|