onekit 2.2.2__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onekit-2.2.2 → onekit-3.0.1}/PKG-INFO +1 -1
- {onekit-2.2.2 → onekit-3.0.1}/pyproject.toml +1 -1
- onekit-3.0.1/src/onekit/exception.py +142 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/pandaskit.py +3 -3
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/pythonkit.py +18 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/sparkkit.py +59 -92
- {onekit-2.2.2 → onekit-3.0.1}/LICENSE +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/README.md +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/__init__.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/dekit.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/mathkit.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/numpykit.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/optfunckit.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/sklearnkit.py +0 -0
- {onekit-2.2.2 → onekit-3.0.1}/src/onekit/vizkit.py +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Iterable,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from pyspark.sql import DataFrame as SparkDF
|
|
8
|
+
|
|
9
|
+
from onekit import pythonkit as pk
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
"ColumnNotFoundError",
|
|
13
|
+
"InvalidChoiceError",
|
|
14
|
+
"OnekitError",
|
|
15
|
+
"RowCountMismatchError",
|
|
16
|
+
"RowValueMismatchError",
|
|
17
|
+
"SchemaMismatchError",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OnekitError(Exception):
|
|
22
|
+
"""A base class for onekit exceptions."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ColumnNotFoundError(OnekitError):
|
|
26
|
+
"""Exception for missing columns in dataframe.
|
|
27
|
+
|
|
28
|
+
See Also
|
|
29
|
+
--------
|
|
30
|
+
check_column_present : Validate column presence.
|
|
31
|
+
has_column : Evaluate column presence.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
>>> from onekit.exception import ColumnNotFoundError
|
|
36
|
+
>>> error = ColumnNotFoundError(missing_cols=["a", "b", "c"])
|
|
37
|
+
>>> error.message
|
|
38
|
+
"following columns not found: ['a', 'b', 'c']"
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, missing_cols: Iterable[str]):
|
|
42
|
+
self.missing_cols = missing_cols
|
|
43
|
+
self.message = f"following columns not found: {missing_cols}"
|
|
44
|
+
super().__init__(self.message)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class InvalidChoiceError(OnekitError):
|
|
48
|
+
"""Exception for invalid choice error.
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
>>> from onekit.exception import InvalidChoiceError
|
|
53
|
+
>>> x = 0
|
|
54
|
+
>>> error = InvalidChoiceError(value=x, choices=[1, 2, 3])
|
|
55
|
+
>>> error.message
|
|
56
|
+
'x=0 invalid choice - choose from [1, 2, 3]'
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, value: Any, choices: Iterable[Any] | None = None):
|
|
60
|
+
self.value = value
|
|
61
|
+
self.choices = choices
|
|
62
|
+
msg = f"{pk.parent_varname(value)}={value} invalid choice"
|
|
63
|
+
if choices is not None:
|
|
64
|
+
msg += f" - choose from {choices}"
|
|
65
|
+
self.message = msg
|
|
66
|
+
super().__init__(self.message)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class RowCountMismatchError(OnekitError):
|
|
70
|
+
"""Exception for mismatch of row counts.
|
|
71
|
+
|
|
72
|
+
See Also
|
|
73
|
+
--------
|
|
74
|
+
assert_row_count_equal : Validate row counts.
|
|
75
|
+
is_row_count_equal : Evaluate row counts.
|
|
76
|
+
|
|
77
|
+
Examples
|
|
78
|
+
--------
|
|
79
|
+
>>> from onekit.exception import RowCountMismatchError
|
|
80
|
+
>>> error = RowCountMismatchError(num_lft=10000, num_rgt=12000)
|
|
81
|
+
>>> error.message
|
|
82
|
+
'num_lft=10_000, num_rgt=12_000, num_diff=2_000'
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, num_lft: int, num_rgt: int):
|
|
86
|
+
num_diff = abs(num_lft - num_rgt)
|
|
87
|
+
self.num_lft = num_lft
|
|
88
|
+
self.num_rgt = num_rgt
|
|
89
|
+
self.num_diff = num_diff
|
|
90
|
+
self.message = pk.concat_strings(
|
|
91
|
+
", ",
|
|
92
|
+
f"num_lft={pk.num_to_str(num_lft)}",
|
|
93
|
+
f"num_rgt={pk.num_to_str(num_rgt)}",
|
|
94
|
+
f"num_diff={pk.num_to_str(num_diff)}",
|
|
95
|
+
)
|
|
96
|
+
super().__init__(self.message)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class RowValueMismatchError(OnekitError):
|
|
100
|
+
"""Exception for mismatch of row values.
|
|
101
|
+
|
|
102
|
+
See Also
|
|
103
|
+
--------
|
|
104
|
+
assert_row_value_equal : Validate row values.
|
|
105
|
+
is_row_value_equal : Evaluate row values.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
lft_rows: SparkDF,
|
|
111
|
+
rgt_rows: SparkDF,
|
|
112
|
+
num_lft: int,
|
|
113
|
+
num_rgt: int,
|
|
114
|
+
):
|
|
115
|
+
self.lft_rows = lft_rows
|
|
116
|
+
self.rgt_rows = rgt_rows
|
|
117
|
+
self.num_lft = num_lft
|
|
118
|
+
self.num_rgt = num_rgt
|
|
119
|
+
self.message = pk.concat_strings(
|
|
120
|
+
", ",
|
|
121
|
+
f"num_lft={pk.num_to_str(num_lft)}",
|
|
122
|
+
f"num_rgt={pk.num_to_str(num_rgt)}",
|
|
123
|
+
)
|
|
124
|
+
super().__init__(self.message)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class SchemaMismatchError(OnekitError):
|
|
128
|
+
"""Exception for mismatch of schemas.
|
|
129
|
+
|
|
130
|
+
See Also
|
|
131
|
+
--------
|
|
132
|
+
assert_schema_equal : Validate schemas.
|
|
133
|
+
is_schema_equal : Evaluate schemas.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, lft_schema: str, rgt_schema: str):
|
|
137
|
+
self.lft_schema = lft_schema
|
|
138
|
+
self.rgt_schema = rgt_schema
|
|
139
|
+
msg = pk.highlight_string_differences(lft_schema, rgt_schema)
|
|
140
|
+
num_diff = sum(c == "|" for c in msg.splitlines()[1])
|
|
141
|
+
self.message = pk.concat_strings(os.linesep, f"{num_diff=}", msg)
|
|
142
|
+
super().__init__(self.message)
|
|
@@ -209,7 +209,7 @@ def profile(df: PandasDF, /, *, q: list[int] | None = None) -> PandasDF:
|
|
|
209
209
|
q95 NaN 1.0 NaN
|
|
210
210
|
max NaN 1.0 NaN
|
|
211
211
|
"""
|
|
212
|
-
|
|
212
|
+
num_rows, _ = df.shape
|
|
213
213
|
quantiles = q or (5, 25, 50, 75, 95)
|
|
214
214
|
|
|
215
215
|
basic_info_df = pd.concat(
|
|
@@ -220,12 +220,12 @@ def profile(df: PandasDF, /, *, q: list[int] | None = None) -> PandasDF:
|
|
|
220
220
|
df.isnull()
|
|
221
221
|
.sum()
|
|
222
222
|
.to_frame("isnull")
|
|
223
|
-
.assign(isnull_pct=lambda df: 100 * df["isnull"] /
|
|
223
|
+
.assign(isnull_pct=lambda df: 100 * df["isnull"] / num_rows)
|
|
224
224
|
),
|
|
225
225
|
(
|
|
226
226
|
df.nunique()
|
|
227
227
|
.to_frame("unique")
|
|
228
|
-
.assign(unique_pct=lambda df: 100 * df["unique"] /
|
|
228
|
+
.assign(unique_pct=lambda df: 100 * df["unique"] / num_rows)
|
|
229
229
|
),
|
|
230
230
|
],
|
|
231
231
|
axis=1,
|
|
@@ -60,6 +60,7 @@ __all__ = (
|
|
|
60
60
|
"num_days",
|
|
61
61
|
"num_to_str",
|
|
62
62
|
"op",
|
|
63
|
+
"parent_varname",
|
|
63
64
|
"prompt_yes_no",
|
|
64
65
|
"reduce_sets",
|
|
65
66
|
"remove_punctuation",
|
|
@@ -958,6 +959,23 @@ def op(func: Callable, const: Any, /) -> Callable[[Any], Any]:
|
|
|
958
959
|
return inner
|
|
959
960
|
|
|
960
961
|
|
|
962
|
+
def parent_varname(x: Any, /) -> str:
|
|
963
|
+
"""Returns the name of the parent variable of :math:`x`.
|
|
964
|
+
|
|
965
|
+
Examples
|
|
966
|
+
--------
|
|
967
|
+
>>> from onekit import pythonkit as pk
|
|
968
|
+
>>> my_var = "my_string_value"
|
|
969
|
+
>>> def f(x) -> str:
|
|
970
|
+
... return pk.parent_varname(x)
|
|
971
|
+
...
|
|
972
|
+
>>> f(my_var)
|
|
973
|
+
'my_var'
|
|
974
|
+
"""
|
|
975
|
+
variables = inspect.currentframe().f_back.f_back.f_locals.items()
|
|
976
|
+
return [name for name, value in variables if value is x][0]
|
|
977
|
+
|
|
978
|
+
|
|
961
979
|
def prompt_yes_no(question: str, /, *, default: str | None = None) -> bool:
|
|
962
980
|
"""Prompt yes-no question.
|
|
963
981
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import datetime as dt
|
|
2
2
|
import functools
|
|
3
3
|
import math
|
|
4
|
-
import os
|
|
5
4
|
from typing import (
|
|
6
5
|
Any,
|
|
7
6
|
Iterable,
|
|
@@ -26,7 +25,7 @@ __all__ = (
|
|
|
26
25
|
"any_col",
|
|
27
26
|
"assert_dataframe_equal",
|
|
28
27
|
"assert_row_count_equal",
|
|
29
|
-
"
|
|
28
|
+
"assert_row_value_equal",
|
|
30
29
|
"assert_schema_equal",
|
|
31
30
|
"bool_to_int",
|
|
32
31
|
"bool_to_str",
|
|
@@ -38,7 +37,7 @@ __all__ = (
|
|
|
38
37
|
"has_column",
|
|
39
38
|
"is_dataframe_equal",
|
|
40
39
|
"is_row_count_equal",
|
|
41
|
-
"
|
|
40
|
+
"is_row_value_equal",
|
|
42
41
|
"is_schema_equal",
|
|
43
42
|
"join",
|
|
44
43
|
"peek",
|
|
@@ -55,54 +54,13 @@ __all__ = (
|
|
|
55
54
|
"with_weekday",
|
|
56
55
|
)
|
|
57
56
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def __init__(self, missing_cols: list[str]):
|
|
67
|
-
self.missing_cols = missing_cols
|
|
68
|
-
self.message = f"following columns not found: {missing_cols}"
|
|
69
|
-
super().__init__(self.message)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class RowCountMismatchError(SparkkitError):
|
|
73
|
-
"""Exception if row counts mismatch."""
|
|
74
|
-
|
|
75
|
-
def __init__(self, n_lft: int, n_rgt: int):
|
|
76
|
-
n_diff = abs(n_lft - n_rgt)
|
|
77
|
-
self.n_lft = n_lft
|
|
78
|
-
self.n_rgt = n_rgt
|
|
79
|
-
self.n_diff = n_diff
|
|
80
|
-
self.message = f"{n_lft=:_}, {n_rgt=:_}, {n_diff=:_}"
|
|
81
|
-
super().__init__(self.message)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class RowMismatchError(SparkkitError):
|
|
85
|
-
"""Exception if rows mismatch."""
|
|
86
|
-
|
|
87
|
-
def __init__(self, lft_rows: SparkDF, rgt_rows: SparkDF, n_lft: int, n_rgt: int):
|
|
88
|
-
self.lft_rows = lft_rows
|
|
89
|
-
self.rgt_rows = rgt_rows
|
|
90
|
-
self.n_lft = n_lft
|
|
91
|
-
self.n_rgt = n_rgt
|
|
92
|
-
self.message = f"{n_lft=:_}, {n_rgt=:_}"
|
|
93
|
-
super().__init__(self.message)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class SchemaMismatchError(SparkkitError):
|
|
97
|
-
"""Exception if schemas mismatch."""
|
|
98
|
-
|
|
99
|
-
def __init__(self, lft_schema: str, rgt_schema: str):
|
|
100
|
-
self.lft_schema = lft_schema
|
|
101
|
-
self.rgt_schema = rgt_schema
|
|
102
|
-
msg = pk.highlight_string_differences(lft_schema, rgt_schema)
|
|
103
|
-
n_diff = sum(c == "|" for c in msg.splitlines()[1])
|
|
104
|
-
self.message = pk.concat_strings(os.linesep, f"{n_diff=}", msg)
|
|
105
|
-
super().__init__(self.message)
|
|
57
|
+
from onekit.exception import (
|
|
58
|
+
ColumnNotFoundError,
|
|
59
|
+
OnekitError,
|
|
60
|
+
RowCountMismatchError,
|
|
61
|
+
RowValueMismatchError,
|
|
62
|
+
SchemaMismatchError,
|
|
63
|
+
)
|
|
106
64
|
|
|
107
65
|
|
|
108
66
|
def add_prefix(df: SparkDF, prefix: str, subset: list[str] | None = None) -> SparkDF:
|
|
@@ -246,19 +204,24 @@ def assert_dataframe_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
246
204
|
If schemas are not equal.
|
|
247
205
|
RowCountMismatchError
|
|
248
206
|
If row counts are not equal.
|
|
249
|
-
|
|
250
|
-
If
|
|
207
|
+
RowValueMismatchError
|
|
208
|
+
If row values are not equal.
|
|
251
209
|
|
|
252
210
|
See Also
|
|
253
211
|
--------
|
|
254
212
|
assert_schema_equal : Validate schemas.
|
|
255
213
|
assert_row_count_equal : Validate row counts.
|
|
256
|
-
|
|
214
|
+
assert_row_value_equal : Validate row values.
|
|
257
215
|
|
|
258
216
|
Examples
|
|
259
217
|
--------
|
|
260
218
|
>>> from pyspark.sql import Row, SparkSession
|
|
261
219
|
>>> from onekit import sparkkit as sk
|
|
220
|
+
>>> from onekit.exception import (
|
|
221
|
+
... SchemaMismatchError,
|
|
222
|
+
... RowCountMismatchError,
|
|
223
|
+
... RowValueMismatchError,
|
|
224
|
+
... )
|
|
262
225
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
263
226
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
264
227
|
>>> rgt_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
@@ -269,10 +232,10 @@ def assert_dataframe_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
269
232
|
>>> rgt_df = spark.createDataFrame([Row(z=1, y="a", x=9), Row(z=3, y="b", x=8)])
|
|
270
233
|
>>> try:
|
|
271
234
|
... sk.assert_dataframe_equal(lft_df, rgt_df)
|
|
272
|
-
... except
|
|
235
|
+
... except SchemaMismatchError as error:
|
|
273
236
|
... print(error)
|
|
274
237
|
...
|
|
275
|
-
|
|
238
|
+
num_diff=15
|
|
276
239
|
struct<x:bigint,y:bigint>
|
|
277
240
|
| ||| |||||||||||
|
|
278
241
|
struct<z:bigint,y:string,x:bigint>
|
|
@@ -281,23 +244,23 @@ def assert_dataframe_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
281
244
|
>>> rgt_df = spark.createDataFrame([Row(x=3, y=4), Row(x=5, y=6)])
|
|
282
245
|
>>> try:
|
|
283
246
|
... sk.assert_dataframe_equal(lft_df, rgt_df)
|
|
284
|
-
... except
|
|
247
|
+
... except RowCountMismatchError as error:
|
|
285
248
|
... print(error)
|
|
286
249
|
...
|
|
287
|
-
|
|
250
|
+
num_lft=1, num_rgt=2, num_diff=1
|
|
288
251
|
|
|
289
252
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4), Row(x=5, y=6)])
|
|
290
253
|
>>> rgt_df = spark.createDataFrame([Row(x=3, y=4), Row(x=5, y=9), Row(x=7, y=8)])
|
|
291
254
|
>>> try:
|
|
292
255
|
... sk.assert_dataframe_equal(lft_df, rgt_df)
|
|
293
|
-
... except
|
|
256
|
+
... except RowValueMismatchError as error:
|
|
294
257
|
... print(error)
|
|
295
258
|
...
|
|
296
|
-
|
|
259
|
+
num_lft=2, num_rgt=2
|
|
297
260
|
"""
|
|
298
261
|
assert_schema_equal(lft_df, rgt_df)
|
|
299
262
|
assert_row_count_equal(lft_df, rgt_df)
|
|
300
|
-
|
|
263
|
+
assert_row_value_equal(lft_df, rgt_df)
|
|
301
264
|
|
|
302
265
|
|
|
303
266
|
def assert_row_count_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
@@ -316,6 +279,7 @@ def assert_row_count_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
316
279
|
--------
|
|
317
280
|
>>> from pyspark.sql import Row, SparkSession
|
|
318
281
|
>>> from onekit import sparkkit as sk
|
|
282
|
+
>>> from onekit.exception import RowCountMismatchError
|
|
319
283
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
320
284
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
321
285
|
>>> rgt_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
@@ -326,25 +290,25 @@ def assert_row_count_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
326
290
|
>>> rgt_df = spark.createDataFrame([Row(x=1)])
|
|
327
291
|
>>> try:
|
|
328
292
|
... sk.assert_row_count_equal(lft_df, rgt_df)
|
|
329
|
-
... except
|
|
293
|
+
... except RowCountMismatchError as error:
|
|
330
294
|
... print(error)
|
|
331
295
|
...
|
|
332
|
-
|
|
296
|
+
num_lft=2, num_rgt=1, num_diff=1
|
|
333
297
|
"""
|
|
334
|
-
|
|
335
|
-
|
|
298
|
+
num_lft = lft_df.count()
|
|
299
|
+
num_rgt = rgt_df.count()
|
|
336
300
|
|
|
337
|
-
if
|
|
338
|
-
raise RowCountMismatchError(
|
|
301
|
+
if num_lft != num_rgt:
|
|
302
|
+
raise RowCountMismatchError(num_lft, num_rgt)
|
|
339
303
|
|
|
340
304
|
|
|
341
|
-
def
|
|
342
|
-
"""Validate
|
|
305
|
+
def assert_row_value_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
306
|
+
"""Validate row values of both dataframes are equal.
|
|
343
307
|
|
|
344
308
|
Raises
|
|
345
309
|
------
|
|
346
|
-
|
|
347
|
-
If
|
|
310
|
+
RowValueMismatchError
|
|
311
|
+
If row values are not equal.
|
|
348
312
|
|
|
349
313
|
See Also
|
|
350
314
|
--------
|
|
@@ -354,31 +318,32 @@ def assert_row_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
354
318
|
--------
|
|
355
319
|
>>> from pyspark.sql import Row, SparkSession
|
|
356
320
|
>>> from onekit import sparkkit as sk
|
|
321
|
+
>>> from onekit.exception import RowValueMismatchError
|
|
357
322
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
358
323
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
359
324
|
>>> rgt_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
360
|
-
>>> sk.
|
|
325
|
+
>>> sk.assert_row_value_equal(lft_df, rgt_df) is None
|
|
361
326
|
True
|
|
362
327
|
|
|
363
328
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
364
329
|
>>> rgt_df = spark.createDataFrame([Row(x=3, y=4), Row(x=5, y=6), Row(x=7, y=8)])
|
|
365
330
|
>>> try:
|
|
366
|
-
... sk.
|
|
367
|
-
... except
|
|
331
|
+
... sk.assert_row_value_equal(lft_df, rgt_df)
|
|
332
|
+
... except RowValueMismatchError as error:
|
|
368
333
|
... print(error)
|
|
369
334
|
...
|
|
370
|
-
|
|
335
|
+
num_lft=1, num_rgt=2
|
|
371
336
|
"""
|
|
372
337
|
lft_rows = lft_df.subtract(rgt_df)
|
|
373
338
|
rgt_rows = rgt_df.subtract(lft_df)
|
|
374
339
|
|
|
375
|
-
|
|
376
|
-
|
|
340
|
+
num_lft = lft_rows.count()
|
|
341
|
+
num_rgt = rgt_rows.count()
|
|
377
342
|
|
|
378
|
-
is_equal = (
|
|
343
|
+
is_equal = (num_lft == 0) and (num_rgt == 0)
|
|
379
344
|
|
|
380
345
|
if not is_equal:
|
|
381
|
-
raise
|
|
346
|
+
raise RowValueMismatchError(lft_rows, rgt_rows, num_lft, num_rgt)
|
|
382
347
|
|
|
383
348
|
|
|
384
349
|
def assert_schema_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
@@ -386,7 +351,7 @@ def assert_schema_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
386
351
|
|
|
387
352
|
Raises
|
|
388
353
|
------
|
|
389
|
-
|
|
354
|
+
SchemaMismatchError
|
|
390
355
|
If schemas are not equal.
|
|
391
356
|
|
|
392
357
|
See Also
|
|
@@ -397,6 +362,7 @@ def assert_schema_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
397
362
|
--------
|
|
398
363
|
>>> from pyspark.sql import Row, SparkSession
|
|
399
364
|
>>> from onekit import sparkkit as sk
|
|
365
|
+
>>> from onekit.exception import SchemaMismatchError
|
|
400
366
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
401
367
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
402
368
|
>>> rgt_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
@@ -407,10 +373,10 @@ def assert_schema_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> None:
|
|
|
407
373
|
>>> rgt_df = spark.createDataFrame([Row(x=1), Row(x=3)])
|
|
408
374
|
>>> try:
|
|
409
375
|
... sk.assert_schema_equal(lft_df, rgt_df)
|
|
410
|
-
... except
|
|
376
|
+
... except SchemaMismatchError as error:
|
|
411
377
|
... print(error)
|
|
412
378
|
...
|
|
413
|
-
|
|
379
|
+
num_diff=10
|
|
414
380
|
struct<x:bigint,y:bigint>
|
|
415
381
|
||||||||||
|
|
416
382
|
struct<x:bigint>
|
|
@@ -509,13 +475,14 @@ def check_column_present(df: SparkDF, *cols: str | Iterable[str]) -> SparkDF:
|
|
|
509
475
|
|
|
510
476
|
Raises
|
|
511
477
|
------
|
|
512
|
-
|
|
478
|
+
ColumnNotFoundError
|
|
513
479
|
If columns are not found in dataframe.
|
|
514
480
|
|
|
515
481
|
Examples
|
|
516
482
|
--------
|
|
517
483
|
>>> from pyspark.sql import Row, SparkSession
|
|
518
484
|
>>> from onekit import sparkkit as sk
|
|
485
|
+
>>> from onekit.exception import ColumnNotFoundError
|
|
519
486
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
520
487
|
>>> df = spark.createDataFrame([Row(x=1), Row(x=2), Row(x=3)])
|
|
521
488
|
>>> sk.check_column_present(df, "x").show()
|
|
@@ -530,7 +497,7 @@ def check_column_present(df: SparkDF, *cols: str | Iterable[str]) -> SparkDF:
|
|
|
530
497
|
|
|
531
498
|
>>> try:
|
|
532
499
|
... sk.check_column_present(df, "y").show()
|
|
533
|
-
... except
|
|
500
|
+
... except ColumnNotFoundError as error:
|
|
534
501
|
... print(error)
|
|
535
502
|
...
|
|
536
503
|
following columns not found: ['y']
|
|
@@ -773,7 +740,7 @@ def is_dataframe_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> bool:
|
|
|
773
740
|
--------
|
|
774
741
|
is_schema_equal : Evaluate schemas.
|
|
775
742
|
is_row_count_equal : Evaluate row counts.
|
|
776
|
-
|
|
743
|
+
is_row_value_equal : Evaluate row values.
|
|
777
744
|
|
|
778
745
|
Examples
|
|
779
746
|
--------
|
|
@@ -803,9 +770,9 @@ def is_dataframe_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> bool:
|
|
|
803
770
|
try:
|
|
804
771
|
assert_schema_equal(lft_df, rgt_df)
|
|
805
772
|
assert_row_count_equal(lft_df, rgt_df)
|
|
806
|
-
|
|
773
|
+
assert_row_value_equal(lft_df, rgt_df)
|
|
807
774
|
return True
|
|
808
|
-
except
|
|
775
|
+
except OnekitError:
|
|
809
776
|
return False
|
|
810
777
|
|
|
811
778
|
|
|
@@ -838,7 +805,7 @@ def is_row_count_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> bool:
|
|
|
838
805
|
return False
|
|
839
806
|
|
|
840
807
|
|
|
841
|
-
def
|
|
808
|
+
def is_row_value_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> bool:
|
|
842
809
|
"""Evaluate if rows of both dataframes are equal.
|
|
843
810
|
|
|
844
811
|
See Also
|
|
@@ -852,18 +819,18 @@ def is_row_equal(lft_df: SparkDF, rgt_df: SparkDF, /) -> bool:
|
|
|
852
819
|
>>> spark = SparkSession.builder.getOrCreate()
|
|
853
820
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
854
821
|
>>> rgt_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
855
|
-
>>> sk.
|
|
822
|
+
>>> sk.is_row_value_equal(lft_df, rgt_df)
|
|
856
823
|
True
|
|
857
824
|
|
|
858
825
|
>>> lft_df = spark.createDataFrame([Row(x=1, y=2), Row(x=3, y=4)])
|
|
859
826
|
>>> rgt_df = spark.createDataFrame([Row(x=3, y=4), Row(x=5, y=6), Row(x=7, y=8)])
|
|
860
|
-
>>> sk.
|
|
827
|
+
>>> sk.is_row_value_equal(lft_df, rgt_df)
|
|
861
828
|
False
|
|
862
829
|
"""
|
|
863
830
|
try:
|
|
864
|
-
|
|
831
|
+
assert_row_value_equal(lft_df, rgt_df)
|
|
865
832
|
return True
|
|
866
|
-
except
|
|
833
|
+
except RowValueMismatchError:
|
|
867
834
|
return False
|
|
868
835
|
|
|
869
836
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|