datacompy 0.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacompy/__init__.py +28 -0
- datacompy/base.py +141 -0
- datacompy/core.py +957 -0
- datacompy/fugue.py +827 -0
- datacompy/polars.py +984 -0
- datacompy/py.typed +0 -0
- datacompy/spark.py +928 -0
- datacompy/templates/column_comparison.txt +7 -0
- datacompy/templates/column_summary.txt +7 -0
- datacompy/templates/fav_column_summary.txt +6 -0
- datacompy/templates/header.txt +6 -0
- datacompy/templates/row_summary.txt +14 -0
- datacompy-0.11.3.dist-info/LICENSE +202 -0
- datacompy-0.11.3.dist-info/METADATA +153 -0
- datacompy-0.11.3.dist-info/RECORD +17 -0
- datacompy-0.11.3.dist-info/WHEEL +5 -0
- datacompy-0.11.3.dist-info/top_level.txt +1 -0
datacompy/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2024 Capital One Services, LLC
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
__version__ = "0.11.3"
|
|
17
|
+
|
|
18
|
+
from datacompy.core import *
|
|
19
|
+
from datacompy.fugue import (
|
|
20
|
+
all_columns_match,
|
|
21
|
+
all_rows_overlap,
|
|
22
|
+
intersect_columns,
|
|
23
|
+
is_match,
|
|
24
|
+
report,
|
|
25
|
+
unq_columns,
|
|
26
|
+
)
|
|
27
|
+
from datacompy.polars import PolarsCompare
|
|
28
|
+
from datacompy.spark import NUMERIC_SPARK_TYPES, SparkCompare
|
datacompy/base.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2024 Capital One Services, LLC
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
Compare two Pandas DataFrames
|
|
18
|
+
|
|
19
|
+
Originally this package was meant to provide similar functionality to
|
|
20
|
+
PROC COMPARE in SAS - i.e. human-readable reporting on the difference between
|
|
21
|
+
two dataframes.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
from abc import ABC, abstractmethod
|
|
26
|
+
from typing import Any, Optional
|
|
27
|
+
|
|
28
|
+
from ordered_set import OrderedSet
|
|
29
|
+
|
|
30
|
+
LOG = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BaseCompare(ABC):
|
|
34
|
+
@property
|
|
35
|
+
def df1(self) -> Any:
|
|
36
|
+
return self._df1 # type: ignore
|
|
37
|
+
|
|
38
|
+
@df1.setter
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def df1(self, df1: Any) -> None:
|
|
41
|
+
"""Check that it is a dataframe and has the join columns"""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def df2(self) -> Any:
|
|
46
|
+
return self._df2 # type: ignore
|
|
47
|
+
|
|
48
|
+
@df2.setter
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def df2(self, df2: Any) -> None:
|
|
51
|
+
"""Check that it is a dataframe and has the join columns"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def _validate_dataframe(
|
|
56
|
+
self, index: str, cast_column_names_lower: bool = True
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Check that it is a dataframe and has the join columns"""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
|
|
63
|
+
"""Actually run the comparison. This tries to run df1.equals(df2)
|
|
64
|
+
first so that if they're truly equal we can tell.
|
|
65
|
+
|
|
66
|
+
This method will log out information about what is different between
|
|
67
|
+
the two dataframes, and will also return a boolean.
|
|
68
|
+
"""
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def df1_unq_columns(self) -> OrderedSet[str]:
|
|
73
|
+
"""Get columns that are unique to df1"""
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def df2_unq_columns(self) -> OrderedSet[str]:
|
|
78
|
+
"""Get columns that are unique to df2"""
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def intersect_columns(self) -> OrderedSet[str]:
|
|
83
|
+
"""Get columns that are shared between the two dataframes"""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def _dataframe_merge(self, ignore_spaces: bool) -> None:
|
|
88
|
+
"""Merge df1 to df2 on the join columns, to get df1 - df2, df2 - df1
|
|
89
|
+
and df1 & df2
|
|
90
|
+
|
|
91
|
+
If ``on_index`` is True, this will join on index values, otherwise it
|
|
92
|
+
will join on the ``join_columns``.
|
|
93
|
+
"""
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def all_columns_match(self) -> bool:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def all_rows_overlap(self) -> bool:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def count_matching_rows(self) -> int:
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def intersect_rows_match(self) -> bool:
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def matches(self, ignore_extra_columns: bool = False) -> bool:
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def subset(self) -> bool:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
@abstractmethod
|
|
125
|
+
def sample_mismatch(
|
|
126
|
+
self, column: str, sample_count: int = 10, for_display: bool = False
|
|
127
|
+
) -> Any:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
@abstractmethod
|
|
131
|
+
def all_mismatch(self, ignore_matching_cols: bool = False) -> Any:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
@abstractmethod
|
|
135
|
+
def report(
|
|
136
|
+
self,
|
|
137
|
+
sample_count: int = 10,
|
|
138
|
+
column_count: int = 10,
|
|
139
|
+
html_file: Optional[str] = None,
|
|
140
|
+
) -> str:
|
|
141
|
+
pass
|