datacompy 0.11.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datacompy/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ #
2
+ # Copyright 2024 Capital One Services, LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __version__ = "0.11.3"
17
+
18
+ from datacompy.core import *
19
+ from datacompy.fugue import (
20
+ all_columns_match,
21
+ all_rows_overlap,
22
+ intersect_columns,
23
+ is_match,
24
+ report,
25
+ unq_columns,
26
+ )
27
+ from datacompy.polars import PolarsCompare
28
+ from datacompy.spark import NUMERIC_SPARK_TYPES, SparkCompare
datacompy/base.py ADDED
@@ -0,0 +1,141 @@
1
+ #
2
+ # Copyright 2024 Capital One Services, LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Compare two Pandas DataFrames
18
+
19
+ Originally this package was meant to provide similar functionality to
20
+ PROC COMPARE in SAS - i.e. human-readable reporting on the difference between
21
+ two dataframes.
22
+ """
23
+
24
+ import logging
25
+ from abc import ABC, abstractmethod
26
+ from typing import Any, Optional
27
+
28
+ from ordered_set import OrderedSet
29
+
30
+ LOG = logging.getLogger(__name__)
31
+
32
+
33
+ class BaseCompare(ABC):
34
+ @property
35
+ def df1(self) -> Any:
36
+ return self._df1 # type: ignore
37
+
38
+ @df1.setter
39
+ @abstractmethod
40
+ def df1(self, df1: Any) -> None:
41
+ """Check that it is a dataframe and has the join columns"""
42
+ pass
43
+
44
+ @property
45
+ def df2(self) -> Any:
46
+ return self._df2 # type: ignore
47
+
48
+ @df2.setter
49
+ @abstractmethod
50
+ def df2(self, df2: Any) -> None:
51
+ """Check that it is a dataframe and has the join columns"""
52
+ pass
53
+
54
+ @abstractmethod
55
+ def _validate_dataframe(
56
+ self, index: str, cast_column_names_lower: bool = True
57
+ ) -> None:
58
+ """Check that it is a dataframe and has the join columns"""
59
+ pass
60
+
61
+ @abstractmethod
62
+ def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
63
+ """Actually run the comparison. This tries to run df1.equals(df2)
64
+ first so that if they're truly equal we can tell.
65
+
66
+ This method will log out information about what is different between
67
+ the two dataframes, and will also return a boolean.
68
+ """
69
+ pass
70
+
71
+ @abstractmethod
72
+ def df1_unq_columns(self) -> OrderedSet[str]:
73
+ """Get columns that are unique to df1"""
74
+ pass
75
+
76
+ @abstractmethod
77
+ def df2_unq_columns(self) -> OrderedSet[str]:
78
+ """Get columns that are unique to df2"""
79
+ pass
80
+
81
+ @abstractmethod
82
+ def intersect_columns(self) -> OrderedSet[str]:
83
+ """Get columns that are shared between the two dataframes"""
84
+ pass
85
+
86
+ @abstractmethod
87
+ def _dataframe_merge(self, ignore_spaces: bool) -> None:
88
+ """Merge df1 to df2 on the join columns, to get df1 - df2, df2 - df1
89
+ and df1 & df2
90
+
91
+ If ``on_index`` is True, this will join on index values, otherwise it
92
+ will join on the ``join_columns``.
93
+ """
94
+ pass
95
+
96
+ @abstractmethod
97
+ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
98
+ pass
99
+
100
+ @abstractmethod
101
+ def all_columns_match(self) -> bool:
102
+ pass
103
+
104
+ @abstractmethod
105
+ def all_rows_overlap(self) -> bool:
106
+ pass
107
+
108
+ @abstractmethod
109
+ def count_matching_rows(self) -> int:
110
+ pass
111
+
112
+ @abstractmethod
113
+ def intersect_rows_match(self) -> bool:
114
+ pass
115
+
116
+ @abstractmethod
117
+ def matches(self, ignore_extra_columns: bool = False) -> bool:
118
+ pass
119
+
120
+ @abstractmethod
121
+ def subset(self) -> bool:
122
+ pass
123
+
124
+ @abstractmethod
125
+ def sample_mismatch(
126
+ self, column: str, sample_count: int = 10, for_display: bool = False
127
+ ) -> Any:
128
+ pass
129
+
130
+ @abstractmethod
131
+ def all_mismatch(self, ignore_matching_cols: bool = False) -> Any:
132
+ pass
133
+
134
+ @abstractmethod
135
+ def report(
136
+ self,
137
+ sample_count: int = 10,
138
+ column_count: int = 10,
139
+ html_file: Optional[str] = None,
140
+ ) -> str:
141
+ pass