duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/methods.py +47 -0
- duckguard/anomaly/ml_methods.py +146 -21
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/core/result.py +5 -0
- duckguard/notifications/email.py +9 -0
- duckguard/notifications/notifiers.py +39 -1
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/RECORD +26 -17
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/licenses/LICENSE +0 -0
duckguard/connectors/factory.py
CHANGED
|
@@ -31,7 +31,7 @@ def register_connector(connector_class: type[Connector]) -> None:
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def connect(
|
|
34
|
-
source:
|
|
34
|
+
source: Any,
|
|
35
35
|
*,
|
|
36
36
|
table: str | None = None,
|
|
37
37
|
schema: str | None = None,
|
|
@@ -46,7 +46,7 @@ def connect(
|
|
|
46
46
|
It automatically detects the source type and uses the appropriate connector.
|
|
47
47
|
|
|
48
48
|
Args:
|
|
49
|
-
source: Path to file, connection string, or
|
|
49
|
+
source: Path to file, connection string, URL, or DataFrame (pandas/polars/pyarrow)
|
|
50
50
|
table: Table name (for database connections)
|
|
51
51
|
schema: Schema name (for database connections)
|
|
52
52
|
database: Database name (for database connections)
|
|
@@ -60,6 +60,9 @@ def connect(
|
|
|
60
60
|
# Connect to a CSV file
|
|
61
61
|
orders = connect("data/orders.csv")
|
|
62
62
|
|
|
63
|
+
# Connect to a DataFrame
|
|
64
|
+
orders = connect(df)
|
|
65
|
+
|
|
63
66
|
# Connect to a Parquet file on S3
|
|
64
67
|
orders = connect("s3://bucket/orders.parquet")
|
|
65
68
|
|
|
@@ -72,6 +75,23 @@ def connect(
|
|
|
72
75
|
Raises:
|
|
73
76
|
ValueError: If no connector can handle the source
|
|
74
77
|
"""
|
|
78
|
+
# Handle DataFrame sources (pandas, polars, pyarrow)
|
|
79
|
+
if not isinstance(source, str):
|
|
80
|
+
# Check if it's a DataFrame-like object
|
|
81
|
+
if hasattr(source, '__dataframe__') or hasattr(source, 'to_pandas') or \
|
|
82
|
+
(hasattr(source, 'shape') and hasattr(source, 'columns')):
|
|
83
|
+
# Register DataFrame with engine
|
|
84
|
+
if engine is None:
|
|
85
|
+
engine = DuckGuardEngine.get_instance()
|
|
86
|
+
|
|
87
|
+
# Generate a unique name for the DataFrame
|
|
88
|
+
import hashlib
|
|
89
|
+
import time
|
|
90
|
+
df_name = f"df_{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}"
|
|
91
|
+
|
|
92
|
+
engine.register_dataframe(df_name, source)
|
|
93
|
+
return Dataset(source=df_name, engine=engine, name="dataframe")
|
|
94
|
+
|
|
75
95
|
config = ConnectionConfig(
|
|
76
96
|
source=source,
|
|
77
97
|
table=table,
|
|
@@ -99,6 +119,10 @@ def connect(
|
|
|
99
119
|
|
|
100
120
|
def _is_database_connection(source: str) -> bool:
|
|
101
121
|
"""Check if source is a database connection string."""
|
|
122
|
+
# Only handle string sources
|
|
123
|
+
if not isinstance(source, str):
|
|
124
|
+
return False
|
|
125
|
+
|
|
102
126
|
db_prefixes = (
|
|
103
127
|
"postgres://",
|
|
104
128
|
"postgresql://",
|
|
@@ -143,6 +167,10 @@ def _handle_database_connection(
|
|
|
143
167
|
engine: DuckGuardEngine | None,
|
|
144
168
|
) -> Dataset:
|
|
145
169
|
"""Handle database connection strings."""
|
|
170
|
+
# Validate source is a string
|
|
171
|
+
if not isinstance(source, str):
|
|
172
|
+
raise ValueError(f"Expected string source, got {type(source).__name__}")
|
|
173
|
+
|
|
146
174
|
source_lower = source.lower()
|
|
147
175
|
|
|
148
176
|
# PostgreSQL
|
duckguard/connectors/files.py
CHANGED
|
@@ -65,6 +65,10 @@ class FileConnector(Connector):
|
|
|
65
65
|
@classmethod
|
|
66
66
|
def can_handle(cls, source: str) -> bool:
|
|
67
67
|
"""Check if this connector can handle the source."""
|
|
68
|
+
# Only handle string paths
|
|
69
|
+
if not isinstance(source, str):
|
|
70
|
+
return False
|
|
71
|
+
|
|
68
72
|
# Check for file extensions
|
|
69
73
|
path = Path(source)
|
|
70
74
|
ext = path.suffix.lower()
|
|
@@ -99,7 +103,7 @@ class S3Connector(FileConnector):
|
|
|
99
103
|
@classmethod
|
|
100
104
|
def can_handle(cls, source: str) -> bool:
|
|
101
105
|
"""Check if this is an S3 path."""
|
|
102
|
-
return source.lower().startswith("s3://")
|
|
106
|
+
return isinstance(source, str) and source.lower().startswith("s3://")
|
|
103
107
|
|
|
104
108
|
@classmethod
|
|
105
109
|
def get_priority(cls) -> int:
|
|
@@ -113,7 +117,7 @@ class GCSConnector(FileConnector):
|
|
|
113
117
|
@classmethod
|
|
114
118
|
def can_handle(cls, source: str) -> bool:
|
|
115
119
|
"""Check if this is a GCS path."""
|
|
116
|
-
return source.lower().startswith(("gs://", "gcs://"))
|
|
120
|
+
return isinstance(source, str) and source.lower().startswith(("gs://", "gcs://"))
|
|
117
121
|
|
|
118
122
|
@classmethod
|
|
119
123
|
def get_priority(cls) -> int:
|
|
@@ -127,7 +131,7 @@ class AzureConnector(FileConnector):
|
|
|
127
131
|
@classmethod
|
|
128
132
|
def can_handle(cls, source: str) -> bool:
|
|
129
133
|
"""Check if this is an Azure path."""
|
|
130
|
-
return source.lower().startswith(("az://", "abfs://"))
|
|
134
|
+
return isinstance(source, str) and source.lower().startswith(("az://", "abfs://"))
|
|
131
135
|
|
|
132
136
|
@classmethod
|
|
133
137
|
def get_priority(cls) -> int:
|
duckguard/core/column.py
CHANGED
|
@@ -1002,6 +1002,378 @@ class Column:
|
|
|
1002
1002
|
rows = self._dataset.engine.fetch_all(sql)
|
|
1003
1003
|
return {row[0]: row[1] for row in rows}
|
|
1004
1004
|
|
|
1005
|
+
# =====================================================================
|
|
1006
|
+
# Conditional Validation Methods (DuckGuard 3.0)
|
|
1007
|
+
# =====================================================================
|
|
1008
|
+
|
|
1009
|
+
def not_null_when(
|
|
1010
|
+
self,
|
|
1011
|
+
condition: str,
|
|
1012
|
+
threshold: float = 1.0
|
|
1013
|
+
) -> ValidationResult:
|
|
1014
|
+
"""Check column is not null when condition is true.
|
|
1015
|
+
|
|
1016
|
+
This enables sophisticated conditional validation like:
|
|
1017
|
+
- "State must not be null when country = 'USA'"
|
|
1018
|
+
- "Phone is required when contact_method = 'phone'"
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1022
|
+
threshold: Maximum allowed non-null rate (0.0 to 1.0, default 1.0)
|
|
1023
|
+
|
|
1024
|
+
Returns:
|
|
1025
|
+
ValidationResult with pass/fail status
|
|
1026
|
+
|
|
1027
|
+
Raises:
|
|
1028
|
+
ValidationError: If condition is invalid or contains forbidden SQL
|
|
1029
|
+
|
|
1030
|
+
Examples:
|
|
1031
|
+
>>> data = connect("customers.csv")
|
|
1032
|
+
>>> # State required for US customers
|
|
1033
|
+
>>> result = data.state.not_null_when("country = 'USA'")
|
|
1034
|
+
>>> assert result.passed
|
|
1035
|
+
|
|
1036
|
+
>>> # Email required for registered users
|
|
1037
|
+
>>> result = data.email.not_null_when("user_type = 'registered'")
|
|
1038
|
+
>>> assert result.passed
|
|
1039
|
+
|
|
1040
|
+
Security:
|
|
1041
|
+
Conditions are validated to prevent SQL injection. Only SELECT
|
|
1042
|
+
queries with WHERE clauses are allowed.
|
|
1043
|
+
"""
|
|
1044
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1045
|
+
|
|
1046
|
+
handler = ConditionalCheckHandler()
|
|
1047
|
+
return handler.execute_not_null_when(
|
|
1048
|
+
dataset=self._dataset,
|
|
1049
|
+
column=self._name,
|
|
1050
|
+
condition=condition,
|
|
1051
|
+
threshold=threshold
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
def unique_when(
|
|
1055
|
+
self,
|
|
1056
|
+
condition: str,
|
|
1057
|
+
threshold: float = 1.0
|
|
1058
|
+
) -> ValidationResult:
|
|
1059
|
+
"""Check column is unique when condition is true.
|
|
1060
|
+
|
|
1061
|
+
Args:
|
|
1062
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1063
|
+
threshold: Minimum required uniqueness rate (0.0 to 1.0, default 1.0)
|
|
1064
|
+
|
|
1065
|
+
Returns:
|
|
1066
|
+
ValidationResult with pass/fail status
|
|
1067
|
+
|
|
1068
|
+
Examples:
|
|
1069
|
+
>>> # Order IDs must be unique for completed orders
|
|
1070
|
+
>>> result = data.order_id.unique_when("status = 'completed'")
|
|
1071
|
+
>>> assert result.passed
|
|
1072
|
+
|
|
1073
|
+
>>> # Transaction IDs unique for successful transactions
|
|
1074
|
+
>>> result = data.txn_id.unique_when("success = true")
|
|
1075
|
+
>>> assert result.passed
|
|
1076
|
+
"""
|
|
1077
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1078
|
+
|
|
1079
|
+
handler = ConditionalCheckHandler()
|
|
1080
|
+
return handler.execute_unique_when(
|
|
1081
|
+
dataset=self._dataset,
|
|
1082
|
+
column=self._name,
|
|
1083
|
+
condition=condition,
|
|
1084
|
+
threshold=threshold
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
def between_when(
|
|
1088
|
+
self,
|
|
1089
|
+
min_val: float,
|
|
1090
|
+
max_val: float,
|
|
1091
|
+
condition: str,
|
|
1092
|
+
threshold: float = 1.0
|
|
1093
|
+
) -> ValidationResult:
|
|
1094
|
+
"""Check column is between min and max when condition is true.
|
|
1095
|
+
|
|
1096
|
+
Args:
|
|
1097
|
+
min_val: Minimum allowed value
|
|
1098
|
+
max_val: Maximum allowed value
|
|
1099
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1100
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
ValidationResult with pass/fail status
|
|
1104
|
+
|
|
1105
|
+
Examples:
|
|
1106
|
+
>>> # Discount between 0-50% for standard customers
|
|
1107
|
+
>>> result = data.discount.between_when(
|
|
1108
|
+
... min_val=0,
|
|
1109
|
+
... max_val=50,
|
|
1110
|
+
... condition="customer_tier = 'standard'"
|
|
1111
|
+
... )
|
|
1112
|
+
>>> assert result.passed
|
|
1113
|
+
|
|
1114
|
+
>>> # Age between 18-65 for employees
|
|
1115
|
+
>>> result = data.age.between_when(18, 65, "type = 'employee'")
|
|
1116
|
+
>>> assert result.passed
|
|
1117
|
+
"""
|
|
1118
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1119
|
+
|
|
1120
|
+
handler = ConditionalCheckHandler()
|
|
1121
|
+
return handler.execute_between_when(
|
|
1122
|
+
dataset=self._dataset,
|
|
1123
|
+
column=self._name,
|
|
1124
|
+
min_value=min_val,
|
|
1125
|
+
max_value=max_val,
|
|
1126
|
+
condition=condition,
|
|
1127
|
+
threshold=threshold
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
def isin_when(
|
|
1131
|
+
self,
|
|
1132
|
+
allowed_values: list[Any],
|
|
1133
|
+
condition: str,
|
|
1134
|
+
threshold: float = 1.0
|
|
1135
|
+
) -> ValidationResult:
|
|
1136
|
+
"""Check column is in allowed values when condition is true.
|
|
1137
|
+
|
|
1138
|
+
Args:
|
|
1139
|
+
allowed_values: List of allowed values
|
|
1140
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1141
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
ValidationResult with pass/fail status
|
|
1145
|
+
|
|
1146
|
+
Examples:
|
|
1147
|
+
>>> # Status must be specific values for paid orders
|
|
1148
|
+
>>> result = data.status.isin_when(
|
|
1149
|
+
... allowed_values=['shipped', 'delivered'],
|
|
1150
|
+
... condition="payment_status = 'paid'"
|
|
1151
|
+
... )
|
|
1152
|
+
>>> assert result.passed
|
|
1153
|
+
|
|
1154
|
+
>>> # Category restricted for active products
|
|
1155
|
+
>>> result = data.category.isin_when(
|
|
1156
|
+
... ['A', 'B', 'C'],
|
|
1157
|
+
... "is_active = true"
|
|
1158
|
+
... )
|
|
1159
|
+
>>> assert result.passed
|
|
1160
|
+
"""
|
|
1161
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1162
|
+
|
|
1163
|
+
handler = ConditionalCheckHandler()
|
|
1164
|
+
return handler.execute_isin_when(
|
|
1165
|
+
dataset=self._dataset,
|
|
1166
|
+
column=self._name,
|
|
1167
|
+
allowed_values=allowed_values,
|
|
1168
|
+
condition=condition,
|
|
1169
|
+
threshold=threshold
|
|
1170
|
+
)
|
|
1171
|
+
|
|
1172
|
+
def matches_when(
|
|
1173
|
+
self,
|
|
1174
|
+
pattern: str,
|
|
1175
|
+
condition: str,
|
|
1176
|
+
threshold: float = 1.0
|
|
1177
|
+
) -> ValidationResult:
|
|
1178
|
+
"""Check column matches pattern when condition is true.
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
pattern: Regular expression pattern to match
|
|
1182
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1183
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1184
|
+
|
|
1185
|
+
Returns:
|
|
1186
|
+
ValidationResult with pass/fail status
|
|
1187
|
+
|
|
1188
|
+
Examples:
|
|
1189
|
+
>>> # Email format required for email notifications
|
|
1190
|
+
>>> result = data.contact.matches_when(
|
|
1191
|
+
... pattern=r'^[\\w.-]+@[\\w.-]+\\.\\w+$',
|
|
1192
|
+
... condition="notification_type = 'email'"
|
|
1193
|
+
... )
|
|
1194
|
+
>>> assert result.passed
|
|
1195
|
+
|
|
1196
|
+
>>> # Phone format required for SMS
|
|
1197
|
+
>>> result = data.contact.matches_when(
|
|
1198
|
+
... pattern=r'^\\+?[0-9]{10,15}$',
|
|
1199
|
+
... condition="notification_type = 'sms'"
|
|
1200
|
+
... )
|
|
1201
|
+
>>> assert result.passed
|
|
1202
|
+
"""
|
|
1203
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1204
|
+
|
|
1205
|
+
handler = ConditionalCheckHandler()
|
|
1206
|
+
return handler.execute_pattern_when(
|
|
1207
|
+
dataset=self._dataset,
|
|
1208
|
+
column=self._name,
|
|
1209
|
+
pattern=pattern,
|
|
1210
|
+
condition=condition,
|
|
1211
|
+
threshold=threshold
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
# =================================================================
|
|
1215
|
+
# Distributional Checks (DuckGuard 3.0)
|
|
1216
|
+
# =================================================================
|
|
1217
|
+
|
|
1218
|
+
def expect_distribution_normal(
|
|
1219
|
+
self,
|
|
1220
|
+
significance_level: float = 0.05
|
|
1221
|
+
) -> ValidationResult:
|
|
1222
|
+
"""Check if column data follows a normal distribution.
|
|
1223
|
+
|
|
1224
|
+
Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
significance_level: Significance level for test (default 0.05)
|
|
1228
|
+
|
|
1229
|
+
Returns:
|
|
1230
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1231
|
+
|
|
1232
|
+
Examples:
|
|
1233
|
+
>>> # Test if temperature measurements are normally distributed
|
|
1234
|
+
>>> result = data.temperature.expect_distribution_normal()
|
|
1235
|
+
>>> assert result.passed
|
|
1236
|
+
|
|
1237
|
+
>>> # Use stricter significance level
|
|
1238
|
+
>>> result = data.measurement.expect_distribution_normal(
|
|
1239
|
+
... significance_level=0.01
|
|
1240
|
+
... )
|
|
1241
|
+
|
|
1242
|
+
Note:
|
|
1243
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1244
|
+
Requires minimum 30 samples for reliable results.
|
|
1245
|
+
"""
|
|
1246
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1247
|
+
|
|
1248
|
+
handler = DistributionalCheckHandler()
|
|
1249
|
+
return handler.execute_distribution_normal(
|
|
1250
|
+
dataset=self._dataset,
|
|
1251
|
+
column=self._name,
|
|
1252
|
+
significance_level=significance_level
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
def expect_distribution_uniform(
|
|
1256
|
+
self,
|
|
1257
|
+
significance_level: float = 0.05
|
|
1258
|
+
) -> ValidationResult:
|
|
1259
|
+
"""Check if column data follows a uniform distribution.
|
|
1260
|
+
|
|
1261
|
+
Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
|
|
1262
|
+
|
|
1263
|
+
Args:
|
|
1264
|
+
significance_level: Significance level for test (default 0.05)
|
|
1265
|
+
|
|
1266
|
+
Returns:
|
|
1267
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1268
|
+
|
|
1269
|
+
Examples:
|
|
1270
|
+
>>> # Test if random numbers are uniformly distributed
|
|
1271
|
+
>>> result = data.random_value.expect_distribution_uniform()
|
|
1272
|
+
>>> assert result.passed
|
|
1273
|
+
|
|
1274
|
+
>>> # Test dice rolls for fairness
|
|
1275
|
+
>>> result = data.dice_roll.expect_distribution_uniform()
|
|
1276
|
+
|
|
1277
|
+
Note:
|
|
1278
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1279
|
+
Requires minimum 30 samples for reliable results.
|
|
1280
|
+
"""
|
|
1281
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1282
|
+
|
|
1283
|
+
handler = DistributionalCheckHandler()
|
|
1284
|
+
return handler.execute_distribution_uniform(
|
|
1285
|
+
dataset=self._dataset,
|
|
1286
|
+
column=self._name,
|
|
1287
|
+
significance_level=significance_level
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
def expect_ks_test(
|
|
1291
|
+
self,
|
|
1292
|
+
distribution: str = "norm",
|
|
1293
|
+
significance_level: float = 0.05
|
|
1294
|
+
) -> ValidationResult:
|
|
1295
|
+
"""Perform Kolmogorov-Smirnov test for specified distribution.
|
|
1296
|
+
|
|
1297
|
+
Args:
|
|
1298
|
+
distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
|
|
1299
|
+
significance_level: Significance level for test (default 0.05)
|
|
1300
|
+
|
|
1301
|
+
Returns:
|
|
1302
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1303
|
+
|
|
1304
|
+
Examples:
|
|
1305
|
+
>>> # Test for normal distribution
|
|
1306
|
+
>>> result = data.values.expect_ks_test(distribution='norm')
|
|
1307
|
+
>>> assert result.passed
|
|
1308
|
+
|
|
1309
|
+
>>> # Test for exponential distribution
|
|
1310
|
+
>>> result = data.wait_times.expect_ks_test(
|
|
1311
|
+
... distribution='expon',
|
|
1312
|
+
... significance_level=0.01
|
|
1313
|
+
... )
|
|
1314
|
+
|
|
1315
|
+
Note:
|
|
1316
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1317
|
+
Supported distributions: norm, uniform, expon, gamma, beta, etc.
|
|
1318
|
+
"""
|
|
1319
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1320
|
+
|
|
1321
|
+
handler = DistributionalCheckHandler()
|
|
1322
|
+
return handler.execute_ks_test(
|
|
1323
|
+
dataset=self._dataset,
|
|
1324
|
+
column=self._name,
|
|
1325
|
+
distribution=distribution,
|
|
1326
|
+
significance_level=significance_level
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
def expect_chi_square_test(
|
|
1330
|
+
self,
|
|
1331
|
+
expected_frequencies: dict | None = None,
|
|
1332
|
+
significance_level: float = 0.05
|
|
1333
|
+
) -> ValidationResult:
|
|
1334
|
+
"""Perform chi-square goodness-of-fit test for categorical data.
|
|
1335
|
+
|
|
1336
|
+
Tests if observed frequencies match expected frequencies.
|
|
1337
|
+
|
|
1338
|
+
Args:
|
|
1339
|
+
expected_frequencies: Dict mapping categories to expected frequencies
|
|
1340
|
+
If None, assumes uniform distribution
|
|
1341
|
+
significance_level: Significance level for test (default 0.05)
|
|
1342
|
+
|
|
1343
|
+
Returns:
|
|
1344
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1345
|
+
|
|
1346
|
+
Examples:
|
|
1347
|
+
>>> # Test if dice is fair (uniform distribution)
|
|
1348
|
+
>>> result = data.dice_roll.expect_chi_square_test()
|
|
1349
|
+
>>> assert result.passed
|
|
1350
|
+
|
|
1351
|
+
>>> # Test with specific expected frequencies
|
|
1352
|
+
>>> expected = {1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
|
|
1353
|
+
>>> result = data.dice_roll.expect_chi_square_test(
|
|
1354
|
+
... expected_frequencies=expected
|
|
1355
|
+
... )
|
|
1356
|
+
|
|
1357
|
+
>>> # Test categorical distribution
|
|
1358
|
+
>>> expected = {'A': 0.5, 'B': 0.3, 'C': 0.2}
|
|
1359
|
+
>>> result = data.category.expect_chi_square_test(
|
|
1360
|
+
... expected_frequencies=expected
|
|
1361
|
+
... )
|
|
1362
|
+
|
|
1363
|
+
Note:
|
|
1364
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1365
|
+
Requires minimum 30 samples for reliable results.
|
|
1366
|
+
"""
|
|
1367
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1368
|
+
|
|
1369
|
+
handler = DistributionalCheckHandler()
|
|
1370
|
+
return handler.execute_chi_square_test(
|
|
1371
|
+
dataset=self._dataset,
|
|
1372
|
+
column=self._name,
|
|
1373
|
+
expected_frequencies=expected_frequencies,
|
|
1374
|
+
significance_level=significance_level
|
|
1375
|
+
)
|
|
1376
|
+
|
|
1005
1377
|
def clear_cache(self) -> None:
|
|
1006
1378
|
"""Clear cached statistics."""
|
|
1007
1379
|
self._stats_cache = None
|