duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,7 @@ def register_connector(connector_class: type[Connector]) -> None:
31
31
 
32
32
 
33
33
  def connect(
34
- source: str,
34
+ source: Any,
35
35
  *,
36
36
  table: str | None = None,
37
37
  schema: str | None = None,
@@ -46,7 +46,7 @@ def connect(
46
46
  It automatically detects the source type and uses the appropriate connector.
47
47
 
48
48
  Args:
49
- source: Path to file, connection string, or URL
49
+ source: Path to file, connection string, URL, or DataFrame (pandas/polars/pyarrow)
50
50
  table: Table name (for database connections)
51
51
  schema: Schema name (for database connections)
52
52
  database: Database name (for database connections)
@@ -60,6 +60,9 @@ def connect(
60
60
  # Connect to a CSV file
61
61
  orders = connect("data/orders.csv")
62
62
 
63
+ # Connect to a DataFrame
64
+ orders = connect(df)
65
+
63
66
  # Connect to a Parquet file on S3
64
67
  orders = connect("s3://bucket/orders.parquet")
65
68
 
@@ -72,6 +75,23 @@ def connect(
72
75
  Raises:
73
76
  ValueError: If no connector can handle the source
74
77
  """
78
+ # Handle DataFrame sources (pandas, polars, pyarrow)
79
+ if not isinstance(source, str):
80
+ # Check if it's a DataFrame-like object
81
+ if hasattr(source, '__dataframe__') or hasattr(source, 'to_pandas') or \
82
+ (hasattr(source, 'shape') and hasattr(source, 'columns')):
83
+ # Register DataFrame with engine
84
+ if engine is None:
85
+ engine = DuckGuardEngine.get_instance()
86
+
87
+ # Generate a unique name for the DataFrame
88
+ import hashlib
89
+ import time
90
+ df_name = f"df_{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}"
91
+
92
+ engine.register_dataframe(df_name, source)
93
+ return Dataset(source=df_name, engine=engine, name="dataframe")
94
+
75
95
  config = ConnectionConfig(
76
96
  source=source,
77
97
  table=table,
@@ -99,6 +119,10 @@ def connect(
99
119
 
100
120
  def _is_database_connection(source: str) -> bool:
101
121
  """Check if source is a database connection string."""
122
+ # Only handle string sources
123
+ if not isinstance(source, str):
124
+ return False
125
+
102
126
  db_prefixes = (
103
127
  "postgres://",
104
128
  "postgresql://",
@@ -143,6 +167,10 @@ def _handle_database_connection(
143
167
  engine: DuckGuardEngine | None,
144
168
  ) -> Dataset:
145
169
  """Handle database connection strings."""
170
+ # Validate source is a string
171
+ if not isinstance(source, str):
172
+ raise ValueError(f"Expected string source, got {type(source).__name__}")
173
+
146
174
  source_lower = source.lower()
147
175
 
148
176
  # PostgreSQL
@@ -65,6 +65,10 @@ class FileConnector(Connector):
65
65
  @classmethod
66
66
  def can_handle(cls, source: str) -> bool:
67
67
  """Check if this connector can handle the source."""
68
+ # Only handle string paths
69
+ if not isinstance(source, str):
70
+ return False
71
+
68
72
  # Check for file extensions
69
73
  path = Path(source)
70
74
  ext = path.suffix.lower()
@@ -99,7 +103,7 @@ class S3Connector(FileConnector):
99
103
  @classmethod
100
104
  def can_handle(cls, source: str) -> bool:
101
105
  """Check if this is an S3 path."""
102
- return source.lower().startswith("s3://")
106
+ return isinstance(source, str) and source.lower().startswith("s3://")
103
107
 
104
108
  @classmethod
105
109
  def get_priority(cls) -> int:
@@ -113,7 +117,7 @@ class GCSConnector(FileConnector):
113
117
  @classmethod
114
118
  def can_handle(cls, source: str) -> bool:
115
119
  """Check if this is a GCS path."""
116
- return source.lower().startswith(("gs://", "gcs://"))
120
+ return isinstance(source, str) and source.lower().startswith(("gs://", "gcs://"))
117
121
 
118
122
  @classmethod
119
123
  def get_priority(cls) -> int:
@@ -127,7 +131,7 @@ class AzureConnector(FileConnector):
127
131
  @classmethod
128
132
  def can_handle(cls, source: str) -> bool:
129
133
  """Check if this is an Azure path."""
130
- return source.lower().startswith(("az://", "abfs://"))
134
+ return isinstance(source, str) and source.lower().startswith(("az://", "abfs://"))
131
135
 
132
136
  @classmethod
133
137
  def get_priority(cls) -> int:
duckguard/core/column.py CHANGED
@@ -1002,6 +1002,378 @@ class Column:
1002
1002
  rows = self._dataset.engine.fetch_all(sql)
1003
1003
  return {row[0]: row[1] for row in rows}
1004
1004
 
1005
+ # =====================================================================
1006
+ # Conditional Validation Methods (DuckGuard 3.0)
1007
+ # =====================================================================
1008
+
1009
+ def not_null_when(
1010
+ self,
1011
+ condition: str,
1012
+ threshold: float = 1.0
1013
+ ) -> ValidationResult:
1014
+ """Check column is not null when condition is true.
1015
+
1016
+ This enables sophisticated conditional validation like:
1017
+ - "State must not be null when country = 'USA'"
1018
+ - "Phone is required when contact_method = 'phone'"
1019
+
1020
+ Args:
1021
+ condition: SQL WHERE clause condition (without WHERE keyword)
1022
+ threshold: Maximum allowed non-null rate (0.0 to 1.0, default 1.0)
1023
+
1024
+ Returns:
1025
+ ValidationResult with pass/fail status
1026
+
1027
+ Raises:
1028
+ ValidationError: If condition is invalid or contains forbidden SQL
1029
+
1030
+ Examples:
1031
+ >>> data = connect("customers.csv")
1032
+ >>> # State required for US customers
1033
+ >>> result = data.state.not_null_when("country = 'USA'")
1034
+ >>> assert result.passed
1035
+
1036
+ >>> # Email required for registered users
1037
+ >>> result = data.email.not_null_when("user_type = 'registered'")
1038
+ >>> assert result.passed
1039
+
1040
+ Security:
1041
+ Conditions are validated to prevent SQL injection. Only SELECT
1042
+ queries with WHERE clauses are allowed.
1043
+ """
1044
+ from duckguard.checks.conditional import ConditionalCheckHandler
1045
+
1046
+ handler = ConditionalCheckHandler()
1047
+ return handler.execute_not_null_when(
1048
+ dataset=self._dataset,
1049
+ column=self._name,
1050
+ condition=condition,
1051
+ threshold=threshold
1052
+ )
1053
+
1054
+ def unique_when(
1055
+ self,
1056
+ condition: str,
1057
+ threshold: float = 1.0
1058
+ ) -> ValidationResult:
1059
+ """Check column is unique when condition is true.
1060
+
1061
+ Args:
1062
+ condition: SQL WHERE clause condition (without WHERE keyword)
1063
+ threshold: Minimum required uniqueness rate (0.0 to 1.0, default 1.0)
1064
+
1065
+ Returns:
1066
+ ValidationResult with pass/fail status
1067
+
1068
+ Examples:
1069
+ >>> # Order IDs must be unique for completed orders
1070
+ >>> result = data.order_id.unique_when("status = 'completed'")
1071
+ >>> assert result.passed
1072
+
1073
+ >>> # Transaction IDs unique for successful transactions
1074
+ >>> result = data.txn_id.unique_when("success = true")
1075
+ >>> assert result.passed
1076
+ """
1077
+ from duckguard.checks.conditional import ConditionalCheckHandler
1078
+
1079
+ handler = ConditionalCheckHandler()
1080
+ return handler.execute_unique_when(
1081
+ dataset=self._dataset,
1082
+ column=self._name,
1083
+ condition=condition,
1084
+ threshold=threshold
1085
+ )
1086
+
1087
+ def between_when(
1088
+ self,
1089
+ min_val: float,
1090
+ max_val: float,
1091
+ condition: str,
1092
+ threshold: float = 1.0
1093
+ ) -> ValidationResult:
1094
+ """Check column is between min and max when condition is true.
1095
+
1096
+ Args:
1097
+ min_val: Minimum allowed value
1098
+ max_val: Maximum allowed value
1099
+ condition: SQL WHERE clause condition (without WHERE keyword)
1100
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1101
+
1102
+ Returns:
1103
+ ValidationResult with pass/fail status
1104
+
1105
+ Examples:
1106
+ >>> # Discount between 0-50% for standard customers
1107
+ >>> result = data.discount.between_when(
1108
+ ... min_val=0,
1109
+ ... max_val=50,
1110
+ ... condition="customer_tier = 'standard'"
1111
+ ... )
1112
+ >>> assert result.passed
1113
+
1114
+ >>> # Age between 18-65 for employees
1115
+ >>> result = data.age.between_when(18, 65, "type = 'employee'")
1116
+ >>> assert result.passed
1117
+ """
1118
+ from duckguard.checks.conditional import ConditionalCheckHandler
1119
+
1120
+ handler = ConditionalCheckHandler()
1121
+ return handler.execute_between_when(
1122
+ dataset=self._dataset,
1123
+ column=self._name,
1124
+ min_value=min_val,
1125
+ max_value=max_val,
1126
+ condition=condition,
1127
+ threshold=threshold
1128
+ )
1129
+
1130
+ def isin_when(
1131
+ self,
1132
+ allowed_values: list[Any],
1133
+ condition: str,
1134
+ threshold: float = 1.0
1135
+ ) -> ValidationResult:
1136
+ """Check column is in allowed values when condition is true.
1137
+
1138
+ Args:
1139
+ allowed_values: List of allowed values
1140
+ condition: SQL WHERE clause condition (without WHERE keyword)
1141
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1142
+
1143
+ Returns:
1144
+ ValidationResult with pass/fail status
1145
+
1146
+ Examples:
1147
+ >>> # Status must be specific values for paid orders
1148
+ >>> result = data.status.isin_when(
1149
+ ... allowed_values=['shipped', 'delivered'],
1150
+ ... condition="payment_status = 'paid'"
1151
+ ... )
1152
+ >>> assert result.passed
1153
+
1154
+ >>> # Category restricted for active products
1155
+ >>> result = data.category.isin_when(
1156
+ ... ['A', 'B', 'C'],
1157
+ ... "is_active = true"
1158
+ ... )
1159
+ >>> assert result.passed
1160
+ """
1161
+ from duckguard.checks.conditional import ConditionalCheckHandler
1162
+
1163
+ handler = ConditionalCheckHandler()
1164
+ return handler.execute_isin_when(
1165
+ dataset=self._dataset,
1166
+ column=self._name,
1167
+ allowed_values=allowed_values,
1168
+ condition=condition,
1169
+ threshold=threshold
1170
+ )
1171
+
1172
+ def matches_when(
1173
+ self,
1174
+ pattern: str,
1175
+ condition: str,
1176
+ threshold: float = 1.0
1177
+ ) -> ValidationResult:
1178
+ """Check column matches pattern when condition is true.
1179
+
1180
+ Args:
1181
+ pattern: Regular expression pattern to match
1182
+ condition: SQL WHERE clause condition (without WHERE keyword)
1183
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1184
+
1185
+ Returns:
1186
+ ValidationResult with pass/fail status
1187
+
1188
+ Examples:
1189
+ >>> # Email format required for email notifications
1190
+ >>> result = data.contact.matches_when(
1191
+ ... pattern=r'^[\\w.-]+@[\\w.-]+\\.\\w+$',
1192
+ ... condition="notification_type = 'email'"
1193
+ ... )
1194
+ >>> assert result.passed
1195
+
1196
+ >>> # Phone format required for SMS
1197
+ >>> result = data.contact.matches_when(
1198
+ ... pattern=r'^\\+?[0-9]{10,15}$',
1199
+ ... condition="notification_type = 'sms'"
1200
+ ... )
1201
+ >>> assert result.passed
1202
+ """
1203
+ from duckguard.checks.conditional import ConditionalCheckHandler
1204
+
1205
+ handler = ConditionalCheckHandler()
1206
+ return handler.execute_pattern_when(
1207
+ dataset=self._dataset,
1208
+ column=self._name,
1209
+ pattern=pattern,
1210
+ condition=condition,
1211
+ threshold=threshold
1212
+ )
1213
+
1214
+ # =================================================================
1215
+ # Distributional Checks (DuckGuard 3.0)
1216
+ # =================================================================
1217
+
1218
+ def expect_distribution_normal(
1219
+ self,
1220
+ significance_level: float = 0.05
1221
+ ) -> ValidationResult:
1222
+ """Check if column data follows a normal distribution.
1223
+
1224
+ Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
1225
+
1226
+ Args:
1227
+ significance_level: Significance level for test (default 0.05)
1228
+
1229
+ Returns:
1230
+ ValidationResult (passed if p-value > significance_level)
1231
+
1232
+ Examples:
1233
+ >>> # Test if temperature measurements are normally distributed
1234
+ >>> result = data.temperature.expect_distribution_normal()
1235
+ >>> assert result.passed
1236
+
1237
+ >>> # Use stricter significance level
1238
+ >>> result = data.measurement.expect_distribution_normal(
1239
+ ... significance_level=0.01
1240
+ ... )
1241
+
1242
+ Note:
1243
+ Requires scipy: pip install 'duckguard[statistics]'
1244
+ Requires minimum 30 samples for reliable results.
1245
+ """
1246
+ from duckguard.checks.distributional import DistributionalCheckHandler
1247
+
1248
+ handler = DistributionalCheckHandler()
1249
+ return handler.execute_distribution_normal(
1250
+ dataset=self._dataset,
1251
+ column=self._name,
1252
+ significance_level=significance_level
1253
+ )
1254
+
1255
+ def expect_distribution_uniform(
1256
+ self,
1257
+ significance_level: float = 0.05
1258
+ ) -> ValidationResult:
1259
+ """Check if column data follows a uniform distribution.
1260
+
1261
+ Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
1262
+
1263
+ Args:
1264
+ significance_level: Significance level for test (default 0.05)
1265
+
1266
+ Returns:
1267
+ ValidationResult (passed if p-value > significance_level)
1268
+
1269
+ Examples:
1270
+ >>> # Test if random numbers are uniformly distributed
1271
+ >>> result = data.random_value.expect_distribution_uniform()
1272
+ >>> assert result.passed
1273
+
1274
+ >>> # Test dice rolls for fairness
1275
+ >>> result = data.dice_roll.expect_distribution_uniform()
1276
+
1277
+ Note:
1278
+ Requires scipy: pip install 'duckguard[statistics]'
1279
+ Requires minimum 30 samples for reliable results.
1280
+ """
1281
+ from duckguard.checks.distributional import DistributionalCheckHandler
1282
+
1283
+ handler = DistributionalCheckHandler()
1284
+ return handler.execute_distribution_uniform(
1285
+ dataset=self._dataset,
1286
+ column=self._name,
1287
+ significance_level=significance_level
1288
+ )
1289
+
1290
+ def expect_ks_test(
1291
+ self,
1292
+ distribution: str = "norm",
1293
+ significance_level: float = 0.05
1294
+ ) -> ValidationResult:
1295
+ """Perform Kolmogorov-Smirnov test for specified distribution.
1296
+
1297
+ Args:
1298
+ distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
1299
+ significance_level: Significance level for test (default 0.05)
1300
+
1301
+ Returns:
1302
+ ValidationResult (passed if p-value > significance_level)
1303
+
1304
+ Examples:
1305
+ >>> # Test for normal distribution
1306
+ >>> result = data.values.expect_ks_test(distribution='norm')
1307
+ >>> assert result.passed
1308
+
1309
+ >>> # Test for exponential distribution
1310
+ >>> result = data.wait_times.expect_ks_test(
1311
+ ... distribution='expon',
1312
+ ... significance_level=0.01
1313
+ ... )
1314
+
1315
+ Note:
1316
+ Requires scipy: pip install 'duckguard[statistics]'
1317
+ Supported distributions: norm, uniform, expon, gamma, beta, etc.
1318
+ """
1319
+ from duckguard.checks.distributional import DistributionalCheckHandler
1320
+
1321
+ handler = DistributionalCheckHandler()
1322
+ return handler.execute_ks_test(
1323
+ dataset=self._dataset,
1324
+ column=self._name,
1325
+ distribution=distribution,
1326
+ significance_level=significance_level
1327
+ )
1328
+
1329
+ def expect_chi_square_test(
1330
+ self,
1331
+ expected_frequencies: dict | None = None,
1332
+ significance_level: float = 0.05
1333
+ ) -> ValidationResult:
1334
+ """Perform chi-square goodness-of-fit test for categorical data.
1335
+
1336
+ Tests if observed frequencies match expected frequencies.
1337
+
1338
+ Args:
1339
+ expected_frequencies: Dict mapping categories to expected frequencies
1340
+ If None, assumes uniform distribution
1341
+ significance_level: Significance level for test (default 0.05)
1342
+
1343
+ Returns:
1344
+ ValidationResult (passed if p-value > significance_level)
1345
+
1346
+ Examples:
1347
+ >>> # Test if dice is fair (uniform distribution)
1348
+ >>> result = data.dice_roll.expect_chi_square_test()
1349
+ >>> assert result.passed
1350
+
1351
+ >>> # Test with specific expected frequencies
1352
+ >>> expected = {1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
1353
+ >>> result = data.dice_roll.expect_chi_square_test(
1354
+ ... expected_frequencies=expected
1355
+ ... )
1356
+
1357
+ >>> # Test categorical distribution
1358
+ >>> expected = {'A': 0.5, 'B': 0.3, 'C': 0.2}
1359
+ >>> result = data.category.expect_chi_square_test(
1360
+ ... expected_frequencies=expected
1361
+ ... )
1362
+
1363
+ Note:
1364
+ Requires scipy: pip install 'duckguard[statistics]'
1365
+ Requires minimum 30 samples for reliable results.
1366
+ """
1367
+ from duckguard.checks.distributional import DistributionalCheckHandler
1368
+
1369
+ handler = DistributionalCheckHandler()
1370
+ return handler.execute_chi_square_test(
1371
+ dataset=self._dataset,
1372
+ column=self._name,
1373
+ expected_frequencies=expected_frequencies,
1374
+ significance_level=significance_level
1375
+ )
1376
+
1005
1377
  def clear_cache(self) -> None:
1006
1378
  """Clear cached statistics."""
1007
1379
  self._stats_cache = None