duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
"""Semantic type detector for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Automatically identifies the semantic meaning of data columns based on:
|
|
4
|
+
- Column names (e.g., "email", "phone_number")
|
|
5
|
+
- Data patterns (e.g., regex matching)
|
|
6
|
+
- Value distributions
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SemanticType(Enum):
|
|
18
|
+
"""Semantic types that can be detected."""
|
|
19
|
+
|
|
20
|
+
# Identity types
|
|
21
|
+
PRIMARY_KEY = "primary_key"
|
|
22
|
+
FOREIGN_KEY = "foreign_key"
|
|
23
|
+
UUID = "uuid"
|
|
24
|
+
ID = "id"
|
|
25
|
+
|
|
26
|
+
# Contact information
|
|
27
|
+
EMAIL = "email"
|
|
28
|
+
PHONE = "phone"
|
|
29
|
+
URL = "url"
|
|
30
|
+
IP_ADDRESS = "ip_address"
|
|
31
|
+
|
|
32
|
+
# Personal information (PII)
|
|
33
|
+
SSN = "ssn"
|
|
34
|
+
CREDIT_CARD = "credit_card"
|
|
35
|
+
PERSON_NAME = "person_name"
|
|
36
|
+
ADDRESS = "address"
|
|
37
|
+
|
|
38
|
+
# Location
|
|
39
|
+
COUNTRY = "country"
|
|
40
|
+
COUNTRY_CODE = "country_code"
|
|
41
|
+
STATE = "state"
|
|
42
|
+
CITY = "city"
|
|
43
|
+
ZIPCODE = "zipcode"
|
|
44
|
+
LATITUDE = "latitude"
|
|
45
|
+
LONGITUDE = "longitude"
|
|
46
|
+
|
|
47
|
+
# Date/Time
|
|
48
|
+
DATE = "date"
|
|
49
|
+
DATETIME = "datetime"
|
|
50
|
+
TIME = "time"
|
|
51
|
+
TIMESTAMP = "timestamp"
|
|
52
|
+
YEAR = "year"
|
|
53
|
+
MONTH = "month"
|
|
54
|
+
DAY = "day"
|
|
55
|
+
|
|
56
|
+
# Numeric
|
|
57
|
+
CURRENCY = "currency"
|
|
58
|
+
PERCENTAGE = "percentage"
|
|
59
|
+
QUANTITY = "quantity"
|
|
60
|
+
AGE = "age"
|
|
61
|
+
COUNT = "count"
|
|
62
|
+
|
|
63
|
+
# Categorical
|
|
64
|
+
BOOLEAN = "boolean"
|
|
65
|
+
ENUM = "enum"
|
|
66
|
+
STATUS = "status"
|
|
67
|
+
CATEGORY = "category"
|
|
68
|
+
GENDER = "gender"
|
|
69
|
+
|
|
70
|
+
# Text
|
|
71
|
+
TEXT = "text"
|
|
72
|
+
DESCRIPTION = "description"
|
|
73
|
+
TITLE = "title"
|
|
74
|
+
SLUG = "slug"
|
|
75
|
+
CODE = "code"
|
|
76
|
+
|
|
77
|
+
# Unknown
|
|
78
|
+
UNKNOWN = "unknown"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class SemanticTypeResult:
|
|
83
|
+
"""Result of semantic type detection.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
semantic_type: The detected semantic type
|
|
87
|
+
confidence: Confidence score (0-1)
|
|
88
|
+
reasons: Reasons for the detection
|
|
89
|
+
is_pii: Whether this is personally identifiable information
|
|
90
|
+
suggested_validations: List of suggested validation rules
|
|
91
|
+
metadata: Additional detection metadata
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
semantic_type: SemanticType
|
|
95
|
+
confidence: float
|
|
96
|
+
reasons: list[str] = field(default_factory=list)
|
|
97
|
+
is_pii: bool = False
|
|
98
|
+
suggested_validations: list[str] = field(default_factory=list)
|
|
99
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Column name patterns for detection
|
|
103
|
+
NAME_PATTERNS: dict[SemanticType, list[str]] = {
|
|
104
|
+
SemanticType.EMAIL: [
|
|
105
|
+
r"e?mail", r"email_?addr(ess)?", r"user_?email", r"contact_?email"
|
|
106
|
+
],
|
|
107
|
+
SemanticType.PHONE: [
|
|
108
|
+
r"phone", r"tel(ephone)?", r"mobile", r"cell", r"fax",
|
|
109
|
+
r"phone_?(num(ber)?)?", r"contact_?phone"
|
|
110
|
+
],
|
|
111
|
+
SemanticType.URL: [
|
|
112
|
+
r"url", r"link", r"href", r"website", r"web_?addr(ess)?", r"uri"
|
|
113
|
+
],
|
|
114
|
+
SemanticType.UUID: [
|
|
115
|
+
r"uuid", r"guid", r".*_uuid$", r".*_guid$"
|
|
116
|
+
],
|
|
117
|
+
SemanticType.PRIMARY_KEY: [
|
|
118
|
+
r"^id$", r".*_id$", r"pk", r"primary_?key"
|
|
119
|
+
],
|
|
120
|
+
SemanticType.FOREIGN_KEY: [
|
|
121
|
+
r"fk_.*", r".*_fk$", r"ref_.*", r".*_ref$"
|
|
122
|
+
],
|
|
123
|
+
SemanticType.SSN: [
|
|
124
|
+
r"ssn", r"social_?security", r"tax_?id", r"sin"
|
|
125
|
+
],
|
|
126
|
+
SemanticType.CREDIT_CARD: [
|
|
127
|
+
r"cc_?(num(ber)?)?", r"card_?(num(ber)?)?", r"credit_?card",
|
|
128
|
+
r"pan", r"payment_?card"
|
|
129
|
+
],
|
|
130
|
+
SemanticType.PERSON_NAME: [
|
|
131
|
+
r"name", r"first_?name", r"last_?name", r"full_?name",
|
|
132
|
+
r"given_?name", r"surname", r"family_?name"
|
|
133
|
+
],
|
|
134
|
+
SemanticType.ADDRESS: [
|
|
135
|
+
r"addr(ess)?", r"street", r"address_?line", r"street_?addr(ess)?"
|
|
136
|
+
],
|
|
137
|
+
SemanticType.COUNTRY: [
|
|
138
|
+
r"country", r"nation", r"country_?name"
|
|
139
|
+
],
|
|
140
|
+
SemanticType.COUNTRY_CODE: [
|
|
141
|
+
r"country_?code", r"iso_?country", r"cc"
|
|
142
|
+
],
|
|
143
|
+
SemanticType.STATE: [
|
|
144
|
+
r"state", r"province", r"region", r"state_?code"
|
|
145
|
+
],
|
|
146
|
+
SemanticType.CITY: [
|
|
147
|
+
r"city", r"town", r"municipality"
|
|
148
|
+
],
|
|
149
|
+
SemanticType.ZIPCODE: [
|
|
150
|
+
r"zip", r"zip_?code", r"postal", r"postal_?code", r"postcode"
|
|
151
|
+
],
|
|
152
|
+
SemanticType.LATITUDE: [
|
|
153
|
+
r"lat(itude)?", r"geo_?lat"
|
|
154
|
+
],
|
|
155
|
+
SemanticType.LONGITUDE: [
|
|
156
|
+
r"lon(g)?(itude)?", r"lng", r"geo_?lon(g)?"
|
|
157
|
+
],
|
|
158
|
+
SemanticType.DATE: [
|
|
159
|
+
r"date", r".*_date$", r".*_dt$", r"dob", r"birth_?date"
|
|
160
|
+
],
|
|
161
|
+
SemanticType.DATETIME: [
|
|
162
|
+
r"datetime", r".*_datetime$", r"timestamp"
|
|
163
|
+
],
|
|
164
|
+
SemanticType.TIME: [
|
|
165
|
+
r"^time$", r".*_time$"
|
|
166
|
+
],
|
|
167
|
+
SemanticType.TIMESTAMP: [
|
|
168
|
+
r"timestamp", r".*_ts$", r"created_?at", r"updated_?at",
|
|
169
|
+
r"modified_?at", r"deleted_?at"
|
|
170
|
+
],
|
|
171
|
+
SemanticType.YEAR: [
|
|
172
|
+
r"year", r"yr"
|
|
173
|
+
],
|
|
174
|
+
SemanticType.MONTH: [
|
|
175
|
+
r"month", r"mo"
|
|
176
|
+
],
|
|
177
|
+
SemanticType.CURRENCY: [
|
|
178
|
+
r"amount", r"price", r"cost", r"total", r"subtotal",
|
|
179
|
+
r"revenue", r"salary", r"fee", r"charge", r"balance",
|
|
180
|
+
r"payment", r".*_amt$", r".*_amount$"
|
|
181
|
+
],
|
|
182
|
+
SemanticType.PERCENTAGE: [
|
|
183
|
+
r"percent(age)?", r"rate", r"ratio", r"pct", r".*_pct$"
|
|
184
|
+
],
|
|
185
|
+
SemanticType.QUANTITY: [
|
|
186
|
+
r"qty", r"quantity", r"count", r"num(ber)?", r".*_qty$"
|
|
187
|
+
],
|
|
188
|
+
SemanticType.AGE: [
|
|
189
|
+
r"age", r"years_?old"
|
|
190
|
+
],
|
|
191
|
+
SemanticType.BOOLEAN: [
|
|
192
|
+
r"is_.*", r"has_.*", r"can_.*", r"should_.*", r"enabled",
|
|
193
|
+
r"disabled", r"active", r"flag", r".*_flag$"
|
|
194
|
+
],
|
|
195
|
+
SemanticType.STATUS: [
|
|
196
|
+
r"status", r"state", r"stage", r"phase"
|
|
197
|
+
],
|
|
198
|
+
SemanticType.CATEGORY: [
|
|
199
|
+
r"type", r"category", r"kind", r"class", r"group", r".*_type$"
|
|
200
|
+
],
|
|
201
|
+
SemanticType.GENDER: [
|
|
202
|
+
r"gender", r"sex"
|
|
203
|
+
],
|
|
204
|
+
SemanticType.DESCRIPTION: [
|
|
205
|
+
r"desc(ription)?", r"summary", r"notes?", r"comment", r"remarks?"
|
|
206
|
+
],
|
|
207
|
+
SemanticType.TITLE: [
|
|
208
|
+
r"title", r"subject", r"heading", r"headline"
|
|
209
|
+
],
|
|
210
|
+
SemanticType.SLUG: [
|
|
211
|
+
r"slug", r"permalink", r"url_?key"
|
|
212
|
+
],
|
|
213
|
+
SemanticType.IP_ADDRESS: [
|
|
214
|
+
r"ip", r"ip_?addr(ess)?", r"client_?ip", r"remote_?ip"
|
|
215
|
+
],
|
|
216
|
+
SemanticType.CODE: [
|
|
217
|
+
r"code", r".*_code$"
|
|
218
|
+
],
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# Value patterns for detection
|
|
222
|
+
VALUE_PATTERNS: dict[SemanticType, str] = {
|
|
223
|
+
SemanticType.EMAIL: r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
|
|
224
|
+
SemanticType.PHONE: r"^\+?[\d\s\-\(\)\.]{10,}$",
|
|
225
|
+
SemanticType.URL: r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$",
|
|
226
|
+
SemanticType.UUID: r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
|
|
227
|
+
SemanticType.SSN: r"^\d{3}-\d{2}-\d{4}$",
|
|
228
|
+
SemanticType.CREDIT_CARD: r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
|
|
229
|
+
SemanticType.IP_ADDRESS: r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
|
230
|
+
SemanticType.ZIPCODE: r"^\d{5}(-\d{4})?$",
|
|
231
|
+
SemanticType.DATE: r"^\d{4}-\d{2}-\d{2}$",
|
|
232
|
+
SemanticType.DATETIME: r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
|
|
233
|
+
SemanticType.TIME: r"^\d{2}:\d{2}(:\d{2})?$",
|
|
234
|
+
SemanticType.COUNTRY_CODE: r"^[A-Z]{2,3}$",
|
|
235
|
+
SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
236
|
+
SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
|
|
237
|
+
SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
# PII types that should be flagged
|
|
241
|
+
PII_TYPES = {
|
|
242
|
+
SemanticType.EMAIL,
|
|
243
|
+
SemanticType.PHONE,
|
|
244
|
+
SemanticType.SSN,
|
|
245
|
+
SemanticType.CREDIT_CARD,
|
|
246
|
+
SemanticType.PERSON_NAME,
|
|
247
|
+
SemanticType.ADDRESS,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Suggested validations per type
|
|
251
|
+
TYPE_VALIDATIONS: dict[SemanticType, list[str]] = {
|
|
252
|
+
SemanticType.EMAIL: ["pattern: email", "unique"],
|
|
253
|
+
SemanticType.PHONE: ["pattern: phone"],
|
|
254
|
+
SemanticType.URL: ["pattern: url"],
|
|
255
|
+
SemanticType.UUID: ["pattern: uuid", "unique"],
|
|
256
|
+
SemanticType.PRIMARY_KEY: ["not_null", "unique"],
|
|
257
|
+
SemanticType.FOREIGN_KEY: ["not_null"],
|
|
258
|
+
SemanticType.SSN: ["pattern: ssn"],
|
|
259
|
+
SemanticType.CREDIT_CARD: ["pattern: credit_card"],
|
|
260
|
+
SemanticType.IP_ADDRESS: ["pattern: ip_address"],
|
|
261
|
+
SemanticType.ZIPCODE: ["pattern: zipcode"],
|
|
262
|
+
SemanticType.DATE: ["pattern: date_iso"],
|
|
263
|
+
SemanticType.DATETIME: ["pattern: datetime_iso"],
|
|
264
|
+
SemanticType.CURRENCY: ["non_negative"],
|
|
265
|
+
SemanticType.PERCENTAGE: ["range: [0, 100]"],
|
|
266
|
+
SemanticType.QUANTITY: ["non_negative"],
|
|
267
|
+
SemanticType.AGE: ["range: [0, 150]"],
|
|
268
|
+
SemanticType.LATITUDE: ["range: [-90, 90]"],
|
|
269
|
+
SemanticType.LONGITUDE: ["range: [-180, 180]"],
|
|
270
|
+
SemanticType.BOOLEAN: ["allowed_values: [true, false]"],
|
|
271
|
+
SemanticType.COUNTRY_CODE: ["pattern: country_code"],
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def detect_type(
|
|
276
|
+
dataset_or_name,
|
|
277
|
+
column_name: str | None = None,
|
|
278
|
+
sample_values: list[Any] | None = None,
|
|
279
|
+
unique_percent: float | None = None,
|
|
280
|
+
null_percent: float | None = None,
|
|
281
|
+
) -> SemanticType | None:
|
|
282
|
+
"""Detect the semantic type of a column.
|
|
283
|
+
|
|
284
|
+
Can be called two ways:
|
|
285
|
+
1. detect_type(dataset, "column_name") - high-level API
|
|
286
|
+
2. detect_type("column_name", sample_values=[...]) - low-level API
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
dataset_or_name: Either a Dataset object or column name string
|
|
290
|
+
column_name: Column name (when first arg is Dataset)
|
|
291
|
+
sample_values: Sample values from the column (low-level API)
|
|
292
|
+
unique_percent: Percentage of unique values (low-level API)
|
|
293
|
+
null_percent: Percentage of null values (low-level API)
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
SemanticType enum value (or None if unknown)
|
|
297
|
+
"""
|
|
298
|
+
detector = SemanticTypeDetector()
|
|
299
|
+
|
|
300
|
+
# High-level API: detect_type(dataset, "column_name")
|
|
301
|
+
if hasattr(dataset_or_name, 'columns') and column_name is not None:
|
|
302
|
+
dataset = dataset_or_name
|
|
303
|
+
col = dataset[column_name]
|
|
304
|
+
try:
|
|
305
|
+
sample = col.get_distinct_values(limit=100)
|
|
306
|
+
except Exception:
|
|
307
|
+
sample = []
|
|
308
|
+
|
|
309
|
+
result = detector.detect(
|
|
310
|
+
column_name,
|
|
311
|
+
sample,
|
|
312
|
+
col.unique_percent,
|
|
313
|
+
col.null_percent,
|
|
314
|
+
)
|
|
315
|
+
return result.semantic_type
|
|
316
|
+
|
|
317
|
+
# Low-level API: detect_type("column_name", sample_values=[...])
|
|
318
|
+
result = detector.detect(
|
|
319
|
+
str(dataset_or_name),
|
|
320
|
+
sample_values or [],
|
|
321
|
+
unique_percent,
|
|
322
|
+
null_percent,
|
|
323
|
+
)
|
|
324
|
+
return result.semantic_type
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def detect_types_for_dataset(dataset) -> dict[str, SemanticType | None]:
|
|
328
|
+
"""Detect semantic types for all columns in a dataset.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
dataset: Dataset to analyze
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Dict mapping column names to SemanticType (or None if unknown)
|
|
335
|
+
"""
|
|
336
|
+
detector = SemanticTypeDetector()
|
|
337
|
+
results = {}
|
|
338
|
+
|
|
339
|
+
for col_name in dataset.columns:
|
|
340
|
+
col = dataset[col_name]
|
|
341
|
+
try:
|
|
342
|
+
sample = col.get_distinct_values(limit=100)
|
|
343
|
+
except Exception:
|
|
344
|
+
sample = []
|
|
345
|
+
|
|
346
|
+
result = detector.detect(
|
|
347
|
+
col_name,
|
|
348
|
+
sample,
|
|
349
|
+
col.unique_percent,
|
|
350
|
+
col.null_percent,
|
|
351
|
+
)
|
|
352
|
+
results[col_name] = result.semantic_type
|
|
353
|
+
|
|
354
|
+
return results
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class SemanticTypeDetector:
|
|
358
|
+
"""Detects semantic types for data columns."""
|
|
359
|
+
|
|
360
|
+
def __init__(self):
|
|
361
|
+
self.name_patterns = NAME_PATTERNS
|
|
362
|
+
self.value_patterns = VALUE_PATTERNS
|
|
363
|
+
|
|
364
|
+
def detect(
|
|
365
|
+
self,
|
|
366
|
+
column_name: str,
|
|
367
|
+
sample_values: list[Any] | None = None,
|
|
368
|
+
unique_percent: float | None = None,
|
|
369
|
+
null_percent: float | None = None,
|
|
370
|
+
) -> SemanticTypeResult:
|
|
371
|
+
"""Detect semantic type for a column."""
|
|
372
|
+
reasons = []
|
|
373
|
+
candidates: dict[SemanticType, float] = {}
|
|
374
|
+
|
|
375
|
+
# 1. Check column name patterns
|
|
376
|
+
name_lower = column_name.lower().replace("-", "_")
|
|
377
|
+
for sem_type, patterns in self.name_patterns.items():
|
|
378
|
+
for pattern in patterns:
|
|
379
|
+
if re.match(pattern, name_lower, re.IGNORECASE):
|
|
380
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.4
|
|
381
|
+
reasons.append(f"Column name matches '{sem_type.value}' pattern")
|
|
382
|
+
break
|
|
383
|
+
|
|
384
|
+
# 2. Check value patterns
|
|
385
|
+
if sample_values:
|
|
386
|
+
string_values = [str(v) for v in sample_values if v is not None]
|
|
387
|
+
if string_values:
|
|
388
|
+
for sem_type, pattern in self.value_patterns.items():
|
|
389
|
+
match_count = sum(
|
|
390
|
+
1 for v in string_values[:50]
|
|
391
|
+
if re.match(pattern, v, re.IGNORECASE)
|
|
392
|
+
)
|
|
393
|
+
match_rate = match_count / min(len(string_values), 50)
|
|
394
|
+
|
|
395
|
+
if match_rate >= 0.8:
|
|
396
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.5
|
|
397
|
+
reasons.append(
|
|
398
|
+
f"{match_rate:.0%} of values match {sem_type.value} pattern"
|
|
399
|
+
)
|
|
400
|
+
elif match_rate >= 0.5:
|
|
401
|
+
candidates[sem_type] = candidates.get(sem_type, 0) + 0.3
|
|
402
|
+
reasons.append(
|
|
403
|
+
f"{match_rate:.0%} of values match {sem_type.value} pattern"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# 3. Check uniqueness for ID/key detection
|
|
407
|
+
if unique_percent is not None:
|
|
408
|
+
if unique_percent == 100 and null_percent == 0:
|
|
409
|
+
# Likely a primary key
|
|
410
|
+
if SemanticType.PRIMARY_KEY not in candidates:
|
|
411
|
+
candidates[SemanticType.PRIMARY_KEY] = 0.3
|
|
412
|
+
else:
|
|
413
|
+
candidates[SemanticType.PRIMARY_KEY] += 0.2
|
|
414
|
+
reasons.append("100% unique with no nulls suggests primary key")
|
|
415
|
+
|
|
416
|
+
# 4. Check for enum/categorical
|
|
417
|
+
if sample_values and unique_percent is not None:
|
|
418
|
+
unique_count = len(set(sample_values))
|
|
419
|
+
if unique_count <= 20 and unique_percent < 5:
|
|
420
|
+
candidates[SemanticType.ENUM] = candidates.get(SemanticType.ENUM, 0) + 0.3
|
|
421
|
+
reasons.append(f"Low cardinality ({unique_count} values) suggests enum")
|
|
422
|
+
|
|
423
|
+
# 5. Check for boolean
|
|
424
|
+
if sample_values:
|
|
425
|
+
values_set = set(str(v).lower() for v in sample_values if v is not None)
|
|
426
|
+
bool_values = {"true", "false", "yes", "no", "1", "0", "t", "f", "y", "n"}
|
|
427
|
+
if values_set.issubset(bool_values) and len(values_set) <= 2:
|
|
428
|
+
candidates[SemanticType.BOOLEAN] = candidates.get(SemanticType.BOOLEAN, 0) + 0.5
|
|
429
|
+
reasons.append("Values are boolean-like")
|
|
430
|
+
|
|
431
|
+
# Determine best match
|
|
432
|
+
if not candidates:
|
|
433
|
+
return SemanticTypeResult(
|
|
434
|
+
semantic_type=SemanticType.UNKNOWN,
|
|
435
|
+
confidence=0.0,
|
|
436
|
+
reasons=["No semantic type detected"],
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Get highest confidence type
|
|
440
|
+
best_type = max(candidates, key=lambda t: candidates[t])
|
|
441
|
+
confidence = min(candidates[best_type], 1.0)
|
|
442
|
+
|
|
443
|
+
# Check if PII
|
|
444
|
+
is_pii = best_type in PII_TYPES
|
|
445
|
+
|
|
446
|
+
# Get suggested validations
|
|
447
|
+
validations = TYPE_VALIDATIONS.get(best_type, [])
|
|
448
|
+
|
|
449
|
+
return SemanticTypeResult(
|
|
450
|
+
semantic_type=best_type,
|
|
451
|
+
confidence=confidence,
|
|
452
|
+
reasons=reasons,
|
|
453
|
+
is_pii=is_pii,
|
|
454
|
+
suggested_validations=validations,
|
|
455
|
+
metadata={
|
|
456
|
+
"column_name": column_name,
|
|
457
|
+
"all_candidates": {t.value: s for t, s in candidates.items()},
|
|
458
|
+
},
|
|
459
|
+
)
|