datawash 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datawash/__init__.py +9 -0
- datawash/adapters/__init__.py +12 -0
- datawash/adapters/base.py +66 -0
- datawash/adapters/csv_adapter.py +23 -0
- datawash/adapters/excel_adapter.py +36 -0
- datawash/adapters/json_adapter.py +21 -0
- datawash/adapters/parquet_adapter.py +34 -0
- datawash/cli/__init__.py +0 -0
- datawash/cli/formatters.py +110 -0
- datawash/cli/main.py +168 -0
- datawash/codegen/__init__.py +1 -0
- datawash/codegen/generator.py +72 -0
- datawash/core/__init__.py +1 -0
- datawash/core/cache.py +64 -0
- datawash/core/config.py +56 -0
- datawash/core/dtypes.py +24 -0
- datawash/core/exceptions.py +21 -0
- datawash/core/models.py +78 -0
- datawash/core/report.py +430 -0
- datawash/core/sampling.py +84 -0
- datawash/detectors/__init__.py +13 -0
- datawash/detectors/base.py +27 -0
- datawash/detectors/duplicate_detector.py +56 -0
- datawash/detectors/format_detector.py +130 -0
- datawash/detectors/missing_detector.py +78 -0
- datawash/detectors/outlier_detector.py +93 -0
- datawash/detectors/registry.py +64 -0
- datawash/detectors/similarity_detector.py +294 -0
- datawash/detectors/type_detector.py +100 -0
- datawash/profiler/__init__.py +1 -0
- datawash/profiler/engine.py +88 -0
- datawash/profiler/parallel.py +122 -0
- datawash/profiler/patterns.py +80 -0
- datawash/profiler/statistics.py +41 -0
- datawash/suggestors/__init__.py +1 -0
- datawash/suggestors/base.py +15 -0
- datawash/suggestors/engine.py +327 -0
- datawash/suggestors/prioritizer.py +23 -0
- datawash/transformers/__init__.py +13 -0
- datawash/transformers/base.py +27 -0
- datawash/transformers/categories.py +64 -0
- datawash/transformers/columns.py +72 -0
- datawash/transformers/duplicates.py +43 -0
- datawash/transformers/formats.py +95 -0
- datawash/transformers/missing.py +201 -0
- datawash/transformers/registry.py +30 -0
- datawash/transformers/types.py +95 -0
- datawash-0.2.0.dist-info/METADATA +353 -0
- datawash-0.2.0.dist-info/RECORD +53 -0
- datawash-0.2.0.dist-info/WHEEL +5 -0
- datawash-0.2.0.dist-info/entry_points.txt +2 -0
- datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
- datawash-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Similar column detector using MinHash + LSH.
|
|
2
|
+
|
|
3
|
+
Multi-stage similarity detection with O(n) average complexity:
|
|
4
|
+
1. N-gram blocking for column names
|
|
5
|
+
2. MinHash + LSH for column values
|
|
6
|
+
3. Size filtering to prune candidates
|
|
7
|
+
4. Exact verification only on candidates
|
|
8
|
+
|
|
9
|
+
Based on:
|
|
10
|
+
- VLDB 2016: "An Empirical Evaluation of Set Similarity Join Techniques"
|
|
11
|
+
- Broder 1997: "On the resemblance and containment of documents" (MinHash)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections import Counter, defaultdict
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
22
|
+
from datawash.detectors.base import BaseDetector
|
|
23
|
+
from datawash.detectors.registry import register_detector
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SimilarityDetector(BaseDetector):
|
|
27
|
+
"""Detect similar columns using MinHash + LSH.
|
|
28
|
+
|
|
29
|
+
Uses a multi-stage pipeline:
|
|
30
|
+
- Stage 1: N-gram blocking for name similarity
|
|
31
|
+
- Stage 2: MinHash signatures + LSH banding for value similarity
|
|
32
|
+
- Stage 3: Exact verification only on candidate pairs
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Thresholds
|
|
36
|
+
NAME_SIMILARITY_THRESHOLD = 0.7
|
|
37
|
+
VALUE_OVERLAP_THRESHOLD = 0.8
|
|
38
|
+
COMBINED_THRESHOLD = 0.6
|
|
39
|
+
|
|
40
|
+
# Algorithm parameters
|
|
41
|
+
NGRAM_SIZE = 2
|
|
42
|
+
MIN_SHARED_NGRAMS = 2
|
|
43
|
+
MINHASH_SIGNATURES = 100
|
|
44
|
+
LSH_BANDS = 20
|
|
45
|
+
MAX_UNIQUE_VALUES = 10000
|
|
46
|
+
MAX_CANDIDATES = 5000 # Cap to avoid O(n^2) blowup on wide datasets
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self, name_threshold: float = 0.7, value_threshold: float = 0.8
|
|
50
|
+
) -> None:
|
|
51
|
+
self._name_threshold = name_threshold
|
|
52
|
+
self._value_threshold = value_threshold
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def name(self) -> str:
|
|
56
|
+
return "similarity"
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def description(self) -> str:
|
|
60
|
+
return "Detects similar or potentially duplicate columns"
|
|
61
|
+
|
|
62
|
+
def detect(
|
|
63
|
+
self, df: pd.DataFrame, profile: DatasetProfile, config: Any = None
|
|
64
|
+
) -> list[Finding]:
|
|
65
|
+
columns = list(df.columns)
|
|
66
|
+
n_cols = len(columns)
|
|
67
|
+
|
|
68
|
+
if n_cols < 2:
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
# Precompute value sets (use unique() first to reduce work)
|
|
72
|
+
column_value_sets: dict[int, set[str]] = {}
|
|
73
|
+
for idx, col in enumerate(columns):
|
|
74
|
+
unique_vals = df[col].dropna().unique()
|
|
75
|
+
if len(unique_vals) > self.MAX_UNIQUE_VALUES:
|
|
76
|
+
column_value_sets[idx] = set() # Too many unique values (likely IDs)
|
|
77
|
+
else:
|
|
78
|
+
column_value_sets[idx] = set(str(v) for v in unique_vals)
|
|
79
|
+
|
|
80
|
+
# Stage 1: Blocking
|
|
81
|
+
name_candidates = self._ngram_blocking(columns)
|
|
82
|
+
value_candidates = self._minhash_lsh_blocking(columns, column_value_sets)
|
|
83
|
+
|
|
84
|
+
# Stage 2: Merge candidates (cap to avoid blowup)
|
|
85
|
+
all_candidates = name_candidates | value_candidates
|
|
86
|
+
if len(all_candidates) > self.MAX_CANDIDATES:
|
|
87
|
+
all_candidates = set(list(all_candidates)[: self.MAX_CANDIDATES])
|
|
88
|
+
|
|
89
|
+
# Stage 3: Verify only candidates
|
|
90
|
+
findings: list[Finding] = []
|
|
91
|
+
for i, j in all_candidates:
|
|
92
|
+
finding = self._verify_pair(columns, i, j, column_value_sets)
|
|
93
|
+
if finding:
|
|
94
|
+
findings.append(finding)
|
|
95
|
+
|
|
96
|
+
return findings
|
|
97
|
+
|
|
98
|
+
# -----------------------------------------------------------------
|
|
99
|
+
# STAGE 1a: N-GRAM BLOCKING FOR NAMES
|
|
100
|
+
# -----------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
def _ngram_blocking(self, columns: list[str]) -> set[tuple[int, int]]:
|
|
103
|
+
"""Find candidate pairs based on shared character n-grams.
|
|
104
|
+
|
|
105
|
+
Complexity: O(n * k) where k = average column name length
|
|
106
|
+
"""
|
|
107
|
+
ngram_index: dict[str, list[int]] = defaultdict(list)
|
|
108
|
+
|
|
109
|
+
for idx, col in enumerate(columns):
|
|
110
|
+
ngrams = self._get_ngrams(col.lower())
|
|
111
|
+
for ng in ngrams:
|
|
112
|
+
ngram_index[ng].append(idx)
|
|
113
|
+
|
|
114
|
+
pair_counts: Counter[tuple[int, int]] = Counter()
|
|
115
|
+
for indices in ngram_index.values():
|
|
116
|
+
if len(indices) < 2:
|
|
117
|
+
continue
|
|
118
|
+
for i in range(len(indices)):
|
|
119
|
+
for j in range(i + 1, len(indices)):
|
|
120
|
+
pair = (min(indices[i], indices[j]), max(indices[i], indices[j]))
|
|
121
|
+
pair_counts[pair] += 1
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
pair
|
|
125
|
+
for pair, count in pair_counts.items()
|
|
126
|
+
if count >= self.MIN_SHARED_NGRAMS
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def _get_ngrams(self, s: str) -> set[str]:
|
|
130
|
+
"""Extract character n-grams from string."""
|
|
131
|
+
n = self.NGRAM_SIZE
|
|
132
|
+
if len(s) < n:
|
|
133
|
+
return {s}
|
|
134
|
+
return {s[i : i + n] for i in range(len(s) - n + 1)}
|
|
135
|
+
|
|
136
|
+
# -----------------------------------------------------------------
|
|
137
|
+
# STAGE 1b + 2: MINHASH + LSH BLOCKING FOR VALUES
|
|
138
|
+
# -----------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
def _minhash_lsh_blocking(
|
|
141
|
+
self,
|
|
142
|
+
columns: list[str],
|
|
143
|
+
column_value_sets: dict[int, set[str]],
|
|
144
|
+
) -> set[tuple[int, int]]:
|
|
145
|
+
"""Find candidate pairs using MinHash signatures + LSH banding.
|
|
146
|
+
|
|
147
|
+
MinHash: O(n * v * m) where m = signature size
|
|
148
|
+
LSH: O(n * b) where b = number of bands
|
|
149
|
+
"""
|
|
150
|
+
n_cols = len(columns)
|
|
151
|
+
|
|
152
|
+
# Generate MinHash signatures for each column
|
|
153
|
+
signatures: list[list[int]] = []
|
|
154
|
+
sizes: list[int] = []
|
|
155
|
+
for idx in range(n_cols):
|
|
156
|
+
val_set = column_value_sets.get(idx, set())
|
|
157
|
+
signatures.append(self._minhash_signature(val_set))
|
|
158
|
+
sizes.append(len(val_set))
|
|
159
|
+
|
|
160
|
+
# LSH Banding: hash signature bands into buckets
|
|
161
|
+
candidates: set[tuple[int, int]] = set()
|
|
162
|
+
rows_per_band = self.MINHASH_SIGNATURES // self.LSH_BANDS
|
|
163
|
+
|
|
164
|
+
# Size filter ratio: for Jaccard >= t, ratio must be <= (2-t)/t
|
|
165
|
+
max_ratio = (2 - self._value_threshold) / self._value_threshold
|
|
166
|
+
|
|
167
|
+
for band_idx in range(self.LSH_BANDS):
|
|
168
|
+
buckets: dict[int, list[int]] = defaultdict(list)
|
|
169
|
+
start = band_idx * rows_per_band
|
|
170
|
+
end = start + rows_per_band
|
|
171
|
+
|
|
172
|
+
for col_idx, sig in enumerate(signatures):
|
|
173
|
+
band_hash = hash(tuple(sig[start:end]))
|
|
174
|
+
buckets[band_hash].append(col_idx)
|
|
175
|
+
|
|
176
|
+
for bucket_cols in buckets.values():
|
|
177
|
+
if len(bucket_cols) < 2 or len(bucket_cols) > 50:
|
|
178
|
+
# Skip very large buckets (too many false positives)
|
|
179
|
+
continue
|
|
180
|
+
for i in range(len(bucket_cols)):
|
|
181
|
+
for j in range(i + 1, len(bucket_cols)):
|
|
182
|
+
idx1, idx2 = bucket_cols[i], bucket_cols[j]
|
|
183
|
+
s1, s2 = sizes[idx1], sizes[idx2]
|
|
184
|
+
if s1 == 0 or s2 == 0:
|
|
185
|
+
continue
|
|
186
|
+
ratio = max(s1, s2) / min(s1, s2)
|
|
187
|
+
if ratio <= max_ratio:
|
|
188
|
+
candidates.add((min(idx1, idx2), max(idx1, idx2)))
|
|
189
|
+
if len(candidates) > self.MAX_CANDIDATES:
|
|
190
|
+
break
|
|
191
|
+
if len(candidates) > self.MAX_CANDIDATES:
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
return candidates
|
|
195
|
+
|
|
196
|
+
def _minhash_signature(self, values: set[str]) -> list[int]:
|
|
197
|
+
"""Generate MinHash signature for a set of values."""
|
|
198
|
+
if not values:
|
|
199
|
+
return [0] * self.MINHASH_SIGNATURES
|
|
200
|
+
signature = []
|
|
201
|
+
for seed in range(self.MINHASH_SIGNATURES):
|
|
202
|
+
min_hash = min(hash((seed, v)) % (2**32) for v in values)
|
|
203
|
+
signature.append(min_hash)
|
|
204
|
+
return signature
|
|
205
|
+
|
|
206
|
+
# -----------------------------------------------------------------
|
|
207
|
+
# STAGE 3: VERIFICATION
|
|
208
|
+
# -----------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
def _verify_pair(
|
|
211
|
+
self,
|
|
212
|
+
columns: list[str],
|
|
213
|
+
i: int,
|
|
214
|
+
j: int,
|
|
215
|
+
column_value_sets: dict[int, set[str]],
|
|
216
|
+
) -> Finding | None:
|
|
217
|
+
"""Verify if a candidate pair is actually similar."""
|
|
218
|
+
col1, col2 = columns[i], columns[j]
|
|
219
|
+
|
|
220
|
+
name_sim = self._normalized_levenshtein(col1.lower(), col2.lower())
|
|
221
|
+
set1 = column_value_sets.get(i, set())
|
|
222
|
+
set2 = column_value_sets.get(j, set())
|
|
223
|
+
value_sim = self._jaccard_similarity(set1, set2)
|
|
224
|
+
|
|
225
|
+
combined_score = 0.4 * name_sim + 0.6 * value_sim
|
|
226
|
+
if combined_score < self.COMBINED_THRESHOLD:
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
severity = Severity.MEDIUM if combined_score > 0.8 else Severity.LOW
|
|
230
|
+
|
|
231
|
+
return Finding(
|
|
232
|
+
detector=self.name,
|
|
233
|
+
issue_type="similar_columns",
|
|
234
|
+
severity=severity,
|
|
235
|
+
columns=[col1, col2],
|
|
236
|
+
details={
|
|
237
|
+
"name_similarity": round(name_sim, 3),
|
|
238
|
+
"value_similarity": round(value_sim, 3),
|
|
239
|
+
"combined_score": round(combined_score, 3),
|
|
240
|
+
},
|
|
241
|
+
message=(
|
|
242
|
+
f"Columns '{col1}' and '{col2}' appear similar "
|
|
243
|
+
f"(name: {name_sim:.0%}, values: {value_sim:.0%})"
|
|
244
|
+
),
|
|
245
|
+
confidence=combined_score,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# -----------------------------------------------------------------
|
|
249
|
+
# HELPERS
|
|
250
|
+
# -----------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
def _normalized_levenshtein(self, s1: str, s2: str) -> float:
|
|
253
|
+
"""Calculate normalized Levenshtein similarity (0 to 1)."""
|
|
254
|
+
if s1 == s2:
|
|
255
|
+
return 1.0
|
|
256
|
+
len1, len2 = len(s1), len(s2)
|
|
257
|
+
max_len = max(len1, len2)
|
|
258
|
+
if max_len == 0:
|
|
259
|
+
return 1.0
|
|
260
|
+
|
|
261
|
+
# Early termination if length difference too large
|
|
262
|
+
if abs(len1 - len2) / max_len > (1 - self._name_threshold):
|
|
263
|
+
return 0.0
|
|
264
|
+
|
|
265
|
+
# Two-row Levenshtein
|
|
266
|
+
if len1 > len2:
|
|
267
|
+
s1, s2 = s2, s1
|
|
268
|
+
len1, len2 = len2, len1
|
|
269
|
+
|
|
270
|
+
previous_row = list(range(len2 + 1))
|
|
271
|
+
for i, c1 in enumerate(s1):
|
|
272
|
+
current_row = [i + 1]
|
|
273
|
+
for j, c2 in enumerate(s2):
|
|
274
|
+
insertions = previous_row[j + 1] + 1
|
|
275
|
+
deletions = current_row[j] + 1
|
|
276
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
277
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
278
|
+
previous_row = current_row
|
|
279
|
+
|
|
280
|
+
distance = previous_row[-1]
|
|
281
|
+
return 1 - (distance / max_len)
|
|
282
|
+
|
|
283
|
+
def _jaccard_similarity(self, set1: set[str], set2: set[str]) -> float:
|
|
284
|
+
"""Calculate exact Jaccard similarity."""
|
|
285
|
+
if not set1 and not set2:
|
|
286
|
+
return 1.0
|
|
287
|
+
if not set1 or not set2:
|
|
288
|
+
return 0.0
|
|
289
|
+
intersection = len(set1 & set2)
|
|
290
|
+
union = len(set1 | set2)
|
|
291
|
+
return intersection / union if union > 0 else 0.0
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
register_detector(SimilarityDetector())
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Semantic type detection using pattern matching."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from datawash.core.models import DatasetProfile, Finding, Severity
|
|
8
|
+
from datawash.detectors.base import BaseDetector
|
|
9
|
+
from datawash.detectors.registry import register_detector
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TypeDetector(BaseDetector):
|
|
13
|
+
@property
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
return "types"
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def description(self) -> str:
|
|
19
|
+
return "Detects semantic types and type mismatches"
|
|
20
|
+
|
|
21
|
+
def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
|
|
22
|
+
findings: list[Finding] = []
|
|
23
|
+
|
|
24
|
+
for col_name, col_profile in profile.columns.items():
|
|
25
|
+
is_string = pd.api.types.is_string_dtype(df[col_name])
|
|
26
|
+
|
|
27
|
+
# Flag numeric columns stored as strings
|
|
28
|
+
if is_string:
|
|
29
|
+
series = df[col_name].dropna()
|
|
30
|
+
if series.empty:
|
|
31
|
+
continue
|
|
32
|
+
# Sample for large columns to avoid expensive pd.to_numeric
|
|
33
|
+
if len(series) > 1000:
|
|
34
|
+
sample = series.sample(1000, random_state=42)
|
|
35
|
+
else:
|
|
36
|
+
sample = series
|
|
37
|
+
numeric_count = pd.to_numeric(sample, errors="coerce").notna().sum()
|
|
38
|
+
ratio = numeric_count / len(sample)
|
|
39
|
+
if ratio > 0.8:
|
|
40
|
+
findings.append(
|
|
41
|
+
Finding(
|
|
42
|
+
detector=self.name,
|
|
43
|
+
issue_type="numeric_as_string",
|
|
44
|
+
severity=Severity.MEDIUM,
|
|
45
|
+
columns=[col_name],
|
|
46
|
+
details={"numeric_ratio": round(float(ratio), 3)},
|
|
47
|
+
message=(
|
|
48
|
+
f"Column '{col_name}' appears numeric "
|
|
49
|
+
f"but stored as string "
|
|
50
|
+
f"({ratio:.0%} parseable)"
|
|
51
|
+
),
|
|
52
|
+
confidence=float(ratio),
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Flag boolean-like columns
|
|
57
|
+
if is_string:
|
|
58
|
+
bool_values = {
|
|
59
|
+
"true",
|
|
60
|
+
"false",
|
|
61
|
+
"yes",
|
|
62
|
+
"no",
|
|
63
|
+
"y",
|
|
64
|
+
"n",
|
|
65
|
+
"1",
|
|
66
|
+
"0",
|
|
67
|
+
"t",
|
|
68
|
+
"f",
|
|
69
|
+
"on",
|
|
70
|
+
"off",
|
|
71
|
+
}
|
|
72
|
+
lowered_unique = set(
|
|
73
|
+
df[col_name].dropna().astype(str).str.strip().str.lower().unique()
|
|
74
|
+
)
|
|
75
|
+
if lowered_unique <= bool_values and len(lowered_unique) >= 2:
|
|
76
|
+
findings.append(
|
|
77
|
+
Finding(
|
|
78
|
+
detector=self.name,
|
|
79
|
+
issue_type="boolean_as_string",
|
|
80
|
+
severity=Severity.LOW,
|
|
81
|
+
columns=[col_name],
|
|
82
|
+
details={"values": sorted(lowered_unique)},
|
|
83
|
+
message=(
|
|
84
|
+
f"Column '{col_name}' contains "
|
|
85
|
+
f"boolean-like values "
|
|
86
|
+
f"stored as strings"
|
|
87
|
+
),
|
|
88
|
+
confidence=0.95,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Report detected semantic types from patterns
|
|
93
|
+
if col_profile.patterns:
|
|
94
|
+
for pattern_name, pattern_info in col_profile.patterns.items():
|
|
95
|
+
col_profile.semantic_type = pattern_name
|
|
96
|
+
|
|
97
|
+
return findings
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
register_detector(TypeDetector())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .engine import profile_dataset as profile_dataset
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Main profiling orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from datawash.core.models import ColumnProfile, DatasetProfile
|
|
12
|
+
from datawash.profiler.patterns import detect_column_patterns
|
|
13
|
+
from datawash.profiler.statistics import (
|
|
14
|
+
compute_categorical_stats,
|
|
15
|
+
compute_numeric_stats,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def profile_dataset(df: pd.DataFrame) -> DatasetProfile:
|
|
22
|
+
"""Generate a complete profile for a DataFrame.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
df: The DataFrame to profile.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
DatasetProfile with column-level and dataset-level statistics.
|
|
29
|
+
"""
|
|
30
|
+
logger.info("Profiling dataset: %d rows, %d columns", len(df), len(df.columns))
|
|
31
|
+
|
|
32
|
+
columns: dict[str, ColumnProfile] = {}
|
|
33
|
+
use_progress = len(df) > 10000 or len(df.columns) > 20
|
|
34
|
+
if use_progress and sys.stderr.isatty():
|
|
35
|
+
from rich.progress import Progress
|
|
36
|
+
|
|
37
|
+
with Progress() as progress:
|
|
38
|
+
task = progress.add_task("Profiling columns...", total=len(df.columns))
|
|
39
|
+
for col_name in df.columns:
|
|
40
|
+
columns[col_name] = _profile_column(df[col_name])
|
|
41
|
+
progress.update(task, advance=1)
|
|
42
|
+
else:
|
|
43
|
+
for col_name in df.columns:
|
|
44
|
+
columns[col_name] = _profile_column(df[col_name])
|
|
45
|
+
|
|
46
|
+
return DatasetProfile(
|
|
47
|
+
row_count=len(df),
|
|
48
|
+
column_count=len(df.columns),
|
|
49
|
+
memory_bytes=int(df.memory_usage(deep=True).sum()),
|
|
50
|
+
columns=columns,
|
|
51
|
+
duplicate_row_count=int(df.duplicated().sum()),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _profile_column(series: pd.Series) -> ColumnProfile:
|
|
56
|
+
"""Profile a single column."""
|
|
57
|
+
name = str(series.name)
|
|
58
|
+
null_count = int(series.isna().sum())
|
|
59
|
+
total = len(series)
|
|
60
|
+
unique_count = int(series.nunique())
|
|
61
|
+
|
|
62
|
+
# Compute type-appropriate statistics
|
|
63
|
+
stats: dict[str, Any] = {}
|
|
64
|
+
# Boolean columns should use categorical stats (quantile fails on bool)
|
|
65
|
+
if pd.api.types.is_bool_dtype(series):
|
|
66
|
+
stats = compute_categorical_stats(series)
|
|
67
|
+
elif pd.api.types.is_numeric_dtype(series):
|
|
68
|
+
stats = compute_numeric_stats(series)
|
|
69
|
+
else:
|
|
70
|
+
stats = compute_categorical_stats(series)
|
|
71
|
+
|
|
72
|
+
# Detect patterns
|
|
73
|
+
patterns = detect_column_patterns(series)
|
|
74
|
+
|
|
75
|
+
# Sample values (up to 5 non-null)
|
|
76
|
+
sample_values = series.dropna().head(5).tolist()
|
|
77
|
+
|
|
78
|
+
return ColumnProfile(
|
|
79
|
+
name=name,
|
|
80
|
+
dtype=str(series.dtype),
|
|
81
|
+
null_count=null_count,
|
|
82
|
+
null_ratio=round(null_count / total, 4) if total > 0 else 0.0,
|
|
83
|
+
unique_count=unique_count,
|
|
84
|
+
unique_ratio=round(unique_count / total, 4) if total > 0 else 0.0,
|
|
85
|
+
sample_values=sample_values,
|
|
86
|
+
statistics=stats,
|
|
87
|
+
patterns=patterns,
|
|
88
|
+
)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Parallel column profiling and detector execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from datawash.core.cache import ComputationCache
|
|
13
|
+
from datawash.core.models import ColumnProfile, DatasetProfile, Finding
|
|
14
|
+
from datawash.detectors.base import BaseDetector
|
|
15
|
+
from datawash.profiler.patterns import detect_column_patterns
|
|
16
|
+
from datawash.profiler.statistics import (
|
|
17
|
+
compute_categorical_stats,
|
|
18
|
+
compute_numeric_stats,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
MAX_WORKERS = min(8, os.cpu_count() or 4)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def profile_dataset_parallel(
|
|
27
|
+
df: pd.DataFrame,
|
|
28
|
+
cache: Optional[ComputationCache] = None,
|
|
29
|
+
) -> DatasetProfile:
|
|
30
|
+
"""Profile all columns in parallel using ThreadPoolExecutor."""
|
|
31
|
+
columns: dict[str, ColumnProfile] = {}
|
|
32
|
+
|
|
33
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
34
|
+
future_to_col = {
|
|
35
|
+
executor.submit(_profile_column, df[col_name]): col_name
|
|
36
|
+
for col_name in df.columns
|
|
37
|
+
}
|
|
38
|
+
for future in as_completed(future_to_col):
|
|
39
|
+
col_name = future_to_col[future]
|
|
40
|
+
try:
|
|
41
|
+
columns[col_name] = future.result()
|
|
42
|
+
except Exception:
|
|
43
|
+
logger.exception("Failed to profile column %s", col_name)
|
|
44
|
+
columns[col_name] = _empty_profile(col_name)
|
|
45
|
+
|
|
46
|
+
return DatasetProfile(
|
|
47
|
+
row_count=len(df),
|
|
48
|
+
column_count=len(df.columns),
|
|
49
|
+
memory_bytes=int(df.memory_usage(deep=True).sum()),
|
|
50
|
+
columns=columns,
|
|
51
|
+
duplicate_row_count=int(df.duplicated().sum()),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def run_detectors_parallel(
|
|
56
|
+
df: pd.DataFrame,
|
|
57
|
+
profile: DatasetProfile,
|
|
58
|
+
detectors: dict[str, BaseDetector],
|
|
59
|
+
) -> list[Finding]:
|
|
60
|
+
"""Run all detectors in parallel."""
|
|
61
|
+
findings: list[Finding] = []
|
|
62
|
+
|
|
63
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
64
|
+
future_to_name = {
|
|
65
|
+
executor.submit(detector.detect, df, profile): name
|
|
66
|
+
for name, detector in detectors.items()
|
|
67
|
+
}
|
|
68
|
+
for future in as_completed(future_to_name):
|
|
69
|
+
name = future_to_name[future]
|
|
70
|
+
try:
|
|
71
|
+
results = future.result()
|
|
72
|
+
findings.extend(results)
|
|
73
|
+
except Exception:
|
|
74
|
+
logger.exception("Detector %s failed", name)
|
|
75
|
+
|
|
76
|
+
return findings
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _profile_column(series: pd.Series) -> ColumnProfile:
|
|
80
|
+
"""Profile a single column (runs inside thread)."""
|
|
81
|
+
name = str(series.name)
|
|
82
|
+
null_count = int(series.isna().sum())
|
|
83
|
+
total = len(series)
|
|
84
|
+
unique_count = int(series.nunique())
|
|
85
|
+
|
|
86
|
+
stats: dict[str, Any] = {}
|
|
87
|
+
if pd.api.types.is_bool_dtype(series):
|
|
88
|
+
stats = compute_categorical_stats(series)
|
|
89
|
+
elif pd.api.types.is_numeric_dtype(series):
|
|
90
|
+
stats = compute_numeric_stats(series)
|
|
91
|
+
else:
|
|
92
|
+
stats = compute_categorical_stats(series)
|
|
93
|
+
|
|
94
|
+
patterns = detect_column_patterns(series)
|
|
95
|
+
sample_values = series.dropna().head(5).tolist()
|
|
96
|
+
|
|
97
|
+
return ColumnProfile(
|
|
98
|
+
name=name,
|
|
99
|
+
dtype=str(series.dtype),
|
|
100
|
+
null_count=null_count,
|
|
101
|
+
null_ratio=round(null_count / total, 4) if total > 0 else 0.0,
|
|
102
|
+
unique_count=unique_count,
|
|
103
|
+
unique_ratio=round(unique_count / total, 4) if total > 0 else 0.0,
|
|
104
|
+
sample_values=sample_values,
|
|
105
|
+
statistics=stats,
|
|
106
|
+
patterns=patterns,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _empty_profile(col_name: str) -> ColumnProfile:
|
|
111
|
+
"""Fallback profile when profiling fails."""
|
|
112
|
+
return ColumnProfile(
|
|
113
|
+
name=col_name,
|
|
114
|
+
dtype="unknown",
|
|
115
|
+
null_count=0,
|
|
116
|
+
null_ratio=0.0,
|
|
117
|
+
unique_count=0,
|
|
118
|
+
unique_ratio=0.0,
|
|
119
|
+
sample_values=[],
|
|
120
|
+
statistics={},
|
|
121
|
+
patterns={},
|
|
122
|
+
)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Pattern detection for common data formats."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
PATTERNS: dict[str, re.Pattern[str]] = {
|
|
11
|
+
"email": re.compile(r"^[\w.+-]+@[\w-]+\.[\w.-]+$"),
|
|
12
|
+
"url": re.compile(r"^https?://[\w\-._~:/?#\[\]@!$&'()*+,;=]+$"),
|
|
13
|
+
"phone": re.compile(r"^[\+]?[(]?[0-9]{1,4}[)]?[-\s./0-9]{6,15}$"),
|
|
14
|
+
"ipv4": re.compile(r"^(?:\d{1,3}\.){3}\d{1,3}$"),
|
|
15
|
+
"uuid": re.compile(
|
|
16
|
+
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE
|
|
17
|
+
),
|
|
18
|
+
"currency": re.compile("^[$\u20ac\u00a3\u00a5][\\s]?[\\d,]+\\.?\\d*$"),
|
|
19
|
+
"zip_us": re.compile(r"^\d{5}(-\d{4})?$"),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
DATE_FORMATS = [
|
|
23
|
+
"%Y-%m-%d",
|
|
24
|
+
"%m/%d/%Y",
|
|
25
|
+
"%d/%m/%Y",
|
|
26
|
+
"%Y/%m/%d",
|
|
27
|
+
"%m-%d-%Y",
|
|
28
|
+
"%d-%m-%Y",
|
|
29
|
+
"%Y%m%d",
|
|
30
|
+
"%b %d, %Y",
|
|
31
|
+
"%B %d, %Y",
|
|
32
|
+
"%d %b %Y",
|
|
33
|
+
"%d %B %Y",
|
|
34
|
+
"%Y-%m-%d %H:%M:%S",
|
|
35
|
+
"%m/%d/%Y %H:%M:%S",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def detect_column_patterns(series: pd.Series) -> dict[str, Any]:
|
|
40
|
+
"""Detect patterns in a column's values.
|
|
41
|
+
|
|
42
|
+
Returns dict with detected pattern name and match ratio.
|
|
43
|
+
"""
|
|
44
|
+
clean = series.dropna().astype(str)
|
|
45
|
+
if clean.empty:
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
sample = clean.head(1000)
|
|
49
|
+
total = len(sample)
|
|
50
|
+
results: dict[str, Any] = {}
|
|
51
|
+
|
|
52
|
+
for name, pattern in PATTERNS.items():
|
|
53
|
+
matches = sample.str.match(pattern).sum()
|
|
54
|
+
ratio = matches / total
|
|
55
|
+
if ratio > 0.5:
|
|
56
|
+
results[name] = {
|
|
57
|
+
"match_ratio": round(float(ratio), 3),
|
|
58
|
+
"pattern": pattern.pattern,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Date detection
|
|
62
|
+
if not results:
|
|
63
|
+
date_ratio = _detect_date_pattern(sample)
|
|
64
|
+
if date_ratio and date_ratio["match_ratio"] > 0.5:
|
|
65
|
+
results["date"] = date_ratio
|
|
66
|
+
|
|
67
|
+
return results
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _detect_date_pattern(sample: pd.Series) -> dict[str, Any] | None:
|
|
71
|
+
"""Try parsing dates with common formats."""
|
|
72
|
+
for fmt in DATE_FORMATS:
|
|
73
|
+
try:
|
|
74
|
+
parsed = pd.to_datetime(sample, format=fmt, errors="coerce")
|
|
75
|
+
ratio = parsed.notna().sum() / len(sample)
|
|
76
|
+
if ratio > 0.7:
|
|
77
|
+
return {"match_ratio": round(float(ratio), 3), "format": fmt}
|
|
78
|
+
except Exception:
|
|
79
|
+
continue
|
|
80
|
+
return None
|