datawash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. datawash/__init__.py +9 -0
  2. datawash/adapters/__init__.py +12 -0
  3. datawash/adapters/base.py +66 -0
  4. datawash/adapters/csv_adapter.py +23 -0
  5. datawash/adapters/excel_adapter.py +36 -0
  6. datawash/adapters/json_adapter.py +21 -0
  7. datawash/adapters/parquet_adapter.py +34 -0
  8. datawash/cli/__init__.py +0 -0
  9. datawash/cli/formatters.py +110 -0
  10. datawash/cli/main.py +168 -0
  11. datawash/codegen/__init__.py +1 -0
  12. datawash/codegen/generator.py +72 -0
  13. datawash/core/__init__.py +1 -0
  14. datawash/core/cache.py +64 -0
  15. datawash/core/config.py +56 -0
  16. datawash/core/dtypes.py +24 -0
  17. datawash/core/exceptions.py +21 -0
  18. datawash/core/models.py +78 -0
  19. datawash/core/report.py +430 -0
  20. datawash/core/sampling.py +84 -0
  21. datawash/detectors/__init__.py +13 -0
  22. datawash/detectors/base.py +27 -0
  23. datawash/detectors/duplicate_detector.py +56 -0
  24. datawash/detectors/format_detector.py +130 -0
  25. datawash/detectors/missing_detector.py +78 -0
  26. datawash/detectors/outlier_detector.py +93 -0
  27. datawash/detectors/registry.py +64 -0
  28. datawash/detectors/similarity_detector.py +294 -0
  29. datawash/detectors/type_detector.py +100 -0
  30. datawash/profiler/__init__.py +1 -0
  31. datawash/profiler/engine.py +88 -0
  32. datawash/profiler/parallel.py +122 -0
  33. datawash/profiler/patterns.py +80 -0
  34. datawash/profiler/statistics.py +41 -0
  35. datawash/suggestors/__init__.py +1 -0
  36. datawash/suggestors/base.py +15 -0
  37. datawash/suggestors/engine.py +327 -0
  38. datawash/suggestors/prioritizer.py +23 -0
  39. datawash/transformers/__init__.py +13 -0
  40. datawash/transformers/base.py +27 -0
  41. datawash/transformers/categories.py +64 -0
  42. datawash/transformers/columns.py +72 -0
  43. datawash/transformers/duplicates.py +43 -0
  44. datawash/transformers/formats.py +95 -0
  45. datawash/transformers/missing.py +201 -0
  46. datawash/transformers/registry.py +30 -0
  47. datawash/transformers/types.py +95 -0
  48. datawash-0.2.0.dist-info/METADATA +353 -0
  49. datawash-0.2.0.dist-info/RECORD +53 -0
  50. datawash-0.2.0.dist-info/WHEEL +5 -0
  51. datawash-0.2.0.dist-info/entry_points.txt +2 -0
  52. datawash-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. datawash-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,294 @@
1
+ """Similar column detector using MinHash + LSH.
2
+
3
+ Multi-stage similarity detection with O(n) average complexity:
4
+ 1. N-gram blocking for column names
5
+ 2. MinHash + LSH for column values
6
+ 3. Size filtering to prune candidates
7
+ 4. Exact verification only on candidates
8
+
9
+ Based on:
10
+ - VLDB 2016: "An Empirical Evaluation of Set Similarity Join Techniques"
11
+ - Broder 1997: "On the resemblance and containment of documents" (MinHash)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections import Counter, defaultdict
17
+ from typing import Any
18
+
19
+ import pandas as pd
20
+
21
+ from datawash.core.models import DatasetProfile, Finding, Severity
22
+ from datawash.detectors.base import BaseDetector
23
+ from datawash.detectors.registry import register_detector
24
+
25
+
26
+ class SimilarityDetector(BaseDetector):
27
+ """Detect similar columns using MinHash + LSH.
28
+
29
+ Uses a multi-stage pipeline:
30
+ - Stage 1: N-gram blocking for name similarity
31
+ - Stage 2: MinHash signatures + LSH banding for value similarity
32
+ - Stage 3: Exact verification only on candidate pairs
33
+ """
34
+
35
+ # Thresholds
36
+ NAME_SIMILARITY_THRESHOLD = 0.7
37
+ VALUE_OVERLAP_THRESHOLD = 0.8
38
+ COMBINED_THRESHOLD = 0.6
39
+
40
+ # Algorithm parameters
41
+ NGRAM_SIZE = 2
42
+ MIN_SHARED_NGRAMS = 2
43
+ MINHASH_SIGNATURES = 100
44
+ LSH_BANDS = 20
45
+ MAX_UNIQUE_VALUES = 10000
46
+ MAX_CANDIDATES = 5000 # Cap to avoid O(n^2) blowup on wide datasets
47
+
48
+ def __init__(
49
+ self, name_threshold: float = 0.7, value_threshold: float = 0.8
50
+ ) -> None:
51
+ self._name_threshold = name_threshold
52
+ self._value_threshold = value_threshold
53
+
54
+ @property
55
+ def name(self) -> str:
56
+ return "similarity"
57
+
58
+ @property
59
+ def description(self) -> str:
60
+ return "Detects similar or potentially duplicate columns"
61
+
62
+ def detect(
63
+ self, df: pd.DataFrame, profile: DatasetProfile, config: Any = None
64
+ ) -> list[Finding]:
65
+ columns = list(df.columns)
66
+ n_cols = len(columns)
67
+
68
+ if n_cols < 2:
69
+ return []
70
+
71
+ # Precompute value sets (use unique() first to reduce work)
72
+ column_value_sets: dict[int, set[str]] = {}
73
+ for idx, col in enumerate(columns):
74
+ unique_vals = df[col].dropna().unique()
75
+ if len(unique_vals) > self.MAX_UNIQUE_VALUES:
76
+ column_value_sets[idx] = set() # Too many unique values (likely IDs)
77
+ else:
78
+ column_value_sets[idx] = set(str(v) for v in unique_vals)
79
+
80
+ # Stage 1: Blocking
81
+ name_candidates = self._ngram_blocking(columns)
82
+ value_candidates = self._minhash_lsh_blocking(columns, column_value_sets)
83
+
84
+ # Stage 2: Merge candidates (cap to avoid blowup)
85
+ all_candidates = name_candidates | value_candidates
86
+ if len(all_candidates) > self.MAX_CANDIDATES:
87
+ all_candidates = set(list(all_candidates)[: self.MAX_CANDIDATES])
88
+
89
+ # Stage 3: Verify only candidates
90
+ findings: list[Finding] = []
91
+ for i, j in all_candidates:
92
+ finding = self._verify_pair(columns, i, j, column_value_sets)
93
+ if finding:
94
+ findings.append(finding)
95
+
96
+ return findings
97
+
98
+ # -----------------------------------------------------------------
99
+ # STAGE 1a: N-GRAM BLOCKING FOR NAMES
100
+ # -----------------------------------------------------------------
101
+
102
+ def _ngram_blocking(self, columns: list[str]) -> set[tuple[int, int]]:
103
+ """Find candidate pairs based on shared character n-grams.
104
+
105
+ Complexity: O(n * k) where k = average column name length
106
+ """
107
+ ngram_index: dict[str, list[int]] = defaultdict(list)
108
+
109
+ for idx, col in enumerate(columns):
110
+ ngrams = self._get_ngrams(col.lower())
111
+ for ng in ngrams:
112
+ ngram_index[ng].append(idx)
113
+
114
+ pair_counts: Counter[tuple[int, int]] = Counter()
115
+ for indices in ngram_index.values():
116
+ if len(indices) < 2:
117
+ continue
118
+ for i in range(len(indices)):
119
+ for j in range(i + 1, len(indices)):
120
+ pair = (min(indices[i], indices[j]), max(indices[i], indices[j]))
121
+ pair_counts[pair] += 1
122
+
123
+ return {
124
+ pair
125
+ for pair, count in pair_counts.items()
126
+ if count >= self.MIN_SHARED_NGRAMS
127
+ }
128
+
129
+ def _get_ngrams(self, s: str) -> set[str]:
130
+ """Extract character n-grams from string."""
131
+ n = self.NGRAM_SIZE
132
+ if len(s) < n:
133
+ return {s}
134
+ return {s[i : i + n] for i in range(len(s) - n + 1)}
135
+
136
+ # -----------------------------------------------------------------
137
+ # STAGE 1b + 2: MINHASH + LSH BLOCKING FOR VALUES
138
+ # -----------------------------------------------------------------
139
+
140
+ def _minhash_lsh_blocking(
141
+ self,
142
+ columns: list[str],
143
+ column_value_sets: dict[int, set[str]],
144
+ ) -> set[tuple[int, int]]:
145
+ """Find candidate pairs using MinHash signatures + LSH banding.
146
+
147
+ MinHash: O(n * v * m) where m = signature size
148
+ LSH: O(n * b) where b = number of bands
149
+ """
150
+ n_cols = len(columns)
151
+
152
+ # Generate MinHash signatures for each column
153
+ signatures: list[list[int]] = []
154
+ sizes: list[int] = []
155
+ for idx in range(n_cols):
156
+ val_set = column_value_sets.get(idx, set())
157
+ signatures.append(self._minhash_signature(val_set))
158
+ sizes.append(len(val_set))
159
+
160
+ # LSH Banding: hash signature bands into buckets
161
+ candidates: set[tuple[int, int]] = set()
162
+ rows_per_band = self.MINHASH_SIGNATURES // self.LSH_BANDS
163
+
164
+ # Size filter ratio: for Jaccard >= t, ratio must be <= (2-t)/t
165
+ max_ratio = (2 - self._value_threshold) / self._value_threshold
166
+
167
+ for band_idx in range(self.LSH_BANDS):
168
+ buckets: dict[int, list[int]] = defaultdict(list)
169
+ start = band_idx * rows_per_band
170
+ end = start + rows_per_band
171
+
172
+ for col_idx, sig in enumerate(signatures):
173
+ band_hash = hash(tuple(sig[start:end]))
174
+ buckets[band_hash].append(col_idx)
175
+
176
+ for bucket_cols in buckets.values():
177
+ if len(bucket_cols) < 2 or len(bucket_cols) > 50:
178
+ # Skip very large buckets (too many false positives)
179
+ continue
180
+ for i in range(len(bucket_cols)):
181
+ for j in range(i + 1, len(bucket_cols)):
182
+ idx1, idx2 = bucket_cols[i], bucket_cols[j]
183
+ s1, s2 = sizes[idx1], sizes[idx2]
184
+ if s1 == 0 or s2 == 0:
185
+ continue
186
+ ratio = max(s1, s2) / min(s1, s2)
187
+ if ratio <= max_ratio:
188
+ candidates.add((min(idx1, idx2), max(idx1, idx2)))
189
+ if len(candidates) > self.MAX_CANDIDATES:
190
+ break
191
+ if len(candidates) > self.MAX_CANDIDATES:
192
+ break
193
+
194
+ return candidates
195
+
196
+ def _minhash_signature(self, values: set[str]) -> list[int]:
197
+ """Generate MinHash signature for a set of values."""
198
+ if not values:
199
+ return [0] * self.MINHASH_SIGNATURES
200
+ signature = []
201
+ for seed in range(self.MINHASH_SIGNATURES):
202
+ min_hash = min(hash((seed, v)) % (2**32) for v in values)
203
+ signature.append(min_hash)
204
+ return signature
205
+
206
+ # -----------------------------------------------------------------
207
+ # STAGE 3: VERIFICATION
208
+ # -----------------------------------------------------------------
209
+
210
+ def _verify_pair(
211
+ self,
212
+ columns: list[str],
213
+ i: int,
214
+ j: int,
215
+ column_value_sets: dict[int, set[str]],
216
+ ) -> Finding | None:
217
+ """Verify if a candidate pair is actually similar."""
218
+ col1, col2 = columns[i], columns[j]
219
+
220
+ name_sim = self._normalized_levenshtein(col1.lower(), col2.lower())
221
+ set1 = column_value_sets.get(i, set())
222
+ set2 = column_value_sets.get(j, set())
223
+ value_sim = self._jaccard_similarity(set1, set2)
224
+
225
+ combined_score = 0.4 * name_sim + 0.6 * value_sim
226
+ if combined_score < self.COMBINED_THRESHOLD:
227
+ return None
228
+
229
+ severity = Severity.MEDIUM if combined_score > 0.8 else Severity.LOW
230
+
231
+ return Finding(
232
+ detector=self.name,
233
+ issue_type="similar_columns",
234
+ severity=severity,
235
+ columns=[col1, col2],
236
+ details={
237
+ "name_similarity": round(name_sim, 3),
238
+ "value_similarity": round(value_sim, 3),
239
+ "combined_score": round(combined_score, 3),
240
+ },
241
+ message=(
242
+ f"Columns '{col1}' and '{col2}' appear similar "
243
+ f"(name: {name_sim:.0%}, values: {value_sim:.0%})"
244
+ ),
245
+ confidence=combined_score,
246
+ )
247
+
248
+ # -----------------------------------------------------------------
249
+ # HELPERS
250
+ # -----------------------------------------------------------------
251
+
252
+ def _normalized_levenshtein(self, s1: str, s2: str) -> float:
253
+ """Calculate normalized Levenshtein similarity (0 to 1)."""
254
+ if s1 == s2:
255
+ return 1.0
256
+ len1, len2 = len(s1), len(s2)
257
+ max_len = max(len1, len2)
258
+ if max_len == 0:
259
+ return 1.0
260
+
261
+ # Early termination if length difference too large
262
+ if abs(len1 - len2) / max_len > (1 - self._name_threshold):
263
+ return 0.0
264
+
265
+ # Two-row Levenshtein
266
+ if len1 > len2:
267
+ s1, s2 = s2, s1
268
+ len1, len2 = len2, len1
269
+
270
+ previous_row = list(range(len2 + 1))
271
+ for i, c1 in enumerate(s1):
272
+ current_row = [i + 1]
273
+ for j, c2 in enumerate(s2):
274
+ insertions = previous_row[j + 1] + 1
275
+ deletions = current_row[j] + 1
276
+ substitutions = previous_row[j] + (c1 != c2)
277
+ current_row.append(min(insertions, deletions, substitutions))
278
+ previous_row = current_row
279
+
280
+ distance = previous_row[-1]
281
+ return 1 - (distance / max_len)
282
+
283
+ def _jaccard_similarity(self, set1: set[str], set2: set[str]) -> float:
284
+ """Calculate exact Jaccard similarity."""
285
+ if not set1 and not set2:
286
+ return 1.0
287
+ if not set1 or not set2:
288
+ return 0.0
289
+ intersection = len(set1 & set2)
290
+ union = len(set1 | set2)
291
+ return intersection / union if union > 0 else 0.0
292
+
293
+
294
+ register_detector(SimilarityDetector())
@@ -0,0 +1,100 @@
1
+ """Semantic type detection using pattern matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from datawash.core.models import DatasetProfile, Finding, Severity
8
+ from datawash.detectors.base import BaseDetector
9
+ from datawash.detectors.registry import register_detector
10
+
11
+
12
+ class TypeDetector(BaseDetector):
13
+ @property
14
+ def name(self) -> str:
15
+ return "types"
16
+
17
+ @property
18
+ def description(self) -> str:
19
+ return "Detects semantic types and type mismatches"
20
+
21
+ def detect(self, df: pd.DataFrame, profile: DatasetProfile) -> list[Finding]:
22
+ findings: list[Finding] = []
23
+
24
+ for col_name, col_profile in profile.columns.items():
25
+ is_string = pd.api.types.is_string_dtype(df[col_name])
26
+
27
+ # Flag numeric columns stored as strings
28
+ if is_string:
29
+ series = df[col_name].dropna()
30
+ if series.empty:
31
+ continue
32
+ # Sample for large columns to avoid expensive pd.to_numeric
33
+ if len(series) > 1000:
34
+ sample = series.sample(1000, random_state=42)
35
+ else:
36
+ sample = series
37
+ numeric_count = pd.to_numeric(sample, errors="coerce").notna().sum()
38
+ ratio = numeric_count / len(sample)
39
+ if ratio > 0.8:
40
+ findings.append(
41
+ Finding(
42
+ detector=self.name,
43
+ issue_type="numeric_as_string",
44
+ severity=Severity.MEDIUM,
45
+ columns=[col_name],
46
+ details={"numeric_ratio": round(float(ratio), 3)},
47
+ message=(
48
+ f"Column '{col_name}' appears numeric "
49
+ f"but stored as string "
50
+ f"({ratio:.0%} parseable)"
51
+ ),
52
+ confidence=float(ratio),
53
+ )
54
+ )
55
+
56
+ # Flag boolean-like columns
57
+ if is_string:
58
+ bool_values = {
59
+ "true",
60
+ "false",
61
+ "yes",
62
+ "no",
63
+ "y",
64
+ "n",
65
+ "1",
66
+ "0",
67
+ "t",
68
+ "f",
69
+ "on",
70
+ "off",
71
+ }
72
+ lowered_unique = set(
73
+ df[col_name].dropna().astype(str).str.strip().str.lower().unique()
74
+ )
75
+ if lowered_unique <= bool_values and len(lowered_unique) >= 2:
76
+ findings.append(
77
+ Finding(
78
+ detector=self.name,
79
+ issue_type="boolean_as_string",
80
+ severity=Severity.LOW,
81
+ columns=[col_name],
82
+ details={"values": sorted(lowered_unique)},
83
+ message=(
84
+ f"Column '{col_name}' contains "
85
+ f"boolean-like values "
86
+ f"stored as strings"
87
+ ),
88
+ confidence=0.95,
89
+ )
90
+ )
91
+
92
+ # Report detected semantic types from patterns
93
+ if col_profile.patterns:
94
+ for pattern_name, pattern_info in col_profile.patterns.items():
95
+ col_profile.semantic_type = pattern_name
96
+
97
+ return findings
98
+
99
+
100
+ register_detector(TypeDetector())
@@ -0,0 +1 @@
1
+ from .engine import profile_dataset as profile_dataset
@@ -0,0 +1,88 @@
1
+ """Main profiling orchestrator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+
11
+ from datawash.core.models import ColumnProfile, DatasetProfile
12
+ from datawash.profiler.patterns import detect_column_patterns
13
+ from datawash.profiler.statistics import (
14
+ compute_categorical_stats,
15
+ compute_numeric_stats,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def profile_dataset(df: pd.DataFrame) -> DatasetProfile:
22
+ """Generate a complete profile for a DataFrame.
23
+
24
+ Args:
25
+ df: The DataFrame to profile.
26
+
27
+ Returns:
28
+ DatasetProfile with column-level and dataset-level statistics.
29
+ """
30
+ logger.info("Profiling dataset: %d rows, %d columns", len(df), len(df.columns))
31
+
32
+ columns: dict[str, ColumnProfile] = {}
33
+ use_progress = len(df) > 10000 or len(df.columns) > 20
34
+ if use_progress and sys.stderr.isatty():
35
+ from rich.progress import Progress
36
+
37
+ with Progress() as progress:
38
+ task = progress.add_task("Profiling columns...", total=len(df.columns))
39
+ for col_name in df.columns:
40
+ columns[col_name] = _profile_column(df[col_name])
41
+ progress.update(task, advance=1)
42
+ else:
43
+ for col_name in df.columns:
44
+ columns[col_name] = _profile_column(df[col_name])
45
+
46
+ return DatasetProfile(
47
+ row_count=len(df),
48
+ column_count=len(df.columns),
49
+ memory_bytes=int(df.memory_usage(deep=True).sum()),
50
+ columns=columns,
51
+ duplicate_row_count=int(df.duplicated().sum()),
52
+ )
53
+
54
+
55
+ def _profile_column(series: pd.Series) -> ColumnProfile:
56
+ """Profile a single column."""
57
+ name = str(series.name)
58
+ null_count = int(series.isna().sum())
59
+ total = len(series)
60
+ unique_count = int(series.nunique())
61
+
62
+ # Compute type-appropriate statistics
63
+ stats: dict[str, Any] = {}
64
+ # Boolean columns should use categorical stats (quantile fails on bool)
65
+ if pd.api.types.is_bool_dtype(series):
66
+ stats = compute_categorical_stats(series)
67
+ elif pd.api.types.is_numeric_dtype(series):
68
+ stats = compute_numeric_stats(series)
69
+ else:
70
+ stats = compute_categorical_stats(series)
71
+
72
+ # Detect patterns
73
+ patterns = detect_column_patterns(series)
74
+
75
+ # Sample values (up to 5 non-null)
76
+ sample_values = series.dropna().head(5).tolist()
77
+
78
+ return ColumnProfile(
79
+ name=name,
80
+ dtype=str(series.dtype),
81
+ null_count=null_count,
82
+ null_ratio=round(null_count / total, 4) if total > 0 else 0.0,
83
+ unique_count=unique_count,
84
+ unique_ratio=round(unique_count / total, 4) if total > 0 else 0.0,
85
+ sample_values=sample_values,
86
+ statistics=stats,
87
+ patterns=patterns,
88
+ )
@@ -0,0 +1,122 @@
1
+ """Parallel column profiling and detector execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import Any, Optional
9
+
10
+ import pandas as pd
11
+
12
+ from datawash.core.cache import ComputationCache
13
+ from datawash.core.models import ColumnProfile, DatasetProfile, Finding
14
+ from datawash.detectors.base import BaseDetector
15
+ from datawash.profiler.patterns import detect_column_patterns
16
+ from datawash.profiler.statistics import (
17
+ compute_categorical_stats,
18
+ compute_numeric_stats,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ MAX_WORKERS = min(8, os.cpu_count() or 4)
24
+
25
+
26
+ def profile_dataset_parallel(
27
+ df: pd.DataFrame,
28
+ cache: Optional[ComputationCache] = None,
29
+ ) -> DatasetProfile:
30
+ """Profile all columns in parallel using ThreadPoolExecutor."""
31
+ columns: dict[str, ColumnProfile] = {}
32
+
33
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
34
+ future_to_col = {
35
+ executor.submit(_profile_column, df[col_name]): col_name
36
+ for col_name in df.columns
37
+ }
38
+ for future in as_completed(future_to_col):
39
+ col_name = future_to_col[future]
40
+ try:
41
+ columns[col_name] = future.result()
42
+ except Exception:
43
+ logger.exception("Failed to profile column %s", col_name)
44
+ columns[col_name] = _empty_profile(col_name)
45
+
46
+ return DatasetProfile(
47
+ row_count=len(df),
48
+ column_count=len(df.columns),
49
+ memory_bytes=int(df.memory_usage(deep=True).sum()),
50
+ columns=columns,
51
+ duplicate_row_count=int(df.duplicated().sum()),
52
+ )
53
+
54
+
55
+ def run_detectors_parallel(
56
+ df: pd.DataFrame,
57
+ profile: DatasetProfile,
58
+ detectors: dict[str, BaseDetector],
59
+ ) -> list[Finding]:
60
+ """Run all detectors in parallel."""
61
+ findings: list[Finding] = []
62
+
63
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
64
+ future_to_name = {
65
+ executor.submit(detector.detect, df, profile): name
66
+ for name, detector in detectors.items()
67
+ }
68
+ for future in as_completed(future_to_name):
69
+ name = future_to_name[future]
70
+ try:
71
+ results = future.result()
72
+ findings.extend(results)
73
+ except Exception:
74
+ logger.exception("Detector %s failed", name)
75
+
76
+ return findings
77
+
78
+
79
+ def _profile_column(series: pd.Series) -> ColumnProfile:
80
+ """Profile a single column (runs inside thread)."""
81
+ name = str(series.name)
82
+ null_count = int(series.isna().sum())
83
+ total = len(series)
84
+ unique_count = int(series.nunique())
85
+
86
+ stats: dict[str, Any] = {}
87
+ if pd.api.types.is_bool_dtype(series):
88
+ stats = compute_categorical_stats(series)
89
+ elif pd.api.types.is_numeric_dtype(series):
90
+ stats = compute_numeric_stats(series)
91
+ else:
92
+ stats = compute_categorical_stats(series)
93
+
94
+ patterns = detect_column_patterns(series)
95
+ sample_values = series.dropna().head(5).tolist()
96
+
97
+ return ColumnProfile(
98
+ name=name,
99
+ dtype=str(series.dtype),
100
+ null_count=null_count,
101
+ null_ratio=round(null_count / total, 4) if total > 0 else 0.0,
102
+ unique_count=unique_count,
103
+ unique_ratio=round(unique_count / total, 4) if total > 0 else 0.0,
104
+ sample_values=sample_values,
105
+ statistics=stats,
106
+ patterns=patterns,
107
+ )
108
+
109
+
110
+ def _empty_profile(col_name: str) -> ColumnProfile:
111
+ """Fallback profile when profiling fails."""
112
+ return ColumnProfile(
113
+ name=col_name,
114
+ dtype="unknown",
115
+ null_count=0,
116
+ null_ratio=0.0,
117
+ unique_count=0,
118
+ unique_ratio=0.0,
119
+ sample_values=[],
120
+ statistics={},
121
+ patterns={},
122
+ )
@@ -0,0 +1,80 @@
1
+ """Pattern detection for common data formats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ PATTERNS: dict[str, re.Pattern[str]] = {
11
+ "email": re.compile(r"^[\w.+-]+@[\w-]+\.[\w.-]+$"),
12
+ "url": re.compile(r"^https?://[\w\-._~:/?#\[\]@!$&'()*+,;=]+$"),
13
+ "phone": re.compile(r"^[\+]?[(]?[0-9]{1,4}[)]?[-\s./0-9]{6,15}$"),
14
+ "ipv4": re.compile(r"^(?:\d{1,3}\.){3}\d{1,3}$"),
15
+ "uuid": re.compile(
16
+ r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE
17
+ ),
18
+ "currency": re.compile("^[$\u20ac\u00a3\u00a5][\\s]?[\\d,]+\\.?\\d*$"),
19
+ "zip_us": re.compile(r"^\d{5}(-\d{4})?$"),
20
+ }
21
+
22
+ DATE_FORMATS = [
23
+ "%Y-%m-%d",
24
+ "%m/%d/%Y",
25
+ "%d/%m/%Y",
26
+ "%Y/%m/%d",
27
+ "%m-%d-%Y",
28
+ "%d-%m-%Y",
29
+ "%Y%m%d",
30
+ "%b %d, %Y",
31
+ "%B %d, %Y",
32
+ "%d %b %Y",
33
+ "%d %B %Y",
34
+ "%Y-%m-%d %H:%M:%S",
35
+ "%m/%d/%Y %H:%M:%S",
36
+ ]
37
+
38
+
39
+ def detect_column_patterns(series: pd.Series) -> dict[str, Any]:
40
+ """Detect patterns in a column's values.
41
+
42
+ Returns dict with detected pattern name and match ratio.
43
+ """
44
+ clean = series.dropna().astype(str)
45
+ if clean.empty:
46
+ return {}
47
+
48
+ sample = clean.head(1000)
49
+ total = len(sample)
50
+ results: dict[str, Any] = {}
51
+
52
+ for name, pattern in PATTERNS.items():
53
+ matches = sample.str.match(pattern).sum()
54
+ ratio = matches / total
55
+ if ratio > 0.5:
56
+ results[name] = {
57
+ "match_ratio": round(float(ratio), 3),
58
+ "pattern": pattern.pattern,
59
+ }
60
+
61
+ # Date detection
62
+ if not results:
63
+ date_ratio = _detect_date_pattern(sample)
64
+ if date_ratio and date_ratio["match_ratio"] > 0.5:
65
+ results["date"] = date_ratio
66
+
67
+ return results
68
+
69
+
70
+ def _detect_date_pattern(sample: pd.Series) -> dict[str, Any] | None:
71
+ """Try parsing dates with common formats."""
72
+ for fmt in DATE_FORMATS:
73
+ try:
74
+ parsed = pd.to_datetime(sample, format=fmt, errors="coerce")
75
+ ratio = parsed.notna().sum() / len(sample)
76
+ if ratio > 0.7:
77
+ return {"match_ratio": round(float(ratio), 3), "format": fmt}
78
+ except Exception:
79
+ continue
80
+ return None