wherefore 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wherefore/__init__.py +0 -0
- wherefore/cli.py +632 -0
- wherefore/clustering/__init__.py +0 -0
- wherefore/clustering/cluster_mismatches.py +344 -0
- wherefore/clustering/signatures.py +446 -0
- wherefore/comparison/__init__.py +0 -0
- wherefore/comparison/diff_engine.py +229 -0
- wherefore/comparison/diff_result.py +151 -0
- wherefore/comparison/key_matching.py +149 -0
- wherefore/comparison/loaders.py +370 -0
- wherefore/reasoning/__init__.py +0 -0
- wherefore/reasoning/explain.py +207 -0
- wherefore/reasoning/prompts/cluster_explanation_v1.md +71 -0
- wherefore/reasoning/providers/__init__.py +0 -0
- wherefore/reasoning/providers/base.py +30 -0
- wherefore/reasoning/providers/claude.py +81 -0
- wherefore/reasoning/redaction.py +118 -0
- wherefore/reasoning/report.py +25 -0
- wherefore/synthetic/__init__.py +0 -0
- wherefore/synthetic/base_dataset.py +259 -0
- wherefore/synthetic/corruptors/__init__.py +0 -0
- wherefore/synthetic/corruptors/dedup_failure.py +68 -0
- wherefore/synthetic/corruptors/encoding_mismatch.py +84 -0
- wherefore/synthetic/corruptors/enum_drift.py +70 -0
- wherefore/synthetic/corruptors/float_precision.py +78 -0
- wherefore/synthetic/corruptors/key_mismatch.py +100 -0
- wherefore/synthetic/corruptors/null_type_coercion.py +84 -0
- wherefore/synthetic/corruptors/timezone_shift.py +73 -0
- wherefore/synthetic/corruptors/truncation.py +72 -0
- wherefore/synthetic/ground_truth.py +83 -0
- wherefore/taxonomy/__init__.py +0 -0
- wherefore/taxonomy/patterns/dedup_failure.yaml +52 -0
- wherefore/taxonomy/patterns/encoding_mismatch.yaml +45 -0
- wherefore/taxonomy/patterns/enum_drift.yaml +48 -0
- wherefore/taxonomy/patterns/float_precision.yaml +51 -0
- wherefore/taxonomy/patterns/key_mismatch.yaml +59 -0
- wherefore/taxonomy/patterns/null_type_coercion.yaml +52 -0
- wherefore/taxonomy/patterns/timezone_shift.yaml +50 -0
- wherefore/taxonomy/patterns/truncation.yaml +44 -0
- wherefore/taxonomy/registry.py +180 -0
- wherefore/taxonomy/schema.py +156 -0
- wherefore-0.1.0.dist-info/METADATA +447 -0
- wherefore-0.1.0.dist-info/RECORD +48 -0
- wherefore-0.1.0.dist-info/WHEEL +5 -0
- wherefore-0.1.0.dist-info/entry_points.txt +2 -0
- wherefore-0.1.0.dist-info/licenses/LICENSE +202 -0
- wherefore-0.1.0.dist-info/licenses/NOTICE +5 -0
- wherefore-0.1.0.dist-info/top_level.txt +1 -0
wherefore/__init__.py
ADDED
|
File without changes
|
wherefore/cli.py
ADDED
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py
|
|
3
|
+
|
|
4
|
+
The `wherefore compare` command. Wires together everything that's
|
|
5
|
+
real: loaders -> (exact or fuzzy key resolution) -> diff_engine ->
|
|
6
|
+
cluster_mismatches -> a Markdown report.
|
|
7
|
+
|
|
8
|
+
By default, the report shows statistical findings only -- zero
|
|
9
|
+
network calls, zero API cost, no key required. This is the default
|
|
10
|
+
specifically so anyone can clone the repo and try the tool for free
|
|
11
|
+
without needing an Anthropic API key (see README "Try it yourself").
|
|
12
|
+
|
|
13
|
+
Pass --explain to additionally call the real AI reasoning layer
|
|
14
|
+
(explain()) for each cluster and include its plain-English narrative
|
|
15
|
+
in the report ALONGSIDE the statistical detail -- not replacing it,
|
|
16
|
+
so a reader can see both the AI's causal claim and the raw evidence it
|
|
17
|
+
reasoned from side by side, rather than trusting the narrative blindly.
|
|
18
|
+
--explain requires ANTHROPIC_API_KEY to be set; this is checked up
|
|
19
|
+
front, before any diffing/clustering work, so a missing key fails fast
|
|
20
|
+
with a clear message instead of partway through a run.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import typer
|
|
29
|
+
|
|
30
|
+
from wherefore.clustering.cluster_mismatches import (
|
|
31
|
+
DEFAULT_CONFIDENCE_THRESHOLD,
|
|
32
|
+
Cluster,
|
|
33
|
+
cluster_mismatches,
|
|
34
|
+
detect_row_presence_patterns,
|
|
35
|
+
)
|
|
36
|
+
from wherefore.comparison.diff_engine import compare as run_diff
|
|
37
|
+
from wherefore.comparison.key_matching import fuzzy_match_keys
|
|
38
|
+
from wherefore.comparison.loaders import load_file
|
|
39
|
+
from wherefore.reasoning.explain import ClusterExplanation, explain
|
|
40
|
+
from wherefore.taxonomy.registry import build_llm_taxonomy_menu
|
|
41
|
+
|
|
42
|
+
app = typer.Typer()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# This empty callback exists purely so Typer keeps `compare` as an
|
|
46
|
+
# explicit subcommand. Without it, Typer collapses a single registered
|
|
47
|
+
# @app.command() into the app's root invocation -- confirmed directly:
|
|
48
|
+
# `wherefore compare a.csv b.csv` failed with "unexpected extra
|
|
49
|
+
# argument" until this was added, because Typer treated `compare` as
|
|
50
|
+
# the literal first positional argument rather than a subcommand name.
|
|
51
|
+
# Remove this once a second subcommand is added (Typer stops
|
|
52
|
+
# collapsing once there are 2+ commands registered).
|
|
53
|
+
@app.callback()
|
|
54
|
+
def _force_subcommand_mode() -> None:
|
|
55
|
+
"""wherefore: explains why two datasets differ, not just that they do."""
|
|
56
|
+
|
|
57
|
+
MIN_KEY_UNIQUENESS = 0.95 # a candidate join key column must be at least this unique to be auto-selected
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
from dataclasses import dataclass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class ComparisonRunResult:
|
|
65
|
+
"""
|
|
66
|
+
Structured output of running one source/target comparison, shared
|
|
67
|
+
by `compare` (one pair) and `compare_dir` (many pairs) so the
|
|
68
|
+
actual diff/cluster/explain logic lives in exactly one place.
|
|
69
|
+
Render/print/write decisions stay with the caller -- this dataclass
|
|
70
|
+
just carries what happened.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
join_column: str
|
|
74
|
+
diff_result: object
|
|
75
|
+
clusters: list
|
|
76
|
+
row_presence_clusters: list
|
|
77
|
+
explanations: dict
|
|
78
|
+
redaction_categories: set[str]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _run_comparison(
|
|
82
|
+
source_df,
|
|
83
|
+
target_df,
|
|
84
|
+
key: str | None,
|
|
85
|
+
fuzzy_keys: bool,
|
|
86
|
+
confidence_threshold: float,
|
|
87
|
+
explain_flag: bool,
|
|
88
|
+
no_redact: bool,
|
|
89
|
+
) -> ComparisonRunResult:
|
|
90
|
+
"""
|
|
91
|
+
The actual diff -> cluster -> (optional) explain pipeline, extracted
|
|
92
|
+
from compare() so compare_dir() can reuse it exactly rather than
|
|
93
|
+
duplicating key-detection, fuzzy-matching, and redaction-wired
|
|
94
|
+
explain() logic across two commands. Raises typer.Exit on the same
|
|
95
|
+
error conditions compare() always has (no key found, key missing
|
|
96
|
+
from a file) -- callers decide whether to abort the whole run or
|
|
97
|
+
catch and continue (compare_dir does the latter, per-pair).
|
|
98
|
+
"""
|
|
99
|
+
join_column = key or _auto_detect_key(source_df, target_df)
|
|
100
|
+
if join_column is None:
|
|
101
|
+
raise ValueError("Could not auto-detect a join key column. Pass one explicitly with --key.")
|
|
102
|
+
|
|
103
|
+
if join_column not in source_df.columns or join_column not in target_df.columns:
|
|
104
|
+
raise ValueError(f"Key column {join_column!r} not found in both files.")
|
|
105
|
+
|
|
106
|
+
# key_mismatch (unresolved key-formatting drift) is detected via
|
|
107
|
+
# detect_row_presence_patterns below, on diff_result.source_only_rows/
|
|
108
|
+
# target_only_rows -- it doesn't need fuzzy_match_confidence at all.
|
|
109
|
+
# fuzzy_match_confidence is threaded through to diff_result anyway
|
|
110
|
+
# (rather than discarded, as it was before) because it's real signal
|
|
111
|
+
# key_matching.py already computes and DiffResult already has a field
|
|
112
|
+
# for -- a SEPARATE, not-yet-built detector (flagging fuzzy matches
|
|
113
|
+
# that were accepted but only barely cleared the confidence floor)
|
|
114
|
+
# would consume it later. No current caller reads diff_result.
|
|
115
|
+
# fuzzy_match_confidence yet; that's an intentionally deferred next
|
|
116
|
+
# step, not dead code -- see project notes on why that detector's
|
|
117
|
+
# scoring needed more design work before shipping.
|
|
118
|
+
fuzzy_match_confidence: dict[str, float] | None = None
|
|
119
|
+
if fuzzy_keys:
|
|
120
|
+
source_df, target_df, fuzzy_match_confidence = _apply_fuzzy_key_resolution(
|
|
121
|
+
source_df, target_df, join_column
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
diff_result = run_diff(
|
|
125
|
+
source_df, target_df, join_columns=join_column, fuzzy_match_confidence=fuzzy_match_confidence
|
|
126
|
+
)
|
|
127
|
+
clusters = cluster_mismatches(diff_result, confidence_threshold=confidence_threshold)
|
|
128
|
+
row_presence_clusters = detect_row_presence_patterns(
|
|
129
|
+
diff_result, source_df=source_df, target_df=target_df, confidence_threshold=confidence_threshold
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
explanations: dict[str, ClusterExplanation] = {}
|
|
133
|
+
all_redaction_categories: set[str] = set()
|
|
134
|
+
if explain_flag and clusters:
|
|
135
|
+
taxonomy_menu = build_llm_taxonomy_menu()
|
|
136
|
+
typer.echo(f"Calling Claude for {len(clusters)} cluster(s)...")
|
|
137
|
+
for cluster in clusters:
|
|
138
|
+
try:
|
|
139
|
+
explanation, categories = explain(cluster, taxonomy_menu, redact=not no_redact)
|
|
140
|
+
explanations[cluster.column] = explanation
|
|
141
|
+
all_redaction_categories.update(categories)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
typer.secho(
|
|
144
|
+
f"Warning: explain() failed for column {cluster.column!r}: {e}",
|
|
145
|
+
fg=typer.colors.YELLOW,
|
|
146
|
+
err=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return ComparisonRunResult(
|
|
150
|
+
join_column=join_column,
|
|
151
|
+
diff_result=diff_result,
|
|
152
|
+
clusters=clusters,
|
|
153
|
+
row_presence_clusters=row_presence_clusters,
|
|
154
|
+
explanations=explanations,
|
|
155
|
+
redaction_categories=all_redaction_categories,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@app.command()
|
|
160
|
+
def compare(
|
|
161
|
+
source: str = typer.Argument(..., help="Path to the source CSV/JSON file"),
|
|
162
|
+
target: str = typer.Argument(..., help="Path to the target CSV/JSON file"),
|
|
163
|
+
key: str = typer.Option(
|
|
164
|
+
None, "--key", help="Join key column name. If omitted, wherefore tries to auto-detect one."
|
|
165
|
+
),
|
|
166
|
+
fuzzy_keys: bool = typer.Option(
|
|
167
|
+
False,
|
|
168
|
+
"--fuzzy-keys",
|
|
169
|
+
help="Allow approximate key matching when exact keys don't align (e.g. 'CUST-001' vs 'CUST001').",
|
|
170
|
+
),
|
|
171
|
+
output: str = typer.Option("report.md", "--output", help="Path to write the Markdown report."),
|
|
172
|
+
confidence_threshold: float = typer.Option(
|
|
173
|
+
DEFAULT_CONFIDENCE_THRESHOLD,
|
|
174
|
+
"--confidence-threshold",
|
|
175
|
+
help="Minimum confidence (0-1) for a statistical signature to count as a pattern match.",
|
|
176
|
+
),
|
|
177
|
+
explain_flag: bool = typer.Option(
|
|
178
|
+
False,
|
|
179
|
+
"--explain",
|
|
180
|
+
help="Call the real Claude API to generate plain-English causal narratives for each "
|
|
181
|
+
"cluster, in addition to the statistical detail. Requires ANTHROPIC_API_KEY to be "
|
|
182
|
+
"set. Makes real network calls and incurs real API cost -- off by default.",
|
|
183
|
+
),
|
|
184
|
+
no_redact: bool = typer.Option(
|
|
185
|
+
False,
|
|
186
|
+
"--no-redact",
|
|
187
|
+
help="Disable automatic redaction of common sensitive patterns (emails, SSNs, credit "
|
|
188
|
+
"card numbers, phone numbers) before sending values to the Claude API with --explain. "
|
|
189
|
+
"Redaction is ON by default -- only disable this if you've already vetted your data.",
|
|
190
|
+
),
|
|
191
|
+
) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Compare two datasets and show what's different, grouped by pattern
|
|
194
|
+
where a statistical signature matches a known failure mode.
|
|
195
|
+
|
|
196
|
+
Example:
|
|
197
|
+
wherefore compare old_export.csv new_export.csv --output report.md
|
|
198
|
+
wherefore compare old_export.csv new_export.csv --explain
|
|
199
|
+
"""
|
|
200
|
+
if explain_flag and not os.environ.get("ANTHROPIC_API_KEY"):
|
|
201
|
+
typer.secho(
|
|
202
|
+
"Error: --explain requires ANTHROPIC_API_KEY to be set in your environment.\n"
|
|
203
|
+
'Run: export ANTHROPIC_API_KEY="sk-ant-..." before using --explain.',
|
|
204
|
+
fg=typer.colors.RED,
|
|
205
|
+
err=True,
|
|
206
|
+
)
|
|
207
|
+
raise typer.Exit(code=1)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
source_df = load_file(source)
|
|
211
|
+
target_df = load_file(target)
|
|
212
|
+
except (FileNotFoundError, ValueError, UnicodeDecodeError, RuntimeError, ImportError) as e:
|
|
213
|
+
typer.secho(f"Error loading files: {e}", fg=typer.colors.RED, err=True)
|
|
214
|
+
raise typer.Exit(code=1)
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
result = _run_comparison(
|
|
218
|
+
source_df, target_df, key, fuzzy_keys, confidence_threshold, explain_flag, no_redact
|
|
219
|
+
)
|
|
220
|
+
except ValueError as e:
|
|
221
|
+
typer.secho(str(e), fg=typer.colors.RED, err=True)
|
|
222
|
+
raise typer.Exit(code=1)
|
|
223
|
+
|
|
224
|
+
if result.redaction_categories:
|
|
225
|
+
typer.secho(
|
|
226
|
+
f"Redacted before sending to Claude: {', '.join(sorted(result.redaction_categories))} "
|
|
227
|
+
f"-- pass --no-redact to disable this.",
|
|
228
|
+
fg=typer.colors.YELLOW,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
report = _render_report(
|
|
232
|
+
source, target, result.join_column, result.diff_result, result.clusters, result.explanations,
|
|
233
|
+
row_presence_clusters=result.row_presence_clusters,
|
|
234
|
+
)
|
|
235
|
+
Path(output).write_text(report)
|
|
236
|
+
|
|
237
|
+
_print_summary(
|
|
238
|
+
result.diff_result, result.clusters, output, result.explanations,
|
|
239
|
+
row_presence_clusters=result.row_presence_clusters,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@app.command(name="compare-dir")
|
|
244
|
+
def compare_dir(
|
|
245
|
+
source_dir: str = typer.Argument(..., help="Directory of source files."),
|
|
246
|
+
target_dir: str = typer.Argument(..., help="Directory of target files with matching filenames."),
|
|
247
|
+
output_dir: str = typer.Option(
|
|
248
|
+
"reports", "--output-dir", help="Directory to write one report per matched file pair."
|
|
249
|
+
),
|
|
250
|
+
key: str = typer.Option(
|
|
251
|
+
None, "--key", help="Join key column name, applied to every pair. If omitted, auto-detected per pair."
|
|
252
|
+
),
|
|
253
|
+
fuzzy_keys: bool = typer.Option(False, "--fuzzy-keys", help="Allow approximate key matching, applied to every pair."),
|
|
254
|
+
confidence_threshold: float = typer.Option(
|
|
255
|
+
DEFAULT_CONFIDENCE_THRESHOLD, "--confidence-threshold", help="Minimum confidence for a pattern match."
|
|
256
|
+
),
|
|
257
|
+
explain_flag: bool = typer.Option(
|
|
258
|
+
False, "--explain", help="Call the real Claude API for every pair with mismatches. Requires ANTHROPIC_API_KEY."
|
|
259
|
+
),
|
|
260
|
+
no_redact: bool = typer.Option(False, "--no-redact", help="Disable redaction for all pairs."),
|
|
261
|
+
) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Compare every matching file pair across two directories -- the
|
|
264
|
+
real-world shape of a migration audit, where you're checking dozens
|
|
265
|
+
of tables, not one. Files are matched by IDENTICAL FILENAME between
|
|
266
|
+
source_dir and target_dir (e.g. source_dir/accounts.csv pairs with
|
|
267
|
+
target_dir/accounts.csv) -- the same mental model as "same table,
|
|
268
|
+
same name, different environment," and deliberately simple: no
|
|
269
|
+
fuzzy filename matching, since guessing wrong at the FILE level
|
|
270
|
+
(comparing the wrong two tables) is a much worse mistake than
|
|
271
|
+
guessing wrong at the row-key level, which already has its own
|
|
272
|
+
careful, opt-in fuzzy-matching path (--fuzzy-keys).
|
|
273
|
+
|
|
274
|
+
Writes one report per pair into output_dir (named after the
|
|
275
|
+
source file), plus a one-line summary per pair to the terminal,
|
|
276
|
+
and a final tally. A failure on one pair (e.g. unrecognized file
|
|
277
|
+
format, no detectable key) is reported and skipped -- it does NOT
|
|
278
|
+
abort the whole batch, since the entire point of this command is
|
|
279
|
+
surviving a large, messy real-world directory where a handful of
|
|
280
|
+
files might not compare cleanly.
|
|
281
|
+
|
|
282
|
+
Example:
|
|
283
|
+
wherefore compare-dir old_exports/ new_exports/ --output-dir reports/
|
|
284
|
+
"""
|
|
285
|
+
if explain_flag and not os.environ.get("ANTHROPIC_API_KEY"):
|
|
286
|
+
typer.secho(
|
|
287
|
+
"Error: --explain requires ANTHROPIC_API_KEY to be set in your environment.\n"
|
|
288
|
+
'Run: export ANTHROPIC_API_KEY="sk-ant-..." before using --explain.',
|
|
289
|
+
fg=typer.colors.RED,
|
|
290
|
+
err=True,
|
|
291
|
+
)
|
|
292
|
+
raise typer.Exit(code=1)
|
|
293
|
+
|
|
294
|
+
source_path = Path(source_dir)
|
|
295
|
+
target_path = Path(target_dir)
|
|
296
|
+
if not source_path.is_dir():
|
|
297
|
+
typer.secho(f"Error: {source_dir!r} is not a directory.", fg=typer.colors.RED, err=True)
|
|
298
|
+
raise typer.Exit(code=1)
|
|
299
|
+
if not target_path.is_dir():
|
|
300
|
+
typer.secho(f"Error: {target_dir!r} is not a directory.", fg=typer.colors.RED, err=True)
|
|
301
|
+
raise typer.Exit(code=1)
|
|
302
|
+
|
|
303
|
+
pairs = _match_files_by_name(source_path, target_path)
|
|
304
|
+
if not pairs:
|
|
305
|
+
typer.secho(
|
|
306
|
+
f"No matching filenames found between {source_dir!r} and {target_dir!r}.",
|
|
307
|
+
fg=typer.colors.RED,
|
|
308
|
+
err=True,
|
|
309
|
+
)
|
|
310
|
+
raise typer.Exit(code=1)
|
|
311
|
+
|
|
312
|
+
output_path = Path(output_dir)
|
|
313
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
314
|
+
|
|
315
|
+
typer.echo(f"Found {len(pairs)} matching file pair(s). Comparing...")
|
|
316
|
+
typer.echo()
|
|
317
|
+
|
|
318
|
+
succeeded = 0
|
|
319
|
+
failed = 0
|
|
320
|
+
all_redaction_categories: set[str] = set()
|
|
321
|
+
|
|
322
|
+
for source_file, target_file in pairs:
|
|
323
|
+
pair_label = source_file.name
|
|
324
|
+
try:
|
|
325
|
+
source_df = load_file(str(source_file))
|
|
326
|
+
target_df = load_file(str(target_file))
|
|
327
|
+
except (FileNotFoundError, ValueError, UnicodeDecodeError, RuntimeError, ImportError) as e:
|
|
328
|
+
typer.secho(f" [SKIPPED] {pair_label}: error loading files: {e}", fg=typer.colors.RED)
|
|
329
|
+
failed += 1
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
result = _run_comparison(
|
|
334
|
+
source_df, target_df, key, fuzzy_keys, confidence_threshold, explain_flag, no_redact
|
|
335
|
+
)
|
|
336
|
+
except ValueError as e:
|
|
337
|
+
typer.secho(f" [SKIPPED] {pair_label}: {e}", fg=typer.colors.RED)
|
|
338
|
+
failed += 1
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
all_redaction_categories.update(result.redaction_categories)
|
|
342
|
+
|
|
343
|
+
report = _render_report(
|
|
344
|
+
str(source_file), str(target_file), result.join_column,
|
|
345
|
+
result.diff_result, result.clusters, result.explanations,
|
|
346
|
+
row_presence_clusters=result.row_presence_clusters,
|
|
347
|
+
)
|
|
348
|
+
report_path = output_path / f"{source_file.stem}_report.md"
|
|
349
|
+
report_path.write_text(report)
|
|
350
|
+
|
|
351
|
+
total_findings = len(result.clusters) + len(result.row_presence_clusters)
|
|
352
|
+
if total_findings == 0:
|
|
353
|
+
typer.secho(f" [OK] {pair_label}: no mismatches", fg=typer.colors.GREEN)
|
|
354
|
+
else:
|
|
355
|
+
pattern_names = [p.pattern_id for c in result.clusters for p in c.candidate_patterns]
|
|
356
|
+
pattern_names += [p.pattern_id for c in result.row_presence_clusters for p in c.candidate_patterns]
|
|
357
|
+
pattern_summary = ", ".join(pattern_names) or "unrecognized pattern(s)"
|
|
358
|
+
typer.secho(
|
|
359
|
+
f" [DIFF] {pair_label}: {total_findings} finding(s) ({pattern_summary})",
|
|
360
|
+
fg=typer.colors.CYAN,
|
|
361
|
+
)
|
|
362
|
+
succeeded += 1
|
|
363
|
+
|
|
364
|
+
typer.echo()
|
|
365
|
+
if all_redaction_categories:
|
|
366
|
+
typer.secho(
|
|
367
|
+
f"Redacted before sending to Claude (across all pairs): "
|
|
368
|
+
f"{', '.join(sorted(all_redaction_categories))} -- pass --no-redact to disable this.",
|
|
369
|
+
fg=typer.colors.YELLOW,
|
|
370
|
+
)
|
|
371
|
+
typer.secho(f"Done: {succeeded} compared, {failed} skipped. Reports written to {output_dir}/", fg=typer.colors.GREEN)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _match_files_by_name(source_dir: Path, target_dir: Path) -> list[tuple[Path, Path]]:
|
|
375
|
+
"""
|
|
376
|
+
Matches files between two directories by IDENTICAL FILENAME --
|
|
377
|
+
deliberately simple, no fuzzy matching at the file level (see
|
|
378
|
+
compare_dir's docstring for why). Only files present in BOTH
|
|
379
|
+
directories are paired; files unique to one side are silently
|
|
380
|
+
excluded from the pairing (not an error -- a real migration
|
|
381
|
+
directory listing might legitimately have a new or removed table).
|
|
382
|
+
Returns pairs sorted by filename for deterministic output ordering.
|
|
383
|
+
"""
|
|
384
|
+
source_files = {p.name: p for p in source_dir.iterdir() if p.is_file()}
|
|
385
|
+
target_files = {p.name: p for p in target_dir.iterdir() if p.is_file()}
|
|
386
|
+
common_names = sorted(set(source_files) & set(target_files))
|
|
387
|
+
return [(source_files[name], target_files[name]) for name in common_names]
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _auto_detect_key(source_df, target_df) -> str | None:
|
|
391
|
+
"""
|
|
392
|
+
Picks a shared column that's (a) present in both files and (b) at
|
|
393
|
+
least MIN_KEY_UNIQUENESS unique in both -- a reasonable proxy for
|
|
394
|
+
"this looks like an identifier column," without requiring perfect
|
|
395
|
+
uniqueness (real data sometimes has a handful of legitimate
|
|
396
|
+
duplicate keys -- e.g. the dedup_failure scenario this tool also
|
|
397
|
+
exists to catch -- so demanding 100% uniqueness would make
|
|
398
|
+
auto-detect fail on exactly the kind of file this tool is for).
|
|
399
|
+
Prefers columns whose name contains "id" or "key" when multiple
|
|
400
|
+
candidates qualify, since that's a strong real-world naming
|
|
401
|
+
convention; falls back to the first qualifying column otherwise.
|
|
402
|
+
"""
|
|
403
|
+
shared_columns = [c for c in source_df.columns if c in target_df.columns]
|
|
404
|
+
candidates = []
|
|
405
|
+
for col in shared_columns:
|
|
406
|
+
source_uniqueness = source_df[col].nunique() / max(len(source_df), 1)
|
|
407
|
+
target_uniqueness = target_df[col].nunique() / max(len(target_df), 1)
|
|
408
|
+
if source_uniqueness >= MIN_KEY_UNIQUENESS and target_uniqueness >= MIN_KEY_UNIQUENESS:
|
|
409
|
+
candidates.append(col)
|
|
410
|
+
|
|
411
|
+
if not candidates:
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
id_like = [c for c in candidates if "id" in c.lower() or "key" in c.lower()]
|
|
415
|
+
return id_like[0] if id_like else candidates[0]
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _apply_fuzzy_key_resolution(source_df, target_df, join_column):
|
|
419
|
+
"""
|
|
420
|
+
Renames target rows' key values to their matched source key where
|
|
421
|
+
a confident fuzzy match exists, so diff_engine's exact join then
|
|
422
|
+
works correctly. Rows with unmatched or ambiguous keys are left
|
|
423
|
+
as-is -- they'll show up as source-only/target-only rows in the
|
|
424
|
+
diff, which is the honest outcome when a key genuinely couldn't be
|
|
425
|
+
confidently resolved, rather than silently forcing a guess.
|
|
426
|
+
|
|
427
|
+
Also returns match_result.confidence_by_target_key (keyed by the
|
|
428
|
+
ORIGINAL, pre-rename target key) so the caller can pass it through
|
|
429
|
+
to diff_engine.compare() instead of discarding it -- a fuzzy match
|
|
430
|
+
that was accepted but scored low is exactly the key_mismatch
|
|
431
|
+
taxonomy pattern's signal (see key_matching.py's module docstring),
|
|
432
|
+
and it's only visible here, before the rename makes the key look
|
|
433
|
+
like a clean exact match to everything downstream.
|
|
434
|
+
"""
|
|
435
|
+
source_keys = source_df[join_column].astype(str).tolist()
|
|
436
|
+
target_keys = target_df[join_column].astype(str).tolist()
|
|
437
|
+
|
|
438
|
+
match_result = fuzzy_match_keys(source_keys, target_keys)
|
|
439
|
+
|
|
440
|
+
if match_result.ambiguous_target_keys:
|
|
441
|
+
preview = match_result.ambiguous_target_keys[:5]
|
|
442
|
+
suffix = " ..." if len(match_result.ambiguous_target_keys) > 5 else ""
|
|
443
|
+
typer.secho(
|
|
444
|
+
f"Warning: {len(match_result.ambiguous_target_keys)} key(s) had ambiguous "
|
|
445
|
+
f"fuzzy matches and were not auto-resolved: {preview}{suffix}",
|
|
446
|
+
fg=typer.colors.YELLOW,
|
|
447
|
+
err=True,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
target_df = target_df.copy()
|
|
451
|
+
target_df[join_column] = target_df[join_column].astype(str).map(
|
|
452
|
+
lambda k: match_result.matched_pairs.get(k, k)
|
|
453
|
+
)
|
|
454
|
+
return source_df, target_df, match_result.confidence_by_target_key
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _render_report(
|
|
458
|
+
source_path,
|
|
459
|
+
target_path,
|
|
460
|
+
join_column,
|
|
461
|
+
diff_result,
|
|
462
|
+
clusters,
|
|
463
|
+
explanations: dict[str, ClusterExplanation] | None = None,
|
|
464
|
+
row_presence_clusters: list | None = None,
|
|
465
|
+
) -> str:
|
|
466
|
+
explanations = explanations or {}
|
|
467
|
+
row_presence_clusters = row_presence_clusters or []
|
|
468
|
+
row_presence_by_side = {c.side: c for c in row_presence_clusters}
|
|
469
|
+
lines = [
|
|
470
|
+
"# wherefore comparison report",
|
|
471
|
+
"",
|
|
472
|
+
f"- Source: `{source_path}`",
|
|
473
|
+
f"- Target: `{target_path}`",
|
|
474
|
+
f"- Join key: `{join_column}`",
|
|
475
|
+
f"- Source rows: {diff_result.source_row_count}",
|
|
476
|
+
f"- Target rows: {diff_result.target_row_count}",
|
|
477
|
+
f"- Matched rows: {diff_result.matched_row_count}",
|
|
478
|
+
"",
|
|
479
|
+
]
|
|
480
|
+
|
|
481
|
+
if explanations:
|
|
482
|
+
lines += [
|
|
483
|
+
"> **Note:** sections marked **AI explanation** below were generated",
|
|
484
|
+
"> by calling the real Claude API (`--explain` was passed). Statistical",
|
|
485
|
+
"> detail is shown alongside each one so you can verify the claim",
|
|
486
|
+
"> against the actual evidence it was reasoned from.",
|
|
487
|
+
"",
|
|
488
|
+
]
|
|
489
|
+
else:
|
|
490
|
+
lines += [
|
|
491
|
+
"> **Note:** this report shows statistical findings only. Pass",
|
|
492
|
+
"> `--explain` to additionally generate a plain-English causal",
|
|
493
|
+
"> narrative for each cluster via the Claude API (requires",
|
|
494
|
+
"> `ANTHROPIC_API_KEY` and makes real, billed API calls).",
|
|
495
|
+
"",
|
|
496
|
+
]
|
|
497
|
+
|
|
498
|
+
if diff_result.source_only_keys:
|
|
499
|
+
lines.append(f"## Rows only in source ({len(diff_result.source_only_keys)})")
|
|
500
|
+
lines.append("")
|
|
501
|
+
source_only_match = row_presence_by_side.get("source_only")
|
|
502
|
+
if source_only_match and not source_only_match.is_unrecognized:
|
|
503
|
+
for p in source_only_match.candidate_patterns:
|
|
504
|
+
lines.append(
|
|
505
|
+
f"- Statistically matches **{p.pattern_id}** "
|
|
506
|
+
f"(signature: `{p.signature_name}`, confidence: {p.confidence:.2f})"
|
|
507
|
+
)
|
|
508
|
+
lines.append("")
|
|
509
|
+
for k in diff_result.source_only_keys[:20]:
|
|
510
|
+
lines.append(f"- {k}")
|
|
511
|
+
if len(diff_result.source_only_keys) > 20:
|
|
512
|
+
lines.append(f"- ... and {len(diff_result.source_only_keys) - 20} more")
|
|
513
|
+
lines.append("")
|
|
514
|
+
|
|
515
|
+
if diff_result.target_only_keys:
|
|
516
|
+
lines.append(f"## Rows only in target ({len(diff_result.target_only_keys)})")
|
|
517
|
+
lines.append("")
|
|
518
|
+
target_only_match = row_presence_by_side.get("target_only")
|
|
519
|
+
if target_only_match and not target_only_match.is_unrecognized:
|
|
520
|
+
for p in target_only_match.candidate_patterns:
|
|
521
|
+
lines.append(
|
|
522
|
+
f"- Statistically matches **{p.pattern_id}** "
|
|
523
|
+
f"(signature: `{p.signature_name}`, confidence: {p.confidence:.2f})"
|
|
524
|
+
)
|
|
525
|
+
lines.append("")
|
|
526
|
+
for k in diff_result.target_only_keys[:20]:
|
|
527
|
+
lines.append(f"- {k}")
|
|
528
|
+
if len(diff_result.target_only_keys) > 20:
|
|
529
|
+
lines.append(f"- ... and {len(diff_result.target_only_keys) - 20} more")
|
|
530
|
+
lines.append("")
|
|
531
|
+
|
|
532
|
+
if not clusters:
|
|
533
|
+
lines.append("## No mismatches found")
|
|
534
|
+
lines.append("")
|
|
535
|
+
lines.append("Every matched row compared identically across all columns.")
|
|
536
|
+
return "\n".join(lines)
|
|
537
|
+
|
|
538
|
+
lines.append(f"## Mismatches by column ({len(clusters)} column(s) affected)")
|
|
539
|
+
lines.append("")
|
|
540
|
+
|
|
541
|
+
for cluster in clusters:
|
|
542
|
+
lines.append(f"### `{cluster.column}` -- {len(cluster.mismatches)} mismatched row(s)")
|
|
543
|
+
lines.append("")
|
|
544
|
+
|
|
545
|
+
explanation = explanations.get(cluster.column)
|
|
546
|
+
if explanation is not None:
|
|
547
|
+
lines.append(f"**AI explanation** (confidence: {explanation.confidence:.2f}):")
|
|
548
|
+
lines.append("")
|
|
549
|
+
lines.append(explanation.narrative)
|
|
550
|
+
lines.append("")
|
|
551
|
+
|
|
552
|
+
if cluster.is_unrecognized:
|
|
553
|
+
lines.append("No known failure pattern's statistical signature matched this cluster.")
|
|
554
|
+
else:
|
|
555
|
+
for match in cluster.candidate_patterns:
|
|
556
|
+
lines.append(
|
|
557
|
+
f"- Statistically matches **{match.pattern_id}** "
|
|
558
|
+
f"(signature: `{match.signature_name}`, confidence: {match.confidence:.2f})"
|
|
559
|
+
)
|
|
560
|
+
lines.append("")
|
|
561
|
+
|
|
562
|
+
lines.append("Example rows:")
|
|
563
|
+
lines.append("")
|
|
564
|
+
if explanation is not None and explanation.cited_rows:
|
|
565
|
+
for row in explanation.cited_rows:
|
|
566
|
+
lines.append(f"- `{row.key}`: `{row.source_value}` -> `{row.target_value}` *(cited by AI)*")
|
|
567
|
+
else:
|
|
568
|
+
for m in cluster.mismatches[:5]:
|
|
569
|
+
lines.append(f"- `{m.key}`: `{m.source_value}` -> `{m.target_value}`")
|
|
570
|
+
if len(cluster.mismatches) > 5:
|
|
571
|
+
lines.append(f"- ... and {len(cluster.mismatches) - 5} more")
|
|
572
|
+
lines.append("")
|
|
573
|
+
|
|
574
|
+
return "\n".join(lines)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _print_summary(
|
|
578
|
+
diff_result,
|
|
579
|
+
clusters,
|
|
580
|
+
output_path,
|
|
581
|
+
explanations: dict[str, ClusterExplanation] | None = None,
|
|
582
|
+
row_presence_clusters: list | None = None,
|
|
583
|
+
) -> None:
|
|
584
|
+
explanations = explanations or {}
|
|
585
|
+
row_presence_by_side = {c.side: c for c in (row_presence_clusters or [])}
|
|
586
|
+
typer.echo(
|
|
587
|
+
f"Compared {diff_result.source_row_count} source rows against "
|
|
588
|
+
f"{diff_result.target_row_count} target rows."
|
|
589
|
+
)
|
|
590
|
+
typer.echo(f"Matched rows: {diff_result.matched_row_count}")
|
|
591
|
+
|
|
592
|
+
if diff_result.source_only_keys:
|
|
593
|
+
typer.echo(f"Rows only in source: {len(diff_result.source_only_keys)}")
|
|
594
|
+
_print_row_presence_match(row_presence_by_side.get("source_only"))
|
|
595
|
+
if diff_result.target_only_keys:
|
|
596
|
+
typer.echo(f"Rows only in target: {len(diff_result.target_only_keys)}")
|
|
597
|
+
_print_row_presence_match(row_presence_by_side.get("target_only"))
|
|
598
|
+
|
|
599
|
+
if not clusters:
|
|
600
|
+
typer.secho("No column mismatches found.", fg=typer.colors.GREEN)
|
|
601
|
+
else:
|
|
602
|
+
for cluster in clusters:
|
|
603
|
+
if cluster.is_unrecognized:
|
|
604
|
+
typer.echo(
|
|
605
|
+
f" {cluster.column}: {len(cluster.mismatches)} mismatches, pattern unrecognized"
|
|
606
|
+
)
|
|
607
|
+
else:
|
|
608
|
+
for match in cluster.candidate_patterns:
|
|
609
|
+
typer.secho(
|
|
610
|
+
f" {cluster.column}: {len(cluster.mismatches)} mismatches, "
|
|
611
|
+
f"matches '{match.pattern_id}' (confidence {match.confidence:.2f})",
|
|
612
|
+
fg=typer.colors.CYAN,
|
|
613
|
+
)
|
|
614
|
+
explanation = explanations.get(cluster.column)
|
|
615
|
+
if explanation is not None:
|
|
616
|
+
typer.secho(f" AI: {explanation.narrative}", fg=typer.colors.MAGENTA)
|
|
617
|
+
|
|
618
|
+
typer.secho(f"\nFull report written to {output_path}", fg=typer.colors.GREEN)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def _print_row_presence_match(row_presence_cluster) -> None:
|
|
622
|
+
if row_presence_cluster is None or row_presence_cluster.is_unrecognized:
|
|
623
|
+
return
|
|
624
|
+
for match in row_presence_cluster.candidate_patterns:
|
|
625
|
+
typer.secho(
|
|
626
|
+
f" matches '{match.pattern_id}' (confidence {match.confidence:.2f})",
|
|
627
|
+
fg=typer.colors.CYAN,
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
if __name__ == "__main__":
|
|
632
|
+
app()
|
|
File without changes
|