wherefore 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. wherefore/__init__.py +0 -0
  2. wherefore/cli.py +632 -0
  3. wherefore/clustering/__init__.py +0 -0
  4. wherefore/clustering/cluster_mismatches.py +344 -0
  5. wherefore/clustering/signatures.py +446 -0
  6. wherefore/comparison/__init__.py +0 -0
  7. wherefore/comparison/diff_engine.py +229 -0
  8. wherefore/comparison/diff_result.py +151 -0
  9. wherefore/comparison/key_matching.py +149 -0
  10. wherefore/comparison/loaders.py +370 -0
  11. wherefore/reasoning/__init__.py +0 -0
  12. wherefore/reasoning/explain.py +207 -0
  13. wherefore/reasoning/prompts/cluster_explanation_v1.md +71 -0
  14. wherefore/reasoning/providers/__init__.py +0 -0
  15. wherefore/reasoning/providers/base.py +30 -0
  16. wherefore/reasoning/providers/claude.py +81 -0
  17. wherefore/reasoning/redaction.py +118 -0
  18. wherefore/reasoning/report.py +25 -0
  19. wherefore/synthetic/__init__.py +0 -0
  20. wherefore/synthetic/base_dataset.py +259 -0
  21. wherefore/synthetic/corruptors/__init__.py +0 -0
  22. wherefore/synthetic/corruptors/dedup_failure.py +68 -0
  23. wherefore/synthetic/corruptors/encoding_mismatch.py +84 -0
  24. wherefore/synthetic/corruptors/enum_drift.py +70 -0
  25. wherefore/synthetic/corruptors/float_precision.py +78 -0
  26. wherefore/synthetic/corruptors/key_mismatch.py +100 -0
  27. wherefore/synthetic/corruptors/null_type_coercion.py +84 -0
  28. wherefore/synthetic/corruptors/timezone_shift.py +73 -0
  29. wherefore/synthetic/corruptors/truncation.py +72 -0
  30. wherefore/synthetic/ground_truth.py +83 -0
  31. wherefore/taxonomy/__init__.py +0 -0
  32. wherefore/taxonomy/patterns/dedup_failure.yaml +52 -0
  33. wherefore/taxonomy/patterns/encoding_mismatch.yaml +45 -0
  34. wherefore/taxonomy/patterns/enum_drift.yaml +48 -0
  35. wherefore/taxonomy/patterns/float_precision.yaml +51 -0
  36. wherefore/taxonomy/patterns/key_mismatch.yaml +59 -0
  37. wherefore/taxonomy/patterns/null_type_coercion.yaml +52 -0
  38. wherefore/taxonomy/patterns/timezone_shift.yaml +50 -0
  39. wherefore/taxonomy/patterns/truncation.yaml +44 -0
  40. wherefore/taxonomy/registry.py +180 -0
  41. wherefore/taxonomy/schema.py +156 -0
  42. wherefore-0.1.0.dist-info/METADATA +447 -0
  43. wherefore-0.1.0.dist-info/RECORD +48 -0
  44. wherefore-0.1.0.dist-info/WHEEL +5 -0
  45. wherefore-0.1.0.dist-info/entry_points.txt +2 -0
  46. wherefore-0.1.0.dist-info/licenses/LICENSE +202 -0
  47. wherefore-0.1.0.dist-info/licenses/NOTICE +5 -0
  48. wherefore-0.1.0.dist-info/top_level.txt +1 -0
wherefore/__init__.py ADDED
File without changes
wherefore/cli.py ADDED
@@ -0,0 +1,632 @@
1
+ """
2
+ cli.py
3
+
4
+ The `wherefore compare` command. Wires together everything that's
5
+ real: loaders -> (exact or fuzzy key resolution) -> diff_engine ->
6
+ cluster_mismatches -> a Markdown report.
7
+
8
+ By default, the report shows statistical findings only -- zero
9
+ network calls, zero API cost, no key required. This is the default
10
+ specifically so anyone can clone the repo and try the tool for free
11
+ without needing an Anthropic API key (see README "Try it yourself").
12
+
13
+ Pass --explain to additionally call the real AI reasoning layer
14
+ (explain()) for each cluster and include its plain-English narrative
15
+ in the report ALONGSIDE the statistical detail -- not replacing it,
16
+ so a reader can see both the AI's causal claim and the raw evidence it
17
+ reasoned from side by side, rather than trusting the narrative blindly.
18
+ --explain requires ANTHROPIC_API_KEY to be set; this is checked up
19
+ front, before any diffing/clustering work, so a missing key fails fast
20
+ with a clear message instead of partway through a run.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import os
26
+ from pathlib import Path
27
+
28
+ import typer
29
+
30
+ from wherefore.clustering.cluster_mismatches import (
31
+ DEFAULT_CONFIDENCE_THRESHOLD,
32
+ Cluster,
33
+ cluster_mismatches,
34
+ detect_row_presence_patterns,
35
+ )
36
+ from wherefore.comparison.diff_engine import compare as run_diff
37
+ from wherefore.comparison.key_matching import fuzzy_match_keys
38
+ from wherefore.comparison.loaders import load_file
39
+ from wherefore.reasoning.explain import ClusterExplanation, explain
40
+ from wherefore.taxonomy.registry import build_llm_taxonomy_menu
41
+
42
+ app = typer.Typer()
43
+
44
+
45
+ # This empty callback exists purely so Typer keeps `compare` as an
46
+ # explicit subcommand. Without it, Typer collapses a single registered
47
+ # @app.command() into the app's root invocation -- confirmed directly:
48
+ # `wherefore compare a.csv b.csv` failed with "unexpected extra
49
+ # argument" until this was added, because Typer treated `compare` as
50
+ # the literal first positional argument rather than a subcommand name.
51
+ # Remove this once a second subcommand is added (Typer stops
52
+ # collapsing once there are 2+ commands registered).
53
+ @app.callback()
54
+ def _force_subcommand_mode() -> None:
55
+ """wherefore: explains why two datasets differ, not just that they do."""
56
+
57
+ MIN_KEY_UNIQUENESS = 0.95 # a candidate join key column must be at least this unique to be auto-selected
58
+
59
+
60
+ from dataclasses import dataclass
61
+
62
+
63
+ @dataclass
64
+ class ComparisonRunResult:
65
+ """
66
+ Structured output of running one source/target comparison, shared
67
+ by `compare` (one pair) and `compare_dir` (many pairs) so the
68
+ actual diff/cluster/explain logic lives in exactly one place.
69
+ Render/print/write decisions stay with the caller -- this dataclass
70
+ just carries what happened.
71
+ """
72
+
73
+ join_column: str
74
+ diff_result: object
75
+ clusters: list
76
+ row_presence_clusters: list
77
+ explanations: dict
78
+ redaction_categories: set[str]
79
+
80
+
81
+ def _run_comparison(
82
+ source_df,
83
+ target_df,
84
+ key: str | None,
85
+ fuzzy_keys: bool,
86
+ confidence_threshold: float,
87
+ explain_flag: bool,
88
+ no_redact: bool,
89
+ ) -> ComparisonRunResult:
90
+ """
91
+ The actual diff -> cluster -> (optional) explain pipeline, extracted
92
+ from compare() so compare_dir() can reuse it exactly rather than
93
+ duplicating key-detection, fuzzy-matching, and redaction-wired
94
+ explain() logic across two commands. Raises typer.Exit on the same
95
+ error conditions compare() always has (no key found, key missing
96
+ from a file) -- callers decide whether to abort the whole run or
97
+ catch and continue (compare_dir does the latter, per-pair).
98
+ """
99
+ join_column = key or _auto_detect_key(source_df, target_df)
100
+ if join_column is None:
101
+ raise ValueError("Could not auto-detect a join key column. Pass one explicitly with --key.")
102
+
103
+ if join_column not in source_df.columns or join_column not in target_df.columns:
104
+ raise ValueError(f"Key column {join_column!r} not found in both files.")
105
+
106
+ # key_mismatch (unresolved key-formatting drift) is detected via
107
+ # detect_row_presence_patterns below, on diff_result.source_only_rows/
108
+ # target_only_rows -- it doesn't need fuzzy_match_confidence at all.
109
+ # fuzzy_match_confidence is threaded through to diff_result anyway
110
+ # (rather than discarded, as it was before) because it's real signal
111
+ # key_matching.py already computes and DiffResult already has a field
112
+ # for -- a SEPARATE, not-yet-built detector (flagging fuzzy matches
113
+ # that were accepted but only barely cleared the confidence floor)
114
+ # would consume it later. No current caller reads diff_result.
115
+ # fuzzy_match_confidence yet; that's an intentionally deferred next
116
+ # step, not dead code -- see project notes on why that detector's
117
+ # scoring needed more design work before shipping.
118
+ fuzzy_match_confidence: dict[str, float] | None = None
119
+ if fuzzy_keys:
120
+ source_df, target_df, fuzzy_match_confidence = _apply_fuzzy_key_resolution(
121
+ source_df, target_df, join_column
122
+ )
123
+
124
+ diff_result = run_diff(
125
+ source_df, target_df, join_columns=join_column, fuzzy_match_confidence=fuzzy_match_confidence
126
+ )
127
+ clusters = cluster_mismatches(diff_result, confidence_threshold=confidence_threshold)
128
+ row_presence_clusters = detect_row_presence_patterns(
129
+ diff_result, source_df=source_df, target_df=target_df, confidence_threshold=confidence_threshold
130
+ )
131
+
132
+ explanations: dict[str, ClusterExplanation] = {}
133
+ all_redaction_categories: set[str] = set()
134
+ if explain_flag and clusters:
135
+ taxonomy_menu = build_llm_taxonomy_menu()
136
+ typer.echo(f"Calling Claude for {len(clusters)} cluster(s)...")
137
+ for cluster in clusters:
138
+ try:
139
+ explanation, categories = explain(cluster, taxonomy_menu, redact=not no_redact)
140
+ explanations[cluster.column] = explanation
141
+ all_redaction_categories.update(categories)
142
+ except Exception as e:
143
+ typer.secho(
144
+ f"Warning: explain() failed for column {cluster.column!r}: {e}",
145
+ fg=typer.colors.YELLOW,
146
+ err=True,
147
+ )
148
+
149
+ return ComparisonRunResult(
150
+ join_column=join_column,
151
+ diff_result=diff_result,
152
+ clusters=clusters,
153
+ row_presence_clusters=row_presence_clusters,
154
+ explanations=explanations,
155
+ redaction_categories=all_redaction_categories,
156
+ )
157
+
158
+
159
+ @app.command()
160
+ def compare(
161
+ source: str = typer.Argument(..., help="Path to the source CSV/JSON file"),
162
+ target: str = typer.Argument(..., help="Path to the target CSV/JSON file"),
163
+ key: str = typer.Option(
164
+ None, "--key", help="Join key column name. If omitted, wherefore tries to auto-detect one."
165
+ ),
166
+ fuzzy_keys: bool = typer.Option(
167
+ False,
168
+ "--fuzzy-keys",
169
+ help="Allow approximate key matching when exact keys don't align (e.g. 'CUST-001' vs 'CUST001').",
170
+ ),
171
+ output: str = typer.Option("report.md", "--output", help="Path to write the Markdown report."),
172
+ confidence_threshold: float = typer.Option(
173
+ DEFAULT_CONFIDENCE_THRESHOLD,
174
+ "--confidence-threshold",
175
+ help="Minimum confidence (0-1) for a statistical signature to count as a pattern match.",
176
+ ),
177
+ explain_flag: bool = typer.Option(
178
+ False,
179
+ "--explain",
180
+ help="Call the real Claude API to generate plain-English causal narratives for each "
181
+ "cluster, in addition to the statistical detail. Requires ANTHROPIC_API_KEY to be "
182
+ "set. Makes real network calls and incurs real API cost -- off by default.",
183
+ ),
184
+ no_redact: bool = typer.Option(
185
+ False,
186
+ "--no-redact",
187
+ help="Disable automatic redaction of common sensitive patterns (emails, SSNs, credit "
188
+ "card numbers, phone numbers) before sending values to the Claude API with --explain. "
189
+ "Redaction is ON by default -- only disable this if you've already vetted your data.",
190
+ ),
191
+ ) -> None:
192
+ """
193
+ Compare two datasets and show what's different, grouped by pattern
194
+ where a statistical signature matches a known failure mode.
195
+
196
+ Example:
197
+ wherefore compare old_export.csv new_export.csv --output report.md
198
+ wherefore compare old_export.csv new_export.csv --explain
199
+ """
200
+ if explain_flag and not os.environ.get("ANTHROPIC_API_KEY"):
201
+ typer.secho(
202
+ "Error: --explain requires ANTHROPIC_API_KEY to be set in your environment.\n"
203
+ 'Run: export ANTHROPIC_API_KEY="sk-ant-..." before using --explain.',
204
+ fg=typer.colors.RED,
205
+ err=True,
206
+ )
207
+ raise typer.Exit(code=1)
208
+
209
+ try:
210
+ source_df = load_file(source)
211
+ target_df = load_file(target)
212
+ except (FileNotFoundError, ValueError, UnicodeDecodeError, RuntimeError, ImportError) as e:
213
+ typer.secho(f"Error loading files: {e}", fg=typer.colors.RED, err=True)
214
+ raise typer.Exit(code=1)
215
+
216
+ try:
217
+ result = _run_comparison(
218
+ source_df, target_df, key, fuzzy_keys, confidence_threshold, explain_flag, no_redact
219
+ )
220
+ except ValueError as e:
221
+ typer.secho(str(e), fg=typer.colors.RED, err=True)
222
+ raise typer.Exit(code=1)
223
+
224
+ if result.redaction_categories:
225
+ typer.secho(
226
+ f"Redacted before sending to Claude: {', '.join(sorted(result.redaction_categories))} "
227
+ f"-- pass --no-redact to disable this.",
228
+ fg=typer.colors.YELLOW,
229
+ )
230
+
231
+ report = _render_report(
232
+ source, target, result.join_column, result.diff_result, result.clusters, result.explanations,
233
+ row_presence_clusters=result.row_presence_clusters,
234
+ )
235
+ Path(output).write_text(report)
236
+
237
+ _print_summary(
238
+ result.diff_result, result.clusters, output, result.explanations,
239
+ row_presence_clusters=result.row_presence_clusters,
240
+ )
241
+
242
+
243
+ @app.command(name="compare-dir")
244
+ def compare_dir(
245
+ source_dir: str = typer.Argument(..., help="Directory of source files."),
246
+ target_dir: str = typer.Argument(..., help="Directory of target files with matching filenames."),
247
+ output_dir: str = typer.Option(
248
+ "reports", "--output-dir", help="Directory to write one report per matched file pair."
249
+ ),
250
+ key: str = typer.Option(
251
+ None, "--key", help="Join key column name, applied to every pair. If omitted, auto-detected per pair."
252
+ ),
253
+ fuzzy_keys: bool = typer.Option(False, "--fuzzy-keys", help="Allow approximate key matching, applied to every pair."),
254
+ confidence_threshold: float = typer.Option(
255
+ DEFAULT_CONFIDENCE_THRESHOLD, "--confidence-threshold", help="Minimum confidence for a pattern match."
256
+ ),
257
+ explain_flag: bool = typer.Option(
258
+ False, "--explain", help="Call the real Claude API for every pair with mismatches. Requires ANTHROPIC_API_KEY."
259
+ ),
260
+ no_redact: bool = typer.Option(False, "--no-redact", help="Disable redaction for all pairs."),
261
+ ) -> None:
262
+ """
263
+ Compare every matching file pair across two directories -- the
264
+ real-world shape of a migration audit, where you're checking dozens
265
+ of tables, not one. Files are matched by IDENTICAL FILENAME between
266
+ source_dir and target_dir (e.g. source_dir/accounts.csv pairs with
267
+ target_dir/accounts.csv) -- the same mental model as "same table,
268
+ same name, different environment," and deliberately simple: no
269
+ fuzzy filename matching, since guessing wrong at the FILE level
270
+ (comparing the wrong two tables) is a much worse mistake than
271
+ guessing wrong at the row-key level, which already has its own
272
+ careful, opt-in fuzzy-matching path (--fuzzy-keys).
273
+
274
+ Writes one report per pair into output_dir (named after the
275
+ source file), plus a one-line summary per pair to the terminal,
276
+ and a final tally. A failure on one pair (e.g. unrecognized file
277
+ format, no detectable key) is reported and skipped -- it does NOT
278
+ abort the whole batch, since the entire point of this command is
279
+ surviving a large, messy real-world directory where a handful of
280
+ files might not compare cleanly.
281
+
282
+ Example:
283
+ wherefore compare-dir old_exports/ new_exports/ --output-dir reports/
284
+ """
285
+ if explain_flag and not os.environ.get("ANTHROPIC_API_KEY"):
286
+ typer.secho(
287
+ "Error: --explain requires ANTHROPIC_API_KEY to be set in your environment.\n"
288
+ 'Run: export ANTHROPIC_API_KEY="sk-ant-..." before using --explain.',
289
+ fg=typer.colors.RED,
290
+ err=True,
291
+ )
292
+ raise typer.Exit(code=1)
293
+
294
+ source_path = Path(source_dir)
295
+ target_path = Path(target_dir)
296
+ if not source_path.is_dir():
297
+ typer.secho(f"Error: {source_dir!r} is not a directory.", fg=typer.colors.RED, err=True)
298
+ raise typer.Exit(code=1)
299
+ if not target_path.is_dir():
300
+ typer.secho(f"Error: {target_dir!r} is not a directory.", fg=typer.colors.RED, err=True)
301
+ raise typer.Exit(code=1)
302
+
303
+ pairs = _match_files_by_name(source_path, target_path)
304
+ if not pairs:
305
+ typer.secho(
306
+ f"No matching filenames found between {source_dir!r} and {target_dir!r}.",
307
+ fg=typer.colors.RED,
308
+ err=True,
309
+ )
310
+ raise typer.Exit(code=1)
311
+
312
+ output_path = Path(output_dir)
313
+ output_path.mkdir(parents=True, exist_ok=True)
314
+
315
+ typer.echo(f"Found {len(pairs)} matching file pair(s). Comparing...")
316
+ typer.echo()
317
+
318
+ succeeded = 0
319
+ failed = 0
320
+ all_redaction_categories: set[str] = set()
321
+
322
+ for source_file, target_file in pairs:
323
+ pair_label = source_file.name
324
+ try:
325
+ source_df = load_file(str(source_file))
326
+ target_df = load_file(str(target_file))
327
+ except (FileNotFoundError, ValueError, UnicodeDecodeError, RuntimeError, ImportError) as e:
328
+ typer.secho(f" [SKIPPED] {pair_label}: error loading files: {e}", fg=typer.colors.RED)
329
+ failed += 1
330
+ continue
331
+
332
+ try:
333
+ result = _run_comparison(
334
+ source_df, target_df, key, fuzzy_keys, confidence_threshold, explain_flag, no_redact
335
+ )
336
+ except ValueError as e:
337
+ typer.secho(f" [SKIPPED] {pair_label}: {e}", fg=typer.colors.RED)
338
+ failed += 1
339
+ continue
340
+
341
+ all_redaction_categories.update(result.redaction_categories)
342
+
343
+ report = _render_report(
344
+ str(source_file), str(target_file), result.join_column,
345
+ result.diff_result, result.clusters, result.explanations,
346
+ row_presence_clusters=result.row_presence_clusters,
347
+ )
348
+ report_path = output_path / f"{source_file.stem}_report.md"
349
+ report_path.write_text(report)
350
+
351
+ total_findings = len(result.clusters) + len(result.row_presence_clusters)
352
+ if total_findings == 0:
353
+ typer.secho(f" [OK] {pair_label}: no mismatches", fg=typer.colors.GREEN)
354
+ else:
355
+ pattern_names = [p.pattern_id for c in result.clusters for p in c.candidate_patterns]
356
+ pattern_names += [p.pattern_id for c in result.row_presence_clusters for p in c.candidate_patterns]
357
+ pattern_summary = ", ".join(pattern_names) or "unrecognized pattern(s)"
358
+ typer.secho(
359
+ f" [DIFF] {pair_label}: {total_findings} finding(s) ({pattern_summary})",
360
+ fg=typer.colors.CYAN,
361
+ )
362
+ succeeded += 1
363
+
364
+ typer.echo()
365
+ if all_redaction_categories:
366
+ typer.secho(
367
+ f"Redacted before sending to Claude (across all pairs): "
368
+ f"{', '.join(sorted(all_redaction_categories))} -- pass --no-redact to disable this.",
369
+ fg=typer.colors.YELLOW,
370
+ )
371
+ typer.secho(f"Done: {succeeded} compared, {failed} skipped. Reports written to {output_dir}/", fg=typer.colors.GREEN)
372
+
373
+
374
+ def _match_files_by_name(source_dir: Path, target_dir: Path) -> list[tuple[Path, Path]]:
375
+ """
376
+ Matches files between two directories by IDENTICAL FILENAME --
377
+ deliberately simple, no fuzzy matching at the file level (see
378
+ compare_dir's docstring for why). Only files present in BOTH
379
+ directories are paired; files unique to one side are silently
380
+ excluded from the pairing (not an error -- a real migration
381
+ directory listing might legitimately have a new or removed table).
382
+ Returns pairs sorted by filename for deterministic output ordering.
383
+ """
384
+ source_files = {p.name: p for p in source_dir.iterdir() if p.is_file()}
385
+ target_files = {p.name: p for p in target_dir.iterdir() if p.is_file()}
386
+ common_names = sorted(set(source_files) & set(target_files))
387
+ return [(source_files[name], target_files[name]) for name in common_names]
388
+
389
+
390
+ def _auto_detect_key(source_df, target_df) -> str | None:
391
+ """
392
+ Picks a shared column that's (a) present in both files and (b) at
393
+ least MIN_KEY_UNIQUENESS unique in both -- a reasonable proxy for
394
+ "this looks like an identifier column," without requiring perfect
395
+ uniqueness (real data sometimes has a handful of legitimate
396
+ duplicate keys -- e.g. the dedup_failure scenario this tool also
397
+ exists to catch -- so demanding 100% uniqueness would make
398
+ auto-detect fail on exactly the kind of file this tool is for).
399
+ Prefers columns whose name contains "id" or "key" when multiple
400
+ candidates qualify, since that's a strong real-world naming
401
+ convention; falls back to the first qualifying column otherwise.
402
+ """
403
+ shared_columns = [c for c in source_df.columns if c in target_df.columns]
404
+ candidates = []
405
+ for col in shared_columns:
406
+ source_uniqueness = source_df[col].nunique() / max(len(source_df), 1)
407
+ target_uniqueness = target_df[col].nunique() / max(len(target_df), 1)
408
+ if source_uniqueness >= MIN_KEY_UNIQUENESS and target_uniqueness >= MIN_KEY_UNIQUENESS:
409
+ candidates.append(col)
410
+
411
+ if not candidates:
412
+ return None
413
+
414
+ id_like = [c for c in candidates if "id" in c.lower() or "key" in c.lower()]
415
+ return id_like[0] if id_like else candidates[0]
416
+
417
+
418
+ def _apply_fuzzy_key_resolution(source_df, target_df, join_column):
419
+ """
420
+ Renames target rows' key values to their matched source key where
421
+ a confident fuzzy match exists, so diff_engine's exact join then
422
+ works correctly. Rows with unmatched or ambiguous keys are left
423
+ as-is -- they'll show up as source-only/target-only rows in the
424
+ diff, which is the honest outcome when a key genuinely couldn't be
425
+ confidently resolved, rather than silently forcing a guess.
426
+
427
+ Also returns match_result.confidence_by_target_key (keyed by the
428
+ ORIGINAL, pre-rename target key) so the caller can pass it through
429
+ to diff_engine.compare() instead of discarding it -- a fuzzy match
430
+ that was accepted but scored low is exactly the key_mismatch
431
+ taxonomy pattern's signal (see key_matching.py's module docstring),
432
+ and it's only visible here, before the rename makes the key look
433
+ like a clean exact match to everything downstream.
434
+ """
435
+ source_keys = source_df[join_column].astype(str).tolist()
436
+ target_keys = target_df[join_column].astype(str).tolist()
437
+
438
+ match_result = fuzzy_match_keys(source_keys, target_keys)
439
+
440
+ if match_result.ambiguous_target_keys:
441
+ preview = match_result.ambiguous_target_keys[:5]
442
+ suffix = " ..." if len(match_result.ambiguous_target_keys) > 5 else ""
443
+ typer.secho(
444
+ f"Warning: {len(match_result.ambiguous_target_keys)} key(s) had ambiguous "
445
+ f"fuzzy matches and were not auto-resolved: {preview}{suffix}",
446
+ fg=typer.colors.YELLOW,
447
+ err=True,
448
+ )
449
+
450
+ target_df = target_df.copy()
451
+ target_df[join_column] = target_df[join_column].astype(str).map(
452
+ lambda k: match_result.matched_pairs.get(k, k)
453
+ )
454
+ return source_df, target_df, match_result.confidence_by_target_key
455
+
456
+
457
+ def _render_report(
458
+ source_path,
459
+ target_path,
460
+ join_column,
461
+ diff_result,
462
+ clusters,
463
+ explanations: dict[str, ClusterExplanation] | None = None,
464
+ row_presence_clusters: list | None = None,
465
+ ) -> str:
466
+ explanations = explanations or {}
467
+ row_presence_clusters = row_presence_clusters or []
468
+ row_presence_by_side = {c.side: c for c in row_presence_clusters}
469
+ lines = [
470
+ "# wherefore comparison report",
471
+ "",
472
+ f"- Source: `{source_path}`",
473
+ f"- Target: `{target_path}`",
474
+ f"- Join key: `{join_column}`",
475
+ f"- Source rows: {diff_result.source_row_count}",
476
+ f"- Target rows: {diff_result.target_row_count}",
477
+ f"- Matched rows: {diff_result.matched_row_count}",
478
+ "",
479
+ ]
480
+
481
+ if explanations:
482
+ lines += [
483
+ "> **Note:** sections marked **AI explanation** below were generated",
484
+ "> by calling the real Claude API (`--explain` was passed). Statistical",
485
+ "> detail is shown alongside each one so you can verify the claim",
486
+ "> against the actual evidence it was reasoned from.",
487
+ "",
488
+ ]
489
+ else:
490
+ lines += [
491
+ "> **Note:** this report shows statistical findings only. Pass",
492
+ "> `--explain` to additionally generate a plain-English causal",
493
+ "> narrative for each cluster via the Claude API (requires",
494
+ "> `ANTHROPIC_API_KEY` and makes real, billed API calls).",
495
+ "",
496
+ ]
497
+
498
+ if diff_result.source_only_keys:
499
+ lines.append(f"## Rows only in source ({len(diff_result.source_only_keys)})")
500
+ lines.append("")
501
+ source_only_match = row_presence_by_side.get("source_only")
502
+ if source_only_match and not source_only_match.is_unrecognized:
503
+ for p in source_only_match.candidate_patterns:
504
+ lines.append(
505
+ f"- Statistically matches **{p.pattern_id}** "
506
+ f"(signature: `{p.signature_name}`, confidence: {p.confidence:.2f})"
507
+ )
508
+ lines.append("")
509
+ for k in diff_result.source_only_keys[:20]:
510
+ lines.append(f"- {k}")
511
+ if len(diff_result.source_only_keys) > 20:
512
+ lines.append(f"- ... and {len(diff_result.source_only_keys) - 20} more")
513
+ lines.append("")
514
+
515
+ if diff_result.target_only_keys:
516
+ lines.append(f"## Rows only in target ({len(diff_result.target_only_keys)})")
517
+ lines.append("")
518
+ target_only_match = row_presence_by_side.get("target_only")
519
+ if target_only_match and not target_only_match.is_unrecognized:
520
+ for p in target_only_match.candidate_patterns:
521
+ lines.append(
522
+ f"- Statistically matches **{p.pattern_id}** "
523
+ f"(signature: `{p.signature_name}`, confidence: {p.confidence:.2f})"
524
+ )
525
+ lines.append("")
526
+ for k in diff_result.target_only_keys[:20]:
527
+ lines.append(f"- {k}")
528
+ if len(diff_result.target_only_keys) > 20:
529
+ lines.append(f"- ... and {len(diff_result.target_only_keys) - 20} more")
530
+ lines.append("")
531
+
532
+ if not clusters:
533
+ lines.append("## No mismatches found")
534
+ lines.append("")
535
+ lines.append("Every matched row compared identically across all columns.")
536
+ return "\n".join(lines)
537
+
538
+ lines.append(f"## Mismatches by column ({len(clusters)} column(s) affected)")
539
+ lines.append("")
540
+
541
+ for cluster in clusters:
542
+ lines.append(f"### `{cluster.column}` -- {len(cluster.mismatches)} mismatched row(s)")
543
+ lines.append("")
544
+
545
+ explanation = explanations.get(cluster.column)
546
+ if explanation is not None:
547
+ lines.append(f"**AI explanation** (confidence: {explanation.confidence:.2f}):")
548
+ lines.append("")
549
+ lines.append(explanation.narrative)
550
+ lines.append("")
551
+
552
+ if cluster.is_unrecognized:
553
+ lines.append("No known failure pattern's statistical signature matched this cluster.")
554
+ else:
555
+ for match in cluster.candidate_patterns:
556
+ lines.append(
557
+ f"- Statistically matches **{match.pattern_id}** "
558
+ f"(signature: `{match.signature_name}`, confidence: {match.confidence:.2f})"
559
+ )
560
+ lines.append("")
561
+
562
+ lines.append("Example rows:")
563
+ lines.append("")
564
+ if explanation is not None and explanation.cited_rows:
565
+ for row in explanation.cited_rows:
566
+ lines.append(f"- `{row.key}`: `{row.source_value}` -> `{row.target_value}` *(cited by AI)*")
567
+ else:
568
+ for m in cluster.mismatches[:5]:
569
+ lines.append(f"- `{m.key}`: `{m.source_value}` -> `{m.target_value}`")
570
+ if len(cluster.mismatches) > 5:
571
+ lines.append(f"- ... and {len(cluster.mismatches) - 5} more")
572
+ lines.append("")
573
+
574
+ return "\n".join(lines)
575
+
576
+
577
+ def _print_summary(
578
+ diff_result,
579
+ clusters,
580
+ output_path,
581
+ explanations: dict[str, ClusterExplanation] | None = None,
582
+ row_presence_clusters: list | None = None,
583
+ ) -> None:
584
+ explanations = explanations or {}
585
+ row_presence_by_side = {c.side: c for c in (row_presence_clusters or [])}
586
+ typer.echo(
587
+ f"Compared {diff_result.source_row_count} source rows against "
588
+ f"{diff_result.target_row_count} target rows."
589
+ )
590
+ typer.echo(f"Matched rows: {diff_result.matched_row_count}")
591
+
592
+ if diff_result.source_only_keys:
593
+ typer.echo(f"Rows only in source: {len(diff_result.source_only_keys)}")
594
+ _print_row_presence_match(row_presence_by_side.get("source_only"))
595
+ if diff_result.target_only_keys:
596
+ typer.echo(f"Rows only in target: {len(diff_result.target_only_keys)}")
597
+ _print_row_presence_match(row_presence_by_side.get("target_only"))
598
+
599
+ if not clusters:
600
+ typer.secho("No column mismatches found.", fg=typer.colors.GREEN)
601
+ else:
602
+ for cluster in clusters:
603
+ if cluster.is_unrecognized:
604
+ typer.echo(
605
+ f" {cluster.column}: {len(cluster.mismatches)} mismatches, pattern unrecognized"
606
+ )
607
+ else:
608
+ for match in cluster.candidate_patterns:
609
+ typer.secho(
610
+ f" {cluster.column}: {len(cluster.mismatches)} mismatches, "
611
+ f"matches '{match.pattern_id}' (confidence {match.confidence:.2f})",
612
+ fg=typer.colors.CYAN,
613
+ )
614
+ explanation = explanations.get(cluster.column)
615
+ if explanation is not None:
616
+ typer.secho(f" AI: {explanation.narrative}", fg=typer.colors.MAGENTA)
617
+
618
+ typer.secho(f"\nFull report written to {output_path}", fg=typer.colors.GREEN)
619
+
620
+
621
+ def _print_row_presence_match(row_presence_cluster) -> None:
622
+ if row_presence_cluster is None or row_presence_cluster.is_unrecognized:
623
+ return
624
+ for match in row_presence_cluster.candidate_patterns:
625
+ typer.secho(
626
+ f" matches '{match.pattern_id}' (confidence {match.confidence:.2f})",
627
+ fg=typer.colors.CYAN,
628
+ )
629
+
630
+
631
+ if __name__ == "__main__":
632
+ app()
File without changes