cc-pushback 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/PKG-INFO +9 -4
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/README.md +2 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/__init__.py +3 -4
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/claude.py +3 -2
- cc_pushback-0.2.0/cc_pushback/cli.py +450 -0
- cc_pushback-0.2.0/cc_pushback/dashboard.py +338 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/detectors.py +51 -36
- cc_pushback-0.2.0/cc_pushback/enrich.py +306 -0
- cc_pushback-0.2.0/cc_pushback/evaluate.py +169 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/formats.py +6 -6
- cc_pushback-0.2.0/cc_pushback/golden_triage.json +345 -0
- cc_pushback-0.2.0/cc_pushback/migrate.py +124 -0
- cc_pushback-0.2.0/cc_pushback/models.py +10 -0
- cc_pushback-0.2.0/cc_pushback/refine.py +159 -0
- cc_pushback-0.2.0/cc_pushback/report.py +744 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/scan.py +1 -1
- cc_pushback-0.2.0/cc_pushback/serve.py +47 -0
- cc_pushback-0.2.0/cc_pushback/store.py +570 -0
- cc_pushback-0.2.0/cc_pushback/triage.py +355 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/pyproject.toml +7 -4
- cc_pushback-0.1.0/cc_pushback/cli.py +0 -138
- cc_pushback-0.1.0/cc_pushback/context.py +0 -11
- cc_pushback-0.1.0/cc_pushback/markers.py +0 -25
- cc_pushback-0.1.0/cc_pushback/models.py +0 -15
- cc_pushback-0.1.0/cc_pushback/nav.py +0 -31
- cc_pushback-0.1.0/cc_pushback/report.py +0 -484
- cc_pushback-0.1.0/cc_pushback/serve.py +0 -60
- cc_pushback-0.1.0/cc_pushback/store.py +0 -34
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/LICENSE +0 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/__main__.py +0 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/py.typed +0 -0
- {cc_pushback-0.1.0 → cc_pushback-0.2.0}/cc_pushback/spec.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cc-pushback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Learn your pushback style from past Claude Code feedback and code reviews, and replicate it with a language model.
|
|
5
5
|
Keywords:
|
|
6
6
|
Author: Yasyf Mohamedali
|
|
@@ -13,14 +13,17 @@ Classifier: Operating System :: OS Independent
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
15
|
Classifier: Typing :: Typed
|
|
16
|
-
Requires-Dist: aiohttp>=3.10
|
|
17
16
|
Requires-Dist: anyio>=4.4
|
|
18
|
-
Requires-Dist: cc-transcript>=0
|
|
17
|
+
Requires-Dist: cc-transcript>=2.0,<3
|
|
19
18
|
Requires-Dist: click>=8
|
|
20
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: fastapi>=0.115
|
|
20
|
+
Requires-Dist: pydantic>=2
|
|
21
|
+
Requires-Dist: spawnllm>=0.1.3
|
|
22
|
+
Requires-Dist: uvicorn>=0.30
|
|
21
23
|
Requires-Dist: pytest>=8.0 ; extra == 'dev'
|
|
22
24
|
Requires-Dist: ty>=0.0.44 ; extra == 'dev'
|
|
23
25
|
Requires-Dist: ruff>=0.8 ; extra == 'dev'
|
|
26
|
+
Requires-Dist: httpx>=0.27 ; extra == 'dev'
|
|
24
27
|
Requires-Python: >=3.13
|
|
25
28
|
Project-URL: Homepage, https://github.com/yasyf/cc-pushback
|
|
26
29
|
Project-URL: Documentation, https://yasyf.github.io/cc-pushback/
|
|
@@ -32,6 +35,8 @@ Description-Content-Type: text/markdown
|
|
|
32
35
|
|
|
33
36
|
# cc-pushback
|
|
34
37
|
|
|
38
|
+

|
|
39
|
+
|
|
35
40
|
[](https://pypi.org/project/cc-pushback/)
|
|
36
41
|
[](https://pypi.org/project/cc-pushback/)
|
|
37
42
|
[](https://yasyf.github.io/cc-pushback/)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# cc-pushback
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
|
|
3
5
|
[](https://pypi.org/project/cc-pushback/)
|
|
4
6
|
[](https://pypi.org/project/cc-pushback/)
|
|
5
7
|
[](https://yasyf.github.io/cc-pushback/)
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from cc_pushback.context import ContextSnapshot, ContextTurn, build_snapshot
|
|
6
5
|
from cc_pushback.detectors import Detector, detect
|
|
6
|
+
from cc_pushback.migrate import MigrationReport, migrate_corpus
|
|
7
7
|
from cc_pushback.models import DedupKey, FeedbackCandidate, SourceKind, dedup_key
|
|
8
8
|
from cc_pushback.scan import ScanReport, scan
|
|
9
9
|
from cc_pushback.spec import PUSHBACK_SPEC
|
|
@@ -15,14 +15,13 @@ from cc_pushback.store import FeedbackStore
|
|
|
15
15
|
# great-docs documents __all__ when present; keep it in sync with the re-exports above.
|
|
16
16
|
__all__ = [
|
|
17
17
|
"PUSHBACK_SPEC",
|
|
18
|
-
"ContextSnapshot",
|
|
19
|
-
"ContextTurn",
|
|
20
18
|
"Detector",
|
|
21
19
|
"FeedbackCandidate",
|
|
22
20
|
"FeedbackStore",
|
|
21
|
+
"MigrationReport",
|
|
23
22
|
"ScanReport",
|
|
24
|
-
"build_snapshot",
|
|
25
23
|
"dedup_key",
|
|
26
24
|
"detect",
|
|
25
|
+
"migrate_corpus",
|
|
27
26
|
"scan",
|
|
28
27
|
]
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
"""A thin shell-out to the ``claude`` CLI for a single headless completion.
|
|
1
|
+
"""A thin shell-out to the ``claude`` CLI for a single headless text completion.
|
|
2
2
|
|
|
3
3
|
Argv construction and envelope parsing come from the shared ``spawnllm`` library;
|
|
4
4
|
the spawn stays local (``anyio.run_process``). It uses the user's existing Claude
|
|
5
5
|
Code auth (no API key), so the package stays offline unless ``claude`` is
|
|
6
|
-
actually on the path.
|
|
6
|
+
actually on the path. The structured path lives in
|
|
7
|
+
:mod:`cc_transcript.judge` (``run_structured``/``structured_judge``).
|
|
7
8
|
"""
|
|
8
9
|
|
|
9
10
|
from __future__ import annotations
|
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
"""The ``cc-pushback`` command-line interface: scan, triage, audit, eval, and friends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import dataclasses
|
|
6
|
+
import functools
|
|
7
|
+
import json
|
|
8
|
+
from collections.abc import Awaitable, Callable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
import anyio
|
|
13
|
+
import click
|
|
14
|
+
from cc_transcript import CLAUDE_PROJECTS_DIR
|
|
15
|
+
|
|
16
|
+
from cc_pushback.claude import claude_available
|
|
17
|
+
from cc_pushback.dashboard import build_app
|
|
18
|
+
from cc_pushback.evaluate import evaluate, flip_report
|
|
19
|
+
from cc_pushback.models import PUSHBACK_SOURCE_KINDS, SourceKind
|
|
20
|
+
from cc_pushback.report import Sample, build_summary, golden_label, project_label
|
|
21
|
+
from cc_pushback.scan import scan as run_scan
|
|
22
|
+
from cc_pushback.serve import serve
|
|
23
|
+
from cc_pushback.store import FeedbackStore
|
|
24
|
+
from cc_pushback.triage import PROMPT_VERSION
|
|
25
|
+
from cc_pushback.triage import audit as run_audit
|
|
26
|
+
from cc_pushback.triage import triage as run_triage
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from spawnllm import TModel
|
|
30
|
+
|
|
31
|
+
SOURCE_KINDS = [*PUSHBACK_SOURCE_KINDS]
|
|
32
|
+
TIERS = ["small", "medium", "large"]
|
|
33
|
+
PENDING_CAP = 1200
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def coro[**P, R](fn: Callable[P, Awaitable[R]]) -> Callable[P, R]:
|
|
37
|
+
"""Adapts an async command body into the sync callback Click expects."""
|
|
38
|
+
|
|
39
|
+
@functools.wraps(fn)
|
|
40
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
41
|
+
return anyio.run(functools.partial(fn, *args, **kwargs))
|
|
42
|
+
|
|
43
|
+
return wrapper
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@click.group()
|
|
47
|
+
@click.version_option(package_name="cc-pushback")
|
|
48
|
+
def main() -> None:
|
|
49
|
+
"""Collect developer pushback signals from existing Claude Code transcripts."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@main.command()
|
|
53
|
+
@click.option(
|
|
54
|
+
"--transcripts",
|
|
55
|
+
"transcripts",
|
|
56
|
+
multiple=True,
|
|
57
|
+
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
|
58
|
+
help="Transcript directories to scan. Defaults to ~/.claude/projects.",
|
|
59
|
+
)
|
|
60
|
+
@click.option("--full", is_flag=True, help="Re-scan every transcript, ignoring recorded mtimes.")
|
|
61
|
+
@click.option(
|
|
62
|
+
"--db",
|
|
63
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
64
|
+
default=None,
|
|
65
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
66
|
+
)
|
|
67
|
+
@coro
|
|
68
|
+
async def scan(transcripts: tuple[Path, ...], full: bool, db: Path | None) -> None:
|
|
69
|
+
"""Scan transcripts for feedback, incrementally.
|
|
70
|
+
|
|
71
|
+
Each transcript is parsed only when new or modified since the last scan, and
|
|
72
|
+
every candidate is inserted with ``INSERT OR IGNORE`` keyed by a content
|
|
73
|
+
digest, so re-running ``scan`` over unchanged inputs is a no-op. Recording a
|
|
74
|
+
file and inserting its candidates commit in one transaction.
|
|
75
|
+
"""
|
|
76
|
+
roots = transcripts or (CLAUDE_PROJECTS_DIR,)
|
|
77
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
78
|
+
report = await run_scan(store, roots, full=full)
|
|
79
|
+
click.echo(f"scanned {report.scanned} files, {report.inserted} new rows")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@main.command()
|
|
83
|
+
@click.option(
|
|
84
|
+
"--db",
|
|
85
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
86
|
+
default=None,
|
|
87
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
88
|
+
)
|
|
89
|
+
@coro
|
|
90
|
+
async def stats(db: Path | None) -> None:
|
|
91
|
+
"""Print ingestion counts by source kind and triage coverage."""
|
|
92
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
93
|
+
report = await store.stats()
|
|
94
|
+
triaged = await store.triage_stats(prompt_version=PROMPT_VERSION)
|
|
95
|
+
click.echo(f"total: {report.total} files: {report.files}")
|
|
96
|
+
for kind, count in report.by_source.items():
|
|
97
|
+
click.echo(f" {kind}: {count}")
|
|
98
|
+
share = f" ({triaged.accepted / triaged.judged:.0%})" if triaged.judged else ""
|
|
99
|
+
click.echo(f"triaged: {triaged.judged}/{triaged.total} (v{PROMPT_VERSION}) accepted: {triaged.accepted}{share}")
|
|
100
|
+
for category, count in triaged.by_category.items():
|
|
101
|
+
click.echo(f" {category}: {count}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@main.command(name="list")
|
|
105
|
+
@click.option(
|
|
106
|
+
"--source",
|
|
107
|
+
"source",
|
|
108
|
+
type=click.Choice(SOURCE_KINDS),
|
|
109
|
+
default=None,
|
|
110
|
+
help="Restrict to one source kind.",
|
|
111
|
+
)
|
|
112
|
+
@click.option("--limit", type=int, default=20, show_default=True, help="Maximum events to show.")
|
|
113
|
+
@click.option(
|
|
114
|
+
"--db",
|
|
115
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
116
|
+
default=None,
|
|
117
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
118
|
+
)
|
|
119
|
+
@coro
|
|
120
|
+
async def list_(source: SourceKind | None, limit: int, db: Path | None) -> None:
|
|
121
|
+
"""List recent feedback events, newest first."""
|
|
122
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
123
|
+
rows = await store.recent(source_kind=source, limit=limit)
|
|
124
|
+
for row in rows:
|
|
125
|
+
click.echo(f"[{row['source_kind']}] {row['occurred_at']} {str(row['text'])[:200]}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@main.command()
|
|
129
|
+
@click.option(
|
|
130
|
+
"--model", "tier", type=click.Choice(TIERS), default="medium", show_default=True, help="Judge model tier."
|
|
131
|
+
)
|
|
132
|
+
@click.option("--limit", type=int, default=None, help="Judge at most this many rows this pass.")
|
|
133
|
+
@click.option("--concurrency", type=int, default=8, show_default=True, help="Maximum concurrent claude subshells.")
|
|
134
|
+
@click.option(
|
|
135
|
+
"--refresh-summary",
|
|
136
|
+
"refresh_summary",
|
|
137
|
+
is_flag=True,
|
|
138
|
+
help="Also re-judge rows whose verdict was recorded at summary fidelity.",
|
|
139
|
+
)
|
|
140
|
+
@click.option(
|
|
141
|
+
"--db",
|
|
142
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
143
|
+
default=None,
|
|
144
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
145
|
+
)
|
|
146
|
+
@coro
|
|
147
|
+
async def triage(tier: TModel, limit: int | None, concurrency: int, refresh_summary: bool, db: Path | None) -> None:
|
|
148
|
+
"""Judge every stored candidate lacking a verdict at the current prompt version.
|
|
149
|
+
|
|
150
|
+
Incremental and idempotent: verdicts persist per row as soon as each call
|
|
151
|
+
completes, failed rows stay pending and are retried on the next run, and
|
|
152
|
+
re-running over a fully judged corpus is a no-op. With ``--refresh-summary``,
|
|
153
|
+
rows judged at summary fidelity are re-judged; a full-fidelity verdict
|
|
154
|
+
replaces the summary one once the row's window hydrates again.
|
|
155
|
+
"""
|
|
156
|
+
from cc_transcript.judge import resolved_model
|
|
157
|
+
|
|
158
|
+
from cc_pushback.triage import JUDGE
|
|
159
|
+
|
|
160
|
+
if not claude_available():
|
|
161
|
+
raise click.ClickException("the claude CLI is not on PATH")
|
|
162
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
163
|
+
pending = len(
|
|
164
|
+
await store.unjudged(
|
|
165
|
+
role=JUDGE, prompt_version=PROMPT_VERSION, model=resolved_model(tier), refresh_summary=refresh_summary
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
if pending > PENDING_CAP:
|
|
169
|
+
raise click.ClickException(f"{pending} pending rows exceeds the {PENDING_CAP} safety cap — wrong DB?")
|
|
170
|
+
click.echo(f"pending: {pending} rows at prompt v{PROMPT_VERSION} ({resolved_model(tier)})")
|
|
171
|
+
report = await run_triage(
|
|
172
|
+
store, tier=tier, limit=limit, concurrency=concurrency, refresh_summary=refresh_summary
|
|
173
|
+
)
|
|
174
|
+
click.echo(f"judged {report.judged} rows ({report.failed} failed), {report.pending} pending")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@main.command()
|
|
178
|
+
@click.option("--accepts", type=int, default=60, show_default=True, help="Audit budget for judge-accepted rows.")
|
|
179
|
+
@click.option("--rejects", type=int, default=60, show_default=True, help="Audit budget for judge-rejected rows.")
|
|
180
|
+
@click.option("--seed", type=int, default=1, show_default=True, help="Deterministic sampling seed (iteration number).")
|
|
181
|
+
@click.option(
|
|
182
|
+
"--model", "tier", type=click.Choice(TIERS), default="large", show_default=True, help="Auditor model tier."
|
|
183
|
+
)
|
|
184
|
+
@click.option("--concurrency", type=int, default=8, show_default=True, help="Maximum concurrent claude subshells.")
|
|
185
|
+
@click.option(
|
|
186
|
+
"--db",
|
|
187
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
188
|
+
default=None,
|
|
189
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
190
|
+
)
|
|
191
|
+
@coro
|
|
192
|
+
async def audit(tier: TModel, accepts: int, rejects: int, seed: int, concurrency: int, db: Path | None) -> None:
|
|
193
|
+
"""Audit a seeded stratified sample of the current prompt version's verdicts.
|
|
194
|
+
|
|
195
|
+
The auditor is a stronger model, blind to the judge's verdicts; its labels are
|
|
196
|
+
keyed independently of the judge's prompt version, so they accumulate across
|
|
197
|
+
iterations and re-auditing a sampled row costs nothing.
|
|
198
|
+
"""
|
|
199
|
+
if not claude_available():
|
|
200
|
+
raise click.ClickException("the claude CLI is not on PATH")
|
|
201
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
202
|
+
report = await run_audit(store, accepts=accepts, rejects=rejects, seed=seed, tier=tier, concurrency=concurrency)
|
|
203
|
+
click.echo(f"audited {report.judged} fresh rows ({report.failed} failed)")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@main.command(name="eval")
|
|
207
|
+
@click.option("--seed", type=int, default=1, show_default=True, help="The seed the audit ran with.")
|
|
208
|
+
@click.option("--accepts", type=int, default=60, show_default=True, help="The audit's accept budget.")
|
|
209
|
+
@click.option("--rejects", type=int, default=60, show_default=True, help="The audit's reject budget.")
|
|
210
|
+
@click.option("--compare-to", type=int, default=None, help="Earlier prompt version for flip analysis.")
|
|
211
|
+
@click.option("--json", "as_json", is_flag=True, help="Emit the full metrics as JSON.")
|
|
212
|
+
@click.option(
|
|
213
|
+
"--db",
|
|
214
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
215
|
+
default=None,
|
|
216
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
217
|
+
)
|
|
218
|
+
@coro
|
|
219
|
+
async def eval_(seed: int, accepts: int, rejects: int, compare_to: int | None, as_json: bool, db: Path | None) -> None:
|
|
220
|
+
"""Compute the mechanical metrics for the current prompt version. No LLM calls.
|
|
221
|
+
|
|
222
|
+
Recomputes everything from raw verdicts: the golden-set gate, audited precision
|
|
223
|
+
and reject contamination over the reproduced uniform core, the cumulative-pool
|
|
224
|
+
secondary estimates, per-kind tables, and (with ``--compare-to``) verdict flips
|
|
225
|
+
against an earlier prompt version.
|
|
226
|
+
"""
|
|
227
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
228
|
+
metrics = await evaluate(store, seed=seed, accepts=accepts, rejects=rejects)
|
|
229
|
+
flips = await flip_report(store, from_version=compare_to, to_version=PROMPT_VERSION) if compare_to else None
|
|
230
|
+
if as_json:
|
|
231
|
+
payload = dataclasses.asdict(metrics) | {
|
|
232
|
+
"golden": dataclasses.asdict(metrics.golden)
|
|
233
|
+
| {
|
|
234
|
+
"failures": [
|
|
235
|
+
dataclasses.asdict(failure) | {"expected": golden_label(failure.expected)}
|
|
236
|
+
for failure in metrics.golden.failures
|
|
237
|
+
]
|
|
238
|
+
},
|
|
239
|
+
"precision": metrics.precision,
|
|
240
|
+
"contamination": metrics.contamination,
|
|
241
|
+
"contamination_upper": metrics.contamination_upper,
|
|
242
|
+
"recall_hat": metrics.recall_hat,
|
|
243
|
+
"flips": dataclasses.asdict(flips) if flips else None,
|
|
244
|
+
}
|
|
245
|
+
click.echo(json.dumps(payload, indent=2))
|
|
246
|
+
return
|
|
247
|
+
share = f" ({metrics.accepted / metrics.judged:.0%})" if metrics.judged else ""
|
|
248
|
+
click.echo(
|
|
249
|
+
f"prompt v{metrics.prompt_version}: judged {metrics.judged}/{metrics.total}, accepted {metrics.accepted}{share}"
|
|
250
|
+
)
|
|
251
|
+
click.echo(f"golden: {metrics.golden.passed}/{metrics.golden.total} (sha256 {metrics.golden.sha256[:12]})")
|
|
252
|
+
for failure in metrics.golden.failures:
|
|
253
|
+
why = f" — {failure.rationale}" if failure.rationale else ""
|
|
254
|
+
click.echo(
|
|
255
|
+
f" FAIL expected {golden_label(failure.expected)}, got {failure.category}{why}: {failure.text[:120]}"
|
|
256
|
+
)
|
|
257
|
+
core_a, core_r = metrics.core_accepts, metrics.core_rejects
|
|
258
|
+
click.echo(
|
|
259
|
+
f"precision (core): {core_a.hits}/{core_a.audited}"
|
|
260
|
+
+ (f" = {p:.3f}" if (p := metrics.precision) is not None else "")
|
|
261
|
+
)
|
|
262
|
+
upper = f" (95% upper {u:.3f})" if (u := metrics.contamination_upper) is not None else ""
|
|
263
|
+
click.echo(
|
|
264
|
+
f"contamination (core): {core_r.hits}/{core_r.audited}"
|
|
265
|
+
+ (f" = {c:.3f}{upper}" if (c := metrics.contamination) is not None else "")
|
|
266
|
+
)
|
|
267
|
+
if (recall := metrics.recall_hat) is not None:
|
|
268
|
+
click.echo(f"recall_hat: {recall:.3f}")
|
|
269
|
+
pool_a, pool_r = metrics.pool_accepts, metrics.pool_rejects
|
|
270
|
+
click.echo(f"pool: accepts {pool_a.hits}/{pool_a.audited}, rejects {pool_r.hits}/{pool_r.audited}")
|
|
271
|
+
for kind, (judged, accepted) in sorted(metrics.by_kind.items()):
|
|
272
|
+
click.echo(f" {kind}: {accepted}/{judged} accepted")
|
|
273
|
+
click.echo(f"disagreements: {len(metrics.disagreements)}")
|
|
274
|
+
for item in metrics.disagreements:
|
|
275
|
+
click.echo(
|
|
276
|
+
f" [{item.source_kind}] judge={item.judge_category} ({item.judge_rationale}) "
|
|
277
|
+
f"auditor={item.auditor_category} ({item.auditor_rationale}): {item.text[:120]}"
|
|
278
|
+
)
|
|
279
|
+
if flips is not None:
|
|
280
|
+
rate = f" ({r:.0%})" if (r := flips.rate) is not None else ""
|
|
281
|
+
click.echo(f"flips vs v{compare_to}: {len(flips.flips)}/{flips.common}{rate}")
|
|
282
|
+
for flip in flips.flips:
|
|
283
|
+
click.echo(f" {flip.from_category} -> {flip.to_category}: {flip.text[:120]}")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@main.command()
|
|
287
|
+
@click.option(
|
|
288
|
+
"--model", "tier", type=click.Choice(TIERS), default="medium", show_default=True, help="Refiner model tier."
|
|
289
|
+
)
|
|
290
|
+
@click.option("--limit", type=int, default=None, help="Refine at most this many events this pass.")
|
|
291
|
+
@click.option("--concurrency", type=int, default=8, show_default=True, help="Maximum concurrent claude subshells.")
|
|
292
|
+
@click.option(
|
|
293
|
+
"--db",
|
|
294
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
295
|
+
default=None,
|
|
296
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
297
|
+
)
|
|
298
|
+
@coro
|
|
299
|
+
async def refine(tier: TModel, limit: int | None, concurrency: int, db: Path | None) -> None:
|
|
300
|
+
"""Refine every accepted pushback event into atomic training pairs.
|
|
301
|
+
|
|
302
|
+
Incremental and idempotent: pairs commit per event as soon as each call
|
|
303
|
+
completes, failed events stay pending and are retried on the next run, and
|
|
304
|
+
re-running over a fully refined corpus is a no-op.
|
|
305
|
+
"""
|
|
306
|
+
from cc_transcript.judge import resolved_model
|
|
307
|
+
|
|
308
|
+
from cc_pushback.refine import PROMPT_VERSION as REFINE_VERSION
|
|
309
|
+
from cc_pushback.refine import refine as run_refine
|
|
310
|
+
|
|
311
|
+
if not claude_available():
|
|
312
|
+
raise click.ClickException("the claude CLI is not on PATH")
|
|
313
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
314
|
+
pending = len(await store.unrefined(prompt_version=REFINE_VERSION, model=resolved_model(tier)))
|
|
315
|
+
if pending > PENDING_CAP:
|
|
316
|
+
raise click.ClickException(f"{pending} pending events exceeds the {PENDING_CAP} safety cap — wrong DB?")
|
|
317
|
+
click.echo(f"pending: {pending} events at refine v{REFINE_VERSION} ({resolved_model(tier)})")
|
|
318
|
+
report = await run_refine(store, tier=tier, limit=limit, concurrency=concurrency)
|
|
319
|
+
click.echo(
|
|
320
|
+
f"refined {report.refined} events into {report.pairs} pairs ({report.failed} failed), {report.pending} pending"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@main.command()
|
|
325
|
+
@click.option(
|
|
326
|
+
"--model", "tier", type=click.Choice(TIERS), default="medium", show_default=True, help="Linking model tier."
|
|
327
|
+
)
|
|
328
|
+
@click.option("--limit", type=int, default=None, help="Enrich at most this many pairs this pass.")
|
|
329
|
+
@click.option("--concurrency", type=int, default=8, show_default=True, help="Maximum concurrent claude subshells.")
|
|
330
|
+
@click.option(
|
|
331
|
+
"--db",
|
|
332
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
333
|
+
default=None,
|
|
334
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
335
|
+
)
|
|
336
|
+
@coro
|
|
337
|
+
async def enrich(tier: TModel, limit: int | None, concurrency: int, db: Path | None) -> None:
|
|
338
|
+
"""Ground every refined pair in the code it complains about.
|
|
339
|
+
|
|
340
|
+
Harvests candidate incorrect edits and their later corrections (from the
|
|
341
|
+
session, or from git history) around each pair's pushback anchor, then has an
|
|
342
|
+
LLM pick the one edit the complaint faults, copied verbatim. Expired
|
|
343
|
+
transcripts and editless windows persist free ``no_code`` rows with no LLM
|
|
344
|
+
call. Incremental and idempotent: evidence persists per pair as soon as each
|
|
345
|
+
row resolves, failed pairs stay pending and are retried on the next run, and
|
|
346
|
+
a refine re-run resurfaces its new pairs here automatically.
|
|
347
|
+
"""
|
|
348
|
+
from cc_transcript.evidence import EXTRACTOR_VERSION
|
|
349
|
+
from cc_transcript.judge import resolved_model
|
|
350
|
+
|
|
351
|
+
from cc_pushback.enrich import ENRICH_VERSION
|
|
352
|
+
from cc_pushback.enrich import enrich as run_enrich
|
|
353
|
+
|
|
354
|
+
if not claude_available():
|
|
355
|
+
raise click.ClickException("the claude CLI is not on PATH")
|
|
356
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
357
|
+
pending = len(
|
|
358
|
+
await store.unenriched(
|
|
359
|
+
enrich_version=ENRICH_VERSION, enrich_model=resolved_model(tier), extractor_version=EXTRACTOR_VERSION
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
if pending > PENDING_CAP:
|
|
363
|
+
raise click.ClickException(f"{pending} pending pairs exceeds the {PENDING_CAP} safety cap — wrong DB?")
|
|
364
|
+
click.echo(f"pending: {pending} pairs at enrich v{ENRICH_VERSION} ({resolved_model(tier)})")
|
|
365
|
+
report = await run_enrich(store, tier=tier, limit=limit, concurrency=concurrency)
|
|
366
|
+
click.echo(
|
|
367
|
+
f"enriched {report.enriched} pairs ({report.code} code, {report.no_code} no_code, "
|
|
368
|
+
f"{report.git} git-sourced, {report.failed} failed), {report.pending} pending"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@main.command(name="migrate-corpus")
|
|
373
|
+
@click.option(
|
|
374
|
+
"--db",
|
|
375
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
376
|
+
default=None,
|
|
377
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
378
|
+
)
|
|
379
|
+
@coro
|
|
380
|
+
async def migrate_corpus_(db: Path | None) -> None:
|
|
381
|
+
"""Convert a pre-2.0 corpus in place to the cc-transcript 2.0 shapes.
|
|
382
|
+
|
|
383
|
+
One-time and idempotent: legacy ``context_json`` snapshots become
|
|
384
|
+
``cc-transcript.context/1`` documents (previews only, summary fidelity,
|
|
385
|
+
``origin='migrated'``), the ``event_uuid`` and ``triage.fidelity`` columns
|
|
386
|
+
are added, and rows already in the new schema are skipped.
|
|
387
|
+
"""
|
|
388
|
+
from cc_pushback.migrate import migrate_corpus
|
|
389
|
+
|
|
390
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
391
|
+
report = await migrate_corpus(store)
|
|
392
|
+
click.echo(f"migrated {report.migrated} rows ({report.skipped} already current)")
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
@main.command()
|
|
396
|
+
@click.option("--jsonl", is_flag=True, help="Emit full pairs as JSON lines for fine-tuning export.")
|
|
397
|
+
@click.option(
|
|
398
|
+
"--db",
|
|
399
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
400
|
+
default=None,
|
|
401
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
402
|
+
)
|
|
403
|
+
@coro
|
|
404
|
+
async def pairs(jsonl: bool, db: Path | None) -> None:
|
|
405
|
+
"""Print the refined training pairs — the pipeline's deliverable.
|
|
406
|
+
|
|
407
|
+
Each pair is one atomic complaint: a faithful re-synthesis of what Claude did,
|
|
408
|
+
the verbatim user excerpt, and the distilled one-sentence complaint.
|
|
409
|
+
"""
|
|
410
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
411
|
+
rows = await store.pairs()
|
|
412
|
+
for row in rows:
|
|
413
|
+
if jsonl:
|
|
414
|
+
click.echo(json.dumps(row | {"project": project_label(str(row["origin_path"] or ""))}))
|
|
415
|
+
else:
|
|
416
|
+
click.echo(f"[{row['category']}] {str(row['action'])[:80]} -> {str(row['complaint'])[:100]}")
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
@main.command(name="view-samples")
|
|
420
|
+
@click.option(
|
|
421
|
+
"--db",
|
|
422
|
+
type=click.Path(dir_okay=False, path_type=Path),
|
|
423
|
+
default=None,
|
|
424
|
+
help="Database path. Defaults to ~/.cc-pushback/feedback.db.",
|
|
425
|
+
)
|
|
426
|
+
@click.option(
|
|
427
|
+
"--llm/--no-llm",
|
|
428
|
+
default=True,
|
|
429
|
+
show_default=True,
|
|
430
|
+
help="Summarize with the claude CLI when it is on PATH, else use heuristics.",
|
|
431
|
+
)
|
|
432
|
+
@click.option("--model", default="claude-sonnet-4-6", show_default=True, help="Model for the claude CLI summary.")
|
|
433
|
+
@click.option("--port", type=int, default=0, show_default=True, help="Port to serve on; 0 picks a free one.")
|
|
434
|
+
@click.option("--open", "open_", is_flag=True, help="Open the dashboard in a browser once serving.")
|
|
435
|
+
@coro
|
|
436
|
+
async def view_samples(db: Path | None, llm: bool, model: str, port: int, open_: bool) -> None:
|
|
437
|
+
"""Serve the training-pairs dashboard: refined pairs and their full lineage.
|
|
438
|
+
|
|
439
|
+
Opens an interactive dashboard listing the refined pairs (the pipeline's
|
|
440
|
+
deliverable) and every candidate behind them, with a detail pane that walks one
|
|
441
|
+
candidate's lineage — detector hit, judge verdicts across versions, the auditor's
|
|
442
|
+
agreement, the refiner's atomic split, and the golden gate. It is served over a
|
|
443
|
+
transient HTTP server whose URL is printed; press Ctrl-C to stop. The corpus
|
|
444
|
+
narrative is written by the ``claude`` CLI when ``--llm`` is set and ``claude`` is
|
|
445
|
+
installed, falling back to deterministic heuristics.
|
|
446
|
+
"""
|
|
447
|
+
async with await FeedbackStore.open(db or FeedbackStore.default_path()) as store:
|
|
448
|
+
samples = [Sample.from_row(row) for row in await store.candidates()]
|
|
449
|
+
summary = await build_summary(samples, use_llm=llm, model=model)
|
|
450
|
+
await serve(build_app(store, summary=summary), port=port, open_browser=open_)
|