haid 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haid-0.0.1/PKG-INFO +144 -0
- haid-0.0.1/README.md +122 -0
- haid-0.0.1/pyproject.toml +39 -0
- haid-0.0.1/setup.cfg +4 -0
- haid-0.0.1/src/haid/__init__.py +9 -0
- haid-0.0.1/src/haid/__main__.py +4 -0
- haid-0.0.1/src/haid/bridge/__init__.py +172 -0
- haid-0.0.1/src/haid/bridge/reconstruct.py +222 -0
- haid-0.0.1/src/haid/bridge/usage.py +71 -0
- haid-0.0.1/src/haid/cli.py +612 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U00.diff +378 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U01.diff +317 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U07.diff +218 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U10.diff +129 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U11.diff +352 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U13.diff +135 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U16.diff +152 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U18.diff +254 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U19.diff +403 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U22.diff +144 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U24.diff +337 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U29.diff +43 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U37.diff +38 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U39.diff +94 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U40.diff +339 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U43.diff +51 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U46.diff +159 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U48.diff +290 -0
- haid-0.0.1/src/haid/data/anchor_diffs/U50.diff +323 -0
- haid-0.0.1/src/haid/data/cleanliness_anchors.json +282 -0
- haid-0.0.1/src/haid/data/difficulty_anchors.json +53 -0
- haid-0.0.1/src/haid/data/metric_baselines.json +184 -0
- haid-0.0.1/src/haid/data/treatments.json +356 -0
- haid-0.0.1/src/haid/diffio.py +139 -0
- haid-0.0.1/src/haid/episodes/__init__.py +110 -0
- haid-0.0.1/src/haid/episodes/grouping.py +112 -0
- haid-0.0.1/src/haid/episodes/model.py +77 -0
- haid-0.0.1/src/haid/episodes/score.py +188 -0
- haid-0.0.1/src/haid/episodes/segment.py +163 -0
- haid-0.0.1/src/haid/episodes/summarize.py +64 -0
- haid-0.0.1/src/haid/filekind.py +100 -0
- haid-0.0.1/src/haid/graph/__init__.py +19 -0
- haid-0.0.1/src/haid/graph/bash_read.py +229 -0
- haid-0.0.1/src/haid/graph/bash_write.py +201 -0
- haid-0.0.1/src/haid/graph/build.py +248 -0
- haid-0.0.1/src/haid/graph/model.py +130 -0
- haid-0.0.1/src/haid/graph/signature.py +49 -0
- haid-0.0.1/src/haid/intent/__init__.py +90 -0
- haid-0.0.1/src/haid/intent/classify.py +132 -0
- haid-0.0.1/src/haid/intent/messages.py +110 -0
- haid-0.0.1/src/haid/intent/taxonomy.py +100 -0
- haid-0.0.1/src/haid/metrics/__init__.py +68 -0
- haid-0.0.1/src/haid/metrics/base.py +112 -0
- haid-0.0.1/src/haid/metrics/baseline.py +64 -0
- haid-0.0.1/src/haid/metrics/json_out.py +171 -0
- haid-0.0.1/src/haid/metrics/rereads.py +136 -0
- haid-0.0.1/src/haid/metrics/retouched.py +75 -0
- haid-0.0.1/src/haid/metrics/retries.py +108 -0
- haid-0.0.1/src/haid/metrics/unused_context.py +68 -0
- haid-0.0.1/src/haid/metrics/view.py +114 -0
- haid-0.0.1/src/haid/report/__init__.py +21 -0
- haid-0.0.1/src/haid/report/benchmark.py +114 -0
- haid-0.0.1/src/haid/report/compose.py +419 -0
- haid-0.0.1/src/haid/report/treatments.py +107 -0
- haid-0.0.1/src/haid/scoring/__init__.py +13 -0
- haid-0.0.1/src/haid/scoring/anchors.py +70 -0
- haid-0.0.1/src/haid/scoring/compare.py +272 -0
- haid-0.0.1/src/haid/scoring/cost.py +230 -0
- haid-0.0.1/src/haid/scoring/placement.py +80 -0
- haid-0.0.1/src/haid/scoring/value.py +233 -0
- haid-0.0.1/src/haid/scoring/volume.py +84 -0
- haid-0.0.1/src/haid/session/__init__.py +28 -0
- haid-0.0.1/src/haid/session/cache.py +105 -0
- haid-0.0.1/src/haid/session/discover.py +56 -0
- haid-0.0.1/src/haid/session/forest.py +192 -0
- haid-0.0.1/src/haid/session/loader.py +96 -0
- haid-0.0.1/src/haid/session/overflow.py +81 -0
- haid-0.0.1/src/haid/session/parse.py +74 -0
- haid-0.0.1/src/haid/session/records.py +153 -0
- haid-0.0.1/src/haid/session/subagents.py +72 -0
- haid-0.0.1/src/haid/why/__init__.py +64 -0
- haid-0.0.1/src/haid/why/anchors.py +69 -0
- haid-0.0.1/src/haid/why/investigate.py +144 -0
- haid-0.0.1/src/haid/why/prompts.py +181 -0
- haid-0.0.1/src/haid/window.py +71 -0
- haid-0.0.1/src/haid.egg-info/PKG-INFO +144 -0
- haid-0.0.1/src/haid.egg-info/SOURCES.txt +88 -0
- haid-0.0.1/src/haid.egg-info/dependency_links.txt +1 -0
- haid-0.0.1/src/haid.egg-info/entry_points.txt +2 -0
- haid-0.0.1/src/haid.egg-info/top_level.txt +1 -0
haid-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: haid
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: How Am I Doing — local-only self-audit & coaching for Claude Code sessions
|
|
5
|
+
Author-email: dv-hart <jhart@datavine.us>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dv-hart/haid
|
|
8
|
+
Project-URL: Repository, https://github.com/dv-hart/haid
|
|
9
|
+
Project-URL: Issues, https://github.com/dv-hart/haid/issues
|
|
10
|
+
Keywords: claude-code,self-audit,coaching,llm,agent,telemetry
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# How Am I Doing (HAID)
|
|
24
|
+
|
|
25
|
+
*A self-audit and coaching layer for Claude Code sessions.*
|
|
26
|
+
|
|
27
|
+
HAID reads your own Claude Code session transcripts, builds a graph of what
|
|
28
|
+
happened, and produces annotated, **coaching-oriented** reports. The aim is less
|
|
29
|
+
"here is your bill" and more "here is where you and the agent diverged, why, and
|
|
30
|
+
what to change."
|
|
31
|
+
|
|
32
|
+
**Nothing leaves your machine** unless you explicitly choose to submit aggregate
|
|
33
|
+
metrics.
|
|
34
|
+
|
|
35
|
+
> Status: **Phase 1 complete; the full scoring stack now runs on real sessions** — the
|
|
36
|
+
> deterministic pipeline runs end to end on real transcripts (163 tests, stdlib-only, no model
|
|
37
|
+
> in the loop):
|
|
38
|
+
> - **Session parsing** (`src/haid/session/`) — forest-aware JSONL parsing: dedup,
|
|
39
|
+
> branch/rewind classification, subagent stitching, overflow resolution, SQLite cache.
|
|
40
|
+
> - **Session graph** (`src/haid/graph/`) — L0 spine + L1 action/IO graph
|
|
41
|
+
> (reads/produces/edits from `structuredPatch`), signatures, per-timeline scoping.
|
|
42
|
+
> - **Waste metrics** (`src/haid/metrics/`) — `rereads`, `retries`, `retouched`,
|
|
43
|
+
> `unused_context`: one rule each, run at **session and window scope**, as benchmarkable
|
|
44
|
+
> token-rates placed against a per-scope baseline.
|
|
45
|
+
> - **Analysis window** (`src/haid/window.py`) — the multi-session unit metrics run over
|
|
46
|
+
> (a project over a timeframe, default 30 days).
|
|
47
|
+
> - **Scoring** (`src/haid/scoring/`) — the relative achievement/cost value scorer
|
|
48
|
+
> (difficulty + cleanliness placement, volume, normalized-token cost, value combiner),
|
|
49
|
+
> calibration-validated. Built ahead of the earlier phases.
|
|
50
|
+
>
|
|
51
|
+
> - **`haid metrics`** (`src/haid/metrics/{json_out,view}.py` + CLI) — the measured substrate:
|
|
52
|
+
> four waste metrics at **session and window scope**, each placed against a per-scope
|
|
53
|
+
> baseline, as a Markdown inspection view + a JSON hand-off to the later "why" passes.
|
|
54
|
+
> - **Bridge** (`src/haid/bridge/`) — reconstructs an analysis window's net code diff from the
|
|
55
|
+
> **transcript alone** (replay, no git) plus its normalized-token cost, so `haid bridge` and
|
|
56
|
+
> `haid value --project/--session` now run the **full scoring stack on real sessions**
|
|
57
|
+
> (previously the scorer only ran on supplied/calibration diffs).
|
|
58
|
+
>
|
|
59
|
+
> All validated on real transcripts (163 tests). The user-facing **report and visualization are
|
|
60
|
+
> the final product**, composing this substrate with the Phase-2/3 why-analysis and the value
|
|
61
|
+
> score. Next: the diagnosis router, episode segmentation (Phase 2), and the visualization
|
|
62
|
+
> (Phase 1.5). See [plans/roadmap.md](plans/roadmap.md) and [plans/phase1-build.md](plans/phase1-build.md).
|
|
63
|
+
|
|
64
|
+
## What this is not
|
|
65
|
+
|
|
66
|
+
Not another token counter. Raw usage accounting is already well covered
|
|
67
|
+
([ccusage](https://github.com/ryoppippi/ccusage) and similar). The entire value
|
|
68
|
+
lives one layer up, in **diagnosis and coaching** — telling you not what you
|
|
69
|
+
spent but how to get better. A tool that confidently misdiagnoses is worse than
|
|
70
|
+
nothing, because people act on it, so trustworthiness of the diagnosis is the
|
|
71
|
+
central design constraint throughout. See
|
|
72
|
+
[docs/trust-discipline.md](docs/trust-discipline.md).
|
|
73
|
+
|
|
74
|
+
## The one big idea: the session graph
|
|
75
|
+
|
|
76
|
+
Underneath everything is one data structure: a graph of the session(s). Turns
|
|
77
|
+
and tool-calls are nodes; edges capture *responds-to*, *reads*, and *produces*
|
|
78
|
+
relationships. The two headline features are just two operations on this one
|
|
79
|
+
graph:
|
|
80
|
+
|
|
81
|
+
- **"Why did you do X?"** → a backwards traversal from X to its trigger.
|
|
82
|
+
- **"Where did the tokens go?"** → a weighting over the same nodes.
|
|
83
|
+
|
|
84
|
+
Build the graph once; get both views from it. Design in
|
|
85
|
+
[docs/session-graph-design.md](docs/session-graph-design.md).
|
|
86
|
+
|
|
87
|
+
## Two orthogonal analysis passes
|
|
88
|
+
|
|
89
|
+
1. **User-anchored pass** — catches *misalignment*. Works backwards from user
|
|
90
|
+
messages; **corrections are ground truth** ("no, I meant…", "that's wrong").
|
|
91
|
+
2. **Signature-scanning pass** — catches *silent inefficiency*. Scans for
|
|
92
|
+
objective, reasoning-free waste signatures (redundant re-reads, retry loops,
|
|
93
|
+
re-touched lines, unused context).
|
|
94
|
+
|
|
95
|
+
The two are orthogonal: one finds where the agent did the *wrong thing*, the
|
|
96
|
+
other where it did the *right thing wastefully*. See
|
|
97
|
+
[docs/architecture.md](docs/architecture.md).
|
|
98
|
+
|
|
99
|
+
## Documentation map
|
|
100
|
+
|
|
101
|
+
| Doc | What's in it |
|
|
102
|
+
|-----|--------------|
|
|
103
|
+
| [docs/vision.md](docs/vision.md) | The full concept, goals, and the canonical test case |
|
|
104
|
+
| [docs/architecture.md](docs/architecture.md) | The two-pass method and how the pieces fit |
|
|
105
|
+
| [docs/session-graph-design.md](docs/session-graph-design.md) | Node/edge taxonomy, episodes, the two core operations |
|
|
106
|
+
| [docs/detectors.md](docs/detectors.md) | Detector catalog + waste metrics as graph queries |
|
|
107
|
+
| [docs/intent-taxonomy.md](docs/intent-taxonomy.md) | Two-axis message classification + purpose timeline + drift |
|
|
108
|
+
| [docs/scoring-rubric.md](docs/scoring-rubric.md) | Achievement vs. cost — the **relative** value verdict (revised; see ladder/playbook) |
|
|
109
|
+
| [docs/difficulty-ladder.md](docs/difficulty-ladder.md) | The validated difficulty scorer (reference ladder + placement) |
|
|
110
|
+
| [docs/cleanliness-ladder.md](docs/cleanliness-ladder.md) | The cleanliness/parsimony scorer (reference ladder + placement) |
|
|
111
|
+
| [docs/axis-calibration-playbook.md](docs/axis-calibration-playbook.md) | Self-contained recipe to calibrate a new scoring axis (worked example: cleanliness; originality calibrated then dropped) |
|
|
112
|
+
| [docs/calibration-pilot-1.md](docs/calibration-pilot-1.md) | Pilot report: why mined review-signals don't validate difficulty |
|
|
113
|
+
| [docs/visualization.md](docs/visualization.md) | The time-layered bus diagram (left-in/right-out, bundled) |
|
|
114
|
+
| [docs/claude-code-data-format.md](docs/claude-code-data-format.md) | **Verified** Claude Code on-disk data reference |
|
|
115
|
+
| [docs/data-inventory.md](docs/data-inventory.md) | Field catalog from 38 real sessions: what's auto-taggable vs. inferred |
|
|
116
|
+
| [docs/data-structure-report.md](docs/data-structure-report.md) | Real annotated records → the graph they produce (Tier 1 & Tier 2 walkthrough) |
|
|
117
|
+
| [docs/trust-discipline.md](docs/trust-discipline.md) | Cite-or-unknown, hedging, no-traceable-origin |
|
|
118
|
+
| [docs/tooling-landscape.md](docs/tooling-landscape.md) | Existing tools and what to build on |
|
|
119
|
+
| [docs/decisions/](docs/decisions/) | Architecture Decision Records (ADRs) |
|
|
120
|
+
| [plans/roadmap.md](plans/roadmap.md) | Phased delivery plan |
|
|
121
|
+
| [plans/mvp.md](plans/mvp.md) | The minimum thing that tests the core risk |
|
|
122
|
+
| [plans/phase1-build.md](plans/phase1-build.md) | The concrete Phase-1 build sequence + progress |
|
|
123
|
+
| [plans/open-questions.md](plans/open-questions.md) | Decisions to make / behaviors to verify early |
|
|
124
|
+
|
|
125
|
+
## Repository layout
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
HAID/
|
|
129
|
+
├── README.md # you are here
|
|
130
|
+
├── docs/ # design & reference documentation
|
|
131
|
+
│ └── decisions/ # ADRs
|
|
132
|
+
├── plans/ # roadmap, MVP spec, open questions
|
|
133
|
+
├── research/ # raw research reports (inputs to the docs)
|
|
134
|
+
├── calibration/ # the scoring-axis calibration harness (experiment code)
|
|
135
|
+
├── src/haid/ # implementation
|
|
136
|
+
│ ├── session/ # Phase-1 parse: forest model, subagents, overflow, cache
|
|
137
|
+
│ ├── graph/ # L0 spine + L1 IO graph (incl. Bash read/write parsing)
|
|
138
|
+
│ ├── metrics/ # the four waste metrics + baseline + `haid metrics` emitter
|
|
139
|
+
│ ├── window.py # the multi-session analysis window
|
|
140
|
+
│ ├── scoring/ # relative value scorer (difficulty/cleanliness/volume/cost/value)
|
|
141
|
+
│ └── bridge/ # transcript→(diff, usage) reconstruction (the bridge)
|
|
142
|
+
├── tests/ # session/ graph/ metrics/ scoring/ bridge/ suites (+ fixtures/)
|
|
143
|
+
└── scripts/ # build_metric_baselines.py + CLI helpers
|
|
144
|
+
```
|
haid-0.0.1/README.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# How Am I Doing (HAID)
|
|
2
|
+
|
|
3
|
+
*A self-audit and coaching layer for Claude Code sessions.*
|
|
4
|
+
|
|
5
|
+
HAID reads your own Claude Code session transcripts, builds a graph of what
|
|
6
|
+
happened, and produces annotated, **coaching-oriented** reports. The aim is less
|
|
7
|
+
"here is your bill" and more "here is where you and the agent diverged, why, and
|
|
8
|
+
what to change."
|
|
9
|
+
|
|
10
|
+
**Nothing leaves your machine** unless you explicitly choose to submit aggregate
|
|
11
|
+
metrics.
|
|
12
|
+
|
|
13
|
+
> Status: **Phase 1 complete; the full scoring stack now runs on real sessions** — the
|
|
14
|
+
> deterministic pipeline runs end to end on real transcripts (163 tests, stdlib-only, no model
|
|
15
|
+
> in the loop):
|
|
16
|
+
> - **Session parsing** (`src/haid/session/`) — forest-aware JSONL parsing: dedup,
|
|
17
|
+
> branch/rewind classification, subagent stitching, overflow resolution, SQLite cache.
|
|
18
|
+
> - **Session graph** (`src/haid/graph/`) — L0 spine + L1 action/IO graph
|
|
19
|
+
> (reads/produces/edits from `structuredPatch`), signatures, per-timeline scoping.
|
|
20
|
+
> - **Waste metrics** (`src/haid/metrics/`) — `rereads`, `retries`, `retouched`,
|
|
21
|
+
> `unused_context`: one rule each, run at **session and window scope**, as benchmarkable
|
|
22
|
+
> token-rates placed against a per-scope baseline.
|
|
23
|
+
> - **Analysis window** (`src/haid/window.py`) — the multi-session unit metrics run over
|
|
24
|
+
> (a project over a timeframe, default 30 days).
|
|
25
|
+
> - **Scoring** (`src/haid/scoring/`) — the relative achievement/cost value scorer
|
|
26
|
+
> (difficulty + cleanliness placement, volume, normalized-token cost, value combiner),
|
|
27
|
+
> calibration-validated. Built ahead of the earlier phases.
|
|
28
|
+
>
|
|
29
|
+
> - **`haid metrics`** (`src/haid/metrics/{json_out,view}.py` + CLI) — the measured substrate:
|
|
30
|
+
> four waste metrics at **session and window scope**, each placed against a per-scope
|
|
31
|
+
> baseline, as a Markdown inspection view + a JSON hand-off to the later "why" passes.
|
|
32
|
+
> - **Bridge** (`src/haid/bridge/`) — reconstructs an analysis window's net code diff from the
|
|
33
|
+
> **transcript alone** (replay, no git) plus its normalized-token cost, so `haid bridge` and
|
|
34
|
+
> `haid value --project/--session` now run the **full scoring stack on real sessions**
|
|
35
|
+
> (previously the scorer only ran on supplied/calibration diffs).
|
|
36
|
+
>
|
|
37
|
+
> All validated on real transcripts (163 tests). The user-facing **report and visualization are
|
|
38
|
+
> the final product**, composing this substrate with the Phase-2/3 why-analysis and the value
|
|
39
|
+
> score. Next: the diagnosis router, episode segmentation (Phase 2), and the visualization
|
|
40
|
+
> (Phase 1.5). See [plans/roadmap.md](plans/roadmap.md) and [plans/phase1-build.md](plans/phase1-build.md).
|
|
41
|
+
|
|
42
|
+
## What this is not
|
|
43
|
+
|
|
44
|
+
Not another token counter. Raw usage accounting is already well covered
|
|
45
|
+
([ccusage](https://github.com/ryoppippi/ccusage) and similar). The entire value
|
|
46
|
+
lives one layer up, in **diagnosis and coaching** — telling you not what you
|
|
47
|
+
spent but how to get better. A tool that confidently misdiagnoses is worse than
|
|
48
|
+
nothing, because people act on it, so trustworthiness of the diagnosis is the
|
|
49
|
+
central design constraint throughout. See
|
|
50
|
+
[docs/trust-discipline.md](docs/trust-discipline.md).
|
|
51
|
+
|
|
52
|
+
## The one big idea: the session graph
|
|
53
|
+
|
|
54
|
+
Underneath everything is one data structure: a graph of the session(s). Turns
|
|
55
|
+
and tool-calls are nodes; edges capture *responds-to*, *reads*, and *produces*
|
|
56
|
+
relationships. The two headline features are just two operations on this one
|
|
57
|
+
graph:
|
|
58
|
+
|
|
59
|
+
- **"Why did you do X?"** → a backwards traversal from X to its trigger.
|
|
60
|
+
- **"Where did the tokens go?"** → a weighting over the same nodes.
|
|
61
|
+
|
|
62
|
+
Build the graph once; get both views from it. Design in
|
|
63
|
+
[docs/session-graph-design.md](docs/session-graph-design.md).
|
|
64
|
+
|
|
65
|
+
## Two orthogonal analysis passes
|
|
66
|
+
|
|
67
|
+
1. **User-anchored pass** — catches *misalignment*. Works backwards from user
|
|
68
|
+
messages; **corrections are ground truth** ("no, I meant…", "that's wrong").
|
|
69
|
+
2. **Signature-scanning pass** — catches *silent inefficiency*. Scans for
|
|
70
|
+
objective, reasoning-free waste signatures (redundant re-reads, retry loops,
|
|
71
|
+
re-touched lines, unused context).
|
|
72
|
+
|
|
73
|
+
The two are orthogonal: one finds where the agent did the *wrong thing*, the
|
|
74
|
+
other where it did the *right thing wastefully*. See
|
|
75
|
+
[docs/architecture.md](docs/architecture.md).
|
|
76
|
+
|
|
77
|
+
## Documentation map
|
|
78
|
+
|
|
79
|
+
| Doc | What's in it |
|
|
80
|
+
|-----|--------------|
|
|
81
|
+
| [docs/vision.md](docs/vision.md) | The full concept, goals, and the canonical test case |
|
|
82
|
+
| [docs/architecture.md](docs/architecture.md) | The two-pass method and how the pieces fit |
|
|
83
|
+
| [docs/session-graph-design.md](docs/session-graph-design.md) | Node/edge taxonomy, episodes, the two core operations |
|
|
84
|
+
| [docs/detectors.md](docs/detectors.md) | Detector catalog + waste metrics as graph queries |
|
|
85
|
+
| [docs/intent-taxonomy.md](docs/intent-taxonomy.md) | Two-axis message classification + purpose timeline + drift |
|
|
86
|
+
| [docs/scoring-rubric.md](docs/scoring-rubric.md) | Achievement vs. cost — the **relative** value verdict (revised; see ladder/playbook) |
|
|
87
|
+
| [docs/difficulty-ladder.md](docs/difficulty-ladder.md) | The validated difficulty scorer (reference ladder + placement) |
|
|
88
|
+
| [docs/cleanliness-ladder.md](docs/cleanliness-ladder.md) | The cleanliness/parsimony scorer (reference ladder + placement) |
|
|
89
|
+
| [docs/axis-calibration-playbook.md](docs/axis-calibration-playbook.md) | Self-contained recipe to calibrate a new scoring axis (worked example: cleanliness; originality calibrated then dropped) |
|
|
90
|
+
| [docs/calibration-pilot-1.md](docs/calibration-pilot-1.md) | Pilot report: why mined review-signals don't validate difficulty |
|
|
91
|
+
| [docs/visualization.md](docs/visualization.md) | The time-layered bus diagram (left-in/right-out, bundled) |
|
|
92
|
+
| [docs/claude-code-data-format.md](docs/claude-code-data-format.md) | **Verified** Claude Code on-disk data reference |
|
|
93
|
+
| [docs/data-inventory.md](docs/data-inventory.md) | Field catalog from 38 real sessions: what's auto-taggable vs. inferred |
|
|
94
|
+
| [docs/data-structure-report.md](docs/data-structure-report.md) | Real annotated records → the graph they produce (Tier 1 & Tier 2 walkthrough) |
|
|
95
|
+
| [docs/trust-discipline.md](docs/trust-discipline.md) | Cite-or-unknown, hedging, no-traceable-origin |
|
|
96
|
+
| [docs/tooling-landscape.md](docs/tooling-landscape.md) | Existing tools and what to build on |
|
|
97
|
+
| [docs/decisions/](docs/decisions/) | Architecture Decision Records (ADRs) |
|
|
98
|
+
| [plans/roadmap.md](plans/roadmap.md) | Phased delivery plan |
|
|
99
|
+
| [plans/mvp.md](plans/mvp.md) | The minimum thing that tests the core risk |
|
|
100
|
+
| [plans/phase1-build.md](plans/phase1-build.md) | The concrete Phase-1 build sequence + progress |
|
|
101
|
+
| [plans/open-questions.md](plans/open-questions.md) | Decisions to make / behaviors to verify early |
|
|
102
|
+
|
|
103
|
+
## Repository layout
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
HAID/
|
|
107
|
+
├── README.md # you are here
|
|
108
|
+
├── docs/ # design & reference documentation
|
|
109
|
+
│ └── decisions/ # ADRs
|
|
110
|
+
├── plans/ # roadmap, MVP spec, open questions
|
|
111
|
+
├── research/ # raw research reports (inputs to the docs)
|
|
112
|
+
├── calibration/ # the scoring-axis calibration harness (experiment code)
|
|
113
|
+
├── src/haid/ # implementation
|
|
114
|
+
│ ├── session/ # Phase-1 parse: forest model, subagents, overflow, cache
|
|
115
|
+
│ ├── graph/ # L0 spine + L1 IO graph (incl. Bash read/write parsing)
|
|
116
|
+
│ ├── metrics/ # the four waste metrics + baseline + `haid metrics` emitter
|
|
117
|
+
│ ├── window.py # the multi-session analysis window
|
|
118
|
+
│ ├── scoring/ # relative value scorer (difficulty/cleanliness/volume/cost/value)
|
|
119
|
+
│ └── bridge/ # transcript→(diff, usage) reconstruction (the bridge)
|
|
120
|
+
├── tests/ # session/ graph/ metrics/ scoring/ bridge/ suites (+ fixtures/)
|
|
121
|
+
└── scripts/ # build_metric_baselines.py + CLI helpers
|
|
122
|
+
```
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "haid"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "How Am I Doing — local-only self-audit & coaching for Claude Code sessions"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "dv-hart", email = "jhart@datavine.us" }]
|
|
13
|
+
keywords = ["claude-code", "self-audit", "coaching", "llm", "agent", "telemetry"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [] # stdlib only — model judgment is delegated to the host agent (see scoring/compare.py)
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/dv-hart/haid"
|
|
29
|
+
Repository = "https://github.com/dv-hart/haid"
|
|
30
|
+
Issues = "https://github.com/dv-hart/haid/issues"
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
haid = "haid.cli:main"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.package-data]
|
|
39
|
+
haid = ["data/*.json", "data/anchor_diffs/*.diff"]
|
haid-0.0.1/setup.cfg
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""HAID — "How Am I Doing": local-only self-audit & coaching for Claude Code sessions.
|
|
2
|
+
|
|
3
|
+
This package is the product code (stdlib only). The scoring subpackage places a session
|
|
4
|
+
diff against fixed reference ladders to produce relative achievement scores; the model
|
|
5
|
+
judgment those placements need is delegated to the host agent (Claude Code subagents),
|
|
6
|
+
never an in-process API call — see haid.scoring.compare.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__version__ = "0.0.1"
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""The bridge: window → (diff, usage) — the join between the real-session pipeline and the
|
|
2
|
+
scoring stack.
|
|
3
|
+
|
|
4
|
+
The scorer (volume / difficulty / cleanliness / value) was built and validated against
|
|
5
|
+
calibration diffs; the session pipeline (session → graph → metrics) ingests real transcripts.
|
|
6
|
+
This package connects them: given an analysis window it produces the two inputs the scorer
|
|
7
|
+
needs — a reconstructed unified **diff** and a normalized-token **cost** — so `haid value` runs
|
|
8
|
+
on real work.
|
|
9
|
+
|
|
10
|
+
Design (recorded in the project notes, decided after measuring the gap):
|
|
11
|
+
- **Replay-primary, no git.** The diff is reconstructed from the transcript (see
|
|
12
|
+
`reconstruct`). The bash-write-to-source gap was measured at ~0–1% on real projects; what
|
|
13
|
+
little it misses is detected and FLAGGED, never silently dropped.
|
|
14
|
+
- **Grain-agnostic core.** `window_inputs` slices the whole window; the same engine will slice
|
|
15
|
+
by episode once episodes exist (Phase 2 — episode↔PR alignment is explicitly TBD, not v1).
|
|
16
|
+
|
|
17
|
+
Stdlib only; no model.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
|
|
25
|
+
from .reconstruct import FileRecon, ReconResult, reconstruct
|
|
26
|
+
from .usage import extract_cost
|
|
27
|
+
|
|
28
|
+
__all__ = ["BridgeResult", "window_inputs", "episode_inputs", "reconstruct", "extract_cost",
|
|
29
|
+
"FileRecon", "ReconResult"]
|
|
30
|
+
|
|
31
|
+
_ABS = re.compile(r"^(?:/|[A-Za-z]:[\\/]|\\\\)") # posix root, drive letter, or UNC
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _is_external(file_id: str) -> bool:
|
|
35
|
+
"""A file id that isn't repo-relative — temp files, other repos, /etc — is not part of the
|
|
36
|
+
project work product and must not enter the scored diff. (build.py makes ids repo-relative
|
|
37
|
+
only when the path is under the session cwd; everything else stays absolute.)"""
|
|
38
|
+
return bool(_ABS.match(file_id))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class BridgeResult:
|
|
43
|
+
diff: str # reconstructed unified diff (scorer input)
|
|
44
|
+
cost: object # cost.CostResult (scorer denominator)
|
|
45
|
+
files: list = field(default_factory=list) # per-file FileRecon (kept for inspection)
|
|
46
|
+
caveats: list = field(default_factory=list) # honesty surface — no silent gaps
|
|
47
|
+
|
|
48
|
+
def summary(self) -> str:
|
|
49
|
+
changed = [f for f in self.files if f.changed]
|
|
50
|
+
incomplete = [f for f in self.files if not f.complete]
|
|
51
|
+
lines = [f"bridge: {len(changed)} changed file(s) reconstructed, "
|
|
52
|
+
f"{len(incomplete)} flagged incomplete",
|
|
53
|
+
self.cost.summary()]
|
|
54
|
+
if self.caveats:
|
|
55
|
+
lines.append("caveats:")
|
|
56
|
+
lines.extend(f" {c}" for c in self.caveats)
|
|
57
|
+
return "\n".join(lines)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def window_inputs(view, sessions) -> BridgeResult:
|
|
61
|
+
"""Build the scorer inputs (diff, cost) for a whole analysis window.
|
|
62
|
+
|
|
63
|
+
`view` is a metrics.WindowView (its `active_stream` gives the active-branch tool calls in
|
|
64
|
+
order); `sessions` are the loaded Session objects (for token usage + edit content).
|
|
65
|
+
"""
|
|
66
|
+
from ..graph.model import is_write
|
|
67
|
+
|
|
68
|
+
tur_by_id = _tur_index(sessions)
|
|
69
|
+
writes = []
|
|
70
|
+
excluded = 0
|
|
71
|
+
for _sid, tc in view.active_stream:
|
|
72
|
+
if not is_write(tc):
|
|
73
|
+
continue
|
|
74
|
+
fid = tc.target_file_id
|
|
75
|
+
if not fid:
|
|
76
|
+
continue
|
|
77
|
+
if _is_external(fid):
|
|
78
|
+
excluded += 1
|
|
79
|
+
continue
|
|
80
|
+
tur = tur_by_id.get(tc.id, {})
|
|
81
|
+
writes.append((fid, tc.tool, tur, tc.write_op, tc.write_content, tc.derived_write))
|
|
82
|
+
|
|
83
|
+
recon = reconstruct(writes, baselines=_baselines(sessions))
|
|
84
|
+
recon.excluded_external = excluded
|
|
85
|
+
|
|
86
|
+
caveats = list(recon.caveats)
|
|
87
|
+
if excluded:
|
|
88
|
+
caveats.append(f"{excluded} write(s) to files outside the project tree "
|
|
89
|
+
"(temp / other repos) excluded from the diff")
|
|
90
|
+
subagent_writes = _subagent_write_count(sessions)
|
|
91
|
+
if subagent_writes:
|
|
92
|
+
caveats.append(f"{subagent_writes} subagent file-write call(s) are not yet folded into "
|
|
93
|
+
"the diff (subagent edit stitching is deferred)")
|
|
94
|
+
|
|
95
|
+
return BridgeResult(diff=recon.diff, cost=extract_cost(sessions),
|
|
96
|
+
files=recon.files, caveats=caveats)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def episode_inputs(episode_sessions) -> BridgeResult:
|
|
100
|
+
"""Build the scorer inputs (diff, cost) for ONE episode = its subset of whole sessions.
|
|
101
|
+
|
|
102
|
+
Because an episode is a set of *whole sessions* (grain decision 2026-06-08), this is just
|
|
103
|
+
`window_inputs` over that subset — no new slicing engine. Two things fall out for free:
|
|
104
|
+
- **episode-relative diff baseline**: `_baselines` takes the earliest captured `originalFile`
|
|
105
|
+
across these sessions only, which is each file's state as it ENTERED the episode (i.e.
|
|
106
|
+
after any earlier episodes touched it), so the diff is the episode's own delta;
|
|
107
|
+
- **clean cost**: `extract_cost` sums these sessions' per-context-window costs — no entangled
|
|
108
|
+
sub-session token split (the whole reason the session is the atomic floor).
|
|
109
|
+
"""
|
|
110
|
+
from ..window import build_view
|
|
111
|
+
sub_view = build_view(episode_sessions)
|
|
112
|
+
return window_inputs(sub_view, episode_sessions)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _tur_index(sessions) -> dict:
|
|
116
|
+
"""tool_use id -> toolUseResult dict, across main + subagent records of every session.
|
|
117
|
+
|
|
118
|
+
Pairing key is the tool_use_id inside the result's tool_result block (verified on real
|
|
119
|
+
data — there is no top-level sourceToolUseID)."""
|
|
120
|
+
out: dict[str, dict] = {}
|
|
121
|
+
for s in sessions:
|
|
122
|
+
recs = list(s.parse.records) + [r for sa in s.subagents for r in sa.parse.records]
|
|
123
|
+
for r in recs:
|
|
124
|
+
tur = r.raw.get("toolUseResult")
|
|
125
|
+
if not isinstance(tur, dict):
|
|
126
|
+
continue
|
|
127
|
+
c = r.content
|
|
128
|
+
if not isinstance(c, list):
|
|
129
|
+
continue
|
|
130
|
+
for b in c:
|
|
131
|
+
if isinstance(b, dict) and b.get("type") == "tool_result" and b.get("tool_use_id"):
|
|
132
|
+
out[b["tool_use_id"]] = tur
|
|
133
|
+
break
|
|
134
|
+
return out
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _baselines(sessions) -> dict:
|
|
138
|
+
"""file_id -> the file's content as it ENTERED the window: the earliest captured
|
|
139
|
+
`originalFile` for that file across all records (any branch, main + subagents).
|
|
140
|
+
|
|
141
|
+
Claude Code omits originalFile on some edits (e.g. large files), so the first edit we see
|
|
142
|
+
in the active stream may lack it even though an earlier touch captured it. Sourcing the
|
|
143
|
+
earliest one window-wide gives buffer-mode reconstruction a correct seed; files that never
|
|
144
|
+
captured it fall back to hunks mode in reconstruct()."""
|
|
145
|
+
from ..graph.build import _file_id
|
|
146
|
+
|
|
147
|
+
by_first_ts = sorted(sessions, key=lambda s: min(
|
|
148
|
+
(r.timestamp for r in s.parse.records if r.timestamp), default=""))
|
|
149
|
+
out: dict[str, str] = {}
|
|
150
|
+
for s in by_first_ts:
|
|
151
|
+
cwd = next((r.raw.get("cwd") for r in s.parse.records if r.raw.get("cwd")), None)
|
|
152
|
+
for r in list(s.parse.records) + [rr for sa in s.subagents for rr in sa.parse.records]:
|
|
153
|
+
tur = r.raw.get("toolUseResult")
|
|
154
|
+
if not isinstance(tur, dict) or tur.get("originalFile") is None:
|
|
155
|
+
continue
|
|
156
|
+
path = tur.get("filePath") or (tur.get("file") or {}).get("filePath")
|
|
157
|
+
fid = _file_id(path, cwd)
|
|
158
|
+
if fid and fid not in out:
|
|
159
|
+
out[fid] = tur["originalFile"]
|
|
160
|
+
return out
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _subagent_write_count(sessions) -> int:
|
|
164
|
+
from ..graph.model import is_write
|
|
165
|
+
from ..graph.build import build_graph
|
|
166
|
+
n = 0
|
|
167
|
+
for s in sessions:
|
|
168
|
+
for sa in s.subagents:
|
|
169
|
+
g = build_graph(sa.parse.records)
|
|
170
|
+
n += sum(1 for tc in g.toolcalls.values()
|
|
171
|
+
if is_write(tc) and tc.target_file_id and not _is_external(tc.target_file_id))
|
|
172
|
+
return n
|