@vyuhlabs/dxkit 2.12.0 → 2.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +90 -0
  2. package/README.md +246 -287
  3. package/dist/allowlist/hint.d.ts +1 -1
  4. package/dist/allowlist/hint.d.ts.map +1 -1
  5. package/dist/allowlist/hint.js +6 -3
  6. package/dist/allowlist/hint.js.map +1 -1
  7. package/dist/baseline/check.d.ts +7 -0
  8. package/dist/baseline/check.d.ts.map +1 -1
  9. package/dist/baseline/check.js +3 -1
  10. package/dist/baseline/check.js.map +1 -1
  11. package/dist/cli.d.ts.map +1 -1
  12. package/dist/cli.js +101 -14
  13. package/dist/cli.js.map +1 -1
  14. package/dist/dashboard/graph-tab.d.ts.map +1 -1
  15. package/dist/dashboard/graph-tab.js +6 -3
  16. package/dist/dashboard/graph-tab.js.map +1 -1
  17. package/dist/doctor.d.ts.map +1 -1
  18. package/dist/doctor.js +13 -12
  19. package/dist/doctor.js.map +1 -1
  20. package/dist/generator.d.ts.map +1 -1
  21. package/dist/generator.js +8 -2
  22. package/dist/generator.js.map +1 -1
  23. package/dist/issue-cli.d.ts +1 -1
  24. package/dist/issue-cli.js +1 -1
  25. package/dist/loop/demo.d.ts +12 -0
  26. package/dist/loop/demo.d.ts.map +1 -0
  27. package/dist/loop/demo.js +331 -0
  28. package/dist/loop/demo.js.map +1 -0
  29. package/dist/loop/doctor.d.ts +37 -0
  30. package/dist/loop/doctor.d.ts.map +1 -0
  31. package/dist/loop/doctor.js +320 -0
  32. package/dist/loop/doctor.js.map +1 -0
  33. package/dist/loop/ledger-cli.d.ts +7 -0
  34. package/dist/loop/ledger-cli.d.ts.map +1 -0
  35. package/dist/loop/ledger-cli.js +95 -0
  36. package/dist/loop/ledger-cli.js.map +1 -0
  37. package/dist/loop/ledger.d.ts +95 -0
  38. package/dist/loop/ledger.d.ts.map +1 -0
  39. package/dist/loop/ledger.js +201 -0
  40. package/dist/loop/ledger.js.map +1 -0
  41. package/dist/loop/policy.d.ts +35 -0
  42. package/dist/loop/policy.d.ts.map +1 -0
  43. package/dist/loop/policy.js +151 -0
  44. package/dist/loop/policy.js.map +1 -0
  45. package/dist/loop/scaffold.d.ts +28 -0
  46. package/dist/loop/scaffold.d.ts.map +1 -0
  47. package/dist/loop/scaffold.js +224 -0
  48. package/dist/loop/scaffold.js.map +1 -0
  49. package/dist/loop/stop-gate.d.ts +71 -0
  50. package/dist/loop/stop-gate.d.ts.map +1 -0
  51. package/dist/loop/stop-gate.js +295 -0
  52. package/dist/loop/stop-gate.js.map +1 -0
  53. package/dist/self-invocation.d.ts +77 -0
  54. package/dist/self-invocation.d.ts.map +1 -0
  55. package/dist/self-invocation.js +157 -0
  56. package/dist/self-invocation.js.map +1 -0
  57. package/dist/ship-installers.d.ts.map +1 -1
  58. package/dist/ship-installers.js +8 -0
  59. package/dist/ship-installers.js.map +1 -1
  60. package/dist/types.d.ts +4 -0
  61. package/dist/types.d.ts.map +1 -1
  62. package/dist/update.d.ts.map +1 -1
  63. package/dist/update.js +22 -5
  64. package/dist/update.js.map +1 -1
  65. package/dist/upgrade.d.ts +3 -3
  66. package/dist/upgrade.d.ts.map +1 -1
  67. package/dist/upgrade.js +5 -4
  68. package/dist/upgrade.js.map +1 -1
  69. package/package.json +6 -4
  70. package/templates/.claude/skills/dxkit-config/SKILL.md +17 -0
  71. package/templates/.claude/skills/dxkit-init/SKILL.md +1 -0
  72. package/templates/.claude/skills/dxkit-learn/SKILL.md +17 -0
  73. package/templates/.claude/skills/dxkit-loop/SKILL.md +114 -0
  74. package/templates/.claude/skills/dxkit-onboard/SKILL.md +2 -0
  75. package/templates/.claude/skills/dxkit-update/SKILL.md +3 -0
package/README.md CHANGED
@@ -1,146 +1,272 @@
1
1
  # dxkit
2
2
 
3
- **AI writes the code. dxkit helps ship it clean.**
3
+ **A deterministic Stop-gate for autonomous coding loops.**
4
4
 
5
- _Deterministic guardrails for any codebase. Brownfield-friendly by default._
5
+ Coding agents keep editing until they decide to stop. Tests and linters catch
6
+ broken code, but they do not know whether the agent made the repo worse than
7
+ the baseline. So loops can quietly ship new secrets, untested paths, and other
8
+ detector-backed regressions, then report success.
6
9
 
7
- dxkit scores your codebase deterministically, baselines today's findings, and gates every push against net-new regressions. It ships conversational skills that walk agents (and humans) through fixes. Existing tech debt stays grandfathered. Nothing runs on an LLM. Everything runs locally.
10
+ In our loop benchmark, vanilla Claude Code-style loops stopped with net-new
11
+ debt in **11 of 16 runs**. A prompt that told the agent to self-check still
12
+ escaped **9 of 16**. With dxkit's Stop-gate, we observed **0 of 16** escapes:
13
+ when the loop tried to stop dirty, dxkit blocked, handed back the exact net-new
14
+ finding, and the agent repaired before stopping clean.
8
15
 
9
16
  <p align="center">
10
- <img src=".github/assets/guardrail-demo.gif" width="760" alt="A git push blocked by the dxkit pre-push guardrail: 2 net-new regressions block the push while 644 pre-existing findings stay grandfathered." />
17
+ <img src=".github/assets/loop-stop-gate-demo.gif" width="820" alt="dxkit's Stop-gate blocks a coding-agent loop on a net-new critical dependency vulnerability, the agent bumps the version, and the gate goes clean." />
11
18
  </p>
19
+ <p align="center"><sub>Recorded from a real run on a synthetic repo, shortened for readability. Blocked and repaired inside the same warm loop.</sub></p>
20
+
21
+ dxkit does not reinvent detection. It runs trusted open source scanners
22
+ (gitleaks, Semgrep, OSV, npm audit, and more), and it can ingest results from
23
+ Snyk and CodeQL. What it adds is the piece those tools were not built for: a
24
+ deterministic check, on every stop, of whether this change introduced a new
25
+ finding compared with a baseline.
12
26
 
13
27
  ```bash
14
- npm init @vyuhlabs/dxkit
28
+ npm init @vyuhlabs/dxkit -- --claude-loop --yes # install dxkit + register the Claude Code Stop hook
29
+ npx vyuh-dxkit baseline create # grandfather today's findings
30
+ npx vyuh-dxkit loop doctor # verify the gate is wired
15
31
  ```
16
32
 
33
+ The gate runs locally with no model: same input, same verdict, in seconds.
34
+ Existing debt stays grandfathered; only net-new regressions block. Want to
35
+ watch the flow first, on a sandbox dxkit creates? See the
36
+ [walkthrough](#see-it-without-touching-your-repo).
37
+
38
+ [Read the benchmark](docs/benchmarks.md) · [Try it on your repo](#try-it-on-your-repo)
39
+
17
40
  <p>
18
- <a href="https://www.npmjs.com/package/@vyuhlabs/dxkit">
19
- <img alt="npm version" src="https://img.shields.io/npm/v/@vyuhlabs/dxkit">
20
- </a>
21
- <img alt="license" src="https://img.shields.io/github/license/vyuh-labs/dxkit">
22
- <img alt="deterministic" src="https://img.shields.io/badge/scoring-deterministic-blue">
23
- <img alt="brownfield" src="https://img.shields.io/badge/brownfield-baseline%20guardrails-orange">
24
- <img alt="local-first" src="https://img.shields.io/badge/local-first-green">
41
+ <a href="https://www.npmjs.com/package/@vyuhlabs/dxkit"><img alt="npm" src="https://img.shields.io/npm/v/@vyuhlabs/dxkit"></a>
42
+ <img alt="license: MIT" src="https://img.shields.io/badge/license-MIT-green">
43
+ <img alt="deterministic gate" src="https://img.shields.io/badge/gate-deterministic-blue">
44
+ <img alt="local-first" src="https://img.shields.io/badge/local--first-success">
25
45
  </p>
26
46
 
27
47
  ---
28
48
 
29
- ## The problem
30
-
31
- Codebases drift downward in slow ways that tests do not catch.
32
-
33
- A typical Friday. Your team ships a fix. CI passes. Review approves. Two weeks later, an auditor finds a new hardcoded secret in the diff, three new untested branches, and a previously-clean file that grew to 800 lines with three TODOs sprinkled in. None of it failed a test, because no test covered those things.
34
-
35
- Now multiply this by every AI agent your team uses. Agents write more code than humans can review. Some of it is fine. Some of it is slop that looks fine but quietly degrades the codebase.
49
+ ## The problem: loops do not know when they made things worse
36
50
 
37
- The conventional fix is "block any new finding via static analysis." That fails on real codebases for a predictable reason:
51
+ An autonomous loop runs until the agent decides it is done. The only checks in
52
+ that loop today are tests and linters, and those catch broken code, not
53
+ regressed code. There is no notion of "worse than the baseline." So an agent
54
+ can add a feature, leave a new untested path or a hardcoded credential behind,
55
+ run the tests, see green, and declare success.
38
56
 
39
- - Block every finding, and your 5-year-old repo lights up with hundreds of pre-existing issues. The team disables the gate within a week.
40
- - Block no findings, and the gate is theater. Nothing changes.
57
+ In our benchmark this happened in most vanilla runs, and telling the agent to
58
+ check its own work only helped a little.
41
59
 
42
- You need an objective gate that only fires on what is actually new. That is the gap dxkit fills.
60
+ ## What dxkit does
43
61
 
44
- ---
45
-
46
- ## How dxkit solves it
47
-
48
- Three ideas working together.
62
+ 1. **Baseline today's debt.** `baseline create` records every current finding,
63
+ so pre-existing issues are grandfathered and never block.
64
+ 2. **Run a deterministic Stop-gate on every stop.** A Claude Code Stop hook
65
+ re-runs the guardrail against that baseline. Same input gives the same
66
+ verdict, in seconds, offline, with no model in the loop.
67
+ 3. **Feed net-new findings back to the agent.** If the change introduced a
68
+ finding, the gate blocks the stop and hands the agent the exact finding to
69
+ fix: do not refresh the baseline, do not touch unrelated debt, fix what this
70
+ branch introduced. The loop stops only when clean.
49
71
 
50
- ### 1. Capture today's state as a baseline
72
+ ## Who this is for
51
73
 
52
- Before dxkit blocks anything, it snapshots every existing finding in your repo and fingerprints them. The fingerprints survive renames, line shifts from formatter runs, and small unrelated edits. Cross-tool overlaps (gitleaks and semgrep flagging the same line) collapse to one finding.
74
+ Use dxkit if you let coding agents:
53
75
 
54
- From this moment forward, the gate only fires on net-new regressions. Your existing debt is grandfathered. The team fixes old issues at their own pace. The gate stays useful because it stays reasonable.
76
+ - run unattended or semi-attended,
77
+ - fix CI or review comments in loops,
78
+ - touch brownfield repos that already carry debt,
79
+ - or work where "new debt" matters more than "all debt."
55
80
 
56
- Three modes for the baseline file:
81
+ ## Built on tools you already trust
57
82
 
58
- - `committed-full`: rich entries committed to git. Default for private repos.
59
- - `committed-sanitized`: stripped to fingerprint plus kind. For compliance-conscious teams.
60
- - `ref-based`: no committed file at all. Prior side recomputed from a git ref via `git worktree add`. Default for public repos. Zero disclosure surface.
83
+ dxkit is an orchestration and enforcement layer, not another scanner. It runs
84
+ established open source tools and treats their output as one stream. Which tools
85
+ run depends on the languages in your repo dxkit covers **8 ecosystems**
86
+ (TypeScript / JavaScript, Python, Go, Rust, C# / .NET, Java, Kotlin, Ruby).
61
87
 
62
- ### 2. Score the codebase deterministically
88
+ Universal, on every repo:
63
89
 
64
- dxkit produces a 0 to 100 score across six dimensions: Security, Code Quality, Tests, Documentation, Maintainability, Developer Experience.
90
+ - secrets: gitleaks
91
+ - code patterns: Semgrep
92
+ - dependency advisories: OSV.dev
93
+ - size, duplication, and the code graph: cloc, jscpd, graphify
65
94
 
66
- The score has four properties:
95
+ Per language, dxkit adds that ecosystem's own linter and audit tool — for
96
+ example npm audit + ESLint (JS / TS), pip-audit + ruff (Python), govulncheck +
97
+ golangci-lint (Go), cargo-audit + clippy (Rust), `dotnet list --vulnerable`
98
+ (C#), osv-scanner + PMD (Java), osv-scanner + detekt (Kotlin), and
99
+ bundler-audit + RuboCop (Ruby). The full per-language matrix is in **Per-pack
100
+ capabilities** below.
67
101
 
68
- | Property | What it means |
69
- | --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
70
- | **Deterministic** | Same code yields the same score every time. No LLM in the grading path. Reproducible across machines, runs, and CI. Auditable. |
71
- | **Comparable** | Two codebases of similar quality produce similar scores. Surface tricks do not move the needle. Adding empty comments does not improve Documentation if the code is not actually documented. |
72
- | **Severity-weighted** | A critical security finding moves the score far more than a TODO comment. Penalties are anchored to real-world impact via CVSS for security and ratio thresholds for tests, coverage, file size, and other dimensions. |
73
- | **Actionable** | Every deduction names the file, the line, and the recommended fix. Output is structured JSON. Agents and humans read the same thing. The "what to do next" lives in the score itself. |
102
+ For deep interprocedural analysis, it ingests findings from **Snyk Code** and
103
+ **CodeQL** (or any SARIF file), fingerprints them the same way as native
104
+ findings, and runs them through the same baseline and gate. You keep the
105
+ detectors you already have. dxkit makes their findings enforceable inside CI
106
+ and inside the agent loop.
74
107
 
75
- ### 3. Fix findings at reduced token cost
108
+ | Layer | Examples | Job |
109
+ | --------- | ------------------------------------------------------ | ------------------------------------------------------- |
110
+ | Detection | gitleaks, Semgrep, OSV, npm audit, Snyk, CodeQL, SARIF | Find issues |
111
+ | dxkit | baseline, fingerprint matcher, Stop-gate, loop ledger | Decide whether this change introduced something net-new |
112
+ | Agent | Claude Code or another coding loop | Repair the exact finding and try to stop again |
76
113
 
77
- Detection is only half the job. dxkit builds a deterministic code graph of the repo (its symbols, call edges, and clustered modules), so fixing is cheap too. A coding agent works from that structure ("what calls this? what breaks if I change it?") instead of re-reading whole files, and every finding in a detailed report already carries its blast radius: the files that depend on it. The `dxkit-action` skill runs the fix, re-scores, and confirms the gate clears. Same result, far fewer tokens.
114
+ ## Try it on your repo
78
115
 
79
- ### What you get from the combination
116
+ The Stop hook runs dxkit on every stop, so install dxkit into the repo. This
117
+ one command adds it as a devDependency and registers the hook additively — your
118
+ existing `.claude` settings are preserved:
80
119
 
81
- A score on its own is a number. A baseline on its own grandfathers the past. Together they produce an objective stop signal you can trust.
82
-
83
- ```text
84
- Today: 16/100 E 644 findings, all baselined
85
- Next PR: 16/100 E 644 persisted, 0 new. Gate passes.
86
- Bad PR: 14/100 E 644 persisted, 2 new high-severity. Gate blocks.
120
+ ```bash
121
+ npm init @vyuhlabs/dxkit -- --claude-loop --yes
122
+ npx vyuh-dxkit baseline create # grandfather today's findings
123
+ npx vyuh-dxkit loop doctor # verify the gate is wired safely and dxkit resolves
124
+ # then run Claude Code as you normally would. The Stop-gate fires on every stop.
125
+ npx vyuh-dxkit loop ledger summarize # afterwards: blocked vs allowed, repaired-after-block
87
126
  ```
88
127
 
89
- The score does not lie. The baseline keeps it useful on real codebases. The combination works the same for humans, AI agents, and CI runners. That is the part that scales. And once the gate fires, the code graph makes acting on it cheap: agents fix from the structure rather than reading file after file.
128
+ When the agent tries to stop, dxkit runs the net-new gate against the baseline.
129
+ Existing findings are grandfathered; only findings this change introduced block.
90
130
 
91
- ---
131
+ ## See it without touching your repo
92
132
 
93
- ## 60-second demo
133
+ Want the flow first, on a sandbox dxkit creates?
94
134
 
95
- ```text
96
- $ npm init @vyuhlabs/dxkit
97
- ✓ Created: 14 files
98
- ✓ Git hooks: installed 1 file(s)
99
- .githooks/pre-push
100
- ✓ Devcontainer: installed 3 file(s)
101
- ✓ CI guardrails workflow: installed 1 file(s)
102
- .github/workflows/dxkit-guardrails.yml
103
- ✓ Done! Claude Code now has full project context.
104
- → Next: run `vyuh-dxkit baseline create` to capture today's state.
105
-
106
- $ npx vyuh-dxkit baseline create
107
- → Baseline mode=committed-full (auto: visibility not detectable via gh; defaulting to private posture)
108
- ✓ Wrote .dxkit/baselines/main.json — 644 findings, salt: deterministic (208.9s)
109
-
110
- $ npx vyuh-dxkit guardrail check
111
- ## Guardrail: PASSED
112
- No changes from baseline (644 pairs checked).
135
+ ```bash
136
+ npx -y @vyuhlabs/dxkit@latest demo loop-guardrail
113
137
  ```
114
138
 
115
- Later, an innocent-looking PR slips in a regression. The pre-push hook fires:
139
+ This runs the **real** gate on a temporary fixture repo: baseline → introduce a
140
+ net-new secret → BLOCK → repair → CLEAN, then it tears the fixture down. No API
141
+ key and no Claude Code, and your own repo is never touched. It needs gitleaks
142
+ installed and takes about 20 seconds; without gitleaks it shows a clearly
143
+ labelled illustration instead. (It does a one-time `npx` download, so it is not
144
+ fully offline — the gate itself is.)
145
+
146
+ ### Presets: what blocks the loop
116
147
 
117
148
  ```text
118
- $ git push
119
- [hook] vyuh-dxkit guardrail check
120
- ## Guardrail: BLOCKED
121
- 2 new regressions found.
122
-
123
- | Status | Kind | Severity | Location | Reason |
124
- |---|---|---|---|---|
125
- | added | secret | high | src/config/secrets.ts:42 | gitleaks/aws-access-key |
126
- | added | code | medium | src/handlers/exec.ts:17 | semgrep/eval-use |
127
-
128
- 644 pre-existing findings persisted. Only the new changes blocked you.
129
- Fix or allowlist with `npx vyuh-dxkit allowlist add ...`
149
+ security-only (default) secrets and critical or high vulnerabilities. Bounded, must-fix, cheap to gate.
150
+ full-debt (opt-in) also gates test gaps and maintainability regressions. Repairs can be expensive.
130
151
  ```
131
152
 
132
- The 644 pre-existing findings sit quietly. The 2 net-new ones stop the push.
133
-
134
- ---
153
+ The default is `security-only`. The headline escape-rate benchmark used
154
+ `full-debt` (it gated both the secret trap and the test-gap trap); the default
155
+ install starts narrower so a first run does not trap users in expensive
156
+ test-generation loops. Switch with
157
+ `npm init @vyuhlabs/dxkit -- --claude-loop --loop-preset full-debt`.
135
158
 
136
- ## Features
137
-
138
- ### Eight first-class language packs
139
-
140
- TypeScript / JavaScript, Python, Go, Rust, C# / .NET, Java, Kotlin, Ruby. Each pack ships per-ecosystem analyzers: semgrep rulesets, dep-vuln scanners, license tools, lint adapters. Polyglot repos get unified reports without configuration.
159
+ ## Give the agent a map, not just a gate
160
+
161
+ The Stop-gate controls what a loop is allowed to ship. The code graph controls
162
+ how the agent does the work in between. When dxkit scaffolds a repo it builds a
163
+ code graph and installs skills that drive real development off it, so the agent
164
+ orients by querying structure instead of grepping and re-reading whole files.
165
+
166
+ - **Build a feature** (`dxkit-feature` skill): query the graph for where the
167
+ feature plugs in, what patterns already exist, and what the change will
168
+ touch, then implement against those patterns and run the analyzers on the
169
+ result before it stops.
170
+ - **Fix a finding** (`dxkit-action` skill): take a flagged finding, pull its
171
+ callers, callees, and blast radius from the graph, repair it, and confirm the
172
+ change did not introduce something net-new.
173
+
174
+ The agent gets callers, callees, and blast radius up front as a budget-bounded
175
+ slice, not a pile of file reads. It is the same graph, the same baseline, and
176
+ the same identity contract the gate already uses.
177
+
178
+ What the benchmarks actually show is predictable spend, not guaranteed cheaper
179
+ spend. On a large repo the median was roughly tied, the worst-case session used
180
+ about **57% fewer tokens**, and the variance was **roughly halved**. On a small
181
+ repo the overhead was about zero. The graph caps the expensive tail. It does
182
+ not promise a lower average, and it does not make the agent write better code on
183
+ its own.
184
+
185
+ This is a different axis from detection. Snyk, SonarQube, and CodeQL tell you
186
+ what is wrong. They do not give the agent a map of the code or bound how much it
187
+ spends finding its way around. dxkit does both: the gate bounds what the loop
188
+ ships, the graph bounds how the loop works.
189
+
190
+ ## The numbers
191
+
192
+ Three independent benchmark results, one theme: dxkit makes agent work more
193
+ predictable.
194
+
195
+ | Layer | What it bounds | Observed result |
196
+ | -------------------------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
197
+ | **Stop-gate** | unsafe final state | vanilla loops escaped **11/16** times, prompt-only checklist escaped **9/16**, dxkit escaped **0/16** |
198
+ | **Deterministic identity** | false "net-new" findings under churn | **100% catch / 0% false-block** on seeded gate tests; **0 false net-new** on tested line shifts and renames |
199
+ | **Graph context** | large-repo exploration tails | median roughly tied, but large-repo mean tokens **30% lower**, worst case **57% lower**, variance roughly halved |
200
+
201
+ > **Benchmark caveats:** the loop-safety study uses controlled synthetic tasks
202
+ > plus real-repo validation, detector-backed findings, and Sonnet runs. It is
203
+ > not a CVE corpus, not a claim of better detection, and not a guarantee that
204
+ > dxkit catches every possible bug. The claim is narrower: for findings the
205
+ > detector observes, dxkit gives the loop a deterministic net-new stop decision.
206
+
207
+ Full methodology, reproducibility notes, artifact status, and caveats are in
208
+ **[docs/benchmarks.md](docs/benchmarks.md)**.
209
+
210
+ ## What dxkit is, and is not
211
+
212
+ **It is a deterministic verification layer.** It baselines today's findings,
213
+ fingerprints them across churn, and blocks only net-new regressions.
214
+
215
+ **It is not a scanner replacement.** It runs and ingests scanners (gitleaks,
216
+ Semgrep, CodeQL, Snyk, SARIF) and makes their findings enforceable. It does not
217
+ claim to find more bugs than they do.
218
+
219
+ **It is not an LLM judge.** No model decides whether the gate passes. The model
220
+ can repair findings. The gate itself is deterministic, and the prompt does not
221
+ grow as the baseline grows.
222
+
223
+ **It is not a guarantee of safe code.** It blocks detector-backed net-new
224
+ findings it can observe. You still need tests, review, scanners, and judgment.
225
+
226
+ ## Why not just Snyk, SonarQube, or CodeQL?
227
+
228
+ Use them. dxkit can ingest their findings. The difference is tempo and control,
229
+ not detection. Cloud scanners are strong detection engines, and they usually
230
+ run on a CI or PR cadence. A coding-agent loop needs a local stop decision
231
+ every time the agent tries to declare done.
232
+
233
+ | Loop Stop-gate need | dxkit | Cloud or CI scanners |
234
+ | ----------------------------------------------------------- | ----- | -------------------------------------- |
235
+ | Runs locally on every stop, in seconds | yes | usually CI or cloud cadence |
236
+ | Can run without network or auth | yes | usually requires network or auth |
237
+ | Grandfathers existing debt | yes | tool-dependent |
238
+ | Feeds the exact block reason back to the warm agent session | yes | usually a human-facing dashboard or PR |
239
+
240
+ The goal is not to replace scanners. It is to make their findings enforceable
241
+ at the speed of the agent loop.
242
+
243
+ ## Beyond loops
244
+
245
+ The same deterministic core powers the rest of dxkit: pre-push and CI
246
+ guardrails, brownfield baselines, durable finding identity, SARIF, CodeQL, and
247
+ Snyk ingest, a six-dimension health report, code-graph context, and a set of
248
+ Claude Code skills. See **[the docs](docs/README.md)**.
249
+
250
+ ## Languages
251
+
252
+ dxkit covers 8 ecosystems. Detection is automatic from your manifests and
253
+ source; each language brings its own native linter, dependency-audit tool, and
254
+ coverage parser, layered on the universal scanners (gitleaks, Semgrep, OSV,
255
+ cloc, jscpd, graphify).
256
+
257
+ | Language | Detected by | Native linter + audit |
258
+ | ----------------------- | --------------------------- | ----------------------------------------- |
259
+ | TypeScript / JavaScript | `package.json` | ESLint, npm audit |
260
+ | Python | `pyproject.toml`, `*.py` | ruff, pip-audit |
261
+ | Go | `go.mod` | golangci-lint, govulncheck |
262
+ | Rust | `Cargo.toml` | clippy, cargo-audit |
263
+ | C# / .NET | `*.csproj`, `*.sln` | dotnet-format, `dotnet list --vulnerable` |
264
+ | Java | `pom.xml`, `src/main/java/` | PMD, osv-scanner |
265
+ | Kotlin | `*.gradle{.kts,}`, `*.kt` | detekt, osv-scanner |
266
+ | Ruby | `Gemfile`, `*.rb` | RuboCop, bundler-audit |
141
267
 
142
268
  <details>
143
- <summary><strong>Per-pack capabilities</strong> (click to expand)</summary>
269
+ <summary><strong>Per-pack capabilities</strong> — coverage import, import-graph, severity tiers (click to expand)</summary>
144
270
 
145
271
  | Language | Detection | Coverage import | Import-graph | Native tools | Lint severity tiers | Vuln severity tiers |
146
272
  | -------- | ------------------------------------- | ------------------- | -------------------------------------------- | ----------------------------------- | ---------------------- | --------------------------------------------- |
@@ -165,203 +291,36 @@ so it does not inflate the Code Quality score.
165
291
 
166
292
  </details>
167
293
 
168
- ### The matcher
169
-
170
- Multi-axis fingerprints (location, domain, content, semantic) pair findings across runs even when files were renamed, lines shifted, tools changed versions, or the branch was force-pushed. When location fails, the matcher falls back to git-aware diff lookup, then content hash, then identity-only multiset match. Every pair carries a confidence score and a reason chain.
171
-
172
- ### Per-finding suppression
173
-
174
- Five typed categories: `false-positive`, `test-fixture`, `mitigated-externally`, `accepted-risk`, `deferred`. Each entry requires a reason. Categories that fade over time require an expiry.
175
-
176
- Two surfaces:
177
-
178
- - Inline annotations: `// dxkit-allow:test-fixture reason="example placeholder"`
179
- - File-level: `.dxkit/allowlist.json`, audited via `vyuh-dxkit allowlist audit`
180
-
181
- Orphaned annotations become their own findings. The TypeScript `@ts-expect-error` model applied to suppressions. Prevents the graveyard of stale allowlist entries.
182
-
183
- ### AI-agent integration
184
-
185
- dxkit ships a suite of Claude Code skills under `.claude/skills/dxkit-*`. They wrap the CLI in conversational flows:
186
-
187
- | Skill | What it does |
188
- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------- |
189
- | `dxkit-onboard` | Walks a customer through the full first-install journey |
190
- | `dxkit-reports` | Runs analyzers and explains the output |
191
- | `dxkit-action` | Reads a report, prioritizes findings, plans and runs fixes, re-verifies |
192
- | `dxkit-ingest` | Brings external SAST findings (Snyk Code, CodeQL, SARIF) into dxkit |
193
- | `dxkit-fix` | Repairs a broken install from doctor output |
194
- | `dxkit-allowlist` | Manages the suppression lifecycle: audit, remove, prune, export to Snyk |
195
- | `dxkit-test` | Writes the missing tests to close gaps + raise the Tests score |
196
- | `dxkit-pr` | Opens a PR with a diff-grounded body + dxkit signals + reviewer checklist |
197
- | `dxkit-feature`, `dxkit-docs`, `dxkit-hooks`, `dxkit-config`, `dxkit-learn`, `dxkit-update`, `dxkit-init` | Focused flows |
198
-
199
- `AGENTS.md` (the open standard read by Codex, Cursor, Aider, and others) also ships in every install. The skill flows are Claude Code-specific today; the AGENTS.md context is portable.
200
-
201
- Why this matters for AI workflows: when an agent fixes a bug, you need an objective signal that says "yes, fixed cleanly" or "fix introduced four new regressions." dxkit's deterministic score plus baseline guardrail produces that signal. The agent reads the same JSON envelope a human reads, runs the verify step itself, and stops when clean.
202
-
203
- ### Code-graph context: fix at reduced token cost
204
-
205
- dxkit builds a deterministic code graph of your repo (its symbols, call edges, and clustered modules) using graphify (the `graphifyy` Python package). What matters is what an agent does with it. Instead of discovering structure by grepping around and reading whole files, the agent gets just the relevant slice:
206
-
207
- - **`vyuh-dxkit context <query>`** (and an opt-in PreToolUse hook) hand an agent a slim structural map: the relevant symbols, where they live, and what calls them. It navigates by the graph instead of re-reading files, which is the same work at a fraction of the tokens.
208
- - **`--graph-context`** writes each finding's module and blast radius (which files call into it) straight into the detailed report, so the `dxkit-action` fix skill can plan the change, and know which callers to re-test, without rediscovering structure first.
209
- - **`vyuh-dxkit explore`** and a dashboard graph tab let humans ask the same graph what the repo does, where a feature lives, and which files are load-bearing.
210
-
211
- This is an additive, fail-open layer. When the graph is missing, or a language's call edges can't be resolved, every command behaves exactly as it did before. It's reliable on TypeScript, Python, and Go. Where the call graph can't be resolved (C#), blast radius is suppressed rather than faked, so a "no callers" reading is never mistaken for "safe to change."
212
-
213
- ### Connect findings and PRs to the people who know the code
214
-
215
- A finding or a PR is more actionable when you know who to ask. dxkit grounds that in an **active-owner model** — recency-weighted git history, scoped to who is still active, with bots and departed contributors filtered, the change author excluded, and a bus-factor signal.
216
-
217
- - **`vyuh-dxkit reviewers`** suggests reviewers for a change, ranked by active ownership of the touched files and blended with `CODEOWNERS` — a better signal than a platform's naive last-touch suggestion. The `dxkit-pr` skill folds it into the PR body.
218
- - **`--attribute`** adds a "who to ask" column to a detailed report: a pre-existing finding is traced to its current owner (an inactive author is routed to whoever owns the file now). It's opt-in and historical — a net-new finding is introduced by your own change.
219
-
220
- Output is names + GitHub @handles, never raw emails — the @handle is both privacy-safe and @-mentionable.
221
-
222
- ### Deep SAST: interprocedural findings from any engine
223
-
224
- dxkit's bundled SAST (community semgrep) is intraprocedural — it can't follow tainted data across function boundaries, so it misses the path-traversal / information-exposure / SSRF / injection class that an interprocedural engine like Snyk Code or CodeQL catches. dxkit doesn't try to re-detect that class; it **ingests** it and makes it first-class.
225
-
226
- - **`vyuh-dxkit ingest --from-snyk`** brings in your Snyk Code findings and works on every Snyk plan: it reads the REST API quota-free where you have it (Enterprise), and on Free/Team plans automatically falls back to `snyk code test` (one test per run). **`--sarif <file>`** ingests SARIF from any engine; **`--codeql`** runs CodeQL on demand (open-source / GitHub Advanced Security).
227
- - Ingested findings enter the same pipeline as native ones: fingerprinted and deduped, written to the baseline, enforced by the guardrail, and graph-linked under `--graph-context` so the `dxkit-action` fix loop sees blast radius + callers — context the source engine's own autofix doesn't have.
228
- - The findings live in a committed `.dxkit/external/` snapshot, so the engine token is needed only at ingest time (ideally one on-demand CI job) — every developer and CI run reads the snapshot without it.
229
-
230
- dxkit isn't competing with the detection engine — it's the governance + agentic-fix layer on top of whichever one you can run. The `dxkit-ingest` skill walks through setup and picks the engine license-aware (your own Snyk for private repos; CodeQL for open source / GHAS).
231
-
232
- ### Reproducible environments
233
-
234
- Per-stack devcontainer with only the languages your project uses. Scanner toolchain auto-installed. Install scripts for AI agent CLIs (auth stays user-owned). Codespaces prebuilds wire via `vyuh-dxkit setup-prebuild` so cold-start drops from ~7 minutes to ~30 seconds.
235
-
236
- ### Public-repo safe baselines
294
+ ## Reproduce the deterministic tier
237
295
 
238
- The `ref-based` mode commits no baseline file. The guardrail check recomputes the prior side at check time from a git ref via `git worktree add`. Zero disclosure surface. File paths, package names, and advisory IDs all stay out of git. Auto-picked for public repos via `gh repo view --json visibility`.
239
-
240
- ---
241
-
242
- ## Quickstart
243
-
244
- ```bash
245
- # Canonical first install
246
- npm init @vyuhlabs/dxkit
247
-
248
- # Capture today's state
249
- npx vyuh-dxkit baseline create
250
-
251
- # Verify the install
252
- npx vyuh-dxkit doctor
253
-
254
- # Commit and ship
255
- git add . && git commit -m "chore: enable dxkit" && git push
256
-
257
- # Optional but recommended
258
- npx vyuh-dxkit setup-branch-protection # mark guardrail as required CI check
259
- npx vyuh-dxkit setup-prebuild # Codespaces prebuild
260
- ```
261
-
262
- À la carte if you only want specific pieces:
296
+ The deterministic results the net-new gate decision and the finding-identity
297
+ matcher — reproduce offline, so you do not have to trust our numbers. This is
298
+ separate from the agentic benchmark, which requires running real agent sessions.
299
+ The harnesses live in `benchmarks/`:
263
300
 
264
301
  ```bash
265
- npx vyuh-dxkit init --with-dxkit-agents # just the dxkit-* Claude skills + AGENTS.md
266
- npx vyuh-dxkit init --with-hooks # just the pre-push hook
267
- npx vyuh-dxkit init --with-precommit-hook # add pre-commit (slow on large repos)
268
- npx vyuh-dxkit init --with-devcontainer # just the per-stack devcontainer
269
- npx vyuh-dxkit init --with-ci # just the PR-gate workflow
302
+ node benchmarks/bench-guardrail.mjs config.json # block/allow on seeded findings
303
+ node benchmarks/bench-netnew-isolation.mjs config.json # net-new isolation under churn
304
+ node benchmarks/bench-matcher.mjs config.json # false net-new on line shifts + renames
270
305
  ```
271
306
 
272
- ---
307
+ See `benchmarks/README.md` to point them at a repo, and the full methodology,
308
+ caveats, and artifact status in **[docs/benchmarks.md](docs/benchmarks.md)**.
273
309
 
274
- ## What dxkit analyzes
275
-
276
- | Dimension | Tools | What it catches |
277
- | -------------------- | --------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- |
278
- | Security | gitleaks, semgrep, osv-scanner, npm-audit, pip-audit, govulncheck, cargo-audit, dotnet vulnerable, bundle-audit | Secrets, dep vulnerabilities, insecure patterns, TLS bypass |
279
- | Code Quality | cloc, jscpd, graphify, lint adapters | File size, duplication, complexity, hygiene markers |
280
- | Tests | coverage adapters per pack, test-file detector | Missing tests, degraded tests, coverage gaps |
281
- | Documentation | doc-comment ratio, README presence | Inline doc coverage, project-level docs |
282
- | Maintainability | graphify call-graph metrics | God files, dead imports, cohesion, communities |
283
- | Developer Experience | git hook detection, CI workflow detection, manifest presence | Pre-push hooks, CI quality gates, environment reproducibility |
284
-
285
- Each analyzer reports raw findings. dxkit aggregates, deduplicates across tools, and scores deterministically.
286
-
287
- ---
288
-
289
- ## Brownfield vs greenfield
290
-
291
- | | Greenfield (day 1) | Brownfield (years of debt) |
292
- | ---------------- | -------------------------------------- | ------------------------------------------------- |
293
- | Baseline | Near-zero on capture | Captures today's debt as floor |
294
- | Behavior | Every regression matters from commit 1 | Existing debt grandfathered; net-new blocks |
295
- | Cleanup pressure | Stay clean, easily | Improve incrementally; no required cleanup sprint |
296
-
297
- The status taxonomy that drives gate decisions:
298
-
299
- | Status | Meaning | Default |
300
- | ------------------- | ----------------------------------------- | ---------- |
301
- | `added` | Net-new finding introduced by this change | **blocks** |
302
- | `relocated` | Same finding, moved (line drift, rename) | passes |
303
- | `persisted` | Same finding, same place. Pre-existing. | passes |
304
- | `removed` / `fixed` | Was there, now gone | passes |
305
- | `tooling_drift` | New because scanner version changed | warns |
306
- | `config_drift` | New because dxkit config changed | warns |
307
- | `uncertain` | Below confidence threshold | warns |
308
-
309
- Customize via [`.dxkit/policy.json`](docs/configuration/policy.md).
310
-
311
- ---
310
+ ## Credits
312
311
 
313
- ## Safety and trust
314
-
315
- - **Local-first.** Every scan runs on the developer's machine. Nothing leaves the repo. No telemetry. No phone-home.
316
- - **No LLM in the grading path.** Scores come from deterministic analyzers and arithmetic. Reproducible. Auditable. The only way to improve a score is to write better code.
317
- - **Sigstore provenance.** Every npm release is signed via OIDC from GitHub Actions. Verify with `npm audit signatures`.
318
- - **Open source.** MIT licensed. Inspect every score derivation.
319
-
320
- ---
321
-
322
- ## Real-world validation
323
-
324
- dxkit ships against pinned production codebases across all eight language packs. Every release runs a cross-stack walkthrough on a polyglot reference repo (TypeScript + Python) and a .NET reference repo before tagging. The cross-stack regression suite is part of CI.
325
-
326
- Recent ship validation (`@vyuhlabs/dxkit@2.6.0`, 2026-05-23):
327
-
328
- - 1904 tests across 110 files
329
- - License findings dropped 73% on a 600-source-file polyglot codebase after the 2.6 baseline polish
330
- - New `ref-based` mode verified end-to-end on both reference stacks
331
-
332
- ---
333
-
334
- ## Documentation
335
-
336
- **Start here**:
337
-
338
- - [Getting started](docs/getting-started.md): full walkthrough from install to first guardrail check
339
- - [CHANGELOG](CHANGELOG.md): release notes. Latest is [2.6.0](https://github.com/vyuh-labs/dxkit/releases/tag/v2.6.0)
340
-
341
- **Depth**:
342
-
343
- - [Why dxkit](docs/why-dxkit.md): rationale, comparison vs SonarQube/Snyk/Semgrep/etc., open methodology
344
- - [Architecture](docs/ARCHITECTURE.md): data flow, the git-aware matcher, fingerprint axes
345
- - [Scoring methodology](docs/SCORING.md): how each dimension is computed, citations
346
- - [Roadmap](docs/roadmap.md): shipped vs planned
347
-
348
- **Reference**:
349
-
350
- - [Command reference](docs/README.md): every subcommand at a glance
351
- - [`baseline`](docs/commands/baseline.md): capture, show, modes
352
- - [`guardrail`](docs/commands/guardrail.md): check, classify, render
353
- - [`allowlist`](docs/commands/allowlist.md): per-finding suppression
354
- - [`.dxkit/policy.json`](docs/configuration/policy.md): tune what blocks vs warns
355
- - [Reporting issues](docs/commands/issue.md): `vyuh-dxkit issue --type=...`
356
-
357
- ---
358
-
359
- ## Contributing
360
-
361
- See [CONTRIBUTING.md](CONTRIBUTING.md). The project follows architectural rules in [CLAUDE.md](CLAUDE.md). Adding a new language pack, a new finding kind, or a new scoring dimension each have one-page recipes.
362
-
363
- ---
312
+ dxkit stands on excellent open source tools. It orchestrates them, it does not
313
+ replace them. Thank you to the maintainers of
314
+ [graphify](https://github.com/safishamsi/graphify) (the code graph),
315
+ [gitleaks](https://github.com/gitleaks/gitleaks),
316
+ [Semgrep](https://github.com/semgrep/semgrep),
317
+ [OSV-Scanner](https://github.com/google/osv-scanner),
318
+ [jscpd](https://github.com/kucherenko/jscpd), and
319
+ [cloc](https://github.com/AlDanial/cloc). Each tool is installed separately and
320
+ keeps its own license.
364
321
 
365
- ## License
322
+ ## Contributing and roadmap
366
323
 
367
- MIT. See [LICENSE](LICENSE).
324
+ - Contributing guide: [CONTRIBUTING.md](CONTRIBUTING.md)
325
+ - Roadmap: [docs/roadmap.md](docs/roadmap.md)
326
+ - License: MIT
@@ -11,7 +11,7 @@
11
11
  * 2. **Inline example** — the exact annotation comment to paste
12
12
  * when the finding has a stable single-line attachment point
13
13
  * and the chosen category is inline-compatible.
14
- * 3. **CLI command** — the exact `npx vyuh-dxkit allowlist add`
14
+ * 3. **CLI command** — the exact `vyuh-dxkit allowlist add`
15
15
  * invocation that handles the mutation without the developer
16
16
  * typing annotation syntax.
17
17
  *
@@ -1 +1 @@
1
- {"version":3,"file":"hint.d.ts","sourceRoot":"","sources":["../../src/allowlist/hint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAIH,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAGxE,OAAO,EAKL,KAAK,iBAAiB,EACvB,MAAM,cAAc,CAAC;AAStB,MAAM,WAAW,SAAS;IACxB,yDAAyD;IACzD,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B;;oBAEgB;IAChB,QAAQ,CAAC,oBAAoB,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAC5D;;;mCAG+B;IAC/B,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC,4DAA4D;IAC5D,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B;;;4CAGwC;IACxC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAChC;2DACuD;IACvD,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;CACjC;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,EAAE,QAAQ,CAAC,EAAE,eAAe,GAAG,SAAS,CAyB3F;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,aAAa,CAAC,MAAM,CAAC,GAAG,MAAM,CAsFlE"}
1
+ {"version":3,"file":"hint.d.ts","sourceRoot":"","sources":["../../src/allowlist/hint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAIH,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAGxE,OAAO,EAKL,KAAK,iBAAiB,EACvB,MAAM,cAAc,CAAC;AAUtB,MAAM,WAAW,SAAS;IACxB,yDAAyD;IACzD,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B;;oBAEgB;IAChB,QAAQ,CAAC,oBAAoB,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAC5D;;;mCAG+B;IAC/B,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC,4DAA4D;IAC5D,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B;;;4CAGwC;IACxC,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAChC;2DACuD;IACvD,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;CACjC;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,aAAa,EAAE,QAAQ,CAAC,EAAE,eAAe,GAAG,SAAS,CAyB3F;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,aAAa,CAAC,MAAM,CAAC,GAAG,MAAM,CAwFlE"}