harness-scorecard 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harness_scorecard-1.0.0/LICENSE +21 -0
- harness_scorecard-1.0.0/PKG-INFO +106 -0
- harness_scorecard-1.0.0/README.md +83 -0
- harness_scorecard-1.0.0/pyproject.toml +97 -0
- harness_scorecard-1.0.0/src/harness_scorecard/__init__.py +5 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/__init__.py +36 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/base.py +149 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/destructive_git.py +148 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/egress.py +71 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/observability.py +65 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/provenance.py +52 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/recovery.py +64 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/secret_protection.py +146 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/self_protection.py +129 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/subagent_isolation.py +86 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/tool_surface.py +99 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks/verification.py +52 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/__init__.py +39 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/destructive_git.py +107 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/egress.py +94 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/observability.py +59 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/provenance.py +70 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/recovery.py +53 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/secret_protection.py +100 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/self_protection.py +116 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/subagent_isolation.py +70 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/tool_surface.py +74 -0
- harness_scorecard-1.0.0/src/harness_scorecard/checks_codex/verification.py +67 -0
- harness_scorecard-1.0.0/src/harness_scorecard/cli.py +113 -0
- harness_scorecard-1.0.0/src/harness_scorecard/discovery.py +177 -0
- harness_scorecard-1.0.0/src/harness_scorecard/discovery_codex.py +251 -0
- harness_scorecard-1.0.0/src/harness_scorecard/dispatch.py +60 -0
- harness_scorecard-1.0.0/src/harness_scorecard/htmlreport.py +152 -0
- harness_scorecard-1.0.0/src/harness_scorecard/models.py +137 -0
- harness_scorecard-1.0.0/src/harness_scorecard/parsing.py +134 -0
- harness_scorecard-1.0.0/src/harness_scorecard/py.typed +0 -0
- harness_scorecard-1.0.0/src/harness_scorecard/redaction.py +52 -0
- harness_scorecard-1.0.0/src/harness_scorecard/report.py +114 -0
- harness_scorecard-1.0.0/src/harness_scorecard/sarif.py +160 -0
- harness_scorecard-1.0.0/src/harness_scorecard/scoring.py +99 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saagar Patel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: harness-scorecard
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A read-only linter and A-F maturity grader for coding-agent harnesses (Claude Code, Codex).
|
|
5
|
+
Keywords: claude-code,codex,coding-agent,harness,linter,sarif,security,static-analysis,supply-chain
|
|
6
|
+
Author: Saagar Patel
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Security
|
|
16
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Project-URL: Homepage, https://github.com/saagpatel/harness-scorecard
|
|
20
|
+
Project-URL: Repository, https://github.com/saagpatel/harness-scorecard
|
|
21
|
+
Project-URL: Issues, https://github.com/saagpatel/harness-scorecard/issues
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# Harness Scorecard
|
|
25
|
+
|
|
26
|
+
A read-only linter and **A–F maturity grader for coding-agent harnesses**. Point it at a
|
|
27
|
+
Claude Code or Codex setup — Claude Code's hooks, `permissions`, `rules/*.md`, agents, and
|
|
28
|
+
`CLAUDE.md`, or Codex's `config.toml` (sandbox, approval policy, trust levels), `hooks.json`,
|
|
29
|
+
and `AGENTS.md` — and it returns a graded scorecard: the overall maturity grade, the specific
|
|
30
|
+
gaps, and the guards that are missing, each with rationale. The harness type is auto-detected.
|
|
31
|
+
|
|
32
|
+
"Harness engineering" became a named discipline in 2026 and everyone is assembling harnesses
|
|
33
|
+
with no way to tell if theirs is any good. The rubric is the product: every check traces to a
|
|
34
|
+
**documented red-team failure mode**, not generic advice.
|
|
35
|
+
|
|
36
|
+
## What makes the grade real
|
|
37
|
+
|
|
38
|
+
Most config "linters" credit a harness for declaring a rule. This one models the *effective*
|
|
39
|
+
enforcement floor. The headline example:
|
|
40
|
+
|
|
41
|
+
> `autoMode.hard_deny` is **inert** when `permissions.defaultMode == "bypassPermissions"`.
|
|
42
|
+
|
|
43
|
+
A naive scorer reads a rich `hard_deny` block and awards an A. Harness Scorecard reads the
|
|
44
|
+
mode, discounts the inert block, and grades against what actually fires — `permissions.deny`
|
|
45
|
+
globs plus the PreToolUse hooks. See [`docs/rubric.md`](docs/rubric.md) for the full model,
|
|
46
|
+
including **capability gates** that cap the grade when a critical hole is present (you can't
|
|
47
|
+
score an A with readable credentials, no matter how many cheap checks pass).
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Grade a harness directory (e.g. your ~/.claude)
|
|
53
|
+
harness-scorecard scan ~/.claude
|
|
54
|
+
|
|
55
|
+
# JSON for tooling, plus a self-contained HTML scorecard
|
|
56
|
+
harness-scorecard scan ~/.claude --format json --html scorecard.html
|
|
57
|
+
|
|
58
|
+
# SARIF 2.1.0 for CI / GitHub code scanning, failing the run below grade C
|
|
59
|
+
harness-scorecard scan ~/.claude --sarif harness.sarif --min-grade C
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
`--min-grade {A,B,C,D,F}` sets the bar (default `B`). Exit codes: `0` meets the bar ·
|
|
63
|
+
`1` below the bar · `2` no harness found.
|
|
64
|
+
|
|
65
|
+
## GitHub Action
|
|
66
|
+
|
|
67
|
+
Grade your harness in CI and upload the findings to code scanning:
|
|
68
|
+
|
|
69
|
+
```yaml
|
|
70
|
+
- uses: saagpatel/harness-scorecard@v1
|
|
71
|
+
with:
|
|
72
|
+
path: .claude
|
|
73
|
+
min-grade: B
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The action writes SARIF and uploads it (requires `security-events: write`) **even when the grade
|
|
77
|
+
fails the build**, so findings always reach code scanning. A complete workflow — permissions,
|
|
78
|
+
weekly scheduling, SARIF upload — is in [`examples/github-workflow.yml`](examples/github-workflow.yml).
|
|
79
|
+
|
|
80
|
+
## Guarantees
|
|
81
|
+
|
|
82
|
+
- **Read-only.** It never writes to the harness it audits.
|
|
83
|
+
- **Privacy-preserving.** All output redacts secrets, tokens, emails, and absolute home
|
|
84
|
+
paths. Nothing leaves the machine.
|
|
85
|
+
- **Dependency-free runtime.** The scorer ships stdlib-only — a tool that grades
|
|
86
|
+
supply-chain hygiene should carry the smallest surface itself.
|
|
87
|
+
|
|
88
|
+
## Scope (v1)
|
|
89
|
+
|
|
90
|
+
Implements **all ten rubric dimensions** end-to-end for **both Claude Code and Codex**: secret
|
|
91
|
+
protection, egress/exfiltration control, tool-surface & inbound-injection defense,
|
|
92
|
+
destructive-action & git safety, harness self-protection & integrity, verification gates,
|
|
93
|
+
subagent isolation & governance, recovery/rollback safety, memory/provenance hygiene, and
|
|
94
|
+
observability/audit trail (the critical gated trio is D1/D4/D5). Each harness has its own
|
|
95
|
+
adapter and check suite over the shared scoring engine; the bypass-aware effective floor maps
|
|
96
|
+
to Codex's `sandbox_mode = "danger-full-access"` + `approval_policy = "never"` just as it does
|
|
97
|
+
to Claude Code's `bypassPermissions`. The rubric is versioned and emitted in every report.
|
|
98
|
+
|
|
99
|
+
## Development
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
uv sync --frozen # install dev tooling from the lockfile
|
|
103
|
+
uv run --no-sync python -m unittest discover -s tests # tests (stdlib runner, zero extra deps)
|
|
104
|
+
uv run --no-sync ruff check src/ tests/ # lint
|
|
105
|
+
uv run --no-sync ty check src/ # type check
|
|
106
|
+
```
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Harness Scorecard
|
|
2
|
+
|
|
3
|
+
A read-only linter and **A–F maturity grader for coding-agent harnesses**. Point it at a
|
|
4
|
+
Claude Code or Codex setup — Claude Code's hooks, `permissions`, `rules/*.md`, agents, and
|
|
5
|
+
`CLAUDE.md`, or Codex's `config.toml` (sandbox, approval policy, trust levels), `hooks.json`,
|
|
6
|
+
and `AGENTS.md` — and it returns a graded scorecard: the overall maturity grade, the specific
|
|
7
|
+
gaps, and the guards that are missing, each with rationale. The harness type is auto-detected.
|
|
8
|
+
|
|
9
|
+
"Harness engineering" became a named discipline in 2026 and everyone is assembling harnesses
|
|
10
|
+
with no way to tell if theirs is any good. The rubric is the product: every check traces to a
|
|
11
|
+
**documented red-team failure mode**, not generic advice.
|
|
12
|
+
|
|
13
|
+
## What makes the grade real
|
|
14
|
+
|
|
15
|
+
Most config "linters" credit a harness for declaring a rule. This one models the *effective*
|
|
16
|
+
enforcement floor. The headline example:
|
|
17
|
+
|
|
18
|
+
> `autoMode.hard_deny` is **inert** when `permissions.defaultMode == "bypassPermissions"`.
|
|
19
|
+
|
|
20
|
+
A naive scorer reads a rich `hard_deny` block and awards an A. Harness Scorecard reads the
|
|
21
|
+
mode, discounts the inert block, and grades against what actually fires — `permissions.deny`
|
|
22
|
+
globs plus the PreToolUse hooks. See [`docs/rubric.md`](docs/rubric.md) for the full model,
|
|
23
|
+
including **capability gates** that cap the grade when a critical hole is present (you can't
|
|
24
|
+
score an A with readable credentials, no matter how many cheap checks pass).
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Grade a harness directory (e.g. your ~/.claude)
|
|
30
|
+
harness-scorecard scan ~/.claude
|
|
31
|
+
|
|
32
|
+
# JSON for tooling, plus a self-contained HTML scorecard
|
|
33
|
+
harness-scorecard scan ~/.claude --format json --html scorecard.html
|
|
34
|
+
|
|
35
|
+
# SARIF 2.1.0 for CI / GitHub code scanning, failing the run below grade C
|
|
36
|
+
harness-scorecard scan ~/.claude --sarif harness.sarif --min-grade C
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
`--min-grade {A,B,C,D,F}` sets the bar (default `B`). Exit codes: `0` meets the bar ·
|
|
40
|
+
`1` below the bar · `2` no harness found.
|
|
41
|
+
|
|
42
|
+
## GitHub Action
|
|
43
|
+
|
|
44
|
+
Grade your harness in CI and upload the findings to code scanning:
|
|
45
|
+
|
|
46
|
+
```yaml
|
|
47
|
+
- uses: saagpatel/harness-scorecard@v1
|
|
48
|
+
with:
|
|
49
|
+
path: .claude
|
|
50
|
+
min-grade: B
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
The action writes SARIF and uploads it (requires `security-events: write`) **even when the grade
|
|
54
|
+
fails the build**, so findings always reach code scanning. A complete workflow — permissions,
|
|
55
|
+
weekly scheduling, SARIF upload — is in [`examples/github-workflow.yml`](examples/github-workflow.yml).
|
|
56
|
+
|
|
57
|
+
## Guarantees
|
|
58
|
+
|
|
59
|
+
- **Read-only.** It never writes to the harness it audits.
|
|
60
|
+
- **Privacy-preserving.** All output redacts secrets, tokens, emails, and absolute home
|
|
61
|
+
paths. Nothing leaves the machine.
|
|
62
|
+
- **Dependency-free runtime.** The scorer ships stdlib-only — a tool that grades
|
|
63
|
+
supply-chain hygiene should carry the smallest surface itself.
|
|
64
|
+
|
|
65
|
+
## Scope (v1)
|
|
66
|
+
|
|
67
|
+
Implements **all ten rubric dimensions** end-to-end for **both Claude Code and Codex**: secret
|
|
68
|
+
protection, egress/exfiltration control, tool-surface & inbound-injection defense,
|
|
69
|
+
destructive-action & git safety, harness self-protection & integrity, verification gates,
|
|
70
|
+
subagent isolation & governance, recovery/rollback safety, memory/provenance hygiene, and
|
|
71
|
+
observability/audit trail (the critical gated trio is D1/D4/D5). Each harness has its own
|
|
72
|
+
adapter and check suite over the shared scoring engine; the bypass-aware effective floor maps
|
|
73
|
+
to Codex's `sandbox_mode = "danger-full-access"` + `approval_policy = "never"` just as it does
|
|
74
|
+
to Claude Code's `bypassPermissions`. The rubric is versioned and emitted in every report.
|
|
75
|
+
|
|
76
|
+
## Development
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
uv sync --frozen # install dev tooling from the lockfile
|
|
80
|
+
uv run --no-sync python -m unittest discover -s tests # tests (stdlib runner, zero extra deps)
|
|
81
|
+
uv run --no-sync ruff check src/ tests/ # lint
|
|
82
|
+
uv run --no-sync ty check src/ # type check
|
|
83
|
+
```
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "harness-scorecard"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A read-only linter and A-F maturity grader for coding-agent harnesses (Claude Code, Codex)."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
license-files = ["LICENSE"]
|
|
8
|
+
authors = [{ name = "Saagar Patel" }]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
keywords = [
|
|
11
|
+
"claude-code",
|
|
12
|
+
"codex",
|
|
13
|
+
"coding-agent",
|
|
14
|
+
"harness",
|
|
15
|
+
"linter",
|
|
16
|
+
"sarif",
|
|
17
|
+
"security",
|
|
18
|
+
"static-analysis",
|
|
19
|
+
"supply-chain",
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"Environment :: Console",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Topic :: Security",
|
|
29
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
30
|
+
"Typing :: Typed",
|
|
31
|
+
]
|
|
32
|
+
# Runtime is intentionally dependency-free: stdlib only.
|
|
33
|
+
# A scorer that grades supply-chain hygiene should carry the smallest
|
|
34
|
+
# possible dependency surface itself. Dev tooling lives in [dependency-groups].
|
|
35
|
+
dependencies = []
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/saagpatel/harness-scorecard"
|
|
39
|
+
Repository = "https://github.com/saagpatel/harness-scorecard"
|
|
40
|
+
Issues = "https://github.com/saagpatel/harness-scorecard/issues"
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
harness-scorecard = "harness_scorecard.cli:main"
|
|
44
|
+
|
|
45
|
+
[build-system]
|
|
46
|
+
requires = ["uv_build>=0.11.13,<0.12.0"]
|
|
47
|
+
build-backend = "uv_build"
|
|
48
|
+
|
|
49
|
+
# Dev tooling is declared here but installed via an approval-gated `uv add`.
|
|
50
|
+
# Tests run on the stdlib (`unittest`) until pytest is approved; the suite is
|
|
51
|
+
# written as unittest.TestCase classes so pytest discovers it unchanged.
|
|
52
|
+
[dependency-groups]
|
|
53
|
+
dev = [{ include-group = "lint" }, { include-group = "test" }]
|
|
54
|
+
lint = ["ruff", "ty"]
|
|
55
|
+
test = ["pytest", "pytest-cov"]
|
|
56
|
+
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
line-length = 100
|
|
59
|
+
target-version = "py312"
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["ALL"]
|
|
63
|
+
ignore = [
|
|
64
|
+
"D", # docstring style — handled case-by-case
|
|
65
|
+
"COM812", # trailing comma — conflicts with the formatter
|
|
66
|
+
"ISC001", # implicit string concat — conflicts with the formatter
|
|
67
|
+
"ANN401", # Any is legitimate at the raw-JSON parsing boundary
|
|
68
|
+
"PLR0913", # keyword-only config params (effective_block) read clearly
|
|
69
|
+
"TC001", # avoid TYPE_CHECKING-block ceremony in small modules
|
|
70
|
+
"TC002",
|
|
71
|
+
"TC003",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint.per-file-ignores]
|
|
75
|
+
# Tests use unittest.TestCase deliberately (stdlib-runnable, zero extra deps).
|
|
76
|
+
"tests/**" = [
|
|
77
|
+
"S101", # asserts are the point of tests
|
|
78
|
+
"PLR2004", # magic values in assertions are fine
|
|
79
|
+
"ANN", # test signatures need no annotations
|
|
80
|
+
"SLF001", # tests may touch private members
|
|
81
|
+
"PT009", # unittest-style assertEqual is intentional here
|
|
82
|
+
"PT027", # unittest-style assertRaises is intentional here
|
|
83
|
+
"INP001", # tests/ is intentionally not an importable package
|
|
84
|
+
"N802", # setUp / tearDown are the unittest API
|
|
85
|
+
"S108", # fixture configs use placeholder /tmp roots that are never written
|
|
86
|
+
]
|
|
87
|
+
"src/harness_scorecard/cli.py" = ["T201"] # print is the CLI's output channel
|
|
88
|
+
|
|
89
|
+
[tool.pytest.ini_options]
|
|
90
|
+
testpaths = ["tests"]
|
|
91
|
+
addopts = ["--import-mode=importlib"]
|
|
92
|
+
|
|
93
|
+
[tool.ty.environment]
|
|
94
|
+
python-version = "3.12"
|
|
95
|
+
|
|
96
|
+
[tool.ty.terminal]
|
|
97
|
+
error-on-warning = true
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Check registry: assembles every dimension's checks into one ordered list."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from harness_scorecard.checks import (
|
|
6
|
+
destructive_git,
|
|
7
|
+
egress,
|
|
8
|
+
observability,
|
|
9
|
+
provenance,
|
|
10
|
+
recovery,
|
|
11
|
+
secret_protection,
|
|
12
|
+
self_protection,
|
|
13
|
+
subagent_isolation,
|
|
14
|
+
tool_surface,
|
|
15
|
+
verification,
|
|
16
|
+
)
|
|
17
|
+
from harness_scorecard.checks.base import DIMENSIONS, Check, Dimension
|
|
18
|
+
|
|
19
|
+
# Order = dimension order in the rubric. New dimension modules append here as they land.
|
|
20
|
+
ALL_CHECKS: list[Check] = [
|
|
21
|
+
*secret_protection.CHECKS,
|
|
22
|
+
*egress.CHECKS,
|
|
23
|
+
*tool_surface.CHECKS,
|
|
24
|
+
*destructive_git.CHECKS,
|
|
25
|
+
*self_protection.CHECKS,
|
|
26
|
+
*verification.CHECKS,
|
|
27
|
+
*subagent_isolation.CHECKS,
|
|
28
|
+
*recovery.CHECKS,
|
|
29
|
+
*provenance.CHECKS,
|
|
30
|
+
*observability.CHECKS,
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# Dimensions that actually have checks this version (used to report coverage honestly).
|
|
34
|
+
IMPLEMENTED_DIMENSION_IDS: list[str] = list(dict.fromkeys(check.dimension for check in ALL_CHECKS))
|
|
35
|
+
|
|
36
|
+
__all__ = ["ALL_CHECKS", "DIMENSIONS", "IMPLEMENTED_DIMENSION_IDS", "Check", "Dimension"]
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Check abstraction, the dimension catalog, and the effective-enforcement helper.
|
|
2
|
+
|
|
3
|
+
A :class:`Check` pairs immutable rubric metadata (id, weight, gate status) with an
|
|
4
|
+
``evaluate`` function that inspects a :class:`HarnessConfig` and returns a status. The
|
|
5
|
+
effective-floor helper (:func:`effective_block`) centralizes the bypass-aware rule so every
|
|
6
|
+
destructive-action check enforces it identically.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
from harness_scorecard.discovery import HarnessConfig
|
|
15
|
+
from harness_scorecard.models import (
|
|
16
|
+
CheckResult,
|
|
17
|
+
Detectability,
|
|
18
|
+
Grade,
|
|
19
|
+
Severity,
|
|
20
|
+
Status,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True, slots=True)
|
|
25
|
+
class Dimension:
|
|
26
|
+
id: str
|
|
27
|
+
name: str
|
|
28
|
+
weight: int
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# The full rubric catalog (all ten dimensions). Scoring runs over whichever dimensions
|
|
32
|
+
# have registered checks; the rest are reported as specced-but-pending.
|
|
33
|
+
DIMENSIONS: dict[str, Dimension] = {
|
|
34
|
+
"D1": Dimension("D1", "Secret protection & credential isolation", 5),
|
|
35
|
+
"D2": Dimension("D2", "Egress / exfiltration control", 4),
|
|
36
|
+
"D3": Dimension("D3", "Tool-surface & inbound-injection defense", 4),
|
|
37
|
+
"D4": Dimension("D4", "Destructive-action & git safety", 5),
|
|
38
|
+
"D5": Dimension("D5", "Harness self-protection & integrity", 5),
|
|
39
|
+
"D6": Dimension("D6", "Verification gates", 3),
|
|
40
|
+
"D7": Dimension("D7", "Subagent isolation & governance", 3),
|
|
41
|
+
"D8": Dimension("D8", "Recovery / rollback safety", 2),
|
|
42
|
+
"D9": Dimension("D9", "Memory / provenance hygiene", 2),
|
|
43
|
+
"D10": Dimension("D10", "Observability / audit trail", 2),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(slots=True)
|
|
48
|
+
class CheckOutcome:
|
|
49
|
+
"""The mutable result of evaluating a check: status + human-readable rationale."""
|
|
50
|
+
|
|
51
|
+
status: Status
|
|
52
|
+
message: str
|
|
53
|
+
evidence: list[str] = field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def passed(message: str, evidence: Iterable[str] = ()) -> CheckOutcome:
|
|
57
|
+
return CheckOutcome(Status.PASS, message, list(evidence))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def partial(message: str, evidence: Iterable[str] = ()) -> CheckOutcome:
|
|
61
|
+
return CheckOutcome(Status.PARTIAL, message, list(evidence))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def failed(message: str, evidence: Iterable[str] = ()) -> CheckOutcome:
|
|
65
|
+
return CheckOutcome(Status.FAIL, message, list(evidence))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def not_applicable(message: str, evidence: Iterable[str] = ()) -> CheckOutcome:
|
|
69
|
+
"""A check that does not apply to this harness; excluded from the dimension denominator."""
|
|
70
|
+
return CheckOutcome(Status.NOT_APPLICABLE, message, list(evidence))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass(frozen=True, slots=True)
|
|
74
|
+
class Check[ConfigT]:
|
|
75
|
+
"""A single rubric check: metadata plus an evaluation function.
|
|
76
|
+
|
|
77
|
+
Generic over the harness config it inspects (``HarnessConfig`` for Claude Code,
|
|
78
|
+
``CodexConfig`` for Codex) so the rubric metadata, ``run()``, and the dimension catalog
|
|
79
|
+
are shared across adapters while each check only sees the config shape it understands.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
id: str
|
|
83
|
+
dimension: str
|
|
84
|
+
title: str
|
|
85
|
+
weight: int
|
|
86
|
+
evaluate: Callable[[ConfigT], CheckOutcome]
|
|
87
|
+
severity: Severity = Severity.MEDIUM
|
|
88
|
+
detectability: Detectability = Detectability.STATIC
|
|
89
|
+
is_gate: bool = False
|
|
90
|
+
gate_cap: Grade | None = None
|
|
91
|
+
remediation: str = ""
|
|
92
|
+
|
|
93
|
+
def run(self, config: ConfigT) -> CheckResult:
|
|
94
|
+
outcome = self.evaluate(config)
|
|
95
|
+
return CheckResult(
|
|
96
|
+
id=self.id,
|
|
97
|
+
dimension=self.dimension,
|
|
98
|
+
title=self.title,
|
|
99
|
+
status=outcome.status,
|
|
100
|
+
weight=self.weight,
|
|
101
|
+
message=outcome.message,
|
|
102
|
+
severity=self.severity,
|
|
103
|
+
detectability=self.detectability,
|
|
104
|
+
is_gate=self.is_gate,
|
|
105
|
+
gate_cap=self.gate_cap,
|
|
106
|
+
remediation=self.remediation,
|
|
107
|
+
evidence=outcome.evidence,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass(slots=True)
|
|
112
|
+
class EffectiveFloor:
|
|
113
|
+
"""Whether a protection is present in the *effective* enforcement floor (rubric §3)."""
|
|
114
|
+
|
|
115
|
+
blocked: bool
|
|
116
|
+
sources: list[str]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def hard_deny_covers(config: HarnessConfig, token_groups: Sequence[Sequence[str]]) -> bool:
|
|
120
|
+
"""True only if ``hard_deny`` is effective AND some rule matches an AND-group of tokens."""
|
|
121
|
+
if not config.hard_deny_effective:
|
|
122
|
+
return False
|
|
123
|
+
rules = [rule.lower() for rule in config.hard_deny]
|
|
124
|
+
return any(all(token in rule for token in group) for group in token_groups for rule in rules)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def effective_block(
|
|
128
|
+
config: HarnessConfig,
|
|
129
|
+
*,
|
|
130
|
+
hooks: Sequence[str] = (),
|
|
131
|
+
deny_needles: Sequence[str] = (),
|
|
132
|
+
hard_deny_tokens: Sequence[Sequence[str]] = (),
|
|
133
|
+
event: str = "PreToolUse",
|
|
134
|
+
matcher: str | None = "Bash",
|
|
135
|
+
) -> EffectiveFloor:
|
|
136
|
+
"""Resolve whether an action is blocked by the effective floor.
|
|
137
|
+
|
|
138
|
+
The floor counts a guard present if any of: a registered hook matches; a
|
|
139
|
+
``permissions.deny`` entry matches; or an effective (non-bypass) ``hard_deny`` rule
|
|
140
|
+
matches. A ``hard_deny`` rule under bypass mode contributes nothing.
|
|
141
|
+
"""
|
|
142
|
+
sources: list[str] = [
|
|
143
|
+
f"hook:{hook_name}" for hook_name in hooks if config.has_hook(event, hook_name, matcher)
|
|
144
|
+
]
|
|
145
|
+
if any(config.deny_matches(needle) for needle in deny_needles):
|
|
146
|
+
sources.append("permissions.deny")
|
|
147
|
+
if hard_deny_covers(config, hard_deny_tokens):
|
|
148
|
+
sources.append("hard_deny")
|
|
149
|
+
return EffectiveFloor(blocked=bool(sources), sources=sources)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""D4 - Destructive-action & git safety.
|
|
2
|
+
|
|
3
|
+
Every block check here resolves against the *effective* enforcement floor, so a guard that
|
|
4
|
+
exists only in an inert ``hard_deny`` block under bypass mode scores as absent.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from harness_scorecard.checks.base import (
|
|
10
|
+
Check,
|
|
11
|
+
CheckOutcome,
|
|
12
|
+
effective_block,
|
|
13
|
+
failed,
|
|
14
|
+
partial,
|
|
15
|
+
passed,
|
|
16
|
+
)
|
|
17
|
+
from harness_scorecard.discovery import HarnessConfig
|
|
18
|
+
from harness_scorecard.models import Detectability, Grade, Severity
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _bypass_note(config: HarnessConfig) -> list[str]:
|
|
22
|
+
if config.is_bypass:
|
|
23
|
+
return ["defaultMode=bypassPermissions: autoMode.hard_deny is INERT"]
|
|
24
|
+
return []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _check_push_to_protected_branch(config: HarnessConfig) -> CheckOutcome:
|
|
28
|
+
floor = effective_block(
|
|
29
|
+
config,
|
|
30
|
+
hooks=("git-safety", "git-guard", "protect-branch"),
|
|
31
|
+
deny_needles=("push origin main", "push origin master", "git push"),
|
|
32
|
+
hard_deny_tokens=(("push", "main"), ("push", "master")),
|
|
33
|
+
)
|
|
34
|
+
if floor.blocked:
|
|
35
|
+
return passed(
|
|
36
|
+
"Push to a protected branch is blocked by the effective floor.",
|
|
37
|
+
evidence=floor.sources,
|
|
38
|
+
)
|
|
39
|
+
return failed(
|
|
40
|
+
"Push to main/master is not blocked by any effective guard.",
|
|
41
|
+
evidence=[*_bypass_note(config), "no git-safety hook or deny entry found"],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _check_catastrophic_deletion(config: HarnessConfig) -> CheckOutcome:
|
|
46
|
+
floor = effective_block(
|
|
47
|
+
config,
|
|
48
|
+
hooks=("block-dangerous-cmds", "defer-destructive", "dangerous"),
|
|
49
|
+
deny_needles=("rm -rf /", "rm -rf ~", "rm -rf /*"),
|
|
50
|
+
hard_deny_tokens=(("rm -rf",),),
|
|
51
|
+
)
|
|
52
|
+
if floor.blocked:
|
|
53
|
+
return passed("Catastrophic deletion is blocked by the effective floor.", floor.sources)
|
|
54
|
+
return failed(
|
|
55
|
+
"No effective guard against catastrophic rm -rf deletion.",
|
|
56
|
+
evidence=_bypass_note(config),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _check_destructive_db(config: HarnessConfig) -> CheckOutcome:
|
|
61
|
+
floor = effective_block(
|
|
62
|
+
config,
|
|
63
|
+
hooks=("db-guard", "database-guard"),
|
|
64
|
+
deny_needles=(),
|
|
65
|
+
hard_deny_tokens=(("destructive", "db"), ("database",), ("db", "host")),
|
|
66
|
+
)
|
|
67
|
+
if floor.blocked:
|
|
68
|
+
return passed("Destructive DB operations are guarded.", floor.sources)
|
|
69
|
+
return failed(
|
|
70
|
+
"No effective guard against destructive DB operations on non-local hosts.",
|
|
71
|
+
evidence=_bypass_note(config),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _check_dependency_install_gate(config: HarnessConfig) -> CheckOutcome:
|
|
76
|
+
if config.has_hook("PreToolUse", "confirm-token", matcher="Bash") or config.has_hook(
|
|
77
|
+
"PreToolUse", "lockfile-freeze", matcher="Bash"
|
|
78
|
+
):
|
|
79
|
+
return passed("Dependency installs require a confirm-token / lockfile freeze.")
|
|
80
|
+
return failed("No gate on dependency installs; unvetted packages can be pulled in.")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _check_force_push_policy(config: HarnessConfig) -> CheckOutcome:
|
|
84
|
+
if config.has_hook("PreToolUse", "git-safety", matcher="Bash"):
|
|
85
|
+
return passed("A git-safety hook covers force-push / history-rewrite.")
|
|
86
|
+
if any(("force" in rule.lower() or "git-safety" in rule.lower()) for rule in config.rule_files):
|
|
87
|
+
return partial(
|
|
88
|
+
"Force-push policy is documented in rules/ but not enforced by a hook.",
|
|
89
|
+
evidence=["advisory only"],
|
|
90
|
+
)
|
|
91
|
+
return failed("No force-push / history-rewrite guard or documented policy.")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
CHECKS: list[Check] = [
|
|
95
|
+
Check(
|
|
96
|
+
id="HS-D4-01",
|
|
97
|
+
dimension="D4",
|
|
98
|
+
title="Push to protected branch effectively blocked",
|
|
99
|
+
weight=5,
|
|
100
|
+
evaluate=_check_push_to_protected_branch,
|
|
101
|
+
severity=Severity.CRITICAL,
|
|
102
|
+
is_gate=True,
|
|
103
|
+
gate_cap=Grade.C,
|
|
104
|
+
remediation=(
|
|
105
|
+
"Block push to main/master via a PreToolUse Bash hook or a deny entry "
|
|
106
|
+
"(not hard_deny alone under bypass)."
|
|
107
|
+
),
|
|
108
|
+
),
|
|
109
|
+
Check(
|
|
110
|
+
id="HS-D4-02",
|
|
111
|
+
dimension="D4",
|
|
112
|
+
title="Catastrophic deletion blocked",
|
|
113
|
+
weight=4,
|
|
114
|
+
evaluate=_check_catastrophic_deletion,
|
|
115
|
+
severity=Severity.HIGH,
|
|
116
|
+
remediation="Add a dangerous-command hook and deny rm -rf at shallow depth.",
|
|
117
|
+
),
|
|
118
|
+
Check(
|
|
119
|
+
id="HS-D4-03",
|
|
120
|
+
dimension="D4",
|
|
121
|
+
title="Destructive DB ops on non-local hosts blocked",
|
|
122
|
+
weight=4,
|
|
123
|
+
evaluate=_check_destructive_db,
|
|
124
|
+
severity=Severity.HIGH,
|
|
125
|
+
remediation=(
|
|
126
|
+
"Add a PreToolUse Bash db-guard hook that blocks destructive ops on non-local hosts."
|
|
127
|
+
),
|
|
128
|
+
),
|
|
129
|
+
Check(
|
|
130
|
+
id="HS-D4-04",
|
|
131
|
+
dimension="D4",
|
|
132
|
+
title="Dependency-install / lockfile gate",
|
|
133
|
+
weight=3,
|
|
134
|
+
evaluate=_check_dependency_install_gate,
|
|
135
|
+
severity=Severity.MEDIUM,
|
|
136
|
+
remediation="Require a confirm-token for *-add/install, or add a lockfile-freeze guard.",
|
|
137
|
+
),
|
|
138
|
+
Check(
|
|
139
|
+
id="HS-D4-05",
|
|
140
|
+
dimension="D4",
|
|
141
|
+
title="Force-push / history-rewrite policy",
|
|
142
|
+
weight=3,
|
|
143
|
+
evaluate=_check_force_push_policy,
|
|
144
|
+
severity=Severity.MEDIUM,
|
|
145
|
+
detectability=Detectability.PARTIAL,
|
|
146
|
+
remediation="Enforce a no-force-push policy via the git-safety hook, not docs alone.",
|
|
147
|
+
),
|
|
148
|
+
]
|