nemesis-eval 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. nemesis_eval-0.2.0/.github/CODEOWNERS +6 -0
  2. nemesis_eval-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml +45 -0
  3. nemesis_eval-0.2.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
  4. nemesis_eval-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml +28 -0
  5. nemesis_eval-0.2.0/.github/dependabot.yml +22 -0
  6. nemesis_eval-0.2.0/.github/pull_request_template.md +23 -0
  7. nemesis_eval-0.2.0/.github/workflows/ci.yml +35 -0
  8. nemesis_eval-0.2.0/.github/workflows/codeql.yml +35 -0
  9. nemesis_eval-0.2.0/.github/workflows/dependency-review.yml +23 -0
  10. nemesis_eval-0.2.0/.github/workflows/nemesis-check.yml +26 -0
  11. nemesis_eval-0.2.0/.github/workflows/release-testpypi.yml +73 -0
  12. nemesis_eval-0.2.0/.github/workflows/release.yml +84 -0
  13. nemesis_eval-0.2.0/.github/workflows/scorecard.yml +47 -0
  14. nemesis_eval-0.2.0/.gitignore +39 -0
  15. nemesis_eval-0.2.0/.pre-commit-config.yaml +11 -0
  16. nemesis_eval-0.2.0/AGENTS.md +48 -0
  17. nemesis_eval-0.2.0/BUILD_SPEC.md +299 -0
  18. nemesis_eval-0.2.0/CHANGELOG.md +42 -0
  19. nemesis_eval-0.2.0/CLAUDE.md +183 -0
  20. nemesis_eval-0.2.0/CODE_OF_CONDUCT.md +58 -0
  21. nemesis_eval-0.2.0/CONTRIBUTING.md +52 -0
  22. nemesis_eval-0.2.0/LICENSE +21 -0
  23. nemesis_eval-0.2.0/PKG-INFO +294 -0
  24. nemesis_eval-0.2.0/README.md +260 -0
  25. nemesis_eval-0.2.0/SECURITY.md +26 -0
  26. nemesis_eval-0.2.0/action.yml +98 -0
  27. nemesis_eval-0.2.0/data/failure_modes.yaml +139 -0
  28. nemesis_eval-0.2.0/docs/banner.png +0 -0
  29. nemesis_eval-0.2.0/pyproject.toml +66 -0
  30. nemesis_eval-0.2.0/src/nemesis/__init__.py +1 -0
  31. nemesis_eval-0.2.0/src/nemesis/__main__.py +185 -0
  32. nemesis_eval-0.2.0/src/nemesis/catalog.py +57 -0
  33. nemesis_eval-0.2.0/src/nemesis/collect.py +103 -0
  34. nemesis_eval-0.2.0/src/nemesis/detectors/__init__.py +53 -0
  35. nemesis_eval-0.2.0/src/nemesis/detectors/agent_output_not_tied_to_exact_repo_state.py +36 -0
  36. nemesis_eval-0.2.0/src/nemesis/detectors/artifact_presence_not_verified.py +40 -0
  37. nemesis_eval-0.2.0/src/nemesis/detectors/base.py +85 -0
  38. nemesis_eval-0.2.0/src/nemesis/detectors/branch_cleanup_not_verified.py +45 -0
  39. nemesis_eval-0.2.0/src/nemesis/detectors/declared_success_too_early.py +52 -0
  40. nemesis_eval-0.2.0/src/nemesis/detectors/dirty_worktree_after_closeout.py +45 -0
  41. nemesis_eval-0.2.0/src/nemesis/detectors/github_merge_treated_as_full_success.py +47 -0
  42. nemesis_eval-0.2.0/src/nemesis/detectors/hot_file_conflict_risk.py +39 -0
  43. nemesis_eval-0.2.0/src/nemesis/detectors/incomplete_implementation_prompts.py +36 -0
  44. nemesis_eval-0.2.0/src/nemesis/detectors/local_status_ignored_before_next_phase.py +36 -0
  45. nemesis_eval-0.2.0/src/nemesis/detectors/missing_root_doctrine_updates.py +39 -0
  46. nemesis_eval-0.2.0/src/nemesis/detectors/old_session_folders_leaking_files.py +37 -0
  47. nemesis_eval-0.2.0/src/nemesis/detectors/patch_vs_new_build_confusion.py +39 -0
  48. nemesis_eval-0.2.0/src/nemesis/detectors/repo_drift_after_merge.py +36 -0
  49. nemesis_eval-0.2.0/src/nemesis/detectors/skill_bloat.py +37 -0
  50. nemesis_eval-0.2.0/src/nemesis/detectors/source_of_truth_ambiguity_across_tools.py +39 -0
  51. nemesis_eval-0.2.0/src/nemesis/detectors/stale_local_checkout_treated_as_current.py +36 -0
  52. nemesis_eval-0.2.0/src/nemesis/detectors/testing_without_source_verification.py +37 -0
  53. nemesis_eval-0.2.0/src/nemesis/detectors/unsafe_audit_probing_language_in_prompts.py +36 -0
  54. nemesis_eval-0.2.0/src/nemesis/detectors/untracked_files_appearing_unexpectedly.py +39 -0
  55. nemesis_eval-0.2.0/src/nemesis/detectors/workflow_drift_across_tools.py +36 -0
  56. nemesis_eval-0.2.0/src/nemesis/eval.py +123 -0
  57. nemesis_eval-0.2.0/src/nemesis/models.py +30 -0
  58. nemesis_eval-0.2.0/src/nemesis/py.typed +0 -0
  59. nemesis_eval-0.2.0/src/nemesis/report.py +112 -0
  60. nemesis_eval-0.2.0/src/nemesis/test_agent.py +263 -0
  61. nemesis_eval-0.2.0/tests/detectors/__init__.py +0 -0
  62. nemesis_eval-0.2.0/tests/detectors/test_declared_success_too_early.py +122 -0
  63. nemesis_eval-0.2.0/tests/detectors/test_detector_contracts.py +124 -0
  64. nemesis_eval-0.2.0/tests/test_agent.py +101 -0
  65. nemesis_eval-0.2.0/tests/test_catalog.py +84 -0
  66. nemesis_eval-0.2.0/tests/test_collect.py +144 -0
  67. nemesis_eval-0.2.0/tests/test_eval.py +79 -0
  68. nemesis_eval-0.2.0/tests/test_report.py +59 -0
  69. nemesis_eval-0.2.0/tests/test_smoke.py +8 -0
@@ -0,0 +1,6 @@
1
+ # Code owners for nemesis-eval.
2
+ # These owners are automatically requested for review on any pull request.
3
+ # Combined with branch protection (require review from Code Owners), this
4
+ # ensures no change reaches main without the maintainer's review.
5
+
6
+ * @LueBangs-coder
@@ -0,0 +1,45 @@
1
+ name: Bug report
2
+ description: Report a problem with Nemesis (a false detection, a crash, wrong output).
3
+ title: "[Bug]: "
4
+ labels: ["bug"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ Thanks for taking the time to file a bug. Please fill out the sections below so it can be reproduced.
10
+ - type: textarea
11
+ id: what-happened
12
+ attributes:
13
+ label: What happened?
14
+ description: A clear description of the bug, including any error output.
15
+ placeholder: Tell us what you saw...
16
+ validations:
17
+ required: true
18
+ - type: textarea
19
+ id: expected
20
+ attributes:
21
+ label: What did you expect to happen?
22
+ validations:
23
+ required: true
24
+ - type: textarea
25
+ id: repro
26
+ attributes:
27
+ label: Steps to reproduce
28
+ description: The exact command(s) you ran, e.g. `nemesis check --repo . --claimed-success`.
29
+ render: shell
30
+ validations:
31
+ required: true
32
+ - type: input
33
+ id: version
34
+ attributes:
35
+ label: Nemesis version
36
+ description: From `pip show nemesis-eval` (Version), or the release tag / commit SHA.
37
+ validations:
38
+ required: false
39
+ - type: input
40
+ id: python
41
+ attributes:
42
+ label: Python version
43
+ placeholder: "3.13.6"
44
+ validations:
45
+ required: false
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Security vulnerability
4
+ url: https://github.com/LueBangs-coder/nemesis-eval/blob/main/SECURITY.md
5
+ about: Please report security issues privately by following the security policy, not as a public issue.
@@ -0,0 +1,28 @@
1
+ name: Feature request
2
+ description: Suggest a new detector, capability, or improvement.
3
+ title: "[Feature]: "
4
+ labels: ["enhancement"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ New detectors should trace back to a real, observed agent failure mode. The more concrete the example, the better.
10
+ - type: textarea
11
+ id: problem
12
+ attributes:
13
+ label: What problem would this solve?
14
+ description: Is this related to a real agent failure you've observed? Describe it.
15
+ validations:
16
+ required: true
17
+ - type: textarea
18
+ id: proposal
19
+ attributes:
20
+ label: Proposed solution
21
+ validations:
22
+ required: true
23
+ - type: textarea
24
+ id: alternatives
25
+ attributes:
26
+ label: Alternatives considered
27
+ validations:
28
+ required: false
@@ -0,0 +1,22 @@
1
+ version: 2
2
+ updates:
3
+ # Keep GitHub Actions (workflows + the composite action.yml) up to date.
4
+ # Dependabot bumps the pinned commit SHAs and keeps the # vX version comment.
5
+ - package-ecosystem: "github-actions"
6
+ directory: "/"
7
+ schedule:
8
+ interval: "weekly"
9
+ commit-message:
10
+ prefix: "ci"
11
+ groups:
12
+ actions:
13
+ patterns:
14
+ - "*"
15
+
16
+ # Keep Python dependencies (pyproject.toml) up to date.
17
+ - package-ecosystem: "pip"
18
+ directory: "/"
19
+ schedule:
20
+ interval: "weekly"
21
+ commit-message:
22
+ prefix: "deps"
@@ -0,0 +1,23 @@
1
+ ## Summary
2
+
3
+ <!-- What does this PR change, and why? -->
4
+
5
+ ## Type of change
6
+
7
+ - [ ] Bug fix
8
+ - [ ] New detector / feature
9
+ - [ ] Documentation
10
+ - [ ] CI / tooling
11
+ - [ ] Refactor (no behavior change)
12
+
13
+ ## Checklist
14
+
15
+ - [ ] `pytest` passes locally
16
+ - [ ] `pre-commit run --all-files` passes (ruff + black)
17
+ - [ ] New or changed behavior is covered by tests
18
+ - [ ] Docs / README / CHANGELOG updated if needed
19
+ - [ ] Any new detector traces back to a real failure mode in `data/failure_modes.yaml`
20
+
21
+ ## Related issues
22
+
23
+ <!-- e.g. Closes #123 -->
@@ -0,0 +1,35 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.11", "3.12", "3.13"]
16
+
17
+ steps:
18
+ - name: Check out the repository
19
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install the package with dev dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install -e ".[dev]"
30
+
31
+ - name: Run the test suite
32
+ run: pytest -q
33
+
34
+ - name: Run linters and formatters
35
+ run: pre-commit run --all-files
@@ -0,0 +1,35 @@
1
+ name: CodeQL
2
+
3
+ # Static analysis (security + quality) for the Python codebase. Results appear
4
+ # under the repository's Security -> Code scanning alerts.
5
+
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ branches: [main]
11
+ schedule:
12
+ - cron: "27 4 * * 1" # weekly, Monday 04:27 UTC
13
+
14
+ permissions:
15
+ contents: read
16
+
17
+ jobs:
18
+ analyze:
19
+ name: Analyze (python)
20
+ runs-on: ubuntu-latest
21
+ permissions:
22
+ security-events: write # upload CodeQL results
23
+ contents: read
24
+ steps:
25
+ - name: Check out the repository
26
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
27
+
28
+ - name: Initialize CodeQL
29
+ uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
30
+ with:
31
+ languages: python
32
+ queries: security-and-quality
33
+
34
+ - name: Perform CodeQL analysis
35
+ uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
@@ -0,0 +1,23 @@
1
+ name: Dependency review
2
+
3
+ # Fails a pull request that introduces a dependency with a known vulnerability
4
+ # or an incompatible license. Runs on the diff only — fast and PR-scoped.
5
+
6
+ on:
7
+ pull_request:
8
+ branches: [main]
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ dependency-review:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Check out the repository
18
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
19
+
20
+ - name: Review dependency changes
21
+ uses: actions/dependency-review-action@a1d282b36b6f3519aa1f3fc636f609c47dddb294 # v4
22
+ with:
23
+ fail-on-severity: moderate
@@ -0,0 +1,26 @@
1
+ name: Nemesis self-check
2
+
3
+ # Dogfooding: run the Nemesis action against this repository on every change.
4
+ # It also serves as the integration test for action.yml itself.
5
+
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ branches: [main]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ nemesis:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: Check out the repository
20
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
21
+
22
+ - name: Run Nemesis against this repo
23
+ uses: ./
24
+ with:
25
+ repo: "."
26
+ fail-on-detect: "true"
@@ -0,0 +1,73 @@
1
+ name: Rehearse release on TestPyPI
2
+
3
+ # A dry run of the real release against TestPyPI, so the production publish is
4
+ # never the first time the pipeline runs. Triggers on a pre-release (e.g. a
5
+ # tag like v0.2.0rc1 published as a GitHub pre-release) or manually.
6
+ #
7
+ # Same security model as release.yml: Trusted Publishing (OIDC), no token, no
8
+ # secret. Configure a TestPyPI trusted publisher (test.pypi.org -> Publishing)
9
+ # pointing at:
10
+ # owner: LueBangs-coder repo: nemesis-eval
11
+ # workflow: release-testpypi.yml environment: testpypi
12
+
13
+ on:
14
+ release:
15
+ types: [prereleased]
16
+ workflow_dispatch:
17
+
18
+ concurrency:
19
+ group: release-testpypi-${{ github.ref }}
20
+ cancel-in-progress: false
21
+
22
+ permissions:
23
+ contents: read
24
+
25
+ jobs:
26
+ build:
27
+ name: Build & verify distributions
28
+ runs-on: ubuntu-latest
29
+ steps:
30
+ - name: Check out the repository
31
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
32
+
33
+ - name: Set up Python
34
+ uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
35
+ with:
36
+ python-version: "3.x"
37
+
38
+ - name: Build sdist and wheel
39
+ run: |
40
+ python -m pip install --upgrade pip build twine
41
+ python -m build
42
+
43
+ - name: Verify metadata renders
44
+ run: python -m twine check dist/*
45
+
46
+ - name: Upload distributions as a build artifact
47
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
48
+ with:
49
+ name: dist
50
+ path: dist/
51
+
52
+ publish-testpypi:
53
+ name: Publish to TestPyPI
54
+ needs: build
55
+ runs-on: ubuntu-latest
56
+ environment:
57
+ name: testpypi
58
+ url: https://test.pypi.org/p/nemesis-eval
59
+ permissions:
60
+ id-token: write # OIDC token for Trusted Publishing — the only elevated grant.
61
+ steps:
62
+ - name: Download the built distributions
63
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
64
+ with:
65
+ name: dist
66
+ path: dist/
67
+
68
+ - name: Publish to TestPyPI via Trusted Publishing
69
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # release/v1
70
+ with:
71
+ repository-url: https://test.pypi.org/legacy/
72
+ # TestPyPI keeps old versions; don't fail a rehearsal on a re-run.
73
+ skip-existing: true
@@ -0,0 +1,84 @@
1
+ name: Release to PyPI
2
+
3
+ # Publishes nemesis-eval to PyPI when a GitHub Release is published.
4
+ #
5
+ # Security model: PyPI Trusted Publishing (OIDC). There is NO API token and
6
+ # NO secret stored in this repository. The publish job mints a short-lived
7
+ # identity for a single run via OpenID Connect. Configure the trusted
8
+ # publisher once on PyPI (Your account -> Publishing) pointing at:
9
+ # owner: LueBangs-coder repo: nemesis-eval
10
+ # workflow: release.yml environment: pypi
11
+
12
+ on:
13
+ release:
14
+ types: [published]
15
+
16
+ # Never run two publishes for the same ref at once.
17
+ concurrency:
18
+ group: release-${{ github.ref }}
19
+ cancel-in-progress: false
20
+
21
+ permissions:
22
+ contents: read
23
+
24
+ jobs:
25
+ build:
26
+ name: Build & verify distributions
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - name: Check out the repository
30
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
31
+
32
+ - name: Set up Python
33
+ uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
34
+ with:
35
+ python-version: "3.x"
36
+
37
+ - name: Build sdist and wheel
38
+ run: |
39
+ python -m pip install --upgrade pip build twine
40
+ python -m build
41
+
42
+ - name: Verify metadata renders on PyPI
43
+ run: python -m twine check dist/*
44
+
45
+ - name: Confirm the build version matches the release tag
46
+ env:
47
+ RELEASE_TAG: ${{ github.event.release.tag_name }}
48
+ run: |
49
+ set -euo pipefail
50
+ wheel="$(ls dist/*.whl)"
51
+ # nemesis_eval-0.2.0-py3-none-any.whl -> 0.2.0
52
+ build_version="$(basename "$wheel" | cut -d- -f2)"
53
+ tag_version="${RELEASE_TAG#v}"
54
+ echo "build=$build_version tag=$tag_version"
55
+ if [ "$build_version" != "$tag_version" ]; then
56
+ echo "::error::Release tag ($tag_version) does not match built version ($build_version)."
57
+ exit 1
58
+ fi
59
+
60
+ - name: Upload distributions as a build artifact
61
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
62
+ with:
63
+ name: dist
64
+ path: dist/
65
+
66
+ publish:
67
+ name: Publish to PyPI
68
+ needs: build
69
+ runs-on: ubuntu-latest
70
+ # The trusted publisher on PyPI is bound to this environment name.
71
+ environment:
72
+ name: pypi
73
+ url: https://pypi.org/p/nemesis-eval
74
+ permissions:
75
+ id-token: write # OIDC token for Trusted Publishing — the only elevated grant.
76
+ steps:
77
+ - name: Download the built distributions
78
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
79
+ with:
80
+ name: dist
81
+ path: dist/
82
+
83
+ - name: Publish to PyPI via Trusted Publishing
84
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # release/v1
@@ -0,0 +1,47 @@
1
+ name: OpenSSF Scorecard
2
+
3
+ # Scores the repository's security posture (branch protection, pinned deps,
4
+ # token permissions, etc.) and uploads results to Security -> Code scanning.
5
+ # Enables the public Scorecard badge.
6
+
7
+ on:
8
+ branch_protection_rule:
9
+ schedule:
10
+ - cron: "23 5 * * 1" # weekly, Monday 05:23 UTC
11
+ push:
12
+ branches: [main]
13
+
14
+ permissions:
15
+ contents: read
16
+
17
+ jobs:
18
+ analysis:
19
+ name: Scorecard analysis
20
+ runs-on: ubuntu-latest
21
+ permissions:
22
+ security-events: write # upload the SARIF results
23
+ id-token: write # publish results for the public badge
24
+ steps:
25
+ - name: Check out the repository
26
+ uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
27
+ with:
28
+ persist-credentials: false
29
+
30
+ - name: Run analysis
31
+ uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3
32
+ with:
33
+ results_file: results.sarif
34
+ results_format: sarif
35
+ publish_results: true
36
+
37
+ - name: Upload artifact
38
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
39
+ with:
40
+ name: SARIF file
41
+ path: results.sarif
42
+ retention-days: 5
43
+
44
+ - name: Upload to code scanning
45
+ uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
46
+ with:
47
+ sarif_file: results.sarif
@@ -0,0 +1,39 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ env/
16
+ ENV/
17
+
18
+ # Test / coverage artifacts
19
+ .pytest_cache/
20
+ .coverage
21
+ htmlcov/
22
+ .tox/
23
+
24
+ # Editor / IDE
25
+ .vscode/
26
+ .idea/
27
+ *.swp
28
+ *.swo
29
+
30
+ # OS metadata
31
+ .DS_Store
32
+ Thumbs.db
33
+ desktop.ini
34
+
35
+ # Claude Code per-project state (local-only)
36
+ .claude/
37
+
38
+ # Internal operational log — local-only, not part of the public product
39
+ WORKLOG.md
@@ -0,0 +1,11 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.6.9
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+
8
+ - repo: https://github.com/psf/black
9
+ rev: 24.10.0
10
+ hooks:
11
+ - id: black
@@ -0,0 +1,48 @@
1
+ # AGENTS.md — Nemesis
2
+
3
+ Operating contract for Codex (and any Codex-family coding agent) working in the Nemesis repo.
4
+
5
+ **Project:** Nemesis — Python evaluation harness for agentic failure modes.
6
+ **Reference spec:** `BUILD_SPEC.md`.
7
+ **Parent project:** Pantheon (private operating context).
8
+
9
+ ---
10
+
11
+ ## Canonical doctrine
12
+
13
+ The full doctrine, operating rules, stop conditions, and current-phase definition live in `CLAUDE.md`. **That file is the single source of truth for how any agent operates in this repo.** Codex must apply the same doctrine as Claude Code — no agent gets a softer contract.
14
+
15
+ Read `CLAUDE.md` end to end before doing anything in this repo. Then read `BUILD_SPEC.md`.
16
+
17
+ This file (`AGENTS.md`) exists because Codex looks for it by convention. Its job is to point Codex at `CLAUDE.md` and add the Codex-specific notes that don't apply to Claude Code.
18
+
19
+ ---
20
+
21
+ ## Why this file exists at all
22
+
23
+ Pantheon doctrine, failure mode 14 — *missing root doctrine updates* — and failure mode 20 — *workflow drift across tools* — both reduce to the same root cause: agents on shared substrate operating from different rule sets.
24
+
25
+ The fix is centralized doctrine in shared root files. `CLAUDE.md` and `AGENTS.md` are pair-files. They cannot drift.
26
+
27
+ If either file changes, the other must be updated in the same commit, or the change is incomplete.
28
+
29
+ ---
30
+
31
+ ## Codex-specific notes
32
+
33
+ These supplement `CLAUDE.md`. They do not override it.
34
+
35
+ - **Codex's tool surface differs from Claude Code's.** Claude Code's `Bash`/`PreToolUse` hook layer is where Terminus enforces boundaries. Codex's sandbox and tool-call surface are where Ananke does the equivalent work. The guardian model is the same; the implementation path is not.
36
+ - **Codex sandbox sessions are ephemeral by default.** State that should persist across sessions must be written to the repo, not to the Codex session. This is the operational shape of failure mode 5 (*old session folders leaking files*) — fix is the same as for Claude Code: explicit archive or cleanup before phase closeout.
37
+ - **Codex's commit behavior:** all commits authored by Codex must be reviewed against the canonical checkout before declaring success. Codex's session report alone is not proof. See `CLAUDE.md` operating rule 7.
38
+ - **Identity stamping (Pantheon's `argus-identity-migration`):** when work moves from Codex to Claude Code or vice versa, the handoff prompt must include: branch, HEAD, upstream parity, worktree status, and a brief stake-in-the-ground description of what is and is not done. Anything weaker is an incomplete handoff and violates failure mode 11.
39
+
40
+ ---
41
+
42
+ ## When Codex finishes a phase
43
+
44
+ Same closeout as Claude Code (`CLAUDE.md` "When the phase passes"). The phase closes in `CLAUDE.md` *and* in this file. Identity stamping is mandatory on phase boundaries.
45
+
46
+ ---
47
+
48
+ *Built in the Pantheon's shadow.*