agent-dispatch 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/.github/workflows/ci.yml +4 -0
  2. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/.github/workflows/publish.yml +5 -1
  3. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/CHANGELOG.md +64 -1
  4. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/PKG-INFO +15 -7
  5. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/README.md +14 -6
  6. agent_dispatch-0.5.0/SECURITY.md +77 -0
  7. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/agents.example.yaml +1 -0
  8. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/pyproject.toml +23 -1
  9. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/__init__.py +1 -1
  10. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/cache.py +15 -1
  11. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/cli.py +7 -4
  12. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/config.py +12 -2
  13. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/jobs.py +92 -3
  14. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/models.py +1 -0
  15. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/runner.py +45 -2
  16. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/src/agent_dispatch/server.py +131 -18
  17. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_cache.py +33 -2
  18. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_cli.py +1 -0
  19. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_config.py +12 -0
  20. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_jobs.py +134 -1
  21. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_models.py +5 -4
  22. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_runner.py +56 -0
  23. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/test_server.py +126 -7
  24. agent_dispatch-0.4.0/SECURITY.md +0 -22
  25. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/.github/dependabot.yml +0 -0
  26. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/.gitignore +0 -0
  27. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/LICENSE +0 -0
  28. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/assets/mascot.png +0 -0
  29. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/__init__.py +0 -0
  30. {agent_dispatch-0.4.0 → agent_dispatch-0.5.0}/tests/conftest.py +0 -0
@@ -6,6 +6,10 @@ on:
6
6
  pull_request:
7
7
  branches: [main]
8
8
 
9
+ # Least privilege: CI only needs to read the repo.
10
+ permissions:
11
+ contents: read
12
+
9
13
  jobs:
10
14
  test:
11
15
  runs-on: ubuntu-latest
@@ -4,12 +4,16 @@ on:
4
4
  release:
5
5
  types: [published]
6
6
 
7
+ # Default to no privileges; the publish job opts into exactly what it needs.
8
+ permissions: {}
9
+
7
10
  jobs:
8
11
  publish:
9
12
  runs-on: ubuntu-latest
10
13
  environment: pypi
11
14
  permissions:
12
- id-token: write
15
+ id-token: write # OIDC token for PyPI Trusted Publisher
16
+ contents: read # checkout the tagged source
13
17
  steps:
14
18
  - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
15
19
 
@@ -7,6 +7,68 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.5.0] - 2026-06-01
11
+
12
+ Security-hardening release. A multi-agent audit of the codebase surfaced
13
+ several issues; the confirmed ones are fixed here, plus job cancellation,
14
+ cache bounding, and stale-job recovery.
15
+
16
+ ### Security
17
+ - **Path traversal in async jobs (fixed).** `dispatch_status`, `dispatch_wait`,
18
+ and `fetch_result` accept a caller-supplied `job_id`/`ref` that flowed
19
+ straight into `JobStore`'s file-path construction. A crafted value such as
20
+ `../../secret` could read any Job-shaped `.json` file outside the jobs
21
+ directory. Job ids are now validated against `^[0-9a-f]{32}$` at the tool
22
+ boundary (`_validate_ref`), in `JobStore.get`, and in `JobStore._path`
23
+ (defense in depth). Malformed ids are rejected without touching the
24
+ filesystem. New helper `jobs.is_valid_job_id`.
25
+ - **Argument/flag injection via structured CLI fields (fixed).** A
26
+ `session_id` (caller-controlled in `dispatch_session`) — or a misconfigured
27
+ `model`, `permission_mode`, or tool name — that started with `-` was placed
28
+ in the argument position after a flag (e.g. `--resume <session_id>`) and the
29
+ `claude` CLI parsed it as a *new* flag, allowing options like
30
+ `--permission-mode bypassPermissions` to be smuggled in. `_build_command`
31
+ now rejects any such value via `_reject_flaglike` (raising
32
+ `runner.ArgInjectionError`); `dispatch`/`dispatch_stream` surface it as a
33
+ clean failed result, never spawning a subprocess.
34
+ - **Tightened file permissions.** Job files are written `0o600` and the jobs
35
+ directory is created `0o700` (they hold full task/context/result payloads
36
+ that may contain secrets). `save_config` now writes `agents.yaml` `0o600`
37
+ and its parent directory `0o700`. All `chmod`s are best-effort (skipped on
38
+ platforms without POSIX modes).
39
+
40
+ ### Added
41
+ - `dispatch_cancel(job_id)` MCP tool — cancel a *pending* async job before it
42
+ starts. Running jobs are left to finish (their subprocess can't be safely
43
+ interrupted); the tool reports an `outcome` of `cancelled`, `running`,
44
+ `already_terminal`, or `not_found`. Makes the previously-unreachable
45
+ `cancelled` job status real. Backed by `JobStore.cancel`, and the
46
+ cancel/start race is closed by `mark_running` refusing a cancelled job.
47
+ - Cache size bound — `CacheSettings.max_size` (default 1000) caps the
48
+ in-memory dispatch cache, evicting the oldest entry first (FIFO by insertion
49
+ time; read access does not refresh, since the timestamp also drives TTL),
50
+ preventing unbounded memory growth from many unique requests. `cache_stats`
51
+ now reports `max_size` and `evictions`.
52
+ - Stale-job recovery — on startup the server marks jobs abandoned in
53
+ `running` (older than 1h, e.g. from a crashed prior run) as `failed` so
54
+ callers don't poll them forever (`JobStore.recover_stale`).
55
+
56
+ ### Changed
57
+ - Input bounds hardened across MCP tools: `dispatch_jobs(limit)` clamped to
58
+ `[1, 1000]`; `dispatch_gc(max_age_days)` rejects non-finite values;
59
+ `summary_chars` (in `dispatch` and per-item `dispatch_parallel`) clamped to
60
+ `[0, 100000]`; `dispatch_parallel` rejects more than
61
+ `max(100, max_concurrency * 20)` items to bound subprocess fan-out.
62
+ - Async job worker now logs lifecycle transitions (running / finished) with
63
+ the job id for easier production debugging.
64
+ - Type hints filled in (`_ref_payload`, `_run_job`, `_run_one`).
65
+ - Lint surface expanded — ruff now enforces bugbear (`B`), bandit security
66
+ (`S`), import order (`I`), and pyupgrade (`UP`) in addition to the defaults,
67
+ with documented ignores for the trusted `claude` subprocess calls.
68
+ - `SECURITY.md` rewritten: accurate supported-versions table and an expanded
69
+ threat model (bypassPermissions, on-disk job files, env inheritance,
70
+ best-effort recursion depth, argument-injection mitigation).
71
+
10
72
  ## [0.4.0] - 2026-05-15
11
73
 
12
74
  ### Added
@@ -152,7 +214,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
152
214
  - Dependabot for `pip` + `github-actions`, GitHub Actions pinned to
153
215
  commit SHAs for supply-chain integrity.
154
216
 
155
- [Unreleased]: https://github.com/ginkida/agent-dispatch/compare/v0.4.0...HEAD
217
+ [Unreleased]: https://github.com/ginkida/agent-dispatch/compare/v0.5.0...HEAD
218
+ [0.5.0]: https://github.com/ginkida/agent-dispatch/compare/v0.4.0...v0.5.0
156
219
  [0.4.0]: https://github.com/ginkida/agent-dispatch/compare/v0.3.0...v0.4.0
157
220
  [0.3.0]: https://github.com/ginkida/agent-dispatch/compare/v0.2.2...v0.3.0
158
221
  [0.2.2]: https://github.com/ginkida/agent-dispatch/compare/v0.2.1...v0.2.2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agent-dispatch
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: MCP server that lets Claude Code agents delegate tasks to agents in other project directories
5
5
  Project-URL: Homepage, https://github.com/ginkida/agent-dispatch
6
6
  Project-URL: Repository, https://github.com/ginkida/agent-dispatch
@@ -339,7 +339,7 @@ fetch_result(ref="8f3a...e1", max_chars=2000) -> truncated, plus {"truncated":
339
339
 
340
340
  Refs reuse the same storage as `dispatch_async` jobs (under `~/.config/agent-dispatch/jobs/`), so any `job_id` returned by `dispatch_async` is also a valid `ref` for `fetch_result`. `parsed_result` (when `response_format="json"` is set) is small and is always inlined directly in the ref response — no second fetch needed.
341
341
 
342
- ### Async dispatch — `dispatch_async`, `dispatch_status`, `dispatch_wait`, `dispatch_jobs`, `dispatch_gc`
342
+ ### Async dispatch — `dispatch_async`, `dispatch_status`, `dispatch_wait`, `dispatch_cancel`, `dispatch_jobs`, `dispatch_gc`
343
343
 
344
344
  When a dispatched task is going to take a while, you don't want to block your own tool slot for minutes. Async dispatch returns a `job_id` immediately and lets you check back when you're ready.
345
345
 
@@ -360,9 +360,11 @@ dispatch_wait(job_id="8f3a...e1", timeout_seconds=120)
360
360
  -> {"id": "...", "status": "running", "timed_out_waiting": true}
361
361
  ```
362
362
 
363
+ `dispatch_cancel(job_id)` cancels a job that is still **pending** (before its subprocess starts) — a running job is left to finish, since its `claude` subprocess can't be safely interrupted. The response carries an `outcome` of `cancelled`, `running`, `already_terminal`, or `not_found`.
364
+
363
365
  `dispatch_jobs(status?)` lists recent jobs as summaries (filter by `pending` / `running` / `done` / `failed` / `cancelled`). `dispatch_gc(max_age_days=7)` purges terminal jobs older than the threshold — pending and running jobs are never deleted.
364
366
 
365
- Job state persists to disk at `~/.config/agent-dispatch/jobs/` (override with `AGENT_DISPATCH_JOBS_DIR`). One JSON file per job, atomic writes — safe to read or `ls` while jobs are in flight.
367
+ Job state persists to disk at `~/.config/agent-dispatch/jobs/` (override with `AGENT_DISPATCH_JOBS_DIR`). One JSON file per job, written owner-only (`0o600`) with atomic writes — safe to read or `ls` while jobs are in flight. Caller-supplied `job_id`s are validated as 32-char hex before any file access (no path traversal). On startup the server marks jobs abandoned in `running` by a prior crashed instance as `failed`.
366
368
 
367
369
  | When to use async | When to use `dispatch` |
368
370
  |-------------------|------------------------|
@@ -418,10 +420,11 @@ settings:
418
420
  # - Read
419
421
  # - Edit
420
422
  max_dispatch_depth: 3 # recursion protection
421
- max_concurrency: 5 # max parallel claude -p processes
423
+ max_concurrency: 5 # max parallel claude -p processes (per dispatch path)
422
424
  cache:
423
425
  enabled: true
424
426
  ttl: 300 # seconds
427
+ max_size: 1000 # max cached entries; oldest evicted first (FIFO)
425
428
  ```
426
429
 
427
430
  Config is reloaded on every tool call — add agents without restarting.
@@ -459,11 +462,16 @@ agent-dispatch MCP server
459
462
 
460
463
  ## Safety
461
464
 
462
- - **Recursion protection** — `AGENT_DISPATCH_DEPTH` env var tracks nesting. Default limit: 3.
465
+ - **Recursion protection** — `AGENT_DISPATCH_DEPTH` env var tracks nesting. Default limit: 3. Best-effort across the subprocess boundary (see [SECURITY.md](SECURITY.md)).
466
+ - **Argument-injection guard** — structured CLI fields (`session_id`, `model`, `permission_mode`, tool names) that start with `-` are rejected so they can't smuggle extra `claude` flags.
467
+ - **Path-traversal guard** — caller-supplied `job_id`/`ref` values are validated as 32-char hex before any filesystem access.
468
+ - **Owner-only state** — job files (`0o600`) and `agents.yaml` (`0o600`) are written for the owner only; their directories are `0o700`.
463
469
  - **Cost control** — `max_budget_usd` per agent or globally.
464
- - **Concurrency** — `max_concurrency` (default: 5) limits parallel `claude -p` processes.
470
+ - **Concurrency** — `max_concurrency` (default: 5) caps parallel `claude -p` processes. Note: the sync and async dispatch paths use separate semaphores, so the worst-case total is `2 × max_concurrency`.
465
471
  - **Timeout** — per-agent or global (default: 300s). Orphaned processes are cleaned up.
466
- - **Caching** — identical `(agent, task, context)` requests return cached results. Only successes are cached. Sessions and dialogues are never cached.
472
+ - **Caching** — identical `(agent, task, context, caller, goal, response_format)` requests return cached results, bounded by `cache.max_size` (oldest entry evicted first). Only successes are cached. Sessions and dialogues are never cached.
473
+
474
+ See [SECURITY.md](SECURITY.md) for the full threat model (including the `bypassPermissions` escalation risk and on-disk job files).
467
475
 
468
476
  ## CLI
469
477
 
@@ -309,7 +309,7 @@ fetch_result(ref="8f3a...e1", max_chars=2000) -> truncated, plus {"truncated":
309
309
 
310
310
  Refs reuse the same storage as `dispatch_async` jobs (under `~/.config/agent-dispatch/jobs/`), so any `job_id` returned by `dispatch_async` is also a valid `ref` for `fetch_result`. `parsed_result` (when `response_format="json"` is set) is small and is always inlined directly in the ref response — no second fetch needed.
311
311
 
312
- ### Async dispatch — `dispatch_async`, `dispatch_status`, `dispatch_wait`, `dispatch_jobs`, `dispatch_gc`
312
+ ### Async dispatch — `dispatch_async`, `dispatch_status`, `dispatch_wait`, `dispatch_cancel`, `dispatch_jobs`, `dispatch_gc`
313
313
 
314
314
  When a dispatched task is going to take a while, you don't want to block your own tool slot for minutes. Async dispatch returns a `job_id` immediately and lets you check back when you're ready.
315
315
 
@@ -330,9 +330,11 @@ dispatch_wait(job_id="8f3a...e1", timeout_seconds=120)
330
330
  -> {"id": "...", "status": "running", "timed_out_waiting": true}
331
331
  ```
332
332
 
333
+ `dispatch_cancel(job_id)` cancels a job that is still **pending** (before its subprocess starts) — a running job is left to finish, since its `claude` subprocess can't be safely interrupted. The response carries an `outcome` of `cancelled`, `running`, `already_terminal`, or `not_found`.
334
+
333
335
  `dispatch_jobs(status?)` lists recent jobs as summaries (filter by `pending` / `running` / `done` / `failed` / `cancelled`). `dispatch_gc(max_age_days=7)` purges terminal jobs older than the threshold — pending and running jobs are never deleted.
334
336
 
335
- Job state persists to disk at `~/.config/agent-dispatch/jobs/` (override with `AGENT_DISPATCH_JOBS_DIR`). One JSON file per job, atomic writes — safe to read or `ls` while jobs are in flight.
337
+ Job state persists to disk at `~/.config/agent-dispatch/jobs/` (override with `AGENT_DISPATCH_JOBS_DIR`). One JSON file per job, written owner-only (`0o600`) with atomic writes — safe to read or `ls` while jobs are in flight. Caller-supplied `job_id`s are validated as 32-char hex before any file access (no path traversal). On startup the server marks jobs abandoned in `running` by a prior crashed instance as `failed`.
336
338
 
337
339
  | When to use async | When to use `dispatch` |
338
340
  |-------------------|------------------------|
@@ -388,10 +390,11 @@ settings:
388
390
  # - Read
389
391
  # - Edit
390
392
  max_dispatch_depth: 3 # recursion protection
391
- max_concurrency: 5 # max parallel claude -p processes
393
+ max_concurrency: 5 # max parallel claude -p processes (per dispatch path)
392
394
  cache:
393
395
  enabled: true
394
396
  ttl: 300 # seconds
397
+ max_size: 1000 # max cached entries; oldest evicted first (FIFO)
395
398
  ```
396
399
 
397
400
  Config is reloaded on every tool call — add agents without restarting.
@@ -429,11 +432,16 @@ agent-dispatch MCP server
429
432
 
430
433
  ## Safety
431
434
 
432
- - **Recursion protection** — `AGENT_DISPATCH_DEPTH` env var tracks nesting. Default limit: 3.
435
+ - **Recursion protection** — `AGENT_DISPATCH_DEPTH` env var tracks nesting. Default limit: 3. Best-effort across the subprocess boundary (see [SECURITY.md](SECURITY.md)).
436
+ - **Argument-injection guard** — structured CLI fields (`session_id`, `model`, `permission_mode`, tool names) that start with `-` are rejected so they can't smuggle extra `claude` flags.
437
+ - **Path-traversal guard** — caller-supplied `job_id`/`ref` values are validated as 32-char hex before any filesystem access.
438
+ - **Owner-only state** — job files (`0o600`) and `agents.yaml` (`0o600`) are written for the owner only; their directories are `0o700`.
433
439
  - **Cost control** — `max_budget_usd` per agent or globally.
434
- - **Concurrency** — `max_concurrency` (default: 5) limits parallel `claude -p` processes.
440
+ - **Concurrency** — `max_concurrency` (default: 5) caps parallel `claude -p` processes. Note: the sync and async dispatch paths use separate semaphores, so the worst-case total is `2 × max_concurrency`.
435
441
  - **Timeout** — per-agent or global (default: 300s). Orphaned processes are cleaned up.
436
- - **Caching** — identical `(agent, task, context)` requests return cached results. Only successes are cached. Sessions and dialogues are never cached.
442
+ - **Caching** — identical `(agent, task, context, caller, goal, response_format)` requests return cached results, bounded by `cache.max_size` (oldest entry evicted first). Only successes are cached. Sessions and dialogues are never cached.
443
+
444
+ See [SECURITY.md](SECURITY.md) for the full threat model (including the `bypassPermissions` escalation risk and on-disk job files).
437
445
 
438
446
  ## CLI
439
447
 
@@ -0,0 +1,77 @@
1
+ # Security Policy
2
+
3
+ ## Reporting a Vulnerability
4
+
5
+ If you discover a security vulnerability, please report it via [GitHub Security Advisories](https://github.com/ginkida/agent-dispatch/security/advisories/new).
6
+
7
+ **Do not** open a public issue for security vulnerabilities.
8
+
9
+ ## Supported Versions
10
+
11
+ | Version | Supported |
12
+ |---------|-----------|
13
+ | 0.5.x | Yes |
14
+ | 0.4.x | Yes |
15
+ | ≤ 0.3.x | No |
16
+
17
+ ## Threat Model
18
+
19
+ `agent-dispatch` runs `claude -p` subprocesses in configured directories on
20
+ behalf of a calling Claude Code agent. The MCP caller and the agent
21
+ configurations are part of the same trust domain as the user running the
22
+ server — this is a developer tool, not a multi-tenant service. With that in
23
+ mind, the security-relevant areas are:
24
+
25
+ ### Subprocess execution
26
+ - Tasks/context strings are passed as **argument-list** elements to
27
+ `subprocess.run`/`Popen` (never `shell=True`), so there is no shell
28
+ injection.
29
+ - **Argument injection is guarded.** Structured fields placed next to a CLI
30
+ flag (`session_id` → `--resume`, `model` → `--model`, `permission_mode`,
31
+ and tool names) are rejected if they start with `-`, which the `claude`
32
+ CLI would otherwise parse as a *separate* flag. See
33
+ `runner._reject_flaglike` / `runner.ArgInjectionError`.
34
+
35
+ ### Permission escalation (`bypassPermissions`)
36
+ - Setting `permission_mode: bypassPermissions` (or a permissive
37
+ `default_permission_mode`) disables Claude Code's permission prompts for
38
+ that agent — it can use any tool without confirmation. Only enable it for
39
+ agents whose project directories you trust. Prefer `allowed_tools` /
40
+ `disallowed_tools` for least privilege.
41
+ - A dispatched agent running with broad permissions can, in principle, start
42
+ its own `claude`/dispatch chain. Recursion depth (`AGENT_DISPATCH_DEPTH`,
43
+ bounded by `max_dispatch_depth`) is **best-effort**: it crosses the process
44
+ boundary via an environment variable, so a deliberately hostile agent that
45
+ clears its environment can reset the counter. It protects against accidental
46
+ A→B→A loops, not against an adversarial agent.
47
+
48
+ ### On-disk state
49
+ - Async/`return_ref` job records persist to
50
+ `~/.config/agent-dispatch/jobs/<job_id>.json` (override with
51
+ `AGENT_DISPATCH_JOBS_DIR`). They contain the full task, context, and result,
52
+ which may include sensitive output. Files are written `0o600` and the
53
+ directory `0o700` (owner-only). Call `dispatch_gc()` periodically to purge
54
+ old results.
55
+ - `agents.yaml` is written `0o600`. It records project paths and permission
56
+ settings.
57
+ - `job_id`s are unauthenticated 32-char hex UUIDs — anyone who can call the
58
+ MCP tools and knows a `job_id` can read its result. Don't relay `job_id`s
59
+ over untrusted channels. Caller-supplied `job_id`/`ref` values are validated
60
+ (`^[0-9a-f]{32}$`) before any filesystem access, blocking path traversal.
61
+
62
+ ### Environment & directories
63
+ - The dispatched subprocess inherits the **full parent environment**
64
+ (`os.environ.copy()`) — necessary for `claude` to find its credentials.
65
+ Keep secrets you don't want dispatched agents to see out of the shell that
66
+ launches the server.
67
+ - Agent directories are resolved to absolute paths via `Path.resolve()` and
68
+ must exist at registration time.
69
+
70
+ ### Cost
71
+ - `max_budget_usd` (per agent or as a default) caps spend per dispatch.
72
+
73
+ ## Reproducibility & CI
74
+
75
+ Third-party GitHub Actions are pinned to commit SHAs; workflows run with
76
+ least-privilege `permissions`. Releases publish to PyPI via OIDC Trusted
77
+ Publishing (no long-lived tokens).
@@ -45,3 +45,4 @@ settings:
45
45
  cache:
46
46
  enabled: true
47
47
  ttl: 300 # seconds; identical (agent, task, context) requests are cached
48
+ max_size: 1000 # max cached entries; oldest is evicted first (FIFO)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agent-dispatch"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  description = "MCP server that lets Claude Code agents delegate tasks to agents in other project directories"
5
5
  readme = "README.md"
6
6
  license = "MIT"
@@ -47,6 +47,28 @@ asyncio_mode = "auto"
47
47
  target-version = "py310"
48
48
  line-length = 100
49
49
 
50
+ [tool.ruff.lint]
51
+ select = [
52
+ "E", "W", # pycodestyle
53
+ "F", # pyflakes
54
+ "B", # flake8-bugbear (likely bugs)
55
+ "I", # isort (import order)
56
+ "UP", # pyupgrade (modern syntax)
57
+ "S", # flake8-bandit (security)
58
+ ]
59
+ ignore = [
60
+ # The dispatch family shells out to the trusted `claude` CLI with argument
61
+ # lists (never shell=True); see runner._build_command and the arg-injection
62
+ # guard (_reject_flaglike). Partial path is intentional — `claude` is
63
+ # resolved from PATH.
64
+ "S603", # subprocess call with possibly-untrusted input
65
+ "S607", # starting a process with a partial executable path
66
+ ]
67
+
68
+ [tool.ruff.lint.per-file-ignores]
69
+ # Tests legitimately assert and use throwaway /tmp paths.
70
+ "tests/**" = ["S101", "S108"]
71
+
50
72
  [project.optional-dependencies]
51
73
  dev = [
52
74
  "pytest>=8.0",
@@ -1,3 +1,3 @@
1
1
  """agent-dispatch: Delegate tasks between Claude Code agents across projects."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.5.0"
@@ -23,12 +23,14 @@ class DispatchCache:
23
23
  requests with different framing would collide and return the wrong response.
24
24
  """
25
25
 
26
- def __init__(self, ttl: int = 300) -> None:
26
+ def __init__(self, ttl: int = 300, max_size: int = 1000) -> None:
27
27
  self._ttl = ttl
28
+ self._max_size = max_size
28
29
  self._store: dict[str, tuple[float, DispatchResult]] = {}
29
30
  self._lock = threading.Lock()
30
31
  self._hits = 0
31
32
  self._misses = 0
33
+ self._evictions = 0
32
34
 
33
35
  @staticmethod
34
36
  def _make_key(
@@ -89,6 +91,15 @@ class DispatchCache:
89
91
  return # don't cache failures
90
92
  key = self._make_key(agent, task, context, caller, goal, response_format)
91
93
  with self._lock:
94
+ # Bound memory: when at capacity and inserting a new key, evict the
95
+ # oldest entry by insertion time (FIFO). We intentionally do NOT
96
+ # refresh timestamps on read — the timestamp also drives TTL expiry,
97
+ # so touching it on access would turn TTL into idle-time. Refreshing
98
+ # an existing key never triggers eviction.
99
+ if key not in self._store and len(self._store) >= self._max_size:
100
+ oldest = min(self._store, key=lambda k: self._store[k][0])
101
+ del self._store[oldest]
102
+ self._evictions += 1
92
103
  self._store[key] = (time.monotonic(), result)
93
104
 
94
105
  def clear(self) -> int:
@@ -97,6 +108,7 @@ class DispatchCache:
97
108
  self._store.clear()
98
109
  self._hits = 0
99
110
  self._misses = 0
111
+ self._evictions = 0
100
112
  return count
101
113
 
102
114
  def evict_expired(self) -> int:
@@ -112,8 +124,10 @@ class DispatchCache:
112
124
  total = self._hits + self._misses
113
125
  return {
114
126
  "size": len(self._store),
127
+ "max_size": self._max_size,
115
128
  "hits": self._hits,
116
129
  "misses": self._misses,
130
+ "evictions": self._evictions,
117
131
  "hit_rate": round(self._hits / total, 3) if total else 0.0,
118
132
  "ttl": self._ttl,
119
133
  }
@@ -25,13 +25,13 @@ def _load_or_exit() -> DispatchConfig:
25
25
  f"Error: config at {config_path()} has an invalid schema:", fg="red"
26
26
  ))
27
27
  click.echo(str(e))
28
- raise SystemExit(1)
28
+ raise SystemExit(1) from None
29
29
  except yaml.YAMLError as e:
30
30
  click.echo(click.style(
31
31
  f"Error: config at {config_path()} is not valid YAML:", fg="red"
32
32
  ))
33
33
  click.echo(str(e))
34
- raise SystemExit(1)
34
+ raise SystemExit(1) from None
35
35
 
36
36
 
37
37
  @click.group()
@@ -94,7 +94,10 @@ def init() -> None:
94
94
  @cli.command()
95
95
  @click.argument("name")
96
96
  @click.argument("directory", type=click.Path(exists=True, file_okay=False, resolve_path=True))
97
- @click.option("-d", "--description", default=None, help="Agent description. Auto-generated if omitted.")
97
+ @click.option(
98
+ "-d", "--description", default=None,
99
+ help="Agent description. Auto-generated if omitted.",
100
+ )
98
101
  @click.option("--timeout", default=300, help="Timeout in seconds (default: 300).")
99
102
  @click.option("--model", default=None, help="Model override for this agent.")
100
103
  @click.option("--max-budget", default=None, type=float, help="Max cost in USD per dispatch.")
@@ -126,7 +129,7 @@ def add(
126
129
  validate_agent_name(name)
127
130
  except ValueError as e:
128
131
  click.echo(f"Error: {e}")
129
- raise SystemExit(1)
132
+ raise SystemExit(1) from None
130
133
 
131
134
  config = _load_or_exit()
132
135
  dir_path = Path(directory).resolve()
@@ -33,15 +33,25 @@ def load_config(path: Path | None = None) -> DispatchConfig:
33
33
  return DispatchConfig.model_validate(raw)
34
34
 
35
35
 
36
+ def _chmod_quiet(path: Path, mode: int) -> None:
37
+ """Best-effort chmod. Silently ignores platforms/filesystems without it."""
38
+ try:
39
+ os.chmod(path, mode)
40
+ except OSError as e: # pragma: no cover - platform dependent
41
+ logger.debug("chmod %s to %o failed: %s", path, mode, e)
42
+
43
+
36
44
  def save_config(config: DispatchConfig, path: Path | None = None) -> None:
37
- """Save config to YAML file."""
45
+ """Save config to YAML file (owner-only perms — it records project paths)."""
38
46
  p = path or config_path()
39
- p.parent.mkdir(parents=True, exist_ok=True)
47
+ p.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
48
+ _chmod_quiet(p.parent, 0o700)
40
49
  data = config.model_dump(mode="json", exclude_none=True)
41
50
  p.write_text(
42
51
  yaml.dump(data, default_flow_style=False, allow_unicode=True, sort_keys=False),
43
52
  encoding="utf-8",
44
53
  )
54
+ _chmod_quiet(p, 0o600)
45
55
 
46
56
 
47
57
  def _collect_mcp_servers(directory: Path) -> list[str]:
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
 
10
10
  import logging
11
11
  import os
12
+ import re
12
13
  import threading
13
14
  import time
14
15
  import uuid
@@ -24,6 +25,24 @@ logger = logging.getLogger(__name__)
24
25
  JobStatus = Literal["pending", "running", "done", "failed", "cancelled"]
25
26
  _TERMINAL_STATUSES: frozenset[str] = frozenset({"done", "failed", "cancelled"})
26
27
 
28
+ # Job IDs are uuid4().hex — 32 lowercase hex chars. Anything else (notably
29
+ # values containing "/" or "..") is rejected so a caller-supplied ref/job_id
30
+ # can never escape the jobs directory via path traversal.
31
+ _JOB_ID_RE = re.compile(r"^[0-9a-f]{32}$")
32
+
33
+
34
+ def is_valid_job_id(job_id: str) -> bool:
35
+ """Return True if *job_id* is a well-formed uuid4 hex string."""
36
+ return isinstance(job_id, str) and bool(_JOB_ID_RE.match(job_id))
37
+
38
+
39
+ def _chmod_quiet(path: Path, mode: int) -> None:
40
+ """Best-effort chmod. Silently ignores platforms/filesystems without it."""
41
+ try:
42
+ os.chmod(path, mode)
43
+ except OSError as e: # pragma: no cover - platform dependent
44
+ logger.debug("chmod %s to %o failed: %s", path, mode, e)
45
+
27
46
 
28
47
  class Job(BaseModel):
29
48
  """Persistent record of an async dispatch."""
@@ -50,16 +69,24 @@ class JobStore:
50
69
 
51
70
  def __init__(self, directory: Path):
52
71
  self.directory = Path(directory).expanduser()
53
- self.directory.mkdir(parents=True, exist_ok=True)
72
+ # Owner-only (0o700): job files hold full task/context/result payloads
73
+ # that may contain secrets — keep them off other local users' radar.
74
+ self.directory.mkdir(parents=True, exist_ok=True, mode=0o700)
75
+ _chmod_quiet(self.directory, 0o700)
54
76
  self._lock = threading.RLock()
55
77
 
56
78
  def _path(self, job_id: str) -> Path:
79
+ # Defense-in-depth: validate before building any path so a crafted
80
+ # job_id ("../../etc/foo") can never resolve outside self.directory.
81
+ if not is_valid_job_id(job_id):
82
+ raise ValueError(f"Invalid job_id: {job_id!r}")
57
83
  return self.directory / f"{job_id}.json"
58
84
 
59
85
  def _write(self, job: Job) -> None:
60
86
  path = self._path(job.id)
61
87
  tmp = path.with_suffix(".tmp")
62
88
  tmp.write_text(job.model_dump_json(indent=2, exclude_none=True), encoding="utf-8")
89
+ _chmod_quiet(tmp, 0o600) # owner-only before it becomes visible
63
90
  os.replace(tmp, path)
64
91
 
65
92
  def create(
@@ -118,7 +145,12 @@ class JobStore:
118
145
  return job
119
146
 
120
147
  def get(self, job_id: str) -> Job | None:
121
- """Read a job by id. Returns None if not found or unreadable."""
148
+ """Read a job by id. Returns None if not found, invalid, or unreadable."""
149
+ if not is_valid_job_id(job_id):
150
+ # Malformed/hostile id (e.g. path traversal attempt) — treat as
151
+ # "not found" without touching the filesystem.
152
+ logger.debug("Rejecting malformed job_id: %r", job_id)
153
+ return None
122
154
  path = self._path(job_id)
123
155
  if not path.exists():
124
156
  return None
@@ -143,16 +175,73 @@ class JobStore:
143
175
  return jobs
144
176
 
145
177
  def mark_running(self, job_id: str) -> Job | None:
146
- """Mark job as running. Returns updated job or None if not found."""
178
+ """Mark a pending job as running.
179
+
180
+ Returns the updated job, or None if the job is missing OR has already
181
+ been cancelled. Refusing to run a cancelled job closes the race with
182
+ ``cancel()``: both take ``self._lock``, so whichever wins, the worker
183
+ either sees ``cancelled`` (and skips) or sets ``running`` first (and
184
+ cancel then refuses).
185
+ """
147
186
  with self._lock:
148
187
  job = self.get(job_id)
149
188
  if job is None:
150
189
  return None
190
+ if job.status == "cancelled":
191
+ return None
151
192
  job.status = "running"
152
193
  job.started_at = time.time()
153
194
  self._write(job)
154
195
  return job
155
196
 
197
+ def cancel(self, job_id: str) -> tuple[Job | None, str]:
198
+ """Attempt to cancel a job.
199
+
200
+ Only *pending* jobs can be cancelled — a running job's subprocess is
201
+ already in flight and is left to finish. Returns ``(job, outcome)``
202
+ where outcome is one of: ``cancelled`` (was pending, now cancelled),
203
+ ``running`` (in flight, untouched), ``already_terminal`` (done/failed/
204
+ already cancelled), or ``not_found``.
205
+ """
206
+ with self._lock:
207
+ job = self.get(job_id)
208
+ if job is None:
209
+ return None, "not_found"
210
+ if job.is_terminal():
211
+ return job, "already_terminal"
212
+ if job.status == "running":
213
+ return job, "running"
214
+ # pending -> cancelled
215
+ job.status = "cancelled"
216
+ job.completed_at = time.time()
217
+ job.error = "Cancelled before execution"
218
+ self._write(job)
219
+ return job, "cancelled"
220
+
221
+ def recover_stale(self, stale_threshold_seconds: float = 3600) -> int:
222
+ """Mark jobs stuck in 'running' beyond the threshold as failed.
223
+
224
+ Async workers are daemon threads — if the server dies mid-dispatch the
225
+ job file is left in ``running`` forever. Call this on startup to flip
226
+ such orphans to ``failed`` so callers don't poll them indefinitely.
227
+ Returns the count recovered.
228
+ """
229
+ now = time.time()
230
+ recovered = 0
231
+ with self._lock:
232
+ for job in self.list(status="running"):
233
+ age = now - (job.started_at or job.created_at)
234
+ if age > stale_threshold_seconds:
235
+ # Count only jobs we actually transitioned (fail() returns
236
+ # None for a missing/malformed id, e.g. a planted file).
237
+ if self.fail(
238
+ job.id,
239
+ f"Abandoned in 'running' for {age:.0f}s — likely a "
240
+ "server restart. Re-dispatch if still needed.",
241
+ ) is not None:
242
+ recovered += 1
243
+ return recovered
244
+
156
245
  def finish(
157
246
  self,
158
247
  job_id: str,
@@ -55,6 +55,7 @@ class CacheSettings(BaseModel):
55
55
 
56
56
  enabled: bool = True
57
57
  ttl: int = Field(default=300, ge=0) # seconds; 0 effectively disables
58
+ max_size: int = Field(default=1000, ge=1) # entries before oldest-first eviction
58
59
 
59
60
  @field_validator("ttl", mode="after")
60
61
  @classmethod