jobd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jobd-0.1.0/.env.example +14 -0
- jobd-0.1.0/.github/workflows/ci.yml +41 -0
- jobd-0.1.0/.gitignore +21 -0
- jobd-0.1.0/CHANGELOG.md +77 -0
- jobd-0.1.0/Dockerfile +32 -0
- jobd-0.1.0/LICENSE +21 -0
- jobd-0.1.0/PKG-INFO +225 -0
- jobd-0.1.0/README.md +173 -0
- jobd-0.1.0/SECURITY.md +47 -0
- jobd-0.1.0/config/classifier.yaml +34 -0
- jobd-0.1.0/config/profiles.yaml +57 -0
- jobd-0.1.0/config/projects.yaml +26 -0
- jobd-0.1.0/docker-compose.yml +30 -0
- jobd-0.1.0/docs/assets/demo.svg +1 -0
- jobd-0.1.0/docs/plans/projects-yaml.md +597 -0
- jobd-0.1.0/docs/preemption.md +158 -0
- jobd-0.1.0/docs/runbook.md +94 -0
- jobd-0.1.0/docs/security.md +35 -0
- jobd-0.1.0/examples/claude-code-hooks/README.md +41 -0
- jobd-0.1.0/examples/claude-code-hooks/jobd-block-gpu.sh +207 -0
- jobd-0.1.0/examples/claude-code-hooks/jobd-nudge.sh +88 -0
- jobd-0.1.0/examples/claude-code-hooks/test-jobd-nudge.sh +149 -0
- jobd-0.1.0/pyproject.toml +81 -0
- jobd-0.1.0/scripts/cron.example +13 -0
- jobd-0.1.0/scripts/install-worker.sh +160 -0
- jobd-0.1.0/scripts/jobd-broker.service +33 -0
- jobd-0.1.0/scripts/run-live-test.sh +45 -0
- jobd-0.1.0/src/job_cli/__init__.py +0 -0
- jobd-0.1.0/src/job_cli/cli.py +955 -0
- jobd-0.1.0/src/jobd/__init__.py +10 -0
- jobd-0.1.0/src/jobd/app.py +1951 -0
- jobd-0.1.0/src/jobd/auth.py +142 -0
- jobd-0.1.0/src/jobd/cgroup_walk.py +117 -0
- jobd-0.1.0/src/jobd/classifier.py +31 -0
- jobd-0.1.0/src/jobd/client.py +225 -0
- jobd-0.1.0/src/jobd/config.py +170 -0
- jobd-0.1.0/src/jobd/ctest_eta.py +120 -0
- jobd-0.1.0/src/jobd/db.py +158 -0
- jobd-0.1.0/src/jobd/estimator.py +242 -0
- jobd-0.1.0/src/jobd/gpu_holder_probe.py +185 -0
- jobd-0.1.0/src/jobd/main.py +57 -0
- jobd-0.1.0/src/jobd/matcher.py +297 -0
- jobd-0.1.0/src/jobd/mcp/README.md +48 -0
- jobd-0.1.0/src/jobd/mcp/__init__.py +1 -0
- jobd-0.1.0/src/jobd/mcp/errors.py +63 -0
- jobd-0.1.0/src/jobd/mcp/schemas.py +116 -0
- jobd-0.1.0/src/jobd/mcp/server.py +194 -0
- jobd-0.1.0/src/jobd/mcp/tools.py +253 -0
- jobd-0.1.0/src/jobd/mcp/translate.py +133 -0
- jobd-0.1.0/src/jobd/models.py +297 -0
- jobd-0.1.0/src/jobd/subreaper.py +194 -0
- jobd-0.1.0/tests/__init__.py +0 -0
- jobd-0.1.0/tests/conftest.py +71 -0
- jobd-0.1.0/tests/fixtures/classifier_cases.yaml +11 -0
- jobd-0.1.0/tests/integration/__init__.py +0 -0
- jobd-0.1.0/tests/integration/test_cancel_latency.py +143 -0
- jobd-0.1.0/tests/integration/test_hook_forward.py +213 -0
- jobd-0.1.0/tests/integration/test_project_defaults_live.py +85 -0
- jobd-0.1.0/tests/mcp/__init__.py +0 -0
- jobd-0.1.0/tests/mcp/test_cli_parity.py +20 -0
- jobd-0.1.0/tests/mcp/test_client.py +180 -0
- jobd-0.1.0/tests/mcp/test_errors.py +37 -0
- jobd-0.1.0/tests/mcp/test_live.py +143 -0
- jobd-0.1.0/tests/mcp/test_tools.py +519 -0
- jobd-0.1.0/tests/mcp/test_translate.py +231 -0
- jobd-0.1.0/tests/mcp/test_wait_loop.py +142 -0
- jobd-0.1.0/tests/mcp/walkthrough.md +51 -0
- jobd-0.1.0/tests/test_api.py +2851 -0
- jobd-0.1.0/tests/test_app_auth_wiring.py +44 -0
- jobd-0.1.0/tests/test_auth.py +181 -0
- jobd-0.1.0/tests/test_capabilities.py +338 -0
- jobd-0.1.0/tests/test_classifier.py +77 -0
- jobd-0.1.0/tests/test_cli.py +770 -0
- jobd-0.1.0/tests/test_cli_audit.py +233 -0
- jobd-0.1.0/tests/test_cli_graph.py +334 -0
- jobd-0.1.0/tests/test_client_auth_header.py +51 -0
- jobd-0.1.0/tests/test_config.py +61 -0
- jobd-0.1.0/tests/test_db.py +95 -0
- jobd-0.1.0/tests/test_deploy_lint.py +87 -0
- jobd-0.1.0/tests/test_e2e.py +66 -0
- jobd-0.1.0/tests/test_estimator.py +365 -0
- jobd-0.1.0/tests/test_eta_in_jobinfo.py +309 -0
- jobd-0.1.0/tests/test_explain.py +176 -0
- jobd-0.1.0/tests/test_matcher.py +457 -0
- jobd-0.1.0/tests/test_models.py +153 -0
- jobd-0.1.0/tests/test_preemption_handler.py +144 -0
- jobd-0.1.0/tests/test_projects_yaml.py +129 -0
- jobd-0.1.0/tests/test_resolution_order.py +253 -0
- jobd-0.1.0/tests/test_submit_dry_run.py +107 -0
- jobd-0.1.0/tests/test_worker.py +1608 -0
- jobd-0.1.0/tests/test_worker_post_event.py +85 -0
- jobd-0.1.0/tests/unit/__init__.py +0 -0
- jobd-0.1.0/tests/unit/conftest.py +25 -0
- jobd-0.1.0/tests/unit/test_audit_acceptance.py +102 -0
- jobd-0.1.0/tests/unit/test_ctest_eta.py +177 -0
- jobd-0.1.0/tests/unit/test_dispatch_skip.py +246 -0
- jobd-0.1.0/tests/unit/test_emit_event.py +48 -0
- jobd-0.1.0/tests/unit/test_events_endpoint.py +191 -0
- jobd-0.1.0/tests/unit/test_gpu_holder_probe.py +200 -0
- jobd-0.1.0/tests/unit/test_lifecycle_events.py +109 -0
- jobd-0.1.0/tests/unit/test_main_entry.py +51 -0
- jobd-0.1.0/tests/unit/test_post_events.py +97 -0
- jobd-0.1.0/tests/unit/test_scheduling_timeout.py +214 -0
- jobd-0.1.0/tests/unit/test_stale_worker_sweep.py +237 -0
- jobd-0.1.0/tests/unit/test_subreaper.py +183 -0
- jobd-0.1.0/tests/unit/test_subreaper_real_execution.py +131 -0
- jobd-0.1.0/tests/unit/test_warning_events.py +133 -0
- jobd-0.1.0/tests/unit/test_worker_lifecycle_events.py +137 -0
- jobd-0.1.0/tests/unit/test_worker_reaper_import.py +71 -0
- jobd-0.1.0/uv.lock +1660 -0
- jobd-0.1.0/worker/capabilities.py +217 -0
- jobd-0.1.0/worker/job-worker.laptop.service +28 -0
- jobd-0.1.0/worker/job-worker.service +27 -0
- jobd-0.1.0/worker/job_worker.py +1037 -0
jobd-0.1.0/.env.example
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# jobd broker configuration — copy to `.env` and fill in.
|
|
2
|
+
# cp .env.example .env
|
|
3
|
+
# docker-compose.yml reads this file; JOBD_API_TOKEN is required.
|
|
4
|
+
|
|
5
|
+
# Shared secret for broker<->worker<->client auth. Generate a strong one:
|
|
6
|
+
# openssl rand -hex 32
|
|
7
|
+
# Every worker systemd unit and every CLI/MCP wrapper must use the SAME value.
|
|
8
|
+
JOBD_API_TOKEN=
|
|
9
|
+
|
|
10
|
+
# Address uvicorn binds to. Loopback by default. Set to a Tailscale CGNAT IP
|
|
11
|
+
# (100.64.0.0/10) to expose the broker to other tailnet machines. NEVER bind to
|
|
12
|
+
# a public/LAN address — the access-control model assumes a trusted tailnet.
|
|
13
|
+
# See docs/security.md.
|
|
14
|
+
JOBD_HOST=127.0.0.1
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ci-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
test:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
strategy:
|
|
17
|
+
fail-fast: false
|
|
18
|
+
matrix:
|
|
19
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Install uv
|
|
24
|
+
uses: astral-sh/setup-uv@v5
|
|
25
|
+
with:
|
|
26
|
+
enable-cache: true
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies (from lockfile)
|
|
29
|
+
run: uv sync --frozen --python ${{ matrix.python-version }} --extra dev --extra worker --extra mcp
|
|
30
|
+
|
|
31
|
+
- name: Lint (ruff)
|
|
32
|
+
run: uv run ruff check .
|
|
33
|
+
|
|
34
|
+
- name: Tests (excluding live broker)
|
|
35
|
+
run: uv run pytest -m "not live" -q
|
|
36
|
+
|
|
37
|
+
# Informational: the codebase is mid-typing-adoption, so mypy is reported
|
|
38
|
+
# but does not gate the build. Tighten to a hard gate once it's clean.
|
|
39
|
+
- name: Types (mypy, non-blocking)
|
|
40
|
+
run: uv run mypy src/jobd src/job_cli
|
|
41
|
+
continue-on-error: true
|
jobd-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.pyc
|
|
3
|
+
.pytest_cache/
|
|
4
|
+
.mypy_cache/
|
|
5
|
+
.ruff_cache/
|
|
6
|
+
*.egg-info/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
logs/
|
|
12
|
+
jobd.db
|
|
13
|
+
jobd.db-journal
|
|
14
|
+
jobd.db-wal
|
|
15
|
+
jobd.db-shm
|
|
16
|
+
.coverage
|
|
17
|
+
coverage.xml
|
|
18
|
+
htmlcov/
|
|
19
|
+
.env
|
|
20
|
+
.env.*
|
|
21
|
+
!.env.example
|
jobd-0.1.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to jobd. Format roughly follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
4
|
+
|
|
5
|
+
## [0.1.0] — 2026-05-31
|
|
6
|
+
|
|
7
|
+
Initial public release. The bullets below summarize the capabilities built during
|
|
8
|
+
pre-release development; parenthetical short hashes reference that internal history
|
|
9
|
+
and are not present in this repository's squashed tree.
|
|
10
|
+
|
|
11
|
+
### Added — Auto-preempt protocol
|
|
12
|
+
|
|
13
|
+
- **#8 path A — auto-preempt on queue-age** (`6db2712`): broker sweeper auto-preempts running jobs when their host has a higher-priority candidate waiting and queue age crosses threshold.
|
|
14
|
+
- **#28/#29 manual preempt-blockers + warnings-only filter** (`b06e78d`): `--no-preempt` submit flag and warnings-only `job list --warnings` filter.
|
|
15
|
+
- **#32 `events.jsonl` auto-preempt event** (`e719842`): broker emits a structured event each time a sweep fires, for observability.
|
|
16
|
+
- **#34 `/preempt` accepts ASSIGNED-but-not-started state** (`22a3791`): pinned by test.
|
|
17
|
+
- **#35 `depends_on` cascade on PREEMPTED parent** (`26e56ac`): pinned by test.
|
|
18
|
+
- **#8 path B — checkpoint protocol** (`e7593b5`): SIGTERM grace window with `JOBD_CHECKPOINT_GRACE_S` knob; terminal `preempted` state with `RESUME_FROM` operator-driven resume contract.
|
|
19
|
+
- **`JOBD_CHECKPOINT_DIR` env var** (`f0cb754`): exposed to every job, pointing at a per-job directory `${JOBD_WORKER_CHECKPOINT_ROOT:-$XDG_DATA_HOME/jobd/checkpoints}/<job_id>/` (default `~/.local/share/jobd/checkpoints/<job_id>/`). Workloads write durable preempt-time checkpoints here; broker does not sweep contents.
|
|
20
|
+
- **`JOBD_WORKER_CHECKPOINT_ROOT` operator override** (`71ae23b`, fix-forward `bf195bc`): worker-level env var redirects checkpoint root (e.g., to a faster filesystem); `os.path.expanduser` applied.
|
|
21
|
+
|
|
22
|
+
### Added — Resource-aware admission & GPU matching
|
|
23
|
+
|
|
24
|
+
- **#42 heartbeat-aware GPU matcher** (`a3509c9`): matcher consults live worker heartbeats (`unregistered_vram_gb`) and drops saturated hosts; 2 GB VRAM floor on `--gpu`.
|
|
25
|
+
- **#42 multi-tenant per-host co-routing** (`f9e6152`): `JOBD_WORKER_MAX_CONCURRENT_JOBS` enables threaded per-host dispatch with per-job thread tracking.
|
|
26
|
+
- **`unregistered_vram_gb` in `/workers`** (`93795b0`): exposed for matcher + observability.
|
|
27
|
+
- **#41 resource-aware admission at dispatch** (`53ffc09`): NVML probe → admission gate → `/jobs/{id}/refuse-admission` re-queue path; tier-tag inference; bypass marker.
|
|
28
|
+
- **#41d `vram_gb` on `JobSubmit`** (`ed154f5`): CLI `--vram-required` flag now persists end-to-end.
|
|
29
|
+
- **#43 unsatisfiable-placement preflight** (`bf15ffa`): submit-time check rejects jobs whose `needs:` tags or VRAM ask cannot be satisfied by any registered worker.
|
|
30
|
+
- **CUDA VRAM tier tags** (`860fdf6`): `cuda-32gb`, `cuda-12gb`, etc. so `needs:cuda-32gb` routes only to matching hosts.
|
|
31
|
+
- **`cuda-8gb` tier** (`a8ed96d`): added so RTX 2080-class GPUs advertise a discoverable tier.
|
|
32
|
+
|
|
33
|
+
### Added — Scheduling, watchdogs, MCP
|
|
34
|
+
|
|
35
|
+
- **Per-job idle-output watchdog + max-wall timeout** (`5f074c0`): broker-side wall enforcement; idle-output watchdog terminates silent-hung jobs.
|
|
36
|
+
- **`--max-wall` and `--idle-timeout` flags on `job submit`** (`f105724`).
|
|
37
|
+
- **Scheduler-awareness warnings** (`0ef4497`): single-slot stall + queue-age-blocked-by-load surface as `warnings:` on `/jobs/{id}`.
|
|
38
|
+
- **Per-job time estimation v1** (`1642d5d`): `eta_*` fields on `Job` and `--eta` flag on `job list`.
|
|
39
|
+
- **Default-on ETA banner on `job submit`**: `--eta/--no-eta` defaults on; prints `Estimated wall p50 X, p90 Y (n=N prior runs)` or `ColdStart` line to stderr after submit. Closes BACKLOG "Auto-surface ETA on submit" Part 1.
|
|
40
|
+
- **ETA Part 2 — ctest-aware sub-job parsing**: opt-in via `JOBD_CTEST_PARSE=1`. New module `src/jobd/ctest_eta.py` parses `<cwd>/build*/Testing/Temporary/CTestCostData.txt`, filters tests by the `ctest -R <regex>` arg, and sums avg-cost values. Broker reports `eta_basis="ctest-cost-K=<n>"` ahead of history-based prediction; CLI banner renders `Estimated wall ~Xs (ctest cost-data, k=K tests)`. Falls through to history when env unset, regex misses, or cost file absent. Closes BACKLOG "Auto-surface ETA on submit" Part 2.
|
|
41
|
+
- **First-byte smoke watchdog — pieces 2+3**: worker-side, in `worker/job_worker.py`. Piece 3 (pre-dispatch launcher-existence check) verifies `cmd[0]` exists and is executable when it looks like a path (`/`, `./`, `../`); on miss, POSTs `/complete` with `termination_reason="launcher_missing"` instead of exec-ing into a silent exit-127. Piece 2 (first-output watchdog) adds env var `JOBD_WORKER_FIRST_OUTPUT_TIMEOUT_S`: fires once if no stdout byte lands within N seconds of job start, disarms permanently on first byte. Both surface as `final_state="failed"` for depends_on cascades. Closes BACKLOG "First-byte smoke" pieces 2+3; piece 1 (push-on-terminal-failure) deferred.
|
|
42
|
+
- **Per-project defaults block in `projects.yaml`** (`34c2a30`): keys at submit time inherit from project block when omitted.
|
|
43
|
+
- **`fast_path` field honored in `JobSubmit`** (`557a0e8`).
|
|
44
|
+
- **#51 `submitted_via` marker** (`636969c`): `JobSubmit.submitted_via: Literal["cli", "mcp"]` round-trips through translation layer; structural test pinned.
|
|
45
|
+
|
|
46
|
+
### Added — Worker management & deploy
|
|
47
|
+
|
|
48
|
+
- **DELETE `/workers/{host}` for purging stale registrations** (`ea04d6a`).
|
|
49
|
+
- **`job delete-worker` + `jobd_worker_delete` MCP tool** (`ccdf35c`): exposes the DELETE endpoint via CLI and MCP.
|
|
50
|
+
- **`jobd-broker.service` systemd unit** (`5334a5c`, #52): with tailscale IP wait.
|
|
51
|
+
- **`job-worker.laptop.service` variant** (`c4591b0`): for full-repo-checkout hosts (vs. the server standalone install).
|
|
52
|
+
- **Nightly live integration test cron wrapper** (`ef24309`): runs `tests/mcp/test_live.py` against the live broker.
|
|
53
|
+
- **Audit instrumentation pass** (`07aa2cf`): coverage inventory + gap report against the existing observability surface.
|
|
54
|
+
|
|
55
|
+
### Fixed
|
|
56
|
+
|
|
57
|
+
- **`jobd --help` / `--version` no longer crash with SQLite OperationalError**: entry point now parses argv before `build_app()`, so `--help`/`--version` short-circuit cleanly without touching the database. Help text documents `JOBD_CONFIG_DIR` / `JOBD_DB_URL` / `JOBD_PORT` / `JOBD_LOGS_DIR`.
|
|
58
|
+
- **`#51` install-worker no longer writes static `tags:` to `worker.yaml`** (`5c01882`): tags now come from runtime probe.
|
|
59
|
+
- **`#73eaa46` cancel via `systemctl kill` on named scope** rather than `Popen.pid`: ensures the entire scope tree dies, not just the tracked PID.
|
|
60
|
+
- **`59561aa` ASSIGNED → RUNNING transition** so cancel reports SIGTERM correctly.
|
|
61
|
+
- **`969abc3` MCP `log_tail` and `depends_on` field-name alignment** (mcp-v1 field-test fallout).
|
|
62
|
+
- **`353a4fb`** clear pyright noise in `_print_resolved`/`_row`.
|
|
63
|
+
|
|
64
|
+
### Documentation
|
|
65
|
+
|
|
66
|
+
- **Checkpoint directory contract** in `docs/preemption.md` (`8b5e339`, fix-forward `7ebffca`, final-review polish `2eec63b`): canonical surface for workload authors; covers env vars, default root, mode-0700 + cross-user-resume caveat.
|
|
67
|
+
- **Auto-preempt default-flip design spec** (`37e2dc4`) + amendment (`31e843d`).
|
|
68
|
+
- **Auto-preempt jobd-side implementation plan** (`915970f`).
|
|
69
|
+
- **Backlog reconciliation** (`ddca65e`, `d34f3b0`, `6d0fbaf`, `b257cca`, `354e043`, `ab8f7f7`, `7eb1203`, `394269c`, `b8de199`): Open vs Done sweep after the 2026-04-27..04-29 ship train; field-test backlog items filed.
|
|
70
|
+
- **Projects.yaml defaults — implementation blueprint** (`c98cf26`).
|
|
71
|
+
- **CHANGELOG.md created** (`86b4880`); this entry backfills the gap from `mcp-v1` to current tip.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## [mcp-v1] — 2026-04-26
|
|
76
|
+
|
|
77
|
+
Translation-layer/MCP shipping point. Earlier history is not catalogued here; see `git log mcp-v1` for detail.
|
jobd-0.1.0/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Pinned to a specific Debian release for reproducibility. For stricter supply-
|
|
2
|
+
# chain guarantees, pin by digest instead: FROM python:3.11-slim-bookworm@sha256:<digest>
|
|
3
|
+
FROM python:3.11-slim-bookworm
|
|
4
|
+
|
|
5
|
+
WORKDIR /app
|
|
6
|
+
ENV PYTHONUNBUFFERED=1 \
|
|
7
|
+
PIP_NO_CACHE_DIR=1 \
|
|
8
|
+
JOBD_CONFIG_DIR=/app/config \
|
|
9
|
+
JOBD_DB_URL=sqlite:////app/data/jobd.db \
|
|
10
|
+
JOBD_LOGS_DIR=/app/logs \
|
|
11
|
+
JOBD_PORT=8765
|
|
12
|
+
|
|
13
|
+
COPY pyproject.toml ./
|
|
14
|
+
COPY src ./src
|
|
15
|
+
RUN pip install -U pip && pip install .
|
|
16
|
+
|
|
17
|
+
# Run as an unprivileged user; the broker never needs root. Data/logs must be
|
|
18
|
+
# writable by that user (the SQLite DB lives under /app/data).
|
|
19
|
+
RUN useradd --create-home --uid 10001 jobd \
|
|
20
|
+
&& mkdir -p /app/data /app/logs \
|
|
21
|
+
&& chown -R jobd:jobd /app
|
|
22
|
+
USER jobd
|
|
23
|
+
|
|
24
|
+
EXPOSE 8765
|
|
25
|
+
|
|
26
|
+
# /health is behind the bearer-token auth dependency, and the slim image has no
|
|
27
|
+
# curl, so the healthcheck is a plain TCP connect: a listening socket means the
|
|
28
|
+
# broker is up.
|
|
29
|
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
|
30
|
+
CMD python -c "import os,socket; socket.create_connection(('127.0.0.1', int(os.environ.get('JOBD_PORT','8765'))), timeout=3).close()"
|
|
31
|
+
|
|
32
|
+
CMD ["jobd"]
|
jobd-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jaret Arnold
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
jobd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jobd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Self-hostable GPU-aware job broker for your own machines, with native MCP/agent integration
|
|
5
|
+
Project-URL: Homepage, https://github.com/musharna/jobd
|
|
6
|
+
Project-URL: Repository, https://github.com/musharna/jobd
|
|
7
|
+
Project-URL: Issues, https://github.com/musharna/jobd/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/musharna/jobd/blob/main/CHANGELOG.md
|
|
9
|
+
Author: Jaret Arnold
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: gpu,homelab,job-queue,mcp,scheduler,self-hosted
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: MacOS
|
|
19
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
26
|
+
Requires-Python: >=3.11
|
|
27
|
+
Requires-Dist: fastapi>=0.110
|
|
28
|
+
Requires-Dist: httpx>=0.26
|
|
29
|
+
Requires-Dist: pydantic>=2.6
|
|
30
|
+
Requires-Dist: pyyaml>=6.0
|
|
31
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
32
|
+
Requires-Dist: sse-starlette>=2.0
|
|
33
|
+
Requires-Dist: starlette>=1.0.1
|
|
34
|
+
Requires-Dist: typer>=0.9
|
|
35
|
+
Requires-Dist: uvicorn[standard]>=0.27
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: respx>=0.20; extra == 'dev'
|
|
42
|
+
Requires-Dist: ruff>=0.3; extra == 'dev'
|
|
43
|
+
Provides-Extra: mcp
|
|
44
|
+
Requires-Dist: httpx>=0.27; extra == 'mcp'
|
|
45
|
+
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
46
|
+
Provides-Extra: worker
|
|
47
|
+
Requires-Dist: httpx>=0.26; extra == 'worker'
|
|
48
|
+
Requires-Dist: nvidia-ml-py>=12; extra == 'worker'
|
|
49
|
+
Requires-Dist: psutil>=5.9; extra == 'worker'
|
|
50
|
+
Requires-Dist: pyyaml>=6.0; extra == 'worker'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
<div align="center">
|
|
54
|
+
|
|
55
|
+
# jobd
|
|
56
|
+
|
|
57
|
+
[](https://github.com/musharna/jobd/actions/workflows/ci.yml)
|
|
58
|
+
[](https://pypi.org/project/jobd/)
|
|
59
|
+

|
|
60
|
+
[](LICENSE)
|
|
61
|
+
|
|
62
|
+
**A self-hostable, GPU-aware job broker for your own machines — with native MCP/agent integration.**
|
|
63
|
+
|
|
64
|
+
> Like [task-spooler](https://manpages.ubuntu.com/manpages/noble/man1/tsp.1.html), but across more than one machine — and VRAM-aware.
|
|
65
|
+
|
|
66
|
+
</div>
|
|
67
|
+
|
|
68
|
+
<p align="center">
|
|
69
|
+
<img src="https://raw.githubusercontent.com/musharna/jobd/main/docs/assets/demo.svg" alt="jobd in action: submit a GPU job, watch it route to a worker with free VRAM and stream back, then inspect the full lifecycle" width="100%">
|
|
70
|
+
</p>
|
|
71
|
+
|
|
72
|
+
You have a couple of boxes with GPUs — a workstation, a server, maybe a laptop — wired together over [Tailscale](https://tailscale.com/) or a LAN. You want to fire off training runs, data pipelines, and long batch jobs from anywhere, have them land on whichever machine actually has the VRAM free, survive across sessions, and get preempted cleanly when something more important shows up. You don't have a cloud, a Kubernetes cluster, or a Slurm install, and you don't want one.
|
|
73
|
+
|
|
74
|
+
jobd is that missing piece: a small broker that turns a handful of personal machines into a single queue. Think _SkyPilot / Modal, for people without a cloud_ — except the fleet is the hardware you already own, and an LLM agent can drive it directly.
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# from any machine on your tailnet:
|
|
78
|
+
job submit --project myproj --gpu --vram-required 16 --wait -- python train.py
|
|
79
|
+
# → routed to whichever worker has ≥16 GB VRAM free, streamed back to your terminal
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Why it exists
|
|
83
|
+
|
|
84
|
+
Most schedulers assume a datacenter. The lightweight ones that don't (a bare `nohup`, a tmux session, an ssh-and-pray script) give you nothing: no queue, no VRAM-aware routing, no preemption, no record of what ran where. jobd fills the gap between "ssh in and run it" and "stand up Slurm":
|
|
85
|
+
|
|
86
|
+
- **VRAM-fit routing.** The broker matches each job against live worker capacity (free VRAM / RAM / CPUs, capability tags, arch/OS) and dispatches to a worker that actually fits — instead of you guessing which box is free.
|
|
87
|
+
- **Preempt + checkpoint.** A higher-priority job can preempt a running one: the worker sends `SIGTERM`, the workload gets a grace window to checkpoint, then `SIGKILL`. A preempted job reaches a terminal `preempted` state with a durable checkpoint to resume from — it isn't silently re-run. (See [docs/preemption.md](docs/preemption.md).)
|
|
88
|
+
- **Survives sessions.** Submit, close your laptop, check back tomorrow. Jobs live in the broker, not your shell.
|
|
89
|
+
- **Agent-native.** Ships a first-class [MCP](https://modelcontextprotocol.io/) server so an LLM agent (Claude Code, etc.) can submit, monitor, and babysit jobs as tool calls — the thing most schedulers bolt on as an afterthought, if at all.
|
|
90
|
+
- **Yours.** One broker process you run on a machine you own. No accounts, no egress, no per-GPU-hour billing. Tailnet-bound by default.
|
|
91
|
+
|
|
92
|
+
## Why not just use…?
|
|
93
|
+
|
|
94
|
+
| Tool | What it gives you | Why jobd instead |
|
|
95
|
+
| ------------------------------------------------------------------------------ | ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
|
|
96
|
+
| **`nohup` / `tmux` / ssh-and-pray** | Runs a command on one box | No queue, no VRAM-aware routing, no preemption, no record of what ran where |
|
|
97
|
+
| **[task-spooler](https://manpages.ubuntu.com/manpages/noble/man1/tsp.1.html)** | A real job queue — on a single machine | jobd queues across _all_ your machines and routes by live VRAM/CPU fit |
|
|
98
|
+
| **Slurm** | Datacenter-grade scheduling | Heavy to stand up and operate for 2–3 personal boxes; jobd is one process + a poller per host |
|
|
99
|
+
| **SkyPilot / Modal / dstack** | Provision and run on clouds (SkyPilot also on-prem via SSH) | jobd targets hardware you _already own_, with no cloud/K8s assumptions and a much smaller footprint |
|
|
100
|
+
| **Ray** | A distributed-compute framework | jobd is a job _queue_, not a programming model — submit any command, no code changes, GPU-fit routing built in |
|
|
101
|
+
|
|
102
|
+
Closest in spirit are task-spooler (single-node) and on-prem SkyPilot (heavier, cloud-shaped). jobd's niche is the 2–3-GPU homelab: multi-machine VRAM-fit routing + preempt/checkpoint + a native agent interface, with nothing to stand up.
|
|
103
|
+
|
|
104
|
+
## Architecture
|
|
105
|
+
|
|
106
|
+
```mermaid
|
|
107
|
+
flowchart TD
|
|
108
|
+
CLI["job CLI"]:::client --> B
|
|
109
|
+
MCP["jobd-mcp<br/>MCP tools"]:::client --> B
|
|
110
|
+
API["HTTP · SSE"]:::client --> B
|
|
111
|
+
B["<b>jobd broker</b> — FastAPI<br/>queue · matcher · priorities · SQLite"]:::broker
|
|
112
|
+
B <-->|poll · dispatch| WA["worker A<br/>24 GB GPU"]:::worker
|
|
113
|
+
B <-->|poll · dispatch| WB["worker B<br/>8 GB GPU"]:::worker
|
|
114
|
+
B <-->|poll · dispatch| WC["worker C<br/>CPU-only"]:::worker
|
|
115
|
+
classDef client fill:#1f2937,stroke:#4b5563,color:#e5e7eb;
|
|
116
|
+
classDef broker fill:#0e7490,stroke:#155e75,color:#ecfeff;
|
|
117
|
+
classDef worker fill:#14532d,stroke:#166534,color:#dcfce7;
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Workers **poll** the broker (pull model — no inbound connection to a worker); the broker matches each job against live capacity and hands it back on the poll. One broker process, one poller per host.
|
|
121
|
+
|
|
122
|
+
- **Broker** — a FastAPI + SQLite service. Holds the queue, runs the matcher, resolves per-project priorities and defaults, exposes a small HTTP API and an SSE stream. Single source of truth.
|
|
123
|
+
- **Workers** — lightweight polling agents, one per host. Each advertises live capacity via heartbeat, claims jobs it can run, executes them (`shell=False`, no shell-injection surface), streams logs back, and honors preemption signals.
|
|
124
|
+
- **Clients** — the `job` CLI, the `jobd-mcp` MCP server, or anything that speaks the HTTP API.
|
|
125
|
+
|
|
126
|
+
## Install
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pip install jobd # broker + CLI
|
|
130
|
+
pip install "jobd[mcp]" # adds the MCP server
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Requires Python ≥ 3.11. The `pip` package is the **broker + CLI + MCP server**; the worker is intentionally not bundled (it has host-specific system deps). Workers run from a clone of this repo (`worker/job_worker.py`) — `scripts/install-worker.sh` sets one up under `~/jobd-worker` with its own venv. (Packaging the worker as `jobd-worker` is planned for a later release.)
|
|
134
|
+
|
|
135
|
+
## Quickstart (single host)
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# 1. start the broker (binds 127.0.0.1:8765 by default)
|
|
139
|
+
JOBD_ALLOW_NO_AUTH=1 jobd # no-auth is fine for a loopback-only broker
|
|
140
|
+
|
|
141
|
+
# 2. in another shell (from a clone of this repo), start a worker pointed at it
|
|
142
|
+
pip install httpx psutil pyyaml pynvml # worker deps (pynvml only needed for GPU hosts)
|
|
143
|
+
JOBD_URL=http://127.0.0.1:8765 JOBD_WORKER_HOST=local \
|
|
144
|
+
python worker/job_worker.py
|
|
145
|
+
|
|
146
|
+
# 3. submit a job and wait for it
|
|
147
|
+
job submit --project demo --wait -- echo hello
|
|
148
|
+
job list
|
|
149
|
+
job logs <id>
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
For a real multi-host deployment (Docker broker + systemd workers, Tailscale binding, shared auth token), see **[docs/security.md](docs/security.md)** and the templates in `docker-compose.yml`, `scripts/`, and `worker/`. Day-2 operations (health, draining a worker, upgrades, token rotation, backups) are in **[docs/runbook.md](docs/runbook.md)**.
|
|
153
|
+
|
|
154
|
+
## Supported platforms
|
|
155
|
+
|
|
156
|
+
Python 3.11+ everywhere.
|
|
157
|
+
|
|
158
|
+
| Component | Linux | macOS | Windows |
|
|
159
|
+
| -------------------------------------- | ------- | ----------- | -------------------- |
|
|
160
|
+
| **Broker** (`jobd`) | ✅ | ✅ | ✅ (WSL recommended) |
|
|
161
|
+
| **CLI** (`job`) / **MCP** (`jobd-mcp`) | ✅ | ✅ | ✅ |
|
|
162
|
+
| **Worker** (`job_worker.py`) | ✅ full | ⚠️ degraded | ⚠️ degraded |
|
|
163
|
+
|
|
164
|
+
The **worker** runs its best on Linux with a systemd user instance: memory caps, process reaping, and preemption use `systemd-run --user` scopes and cgroups. On non-systemd hosts the worker still executes jobs, but silently drops those guarantees — fine for a single trusted box, not for hard resource isolation. GPU features need NVIDIA + `nvidia-ml-py`. The broker, CLI, and MCP server are pure-Python and portable.
|
|
165
|
+
|
|
166
|
+
## CLI
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
job submit -p PROJ [--gpu] [--vram-required N] [--needs TAG]... [--wait] -- CMD...
|
|
170
|
+
job list [--state STATE] [--project P] # queue + recent jobs
|
|
171
|
+
job status ID [--watch] # one job, optionally live
|
|
172
|
+
job logs ID [-n BYTES] # tail captured output
|
|
173
|
+
job wait ID # block until terminal
|
|
174
|
+
job cancel ID / job preempt ID # stop a job
|
|
175
|
+
job workers # fleet snapshot + health
|
|
176
|
+
job projects list | set NAME PRI | nudge NAME DELTA
|
|
177
|
+
job audit [--project P] [--since 24h] # event history
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
`job submit --explain` dry-runs the resolution (priority, profile, project defaults, host pin) and prints the effective config without enqueuing anything.
|
|
181
|
+
|
|
182
|
+
## MCP / agent integration
|
|
183
|
+
|
|
184
|
+
jobd ships an MCP server (`jobd-mcp`) exposing the queue as nine tools — `jobd_submit`, `jobd_status`, `jobd_logs`, `jobd_list`, `jobd_cancel`, `jobd_preempt`, `jobd_workers`, `jobd_job_get`, `jobd_worker_delete`. Point your MCP client at it:
|
|
185
|
+
|
|
186
|
+
```json
|
|
187
|
+
{
|
|
188
|
+
"mcpServers": {
|
|
189
|
+
"jobd": {
|
|
190
|
+
"command": "jobd-mcp",
|
|
191
|
+
"env": {
|
|
192
|
+
"JOBD_URL": "http://127.0.0.1:8765",
|
|
193
|
+
"JOBD_API_TOKEN": "<your-token>"
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
`JOBD_API_TOKEN` must match the broker's token, or every call returns 401. Omit it only when the broker runs with `JOBD_ALLOW_NO_AUTH=1`.
|
|
201
|
+
|
|
202
|
+
Now an agent can "run this overnight," check on it next session, and route GPU work through the broker instead of colliding on a shared card. The `examples/claude-code-hooks/` directory has optional [Claude Code](https://docs.claude.com/en/docs/claude-code) hooks that _nudge_ (or hard-block) an agent toward submitting heavy commands through jobd — including a VRAM-aware GPU guard with `# NO_GPU` / `# CONCURRENT_OK` / `# VRAM=NGB` override markers.
|
|
203
|
+
|
|
204
|
+
## Configuration
|
|
205
|
+
|
|
206
|
+
Three optional YAML files under `JOBD_CONFIG_DIR` (defaults shipped in `config/`):
|
|
207
|
+
|
|
208
|
+
- **`projects.yaml`** — per-project base priority and submit defaults (preemptibility, wall/idle timeouts, host pins, capability requirements). See [docs/plans/projects-yaml.md](docs/plans/projects-yaml.md) for the full resolution model.
|
|
209
|
+
- **`profiles.yaml`** — named resource bundles (`--profile gpu-train-large`) the matcher uses to size a job.
|
|
210
|
+
- **`classifier.yaml`** — rules that auto-suggest a profile from the command string.
|
|
211
|
+
|
|
212
|
+
All three are optional; with none present, every job runs at the global default priority.
|
|
213
|
+
|
|
214
|
+
## Security
|
|
215
|
+
|
|
216
|
+
The broker has **no TCP-layer auth beyond a shared bearer token**, so it is meant to run on a trusted network (loopback or a Tailscale tailnet), never on a public interface. Two stacked controls:
|
|
217
|
+
|
|
218
|
+
1. **Interface binding** — `JOBD_HOST` must be `127.0.0.1` or a Tailscale CGNAT address (`100.64.0.0/10`), never `0.0.0.0`. A CI lint (`tests/test_deploy_lint.py`) enforces this on the Docker deployment.
|
|
219
|
+
2. **Bearer token** — set `JOBD_API_TOKEN` (≥32 random bytes) on every broker/worker/CLI/MCP host. The broker refuses to start without it unless you explicitly set `JOBD_ALLOW_NO_AUTH=1`. **`JOBD_ALLOW_NO_AUTH=1` is for a loopback-only broker (`JOBD_HOST=127.0.0.1`) — for local dev/tests.** Combined with a non-loopback `JOBD_HOST` it exposes an unauthenticated RCE endpoint to your whole tailnet; the broker logs a startup warning if you do this. Don't.
|
|
220
|
+
|
|
221
|
+
Full threat model, env-var reference, and token rotation: **[docs/security.md](docs/security.md)**.
|
|
222
|
+
|
|
223
|
+
## License
|
|
224
|
+
|
|
225
|
+
MIT — see [LICENSE](LICENSE).
|
jobd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# jobd
|
|
4
|
+
|
|
5
|
+
[](https://github.com/musharna/jobd/actions/workflows/ci.yml)
|
|
6
|
+
[](https://pypi.org/project/jobd/)
|
|
7
|
+

|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
**A self-hostable, GPU-aware job broker for your own machines — with native MCP/agent integration.**
|
|
11
|
+
|
|
12
|
+
> Like [task-spooler](https://manpages.ubuntu.com/manpages/noble/man1/tsp.1.html), but across more than one machine — and VRAM-aware.
|
|
13
|
+
|
|
14
|
+
</div>
|
|
15
|
+
|
|
16
|
+
<p align="center">
|
|
17
|
+
<img src="https://raw.githubusercontent.com/musharna/jobd/main/docs/assets/demo.svg" alt="jobd in action: submit a GPU job, watch it route to a worker with free VRAM and stream back, then inspect the full lifecycle" width="100%">
|
|
18
|
+
</p>
|
|
19
|
+
|
|
20
|
+
You have a couple of boxes with GPUs — a workstation, a server, maybe a laptop — wired together over [Tailscale](https://tailscale.com/) or a LAN. You want to fire off training runs, data pipelines, and long batch jobs from anywhere, have them land on whichever machine actually has the VRAM free, survive across sessions, and get preempted cleanly when something more important shows up. You don't have a cloud, a Kubernetes cluster, or a Slurm install, and you don't want one.
|
|
21
|
+
|
|
22
|
+
jobd is that missing piece: a small broker that turns a handful of personal machines into a single queue. Think _SkyPilot / Modal, for people without a cloud_ — except the fleet is the hardware you already own, and an LLM agent can drive it directly.
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# from any machine on your tailnet:
|
|
26
|
+
job submit --project myproj --gpu --vram-required 16 --wait -- python train.py
|
|
27
|
+
# → routed to whichever worker has ≥16 GB VRAM free, streamed back to your terminal
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Why it exists
|
|
31
|
+
|
|
32
|
+
Most schedulers assume a datacenter. The lightweight ones that don't (a bare `nohup`, a tmux session, an ssh-and-pray script) give you nothing: no queue, no VRAM-aware routing, no preemption, no record of what ran where. jobd fills the gap between "ssh in and run it" and "stand up Slurm":
|
|
33
|
+
|
|
34
|
+
- **VRAM-fit routing.** The broker matches each job against live worker capacity (free VRAM / RAM / CPUs, capability tags, arch/OS) and dispatches to a worker that actually fits — instead of you guessing which box is free.
|
|
35
|
+
- **Preempt + checkpoint.** A higher-priority job can preempt a running one: the worker sends `SIGTERM`, the workload gets a grace window to checkpoint, then `SIGKILL`. A preempted job reaches a terminal `preempted` state with a durable checkpoint to resume from — it isn't silently re-run. (See [docs/preemption.md](docs/preemption.md).)
|
|
36
|
+
- **Survives sessions.** Submit, close your laptop, check back tomorrow. Jobs live in the broker, not your shell.
|
|
37
|
+
- **Agent-native.** Ships a first-class [MCP](https://modelcontextprotocol.io/) server so an LLM agent (Claude Code, etc.) can submit, monitor, and babysit jobs as tool calls — the thing most schedulers bolt on as an afterthought, if at all.
|
|
38
|
+
- **Yours.** One broker process you run on a machine you own. No accounts, no egress, no per-GPU-hour billing. Tailnet-bound by default.
|
|
39
|
+
|
|
40
|
+
## Why not just use…?
|
|
41
|
+
|
|
42
|
+
| Tool | What it gives you | Why jobd instead |
|
|
43
|
+
| ------------------------------------------------------------------------------ | ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
|
|
44
|
+
| **`nohup` / `tmux` / ssh-and-pray** | Runs a command on one box | No queue, no VRAM-aware routing, no preemption, no record of what ran where |
|
|
45
|
+
| **[task-spooler](https://manpages.ubuntu.com/manpages/noble/man1/tsp.1.html)** | A real job queue — on a single machine | jobd queues across _all_ your machines and routes by live VRAM/CPU fit |
|
|
46
|
+
| **Slurm** | Datacenter-grade scheduling | Heavy to stand up and operate for 2–3 personal boxes; jobd is one process + a poller per host |
|
|
47
|
+
| **SkyPilot / Modal / dstack** | Provision and run on clouds (SkyPilot also on-prem via SSH) | jobd targets hardware you _already own_, with no cloud/K8s assumptions and a much smaller footprint |
|
|
48
|
+
| **Ray** | A distributed-compute framework | jobd is a job _queue_, not a programming model — submit any command, no code changes, GPU-fit routing built in |
|
|
49
|
+
|
|
50
|
+
Closest in spirit are task-spooler (single-node) and on-prem SkyPilot (heavier, cloud-shaped). jobd's niche is the 2–3-GPU homelab: multi-machine VRAM-fit routing + preempt/checkpoint + a native agent interface, with nothing to stand up.
|
|
51
|
+
|
|
52
|
+
## Architecture
|
|
53
|
+
|
|
54
|
+
```mermaid
|
|
55
|
+
flowchart TD
|
|
56
|
+
CLI["job CLI"]:::client --> B
|
|
57
|
+
MCP["jobd-mcp<br/>MCP tools"]:::client --> B
|
|
58
|
+
API["HTTP · SSE"]:::client --> B
|
|
59
|
+
B["<b>jobd broker</b> — FastAPI<br/>queue · matcher · priorities · SQLite"]:::broker
|
|
60
|
+
B <-->|poll · dispatch| WA["worker A<br/>24 GB GPU"]:::worker
|
|
61
|
+
B <-->|poll · dispatch| WB["worker B<br/>8 GB GPU"]:::worker
|
|
62
|
+
B <-->|poll · dispatch| WC["worker C<br/>CPU-only"]:::worker
|
|
63
|
+
classDef client fill:#1f2937,stroke:#4b5563,color:#e5e7eb;
|
|
64
|
+
classDef broker fill:#0e7490,stroke:#155e75,color:#ecfeff;
|
|
65
|
+
classDef worker fill:#14532d,stroke:#166534,color:#dcfce7;
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Workers **poll** the broker (pull model — no inbound connection to a worker); the broker matches each job against live capacity and hands it back on the poll. One broker process, one poller per host.
|
|
69
|
+
|
|
70
|
+
- **Broker** — a FastAPI + SQLite service. Holds the queue, runs the matcher, resolves per-project priorities and defaults, exposes a small HTTP API and an SSE stream. Single source of truth.
|
|
71
|
+
- **Workers** — lightweight polling agents, one per host. Each advertises live capacity via heartbeat, claims jobs it can run, executes them (`shell=False`, no shell-injection surface), streams logs back, and honors preemption signals.
|
|
72
|
+
- **Clients** — the `job` CLI, the `jobd-mcp` MCP server, or anything that speaks the HTTP API.
|
|
73
|
+
|
|
74
|
+
## Install
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install jobd # broker + CLI
|
|
78
|
+
pip install "jobd[mcp]" # adds the MCP server
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Requires Python ≥ 3.11. The `pip` package is the **broker + CLI + MCP server**; the worker is intentionally not bundled (it has host-specific system deps). Workers run from a clone of this repo (`worker/job_worker.py`) — `scripts/install-worker.sh` sets one up under `~/jobd-worker` with its own venv. (Packaging the worker as `jobd-worker` is planned for a later release.)
|
|
82
|
+
|
|
83
|
+
## Quickstart (single host)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# 1. start the broker (binds 127.0.0.1:8765 by default)
|
|
87
|
+
JOBD_ALLOW_NO_AUTH=1 jobd # no-auth is fine for a loopback-only broker
|
|
88
|
+
|
|
89
|
+
# 2. in another shell (from a clone of this repo), start a worker pointed at it
|
|
90
|
+
pip install httpx psutil pyyaml pynvml # worker deps (pynvml only needed for GPU hosts)
|
|
91
|
+
JOBD_URL=http://127.0.0.1:8765 JOBD_WORKER_HOST=local \
|
|
92
|
+
python worker/job_worker.py
|
|
93
|
+
|
|
94
|
+
# 3. submit a job and wait for it
|
|
95
|
+
job submit --project demo --wait -- echo hello
|
|
96
|
+
job list
|
|
97
|
+
job logs <id>
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
For a real multi-host deployment (Docker broker + systemd workers, Tailscale binding, shared auth token), see **[docs/security.md](docs/security.md)** and the templates in `docker-compose.yml`, `scripts/`, and `worker/`. Day-2 operations (health, draining a worker, upgrades, token rotation, backups) are in **[docs/runbook.md](docs/runbook.md)**.
|
|
101
|
+
|
|
102
|
+
## Supported platforms
|
|
103
|
+
|
|
104
|
+
Python 3.11+ everywhere.
|
|
105
|
+
|
|
106
|
+
| Component | Linux | macOS | Windows |
|
|
107
|
+
| -------------------------------------- | ------- | ----------- | -------------------- |
|
|
108
|
+
| **Broker** (`jobd`) | ✅ | ✅ | ✅ (WSL recommended) |
|
|
109
|
+
| **CLI** (`job`) / **MCP** (`jobd-mcp`) | ✅ | ✅ | ✅ |
|
|
110
|
+
| **Worker** (`job_worker.py`) | ✅ full | ⚠️ degraded | ⚠️ degraded |
|
|
111
|
+
|
|
112
|
+
The **worker** runs its best on Linux with a systemd user instance: memory caps, process reaping, and preemption use `systemd-run --user` scopes and cgroups. On non-systemd hosts the worker still executes jobs, but silently drops those guarantees — fine for a single trusted box, not for hard resource isolation. GPU features need NVIDIA + `nvidia-ml-py`. The broker, CLI, and MCP server are pure-Python and portable.
|
|
113
|
+
|
|
114
|
+
## CLI
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
job submit -p PROJ [--gpu] [--vram-required N] [--needs TAG]... [--wait] -- CMD...
|
|
118
|
+
job list [--state STATE] [--project P] # queue + recent jobs
|
|
119
|
+
job status ID [--watch] # one job, optionally live
|
|
120
|
+
job logs ID [-n BYTES] # tail captured output
|
|
121
|
+
job wait ID # block until terminal
|
|
122
|
+
job cancel ID / job preempt ID # stop a job
|
|
123
|
+
job workers # fleet snapshot + health
|
|
124
|
+
job projects list | set NAME PRI | nudge NAME DELTA
|
|
125
|
+
job audit [--project P] [--since 24h] # event history
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
`job submit --explain` dry-runs the resolution (priority, profile, project defaults, host pin) and prints the effective config without enqueuing anything.
|
|
129
|
+
|
|
130
|
+
## MCP / agent integration
|
|
131
|
+
|
|
132
|
+
jobd ships an MCP server (`jobd-mcp`) exposing the queue as nine tools — `jobd_submit`, `jobd_status`, `jobd_logs`, `jobd_list`, `jobd_cancel`, `jobd_preempt`, `jobd_workers`, `jobd_job_get`, `jobd_worker_delete`. Point your MCP client at it:
|
|
133
|
+
|
|
134
|
+
```json
|
|
135
|
+
{
|
|
136
|
+
"mcpServers": {
|
|
137
|
+
"jobd": {
|
|
138
|
+
"command": "jobd-mcp",
|
|
139
|
+
"env": {
|
|
140
|
+
"JOBD_URL": "http://127.0.0.1:8765",
|
|
141
|
+
"JOBD_API_TOKEN": "<your-token>"
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
`JOBD_API_TOKEN` must match the broker's token, or every call returns 401. Omit it only when the broker runs with `JOBD_ALLOW_NO_AUTH=1`.
|
|
149
|
+
|
|
150
|
+
Now an agent can "run this overnight," check on it next session, and route GPU work through the broker instead of colliding on a shared card. The `examples/claude-code-hooks/` directory has optional [Claude Code](https://docs.claude.com/en/docs/claude-code) hooks that _nudge_ (or hard-block) an agent toward submitting heavy commands through jobd — including a VRAM-aware GPU guard with `# NO_GPU` / `# CONCURRENT_OK` / `# VRAM=NGB` override markers.
|
|
151
|
+
|
|
152
|
+
## Configuration
|
|
153
|
+
|
|
154
|
+
Three optional YAML files under `JOBD_CONFIG_DIR` (defaults shipped in `config/`):
|
|
155
|
+
|
|
156
|
+
- **`projects.yaml`** — per-project base priority and submit defaults (preemptibility, wall/idle timeouts, host pins, capability requirements). See [docs/plans/projects-yaml.md](docs/plans/projects-yaml.md) for the full resolution model.
|
|
157
|
+
- **`profiles.yaml`** — named resource bundles (`--profile gpu-train-large`) the matcher uses to size a job.
|
|
158
|
+
- **`classifier.yaml`** — rules that auto-suggest a profile from the command string.
|
|
159
|
+
|
|
160
|
+
All three are optional; with none present, every job runs at the global default priority.
|
|
161
|
+
|
|
162
|
+
## Security
|
|
163
|
+
|
|
164
|
+
The broker has **no TCP-layer auth beyond a shared bearer token**, so it is meant to run on a trusted network (loopback or a Tailscale tailnet), never on a public interface. Two stacked controls:
|
|
165
|
+
|
|
166
|
+
1. **Interface binding** — `JOBD_HOST` must be `127.0.0.1` or a Tailscale CGNAT address (`100.64.0.0/10`), never `0.0.0.0`. A CI lint (`tests/test_deploy_lint.py`) enforces this on the Docker deployment.
|
|
167
|
+
2. **Bearer token** — set `JOBD_API_TOKEN` (≥32 random bytes) on every broker/worker/CLI/MCP host. The broker refuses to start without it unless you explicitly set `JOBD_ALLOW_NO_AUTH=1`. **`JOBD_ALLOW_NO_AUTH=1` is for a loopback-only broker (`JOBD_HOST=127.0.0.1`) — for local dev/tests.** Combined with a non-loopback `JOBD_HOST` it exposes an unauthenticated RCE endpoint to your whole tailnet; the broker logs a startup warning if you do this. Don't.
|
|
168
|
+
|
|
169
|
+
Full threat model, env-var reference, and token rotation: **[docs/security.md](docs/security.md)**.
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
MIT — see [LICENSE](LICENSE).
|