jd-worker 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jd_worker-1.0.0/PKG-INFO +56 -0
- jd_worker-1.0.0/README.md +37 -0
- jd_worker-1.0.0/jd/__init__.py +14 -0
- jd_worker-1.0.0/jd/files.py +236 -0
- jd_worker-1.0.0/jd/paths.py +58 -0
- jd_worker-1.0.0/jd/worker.py +621 -0
- jd_worker-1.0.0/jd_worker.egg-info/PKG-INFO +56 -0
- jd_worker-1.0.0/jd_worker.egg-info/SOURCES.txt +12 -0
- jd_worker-1.0.0/jd_worker.egg-info/dependency_links.txt +1 -0
- jd_worker-1.0.0/jd_worker.egg-info/entry_points.txt +2 -0
- jd_worker-1.0.0/jd_worker.egg-info/requires.txt +2 -0
- jd_worker-1.0.0/jd_worker.egg-info/top_level.txt +1 -0
- jd_worker-1.0.0/pyproject.toml +35 -0
- jd_worker-1.0.0/setup.cfg +4 -0
jd_worker-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jd-worker
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Worker CLI for the JobDistributor distributed job system
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/NWSL-UCF/job-distributor
|
|
7
|
+
Project-URL: Documentation, https://github.com/NWSL-UCF/job-distributor/blob/main/docs/jd-worker.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/NWSL-UCF/job-distributor/issues
|
|
9
|
+
Keywords: distributed,jobs,worker,hpc,ml
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: psutil
|
|
19
|
+
|
|
20
|
+
# Worker client (`jd-worker`)
|
|
21
|
+
|
|
22
|
+
Workers run **`jd_worker`**, which requests jobs from the job server, executes your **entry script** with parameters as CLI flags, sends heartbeats, and reports **DONE** / **ABORTED**.
|
|
23
|
+
|
|
24
|
+
Full behaviour, environment variables, local paths (`~/jd_data/…`), and library helpers (`jd_upload`, checkpoints, `jd_job_dir`, …) are documented in **[`docs/jd-worker.md`](../docs/jd-worker.md)**.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
From the repo root (editable):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd job-distributor/client
|
|
34
|
+
python3 -m venv venv
|
|
35
|
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or install from GitHub (see `docs/jd-worker.md` for branch / subdirectory).
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Run
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
jd_worker expId=<experiment_id> entry_script=<your_script.py>
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Optional: `server=`, `port=`, `machine_type=`, `once=true`, etc. — see `jd_worker help` or **`docs/jd-worker.md`**.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Example workload
|
|
54
|
+
|
|
55
|
+
For an end-to-end ML-style example (MNIST tuning), see
|
|
56
|
+
[MNIST-parameter-tuning](https://github.com/NWSL-UCF/MNIST-parameter-tuning). Point **`entry_script`** at that repo’s training script and align **`expId`** with the server.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Worker client (`jd-worker`)
|
|
2
|
+
|
|
3
|
+
Workers run **`jd_worker`**, which requests jobs from the job server, executes your **entry script** with parameters as CLI flags, sends heartbeats, and reports **DONE** / **ABORTED**.
|
|
4
|
+
|
|
5
|
+
Full behaviour, environment variables, local paths (`~/jd_data/…`), and library helpers (`jd_upload`, checkpoints, `jd_job_dir`, …) are documented in **[`docs/jd-worker.md`](../docs/jd-worker.md)**.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
From the repo root (editable):
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
cd job-distributor/client
|
|
15
|
+
python3 -m venv venv
|
|
16
|
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
|
17
|
+
pip install -e .
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Or install from GitHub (see `docs/jd-worker.md` for branch / subdirectory).
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Run
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
jd_worker expId=<experiment_id> entry_script=<your_script.py>
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Optional: `server=`, `port=`, `machine_type=`, `once=true`, etc. — see `jd_worker help` or **`docs/jd-worker.md`**.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Example workload
|
|
35
|
+
|
|
36
|
+
For an end-to-end ML-style example (MNIST tuning), see
|
|
37
|
+
[MNIST-parameter-tuning](https://github.com/NWSL-UCF/MNIST-parameter-tuning). Point **`entry_script`** at that repo’s training script and align **`expId`** with the server.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""jd — Job Distributor client package."""
|
|
2
|
+
__version__ = "1.0.0"
|
|
3
|
+
|
|
4
|
+
from jd.files import jd_get_last_checkpoint, jd_update_checkpoint, jd_upload
|
|
5
|
+
from jd.paths import jd_exp_dir, jd_job_dir, jd_worker_workspace
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"jd_upload",
|
|
9
|
+
"jd_update_checkpoint",
|
|
10
|
+
"jd_get_last_checkpoint",
|
|
11
|
+
"jd_job_dir",
|
|
12
|
+
"jd_worker_workspace",
|
|
13
|
+
"jd_exp_dir",
|
|
14
|
+
]
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
jd.files — worker-side file/checkpoint helpers
|
|
3
|
+
================================================
|
|
4
|
+
These functions are designed to be called **from inside your entry script**
|
|
5
|
+
while a job is running. jd_worker automatically injects the required
|
|
6
|
+
context (server URL, job ID, experiment ID) as environment variables, so
|
|
7
|
+
you normally do not need to pass those arguments explicitly.
|
|
8
|
+
|
|
9
|
+
Quick-start
|
|
10
|
+
-----------
|
|
11
|
+
from jd import jd_upload, jd_job_dir, jd_update_checkpoint, jd_get_last_checkpoint
|
|
12
|
+
|
|
13
|
+
out = jd_job_dir() / "metrics.csv"
|
|
14
|
+
# … write to out …
|
|
15
|
+
jd_upload(str(out))
|
|
16
|
+
|
|
17
|
+
jd_update_checkpoint({"epoch": 5, "state_dict": model.state_dict()})
|
|
18
|
+
ckpt = jd_get_last_checkpoint()
|
|
19
|
+
if ckpt:
|
|
20
|
+
model.load_state_dict(ckpt["state_dict"])
|
|
21
|
+
|
|
22
|
+
Environment variables (set automatically by jd_worker)
|
|
23
|
+
-------------------------------------------------------
|
|
24
|
+
JD_SERVER — job server base URL, e.g. http://10.0.0.1:8000
|
|
25
|
+
JD_JOB_ID — integer job ID assigned by the server
|
|
26
|
+
JD_EXP_ID — experiment identifier
|
|
27
|
+
JD_WORKER_JOB_DIR — absolute …/<parent>/jd_data/<expId>/<job_id>/
|
|
28
|
+
(same as ``--base_path``); prefer ``jd_job_dir()``
|
|
29
|
+
JD_WORKER_WORKSPACE_ROOT — absolute ``<parent>/jd_data`` (same as ``jd_worker_workspace()``)
|
|
30
|
+
|
|
31
|
+
You can override server/job_id via the upload/checkpoint function keyword arguments.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import io
|
|
35
|
+
import logging
|
|
36
|
+
import os
|
|
37
|
+
import pickle
|
|
38
|
+
|
|
39
|
+
import requests
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
_MAX_BYTES = 100 * 1024 * 1024 # 100 MB
|
|
44
|
+
_TIMEOUT = 180 # seconds for upload/download HTTP calls
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ── Internal helpers ──────────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
def _ctx(job_id, server):
|
|
50
|
+
"""Resolve job_id and server from kwargs → env vars → error."""
|
|
51
|
+
if job_id is None:
|
|
52
|
+
raw = os.environ.get("JD_JOB_ID", "")
|
|
53
|
+
if not raw:
|
|
54
|
+
raise RuntimeError(
|
|
55
|
+
"job_id not provided and JD_JOB_ID environment variable is not set. "
|
|
56
|
+
"Make sure you are calling this function from inside an entry script "
|
|
57
|
+
"launched by jd_worker."
|
|
58
|
+
)
|
|
59
|
+
job_id = int(raw)
|
|
60
|
+
if server is None:
|
|
61
|
+
server = os.environ.get("JD_SERVER", "")
|
|
62
|
+
if not server:
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"server not provided and JD_SERVER environment variable is not set."
|
|
65
|
+
)
|
|
66
|
+
return job_id, server.rstrip("/")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _auth_headers() -> dict:
|
|
70
|
+
"""Return Authorization header if JD_WORKER_TOKEN is set (Hub mode)."""
|
|
71
|
+
token = os.environ.get("JD_WORKER_TOKEN", "").strip()
|
|
72
|
+
if token:
|
|
73
|
+
return {"Authorization": f"Bearer {token}"}
|
|
74
|
+
return {}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _check_size(data: bytes, label: str) -> None:
|
|
78
|
+
if len(data) > _MAX_BYTES:
|
|
79
|
+
mb = len(data) / (1024 ** 2)
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"{label} is {mb:.1f} MB which exceeds the 100 MB limit."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ── Public API ────────────────────────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
def jd_upload(file_path: str, job_id: int = None, server: str = None) -> dict:
|
|
88
|
+
"""
|
|
89
|
+
Upload a result file (≤ 100 MB) to the server.
|
|
90
|
+
|
|
91
|
+
The file is stored in the experiment's job directory as::
|
|
92
|
+
|
|
93
|
+
result_v{N}_{timestamp}.<original_ext>
|
|
94
|
+
|
|
95
|
+
where N auto-increments across calls so every upload is preserved.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
file_path : str
|
|
100
|
+
Local path to the file you want to upload.
|
|
101
|
+
job_id : int, optional
|
|
102
|
+
Defaults to the JD_JOB_ID environment variable.
|
|
103
|
+
server : str, optional
|
|
104
|
+
Job server base URL. Defaults to JD_SERVER environment variable.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
dict
|
|
109
|
+
``{"success": True, "filename": "result_v0_…", "version": 0, "size_bytes": …}``
|
|
110
|
+
"""
|
|
111
|
+
job_id, server = _ctx(job_id, server)
|
|
112
|
+
|
|
113
|
+
if not os.path.isfile(file_path):
|
|
114
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
115
|
+
|
|
116
|
+
with open(file_path, "rb") as fh:
|
|
117
|
+
data = fh.read()
|
|
118
|
+
|
|
119
|
+
_check_size(data, f"File '{os.path.basename(file_path)}'")
|
|
120
|
+
|
|
121
|
+
original_name = os.path.basename(file_path)
|
|
122
|
+
logger.info(f"[jd_upload] Uploading '{original_name}' ({len(data)/(1024**2):.2f} MB) …")
|
|
123
|
+
|
|
124
|
+
resp = requests.post(
|
|
125
|
+
f"{server}/upload",
|
|
126
|
+
data={"job_id": str(job_id)},
|
|
127
|
+
files={"file": (original_name, io.BytesIO(data), "application/octet-stream")},
|
|
128
|
+
headers=_auth_headers(),
|
|
129
|
+
timeout=_TIMEOUT,
|
|
130
|
+
)
|
|
131
|
+
resp.raise_for_status()
|
|
132
|
+
result = resp.json()
|
|
133
|
+
logger.info(f"[jd_upload] Saved as '{result.get('filename')}' (version {result.get('version')})")
|
|
134
|
+
return result
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def jd_update_checkpoint(obj, job_id: int = None, server: str = None) -> dict:
|
|
138
|
+
"""
|
|
139
|
+
Serialise *obj* with pickle and upload it as a versioned checkpoint.
|
|
140
|
+
|
|
141
|
+
The checkpoint is stored in the experiment's job directory as::
|
|
142
|
+
|
|
143
|
+
checkpoint_v{N}_{timestamp}.pt
|
|
144
|
+
|
|
145
|
+
Each call creates a new version, so previous checkpoints are never
|
|
146
|
+
overwritten. The file uses standard Python pickle serialisation and is
|
|
147
|
+
compatible with PyTorch state-dicts as well as arbitrary Python objects.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
obj : any
|
|
152
|
+
The Python object to checkpoint (e.g. a dict containing
|
|
153
|
+
``model.state_dict()`` and optimizer state).
|
|
154
|
+
job_id : int, optional
|
|
155
|
+
server : str, optional
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
dict
|
|
160
|
+
``{"success": True, "filename": "checkpoint_v0_…", "version": 0, …}``
|
|
161
|
+
"""
|
|
162
|
+
job_id, server = _ctx(job_id, server)
|
|
163
|
+
|
|
164
|
+
logger.info("[jd_update_checkpoint] Serialising checkpoint …")
|
|
165
|
+
data = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
|
|
166
|
+
_check_size(data, "Checkpoint")
|
|
167
|
+
|
|
168
|
+
logger.info(f"[jd_update_checkpoint] Uploading {len(data)/(1024**2):.2f} MB …")
|
|
169
|
+
resp = requests.post(
|
|
170
|
+
f"{server}/checkpoint",
|
|
171
|
+
data={"job_id": str(job_id)},
|
|
172
|
+
files={"checkpoint": ("checkpoint.pkl", io.BytesIO(data), "application/octet-stream")},
|
|
173
|
+
headers=_auth_headers(),
|
|
174
|
+
timeout=_TIMEOUT,
|
|
175
|
+
)
|
|
176
|
+
resp.raise_for_status()
|
|
177
|
+
result = resp.json()
|
|
178
|
+
logger.info(f"[jd_update_checkpoint] Saved as '{result.get('filename')}' (version {result.get('version')})")
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def jd_get_last_checkpoint(job_id: int = None, server: str = None):
|
|
183
|
+
"""
|
|
184
|
+
Download the highest-versioned checkpoint for this job and return it as
|
|
185
|
+
a Python object — **nothing is written to disk**.
|
|
186
|
+
|
|
187
|
+
The server sends the raw pickle bytes; this function deserialises them
|
|
188
|
+
directly from an in-memory buffer so your script can resume immediately.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
job_id : int, optional
|
|
193
|
+
server : str, optional
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
object
|
|
198
|
+
The Python object that was passed to ``jd_update_checkpoint``, or
|
|
199
|
+
``None`` if no checkpoint exists yet for this job.
|
|
200
|
+
|
|
201
|
+
Examples
|
|
202
|
+
--------
|
|
203
|
+
::
|
|
204
|
+
|
|
205
|
+
ckpt = jd_get_last_checkpoint()
|
|
206
|
+
if ckpt is not None:
|
|
207
|
+
model.load_state_dict(ckpt["model"])
|
|
208
|
+
start_epoch = ckpt["epoch"] + 1
|
|
209
|
+
"""
|
|
210
|
+
job_id, server = _ctx(job_id, server)
|
|
211
|
+
|
|
212
|
+
logger.info(f"[jd_get_last_checkpoint] Fetching latest checkpoint for job {job_id} …")
|
|
213
|
+
resp = requests.get(
|
|
214
|
+
f"{server}/checkpoint/latest",
|
|
215
|
+
params={"job_id": job_id},
|
|
216
|
+
headers=_auth_headers(),
|
|
217
|
+
timeout=_TIMEOUT,
|
|
218
|
+
stream=True,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if resp.status_code == 404:
|
|
222
|
+
logger.info("[jd_get_last_checkpoint] No checkpoint found.")
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
resp.raise_for_status()
|
|
226
|
+
|
|
227
|
+
# Read directly into a BytesIO buffer — no temp file, stays in memory
|
|
228
|
+
buf = io.BytesIO()
|
|
229
|
+
for chunk in resp.iter_content(chunk_size=1024 * 256):
|
|
230
|
+
buf.write(chunk)
|
|
231
|
+
|
|
232
|
+
logger.info(f"[jd_get_last_checkpoint] Received {buf.tell()/(1024**2):.2f} MB. Deserialising …")
|
|
233
|
+
buf.seek(0)
|
|
234
|
+
obj = pickle.load(buf)
|
|
235
|
+
logger.info("[jd_get_last_checkpoint] Checkpoint ready.")
|
|
236
|
+
return obj
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Paths injected by jd_worker for entry scripts
|
|
3
|
+
==============================================
|
|
4
|
+
Layout on the worker::
|
|
5
|
+
|
|
6
|
+
<parent>/jd_data/<expId>/<job_id>/ ← save job outputs here (jd_job_dir)
|
|
7
|
+
|
|
8
|
+
``parent`` is ``JD_WORKSPACE_PATH`` if set, otherwise your home directory.
|
|
9
|
+
``jd_worker_workspace()`` returns the resolved ``…/jd_data`` directory.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def jd_job_dir() -> Path:
|
|
19
|
+
"""
|
|
20
|
+
Absolute directory for **this job's** local files — same as ``--base_path``.
|
|
21
|
+
|
|
22
|
+
Equivalent to ``JD_WORKER_JOB_DIR``. Prefer this for CSVs, checkpoints on
|
|
23
|
+
disk, temp files, etc.
|
|
24
|
+
"""
|
|
25
|
+
raw = os.environ.get("JD_WORKER_JOB_DIR", "").strip()
|
|
26
|
+
if not raw:
|
|
27
|
+
raise RuntimeError(
|
|
28
|
+
"JD_WORKER_JOB_DIR is not set. Run your script via jd_worker "
|
|
29
|
+
"(or set JD_WORKER_JOB_DIR to your job sandbox directory)."
|
|
30
|
+
)
|
|
31
|
+
return Path(raw).expanduser().resolve()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def jd_worker_workspace() -> Path:
|
|
35
|
+
"""
|
|
36
|
+
Absolute **jd_data root**: ``<parent>/jd_data`` where ``parent`` is
|
|
37
|
+
``JD_WORKSPACE_PATH`` or ``~``.
|
|
38
|
+
|
|
39
|
+
Set by ``JD_WORKER_WORKSPACE_ROOT`` for the entry-script process.
|
|
40
|
+
If only ``JD_WORKER_JOB_DIR`` is present, derives
|
|
41
|
+
``…/jd_data/<expId>/<job_id>`` → parent.parent as that root.
|
|
42
|
+
"""
|
|
43
|
+
raw = os.environ.get("JD_WORKER_WORKSPACE_ROOT", "").strip()
|
|
44
|
+
if raw:
|
|
45
|
+
return Path(raw).expanduser().resolve()
|
|
46
|
+
job_raw = os.environ.get("JD_WORKER_JOB_DIR", "").strip()
|
|
47
|
+
if job_raw:
|
|
48
|
+
p = Path(job_raw).expanduser().resolve()
|
|
49
|
+
return p.parent.parent
|
|
50
|
+
raise RuntimeError(
|
|
51
|
+
"Cannot resolve worker workspace: set JD_WORKER_WORKSPACE_ROOT or "
|
|
52
|
+
"run inside jd_worker so JD_WORKER_JOB_DIR is set."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def jd_exp_dir() -> Path:
|
|
57
|
+
"""Absolute ``<parent>/jd_data/<expId>/`` for this run (parent of ``jd_job_dir()``)."""
|
|
58
|
+
return jd_job_dir().parent
|
|
@@ -0,0 +1,621 @@
|
|
|
1
|
+
"""
|
|
2
|
+
jd_worker — Job Distributor Worker CLI
|
|
3
|
+
=======================================
|
|
4
|
+
Requests jobs from a jd server, runs the entry script with the job's
|
|
5
|
+
parameters as CLI flags, sends a heartbeat ping every 57 seconds, and
|
|
6
|
+
reports DONE or ABORTED when the script finishes.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
jd_worker expId=<id> entry_script=<script.py> [options]
|
|
11
|
+
|
|
12
|
+
Required
|
|
13
|
+
--------
|
|
14
|
+
expId=<id> Experiment identifier (must match the server).
|
|
15
|
+
entry_script=<path> Python script to run for each job.
|
|
16
|
+
|
|
17
|
+
Optional
|
|
18
|
+
--------
|
|
19
|
+
(no workspace CLI) Local data lives under ``<parent>/jd_data/<expId>/<job_id>/``.
|
|
20
|
+
``parent`` is ``JD_WORKSPACE_PATH`` if set, otherwise
|
|
21
|
+
``~`` (your home directory). So the default root is
|
|
22
|
+
``~/jd_data/``. Passed to the entry script as
|
|
23
|
+
``--base_path``, ``JD_WORKER_JOB_DIR``, and
|
|
24
|
+
``JD_WORKER_WORKSPACE_ROOT`` (= ``.../jd_data``).
|
|
25
|
+
|
|
26
|
+
Note: ``JD_WORKSPACE_PATH`` here is **worker-only**
|
|
27
|
+
(parent of ``jd_data``). The job server uses the same
|
|
28
|
+
variable name for its own layout — avoid exporting one
|
|
29
|
+
value for both in the same shell.
|
|
30
|
+
server=<url> Job server base URL or host (default: http://localhost,
|
|
31
|
+
env: JD_SERVER). If you omit ``http://`` or ``https://``,
|
|
32
|
+
``http://`` is assumed.
|
|
33
|
+
port=<N> Port if not included in server URL
|
|
34
|
+
(default: 5000, env: JD_PORT)
|
|
35
|
+
log_dir=<path> If set, logs go under <log_dir>/<expId>/; otherwise
|
|
36
|
+
under <jd_data>/<expId>/jd_worker_logs/
|
|
37
|
+
(env: JD_LOG_DIR)
|
|
38
|
+
machine_type=<type> Label for this machine in the dashboard
|
|
39
|
+
(default: worker, env: JD_MACHINE_TYPE)
|
|
40
|
+
process_id=<N> Numeric ID when running multiple workers on the
|
|
41
|
+
same machine (default: 0)
|
|
42
|
+
once=true Exit after completing a single job instead of
|
|
43
|
+
looping until no jobs remain.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
jd_worker expId=mnist_tune entry_script=train.py
|
|
48
|
+
|
|
49
|
+
# Optional: put jd_data under /scratch/jd_data (parent=/scratch)
|
|
50
|
+
JD_WORKSPACE_PATH=/scratch jd_worker expId=mnist_tune entry_script=train.py \\
|
|
51
|
+
server=http://10.0.0.5 port=8000 \\
|
|
52
|
+
machine_type=gpu_node
|
|
53
|
+
|
|
54
|
+
# Run exactly one job:
|
|
55
|
+
jd_worker expId=mnist_tune entry_script=train.py once=true
|
|
56
|
+
|
|
57
|
+
Install
|
|
58
|
+
-------
|
|
59
|
+
pip install -e ./client # from the repo root
|
|
60
|
+
# then `jd_worker` is available in whatever env is active
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
import logging
|
|
64
|
+
import os
|
|
65
|
+
import platform
|
|
66
|
+
import random
|
|
67
|
+
import signal
|
|
68
|
+
import socket
|
|
69
|
+
import subprocess
|
|
70
|
+
import sys
|
|
71
|
+
import threading
|
|
72
|
+
import time
|
|
73
|
+
from urllib.parse import urlparse, urlunparse
|
|
74
|
+
|
|
75
|
+
import psutil
|
|
76
|
+
import requests
|
|
77
|
+
|
|
78
|
+
from jd import __version__
|
|
79
|
+
|
|
80
|
+
IS_WINDOWS = platform.system() == "Windows"
|
|
81
|
+
PING_INTERVAL = 57 # seconds — intentionally not 60 to avoid racing the idle timeout
|
|
82
|
+
|
|
83
|
+
# Fixed subdirectory under JD_WORKSPACE_PATH (or home): …/jd_data/<expId>/<job_id>/
|
|
84
|
+
_WORKER_JD_DATA_DIRNAME = "jd_data"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ── Argument parsing ─────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
def _parse_kv(argv: list) -> dict:
|
|
90
|
+
"""Parse key=value positional arguments into a plain dict."""
|
|
91
|
+
cfg = {}
|
|
92
|
+
for arg in argv:
|
|
93
|
+
if '=' in arg:
|
|
94
|
+
k, v = arg.split('=', 1)
|
|
95
|
+
cfg[k.strip()] = v.strip()
|
|
96
|
+
else:
|
|
97
|
+
cfg[arg.strip()] = 'true'
|
|
98
|
+
return cfg
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _normalize_server_base_url(server_raw: str, port_raw: str) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Build a base URL that requests/lib can open.
|
|
104
|
+
|
|
105
|
+
Host-only values like ``localhost`` must become ``http://localhost:<port>``.
|
|
106
|
+
Without a scheme, Python requests raises "No connection adapters were found".
|
|
107
|
+
"""
|
|
108
|
+
s = (server_raw or "").strip().rstrip("/")
|
|
109
|
+
if not s:
|
|
110
|
+
s = "http://localhost"
|
|
111
|
+
if not urlparse(s).scheme:
|
|
112
|
+
s = "http://" + s.lstrip("/")
|
|
113
|
+
parsed = urlparse(s)
|
|
114
|
+
if parsed.port is not None:
|
|
115
|
+
return s
|
|
116
|
+
host = parsed.hostname
|
|
117
|
+
if not host:
|
|
118
|
+
host = "localhost"
|
|
119
|
+
netloc = f"{host}:{port_raw}"
|
|
120
|
+
return urlunparse(
|
|
121
|
+
(parsed.scheme, netloc, parsed.path or "", "", "", "")
|
|
122
|
+
).rstrip("/")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _resolve(cfg: dict) -> dict:
|
|
126
|
+
"""Merge CLI key=value pairs with environment variables and defaults."""
|
|
127
|
+
|
|
128
|
+
def get(cli_key, env_key=None, default=None):
|
|
129
|
+
if cli_key in cfg:
|
|
130
|
+
return cfg[cli_key]
|
|
131
|
+
if env_key:
|
|
132
|
+
val = os.environ.get(env_key, '')
|
|
133
|
+
if val:
|
|
134
|
+
return val
|
|
135
|
+
return default
|
|
136
|
+
|
|
137
|
+
server_raw = get('server', 'JD_SERVER', 'http://localhost')
|
|
138
|
+
port_raw = str(get('port', 'JD_PORT', '5000')).strip()
|
|
139
|
+
|
|
140
|
+
base_url = _normalize_server_base_url(server_raw, port_raw)
|
|
141
|
+
|
|
142
|
+
# …/<parent>/jd_data/<expId>/<job_id>/ — parent from env or ~
|
|
143
|
+
parent = os.environ.get("JD_WORKSPACE_PATH", "").strip()
|
|
144
|
+
if not parent:
|
|
145
|
+
parent = os.path.expanduser("~")
|
|
146
|
+
parent = os.path.abspath(os.path.expanduser(parent))
|
|
147
|
+
workspace_path = os.path.join(parent, _WORKER_JD_DATA_DIRNAME)
|
|
148
|
+
os.makedirs(workspace_path, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
log_override = None
|
|
151
|
+
if 'log_dir' in cfg:
|
|
152
|
+
log_override = os.path.expanduser(cfg['log_dir'].strip())
|
|
153
|
+
elif os.environ.get('JD_LOG_DIR', '').strip():
|
|
154
|
+
log_override = os.path.expanduser(os.environ['JD_LOG_DIR'].strip())
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
'exp_id': get('expId', 'JD_EXP_ID', None),
|
|
158
|
+
'entry_script': get('entry_script', 'JD_ENTRY_SCRIPT', None),
|
|
159
|
+
'base_url': base_url,
|
|
160
|
+
'workspace_path': workspace_path,
|
|
161
|
+
'log_dir_override': log_override,
|
|
162
|
+
'machine_type': get('machine_type', 'JD_MACHINE_TYPE', 'worker'),
|
|
163
|
+
'process_id': get('process_id', None, '0'),
|
|
164
|
+
'once': get('once', 'JD_ONCE', 'false').lower() == 'true',
|
|
165
|
+
# Hub authentication (optional)
|
|
166
|
+
'hub_url': get('hub_url', 'JD_HUB_URL', '').strip().rstrip('/'),
|
|
167
|
+
'api_key': get('api_key', 'JD_API_KEY', '').strip(),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ── Hub authentication ────────────────────────────────────────────────────────
|
|
172
|
+
|
|
173
|
+
def _hub_get_worker_token(hub_url: str, api_key: str,
|
|
174
|
+
exp_id: str, logger: logging.Logger) -> tuple[str, str] | None:
|
|
175
|
+
"""
|
|
176
|
+
Obtain a worker JWT and server URL from the Hub.
|
|
177
|
+
|
|
178
|
+
Returns (worker_token, server_url) on success, None on failure.
|
|
179
|
+
The returned server_url overrides any CLI-supplied server address.
|
|
180
|
+
"""
|
|
181
|
+
endpoint = f"{hub_url}/api/worker/token"
|
|
182
|
+
try:
|
|
183
|
+
r = requests.post(
|
|
184
|
+
endpoint,
|
|
185
|
+
json={"experiment_name": exp_id},
|
|
186
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
187
|
+
timeout=30,
|
|
188
|
+
)
|
|
189
|
+
if r.status_code == 200:
|
|
190
|
+
data = r.json()
|
|
191
|
+
token = data.get("worker_token", "")
|
|
192
|
+
server_url = data.get("server_url", "")
|
|
193
|
+
expires_at = data.get("expires_at", "unknown")
|
|
194
|
+
logger.info(f"Worker token obtained from Hub (expires: {expires_at})")
|
|
195
|
+
return token, server_url
|
|
196
|
+
logger.error(f"Hub token request failed: HTTP {r.status_code} — {r.text[:300]}")
|
|
197
|
+
return None
|
|
198
|
+
except Exception as exc:
|
|
199
|
+
logger.error(f"Hub connection error: {exc}")
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ── Logging ───────────────────────────────────────────────────────────────────
|
|
204
|
+
|
|
205
|
+
def _setup_logger(log_dir: str, runner_id: str) -> logging.Logger:
|
|
206
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
207
|
+
log_path = os.path.join(log_dir, f"jd_worker_{runner_id}.log")
|
|
208
|
+
|
|
209
|
+
logger = logging.getLogger(f'jd_worker.{runner_id}')
|
|
210
|
+
logger.setLevel(logging.INFO)
|
|
211
|
+
|
|
212
|
+
fmt = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
|
213
|
+
fh = logging.FileHandler(log_path, encoding='utf-8')
|
|
214
|
+
fh.setFormatter(fmt)
|
|
215
|
+
sh = logging.StreamHandler(sys.stdout)
|
|
216
|
+
sh.setFormatter(fmt)
|
|
217
|
+
|
|
218
|
+
logger.addHandler(fh)
|
|
219
|
+
logger.addHandler(sh)
|
|
220
|
+
logger.propagate = False
|
|
221
|
+
return logger
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# ── System metrics ────────────────────────────────────────────────────────────
|
|
225
|
+
|
|
226
|
+
def _collect_metrics(machine_type: str, logger: logging.Logger) -> dict:
|
|
227
|
+
"""Collect a single snapshot of system metrics."""
|
|
228
|
+
try:
|
|
229
|
+
cpu_pct = psutil.cpu_percent(interval=0.1)
|
|
230
|
+
cpu_phys = psutil.cpu_count(logical=False) or 1
|
|
231
|
+
cpu_logi = psutil.cpu_count(logical=True) or 1
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
freq = psutil.cpu_freq()
|
|
235
|
+
freq_mhz = freq.current if freq else 0
|
|
236
|
+
except (AttributeError, RuntimeError):
|
|
237
|
+
freq_mhz = 0
|
|
238
|
+
|
|
239
|
+
mem = psutil.virtual_memory()
|
|
240
|
+
ram_total_gb = mem.total / (1024 ** 3)
|
|
241
|
+
ram_avail_gb = mem.available / (1024 ** 3)
|
|
242
|
+
|
|
243
|
+
if not IS_WINDOWS:
|
|
244
|
+
try:
|
|
245
|
+
la = os.getloadavg()
|
|
246
|
+
load1, load5, load15 = la[0], la[1], la[2]
|
|
247
|
+
except (AttributeError, OSError):
|
|
248
|
+
load1 = load5 = load15 = 0.0
|
|
249
|
+
load_per_cpu = load1 / cpu_logi if cpu_logi > 0 else 0.0
|
|
250
|
+
idle_slots = max(0, cpu_logi - load1)
|
|
251
|
+
else:
|
|
252
|
+
load1 = load5 = load15 = cpu_pct / 100.0 * cpu_logi
|
|
253
|
+
load_per_cpu = cpu_pct / 100.0
|
|
254
|
+
idle_slots = max(0, cpu_logi * (1 - cpu_pct / 100.0))
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
d0 = psutil.disk_io_counters()
|
|
258
|
+
time.sleep(0.1)
|
|
259
|
+
d1 = psutil.disk_io_counters()
|
|
260
|
+
if d0 and d1:
|
|
261
|
+
dt = 0.1
|
|
262
|
+
ops = (d1.read_count - d0.read_count +
|
|
263
|
+
d1.write_count - d0.write_count) / dt
|
|
264
|
+
bps = (d1.read_bytes - d0.read_bytes +
|
|
265
|
+
d1.write_bytes - d0.write_bytes) / dt
|
|
266
|
+
disk_io_util = max(min(100.0, (ops / 100_000) * 100),
|
|
267
|
+
min(100.0, (bps / 1e9) * 100))
|
|
268
|
+
else:
|
|
269
|
+
disk_io_util = 0.0
|
|
270
|
+
except (AttributeError, RuntimeError, TypeError):
|
|
271
|
+
disk_io_util = 0.0
|
|
272
|
+
|
|
273
|
+
return {
|
|
274
|
+
"cpu_util": round(cpu_pct, 1),
|
|
275
|
+
"ram_util": round(mem.percent, 1),
|
|
276
|
+
"ram_available": round(ram_avail_gb, 15),
|
|
277
|
+
"ram_total": round(ram_total_gb, 1),
|
|
278
|
+
"worker_type": machine_type,
|
|
279
|
+
"idle_slots": int(round(idle_slots)),
|
|
280
|
+
"load_1min": round(load1, 10),
|
|
281
|
+
"load_5min": round(load5, 10),
|
|
282
|
+
"load_15min": round(load15, 10),
|
|
283
|
+
"load_per_cpu": round(load_per_cpu, 13),
|
|
284
|
+
"disk_io_util": round(disk_io_util, 2),
|
|
285
|
+
"cpu_cores": cpu_phys,
|
|
286
|
+
"cpu_threads": cpu_logi,
|
|
287
|
+
"cpu_freq_mhz": int(round(freq_mhz)) if freq_mhz > 0 else 0,
|
|
288
|
+
}
|
|
289
|
+
except Exception as exc:
|
|
290
|
+
logger.error(f"Metrics collection error: {exc}")
|
|
291
|
+
return {k: 0 for k in (
|
|
292
|
+
"cpu_util", "ram_util", "ram_available", "ram_total",
|
|
293
|
+
"idle_slots", "load_1min", "load_5min", "load_15min",
|
|
294
|
+
"load_per_cpu", "disk_io_util", "cpu_cores", "cpu_threads",
|
|
295
|
+
"cpu_freq_mhz",
|
|
296
|
+
)} | {"worker_type": machine_type}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _averaged_metrics(machine_type: str, logger: logging.Logger,
|
|
300
|
+
samples: int = 5, interval: float = 3.0) -> dict:
|
|
301
|
+
"""Collect `samples` snapshots and return their numeric averages."""
|
|
302
|
+
logger.info(f"Collecting system metrics ({samples} samples × {interval}s)…")
|
|
303
|
+
snapshots = []
|
|
304
|
+
for i in range(samples):
|
|
305
|
+
logger.info(f" Sample {i+1}/{samples}")
|
|
306
|
+
snapshots.append(_collect_metrics(machine_type, logger))
|
|
307
|
+
if i < samples - 1:
|
|
308
|
+
time.sleep(interval)
|
|
309
|
+
|
|
310
|
+
numeric = [
|
|
311
|
+
"cpu_util", "ram_util", "ram_available", "ram_total",
|
|
312
|
+
"idle_slots", "load_1min", "load_5min", "load_15min",
|
|
313
|
+
"load_per_cpu", "disk_io_util", "cpu_freq_mhz",
|
|
314
|
+
]
|
|
315
|
+
result = {"worker_type": machine_type,
|
|
316
|
+
"cpu_cores": snapshots[0]["cpu_cores"],
|
|
317
|
+
"cpu_threads": snapshots[0]["cpu_threads"]}
|
|
318
|
+
for key in numeric:
|
|
319
|
+
vals = [s[key] for s in snapshots]
|
|
320
|
+
avg = sum(vals) / len(vals)
|
|
321
|
+
if key in ("idle_slots", "cpu_freq_mhz"):
|
|
322
|
+
result[key] = int(round(avg))
|
|
323
|
+
elif key in ("cpu_util", "ram_util", "ram_total"):
|
|
324
|
+
result[key] = round(avg, 1)
|
|
325
|
+
elif key == "ram_available":
|
|
326
|
+
result[key] = round(avg, 15)
|
|
327
|
+
elif key in ("load_1min", "load_5min", "load_15min"):
|
|
328
|
+
result[key] = round(avg, 10)
|
|
329
|
+
elif key == "load_per_cpu":
|
|
330
|
+
result[key] = round(avg, 13)
|
|
331
|
+
else:
|
|
332
|
+
result[key] = round(avg, 2)
|
|
333
|
+
logger.info("System metrics ready.")
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# ── Server communication ──────────────────────────────────────────────────────
|
|
338
|
+
|
|
339
|
+
def _auth_headers(worker_token: str) -> dict:
|
|
340
|
+
"""Return Authorization header dict if worker_token is set, else empty."""
|
|
341
|
+
if worker_token:
|
|
342
|
+
return {"Authorization": f"Bearer {worker_token}"}
|
|
343
|
+
return {}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _request_job(url: str, runner_id: str, metrics: dict,
|
|
347
|
+
logger: logging.Logger, worker_token: str = ""):
|
|
348
|
+
"""Returns (job_dict, None) on success, (None, reason) on failure."""
|
|
349
|
+
try:
|
|
350
|
+
r = requests.post(url, json={"requested_by": runner_id,
|
|
351
|
+
"system_metrics": metrics},
|
|
352
|
+
headers=_auth_headers(worker_token),
|
|
353
|
+
timeout=30)
|
|
354
|
+
if r.status_code == 404:
|
|
355
|
+
return None, 'no_jobs'
|
|
356
|
+
if r.status_code == 200:
|
|
357
|
+
return r.json(), None
|
|
358
|
+
return None, f"HTTP {r.status_code}: {r.text[:200]}"
|
|
359
|
+
except Exception as exc:
|
|
360
|
+
return None, str(exc)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _update_status(url: str, job_id: int, status: str,
|
|
364
|
+
message: str, logger: logging.Logger,
|
|
365
|
+
worker_token: str = "") -> None:
|
|
366
|
+
try:
|
|
367
|
+
r = requests.post(url, json={"job_id": job_id,
|
|
368
|
+
"status": status,
|
|
369
|
+
"message": message},
|
|
370
|
+
headers=_auth_headers(worker_token),
|
|
371
|
+
timeout=30)
|
|
372
|
+
if r.status_code == 200:
|
|
373
|
+
logger.info(f"Job {job_id} → {status}")
|
|
374
|
+
else:
|
|
375
|
+
logger.warning(f"Status update failed: HTTP {r.status_code}")
|
|
376
|
+
except Exception as exc:
|
|
377
|
+
logger.error(f"Status update error: {exc}")
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _ping_loop(url: str, job_id: int,
|
|
381
|
+
stop_event: threading.Event,
|
|
382
|
+
logger: logging.Logger,
|
|
383
|
+
worker_token: str = "") -> None:
|
|
384
|
+
"""Background thread: ping the server every PING_INTERVAL seconds."""
|
|
385
|
+
while not stop_event.wait(PING_INTERVAL):
|
|
386
|
+
try:
|
|
387
|
+
r = requests.post(url, json={"id": job_id},
|
|
388
|
+
headers=_auth_headers(worker_token),
|
|
389
|
+
timeout=10)
|
|
390
|
+
if r.status_code == 200:
|
|
391
|
+
logger.info(f"Ping OK (job {job_id})")
|
|
392
|
+
else:
|
|
393
|
+
logger.warning(f"Ping HTTP {r.status_code} (job {job_id})")
|
|
394
|
+
except Exception as exc:
|
|
395
|
+
logger.warning(f"Ping error (job {job_id}): {exc}")
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# ── Main entry point ──────────────────────────────────────────────────────────
|
|
399
|
+
|
|
400
|
+
def main() -> None:
|
|
401
|
+
argv = sys.argv[1:]
|
|
402
|
+
|
|
403
|
+
# Show help
|
|
404
|
+
if not argv or any(a in argv for a in ('help', '-h', '--help')):
|
|
405
|
+
print(__doc__)
|
|
406
|
+
sys.exit(0)
|
|
407
|
+
|
|
408
|
+
kv = _parse_kv(argv)
|
|
409
|
+
cfg = _resolve(kv)
|
|
410
|
+
|
|
411
|
+
# Validate required arguments
|
|
412
|
+
errors = []
|
|
413
|
+
if not cfg['exp_id']:
|
|
414
|
+
errors.append("expId is required")
|
|
415
|
+
if not cfg['entry_script']:
|
|
416
|
+
errors.append("entry_script is required")
|
|
417
|
+
elif not os.path.isfile(cfg['entry_script']):
|
|
418
|
+
errors.append(f"entry_script '{cfg['entry_script']}' not found")
|
|
419
|
+
if errors:
|
|
420
|
+
for e in errors:
|
|
421
|
+
print(f"Error: {e}")
|
|
422
|
+
print("Run `jd_worker help` for usage.")
|
|
423
|
+
sys.exit(1)
|
|
424
|
+
|
|
425
|
+
# Build a unique runner ID visible in the dashboard
|
|
426
|
+
username = os.getenv('USER') or os.getenv('USERNAME') or 'user'
|
|
427
|
+
random.seed(int(time.time() * 1000))
|
|
428
|
+
suffix = random.randint(10000, 99999)
|
|
429
|
+
runner_id = (f"{username}@{socket.gethostname()}"
|
|
430
|
+
f"({cfg['machine_type']})_{cfg['process_id']}_{suffix}")
|
|
431
|
+
|
|
432
|
+
if cfg['log_dir_override'] is not None:
|
|
433
|
+
log_dir = os.path.join(cfg['log_dir_override'], cfg['exp_id'])
|
|
434
|
+
else:
|
|
435
|
+
log_dir = os.path.join(cfg['workspace_path'], cfg['exp_id'], 'jd_worker_logs')
|
|
436
|
+
logger = _setup_logger(log_dir, runner_id)
|
|
437
|
+
|
|
438
|
+
# ── Hub authentication (optional) ────────────────────────────────────────
|
|
439
|
+
worker_token = ""
|
|
440
|
+
if cfg['hub_url'] and cfg['api_key']:
|
|
441
|
+
logger.info(f"Hub mode: authenticating via {cfg['hub_url']}")
|
|
442
|
+
result = _hub_get_worker_token(
|
|
443
|
+
cfg['hub_url'], cfg['api_key'], cfg['exp_id'], logger
|
|
444
|
+
)
|
|
445
|
+
if result is None:
|
|
446
|
+
logger.error("Failed to obtain worker token from Hub. Exiting.")
|
|
447
|
+
sys.exit(1)
|
|
448
|
+
worker_token, hub_server_url = result
|
|
449
|
+
if hub_server_url:
|
|
450
|
+
cfg['base_url'] = hub_server_url
|
|
451
|
+
logger.info(f"Server URL from Hub: {hub_server_url}")
|
|
452
|
+
elif cfg['hub_url'] or cfg['api_key']:
|
|
453
|
+
logger.warning(
|
|
454
|
+
"Both hub_url (JD_HUB_URL) and api_key (JD_API_KEY) must be set "
|
|
455
|
+
"for Hub authentication. Running in standalone mode."
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
urls = {
|
|
459
|
+
'request': f"{cfg['base_url']}/request_job",
|
|
460
|
+
'update': f"{cfg['base_url']}/update_job_status",
|
|
461
|
+
'ping': f"{cfg['base_url']}/ping",
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
logger.info(f"jd_worker v{__version__} | runner: {runner_id}")
|
|
465
|
+
logger.info(f"Server: {cfg['base_url']}")
|
|
466
|
+
logger.info(f"Entry script: {cfg['entry_script']}")
|
|
467
|
+
logger.info(f"Hub mode: {'enabled' if worker_token else 'disabled'}")
|
|
468
|
+
logger.info(f"Local jd_data root: {cfg['workspace_path']} "
|
|
469
|
+
f"(each job: …/jd_data/<expId>/<job_id>/)")
|
|
470
|
+
logger.info(f"Ping interval: {PING_INTERVAL}s")
|
|
471
|
+
if cfg['once']:
|
|
472
|
+
logger.info("Mode: single job (once=true)")
|
|
473
|
+
|
|
474
|
+
# Track the current child process so SIGINT/SIGTERM can clean it up
|
|
475
|
+
_proc: list = [None]
|
|
476
|
+
|
|
477
|
+
def _shutdown(signum=None, frame=None):
|
|
478
|
+
p = _proc[0]
|
|
479
|
+
if p and p.poll() is None:
|
|
480
|
+
logger.info(f"Killing subprocess PID {p.pid}…")
|
|
481
|
+
try:
|
|
482
|
+
if IS_WINDOWS:
|
|
483
|
+
p.send_signal(signal.CTRL_BREAK_EVENT)
|
|
484
|
+
else:
|
|
485
|
+
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
|
|
486
|
+
except Exception:
|
|
487
|
+
pass
|
|
488
|
+
logger.info("jd_worker shut down.")
|
|
489
|
+
sys.exit(0)
|
|
490
|
+
|
|
491
|
+
signal.signal(signal.SIGINT, _shutdown)
|
|
492
|
+
signal.signal(signal.SIGTERM, _shutdown)
|
|
493
|
+
|
|
494
|
+
# ── Main job loop ────────────────────────────────────────────────────────
|
|
495
|
+
while True:
|
|
496
|
+
job_id = None
|
|
497
|
+
try:
|
|
498
|
+
metrics = _averaged_metrics(cfg['machine_type'], logger)
|
|
499
|
+
job, reason = _request_job(urls['request'], runner_id, metrics, logger,
|
|
500
|
+
worker_token=worker_token)
|
|
501
|
+
|
|
502
|
+
if job is None:
|
|
503
|
+
if reason == 'no_jobs':
|
|
504
|
+
logger.info("No more jobs available. Exiting.")
|
|
505
|
+
break
|
|
506
|
+
logger.error(f"Job request failed: {reason}. Retrying in 10 s…")
|
|
507
|
+
time.sleep(10)
|
|
508
|
+
continue
|
|
509
|
+
|
|
510
|
+
job_id = job['job_id']
|
|
511
|
+
params = job['parameters']
|
|
512
|
+
logger.info(f"Job {job_id} received | params: {params}")
|
|
513
|
+
|
|
514
|
+
# Local sandbox for this job — keep worker-side I/O under this directory
|
|
515
|
+
job_root = os.path.abspath(os.path.join(
|
|
516
|
+
cfg['workspace_path'], cfg['exp_id'], str(job_id)))
|
|
517
|
+
os.makedirs(job_root, exist_ok=True)
|
|
518
|
+
|
|
519
|
+
# Build subprocess command
|
|
520
|
+
# Uses the *current* Python interpreter so venv/conda is respected
|
|
521
|
+
cmd = [sys.executable, cfg['entry_script']]
|
|
522
|
+
for k, v in params.items():
|
|
523
|
+
cmd.extend([f"--{k}", str(v)])
|
|
524
|
+
cmd.extend(['--base_path', job_root])
|
|
525
|
+
logger.info(f"Job workspace: {job_root}")
|
|
526
|
+
logger.info(f"Command: {' '.join(cmd)}")
|
|
527
|
+
|
|
528
|
+
# Start heartbeat ping thread
|
|
529
|
+
stop_ping = threading.Event()
|
|
530
|
+
pinger = threading.Thread(
|
|
531
|
+
target=_ping_loop,
|
|
532
|
+
args=(urls['ping'], job_id, stop_ping, logger, worker_token),
|
|
533
|
+
daemon=True,
|
|
534
|
+
)
|
|
535
|
+
pinger.start()
|
|
536
|
+
|
|
537
|
+
# Build child environment: inherit everything + inject JD_ context
|
|
538
|
+
# so jd_upload / jd_update_checkpoint / jd_get_last_checkpoint work
|
|
539
|
+
# inside the entry script without requiring explicit arguments.
|
|
540
|
+
child_env = os.environ.copy()
|
|
541
|
+
child_env["JD_JOB_ID"] = str(job_id)
|
|
542
|
+
child_env["JD_SERVER"] = cfg["base_url"]
|
|
543
|
+
child_env["JD_EXP_ID"] = cfg["exp_id"]
|
|
544
|
+
child_env["JD_WORKER_JOB_DIR"] = job_root
|
|
545
|
+
child_env["JD_WORKER_WORKSPACE_ROOT"] = cfg["workspace_path"]
|
|
546
|
+
if worker_token:
|
|
547
|
+
child_env["JD_WORKER_TOKEN"] = worker_token
|
|
548
|
+
|
|
549
|
+
# Launch the entry script
|
|
550
|
+
popen_kw = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
551
|
+
text=True, env=child_env)
|
|
552
|
+
if IS_WINDOWS:
|
|
553
|
+
_proc[0] = subprocess.Popen(
|
|
554
|
+
cmd,
|
|
555
|
+
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
556
|
+
**popen_kw,
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
_proc[0] = subprocess.Popen(
|
|
560
|
+
cmd,
|
|
561
|
+
preexec_fn=os.setsid,
|
|
562
|
+
**popen_kw,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
stdout, stderr = _proc[0].communicate()
|
|
566
|
+
rc = _proc[0].returncode
|
|
567
|
+
|
|
568
|
+
# Stop heartbeat
|
|
569
|
+
stop_ping.set()
|
|
570
|
+
pinger.join(timeout=5)
|
|
571
|
+
|
|
572
|
+
if rc == 0:
|
|
573
|
+
logger.info(f"Job {job_id} finished successfully.")
|
|
574
|
+
_update_status(
|
|
575
|
+
urls['update'], job_id, 'DONE',
|
|
576
|
+
f"Completed successfully on {runner_id}.",
|
|
577
|
+
logger, worker_token=worker_token,
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
logger.error(f"Job {job_id} failed — exit code {rc}")
|
|
581
|
+
if stderr.strip():
|
|
582
|
+
logger.error(f"STDERR (last 1000 chars):\n{stderr.strip()[-1000:]}")
|
|
583
|
+
|
|
584
|
+
abort_msg = (
|
|
585
|
+
f"Job failed on {runner_id}. "
|
|
586
|
+
f"Exit code {rc}."
|
|
587
|
+
)
|
|
588
|
+
snippet = (stderr.strip() or stdout.strip())[-500:]
|
|
589
|
+
if snippet and any(kw in snippet.lower()
|
|
590
|
+
for kw in ('error', 'exception', 'traceback', 'failed')):
|
|
591
|
+
abort_msg += f" Last output: {snippet}"
|
|
592
|
+
else:
|
|
593
|
+
abort_msg += " Check worker logs for details."
|
|
594
|
+
|
|
595
|
+
if rc == -9:
|
|
596
|
+
abort_msg += " (Process killed — possible OOM or time limit.)"
|
|
597
|
+
|
|
598
|
+
_update_status(urls['update'], job_id, 'ABORTED', abort_msg, logger,
|
|
599
|
+
worker_token=worker_token)
|
|
600
|
+
|
|
601
|
+
_proc[0] = None
|
|
602
|
+
|
|
603
|
+
except Exception as exc:
|
|
604
|
+
logger.exception(f"Unexpected error: {exc}")
|
|
605
|
+
if job_id is not None:
|
|
606
|
+
_update_status(
|
|
607
|
+
urls['update'], job_id, 'ABORTED',
|
|
608
|
+
f"Unexpected exception on {runner_id}: {exc}",
|
|
609
|
+
logger, worker_token=worker_token,
|
|
610
|
+
)
|
|
611
|
+
break
|
|
612
|
+
|
|
613
|
+
if cfg['once']:
|
|
614
|
+
logger.info("once=true — exiting after one job.")
|
|
615
|
+
break
|
|
616
|
+
|
|
617
|
+
time.sleep(3) # brief pause before requesting the next job
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
if __name__ == '__main__':
|
|
621
|
+
main()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jd-worker
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Worker CLI for the JobDistributor distributed job system
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/NWSL-UCF/job-distributor
|
|
7
|
+
Project-URL: Documentation, https://github.com/NWSL-UCF/job-distributor/blob/main/docs/jd-worker.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/NWSL-UCF/job-distributor/issues
|
|
9
|
+
Keywords: distributed,jobs,worker,hpc,ml
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: psutil
|
|
19
|
+
|
|
20
|
+
# Worker client (`jd-worker`)
|
|
21
|
+
|
|
22
|
+
Workers run **`jd_worker`**, which requests jobs from the job server, executes your **entry script** with parameters as CLI flags, sends heartbeats, and reports **DONE** / **ABORTED**.
|
|
23
|
+
|
|
24
|
+
Full behaviour, environment variables, local paths (`~/jd_data/…`), and library helpers (`jd_upload`, checkpoints, `jd_job_dir`, …) are documented in **[`docs/jd-worker.md`](../docs/jd-worker.md)**.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
From the repo root (editable):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd job-distributor/client
|
|
34
|
+
python3 -m venv venv
|
|
35
|
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or install from GitHub (see `docs/jd-worker.md` for branch / subdirectory).
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Run
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
jd_worker expId=<experiment_id> entry_script=<your_script.py>
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Optional: `server=`, `port=`, `machine_type=`, `once=true`, etc. — see `jd_worker help` or **`docs/jd-worker.md`**.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Example workload
|
|
54
|
+
|
|
55
|
+
For an end-to-end ML-style example (MNIST tuning), see
|
|
56
|
+
[MNIST-parameter-tuning](https://github.com/NWSL-UCF/MNIST-parameter-tuning). Point **`entry_script`** at that repo’s training script and align **`expId`** with the server.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
jd/__init__.py
|
|
4
|
+
jd/files.py
|
|
5
|
+
jd/paths.py
|
|
6
|
+
jd/worker.py
|
|
7
|
+
jd_worker.egg-info/PKG-INFO
|
|
8
|
+
jd_worker.egg-info/SOURCES.txt
|
|
9
|
+
jd_worker.egg-info/dependency_links.txt
|
|
10
|
+
jd_worker.egg-info/entry_points.txt
|
|
11
|
+
jd_worker.egg-info/requires.txt
|
|
12
|
+
jd_worker.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
jd
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "jd-worker"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Worker CLI for the JobDistributor distributed job system"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = ["distributed", "jobs", "worker", "hpc", "ml"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Topic :: System :: Distributed Computing",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"requests",
|
|
22
|
+
"psutil",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/NWSL-UCF/job-distributor"
|
|
27
|
+
Documentation = "https://github.com/NWSL-UCF/job-distributor/blob/main/docs/jd-worker.md"
|
|
28
|
+
"Bug Tracker" = "https://github.com/NWSL-UCF/job-distributor/issues"
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
jd_worker = "jd.worker:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["."]
|
|
35
|
+
include = ["jd*"]
|