jd-worker 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jd/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """jd — Job Distributor client package."""
2
+ __version__ = "1.0.0"
3
+
4
+ from jd.files import jd_get_last_checkpoint, jd_update_checkpoint, jd_upload
5
+ from jd.paths import jd_exp_dir, jd_job_dir, jd_worker_workspace
6
+
7
+ __all__ = [
8
+ "jd_upload",
9
+ "jd_update_checkpoint",
10
+ "jd_get_last_checkpoint",
11
+ "jd_job_dir",
12
+ "jd_worker_workspace",
13
+ "jd_exp_dir",
14
+ ]
jd/files.py ADDED
@@ -0,0 +1,236 @@
1
+ """
2
+ jd.files — worker-side file/checkpoint helpers
3
+ ================================================
4
+ These functions are designed to be called **from inside your entry script**
5
+ while a job is running. jd_worker automatically injects the required
6
+ context (server URL, job ID, experiment ID) as environment variables, so
7
+ you normally do not need to pass those arguments explicitly.
8
+
9
+ Quick-start
10
+ -----------
11
+ from jd import jd_upload, jd_job_dir, jd_update_checkpoint, jd_get_last_checkpoint
12
+
13
+ out = jd_job_dir() / "metrics.csv"
14
+ # … write to out …
15
+ jd_upload(str(out))
16
+
17
+ jd_update_checkpoint({"epoch": 5, "state_dict": model.state_dict()})
18
+ ckpt = jd_get_last_checkpoint()
19
+ if ckpt:
20
+ model.load_state_dict(ckpt["state_dict"])
21
+
22
+ Environment variables (set automatically by jd_worker)
23
+ -------------------------------------------------------
24
+ JD_SERVER — job server base URL, e.g. http://10.0.0.1:8000
25
+ JD_JOB_ID — integer job ID assigned by the server
26
+ JD_EXP_ID — experiment identifier
27
+ JD_WORKER_JOB_DIR — absolute …/<parent>/jd_data/<expId>/<job_id>/
28
+ (same as ``--base_path``); prefer ``jd_job_dir()``
29
+ JD_WORKER_WORKSPACE_ROOT — absolute ``<parent>/jd_data`` (same as ``jd_worker_workspace()``)
30
+
31
+ You can override server/job_id via the upload/checkpoint function keyword arguments.
32
+ """
33
+
34
+ import io
35
+ import logging
36
+ import os
37
+ import pickle
38
+
39
+ import requests
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ _MAX_BYTES = 100 * 1024 * 1024 # 100 MB
44
+ _TIMEOUT = 180 # seconds for upload/download HTTP calls
45
+
46
+
47
+ # ── Internal helpers ──────────────────────────────────────────────────────────
48
+
49
+ def _ctx(job_id, server):
50
+ """Resolve job_id and server from kwargs → env vars → error."""
51
+ if job_id is None:
52
+ raw = os.environ.get("JD_JOB_ID", "")
53
+ if not raw:
54
+ raise RuntimeError(
55
+ "job_id not provided and JD_JOB_ID environment variable is not set. "
56
+ "Make sure you are calling this function from inside an entry script "
57
+ "launched by jd_worker."
58
+ )
59
+ job_id = int(raw)
60
+ if server is None:
61
+ server = os.environ.get("JD_SERVER", "")
62
+ if not server:
63
+ raise RuntimeError(
64
+ "server not provided and JD_SERVER environment variable is not set."
65
+ )
66
+ return job_id, server.rstrip("/")
67
+
68
+
69
+ def _auth_headers() -> dict:
70
+ """Return Authorization header if JD_WORKER_TOKEN is set (Hub mode)."""
71
+ token = os.environ.get("JD_WORKER_TOKEN", "").strip()
72
+ if token:
73
+ return {"Authorization": f"Bearer {token}"}
74
+ return {}
75
+
76
+
77
+ def _check_size(data: bytes, label: str) -> None:
78
+ if len(data) > _MAX_BYTES:
79
+ mb = len(data) / (1024 ** 2)
80
+ raise ValueError(
81
+ f"{label} is {mb:.1f} MB which exceeds the 100 MB limit."
82
+ )
83
+
84
+
85
+ # ── Public API ────────────────────────────────────────────────────────────────
86
+
87
+ def jd_upload(file_path: str, job_id: int = None, server: str = None) -> dict:
88
+ """
89
+ Upload a result file (≤ 100 MB) to the server.
90
+
91
+ The file is stored in the experiment's job directory as::
92
+
93
+ result_v{N}_{timestamp}.<original_ext>
94
+
95
+ where N auto-increments across calls so every upload is preserved.
96
+
97
+ Parameters
98
+ ----------
99
+ file_path : str
100
+ Local path to the file you want to upload.
101
+ job_id : int, optional
102
+ Defaults to the JD_JOB_ID environment variable.
103
+ server : str, optional
104
+ Job server base URL. Defaults to JD_SERVER environment variable.
105
+
106
+ Returns
107
+ -------
108
+ dict
109
+ ``{"success": True, "filename": "result_v0_…", "version": 0, "size_bytes": …}``
110
+ """
111
+ job_id, server = _ctx(job_id, server)
112
+
113
+ if not os.path.isfile(file_path):
114
+ raise FileNotFoundError(f"File not found: {file_path}")
115
+
116
+ with open(file_path, "rb") as fh:
117
+ data = fh.read()
118
+
119
+ _check_size(data, f"File '{os.path.basename(file_path)}'")
120
+
121
+ original_name = os.path.basename(file_path)
122
+ logger.info(f"[jd_upload] Uploading '{original_name}' ({len(data)/(1024**2):.2f} MB) …")
123
+
124
+ resp = requests.post(
125
+ f"{server}/upload",
126
+ data={"job_id": str(job_id)},
127
+ files={"file": (original_name, io.BytesIO(data), "application/octet-stream")},
128
+ headers=_auth_headers(),
129
+ timeout=_TIMEOUT,
130
+ )
131
+ resp.raise_for_status()
132
+ result = resp.json()
133
+ logger.info(f"[jd_upload] Saved as '{result.get('filename')}' (version {result.get('version')})")
134
+ return result
135
+
136
+
137
+ def jd_update_checkpoint(obj, job_id: int = None, server: str = None) -> dict:
138
+ """
139
+ Serialise *obj* with pickle and upload it as a versioned checkpoint.
140
+
141
+ The checkpoint is stored in the experiment's job directory as::
142
+
143
+ checkpoint_v{N}_{timestamp}.pt
144
+
145
+ Each call creates a new version, so previous checkpoints are never
146
+ overwritten. The file uses standard Python pickle serialisation and is
147
+ compatible with PyTorch state-dicts as well as arbitrary Python objects.
148
+
149
+ Parameters
150
+ ----------
151
+ obj : any
152
+ The Python object to checkpoint (e.g. a dict containing
153
+ ``model.state_dict()`` and optimizer state).
154
+ job_id : int, optional
155
+ server : str, optional
156
+
157
+ Returns
158
+ -------
159
+ dict
160
+ ``{"success": True, "filename": "checkpoint_v0_…", "version": 0, …}``
161
+ """
162
+ job_id, server = _ctx(job_id, server)
163
+
164
+ logger.info("[jd_update_checkpoint] Serialising checkpoint …")
165
+ data = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
166
+ _check_size(data, "Checkpoint")
167
+
168
+ logger.info(f"[jd_update_checkpoint] Uploading {len(data)/(1024**2):.2f} MB …")
169
+ resp = requests.post(
170
+ f"{server}/checkpoint",
171
+ data={"job_id": str(job_id)},
172
+ files={"checkpoint": ("checkpoint.pkl", io.BytesIO(data), "application/octet-stream")},
173
+ headers=_auth_headers(),
174
+ timeout=_TIMEOUT,
175
+ )
176
+ resp.raise_for_status()
177
+ result = resp.json()
178
+ logger.info(f"[jd_update_checkpoint] Saved as '{result.get('filename')}' (version {result.get('version')})")
179
+ return result
180
+
181
+
182
+ def jd_get_last_checkpoint(job_id: int = None, server: str = None):
183
+ """
184
+ Download the highest-versioned checkpoint for this job and return it as
185
+ a Python object — **nothing is written to disk**.
186
+
187
+ The server sends the raw pickle bytes; this function deserialises them
188
+ directly from an in-memory buffer so your script can resume immediately.
189
+
190
+ Parameters
191
+ ----------
192
+ job_id : int, optional
193
+ server : str, optional
194
+
195
+ Returns
196
+ -------
197
+ object
198
+ The Python object that was passed to ``jd_update_checkpoint``, or
199
+ ``None`` if no checkpoint exists yet for this job.
200
+
201
+ Examples
202
+ --------
203
+ ::
204
+
205
+ ckpt = jd_get_last_checkpoint()
206
+ if ckpt is not None:
207
+ model.load_state_dict(ckpt["model"])
208
+ start_epoch = ckpt["epoch"] + 1
209
+ """
210
+ job_id, server = _ctx(job_id, server)
211
+
212
+ logger.info(f"[jd_get_last_checkpoint] Fetching latest checkpoint for job {job_id} …")
213
+ resp = requests.get(
214
+ f"{server}/checkpoint/latest",
215
+ params={"job_id": job_id},
216
+ headers=_auth_headers(),
217
+ timeout=_TIMEOUT,
218
+ stream=True,
219
+ )
220
+
221
+ if resp.status_code == 404:
222
+ logger.info("[jd_get_last_checkpoint] No checkpoint found.")
223
+ return None
224
+
225
+ resp.raise_for_status()
226
+
227
+ # Read directly into a BytesIO buffer — no temp file, stays in memory
228
+ buf = io.BytesIO()
229
+ for chunk in resp.iter_content(chunk_size=1024 * 256):
230
+ buf.write(chunk)
231
+
232
+ logger.info(f"[jd_get_last_checkpoint] Received {buf.tell()/(1024**2):.2f} MB. Deserialising …")
233
+ buf.seek(0)
234
+ obj = pickle.load(buf)
235
+ logger.info("[jd_get_last_checkpoint] Checkpoint ready.")
236
+ return obj
jd/paths.py ADDED
@@ -0,0 +1,58 @@
1
+ """
2
+ Paths injected by jd_worker for entry scripts
3
+ ==============================================
4
+ Layout on the worker::
5
+
6
+ <parent>/jd_data/<expId>/<job_id>/ ← save job outputs here (jd_job_dir)
7
+
8
+ ``parent`` is ``JD_WORKSPACE_PATH`` if set, otherwise your home directory.
9
+ ``jd_worker_workspace()`` returns the resolved ``…/jd_data`` directory.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from pathlib import Path
16
+
17
+
18
+ def jd_job_dir() -> Path:
19
+ """
20
+ Absolute directory for **this job's** local files — same as ``--base_path``.
21
+
22
+ Equivalent to ``JD_WORKER_JOB_DIR``. Prefer this for CSVs, checkpoints on
23
+ disk, temp files, etc.
24
+ """
25
+ raw = os.environ.get("JD_WORKER_JOB_DIR", "").strip()
26
+ if not raw:
27
+ raise RuntimeError(
28
+ "JD_WORKER_JOB_DIR is not set. Run your script via jd_worker "
29
+ "(or set JD_WORKER_JOB_DIR to your job sandbox directory)."
30
+ )
31
+ return Path(raw).expanduser().resolve()
32
+
33
+
34
+ def jd_worker_workspace() -> Path:
35
+ """
36
+ Absolute **jd_data root**: ``<parent>/jd_data`` where ``parent`` is
37
+ ``JD_WORKSPACE_PATH`` or ``~``.
38
+
39
+ Set by ``JD_WORKER_WORKSPACE_ROOT`` for the entry-script process.
40
+ If only ``JD_WORKER_JOB_DIR`` is present, derives
41
+ ``…/jd_data/<expId>/<job_id>`` → parent.parent as that root.
42
+ """
43
+ raw = os.environ.get("JD_WORKER_WORKSPACE_ROOT", "").strip()
44
+ if raw:
45
+ return Path(raw).expanduser().resolve()
46
+ job_raw = os.environ.get("JD_WORKER_JOB_DIR", "").strip()
47
+ if job_raw:
48
+ p = Path(job_raw).expanduser().resolve()
49
+ return p.parent.parent
50
+ raise RuntimeError(
51
+ "Cannot resolve worker workspace: set JD_WORKER_WORKSPACE_ROOT or "
52
+ "run inside jd_worker so JD_WORKER_JOB_DIR is set."
53
+ )
54
+
55
+
56
+ def jd_exp_dir() -> Path:
57
+ """Absolute ``<parent>/jd_data/<expId>/`` for this run (parent of ``jd_job_dir()``)."""
58
+ return jd_job_dir().parent
jd/worker.py ADDED
@@ -0,0 +1,621 @@
1
+ """
2
+ jd_worker — Job Distributor Worker CLI
3
+ =======================================
4
+ Requests jobs from a jd server, runs the entry script with the job's
5
+ parameters as CLI flags, sends a heartbeat ping every 57 seconds, and
6
+ reports DONE or ABORTED when the script finishes.
7
+
8
+ Usage
9
+ -----
10
+ jd_worker expId=<id> entry_script=<script.py> [options]
11
+
12
+ Required
13
+ --------
14
+ expId=<id> Experiment identifier (must match the server).
15
+ entry_script=<path> Python script to run for each job.
16
+
17
+ Optional
18
+ --------
19
+ (no workspace CLI) Local data lives under ``<parent>/jd_data/<expId>/<job_id>/``.
20
+ ``parent`` is ``JD_WORKSPACE_PATH`` if set, otherwise
21
+ ``~`` (your home directory). So the default root is
22
+ ``~/jd_data/``. Passed to the entry script as
23
+ ``--base_path``, ``JD_WORKER_JOB_DIR``, and
24
+ ``JD_WORKER_WORKSPACE_ROOT`` (= ``.../jd_data``).
25
+
26
+ Note: ``JD_WORKSPACE_PATH`` here is **worker-only**
27
+ (parent of ``jd_data``). The job server uses the same
28
+ variable name for its own layout — avoid exporting one
29
+ value for both in the same shell.
30
+ server=<url> Job server base URL or host (default: http://localhost,
31
+ env: JD_SERVER). If you omit ``http://`` or ``https://``,
32
+ ``http://`` is assumed.
33
+ port=<N> Port if not included in server URL
34
+ (default: 5000, env: JD_PORT)
35
+ log_dir=<path> If set, logs go under <log_dir>/<expId>/; otherwise
36
+ under <jd_data>/<expId>/jd_worker_logs/
37
+ (env: JD_LOG_DIR)
38
+ machine_type=<type> Label for this machine in the dashboard
39
+ (default: worker, env: JD_MACHINE_TYPE)
40
+ process_id=<N> Numeric ID when running multiple workers on the
41
+ same machine (default: 0)
42
+ once=true Exit after completing a single job instead of
43
+ looping until no jobs remain.
44
+
45
+ Examples
46
+ --------
47
+ jd_worker expId=mnist_tune entry_script=train.py
48
+
49
+ # Optional: put jd_data under /scratch/jd_data (parent=/scratch)
50
+ JD_WORKSPACE_PATH=/scratch jd_worker expId=mnist_tune entry_script=train.py \\
51
+ server=http://10.0.0.5 port=8000 \\
52
+ machine_type=gpu_node
53
+
54
+ # Run exactly one job:
55
+ jd_worker expId=mnist_tune entry_script=train.py once=true
56
+
57
+ Install
58
+ -------
59
+ pip install -e ./client # from the repo root
60
+ # then `jd_worker` is available in whatever env is active
61
+ """
62
+
63
+ import logging
64
+ import os
65
+ import platform
66
+ import random
67
+ import signal
68
+ import socket
69
+ import subprocess
70
+ import sys
71
+ import threading
72
+ import time
73
+ from urllib.parse import urlparse, urlunparse
74
+
75
+ import psutil
76
+ import requests
77
+
78
+ from jd import __version__
79
+
80
+ IS_WINDOWS = platform.system() == "Windows"
81
+ PING_INTERVAL = 57 # seconds — intentionally not 60 to avoid racing the idle timeout
82
+
83
+ # Fixed subdirectory under JD_WORKSPACE_PATH (or home): …/jd_data/<expId>/<job_id>/
84
+ _WORKER_JD_DATA_DIRNAME = "jd_data"
85
+
86
+
87
+ # ── Argument parsing ─────────────────────────────────────────────────────────
88
+
89
+ def _parse_kv(argv: list) -> dict:
90
+ """Parse key=value positional arguments into a plain dict."""
91
+ cfg = {}
92
+ for arg in argv:
93
+ if '=' in arg:
94
+ k, v = arg.split('=', 1)
95
+ cfg[k.strip()] = v.strip()
96
+ else:
97
+ cfg[arg.strip()] = 'true'
98
+ return cfg
99
+
100
+
101
+ def _normalize_server_base_url(server_raw: str, port_raw: str) -> str:
102
+ """
103
+ Build a base URL that requests/lib can open.
104
+
105
+ Host-only values like ``localhost`` must become ``http://localhost:<port>``.
106
+ Without a scheme, Python requests raises "No connection adapters were found".
107
+ """
108
+ s = (server_raw or "").strip().rstrip("/")
109
+ if not s:
110
+ s = "http://localhost"
111
+ if not urlparse(s).scheme:
112
+ s = "http://" + s.lstrip("/")
113
+ parsed = urlparse(s)
114
+ if parsed.port is not None:
115
+ return s
116
+ host = parsed.hostname
117
+ if not host:
118
+ host = "localhost"
119
+ netloc = f"{host}:{port_raw}"
120
+ return urlunparse(
121
+ (parsed.scheme, netloc, parsed.path or "", "", "", "")
122
+ ).rstrip("/")
123
+
124
+
125
+ def _resolve(cfg: dict) -> dict:
126
+ """Merge CLI key=value pairs with environment variables and defaults."""
127
+
128
+ def get(cli_key, env_key=None, default=None):
129
+ if cli_key in cfg:
130
+ return cfg[cli_key]
131
+ if env_key:
132
+ val = os.environ.get(env_key, '')
133
+ if val:
134
+ return val
135
+ return default
136
+
137
+ server_raw = get('server', 'JD_SERVER', 'http://localhost')
138
+ port_raw = str(get('port', 'JD_PORT', '5000')).strip()
139
+
140
+ base_url = _normalize_server_base_url(server_raw, port_raw)
141
+
142
+ # …/<parent>/jd_data/<expId>/<job_id>/ — parent from env or ~
143
+ parent = os.environ.get("JD_WORKSPACE_PATH", "").strip()
144
+ if not parent:
145
+ parent = os.path.expanduser("~")
146
+ parent = os.path.abspath(os.path.expanduser(parent))
147
+ workspace_path = os.path.join(parent, _WORKER_JD_DATA_DIRNAME)
148
+ os.makedirs(workspace_path, exist_ok=True)
149
+
150
+ log_override = None
151
+ if 'log_dir' in cfg:
152
+ log_override = os.path.expanduser(cfg['log_dir'].strip())
153
+ elif os.environ.get('JD_LOG_DIR', '').strip():
154
+ log_override = os.path.expanduser(os.environ['JD_LOG_DIR'].strip())
155
+
156
+ return {
157
+ 'exp_id': get('expId', 'JD_EXP_ID', None),
158
+ 'entry_script': get('entry_script', 'JD_ENTRY_SCRIPT', None),
159
+ 'base_url': base_url,
160
+ 'workspace_path': workspace_path,
161
+ 'log_dir_override': log_override,
162
+ 'machine_type': get('machine_type', 'JD_MACHINE_TYPE', 'worker'),
163
+ 'process_id': get('process_id', None, '0'),
164
+ 'once': get('once', 'JD_ONCE', 'false').lower() == 'true',
165
+ # Hub authentication (optional)
166
+ 'hub_url': get('hub_url', 'JD_HUB_URL', '').strip().rstrip('/'),
167
+ 'api_key': get('api_key', 'JD_API_KEY', '').strip(),
168
+ }
169
+
170
+
171
+ # ── Hub authentication ────────────────────────────────────────────────────────
172
+
173
+ def _hub_get_worker_token(hub_url: str, api_key: str,
174
+ exp_id: str, logger: logging.Logger) -> tuple[str, str] | None:
175
+ """
176
+ Obtain a worker JWT and server URL from the Hub.
177
+
178
+ Returns (worker_token, server_url) on success, None on failure.
179
+ The returned server_url overrides any CLI-supplied server address.
180
+ """
181
+ endpoint = f"{hub_url}/api/worker/token"
182
+ try:
183
+ r = requests.post(
184
+ endpoint,
185
+ json={"experiment_name": exp_id},
186
+ headers={"Authorization": f"Bearer {api_key}"},
187
+ timeout=30,
188
+ )
189
+ if r.status_code == 200:
190
+ data = r.json()
191
+ token = data.get("worker_token", "")
192
+ server_url = data.get("server_url", "")
193
+ expires_at = data.get("expires_at", "unknown")
194
+ logger.info(f"Worker token obtained from Hub (expires: {expires_at})")
195
+ return token, server_url
196
+ logger.error(f"Hub token request failed: HTTP {r.status_code} — {r.text[:300]}")
197
+ return None
198
+ except Exception as exc:
199
+ logger.error(f"Hub connection error: {exc}")
200
+ return None
201
+
202
+
203
+ # ── Logging ───────────────────────────────────────────────────────────────────
204
+
205
+ def _setup_logger(log_dir: str, runner_id: str) -> logging.Logger:
206
+ os.makedirs(log_dir, exist_ok=True)
207
+ log_path = os.path.join(log_dir, f"jd_worker_{runner_id}.log")
208
+
209
+ logger = logging.getLogger(f'jd_worker.{runner_id}')
210
+ logger.setLevel(logging.INFO)
211
+
212
+ fmt = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
213
+ fh = logging.FileHandler(log_path, encoding='utf-8')
214
+ fh.setFormatter(fmt)
215
+ sh = logging.StreamHandler(sys.stdout)
216
+ sh.setFormatter(fmt)
217
+
218
+ logger.addHandler(fh)
219
+ logger.addHandler(sh)
220
+ logger.propagate = False
221
+ return logger
222
+
223
+
224
+ # ── System metrics ────────────────────────────────────────────────────────────
225
+
226
+ def _collect_metrics(machine_type: str, logger: logging.Logger) -> dict:
227
+ """Collect a single snapshot of system metrics."""
228
+ try:
229
+ cpu_pct = psutil.cpu_percent(interval=0.1)
230
+ cpu_phys = psutil.cpu_count(logical=False) or 1
231
+ cpu_logi = psutil.cpu_count(logical=True) or 1
232
+
233
+ try:
234
+ freq = psutil.cpu_freq()
235
+ freq_mhz = freq.current if freq else 0
236
+ except (AttributeError, RuntimeError):
237
+ freq_mhz = 0
238
+
239
+ mem = psutil.virtual_memory()
240
+ ram_total_gb = mem.total / (1024 ** 3)
241
+ ram_avail_gb = mem.available / (1024 ** 3)
242
+
243
+ if not IS_WINDOWS:
244
+ try:
245
+ la = os.getloadavg()
246
+ load1, load5, load15 = la[0], la[1], la[2]
247
+ except (AttributeError, OSError):
248
+ load1 = load5 = load15 = 0.0
249
+ load_per_cpu = load1 / cpu_logi if cpu_logi > 0 else 0.0
250
+ idle_slots = max(0, cpu_logi - load1)
251
+ else:
252
+ load1 = load5 = load15 = cpu_pct / 100.0 * cpu_logi
253
+ load_per_cpu = cpu_pct / 100.0
254
+ idle_slots = max(0, cpu_logi * (1 - cpu_pct / 100.0))
255
+
256
+ try:
257
+ d0 = psutil.disk_io_counters()
258
+ time.sleep(0.1)
259
+ d1 = psutil.disk_io_counters()
260
+ if d0 and d1:
261
+ dt = 0.1
262
+ ops = (d1.read_count - d0.read_count +
263
+ d1.write_count - d0.write_count) / dt
264
+ bps = (d1.read_bytes - d0.read_bytes +
265
+ d1.write_bytes - d0.write_bytes) / dt
266
+ disk_io_util = max(min(100.0, (ops / 100_000) * 100),
267
+ min(100.0, (bps / 1e9) * 100))
268
+ else:
269
+ disk_io_util = 0.0
270
+ except (AttributeError, RuntimeError, TypeError):
271
+ disk_io_util = 0.0
272
+
273
+ return {
274
+ "cpu_util": round(cpu_pct, 1),
275
+ "ram_util": round(mem.percent, 1),
276
+ "ram_available": round(ram_avail_gb, 15),
277
+ "ram_total": round(ram_total_gb, 1),
278
+ "worker_type": machine_type,
279
+ "idle_slots": int(round(idle_slots)),
280
+ "load_1min": round(load1, 10),
281
+ "load_5min": round(load5, 10),
282
+ "load_15min": round(load15, 10),
283
+ "load_per_cpu": round(load_per_cpu, 13),
284
+ "disk_io_util": round(disk_io_util, 2),
285
+ "cpu_cores": cpu_phys,
286
+ "cpu_threads": cpu_logi,
287
+ "cpu_freq_mhz": int(round(freq_mhz)) if freq_mhz > 0 else 0,
288
+ }
289
+ except Exception as exc:
290
+ logger.error(f"Metrics collection error: {exc}")
291
+ return {k: 0 for k in (
292
+ "cpu_util", "ram_util", "ram_available", "ram_total",
293
+ "idle_slots", "load_1min", "load_5min", "load_15min",
294
+ "load_per_cpu", "disk_io_util", "cpu_cores", "cpu_threads",
295
+ "cpu_freq_mhz",
296
+ )} | {"worker_type": machine_type}
297
+
298
+
299
+ def _averaged_metrics(machine_type: str, logger: logging.Logger,
300
+ samples: int = 5, interval: float = 3.0) -> dict:
301
+ """Collect `samples` snapshots and return their numeric averages."""
302
+ logger.info(f"Collecting system metrics ({samples} samples × {interval}s)…")
303
+ snapshots = []
304
+ for i in range(samples):
305
+ logger.info(f" Sample {i+1}/{samples}")
306
+ snapshots.append(_collect_metrics(machine_type, logger))
307
+ if i < samples - 1:
308
+ time.sleep(interval)
309
+
310
+ numeric = [
311
+ "cpu_util", "ram_util", "ram_available", "ram_total",
312
+ "idle_slots", "load_1min", "load_5min", "load_15min",
313
+ "load_per_cpu", "disk_io_util", "cpu_freq_mhz",
314
+ ]
315
+ result = {"worker_type": machine_type,
316
+ "cpu_cores": snapshots[0]["cpu_cores"],
317
+ "cpu_threads": snapshots[0]["cpu_threads"]}
318
+ for key in numeric:
319
+ vals = [s[key] for s in snapshots]
320
+ avg = sum(vals) / len(vals)
321
+ if key in ("idle_slots", "cpu_freq_mhz"):
322
+ result[key] = int(round(avg))
323
+ elif key in ("cpu_util", "ram_util", "ram_total"):
324
+ result[key] = round(avg, 1)
325
+ elif key == "ram_available":
326
+ result[key] = round(avg, 15)
327
+ elif key in ("load_1min", "load_5min", "load_15min"):
328
+ result[key] = round(avg, 10)
329
+ elif key == "load_per_cpu":
330
+ result[key] = round(avg, 13)
331
+ else:
332
+ result[key] = round(avg, 2)
333
+ logger.info("System metrics ready.")
334
+ return result
335
+
336
+
337
+ # ── Server communication ──────────────────────────────────────────────────────
338
+
339
+ def _auth_headers(worker_token: str) -> dict:
340
+ """Return Authorization header dict if worker_token is set, else empty."""
341
+ if worker_token:
342
+ return {"Authorization": f"Bearer {worker_token}"}
343
+ return {}
344
+
345
+
346
+ def _request_job(url: str, runner_id: str, metrics: dict,
347
+ logger: logging.Logger, worker_token: str = ""):
348
+ """Returns (job_dict, None) on success, (None, reason) on failure."""
349
+ try:
350
+ r = requests.post(url, json={"requested_by": runner_id,
351
+ "system_metrics": metrics},
352
+ headers=_auth_headers(worker_token),
353
+ timeout=30)
354
+ if r.status_code == 404:
355
+ return None, 'no_jobs'
356
+ if r.status_code == 200:
357
+ return r.json(), None
358
+ return None, f"HTTP {r.status_code}: {r.text[:200]}"
359
+ except Exception as exc:
360
+ return None, str(exc)
361
+
362
+
363
+ def _update_status(url: str, job_id: int, status: str,
364
+ message: str, logger: logging.Logger,
365
+ worker_token: str = "") -> None:
366
+ try:
367
+ r = requests.post(url, json={"job_id": job_id,
368
+ "status": status,
369
+ "message": message},
370
+ headers=_auth_headers(worker_token),
371
+ timeout=30)
372
+ if r.status_code == 200:
373
+ logger.info(f"Job {job_id} → {status}")
374
+ else:
375
+ logger.warning(f"Status update failed: HTTP {r.status_code}")
376
+ except Exception as exc:
377
+ logger.error(f"Status update error: {exc}")
378
+
379
+
380
+ def _ping_loop(url: str, job_id: int,
381
+ stop_event: threading.Event,
382
+ logger: logging.Logger,
383
+ worker_token: str = "") -> None:
384
+ """Background thread: ping the server every PING_INTERVAL seconds."""
385
+ while not stop_event.wait(PING_INTERVAL):
386
+ try:
387
+ r = requests.post(url, json={"id": job_id},
388
+ headers=_auth_headers(worker_token),
389
+ timeout=10)
390
+ if r.status_code == 200:
391
+ logger.info(f"Ping OK (job {job_id})")
392
+ else:
393
+ logger.warning(f"Ping HTTP {r.status_code} (job {job_id})")
394
+ except Exception as exc:
395
+ logger.warning(f"Ping error (job {job_id}): {exc}")
396
+
397
+
398
+ # ── Main entry point ──────────────────────────────────────────────────────────
399
+
400
+ def main() -> None:
401
+ argv = sys.argv[1:]
402
+
403
+ # Show help
404
+ if not argv or any(a in argv for a in ('help', '-h', '--help')):
405
+ print(__doc__)
406
+ sys.exit(0)
407
+
408
+ kv = _parse_kv(argv)
409
+ cfg = _resolve(kv)
410
+
411
+ # Validate required arguments
412
+ errors = []
413
+ if not cfg['exp_id']:
414
+ errors.append("expId is required")
415
+ if not cfg['entry_script']:
416
+ errors.append("entry_script is required")
417
+ elif not os.path.isfile(cfg['entry_script']):
418
+ errors.append(f"entry_script '{cfg['entry_script']}' not found")
419
+ if errors:
420
+ for e in errors:
421
+ print(f"Error: {e}")
422
+ print("Run `jd_worker help` for usage.")
423
+ sys.exit(1)
424
+
425
+ # Build a unique runner ID visible in the dashboard
426
+ username = os.getenv('USER') or os.getenv('USERNAME') or 'user'
427
+ random.seed(int(time.time() * 1000))
428
+ suffix = random.randint(10000, 99999)
429
+ runner_id = (f"{username}@{socket.gethostname()}"
430
+ f"({cfg['machine_type']})_{cfg['process_id']}_{suffix}")
431
+
432
+ if cfg['log_dir_override'] is not None:
433
+ log_dir = os.path.join(cfg['log_dir_override'], cfg['exp_id'])
434
+ else:
435
+ log_dir = os.path.join(cfg['workspace_path'], cfg['exp_id'], 'jd_worker_logs')
436
+ logger = _setup_logger(log_dir, runner_id)
437
+
438
+ # ── Hub authentication (optional) ────────────────────────────────────────
439
+ worker_token = ""
440
+ if cfg['hub_url'] and cfg['api_key']:
441
+ logger.info(f"Hub mode: authenticating via {cfg['hub_url']}")
442
+ result = _hub_get_worker_token(
443
+ cfg['hub_url'], cfg['api_key'], cfg['exp_id'], logger
444
+ )
445
+ if result is None:
446
+ logger.error("Failed to obtain worker token from Hub. Exiting.")
447
+ sys.exit(1)
448
+ worker_token, hub_server_url = result
449
+ if hub_server_url:
450
+ cfg['base_url'] = hub_server_url
451
+ logger.info(f"Server URL from Hub: {hub_server_url}")
452
+ elif cfg['hub_url'] or cfg['api_key']:
453
+ logger.warning(
454
+ "Both hub_url (JD_HUB_URL) and api_key (JD_API_KEY) must be set "
455
+ "for Hub authentication. Running in standalone mode."
456
+ )
457
+
458
+ urls = {
459
+ 'request': f"{cfg['base_url']}/request_job",
460
+ 'update': f"{cfg['base_url']}/update_job_status",
461
+ 'ping': f"{cfg['base_url']}/ping",
462
+ }
463
+
464
+ logger.info(f"jd_worker v{__version__} | runner: {runner_id}")
465
+ logger.info(f"Server: {cfg['base_url']}")
466
+ logger.info(f"Entry script: {cfg['entry_script']}")
467
+ logger.info(f"Hub mode: {'enabled' if worker_token else 'disabled'}")
468
+ logger.info(f"Local jd_data root: {cfg['workspace_path']} "
469
+ f"(each job: …/jd_data/<expId>/<job_id>/)")
470
+ logger.info(f"Ping interval: {PING_INTERVAL}s")
471
+ if cfg['once']:
472
+ logger.info("Mode: single job (once=true)")
473
+
474
+ # Track the current child process so SIGINT/SIGTERM can clean it up
475
+ _proc: list = [None]
476
+
477
+ def _shutdown(signum=None, frame=None):
478
+ p = _proc[0]
479
+ if p and p.poll() is None:
480
+ logger.info(f"Killing subprocess PID {p.pid}…")
481
+ try:
482
+ if IS_WINDOWS:
483
+ p.send_signal(signal.CTRL_BREAK_EVENT)
484
+ else:
485
+ os.killpg(os.getpgid(p.pid), signal.SIGTERM)
486
+ except Exception:
487
+ pass
488
+ logger.info("jd_worker shut down.")
489
+ sys.exit(0)
490
+
491
+ signal.signal(signal.SIGINT, _shutdown)
492
+ signal.signal(signal.SIGTERM, _shutdown)
493
+
494
+ # ── Main job loop ────────────────────────────────────────────────────────
495
+ while True:
496
+ job_id = None
497
+ try:
498
+ metrics = _averaged_metrics(cfg['machine_type'], logger)
499
+ job, reason = _request_job(urls['request'], runner_id, metrics, logger,
500
+ worker_token=worker_token)
501
+
502
+ if job is None:
503
+ if reason == 'no_jobs':
504
+ logger.info("No more jobs available. Exiting.")
505
+ break
506
+ logger.error(f"Job request failed: {reason}. Retrying in 10 s…")
507
+ time.sleep(10)
508
+ continue
509
+
510
+ job_id = job['job_id']
511
+ params = job['parameters']
512
+ logger.info(f"Job {job_id} received | params: {params}")
513
+
514
+ # Local sandbox for this job — keep worker-side I/O under this directory
515
+ job_root = os.path.abspath(os.path.join(
516
+ cfg['workspace_path'], cfg['exp_id'], str(job_id)))
517
+ os.makedirs(job_root, exist_ok=True)
518
+
519
+ # Build subprocess command
520
+ # Uses the *current* Python interpreter so venv/conda is respected
521
+ cmd = [sys.executable, cfg['entry_script']]
522
+ for k, v in params.items():
523
+ cmd.extend([f"--{k}", str(v)])
524
+ cmd.extend(['--base_path', job_root])
525
+ logger.info(f"Job workspace: {job_root}")
526
+ logger.info(f"Command: {' '.join(cmd)}")
527
+
528
+ # Start heartbeat ping thread
529
+ stop_ping = threading.Event()
530
+ pinger = threading.Thread(
531
+ target=_ping_loop,
532
+ args=(urls['ping'], job_id, stop_ping, logger, worker_token),
533
+ daemon=True,
534
+ )
535
+ pinger.start()
536
+
537
+ # Build child environment: inherit everything + inject JD_ context
538
+ # so jd_upload / jd_update_checkpoint / jd_get_last_checkpoint work
539
+ # inside the entry script without requiring explicit arguments.
540
+ child_env = os.environ.copy()
541
+ child_env["JD_JOB_ID"] = str(job_id)
542
+ child_env["JD_SERVER"] = cfg["base_url"]
543
+ child_env["JD_EXP_ID"] = cfg["exp_id"]
544
+ child_env["JD_WORKER_JOB_DIR"] = job_root
545
+ child_env["JD_WORKER_WORKSPACE_ROOT"] = cfg["workspace_path"]
546
+ if worker_token:
547
+ child_env["JD_WORKER_TOKEN"] = worker_token
548
+
549
+ # Launch the entry script
550
+ popen_kw = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE,
551
+ text=True, env=child_env)
552
+ if IS_WINDOWS:
553
+ _proc[0] = subprocess.Popen(
554
+ cmd,
555
+ creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
556
+ **popen_kw,
557
+ )
558
+ else:
559
+ _proc[0] = subprocess.Popen(
560
+ cmd,
561
+ preexec_fn=os.setsid,
562
+ **popen_kw,
563
+ )
564
+
565
+ stdout, stderr = _proc[0].communicate()
566
+ rc = _proc[0].returncode
567
+
568
+ # Stop heartbeat
569
+ stop_ping.set()
570
+ pinger.join(timeout=5)
571
+
572
+ if rc == 0:
573
+ logger.info(f"Job {job_id} finished successfully.")
574
+ _update_status(
575
+ urls['update'], job_id, 'DONE',
576
+ f"Completed successfully on {runner_id}.",
577
+ logger, worker_token=worker_token,
578
+ )
579
+ else:
580
+ logger.error(f"Job {job_id} failed — exit code {rc}")
581
+ if stderr.strip():
582
+ logger.error(f"STDERR (last 1000 chars):\n{stderr.strip()[-1000:]}")
583
+
584
+ abort_msg = (
585
+ f"Job failed on {runner_id}. "
586
+ f"Exit code {rc}."
587
+ )
588
+ snippet = (stderr.strip() or stdout.strip())[-500:]
589
+ if snippet and any(kw in snippet.lower()
590
+ for kw in ('error', 'exception', 'traceback', 'failed')):
591
+ abort_msg += f" Last output: {snippet}"
592
+ else:
593
+ abort_msg += " Check worker logs for details."
594
+
595
+ if rc == -9:
596
+ abort_msg += " (Process killed — possible OOM or time limit.)"
597
+
598
+ _update_status(urls['update'], job_id, 'ABORTED', abort_msg, logger,
599
+ worker_token=worker_token)
600
+
601
+ _proc[0] = None
602
+
603
+ except Exception as exc:
604
+ logger.exception(f"Unexpected error: {exc}")
605
+ if job_id is not None:
606
+ _update_status(
607
+ urls['update'], job_id, 'ABORTED',
608
+ f"Unexpected exception on {runner_id}: {exc}",
609
+ logger, worker_token=worker_token,
610
+ )
611
+ break
612
+
613
+ if cfg['once']:
614
+ logger.info("once=true — exiting after one job.")
615
+ break
616
+
617
+ time.sleep(3) # brief pause before requesting the next job
618
+
619
+
620
+ if __name__ == '__main__':
621
+ main()
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: jd-worker
3
+ Version: 1.0.0
4
+ Summary: Worker CLI for the JobDistributor distributed job system
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/NWSL-UCF/job-distributor
7
+ Project-URL: Documentation, https://github.com/NWSL-UCF/job-distributor/blob/main/docs/jd-worker.md
8
+ Project-URL: Bug Tracker, https://github.com/NWSL-UCF/job-distributor/issues
9
+ Keywords: distributed,jobs,worker,hpc,ml
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Topic :: System :: Distributed Computing
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: requests
18
+ Requires-Dist: psutil
19
+
20
+ # Worker client (`jd-worker`)
21
+
22
+ Workers run **`jd_worker`**, which requests jobs from the job server, executes your **entry script** with parameters as CLI flags, sends heartbeats, and reports **DONE** / **ABORTED**.
23
+
24
+ Full behaviour, environment variables, local paths (`~/jd_data/…`), and library helpers (`jd_upload`, checkpoints, `jd_job_dir`, …) are documented in **[`docs/jd-worker.md`](../docs/jd-worker.md)**.
25
+
26
+ ---
27
+
28
+ ## Install
29
+
30
+ From the repo root (editable):
31
+
32
+ ```bash
33
+ cd job-distributor/client
34
+ python3 -m venv venv
35
+ source venv/bin/activate # Windows: venv\Scripts\activate
36
+ pip install -e .
37
+ ```
38
+
39
+ Or install from GitHub (see `docs/jd-worker.md` for branch / subdirectory).
40
+
41
+ ---
42
+
43
+ ## Run
44
+
45
+ ```bash
46
+ jd_worker expId=<experiment_id> entry_script=<your_script.py>
47
+ ```
48
+
49
+ Optional: `server=`, `port=`, `machine_type=`, `once=true`, etc. — see `jd_worker help` or **`docs/jd-worker.md`**.
50
+
51
+ ---
52
+
53
+ ## Example workload
54
+
55
+ For an end-to-end ML-style example (MNIST tuning), see
56
+ [MNIST-parameter-tuning](https://github.com/NWSL-UCF/MNIST-parameter-tuning). Point **`entry_script`** at that repo’s training script and align **`expId`** with the server.
@@ -0,0 +1,9 @@
1
+ jd/__init__.py,sha256=HvsPfgH7X5ao9MpgJ5brs52CeHF7hgDv9sPmmd-kd9E,363
2
+ jd/files.py,sha256=-66sf4fbTZugtsAinBI5YyE_ctK74k7SdAieI8zISsE,8151
3
+ jd/paths.py,sha256=-FVQYEuSYamYtbMdBq9azQS0bqYz49AKBeCg5ZO3tSU,1918
4
+ jd/worker.py,sha256=QPfMEmixVynTfTaU9OMf42EWc1Hl2q4MADxyKCCR5GU,25166
5
+ jd_worker-1.0.0.dist-info/METADATA,sha256=Xam6aG0K-Gt6e3BTpHY_D_vq-tLaq9AHqTaWuH8dDAA,1925
6
+ jd_worker-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ jd_worker-1.0.0.dist-info/entry_points.txt,sha256=4yBzE58Z9WQN--Ri1vKEKuqvd25QBgVe-XCea-TLm9Q,45
8
+ jd_worker-1.0.0.dist-info/top_level.txt,sha256=2Yt96k5nsj0T_z76Ppc_ss__sm5nD6LRi6bmXiQFrik,3
9
+ jd_worker-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ jd_worker = jd.worker:main
@@ -0,0 +1 @@
1
+ jd