earthkit-workflows 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ JobId = str
27
27
 
28
28
  @dataclass
29
29
  class JobProgress:
30
+ started: bool
30
31
  completed: bool
31
32
  pct: (
32
33
  str | None
@@ -35,19 +36,20 @@ class JobProgress:
35
36
 
36
37
  @classmethod
37
38
  def failed(cls, failure: str) -> Self:
38
- return cls(True, None, failure)
39
+ return cls(True, True, None, failure)
39
40
 
40
41
  @classmethod
41
42
  def progressed(cls, pct: float) -> Self:
42
43
  progress = "{:.2%}".format(pct)[:-1]
43
- return cls(False, progress, None)
44
+ return cls(True, False, progress, None)
44
45
 
45
46
  @classmethod
46
47
  def succeeded(cls) -> Self:
47
- return cls(True, None, None)
48
+ return cls(True, True, None, None)
48
49
 
49
50
 
50
- JobProgressStarted = JobProgress(False, "0.00", None)
51
+ JobProgressStarted = JobProgress(True, False, "0.00", None)
52
+ JobProgressEnqueued = JobProgress(False, False, None, None)
51
53
 
52
54
 
53
55
  @dataclass
@@ -42,7 +42,15 @@ class PackagesEnv(AbstractContextManager):
42
42
  logger.debug(
43
43
  f"installing {len(packages)} packages: {','.join(packages[:3])}{',...' if len(packages) > 3 else ''}"
44
44
  )
45
- install_command = ["uv", "pip", "install", "--prefix", self.td.name]
45
+ install_command = [
46
+ "uv",
47
+ "pip",
48
+ "install",
49
+ "--prefix",
50
+ self.td.name,
51
+ "--prerelease",
52
+ "allow",
53
+ ]
46
54
  if os.environ.get("VENV_OFFLINE", "") == "YES":
47
55
  install_command += ["--offline"]
48
56
  if cache_dir := os.environ.get("VENV_CACHE", ""):
@@ -15,14 +15,17 @@ from cascade.gateway.server import serve
15
15
 
16
16
 
17
17
  def main(
18
- url: str, log_base: str | None = None, troika_config: str | None = None
18
+ url: str,
19
+ log_base: str | None = None,
20
+ troika_config: str | None = None,
21
+ max_jobs: int | None = None,
19
22
  ) -> None:
20
23
  if log_base:
21
24
  log_path = f"{log_base}/gateway.txt"
22
25
  logging.config.dictConfig(logging_config_filehandler(log_path))
23
26
  else:
24
27
  logging.config.dictConfig(logging_config)
25
- serve(url, log_base, troika_config)
28
+ serve(url, log_base, troika_config, max_jobs)
26
29
 
27
30
 
28
31
  if __name__ == "__main__":
cascade/gateway/api.py CHANGED
@@ -62,8 +62,9 @@ class JobProgressRequest(CascadeGatewayAPI):
62
62
 
63
63
 
64
64
  class JobProgressResponse(CascadeGatewayAPI):
65
- progresses: dict[JobId, JobProgress]
65
+ progresses: dict[JobId, JobProgress | None]
66
66
  datasets: dict[JobId, list[DatasetId]]
67
+ queue_length: int
67
68
  error: str | None # top level error
68
69
 
69
70
 
cascade/gateway/router.py CHANGED
@@ -15,6 +15,7 @@ import os
15
15
  import stat
16
16
  import subprocess
17
17
  import uuid
18
+ from collections import OrderedDict
18
19
  from dataclasses import dataclass
19
20
  from typing import Iterable
20
21
 
@@ -22,7 +23,12 @@ import orjson
22
23
  import zmq
23
24
 
24
25
  import cascade.executor.platform as platform
25
- from cascade.controller.report import JobId, JobProgress, JobProgressStarted
26
+ from cascade.controller.report import (
27
+ JobId,
28
+ JobProgress,
29
+ JobProgressEnqueued,
30
+ JobProgressStarted,
31
+ )
26
32
  from cascade.executor.comms import get_context
27
33
  from cascade.gateway.api import JobSpec, TroikaSpec
28
34
  from cascade.low.core import DatasetId
@@ -202,16 +208,29 @@ def _spawn_subprocess(
202
208
 
203
209
  class JobRouter:
204
210
  def __init__(
205
- self, poller: zmq.Poller, log_base: str | None, troika_config: str | None
211
+ self,
212
+ poller: zmq.Poller,
213
+ log_base: str | None,
214
+ troika_config: str | None,
215
+ max_jobs: int | None,
206
216
  ):
207
217
  self.poller = poller
208
218
  self.jobs: dict[str, Job] = {}
219
+ self.active_jobs = 0
220
+ self.max_jobs = max_jobs
221
+ self.jobs_queue: OrderedDict[JobId, JobSpec] = OrderedDict()
209
222
  self.procs: dict[str, subprocess.Popen] = {}
210
223
  self.log_base = log_base
211
224
  self.troika_config = troika_config
212
225
 
213
- def spawn_job(self, job_spec: JobSpec) -> JobId:
214
- job_id = next_uuid(self.jobs.keys(), lambda: str(uuid.uuid4()))
226
+ def maybe_spawn(self) -> None:
227
+ if not self.jobs_queue:
228
+ return
229
+ if self.max_jobs is not None and self.active_jobs >= self.max_jobs:
230
+ logger.debug(f"already running {self.active_jobs}, no spawn")
231
+ return
232
+
233
+ job_id, job_spec = self.jobs_queue.popitem(False)
215
234
  base_addr = f"tcp://{platform.get_bindabble_self()}"
216
235
  socket = get_context().socket(zmq.PULL)
217
236
  port = socket.bind_to_random_port(base_addr)
@@ -222,18 +241,37 @@ class JobRouter:
222
241
  self.procs[job_id] = _spawn_subprocess(
223
242
  job_spec, full_addr, job_id, self.log_base, self.troika_config
224
243
  )
244
+ self.active_jobs += 1
245
+ return job_id
246
+
247
+ def enqueue_job(self, job_spec: JobSpec) -> JobId:
248
+ job_id = next_uuid(
249
+ set(self.jobs.keys()).union(self.jobs_queue.keys()),
250
+ lambda: str(uuid.uuid4()),
251
+ )
252
+ self.jobs_queue[job_id] = job_spec
253
+ self.maybe_spawn()
225
254
  return job_id
226
255
 
227
256
  def progress_of(
228
257
  self, job_ids: Iterable[JobId]
229
- ) -> tuple[dict[JobId, JobProgress], dict[JobId, list[DatasetId]]]:
258
+ ) -> tuple[dict[JobId, JobProgress], dict[JobId, list[DatasetId]], int]:
230
259
  if not job_ids:
231
- job_ids = self.jobs.keys()
232
- progresses = {job_id: self.jobs[job_id].progress for job_id in job_ids}
260
+ job_ids = set(self.jobs.keys()).union(self.jobs_queue.keys())
261
+ progresses = {}
262
+ for job_id in job_ids:
263
+ if job_id in self.jobs:
264
+ progresses[job_id] = self.jobs[job_id].progress
265
+ elif job_id in self.jobs_queue:
266
+ progresses[job_id] = JobProgressEnqueued
267
+ else:
268
+ progresses[job_id] = None
233
269
  datasets = {
234
- job_id: list(self.jobs[job_id].results.keys()) for job_id in job_ids
270
+ job_id: list(self.jobs[job_id].results.keys())
271
+ for job_id in job_ids
272
+ if job_id in self.jobs
235
273
  }
236
- return progresses, datasets
274
+ return progresses, datasets, len(self.jobs_queue)
237
275
 
238
276
  def get_result(self, job_id: JobId, dataset_id: DatasetId) -> bytes:
239
277
  return self.jobs[job_id].results[dataset_id]
@@ -246,6 +284,8 @@ class JobRouter:
246
284
  job = self.jobs[job_id]
247
285
  if progress.completed:
248
286
  self.poller.unregister(job.socket)
287
+ self.active_jobs -= 1
288
+ self.maybe_spawn()
249
289
  if progress.failure is not None and job.progress.failure is None:
250
290
  job.progress = progress
251
291
  elif job.last_seen >= timestamp or job.progress.failure is not None:
cascade/gateway/server.py CHANGED
@@ -31,16 +31,19 @@ def handle_fe(socket: zmq.Socket, jobs: JobRouter) -> bool:
31
31
  rv: api.CascadeGatewayAPI
32
32
  if isinstance(m, api.SubmitJobRequest):
33
33
  try:
34
- job_id = jobs.spawn_job(m.job)
34
+ job_id = jobs.enqueue_job(m.job)
35
35
  rv = api.SubmitJobResponse(job_id=job_id, error=None)
36
36
  except Exception as e:
37
37
  logger.exception(f"failed to spawn a job: {m}")
38
38
  rv = api.SubmitJobResponse(job_id=None, error=repr(e))
39
39
  elif isinstance(m, api.JobProgressRequest):
40
40
  try:
41
- progresses, datasets = jobs.progress_of(m.job_ids)
41
+ progresses, datasets, queue_length = jobs.progress_of(m.job_ids)
42
42
  rv = api.JobProgressResponse(
43
- progresses=progresses, datasets=datasets, error=None
43
+ progresses=progresses,
44
+ datasets=datasets,
45
+ error=None,
46
+ queue_length=queue_length,
44
47
  )
45
48
  except Exception as e:
46
49
  logger.exception(f"failed to get progress of: {m}")
@@ -80,7 +83,10 @@ def handle_controller(socket: zmq.Socket, jobs: JobRouter) -> None:
80
83
 
81
84
 
82
85
  def serve(
83
- url: str, log_base: str | None = None, troika_config: str | None = None
86
+ url: str,
87
+ log_base: str | None = None,
88
+ troika_config: str | None = None,
89
+ max_jobs: int | None = None,
84
90
  ) -> None:
85
91
  ctx = get_context()
86
92
  poller = zmq.Poller()
@@ -88,7 +94,7 @@ def serve(
88
94
  fe = ctx.socket(zmq.REP)
89
95
  fe.bind(url)
90
96
  poller.register(fe, flags=zmq.POLLIN)
91
- jobs = JobRouter(poller, log_base, troika_config)
97
+ jobs = JobRouter(poller, log_base, troika_config, max_jobs)
92
98
 
93
99
  logger.debug("entering recv loop")
94
100
  is_break = False
@@ -1,2 +1,2 @@
1
1
  # Do not change! Do not track in version control!
2
- __version__ = "0.4.5"
2
+ __version__ = "0.4.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: earthkit-workflows
3
- Version: 0.4.5
3
+ Version: 0.4.7
4
4
  Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License-Expression: Apache-2.0
@@ -16,7 +16,7 @@ cascade/controller/act.py,sha256=WHIsk4H-Bbyl_DABX2VWhyKy_cNnp12x1nilatPCL8I,298
16
16
  cascade/controller/core.py,sha256=NqvZ5g5GNphwOpzdXbCI0_fxIzzmO97_n2xZKswK72Q,3589
17
17
  cascade/controller/impl.py,sha256=9jdTikYO8OkaNIfzatyr3Mhai5EfEhaeii9GaF9cQw4,3526
18
18
  cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,5881
19
- cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
19
+ cascade/controller/report.py,sha256=rKGYmq4nIiDqKuP_C7YSwpEAUOPdjILlDcbKkdUt30s,3772
20
20
  cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
21
21
  cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
22
22
  cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
@@ -28,14 +28,14 @@ cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,282
28
28
  cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
29
29
  cascade/executor/runner/entrypoint.py,sha256=WyxOFGAYDQD_fXsM4H9_6xBrnAmQrCTUnljfcW6-BoM,7918
30
30
  cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
31
- cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
31
+ cascade/executor/runner/packages.py,sha256=zOMCKRX34j6vSIK741KV7kTFRgy3bZz_xeLrldslaL4,2633
32
32
  cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
33
33
  cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,829
34
- cascade/gateway/__main__.py,sha256=F_wft7ja5ckM0SqeXsy_u2j-Ch6OTlpbTTlYtDkvGMI,917
35
- cascade/gateway/api.py,sha256=-Vuo9fDqFNFIofcHZ79UB1rTWnQR3D9Pna2CjqdyHaE,3021
34
+ cascade/gateway/__main__.py,sha256=kmfklSeA7v5ie75SBHOql-eHuY6x4eTHlItMYqCQ1Pg,969
35
+ cascade/gateway/api.py,sha256=vPYfiuEjBeddFnCPZpr4_9ovuhGdZ3_migzKTUtvF98,3050
36
36
  cascade/gateway/client.py,sha256=1p4Tvrf-BH0LQHOES5rY1z3JNIfmXcqWG2kYl4rpcE0,4061
37
- cascade/gateway/router.py,sha256=RcDniyPOZnu6_HuMUrQjZ4P-PoUbUVezvYXG_ryBLUg,10399
38
- cascade/gateway/server.py,sha256=vb3z0TfoMvSHqczhmYgzeXGVcw2M9yGpyW0t6d57Oag,3827
37
+ cascade/gateway/router.py,sha256=9oTkqssb3dHF24TIaAn_7oQoNfm4qkOvriufbOJxnyE,11582
38
+ cascade/gateway/server.py,sha256=BfUKpU2nCEB_zI4BdZU_9zHYHX1WoQaLARCTxMSP0Nk,3971
39
39
  cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
40
40
  cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
41
41
  cascade/low/core.py,sha256=_3x4ka_pmCgZbfwFeyhq8S4M6wmh0s24VRCLhk5yQFM,6444
@@ -59,7 +59,7 @@ cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
59
59
  cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
60
60
  cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
61
61
  earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
62
- earthkit/workflows/_version.py,sha256=E5wa5mKarJfqqG7Ah0-faFraDaomimplznAnXRNZfLw,72
62
+ earthkit/workflows/_version.py,sha256=Vm-kUKLcx3Zcn5dopRQ_3Wdf6JGosDY_tCioAs7HFG0,72
63
63
  earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
64
64
  earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
65
65
  earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -89,8 +89,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
89
89
  earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
90
90
  earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
91
91
  earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
92
- earthkit_workflows-0.4.5.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
93
- earthkit_workflows-0.4.5.dist-info/METADATA,sha256=zR79j7-OUPmbz7_K8oXuYCfYSZJlptNepUxPS6j-CZs,1571
94
- earthkit_workflows-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
- earthkit_workflows-0.4.5.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
96
- earthkit_workflows-0.4.5.dist-info/RECORD,,
92
+ earthkit_workflows-0.4.7.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
93
+ earthkit_workflows-0.4.7.dist-info/METADATA,sha256=8hfeDm0i94yYA1_CJRFcGrfWE1r7MCzb-HyzvIc5tPs,1571
94
+ earthkit_workflows-0.4.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
+ earthkit_workflows-0.4.7.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
96
+ earthkit_workflows-0.4.7.dist-info/RECORD,,