matensemble 0.3.7__tar.gz → 0.3.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {matensemble-0.3.7 → matensemble-0.3.9}/PKG-INFO +1 -1
  2. {matensemble-0.3.7 → matensemble-0.3.9}/pyproject.toml +1 -1
  3. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/chore.py +9 -0
  4. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/ensemble.py +3 -0
  5. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/fluxlet.py +4 -7
  6. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/manager.py +31 -9
  7. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/strategy.py +12 -2
  8. {matensemble-0.3.7 → matensemble-0.3.9}/LICENSE +0 -0
  9. {matensemble-0.3.7 → matensemble-0.3.9}/README.md +0 -0
  10. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/.python-version +0 -0
  11. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/README.md +0 -0
  12. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/TODO.md +0 -0
  13. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/__init__.py +0 -0
  14. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/assets/index-1X2cLUgt.js +0 -0
  15. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/assets/index-DRkGfWlx.css +0 -0
  16. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/index.html +0 -0
  17. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/vite.svg +0 -0
  18. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/__init__.py +0 -0
  19. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/driver.py +0 -0
  20. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
  21. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +0 -0
  22. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_diffraction.py +0 -0
  23. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +0 -0
  24. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_twist.py +0 -0
  25. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/correlations.py +0 -0
  26. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/ovito_calculators.py +0 -0
  27. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +0 -0
  28. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +0 -0
  29. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/MDSubprocess.py +0 -0
  30. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/__init__.py +0 -0
  31. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/analysis_registry.py +0 -0
  32. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/__init__.py +0 -0
  33. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/lammps_init.py +0 -0
  34. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/preprocessors.py +0 -0
  35. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/stat.py +0 -0
  36. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +0 -0
  37. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/logger.py +0 -0
  38. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/model.py +0 -0
  39. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/pipeline.py +0 -0
  40. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/__init__.py +0 -0
  41. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/service.py +0 -0
  42. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/test.py +0 -0
  43. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/runtime_worker.py +0 -0
  44. {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matensemble
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
5
5
  Author: Soumendu Bagchi, Kaleb Duchesneau
6
6
  Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "matensemble"
3
- version = "0.3.7"
3
+ version = "0.3.9"
4
4
  description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
5
5
  readme = "README.md"
6
6
  license-files = ["LICENSE"]
@@ -53,6 +53,7 @@ class Chore:
53
53
  deps: tuple[str, ...] = (),
54
54
  args: tuple = (),
55
55
  kwargs: dict | None = None,
56
+ nnodes: int | None = None,
56
57
  ) -> None:
57
58
  """
58
59
  The constructor for a :obj:`Chore`
@@ -84,6 +85,12 @@ class Chore:
84
85
  The arguments to give the function if type is PYTHON
85
86
  kwargs : dict
86
87
  The key-word arguments to give the function if flaovr is PYTHON
88
+ nnodes : int, optional
89
+ When set, this chore will be scheduled via ``per_resource`` and will
90
+ occupy *nnodes* whole nodes (all cores and all GPUs on each node).
91
+ The manager uses this to compute the true resource footprint instead
92
+ of ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
93
+ Leave as ``None`` for normal ``from_command`` chores.
87
94
  """
88
95
 
89
96
  self.id = id
@@ -100,6 +107,7 @@ class Chore:
100
107
  self.deps = deps
101
108
  self.args = args
102
109
  self.kwargs = {} if kwargs is None else kwargs
110
+ self.nnodes = nnodes
103
111
 
104
112
  def graph(self) -> nx.DiGraph:
105
113
  return nx.DiGraph()
@@ -121,6 +129,7 @@ class Chore:
121
129
  "deps": list(self.deps),
122
130
  "args": _json_safe(self.args),
123
131
  "kwargs": _json_safe(self.kwargs),
132
+ "nnodes": self.nnodes,
124
133
  }
125
134
 
126
135
  def _write_metadata(self) -> None:
@@ -7,6 +7,7 @@ from networkx import write_network_text
7
7
  from matensemble.manager import FluxManager
8
8
  from matensemble.chore import Chore, ChoreType
9
9
  from matensemble.model import Resources
10
+
10
11
  # from matensemble.dynopro.driver import online_dynamics
11
12
 
12
13
 
@@ -73,6 +74,7 @@ class EnsembleDynamicsRunner:
73
74
  num_tasks=self.tasks_per_job,
74
75
  cores_per_task=self.cores_per_task,
75
76
  gpus_per_task=self.gpus_per_task,
77
+ mpi=True,
76
78
  )
77
79
  workdir = outdir / chore_id
78
80
 
@@ -85,6 +87,7 @@ class EnsembleDynamicsRunner:
85
87
  chore_type=ChoreType.EXECUTABLE,
86
88
  resources=resources,
87
89
  workdir=workdir,
90
+ nnodes=self.nnodes,
88
91
  )
89
92
  )
90
93
 
@@ -26,7 +26,7 @@ class Fluxlet:
26
26
  handle: flux.Flux,
27
27
  ) -> None:
28
28
  self.handle = handle
29
- self.gpus_per_node = self.get_gpus_per_node()
29
+ self.num_nodes, self.gpus_per_node = self.get_gpus_per_node()
30
30
 
31
31
  def get_gpus_per_node(self) -> tuple[int, int]:
32
32
  """
@@ -50,7 +50,6 @@ class Fluxlet:
50
50
  chore: Chore,
51
51
  set_cpu_affinity: bool | None = None,
52
52
  set_gpu_affinity: bool | None = None,
53
- nnodes: int | None = None,
54
53
  dynopro: bool | None = None,
55
54
  ) -> flux.job.FluxExecutorFuture:
56
55
  """
@@ -81,11 +80,11 @@ class Fluxlet:
81
80
  if dynopro:
82
81
  jobspec = flux.job.JobspecV1.per_resource(
83
82
  chore.command,
84
- ncores=chore.resources.num_tasks,
85
- nnodes=nnodes,
83
+ nnodes=chore.nnodes,
86
84
  gpus_per_node=self.gpus_per_node,
87
- per_resource_type="core",
85
+ per_resource_type="node",
88
86
  per_resource_count=1,
87
+ exclusive=True, # dynopro needs whole nodes
89
88
  )
90
89
 
91
90
  chore.workdir.mkdir(parents=True, exist_ok=True)
@@ -106,8 +105,6 @@ class Fluxlet:
106
105
  base_env["SLURM_GPUS_PER_NODE"] = str(self.gpus_per_node)
107
106
  jobspec.environment = base_env
108
107
 
109
- fut = executor.submit(jobspec)
110
-
111
108
  fut = executor.submit(jobspec)
112
109
  fut.chore_id = chore.id
113
110
  fut.chore_obj = chore
@@ -272,6 +272,26 @@ class FluxManager:
272
272
  gpus_per_node = total_gpus // nnodes
273
273
  return nnodes, cores_per_node, gpus_per_node
274
274
 
275
+ def _chore_resource_footprint(self, chore: Chore) -> tuple[int, int]:
276
+ """
277
+ Return ``(needed_cores, needed_gpus)`` for a chore.
278
+
279
+ For whole-node (dynopro) chores — those with ``chore.nnodes`` set — the
280
+ footprint is ``nnodes * cores_per_node`` and ``nnodes * gpus_per_node``
281
+ because ``per_resource`` allocates entire nodes and every core and GPU on
282
+ them becomes unavailable. For ordinary chores the footprint is the
283
+ familiar ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
284
+ """
285
+
286
+ if getattr(chore, "nnodes", None) is not None:
287
+ needed_cores = chore.nnodes * self._cores_per_node
288
+ needed_gpus = chore.nnodes * self._gpus_per_node
289
+ else:
290
+ needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
291
+ needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
292
+
293
+ return needed_cores, needed_gpus
294
+
275
295
  def _chore_fits_allocation(self, chore: Chore) -> bool:
276
296
  """
277
297
  Checks whether the given chore is too big to be submitted
@@ -282,8 +302,7 @@ class FluxManager:
282
302
  The :obj:`Chore` to check if it will fit in the allocation
283
303
  """
284
304
 
285
- needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
286
- needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
305
+ needed_cores, needed_gpus = self._chore_resource_footprint(chore)
287
306
 
288
307
  total_cores = self._nnodes_on_allocation * self._cores_per_node
289
308
  total_gpus = self._nnodes_on_allocation * self._gpus_per_node
@@ -334,8 +353,7 @@ class FluxManager:
334
353
  Checks to see if there are enough resources to submit the given :obj:`Chore`
335
354
  """
336
355
 
337
- needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
338
- needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
356
+ needed_cores, needed_gpus = self._chore_resource_footprint(chore)
339
357
  return self._free_cores >= needed_cores and self._free_gpus >= needed_gpus
340
358
 
341
359
  def _has_failed(self, chore_id: str) -> bool:
@@ -399,7 +417,9 @@ class FluxManager:
399
417
 
400
418
  self._fail_dependents(dep_id)
401
419
 
402
- def _submit_one(self, chore_id: str, buffer_time: float) -> None:
420
+ def _submit_one(
421
+ self, chore_id: str, buffer_time: float, dynopro: bool = False
422
+ ) -> None:
403
423
  """
404
424
  Submits a :obj:`Chore` and does book-keeping all the queues and resources
405
425
  count
@@ -418,7 +438,7 @@ class FluxManager:
418
438
  chore,
419
439
  set_cpu_affinity=self._set_cpu_affinity,
420
440
  set_gpu_affinity=self._set_gpu_affinity,
421
- nnodes=None,
441
+ dynopro=dynopro,
422
442
  )
423
443
  except Exception as e:
424
444
  self._logger.exception("CHORE SUBMIT FAILED: chore=%s", chore_id)
@@ -441,7 +461,9 @@ class FluxManager:
441
461
  self._free_gpus -= chore.resources.num_tasks * chore.resources.gpus_per_task
442
462
  time.sleep(buffer_time)
443
463
 
444
- def _submit_until_ooresources(self, buffer_time: float) -> bool:
464
+ def _submit_until_ooresources(
465
+ self, buffer_time: float, dynopro: bool = False
466
+ ) -> bool:
445
467
  """
446
468
  Submit as many chores as possible until out-of-resources
447
469
 
@@ -459,7 +481,7 @@ class FluxManager:
459
481
  chore = self._chores_by_id[chore_id]
460
482
 
461
483
  if self._can_submit_now(chore):
462
- self._submit_one(chore_id, buffer_time)
484
+ self._submit_one(chore_id, buffer_time, dynopro=dynopro)
463
485
  submitted_any = True
464
486
  else:
465
487
  deferred.append(chore_id)
@@ -636,7 +658,7 @@ class FluxManager:
636
658
  )
637
659
  while not done:
638
660
  self._check_resources()
639
- self._submit_until_ooresources(buffer_time=buffer_time)
661
+ self._submit_until_ooresources(buffer_time=buffer_time, dynopro=dynopro)
640
662
  proc_strat.process_futures(buffer_time=buffer_time)
641
663
 
642
664
  done = (
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
 
12
12
  from abc import ABC, abstractmethod
13
13
 
14
+ from matensemble import dynopro
14
15
  from matensemble.model import OutputReference
15
16
 
16
17
 
@@ -127,7 +128,10 @@ class AdaptiveStrategy(FutureProcessingStrategy):
127
128
  self.manager._blocked.discard(dep_id)
128
129
 
129
130
  # adaptively submit another chore
130
- self.manager._submit_until_ooresources(buffer_time=buffer_time)
131
+ self.manager._submit_until_ooresources(
132
+ buffer_time=buffer_time,
133
+ dynopro=getattr(self.manager, "_dynopro", False),
134
+ )
131
135
 
132
136
  if self.manager._write_restart_freq and (
133
137
  len(self.manager._completed_chores) % self.manager._write_restart_freq
@@ -304,7 +308,7 @@ class UserStrategy(FutureProcessingStrategy):
304
308
  self.manager._blocked.discard(dep_id)
305
309
 
306
310
  # --- Processing the chore and spawning the new one ---
307
- if self.proc_chore == chore_name:
311
+ if chore_name == self.proc_chore:
308
312
  try:
309
313
  # Trust boundary: result.pickle is written by matensemble.runtime_worker
310
314
  # in this workflow's chore workdir only—do not load pickles from
@@ -339,6 +343,12 @@ class UserStrategy(FutureProcessingStrategy):
339
343
  f"bolo_match={chore_name} | due the following Exception ->\n{e}"
340
344
  )
341
345
 
346
+ # adaptively submit another chore
347
+ self.manager._submit_until_ooresources(
348
+ buffer_time=buffer_time,
349
+ dynopro=getattr(self.manager, "_dynopro", False),
350
+ )
351
+
342
352
  if self.manager._write_restart_freq and (
343
353
  len(self.manager._completed_chores) % self.manager._write_restart_freq
344
354
  == 0
File without changes
File without changes