matensemble 0.3.7__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {matensemble-0.3.7 → matensemble-0.3.8}/PKG-INFO +1 -1
  2. {matensemble-0.3.7 → matensemble-0.3.8}/pyproject.toml +1 -1
  3. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/chore.py +9 -0
  4. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/fluxlet.py +3 -5
  5. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/manager.py +31 -8
  6. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/strategy.py +11 -1
  7. {matensemble-0.3.7 → matensemble-0.3.8}/LICENSE +0 -0
  8. {matensemble-0.3.7 → matensemble-0.3.8}/README.md +0 -0
  9. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/.python-version +0 -0
  10. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/README.md +0 -0
  11. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/TODO.md +0 -0
  12. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/__init__.py +0 -0
  13. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/assets/index-1X2cLUgt.js +0 -0
  14. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/assets/index-DRkGfWlx.css +0 -0
  15. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/index.html +0 -0
  16. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/vite.svg +0 -0
  17. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/__init__.py +0 -0
  18. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/driver.py +0 -0
  19. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/ensemble.py +0 -0
  20. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
  21. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +0 -0
  22. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_diffraction.py +0 -0
  23. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +0 -0
  24. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_twist.py +0 -0
  25. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/correlations.py +0 -0
  26. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/ovito_calculators.py +0 -0
  27. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +0 -0
  28. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +0 -0
  29. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/MDSubprocess.py +0 -0
  30. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/__init__.py +0 -0
  31. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/analysis_registry.py +0 -0
  32. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/__init__.py +0 -0
  33. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/lammps_init.py +0 -0
  34. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/preprocessors.py +0 -0
  35. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/stat.py +0 -0
  36. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +0 -0
  37. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/logger.py +0 -0
  38. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/model.py +0 -0
  39. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/pipeline.py +0 -0
  40. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/__init__.py +0 -0
  41. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/service.py +0 -0
  42. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/test.py +0 -0
  43. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/runtime_worker.py +0 -0
  44. {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matensemble
3
- Version: 0.3.7
3
+ Version: 0.3.8
4
4
  Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
5
5
  Author: Soumendu Bagchi, Kaleb Duchesneau
6
6
  Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "matensemble"
3
- version = "0.3.7"
3
+ version = "0.3.8"
4
4
  description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
5
5
  readme = "README.md"
6
6
  license-files = ["LICENSE"]
@@ -53,6 +53,7 @@ class Chore:
53
53
  deps: tuple[str, ...] = (),
54
54
  args: tuple = (),
55
55
  kwargs: dict | None = None,
56
+ nnodes: int | None = None,
56
57
  ) -> None:
57
58
  """
58
59
  The constructor for a :obj:`Chore`
@@ -84,6 +85,12 @@ class Chore:
84
85
  The arguments to give the function if type is PYTHON
85
86
  kwargs : dict
86
87
  The key-word arguments to give the function if flaovr is PYTHON
88
+ nnodes : int, optional
89
+ When set, this chore will be scheduled via ``per_resource`` and will
90
+ occupy *nnodes* whole nodes (all cores and all GPUs on each node).
91
+ The manager uses this to compute the true resource footprint instead
92
+ of ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
93
+ Leave as ``None`` for normal ``from_command`` chores.
87
94
  """
88
95
 
89
96
  self.id = id
@@ -100,6 +107,7 @@ class Chore:
100
107
  self.deps = deps
101
108
  self.args = args
102
109
  self.kwargs = {} if kwargs is None else kwargs
110
+ self.nnodes = nnodes
103
111
 
104
112
  def graph(self) -> nx.DiGraph:
105
113
  return nx.DiGraph()
@@ -121,6 +129,7 @@ class Chore:
121
129
  "deps": list(self.deps),
122
130
  "args": _json_safe(self.args),
123
131
  "kwargs": _json_safe(self.kwargs),
132
+ "nnodes": self.nnodes,
124
133
  }
125
134
 
126
135
  def _write_metadata(self) -> None:
@@ -26,7 +26,7 @@ class Fluxlet:
26
26
  handle: flux.Flux,
27
27
  ) -> None:
28
28
  self.handle = handle
29
- self.gpus_per_node = self.get_gpus_per_node()
29
+ self.num_nodes, self.gpus_per_node = self.get_gpus_per_node()
30
30
 
31
31
  def get_gpus_per_node(self) -> tuple[int, int]:
32
32
  """
@@ -81,11 +81,11 @@ class Fluxlet:
81
81
  if dynopro:
82
82
  jobspec = flux.job.JobspecV1.per_resource(
83
83
  chore.command,
84
- ncores=chore.resources.num_tasks,
85
84
  nnodes=nnodes,
86
85
  gpus_per_node=self.gpus_per_node,
87
- per_resource_type="core",
86
+ per_resource_type="node",
88
87
  per_resource_count=1,
88
+ exclusive=True, # dynopro needs whole nodes
89
89
  )
90
90
 
91
91
  chore.workdir.mkdir(parents=True, exist_ok=True)
@@ -106,8 +106,6 @@ class Fluxlet:
106
106
  base_env["SLURM_GPUS_PER_NODE"] = str(self.gpus_per_node)
107
107
  jobspec.environment = base_env
108
108
 
109
- fut = executor.submit(jobspec)
110
-
111
109
  fut = executor.submit(jobspec)
112
110
  fut.chore_id = chore.id
113
111
  fut.chore_obj = chore
@@ -272,6 +272,26 @@ class FluxManager:
272
272
  gpus_per_node = total_gpus // nnodes
273
273
  return nnodes, cores_per_node, gpus_per_node
274
274
 
275
+ def _chore_resource_footprint(self, chore: Chore) -> tuple[int, int]:
276
+ """
277
+ Return ``(needed_cores, needed_gpus)`` for a chore.
278
+
279
+ For whole-node (dynopro) chores — those with ``chore.nnodes`` set — the
280
+ footprint is ``nnodes * cores_per_node`` and ``nnodes * gpus_per_node``
281
+ because ``per_resource`` allocates entire nodes and every core and GPU on
282
+ them becomes unavailable. For ordinary chores the footprint is the
283
+ familiar ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
284
+ """
285
+
286
+ if getattr(chore, "nnodes", None) is not None:
287
+ needed_cores = chore.nnodes * self._cores_per_node
288
+ needed_gpus = chore.nnodes * self._gpus_per_node
289
+ else:
290
+ needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
291
+ needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
292
+
293
+ return needed_cores, needed_gpus
294
+
275
295
  def _chore_fits_allocation(self, chore: Chore) -> bool:
276
296
  """
277
297
  Checks whether the given chore is too big to be submitted
@@ -282,8 +302,7 @@ class FluxManager:
282
302
  The :obj:`Chore` to check if it will fit in the allocation
283
303
  """
284
304
 
285
- needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
286
- needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
305
+ needed_cores, needed_gpus = self._chore_resource_footprint(chore)
287
306
 
288
307
  total_cores = self._nnodes_on_allocation * self._cores_per_node
289
308
  total_gpus = self._nnodes_on_allocation * self._gpus_per_node
@@ -334,8 +353,7 @@ class FluxManager:
334
353
  Checks to see if there are enough resources to submit the given :obj:`Chore`
335
354
  """
336
355
 
337
- needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
338
- needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
356
+ needed_cores, needed_gpus = self._chore_resource_footprint(chore)
339
357
  return self._free_cores >= needed_cores and self._free_gpus >= needed_gpus
340
358
 
341
359
  def _has_failed(self, chore_id: str) -> bool:
@@ -399,7 +417,9 @@ class FluxManager:
399
417
 
400
418
  self._fail_dependents(dep_id)
401
419
 
402
- def _submit_one(self, chore_id: str, buffer_time: float) -> None:
420
+ def _submit_one(
421
+ self, chore_id: str, buffer_time: float, dynopro: bool = False
422
+ ) -> None:
403
423
  """
404
424
  Submits a :obj:`Chore` and does book-keeping all the queues and resources
405
425
  count
@@ -419,6 +439,7 @@ class FluxManager:
419
439
  set_cpu_affinity=self._set_cpu_affinity,
420
440
  set_gpu_affinity=self._set_gpu_affinity,
421
441
  nnodes=None,
442
+ dynopro=dynopro,
422
443
  )
423
444
  except Exception as e:
424
445
  self._logger.exception("CHORE SUBMIT FAILED: chore=%s", chore_id)
@@ -441,7 +462,9 @@ class FluxManager:
441
462
  self._free_gpus -= chore.resources.num_tasks * chore.resources.gpus_per_task
442
463
  time.sleep(buffer_time)
443
464
 
444
- def _submit_until_ooresources(self, buffer_time: float) -> bool:
465
+ def _submit_until_ooresources(
466
+ self, buffer_time: float, dynopro: bool = False
467
+ ) -> bool:
445
468
  """
446
469
  Submit as many chores as possible until out-of-resources
447
470
 
@@ -459,7 +482,7 @@ class FluxManager:
459
482
  chore = self._chores_by_id[chore_id]
460
483
 
461
484
  if self._can_submit_now(chore):
462
- self._submit_one(chore_id, buffer_time)
485
+ self._submit_one(chore_id, buffer_time, dynopro=dynopro)
463
486
  submitted_any = True
464
487
  else:
465
488
  deferred.append(chore_id)
@@ -636,7 +659,7 @@ class FluxManager:
636
659
  )
637
660
  while not done:
638
661
  self._check_resources()
639
- self._submit_until_ooresources(buffer_time=buffer_time)
662
+ self._submit_until_ooresources(buffer_time=buffer_time, dynopro=dynopro)
640
663
  proc_strat.process_futures(buffer_time=buffer_time)
641
664
 
642
665
  done = (
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
 
12
12
  from abc import ABC, abstractmethod
13
13
 
14
+ from matensemble import dynopro
14
15
  from matensemble.model import OutputReference
15
16
 
16
17
 
@@ -127,7 +128,10 @@ class AdaptiveStrategy(FutureProcessingStrategy):
127
128
  self.manager._blocked.discard(dep_id)
128
129
 
129
130
  # adaptively submit another chore
130
- self.manager._submit_until_ooresources(buffer_time=buffer_time)
131
+ self.manager._submit_until_ooresources(
132
+ buffer_time=buffer_time,
133
+ dynopro=getattr(self.manager, "_dynopro", False),
134
+ )
131
135
 
132
136
  if self.manager._write_restart_freq and (
133
137
  len(self.manager._completed_chores) % self.manager._write_restart_freq
@@ -339,6 +343,12 @@ class UserStrategy(FutureProcessingStrategy):
339
343
  f"bolo_match={chore_name} | due the following Exception ->\n{e}"
340
344
  )
341
345
 
346
+ # adaptively submit another chore
347
+ self.manager._submit_until_ooresources(
348
+ buffer_time=buffer_time,
349
+ dynopro=getattr(self.manager, "_dynopro", False),
350
+ )
351
+
342
352
  if self.manager._write_restart_freq and (
343
353
  len(self.manager._completed_chores) % self.manager._write_restart_freq
344
354
  == 0
File without changes
File without changes