dpdispatcher 0.5.6__tar.gz → 0.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dpdispatcher might be problematic. Click here for more details.

Files changed (232) hide show
  1. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/pyright.yml +2 -0
  2. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.pre-commit-config.yaml +2 -2
  3. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/PKG-INFO +14 -5
  4. dpdispatcher-0.5.8/README.md +33 -0
  5. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/batch.md +10 -2
  6. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/__init__.py +2 -0
  7. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/_version.py +2 -2
  8. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/base_context.py +0 -3
  9. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/distributed_shell.py +6 -7
  10. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dp_cloud_server.py +3 -1
  11. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dp_cloud_server_context.py +0 -3
  12. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/client.py +1 -1
  13. dpdispatcher-0.5.8/dpdispatcher/fugaku.py +94 -0
  14. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/hdfs_context.py +0 -3
  15. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/lazy_local_context.py +0 -4
  16. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/local_context.py +0 -4
  17. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/lsf.py +12 -2
  18. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/machine.py +18 -2
  19. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/pbs.py +14 -2
  20. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/shell.py +14 -3
  21. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/slurm.py +69 -16
  22. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/ssh_context.py +21 -17
  23. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/submission.py +158 -41
  24. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/PKG-INFO +14 -5
  25. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/SOURCES.txt +4 -0
  26. dpdispatcher-0.5.8/tests/jsons/machine_fugaku.json +24 -0
  27. dpdispatcher-0.5.8/tests/jsons/machine_local_fugaku.json +18 -0
  28. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_resources.py +3 -0
  29. dpdispatcher-0.5.8/tests/test_run_submission.py +213 -0
  30. dpdispatcher-0.5.6/tests/test_run_submission.py → dpdispatcher-0.5.8/tests/test_run_submission_ratio_unfinished.py +16 -10
  31. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_ssh_context.py +48 -0
  32. dpdispatcher-0.5.6/README.md +0 -24
  33. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/ci-docker.yml +0 -0
  34. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/machines.yml +0 -0
  35. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/mirror_gitee.yml +0 -0
  36. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/publish_conda.yml +0 -0
  37. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/release.yml +0 -0
  38. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/test.yml +0 -0
  39. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.gitignore +0 -0
  40. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/CONTRIBUTING.md +0 -0
  41. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/Dockerfile +0 -0
  42. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/LICENSE +0 -0
  43. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/LICENSE +0 -0
  44. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/README.md +0 -0
  45. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs/docker-compose.yml +0 -0
  46. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs/start-pbs.sh +0 -0
  47. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs.sh +0 -0
  48. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/docker-compose.yml +0 -0
  49. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/register_cluster.sh +0 -0
  50. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/start-slurm.sh +0 -0
  51. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm.sh +0 -0
  52. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh/docker-compose.yml +0 -0
  53. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh/start-ssh.sh +0 -0
  54. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh.sh +0 -0
  55. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh_rsync.sh +0 -0
  56. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/codecov.yml +0 -0
  57. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/conda/conda_build_config.yaml +0 -0
  58. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/conda/meta.yaml +0 -0
  59. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/.gitignore +0 -0
  60. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/Makefile +0 -0
  61. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/conf.py +0 -0
  62. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/context.md +0 -0
  63. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/credits.rst +0 -0
  64. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/dpdispatcher_on_yarn.md +0 -0
  65. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/expanse.md +0 -0
  66. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/g16.md +0 -0
  67. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/shell.md +0 -0
  68. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/getting-started.md +0 -0
  69. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/index.rst +0 -0
  70. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/install.md +0 -0
  71. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/machine.rst +0 -0
  72. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/make.bat +0 -0
  73. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/requirements.txt +0 -0
  74. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/resources.rst +0 -0
  75. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/task.rst +0 -0
  76. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/JobStatus.py +0 -0
  77. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/arginfo.py +0 -0
  78. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/__init__.py +0 -0
  79. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/config.py +0 -0
  80. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/retcode.py +0 -0
  81. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/temp_test.py +0 -0
  82. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/zip_file.py +0 -0
  83. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpdisp.py +0 -0
  84. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/hdfs_cli.py +0 -0
  85. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/utils.py +0 -0
  86. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/dependency_links.txt +0 -0
  87. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/entry_points.txt +0 -0
  88. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/requires.txt +0 -0
  89. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/top_level.txt +0 -0
  90. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/expanse.json +0 -0
  91. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/lazy_local.json +0 -0
  92. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/mandu.json +0 -0
  93. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/resources/expanse_cpu.json +0 -0
  94. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/resources/mandu.json +0 -0
  95. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/task/deepmd-kit.json +0 -0
  96. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/task/g16.json +0 -0
  97. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/pyproject.toml +0 -0
  98. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/scripts/script_gen_dargs_docs.py +0 -0
  99. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/scripts/script_gen_dargs_json.py +0 -0
  100. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/setup.cfg +0 -0
  101. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/.gitignore +0 -0
  102. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/__init__.py +0 -0
  103. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/batch.json +0 -0
  104. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/context.py +0 -0
  105. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/debug_test_class_submission_init.py +0 -0
  106. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_ali_ehpc.py +0 -0
  107. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_dp_cloud_server.py +0 -0
  108. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_lazy_ali_ehpc.py +0 -0
  109. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_lsf.py +0 -0
  110. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_shell.py +0 -0
  111. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_slurm.py +0 -0
  112. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_ssh_ali_ehpc.py +0 -0
  113. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/graph.pb +0 -0
  114. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/job.json +0 -0
  115. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine.json +0 -0
  116. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_ali_ehpc.json +0 -0
  117. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_center.json +0 -0
  118. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_diffenert.json +0 -0
  119. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_dp_cloud_server.json +0 -0
  120. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_if_cuda_multi_devices.json +0 -0
  121. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazy_local_lsf.json +0 -0
  122. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazy_local_slurm.json +0 -0
  123. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazylocal_shell.json +0 -0
  124. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_local_shell.json +0 -0
  125. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lsf.json +0 -0
  126. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_slurm.json +0 -0
  127. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_yarn.json +0 -0
  128. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/resources.json +0 -0
  129. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/submission.json +0 -0
  130. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/task.json +0 -0
  131. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/context.py +0 -0
  132. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/test_dispatcher.py +0 -0
  133. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/test_lsf_local.py +0 -0
  134. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_dispatcher_utils.py +0 -0
  135. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_lazy_local_context.py +0 -0
  136. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_local_context.py +0 -0
  137. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_local_session.py +0 -0
  138. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_ssh_context.py +0 -0
  139. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/context.py +0 -0
  140. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/test_dispatcher.py +0 -0
  141. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/test_pbs_local.py +0 -0
  142. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/sample_class.py +0 -0
  143. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/script_gen_json.py +0 -0
  144. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/context.py +0 -0
  145. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_dispatcher.py +0 -0
  146. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_shell_local.py +0 -0
  147. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_shell_ssh.py +0 -0
  148. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/context.py +0 -0
  149. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_dispatcher.py +0 -0
  150. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_dispatcher_lazy_local.py +0 -0
  151. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_lazy_local.py +0 -0
  152. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_local.py +0 -0
  153. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_ssh.py +0 -0
  154. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm_test.env +0 -0
  155. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_argcheck.py +0 -0
  156. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_job.py +0 -0
  157. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_machine.py +0 -0
  158. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_machine_dispatch.py +0 -0
  159. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_submission.py +0 -0
  160. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_submission_init.py +0 -0
  161. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_task.py +0 -0
  162. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/conf.lmp +0 -0
  163. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/input.lammps +0 -0
  164. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/some_dir/some_file +0 -0
  165. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-2/conf.lmp +0 -0
  166. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-2/input.lammps +0 -0
  167. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-3/conf.lmp +0 -0
  168. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-3/input.lammps +0 -0
  169. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-4/conf.lmp +0 -0
  170. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-4/input.lammps +0 -0
  171. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/dir with space/file with space +0 -0
  172. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/graph.pb +0 -0
  173. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/some_dir/some_file +0 -0
  174. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_group_size.py +0 -0
  175. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_context.py +0 -0
  176. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-1/conf.lmp +0 -0
  177. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-1/input.lammps +0 -0
  178. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-2/conf.lmp +0 -0
  179. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-2/input.lammps +0 -0
  180. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-3/conf.lmp +0 -0
  181. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-3/input.lammps +0 -0
  182. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-4/conf.lmp +0 -0
  183. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-4/input.lammps +0 -0
  184. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/graph.pb +0 -0
  185. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_if_cuda_multi_devices/test_dir/test.txt +0 -0
  186. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_import_classes.py +0 -0
  187. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lazy_local_context.py +0 -0
  188. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_local_context.py +0 -0
  189. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-1/conf.lmp +0 -0
  190. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-1/input.lammps +0 -0
  191. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-2/conf.lmp +0 -0
  192. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-2/input.lammps +0 -0
  193. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-3/conf.lmp +0 -0
  194. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-3/input.lammps +0 -0
  195. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-4/conf.lmp +0 -0
  196. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-4/input.lammps +0 -0
  197. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/graph.pb +0 -0
  198. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/submission.json +0 -0
  199. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_script_generation.py +0 -0
  200. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-1/conf.lmp +0 -0
  201. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-1/input.lammps +0 -0
  202. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-2/conf.lmp +0 -0
  203. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-2/input.lammps +0 -0
  204. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-3/conf.lmp +0 -0
  205. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-3/input.lammps +0 -0
  206. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-4/conf.lmp +0 -0
  207. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-4/input.lammps +0 -0
  208. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/graph.pb +0 -0
  209. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_retry.py +0 -0
  210. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_cuda_multi_devices.py +0 -0
  211. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival.py +0 -0
  212. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/fail_dir/mock_fail_task.txt +0 -0
  213. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir with space/example.txt +0 -0
  214. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir1/example.txt +0 -0
  215. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir2/example.txt +0 -0
  216. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir3/example.txt +0 -0
  217. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir4/example.txt +0 -0
  218. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/graph.pb +0 -0
  219. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/recover_dir/mock_recover_task.txt +0 -0
  220. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-1/conf.lmp +0 -0
  221. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-1/input.lammps +0 -0
  222. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-2/conf.lmp +0 -0
  223. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-2/input.lammps +0 -0
  224. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-3/conf.lmp +0 -0
  225. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-3/input.lammps +0 -0
  226. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-4/conf.lmp +0 -0
  227. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-4/input.lammps +0 -0
  228. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/d3c842c5b9476e48f7145b370cd330372b9293e1.json +0 -0
  229. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/graph.pb +0 -0
  230. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/submission.json +0 -0
  231. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_script_generation.py +0 -0
  232. {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_work_path/.gitkeep +0 -0
@@ -13,3 +13,5 @@ jobs:
13
13
  python-version: '3.11'
14
14
  - run: pip install -e .[cloudserver]
15
15
  - uses: jakebailey/pyright-action@v1
16
+ with:
17
+ version: 1.1.308
@@ -22,7 +22,7 @@ repos:
22
22
  - id: black-jupyter
23
23
  - repo: https://github.com/charliermarsh/ruff-pre-commit
24
24
  # Ruff version.
25
- rev: v0.0.260
25
+ rev: v0.0.275
26
26
  hooks:
27
27
  - id: ruff
28
28
  args: ["--fix"]
@@ -34,6 +34,6 @@ repos:
34
34
  args: ["--write"]
35
35
  # Python inside docs
36
36
  - repo: https://github.com/asottile/blacken-docs
37
- rev: 1.13.0
37
+ rev: 1.14.0
38
38
  hooks:
39
39
  - id: blacken-docs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dpdispatcher
3
- Version: 0.5.6
3
+ Version: 0.5.8
4
4
  Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
5
5
  Author: DeepModeling
6
6
  License: GNU LESSER GENERAL PUBLIC LICENSE
@@ -191,15 +191,20 @@ License-File: LICENSE
191
191
 
192
192
  # DPDispatcher
193
193
 
194
- DPDispatcher is a python package used to generate HPC(High Performance Computing) scheduler systems (Slurm/PBS/LSF/dpcloudserver) jobs input scripts and submit these scripts to HPC systems and poke until they finish.
194
+ [![conda-forge](https://img.shields.io/conda/dn/conda-forge/dpdispatcher?color=red&label=conda-forge&logo=conda-forge)](https://anaconda.org/conda-forge/dpdispatcher)
195
+ [![pip install](https://img.shields.io/pypi/dm/dpdispatcher?label=pip%20install&logo=pypi)](https://pypi.org/project/dpdispatcher)
196
+ [![docker pull](https://img.shields.io/docker/pulls/dptechnology/dpdispatcher?logo=docker)](https://hub.docker.com/r/dptechnology/dpdispatcher)
197
+ [![Documentation Status](https://readthedocs.org/projects/dpdispatcher/badge/)](https://dpdispatcher.readthedocs.io/)
198
+
199
+ DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
195
200
 
196
- DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs is running on remote systems connected by SSH).
201
+ DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
197
202
 
198
203
  For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
199
204
 
200
205
  ## Installation
201
206
 
202
- DPDispatcher can installed by `pip`:
207
+ DPDispatcher can be installed by `pip`:
203
208
 
204
209
  ```bash
205
210
  pip install dpdispatcher
@@ -211,5 +216,9 @@ See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-star
211
216
 
212
217
  ## Contributing
213
218
 
214
- DPDispatcher is maintained by Deep Modeling's developers and welcome other people.
219
+ DPDispatcher is maintained by Deep Modeling's developers and welcomes other people.
215
220
  See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
221
+
222
+ ## References
223
+
224
+ DPDispatcher is derivated from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206).
@@ -0,0 +1,33 @@
1
+ # DPDispatcher
2
+
3
+ [![conda-forge](https://img.shields.io/conda/dn/conda-forge/dpdispatcher?color=red&label=conda-forge&logo=conda-forge)](https://anaconda.org/conda-forge/dpdispatcher)
4
+ [![pip install](https://img.shields.io/pypi/dm/dpdispatcher?label=pip%20install&logo=pypi)](https://pypi.org/project/dpdispatcher)
5
+ [![docker pull](https://img.shields.io/docker/pulls/dptechnology/dpdispatcher?logo=docker)](https://hub.docker.com/r/dptechnology/dpdispatcher)
6
+ [![Documentation Status](https://readthedocs.org/projects/dpdispatcher/badge/)](https://dpdispatcher.readthedocs.io/)
7
+
8
+ DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
9
+
10
+ DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
11
+
12
+ For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
13
+
14
+ ## Installation
15
+
16
+ DPDispatcher can be installed by `pip`:
17
+
18
+ ```bash
19
+ pip install dpdispatcher
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage.
25
+
26
+ ## Contributing
27
+
28
+ DPDispatcher is maintained by Deep Modeling's developers and welcomes other people.
29
+ See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
30
+
31
+ ## References
32
+
33
+ DPDispatcher is derivated from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206).
@@ -21,9 +21,9 @@ To avoid running multiple jobs at the same time, one could set {dargs:argument}`
21
21
  One needs to make sure slurm has been setup in the remote server and the related environment is activated.
22
22
 
23
23
  When `SlurmJobArray` is used, dpdispatcher submits Slurm jobs with [job arrays](https://slurm.schedmd.com/job_array.html).
24
- In this way, a dpdispatcher {class}`task <dpdispatcher.submission.Task>` maps to a Slurm job and a dpdispatcher {class}`job <dpdispatcher.submission.Job>` maps to a Slurm job array.
24
+ In this way, several dpdispatcher {class}`task <dpdispatcher.submission.Task>`s map to a Slurm job and a dpdispatcher {class}`job <dpdispatcher.submission.Job>` maps to a Slurm job array.
25
25
  Millions of Slurm jobs can be submitted quickly and Slurm can execute all Slurm jobs at the same time.
26
- One can use {dargs:argument}`group_size <resources/group_size>` to control how many Slurm jobs are contained in a Slurm job array.
26
+ One can use {dargs:argument}`group_size <resources/group_size>` and {dargs:argument}`slurm_job_size <resources[SlurmJobArray]/kwargs/slurm_job_size>` to control how many Slurm jobs are contained in a Slurm job array.
27
27
 
28
28
  ## OpenPBS or PBSPro
29
29
 
@@ -62,3 +62,11 @@ Read Bohrium documentation for details.
62
62
 
63
63
  `DistributedShell` is used to submit yarn jobs.
64
64
  Read [Support DPDispatcher on Yarn](dpdispatcher_on_yarn.md) for details.
65
+
66
+ ## Fugaku
67
+
68
+ {dargs:argument}`batch_type <resources/batch_type>`: `Fugaku`
69
+
70
+ [Fujitsu cloud service](https://doc.cloud.global.fujitsu.com/lib/common/jp/hpc-user-manual/) is a job scheduling system used by Fujitsu's HPCs such as Fugaku, ITO and K computer. It should be noted that although the same job scheduling system is used, there are some differences in the details, Fagaku class cannot be directly used for other HPCs.
71
+
72
+ Read Fujitsu cloud service documentation for details.
@@ -43,6 +43,7 @@ except ImportError:
43
43
  from .distributed_shell import DistributedShell
44
44
  from .dp_cloud_server import DpCloudServer, Lebesgue
45
45
  from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
46
+ from .fugaku import Fugaku
46
47
  from .hdfs_context import HDFSContext
47
48
  from .lazy_local_context import LazyLocalContext
48
49
  from .local_context import LocalContext
@@ -85,6 +86,7 @@ __all__ = [
85
86
  "PBS",
86
87
  "Shell",
87
88
  "Slurm",
89
+ "Fugaku",
88
90
  "SSHContext",
89
91
  "Submission",
90
92
  "Task",
@@ -1,4 +1,4 @@
1
1
  # file generated by setuptools_scm
2
2
  # don't change, don't track in version control
3
- __version__ = version = '0.5.6'
4
- __version_tuple__ = version_tuple = (0, 5, 6)
3
+ __version__ = version = '0.5.8'
4
+ __version_tuple__ = version_tuple = (0, 5, 8)
@@ -70,9 +70,6 @@ class BaseContext(metaclass=ABCMeta):
70
70
  def read_file(self, fname):
71
71
  raise NotImplementedError("abstract method")
72
72
 
73
- def kill(self, proc):
74
- raise NotImplementedError("abstract method")
75
-
76
73
  def check_finish(self, proc):
77
74
  raise NotImplementedError("abstract method")
78
75
 
@@ -136,17 +136,16 @@ class DistributedShell(Machine):
136
136
 
137
137
  resources = job.resources
138
138
  submit_command = (
139
- "hadoop jar %s/hadoop-yarn-applications-distributedshell-*.jar "
139
+ "hadoop jar {}/hadoop-yarn-applications-distributedshell-*.jar "
140
140
  "org.apache.hadoop.yarn.applications.distributedshell.Client "
141
- "-jar %s/hadoop-yarn-applications-distributedshell-*.jar "
142
- '-queue %s -appname "distributedshell_dpgen_%s" '
141
+ "-jar {}/hadoop-yarn-applications-distributedshell-*.jar "
142
+ '-queue {} -appname "distributedshell_dpgen_{}" '
143
143
  "-shell_env YARN_CONTAINER_RUNTIME_TYPE=docker "
144
- "-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=%s "
144
+ "-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={} "
145
145
  "-shell_env ENV_DOCKER_CONTAINER_SHM_SIZE='600m' "
146
146
  "-master_memory 1024 -master_vcores 2 -num_containers 1 "
147
- "-container_resources memory-mb=%s,vcores=%s "
148
- "-shell_script /tmp/%s"
149
- % (
147
+ "-container_resources memory-mb={},vcores={} "
148
+ "-shell_script /tmp/{}".format(
150
149
  resources.kwargs.get("yarn_path", ""),
151
150
  resources.kwargs.get("yarn_path", ""),
152
151
  resources.queue_name,
@@ -106,7 +106,9 @@ class Bohrium(Machine):
106
106
 
107
107
  input_data = self.input_data.copy()
108
108
 
109
- input_data["job_resources"] = job_resources
109
+ if not input_data.get("job_resources"):
110
+ input_data["job_resources"] = []
111
+ input_data["job_resources"].append(job_resources)
110
112
  input_data["command"] = f"bash {job.script_file_name}"
111
113
  if not input_data.get("backward_files"):
112
114
  input_data["backward_files"] = self._gen_backward_files_list(job)
@@ -270,9 +270,6 @@ class BohriumContext(BaseContext):
270
270
  # retcode = cmd_pipes['stdout'].channel.recv_exit_status()
271
271
  # return retcode, cmd_pipes['stdout'], cmd_pipes['stderr']
272
272
 
273
- def kill(self, cmd_pipes):
274
- pass
275
-
276
273
  @classmethod
277
274
  def machine_subfields(cls) -> List[Argument]:
278
275
  """Generate the machine subfields.
@@ -198,7 +198,7 @@ class Client:
198
198
  ):
199
199
  post_data = {
200
200
  "job_type": job_type,
201
- "oss_path": [oss_path],
201
+ "oss_path": oss_path,
202
202
  }
203
203
  if program_id is not None:
204
204
  post_data["project_id"] = program_id
@@ -0,0 +1,94 @@
1
+ import shlex
2
+
3
+ from dpdispatcher import dlog
4
+ from dpdispatcher.JobStatus import JobStatus
5
+ from dpdispatcher.machine import Machine
6
+
7
+ fugaku_script_header_template = """\
8
+ {queue_name_line}
9
+ {fugaku_node_number_line}
10
+ {fugaku_ntasks_per_node_line}
11
+ """
12
+
13
+
14
+ class Fugaku(Machine):
15
+ def gen_script(self, job):
16
+ fugaku_script = super().gen_script(job)
17
+ return fugaku_script
18
+
19
+ def gen_script_header(self, job):
20
+ resources = job.resources
21
+ fugaku_script_header_dict = {}
22
+ fugaku_script_header_dict[
23
+ "fugaku_node_number_line"
24
+ ] = f'#PJM -L "node={resources.number_node}" '
25
+ fugaku_script_header_dict[
26
+ "fugaku_ntasks_per_node_line"
27
+ ] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'.format(
28
+ cpu_per_node=resources.cpu_per_node
29
+ )
30
+ fugaku_script_header_dict[
31
+ "queue_name_line"
32
+ ] = f'#PJM -L "rscgrp={resources.queue_name}"'
33
+ fugaku_script_header = fugaku_script_header_template.format(
34
+ **fugaku_script_header_dict
35
+ )
36
+ return fugaku_script_header
37
+
38
+ def do_submit(self, job):
39
+ script_file_name = job.script_file_name
40
+ script_str = self.gen_script(job)
41
+ job_id_name = job.job_hash + "_job_id"
42
+ # script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
43
+ self.context.write_file(fname=script_file_name, write_str=script_str)
44
+ # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
45
+ # script_file_dir = os.path.join(self.context.submission.work_base)
46
+ script_file_dir = self.context.remote_root
47
+ # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'pjsub', script_file_name))
48
+
49
+ stdin, stdout, stderr = self.context.block_checkcall(
50
+ "cd {} && {} {}".format(
51
+ shlex.quote(script_file_dir), "pjsub", shlex.quote(script_file_name)
52
+ )
53
+ )
54
+ subret = stdout.readlines()
55
+ job_id = subret[0].split()[5]
56
+ self.context.write_file(job_id_name, job_id)
57
+ return job_id
58
+
59
+ def default_resources(self, resources):
60
+ pass
61
+
62
+ def check_status(self, job):
63
+ job_id = job.job_id
64
+ if job_id == "":
65
+ return JobStatus.unsubmitted
66
+ ret, stdin, stdout, stderr = self.context.block_call("pjstat " + job_id)
67
+ err_str = stderr.read().decode("utf-8")
68
+ try:
69
+ status_line = stdout.read().decode("utf-8").split("\n")[-2]
70
+ # pjstat only retrun 0 if the job is not waiting or running
71
+ except Exception:
72
+ ret, stdin, stdout, stderr = self.context.block_call("pjstat -H " + job_id)
73
+ status_line = stdout.read().decode("utf-8").split("\n")[-2]
74
+ status_word = status_line.split()[3]
75
+ if status_word in ["EXT", "CCL", "ERR"]:
76
+ if self.check_finish_tag(job):
77
+ dlog.info(f"job: {job.job_hash} {job.job_id} finished")
78
+ return JobStatus.finished
79
+ else:
80
+ return JobStatus.terminated
81
+ else:
82
+ return JobStatus.unknown
83
+ status_word = status_line.split()[3]
84
+ # dlog.info (status_word)
85
+ if status_word in ["QUE", "HLD", "RNA", "SPD"]:
86
+ return JobStatus.waiting
87
+ elif status_word in ["RUN", "RNE"]:
88
+ return JobStatus.running
89
+ else:
90
+ return JobStatus.unknown
91
+
92
+ def check_finish_tag(self, job):
93
+ job_tag_finished = job.job_hash + "_job_tag_finished"
94
+ return self.context.check_file_exists(job_tag_finished)
@@ -247,6 +247,3 @@ class HDFSContext(BaseContext):
247
247
 
248
248
  def read_file(self, fname):
249
249
  return HDFS.read_hdfs_file(os.path.join(self.remote_root, fname))
250
-
251
- def kill(self, job_id):
252
- pass
@@ -1,5 +1,4 @@
1
1
  import os
2
- import signal
3
2
  import subprocess as sp
4
3
 
5
4
  from dpdispatcher.base_context import BaseContext
@@ -167,9 +166,6 @@ class LazyLocalContext(BaseContext):
167
166
  )
168
167
  return proc
169
168
 
170
- def kill(self, job_id):
171
- os.kill(job_id, signal.SIGTERM)
172
-
173
169
  def check_finish(self, proc):
174
170
  return proc.poll() is not None
175
171
 
@@ -1,7 +1,6 @@
1
1
  import hashlib
2
2
  import os
3
3
  import shutil
4
- import signal
5
4
  import subprocess as sp
6
5
  from glob import glob
7
6
  from subprocess import TimeoutExpired
@@ -291,9 +290,6 @@ class LocalContext(BaseContext):
291
290
  )
292
291
  return proc
293
292
 
294
- def kill(self, job_id):
295
- os.kill(job_id, signal.SIGTERM)
296
-
297
293
  def check_finish(self, proc):
298
294
  return proc.poll() is not None
299
295
 
@@ -83,8 +83,7 @@ class LSF(Machine):
83
83
 
84
84
  try:
85
85
  stdin, stdout, stderr = self.context.block_checkcall(
86
- "cd %s && %s %s"
87
- % (
86
+ "cd {} && {} {}".format(
88
87
  shlex.quote(self.context.remote_root),
89
88
  "bsub < ",
90
89
  shlex.quote(script_file_name),
@@ -211,3 +210,14 @@ class LSF(Machine):
211
210
  doc="Extra arguments.",
212
211
  )
213
212
  ]
213
+
214
+ def kill(self, job):
215
+ """Kill the job.
216
+
217
+ Parameters
218
+ ----------
219
+ job : Job
220
+ job
221
+ """
222
+ job_id = job.job_id
223
+ ret, stdin, stdout, stderr = self.context.block_call("bkill " + str(job_id))
@@ -377,8 +377,12 @@ class Machine(metaclass=ABCMeta):
377
377
  machine_args = [
378
378
  Argument("batch_type", str, optional=False, doc=doc_batch_type),
379
379
  # TODO: add default to local_root and remote_root after refactor the code
380
- Argument("local_root", [str, None], optional=False, doc=doc_local_root),
381
- Argument("remote_root", [str, None], optional=True, doc=doc_remote_root),
380
+ Argument(
381
+ "local_root", [str, type(None)], optional=False, doc=doc_local_root
382
+ ),
383
+ Argument(
384
+ "remote_root", [str, type(None)], optional=True, doc=doc_remote_root
385
+ ),
382
386
  Argument(
383
387
  "clean_asynchronously",
384
388
  bool,
@@ -439,3 +443,15 @@ class Machine(metaclass=ABCMeta):
439
443
  "kwargs", dict, optional=True, doc="This field is empty for this batch."
440
444
  )
441
445
  ]
446
+
447
+ def kill(self, job):
448
+ """Kill the job.
449
+
450
+ If not implemented, pass and let the user manually kill it.
451
+
452
+ Parameters
453
+ ----------
454
+ job : Job
455
+ job
456
+ """
457
+ dlog.warning("Job %s should be manually killed" % job.job_id)
@@ -46,8 +46,9 @@ class PBS(Machine):
46
46
  script_file_dir = self.context.remote_root
47
47
  # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name))
48
48
  stdin, stdout, stderr = self.context.block_checkcall(
49
- "cd %s && %s %s"
50
- % (shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name))
49
+ "cd {} && {} {}".format(
50
+ shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)
51
+ )
51
52
  )
52
53
  subret = stdout.readlines()
53
54
  job_id = subret[0].split()[0]
@@ -94,6 +95,17 @@ class PBS(Machine):
94
95
  job_tag_finished = job.job_hash + "_job_tag_finished"
95
96
  return self.context.check_file_exists(job_tag_finished)
96
97
 
98
+ def kill(self, job):
99
+ """Kill the job.
100
+
101
+ Parameters
102
+ ----------
103
+ job : Job
104
+ job
105
+ """
106
+ job_id = job.job_id
107
+ ret, stdin, stdout, stderr = self.context.block_call("qdel " + str(job_id))
108
+
97
109
 
98
110
  class Torque(PBS):
99
111
  def check_status(self, job):
@@ -25,8 +25,7 @@ class Shell(Machine):
25
25
  output_name = job.job_hash + ".out"
26
26
  self.context.write_file(fname=script_file_name, write_str=script_str)
27
27
  ret, stdin, stdout, stderr = self.context.block_call(
28
- "cd %s && { nohup bash %s 1>>%s 2>>%s & } && echo $!"
29
- % (
28
+ "cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
30
29
  shlex.quote(self.context.remote_root),
31
30
  script_file_name,
32
31
  output_name,
@@ -66,7 +65,7 @@ class Shell(Machine):
66
65
 
67
66
  # mark defunct process as terminated
68
67
  ret, stdin, stdout, stderr = self.context.block_call(
69
- f"if ps -p {job_id} > /dev/null && ! (ps -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
68
+ f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
70
69
  )
71
70
  if ret != 0:
72
71
  err_str = stderr.read().decode("utf-8")
@@ -101,3 +100,15 @@ class Shell(Machine):
101
100
  job_tag_finished = job.job_hash + "_job_tag_finished"
102
101
  # print('job finished: ',job.job_id, job_tag_finished)
103
102
  return self.context.check_file_exists(job_tag_finished)
103
+
104
+ def kill(self, job):
105
+ """Kill the job.
106
+
107
+ Parameters
108
+ ----------
109
+ job : Job
110
+ job
111
+ """
112
+ job_id = job.job_id
113
+ # 9 means exit, cannot be blocked
114
+ ret, stdin, stdout, stderr = self.context.block_call("kill -9 " + str(job_id))
@@ -1,3 +1,4 @@
1
+ import math
1
2
  import pathlib
2
3
  import shlex
3
4
  from typing import List
@@ -45,9 +46,12 @@ class Slurm(Machine):
45
46
  )
46
47
  else:
47
48
  script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
48
- script_header_dict[
49
- "slurm_partition_line"
50
- ] = f"#SBATCH --partition {resources.queue_name}"
49
+ if resources.queue_name != "":
50
+ script_header_dict[
51
+ "slurm_partition_line"
52
+ ] = f"#SBATCH --partition {resources.queue_name}"
53
+ else:
54
+ script_header_dict["slurm_partition_line"] = ""
51
55
  slurm_script_header = slurm_script_header_template.format(**script_header_dict)
52
56
  return slurm_script_header
53
57
 
@@ -60,8 +64,7 @@ class Slurm(Machine):
60
64
  self.context.write_file(fname=script_file_name, write_str=script_str)
61
65
  # self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
62
66
  ret, stdin, stdout, stderr = self.context.block_call(
63
- "cd %s && %s %s"
64
- % (
67
+ "cd {} && {} {}".format(
65
68
  shlex.quote(self.context.remote_root),
66
69
  "sbatch",
67
70
  shlex.quote(script_file_name),
@@ -78,7 +81,12 @@ class Slurm(Machine):
78
81
  "Get error code %d in submitting through ssh with job: %s . message: %s"
79
82
  % (ret, job.job_hash, err_str)
80
83
  )
81
- elif "Job violates accounting/QOS policy" in err_str:
84
+ elif (
85
+ "Job violates accounting/QOS policy" in err_str
86
+ # the number of jobs exceeds DEFAULT_MAX_JOB_COUNT (by default 10000)
87
+ or "Slurm temporarily unable to accept job, sleeping and retrying"
88
+ in err_str
89
+ ):
82
90
  # job number exceeds, skip the submitting
83
91
  return ""
84
92
  raise RuntimeError(
@@ -115,6 +123,7 @@ class Slurm(Machine):
115
123
  elif (
116
124
  "Socket timed out on send/recv operation" in err_str
117
125
  or "Unable to contact slurm controller" in err_str
126
+ or "Invalid user for SlurmUser" in err_str
118
127
  ):
119
128
  # retry 3 times
120
129
  raise RetrySignal(
@@ -194,30 +203,47 @@ class Slurm(Machine):
194
203
  )
195
204
  ]
196
205
 
206
+ def kill(self, job):
207
+ """Kill the job.
208
+
209
+ Parameters
210
+ ----------
211
+ job : Job
212
+ job
213
+ """
214
+ job_id = job.job_id
215
+ # -Q Do not report an error if the specified job is already completed.
216
+ ret, stdin, stdout, stderr = self.context.block_call(
217
+ "scancel -Q " + str(job_id)
218
+ )
219
+ # we do not need to stop here if scancel failed; just continue
220
+
197
221
 
198
222
  class SlurmJobArray(Slurm):
199
223
  """Slurm with job array enabled for multiple tasks in a job."""
200
224
 
201
225
  def gen_script_header(self, job):
226
+ slurm_job_size = job.resources.kwargs.get("slurm_job_size", 1)
202
227
  if job.fail_count > 0:
203
228
  # resubmit jobs, check if some of tasks have been finished
204
- job_array = []
229
+ job_array = set()
205
230
  for ii, task in enumerate(job.job_task_list):
206
231
  task_tag_finished = (
207
232
  pathlib.PurePath(task.task_work_path)
208
233
  / (task.task_hash + "_task_tag_finished")
209
234
  ).as_posix()
210
235
  if not self.context.check_file_exists(task_tag_finished):
211
- job_array.append(ii)
236
+ job_array.add(ii // slurm_job_size)
212
237
  return super().gen_script_header(job) + "\n#SBATCH --array=%s" % (
213
238
  ",".join(map(str, job_array))
214
239
  )
215
240
  return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
216
- len(job.job_task_list) - 1
241
+ math.ceil(len(job.job_task_list) / slurm_job_size) - 1
217
242
  )
218
243
 
219
244
  def gen_script_command(self, job):
220
245
  resources = job.resources
246
+ slurm_job_size = resources.kwargs.get("slurm_job_size", 1)
221
247
  # SLURM_ARRAY_TASK_ID: 0 ~ n_jobs-1
222
248
  script_command = "case $SLURM_ARRAY_TASK_ID in\n"
223
249
  for ii, task in enumerate(job.job_task_list):
@@ -243,10 +269,16 @@ class SlurmJobArray(Slurm):
243
269
  task_tag_finished=task_tag_finished,
244
270
  log_err_part=log_err_part,
245
271
  )
246
- script_command += f"{ii})\n"
272
+ if ii % slurm_job_size == 0:
273
+ script_command += f"{ii // slurm_job_size})\n"
247
274
  script_command += single_script_command
248
275
  script_command += self.gen_script_wait(resources=resources)
249
- script_command += "\n;;\n"
276
+ script_command += "\n"
277
+ if (
278
+ ii % slurm_job_size == slurm_job_size - 1
279
+ or ii == len(job.job_task_list) - 1
280
+ ):
281
+ script_command += ";;\n"
250
282
  script_command += "*)\nexit 1\n;;\nesac\n"
251
283
  return script_command
252
284
 
@@ -337,9 +369,30 @@ class SlurmJobArray(Slurm):
337
369
  def check_finish_tag(self, job):
338
370
  results = []
339
371
  for task in job.job_task_list:
340
- task_tag_finished = (
341
- pathlib.PurePath(task.task_work_path)
342
- / (task.task_hash + "_task_tag_finished")
343
- ).as_posix()
344
- results.append(self.context.check_file_exists(task_tag_finished))
372
+ task.get_task_state(self.context)
373
+ results.append(task.task_state == JobStatus.finished)
345
374
  return all(results)
375
+
376
+ @classmethod
377
+ def resources_subfields(cls) -> List[Argument]:
378
+ """Generate the resources subfields.
379
+
380
+ Returns
381
+ -------
382
+ list[Argument]
383
+ resources subfields
384
+ """
385
+ doc_slurm_job_size = "Number of tasks in a Slurm job"
386
+ arg = super().resources_subfields()[0]
387
+ arg.extend_subfields(
388
+ [
389
+ Argument(
390
+ "slurm_job_size",
391
+ int,
392
+ optional=True,
393
+ default=1,
394
+ doc=doc_slurm_job_size,
395
+ ),
396
+ ]
397
+ )
398
+ return [arg]