dpdispatcher 0.5.6__tar.gz → 0.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/pyright.yml +2 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.pre-commit-config.yaml +2 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/PKG-INFO +14 -5
- dpdispatcher-0.5.8/README.md +33 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/batch.md +10 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/__init__.py +2 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/_version.py +2 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/base_context.py +0 -3
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/distributed_shell.py +6 -7
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dp_cloud_server.py +3 -1
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dp_cloud_server_context.py +0 -3
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/client.py +1 -1
- dpdispatcher-0.5.8/dpdispatcher/fugaku.py +94 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/hdfs_context.py +0 -3
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/lazy_local_context.py +0 -4
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/local_context.py +0 -4
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/lsf.py +12 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/machine.py +18 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/pbs.py +14 -2
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/shell.py +14 -3
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/slurm.py +69 -16
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/ssh_context.py +21 -17
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/submission.py +158 -41
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/PKG-INFO +14 -5
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/SOURCES.txt +4 -0
- dpdispatcher-0.5.8/tests/jsons/machine_fugaku.json +24 -0
- dpdispatcher-0.5.8/tests/jsons/machine_local_fugaku.json +18 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_resources.py +3 -0
- dpdispatcher-0.5.8/tests/test_run_submission.py +213 -0
- dpdispatcher-0.5.6/tests/test_run_submission.py → dpdispatcher-0.5.8/tests/test_run_submission_ratio_unfinished.py +16 -10
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_ssh_context.py +48 -0
- dpdispatcher-0.5.6/README.md +0 -24
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/ci-docker.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/machines.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/mirror_gitee.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/publish_conda.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/release.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.github/workflows/test.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/.gitignore +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/CONTRIBUTING.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/Dockerfile +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/LICENSE +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/LICENSE +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/README.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs/docker-compose.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs/start-pbs.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/pbs.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/docker-compose.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/register_cluster.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm/start-slurm.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/slurm.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh/docker-compose.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh/start-ssh.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/ci/ssh_rsync.sh +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/codecov.yml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/conda/conda_build_config.yaml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/conda/meta.yaml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/.gitignore +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/Makefile +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/conf.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/context.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/credits.rst +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/dpdispatcher_on_yarn.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/expanse.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/g16.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/examples/shell.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/getting-started.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/index.rst +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/install.md +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/machine.rst +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/make.bat +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/requirements.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/resources.rst +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/doc/task.rst +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/JobStatus.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/arginfo.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/__init__.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/config.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/retcode.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/temp_test.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpcloudserver/zip_file.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/dpdisp.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/hdfs_cli.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher/utils.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/dependency_links.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/entry_points.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/requires.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/dpdispatcher.egg-info/top_level.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/expanse.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/lazy_local.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/machine/mandu.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/resources/expanse_cpu.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/resources/mandu.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/task/deepmd-kit.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/examples/task/g16.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/pyproject.toml +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/scripts/script_gen_dargs_docs.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/scripts/script_gen_dargs_json.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/setup.cfg +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/.gitignore +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/__init__.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/batch.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/debug_test_class_submission_init.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_dp_cloud_server.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_lazy_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_lsf.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_shell.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_slurm.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/devel_test_ssh_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/job.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_ali_ehpc.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_center.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_diffenert.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_dp_cloud_server.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_if_cuda_multi_devices.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazy_local_lsf.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazy_local_slurm.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lazylocal_shell.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_local_shell.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_lsf.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_slurm.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/machine_yarn.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/resources.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/submission.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/jsons/task.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/lsf/test_lsf_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_dispatcher_utils.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_lazy_local_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_local_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_local_session.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/old/test_ssh_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/pbs/test_pbs_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/sample_class.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/script_gen_json.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_shell_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/shell/test_shell_ssh.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_dispatcher_lazy_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_lazy_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_local.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm/test_slurm_ssh.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/slurm_test.env +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_argcheck.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_job.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_machine.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_machine_dispatch.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_submission.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_submission_init.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_class_task.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-1/some_dir/some_file +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/dir with space/file with space +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_context_dir/0_md/some_dir/some_file +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_group_size.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_hdfs_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_if_cuda_multi_devices/test_dir/test.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_import_classes.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lazy_local_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_local_context.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_dir/0_md/submission.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_lsf_script_generation.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_pbs_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_retry.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_cuda_multi_devices.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/fail_dir/mock_fail_task.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir with space/example.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir1/example.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir2/example.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir3/example.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/dir4/example.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/parent_dir/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_shell_trival_dir/recover_dir/mock_recover_task.txt +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/d3c842c5b9476e48f7145b370cd330372b9293e1.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_dir/0_md/submission.json +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_slurm_script_generation.py +0 -0
- {dpdispatcher-0.5.6 → dpdispatcher-0.5.8}/tests/test_work_path/.gitkeep +0 -0
|
@@ -22,7 +22,7 @@ repos:
|
|
|
22
22
|
- id: black-jupyter
|
|
23
23
|
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
|
24
24
|
# Ruff version.
|
|
25
|
-
rev: v0.0.
|
|
25
|
+
rev: v0.0.275
|
|
26
26
|
hooks:
|
|
27
27
|
- id: ruff
|
|
28
28
|
args: ["--fix"]
|
|
@@ -34,6 +34,6 @@ repos:
|
|
|
34
34
|
args: ["--write"]
|
|
35
35
|
# Python inside docs
|
|
36
36
|
- repo: https://github.com/asottile/blacken-docs
|
|
37
|
-
rev: 1.
|
|
37
|
+
rev: 1.14.0
|
|
38
38
|
hooks:
|
|
39
39
|
- id: blacken-docs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.8
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -191,15 +191,20 @@ License-File: LICENSE
|
|
|
191
191
|
|
|
192
192
|
# DPDispatcher
|
|
193
193
|
|
|
194
|
-
|
|
194
|
+
[](https://anaconda.org/conda-forge/dpdispatcher)
|
|
195
|
+
[](https://pypi.org/project/dpdispatcher)
|
|
196
|
+
[](https://hub.docker.com/r/dptechnology/dpdispatcher)
|
|
197
|
+
[](https://dpdispatcher.readthedocs.io/)
|
|
198
|
+
|
|
199
|
+
DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
|
|
195
200
|
|
|
196
|
-
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs
|
|
201
|
+
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
|
|
197
202
|
|
|
198
203
|
For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
|
|
199
204
|
|
|
200
205
|
## Installation
|
|
201
206
|
|
|
202
|
-
DPDispatcher can installed by `pip`:
|
|
207
|
+
DPDispatcher can be installed by `pip`:
|
|
203
208
|
|
|
204
209
|
```bash
|
|
205
210
|
pip install dpdispatcher
|
|
@@ -211,5 +216,9 @@ See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-star
|
|
|
211
216
|
|
|
212
217
|
## Contributing
|
|
213
218
|
|
|
214
|
-
DPDispatcher is maintained by Deep Modeling's developers and
|
|
219
|
+
DPDispatcher is maintained by Deep Modeling's developers and welcomes other people.
|
|
215
220
|
See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
|
|
221
|
+
|
|
222
|
+
## References
|
|
223
|
+
|
|
224
|
+
DPDispatcher is derivated from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206).
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# DPDispatcher
|
|
2
|
+
|
|
3
|
+
[](https://anaconda.org/conda-forge/dpdispatcher)
|
|
4
|
+
[](https://pypi.org/project/dpdispatcher)
|
|
5
|
+
[](https://hub.docker.com/r/dptechnology/dpdispatcher)
|
|
6
|
+
[](https://dpdispatcher.readthedocs.io/)
|
|
7
|
+
|
|
8
|
+
DPDispatcher is a Python package used to generate HPC (High-Performance Computing) scheduler systems (Slurm/PBS/LSF/Bohrium) jobs input scripts, submit them to HPC systems, and poke until they finish.
|
|
9
|
+
|
|
10
|
+
DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs are running on remote systems connected by SSH).
|
|
11
|
+
|
|
12
|
+
For more information, check the [documentation](https://dpdispatcher.readthedocs.io/).
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
DPDispatcher can be installed by `pip`:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install dpdispatcher
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage.
|
|
25
|
+
|
|
26
|
+
## Contributing
|
|
27
|
+
|
|
28
|
+
DPDispatcher is maintained by Deep Modeling's developers and welcomes other people.
|
|
29
|
+
See [Contributing Guide](CONTRIBUTING.md) to become a contributor! 🤓
|
|
30
|
+
|
|
31
|
+
## References
|
|
32
|
+
|
|
33
|
+
DPDispatcher is derivated from the [DP-GEN](https://github.com/deepmodeling/dpgen) package. To mention DPDispatcher in a scholarly publication, please read Section 3.3 in the [DP-GEN paper](https://doi.org/10.1016/j.cpc.2020.107206).
|
|
@@ -21,9 +21,9 @@ To avoid running multiple jobs at the same time, one could set {dargs:argument}`
|
|
|
21
21
|
One needs to make sure slurm has been setup in the remote server and the related environment is activated.
|
|
22
22
|
|
|
23
23
|
When `SlurmJobArray` is used, dpdispatcher submits Slurm jobs with [job arrays](https://slurm.schedmd.com/job_array.html).
|
|
24
|
-
In this way,
|
|
24
|
+
In this way, several dpdispatcher {class}`task <dpdispatcher.submission.Task>`s map to a Slurm job and a dpdispatcher {class}`job <dpdispatcher.submission.Job>` maps to a Slurm job array.
|
|
25
25
|
Millions of Slurm jobs can be submitted quickly and Slurm can execute all Slurm jobs at the same time.
|
|
26
|
-
One can use {dargs:argument}`group_size <resources/group_size>` to control how many Slurm jobs are contained in a Slurm job array.
|
|
26
|
+
One can use {dargs:argument}`group_size <resources/group_size>` and {dargs:argument}`slurm_job_size <resources[SlurmJobArray]/kwargs/slurm_job_size>` to control how many Slurm jobs are contained in a Slurm job array.
|
|
27
27
|
|
|
28
28
|
## OpenPBS or PBSPro
|
|
29
29
|
|
|
@@ -62,3 +62,11 @@ Read Bohrium documentation for details.
|
|
|
62
62
|
|
|
63
63
|
`DistributedShell` is used to submit yarn jobs.
|
|
64
64
|
Read [Support DPDispatcher on Yarn](dpdispatcher_on_yarn.md) for details.
|
|
65
|
+
|
|
66
|
+
## Fugaku
|
|
67
|
+
|
|
68
|
+
{dargs:argument}`batch_type <resources/batch_type>`: `Fugaku`
|
|
69
|
+
|
|
70
|
+
[Fujitsu cloud service](https://doc.cloud.global.fujitsu.com/lib/common/jp/hpc-user-manual/) is a job scheduling system used by Fujitsu's HPCs such as Fugaku, ITO and K computer. It should be noted that although the same job scheduling system is used, there are some differences in the details, Fagaku class cannot be directly used for other HPCs.
|
|
71
|
+
|
|
72
|
+
Read Fujitsu cloud service documentation for details.
|
|
@@ -43,6 +43,7 @@ except ImportError:
|
|
|
43
43
|
from .distributed_shell import DistributedShell
|
|
44
44
|
from .dp_cloud_server import DpCloudServer, Lebesgue
|
|
45
45
|
from .dp_cloud_server_context import DpCloudServerContext, LebesgueContext
|
|
46
|
+
from .fugaku import Fugaku
|
|
46
47
|
from .hdfs_context import HDFSContext
|
|
47
48
|
from .lazy_local_context import LazyLocalContext
|
|
48
49
|
from .local_context import LocalContext
|
|
@@ -85,6 +86,7 @@ __all__ = [
|
|
|
85
86
|
"PBS",
|
|
86
87
|
"Shell",
|
|
87
88
|
"Slurm",
|
|
89
|
+
"Fugaku",
|
|
88
90
|
"SSHContext",
|
|
89
91
|
"Submission",
|
|
90
92
|
"Task",
|
|
@@ -70,9 +70,6 @@ class BaseContext(metaclass=ABCMeta):
|
|
|
70
70
|
def read_file(self, fname):
|
|
71
71
|
raise NotImplementedError("abstract method")
|
|
72
72
|
|
|
73
|
-
def kill(self, proc):
|
|
74
|
-
raise NotImplementedError("abstract method")
|
|
75
|
-
|
|
76
73
|
def check_finish(self, proc):
|
|
77
74
|
raise NotImplementedError("abstract method")
|
|
78
75
|
|
|
@@ -136,17 +136,16 @@ class DistributedShell(Machine):
|
|
|
136
136
|
|
|
137
137
|
resources = job.resources
|
|
138
138
|
submit_command = (
|
|
139
|
-
"hadoop jar
|
|
139
|
+
"hadoop jar {}/hadoop-yarn-applications-distributedshell-*.jar "
|
|
140
140
|
"org.apache.hadoop.yarn.applications.distributedshell.Client "
|
|
141
|
-
"-jar
|
|
142
|
-
'-queue
|
|
141
|
+
"-jar {}/hadoop-yarn-applications-distributedshell-*.jar "
|
|
142
|
+
'-queue {} -appname "distributedshell_dpgen_{}" '
|
|
143
143
|
"-shell_env YARN_CONTAINER_RUNTIME_TYPE=docker "
|
|
144
|
-
"-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE
|
|
144
|
+
"-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE={} "
|
|
145
145
|
"-shell_env ENV_DOCKER_CONTAINER_SHM_SIZE='600m' "
|
|
146
146
|
"-master_memory 1024 -master_vcores 2 -num_containers 1 "
|
|
147
|
-
"-container_resources memory-mb
|
|
148
|
-
"-shell_script /tmp
|
|
149
|
-
% (
|
|
147
|
+
"-container_resources memory-mb={},vcores={} "
|
|
148
|
+
"-shell_script /tmp/{}".format(
|
|
150
149
|
resources.kwargs.get("yarn_path", ""),
|
|
151
150
|
resources.kwargs.get("yarn_path", ""),
|
|
152
151
|
resources.queue_name,
|
|
@@ -106,7 +106,9 @@ class Bohrium(Machine):
|
|
|
106
106
|
|
|
107
107
|
input_data = self.input_data.copy()
|
|
108
108
|
|
|
109
|
-
input_data
|
|
109
|
+
if not input_data.get("job_resources"):
|
|
110
|
+
input_data["job_resources"] = []
|
|
111
|
+
input_data["job_resources"].append(job_resources)
|
|
110
112
|
input_data["command"] = f"bash {job.script_file_name}"
|
|
111
113
|
if not input_data.get("backward_files"):
|
|
112
114
|
input_data["backward_files"] = self._gen_backward_files_list(job)
|
|
@@ -270,9 +270,6 @@ class BohriumContext(BaseContext):
|
|
|
270
270
|
# retcode = cmd_pipes['stdout'].channel.recv_exit_status()
|
|
271
271
|
# return retcode, cmd_pipes['stdout'], cmd_pipes['stderr']
|
|
272
272
|
|
|
273
|
-
def kill(self, cmd_pipes):
|
|
274
|
-
pass
|
|
275
|
-
|
|
276
273
|
@classmethod
|
|
277
274
|
def machine_subfields(cls) -> List[Argument]:
|
|
278
275
|
"""Generate the machine subfields.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import shlex
|
|
2
|
+
|
|
3
|
+
from dpdispatcher import dlog
|
|
4
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
5
|
+
from dpdispatcher.machine import Machine
|
|
6
|
+
|
|
7
|
+
fugaku_script_header_template = """\
|
|
8
|
+
{queue_name_line}
|
|
9
|
+
{fugaku_node_number_line}
|
|
10
|
+
{fugaku_ntasks_per_node_line}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Fugaku(Machine):
|
|
15
|
+
def gen_script(self, job):
|
|
16
|
+
fugaku_script = super().gen_script(job)
|
|
17
|
+
return fugaku_script
|
|
18
|
+
|
|
19
|
+
def gen_script_header(self, job):
|
|
20
|
+
resources = job.resources
|
|
21
|
+
fugaku_script_header_dict = {}
|
|
22
|
+
fugaku_script_header_dict[
|
|
23
|
+
"fugaku_node_number_line"
|
|
24
|
+
] = f'#PJM -L "node={resources.number_node}" '
|
|
25
|
+
fugaku_script_header_dict[
|
|
26
|
+
"fugaku_ntasks_per_node_line"
|
|
27
|
+
] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'.format(
|
|
28
|
+
cpu_per_node=resources.cpu_per_node
|
|
29
|
+
)
|
|
30
|
+
fugaku_script_header_dict[
|
|
31
|
+
"queue_name_line"
|
|
32
|
+
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
33
|
+
fugaku_script_header = fugaku_script_header_template.format(
|
|
34
|
+
**fugaku_script_header_dict
|
|
35
|
+
)
|
|
36
|
+
return fugaku_script_header
|
|
37
|
+
|
|
38
|
+
def do_submit(self, job):
|
|
39
|
+
script_file_name = job.script_file_name
|
|
40
|
+
script_str = self.gen_script(job)
|
|
41
|
+
job_id_name = job.job_hash + "_job_id"
|
|
42
|
+
# script_str = self.sub_script(job_dirs, cmd, args=args, resources=resources, outlog=outlog, errlog=errlog)
|
|
43
|
+
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
44
|
+
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
45
|
+
# script_file_dir = os.path.join(self.context.submission.work_base)
|
|
46
|
+
script_file_dir = self.context.remote_root
|
|
47
|
+
# stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'pjsub', script_file_name))
|
|
48
|
+
|
|
49
|
+
stdin, stdout, stderr = self.context.block_checkcall(
|
|
50
|
+
"cd {} && {} {}".format(
|
|
51
|
+
shlex.quote(script_file_dir), "pjsub", shlex.quote(script_file_name)
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
subret = stdout.readlines()
|
|
55
|
+
job_id = subret[0].split()[5]
|
|
56
|
+
self.context.write_file(job_id_name, job_id)
|
|
57
|
+
return job_id
|
|
58
|
+
|
|
59
|
+
def default_resources(self, resources):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def check_status(self, job):
|
|
63
|
+
job_id = job.job_id
|
|
64
|
+
if job_id == "":
|
|
65
|
+
return JobStatus.unsubmitted
|
|
66
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat " + job_id)
|
|
67
|
+
err_str = stderr.read().decode("utf-8")
|
|
68
|
+
try:
|
|
69
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
70
|
+
# pjstat only retrun 0 if the job is not waiting or running
|
|
71
|
+
except Exception:
|
|
72
|
+
ret, stdin, stdout, stderr = self.context.block_call("pjstat -H " + job_id)
|
|
73
|
+
status_line = stdout.read().decode("utf-8").split("\n")[-2]
|
|
74
|
+
status_word = status_line.split()[3]
|
|
75
|
+
if status_word in ["EXT", "CCL", "ERR"]:
|
|
76
|
+
if self.check_finish_tag(job):
|
|
77
|
+
dlog.info(f"job: {job.job_hash} {job.job_id} finished")
|
|
78
|
+
return JobStatus.finished
|
|
79
|
+
else:
|
|
80
|
+
return JobStatus.terminated
|
|
81
|
+
else:
|
|
82
|
+
return JobStatus.unknown
|
|
83
|
+
status_word = status_line.split()[3]
|
|
84
|
+
# dlog.info (status_word)
|
|
85
|
+
if status_word in ["QUE", "HLD", "RNA", "SPD"]:
|
|
86
|
+
return JobStatus.waiting
|
|
87
|
+
elif status_word in ["RUN", "RNE"]:
|
|
88
|
+
return JobStatus.running
|
|
89
|
+
else:
|
|
90
|
+
return JobStatus.unknown
|
|
91
|
+
|
|
92
|
+
def check_finish_tag(self, job):
|
|
93
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
94
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import signal
|
|
3
2
|
import subprocess as sp
|
|
4
3
|
|
|
5
4
|
from dpdispatcher.base_context import BaseContext
|
|
@@ -167,9 +166,6 @@ class LazyLocalContext(BaseContext):
|
|
|
167
166
|
)
|
|
168
167
|
return proc
|
|
169
168
|
|
|
170
|
-
def kill(self, job_id):
|
|
171
|
-
os.kill(job_id, signal.SIGTERM)
|
|
172
|
-
|
|
173
169
|
def check_finish(self, proc):
|
|
174
170
|
return proc.poll() is not None
|
|
175
171
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
import signal
|
|
5
4
|
import subprocess as sp
|
|
6
5
|
from glob import glob
|
|
7
6
|
from subprocess import TimeoutExpired
|
|
@@ -291,9 +290,6 @@ class LocalContext(BaseContext):
|
|
|
291
290
|
)
|
|
292
291
|
return proc
|
|
293
292
|
|
|
294
|
-
def kill(self, job_id):
|
|
295
|
-
os.kill(job_id, signal.SIGTERM)
|
|
296
|
-
|
|
297
293
|
def check_finish(self, proc):
|
|
298
294
|
return proc.poll() is not None
|
|
299
295
|
|
|
@@ -83,8 +83,7 @@ class LSF(Machine):
|
|
|
83
83
|
|
|
84
84
|
try:
|
|
85
85
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
86
|
-
"cd
|
|
87
|
-
% (
|
|
86
|
+
"cd {} && {} {}".format(
|
|
88
87
|
shlex.quote(self.context.remote_root),
|
|
89
88
|
"bsub < ",
|
|
90
89
|
shlex.quote(script_file_name),
|
|
@@ -211,3 +210,14 @@ class LSF(Machine):
|
|
|
211
210
|
doc="Extra arguments.",
|
|
212
211
|
)
|
|
213
212
|
]
|
|
213
|
+
|
|
214
|
+
def kill(self, job):
|
|
215
|
+
"""Kill the job.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
job : Job
|
|
220
|
+
job
|
|
221
|
+
"""
|
|
222
|
+
job_id = job.job_id
|
|
223
|
+
ret, stdin, stdout, stderr = self.context.block_call("bkill " + str(job_id))
|
|
@@ -377,8 +377,12 @@ class Machine(metaclass=ABCMeta):
|
|
|
377
377
|
machine_args = [
|
|
378
378
|
Argument("batch_type", str, optional=False, doc=doc_batch_type),
|
|
379
379
|
# TODO: add default to local_root and remote_root after refactor the code
|
|
380
|
-
Argument(
|
|
381
|
-
|
|
380
|
+
Argument(
|
|
381
|
+
"local_root", [str, type(None)], optional=False, doc=doc_local_root
|
|
382
|
+
),
|
|
383
|
+
Argument(
|
|
384
|
+
"remote_root", [str, type(None)], optional=True, doc=doc_remote_root
|
|
385
|
+
),
|
|
382
386
|
Argument(
|
|
383
387
|
"clean_asynchronously",
|
|
384
388
|
bool,
|
|
@@ -439,3 +443,15 @@ class Machine(metaclass=ABCMeta):
|
|
|
439
443
|
"kwargs", dict, optional=True, doc="This field is empty for this batch."
|
|
440
444
|
)
|
|
441
445
|
]
|
|
446
|
+
|
|
447
|
+
def kill(self, job):
|
|
448
|
+
"""Kill the job.
|
|
449
|
+
|
|
450
|
+
If not implemented, pass and let the user manually kill it.
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
job : Job
|
|
455
|
+
job
|
|
456
|
+
"""
|
|
457
|
+
dlog.warning("Job %s should be manually killed" % job.job_id)
|
|
@@ -46,8 +46,9 @@ class PBS(Machine):
|
|
|
46
46
|
script_file_dir = self.context.remote_root
|
|
47
47
|
# stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name))
|
|
48
48
|
stdin, stdout, stderr = self.context.block_checkcall(
|
|
49
|
-
"cd
|
|
50
|
-
|
|
49
|
+
"cd {} && {} {}".format(
|
|
50
|
+
shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)
|
|
51
|
+
)
|
|
51
52
|
)
|
|
52
53
|
subret = stdout.readlines()
|
|
53
54
|
job_id = subret[0].split()[0]
|
|
@@ -94,6 +95,17 @@ class PBS(Machine):
|
|
|
94
95
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
95
96
|
return self.context.check_file_exists(job_tag_finished)
|
|
96
97
|
|
|
98
|
+
def kill(self, job):
|
|
99
|
+
"""Kill the job.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
job : Job
|
|
104
|
+
job
|
|
105
|
+
"""
|
|
106
|
+
job_id = job.job_id
|
|
107
|
+
ret, stdin, stdout, stderr = self.context.block_call("qdel " + str(job_id))
|
|
108
|
+
|
|
97
109
|
|
|
98
110
|
class Torque(PBS):
|
|
99
111
|
def check_status(self, job):
|
|
@@ -25,8 +25,7 @@ class Shell(Machine):
|
|
|
25
25
|
output_name = job.job_hash + ".out"
|
|
26
26
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
27
27
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
28
|
-
"cd
|
|
29
|
-
% (
|
|
28
|
+
"cd {} && {{ nohup bash {} 1>>{} 2>>{} & }} && echo $!".format(
|
|
30
29
|
shlex.quote(self.context.remote_root),
|
|
31
30
|
script_file_name,
|
|
32
31
|
output_name,
|
|
@@ -66,7 +65,7 @@ class Shell(Machine):
|
|
|
66
65
|
|
|
67
66
|
# mark defunct process as terminated
|
|
68
67
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
69
|
-
f"if ps -p {job_id} > /dev/null && ! (ps -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
68
|
+
f"if ps -p {job_id} > /dev/null && ! (ps -o command -p {job_id} | grep defunct >/dev/null) ; then echo 1; fi"
|
|
70
69
|
)
|
|
71
70
|
if ret != 0:
|
|
72
71
|
err_str = stderr.read().decode("utf-8")
|
|
@@ -101,3 +100,15 @@ class Shell(Machine):
|
|
|
101
100
|
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
102
101
|
# print('job finished: ',job.job_id, job_tag_finished)
|
|
103
102
|
return self.context.check_file_exists(job_tag_finished)
|
|
103
|
+
|
|
104
|
+
def kill(self, job):
|
|
105
|
+
"""Kill the job.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
job : Job
|
|
110
|
+
job
|
|
111
|
+
"""
|
|
112
|
+
job_id = job.job_id
|
|
113
|
+
# 9 means exit, cannot be blocked
|
|
114
|
+
ret, stdin, stdout, stderr = self.context.block_call("kill -9 " + str(job_id))
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import math
|
|
1
2
|
import pathlib
|
|
2
3
|
import shlex
|
|
3
4
|
from typing import List
|
|
@@ -45,9 +46,12 @@ class Slurm(Machine):
|
|
|
45
46
|
)
|
|
46
47
|
else:
|
|
47
48
|
script_header_dict["slurm_number_gpu_line"] = custom_gpu_line
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
if resources.queue_name != "":
|
|
50
|
+
script_header_dict[
|
|
51
|
+
"slurm_partition_line"
|
|
52
|
+
] = f"#SBATCH --partition {resources.queue_name}"
|
|
53
|
+
else:
|
|
54
|
+
script_header_dict["slurm_partition_line"] = ""
|
|
51
55
|
slurm_script_header = slurm_script_header_template.format(**script_header_dict)
|
|
52
56
|
return slurm_script_header
|
|
53
57
|
|
|
@@ -60,8 +64,7 @@ class Slurm(Machine):
|
|
|
60
64
|
self.context.write_file(fname=script_file_name, write_str=script_str)
|
|
61
65
|
# self.context.write_file(fname=os.path.join(self.context.submission.work_base, script_file_name), write_str=script_str)
|
|
62
66
|
ret, stdin, stdout, stderr = self.context.block_call(
|
|
63
|
-
"cd
|
|
64
|
-
% (
|
|
67
|
+
"cd {} && {} {}".format(
|
|
65
68
|
shlex.quote(self.context.remote_root),
|
|
66
69
|
"sbatch",
|
|
67
70
|
shlex.quote(script_file_name),
|
|
@@ -78,7 +81,12 @@ class Slurm(Machine):
|
|
|
78
81
|
"Get error code %d in submitting through ssh with job: %s . message: %s"
|
|
79
82
|
% (ret, job.job_hash, err_str)
|
|
80
83
|
)
|
|
81
|
-
elif
|
|
84
|
+
elif (
|
|
85
|
+
"Job violates accounting/QOS policy" in err_str
|
|
86
|
+
# the number of jobs exceeds DEFAULT_MAX_JOB_COUNT (by default 10000)
|
|
87
|
+
or "Slurm temporarily unable to accept job, sleeping and retrying"
|
|
88
|
+
in err_str
|
|
89
|
+
):
|
|
82
90
|
# job number exceeds, skip the submitting
|
|
83
91
|
return ""
|
|
84
92
|
raise RuntimeError(
|
|
@@ -115,6 +123,7 @@ class Slurm(Machine):
|
|
|
115
123
|
elif (
|
|
116
124
|
"Socket timed out on send/recv operation" in err_str
|
|
117
125
|
or "Unable to contact slurm controller" in err_str
|
|
126
|
+
or "Invalid user for SlurmUser" in err_str
|
|
118
127
|
):
|
|
119
128
|
# retry 3 times
|
|
120
129
|
raise RetrySignal(
|
|
@@ -194,30 +203,47 @@ class Slurm(Machine):
|
|
|
194
203
|
)
|
|
195
204
|
]
|
|
196
205
|
|
|
206
|
+
def kill(self, job):
|
|
207
|
+
"""Kill the job.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
job : Job
|
|
212
|
+
job
|
|
213
|
+
"""
|
|
214
|
+
job_id = job.job_id
|
|
215
|
+
# -Q Do not report an error if the specified job is already completed.
|
|
216
|
+
ret, stdin, stdout, stderr = self.context.block_call(
|
|
217
|
+
"scancel -Q " + str(job_id)
|
|
218
|
+
)
|
|
219
|
+
# we do not need to stop here if scancel failed; just continue
|
|
220
|
+
|
|
197
221
|
|
|
198
222
|
class SlurmJobArray(Slurm):
|
|
199
223
|
"""Slurm with job array enabled for multiple tasks in a job."""
|
|
200
224
|
|
|
201
225
|
def gen_script_header(self, job):
|
|
226
|
+
slurm_job_size = job.resources.kwargs.get("slurm_job_size", 1)
|
|
202
227
|
if job.fail_count > 0:
|
|
203
228
|
# resubmit jobs, check if some of tasks have been finished
|
|
204
|
-
job_array =
|
|
229
|
+
job_array = set()
|
|
205
230
|
for ii, task in enumerate(job.job_task_list):
|
|
206
231
|
task_tag_finished = (
|
|
207
232
|
pathlib.PurePath(task.task_work_path)
|
|
208
233
|
/ (task.task_hash + "_task_tag_finished")
|
|
209
234
|
).as_posix()
|
|
210
235
|
if not self.context.check_file_exists(task_tag_finished):
|
|
211
|
-
job_array.
|
|
236
|
+
job_array.add(ii // slurm_job_size)
|
|
212
237
|
return super().gen_script_header(job) + "\n#SBATCH --array=%s" % (
|
|
213
238
|
",".join(map(str, job_array))
|
|
214
239
|
)
|
|
215
240
|
return super().gen_script_header(job) + "\n#SBATCH --array=0-%d" % (
|
|
216
|
-
len(job.job_task_list) - 1
|
|
241
|
+
math.ceil(len(job.job_task_list) / slurm_job_size) - 1
|
|
217
242
|
)
|
|
218
243
|
|
|
219
244
|
def gen_script_command(self, job):
|
|
220
245
|
resources = job.resources
|
|
246
|
+
slurm_job_size = resources.kwargs.get("slurm_job_size", 1)
|
|
221
247
|
# SLURM_ARRAY_TASK_ID: 0 ~ n_jobs-1
|
|
222
248
|
script_command = "case $SLURM_ARRAY_TASK_ID in\n"
|
|
223
249
|
for ii, task in enumerate(job.job_task_list):
|
|
@@ -243,10 +269,16 @@ class SlurmJobArray(Slurm):
|
|
|
243
269
|
task_tag_finished=task_tag_finished,
|
|
244
270
|
log_err_part=log_err_part,
|
|
245
271
|
)
|
|
246
|
-
|
|
272
|
+
if ii % slurm_job_size == 0:
|
|
273
|
+
script_command += f"{ii // slurm_job_size})\n"
|
|
247
274
|
script_command += single_script_command
|
|
248
275
|
script_command += self.gen_script_wait(resources=resources)
|
|
249
|
-
script_command += "\n
|
|
276
|
+
script_command += "\n"
|
|
277
|
+
if (
|
|
278
|
+
ii % slurm_job_size == slurm_job_size - 1
|
|
279
|
+
or ii == len(job.job_task_list) - 1
|
|
280
|
+
):
|
|
281
|
+
script_command += ";;\n"
|
|
250
282
|
script_command += "*)\nexit 1\n;;\nesac\n"
|
|
251
283
|
return script_command
|
|
252
284
|
|
|
@@ -337,9 +369,30 @@ class SlurmJobArray(Slurm):
|
|
|
337
369
|
def check_finish_tag(self, job):
|
|
338
370
|
results = []
|
|
339
371
|
for task in job.job_task_list:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
/ (task.task_hash + "_task_tag_finished")
|
|
343
|
-
).as_posix()
|
|
344
|
-
results.append(self.context.check_file_exists(task_tag_finished))
|
|
372
|
+
task.get_task_state(self.context)
|
|
373
|
+
results.append(task.task_state == JobStatus.finished)
|
|
345
374
|
return all(results)
|
|
375
|
+
|
|
376
|
+
@classmethod
|
|
377
|
+
def resources_subfields(cls) -> List[Argument]:
|
|
378
|
+
"""Generate the resources subfields.
|
|
379
|
+
|
|
380
|
+
Returns
|
|
381
|
+
-------
|
|
382
|
+
list[Argument]
|
|
383
|
+
resources subfields
|
|
384
|
+
"""
|
|
385
|
+
doc_slurm_job_size = "Number of tasks in a Slurm job"
|
|
386
|
+
arg = super().resources_subfields()[0]
|
|
387
|
+
arg.extend_subfields(
|
|
388
|
+
[
|
|
389
|
+
Argument(
|
|
390
|
+
"slurm_job_size",
|
|
391
|
+
int,
|
|
392
|
+
optional=True,
|
|
393
|
+
default=1,
|
|
394
|
+
doc=doc_slurm_job_size,
|
|
395
|
+
),
|
|
396
|
+
]
|
|
397
|
+
)
|
|
398
|
+
return [arg]
|