dpdispatcher 0.5.8__tar.gz → 0.5.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dpdispatcher might be problematic. Click here for more details.
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.pre-commit-config.yaml +4 -4
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/Dockerfile +1 -1
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/PKG-INFO +8 -1
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/README.md +6 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/batch.md +6 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/context.md +8 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/install.md +6 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/__init__.py +4 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/_version.py +2 -2
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dp_cloud_server.py +7 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dp_cloud_server_context.py +10 -7
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/client.py +22 -9
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/fugaku.py +1 -3
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/hdfs_cli.py +4 -12
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/hdfs_context.py +1 -4
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/lsf.py +2 -6
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/machine.py +1 -3
- dpdispatcher-0.5.10/dpdispatcher/openapi.py +198 -0
- dpdispatcher-0.5.10/dpdispatcher/openapi_context.py +259 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/pbs.py +4 -12
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/slurm.py +2 -6
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/submission.py +9 -19
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/PKG-INFO +8 -1
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/SOURCES.txt +3 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/requires.txt +6 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/pyproject.toml +2 -1
- dpdispatcher-0.5.10/tests/jsons/machine_openapi.json +17 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/ci-docker.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/machines.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/mirror_gitee.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/publish_conda.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/pyright.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/release.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.github/workflows/test.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/.gitignore +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/CONTRIBUTING.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/LICENSE +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/LICENSE +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/README.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/pbs/docker-compose.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/pbs/start-pbs.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/pbs.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/slurm/docker-compose.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/slurm/register_cluster.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/slurm/start-slurm.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/slurm.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/ssh/docker-compose.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/ssh/start-ssh.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/ssh.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/ci/ssh_rsync.sh +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/codecov.yml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/conda/conda_build_config.yaml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/conda/meta.yaml +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/.gitignore +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/Makefile +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/conf.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/credits.rst +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/dpdispatcher_on_yarn.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/examples/expanse.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/examples/g16.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/examples/shell.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/getting-started.md +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/index.rst +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/machine.rst +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/make.bat +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/requirements.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/resources.rst +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/doc/task.rst +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/JobStatus.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/arginfo.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/base_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/distributed_shell.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/__init__.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/config.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/retcode.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/temp_test.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpcloudserver/zip_file.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/dpdisp.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/lazy_local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/shell.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/ssh_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher/utils.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/dependency_links.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/entry_points.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/dpdispatcher.egg-info/top_level.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/machine/expanse.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/machine/lazy_local.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/machine/mandu.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/resources/expanse_cpu.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/resources/mandu.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/task/deepmd-kit.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/examples/task/g16.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/scripts/script_gen_dargs_docs.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/scripts/script_gen_dargs_json.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/setup.cfg +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/.gitignore +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/__init__.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/batch.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/debug_test_class_submission_init.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_dp_cloud_server.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_lazy_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_lsf.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_shell.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_slurm.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/devel_test_ssh_ali_ehpc.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/job.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_ali_ehpc.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_center.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_diffenert.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_dp_cloud_server.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_fugaku.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_if_cuda_multi_devices.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_lazy_local_lsf.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_lazy_local_slurm.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_lazylocal_shell.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_local_fugaku.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_local_shell.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_lsf.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_slurm.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/machine_yarn.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/resources.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/submission.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/jsons/task.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/lsf/context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/lsf/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/lsf/test_lsf_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/old/test_dispatcher_utils.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/old/test_lazy_local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/old/test_local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/old/test_local_session.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/old/test_ssh_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/pbs/context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/pbs/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/pbs/test_pbs_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/sample_class.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/script_gen_json.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/shell/context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/shell/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/shell/test_shell_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/shell/test_shell_ssh.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/test_dispatcher.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/test_dispatcher_lazy_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/test_slurm_lazy_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/test_slurm_local.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm/test_slurm_ssh.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/slurm_test.env +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_argcheck.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_job.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_machine.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_machine_dispatch.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_resources.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_submission.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_submission_init.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_class_task.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-1/some_dir/some_file +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/dir with space/file with space +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_context_dir/0_md/some_dir/some_file +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_group_size.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_hdfs_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_if_cuda_multi_devices/test_dir/test.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_import_classes.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lazy_local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_local_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_dir/0_md/submission.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_lsf_script_generation.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_pbs_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_retry.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_run_submission.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_run_submission_ratio_unfinished.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_cuda_multi_devices.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/fail_dir/mock_fail_task.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/dir with space/example.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/dir1/example.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/dir2/example.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/dir3/example.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/dir4/example.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/parent_dir/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_shell_trival_dir/recover_dir/mock_recover_task.txt +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-1/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-1/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-2/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-2/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-3/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-3/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-4/conf.lmp +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/bct-4/input.lammps +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/d3c842c5b9476e48f7145b370cd330372b9293e1.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/graph.pb +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_dir/0_md/submission.json +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_slurm_script_generation.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_ssh_context.py +0 -0
- {dpdispatcher-0.5.8 → dpdispatcher-0.5.10}/tests/test_work_path/.gitkeep +0 -0
|
@@ -17,12 +17,12 @@ repos:
|
|
|
17
17
|
- id: check-toml
|
|
18
18
|
# Python
|
|
19
19
|
- repo: https://github.com/psf/black
|
|
20
|
-
rev: 23.
|
|
20
|
+
rev: 23.7.0
|
|
21
21
|
hooks:
|
|
22
22
|
- id: black-jupyter
|
|
23
|
-
- repo: https://github.com/
|
|
23
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
24
24
|
# Ruff version.
|
|
25
|
-
rev: v0.0.
|
|
25
|
+
rev: v0.0.278
|
|
26
26
|
hooks:
|
|
27
27
|
- id: ruff
|
|
28
28
|
args: ["--fix"]
|
|
@@ -34,6 +34,6 @@ repos:
|
|
|
34
34
|
args: ["--write"]
|
|
35
35
|
# Python inside docs
|
|
36
36
|
- repo: https://github.com/asottile/blacken-docs
|
|
37
|
-
rev: 1.
|
|
37
|
+
rev: 1.15.0
|
|
38
38
|
hooks:
|
|
39
39
|
- id: blacken-docs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dpdispatcher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.10
|
|
4
4
|
Summary: Generate HPC scheduler systems jobs input scripts, submit these scripts to HPC systems, and poke until they finish
|
|
5
5
|
Author: DeepModeling
|
|
6
6
|
License: GNU LESSER GENERAL PUBLIC LICENSE
|
|
@@ -186,6 +186,7 @@ Requires-Python: >=3.7
|
|
|
186
186
|
Description-Content-Type: text/markdown
|
|
187
187
|
Provides-Extra: docs
|
|
188
188
|
Provides-Extra: cloudserver
|
|
189
|
+
Provides-Extra: bohrium
|
|
189
190
|
Provides-Extra: test
|
|
190
191
|
License-File: LICENSE
|
|
191
192
|
|
|
@@ -210,6 +211,12 @@ DPDispatcher can be installed by `pip`:
|
|
|
210
211
|
pip install dpdispatcher
|
|
211
212
|
```
|
|
212
213
|
|
|
214
|
+
To add [Bohrium](https://bohrium.dp.tech/) support, execute
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
pip install dpdispatcher[bohrium]
|
|
218
|
+
```
|
|
219
|
+
|
|
213
220
|
## Usage
|
|
214
221
|
|
|
215
222
|
See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage.
|
|
@@ -19,6 +19,12 @@ DPDispatcher can be installed by `pip`:
|
|
|
19
19
|
pip install dpdispatcher
|
|
20
20
|
```
|
|
21
21
|
|
|
22
|
+
To add [Bohrium](https://bohrium.dp.tech/) support, execute
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install dpdispatcher[bohrium]
|
|
26
|
+
```
|
|
27
|
+
|
|
22
28
|
## Usage
|
|
23
29
|
|
|
24
30
|
See [Getting Started](https://dpdispatcher.readthedocs.io/en/latest/getting-started.html) for usage.
|
|
@@ -70,3 +70,9 @@ Read [Support DPDispatcher on Yarn](dpdispatcher_on_yarn.md) for details.
|
|
|
70
70
|
[Fujitsu cloud service](https://doc.cloud.global.fujitsu.com/lib/common/jp/hpc-user-manual/) is a job scheduling system used by Fujitsu's HPCs such as Fugaku, ITO and K computer. It should be noted that although the same job scheduling system is used, there are some differences in the details, Fagaku class cannot be directly used for other HPCs.
|
|
71
71
|
|
|
72
72
|
Read Fujitsu cloud service documentation for details.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## OpenAPI
|
|
76
|
+
|
|
77
|
+
{dargs:argument}`batcy_type <resources/batch_type>`: `OpenAPI`
|
|
78
|
+
OpenAPI is a new way to submit jobs to Bohrium. It using [AccessKey](https://bohrium.dp.tech/personal/setting) instead of username and password. Read Bohrium documentation for details.
|
|
@@ -42,3 +42,11 @@ To use Bohrium, one needs to provide necessary parameters in {dargs:argument}`re
|
|
|
42
42
|
|
|
43
43
|
The Hadoop Distributed File System (HDFS) is a distributed file system.
|
|
44
44
|
Read [Support DPDispatcher on Yarn](dpdispatcher_on_yarn.md) for details.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
## OpenAPI
|
|
48
|
+
|
|
49
|
+
{dargs:argument}`context_type <machine/context_type>`: `OpenAPI`
|
|
50
|
+
|
|
51
|
+
OpenAPI is a new way to submit jobs to Bohrium. It using [AccessKey](https://bohrium.dp.tech/personal/setting) instead of username and password. Read Bohrium documentation for details.
|
|
52
|
+
To use OpenAPI, one needs to provide necessary parameters in {dargs:argument}`remote_profile <machine[OpenAPIContext]/remote_profile>`.
|
|
@@ -49,6 +49,8 @@ from .lazy_local_context import LazyLocalContext
|
|
|
49
49
|
from .local_context import LocalContext
|
|
50
50
|
from .lsf import LSF
|
|
51
51
|
from .machine import Machine
|
|
52
|
+
from .openapi import OpenAPI
|
|
53
|
+
from .openapi_context import OpenAPIContext
|
|
52
54
|
from .pbs import PBS, Torque
|
|
53
55
|
from .shell import Shell
|
|
54
56
|
from .slurm import Slurm
|
|
@@ -77,6 +79,8 @@ __all__ = [
|
|
|
77
79
|
"__version__",
|
|
78
80
|
"DistributedShell",
|
|
79
81
|
"DpCloudServer",
|
|
82
|
+
"OpenAPI",
|
|
83
|
+
"OpenAPIContext",
|
|
80
84
|
"DpCloudServerContext",
|
|
81
85
|
"HDFSContext",
|
|
82
86
|
"LazyLocalContext",
|
|
@@ -31,6 +31,13 @@ class Bohrium(Machine):
|
|
|
31
31
|
phone = context.remote_profile.get("phone", None)
|
|
32
32
|
username = context.remote_profile.get("username", None)
|
|
33
33
|
password = context.remote_profile.get("password", None)
|
|
34
|
+
|
|
35
|
+
ticket = os.environ.get("BOHR_TICKET", None)
|
|
36
|
+
if ticket:
|
|
37
|
+
self.api = Client(ticket=ticket)
|
|
38
|
+
self.group_id = None
|
|
39
|
+
return
|
|
40
|
+
|
|
34
41
|
if email is None and username is not None:
|
|
35
42
|
raise DeprecationWarning(
|
|
36
43
|
"username is no longer support in current version, "
|
|
@@ -21,7 +21,7 @@ DP_CLOUD_SERVER_HOME_DIR = os.path.join(
|
|
|
21
21
|
os.path.expanduser("~"), ".dpdispatcher/", "dp_cloud_server/"
|
|
22
22
|
)
|
|
23
23
|
ENDPOINT = "http://oss-cn-shenzhen.aliyuncs.com"
|
|
24
|
-
BUCKET_NAME = "dpcloudserver"
|
|
24
|
+
BUCKET_NAME = os.environ.get("BUCKET_NAME", "dpcloudserver")
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class BohriumContext(BaseContext):
|
|
@@ -39,9 +39,16 @@ class BohriumContext(BaseContext):
|
|
|
39
39
|
self.init_remote_root = remote_root
|
|
40
40
|
self.temp_local_root = os.path.abspath(local_root)
|
|
41
41
|
self.remote_profile = remote_profile
|
|
42
|
+
ticket = os.environ.get("BOHR_TICKET", None)
|
|
42
43
|
email = remote_profile.get("email", None)
|
|
43
44
|
phone = remote_profile.get("phone", None)
|
|
44
45
|
password = remote_profile.get("password")
|
|
46
|
+
os.makedirs(DP_CLOUD_SERVER_HOME_DIR, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
if ticket is not None:
|
|
49
|
+
self.api = Client(ticket=ticket)
|
|
50
|
+
return
|
|
51
|
+
|
|
45
52
|
if email is None and phone is None:
|
|
46
53
|
raise ValueError(
|
|
47
54
|
"can not find email/phone number in remote_profile, please check your machine file."
|
|
@@ -57,8 +64,6 @@ class BohriumContext(BaseContext):
|
|
|
57
64
|
|
|
58
65
|
self.api = Client(account, password)
|
|
59
66
|
|
|
60
|
-
os.makedirs(DP_CLOUD_SERVER_HOME_DIR, exist_ok=True)
|
|
61
|
-
|
|
62
67
|
@classmethod
|
|
63
68
|
def load_from_dict(cls, context_dict):
|
|
64
69
|
local_root = context_dict["local_root"]
|
|
@@ -256,9 +261,7 @@ class BohriumContext(BaseContext):
|
|
|
256
261
|
return os.path.isfile(os.path.join(DP_CLOUD_SERVER_HOME_DIR, fname))
|
|
257
262
|
|
|
258
263
|
def clean(self):
|
|
259
|
-
submission_file_name = "{submission_hash}.json"
|
|
260
|
-
submission_hash=self.submission.submission_hash
|
|
261
|
-
)
|
|
264
|
+
submission_file_name = f"{self.submission.submission_hash}.json"
|
|
262
265
|
submission_json = os.path.join(DP_CLOUD_SERVER_HOME_DIR, submission_file_name)
|
|
263
266
|
os.remove(submission_json)
|
|
264
267
|
return True
|
|
@@ -288,7 +291,7 @@ class BohriumContext(BaseContext):
|
|
|
288
291
|
dict,
|
|
289
292
|
[
|
|
290
293
|
Argument("email", str, optional=True, doc="Email"),
|
|
291
|
-
Argument("password", str, optional=
|
|
294
|
+
Argument("password", str, optional=True, doc="Password"),
|
|
292
295
|
Argument(
|
|
293
296
|
"program_id",
|
|
294
297
|
int,
|
|
@@ -25,7 +25,9 @@ class RequestInfoException(Exception):
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class Client:
|
|
28
|
-
def __init__(
|
|
28
|
+
def __init__(
|
|
29
|
+
self, email=None, password=None, debug=False, ticket=None, base_url=API_HOST
|
|
30
|
+
):
|
|
29
31
|
self.debug = debug
|
|
30
32
|
self.debug = os.getenv("LBG_CLI_DEBUG_PRINT", debug)
|
|
31
33
|
self.config = {}
|
|
@@ -35,6 +37,7 @@ class Client:
|
|
|
35
37
|
self.config["password"] = password
|
|
36
38
|
self.base_url = base_url
|
|
37
39
|
self.last_log_offset = 0
|
|
40
|
+
self.ticket = ticket
|
|
38
41
|
|
|
39
42
|
def post(self, url, data=None, header=None, params=None, retry=5):
|
|
40
43
|
return self._req(
|
|
@@ -51,19 +54,26 @@ class Client:
|
|
|
51
54
|
header = {}
|
|
52
55
|
if not self.token:
|
|
53
56
|
self.refresh_token()
|
|
57
|
+
self.ticket = os.environ.get("BOHR_TICKET", "")
|
|
54
58
|
header["Authorization"] = f"jwt {self.token}"
|
|
59
|
+
header["Brm-Ticket"] = self.ticket
|
|
55
60
|
resp_code = None
|
|
56
61
|
err = None
|
|
57
62
|
for i in range(retry):
|
|
58
63
|
resp = None
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
try:
|
|
65
|
+
if method == "GET":
|
|
66
|
+
resp = requests.get(url, params=params, headers=header)
|
|
67
|
+
else:
|
|
68
|
+
if self.debug:
|
|
69
|
+
print(data)
|
|
70
|
+
resp = requests.post(url, json=data, params=params, headers=header)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
dlog.error(f"request({i}) error {e}", i, stack_info=ENABLE_STACK)
|
|
73
|
+
err = e
|
|
74
|
+
time.sleep(1 * i)
|
|
75
|
+
continue
|
|
76
|
+
|
|
67
77
|
resp_code = resp.status_code
|
|
68
78
|
if not resp.ok:
|
|
69
79
|
if self.debug:
|
|
@@ -96,6 +106,9 @@ class Client:
|
|
|
96
106
|
self.user_id = resp["user_id"]
|
|
97
107
|
|
|
98
108
|
def refresh_token(self, retry=3):
|
|
109
|
+
self.ticket = os.environ.get("BOHR_TICKET", "")
|
|
110
|
+
if self.ticket:
|
|
111
|
+
return
|
|
99
112
|
url = "/account/login"
|
|
100
113
|
post_data = {"email": self.config["email"], "password": self.config["password"]}
|
|
101
114
|
resp_code = None
|
|
@@ -24,9 +24,7 @@ class Fugaku(Machine):
|
|
|
24
24
|
] = f'#PJM -L "node={resources.number_node}" '
|
|
25
25
|
fugaku_script_header_dict[
|
|
26
26
|
"fugaku_ntasks_per_node_line"
|
|
27
|
-
] = '#PJM --mpi "max-proc-per-node={cpu_per_node}"'
|
|
28
|
-
cpu_per_node=resources.cpu_per_node
|
|
29
|
-
)
|
|
27
|
+
] = f'#PJM --mpi "max-proc-per-node={resources.cpu_per_node}"'
|
|
30
28
|
fugaku_script_header_dict[
|
|
31
29
|
"queue_name_line"
|
|
32
30
|
] = f'#PJM -L "rscgrp={resources.queue_name}"'
|
|
@@ -90,9 +90,7 @@ class HDFS:
|
|
|
90
90
|
raise RuntimeError(
|
|
91
91
|
"try to access local_path[{}] " "but failed".format(local_path)
|
|
92
92
|
)
|
|
93
|
-
cmd = "hadoop fs -copyFromLocal -f {
|
|
94
|
-
local=local_path, remote=to_uri
|
|
95
|
-
)
|
|
93
|
+
cmd = f"hadoop fs -copyFromLocal -f {local_path} {to_uri}"
|
|
96
94
|
try:
|
|
97
95
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
98
96
|
if ret == 0:
|
|
@@ -106,9 +104,7 @@ class HDFS:
|
|
|
106
104
|
)
|
|
107
105
|
except Exception as e:
|
|
108
106
|
raise RuntimeError(
|
|
109
|
-
"Cannot copy local[{}] to remote[{}] with cmd[{}]"
|
|
110
|
-
local_path, to_uri, cmd
|
|
111
|
-
)
|
|
107
|
+
f"Cannot copy local[{local_path}] to remote[{to_uri}] with cmd[{cmd}]"
|
|
112
108
|
) from e
|
|
113
109
|
|
|
114
110
|
@staticmethod
|
|
@@ -118,9 +114,7 @@ class HDFS:
|
|
|
118
114
|
remote = from_uri
|
|
119
115
|
elif isinstance(from_uri, list) or isinstance(from_uri, tuple):
|
|
120
116
|
remote = " ".join(from_uri)
|
|
121
|
-
cmd = "hadoop fs -copyToLocal {remote} {
|
|
122
|
-
remote=remote, local=local_path
|
|
123
|
-
)
|
|
117
|
+
cmd = f"hadoop fs -copyToLocal {remote} {local_path}"
|
|
124
118
|
|
|
125
119
|
try:
|
|
126
120
|
ret, out, err = run_cmd_with_all_output(cmd)
|
|
@@ -135,9 +129,7 @@ class HDFS:
|
|
|
135
129
|
)
|
|
136
130
|
except Exception as e:
|
|
137
131
|
raise RuntimeError(
|
|
138
|
-
"Cannot copy remote[{}] to local[{}] with cmd[{}]"
|
|
139
|
-
from_uri, local_path, cmd
|
|
140
|
-
)
|
|
132
|
+
f"Cannot copy remote[{from_uri}] to local[{local_path}] with cmd[{cmd}]"
|
|
141
133
|
) from e
|
|
142
134
|
|
|
143
135
|
@staticmethod
|
|
@@ -137,10 +137,7 @@ class HDFSContext(BaseContext):
|
|
|
137
137
|
if os.path.exists(gz_dir):
|
|
138
138
|
shutil.rmtree(gz_dir, ignore_errors=True)
|
|
139
139
|
os.mkdir(os.path.join(self.local_root, "tmp"))
|
|
140
|
-
rfile_tgz = "{}/{}_*_download.tar.gz"
|
|
141
|
-
self.remote_root,
|
|
142
|
-
submission.submission_hash,
|
|
143
|
-
)
|
|
140
|
+
rfile_tgz = f"{self.remote_root}/{submission.submission_hash}_*_download.tar.gz"
|
|
144
141
|
lfile_tgz = "%s/tmp/" % (self.local_root)
|
|
145
142
|
HDFS.copy_to_local(rfile_tgz, lfile_tgz)
|
|
146
143
|
|
|
@@ -31,12 +31,8 @@ class LSF(Machine):
|
|
|
31
31
|
"lsf_nodes_line": "#BSUB -n {number_cores}".format(
|
|
32
32
|
number_cores=resources.number_node * resources.cpu_per_node
|
|
33
33
|
),
|
|
34
|
-
"lsf_ptile_line": "#BSUB -R 'span[ptile={cpu_per_node}]'"
|
|
35
|
-
|
|
36
|
-
),
|
|
37
|
-
"lsf_partition_line": "#BSUB -q {queue_name}".format(
|
|
38
|
-
queue_name=resources.queue_name
|
|
39
|
-
),
|
|
34
|
+
"lsf_ptile_line": f"#BSUB -R 'span[ptile={resources.cpu_per_node}]'",
|
|
35
|
+
"lsf_partition_line": f"#BSUB -q {resources.queue_name}",
|
|
40
36
|
}
|
|
41
37
|
gpu_usage_flag = resources.kwargs.get("gpu_usage", False)
|
|
42
38
|
gpu_new_syntax_flag = resources.kwargs.get("gpu_new_syntax", False)
|
|
@@ -208,9 +208,7 @@ class Machine(metaclass=ABCMeta):
|
|
|
208
208
|
|
|
209
209
|
def check_if_recover(self, submission):
|
|
210
210
|
submission_hash = submission.submission_hash
|
|
211
|
-
submission_file_name = "{submission_hash}.json"
|
|
212
|
-
submission_hash=submission_hash
|
|
213
|
-
)
|
|
211
|
+
submission_file_name = f"{submission_hash}.json"
|
|
214
212
|
if_recover = self.context.check_file_exists(submission_file_name)
|
|
215
213
|
return if_recover
|
|
216
214
|
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from bohriumsdk.client import Client
|
|
7
|
+
from bohriumsdk.job import Job
|
|
8
|
+
from bohriumsdk.storage import Storage
|
|
9
|
+
from bohriumsdk.util import Util
|
|
10
|
+
except ModuleNotFoundError:
|
|
11
|
+
found_bohriumsdk = False
|
|
12
|
+
else:
|
|
13
|
+
found_bohriumsdk = True
|
|
14
|
+
|
|
15
|
+
from dpdispatcher import dlog
|
|
16
|
+
from dpdispatcher.JobStatus import JobStatus
|
|
17
|
+
from dpdispatcher.machine import Machine
|
|
18
|
+
|
|
19
|
+
shell_script_header_template = """
|
|
20
|
+
#!/bin/bash -l
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OpenAPI(Machine):
|
|
25
|
+
def __init__(self, context):
|
|
26
|
+
if not found_bohriumsdk:
|
|
27
|
+
raise ModuleNotFoundError(
|
|
28
|
+
"bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`"
|
|
29
|
+
)
|
|
30
|
+
self.context = context
|
|
31
|
+
self.remote_profile = context.remote_profile.copy()
|
|
32
|
+
|
|
33
|
+
self.grouped = self.remote_profile.get("grouped", True)
|
|
34
|
+
self.client = Client()
|
|
35
|
+
self.job = Job(client=self.client)
|
|
36
|
+
self.storage = Storage(client=self.client)
|
|
37
|
+
self.group_id = None
|
|
38
|
+
|
|
39
|
+
def gen_script(self, job):
|
|
40
|
+
shell_script = super().gen_script(job)
|
|
41
|
+
return shell_script
|
|
42
|
+
|
|
43
|
+
def gen_script_header(self, job):
|
|
44
|
+
shell_script_header = shell_script_header_template
|
|
45
|
+
return shell_script_header
|
|
46
|
+
|
|
47
|
+
def gen_local_script(self, job):
|
|
48
|
+
script_str = self.gen_script(job)
|
|
49
|
+
script_file_name = job.script_file_name
|
|
50
|
+
self.context.write_local_file(fname=script_file_name, write_str=script_str)
|
|
51
|
+
return script_file_name
|
|
52
|
+
|
|
53
|
+
def _gen_backward_files_list(self, job):
|
|
54
|
+
result_file_list = []
|
|
55
|
+
# result_file_list.extend(job.backward_common_files)
|
|
56
|
+
for task in job.job_task_list:
|
|
57
|
+
result_file_list.extend(
|
|
58
|
+
[os.path.join(task.task_work_path, b_f) for b_f in task.backward_files]
|
|
59
|
+
)
|
|
60
|
+
result_file_list = list(set(result_file_list))
|
|
61
|
+
return result_file_list
|
|
62
|
+
|
|
63
|
+
def do_submit(self, job):
|
|
64
|
+
self.gen_local_script(job)
|
|
65
|
+
|
|
66
|
+
project_id = self.remote_profile.get("project_id", 0)
|
|
67
|
+
|
|
68
|
+
openapi_params = {
|
|
69
|
+
"oss_path": job.upload_path,
|
|
70
|
+
"input_file_type": 3,
|
|
71
|
+
"input_file_method": 1,
|
|
72
|
+
"job_type": "container",
|
|
73
|
+
"job_name": self.remote_profile.get("job_name", "DP-GEN"),
|
|
74
|
+
"project_id": project_id,
|
|
75
|
+
"scass_type": self.remote_profile.get("machine_type", ""),
|
|
76
|
+
"cmd": f"bash {job.script_file_name}",
|
|
77
|
+
"log_files": os.path.join(
|
|
78
|
+
job.job_task_list[0].task_work_path, job.job_task_list[0].outlog
|
|
79
|
+
),
|
|
80
|
+
"out_files": self._gen_backward_files_list(job),
|
|
81
|
+
"platform": self.remote_profile.get("platform", "ali"),
|
|
82
|
+
"image_address": self.remote_profile.get("image_address", ""),
|
|
83
|
+
"job_id": job.job_id,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
data = self.job.insert(**openapi_params)
|
|
87
|
+
|
|
88
|
+
job.job_id = data.get("jobId", 0) # type: ignore
|
|
89
|
+
# self.job_group_id = data.get("jobGroupId")
|
|
90
|
+
job.job_state = JobStatus.waiting
|
|
91
|
+
return job.job_id
|
|
92
|
+
|
|
93
|
+
def _get_job_detail(self, job_id, group_id):
|
|
94
|
+
check_return = self.job.detail(job_id)
|
|
95
|
+
assert check_return is not None, (
|
|
96
|
+
f"Failed to retrieve tasks information. To resubmit this job, please "
|
|
97
|
+
f"try again, if this problem still exists please delete the submission "
|
|
98
|
+
f"file and try again.\nYou can check submission.submission_hash in the "
|
|
99
|
+
f'previous log or type `grep -rl "{job_id}:job_group_id:{group_id}" '
|
|
100
|
+
f"~/.dpdispatcher/dp_cloud_server/` to find corresponding file. "
|
|
101
|
+
f"You can try with command:\n "
|
|
102
|
+
f'rm $(grep -rl "{job_id}:job_group_id:{group_id}" ~/.dpdispatcher/dp_cloud_server/)'
|
|
103
|
+
)
|
|
104
|
+
return check_return
|
|
105
|
+
|
|
106
|
+
def check_status(self, job):
|
|
107
|
+
if job.job_id == "":
|
|
108
|
+
return JobStatus.unsubmitted
|
|
109
|
+
job_id = job.job_id
|
|
110
|
+
group_id = None
|
|
111
|
+
if hasattr(job, "jgid"):
|
|
112
|
+
group_id = job.jgid
|
|
113
|
+
check_return = self._get_job_detail(job_id, group_id)
|
|
114
|
+
try:
|
|
115
|
+
dp_job_status = check_return["status"] # type: ignore
|
|
116
|
+
except IndexError as e:
|
|
117
|
+
dlog.error(
|
|
118
|
+
f"cannot find job information in bohrium for job {job.job_id}. check_return:{check_return}; retry one more time after 60 seconds"
|
|
119
|
+
)
|
|
120
|
+
time.sleep(60)
|
|
121
|
+
retry_return = self._get_job_detail(job_id, group_id)
|
|
122
|
+
try:
|
|
123
|
+
dp_job_status = retry_return["status"] # type: ignore
|
|
124
|
+
except IndexError as e:
|
|
125
|
+
raise RuntimeError(
|
|
126
|
+
f"cannot find job information in bohrium for job {job.job_id} {check_return} {retry_return}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
job_state = self.map_dp_job_state(dp_job_status)
|
|
130
|
+
if job_state == JobStatus.finished:
|
|
131
|
+
job_log = self.job.log(job_id)
|
|
132
|
+
if self.remote_profile.get("output_log"):
|
|
133
|
+
print(job_log, end="")
|
|
134
|
+
# print(job.job_id)
|
|
135
|
+
self._download_job(job)
|
|
136
|
+
elif self.remote_profile.get("output_log") and job_state == JobStatus.running:
|
|
137
|
+
job_log = self.job.log(job_id)
|
|
138
|
+
print(job_log, end="")
|
|
139
|
+
return job_state
|
|
140
|
+
|
|
141
|
+
def _download_job(self, job):
|
|
142
|
+
data = self.job.detail(job.job_id)
|
|
143
|
+
# print(data)
|
|
144
|
+
job_url = data["jobFiles"]["outFiles"][0]["url"] # type: ignore
|
|
145
|
+
if not job_url:
|
|
146
|
+
return
|
|
147
|
+
job_hash = job.job_hash
|
|
148
|
+
result_filename = job_hash + "_back.zip"
|
|
149
|
+
target_result_zip = os.path.join(self.context.local_root, result_filename)
|
|
150
|
+
self.storage.download_from_url(job_url, target_result_zip)
|
|
151
|
+
Util.unzip_file(target_result_zip, out_dir=self.context.local_root)
|
|
152
|
+
try:
|
|
153
|
+
os.makedirs(os.path.join(self.context.local_root, "backup"), exist_ok=True)
|
|
154
|
+
shutil.move(
|
|
155
|
+
target_result_zip,
|
|
156
|
+
os.path.join(
|
|
157
|
+
self.context.local_root,
|
|
158
|
+
"backup",
|
|
159
|
+
os.path.split(target_result_zip)[1],
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
except (OSError, shutil.Error) as e:
|
|
163
|
+
dlog.exception("unable to backup file, " + str(e))
|
|
164
|
+
|
|
165
|
+
def check_finish_tag(self, job):
|
|
166
|
+
job_tag_finished = job.job_hash + "_job_tag_finished"
|
|
167
|
+
dlog.info("check if job finished: ", job.job_id, job_tag_finished)
|
|
168
|
+
return self.context.check_file_exists(job_tag_finished)
|
|
169
|
+
# return
|
|
170
|
+
# pass
|
|
171
|
+
|
|
172
|
+
def check_if_recover(self, submission):
|
|
173
|
+
return False
|
|
174
|
+
# pass
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def map_dp_job_state(status):
|
|
178
|
+
if isinstance(status, JobStatus):
|
|
179
|
+
return status
|
|
180
|
+
map_dict = {
|
|
181
|
+
-1: JobStatus.terminated,
|
|
182
|
+
0: JobStatus.waiting,
|
|
183
|
+
1: JobStatus.running,
|
|
184
|
+
2: JobStatus.finished,
|
|
185
|
+
3: JobStatus.waiting,
|
|
186
|
+
4: JobStatus.running,
|
|
187
|
+
5: JobStatus.terminated,
|
|
188
|
+
6: JobStatus.running,
|
|
189
|
+
9: JobStatus.waiting,
|
|
190
|
+
}
|
|
191
|
+
if status not in map_dict:
|
|
192
|
+
dlog.error(f"unknown job status {status}")
|
|
193
|
+
return JobStatus.unknown
|
|
194
|
+
return map_dict[status]
|
|
195
|
+
|
|
196
|
+
# def check_finish_tag(self, job):
|
|
197
|
+
# job_tag_finished = job.job_hash + '_job_tag_finished'
|
|
198
|
+
# return self.context.check_file_exists(job_tag_finished)
|