oh-my-batch 0.1.0.dev0__py3-none-any.whl → 0.1.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oh_my_batch/job.py +35 -43
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/METADATA +3 -3
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/RECORD +6 -6
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/LICENSE +0 -0
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/WHEEL +0 -0
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/entry_points.txt +0 -0
    
        oh_my_batch/job.py
    CHANGED
    
    | @@ -12,29 +12,22 @@ from .util import expand_globs, shell_run, parse_csv | |
| 12 12 |  | 
| 13 13 | 
             
            logger = logging.getLogger(__name__)
         | 
| 14 14 |  | 
| 15 | 
            +
            class JobState:
         | 
| 16 | 
            +
                NULL = 0
         | 
| 17 | 
            +
                PENDING = 1
         | 
| 18 | 
            +
                RUNNING = 2
         | 
| 19 | 
            +
                CANCELLED = 3
         | 
| 20 | 
            +
                COMPLETED = 4
         | 
| 21 | 
            +
                FAILED = 5
         | 
| 22 | 
            +
                UNKNOWN = 6
         | 
| 15 23 |  | 
| 16 | 
            -
             | 
| 17 | 
            -
                 | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
                 | 
| 21 | 
            -
             | 
| 22 | 
            -
                     | 
| 23 | 
            -
                    obj.terminal = terminal
         | 
| 24 | 
            -
                    obj.status_name = status_name
         | 
| 25 | 
            -
                    return obj
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                value: int  # type: ignore
         | 
| 28 | 
            -
                terminal: bool
         | 
| 29 | 
            -
                status_name: str
         | 
| 30 | 
            -
             | 
| 31 | 
            -
                NULL = (0, True, "NULL")
         | 
| 32 | 
            -
                PENDING = (1, False, "PENDING")
         | 
| 33 | 
            -
                RUNNING = (2, False, "RUNNING")
         | 
| 34 | 
            -
                CANCELLED = (3, True, "CANCELLED")
         | 
| 35 | 
            -
                COMPLETED = (4, True, "COMPLETED")
         | 
| 36 | 
            -
                FAILED = (5, True, "FAILED")
         | 
| 37 | 
            -
                UNKNOWN = (6, False, "UNKNOWN")
         | 
| 24 | 
            +
                @classmethod
         | 
| 25 | 
            +
                def is_terminal(cls, state: int):
         | 
| 26 | 
            +
                    return state in (JobState.NULL, JobState.COMPLETED, JobState.FAILED, JobState.CANCELLED)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                @classmethod
         | 
| 29 | 
            +
                def is_success(cls, state: int):
         | 
| 30 | 
            +
                    return state == JobState.COMPLETED
         | 
| 38 31 |  | 
| 39 32 |  | 
| 40 33 | 
             
            def new_job(script: str):
         | 
| @@ -69,9 +62,6 @@ class BaseJobManager: | |
| 69 62 | 
             
                    scripts = set(os.path.normpath(s) for s in expand_globs(script))
         | 
| 70 63 | 
             
                    logger.info('Scripts to submit: %s', scripts)
         | 
| 71 64 |  | 
| 72 | 
            -
                    if recovery and recover_scripts != scripts:
         | 
| 73 | 
            -
                        raise ValueError('Scripts to submit are different from scripts in recovery file')
         | 
| 74 | 
            -
             | 
| 75 65 | 
             
                    for script_file in scripts:
         | 
| 76 66 | 
             
                        if script_file not in recover_scripts:
         | 
| 77 67 | 
             
                            jobs.append(new_job(script_file))
         | 
| @@ -87,7 +77,7 @@ class BaseJobManager: | |
| 87 77 | 
             
                            break
         | 
| 88 78 |  | 
| 89 79 | 
             
                        # stop if all jobs are terminal and not job to be submitted
         | 
| 90 | 
            -
                        if (all(j['state'] | 
| 80 | 
            +
                        if (all(JobState.is_terminal(j['state']) for j in jobs) and
         | 
| 91 81 | 
             
                                not any(should_submit(j, max_tries) for j in jobs)):
         | 
| 92 82 | 
             
                            break
         | 
| 93 83 |  | 
| @@ -108,19 +98,21 @@ class Slurm(BaseJobManager): | |
| 108 98 |  | 
| 109 99 | 
             
                def _update_jobs(self, jobs: List[dict], max_tries: int, submit_opts: str):
         | 
| 110 100 | 
             
                    # query job status
         | 
| 111 | 
            -
                    job_ids =  | 
| 112 | 
            -
                     | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 120 | 
            -
             | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 101 | 
            +
                    job_ids = [j['id'] for j in jobs if j['id']]
         | 
| 102 | 
            +
                    if job_ids:
         | 
| 103 | 
            +
                        query_cmd = f'{self._sacct_bin} -X -P --format=JobID,JobName,State -j {",".join(job_ids)}'
         | 
| 104 | 
            +
                        user = os.environ.get('USER')
         | 
| 105 | 
            +
                        if user:
         | 
| 106 | 
            +
                            query_cmd += f' -u {user}'
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                        cp = shell_run(query_cmd)
         | 
| 109 | 
            +
                        if cp.returncode != 0:
         | 
| 110 | 
            +
                            logger.error('Failed to query job status: %s', cp.stderr.decode('utf-8'))
         | 
| 111 | 
            +
                            return jobs
         | 
| 112 | 
            +
                        logger.info('Job status: %s', cp.stdout.decode('utf-8'))
         | 
| 113 | 
            +
                        new_state = parse_csv(cp.stdout.decode('utf-8'))
         | 
| 114 | 
            +
                    else:
         | 
| 115 | 
            +
                        new_state = []
         | 
| 124 116 |  | 
| 125 117 | 
             
                    for job in jobs:
         | 
| 126 118 | 
             
                        for row in new_state:
         | 
| @@ -130,8 +122,8 @@ class Slurm(BaseJobManager): | |
| 130 122 | 
             
                                    logger.warning('Unknown job %s state: %s',row['JobID'], row['State'])
         | 
| 131 123 | 
             
                                break
         | 
| 132 124 | 
             
                        else:
         | 
| 133 | 
            -
                            job[' | 
| 134 | 
            -
             | 
| 125 | 
            +
                            if job['id']:
         | 
| 126 | 
            +
                                logger.error('Job %s not found in sacct output', job['id'])
         | 
| 135 127 |  | 
| 136 128 | 
             
                    # check if there are jobs to be (re)submitted
         | 
| 137 129 | 
             
                    for job in jobs:
         | 
| @@ -171,8 +163,8 @@ class Slurm(BaseJobManager): | |
| 171 163 |  | 
| 172 164 |  | 
| 173 165 | 
             
            def should_submit(job: dict, max_tries: int):
         | 
| 174 | 
            -
                state:  | 
| 175 | 
            -
                if not state | 
| 166 | 
            +
                state: int = job['state']
         | 
| 167 | 
            +
                if not JobState.is_terminal(state):
         | 
| 176 168 | 
             
                    return False
         | 
| 177 169 | 
             
                if job['tries'] >= max_tries:
         | 
| 178 170 | 
             
                    return False
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.1
         | 
| 2 2 | 
             
            Name: oh-my-batch
         | 
| 3 | 
            -
            Version: 0.1.0. | 
| 3 | 
            +
            Version: 0.1.0.dev2
         | 
| 4 4 | 
             
            Summary: 
         | 
| 5 5 | 
             
            License: GPL
         | 
| 6 6 | 
             
            Author: weihong.xu
         | 
| @@ -118,13 +118,13 @@ You can use `omb job` to track the state of the jobs. | |
| 118 118 | 
             
            ```bash
         | 
| 119 119 |  | 
| 120 120 | 
             
            omb job slurm \
         | 
| 121 | 
            -
                submit tmp/*.slurm --max_tries 3 --wait --recovery lammps. | 
| 121 | 
            +
                submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json 
         | 
| 122 122 | 
             
            ```
         | 
| 123 123 |  | 
| 124 124 | 
             
            The above command will submit the batch scripts to the job scheduler,
         | 
| 125 125 | 
             
            and wait for the jobs to finish. If the job fails, it will retry for at most 3 times.
         | 
| 126 126 |  | 
| 127 | 
            -
            The `--recovery` option will save the job information to `lammps. | 
| 127 | 
            +
            The `--recovery` option will save the job information to `lammps-jobs.json` file,
         | 
| 128 128 | 
             
            if `omb job` is interrupted, you can run the exact same command to recover the job status, 
         | 
| 129 129 | 
             
            so that you don't need to resubmit the jobs that are already submitted.
         | 
| 130 130 |  | 
| @@ -5,10 +5,10 @@ oh_my_batch/assets/functions.sh,sha256=eORxFefV-XrWbG-2I6u-c8uf1XxOQ31LaeVHBumwz | |
| 5 5 | 
             
            oh_my_batch/batch.py,sha256=e73N-xwxMvgxnWwFMp33PQD1Dy-T-ATjANlwtPRHPQM,3016
         | 
| 6 6 | 
             
            oh_my_batch/cli.py,sha256=G_JxqX0Zbx_EbcDxXbYjJ_4O-EOhmkF1lcMWgQ5ZPqo,375
         | 
| 7 7 | 
             
            oh_my_batch/combo.py,sha256=AHFD5CLoczqtjcfl2Rb4A2ucoQU40-cWtDOYjtP-yY4,7680
         | 
| 8 | 
            -
            oh_my_batch/job.py,sha256= | 
| 8 | 
            +
            oh_my_batch/job.py,sha256=_fETBYpuSd_hNHKnXSwYcSU3OXtU7PO-P2QMfhE-Wfs,5788
         | 
| 9 9 | 
             
            oh_my_batch/util.py,sha256=H8B4zVNH5xRp-NG_uypgvtmz2YSpXy_6LK5ROv6SYrc,2116
         | 
| 10 | 
            -
            oh_my_batch-0.1.0. | 
| 11 | 
            -
            oh_my_batch-0.1.0. | 
| 12 | 
            -
            oh_my_batch-0.1.0. | 
| 13 | 
            -
            oh_my_batch-0.1.0. | 
| 14 | 
            -
            oh_my_batch-0.1.0. | 
| 10 | 
            +
            oh_my_batch-0.1.0.dev2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
         | 
| 11 | 
            +
            oh_my_batch-0.1.0.dev2.dist-info/METADATA,sha256=Novvp5-MhKR9J0Q6Al833vA6GjT7EVRtDv4ADmnluxk,4456
         | 
| 12 | 
            +
            oh_my_batch-0.1.0.dev2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
         | 
| 13 | 
            +
            oh_my_batch-0.1.0.dev2.dist-info/entry_points.txt,sha256=ZY2GutSoNjjSyJ4qO2pTeseKUFgoTYdvmgkuZZkwi68,77
         | 
| 14 | 
            +
            oh_my_batch-0.1.0.dev2.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |