PyPI - hpc-runner - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hpc-runner 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

hpc_runner/_version.py +2 -2
hpc_runner/cli/cancel.py +1 -1
hpc_runner/cli/config.py +2 -2
hpc_runner/cli/main.py +17 -13
hpc_runner/cli/monitor.py +30 -0
hpc_runner/cli/run.py +223 -67
hpc_runner/cli/status.py +6 -5
hpc_runner/core/__init__.py +30 -0
hpc_runner/core/descriptors.py +87 -33
hpc_runner/core/exceptions.py +9 -0
hpc_runner/core/job.py +272 -93
hpc_runner/core/job_info.py +104 -0
hpc_runner/core/result.py +4 -0
hpc_runner/schedulers/base.py +148 -30
hpc_runner/schedulers/detection.py +22 -4
hpc_runner/schedulers/local/scheduler.py +119 -2
hpc_runner/schedulers/sge/args.py +161 -94
hpc_runner/schedulers/sge/parser.py +106 -13
hpc_runner/schedulers/sge/scheduler.py +727 -171
hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
hpc_runner/tui/__init__.py +5 -0
hpc_runner/tui/app.py +436 -0
hpc_runner/tui/components/__init__.py +17 -0
hpc_runner/tui/components/detail_panel.py +187 -0
hpc_runner/tui/components/filter_bar.py +174 -0
hpc_runner/tui/components/filter_popup.py +345 -0
hpc_runner/tui/components/job_table.py +260 -0
hpc_runner/tui/providers/__init__.py +5 -0
hpc_runner/tui/providers/jobs.py +197 -0
hpc_runner/tui/screens/__init__.py +7 -0
hpc_runner/tui/screens/confirm.py +67 -0
hpc_runner/tui/screens/job_details.py +210 -0
hpc_runner/tui/screens/log_viewer.py +170 -0
hpc_runner/tui/snapshot.py +153 -0
hpc_runner/tui/styles/monitor.tcss +567 -0
hpc_runner-0.2.1.dist-info/METADATA +285 -0
hpc_runner-0.2.1.dist-info/RECORD +56 -0
hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
hpc_runner-0.1.1.dist-info/METADATA +0 -46
hpc_runner-0.1.1.dist-info/RECORD +0 -38
{hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/WHEEL +0 -0
{hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/entry_points.txt +0 -0

hpc_runner/schedulers/sge/scheduler.py CHANGED Viewed

@@ -5,26 +5,55 @@ from __future__ import annotations
 import os
 import subprocess
 import tempfile
+import uuid
+from datetime import datetime
 from pathlib import Path
 from typing import TYPE_CHECKING
 from hpc_runner.core.config import get_config
+def get_script_dir() -> Path:
+    """Get directory for temporary job scripts.
+    Uses HPC_SCRIPT_DIR environment variable if set, otherwise
+    defaults to ~/.cache/hpc-runner/scripts/.
+    Returns:
+        Path to script directory (created if needed).
+    """
+    if env_dir := os.environ.get("HPC_SCRIPT_DIR"):
+        script_dir = Path(env_dir)
+    else:
+        script_dir = Path.home() / ".cache" / "hpc-runner" / "scripts"
+    script_dir.mkdir(parents=True, exist_ok=True)
+    return script_dir
+from hpc_runner.core.job_info import JobInfo
 from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
 from hpc_runner.schedulers.base import BaseScheduler
 from hpc_runner.schedulers.sge.args import (
+    SGEArrayArg,
     SGECpuArg,
     SGECwdArg,
     SGEErrorArg,
+    SGEHoldArg,
+    SGEInheritEnvArg,
     SGEJobNameArg,
-    SGEJoinOutputArg,
     SGEMemArg,
+    SGEMergeOutputArg,
     SGEOutputArg,
+    SGEPriorityArg,
     SGEQueueArg,
+    SGEShellArg,
     SGETimeArg,
 )
 from hpc_runner.schedulers.sge.parser import (
     parse_qacct_output,
     parse_qstat_plain,
+    parse_qstat_xml,
     parse_qsub_output,
     state_to_status,
 )
@@ -40,66 +69,316 @@ class SGEScheduler(BaseScheduler):
     name = "sge"
-    # Descriptor-based argument definitions
-    cpu_arg = SGECpuArg()
-    mem_arg = SGEMemArg()
-    time_arg = SGETimeArg()
-    queue_arg = SGEQueueArg()
-    job_name_arg = SGEJobNameArg()
-    stdout_arg = SGEOutputArg()
-    stderr_arg = SGEErrorArg()
-    join_output_arg = SGEJoinOutputArg()
-    cwd_arg = SGECwdArg()
     def __init__(self) -> None:
-        # Load scheduler-specific config
+        """Initialize SGE scheduler with config-driven settings."""
         config = get_config()
         sge_config = config.get_scheduler_config("sge")
+        # Extract config values (also stored as attributes for testing/introspection)
         self.pe_name = sge_config.get("parallel_environment", "smp")
         self.mem_resource = sge_config.get("memory_resource", "mem_free")
         self.time_resource = sge_config.get("time_resource", "h_rt")
-        self.merge_output_default = sge_config.get("merge_output", True)
-    def submit(self, job: "Job", interactive: bool = False) -> JobResult:
-        """Submit a job to SGE."""
+        # Module handling config
+        self.purge_modules = sge_config.get("purge_modules", False)
+        self.silent_modules = sge_config.get("silent_modules", False)
+        self.module_init_script = sge_config.get("module_init_script", "")
+        # Environment handling config
+        self.expand_makeflags = sge_config.get("expand_makeflags", True)
+        self.unset_vars = sge_config.get("unset_vars", [])
+        # Build the argument renderer registry
+        # Maps Job attribute names -> SGE argument renderer instances
+        # Note: 'nodes' and 'tasks' are NOT mapped - they're Slurm/MPI concepts.
+        # If a job has these set, they'll be silently ignored by SGE.
+        self.ARG_RENDERERS = {
+            # Basic attributes
+            "shell": SGEShellArg(),
+            "use_cwd": SGECwdArg(),
+            "inherit_env": SGEInheritEnvArg(),
+            "name": SGEJobNameArg(),
+            "queue": SGEQueueArg(),
+            "priority": SGEPriorityArg(),
+            "stdout": SGEOutputArg(),
+            "stderr": SGEErrorArg(),
+            # Resource attributes (config-driven)
+            "cpu": SGECpuArg(pe_name=self.pe_name),
+            "mem": SGEMemArg(resource_name=self.mem_resource),
+            "time": SGETimeArg(resource_name=self.time_resource),
+        }
+        # Keep references for special-case rendering
+        self._array_arg = SGEArrayArg()
+        self._hold_arg = SGEHoldArg()
+        self._merge_output_arg = SGEMergeOutputArg()
+    # =========================================================================
+    # Script Generation
+    # =========================================================================
+    def generate_script(
+        self,
+        job: "Job",
+        array_range: str | None = None,
+        keep_script: bool = False,
+        script_path: str | None = None,
+    ) -> str:
+        """Generate qsub script using template.
+        Args:
+            job: Job to generate script for.
+            array_range: Array job range string (e.g., "1-100").
+            keep_script: If True, script won't self-delete after execution.
+            script_path: Path where script will be written (for self-deletion).
+        """
+        directives = self._build_directives(job, array_range)
+        return render_template(
+            "sge/templates/batch.sh.j2",
+            job=job,
+            scheduler=self,
+            directives=directives,
+            script_path=script_path,
+            keep_script=keep_script,
+        )
+    def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
+        """Build complete list of #$ directives for the job.
+        Uses the rendering protocol from BaseScheduler, then adds
+        special cases that aren't simple attribute mappings.
+        """
+        directives: list[str] = []
+        # 1. Render standard attributes via protocol
+        directives.extend(self.render_directives(job))
+        # 2. Handle output merging (derived from stderr being None)
+        if job.merge_output:
+            if d := self._merge_output_arg.to_directive(True):
+                directives.append(d)
+        # 3. Array job range
+        if array_range:
+            if d := self._array_arg.to_directive(array_range):
+                directives.append(d)
+        # 4. Dependencies
+        dep_str = self._build_dependency_string(job)
+        if dep_str:
+            if d := self._hold_arg.to_directive(dep_str):
+                directives.append(d)
+        # 5. Custom resources (ResourceSet)
+        for resource in job.resources:
+            directives.append(f"#$ -l {resource.name}={resource.value}")
+        # 6. Raw passthrough arguments
+        for arg in job.raw_args + job.sge_args:
+            if arg.startswith("-"):
+                directives.append(f"#$ {arg}")
+            else:
+                directives.append(f"#$ -{arg}")
+        return directives
+    def _build_dependency_string(self, job: "Job") -> str | None:
+        """Build SGE dependency string from job dependencies."""
+        # String-based dependency from CLI
+        if job.dependency:
+            if ":" in job.dependency:
+                return job.dependency.split(":", 1)[1]
+            return job.dependency
+        # Programmatic dependencies from Job.after()
+        if job.dependencies:
+            return ",".join(dep.job_id for dep in job.dependencies)
+        return None
+    # =========================================================================
+    # Command Building
+    # =========================================================================
+    def build_submit_command(self, job: "Job") -> list[str]:
+        """Build qsub command line."""
+        cmd = ["qsub"]
+        cmd.extend(self.render_args(job))
+        cmd.extend(job.raw_args)
+        cmd.extend(job.sge_args)
+        return cmd
+    def build_interactive_command(self, job: "Job") -> list[str]:
+        """Build qrsh command for interactive jobs.
+        Note: qrsh supports a subset of qsub options. Notably:
+        - Does NOT support: -S (shell), -o/-e (output), -j (join), -N (name)
+        - Does support: -V, -pe, -l, -q, -cwd
+        """
+        import shlex
+        cmd = ["qrsh"]
+        # Only include qrsh-compatible options
+        QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
+        for attr_name, value in job.iter_attributes():
+            if attr_name not in QRSH_COMPATIBLE:
+                continue
+            renderer = self.ARG_RENDERERS.get(attr_name)
+            if renderer:
+                cmd.extend(renderer.to_args(value))
+        cmd.extend(job.raw_args)
+        cmd.extend(job.sge_args)
+        # Add the command - split it back into parts for proper argument handling
+        # This preserves quoting: "bash -c 'echo hello'" -> ['bash', '-c', 'echo hello']
+        cmd.extend(shlex.split(job.command))
+        return cmd
+    # =========================================================================
+    # Job Submission
+    # =========================================================================
+    def submit(
+        self, job: "Job", interactive: bool = False, keep_script: bool = False
+    ) -> JobResult:
+        """Submit a job to SGE.
+        Args:
+            job: Job to submit.
+            interactive: If True, run interactively via qrsh.
+            keep_script: If True, don't delete the job script after submission.
+                         Useful for debugging.
+        """
         if interactive:
-            return self._submit_interactive(job)
-        return self._submit_batch(job)
+            return self._submit_interactive(job, keep_script=keep_script)
+        return self._submit_batch(job, keep_script=keep_script)
-    def _submit_batch(self, job: "Job") -> JobResult:
+    def _submit_batch(self, job: "Job", keep_script: bool = False) -> JobResult:
         """Submit via qsub."""
-        script = self.generate_script(job)
+        # Determine script path first (needed for self-deletion in template)
+        script_dir = get_script_dir()
+        script_name = f"hpc_batch_{uuid.uuid4().hex[:8]}.sh"
+        script_path = script_dir / script_name
+        # Generate script with cleanup instruction
+        script = self.generate_script(
+            job, keep_script=keep_script, script_path=str(script_path)
+        )
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".sh", delete=False, prefix="hpc_"
-        ) as f:
-            f.write(script)
-            script_path = f.name
+        script_path.write_text(script)
+        script_path.chmod(0o755)
+        if keep_script:
+            import sys
+            print(f"Script saved: {script_path}", file=sys.stderr)
         try:
-            cmd = ["qsub", script_path]
-            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            result = subprocess.run(
+                ["qsub", str(script_path)],
+                capture_output=True,
+                text=True,
+                errors="replace",
+                check=True,
+            )
             job_id = parse_qsub_output(result.stdout)
             if job_id is None:
-                raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
+                raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
             return JobResult(job_id=job_id, scheduler=self, job=job)
         finally:
-            Path(script_path).unlink(missing_ok=True)
+            # Clean up locally after qsub (script is copied to spool)
+            # The script inside the job will also self-delete unless keep_script
+            if not keep_script:
+                script_path.unlink(missing_ok=True)
+    def _submit_interactive(self, job: "Job", keep_script: bool = False) -> JobResult:
+        """Submit via qrsh for interactive execution.
-    def _submit_interactive(self, job: "Job") -> JobResult:
-        """Submit via qrsh for interactive execution."""
-        cmd = self.build_interactive_command(job)
+        Creates a wrapper script with full environment setup (modules, venv, etc.)
+        and executes it via qrsh. The script self-deletes after execution unless
+        keep_script is True.
+        Note: Script is written to ~/.cache/hpc-runner/scripts/ (shared filesystem)
+        rather than /tmp (which is node-local).
+        """
+        # Generate unique script path in shared script directory
+        script_dir = get_script_dir()
+        script_name = f"hpc_interactive_{uuid.uuid4().hex[:8]}.sh"
+        script_path = script_dir / script_name
+        # Generate wrapper script with the actual path (for self-deletion)
+        script = self._generate_interactive_script(
+            job, str(script_path), keep_script=keep_script
+        )
+        # Write script to shared filesystem
+        script_path.write_text(script)
+        script_path.chmod(0o755)
+        if keep_script:
+            # Print script path for debugging
+            import sys
+            print(f"Script saved: {script_path}", file=sys.stderr)
+        # Build qrsh command with script path
+        cmd = self._build_qrsh_command(job, str(script_path))
+        # Run and capture exit code
         result = subprocess.run(cmd, check=False)
-        # For interactive jobs, we don't have a real job ID
-        return JobResult(job_id="interactive", scheduler=self, job=job)
+        # Clean up if script still exists and we're not keeping it
+        if not keep_script:
+            script_path.unlink(missing_ok=True)
+        return JobResult(
+            job_id="interactive",
+            scheduler=self,
+            job=job,
+            _exit_code=result.returncode,
+        )
+    def _generate_interactive_script(
+        self, job: "Job", script_path: str, keep_script: bool = False
+    ) -> str:
+        """Generate wrapper script for interactive jobs."""
+        return render_template(
+            "sge/templates/interactive.sh.j2",
+            job=job,
+            scheduler=self,
+            script_path=script_path,
+            keep_script=keep_script,
+        )
+    def _build_qrsh_command(self, job: "Job", script_path: str) -> list[str]:
+        """Build qrsh command to run wrapper script."""
+        cmd = ["qrsh"]
+        # Only include qrsh-compatible options
+        QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
+        for attr_name, value in job.iter_attributes():
+            if attr_name not in QRSH_COMPATIBLE:
+                continue
+            renderer = self.ARG_RENDERERS.get(attr_name)
+            if renderer:
+                cmd.extend(renderer.to_args(value))
+        cmd.extend(job.raw_args)
+        cmd.extend(job.sge_args)
+        # Execute the wrapper script
+        cmd.append(script_path)
+        return cmd
     def submit_array(self, array: "JobArray") -> ArrayJobResult:
         """Submit array job."""
-        job = array.job
-        script = self.generate_script(job, array_range=array.range_str)
+        script = self.generate_script(array.job, array_range=array.range_str)
         with tempfile.NamedTemporaryFile(
             mode="w", suffix=".sh", delete=False, prefix="hpc_"
@@ -108,17 +387,25 @@ class SGEScheduler(BaseScheduler):
             script_path = f.name
         try:
-            cmd = ["qsub", script_path]
-            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            result = subprocess.run(
+                ["qsub", script_path],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
             job_id = parse_qsub_output(result.stdout)
             if job_id is None:
-                raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
+                raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
             return ArrayJobResult(base_job_id=job_id, scheduler=self, array=array)
         finally:
             Path(script_path).unlink(missing_ok=True)
+    # =========================================================================
+    # Job Management
+    # =========================================================================
     def cancel(self, job_id: str) -> bool:
         """Cancel a job via qdel."""
         try:
@@ -128,7 +415,7 @@ class SGEScheduler(BaseScheduler):
             return False
     def get_status(self, job_id: str) -> JobStatus:
-        """Get job status via qstat."""
+        """Get job status via qstat/qacct."""
         # Try qstat first (running/pending jobs)
         try:
             result = subprocess.run(
@@ -137,30 +424,17 @@ class SGEScheduler(BaseScheduler):
                 text=True,
             )
             if result.returncode == 0:
-                # Job exists, check state from regular qstat
-                result2 = subprocess.run(
-                    ["qstat"],
-                    capture_output=True,
-                    text=True,
-                )
+                result2 = subprocess.run(["qstat"], capture_output=True, text=True)
                 if result2.returncode == 0:
                     jobs = parse_qstat_plain(result2.stdout)
-                    # Handle array job task IDs (e.g., 12345.1)
                     base_id = job_id.split(".")[0]
                     if base_id in jobs:
-                        state = jobs[base_id].get("state", "")
-                        return state_to_status(state)
-                    # Check if full ID matches
-                    if job_id in jobs:
-                        state = jobs[job_id].get("state", "")
-                        return state_to_status(state)
-                # Job exists but not in qstat output - likely running
+                        return state_to_status(jobs[base_id].get("state", ""))
                 return JobStatus.RUNNING
         except subprocess.CalledProcessError:
             pass
-        # Job not in qstat, check qacct for completed jobs
+        # Check qacct for completed jobs
         try:
             result = subprocess.run(
                 ["qacct", "-j", job_id],
@@ -169,11 +443,9 @@ class SGEScheduler(BaseScheduler):
             )
             if result.returncode == 0:
                 info = parse_qacct_output(result.stdout)
-                exit_status = info.get("exit_status", "")
-                if exit_status == "0":
+                if info.get("exit_status") == "0":
                     return JobStatus.COMPLETED
-                else:
-                    return JobStatus.FAILED
+                return JobStatus.FAILED
         except subprocess.CalledProcessError:
             pass
@@ -196,130 +468,414 @@ class SGEScheduler(BaseScheduler):
             pass
         return None
-    def get_output_path(self, job_id: str, stream: str) -> Path | None:
-        """Determine output path.
-        SGE uses patterns that need to be resolved.
+    # =========================================================================
+    # TUI Monitor API
+    # =========================================================================
+    def list_active_jobs(
+        self,
+        user: str | None = None,
+        status: set[JobStatus] | None = None,
+        queue: str | None = None,
+    ) -> list[JobInfo]:
+        """List active SGE jobs using qstat -xml.
+        Args:
+            user: Filter by username. None = all users.
+            status: Filter by status set. None = all active statuses.
+            queue: Filter by queue name. None = all queues.
+        Returns:
+            List of JobInfo for matching active jobs.
         """
-        # This is tricky with SGE as paths can use $JOB_ID, etc.
-        # For now, return None and let user check
-        return None
-    def generate_script(self, job: "Job", array_range: str | None = None) -> str:
-        """Generate qsub script using template."""
-        directives = self._build_directives(job, array_range)
-        return render_template(
-            "sge/templates/job.sh.j2",
-            job=job,
-            scheduler=self,
-            directives=directives,
-        )
-    def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
-        """Build #$ directives."""
-        directives: list[str] = []
-        # Shell
-        directives.append("#$ -S /bin/bash")
-        # Use current working directory
-        if job.workdir is None:
-            directives.append("#$ -cwd")
+        # Build qstat command
+        cmd = ["qstat", "-xml"]
+        if user:
+            cmd.extend(["-u", user])
+        else:
+            # Show all users' jobs
+            cmd.extend(["-u", "*"])
-        # Job name
-        if job.name:
-            directives.append(f"#$ -N {job.name}")
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                errors="replace",
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            # qstat failed - likely no jobs or scheduler not available
+            return []
+        except FileNotFoundError:
+            # qstat not found
+            return []
+        # Parse XML output
+        parsed_jobs = parse_qstat_xml(result.stdout)
+        # Convert to JobInfo and apply filters
+        jobs: list[JobInfo] = []
+        for job_id, job_data in parsed_jobs.items():
+            # Convert state to JobStatus
+            state_str = job_data.get("state", "")
+            job_status = state_to_status(state_str)
+            # Apply status filter
+            if status is not None and job_status not in status:
+                continue
+            # Apply queue filter
+            job_queue = job_data.get("queue")
+            if queue is not None and job_queue != queue:
+                continue
+            # Build JobInfo
+            job_info = JobInfo(
+                job_id=job_id,
+                name=job_data.get("name", job_id),
+                user=job_data.get("user", "unknown"),
+                status=job_status,
+                queue=job_queue,
+                cpu=job_data.get("slots"),
+                node=job_data.get("node"),
+            )
-        # CPU/slots via parallel environment
-        if job.cpu:
-            directives.append(f"#$ -pe {self.pe_name} {job.cpu}")
+            # Add timing info if available
+            if "submit_time" in job_data:
+                job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
+            if "start_time" in job_data:
+                job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
+                # Calculate runtime for running jobs
+                if job_info.status == JobStatus.RUNNING:
+                    job_info.runtime = datetime.now() - job_info.start_time
+            # Array task ID
+            if "array_task_id" in job_data:
+                try:
+                    job_info.array_task_id = int(job_data["array_task_id"])
+                except ValueError:
+                    pass  # Could be a range like "1-10"
+            jobs.append(job_info)
+        return jobs
+    def list_completed_jobs(
+        self,
+        user: str | None = None,
+        since: datetime | None = None,
+        until: datetime | None = None,
+        exit_code: int | None = None,
+        queue: str | None = None,
+        limit: int = 100,
+    ) -> list[JobInfo]:
+        """List completed SGE jobs from qacct.
+        TODO: Implement using qacct.
+        """
+        raise NotImplementedError("SGE list_completed_jobs() not yet implemented")
-        # Memory
-        if job.mem:
-            directives.append(f"#$ -l {self.mem_resource}={job.mem}")
+    def has_accounting(self) -> bool:
+        """Check if SGE accounting is available."""
+        return True
-        # Time
-        if job.time:
-            directives.append(f"#$ -l {self.time_resource}={job.time}")
+    def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
+        """Get detailed information for an SGE job using qstat -j -xml.
-        # Queue
-        if job.queue:
-            directives.append(f"#$ -q {job.queue}")
+        Parses the full job details including output paths, resources, etc.
-        # Output handling - merge by default
-        if job.merge_output:
-            directives.append("#$ -j y")
-            if job.stdout:
-                directives.append(f"#$ -o {job.stdout}")
+        Returns:
+            Tuple of (JobInfo, extra_details dict).
+            The extra_details dict contains resources, pe_name, pe_range,
+            cwd, script_file, dependencies, project, department.
+        """
+        cmd = ["qstat", "-j", job_id, "-xml"]
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                errors="replace",
+                check=True,
+            )
+            output = result.stdout
+        except subprocess.CalledProcessError as exc:
+            output = exc.stdout or exc.stderr or ""
+            if not output:
+                raise ValueError(f"Job {job_id} not found")
+        except FileNotFoundError:
+            raise RuntimeError("qstat not found")
+        # Parse XML output
+        job_data = self._parse_qstat_j_xml(output)
+        if not job_data and output:
+            raise ValueError(f"Job {job_id} not found")
+        # Separate extra details from JobInfo fields
+        extra_details: dict[str, object] = {}
+        for key in (
+            "resources",
+            "pe_name",
+            "pe_range",
+            "cwd",
+            "script_file",
+            "dependencies",
+            "project",
+            "department",
+            "job_args",
+            "command",
+        ):
+            if key in job_data:
+                extra_details[key] = job_data[key]
+        # Get basic info from qstat -xml first
+        basic_jobs = self.list_active_jobs()
+        basic_info = next((j for j in basic_jobs if j.job_id == job_id), None)
+        if basic_info:
+            # Merge detailed info with basic info
+            if job_data.get("stdout_path"):
+                basic_info.stdout_path = job_data["stdout_path"]
+            if job_data.get("stderr_path"):
+                basic_info.stderr_path = job_data["stderr_path"]
+            if job_data.get("node"):
+                basic_info.node = job_data["node"]
+            # Always use timing from detailed qstat -j output (more reliable)
+            if job_data.get("submit_time"):
+                basic_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
+            if job_data.get("start_time"):
+                basic_info.start_time = datetime.fromtimestamp(job_data["start_time"])
+                # Calculate runtime if running
+                if basic_info.status == JobStatus.RUNNING:
+                    basic_info.runtime = datetime.now() - basic_info.start_time
+            return basic_info, extra_details
         else:
-            if job.stdout:
-                directives.append(f"#$ -o {job.stdout}")
-            if job.stderr:
-                directives.append(f"#$ -e {job.stderr}")
-        # Array job
-        if array_range:
-            directives.append(f"#$ -t {array_range}")
-        # Resources (GRES-style)
-        for resource in job.resources:
-            directives.append(f"#$ -l {resource.name}={resource.value}")
-        # Dependencies
-        if job.dependencies:
-            dep_ids = ",".join(dep.job_id for dep in job.dependencies)
-            # SGE uses -hold_jid for dependencies
-            directives.append(f"#$ -hold_jid {dep_ids}")
-        # Raw args
-        for arg in job.raw_args + job.sge_args:
-            if arg.startswith("-"):
-                directives.append(f"#$ {arg}")
+            # Build from scratch using qstat -j data
+            job_info = JobInfo(
+                job_id=job_id,
+                name=job_data.get("name", job_id),
+                user=job_data.get("user", "unknown"),
+                status=job_data.get("status", JobStatus.UNKNOWN),
+                queue=job_data.get("queue"),
+                stdout_path=job_data.get("stdout_path"),
+                stderr_path=job_data.get("stderr_path"),
+                node=job_data.get("node"),
+            )
+            # Add timing info
+            if job_data.get("submit_time"):
+                job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
+            if job_data.get("start_time"):
+                job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
+                if job_info.status == JobStatus.RUNNING:
+                    job_info.runtime = datetime.now() - job_info.start_time
+            return job_info, extra_details
+    def _parse_qstat_j_xml(self, xml_output: str) -> dict[str, object]:
+        """Parse qstat -j -xml output to extract job details.
+        Returns a dict with:
+        - Basic: name, user, stdout_path, stderr_path
+        - Resources: dict of resource_name -> value
+        - PE: pe_name, pe_range
+        - Paths: cwd, script_file
+        - Dependencies: list of job IDs
+        - Other: project, department
+        """
+        import xml.etree.ElementTree as ET
+        data: dict[str, object] = {}
+        root = self._parse_xml_root(xml_output)
+        if root is None:
+            return data
+        self._strip_xml_namespaces(root)
+        # Find job info element
+        job_info = root.find(".//JB_job_number/..")
+        if job_info is None:
+            # Try alternative structure
+            job_info = root.find(".//djob_info/element")
+        if job_info is None:
+            return data
+        # Extract basic fields
+        name_elem = job_info.find(".//JB_job_name")
+        if name_elem is not None and name_elem.text:
+            data["name"] = name_elem.text
+        owner_elem = job_info.find(".//JB_owner")
+        if owner_elem is not None and owner_elem.text:
+            data["user"] = owner_elem.text
+        # Project and department
+        project_elem = job_info.find(".//JB_project")
+        if project_elem is not None and project_elem.text:
+            data["project"] = project_elem.text
+        dept_elem = job_info.find(".//JB_department")
+        if dept_elem is not None and dept_elem.text:
+            data["department"] = dept_elem.text
+        # Get cwd for resolving relative paths
+        cwd: Path | None = None
+        cwd_elem = job_info.find(".//JB_cwd")
+        if cwd_elem is not None and cwd_elem.text:
+            cwd = Path(cwd_elem.text)
+            data["cwd"] = str(cwd)
+        # Script file
+        script_elem = job_info.find(".//JB_script_file")
+        if script_elem is not None and script_elem.text:
+            data["script_file"] = script_elem.text
+        # Job arguments/command
+        job_args: list[str] = []
+        for arg_elem in job_info.findall(".//JB_job_args//ST_name"):
+            if arg_elem.text:
+                job_args.append(arg_elem.text)
+        if job_args:
+            data["job_args"] = job_args
+        # Submission time
+        submit_text = job_info.findtext(".//JB_submission_time")
+        if submit_text:
+            try:
+                data["submit_time"] = int(submit_text)
+            except ValueError:
+                pass
+        # Start time (for running jobs) - in JB_ja_tasks/ulong_sublist/JAT_start_time
+        task_start_text = job_info.findtext(
+            ".//JB_ja_tasks/ulong_sublist/JAT_start_time"
+        )
+        if task_start_text:
+            try:
+                data["start_time"] = int(task_start_text)
+            except ValueError:
+                pass
+        # Also check direct JAT_start_time (alternative structure)
+        if "start_time" not in data:
+            start_text = job_info.findtext(".//JAT_start_time")
+            if start_text:
+                try:
+                    data["start_time"] = int(start_text)
+                except ValueError:
+                    pass
+        # For interactive jobs (qrsh), get command from QRSH_COMMAND env var
+        for env_elem in job_info.findall(".//JB_env_list/job_sublist"):
+            var_elem = env_elem.find("VA_variable")
+            val_elem = env_elem.find("VA_value")
+            if var_elem is not None and var_elem.text == "QRSH_COMMAND":
+                if val_elem is not None and val_elem.text:
+                    data["command"] = self._normalize_qrsh_command(val_elem.text)
+                break
+        # stdout path - look for PN_path in stdout_path_list
+        stdout_path_elem = job_info.find(".//JB_stdout_path_list//PN_path")
+        if stdout_path_elem is not None and stdout_path_elem.text:
+            stdout_path = Path(stdout_path_elem.text)
+            # Resolve relative paths against cwd
+            if not stdout_path.is_absolute() and cwd:
+                stdout_path = cwd / stdout_path
+            data["stdout_path"] = stdout_path
+        # stderr path
+        stderr_path_elem = job_info.find(".//JB_stderr_path_list//PN_path")
+        if stderr_path_elem is not None and stderr_path_elem.text:
+            stderr_path = Path(stderr_path_elem.text)
+            if not stderr_path.is_absolute() and cwd:
+                stderr_path = cwd / stderr_path
+            data["stderr_path"] = stderr_path
+        # Check merge flag
+        merge_elem = job_info.find(".//JB_merge_stderr")
+        if merge_elem is not None and merge_elem.text:
+            if merge_elem.text.lower() in ("true", "1", "y"):
+                data["merge"] = True
+        # If merge is enabled and we have stdout but no stderr, use stdout for both
+        if data.get("merge") and data.get("stdout_path") and not data.get("stderr_path"):
+            data["stderr_path"] = data["stdout_path"]
+        # Parse hard resource list
+        resources: dict[str, str] = {}
+        for qstat_elem in job_info.findall(".//JB_hard_resource_list/qstat_l_requests"):
+            res_name_elem = qstat_elem.find("CE_name")
+            res_val_elem = qstat_elem.find("CE_stringval")
+            if res_name_elem is not None and res_name_elem.text:
+                res_name = res_name_elem.text
+                res_val = res_val_elem.text if res_val_elem is not None else ""
+                resources[res_name] = res_val or ""
+        # Also check soft resources
+        for qstat_elem in job_info.findall(".//JB_soft_resource_list/qstat_l_requests"):
+            res_name_elem = qstat_elem.find("CE_name")
+            res_val_elem = qstat_elem.find("CE_stringval")
+            if res_name_elem is not None and res_name_elem.text:
+                res_name = res_name_elem.text
+                res_val = res_val_elem.text if res_val_elem is not None else ""
+                resources[f"{res_name} (soft)"] = res_val or ""
+        if resources:
+            data["resources"] = resources
+        # Parallel environment
+        pe_elem = job_info.find(".//JB_pe")
+        if pe_elem is not None and pe_elem.text:
+            data["pe_name"] = pe_elem.text
+        # PE range (min-max slots)
+        pe_range_min = job_info.find(".//JB_pe_range//RN_min")
+        pe_range_max = job_info.find(".//JB_pe_range//RN_max")
+        if pe_range_min is not None and pe_range_max is not None:
+            min_val = pe_range_min.text or "1"
+            max_val = pe_range_max.text or "1"
+            if min_val == max_val:
+                data["pe_range"] = min_val
             else:
-                directives.append(f"#$ -{arg}")
+                data["pe_range"] = f"{min_val}-{max_val}"
-        return directives
-    def build_submit_command(self, job: "Job") -> list[str]:
-        """Build qsub command line."""
-        cmd = ["qsub"]
-        if job.name:
-            cmd.extend(["-N", job.name])
-        if job.cpu:
-            cmd.extend(["-pe", self.pe_name, str(job.cpu)])
-        if job.mem:
-            cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
-        if job.time:
-            cmd.extend(["-l", f"{self.time_resource}={job.time}"])
-        if job.queue:
-            cmd.extend(["-q", job.queue])
-        cmd.extend(job.raw_args)
-        cmd.extend(job.sge_args)
+        # Dependencies (predecessor jobs)
+        dependencies: list[str] = []
+        for dep_elem in job_info.findall(".//JB_jid_predecessor_list//JRE_job_number"):
+            if dep_elem.text:
+                dependencies.append(dep_elem.text)
+        if dependencies:
+            data["dependencies"] = dependencies
-        return cmd
+        return data
-    def build_interactive_command(self, job: "Job") -> list[str]:
-        """Build qrsh command for interactive jobs."""
-        cmd = ["qrsh"]
+    def _strip_xml_namespaces(self, root: "ET.Element") -> None:
+        """Strip namespaces so ElementTree can match tag names directly."""
+        import xml.etree.ElementTree as ET
-        if job.cpu:
-            cmd.extend(["-pe", self.pe_name, str(job.cpu)])
-        if job.mem:
-            cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
-        if job.time:
-            cmd.extend(["-l", f"{self.time_resource}={job.time}"])
-        if job.queue:
-            cmd.extend(["-q", job.queue])
+        for elem in root.iter():
+            if isinstance(elem.tag, str) and "}" in elem.tag:
+                elem.tag = elem.tag.split("}", 1)[1]
-        cmd.extend(job.raw_args)
-        cmd.extend(job.sge_args)
+    def _normalize_qrsh_command(self, value: str) -> str:
+        """Normalize QRSH_COMMAND by replacing non-ASCII separators with spaces."""
+        cleaned = "".join(ch if 32 <= ord(ch) < 127 else " " for ch in value)
+        return " ".join(cleaned.split())
-        # Add the command
-        if isinstance(job.command, str):
-            cmd.append(job.command)
-        else:
-            cmd.extend(job.command)
+    def _parse_xml_root(self, xml_output: str) -> "ET.Element | None":
+        """Parse XML output, tolerating leading/trailing non-XML noise."""
+        import xml.etree.ElementTree as ET
-        return cmd
+        try:
+            return ET.fromstring(xml_output)
+        except ET.ParseError:
+            pass
+        start = xml_output.find("<")
+        end = xml_output.rfind(">")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        try:
+            return ET.fromstring(xml_output[start : end + 1])
+        except ET.ParseError:
+            return None

hpc-runner 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

hpc-runner 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl