researchloop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. researchloop/__init__.py +1 -0
  2. researchloop/__main__.py +3 -0
  3. researchloop/cli.py +1138 -0
  4. researchloop/clusters/__init__.py +4 -0
  5. researchloop/clusters/monitor.py +199 -0
  6. researchloop/clusters/ssh.py +183 -0
  7. researchloop/comms/__init__.py +0 -0
  8. researchloop/comms/base.py +34 -0
  9. researchloop/comms/conversation.py +465 -0
  10. researchloop/comms/ntfy.py +95 -0
  11. researchloop/comms/router.py +71 -0
  12. researchloop/comms/slack.py +188 -0
  13. researchloop/core/__init__.py +0 -0
  14. researchloop/core/auth.py +78 -0
  15. researchloop/core/config.py +328 -0
  16. researchloop/core/credentials.py +38 -0
  17. researchloop/core/models.py +119 -0
  18. researchloop/core/orchestrator.py +910 -0
  19. researchloop/dashboard/__init__.py +0 -0
  20. researchloop/dashboard/app.py +15 -0
  21. researchloop/dashboard/auth.py +60 -0
  22. researchloop/dashboard/routes.py +912 -0
  23. researchloop/dashboard/templates/base.html +84 -0
  24. researchloop/dashboard/templates/login.html +12 -0
  25. researchloop/dashboard/templates/loop_detail.html +58 -0
  26. researchloop/dashboard/templates/loops.html +61 -0
  27. researchloop/dashboard/templates/setup.html +14 -0
  28. researchloop/dashboard/templates/sprint_detail.html +109 -0
  29. researchloop/dashboard/templates/sprints.html +48 -0
  30. researchloop/dashboard/templates/studies.html +18 -0
  31. researchloop/dashboard/templates/study_detail.html +64 -0
  32. researchloop/db/__init__.py +5 -0
  33. researchloop/db/database.py +86 -0
  34. researchloop/db/migrations.py +172 -0
  35. researchloop/db/queries.py +351 -0
  36. researchloop/runner/__init__.py +1 -0
  37. researchloop/runner/claude.py +169 -0
  38. researchloop/runner/job_templates/sge.sh.j2 +319 -0
  39. researchloop/runner/job_templates/slurm.sh.j2 +336 -0
  40. researchloop/runner/main.py +156 -0
  41. researchloop/runner/pipeline.py +272 -0
  42. researchloop/runner/templates/fix_issues.md.j2 +11 -0
  43. researchloop/runner/templates/idea_generator.md.j2 +16 -0
  44. researchloop/runner/templates/red_team.md.j2 +15 -0
  45. researchloop/runner/templates/report.md.j2 +31 -0
  46. researchloop/runner/templates/research_sprint.md.j2 +51 -0
  47. researchloop/runner/templates/summarizer.md.j2 +7 -0
  48. researchloop/runner/upload.py +153 -0
  49. researchloop/schedulers/__init__.py +11 -0
  50. researchloop/schedulers/base.py +43 -0
  51. researchloop/schedulers/local.py +188 -0
  52. researchloop/schedulers/sge.py +163 -0
  53. researchloop/schedulers/slurm.py +179 -0
  54. researchloop/sprints/__init__.py +0 -0
  55. researchloop/sprints/auto_loop.py +458 -0
  56. researchloop/sprints/manager.py +750 -0
  57. researchloop/studies/__init__.py +0 -0
  58. researchloop/studies/manager.py +102 -0
  59. researchloop-0.1.0.dist-info/METADATA +596 -0
  60. researchloop-0.1.0.dist-info/RECORD +63 -0
  61. researchloop-0.1.0.dist-info/WHEEL +4 -0
  62. researchloop-0.1.0.dist-info/entry_points.txt +3 -0
  63. researchloop-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,4 @@
1
+ from researchloop.clusters.monitor import JobMonitor
2
+ from researchloop.clusters.ssh import SSHConnection, SSHManager
3
+
4
+ __all__ = ["SSHConnection", "SSHManager", "JobMonitor"]
@@ -0,0 +1,199 @@
1
+ """Job monitoring - polls active jobs via SSH and updates the database."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from datetime import datetime, timezone
9
+ from typing import Any
10
+
11
+ from researchloop.clusters.ssh import SSHManager
12
+ from researchloop.db import queries
13
+ from researchloop.schedulers.base import BaseScheduler
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # If a job's heartbeat is older than this many seconds AND the job is not
18
+ # visible in the scheduler queue, consider it abandoned.
19
+ _HEARTBEAT_STALE_SECONDS = 5 * 60 # 5 minutes
20
+
21
+
22
+ class JobMonitor:
23
+ """Monitors submitted jobs by periodically polling their status."""
24
+
25
+ def __init__(
26
+ self,
27
+ ssh_manager: SSHManager,
28
+ db: Any,
29
+ schedulers: dict[str, BaseScheduler],
30
+ config: Any = None,
31
+ ) -> None:
32
+ self.ssh_manager = ssh_manager
33
+ self.db = db
34
+ self.schedulers = schedulers
35
+ self.config = config
36
+ self._polling_task: asyncio.Task[None] | None = None
37
+ self._stop_event = asyncio.Event()
38
+
39
+ # ------------------------------------------------------------------
40
+ # Single-job check
41
+ # ------------------------------------------------------------------
42
+
43
+ async def check_job(self, sprint: dict[str, Any]) -> str:
44
+ """Check the status of the job for *sprint*.
45
+
46
+ Returns the scheduler-normalised status string:
47
+ ``pending``, ``running``, ``completed``, ``failed``, or ``unknown``.
48
+ """
49
+ job_id: str | None = sprint.get("job_id")
50
+ if not job_id:
51
+ logger.warning("Sprint %s has no job_id", sprint["id"])
52
+ return "unknown"
53
+
54
+ # Resolve scheduler for this sprint's study/cluster.
55
+ study = await queries.get_study(self.db, sprint["study_name"])
56
+ if study is None:
57
+ return "unknown"
58
+
59
+ cluster_name = study["cluster"]
60
+ scheduler = self.schedulers.get(cluster_name)
61
+ if scheduler is None:
62
+ logger.debug("No scheduler for cluster %r", cluster_name)
63
+ return "unknown"
64
+
65
+ # Resolve cluster config for SSH connection.
66
+ cluster_cfg = None
67
+ if self.config:
68
+ for c in self.config.clusters:
69
+ if c.name == cluster_name:
70
+ cluster_cfg = c
71
+ break
72
+
73
+ if cluster_cfg is None:
74
+ logger.debug("No cluster config for %r", cluster_name)
75
+ return "unknown"
76
+
77
+ try:
78
+ ssh = await self.ssh_manager.get_connection(
79
+ {
80
+ "host": cluster_cfg.host,
81
+ "port": cluster_cfg.port,
82
+ "user": cluster_cfg.user,
83
+ "key_path": cluster_cfg.key_path,
84
+ }
85
+ )
86
+ status = await scheduler.status(ssh, job_id)
87
+ except Exception:
88
+ logger.debug("SSH check failed for job %s", job_id, exc_info=True)
89
+ return "unknown"
90
+
91
+ logger.info("Sprint %s (job %s) status: %s", sprint["id"], job_id, status)
92
+ return status
93
+
94
+ # ------------------------------------------------------------------
95
+ # Poll all active jobs
96
+ # ------------------------------------------------------------------
97
+
98
+ async def poll_active_jobs(self) -> None:
99
+ """Check every active sprint, update the DB, and detect abandoned jobs."""
100
+ sprints = await queries.get_active_sprints(self.db)
101
+ if not sprints:
102
+ logger.debug("No active sprints to poll.")
103
+ return
104
+
105
+ logger.info("Polling %d active sprint(s)...", len(sprints))
106
+ now = datetime.now(timezone.utc)
107
+
108
+ for sprint in sprints:
109
+ sprint_id: str = sprint["id"]
110
+ try:
111
+ status = await self.check_job(sprint)
112
+ except Exception:
113
+ logger.exception("Error checking status for sprint %s", sprint_id)
114
+ status = "unknown"
115
+
116
+ # --- Abandoned-job detection ---
117
+ if status == "unknown":
118
+ # Check heartbeat from metadata_json
119
+ metadata_str = sprint.get("metadata_json")
120
+ heartbeat_str: str | None = None
121
+ if metadata_str:
122
+ try:
123
+ metadata = json.loads(metadata_str)
124
+ heartbeat_str = metadata.get("last_heartbeat")
125
+ except (json.JSONDecodeError, TypeError):
126
+ pass
127
+
128
+ if heartbeat_str is not None:
129
+ heartbeat = datetime.fromisoformat(heartbeat_str)
130
+ if heartbeat.tzinfo is None:
131
+ heartbeat = heartbeat.replace(tzinfo=timezone.utc)
132
+
133
+ stale_seconds = (now - heartbeat).total_seconds()
134
+ if stale_seconds > _HEARTBEAT_STALE_SECONDS:
135
+ logger.warning(
136
+ "Sprint %s appears abandoned: heartbeat %.0fs ago "
137
+ "and job not in scheduler queue. Marking as failed.",
138
+ sprint_id,
139
+ stale_seconds,
140
+ )
141
+ status = "failed"
142
+
143
+ # Persist the updated status if it changed.
144
+ if status in ("completed", "failed"):
145
+ try:
146
+ await queries.update_sprint(
147
+ self.db,
148
+ sprint_id,
149
+ status=status,
150
+ completed_at=datetime.now(timezone.utc).isoformat(),
151
+ )
152
+ except Exception:
153
+ logger.exception(
154
+ "Failed to update DB status for sprint %s", sprint_id
155
+ )
156
+
157
+ # ------------------------------------------------------------------
158
+ # Background polling loop
159
+ # ------------------------------------------------------------------
160
+
161
+ async def start_polling(self, interval: int = 120) -> None:
162
+ """Start a background task that polls active jobs every *interval* seconds."""
163
+ if self._polling_task is not None and not self._polling_task.done():
164
+ logger.warning("Polling is already running.")
165
+ return
166
+
167
+ self._stop_event.clear()
168
+ self._polling_task = asyncio.create_task(
169
+ self._poll_loop(interval), name="job-monitor-poll"
170
+ )
171
+ logger.info("Job monitor polling started (interval=%ds).", interval)
172
+
173
+ async def _poll_loop(self, interval: int) -> None:
174
+ """Internal loop executed by the background task."""
175
+ while not self._stop_event.is_set():
176
+ try:
177
+ await self.poll_active_jobs()
178
+ except Exception:
179
+ logger.exception("Unhandled error during job polling")
180
+
181
+ # Wait for the interval, but break early if stop is requested.
182
+ try:
183
+ await asyncio.wait_for(self._stop_event.wait(), timeout=interval)
184
+ # If we reach here, stop was requested.
185
+ break
186
+ except asyncio.TimeoutError:
187
+ # Normal timeout - continue polling.
188
+ pass
189
+
190
+ async def stop_polling(self) -> None:
191
+ """Stop the background polling task."""
192
+ self._stop_event.set()
193
+ if self._polling_task is not None:
194
+ try:
195
+ await self._polling_task
196
+ except asyncio.CancelledError:
197
+ pass
198
+ self._polling_task = None
199
+ logger.info("Job monitor polling stopped.")
@@ -0,0 +1,183 @@
1
+ """SSH connection manager using asyncssh."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import asyncssh
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class SSHConnection:
16
+ """Manages a single SSH connection to a remote host."""
17
+
18
+ def __init__(
19
+ self,
20
+ host: str,
21
+ port: int,
22
+ user: str,
23
+ key_path: str,
24
+ known_hosts: str | None = None,
25
+ ) -> None:
26
+ self.host = host
27
+ self.port = port
28
+ self.user = user
29
+ self.key_path = key_path
30
+ self.known_hosts = known_hosts
31
+ self._conn: asyncssh.SSHClientConnection | None = None
32
+
33
+ async def connect(self) -> SSHConnection:
34
+ """Establish the SSH connection and return self."""
35
+ connect_kwargs: dict[str, Any] = {
36
+ "host": self.host,
37
+ "port": self.port,
38
+ "username": self.user,
39
+ "client_keys": [self.key_path],
40
+ "agent_path": None, # Don't use SSH agent; we have explicit keys.
41
+ }
42
+ if self.known_hosts is not None:
43
+ connect_kwargs["known_hosts"] = self.known_hosts
44
+ else:
45
+ # Disable host key checking when no known_hosts file is provided.
46
+ connect_kwargs["known_hosts"] = None
47
+
48
+ logger.info("Connecting to %s@%s:%d", self.user, self.host, self.port)
49
+ self._conn = await asyncssh.connect(**connect_kwargs)
50
+ logger.info("Connected to %s@%s:%d", self.user, self.host, self.port)
51
+ return self
52
+
53
+ @property
54
+ def connection(self) -> asyncssh.SSHClientConnection:
55
+ if self._conn is None:
56
+ raise RuntimeError(
57
+ "SSH connection is not established. Call connect() first."
58
+ )
59
+ return self._conn
60
+
61
+ async def run(self, command: str, timeout: float = 30) -> tuple[str, str, int]:
62
+ """Run a command over SSH.
63
+
64
+ Returns:
65
+ A tuple of (stdout, stderr, exit_code).
66
+ """
67
+ logger.debug("Running command on %s: %s", self.host, command)
68
+ try:
69
+ result = await asyncio.wait_for(
70
+ self.connection.run(command, check=False),
71
+ timeout=timeout,
72
+ )
73
+ except asyncio.TimeoutError:
74
+ logger.error(
75
+ "Command timed out after %.1fs on %s: %s",
76
+ timeout,
77
+ self.host,
78
+ command,
79
+ )
80
+ raise
81
+
82
+ stdout = str(result.stdout or "")
83
+ stderr = str(result.stderr or "")
84
+ exit_code = result.exit_status if result.exit_status is not None else -1
85
+
86
+ logger.debug("Command on %s finished with exit_code=%d", self.host, exit_code)
87
+ return stdout, stderr, exit_code
88
+
89
+ async def upload_file(self, local_path: str, remote_path: str) -> None:
90
+ """Upload a local file to the remote host via SFTP."""
91
+ logger.info("Uploading %s -> %s:%s", local_path, self.host, remote_path)
92
+ async with self.connection.start_sftp_client() as sftp:
93
+ await sftp.put(local_path, remote_path)
94
+ logger.info("Upload complete: %s -> %s:%s", local_path, self.host, remote_path)
95
+
96
+ async def download_file(self, remote_path: str, local_path: str) -> None:
97
+ """Download a file from the remote host via SFTP."""
98
+ logger.info("Downloading %s:%s -> %s", self.host, remote_path, local_path)
99
+ # Ensure local parent directory exists.
100
+ Path(local_path).parent.mkdir(parents=True, exist_ok=True)
101
+ async with self.connection.start_sftp_client() as sftp:
102
+ await sftp.get(remote_path, local_path)
103
+ logger.info(
104
+ "Download complete: %s:%s -> %s", self.host, remote_path, local_path
105
+ )
106
+
107
+ async def close(self) -> None:
108
+ """Close the SSH connection."""
109
+ if self._conn is not None:
110
+ self._conn.close()
111
+ await self._conn.wait_closed()
112
+ logger.info(
113
+ "Closed connection to %s@%s:%d", self.user, self.host, self.port
114
+ )
115
+ self._conn = None
116
+
117
+ # --- Context manager support ---
118
+
119
+ async def __aenter__(self) -> SSHConnection:
120
+ return await self.connect()
121
+
122
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
123
+ await self.close()
124
+
125
+
126
+ class SSHManager:
127
+ """Manages a pool of SSH connections keyed by cluster configuration."""
128
+
129
+ def __init__(self) -> None:
130
+ self._connections: dict[str, SSHConnection] = {}
131
+ self._lock = asyncio.Lock()
132
+
133
+ @staticmethod
134
+ def _config_key(cluster_config: dict[str, Any]) -> str:
135
+ """Derive a unique key from cluster configuration."""
136
+ host = cluster_config["host"]
137
+ port = cluster_config["port"]
138
+ user = cluster_config["user"]
139
+ return f"{user}@{host}:{port}"
140
+
141
+ async def get_connection(self, cluster_config: dict[str, Any]) -> SSHConnection:
142
+ """Return an existing connection or create a new one.
143
+
144
+ ``cluster_config`` must contain at minimum::
145
+
146
+ {
147
+ "host": str,
148
+ "port": int,
149
+ "user": str,
150
+ "key_path": str,
151
+ "known_hosts": str | None, # optional
152
+ }
153
+ """
154
+ key = self._config_key(cluster_config)
155
+
156
+ async with self._lock:
157
+ existing = self._connections.get(key)
158
+ if existing is not None and existing._conn is not None:
159
+ logger.debug("Reusing existing SSH connection for %s", key)
160
+ return existing
161
+
162
+ # Create a fresh connection.
163
+ conn = SSHConnection(
164
+ host=cluster_config["host"],
165
+ port=cluster_config["port"],
166
+ user=cluster_config["user"],
167
+ key_path=cluster_config["key_path"],
168
+ known_hosts=cluster_config.get("known_hosts"),
169
+ )
170
+ await conn.connect()
171
+ self._connections[key] = conn
172
+ return conn
173
+
174
+ async def close_all(self) -> None:
175
+ """Close every managed SSH connection."""
176
+ async with self._lock:
177
+ for key, conn in self._connections.items():
178
+ try:
179
+ await conn.close()
180
+ except Exception:
181
+ logger.exception("Error closing SSH connection %s", key)
182
+ self._connections.clear()
183
+ logger.info("All SSH connections closed.")
File without changes
@@ -0,0 +1,34 @@
1
+ """Abstract notification interface for researchloop."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+
8
+ class BaseNotifier(ABC):
9
+ """Every notification backend must implement this interface."""
10
+
11
+ @abstractmethod
12
+ async def notify_sprint_started(
13
+ self, sprint_id: str, study_name: str, idea: str
14
+ ) -> None:
15
+ """Called when a sprint has been submitted to a cluster."""
16
+ ...
17
+
18
+ @abstractmethod
19
+ async def notify_sprint_completed(
20
+ self,
21
+ sprint_id: str,
22
+ study_name: str,
23
+ summary: str,
24
+ pdf_path: str | None = None,
25
+ ) -> None:
26
+ """Called when a sprint finishes successfully."""
27
+ ...
28
+
29
+ @abstractmethod
30
+ async def notify_sprint_failed(
31
+ self, sprint_id: str, study_name: str, error: str
32
+ ) -> None:
33
+ """Called when a sprint fails."""
34
+ ...