omneval-devloop 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devloop/__init__.py +27 -0
- devloop/cluster.py +79 -0
- devloop/dev_loop.py +395 -0
- devloop/dev_loop_logic.py +66 -0
- devloop/github_ops.py +167 -0
- devloop/k8s_jobs.py +367 -0
- devloop/projects.py +121 -0
- devloop/schedules.py +82 -0
- devloop/shared.py +244 -0
- devloop/summarization.py +69 -0
- devloop/summarize_activities.py +130 -0
- devloop/webhook.py +105 -0
- devloop/worker.py +124 -0
- devloop/workflows.py +25 -0
- omneval_devloop-0.0.1.dist-info/METADATA +11 -0
- omneval_devloop-0.0.1.dist-info/RECORD +18 -0
- omneval_devloop-0.0.1.dist-info/WHEEL +4 -0
- omneval_devloop-0.0.1.dist-info/licenses/LICENSE +201 -0
devloop/github_ops.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""GitHub REST activities for the Dev Loop (issues #22, #23).
|
|
2
|
+
|
|
3
|
+
Network access is via ``httpx`` against the GitHub REST API. Each enrolled
|
|
4
|
+
project carries its own scoped GitHub token (``github_token_secret`` in the
|
|
5
|
+
registry); the token is resolved per project from that Secret at call time, so
|
|
6
|
+
different orgs/owners use different credentials.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from temporalio import activity
|
|
18
|
+
|
|
19
|
+
from . import cluster
|
|
20
|
+
from .projects import ProjectConfig, get_project, parse_github_repo
|
|
21
|
+
from .shared import OpenAgentPRsInput, PostCommentsInput
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
GITHUB_API = os.getenv("GITHUB_API", "https://api.github.com")
|
|
26
|
+
|
|
27
|
+
# Agent issue branches are named ``agent/issue-<N>[-slug]`` (see entrypoint.py).
|
|
28
|
+
_AGENT_BRANCH = re.compile(r"^agent/issue-(\d+)")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def agent_pr_issue_numbers(pulls: list[dict[str, Any]]) -> list[int]:
|
|
32
|
+
"""Issue numbers that already have an open agent PR.
|
|
33
|
+
|
|
34
|
+
Pure helper (no network) so it is unit-testable. Reads each PR's head branch
|
|
35
|
+
and matches the ``agent/issue-<N>`` convention the execute phase pushes. Used
|
|
36
|
+
by the Dev Loop planner to skip issues whose work is already up for human
|
|
37
|
+
review — under the PR-review merge model an issue stays *open* until its PR
|
|
38
|
+
is merged, so without this filter the planner would re-surface it every round.
|
|
39
|
+
"""
|
|
40
|
+
nums: set[int] = set()
|
|
41
|
+
for pr in pulls:
|
|
42
|
+
ref = (pr.get("head") or {}).get("ref", "")
|
|
43
|
+
m = _AGENT_BRANCH.match(ref)
|
|
44
|
+
if m:
|
|
45
|
+
nums.add(int(m.group(1)))
|
|
46
|
+
return sorted(nums)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# --------------------------------------------------------------------------- #
|
|
50
|
+
# HTTP helpers
|
|
51
|
+
# --------------------------------------------------------------------------- #
|
|
52
|
+
def _headers(token: str) -> dict[str, str]:
|
|
53
|
+
return {
|
|
54
|
+
"Authorization": f"Bearer {token}",
|
|
55
|
+
"Accept": "application/vnd.github+json",
|
|
56
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _client(cfg: ProjectConfig):
|
|
61
|
+
import httpx
|
|
62
|
+
|
|
63
|
+
token = cluster.read_secret_value(cfg.github_token_secret, "GITHUB_TOKEN")
|
|
64
|
+
return httpx.Client(base_url=GITHUB_API, headers=_headers(token), timeout=30.0)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# --------------------------------------------------------------------------- #
|
|
68
|
+
# Activity inputs
|
|
69
|
+
# --------------------------------------------------------------------------- #
|
|
70
|
+
@dataclass
|
|
71
|
+
class NewIssue:
|
|
72
|
+
title: str
|
|
73
|
+
body: str
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class FileIssuesInput:
|
|
78
|
+
project_id: str
|
|
79
|
+
issues: list[NewIssue] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# --------------------------------------------------------------------------- #
|
|
83
|
+
# Activities
|
|
84
|
+
# --------------------------------------------------------------------------- #
|
|
85
|
+
@activity.defn
|
|
86
|
+
async def post_pr_comments(inp: PostCommentsInput) -> None:
|
|
87
|
+
"""Post the reviewer's findings to a PR: a summary comment plus any
|
|
88
|
+
line-anchored inline review comments. Called by the Dev Loop after the review
|
|
89
|
+
Agent Execution Job returns its ``review`` payload."""
|
|
90
|
+
cfg = get_project(inp.project_id)
|
|
91
|
+
repo = parse_github_repo(cfg.github_url)
|
|
92
|
+
with _client(cfg) as c:
|
|
93
|
+
# PR-level summary comment
|
|
94
|
+
c.post(
|
|
95
|
+
f"/repos/{repo}/issues/{inp.pr_number}/comments",
|
|
96
|
+
json={"body": f"### Agent review\n\n{inp.summary}"},
|
|
97
|
+
).raise_for_status()
|
|
98
|
+
# Inline review comments (best-effort; needs the head commit SHA)
|
|
99
|
+
if inp.inline_comments:
|
|
100
|
+
pr = c.get(f"/repos/{repo}/pulls/{inp.pr_number}")
|
|
101
|
+
pr.raise_for_status()
|
|
102
|
+
commit_id = pr.json()["head"]["sha"]
|
|
103
|
+
comments = [
|
|
104
|
+
{"path": ic.file, "line": ic.line, "side": "RIGHT", "body": ic.body}
|
|
105
|
+
for ic in inp.inline_comments
|
|
106
|
+
]
|
|
107
|
+
c.post(
|
|
108
|
+
f"/repos/{repo}/pulls/{inp.pr_number}/reviews",
|
|
109
|
+
json={"commit_id": commit_id, "event": "COMMENT", "comments": comments},
|
|
110
|
+
).raise_for_status()
|
|
111
|
+
log.info(
|
|
112
|
+
"posted %d inline comment(s) to %s#%d",
|
|
113
|
+
len(inp.inline_comments),
|
|
114
|
+
repo,
|
|
115
|
+
inp.pr_number,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@activity.defn
|
|
120
|
+
async def file_issues(inp: FileIssuesInput) -> list[int]:
|
|
121
|
+
"""File new agent-ready issues. The seam for the forthcoming QA Validator
|
|
122
|
+
agent, which files follow-up issues for problems it finds; not yet wired into
|
|
123
|
+
any workflow."""
|
|
124
|
+
cfg = get_project(inp.project_id)
|
|
125
|
+
repo = parse_github_repo(cfg.github_url)
|
|
126
|
+
created: list[int] = []
|
|
127
|
+
with _client(cfg) as c:
|
|
128
|
+
for issue in inp.issues:
|
|
129
|
+
resp = c.post(
|
|
130
|
+
f"/repos/{repo}/issues",
|
|
131
|
+
json={
|
|
132
|
+
"title": issue.title,
|
|
133
|
+
"body": issue.body,
|
|
134
|
+
"labels": [cfg.agent_label],
|
|
135
|
+
},
|
|
136
|
+
)
|
|
137
|
+
resp.raise_for_status()
|
|
138
|
+
created.append(resp.json()["number"])
|
|
139
|
+
log.info("filed %d new issue(s) in %s: %s", len(created), repo, created)
|
|
140
|
+
return created
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@activity.defn
|
|
144
|
+
async def open_agent_pr_issue_numbers(inp: OpenAgentPRsInput) -> list[int]:
|
|
145
|
+
"""Return issue numbers that already have an open agent PR (head branch
|
|
146
|
+
``agent/issue-<N>``). The Dev Loop planner uses this to drop issues whose
|
|
147
|
+
work is awaiting human review on a PR, so they aren't re-planned each round.
|
|
148
|
+
"""
|
|
149
|
+
cfg = get_project(inp.project_id)
|
|
150
|
+
repo = parse_github_repo(cfg.github_url)
|
|
151
|
+
pulls: list[dict[str, Any]] = []
|
|
152
|
+
with _client(cfg) as c:
|
|
153
|
+
page = 1
|
|
154
|
+
while True:
|
|
155
|
+
resp = c.get(
|
|
156
|
+
f"/repos/{repo}/pulls",
|
|
157
|
+
params={"state": "open", "per_page": 100, "page": page},
|
|
158
|
+
)
|
|
159
|
+
resp.raise_for_status()
|
|
160
|
+
batch = resp.json()
|
|
161
|
+
if not batch:
|
|
162
|
+
break
|
|
163
|
+
pulls.extend(batch)
|
|
164
|
+
page += 1
|
|
165
|
+
numbers = agent_pr_issue_numbers(pulls)
|
|
166
|
+
log.info("issues with open agent PRs in %s: %s", repo, numbers)
|
|
167
|
+
return numbers
|
devloop/k8s_jobs.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Kubernetes Job dispatch for Agent Execution Jobs (issue #18).
|
|
2
|
+
|
|
3
|
+
A single Temporal activity, ``dispatch_agent_job``, renders a ``batch/v1`` Job
|
|
4
|
+
from a Project Registry entry, creates it in the ``agents`` namespace, polls it
|
|
5
|
+
to a terminal state, and reads the output ConfigMap the Job writes.
|
|
6
|
+
|
|
7
|
+
Design notes
|
|
8
|
+
------------
|
|
9
|
+
* The kubernetes client is reached through the ``cluster`` module
|
|
10
|
+
(``cluster.batch()`` / ``cluster.core()`` and its ConfigMap helpers) so unit
|
|
11
|
+
tests can monkeypatch one seam without a cluster.
|
|
12
|
+
* A failed Job (or a Job whose output ConfigMap reports ``failed``) raises an
|
|
13
|
+
exception so Temporal's retry policy (max 3) re-runs the activity. Each
|
|
14
|
+
attempt gets a fresh Job name (``…-a<attempt>``).
|
|
15
|
+
* If the Job reports ``awaiting_human`` (a mid-run blocking question, issue
|
|
16
|
+
#21) the activity returns that result *without* deleting the Job — it stays
|
|
17
|
+
Running, polling its input ConfigMap for the answer. The workflow then calls
|
|
18
|
+
:func:`answer_agent_job` and :func:`await_agent_job`.
|
|
19
|
+
* ``cleanup_agent_job`` deletes the Job and ConfigMap after a retention window.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
import os
|
|
28
|
+
|
|
29
|
+
from temporalio import activity
|
|
30
|
+
from temporalio.exceptions import ApplicationError
|
|
31
|
+
|
|
32
|
+
from . import cluster
|
|
33
|
+
from .cluster import NAMESPACE
|
|
34
|
+
from .shared import (
|
|
35
|
+
KEY_HUMAN_ANSWER,
|
|
36
|
+
KEY_RESULT,
|
|
37
|
+
AgentJobResult,
|
|
38
|
+
AnswerInput,
|
|
39
|
+
AwaitInput,
|
|
40
|
+
DispatchInput,
|
|
41
|
+
JobStatus,
|
|
42
|
+
)
|
|
43
|
+
from .projects import get_project
|
|
44
|
+
|
|
45
|
+
log = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
SERVICE_ACCOUNT = os.getenv("AGENT_JOB_SERVICE_ACCOUNT", "agent-job")
|
|
48
|
+
TEMPORAL_HOST = os.getenv("TEMPORAL_HOST", "localhost:7233")
|
|
49
|
+
OMNEVAL_OTLP_ENDPOINT = os.getenv(
|
|
50
|
+
"OMNEVAL_OTLP_ENDPOINT", "http://omneval-ingest.omneval.svc.cluster.local:8000"
|
|
51
|
+
)
|
|
52
|
+
OPENAI_BASE_URL = os.getenv("AGENT_OPENAI_BASE_URL", "http://192.168.68.104/v1")
|
|
53
|
+
AGENT_BASE_IMAGE = os.getenv(
|
|
54
|
+
"AGENT_BASE_IMAGE", "ghcr.io/omneval/devloop-agent-base:latest"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Defaults; overridable via DispatchInput for tests.
|
|
58
|
+
DEFAULT_CPU = os.getenv("AGENT_JOB_CPU", "2")
|
|
59
|
+
DEFAULT_MEMORY = os.getenv("AGENT_JOB_MEMORY", "4Gi")
|
|
60
|
+
JOB_ACTIVE_DEADLINE = int(os.getenv("AGENT_JOB_ACTIVE_DEADLINE", "7200"))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# --------------------------------------------------------------------------- #
|
|
64
|
+
# Job spec rendering
|
|
65
|
+
# --------------------------------------------------------------------------- #
|
|
66
|
+
def job_name_for(d: DispatchInput, attempt: int, discriminator: str = "") -> str:
|
|
67
|
+
"""Build a Job name. Dev-loop jobs disambiguate by issue number; jobs with
|
|
68
|
+
no issue (Alert Response diagnosis) use ``discriminator`` (a per-workflow
|
|
69
|
+
hash) so two concurrent/rapid alerts don't collide on the same Job name and
|
|
70
|
+
read each other's stale output ConfigMap."""
|
|
71
|
+
spec = d.task_spec
|
|
72
|
+
base = f"agent-{spec.project_id}-{spec.phase}"
|
|
73
|
+
if d.issue_number:
|
|
74
|
+
base += f"-{d.issue_number}"
|
|
75
|
+
elif discriminator:
|
|
76
|
+
base += f"-{discriminator}"
|
|
77
|
+
return f"{base}-a{attempt}".replace("_", "-").lower()[:60]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _resolve_job_refs(d: DispatchInput):
|
|
81
|
+
"""Resolve (image, omneval_secret, github_url, default_branch, github_token_secret).
|
|
82
|
+
|
|
83
|
+
Registry-backed jobs read from the project entry; alert-response style jobs
|
|
84
|
+
pass explicit overrides and have no registry project. ``github_token_secret``
|
|
85
|
+
may be empty for jobs that need no GitHub access (e.g. diagnosis).
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
cfg = get_project(d.project_id)
|
|
89
|
+
except KeyError:
|
|
90
|
+
cfg = None
|
|
91
|
+
image = d.image_override or (cfg.agent_image if cfg else AGENT_BASE_IMAGE)
|
|
92
|
+
omneval_secret = d.omneval_secret_override or (
|
|
93
|
+
cfg.omneval_ingest_secret if cfg else ""
|
|
94
|
+
)
|
|
95
|
+
github_url = d.github_url_override or (cfg.github_url if cfg else "")
|
|
96
|
+
default_branch = cfg.default_branch if cfg else "main"
|
|
97
|
+
github_token_secret = d.github_token_secret_override or (
|
|
98
|
+
cfg.github_token_secret if cfg else ""
|
|
99
|
+
)
|
|
100
|
+
if not omneval_secret:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"no omneval ingest secret for project {d.project_id!r} "
|
|
103
|
+
"(set omneval_secret_override or add a registry entry)"
|
|
104
|
+
)
|
|
105
|
+
return image, omneval_secret, github_url, default_branch, github_token_secret
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def render_job(d: DispatchInput, job_name: str) -> dict:
|
|
109
|
+
"""Render the ``batch/v1`` Job manifest for an Agent Execution Job."""
|
|
110
|
+
image, omneval_secret, github_url, default_branch, github_token_secret = (
|
|
111
|
+
_resolve_job_refs(d)
|
|
112
|
+
)
|
|
113
|
+
spec = d.task_spec
|
|
114
|
+
|
|
115
|
+
# omneval ingest is X-API-Key auth, NOT bearer; the omneval project is
|
|
116
|
+
# resolved server-side from the key, so no project_id is sent.
|
|
117
|
+
env = [
|
|
118
|
+
{"name": "TASK_SPEC", "value": spec.to_env_value()},
|
|
119
|
+
{"name": "PROJECT_ID", "value": d.project_id},
|
|
120
|
+
{"name": "GITHUB_URL", "value": github_url},
|
|
121
|
+
{"name": "DEFAULT_BRANCH", "value": default_branch},
|
|
122
|
+
{"name": "TEMPORAL_HOST", "value": TEMPORAL_HOST},
|
|
123
|
+
{"name": "OUTPUT_CONFIGMAP", "value": job_name},
|
|
124
|
+
{"name": "OPENAI_BASE_URL", "value": OPENAI_BASE_URL},
|
|
125
|
+
# OTLP / omneval tracing
|
|
126
|
+
{"name": "OTEL_EXPORTER_OTLP_PROTOCOL", "value": "http/protobuf"},
|
|
127
|
+
{
|
|
128
|
+
"name": "OTEL_EXPORTER_OTLP_ENDPOINT",
|
|
129
|
+
"value": os.environ.get(
|
|
130
|
+
"OTEL_EXPORTER_OTLP_ENDPOINT", OMNEVAL_OTLP_ENDPOINT
|
|
131
|
+
),
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"name": "OMNEVAL_API_KEY",
|
|
135
|
+
"valueFrom": {"secretKeyRef": {"name": omneval_secret, "key": "api-key"}},
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
"name": "OTEL_EXPORTER_OTLP_HEADERS",
|
|
139
|
+
# If the worker has an explicit override use it; otherwise default to
|
|
140
|
+
# omneval X-API-Key substitution referencing the secret env var.
|
|
141
|
+
"value": os.environ.get(
|
|
142
|
+
"OTEL_EXPORTER_OTLP_HEADERS", "x-api-key=$(OMNEVAL_API_KEY)"
|
|
143
|
+
),
|
|
144
|
+
},
|
|
145
|
+
# OTEL_SERVICE_NAME is set to the phase so spans are tagged per-phase;
|
|
146
|
+
# the worker's own OTEL_SERVICE_NAME is intentionally NOT inherited here.
|
|
147
|
+
{"name": "OTEL_SERVICE_NAME", "value": spec.phase},
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
# Pass through LLM connection env and stub flag from the worker process so
|
|
151
|
+
# the agent entrypoint can reach the DGX model endpoint and take the stub
|
|
152
|
+
# fast-path when AGENT_STUB=1. Only include vars that are actually set —
|
|
153
|
+
# skip missing ones rather than forwarding empty strings.
|
|
154
|
+
for var in ("AGENT_MODEL", "AGENT_LLM_BASE_URL", "AGENT_LLM_API_KEY", "AGENT_STUB"):
|
|
155
|
+
val = os.environ.get(var)
|
|
156
|
+
if val:
|
|
157
|
+
env.append({"name": var, "value": val})
|
|
158
|
+
|
|
159
|
+
# Reviewer the merge phase tags on the PR it opens (assignee + @-mention).
|
|
160
|
+
# Sourced from the registry; absent for non-registry (alert-response) jobs.
|
|
161
|
+
try:
|
|
162
|
+
reviewer = get_project(d.project_id).pr_reviewer
|
|
163
|
+
except KeyError:
|
|
164
|
+
reviewer = ""
|
|
165
|
+
if reviewer:
|
|
166
|
+
env.append({"name": "PR_REVIEWER", "value": reviewer})
|
|
167
|
+
|
|
168
|
+
# Per-project GitHub token (scoped to that owner/org). Omitted for jobs that
|
|
169
|
+
# need no GitHub access, e.g. Alert Response diagnosis.
|
|
170
|
+
# ``GITHUB_TOKEN`` is used for git push/clone; ``GH_TOKEN`` is consumed by
|
|
171
|
+
# the ``gh`` CLI (``gh issue list``) inside the agent sandbox.
|
|
172
|
+
if github_token_secret:
|
|
173
|
+
env.extend(
|
|
174
|
+
[
|
|
175
|
+
{
|
|
176
|
+
"name": "GITHUB_TOKEN",
|
|
177
|
+
"valueFrom": {
|
|
178
|
+
"secretKeyRef": {
|
|
179
|
+
"name": github_token_secret,
|
|
180
|
+
"key": "GITHUB_TOKEN",
|
|
181
|
+
}
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"name": "GH_TOKEN",
|
|
186
|
+
"valueFrom": {
|
|
187
|
+
"secretKeyRef": {
|
|
188
|
+
"name": github_token_secret,
|
|
189
|
+
"key": "GITHUB_TOKEN",
|
|
190
|
+
}
|
|
191
|
+
},
|
|
192
|
+
},
|
|
193
|
+
]
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
"apiVersion": "batch/v1",
|
|
198
|
+
"kind": "Job",
|
|
199
|
+
"metadata": {
|
|
200
|
+
"name": job_name,
|
|
201
|
+
"namespace": NAMESPACE,
|
|
202
|
+
"labels": {
|
|
203
|
+
"app.kubernetes.io/managed-by": "orchestration-worker",
|
|
204
|
+
"agents.homelab/project": d.project_id,
|
|
205
|
+
"agents.homelab/phase": spec.phase,
|
|
206
|
+
},
|
|
207
|
+
},
|
|
208
|
+
"spec": {
|
|
209
|
+
"backoffLimit": 0, # Temporal owns retries, not the Job controller
|
|
210
|
+
"activeDeadlineSeconds": JOB_ACTIVE_DEADLINE,
|
|
211
|
+
"ttlSecondsAfterFinished": int(d.retention_seconds),
|
|
212
|
+
"template": {
|
|
213
|
+
"metadata": {
|
|
214
|
+
"labels": {"agents.homelab/project": d.project_id},
|
|
215
|
+
},
|
|
216
|
+
"spec": {
|
|
217
|
+
"restartPolicy": "Never",
|
|
218
|
+
"serviceAccountName": d.service_account_override or SERVICE_ACCOUNT,
|
|
219
|
+
"containers": [
|
|
220
|
+
{
|
|
221
|
+
"name": "agent",
|
|
222
|
+
"image": image,
|
|
223
|
+
"command": ["python", "/usr/local/bin/agent-entrypoint.py"],
|
|
224
|
+
"env": env,
|
|
225
|
+
"resources": {
|
|
226
|
+
"requests": {
|
|
227
|
+
"cpu": DEFAULT_CPU,
|
|
228
|
+
"memory": DEFAULT_MEMORY,
|
|
229
|
+
},
|
|
230
|
+
"limits": {"memory": DEFAULT_MEMORY},
|
|
231
|
+
},
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
},
|
|
235
|
+
},
|
|
236
|
+
},
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# --------------------------------------------------------------------------- #
|
|
241
|
+
# ConfigMap read/write helpers
|
|
242
|
+
# --------------------------------------------------------------------------- #
|
|
243
|
+
def _read_output(job_name: str) -> dict | None:
|
|
244
|
+
"""Return the parsed output ConfigMap payload, or None if not yet written."""
|
|
245
|
+
data = cluster.read_configmap_data(job_name)
|
|
246
|
+
if not data:
|
|
247
|
+
return None
|
|
248
|
+
raw = data.get(KEY_RESULT)
|
|
249
|
+
if not raw:
|
|
250
|
+
return None
|
|
251
|
+
return json.loads(raw)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _job_terminal(job) -> str | None:
|
|
255
|
+
"""Return 'complete', 'failed', or None for a batch/v1 Job object/dict."""
|
|
256
|
+
status = job.status if not isinstance(job, dict) else job.get("status", {})
|
|
257
|
+
succeeded = (
|
|
258
|
+
getattr(status, "succeeded", None)
|
|
259
|
+
if not isinstance(status, dict)
|
|
260
|
+
else status.get("succeeded")
|
|
261
|
+
)
|
|
262
|
+
failed = (
|
|
263
|
+
getattr(status, "failed", None)
|
|
264
|
+
if not isinstance(status, dict)
|
|
265
|
+
else status.get("failed")
|
|
266
|
+
)
|
|
267
|
+
if succeeded:
|
|
268
|
+
return "complete"
|
|
269
|
+
if failed:
|
|
270
|
+
return "failed"
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# --------------------------------------------------------------------------- #
|
|
275
|
+
# Polling
|
|
276
|
+
# --------------------------------------------------------------------------- #
|
|
277
|
+
async def _poll_to_terminal(
|
|
278
|
+
batch, job_name: str, poll_interval: float
|
|
279
|
+
) -> AgentJobResult:
|
|
280
|
+
"""Poll a running Job until terminal or until it asks a human a question."""
|
|
281
|
+
while True:
|
|
282
|
+
payload = _read_output(job_name)
|
|
283
|
+
if payload and payload.get("status") == JobStatus.AWAITING_HUMAN.value:
|
|
284
|
+
log.info("job %s is awaiting a human reply", job_name)
|
|
285
|
+
return AgentJobResult.from_payload(payload, job_name)
|
|
286
|
+
|
|
287
|
+
job = batch.read_namespaced_job_status(job_name, NAMESPACE)
|
|
288
|
+
terminal = _job_terminal(job)
|
|
289
|
+
if terminal == "complete":
|
|
290
|
+
payload = _read_output(job_name) or {"status": JobStatus.COMPLETE.value}
|
|
291
|
+
return AgentJobResult.from_payload(payload, job_name)
|
|
292
|
+
if terminal == "failed":
|
|
293
|
+
payload = _read_output(job_name) or {}
|
|
294
|
+
err = payload.get("error", f"Job {job_name} failed without output")
|
|
295
|
+
raise ApplicationError(f"agent job failed: {err}", type="AgentJobFailed")
|
|
296
|
+
|
|
297
|
+
await asyncio.sleep(poll_interval)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# --------------------------------------------------------------------------- #
|
|
301
|
+
# Activities
|
|
302
|
+
# --------------------------------------------------------------------------- #
|
|
303
|
+
@activity.defn
|
|
304
|
+
async def dispatch_agent_job(d: DispatchInput) -> AgentJobResult:
|
|
305
|
+
"""Render + create an Agent Execution Job, then poll it to completion."""
|
|
306
|
+
attempt = activity.info().attempt
|
|
307
|
+
# Jobs without an issue number (Alert Response diagnosis) share a name across
|
|
308
|
+
# workflows; disambiguate by a hash of the workflow id so concurrent alerts
|
|
309
|
+
# get distinct Jobs/ConfigMaps. Stable across retries (same workflow run).
|
|
310
|
+
discriminator = ""
|
|
311
|
+
if not d.issue_number:
|
|
312
|
+
import hashlib
|
|
313
|
+
|
|
314
|
+
discriminator = hashlib.sha1(activity.info().workflow_id.encode()).hexdigest()[
|
|
315
|
+
:8
|
|
316
|
+
]
|
|
317
|
+
job_name = job_name_for(d, attempt, discriminator)
|
|
318
|
+
manifest = render_job(d, job_name)
|
|
319
|
+
|
|
320
|
+
batch = cluster.batch()
|
|
321
|
+
|
|
322
|
+
from kubernetes.client.exceptions import ApiException
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
batch.create_namespaced_job(NAMESPACE, manifest)
|
|
326
|
+
log.info("created job %s (attempt %d)", job_name, attempt)
|
|
327
|
+
except ApiException as exc:
|
|
328
|
+
if getattr(exc, "status", None) != 409: # already exists (retry attach)
|
|
329
|
+
raise
|
|
330
|
+
log.info("job %s already exists, attaching", job_name)
|
|
331
|
+
|
|
332
|
+
return await _poll_to_terminal(batch, job_name, d.poll_interval_seconds)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
@activity.defn
|
|
336
|
+
async def answer_agent_job(inp: AnswerInput) -> None:
|
|
337
|
+
"""Write a human's reply to the Job's input ConfigMap so it can resume."""
|
|
338
|
+
cluster.patch_configmap_data(inp.job_name, {KEY_HUMAN_ANSWER: inp.answer})
|
|
339
|
+
log.info("answered job %s", inp.job_name)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@activity.defn
|
|
343
|
+
async def await_agent_job(inp: AwaitInput) -> AgentJobResult:
|
|
344
|
+
"""Continue polling a Job that was previously parked on a human question."""
|
|
345
|
+
return await _poll_to_terminal(
|
|
346
|
+
cluster.batch(), inp.job_name, inp.poll_interval_seconds
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@activity.defn
|
|
351
|
+
async def cleanup_agent_job(job_name: str) -> None:
|
|
352
|
+
"""Delete the Job and its output ConfigMap (best-effort)."""
|
|
353
|
+
from kubernetes.client import V1DeleteOptions
|
|
354
|
+
from kubernetes.client.exceptions import ApiException
|
|
355
|
+
|
|
356
|
+
batch, core = cluster.batch(), cluster.core()
|
|
357
|
+
for fn in (
|
|
358
|
+
lambda: batch.delete_namespaced_job(
|
|
359
|
+
job_name, NAMESPACE, body=V1DeleteOptions(propagation_policy="Background")
|
|
360
|
+
),
|
|
361
|
+
lambda: core.delete_namespaced_config_map(job_name, NAMESPACE),
|
|
362
|
+
):
|
|
363
|
+
try:
|
|
364
|
+
fn()
|
|
365
|
+
except ApiException as exc:
|
|
366
|
+
if getattr(exc, "status", None) != 404:
|
|
367
|
+
log.warning("cleanup error for %s: %s", job_name, exc)
|
devloop/projects.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Project registry loader for the Orchestration Worker.
|
|
2
|
+
|
|
3
|
+
Reads agents/projects.yaml and surfaces typed ProjectConfig objects.
|
|
4
|
+
No dynamic reload — restart the worker to pick up registry changes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
_REQUIRED_FIELDS = (
|
|
19
|
+
"id",
|
|
20
|
+
"github_url",
|
|
21
|
+
"default_branch",
|
|
22
|
+
"agent_image",
|
|
23
|
+
"agent_label",
|
|
24
|
+
"discord_channel",
|
|
25
|
+
"omneval_ingest_secret",
|
|
26
|
+
"github_token_secret",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class ProjectConfig:
|
|
32
|
+
id: str
|
|
33
|
+
github_url: str
|
|
34
|
+
default_branch: str
|
|
35
|
+
agent_image: str
|
|
36
|
+
agent_label: str
|
|
37
|
+
discord_channel: str
|
|
38
|
+
omneval_ingest_secret: str
|
|
39
|
+
# Secret (agents ns, key "GITHUB_TOKEN") holding this project's scoped GitHub
|
|
40
|
+
# token. Per-project so each org/owner gets its own credential — the worker
|
|
41
|
+
# resolves it per project and the Agent Execution Job mounts it.
|
|
42
|
+
github_token_secret: str
|
|
43
|
+
# GitHub login tagged for review on the PR the merge phase opens (assignee +
|
|
44
|
+
# @-mention). Optional; empty means the merge phase opens the PR untagged.
|
|
45
|
+
pr_reviewer: str = ""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_projects(path: str | Path) -> list[ProjectConfig]:
|
|
49
|
+
"""Parse projects.yaml and return a list of ProjectConfig.
|
|
50
|
+
|
|
51
|
+
Raises ValueError if any project entry is missing required fields.
|
|
52
|
+
"""
|
|
53
|
+
raw = Path(path).read_text()
|
|
54
|
+
data: dict[str, Any] = yaml.safe_load(raw)
|
|
55
|
+
entries = data.get("projects", [])
|
|
56
|
+
configs: list[ProjectConfig] = []
|
|
57
|
+
for entry in entries:
|
|
58
|
+
missing = [f for f in _REQUIRED_FIELDS if f not in entry]
|
|
59
|
+
if missing:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Project entry missing required fields: {missing!r} in {entry!r}"
|
|
62
|
+
)
|
|
63
|
+
configs.append(
|
|
64
|
+
ProjectConfig(
|
|
65
|
+
id=entry["id"],
|
|
66
|
+
github_url=entry["github_url"],
|
|
67
|
+
default_branch=entry["default_branch"],
|
|
68
|
+
agent_image=entry["agent_image"],
|
|
69
|
+
agent_label=entry["agent_label"],
|
|
70
|
+
discord_channel=entry["discord_channel"],
|
|
71
|
+
omneval_ingest_secret=entry["omneval_ingest_secret"],
|
|
72
|
+
github_token_secret=entry["github_token_secret"],
|
|
73
|
+
pr_reviewer=entry.get("pr_reviewer", ""),
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
logger.info(
|
|
77
|
+
"loaded %d project%s: %s",
|
|
78
|
+
len(configs),
|
|
79
|
+
"" if len(configs) == 1 else "s",
|
|
80
|
+
", ".join(c.id for c in configs),
|
|
81
|
+
)
|
|
82
|
+
return configs
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Process-wide registry.
|
|
87
|
+
#
|
|
88
|
+
# The worker calls install_registry() once at startup; activities then resolve
|
|
89
|
+
# project configs by id without re-reading the file (no dynamic reload — a
|
|
90
|
+
# worker restart is required to pick up registry changes).
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
_REGISTRY: dict[str, ProjectConfig] = {}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def install_registry(path: str | Path) -> list[ProjectConfig]:
|
|
97
|
+
"""Load projects.yaml and make the configs available process-wide."""
|
|
98
|
+
configs = load_projects(path)
|
|
99
|
+
_REGISTRY.clear()
|
|
100
|
+
_REGISTRY.update({c.id: c for c in configs})
|
|
101
|
+
return configs
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_project(project_id: str) -> ProjectConfig:
|
|
105
|
+
"""Return the registered ProjectConfig for ``project_id``.
|
|
106
|
+
|
|
107
|
+
Raises KeyError if the project is not in the registry.
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
return _REGISTRY[project_id]
|
|
111
|
+
except KeyError:
|
|
112
|
+
raise KeyError(
|
|
113
|
+
f"project {project_id!r} not in registry (known: {sorted(_REGISTRY)})"
|
|
114
|
+
) from None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def parse_github_repo(github_url: str) -> str:
|
|
118
|
+
"""Return ``owner/repo`` from a GitHub URL (used for gh CLI / REST calls)."""
|
|
119
|
+
slug = github_url.rstrip("/").removesuffix(".git")
|
|
120
|
+
parts = slug.split("/")
|
|
121
|
+
return f"{parts[-2]}/{parts[-1]}"
|