observability-agent 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- observability_agent-0.1.3/PKG-INFO +6 -0
- observability_agent-0.1.3/observability_agent.egg-info/PKG-INFO +6 -0
- observability_agent-0.1.3/observability_agent.egg-info/SOURCES.txt +8 -0
- observability_agent-0.1.3/observability_agent.egg-info/dependency_links.txt +1 -0
- observability_agent-0.1.3/observability_agent.egg-info/entry_points.txt +2 -0
- observability_agent-0.1.3/observability_agent.egg-info/requires.txt +1 -0
- observability_agent-0.1.3/observability_agent.egg-info/top_level.txt +1 -0
- observability_agent-0.1.3/pyproject.toml +16 -0
- observability_agent-0.1.3/run_agent.py +367 -0
- observability_agent-0.1.3/setup.cfg +4 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
run_agent.py
|
|
3
|
+
observability_agent.egg-info/PKG-INFO
|
|
4
|
+
observability_agent.egg-info/SOURCES.txt
|
|
5
|
+
observability_agent.egg-info/dependency_links.txt
|
|
6
|
+
observability_agent.egg-info/entry_points.txt
|
|
7
|
+
observability_agent.egg-info/requires.txt
|
|
8
|
+
observability_agent.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
httpx>=0.27.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
run_agent
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "observability-agent"
|
|
7
|
+
version = "0.1.3"
|
|
8
|
+
description = "Opentrons observability relay agent"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = ["httpx>=0.27.0"]
|
|
11
|
+
|
|
12
|
+
[project.scripts]
|
|
13
|
+
observability-agent = "run_agent:main"
|
|
14
|
+
|
|
15
|
+
[tool.setuptools]
|
|
16
|
+
py-modules = ["run_agent"]
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Local relay agent: polls Opentrons robot(s) on the lab network and POSTs telemetry to the cloud.
|
|
4
|
+
Supports HTTP and HTTPS per robot (e.g. 198.51.100.73 and 203.0.113.198 over HTTPS, localhost over HTTP).
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python run_agent.py --lab-id=LAB_ID --agent-token=TOKEN --backend-url=https://your-api.com
|
|
8
|
+
python run_agent.py --config=agent_config.json
|
|
9
|
+
|
|
10
|
+
Robot addresses for production come from the cloud app (Fleet Manager): the agent calls
|
|
11
|
+
GET /api/agent/robot-poll-targets. Use --local-robots (or use_local_robots in JSON) only
|
|
12
|
+
for development without the cloud UI.
|
|
13
|
+
|
|
14
|
+
Example agent_config.json (production — no robots section):
|
|
15
|
+
{
|
|
16
|
+
"lab_id": "abc123",
|
|
17
|
+
"agent_token": "your-token",
|
|
18
|
+
"backend_url": "https://your-api.com",
|
|
19
|
+
"robot_poll_interval_seconds": 5
|
|
20
|
+
}
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
import sys
|
|
30
|
+
import time
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import httpx
|
|
34
|
+
|
|
35
|
+
# Default robots for periodic check-ins: two over HTTPS, localhost over HTTP
|
|
36
|
+
DEFAULT_ROBOTS = [
|
|
37
|
+
{"ip": "198.51.100.73", "scheme": "https", "port": 31950},
|
|
38
|
+
{"ip": "203.0.113.198", "scheme": "https", "port": 31950},
|
|
39
|
+
{"ip": "localhost", "scheme": "http", "port": 31950},
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
ROBOT_TIMEOUT = 10.0
|
|
43
|
+
BACKEND_TIMEOUT = 30.0
|
|
44
|
+
MIN_BACKOFF = 5.0
|
|
45
|
+
MAX_BACKOFF = 60.0
|
|
46
|
+
# How often to refresh robot list from the cloud (when not using --local-robots).
|
|
47
|
+
TARGETS_REFRESH_SECONDS = 30.0
|
|
48
|
+
|
|
49
|
+
logging.basicConfig(
|
|
50
|
+
level=logging.INFO,
|
|
51
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
52
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
53
|
+
)
|
|
54
|
+
log = logging.getLogger("agent")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _url(ip: str, path: str, scheme: str = "http", port: int = 31950) -> str:
|
|
58
|
+
path = path.strip("/")
|
|
59
|
+
return f"{scheme}://{ip}:{port}/{path}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def fetch_robot_telemetry(
|
|
63
|
+
ip: str,
|
|
64
|
+
scheme: str = "http",
|
|
65
|
+
port: int = 31950,
|
|
66
|
+
timeout: float = ROBOT_TIMEOUT,
|
|
67
|
+
) -> dict | None:
|
|
68
|
+
"""Fetch health, runs, and logs from one robot. Returns dict for payload or None on failure."""
|
|
69
|
+
headers = {"Content-Type": "application/json", "Opentrons-Version": "*"}
|
|
70
|
+
out = {"ip": ip, "health": None, "runs": None, "logs": None, "serial": None}
|
|
71
|
+
try:
|
|
72
|
+
with httpx.Client(timeout=timeout) as client:
|
|
73
|
+
# Health
|
|
74
|
+
r = client.get(_url(ip, "health", scheme, port), headers=headers)
|
|
75
|
+
if r.status_code == 200:
|
|
76
|
+
out["health"] = {
|
|
77
|
+
"name": r.headers.get("name"),
|
|
78
|
+
"date": r.headers.get("date"),
|
|
79
|
+
"logs": r.headers.get("logs"),
|
|
80
|
+
"serial_number": r.headers.get("serial_number"),
|
|
81
|
+
"status": r.headers.get("status"),
|
|
82
|
+
"health_data": r.headers.get("health_data"),
|
|
83
|
+
}
|
|
84
|
+
out["serial"] = r.headers.get("serial_number")
|
|
85
|
+
# Runs
|
|
86
|
+
r = client.get(_url(ip, "runs", scheme, port), headers=headers)
|
|
87
|
+
if r.status_code == 200:
|
|
88
|
+
try:
|
|
89
|
+
out["runs"] = r.json()
|
|
90
|
+
except Exception:
|
|
91
|
+
out["runs"] = {}
|
|
92
|
+
# Logs
|
|
93
|
+
r = client.get(_url(ip, "logs", scheme, port), headers=headers)
|
|
94
|
+
if r.status_code == 200:
|
|
95
|
+
out["logs"] = r.headers.get("logs") or ""
|
|
96
|
+
except Exception as e:
|
|
97
|
+
log.warning("Robot %s (%s): %s", ip, scheme, e)
|
|
98
|
+
return None
|
|
99
|
+
return out
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def build_telemetry_payload(robots_config: list, timeout: float = ROBOT_TIMEOUT) -> list:
|
|
103
|
+
"""Build list of robot telemetry dicts for POST body."""
|
|
104
|
+
payload_robots = []
|
|
105
|
+
for r in robots_config:
|
|
106
|
+
if isinstance(r, str):
|
|
107
|
+
ip, scheme, port = r.strip(), "http", 31950
|
|
108
|
+
else:
|
|
109
|
+
ip = (r.get("ip") or "").strip()
|
|
110
|
+
scheme = (r.get("scheme") or "http").lower()
|
|
111
|
+
port = int(r.get("port") or 31950)
|
|
112
|
+
if not ip:
|
|
113
|
+
continue
|
|
114
|
+
data = fetch_robot_telemetry(ip, scheme=scheme, port=port, timeout=timeout)
|
|
115
|
+
if data is None:
|
|
116
|
+
continue
|
|
117
|
+
payload_robots.append({
|
|
118
|
+
"ip": ip,
|
|
119
|
+
"robot_id": data.get("serial"),
|
|
120
|
+
"serial": data.get("serial"),
|
|
121
|
+
"health": data.get("health"),
|
|
122
|
+
"runs": data.get("runs"),
|
|
123
|
+
"logs": data.get("logs"),
|
|
124
|
+
})
|
|
125
|
+
return payload_robots
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def fetch_robot_poll_targets(
|
|
129
|
+
backend_url: str,
|
|
130
|
+
agent_token: str,
|
|
131
|
+
timeout: float = BACKEND_TIMEOUT,
|
|
132
|
+
) -> list[dict] | None:
|
|
133
|
+
"""GET poll targets from cloud. Returns None on HTTP/network failure."""
|
|
134
|
+
url = f"{backend_url.rstrip('/')}/api/agent/robot-poll-targets"
|
|
135
|
+
headers = {
|
|
136
|
+
"Authorization": f"Bearer {agent_token}",
|
|
137
|
+
"Accept": "application/json",
|
|
138
|
+
}
|
|
139
|
+
try:
|
|
140
|
+
with httpx.Client(timeout=timeout) as client:
|
|
141
|
+
r = client.get(url, headers=headers)
|
|
142
|
+
if r.status_code != 200:
|
|
143
|
+
log.error("GET robot-poll-targets %s: %s", r.status_code, r.text[:200])
|
|
144
|
+
return None
|
|
145
|
+
data = r.json()
|
|
146
|
+
robots = data.get("robots")
|
|
147
|
+
if not isinstance(robots, list):
|
|
148
|
+
return None
|
|
149
|
+
out: list[dict] = []
|
|
150
|
+
for item in robots:
|
|
151
|
+
if not isinstance(item, dict):
|
|
152
|
+
continue
|
|
153
|
+
ip = (item.get("ip") or "").strip()
|
|
154
|
+
if not ip:
|
|
155
|
+
continue
|
|
156
|
+
scheme = (item.get("scheme") or "http").lower()
|
|
157
|
+
if scheme not in ("http", "https"):
|
|
158
|
+
scheme = "http"
|
|
159
|
+
try:
|
|
160
|
+
port = int(item.get("port") or 31950)
|
|
161
|
+
except (TypeError, ValueError):
|
|
162
|
+
port = 31950
|
|
163
|
+
out.append({"ip": ip, "scheme": scheme, "port": port})
|
|
164
|
+
return out
|
|
165
|
+
except Exception as e:
|
|
166
|
+
log.error("GET robot-poll-targets failed: %s", e)
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def post_telemetry(
|
|
171
|
+
backend_url: str,
|
|
172
|
+
agent_token: str,
|
|
173
|
+
lab_id: str,
|
|
174
|
+
robots: list,
|
|
175
|
+
timeout: float = BACKEND_TIMEOUT,
|
|
176
|
+
) -> bool:
|
|
177
|
+
"""POST telemetry to cloud. Returns True on success."""
|
|
178
|
+
url = f"{backend_url.rstrip('/')}/api/agent/telemetry"
|
|
179
|
+
headers = {
|
|
180
|
+
"Authorization": f"Bearer {agent_token}",
|
|
181
|
+
"Content-Type": "application/json",
|
|
182
|
+
}
|
|
183
|
+
body = {"lab_id": lab_id, "robots": robots}
|
|
184
|
+
try:
|
|
185
|
+
with httpx.Client(timeout=timeout) as client:
|
|
186
|
+
r = client.post(url, json=body, headers=headers)
|
|
187
|
+
if r.status_code in (200, 201):
|
|
188
|
+
return True
|
|
189
|
+
log.error("Backend %s: %s %s", r.status_code, r.text[:200])
|
|
190
|
+
return False
|
|
191
|
+
except Exception as e:
|
|
192
|
+
log.error("Backend POST failed: %s", e)
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def load_config(path: str) -> dict:
|
|
197
|
+
with open(path, encoding="utf-8") as f:
|
|
198
|
+
data = json.load(f)
|
|
199
|
+
if not isinstance(data, dict):
|
|
200
|
+
raise ValueError("Config must be a JSON object")
|
|
201
|
+
robots = data.get("robots")
|
|
202
|
+
if robots is None:
|
|
203
|
+
robots = []
|
|
204
|
+
if not isinstance(robots, list):
|
|
205
|
+
robots = []
|
|
206
|
+
data["robots"] = robots
|
|
207
|
+
data.setdefault("robot_poll_interval_seconds", 5)
|
|
208
|
+
data.setdefault("backend_url", os.environ.get("BACKEND_URL", ""))
|
|
209
|
+
data.setdefault("lab_id", os.environ.get("LAB_ID", ""))
|
|
210
|
+
data.setdefault("agent_token", os.environ.get("AGENT_TOKEN", ""))
|
|
211
|
+
data.setdefault("use_local_robots", False)
|
|
212
|
+
return data
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _env_use_local_robots() -> bool:
|
|
216
|
+
v = os.environ.get("AGENT_USE_LOCAL_ROBOTS", "").strip().lower()
|
|
217
|
+
return v in ("1", "true", "yes")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _config_use_local_robots(cfg: dict) -> bool:
|
|
221
|
+
v = cfg.get("use_local_robots")
|
|
222
|
+
if isinstance(v, bool):
|
|
223
|
+
return v
|
|
224
|
+
if isinstance(v, str):
|
|
225
|
+
return v.strip().lower() in ("1", "true", "yes")
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def main() -> int:
|
|
230
|
+
ap = argparse.ArgumentParser(description="Opentrons observability relay agent")
|
|
231
|
+
ap.add_argument("--lab-id", default=os.environ.get("LAB_ID"), help="Lab ID")
|
|
232
|
+
ap.add_argument("--agent-token", default=os.environ.get("AGENT_TOKEN"), help="Agent token")
|
|
233
|
+
ap.add_argument("--backend-url", default=os.environ.get("BACKEND_URL"), help="Cloud backend URL")
|
|
234
|
+
ap.add_argument("--robot-ips", help="With --local-robots: comma-separated robot IPs")
|
|
235
|
+
ap.add_argument("--config", help="Path to agent_config.json")
|
|
236
|
+
ap.add_argument("--interval", type=float, default=5, help="Poll interval in seconds")
|
|
237
|
+
ap.add_argument("--https-ips", help="With --local-robots: comma-separated IPs to use HTTPS")
|
|
238
|
+
ap.add_argument(
|
|
239
|
+
"--local-robots",
|
|
240
|
+
action="store_true",
|
|
241
|
+
help="Use robots from config/--robot-ips instead of the cloud (dev only; production uses Fleet Manager)",
|
|
242
|
+
)
|
|
243
|
+
args = ap.parse_args()
|
|
244
|
+
|
|
245
|
+
use_local = bool(args.local_robots)
|
|
246
|
+
if not use_local:
|
|
247
|
+
use_local = _env_use_local_robots()
|
|
248
|
+
|
|
249
|
+
if args.config:
|
|
250
|
+
config = load_config(args.config)
|
|
251
|
+
lab_id = config.get("lab_id") or args.lab_id
|
|
252
|
+
agent_token = config.get("agent_token") or args.agent_token
|
|
253
|
+
backend_url = config.get("backend_url") or args.backend_url
|
|
254
|
+
interval = float(config.get("robot_poll_interval_seconds", args.interval))
|
|
255
|
+
if not use_local:
|
|
256
|
+
use_local = _config_use_local_robots(config) or bool(args.local_robots)
|
|
257
|
+
if use_local:
|
|
258
|
+
robots_config = list(config.get("robots") or [])
|
|
259
|
+
if not robots_config:
|
|
260
|
+
robots_config = list(DEFAULT_ROBOTS)
|
|
261
|
+
else:
|
|
262
|
+
robots_config = []
|
|
263
|
+
else:
|
|
264
|
+
lab_id = args.lab_id
|
|
265
|
+
agent_token = args.agent_token
|
|
266
|
+
backend_url = args.backend_url
|
|
267
|
+
interval = args.interval
|
|
268
|
+
if use_local:
|
|
269
|
+
if args.robot_ips:
|
|
270
|
+
ips = [s.strip() for s in args.robot_ips.split(",") if s.strip()]
|
|
271
|
+
https_ips = set()
|
|
272
|
+
if args.https_ips:
|
|
273
|
+
https_ips = {s.strip() for s in args.https_ips.split(",") if s.strip()}
|
|
274
|
+
else:
|
|
275
|
+
https_ips = {"198.51.100.73", "203.0.113.198"}
|
|
276
|
+
robots_config = [
|
|
277
|
+
{"ip": ip, "scheme": "https" if ip in https_ips else "http", "port": 31950}
|
|
278
|
+
for ip in ips
|
|
279
|
+
]
|
|
280
|
+
else:
|
|
281
|
+
robots_config = list(DEFAULT_ROBOTS)
|
|
282
|
+
else:
|
|
283
|
+
robots_config = []
|
|
284
|
+
|
|
285
|
+
if not lab_id or not agent_token or not backend_url:
|
|
286
|
+
log.error("Provide --lab-id, --agent-token, and --backend-url (or set LAB_ID, AGENT_TOKEN, BACKEND_URL)")
|
|
287
|
+
return 1
|
|
288
|
+
|
|
289
|
+
if use_local:
|
|
290
|
+
log.info(
|
|
291
|
+
"Lab %s; backend %s; LOCAL robots %s; interval %.1fs",
|
|
292
|
+
lab_id,
|
|
293
|
+
backend_url,
|
|
294
|
+
[r.get("ip") if isinstance(r, dict) else r for r in robots_config],
|
|
295
|
+
interval,
|
|
296
|
+
)
|
|
297
|
+
else:
|
|
298
|
+
log.info(
|
|
299
|
+
"Lab %s; backend %s; robot list from cloud (GET /api/agent/robot-poll-targets); interval %.1fs",
|
|
300
|
+
lab_id,
|
|
301
|
+
backend_url,
|
|
302
|
+
interval,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
backoff = MIN_BACKOFF
|
|
306
|
+
cached_cloud_robots: list = []
|
|
307
|
+
last_targets_fetch = 0.0
|
|
308
|
+
have_cloud_targets_response = False
|
|
309
|
+
|
|
310
|
+
while True:
|
|
311
|
+
try:
|
|
312
|
+
if use_local:
|
|
313
|
+
active_robots = robots_config
|
|
314
|
+
else:
|
|
315
|
+
now = time.time()
|
|
316
|
+
if now - last_targets_fetch >= TARGETS_REFRESH_SECONDS:
|
|
317
|
+
fetched = fetch_robot_poll_targets(backend_url, agent_token)
|
|
318
|
+
last_targets_fetch = now
|
|
319
|
+
if fetched is not None:
|
|
320
|
+
cached_cloud_robots = fetched
|
|
321
|
+
have_cloud_targets_response = True
|
|
322
|
+
elif not cached_cloud_robots:
|
|
323
|
+
log.warning(
|
|
324
|
+
"Could not load robot list from cloud yet; retry in %.0fs",
|
|
325
|
+
TARGETS_REFRESH_SECONDS,
|
|
326
|
+
)
|
|
327
|
+
active_robots = cached_cloud_robots
|
|
328
|
+
|
|
329
|
+
if not use_local and not active_robots:
|
|
330
|
+
if have_cloud_targets_response:
|
|
331
|
+
log.warning(
|
|
332
|
+
"No robot addresses in the cloud for this lab. Add them in Fleet Manager (web app)."
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
log.warning(
|
|
336
|
+
"Waiting for robot list from the cloud API (GET /api/agent/robot-poll-targets)."
|
|
337
|
+
)
|
|
338
|
+
time.sleep(interval)
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
robots_payload = build_telemetry_payload(active_robots)
|
|
342
|
+
if not robots_payload:
|
|
343
|
+
log.warning("No robot data collected this cycle")
|
|
344
|
+
else:
|
|
345
|
+
ok = post_telemetry(backend_url, agent_token, lab_id, robots_payload)
|
|
346
|
+
if ok:
|
|
347
|
+
log.info("POST ok (%d robot(s))", len(robots_payload))
|
|
348
|
+
backoff = MIN_BACKOFF
|
|
349
|
+
else:
|
|
350
|
+
log.warning("POST failed; retry in %.0fs", backoff)
|
|
351
|
+
time.sleep(backoff)
|
|
352
|
+
backoff = min(backoff * 2, MAX_BACKOFF)
|
|
353
|
+
continue
|
|
354
|
+
except KeyboardInterrupt:
|
|
355
|
+
log.info("Stopping")
|
|
356
|
+
break
|
|
357
|
+
except Exception as e:
|
|
358
|
+
log.exception("Cycle error: %s", e)
|
|
359
|
+
time.sleep(backoff)
|
|
360
|
+
backoff = min(backoff * 2, MAX_BACKOFF)
|
|
361
|
+
continue
|
|
362
|
+
time.sleep(interval)
|
|
363
|
+
return 0
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
if __name__ == "__main__":
|
|
367
|
+
sys.exit(main())
|