osiris-agent 0.3.76__tar.gz → 0.3.78__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osiris_agent
3
- Version: 0.3.76
3
+ Version: 0.3.78
4
4
  Summary: OSIRIS agent for ROS2/Humble
5
5
  Home-page: https://github.com/nicolaselielll/osiris_agent
6
6
  Author: Nicolas Tuomaala
@@ -1,6 +1,6 @@
1
1
  """osiris_agent package initializer."""
2
2
 
3
- __version__ = '0.3.76'
3
+ __version__ = '0.3.78'
4
4
 
5
5
  __all__ = [
6
6
  "agent_node",
@@ -0,0 +1,867 @@
1
+ import asyncio
2
+ import os
3
+ import random
4
+ import threading
5
+ import time
6
+ from collections import deque
7
+
8
+ import psutil
9
+ import rclpy
10
+ import websockets
11
+ import json
12
+
13
+ from rcl_interfaces.srv import GetParameters, ListParameters
14
+ from rclpy.node import Node
15
+ from rclpy.parameter import parameter_value_to_python
16
+ from rclpy.qos import QoSProfile
17
+ from rosidl_runtime_py import message_to_ordereddict
18
+ from rosidl_runtime_py.utilities import get_message
19
+
20
+ from osiris_agent import __version__ as AGENT_VERSION
21
+ from .ros2_control_collector import Ros2ControlCollector
22
+ from .tf_tree_collector import TfTreeCollector
23
+
24
+ # ──────────────────────────────────────────────
25
+ # Constants
26
+ # ──────────────────────────────────────────────
27
+ GRAPH_CHECK_INTERVAL = 2.0 # seconds between graph polls
28
+ TOPIC_BATCH_SIZE = 10 # max topics enriched (deep-scan) per tick
29
+ TELEMETRY_INTERVAL = 1.0 # seconds between telemetry samples
30
+ SERVICE_SCAN_INTERVAL = 30.0 # seconds between service graph scans
31
+ PARAMETER_REFRESH_INTERVAL = 60.0 # seconds between retries for nodes with no params yet
32
+ MAX_SUBSCRIPTIONS = 100 # hard cap on gateway-requested topic subs
33
+ RECONNECT_INITIAL_DELAY = 1 # seconds
34
+ RECONNECT_MAX_DELAY = 30 # seconds
35
+
36
+ # Services to suppress from graph output (internal ROS2 plumbing)
37
+ _SUPPRESSED_SERVICE_PREFIXES = ('/ros2cli_daemon',)
38
+
39
+
40
+ class WebBridge(Node):
41
+
42
+ def __init__(self):
43
+ super().__init__('osiris_node')
44
+
45
+ auth_token = os.environ.get('OSIRIS_AUTH_TOKEN')
46
+ if not auth_token:
47
+ raise ValueError("OSIRIS_AUTH_TOKEN environment variable must be set")
48
+
49
+ # Declare tunable parameters
50
+ self.declare_parameter('graph_check_interval', GRAPH_CHECK_INTERVAL)
51
+ self.declare_parameter('topic_batch_size', TOPIC_BATCH_SIZE)
52
+ self.declare_parameter('telemetry_interval', TELEMETRY_INTERVAL)
53
+ self.declare_parameter('tf_tree_enabled', False)
54
+
55
+ base_url = os.environ.get('OSIRIS_WS_URL', 'wss://osiris-gateway.fly.dev')
56
+ self.ws_url = f'{base_url}?robot=true&token={auth_token}'
57
+ # self.ws_url = f'ws://host.docker.internal:8080?robot=true&token={auth_token}'
58
+
59
+ self.ws = None
60
+ self.loop = None
61
+ self._send_queue: asyncio.Queue | None = None
62
+
63
+ # ── Topic subscriptions (gateway-requested) ──────────────────────────
64
+ self._topic_subs: dict[str, rclpy.subscription.Subscription] = {}
65
+ self._topic_subs_lock = threading.Lock()
66
+ self._topic_last_timestamp: dict[str, float] = {}
67
+ self._topic_rate_history: dict[str, deque] = {}
68
+ self._rate_history_depth = 8
69
+
70
+ # ── Existence caches (set of fully-qualified names) ───────────────────
71
+ self._active_nodes: set[str] = set()
72
+ self._active_topics: set[str] = set()
73
+ self._active_services: dict[str, str] = {}
74
+ self._active_actions: set[str] = set()
75
+
76
+ # ── Count sentinels (cheap change detection) ─────────────────────────
77
+ self._topic_counts: dict[str, tuple[int, int]] = {} # topic → (pub_n, sub_n)
78
+
79
+ # ── Relation caches (populated by Tier-2 enrichment) ─────────────────
80
+ self._topic_relations: dict[str, dict] = {}
81
+
82
+ # ── Enrichment pending queues ─────────────────────────────────────────
83
+ self._pending_topic_enrichment: set[str] = set()
84
+
85
+ # ── Parameters (lazy-loaded, async) ──────────────────────────────────
86
+ self._node_parameter_cache: dict[str, dict] = {}
87
+ self._pending_param_fetches: set[str] = set()
88
+
89
+ # ── Snapshot & dirty-flag ─────────────────────────────────────────────
90
+ self._last_sent_nodes: dict | None = None
91
+ self._last_sent_topics: dict | None = None
92
+ self._last_sent_actions: dict | None = None
93
+ self._last_sent_services: dict | None = None
94
+ self._graph_dirty = False
95
+
96
+ # ── Service scan throttle ─────────────────────────────────────────────
97
+ self._last_service_scan: float = 0.0
98
+ self._service_rescan_ticks: int = 0
99
+
100
+ # ── Initial scan synchronization ──────────────────────────────────────
101
+ self._initial_scan_complete = threading.Event()
102
+ self._first_graph_check_done = False
103
+
104
+ # ── Telemetry ─────────────────────────────────────────────────────────
105
+ self._telemetry_enabled = True
106
+ self._last_disk_io = None
107
+ self._last_net_io = None
108
+ self._last_io_time: float | None = None
109
+ self._cpu_history: deque = deque(maxlen=900) # 15 min at 1 Hz
110
+ psutil.cpu_percent(interval=None) # prime — first call always returns 0.0
111
+
112
+ # ── Collectors ────────────────────────────────────────────────────────
113
+ self._ros2_control = Ros2ControlCollector(
114
+ node=self,
115
+ event_callback=self._on_ros2_control_event,
116
+ logger=self.get_logger(),
117
+ )
118
+ _tf_tree_enabled = self.get_parameter('tf_tree_enabled').get_parameter_value().bool_value
119
+ self._tf_tree = TfTreeCollector(
120
+ node=self,
121
+ event_callback=self._on_tf_tree_event,
122
+ logger=self.get_logger(),
123
+ ) if _tf_tree_enabled else None
124
+
125
+ # ── Timers ────────────────────────────────────────────────────────────
126
+ _graph_interval = self.get_parameter('graph_check_interval').get_parameter_value().double_value
127
+ self._topic_batch_size = self.get_parameter('topic_batch_size').get_parameter_value().integer_value
128
+ self.create_timer(_graph_interval, self._check_graph_changes)
129
+ # NOTE: _collect_telemetry and _refresh_empty_param_caches timers are
130
+ # NOT wired yet — held for bisect step 4e (enable polling).
131
+
132
+ # ── WebSocket thread ──────────────────────────────────────────────────
133
+ threading.Thread(target=self._run_ws_client, daemon=True).start()
134
+
135
+ self.get_logger().info(
136
+ f"🚀 Osiris agent v{AGENT_VERSION} — bisect 4c: state hydration + collectors (inert)"
137
+ )
138
+
139
+ # ──────────────────────────────────────────────
140
+ # WebSocket client
141
+ # ──────────────────────────────────────────────
142
+
143
+ def _run_ws_client(self):
144
+ self.loop = asyncio.new_event_loop()
145
+ asyncio.set_event_loop(self.loop)
146
+ self._send_queue = asyncio.Queue()
147
+ self.loop.run_until_complete(self._client_loop_with_reconnect())
148
+
149
+ async def _client_loop_with_reconnect(self):
150
+ delay = RECONNECT_INITIAL_DELAY
151
+ while self.context.ok():
152
+ try:
153
+ await self._client_loop()
154
+ except Exception as e:
155
+ if self.context.ok():
156
+ self.get_logger().warning(
157
+ f"WebSocket error: {e}; retrying in {delay:.1f}s"
158
+ )
159
+ await asyncio.sleep(delay)
160
+ delay = min(delay * 2, RECONNECT_MAX_DELAY) + random.uniform(0, 1)
161
+
162
+ async def _client_loop(self):
163
+ send_task = None
164
+ self.get_logger().info('Connecting to gateway...')
165
+ try:
166
+ async with websockets.connect(self.ws_url) as ws:
167
+ try:
168
+ auth_msg = await ws.recv()
169
+ auth_data = json.loads(auth_msg)
170
+ except Exception:
171
+ self.get_logger().error('Failed to receive auth response from gateway')
172
+ return
173
+
174
+ if not auth_data or auth_data.get('type') != 'auth_success':
175
+ error_msg = auth_data.get('message', 'unknown') if auth_data else 'no response'
176
+ self.get_logger().error(f'Authentication failed: {error_msg}')
177
+ return
178
+
179
+ self.get_logger().info('Connected and authenticated to gateway')
180
+ self.ws = ws
181
+ send_task = asyncio.create_task(self._send_loop(ws))
182
+
183
+ await self._send_initial_state()
184
+ await self._receive_loop(ws)
185
+ finally:
186
+ if send_task and not send_task.done():
187
+ send_task.cancel()
188
+ try:
189
+ await send_task
190
+ except (asyncio.CancelledError, Exception):
191
+ pass
192
+ if self.ws is not None:
193
+ self.get_logger().warning('Disconnected from gateway')
194
+ self.ws = None
195
+
196
+ async def _send_loop(self, ws):
197
+ while True:
198
+ msg = await self._send_queue.get()
199
+ try:
200
+ await ws.send(msg)
201
+ except Exception as e:
202
+ self.get_logger().error(f"WS send failed: {e}")
203
+ raise
204
+
205
+ async def _receive_loop(self, ws):
206
+ async for raw in ws:
207
+ if not self.context.ok():
208
+ break
209
+ try:
210
+ data = json.loads(raw)
211
+ except json.JSONDecodeError:
212
+ continue
213
+ msg_type = data.get('type')
214
+ if msg_type == 'subscribe':
215
+ topic = data.get('topic')
216
+ if topic:
217
+ self._subscribe_to_topic(topic)
218
+ elif msg_type == 'unsubscribe':
219
+ topic = data.get('topic')
220
+ if topic:
221
+ self._unsubscribe_from_topic(topic)
222
+ elif msg_type == 'start_telemetry':
223
+ self._telemetry_enabled = True
224
+ elif msg_type == 'stop_telemetry':
225
+ self._telemetry_enabled = False
226
+ elif msg_type == 'error':
227
+ self.get_logger().warning(f"Gateway error: {data.get('message', '')}")
228
+
229
+ async def _send_initial_state(self):
230
+ # Wait for the first _check_graph_changes tick to populate all caches.
231
+ await asyncio.to_thread(self._initial_scan_complete.wait, 15.0)
232
+
233
+ # Reset delta caches so _flush_graph_snapshots treats everything as
234
+ # "unsent" after this reconnect.
235
+ self._last_sent_nodes = None
236
+ self._last_sent_topics = None
237
+ self._last_sent_actions = None
238
+ self._last_sent_services = None
239
+ self._graph_dirty = True
240
+
241
+ nodes = self._get_nodes_with_relations()
242
+ topics = self._get_topics_with_relations()
243
+ actions = self._get_actions_with_relations()
244
+ services = self._get_services_with_relations()
245
+
246
+ self._last_sent_nodes = nodes.copy()
247
+ self._last_sent_topics = topics.copy()
248
+ self._last_sent_actions = actions.copy()
249
+ self._last_sent_services = services.copy()
250
+
251
+ await self._send_queue.put(json.dumps({
252
+ 'type': 'agent_version',
253
+ 'version': AGENT_VERSION,
254
+ }))
255
+
256
+ await self._send_queue.put(json.dumps({
257
+ 'type': 'initial_state',
258
+ 'timestamp': time.time(),
259
+ 'data': {
260
+ 'nodes': nodes,
261
+ 'topics': topics,
262
+ 'actions': actions,
263
+ 'services': services,
264
+ 'telemetry': self._get_telemetry_snapshot(),
265
+ 'controllers': self._ros2_control.get_controllers_snapshot(),
266
+ 'hardware': self._ros2_control.get_hardware_snapshot(),
267
+ 'tf_tree': self._tf_tree.get_snapshot() if self._tf_tree is not None else None,
268
+ },
269
+ }))
270
+
271
+ await self._send_bridge_subscriptions()
272
+
273
+ self.get_logger().info(
274
+ f"Sent initial_state: {len(nodes)} nodes, {len(topics)} topics, "
275
+ f"{len(actions)} actions, {len(services)} services"
276
+ )
277
+
278
+ async def _send_bridge_subscriptions(self):
279
+ with self._topic_subs_lock:
280
+ subs = list(self._topic_subs.keys())
281
+ await self._send_queue.put(json.dumps({
282
+ 'type': 'bridge_subscriptions',
283
+ 'subscriptions': subs,
284
+ 'timestamp': time.time(),
285
+ }))
286
+
287
+ # ──────────────────────────────────────────────
288
+ # Tier-1: cheap existence detection
289
+ # ──────────────────────────────────────────────
290
+
291
+ def _check_graph_changes(self):
292
+ if self._first_graph_check_done:
293
+ return # BISECT R1: skip all polling after first tick
294
+
295
+ _t0 = time.time()
296
+ node_pairs = list(self.get_node_names_and_namespaces())
297
+ topic_type_list = self.get_topic_names_and_types()
298
+ _t1 = time.time()
299
+
300
+ current_nodes = {self._node_full_name(n, ns) for n, ns in node_pairs}
301
+ current_topics = {t for t, _ in topic_type_list}
302
+ current_actions = {
303
+ t.replace('/_action/status', '')
304
+ for t in current_topics
305
+ if t.endswith('/_action/status')
306
+ }
307
+ self.get_logger().info(
308
+ f"[poll] node+topic: {(_t1-_t0)*1000:.1f}ms "
309
+ f"({len(current_nodes)} nodes, {len(current_topics)} topics, {len(current_actions)} actions)"
310
+ )
311
+
312
+ _ts0 = time.time()
313
+ service_type_list = self.get_service_names_and_types()
314
+ _ts1 = time.time()
315
+ current_services = {
316
+ s: types[0] if types else 'unknown'
317
+ for s, types in service_type_list
318
+ if not any(s.startswith(p) for p in _SUPPRESSED_SERVICE_PREFIXES)
319
+ }
320
+ self.get_logger().info(
321
+ f"[poll] service_scan: {(_ts1-_ts0)*1000:.1f}ms ({len(current_services)} services)"
322
+ )
323
+
324
+ self._first_graph_check_done = True
325
+ self._active_nodes = current_nodes
326
+ self._active_topics = current_topics
327
+ self._active_services = current_services
328
+ self._active_actions = current_actions
329
+ _te0 = time.time()
330
+ self._do_full_initial_enrichment(topic_type_list, node_pairs)
331
+ _te1 = time.time()
332
+ for fqn in current_nodes:
333
+ self._fetch_node_parameters_async(fqn)
334
+ self._ros2_control.poll()
335
+ if self._tf_tree is not None:
336
+ self._tf_tree.poll(force=True)
337
+ self._initial_scan_complete.set()
338
+ self.get_logger().info(
339
+ f"[poll] first tick complete: {len(current_nodes)} nodes, {len(current_topics)} topics, "
340
+ f"{len(current_services)} services, {len(current_actions)} actions — "
341
+ f"node+topic={(_t1-_t0)*1000:.1f}ms enrichment={(_te1-_te0)*1000:.1f}ms"
342
+ )
343
+
344
+ # ──────────────────────────────────────────────
345
+ # Initial full enrichment (called once on first tick)
346
+ # ──────────────────────────────────────────────
347
+
348
+ def _do_full_initial_enrichment(self, topic_type_list, node_pairs):
349
+ topic_type_map = dict(topic_type_list)
350
+ self._pending_topic_enrichment.clear()
351
+ for topic in self._active_topics:
352
+ try:
353
+ pub_infos = self.get_publishers_info_by_topic(topic)
354
+ sub_infos = self.get_subscriptions_info_by_topic(topic)
355
+ except Exception:
356
+ continue
357
+ publishers = {self._node_full_name(p.node_name, p.node_namespace) for p in pub_infos}
358
+ subscribers = {self._node_full_name(s.node_name, s.node_namespace) for s in sub_infos}
359
+ self._topic_relations[topic] = {
360
+ 'publishers': publishers,
361
+ 'subscribers': subscribers,
362
+ 'publisher_infos': pub_infos,
363
+ 'subscriber_infos': sub_infos,
364
+ 'type': topic_type_map.get(topic, ['unknown'])[0],
365
+ }
366
+ self._topic_counts[topic] = (len(pub_infos), len(sub_infos))
367
+
368
+ # ──────────────────────────────────────────────
369
+ # Tier-2: batched relation enrichment (inert with R1 gate)
370
+ # ──────────────────────────────────────────────
371
+
372
+ def _enrich_pending_relations(self, topic_type_list=None):
373
+ if not self._pending_topic_enrichment:
374
+ return
375
+
376
+ _pending_before = len(self._pending_topic_enrichment)
377
+ _t0 = time.time()
378
+ batch = set(list(self._pending_topic_enrichment)[:self._topic_batch_size])
379
+ self._pending_topic_enrichment -= batch
380
+ self.get_logger().info(
381
+ f"[enrich] batch={len(batch)}, pending_before={_pending_before}, "
382
+ f"remaining={len(self._pending_topic_enrichment)}"
383
+ )
384
+
385
+ if topic_type_list is not None:
386
+ topic_type_map = dict(topic_type_list)
387
+ else:
388
+ topic_type_map = dict(self.get_topic_names_and_types())
389
+
390
+ for topic in batch:
391
+ if topic not in self._active_topics:
392
+ continue
393
+ try:
394
+ pub_infos = self.get_publishers_info_by_topic(topic)
395
+ sub_infos = self.get_subscriptions_info_by_topic(topic)
396
+ except Exception as e:
397
+ self.get_logger().debug(f"Enrichment failed for {topic}: {e}")
398
+ continue
399
+
400
+ publishers = {self._node_full_name(p.node_name, p.node_namespace) for p in pub_infos}
401
+ subscribers = {self._node_full_name(s.node_name, s.node_namespace) for s in sub_infos}
402
+ old = self._topic_relations.get(topic)
403
+ new_rel = {
404
+ 'publishers': publishers,
405
+ 'subscribers': subscribers,
406
+ 'publisher_infos': pub_infos,
407
+ 'subscriber_infos': sub_infos,
408
+ 'type': topic_type_map.get(topic, ['unknown'])[0],
409
+ }
410
+ self._topic_relations[topic] = new_rel
411
+ self._topic_counts[topic] = (len(pub_infos), len(sub_infos))
412
+
413
+ if old is not None:
414
+ old_subs = old['subscribers']
415
+ for fqn in subscribers - old_subs:
416
+ self._send_event_and_update({
417
+ 'type': 'topic_event', 'topic': topic, 'node': fqn,
418
+ 'event': 'subscribed', 'timestamp': time.time(),
419
+ })
420
+ for fqn in old_subs - subscribers:
421
+ self._send_event_and_update({
422
+ 'type': 'topic_event', 'topic': topic, 'node': fqn,
423
+ 'event': 'unsubscribed', 'timestamp': time.time(),
424
+ })
425
+
426
+ self.get_logger().info(f"[enrich] done in {(time.time()-_t0)*1000:.1f}ms")
427
+
428
+ # ──────────────────────────────────────────────
429
+ # Graph snapshot builders
430
+ # ──────────────────────────────────────────────
431
+
432
+ def _get_nodes_with_relations(self) -> dict:
433
+ result = {}
434
+ for fqn in self._active_nodes:
435
+ result[fqn] = {
436
+ 'publishes': [],
437
+ 'subscribes': [],
438
+ 'actions': [],
439
+ 'services': [],
440
+ 'parameters': self._node_parameter_cache.get(fqn, {}),
441
+ }
442
+
443
+ for topic, rel in self._topic_relations.items():
444
+ pub_infos = rel.get('publisher_infos', [])
445
+ sub_infos = rel.get('subscriber_infos', [])
446
+ for p in pub_infos:
447
+ fqn = self._node_full_name(p.node_name, p.node_namespace)
448
+ if fqn in result:
449
+ result[fqn]['publishes'].append({
450
+ 'topic': topic,
451
+ 'qos': self._qos_to_dict(p.qos_profile),
452
+ })
453
+ for s in sub_infos:
454
+ fqn = self._node_full_name(s.node_name, s.node_namespace)
455
+ if fqn in result:
456
+ result[fqn]['subscribes'].append({
457
+ 'topic': topic,
458
+ 'qos': self._qos_to_dict(s.qos_profile),
459
+ })
460
+
461
+ for topic, rel in self._topic_relations.items():
462
+ if topic.endswith('/_action/status') and rel['publishers']:
463
+ action = topic.replace('/_action/status', '')
464
+ for p in rel['publisher_infos']:
465
+ fqn = self._node_full_name(p.node_name, p.node_namespace)
466
+ if fqn in result and action not in result[fqn]['actions']:
467
+ result[fqn]['actions'].append(action)
468
+
469
+ return result
470
+
471
+ def _get_topics_with_relations(self) -> dict:
472
+ result = {}
473
+ for topic, rel in self._topic_relations.items():
474
+ result[topic] = {
475
+ 'type': rel.get('type', 'unknown'),
476
+ 'publishers': [
477
+ {
478
+ 'node': self._node_full_name(p.node_name, p.node_namespace),
479
+ 'qos': self._qos_to_dict(p.qos_profile),
480
+ }
481
+ for p in rel.get('publisher_infos', [])
482
+ ],
483
+ 'subscribers': [
484
+ {
485
+ 'node': self._node_full_name(s.node_name, s.node_namespace),
486
+ 'qos': self._qos_to_dict(s.qos_profile),
487
+ }
488
+ for s in rel.get('subscriber_infos', [])
489
+ ],
490
+ }
491
+ return result
492
+
493
+ def _get_actions_with_relations(self) -> dict:
494
+ result = {}
495
+ for topic, rel in self._topic_relations.items():
496
+ if topic.endswith('/_action/status') and rel['publishers']:
497
+ action = topic.replace('/_action/status', '')
498
+ providers = [
499
+ self._node_full_name(p.node_name, p.node_namespace)
500
+ for p in rel.get('publisher_infos', [])
501
+ ]
502
+ result[action] = {'providers': providers}
503
+ return result
504
+
505
+ def _get_services_with_relations(self) -> dict:
506
+ return {
507
+ name: {'type': type_str, 'providers': []}
508
+ for name, type_str in self._active_services.items()
509
+ }
510
+
511
+ # ──────────────────────────────────────────────
512
+ # Delta-send: flush graph snapshots after each tick
513
+ # ──────────────────────────────────────────────
514
+
515
+ def _flush_graph_snapshots(self):
516
+ if not self._graph_dirty or not self.ws or not self.loop:
517
+ return
518
+ self._graph_dirty = False
519
+ self.get_logger().debug("[flush] graph dirty, checking snapshots")
520
+
521
+ nodes = self._get_nodes_with_relations()
522
+ if nodes != self._last_sent_nodes:
523
+ self.get_logger().info(f"[flush] nodes changed ({len(nodes)} nodes)")
524
+ self._last_sent_nodes = nodes.copy()
525
+ self._enqueue({
526
+ 'type': 'nodes', 'data': nodes, 'timestamp': time.time(),
527
+ })
528
+
529
+ topics = self._get_topics_with_relations()
530
+ if topics != self._last_sent_topics:
531
+ self.get_logger().info(f"[flush] topics changed ({len(topics)} topics)")
532
+ self._last_sent_topics = topics.copy()
533
+ self._enqueue({
534
+ 'type': 'topics', 'data': topics, 'timestamp': time.time(),
535
+ })
536
+
537
+ actions = self._get_actions_with_relations()
538
+ if actions != self._last_sent_actions:
539
+ self.get_logger().info(f"[flush] actions changed ({len(actions)} actions)")
540
+ self._last_sent_actions = actions.copy()
541
+ self._enqueue({
542
+ 'type': 'actions', 'data': actions, 'timestamp': time.time(),
543
+ })
544
+
545
+ services = self._get_services_with_relations()
546
+ if services != self._last_sent_services:
547
+ self.get_logger().info(f"[flush] services changed ({len(services)} services)")
548
+ self._last_sent_services = services.copy()
549
+ self._enqueue({
550
+ 'type': 'services', 'data': services, 'timestamp': time.time(),
551
+ })
552
+
553
+ # ──────────────────────────────────────────────
554
+ # Topic subscriptions (gateway-requested)
555
+ # ──────────────────────────────────────────────
556
+
557
+ def _subscribe_to_topic(self, topic_name: str):
558
+ if not topic_name or not isinstance(topic_name, str):
559
+ return
560
+ with self._topic_subs_lock:
561
+ if topic_name in self._topic_subs:
562
+ return
563
+ if len(self._topic_subs) >= MAX_SUBSCRIPTIONS:
564
+ self.get_logger().error(
565
+ f"Subscription limit ({MAX_SUBSCRIPTIONS}) reached; "
566
+ f"cannot subscribe to {topic_name}"
567
+ )
568
+ return
569
+
570
+ types = dict(self.get_topic_names_and_types()).get(topic_name)
571
+ if not types:
572
+ self.get_logger().warning(f"Topic not found: {topic_name}")
573
+ return
574
+
575
+ msg_class = get_message(types[0])
576
+ sub = self.create_subscription(
577
+ msg_class,
578
+ topic_name,
579
+ lambda msg, t=topic_name: self._on_topic_msg(msg, t),
580
+ QoSProfile(depth=10),
581
+ )
582
+ with self._topic_subs_lock:
583
+ self._topic_subs[topic_name] = sub
584
+
585
+ self.get_logger().info(f"Subscribed to {topic_name}")
586
+ if self.loop:
587
+ asyncio.run_coroutine_threadsafe(
588
+ self._send_bridge_subscriptions(), self.loop
589
+ )
590
+
591
+ def _unsubscribe_from_topic(self, topic_name: str):
592
+ with self._topic_subs_lock:
593
+ sub = self._topic_subs.pop(topic_name, None)
594
+ if sub:
595
+ self.destroy_subscription(sub)
596
+ self.get_logger().info(f"Unsubscribed from {topic_name}")
597
+ if self.loop:
598
+ asyncio.run_coroutine_threadsafe(
599
+ self._send_bridge_subscriptions(), self.loop
600
+ )
601
+
602
+ def _on_topic_msg(self, msg, topic_name: str):
603
+ if not self.ws or not self.loop:
604
+ return
605
+
606
+ ts = time.time()
607
+ last_ts = self._topic_last_timestamp.get(topic_name)
608
+ if last_ts is not None:
609
+ delta = ts - last_ts
610
+ if delta > 0:
611
+ history = self._topic_rate_history.setdefault(
612
+ topic_name, deque(maxlen=self._rate_history_depth)
613
+ )
614
+ history.append(delta)
615
+ self._topic_last_timestamp[topic_name] = ts
616
+
617
+ rate = None
618
+ history = self._topic_rate_history.get(topic_name)
619
+ if history:
620
+ total = sum(history)
621
+ if total > 0:
622
+ rate = len(history) / total
623
+
624
+ asyncio.run_coroutine_threadsafe(
625
+ self._send_queue.put(json.dumps({
626
+ 'type': 'topic_data',
627
+ 'topic': topic_name,
628
+ 'data': message_to_ordereddict(msg),
629
+ 'rate_hz': rate,
630
+ 'timestamp': ts,
631
+ })),
632
+ self.loop,
633
+ )
634
+
635
+ # ──────────────────────────────────────────────
636
+ # Parameters (async, lazy-loaded)
637
+ # ──────────────────────────────────────────────
638
+
639
+ def _refresh_empty_param_caches(self):
640
+ """Retry parameter fetch for nodes that don't have cached params yet."""
641
+ for fqn in self._active_nodes:
642
+ if not self._node_parameter_cache.get(fqn):
643
+ self._fetch_node_parameters_async(fqn)
644
+
645
+ def _fetch_node_parameters_async(self, fqn: str):
646
+ """Fetch parameters for *fqn* without blocking the executor.
647
+
648
+ Creates service clients, fires async calls, and stores results in
649
+ _node_parameter_cache when callbacks fire. Safe to call from any
650
+ timer or graph-change callback.
651
+ """
652
+ if fqn in self._pending_param_fetches:
653
+ return
654
+
655
+ list_client = self.create_client(ListParameters, f"{fqn}/list_parameters")
656
+ if not list_client.service_is_ready():
657
+ self.destroy_client(list_client)
658
+ return
659
+
660
+ self._pending_param_fetches.add(fqn)
661
+ req = ListParameters.Request()
662
+ req.depth = 10
663
+ future = list_client.call_async(req)
664
+
665
+ def _on_list(fut):
666
+ self.destroy_client(list_client)
667
+ response = fut.result()
668
+ if response is None or not response.result.names:
669
+ self._pending_param_fetches.discard(fqn)
670
+ return
671
+ param_names = list(response.result.names)
672
+ get_client = self.create_client(GetParameters, f"{fqn}/get_parameters")
673
+ get_req = GetParameters.Request()
674
+ get_req.names = param_names
675
+ get_future = get_client.call_async(get_req)
676
+
677
+ def _on_get(gfut):
678
+ self.destroy_client(get_client)
679
+ self._pending_param_fetches.discard(fqn)
680
+ get_resp = gfut.result()
681
+ if get_resp is None:
682
+ return
683
+ params = {}
684
+ for name, value in zip(param_names, get_resp.values):
685
+ try:
686
+ params[name] = parameter_value_to_python(value)
687
+ except Exception:
688
+ pass
689
+ self._node_parameter_cache[fqn] = params
690
+ self._graph_dirty = True
691
+ self.get_logger().debug(f"[params] cached {len(params)} params for {fqn}")
692
+ self._flush_graph_snapshots()
693
+
694
+ get_future.add_done_callback(_on_get)
695
+
696
+ future.add_done_callback(_on_list)
697
+
698
+ # ──────────────────────────────────────────────
699
+ # Telemetry
700
+ # ──────────────────────────────────────────────
701
+
702
+ def _collect_telemetry(self):
703
+ if not self._telemetry_enabled or not self.ws or not self.loop:
704
+ return
705
+ self._enqueue({
706
+ 'type': 'telemetry',
707
+ 'data': self._get_telemetry_snapshot(),
708
+ 'timestamp': time.time(),
709
+ })
710
+
711
+ def _get_telemetry_snapshot(self) -> dict:
712
+ cpu_now = round(psutil.cpu_percent(interval=None), 1)
713
+ self._cpu_history.append(cpu_now)
714
+
715
+ def _rolling(n: int) -> float | None:
716
+ window = list(self._cpu_history)[-n:]
717
+ return round(sum(window) / len(window), 1) if window else None
718
+
719
+ load1 = _rolling(60)
720
+ load5 = _rolling(300)
721
+ load15 = _rolling(900)
722
+
723
+ vm = psutil.virtual_memory()
724
+ ram_percent = vm.percent
725
+
726
+ now = time.time()
727
+ disk_usage = psutil.disk_usage('/')
728
+ disk_read_mbps = 0.0
729
+ disk_write_mbps = 0.0
730
+ try:
731
+ disk_io = psutil.disk_io_counters()
732
+ if self._last_disk_io is not None and self._last_io_time is not None:
733
+ dt = now - self._last_io_time
734
+ if dt > 0:
735
+ disk_read_mbps = round(max(0.0, (disk_io.read_bytes - self._last_disk_io.read_bytes) / dt / (1024 * 1024)), 2)
736
+ disk_write_mbps = round(max(0.0, (disk_io.write_bytes - self._last_disk_io.write_bytes) / dt / (1024 * 1024)), 2)
737
+ self._last_disk_io = disk_io
738
+ except Exception:
739
+ pass
740
+
741
+ net_tx_mbps = 0.0
742
+ net_rx_mbps = 0.0
743
+ try:
744
+ net_io = psutil.net_io_counters()
745
+ if self._last_net_io is not None and self._last_io_time is not None:
746
+ dt = now - self._last_io_time
747
+ if dt > 0:
748
+ net_tx_mbps = round(max(0.0, (net_io.bytes_sent - self._last_net_io.bytes_sent) / dt / (1024 * 1024)), 2)
749
+ net_rx_mbps = round(max(0.0, (net_io.bytes_recv - self._last_net_io.bytes_recv) / dt / (1024 * 1024)), 2)
750
+ self._last_net_io = net_io
751
+ except Exception:
752
+ pass
753
+
754
+ self._last_io_time = now
755
+
756
+ cpu_c = None
757
+ try:
758
+ temps = psutil.sensors_temperatures()
759
+ for key in ('coretemp', 'cpu-thermal', 'acpitz', 'k10temp', 'cpu_thermal'):
760
+ entries = temps.get(key)
761
+ if entries:
762
+ cpu_c = round(entries[0].current, 1)
763
+ break
764
+ except Exception:
765
+ pass
766
+
767
+ return {
768
+ 'cpu': {
769
+ 'now': cpu_now,
770
+ 'load1': load1,
771
+ 'load5': load5,
772
+ 'load15': load15,
773
+ },
774
+ 'ram': {
775
+ 'percent': round(ram_percent, 1),
776
+ 'used_mb': round(vm.used / (1024 * 1024), 1),
777
+ 'total_mb': round(vm.total / (1024 * 1024), 1),
778
+ },
779
+ 'disk': {
780
+ 'percent': round(disk_usage.percent, 1),
781
+ 'used_gb': round(disk_usage.used / (1024 ** 3), 2),
782
+ 'total_gb': round(disk_usage.total / (1024 ** 3), 2),
783
+ 'read_mbps': disk_read_mbps,
784
+ 'write_mbps': disk_write_mbps,
785
+ },
786
+ 'net': {
787
+ 'tx_mbps': net_tx_mbps,
788
+ 'rx_mbps': net_rx_mbps,
789
+ },
790
+ 'temp': {
791
+ 'cpu_c': cpu_c,
792
+ },
793
+ }
794
+
795
+ # ──────────────────────────────────────────────
796
+ # Helpers
797
+ # ──────────────────────────────────────────────
798
+
799
+ @staticmethod
800
+ def _node_full_name(name: str, namespace: str) -> str:
801
+ ns = namespace if namespace.endswith('/') else namespace + '/'
802
+ return ns + name
803
+
804
+ @staticmethod
805
+ def _qos_to_dict(qos) -> dict | None:
806
+ if not qos:
807
+ return None
808
+ return {
809
+ 'reliability': qos.reliability.name if hasattr(qos.reliability, 'name') else str(qos.reliability),
810
+ 'durability': qos.durability.name if hasattr(qos.durability, 'name') else str(qos.durability),
811
+ 'history': qos.history.name if hasattr(qos.history, 'name') else str(qos.history),
812
+ 'depth': qos.depth,
813
+ 'liveliness': qos.liveliness.name if hasattr(qos.liveliness, 'name') else str(qos.liveliness),
814
+ }
815
+
816
+ def _send_event_and_update(self, event: dict, log: str = ''):
817
+ """Queue an event to the WS send loop and mark the graph dirty."""
818
+ if log:
819
+ self.get_logger().debug(log)
820
+ if event:
821
+ self._enqueue(event)
822
+ self._graph_dirty = True
823
+
824
+ def _enqueue(self, payload: dict):
825
+ """Thread-safe enqueue to the asyncio send queue."""
826
+ if self.ws and self.loop:
827
+ asyncio.run_coroutine_threadsafe(
828
+ self._send_queue.put(json.dumps(payload)),
829
+ self.loop,
830
+ )
831
+
832
+ # ──────────────────────────────────────────────
833
+ # Collector event handlers
834
+ # ──────────────────────────────────────────────
835
+
836
+ def _on_ros2_control_event(self, event: dict):
837
+ self._enqueue(event)
838
+
839
+ def _on_tf_tree_event(self, event: dict):
840
+ self._enqueue(event)
841
+
842
+ # ──────────────────────────────────────────────
843
+ # Cleanup
844
+ # ──────────────────────────────────────────────
845
+
846
+ def destroy_node(self):
847
+ self._ros2_control.destroy()
848
+ if self._tf_tree is not None:
849
+ self._tf_tree.destroy()
850
+ super().destroy_node()
851
+
852
+
853
+ def main(args=None):
854
+ rclpy.init(args=args)
855
+ node = WebBridge()
856
+ try:
857
+ rclpy.spin(node)
858
+ except (KeyboardInterrupt, rclpy.executors.ExternalShutdownException):
859
+ pass
860
+ finally:
861
+ node.destroy_node()
862
+ if rclpy.ok():
863
+ rclpy.shutdown()
864
+
865
+
866
+ if __name__ == '__main__':
867
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osiris_agent
3
- Version: 0.3.76
3
+ Version: 0.3.78
4
4
  Summary: OSIRIS agent for ROS2/Humble
5
5
  Home-page: https://github.com/nicolaselielll/osiris_agent
6
6
  Author: Nicolas Tuomaala
@@ -6,7 +6,7 @@ long_description = (HERE / "README.md").read_text(encoding="utf-8")
6
6
 
7
7
  setup(
8
8
  name='osiris_agent',
9
- version='0.3.76',
9
+ version='0.3.78',
10
10
  description='OSIRIS agent for ROS2/Humble',
11
11
  long_description=long_description,
12
12
  long_description_content_type="text/markdown",
@@ -1,268 +0,0 @@
1
- import asyncio
2
- import os
3
- import random
4
- import threading
5
- import time
6
-
7
- import rclpy
8
- import websockets
9
- import json
10
-
11
- from rclpy.node import Node
12
-
13
- from osiris_agent import __version__ as AGENT_VERSION
14
-
15
- # ──────────────────────────────────────────────
16
- # Constants
17
- # ──────────────────────────────────────────────
18
- GRAPH_CHECK_INTERVAL = 2.0 # seconds between graph polls
19
- TOPIC_BATCH_SIZE = 10 # max topics enriched per tick
20
- RECONNECT_INITIAL_DELAY = 1 # seconds
21
- RECONNECT_MAX_DELAY = 30 # seconds
22
-
23
- # Services to suppress from graph output (internal ROS2 plumbing)
24
- _SUPPRESSED_SERVICE_PREFIXES = ('/ros2cli_daemon',)
25
-
26
-
27
- class WebBridge(Node):
28
-
29
- def __init__(self):
30
- super().__init__('osiris_node')
31
-
32
- auth_token = os.environ.get('OSIRIS_AUTH_TOKEN')
33
- if not auth_token:
34
- raise ValueError("OSIRIS_AUTH_TOKEN environment variable must be set")
35
-
36
- base_url = os.environ.get('OSIRIS_WS_URL', 'wss://osiris-gateway.fly.dev')
37
- self.ws_url = f'{base_url}?robot=true&token={auth_token}'
38
- # self.ws_url = f'ws://host.docker.internal:8080?robot=true&token={auth_token}'
39
-
40
- # Declare tunable parameters
41
- self.declare_parameter('graph_check_interval', GRAPH_CHECK_INTERVAL)
42
- self.declare_parameter('topic_batch_size', TOPIC_BATCH_SIZE)
43
-
44
- self.ws = None
45
- self.loop = None
46
- self._send_queue: asyncio.Queue | None = None
47
-
48
- # ── Existence caches (set of fully-qualified names) ───────────────────
49
- self._active_nodes: set[str] = set()
50
- self._active_topics: set[str] = set()
51
- self._active_services: dict[str, str] = {}
52
- self._active_actions: set[str] = set()
53
-
54
- # ── Relation caches (populated by enrichment) ─────────────────────────
55
- self._topic_relations: dict[str, dict] = {}
56
-
57
- # ── First-tick gate ───────────────────────────────────────────────────
58
- self._first_graph_check_done = False
59
-
60
- # ── Timers ────────────────────────────────────────────────────────────
61
- _graph_interval = self.get_parameter('graph_check_interval').get_parameter_value().double_value
62
- self._topic_batch_size = self.get_parameter('topic_batch_size').get_parameter_value().integer_value
63
- self.create_timer(_graph_interval, self._check_graph_changes)
64
-
65
- threading.Thread(target=self._run_ws_client, daemon=True).start()
66
-
67
- self.get_logger().info(
68
- f"🚀 Osiris agent v{AGENT_VERSION} — bisect 4a: first-tick scan enabled"
69
- )
70
-
71
- # ──────────────────────────────────────────────
72
- # WebSocket client
73
- # ──────────────────────────────────────────────
74
-
75
- def _run_ws_client(self):
76
- self.loop = asyncio.new_event_loop()
77
- asyncio.set_event_loop(self.loop)
78
- self._send_queue = asyncio.Queue()
79
- self.loop.run_until_complete(self._client_loop_with_reconnect())
80
-
81
- async def _client_loop_with_reconnect(self):
82
- delay = RECONNECT_INITIAL_DELAY
83
- while self.context.ok():
84
- try:
85
- await self._client_loop()
86
- except Exception as e:
87
- if self.context.ok():
88
- self.get_logger().warning(
89
- f"WebSocket error: {e}; retrying in {delay:.1f}s"
90
- )
91
- await asyncio.sleep(delay)
92
- delay = min(delay * 2, RECONNECT_MAX_DELAY) + random.uniform(0, 1)
93
-
94
- async def _client_loop(self):
95
- send_task = None
96
- self.get_logger().info('Connecting to gateway...')
97
- try:
98
- async with websockets.connect(self.ws_url) as ws:
99
- try:
100
- auth_msg = await ws.recv()
101
- auth_data = json.loads(auth_msg)
102
- except Exception:
103
- self.get_logger().error('Failed to receive auth response from gateway')
104
- return
105
-
106
- if not auth_data or auth_data.get('type') != 'auth_success':
107
- error_msg = auth_data.get('message', 'unknown') if auth_data else 'no response'
108
- self.get_logger().error(f'Authentication failed: {error_msg}')
109
- return
110
-
111
- self.get_logger().info('Connected and authenticated to gateway')
112
- self.ws = ws
113
- send_task = asyncio.create_task(self._send_loop(ws))
114
-
115
- await self._send_initial_state()
116
- await self._receive_loop(ws)
117
- finally:
118
- if send_task and not send_task.done():
119
- send_task.cancel()
120
- try:
121
- await send_task
122
- except (asyncio.CancelledError, Exception):
123
- pass
124
- if self.ws is not None:
125
- self.get_logger().warning('Disconnected from gateway')
126
- self.ws = None
127
-
128
- async def _send_loop(self, ws):
129
- while True:
130
- msg = await self._send_queue.get()
131
- try:
132
- await ws.send(msg)
133
- except Exception as e:
134
- self.get_logger().error(f"WS send failed: {e}")
135
- raise
136
-
137
- async def _receive_loop(self, ws):
138
- async for raw in ws:
139
- if not self.context.ok():
140
- break
141
-
142
- async def _send_initial_state(self):
143
- await self._send_queue.put(json.dumps({
144
- 'type': 'agent_version',
145
- 'version': AGENT_VERSION,
146
- }))
147
-
148
- await self._send_queue.put(json.dumps({
149
- 'type': 'initial_state',
150
- 'timestamp': time.time(),
151
- 'data': {
152
- 'nodes': {},
153
- 'topics': {},
154
- 'actions': {},
155
- 'services': {},
156
- 'telemetry': None,
157
- 'controllers': None,
158
- 'hardware': None,
159
- 'tf_tree': None,
160
- },
161
- }))
162
- self.get_logger().info("Sent initial_state (empty — bisect mode)")
163
-
164
- # ──────────────────────────────────────────────
165
- # Tier-1: cheap existence detection
166
- # ──────────────────────────────────────────────
167
-
168
- def _check_graph_changes(self):
169
- if self._first_graph_check_done:
170
- return # BISECT R1: skip all polling after first tick
171
-
172
- _t0 = time.time()
173
- node_pairs = list(self.get_node_names_and_namespaces())
174
- topic_type_list = self.get_topic_names_and_types()
175
- _t1 = time.time()
176
-
177
- current_nodes = {self._node_full_name(n, ns) for n, ns in node_pairs}
178
- current_topics = {t for t, _ in topic_type_list}
179
- current_actions = {
180
- t.replace('/_action/status', '')
181
- for t in current_topics
182
- if t.endswith('/_action/status')
183
- }
184
- self.get_logger().info(
185
- f"[poll] node+topic: {(_t1-_t0)*1000:.1f}ms "
186
- f"({len(current_nodes)} nodes, {len(current_topics)} topics, {len(current_actions)} actions)"
187
- )
188
-
189
- _ts0 = time.time()
190
- service_type_list = self.get_service_names_and_types()
191
- _ts1 = time.time()
192
- current_services = {
193
- s: types[0] if types else 'unknown'
194
- for s, types in service_type_list
195
- if not any(s.startswith(p) for p in _SUPPRESSED_SERVICE_PREFIXES)
196
- }
197
- self.get_logger().info(
198
- f"[poll] service_scan: {(_ts1-_ts0)*1000:.1f}ms ({len(current_services)} services)"
199
- )
200
-
201
- self._first_graph_check_done = True
202
- self._active_nodes = current_nodes
203
- self._active_topics = current_topics
204
- self._active_services = current_services
205
- self._active_actions = current_actions
206
- _te0 = time.time()
207
- self._do_full_initial_enrichment(topic_type_list, node_pairs)
208
- _te1 = time.time()
209
- self.get_logger().info(
210
- f"[poll] first tick complete: {len(current_nodes)} nodes, {len(current_topics)} topics, "
211
- f"{len(current_services)} services, {len(current_actions)} actions — "
212
- f"node+topic={(_t1-_t0)*1000:.1f}ms enrichment={(_te1-_te0)*1000:.1f}ms"
213
- )
214
-
215
- # ──────────────────────────────────────────────
216
- # Initial full enrichment (called once on first tick)
217
- # ──────────────────────────────────────────────
218
-
219
- def _do_full_initial_enrichment(self, topic_type_list, node_pairs):
220
- topic_type_map = dict(topic_type_list)
221
- for topic in self._active_topics:
222
- try:
223
- pub_infos = self.get_publishers_info_by_topic(topic)
224
- sub_infos = self.get_subscriptions_info_by_topic(topic)
225
- except Exception:
226
- continue
227
- publishers = {self._node_full_name(p.node_name, p.node_namespace) for p in pub_infos}
228
- subscribers = {self._node_full_name(s.node_name, s.node_namespace) for s in sub_infos}
229
- self._topic_relations[topic] = {
230
- 'publishers': publishers,
231
- 'subscribers': subscribers,
232
- 'publisher_infos': pub_infos,
233
- 'subscriber_infos': sub_infos,
234
- 'type': topic_type_map.get(topic, ['unknown'])[0],
235
- }
236
-
237
- # ──────────────────────────────────────────────
238
- # Helpers
239
- # ──────────────────────────────────────────────
240
-
241
- @staticmethod
242
- def _node_full_name(name: str, namespace: str) -> str:
243
- ns = namespace if namespace.endswith('/') else namespace + '/'
244
- return ns + name
245
-
246
- # ──────────────────────────────────────────────
247
- # Cleanup
248
- # ──────────────────────────────────────────────
249
-
250
- def destroy_node(self):
251
- super().destroy_node()
252
-
253
-
254
- def main(args=None):
255
- rclpy.init(args=args)
256
- node = WebBridge()
257
- try:
258
- rclpy.spin(node)
259
- except (KeyboardInterrupt, rclpy.executors.ExternalShutdownException):
260
- pass
261
- finally:
262
- node.destroy_node()
263
- if rclpy.ok():
264
- rclpy.shutdown()
265
-
266
-
267
- if __name__ == '__main__':
268
- main()
File without changes
File without changes
File without changes