remoteRF-server-testing 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. remoteRF_server/__init__.py +0 -0
  2. remoteRF_server/common/__init__.py +0 -0
  3. remoteRF_server/common/grpc/__init__.py +1 -0
  4. remoteRF_server/common/grpc/grpc_host_pb2.py +63 -0
  5. remoteRF_server/common/grpc/grpc_host_pb2_grpc.py +97 -0
  6. remoteRF_server/common/grpc/grpc_pb2.py +59 -0
  7. remoteRF_server/common/grpc/grpc_pb2_grpc.py +97 -0
  8. remoteRF_server/common/idl/__init__.py +1 -0
  9. remoteRF_server/common/idl/device_schema.py +39 -0
  10. remoteRF_server/common/idl/pluto_schema.py +174 -0
  11. remoteRF_server/common/idl/schema.py +358 -0
  12. remoteRF_server/common/utils/__init__.py +6 -0
  13. remoteRF_server/common/utils/ansi_codes.py +120 -0
  14. remoteRF_server/common/utils/api_token.py +21 -0
  15. remoteRF_server/common/utils/db_connection.py +35 -0
  16. remoteRF_server/common/utils/db_location.py +24 -0
  17. remoteRF_server/common/utils/list_string.py +5 -0
  18. remoteRF_server/common/utils/process_arg.py +80 -0
  19. remoteRF_server/drivers/__init__.py +0 -0
  20. remoteRF_server/drivers/adalm_pluto/__init__.py +0 -0
  21. remoteRF_server/drivers/adalm_pluto/pluto_remote_server.py +105 -0
  22. remoteRF_server/host/__init__.py +0 -0
  23. remoteRF_server/host/host_auth_token.py +292 -0
  24. remoteRF_server/host/host_directory_store.py +142 -0
  25. remoteRF_server/host/host_tunnel_server.py +1388 -0
  26. remoteRF_server/server/__init__.py +0 -0
  27. remoteRF_server/server/acc_perms.py +317 -0
  28. remoteRF_server/server/cert_provider.py +184 -0
  29. remoteRF_server/server/device_manager.py +688 -0
  30. remoteRF_server/server/grpc_server.py +1023 -0
  31. remoteRF_server/server/reservation.py +811 -0
  32. remoteRF_server/server/rpc_manager.py +104 -0
  33. remoteRF_server/server/user_group_cli.py +723 -0
  34. remoteRF_server/server/user_group_handler.py +1120 -0
  35. remoteRF_server/serverrf_cli.py +1377 -0
  36. remoteRF_server/tools/__init__.py +191 -0
  37. remoteRF_server/tools/gen_certs.py +274 -0
  38. remoteRF_server/tools/gist_status.py +139 -0
  39. remoteRF_server/tools/gist_status_testing.py +67 -0
  40. remoterf_server_testing-0.0.0.dist-info/METADATA +612 -0
  41. remoterf_server_testing-0.0.0.dist-info/RECORD +44 -0
  42. remoterf_server_testing-0.0.0.dist-info/WHEEL +5 -0
  43. remoterf_server_testing-0.0.0.dist-info/entry_points.txt +2 -0
  44. remoterf_server_testing-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1388 @@
1
+ # src/remoteRF_host/host/host_tunnel_server.py
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import threading
7
+ import queue
8
+ import time
9
+ import secrets
10
+ import traceback
11
+ import hashlib
12
+ import re
13
+ import uuid
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Dict, Optional, Tuple, Any, List, Set, Iterator
17
+
18
+ from concurrent.futures import Future
19
+ from concurrent import futures
20
+
21
+ import grpc
22
+
23
+ from ..common.utils.process_arg import map_arg
24
+ from ..common.grpc import grpc_pb2 as generic_pb2
25
+ from ..common.grpc import grpc_host_pb2_grpc as host_tunnel_pb2_grpc
26
+ from ..common.grpc import grpc_host_pb2 as host_tunnel_pb2
27
+
28
+ from .host_directory_store import (
29
+ EnvStore,
30
+ DeviceIdConflictError,
31
+ now_ms,
32
+ sanitize_env_key,
33
+ csv_split,
34
+ cfg_dir_from_file,
35
+ )
36
+
37
+ from .host_auth_token import is_host_token_valid, list_hosts
38
+
39
+ # =============================================================================
40
+ # Hardcoded logging mode (NO env vars)
41
+ #
42
+ # Pick ONE:
43
+ # LOG_MODE = "OFF" -> show nothing
44
+ # LOG_MODE = "WARN" -> show only WARN + ERROR
45
+ # LOG_MODE = "ALL" -> show everything (INFO + WARN + ERROR)
46
+ # =============================================================================
47
+
48
+ LOG_MODE = "OFF"
49
+ _LOG_LOCK = threading.Lock()
50
+ HOST_ID_RE = re.compile(r"^[A-Za-z0-9_.-]{1,64}$") # adjust if you want
51
+
52
+ def _grpc_err_summary(e: grpc.RpcError) -> str:
53
+ try:
54
+ code = e.code()
55
+ name = code.name if code else "UNKNOWN"
56
+ except Exception:
57
+ name = "UNKNOWN"
58
+ try:
59
+ details = e.details() or ""
60
+ except Exception:
61
+ details = ""
62
+ return f"{name}: {details}".strip()
63
+
64
+ def _host_status(host_id: str) -> str:
65
+ try:
66
+ for hid, st in list_hosts():
67
+ if hid == host_id:
68
+ return (st or "").strip().lower()
69
+ except Exception:
70
+ pass
71
+ return ""
72
+
73
+ def _auth_fail_code_and_details(host_id: str) -> tuple[str, str]:
74
+ st = _host_status(host_id)
75
+ if not st:
76
+ return ("UNAUTHENTICATED", f"Unknown host_id={host_id!r}. Ask admin to provision a host token.")
77
+ if st != "approved":
78
+ return ("PERMISSION_DENIED", f"host_id={host_id!r} not approved (status={st!r}).")
79
+ return ("UNAUTHENTICATED", f"Invalid host_token for host_id={host_id!r}.")
80
+
81
+ class HostIdConflictError(RuntimeError):
82
+ def __init__(self, host_id: str, *, age_ms: int, replace_after_ms: int) -> None:
83
+ super().__init__(
84
+ f"host_id '{host_id}' already in use (last heartbeat {age_ms}ms ago). "
85
+ f"Try again after {replace_after_ms}ms or fix the conflicting host config."
86
+ )
87
+ self.host_id = host_id
88
+ self.age_ms = age_ms
89
+ self.replace_after_ms = replace_after_ms
90
+
91
+ def _validate_host_id(raw: str) -> str:
92
+ hid = (raw or "").strip()
93
+ if not hid:
94
+ raise ValueError("Missing host_id (set HOST_ID in host.env; do not auto-generate).")
95
+ if hid == "unknown-host":
96
+ raise ValueError("host_id cannot be 'unknown-host' (set HOST_ID in host.env).")
97
+ if not HOST_ID_RE.fullmatch(hid):
98
+ raise ValueError(
99
+ "Invalid host_id. Allowed: 1..64 chars of [A-Za-z0-9_.-]. "
100
+ f"Got: {hid!r}"
101
+ )
102
+ return hid
103
+
104
+ def _ts() -> str:
105
+ t = time.time()
106
+ sec = int(t)
107
+ ms = int((t - sec) * 1000)
108
+ lt = time.localtime(sec)
109
+ return time.strftime("%H:%M:%S", lt) + f".{ms:03d}"
110
+
111
+ def _should_log(level: str) -> bool:
112
+ lvl = (level or "").upper()
113
+ mode = (LOG_MODE or "").upper()
114
+ if mode == "OFF":
115
+ return False
116
+ if mode == "WARN":
117
+ return lvl in ("WARN", "ERROR")
118
+ if mode == "ALL":
119
+ return lvl in ("INFO", "WARN", "ERROR")
120
+ return lvl in ("WARN", "ERROR")
121
+
122
+ def _log(level: str, msg: str) -> None:
123
+ if not _should_log(level):
124
+ return
125
+ tn = threading.current_thread().name
126
+ with _LOG_LOCK:
127
+ print(f"{_ts()} {level:<5} [{tn}] hostrf.host_tunnel_server: {msg}", file=sys.stderr, flush=True)
128
+
129
+ def info(msg: str) -> None:
130
+ _log("INFO", msg)
131
+
132
+ def warn(msg: str) -> None:
133
+ _log("WARN", msg)
134
+
135
+ def error(msg: str) -> None:
136
+ _log("ERROR", msg)
137
+
138
+ def exception(msg: str) -> None:
139
+ _log("ERROR", msg)
140
+ traceback.print_exc(limit=80, file=sys.stderr)
141
+
142
+ # Helpers
143
+
144
+ def host_key(host_id: str) -> str:
145
+ h = hashlib.sha256((host_id or "").encode("utf-8")).hexdigest()
146
+ return h[:16] # short stable key
147
+
148
+ def _safe_thread_tag(s: str) -> str:
149
+ s = s or "peer"
150
+ return re.sub(r"[^A-Za-z0-9_.-]+", "_", s)[:64]
151
+
152
+ def _copy_argument_map(dst_map: Any, src_map: Any) -> None:
153
+ if src_map is None:
154
+ return
155
+ try:
156
+ for k in src_map:
157
+ dst_map[str(k)].CopyFrom(src_map[k])
158
+ except Exception:
159
+ for k, v in dict(src_map).items():
160
+ dst_map[str(k)].CopyFrom(v)
161
+
162
+ # Types
163
+
164
+ @dataclass(frozen=True)
165
+ class DeviceRoute:
166
+ host_id: str
167
+ host_local_id: int
168
+
169
+
170
+ @dataclass
171
+ class HostStatus:
172
+ host_id: str
173
+ online: bool = False
174
+ last_seen_ms: int = 0
175
+
176
+
177
+ @dataclass
178
+ class DeviceStatus:
179
+ device_id: str
180
+ host_id: str
181
+ host_local_id: int
182
+ online: bool = False
183
+ last_seen_ms: int = 0
184
+
185
+ # Session
186
+
187
+ class HostSession:
188
+ def __init__(self, host_id: str, *, peer: str = "<?>", session_uuid: str = "", outbound_max: int = 2048) -> None:
189
+ self.host_id = host_id
190
+ self.peer = peer
191
+ self.session_uuid = (session_uuid or uuid.uuid4().hex[:10])
192
+
193
+ self.out_q: "queue.Queue[Optional[host_tunnel_pb2.HostFrame]]" = queue.Queue(maxsize=int(outbound_max))
194
+ self.inflight: Dict[str, Future] = {}
195
+ self.inflight_lock = threading.Lock()
196
+ self.last_heartbeat_ms = now_ms()
197
+ self.alive = True
198
+ self.outbound_max = int(outbound_max)
199
+
200
+ info(
201
+ f"[session] created host_id={self.host_id!r} sess={self.session_uuid} "
202
+ f"peer={self.peer} outbound_max={self.outbound_max}"
203
+ )
204
+
205
+ def send(self, frame: host_tunnel_pb2.HostFrame, *, timeout: float = 2.0) -> bool:
206
+ which = frame.WhichOneof("msg")
207
+ if not self.alive:
208
+ warn(f"[session] send refused (dead session) host_id={self.host_id!r} sess={self.session_uuid} type={which}")
209
+ return False
210
+ try:
211
+ self.out_q.put(frame, timeout=timeout)
212
+ info(f"[session] queued outbound host_id={self.host_id!r} sess={self.session_uuid} type={which}")
213
+ return True
214
+ except queue.Full:
215
+ warn(
216
+ f"[session] outbound queue FULL host_id={self.host_id!r} sess={self.session_uuid} "
217
+ f"max={self.outbound_max} drop type={which}"
218
+ )
219
+ return False
220
+ except Exception:
221
+ exception(f"[session] send exception host_id={self.host_id!r} sess={self.session_uuid} type={which}")
222
+ return False
223
+
224
+ def close(self) -> None:
225
+ if not self.alive:
226
+ return
227
+ info(f"[session] closing host_id={self.host_id!r} sess={self.session_uuid} peer={self.peer}")
228
+ self.alive = False
229
+
230
+ try:
231
+ self.out_q.put_nowait(None)
232
+ except Exception:
233
+ exception(f"[session] close: failed to enqueue sentinel host_id={self.host_id!r} sess={self.session_uuid}")
234
+
235
+ with self.inflight_lock:
236
+ for rid, fut in list(self.inflight.items()):
237
+ if not fut.done():
238
+ fut.set_exception(RuntimeError(f"Host '{self.host_id}' disconnected (req_id={rid})"))
239
+ self.inflight.clear()
240
+
241
+ info(f"[session] closed host_id={self.host_id!r} sess={self.session_uuid}")
242
+
243
+ # Registry
244
+
245
+ class HostTunnelRegistry:
246
+
247
+ def __init__(self, *, active_cache_ttl_ms: int = 250, dir_cache_ttl_ms: int = 1000) -> None:
248
+ self._lock = threading.RLock()
249
+
250
+ # live sessions
251
+ self._hosts: Dict[str, HostSession] = {} # host_id -> session
252
+
253
+ # directory state
254
+ self._routes: Dict[str, DeviceRoute] = {} # device_id -> route(host_id, local_id)
255
+ self._host_devices: Dict[str, Set[str]] = {} # host_id -> set(device_id)
256
+ self._device_infos: Dict[str, host_tunnel_pb2.DeviceInfo] = {} # device_id -> DeviceInfo
257
+
258
+ # status
259
+ self._host_status: Dict[str, HostStatus] = {}
260
+ self._device_status: Dict[str, DeviceStatus] = {}
261
+
262
+ # persistence
263
+ cfg = cfg_dir_from_file(__file__)
264
+ cfg.mkdir(parents=True, exist_ok=True)
265
+ self._env_lists = EnvStore(cfg / "host_directory.env")
266
+ self._env_meta = EnvStore(cfg / "host_directory_meta.env")
267
+
268
+ # caches
269
+ self._active_hosts_cache: Set[str] = set()
270
+ self._active_hosts_cache_ts_ms: int = 0
271
+ self._active_hosts_cache_ttl_ms: int = int(active_cache_ttl_ms)
272
+
273
+ self._dir_cache: Dict[str, Tuple[str, Any, bool]] = {}
274
+ self._dir_cache_ts_ms: int = 0
275
+ self._dir_cache_ttl_ms: int = int(dir_cache_ttl_ms)
276
+
277
+ info(f"[registry] init cfg_dir={cfg}")
278
+ self._load_persisted()
279
+
280
+ # caching helpers
281
+
282
+ def _invalidate_caches_locked(self) -> None:
283
+ self._active_hosts_cache_ts_ms = 0
284
+ self._dir_cache_ts_ms = 0
285
+
286
+ def _refresh_active_hosts_cache_locked(self, now: int) -> None:
287
+ if self._active_hosts_cache_ts_ms and (now - self._active_hosts_cache_ts_ms) < self._active_hosts_cache_ttl_ms:
288
+ return
289
+ self._active_hosts_cache = {hid for hid, sess in self._hosts.items() if sess is not None and sess.alive}
290
+ self._active_hosts_cache_ts_ms = now
291
+
292
+ # persistence load
293
+
294
+ def _load_persisted(self) -> None:
295
+ kv_lists = self._env_lists.read_kv()
296
+ kv_meta = self._env_meta.read_kv()
297
+
298
+ known_hosts = csv_split(kv_lists.get("KNOWN_HOSTS", ""))
299
+ known_devices = csv_split(kv_lists.get("KNOWN_DEVICES", ""))
300
+
301
+ info(f"[registry] load persisted: known_hosts={len(known_hosts)} known_devices={len(known_devices)}")
302
+
303
+ with self._lock:
304
+ self._routes.clear()
305
+ self._host_devices.clear()
306
+ self._device_infos.clear()
307
+ self._host_status.clear()
308
+ self._device_status.clear()
309
+
310
+ # seed hosts offline
311
+ for hid in known_hosts:
312
+ if hid:
313
+ self._host_status[hid] = HostStatus(host_id=hid, online=False, last_seen_ms=0)
314
+ self._host_devices.setdefault(hid, set())
315
+
316
+ devices_from_host_lists: Set[str] = set()
317
+ for k, v in kv_lists.items():
318
+ if k.startswith("HOST_") and k.endswith("_DEVICES"):
319
+ devices_from_host_lists.update(csv_split(v))
320
+
321
+ all_devices: List[str] = sorted(set(known_devices) | devices_from_host_lists)
322
+
323
+ for device_id in all_devices:
324
+ device_id = (device_id or "").strip()
325
+ if not device_id:
326
+ continue
327
+
328
+ # STRICT MODE: ignore any persisted non-numeric / non-canonical IDs
329
+ if not device_id.isdigit() or device_id != str(int(device_id)):
330
+ warn(f"[registry] skipping persisted non-numeric/non-canonical device_id={device_id!r}")
331
+ continue
332
+
333
+ dk = sanitize_env_key(device_id)
334
+
335
+ host_id = (kv_meta.get(f"DEVICE_{dk}_HOST", "") or "").strip()
336
+ local_id_s = (kv_meta.get(f"DEVICE_{dk}_LOCAL_ID", "") or "0").strip()
337
+ try:
338
+ local_id = int(local_id_s)
339
+ except Exception:
340
+ local_id = 0
341
+
342
+ label = (kv_meta.get(f"DEVICE_{dk}_LABEL", "") or "").strip()
343
+ serial = (kv_meta.get(f"DEVICE_{dk}_SERIAL", "") or "").strip()
344
+ kind = (kv_meta.get(f"DEVICE_{dk}_KIND", "") or "").strip()
345
+
346
+ if host_id:
347
+ if host_id not in self._host_status:
348
+ self._host_status[host_id] = HostStatus(host_id=host_id, online=False, last_seen_ms=0)
349
+ self._host_devices.setdefault(host_id, set()).add(device_id)
350
+ self._routes[device_id] = DeviceRoute(host_id=host_id, host_local_id=int(local_id))
351
+
352
+ self._device_infos[device_id] = host_tunnel_pb2.DeviceInfo(
353
+ device_id=str(device_id),
354
+ local_id=int(local_id),
355
+ label=str(label),
356
+ serial=str(serial),
357
+ kind=str(kind),
358
+ )
359
+
360
+ self._device_status[device_id] = DeviceStatus(
361
+ device_id=str(device_id),
362
+ host_id=str(host_id),
363
+ host_local_id=int(local_id),
364
+ online=False,
365
+ last_seen_ms=0,
366
+ )
367
+
368
+ self._invalidate_caches_locked()
369
+
370
+ info(
371
+ f"[registry] persisted directory loaded: routes={len(self._routes)} "
372
+ f"hosts={len(self._host_status)} devices={len(self._device_status)}"
373
+ )
374
+
375
+ # persistence write helpers
376
+
377
+ def _persist_host_meta(self, host_id: str, *, host_name: str = "", platform: str = "", version: str = "") -> None:
378
+ hk = host_key(host_id)
379
+ self._env_meta.set_kv_if_absent(f"HOST_{hk}_ID", host_id)
380
+ if host_name:
381
+ self._env_meta.upsert_kv(f"HOST_{hk}_NAME", host_name)
382
+ if platform:
383
+ self._env_meta.upsert_kv(f"HOST_{hk}_PLATFORM", platform)
384
+ if version:
385
+ self._env_meta.upsert_kv(f"HOST_{hk}_VERSION", version)
386
+
387
+ def persist_host_meta_from_hello(self, hello: Any) -> None:
388
+ try:
389
+ hid = str(getattr(hello, "host_id", "") or "").strip()
390
+ ver = str(getattr(hello, "version", "") or "").strip()
391
+ if hid:
392
+ self._persist_host_meta(hid, version=ver)
393
+ info(f"[registry] persisted host meta from hello: host_id={hid!r} version={ver!r}")
394
+ except Exception:
395
+ exception("[registry] persist_host_meta_from_hello failed")
396
+
397
+ def _persist_host(self, host_id: str) -> None:
398
+ self._env_lists.append_to_csv_list("KNOWN_HOSTS", host_id)
399
+ info(f"[registry] persisted host list: host_id={host_id!r}")
400
+
401
+ def _persist_device(self, host_id: str, device_id: str, local_id: int, device_info: Any) -> None:
402
+ self._env_lists.append_to_csv_list("KNOWN_DEVICES", device_id)
403
+ hk = host_key(host_id)
404
+ self._env_meta.set_kv_if_absent(f"HOST_{hk}_ID", host_id)
405
+ self._env_lists.append_to_csv_list(f"HOST_{hk}_DEVICES", device_id)
406
+
407
+ dk = sanitize_env_key(device_id)
408
+ self._env_meta.upsert_kv(f"DEVICE_{dk}_HOST", host_id)
409
+ self._env_meta.upsert_kv(f"DEVICE_{dk}_LOCAL_ID", str(int(local_id)))
410
+
411
+ label = str(getattr(device_info, "label", "") or "").strip()
412
+ serial = str(getattr(device_info, "serial", "") or "").strip()
413
+ kind = str(getattr(device_info, "kind", "") or "").strip()
414
+
415
+ if label:
416
+ self._env_meta.upsert_kv(f"DEVICE_{dk}_LABEL", label)
417
+ if serial:
418
+ self._env_meta.upsert_kv(f"DEVICE_{dk}_SERIAL", serial)
419
+ if kind:
420
+ self._env_meta.upsert_kv(f"DEVICE_{dk}_KIND", kind)
421
+
422
+ info(
423
+ f"[registry] persisted device: host_id={host_id!r} device_id={device_id!r} local_id={int(local_id)} "
424
+ f"label={label!r} serial={serial!r} kind={kind!r}"
425
+ )
426
+
427
+ # directory/status mutators
428
+
429
+ def register_host(self, host_id: str, session: HostSession, *, replace_if_stale_ms: int = 15_000) -> None:
430
+ info(
431
+ f"[registry] register_host: host_id={host_id!r} incoming_sess={session.session_uuid} "
432
+ f"incoming_peer={session.peer}"
433
+ )
434
+
435
+ with self._lock:
436
+ old = self._hosts.get(host_id)
437
+ if old is not None and old.alive:
438
+ age_ms = int(now_ms() - int(getattr(old, "last_heartbeat_ms", 0) or 0))
439
+
440
+ if age_ms < replace_if_stale_ms:
441
+ warn(
442
+ f"[registry] HostIdConflict host_id={host_id!r} age_ms={age_ms} "
443
+ f"incoming_sess={session.session_uuid} incoming_peer={session.peer} "
444
+ f"existing_sess={getattr(old,'session_uuid','?')} existing_peer={getattr(old,'peer','?')}"
445
+ )
446
+ raise HostIdConflictError(host_id, age_ms=age_ms, replace_after_ms=replace_if_stale_ms)
447
+
448
+ warn(
449
+ f"[registry] replacing STALE host_id={host_id!r} age_ms={age_ms} "
450
+ f"incoming_sess={session.session_uuid} incoming_peer={session.peer} "
451
+ f"existing_sess={getattr(old,'session_uuid','?')} existing_peer={getattr(old,'peer','?')}"
452
+ )
453
+ old.close()
454
+
455
+ self._hosts[host_id] = session
456
+
457
+ hs = self._host_status.get(host_id) or HostStatus(host_id=host_id)
458
+ hs.online = True
459
+ hs.last_seen_ms = now_ms()
460
+ self._host_status[host_id] = hs
461
+
462
+ self._host_devices.setdefault(host_id, set())
463
+ self._persist_host(host_id)
464
+ self._invalidate_caches_locked()
465
+
466
+ info(f"[registry] register_host done: host_id={host_id!r} online=True incoming_sess={session.session_uuid}")
467
+
468
+ def drop_host(self, host_id: str) -> None:
469
+ info(f"[registry] drop_host: host_id={host_id!r}")
470
+ with self._lock:
471
+ sess = self._hosts.pop(host_id, None)
472
+ if sess is not None:
473
+ sess.close()
474
+
475
+ hs = self._host_status.get(host_id) or HostStatus(host_id=host_id)
476
+ hs.online = False
477
+ hs.last_seen_ms = now_ms()
478
+ self._host_status[host_id] = hs
479
+
480
+ for did in self._host_devices.get(host_id, set()):
481
+ ds = self._device_status.get(did)
482
+ if ds is not None:
483
+ ds.online = False
484
+ ds.last_seen_ms = now_ms()
485
+
486
+ self._invalidate_caches_locked()
487
+
488
+ info(f"[registry] drop_host done: host_id={host_id!r} online=False")
489
+
490
+ def drop_host_if_match(self, host_id: str, session: HostSession) -> bool:
491
+ with self._lock:
492
+ cur = self._hosts.get(host_id)
493
+ if cur is not session:
494
+ return False
495
+
496
+ sess = self._hosts.pop(host_id, None)
497
+ if sess is not None:
498
+ sess.close()
499
+
500
+ hs = self._host_status.get(host_id) or HostStatus(host_id=host_id)
501
+ hs.online = False
502
+ hs.last_seen_ms = now_ms()
503
+ self._host_status[host_id] = hs
504
+
505
+ for did in self._host_devices.get(host_id, set()):
506
+ ds = self._device_status.get(did)
507
+ if ds is not None:
508
+ ds.online = False
509
+ ds.last_seen_ms = now_ms()
510
+
511
+ self._invalidate_caches_locked()
512
+
513
+ info(f"[registry] drop_host_if_match done: host_id={host_id!r} online=False")
514
+ return True
515
+
516
+ def heartbeat(self, host_id: str, unix_ms: int = 0) -> None:
517
+ now = int(unix_ms) or now_ms()
518
+ with self._lock:
519
+ hs = self._host_status.get(host_id) or HostStatus(host_id=host_id)
520
+ hs.online = True
521
+ hs.last_seen_ms = now
522
+ self._host_status[host_id] = hs
523
+
524
+ for did in self._host_devices.get(host_id, set()):
525
+ ds = self._device_status.get(did)
526
+ if ds is not None:
527
+ ds.online = True
528
+ ds.last_seen_ms = now
529
+
530
+ self._invalidate_caches_locked()
531
+
532
+ info(f"[registry] heartbeat: host_id={host_id!r} unix_ms={now}")
533
+
534
+ def announce_devices(self, host_id: str, devices: list) -> None:
535
+ info(f"[registry] announce_devices: host_id={host_id!r} n={len(devices)}")
536
+
537
+ # STRICT MODE:
538
+ # - device_id MUST be canonical decimal string for gid (e.g. "10")
539
+ # - reject "010", reject "pluto:...", reject duplicates in a single announce
540
+ pairs: List[Tuple[Any, str]] = []
541
+ seen: Set[str] = set()
542
+
543
+ for d in devices:
544
+ raw = str(getattr(d, "device_id", "") or "").strip()
545
+ if not raw:
546
+ continue
547
+
548
+ if not raw.isdigit():
549
+ raise ValueError(
550
+ f"Host device_id must be a numeric gid string (e.g. '10'). "
551
+ f"Got {raw!r} from host_id={host_id!r}."
552
+ )
553
+
554
+ canon = str(int(raw))
555
+ if raw != canon:
556
+ raise ValueError(
557
+ f"Host device_id must be canonical decimal with no leading zeros. "
558
+ f"Got {raw!r}; use {canon!r} (host_id={host_id!r})."
559
+ )
560
+
561
+ if canon in seen:
562
+ raise ValueError(
563
+ f"Duplicate device_id {canon!r} in device announce from host_id={host_id!r}."
564
+ )
565
+ seen.add(canon)
566
+ pairs.append((d, canon))
567
+
568
+ # conflict check first (no partial apply)
569
+ with self._lock:
570
+ for _, device_id in pairs:
571
+ existing = self._routes.get(device_id)
572
+ if existing is not None and existing.host_id != host_id:
573
+ warn(
574
+ f"[registry] device_id conflict: device_id={device_id!r} "
575
+ f"existing_host={existing.host_id!r} new_host={host_id!r}"
576
+ )
577
+ raise DeviceIdConflictError(
578
+ device_id=device_id,
579
+ existing_host=existing.host_id,
580
+ new_host=host_id,
581
+ )
582
+
583
+ now = now_ms()
584
+ with self._lock:
585
+ self._host_devices.setdefault(host_id, set())
586
+
587
+ # Preserve old behavior: skip empty ids with warn (but strict checks already handled non-empty bad ones)
588
+ for d in devices:
589
+ device_id0 = str(getattr(d, "device_id", "") or "").strip()
590
+ if not device_id0:
591
+ warn(f"[registry] announce_devices: skipping device with empty device_id host_id={host_id!r}")
592
+
593
+ for d, device_id in pairs:
594
+ try:
595
+ local_id = int(getattr(d, "local_id", 0))
596
+ except Exception:
597
+ local_id = 0
598
+
599
+ label = str(getattr(d, "label", "") or "")
600
+ serial = str(getattr(d, "serial", "") or "")
601
+ kind = str(getattr(d, "kind", "") or "")
602
+
603
+ info(
604
+ f"[registry] device: host_id={host_id!r} local_id={local_id} "
605
+ f"device_id={device_id!r} label={label!r} serial={serial!r} kind={kind!r}"
606
+ )
607
+
608
+ self._routes[device_id] = DeviceRoute(host_id=host_id, host_local_id=local_id)
609
+ self._host_devices[host_id].add(device_id)
610
+
611
+ ds = self._device_status.get(device_id) or DeviceStatus(
612
+ device_id=device_id,
613
+ host_id=host_id,
614
+ host_local_id=local_id,
615
+ )
616
+ ds.host_id = host_id
617
+ ds.host_local_id = local_id
618
+ ds.online = True
619
+ ds.last_seen_ms = now
620
+ self._device_status[device_id] = ds
621
+
622
+ try:
623
+ tmp = host_tunnel_pb2.DeviceInfo()
624
+ tmp.CopyFrom(d)
625
+ # ensure stored info reflects canonical device_id key
626
+ tmp.device_id = str(device_id)
627
+ self._device_infos[device_id] = tmp
628
+ except Exception:
629
+ self._device_infos[device_id] = host_tunnel_pb2.DeviceInfo(
630
+ device_id=str(device_id),
631
+ local_id=int(local_id),
632
+ label=label,
633
+ serial=serial,
634
+ kind=kind,
635
+ )
636
+
637
+ self._persist_device(host_id, device_id, local_id, d)
638
+
639
+ self._invalidate_caches_locked()
640
+
641
+ info(f"[registry] announce_devices done: host_id={host_id!r} routes={len(self._routes)}")
642
+
643
+ # FAST PATHS (use per-RPC)
644
+
645
+ def is_host_device(self, device_id: str) -> bool:
646
+ did = (device_id or "").strip()
647
+ if not did:
648
+ return False
649
+ with self._lock:
650
+ return did in self._routes
651
+
652
+ def is_host_device_active(self, device_id: str) -> bool:
653
+ did = (device_id or "").strip()
654
+ if not did:
655
+ return False
656
+ now = now_ms()
657
+ with self._lock:
658
+ route = self._routes.get(did)
659
+ if route is None:
660
+ return False
661
+ self._refresh_active_hosts_cache_locked(now)
662
+ return route.host_id in self._active_hosts_cache
663
+
664
+ def get_host_for_device(self, device_id: str) -> Optional[str]:
665
+ did = (device_id or "").strip()
666
+ if not did:
667
+ return None
668
+ with self._lock:
669
+ r = self._routes.get(did)
670
+ return r.host_id if r else None
671
+
672
+ # UI/diagnostics snapshots
673
+
674
+ def device_directory_cached(self, *, ttl_ms: Optional[int] = None) -> Dict[str, Tuple[str, Any, bool]]:
675
+ now = now_ms()
676
+ ttl = int(self._dir_cache_ttl_ms if ttl_ms is None else ttl_ms)
677
+
678
+ with self._lock:
679
+ if self._dir_cache_ts_ms and (now - self._dir_cache_ts_ms) < ttl:
680
+ return dict(self._dir_cache)
681
+
682
+ self._refresh_active_hosts_cache_locked(now)
683
+
684
+ out: Dict[str, Tuple[str, Any, bool]] = {}
685
+ for device_id, route in self._routes.items():
686
+ info0 = self._device_infos.get(device_id)
687
+ info_obj = info0
688
+ if info0 is not None:
689
+ tmp = host_tunnel_pb2.DeviceInfo()
690
+ try:
691
+ tmp.CopyFrom(info0)
692
+ info_obj = tmp
693
+ except Exception:
694
+ info_obj = info0
695
+
696
+ is_active = route.host_id in self._active_hosts_cache
697
+ out[device_id] = (route.host_id, info_obj, is_active)
698
+
699
+ self._dir_cache = out
700
+ self._dir_cache_ts_ms = now
701
+ return dict(out)
702
+
703
+ # misc accessors
704
+
705
+ def list_routes(self) -> Dict[str, Tuple[str, int]]:
706
+ with self._lock:
707
+ return {did: (r.host_id, r.host_local_id) for did, r in self._routes.items()}
708
+
709
+ def list_hosts(self) -> Dict[str, HostStatus]:
710
+ with self._lock:
711
+ return dict(self._host_status)
712
+
713
+ def list_devices(self) -> Dict[str, DeviceStatus]:
714
+ with self._lock:
715
+ return dict(self._device_status)
716
+
717
+ # forwarding
718
+
719
+ def _get_session_and_route(self, device_id: str) -> Tuple[HostSession, DeviceRoute]:
720
+ with self._lock:
721
+ route = self._routes.get(str(device_id))
722
+ if route is None:
723
+ raise KeyError(f"Unknown device_id={device_id}")
724
+
725
+ sess = self._hosts.get(route.host_id)
726
+ if sess is None or not sess.alive:
727
+ raise RuntimeError(f"Host '{route.host_id}' not connected for device {device_id}")
728
+
729
+ return sess, route
730
+
731
+ def forward_request(
732
+ self,
733
+ *,
734
+ device_id: str,
735
+ request: "generic_pb2.GenericRPCRequest",
736
+ timeout_sec: float,
737
+ deadline_unix_ms: int = 0,
738
+ cancel_on_timeout: bool = False,
739
+ ) -> "generic_pb2.GenericRPCResponse":
740
+ fn = str(getattr(request, "function_name", "") or "")
741
+ info(f"[forward] device_id={device_id!r} fn={fn!r} timeout_sec={timeout_sec}")
742
+
743
+ try:
744
+ sess, route = self._get_session_and_route(str(device_id))
745
+ except Exception as e:
746
+ exception(f"[forward] route/session lookup failed device_id={device_id!r} err={e!r}")
747
+ r = generic_pb2.GenericRPCResponse()
748
+ r.results["Ok"].CopyFrom(map_arg(False))
749
+ r.results["Error"].CopyFrom(map_arg(f"Route/session lookup failed: {e}"))
750
+ return r
751
+
752
+ req_id = secrets.token_hex(12)
753
+ fut: Future = Future()
754
+
755
+ with sess.inflight_lock:
756
+ sess.inflight[req_id] = fut
757
+
758
+ try:
759
+ gid_u32 = int(str(device_id)) # strict mode: device_id is canonical gid string
760
+ except Exception:
761
+ gid_u32 = 0
762
+
763
+ rpc_req = host_tunnel_pb2.RpcRequest(
764
+ req_id=req_id,
765
+ global_device_id=int(gid_u32),
766
+ local_device_id=int(route.host_local_id),
767
+ device_id=str(device_id),
768
+ deadline_unix_ms=int(deadline_unix_ms or 0),
769
+ )
770
+ rpc_req.request.CopyFrom(request)
771
+
772
+ frame = host_tunnel_pb2.HostFrame(rpc_request=rpc_req)
773
+
774
+ info(f"[forward] -> host_id={sess.host_id!r} req_id={req_id} local_id={route.host_local_id}")
775
+
776
+ if not sess.send(frame, timeout=2.0):
777
+ with sess.inflight_lock:
778
+ sess.inflight.pop(req_id, None)
779
+
780
+ r = generic_pb2.GenericRPCResponse()
781
+ r.results["Ok"].CopyFrom(map_arg(False))
782
+ r.results["Error"].CopyFrom(map_arg(f"Host '{sess.host_id}' outbound queue full / not writable."))
783
+ return r
784
+
785
+ t0 = time.time()
786
+ try:
787
+ resp: host_tunnel_pb2.RpcResponse = fut.result(timeout=timeout_sec)
788
+ except Exception as e:
789
+ dt_ms = int((time.time() - t0) * 1000)
790
+ warn(f"[forward] wait failed req_id={req_id} dt_ms={dt_ms} err={e!r}")
791
+
792
+ if cancel_on_timeout:
793
+ try:
794
+ ok = sess.send(host_tunnel_pb2.HostFrame(cancel=host_tunnel_pb2.Cancel(req_id=req_id)))
795
+ info(f"[forward] sent cancel req_id={req_id} ok={ok}")
796
+ except Exception:
797
+ exception(f"[forward] failed sending cancel req_id={req_id}")
798
+
799
+ with sess.inflight_lock:
800
+ sess.inflight.pop(req_id, None)
801
+
802
+ r = generic_pb2.GenericRPCResponse()
803
+ r.results["Ok"].CopyFrom(map_arg(False))
804
+ r.results["Error"].CopyFrom(map_arg(str(e)))
805
+ return r
806
+
807
+ with sess.inflight_lock:
808
+ sess.inflight.pop(req_id, None)
809
+
810
+ dt_ms = int((time.time() - t0) * 1000)
811
+ info(f"[forward] <- resp req_id={req_id} ok={bool(resp.ok)} dt_ms={dt_ms} err={str(resp.error or '')!r}")
812
+
813
+ if not resp.ok:
814
+ r = generic_pb2.GenericRPCResponse()
815
+ r.results["Ok"].CopyFrom(map_arg(False))
816
+ r.results["Error"].CopyFrom(map_arg(resp.error or "Remote host error."))
817
+ return r
818
+
819
+ return resp.response
820
+
821
+ def forward_request_by_host_device(
822
+ self,
823
+ *,
824
+ host_id: str,
825
+ device_id: str,
826
+ request: "generic_pb2.GenericRPCRequest",
827
+ timeout_sec: float,
828
+ deadline_unix_ms: int = 0,
829
+ cancel_on_timeout: bool = False,
830
+ ) -> "generic_pb2.GenericRPCResponse":
831
+ with self._lock:
832
+ route = self._routes.get(str(device_id))
833
+ if route is None:
834
+ warn(f"[forward] unknown device_id={device_id!r}")
835
+ r = generic_pb2.GenericRPCResponse()
836
+ r.results["Ok"].CopyFrom(map_arg(False))
837
+ r.results["Error"].CopyFrom(map_arg(f"Unknown device_id={device_id}"))
838
+ return r
839
+
840
+ if route.host_id != host_id:
841
+ warn(f"[forward] device_id={device_id!r} not on host_id={host_id!r} (actual={route.host_id!r})")
842
+ r = generic_pb2.GenericRPCResponse()
843
+ r.results["Ok"].CopyFrom(map_arg(False))
844
+ r.results["Error"].CopyFrom(map_arg(f"device_id={device_id} is not on host_id={host_id}"))
845
+ return r
846
+
847
+ return self.forward_request(
848
+ device_id=device_id,
849
+ request=request,
850
+ timeout_sec=timeout_sec,
851
+ deadline_unix_ms=deadline_unix_ms,
852
+ cancel_on_timeout=cancel_on_timeout,
853
+ )
854
+
855
+ # Convenience forwarders
856
+
857
+ def handle_host_device(
858
+ registry: HostTunnelRegistry,
859
+ *,
860
+ host_id: str,
861
+ device_id: str,
862
+ function_name: str,
863
+ args: Any,
864
+ timeout_sec: float = 10.0,
865
+ ) -> Dict[str, Any]:
866
+ info(f"[handle_host_device] host_id={host_id!r} device_id={device_id!r} fn={function_name!r}")
867
+ req = generic_pb2.GenericRPCRequest(function_name=str(function_name))
868
+ _copy_argument_map(req.args, args)
869
+
870
+ resp = registry.forward_request_by_host_device(
871
+ host_id=host_id,
872
+ device_id=device_id,
873
+ request=req,
874
+ timeout_sec=timeout_sec,
875
+ cancel_on_timeout=True,
876
+ )
877
+ return dict(resp.results)
878
+
879
+ def handle_host_device_request(
880
+ registry: HostTunnelRegistry,
881
+ *,
882
+ host_id: str,
883
+ device_id: str,
884
+ request: "generic_pb2.GenericRPCRequest",
885
+ timeout_sec: float = 10.0,
886
+ ) -> "generic_pb2.GenericRPCResponse":
887
+ info(
888
+ f"[handle_host_device_request] host_id={host_id!r} device_id={device_id!r} "
889
+ f"fn={str(getattr(request,'function_name','') or '')!r}"
890
+ )
891
+ return registry.forward_request_by_host_device(
892
+ host_id=host_id,
893
+ device_id=device_id,
894
+ request=request,
895
+ timeout_sec=timeout_sec,
896
+ cancel_on_timeout=True,
897
+ )
898
+
899
+ # Servicer
900
+
901
+ class HostTunnelServicer(host_tunnel_pb2_grpc.HostTunnelServicer):
902
+ def __init__(self, registry: HostTunnelRegistry) -> None:
903
+ self.registry = registry
904
+
905
+ def Connect(self, request_iterator: Iterator[host_tunnel_pb2.HostFrame], context):
906
+ try:
907
+ peer = context.peer()
908
+ except Exception:
909
+ peer = "unknown-peer"
910
+
911
+ peer_tag = _safe_thread_tag(peer)
912
+ stream_uuid = uuid.uuid4().hex[:10]
913
+ info(f"[Connect] new stream peer={peer} stream_sess={stream_uuid}")
914
+
915
+ host_id: Optional[str] = None
916
+ session: Optional[HostSession] = None
917
+ stop = threading.Event()
918
+ fatal: Dict[str, str] = {} # {"code": "...", "details": "..."}
919
+
920
+ # IMPORTANT: only drop the host if THIS stream successfully registered it,
921
+ # and only if the registry still points at THIS session.
922
+ registered = False
923
+
924
+ cleanup_lock = threading.Lock()
925
+ cleanup_done = False
926
+
927
+ def _cleanup(reason: str, *, detail: str = "") -> None:
928
+ nonlocal cleanup_done
929
+ with cleanup_lock:
930
+ if cleanup_done:
931
+ return
932
+ cleanup_done = True
933
+
934
+ # stop pumps first
935
+ try:
936
+ stop.set()
937
+ except Exception:
938
+ pass
939
+
940
+ hid = host_id or "?"
941
+ det = (detail or "").strip()
942
+ if det:
943
+ warn(f"[host] offline host_id={hid!r} reason={reason} detail={det}")
944
+ else:
945
+ warn(f"[host] offline host_id={hid!r} reason={reason}")
946
+
947
+ # If we never registered, do NOT drop: it could be a conflict with a real live host.
948
+ try:
949
+ if registered and host_id is not None and session is not None:
950
+ self.registry.drop_host_if_match(host_id, session)
951
+ except Exception:
952
+ pass
953
+
954
+ # close session object (no-op if already closed)
955
+ try:
956
+ if session is not None:
957
+ session.close()
958
+ except Exception:
959
+ pass
960
+
961
+ def _on_rpc_done():
962
+ try:
963
+ _cleanup("grpc-stream-done")
964
+ except Exception:
965
+ pass
966
+
967
+ try:
968
+ context.add_callback(_on_rpc_done)
969
+ except Exception:
970
+ pass
971
+
972
+ def inbound_loop():
973
+ nonlocal host_id, session, registered
974
+ info(f"[inbound] start peer={peer} stream_sess={stream_uuid}")
975
+
976
+ try:
977
+ for frame in request_iterator:
978
+ which = frame.WhichOneof("msg")
979
+ if which is None:
980
+ warn(f"[inbound] frame with no msg peer={peer} stream_sess={stream_uuid}")
981
+ continue
982
+
983
+ info(f"[inbound] IN type={which} peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
984
+
985
+ if which == "hello":
986
+ # Guard: only accept one HELLO per stream.
987
+ if session is not None or host_id is not None:
988
+ warn(f"[inbound] duplicate HELLO ignored host_id={host_id!r} peer={peer} stream_sess={stream_uuid}")
989
+ continue
990
+
991
+ hello = frame.hello
992
+ raw_id = str(getattr(hello, "host_id", "") or "")
993
+ raw_tok = str(getattr(hello, "host_token", "") or "") # NEW FIELD
994
+
995
+ try:
996
+ host_id = _validate_host_id(raw_id)
997
+ except ValueError as e:
998
+ error(f"[inbound] invalid hello.host_id peer={peer} stream_sess={stream_uuid} err={e}")
999
+ fatal["code"] = "INVALID_ARGUMENT"
1000
+ fatal["details"] = str(e)
1001
+ stop.set()
1002
+ return
1003
+
1004
+ # Auth gate: MUST pass before we register the host_id.
1005
+ if not is_host_token_valid(host_id, raw_tok, require_status="approved"):
1006
+ code, details = _auth_fail_code_and_details(host_id)
1007
+ error(
1008
+ f"[inbound] auth failed host_id={host_id!r} peer={peer} stream_sess={stream_uuid} "
1009
+ f"code={code} details={details}"
1010
+ )
1011
+ fatal["code"] = code
1012
+ fatal["details"] = details
1013
+ stop.set()
1014
+ return
1015
+
1016
+ ver = str(getattr(hello, "version", "") or "")
1017
+ info(f"[inbound] HELLO host_id={host_id!r} version={ver!r} peer={peer} stream_sess={stream_uuid}")
1018
+
1019
+ # ONE session per stream
1020
+ session = HostSession(host_id, peer=peer, session_uuid=stream_uuid)
1021
+
1022
+ # ONE register per stream
1023
+ try:
1024
+ self.registry.register_host(host_id, session, replace_if_stale_ms=15_000)
1025
+ registered = True
1026
+ except HostIdConflictError as e:
1027
+ error(f"[inbound] HostIdConflictError host_id={host_id!r} stream_sess={stream_uuid} err={e}")
1028
+ fatal["code"] = "ALREADY_EXISTS"
1029
+ fatal["details"] = str(e)
1030
+ try:
1031
+ session.close()
1032
+ except Exception:
1033
+ pass
1034
+ stop.set()
1035
+ return
1036
+
1037
+ self.registry.persist_host_meta_from_hello(hello)
1038
+
1039
+ # Ask host for metadata immediately on join
1040
+ try:
1041
+ mrid = secrets.token_hex(12)
1042
+ ok = session.send(
1043
+ host_tunnel_pb2.HostFrame(
1044
+ meta_request=host_tunnel_pb2.MetaRequest(
1045
+ req_id=mrid,
1046
+ include_platform=True,
1047
+ include_env=True,
1048
+ include_devices=True,
1049
+ )
1050
+ )
1051
+ )
1052
+ info(
1053
+ f"[inbound] sent MetaRequest req_id={mrid} ok={ok} "
1054
+ f"host_id={host_id!r} stream_sess={stream_uuid}"
1055
+ )
1056
+ except Exception:
1057
+ exception("[inbound] FAILED sending MetaRequest")
1058
+
1059
+ # heartbeat ack
1060
+ try:
1061
+ ok = session.send(
1062
+ host_tunnel_pb2.HostFrame(
1063
+ heartbeat=host_tunnel_pb2.Heartbeat(unix_ms=now_ms())
1064
+ )
1065
+ )
1066
+ info(
1067
+ f"[inbound] sent heartbeat ack ok={ok} "
1068
+ f"host_id={host_id!r} stream_sess={stream_uuid}"
1069
+ )
1070
+ except Exception:
1071
+ exception("[inbound] FAILED sending heartbeat ack")
1072
+
1073
+ elif which == "device_announce":
1074
+ if session is None or host_id is None:
1075
+ warn(f"[inbound] device_announce before hello/session: ignoring peer={peer} stream_sess={stream_uuid}")
1076
+ continue
1077
+
1078
+ ann = frame.device_announce
1079
+ n = len(getattr(ann, "devices", []))
1080
+ info(
1081
+ f"[inbound] DEVICE_ANNOUNCE host_id={host_id!r} n={n} stream_sess={stream_uuid} "
1082
+ f"unix_ms={int(getattr(ann,'unix_ms',0) or 0)} full_snapshot={bool(getattr(ann,'full_snapshot',False))}"
1083
+ )
1084
+
1085
+ for d in list(ann.devices):
1086
+ info(
1087
+ f"[inbound] device local_id={int(getattr(d,'local_id',0))} "
1088
+ f"label={str(getattr(d,'label','') or '')!r} "
1089
+ f"device_id={str(getattr(d,'device_id','') or '')!r} "
1090
+ f"serial={str(getattr(d,'serial','') or '')!r} "
1091
+ f"kind={str(getattr(d,'kind','') or '')!r}"
1092
+ )
1093
+
1094
+ try:
1095
+ self.registry.announce_devices(host_id, list(ann.devices))
1096
+ info(f"[inbound] registry announce applied host_id={host_id!r} stream_sess={stream_uuid}")
1097
+ except ValueError as e:
1098
+ error(
1099
+ f"[inbound] INVALID_ARGUMENT device announce host_id={host_id!r} "
1100
+ f"stream_sess={stream_uuid} err={e}"
1101
+ )
1102
+ fatal["code"] = "INVALID_ARGUMENT"
1103
+ fatal["details"] = str(e)
1104
+ stop.set()
1105
+ return
1106
+ except DeviceIdConflictError as e:
1107
+ error(f"[inbound] DeviceIdConflictError host_id={host_id!r} stream_sess={stream_uuid} err={e}")
1108
+ fatal["code"] = "ALREADY_EXISTS"
1109
+ fatal["details"] = str(e)
1110
+ stop.set()
1111
+ return
1112
+ except Exception:
1113
+ exception(f"[inbound] announce_devices crashed host_id={host_id!r} stream_sess={stream_uuid}")
1114
+
1115
+ # heartbeat ack after announce
1116
+ try:
1117
+ ok = session.send(
1118
+ host_tunnel_pb2.HostFrame(
1119
+ heartbeat=host_tunnel_pb2.Heartbeat(unix_ms=now_ms())
1120
+ )
1121
+ )
1122
+ info(
1123
+ f"[inbound] sent heartbeat ack after announce ok={ok} "
1124
+ f"host_id={host_id!r} stream_sess={stream_uuid}"
1125
+ )
1126
+ except Exception:
1127
+ exception("[inbound] FAILED sending heartbeat ack after announce")
1128
+
1129
+ elif which == "rpc_response":
1130
+ if session is None:
1131
+ warn(f"[inbound] rpc_response before session: ignoring peer={peer} stream_sess={stream_uuid}")
1132
+ continue
1133
+
1134
+ resp = frame.rpc_response
1135
+ rid = str(getattr(resp, "req_id", "") or "")
1136
+ ok = bool(getattr(resp, "ok", False))
1137
+ err_s = str(getattr(resp, "error", "") or "")
1138
+ info(f"[inbound] RPC_RESPONSE req_id={rid} ok={ok} error={err_s!r} stream_sess={stream_uuid}")
1139
+
1140
+ with session.inflight_lock:
1141
+ fut = session.inflight.get(rid)
1142
+
1143
+ if fut is not None and not fut.done():
1144
+ fut.set_result(resp)
1145
+ info(f"[inbound] delivered rpc_response to waiter req_id={rid} stream_sess={stream_uuid}")
1146
+ else:
1147
+ warn(f"[inbound] rpc_response has no inflight waiter req_id={rid} stream_sess={stream_uuid}")
1148
+
1149
+ elif which == "meta_response":
1150
+ # NEW proto: MetaResponse.meta.devices
1151
+ if host_id is None or session is None:
1152
+ warn(f"[inbound] meta_response before hello/session: ignoring peer={peer} stream_sess={stream_uuid}")
1153
+ continue
1154
+
1155
+ mr = frame.meta_response
1156
+ ok = bool(getattr(mr, "ok", False))
1157
+ err_s = str(getattr(mr, "error", "") or "")
1158
+ info(f"[inbound] META_RESPONSE ok={ok} error={err_s!r} stream_sess={stream_uuid}")
1159
+
1160
+ if ok:
1161
+ try:
1162
+ meta = getattr(mr, "meta", None)
1163
+ devs = list(getattr(meta, "devices", []) or [])
1164
+ except Exception:
1165
+ devs = []
1166
+
1167
+ if devs:
1168
+ info(f"[inbound] META_RESPONSE applying {len(devs)} devices to registry host_id={host_id!r}")
1169
+ try:
1170
+ self.registry.announce_devices(host_id, devs)
1171
+ self.registry.heartbeat(host_id, now_ms())
1172
+ except ValueError as e:
1173
+ error(f"[inbound] META_RESPONSE INVALID_ARGUMENT host_id={host_id!r} err={e}")
1174
+ fatal["code"] = "INVALID_ARGUMENT"
1175
+ fatal["details"] = str(e)
1176
+ stop.set()
1177
+ return
1178
+ except DeviceIdConflictError as e:
1179
+ error(f"[inbound] META_RESPONSE DeviceIdConflictError host_id={host_id!r} err={e}")
1180
+ fatal["code"] = "ALREADY_EXISTS"
1181
+ fatal["details"] = str(e)
1182
+ stop.set()
1183
+ return
1184
+ except Exception:
1185
+ exception("[inbound] META_RESPONSE announce_devices failed")
1186
+ else:
1187
+ info("[inbound] META_RESPONSE had no meta.devices (or empty)")
1188
+
1189
+ elif which == "heartbeat":
1190
+ if session is not None and host_id is not None:
1191
+ ms = int(getattr(frame.heartbeat, "unix_ms", 0) or 0) or now_ms()
1192
+ session.last_heartbeat_ms = ms
1193
+ self.registry.heartbeat(host_id, ms)
1194
+ info(f"[inbound] HEARTBEAT host_id={host_id!r} unix_ms={ms} stream_sess={stream_uuid}")
1195
+ else:
1196
+ warn(
1197
+ f"[inbound] heartbeat before hello/session: ignoring peer={peer} "
1198
+ f"stream_sess={stream_uuid} host_id={(host_id or '?')!r}"
1199
+ )
1200
+
1201
+ elif which == "cancel":
1202
+ try:
1203
+ rid = str(getattr(frame.cancel, "req_id", "") or "")
1204
+ warn(f"[inbound] CANCEL received from host?? req_id={rid} peer={peer} stream_sess={stream_uuid}")
1205
+ except Exception:
1206
+ exception("[inbound] cancel frame parse failed")
1207
+
1208
+ else:
1209
+ warn(f"[inbound] unknown frame type={which} peer={peer} stream_sess={stream_uuid}")
1210
+
1211
+ except grpc.RpcError as e:
1212
+ warn(
1213
+ f"[inbound] stream ended peer={peer} stream_sess={stream_uuid} "
1214
+ f"host_id={host_id or '?'} grpc={_grpc_err_summary(e)}"
1215
+ )
1216
+ except Exception:
1217
+ exception(f"[inbound] loop crashed peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
1218
+ finally:
1219
+ info(f"[inbound] exiting peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
1220
+ _cleanup("inbound-exit")
1221
+
1222
+ t = threading.Thread(target=inbound_loop, daemon=True, name=f"inbound_{peer_tag}")
1223
+ t.start()
1224
+
1225
+ info(f"[Connect] outbound pump starting peer={peer} stream_sess={stream_uuid}")
1226
+
1227
+ printed_waiting = False
1228
+ try:
1229
+ while not stop.is_set():
1230
+ if not context.is_active():
1231
+ _cleanup("context-inactive")
1232
+ break
1233
+
1234
+ if fatal:
1235
+ code = fatal.get("code", "")
1236
+ details = fatal.get("details", "fatal")
1237
+ error(f"[Connect] aborting stream peer={peer} stream_sess={stream_uuid} code={code} details={details}")
1238
+ _cleanup(f"fatal:{code}", detail=details)
1239
+
1240
+ if code == "ALREADY_EXISTS":
1241
+ context.abort(grpc.StatusCode.ALREADY_EXISTS, details)
1242
+ if code == "INVALID_ARGUMENT":
1243
+ context.abort(grpc.StatusCode.INVALID_ARGUMENT, details)
1244
+ if code == "UNAUTHENTICATED":
1245
+ context.abort(grpc.StatusCode.UNAUTHENTICATED, details)
1246
+ if code == "PERMISSION_DENIED":
1247
+ context.abort(grpc.StatusCode.PERMISSION_DENIED, details)
1248
+
1249
+ context.abort(grpc.StatusCode.UNKNOWN, details)
1250
+
1251
+ if session is None:
1252
+ if not printed_waiting:
1253
+ info(f"[Connect] waiting for HELLO peer={peer} stream_sess={stream_uuid}")
1254
+ printed_waiting = True
1255
+ time.sleep(0.05)
1256
+ continue
1257
+
1258
+ try:
1259
+ out = session.out_q.get(timeout=0.5)
1260
+ except queue.Empty:
1261
+ continue
1262
+ except Exception:
1263
+ exception(
1264
+ f"[Connect] outbound queue get failed peer={peer} stream_sess={stream_uuid} "
1265
+ f"host_id={host_id or '?'}"
1266
+ )
1267
+ continue
1268
+
1269
+ if out is None:
1270
+ info(f"[Connect] outbound sentinel -> end stream peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
1271
+ _cleanup("outbound-sentinel")
1272
+ break
1273
+
1274
+ out_type = out.WhichOneof("msg")
1275
+ info(f"[Connect] OUT type={out_type} peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
1276
+ yield out
1277
+
1278
+ except grpc.RpcError as e:
1279
+ warn(
1280
+ f"[Connect] outbound ended peer={peer} stream_sess={stream_uuid} "
1281
+ f"host_id={host_id or '?'} grpc={_grpc_err_summary(e)}"
1282
+ )
1283
+ _cleanup("outbound-grpc", detail=_grpc_err_summary(e))
1284
+ except GeneratorExit:
1285
+ _cleanup("generator-exit")
1286
+ raise
1287
+ finally:
1288
+ _cleanup("connect-finally")
1289
+ info(f"[Connect] stream ended peer={peer} stream_sess={stream_uuid} host_id={host_id or '?'}")
1290
+
1291
+ # Credentials + server start
1292
+
1293
+ def build_server_credentials(
1294
+ *,
1295
+ private_key_pem: bytes,
1296
+ certificate_chain_pem: bytes,
1297
+ client_ca_pem: Optional[bytes] = None,
1298
+ require_client_auth: bool = True,
1299
+ ) -> grpc.ServerCredentials:
1300
+ info(f"[creds] build_server_credentials client_ca={'yes' if bool(client_ca_pem) else 'no'} require_client_auth={require_client_auth}")
1301
+ if client_ca_pem:
1302
+ return grpc.ssl_server_credentials(
1303
+ ((private_key_pem, certificate_chain_pem,),),
1304
+ root_certificates=client_ca_pem,
1305
+ require_client_auth=require_client_auth,
1306
+ )
1307
+ return grpc.ssl_server_credentials(((private_key_pem, certificate_chain_pem),))
1308
+
1309
+
1310
+ def start_host_tunnel_server(
1311
+ *,
1312
+ host: str,
1313
+ port: int,
1314
+ server_credentials: grpc.ServerCredentials,
1315
+ registry: Optional[HostTunnelRegistry] = None,
1316
+ max_workers: int = 32,
1317
+ ) -> Tuple[grpc.Server, threading.Thread, HostTunnelRegistry]:
1318
+ if registry is not None:
1319
+ reg = set_tunnel_registry(registry)
1320
+ else:
1321
+ reg = get_tunnel_registry(create=True)
1322
+ assert reg is not None
1323
+
1324
+ options = [
1325
+ ("grpc.max_send_message_length", 100 * 1024 * 1024),
1326
+ ("grpc.max_receive_message_length", 100 * 1024 * 1024),
1327
+ ("grpc.keepalive_time_ms", 30_000),
1328
+ ("grpc.keepalive_timeout_ms", 10_000),
1329
+ ("grpc.http2.max_pings_without_data", 0),
1330
+ ("grpc.keepalive_permit_without_calls", 1),
1331
+ ]
1332
+
1333
+ bind_addr = f"{host}:{int(port)}"
1334
+ info(f"[server] creating grpc.server max_workers={max_workers} bind={bind_addr}")
1335
+
1336
+ server = grpc.server(
1337
+ futures.ThreadPoolExecutor(max_workers=max_workers),
1338
+ options=options,
1339
+ )
1340
+
1341
+ host_tunnel_pb2_grpc.add_HostTunnelServicer_to_server(HostTunnelServicer(reg), server)
1342
+
1343
+ try:
1344
+ added = server.add_secure_port(bind_addr, server_credentials)
1345
+ info(f"[server] add_secure_port addr={bind_addr} -> {added}")
1346
+ except Exception:
1347
+ exception(f"[server] add_secure_port failed addr={bind_addr}")
1348
+ raise
1349
+
1350
+ def run():
1351
+ try:
1352
+ info(f"[server] starting on {bind_addr}")
1353
+ server.start()
1354
+ info(f"[server] started on {bind_addr} (waiting for termination)")
1355
+ server.wait_for_termination()
1356
+ info("[server] wait_for_termination returned")
1357
+ except Exception:
1358
+ exception("[server] run() crashed")
1359
+ raise
1360
+
1361
+ th = threading.Thread(target=run, daemon=True, name="host_tunnel_server")
1362
+ th.start()
1363
+ return server, th, reg
1364
+
1365
+ # Singleton registry (process-wide)
1366
+
1367
+ _TUNNEL_REGISTRY_LOCK = threading.Lock()
1368
+ TUNNEL_REGISTRY: Optional["HostTunnelRegistry"] = None
1369
+
1370
+ def get_tunnel_registry(*, create: bool = True) -> Optional["HostTunnelRegistry"]:
1371
+ global TUNNEL_REGISTRY
1372
+
1373
+ if TUNNEL_REGISTRY is not None:
1374
+ return TUNNEL_REGISTRY
1375
+ if not create:
1376
+ return None
1377
+
1378
+ with _TUNNEL_REGISTRY_LOCK:
1379
+ if TUNNEL_REGISTRY is None:
1380
+ TUNNEL_REGISTRY = HostTunnelRegistry()
1381
+ return TUNNEL_REGISTRY
1382
+
1383
+ def set_tunnel_registry(reg: "HostTunnelRegistry") -> "HostTunnelRegistry":
1384
+ """Force the singleton to a specific instance."""
1385
+ global TUNNEL_REGISTRY
1386
+ with _TUNNEL_REGISTRY_LOCK:
1387
+ TUNNEL_REGISTRY = reg
1388
+ return reg