empathy-framework 4.4.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,849 @@
1
+ """Cross-Session Agent Communication Protocol.
2
+
3
+ This module enables agents across different Claude Code sessions to communicate
4
+ and coordinate via Redis-backed short-term memory.
5
+
6
+ Features:
7
+ - Session discovery and announcement
8
+ - Priority-based conflict resolution
9
+ - Task queue coordination
10
+ - Shared state management
11
+ - Agent-to-agent signaling
12
+
13
+ Requires Redis (not available in mock mode).
14
+
15
+ Copyright 2025-2026 Smart AI Memory, LLC
16
+ Licensed under Fair Source 0.9
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import secrets
24
+ import threading
25
+ import time
26
+ from collections.abc import Callable
27
+ from dataclasses import dataclass, field
28
+ from datetime import datetime, timedelta
29
+ from enum import Enum
30
+ from typing import Any
31
+
32
+ import structlog
33
+
34
+ from .short_term import (
35
+ AccessTier,
36
+ AgentCredentials,
37
+ RedisShortTermMemory,
38
+ )
39
+
40
+ logger = structlog.get_logger(__name__)
41
+
42
+
43
+ # === Constants ===
44
+
45
+ CHANNEL_SESSIONS = "empathy:sessions"
46
+ KEY_ACTIVE_AGENTS = "empathy:active_agents"
47
+ KEY_SERVICE_LOCK = "empathy:service_lock"
48
+ KEY_SERVICE_HEARTBEAT = "empathy:service_heartbeat"
49
+
50
+ HEARTBEAT_INTERVAL_SECONDS = 30
51
+ STALE_THRESHOLD_SECONDS = 90
52
+ SERVICE_LOCK_TTL_SECONDS = 60
53
+
54
+
55
+ class SessionType(Enum):
56
+ """Type of session/agent."""
57
+
58
+ CLAUDE = "claude" # Interactive Claude Code session
59
+ SERVICE = "service" # Background service/daemon
60
+ WORKER = "worker" # Task worker agent
61
+
62
+
63
+ class ConflictStrategy(Enum):
64
+ """Strategy for resolving conflicts between agents."""
65
+
66
+ PRIORITY_BASED = "priority" # Higher access tier wins
67
+ FIRST_WRITE_WINS = "first_write" # First to write wins
68
+ LAST_WRITE_WINS = "last_write" # Last to write wins
69
+
70
+
71
+ @dataclass
72
+ class SessionInfo:
73
+ """Information about an active session."""
74
+
75
+ agent_id: str
76
+ session_type: SessionType
77
+ access_tier: AccessTier
78
+ capabilities: list[str]
79
+ started_at: datetime
80
+ last_heartbeat: datetime
81
+ metadata: dict[str, Any] = field(default_factory=dict)
82
+
83
+ def to_dict(self) -> dict[str, Any]:
84
+ """Convert to dictionary for storage."""
85
+ return {
86
+ "agent_id": self.agent_id,
87
+ "session_type": self.session_type.value,
88
+ "access_tier": self.access_tier.value,
89
+ "capabilities": self.capabilities,
90
+ "started_at": self.started_at.isoformat(),
91
+ "last_heartbeat": self.last_heartbeat.isoformat(),
92
+ "metadata": self.metadata,
93
+ }
94
+
95
+ @classmethod
96
+ def from_dict(cls, data: dict[str, Any]) -> SessionInfo:
97
+ """Create from dictionary."""
98
+ return cls(
99
+ agent_id=data["agent_id"],
100
+ session_type=SessionType(data["session_type"]),
101
+ access_tier=AccessTier(data["access_tier"]),
102
+ capabilities=data.get("capabilities", []),
103
+ started_at=datetime.fromisoformat(data["started_at"]),
104
+ last_heartbeat=datetime.fromisoformat(data["last_heartbeat"]),
105
+ metadata=data.get("metadata", {}),
106
+ )
107
+
108
+ @property
109
+ def is_stale(self) -> bool:
110
+ """Check if session is stale (no recent heartbeat)."""
111
+ threshold = datetime.now() - timedelta(seconds=STALE_THRESHOLD_SECONDS)
112
+ return self.last_heartbeat < threshold
113
+
114
+
115
+ @dataclass
116
+ class ConflictResult:
117
+ """Result of a conflict resolution."""
118
+
119
+ winner_agent_id: str
120
+ loser_agent_id: str
121
+ resource_key: str
122
+ strategy_used: ConflictStrategy
123
+ reason: str
124
+
125
+
126
+ def generate_agent_id(session_type: SessionType) -> str:
127
+ """Generate a unique agent ID.
128
+
129
+ Format: {session_type}_{timestamp}_{random_suffix}
130
+ Example: claude_20260120_a1b2c3
131
+ """
132
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
133
+ suffix = secrets.token_hex(3) # 6 character hex string
134
+ return f"{session_type.value}_{timestamp}_{suffix}"
135
+
136
+
137
+ class CrossSessionCoordinator:
138
+ """Coordinator for cross-session agent communication.
139
+
140
+ This class manages session discovery, conflict resolution, and
141
+ coordination between agents across different Claude Code sessions.
142
+
143
+ Requires Redis - not available in mock mode.
144
+ """
145
+
146
+ def __init__(
147
+ self,
148
+ memory: RedisShortTermMemory,
149
+ session_type: SessionType = SessionType.CLAUDE,
150
+ access_tier: AccessTier = AccessTier.CONTRIBUTOR,
151
+ capabilities: list[str] | None = None,
152
+ auto_announce: bool = True,
153
+ ):
154
+ """Initialize cross-session coordinator.
155
+
156
+ Args:
157
+ memory: RedisShortTermMemory instance (must not be mock)
158
+ session_type: Type of this session
159
+ access_tier: Access tier for this session
160
+ capabilities: List of capabilities this session supports
161
+ auto_announce: Whether to announce presence on init
162
+ """
163
+ if memory.use_mock:
164
+ raise ValueError(
165
+ "Cross-session communication requires Redis. "
166
+ "Mock mode does not support cross-session features."
167
+ )
168
+
169
+ self._memory = memory
170
+ self._session_type = session_type
171
+ self._access_tier = access_tier
172
+ self._capabilities = capabilities or ["stash", "retrieve", "queue", "signal"]
173
+
174
+ # Generate unique agent ID
175
+ self._agent_id = generate_agent_id(session_type)
176
+ self._credentials = AgentCredentials(
177
+ agent_id=self._agent_id,
178
+ tier=access_tier,
179
+ )
180
+
181
+ # Session info
182
+ self._session_info = SessionInfo(
183
+ agent_id=self._agent_id,
184
+ session_type=session_type,
185
+ access_tier=access_tier,
186
+ capabilities=self._capabilities,
187
+ started_at=datetime.now(),
188
+ last_heartbeat=datetime.now(),
189
+ )
190
+
191
+ # Heartbeat thread
192
+ self._heartbeat_thread: threading.Thread | None = None
193
+ self._heartbeat_stop = threading.Event()
194
+
195
+ # Event handlers
196
+ self._on_session_joined: list[Callable[[SessionInfo], None]] = []
197
+ self._on_session_left: list[Callable[[str], None]] = []
198
+
199
+ # Auto-announce if requested
200
+ if auto_announce:
201
+ self.announce()
202
+ self.start_heartbeat()
203
+
204
+ logger.info(
205
+ "cross_session_coordinator_initialized",
206
+ agent_id=self._agent_id,
207
+ session_type=session_type.value,
208
+ access_tier=access_tier.name,
209
+ )
210
+
211
+ @property
212
+ def agent_id(self) -> str:
213
+ """Get this session's agent ID."""
214
+ return self._agent_id
215
+
216
+ @property
217
+ def credentials(self) -> AgentCredentials:
218
+ """Get this session's credentials."""
219
+ return self._credentials
220
+
221
+ @property
222
+ def session_info(self) -> SessionInfo:
223
+ """Get this session's info."""
224
+ return self._session_info
225
+
226
+ # === Session Discovery ===
227
+
228
+ def announce(self) -> None:
229
+ """Announce this session's presence to other sessions."""
230
+ client = self._memory._client
231
+ if client is None:
232
+ return
233
+
234
+ # Update active agents registry
235
+ session_data = json.dumps(self._session_info.to_dict())
236
+ client.hset(KEY_ACTIVE_AGENTS, self._agent_id, session_data)
237
+
238
+ # Publish announcement to sessions channel
239
+ announcement = {
240
+ "event": "session_joined",
241
+ "session": self._session_info.to_dict(),
242
+ }
243
+ client.publish(CHANNEL_SESSIONS, json.dumps(announcement))
244
+
245
+ logger.info(
246
+ "session_announced",
247
+ agent_id=self._agent_id,
248
+ session_type=self._session_type.value,
249
+ )
250
+
251
+ def depart(self) -> None:
252
+ """Announce this session's departure."""
253
+ self.stop_heartbeat()
254
+
255
+ client = self._memory._client
256
+ if client is None:
257
+ return
258
+
259
+ # Remove from active agents registry
260
+ client.hdel(KEY_ACTIVE_AGENTS, self._agent_id)
261
+
262
+ # Publish departure to sessions channel
263
+ departure = {
264
+ "event": "session_left",
265
+ "agent_id": self._agent_id,
266
+ }
267
+ client.publish(CHANNEL_SESSIONS, json.dumps(departure))
268
+
269
+ logger.info("session_departed", agent_id=self._agent_id)
270
+
271
+ def get_active_sessions(self) -> list[SessionInfo]:
272
+ """Get all active sessions.
273
+
274
+ Returns:
275
+ List of SessionInfo for all active sessions
276
+ """
277
+ client = self._memory._client
278
+ if client is None:
279
+ return []
280
+
281
+ sessions = []
282
+ all_agents = client.hgetall(KEY_ACTIVE_AGENTS)
283
+
284
+ for agent_id, session_data in all_agents.items():
285
+ try:
286
+ # Decode bytes if necessary
287
+ if isinstance(agent_id, bytes):
288
+ agent_id = agent_id.decode()
289
+ if isinstance(session_data, bytes):
290
+ session_data = session_data.decode()
291
+
292
+ info = SessionInfo.from_dict(json.loads(session_data))
293
+
294
+ # Skip stale sessions
295
+ if info.is_stale:
296
+ # Clean up stale session
297
+ client.hdel(KEY_ACTIVE_AGENTS, agent_id)
298
+ logger.debug("cleaned_stale_session", agent_id=agent_id)
299
+ continue
300
+
301
+ sessions.append(info)
302
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
303
+ logger.warning(
304
+ "invalid_session_data",
305
+ agent_id=agent_id,
306
+ error=str(e),
307
+ )
308
+
309
+ return sessions
310
+
311
+ def get_session(self, agent_id: str) -> SessionInfo | None:
312
+ """Get info for a specific session.
313
+
314
+ Args:
315
+ agent_id: Agent ID to look up
316
+
317
+ Returns:
318
+ SessionInfo if found and not stale, None otherwise
319
+ """
320
+ client = self._memory._client
321
+ if client is None:
322
+ return None
323
+
324
+ session_data = client.hget(KEY_ACTIVE_AGENTS, agent_id)
325
+ if session_data is None:
326
+ return None
327
+
328
+ try:
329
+ if isinstance(session_data, bytes):
330
+ session_data = session_data.decode()
331
+ info = SessionInfo.from_dict(json.loads(session_data))
332
+ return info if not info.is_stale else None
333
+ except (json.JSONDecodeError, KeyError, ValueError):
334
+ return None
335
+
336
+ # === Heartbeat ===
337
+
338
+ def start_heartbeat(self) -> None:
339
+ """Start the heartbeat thread."""
340
+ if self._heartbeat_thread is not None:
341
+ return
342
+
343
+ self._heartbeat_stop.clear()
344
+ self._heartbeat_thread = threading.Thread(
345
+ target=self._heartbeat_loop,
346
+ daemon=True,
347
+ name=f"heartbeat-{self._agent_id}",
348
+ )
349
+ self._heartbeat_thread.start()
350
+ logger.debug("heartbeat_started", agent_id=self._agent_id)
351
+
352
+ def stop_heartbeat(self) -> None:
353
+ """Stop the heartbeat thread."""
354
+ if self._heartbeat_thread is None:
355
+ return
356
+
357
+ self._heartbeat_stop.set()
358
+ self._heartbeat_thread.join(timeout=5)
359
+ self._heartbeat_thread = None
360
+ logger.debug("heartbeat_stopped", agent_id=self._agent_id)
361
+
362
+ def _heartbeat_loop(self) -> None:
363
+ """Heartbeat loop - runs in background thread."""
364
+ while not self._heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
365
+ self._send_heartbeat()
366
+
367
+ def _send_heartbeat(self) -> None:
368
+ """Send a heartbeat update."""
369
+ client = self._memory._client
370
+ if client is None:
371
+ return
372
+
373
+ # Update last heartbeat
374
+ self._session_info.last_heartbeat = datetime.now()
375
+ session_data = json.dumps(self._session_info.to_dict())
376
+ client.hset(KEY_ACTIVE_AGENTS, self._agent_id, session_data)
377
+
378
+ # === Conflict Resolution ===
379
+
380
+ def resolve_conflict(
381
+ self,
382
+ resource_key: str,
383
+ other_agent_id: str,
384
+ strategy: ConflictStrategy = ConflictStrategy.PRIORITY_BASED,
385
+ ) -> ConflictResult:
386
+ """Resolve a conflict between this session and another.
387
+
388
+ Args:
389
+ resource_key: Key of the contested resource
390
+ other_agent_id: Agent ID of the other party
391
+ strategy: Strategy to use for resolution
392
+
393
+ Returns:
394
+ ConflictResult with winner and loser
395
+ """
396
+ other_session = self.get_session(other_agent_id)
397
+
398
+ if strategy == ConflictStrategy.PRIORITY_BASED:
399
+ return self._resolve_by_priority(resource_key, other_session)
400
+ elif strategy == ConflictStrategy.FIRST_WRITE_WINS:
401
+ return self._resolve_first_write(resource_key, other_session)
402
+ else: # LAST_WRITE_WINS
403
+ return self._resolve_last_write(resource_key, other_session)
404
+
405
+ def _resolve_by_priority(
406
+ self,
407
+ resource_key: str,
408
+ other_session: SessionInfo | None,
409
+ ) -> ConflictResult:
410
+ """Resolve conflict using priority (access tier)."""
411
+ my_tier = self._access_tier.value
412
+ other_tier = other_session.access_tier.value if other_session else 0
413
+ other_id = other_session.agent_id if other_session else "unknown"
414
+
415
+ if my_tier > other_tier:
416
+ winner, loser = self._agent_id, other_id
417
+ reason = f"Higher tier ({self._access_tier.name} > {other_session.access_tier.name if other_session else 'N/A'})"
418
+ elif my_tier < other_tier:
419
+ winner, loser = other_id, self._agent_id
420
+ reason = f"Higher tier ({other_session.access_tier.name if other_session else 'N/A'} > {self._access_tier.name})"
421
+ else:
422
+ # Equal tier - use timestamp (first write wins)
423
+ if other_session and other_session.started_at < self._session_info.started_at:
424
+ winner, loser = other_id, self._agent_id
425
+ reason = "Equal tier, earlier session wins"
426
+ else:
427
+ winner, loser = self._agent_id, other_id
428
+ reason = "Equal tier, earlier session wins"
429
+
430
+ return ConflictResult(
431
+ winner_agent_id=winner,
432
+ loser_agent_id=loser,
433
+ resource_key=resource_key,
434
+ strategy_used=ConflictStrategy.PRIORITY_BASED,
435
+ reason=reason,
436
+ )
437
+
438
+ def _resolve_first_write(
439
+ self,
440
+ resource_key: str,
441
+ other_session: SessionInfo | None,
442
+ ) -> ConflictResult:
443
+ """Resolve conflict using first-write-wins."""
444
+ # Check who owns the resource
445
+ client = self._memory._client
446
+ if client is None:
447
+ return ConflictResult(
448
+ winner_agent_id=self._agent_id,
449
+ loser_agent_id=other_session.agent_id if other_session else "unknown",
450
+ resource_key=resource_key,
451
+ strategy_used=ConflictStrategy.FIRST_WRITE_WINS,
452
+ reason="No Redis connection - local wins",
453
+ )
454
+
455
+ # Try to get lock on resource
456
+ lock_key = f"empathy:lock:{resource_key}"
457
+ acquired = client.setnx(lock_key, self._agent_id)
458
+
459
+ if acquired:
460
+ client.expire(lock_key, 300) # 5 minute lock
461
+ winner, loser = self._agent_id, other_session.agent_id if other_session else "unknown"
462
+ reason = "First to acquire lock"
463
+ else:
464
+ current_owner = client.get(lock_key)
465
+ if isinstance(current_owner, bytes):
466
+ current_owner = current_owner.decode()
467
+ winner = current_owner or "unknown"
468
+ loser = self._agent_id
469
+ reason = "Lock already held"
470
+
471
+ return ConflictResult(
472
+ winner_agent_id=winner,
473
+ loser_agent_id=loser,
474
+ resource_key=resource_key,
475
+ strategy_used=ConflictStrategy.FIRST_WRITE_WINS,
476
+ reason=reason,
477
+ )
478
+
479
+ def _resolve_last_write(
480
+ self,
481
+ resource_key: str,
482
+ other_session: SessionInfo | None,
483
+ ) -> ConflictResult:
484
+ """Resolve conflict using last-write-wins (current writer wins)."""
485
+ return ConflictResult(
486
+ winner_agent_id=self._agent_id,
487
+ loser_agent_id=other_session.agent_id if other_session else "unknown",
488
+ resource_key=resource_key,
489
+ strategy_used=ConflictStrategy.LAST_WRITE_WINS,
490
+ reason="Last write wins - current writer takes precedence",
491
+ )
492
+
493
+ # === Distributed Locking ===
494
+
495
+ def acquire_lock(
496
+ self,
497
+ resource_key: str,
498
+ timeout_seconds: int = 300,
499
+ ) -> bool:
500
+ """Acquire a distributed lock on a resource.
501
+
502
+ Args:
503
+ resource_key: Key of the resource to lock
504
+ timeout_seconds: Lock timeout in seconds
505
+
506
+ Returns:
507
+ True if lock acquired, False otherwise
508
+ """
509
+ client = self._memory._client
510
+ if client is None:
511
+ return False
512
+
513
+ lock_key = f"empathy:lock:{resource_key}"
514
+ acquired = client.setnx(lock_key, self._agent_id)
515
+
516
+ if acquired:
517
+ client.expire(lock_key, timeout_seconds)
518
+ logger.debug(
519
+ "lock_acquired",
520
+ resource_key=resource_key,
521
+ agent_id=self._agent_id,
522
+ )
523
+
524
+ return bool(acquired)
525
+
526
+ def release_lock(self, resource_key: str) -> bool:
527
+ """Release a distributed lock.
528
+
529
+ Args:
530
+ resource_key: Key of the resource to unlock
531
+
532
+ Returns:
533
+ True if lock released, False if not owner
534
+ """
535
+ client = self._memory._client
536
+ if client is None:
537
+ return False
538
+
539
+ lock_key = f"empathy:lock:{resource_key}"
540
+ current_owner = client.get(lock_key)
541
+
542
+ if current_owner:
543
+ if isinstance(current_owner, bytes):
544
+ current_owner = current_owner.decode()
545
+ if current_owner == self._agent_id:
546
+ client.delete(lock_key)
547
+ logger.debug(
548
+ "lock_released",
549
+ resource_key=resource_key,
550
+ agent_id=self._agent_id,
551
+ )
552
+ return True
553
+
554
+ return False
555
+
556
+ def check_lock(self, resource_key: str) -> str | None:
557
+ """Check who holds a lock on a resource.
558
+
559
+ Args:
560
+ resource_key: Key of the resource
561
+
562
+ Returns:
563
+ Agent ID of lock holder, or None if unlocked
564
+ """
565
+ client = self._memory._client
566
+ if client is None:
567
+ return None
568
+
569
+ lock_key = f"empathy:lock:{resource_key}"
570
+ owner = client.get(lock_key)
571
+
572
+ if owner:
573
+ if isinstance(owner, bytes):
574
+ owner = owner.decode()
575
+ return owner
576
+
577
+ return None
578
+
579
+ # === Event Handlers ===
580
+
581
+ def on_session_joined(self, handler: Callable[[SessionInfo], None]) -> None:
582
+ """Register handler for when a session joins.
583
+
584
+ Args:
585
+ handler: Callback receiving SessionInfo of joining session
586
+ """
587
+ self._on_session_joined.append(handler)
588
+
589
+ def on_session_left(self, handler: Callable[[str], None]) -> None:
590
+ """Register handler for when a session leaves.
591
+
592
+ Args:
593
+ handler: Callback receiving agent_id of departing session
594
+ """
595
+ self._on_session_left.append(handler)
596
+
597
+ def subscribe_to_sessions(self) -> None:
598
+ """Subscribe to session events (join/leave).
599
+
600
+ Note: This blocks and should be called in a separate thread.
601
+ """
602
+ client = self._memory._client
603
+ if client is None:
604
+ return
605
+
606
+ pubsub = client.pubsub()
607
+ pubsub.subscribe(CHANNEL_SESSIONS)
608
+
609
+ for message in pubsub.listen():
610
+ if message["type"] != "message":
611
+ continue
612
+
613
+ try:
614
+ data = message["data"]
615
+ if isinstance(data, bytes):
616
+ data = data.decode()
617
+ event = json.loads(data)
618
+
619
+ if event.get("event") == "session_joined":
620
+ session_info = SessionInfo.from_dict(event["session"])
621
+ for handler in self._on_session_joined:
622
+ handler(session_info)
623
+ elif event.get("event") == "session_left":
624
+ agent_id = event["agent_id"]
625
+ for handler in self._on_session_left:
626
+ handler(agent_id)
627
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
628
+ logger.warning("invalid_session_event", error=str(e))
629
+
630
+ # === Cleanup ===
631
+
632
+ def close(self) -> None:
633
+ """Clean up and depart."""
634
+ self.depart()
635
+
636
+
637
+ # === Background Service ===
638
+
639
+
640
+ class BackgroundService:
641
+ """Background service daemon for cross-session coordination.
642
+
643
+ This service runs persistently to:
644
+ - Maintain registry of active sessions
645
+ - Aggregate results from completed tasks
646
+ - Clean up stale session data
647
+ - Coordinate conflict resolution
648
+ - Promote patterns to long-term memory (when ready)
649
+ """
650
+
651
+ def __init__(
652
+ self,
653
+ memory: RedisShortTermMemory,
654
+ auto_start_on_connect: bool = True,
655
+ ):
656
+ """Initialize background service.
657
+
658
+ Args:
659
+ memory: RedisShortTermMemory instance
660
+ auto_start_on_connect: Start automatically when first session connects
661
+ """
662
+ if memory.use_mock:
663
+ raise ValueError(
664
+ "Background service requires Redis. "
665
+ "Mock mode does not support cross-session features."
666
+ )
667
+
668
+ self._memory = memory
669
+ self._auto_start = auto_start_on_connect
670
+ self._coordinator: CrossSessionCoordinator | None = None
671
+ self._running = False
672
+ self._service_thread: threading.Thread | None = None
673
+ self._stop_event = threading.Event()
674
+
675
+ logger.info("background_service_initialized")
676
+
677
+ @property
678
+ def is_running(self) -> bool:
679
+ """Check if service is running."""
680
+ return self._running
681
+
682
+ def start(self) -> bool:
683
+ """Start the background service.
684
+
685
+ Returns:
686
+ True if started, False if already running or couldn't acquire lock
687
+ """
688
+ if self._running:
689
+ logger.warning("service_already_running")
690
+ return False
691
+
692
+ # Try to acquire service lock (only one service can run)
693
+ if not self._acquire_service_lock():
694
+ logger.warning("service_lock_held_by_another")
695
+ return False
696
+
697
+ # Create coordinator for service
698
+ self._coordinator = CrossSessionCoordinator(
699
+ memory=self._memory,
700
+ session_type=SessionType.SERVICE,
701
+ access_tier=AccessTier.STEWARD,
702
+ capabilities=["coordinate", "aggregate", "cleanup", "promote"],
703
+ auto_announce=True,
704
+ )
705
+
706
+ # Start service loop
707
+ self._running = True
708
+ self._stop_event.clear()
709
+ self._service_thread = threading.Thread(
710
+ target=self._service_loop,
711
+ daemon=True,
712
+ name="empathy-service",
713
+ )
714
+ self._service_thread.start()
715
+
716
+ logger.info(
717
+ "background_service_started",
718
+ agent_id=self._coordinator.agent_id,
719
+ )
720
+ return True
721
+
722
+ def stop(self) -> None:
723
+ """Stop the background service."""
724
+ if not self._running:
725
+ return
726
+
727
+ self._stop_event.set()
728
+
729
+ if self._service_thread:
730
+ self._service_thread.join(timeout=10)
731
+ self._service_thread = None
732
+
733
+ if self._coordinator:
734
+ self._coordinator.close()
735
+ self._coordinator = None
736
+
737
+ self._release_service_lock()
738
+ self._running = False
739
+
740
+ logger.info("background_service_stopped")
741
+
742
+ def _acquire_service_lock(self) -> bool:
743
+ """Try to acquire the service lock."""
744
+ client = self._memory._client
745
+ if client is None:
746
+ return False
747
+
748
+ # Use SETNX for atomic lock acquisition
749
+ acquired = client.setnx(KEY_SERVICE_LOCK, os.getpid())
750
+ if acquired:
751
+ client.expire(KEY_SERVICE_LOCK, SERVICE_LOCK_TTL_SECONDS)
752
+ return bool(acquired)
753
+
754
+ def _release_service_lock(self) -> None:
755
+ """Release the service lock."""
756
+ client = self._memory._client
757
+ if client:
758
+ client.delete(KEY_SERVICE_LOCK)
759
+
760
+ def _refresh_service_lock(self) -> None:
761
+ """Refresh the service lock TTL."""
762
+ client = self._memory._client
763
+ if client:
764
+ client.expire(KEY_SERVICE_LOCK, SERVICE_LOCK_TTL_SECONDS)
765
+ client.set(KEY_SERVICE_HEARTBEAT, datetime.now().isoformat())
766
+
767
+ def _service_loop(self) -> None:
768
+ """Main service loop."""
769
+ cleanup_interval = 60 # Clean up stale sessions every 60 seconds
770
+ last_cleanup = time.time()
771
+
772
+ while not self._stop_event.wait(10): # Check every 10 seconds
773
+ try:
774
+ # Refresh service lock
775
+ self._refresh_service_lock()
776
+
777
+ # Periodic cleanup
778
+ if time.time() - last_cleanup > cleanup_interval:
779
+ self._cleanup_stale_sessions()
780
+ last_cleanup = time.time()
781
+
782
+ except Exception as e:
783
+ logger.exception("service_loop_error", error=str(e))
784
+
785
+ def _cleanup_stale_sessions(self) -> None:
786
+ """Clean up stale session data."""
787
+ if not self._coordinator:
788
+ return
789
+
790
+ # Get all sessions (this already cleans stale ones)
791
+ sessions = self._coordinator.get_active_sessions()
792
+ logger.debug(
793
+ "cleanup_completed",
794
+ active_sessions=len(sessions),
795
+ )
796
+
797
+ def get_status(self) -> dict[str, Any]:
798
+ """Get service status.
799
+
800
+ Returns:
801
+ Dict with service status information
802
+ """
803
+ status = {
804
+ "running": self._running,
805
+ "agent_id": self._coordinator.agent_id if self._coordinator else None,
806
+ "active_sessions": 0,
807
+ }
808
+
809
+ if self._coordinator:
810
+ sessions = self._coordinator.get_active_sessions()
811
+ status["active_sessions"] = len(sessions)
812
+ status["sessions"] = [s.to_dict() for s in sessions]
813
+
814
+ return status
815
+
816
+
817
+ # === Convenience Functions ===
818
+
819
+
820
+ def check_redis_cross_session_support(memory: RedisShortTermMemory) -> bool:
821
+ """Check if Redis supports cross-session communication.
822
+
823
+ Args:
824
+ memory: RedisShortTermMemory instance
825
+
826
+ Returns:
827
+ True if Redis is available and not in mock mode
828
+ """
829
+ return not memory.use_mock and memory._client is not None
830
+
831
+
832
+ def get_or_start_service(memory: RedisShortTermMemory) -> BackgroundService | None:
833
+ """Get existing service or start a new one.
834
+
835
+ Args:
836
+ memory: RedisShortTermMemory instance
837
+
838
+ Returns:
839
+ BackgroundService if started/running, None if unavailable
840
+ """
841
+ if not check_redis_cross_session_support(memory):
842
+ return None
843
+
844
+ service = BackgroundService(memory)
845
+ if service.start():
846
+ return service
847
+
848
+ # Service already running elsewhere
849
+ return None