mcp-hangar 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. mcp_hangar/__init__.py +139 -0
  2. mcp_hangar/application/__init__.py +1 -0
  3. mcp_hangar/application/commands/__init__.py +67 -0
  4. mcp_hangar/application/commands/auth_commands.py +118 -0
  5. mcp_hangar/application/commands/auth_handlers.py +296 -0
  6. mcp_hangar/application/commands/commands.py +59 -0
  7. mcp_hangar/application/commands/handlers.py +189 -0
  8. mcp_hangar/application/discovery/__init__.py +21 -0
  9. mcp_hangar/application/discovery/discovery_metrics.py +283 -0
  10. mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
  11. mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
  12. mcp_hangar/application/discovery/security_validator.py +414 -0
  13. mcp_hangar/application/event_handlers/__init__.py +50 -0
  14. mcp_hangar/application/event_handlers/alert_handler.py +191 -0
  15. mcp_hangar/application/event_handlers/audit_handler.py +203 -0
  16. mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
  17. mcp_hangar/application/event_handlers/logging_handler.py +69 -0
  18. mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
  19. mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
  20. mcp_hangar/application/event_handlers/security_handler.py +604 -0
  21. mcp_hangar/application/mcp/tooling.py +158 -0
  22. mcp_hangar/application/ports/__init__.py +9 -0
  23. mcp_hangar/application/ports/observability.py +237 -0
  24. mcp_hangar/application/queries/__init__.py +52 -0
  25. mcp_hangar/application/queries/auth_handlers.py +237 -0
  26. mcp_hangar/application/queries/auth_queries.py +118 -0
  27. mcp_hangar/application/queries/handlers.py +227 -0
  28. mcp_hangar/application/read_models/__init__.py +11 -0
  29. mcp_hangar/application/read_models/provider_views.py +139 -0
  30. mcp_hangar/application/sagas/__init__.py +11 -0
  31. mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
  32. mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
  33. mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
  34. mcp_hangar/application/services/__init__.py +9 -0
  35. mcp_hangar/application/services/provider_service.py +208 -0
  36. mcp_hangar/application/services/traced_provider_service.py +211 -0
  37. mcp_hangar/bootstrap/runtime.py +328 -0
  38. mcp_hangar/context.py +178 -0
  39. mcp_hangar/domain/__init__.py +117 -0
  40. mcp_hangar/domain/contracts/__init__.py +57 -0
  41. mcp_hangar/domain/contracts/authentication.py +225 -0
  42. mcp_hangar/domain/contracts/authorization.py +229 -0
  43. mcp_hangar/domain/contracts/event_store.py +178 -0
  44. mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
  45. mcp_hangar/domain/contracts/persistence.py +383 -0
  46. mcp_hangar/domain/contracts/provider_runtime.py +146 -0
  47. mcp_hangar/domain/discovery/__init__.py +20 -0
  48. mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
  49. mcp_hangar/domain/discovery/discovered_provider.py +185 -0
  50. mcp_hangar/domain/discovery/discovery_service.py +412 -0
  51. mcp_hangar/domain/discovery/discovery_source.py +192 -0
  52. mcp_hangar/domain/events.py +433 -0
  53. mcp_hangar/domain/exceptions.py +525 -0
  54. mcp_hangar/domain/model/__init__.py +70 -0
  55. mcp_hangar/domain/model/aggregate.py +58 -0
  56. mcp_hangar/domain/model/circuit_breaker.py +152 -0
  57. mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
  58. mcp_hangar/domain/model/event_sourced_provider.py +423 -0
  59. mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
  60. mcp_hangar/domain/model/health_tracker.py +183 -0
  61. mcp_hangar/domain/model/load_balancer.py +185 -0
  62. mcp_hangar/domain/model/provider.py +810 -0
  63. mcp_hangar/domain/model/provider_group.py +656 -0
  64. mcp_hangar/domain/model/tool_catalog.py +105 -0
  65. mcp_hangar/domain/policies/__init__.py +19 -0
  66. mcp_hangar/domain/policies/provider_health.py +187 -0
  67. mcp_hangar/domain/repository.py +249 -0
  68. mcp_hangar/domain/security/__init__.py +85 -0
  69. mcp_hangar/domain/security/input_validator.py +710 -0
  70. mcp_hangar/domain/security/rate_limiter.py +387 -0
  71. mcp_hangar/domain/security/roles.py +237 -0
  72. mcp_hangar/domain/security/sanitizer.py +387 -0
  73. mcp_hangar/domain/security/secrets.py +501 -0
  74. mcp_hangar/domain/services/__init__.py +20 -0
  75. mcp_hangar/domain/services/audit_service.py +376 -0
  76. mcp_hangar/domain/services/image_builder.py +328 -0
  77. mcp_hangar/domain/services/provider_launcher.py +1046 -0
  78. mcp_hangar/domain/value_objects.py +1138 -0
  79. mcp_hangar/errors.py +818 -0
  80. mcp_hangar/fastmcp_server.py +1105 -0
  81. mcp_hangar/gc.py +134 -0
  82. mcp_hangar/infrastructure/__init__.py +79 -0
  83. mcp_hangar/infrastructure/async_executor.py +133 -0
  84. mcp_hangar/infrastructure/auth/__init__.py +37 -0
  85. mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
  86. mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
  87. mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
  88. mcp_hangar/infrastructure/auth/middleware.py +340 -0
  89. mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
  90. mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
  91. mcp_hangar/infrastructure/auth/projections.py +366 -0
  92. mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
  93. mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
  94. mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
  95. mcp_hangar/infrastructure/command_bus.py +112 -0
  96. mcp_hangar/infrastructure/discovery/__init__.py +110 -0
  97. mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
  98. mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
  99. mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
  100. mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
  101. mcp_hangar/infrastructure/event_bus.py +260 -0
  102. mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
  103. mcp_hangar/infrastructure/event_store.py +396 -0
  104. mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
  105. mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
  106. mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
  107. mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
  108. mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
  109. mcp_hangar/infrastructure/metrics_publisher.py +36 -0
  110. mcp_hangar/infrastructure/observability/__init__.py +10 -0
  111. mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
  112. mcp_hangar/infrastructure/persistence/__init__.py +33 -0
  113. mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
  114. mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
  115. mcp_hangar/infrastructure/persistence/database.py +333 -0
  116. mcp_hangar/infrastructure/persistence/database_common.py +330 -0
  117. mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
  118. mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
  119. mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
  120. mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
  121. mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
  122. mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
  123. mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
  124. mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
  125. mcp_hangar/infrastructure/query_bus.py +153 -0
  126. mcp_hangar/infrastructure/saga_manager.py +401 -0
  127. mcp_hangar/logging_config.py +209 -0
  128. mcp_hangar/metrics.py +1007 -0
  129. mcp_hangar/models.py +31 -0
  130. mcp_hangar/observability/__init__.py +54 -0
  131. mcp_hangar/observability/health.py +487 -0
  132. mcp_hangar/observability/metrics.py +319 -0
  133. mcp_hangar/observability/tracing.py +433 -0
  134. mcp_hangar/progress.py +542 -0
  135. mcp_hangar/retry.py +613 -0
  136. mcp_hangar/server/__init__.py +120 -0
  137. mcp_hangar/server/__main__.py +6 -0
  138. mcp_hangar/server/auth_bootstrap.py +340 -0
  139. mcp_hangar/server/auth_cli.py +335 -0
  140. mcp_hangar/server/auth_config.py +305 -0
  141. mcp_hangar/server/bootstrap.py +735 -0
  142. mcp_hangar/server/cli.py +161 -0
  143. mcp_hangar/server/config.py +224 -0
  144. mcp_hangar/server/context.py +215 -0
  145. mcp_hangar/server/http_auth_middleware.py +165 -0
  146. mcp_hangar/server/lifecycle.py +467 -0
  147. mcp_hangar/server/state.py +117 -0
  148. mcp_hangar/server/tools/__init__.py +16 -0
  149. mcp_hangar/server/tools/discovery.py +186 -0
  150. mcp_hangar/server/tools/groups.py +75 -0
  151. mcp_hangar/server/tools/health.py +301 -0
  152. mcp_hangar/server/tools/provider.py +939 -0
  153. mcp_hangar/server/tools/registry.py +320 -0
  154. mcp_hangar/server/validation.py +113 -0
  155. mcp_hangar/stdio_client.py +229 -0
  156. mcp_hangar-0.2.0.dist-info/METADATA +347 -0
  157. mcp_hangar-0.2.0.dist-info/RECORD +160 -0
  158. mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
  159. mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
  160. mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,656 @@
1
+ """Provider Group Aggregate - manages a group of providers with load balancing.
2
+
3
+ A ProviderGroup is an aggregate root that manages multiple Provider instances
4
+ as a single logical unit with automatic load balancing and failover.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ import threading
9
+ import time
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from ...logging_config import get_logger
13
+ from ..events import DomainEvent
14
+ from ..value_objects import GroupId, GroupState, LoadBalancerStrategy, MemberPriority, MemberWeight, ProviderState
15
+ from .aggregate import AggregateRoot
16
+ from .circuit_breaker import CircuitBreaker, CircuitBreakerConfig
17
+ from .load_balancer import LoadBalancer
18
+ from .provider import Provider
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ # --- Group-specific Domain Events ---
24
+
25
+
26
+ @dataclass
27
+ class GroupCreated(DomainEvent):
28
+ """Published when a provider group is created."""
29
+
30
+ group_id: str
31
+ strategy: str
32
+ min_healthy: int
33
+
34
+ def __post_init__(self):
35
+ super().__init__()
36
+
37
+
38
+ @dataclass
39
+ class GroupMemberAdded(DomainEvent):
40
+ """Published when a member is added to a group."""
41
+
42
+ group_id: str
43
+ member_id: str
44
+ weight: int
45
+ priority: int
46
+
47
+ def __post_init__(self):
48
+ super().__init__()
49
+
50
+
51
+ @dataclass
52
+ class GroupMemberRemoved(DomainEvent):
53
+ """Published when a member is removed from a group."""
54
+
55
+ group_id: str
56
+ member_id: str
57
+
58
+ def __post_init__(self):
59
+ super().__init__()
60
+
61
+
62
+ @dataclass
63
+ class GroupMemberHealthChanged(DomainEvent):
64
+ """Published when a member's rotation status changes."""
65
+
66
+ group_id: str
67
+ member_id: str
68
+ in_rotation: bool
69
+ reason: str = ""
70
+
71
+ def __post_init__(self):
72
+ super().__init__()
73
+
74
+
75
+ @dataclass
76
+ class GroupStateChanged(DomainEvent):
77
+ """Published when group state transitions."""
78
+
79
+ group_id: str
80
+ old_state: str
81
+ new_state: str
82
+ healthy_count: int
83
+ total_count: int
84
+
85
+ def __post_init__(self):
86
+ super().__init__()
87
+
88
+
89
+ @dataclass
90
+ class GroupCircuitOpened(DomainEvent):
91
+ """Published when group circuit breaker opens."""
92
+
93
+ group_id: str
94
+ failure_count: int
95
+
96
+ def __post_init__(self):
97
+ super().__init__()
98
+
99
+
100
+ @dataclass
101
+ class GroupCircuitClosed(DomainEvent):
102
+ """Published when group circuit breaker closes."""
103
+
104
+ group_id: str
105
+
106
+ def __post_init__(self):
107
+ super().__init__()
108
+
109
+
110
+ # --- Group Member ---
111
+
112
+
113
+ @dataclass
114
+ class GroupMember:
115
+ """A member of a provider group."""
116
+
117
+ provider: Provider
118
+ weight: int = 1
119
+ priority: int = 1
120
+ in_rotation: bool = False # Currently accepting traffic
121
+ consecutive_failures: int = 0
122
+ consecutive_successes: int = 0
123
+ last_selected_at: float = 0.0
124
+
125
+ @property
126
+ def id(self) -> str:
127
+ """Get member's provider ID as string."""
128
+ # provider.id returns str (from Provider class)
129
+ return str(self.provider.id)
130
+
131
+
132
+ # --- Provider Group Aggregate ---
133
+
134
+
135
+ class ProviderGroup(AggregateRoot):
136
+ """
137
+ Aggregate root for a group of load-balanced providers.
138
+
139
+ Responsibilities:
140
+ - Manage member lifecycle
141
+ - Load balancing decisions
142
+ - Group-level health tracking
143
+ - Circuit breaker for the entire group
144
+
145
+ Thread-safety:
146
+ - All public methods are thread-safe
147
+ - Internal lock prevents concurrent modification
148
+ """
149
+
150
+ def __init__(
151
+ self,
152
+ group_id: str,
153
+ strategy: LoadBalancerStrategy = LoadBalancerStrategy.ROUND_ROBIN,
154
+ min_healthy: int = 1,
155
+ auto_start: bool = True,
156
+ unhealthy_threshold: int = 2,
157
+ healthy_threshold: int = 1,
158
+ circuit_failure_threshold: int = 10,
159
+ circuit_reset_timeout_s: float = 60.0,
160
+ description: Optional[str] = None,
161
+ ):
162
+ """
163
+ Initialize a provider group.
164
+
165
+ Args:
166
+ group_id: Unique identifier for the group
167
+ strategy: Load balancing strategy
168
+ min_healthy: Minimum healthy members for HEALTHY state
169
+ auto_start: Automatically start members when added
170
+ unhealthy_threshold: Failures before removing from rotation
171
+ healthy_threshold: Successes before adding back to rotation
172
+ circuit_failure_threshold: Failures before circuit opens
173
+ circuit_reset_timeout_s: Time before circuit resets
174
+ description: Human-readable description
175
+ """
176
+ super().__init__()
177
+
178
+ # Identity
179
+ self._id = GroupId(group_id)
180
+ self._description = description
181
+
182
+ # Configuration
183
+ self._strategy = strategy
184
+ self._min_healthy = max(1, min_healthy)
185
+ self._auto_start = auto_start
186
+ self._unhealthy_threshold = max(1, unhealthy_threshold)
187
+ self._healthy_threshold = max(1, healthy_threshold)
188
+
189
+ # State
190
+ self._state = GroupState.INACTIVE
191
+ self._members: Dict[str, GroupMember] = {}
192
+ self._load_balancer = LoadBalancer(strategy)
193
+
194
+ # Circuit breaker (extracted for SRP)
195
+ self._circuit_breaker = CircuitBreaker(
196
+ CircuitBreakerConfig(
197
+ failure_threshold=circuit_failure_threshold,
198
+ reset_timeout_s=circuit_reset_timeout_s,
199
+ )
200
+ )
201
+
202
+ # Threading
203
+ self._lock = threading.RLock()
204
+
205
+ self._record_event(
206
+ GroupCreated(
207
+ group_id=group_id,
208
+ strategy=strategy.value,
209
+ min_healthy=min_healthy,
210
+ )
211
+ )
212
+
213
+ # --- Properties ---
214
+
215
+ @property
216
+ def id(self) -> str:
217
+ """Get group ID."""
218
+ return self._id.value
219
+
220
+ @property
221
+ def description(self) -> Optional[str]:
222
+ """Get group description."""
223
+ return self._description
224
+
225
+ @property
226
+ def state(self) -> GroupState:
227
+ """Get current group state."""
228
+ with self._lock:
229
+ return self._state
230
+
231
+ @property
232
+ def strategy(self) -> LoadBalancerStrategy:
233
+ """Get load balancing strategy."""
234
+ return self._strategy
235
+
236
+ @property
237
+ def healthy_count(self) -> int:
238
+ """Number of members currently in rotation."""
239
+ with self._lock:
240
+ return sum(1 for m in self._members.values() if m.in_rotation)
241
+
242
+ @property
243
+ def total_count(self) -> int:
244
+ """Total number of members in the group."""
245
+ with self._lock:
246
+ return len(self._members)
247
+
248
+ @property
249
+ def is_available(self) -> bool:
250
+ """Can the group accept requests?"""
251
+ with self._lock:
252
+ return not self._circuit_breaker.is_open and self._state.can_accept_requests and self.healthy_count >= 1
253
+
254
+ @property
255
+ def circuit_open(self) -> bool:
256
+ """Is the circuit breaker open?"""
257
+ return self._circuit_breaker.is_open
258
+
259
+ @property
260
+ def members(self) -> List[GroupMember]:
261
+ """Get list of all members."""
262
+ with self._lock:
263
+ return list(self._members.values())
264
+
265
+ # --- Member Management ---
266
+
267
+ def add_member(
268
+ self,
269
+ provider: Provider,
270
+ weight: int = 1,
271
+ priority: int = 1,
272
+ ) -> None:
273
+ """
274
+ Add a provider to the group.
275
+
276
+ Args:
277
+ provider: Provider instance to add
278
+ weight: Load balancing weight (higher = more traffic)
279
+ priority: Priority for priority-based selection (lower = higher priority)
280
+
281
+ Raises:
282
+ ValueError: If member already exists in group
283
+ """
284
+ with self._lock:
285
+ # Get member ID as string for dictionary key
286
+ member_id = str(provider.id)
287
+
288
+ if member_id in self._members:
289
+ raise ValueError(f"Member {member_id} already in group {self.id}")
290
+
291
+ # Validate weight and priority
292
+ validated_weight = MemberWeight(weight)
293
+ validated_priority = MemberPriority(priority)
294
+
295
+ member = GroupMember(
296
+ provider=provider,
297
+ weight=validated_weight.value,
298
+ priority=validated_priority.value,
299
+ )
300
+ self._members[member_id] = member
301
+
302
+ self._record_event(
303
+ GroupMemberAdded(
304
+ group_id=self.id,
305
+ member_id=member_id,
306
+ weight=weight,
307
+ priority=priority,
308
+ )
309
+ )
310
+
311
+ logger.info(f"Added member {member_id} to group {self.id} (weight={weight}, priority={priority})")
312
+
313
+ # Auto-start if configured
314
+ if self._auto_start:
315
+ self._try_start_member(member)
316
+
317
+ def remove_member(self, member_id: str) -> bool:
318
+ """
319
+ Remove a provider from the group.
320
+
321
+ Args:
322
+ member_id: ID of the member to remove
323
+
324
+ Returns:
325
+ True if member was removed, False if not found
326
+ """
327
+ with self._lock:
328
+ member = self._members.pop(member_id, None)
329
+ if member:
330
+ member.in_rotation = False
331
+ self._update_state()
332
+ self._record_event(
333
+ GroupMemberRemoved(
334
+ group_id=self.id,
335
+ member_id=member_id,
336
+ )
337
+ )
338
+ logger.info(f"Removed member {member_id} from group {self.id}")
339
+ return True
340
+ return False
341
+
342
+ def get_member(self, member_id: str) -> Optional[GroupMember]:
343
+ """Get a member by ID."""
344
+ with self._lock:
345
+ return self._members.get(member_id)
346
+
347
+ def _try_start_member(self, member: GroupMember) -> bool:
348
+ """
349
+ Try to start a member and add to rotation if successful.
350
+
351
+ Returns:
352
+ True if member started and added to rotation
353
+ """
354
+ try:
355
+ member.provider.ensure_ready()
356
+ if member.provider.state == ProviderState.READY:
357
+ member.in_rotation = True
358
+ member.consecutive_failures = 0
359
+ member.consecutive_successes = 1
360
+ self._update_state()
361
+ self._record_event(
362
+ GroupMemberHealthChanged(
363
+ group_id=self.id,
364
+ member_id=member.id,
365
+ in_rotation=True,
366
+ reason="started",
367
+ )
368
+ )
369
+ logger.info(f"Member {member.id} started and added to rotation")
370
+ return True
371
+ except Exception as e:
372
+ logger.warning(f"Failed to start member {member.id}: {e}")
373
+ member.in_rotation = False
374
+ return False
375
+
376
+ # --- Load Balancing ---
377
+
378
+ def select_member(self) -> Optional[Provider]:
379
+ """
380
+ Select a member for the next request using load balancer.
381
+
382
+ Returns:
383
+ Selected provider or None if no healthy members available
384
+ """
385
+ with self._lock:
386
+ if not self._circuit_breaker.allow_request():
387
+ return None
388
+
389
+ self._check_circuit_recovery()
390
+
391
+ available = [m for m in self._members.values() if m.in_rotation]
392
+ if not available:
393
+ return None
394
+
395
+ selected = self._load_balancer.select(available)
396
+ if selected:
397
+ selected.last_selected_at = time.time()
398
+ return selected.provider
399
+
400
+ return None
401
+
402
+ def _check_circuit_recovery(self) -> None:
403
+ """Check if circuit just recovered and emit event."""
404
+ if not self._circuit_breaker.is_open and self._state == GroupState.DEGRADED:
405
+ self._record_event(GroupCircuitClosed(group_id=self.id))
406
+ logger.info(f"Circuit breaker closed for group {self.id}")
407
+ self._update_state()
408
+
409
+ # --- Health Reporting ---
410
+
411
+ def report_success(self, member_id: str) -> None:
412
+ """
413
+ Report successful invocation for a member.
414
+
415
+ Args:
416
+ member_id: ID of the member that succeeded
417
+ """
418
+ with self._lock:
419
+ member = self._members.get(member_id)
420
+ if not member:
421
+ return
422
+
423
+ member.consecutive_failures = 0
424
+ member.consecutive_successes += 1
425
+ self._maybe_add_to_rotation(member, member_id)
426
+
427
+ def _maybe_add_to_rotation(self, member: GroupMember, member_id: str) -> None:
428
+ """Add member back to rotation if healthy threshold reached."""
429
+ if member.in_rotation:
430
+ return
431
+ if member.provider.state != ProviderState.READY:
432
+ return
433
+ if member.consecutive_successes < self._healthy_threshold:
434
+ return
435
+
436
+ member.in_rotation = True
437
+ self._record_event(
438
+ GroupMemberHealthChanged(
439
+ group_id=self.id,
440
+ member_id=member_id,
441
+ in_rotation=True,
442
+ reason="healthy_threshold_reached",
443
+ )
444
+ )
445
+ self._update_state()
446
+ logger.info(f"Member {member_id} added back to rotation")
447
+
448
+ def report_failure(self, member_id: str) -> None:
449
+ """
450
+ Report failed invocation for a member.
451
+
452
+ Args:
453
+ member_id: ID of the member that failed
454
+ """
455
+ with self._lock:
456
+ member = self._members.get(member_id)
457
+ if not member:
458
+ return
459
+
460
+ member.consecutive_failures += 1
461
+ member.consecutive_successes = 0
462
+
463
+ self._maybe_remove_from_rotation(member, member_id)
464
+ self._maybe_open_circuit()
465
+ self._update_state()
466
+
467
+ def _maybe_remove_from_rotation(self, member: GroupMember, member_id: str) -> None:
468
+ """Remove member from rotation if unhealthy threshold reached."""
469
+ if member.consecutive_failures < self._unhealthy_threshold:
470
+ return
471
+ if not member.in_rotation:
472
+ return
473
+
474
+ member.in_rotation = False
475
+ self._record_event(
476
+ GroupMemberHealthChanged(
477
+ group_id=self.id,
478
+ member_id=member_id,
479
+ in_rotation=False,
480
+ reason="unhealthy_threshold_reached",
481
+ )
482
+ )
483
+ logger.info(f"Member {member_id} removed from rotation after {member.consecutive_failures} failures")
484
+
485
+ def _maybe_open_circuit(self) -> None:
486
+ """Open circuit breaker if failure threshold reached."""
487
+ circuit_just_opened = self._circuit_breaker.record_failure()
488
+ if not circuit_just_opened:
489
+ return
490
+
491
+ self._record_event(
492
+ GroupCircuitOpened(
493
+ group_id=self.id,
494
+ failure_count=self._circuit_breaker.failure_count,
495
+ )
496
+ )
497
+ logger.warning(
498
+ f"Circuit breaker opened for group {self.id} after {self._circuit_breaker.failure_count} failures"
499
+ )
500
+
501
+ # --- State Management ---
502
+
503
+ def _update_state(self) -> None:
504
+ """Update group state based on member health."""
505
+ old_state = self._state
506
+ healthy = self.healthy_count
507
+ total = len(self._members)
508
+
509
+ if self._circuit_breaker.is_open:
510
+ new_state = GroupState.DEGRADED
511
+ elif healthy == 0:
512
+ new_state = GroupState.INACTIVE
513
+ elif healthy < self._min_healthy:
514
+ new_state = GroupState.PARTIAL
515
+ else:
516
+ new_state = GroupState.HEALTHY
517
+
518
+ if new_state != old_state:
519
+ self._state = new_state
520
+ self._record_event(
521
+ GroupStateChanged(
522
+ group_id=self.id,
523
+ old_state=old_state.value,
524
+ new_state=new_state.value,
525
+ healthy_count=healthy,
526
+ total_count=total,
527
+ )
528
+ )
529
+ logger.info(f"Group {self.id} state: {old_state.value} -> {new_state.value} (healthy={healthy}/{total})")
530
+
531
+ def rebalance(self) -> None:
532
+ """
533
+ Manually trigger rebalancing.
534
+
535
+ Re-evaluates health of all members and updates rotation.
536
+ """
537
+ with self._lock:
538
+ for member in self._members.values():
539
+ if member.provider.state == ProviderState.READY:
540
+ if not member.in_rotation:
541
+ member.in_rotation = True
542
+ member.consecutive_failures = 0
543
+ self._record_event(
544
+ GroupMemberHealthChanged(
545
+ group_id=self.id,
546
+ member_id=member.id,
547
+ in_rotation=True,
548
+ reason="rebalance",
549
+ )
550
+ )
551
+ else:
552
+ if member.in_rotation:
553
+ member.in_rotation = False
554
+ self._record_event(
555
+ GroupMemberHealthChanged(
556
+ group_id=self.id,
557
+ member_id=member.id,
558
+ in_rotation=False,
559
+ reason="rebalance",
560
+ )
561
+ )
562
+
563
+ # Reset load balancer state
564
+ self._load_balancer.reset()
565
+
566
+ # Reset circuit breaker
567
+ was_open = self._circuit_breaker.is_open
568
+ self._circuit_breaker.reset()
569
+ if was_open:
570
+ self._record_event(GroupCircuitClosed(group_id=self.id))
571
+
572
+ self._update_state()
573
+ logger.info(f"Group {self.id} rebalanced: {self.healthy_count} healthy")
574
+
575
+ # --- Lifecycle ---
576
+
577
+ def start_all(self) -> int:
578
+ """
579
+ Start all members.
580
+
581
+ Returns:
582
+ Number of members successfully started
583
+ """
584
+ with self._lock:
585
+ started = 0
586
+ for member in self._members.values():
587
+ if self._try_start_member(member):
588
+ started += 1
589
+ return started
590
+
591
+ def stop_all(self) -> None:
592
+ """Stop all members."""
593
+ with self._lock:
594
+ for member in self._members.values():
595
+ try:
596
+ member.provider.shutdown()
597
+ member.in_rotation = False
598
+ except Exception as e:
599
+ logger.warning(f"Failed to stop member {member.id}: {e}")
600
+ self._update_state()
601
+
602
+ def shutdown(self) -> None:
603
+ """Shutdown the group and all members."""
604
+ self.stop_all()
605
+ logger.info(f"Group {self.id} shutdown complete")
606
+
607
+ # --- Tools Access ---
608
+
609
+ def get_tools(self) -> List[Any]:
610
+ """
611
+ Get tools from a healthy member.
612
+
613
+ Returns tools from the first healthy member, as all members
614
+ should have the same tools.
615
+ """
616
+ with self._lock:
617
+ for member in self._members.values():
618
+ if member.in_rotation and member.provider.state == ProviderState.READY:
619
+ return list(member.provider.tools)
620
+ return []
621
+
622
+ def get_tool_names(self) -> List[str]:
623
+ """Get list of tool names from a healthy member."""
624
+ with self._lock:
625
+ for member in self._members.values():
626
+ if member.in_rotation and member.provider.state == ProviderState.READY:
627
+ return member.provider.get_tool_names()
628
+ return []
629
+
630
+ # --- Serialization ---
631
+
632
+ def to_status_dict(self) -> Dict[str, Any]:
633
+ """Get status as dictionary."""
634
+ with self._lock:
635
+ return {
636
+ "group_id": self.id,
637
+ "description": self._description,
638
+ "state": self._state.value,
639
+ "strategy": self._strategy.value,
640
+ "min_healthy": self._min_healthy,
641
+ "healthy_count": self.healthy_count,
642
+ "total_members": len(self._members),
643
+ "is_available": self.is_available,
644
+ "circuit_open": self._circuit_breaker.is_open,
645
+ "members": [
646
+ {
647
+ "id": m.id,
648
+ "state": m.provider.state.value,
649
+ "in_rotation": m.in_rotation,
650
+ "weight": m.weight,
651
+ "priority": m.priority,
652
+ "consecutive_failures": m.consecutive_failures,
653
+ }
654
+ for m in self._members.values()
655
+ ],
656
+ }