mcp-hangar 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_hangar/__init__.py +139 -0
- mcp_hangar/application/__init__.py +1 -0
- mcp_hangar/application/commands/__init__.py +67 -0
- mcp_hangar/application/commands/auth_commands.py +118 -0
- mcp_hangar/application/commands/auth_handlers.py +296 -0
- mcp_hangar/application/commands/commands.py +59 -0
- mcp_hangar/application/commands/handlers.py +189 -0
- mcp_hangar/application/discovery/__init__.py +21 -0
- mcp_hangar/application/discovery/discovery_metrics.py +283 -0
- mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
- mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
- mcp_hangar/application/discovery/security_validator.py +414 -0
- mcp_hangar/application/event_handlers/__init__.py +50 -0
- mcp_hangar/application/event_handlers/alert_handler.py +191 -0
- mcp_hangar/application/event_handlers/audit_handler.py +203 -0
- mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
- mcp_hangar/application/event_handlers/logging_handler.py +69 -0
- mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
- mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
- mcp_hangar/application/event_handlers/security_handler.py +604 -0
- mcp_hangar/application/mcp/tooling.py +158 -0
- mcp_hangar/application/ports/__init__.py +9 -0
- mcp_hangar/application/ports/observability.py +237 -0
- mcp_hangar/application/queries/__init__.py +52 -0
- mcp_hangar/application/queries/auth_handlers.py +237 -0
- mcp_hangar/application/queries/auth_queries.py +118 -0
- mcp_hangar/application/queries/handlers.py +227 -0
- mcp_hangar/application/read_models/__init__.py +11 -0
- mcp_hangar/application/read_models/provider_views.py +139 -0
- mcp_hangar/application/sagas/__init__.py +11 -0
- mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
- mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
- mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
- mcp_hangar/application/services/__init__.py +9 -0
- mcp_hangar/application/services/provider_service.py +208 -0
- mcp_hangar/application/services/traced_provider_service.py +211 -0
- mcp_hangar/bootstrap/runtime.py +328 -0
- mcp_hangar/context.py +178 -0
- mcp_hangar/domain/__init__.py +117 -0
- mcp_hangar/domain/contracts/__init__.py +57 -0
- mcp_hangar/domain/contracts/authentication.py +225 -0
- mcp_hangar/domain/contracts/authorization.py +229 -0
- mcp_hangar/domain/contracts/event_store.py +178 -0
- mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
- mcp_hangar/domain/contracts/persistence.py +383 -0
- mcp_hangar/domain/contracts/provider_runtime.py +146 -0
- mcp_hangar/domain/discovery/__init__.py +20 -0
- mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
- mcp_hangar/domain/discovery/discovered_provider.py +185 -0
- mcp_hangar/domain/discovery/discovery_service.py +412 -0
- mcp_hangar/domain/discovery/discovery_source.py +192 -0
- mcp_hangar/domain/events.py +433 -0
- mcp_hangar/domain/exceptions.py +525 -0
- mcp_hangar/domain/model/__init__.py +70 -0
- mcp_hangar/domain/model/aggregate.py +58 -0
- mcp_hangar/domain/model/circuit_breaker.py +152 -0
- mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
- mcp_hangar/domain/model/event_sourced_provider.py +423 -0
- mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
- mcp_hangar/domain/model/health_tracker.py +183 -0
- mcp_hangar/domain/model/load_balancer.py +185 -0
- mcp_hangar/domain/model/provider.py +810 -0
- mcp_hangar/domain/model/provider_group.py +656 -0
- mcp_hangar/domain/model/tool_catalog.py +105 -0
- mcp_hangar/domain/policies/__init__.py +19 -0
- mcp_hangar/domain/policies/provider_health.py +187 -0
- mcp_hangar/domain/repository.py +249 -0
- mcp_hangar/domain/security/__init__.py +85 -0
- mcp_hangar/domain/security/input_validator.py +710 -0
- mcp_hangar/domain/security/rate_limiter.py +387 -0
- mcp_hangar/domain/security/roles.py +237 -0
- mcp_hangar/domain/security/sanitizer.py +387 -0
- mcp_hangar/domain/security/secrets.py +501 -0
- mcp_hangar/domain/services/__init__.py +20 -0
- mcp_hangar/domain/services/audit_service.py +376 -0
- mcp_hangar/domain/services/image_builder.py +328 -0
- mcp_hangar/domain/services/provider_launcher.py +1046 -0
- mcp_hangar/domain/value_objects.py +1138 -0
- mcp_hangar/errors.py +818 -0
- mcp_hangar/fastmcp_server.py +1105 -0
- mcp_hangar/gc.py +134 -0
- mcp_hangar/infrastructure/__init__.py +79 -0
- mcp_hangar/infrastructure/async_executor.py +133 -0
- mcp_hangar/infrastructure/auth/__init__.py +37 -0
- mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
- mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
- mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
- mcp_hangar/infrastructure/auth/middleware.py +340 -0
- mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
- mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
- mcp_hangar/infrastructure/auth/projections.py +366 -0
- mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
- mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
- mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
- mcp_hangar/infrastructure/command_bus.py +112 -0
- mcp_hangar/infrastructure/discovery/__init__.py +110 -0
- mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
- mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
- mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
- mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
- mcp_hangar/infrastructure/event_bus.py +260 -0
- mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
- mcp_hangar/infrastructure/event_store.py +396 -0
- mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
- mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
- mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
- mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
- mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
- mcp_hangar/infrastructure/metrics_publisher.py +36 -0
- mcp_hangar/infrastructure/observability/__init__.py +10 -0
- mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
- mcp_hangar/infrastructure/persistence/__init__.py +33 -0
- mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
- mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
- mcp_hangar/infrastructure/persistence/database.py +333 -0
- mcp_hangar/infrastructure/persistence/database_common.py +330 -0
- mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
- mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
- mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
- mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
- mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
- mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
- mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
- mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
- mcp_hangar/infrastructure/query_bus.py +153 -0
- mcp_hangar/infrastructure/saga_manager.py +401 -0
- mcp_hangar/logging_config.py +209 -0
- mcp_hangar/metrics.py +1007 -0
- mcp_hangar/models.py +31 -0
- mcp_hangar/observability/__init__.py +54 -0
- mcp_hangar/observability/health.py +487 -0
- mcp_hangar/observability/metrics.py +319 -0
- mcp_hangar/observability/tracing.py +433 -0
- mcp_hangar/progress.py +542 -0
- mcp_hangar/retry.py +613 -0
- mcp_hangar/server/__init__.py +120 -0
- mcp_hangar/server/__main__.py +6 -0
- mcp_hangar/server/auth_bootstrap.py +340 -0
- mcp_hangar/server/auth_cli.py +335 -0
- mcp_hangar/server/auth_config.py +305 -0
- mcp_hangar/server/bootstrap.py +735 -0
- mcp_hangar/server/cli.py +161 -0
- mcp_hangar/server/config.py +224 -0
- mcp_hangar/server/context.py +215 -0
- mcp_hangar/server/http_auth_middleware.py +165 -0
- mcp_hangar/server/lifecycle.py +467 -0
- mcp_hangar/server/state.py +117 -0
- mcp_hangar/server/tools/__init__.py +16 -0
- mcp_hangar/server/tools/discovery.py +186 -0
- mcp_hangar/server/tools/groups.py +75 -0
- mcp_hangar/server/tools/health.py +301 -0
- mcp_hangar/server/tools/provider.py +939 -0
- mcp_hangar/server/tools/registry.py +320 -0
- mcp_hangar/server/validation.py +113 -0
- mcp_hangar/stdio_client.py +229 -0
- mcp_hangar-0.2.0.dist-info/METADATA +347 -0
- mcp_hangar-0.2.0.dist-info/RECORD +160 -0
- mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
- mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
- mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
"""Provider Group Aggregate - manages a group of providers with load balancing.
|
|
2
|
+
|
|
3
|
+
A ProviderGroup is an aggregate root that manages multiple Provider instances
|
|
4
|
+
as a single logical unit with automatic load balancing and failover.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from ...logging_config import get_logger
|
|
13
|
+
from ..events import DomainEvent
|
|
14
|
+
from ..value_objects import GroupId, GroupState, LoadBalancerStrategy, MemberPriority, MemberWeight, ProviderState
|
|
15
|
+
from .aggregate import AggregateRoot
|
|
16
|
+
from .circuit_breaker import CircuitBreaker, CircuitBreakerConfig
|
|
17
|
+
from .load_balancer import LoadBalancer
|
|
18
|
+
from .provider import Provider
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# --- Group-specific Domain Events ---
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class GroupCreated(DomainEvent):
|
|
28
|
+
"""Published when a provider group is created."""
|
|
29
|
+
|
|
30
|
+
group_id: str
|
|
31
|
+
strategy: str
|
|
32
|
+
min_healthy: int
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
super().__init__()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class GroupMemberAdded(DomainEvent):
|
|
40
|
+
"""Published when a member is added to a group."""
|
|
41
|
+
|
|
42
|
+
group_id: str
|
|
43
|
+
member_id: str
|
|
44
|
+
weight: int
|
|
45
|
+
priority: int
|
|
46
|
+
|
|
47
|
+
def __post_init__(self):
|
|
48
|
+
super().__init__()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class GroupMemberRemoved(DomainEvent):
|
|
53
|
+
"""Published when a member is removed from a group."""
|
|
54
|
+
|
|
55
|
+
group_id: str
|
|
56
|
+
member_id: str
|
|
57
|
+
|
|
58
|
+
def __post_init__(self):
|
|
59
|
+
super().__init__()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class GroupMemberHealthChanged(DomainEvent):
|
|
64
|
+
"""Published when a member's rotation status changes."""
|
|
65
|
+
|
|
66
|
+
group_id: str
|
|
67
|
+
member_id: str
|
|
68
|
+
in_rotation: bool
|
|
69
|
+
reason: str = ""
|
|
70
|
+
|
|
71
|
+
def __post_init__(self):
|
|
72
|
+
super().__init__()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class GroupStateChanged(DomainEvent):
|
|
77
|
+
"""Published when group state transitions."""
|
|
78
|
+
|
|
79
|
+
group_id: str
|
|
80
|
+
old_state: str
|
|
81
|
+
new_state: str
|
|
82
|
+
healthy_count: int
|
|
83
|
+
total_count: int
|
|
84
|
+
|
|
85
|
+
def __post_init__(self):
|
|
86
|
+
super().__init__()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class GroupCircuitOpened(DomainEvent):
|
|
91
|
+
"""Published when group circuit breaker opens."""
|
|
92
|
+
|
|
93
|
+
group_id: str
|
|
94
|
+
failure_count: int
|
|
95
|
+
|
|
96
|
+
def __post_init__(self):
|
|
97
|
+
super().__init__()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class GroupCircuitClosed(DomainEvent):
|
|
102
|
+
"""Published when group circuit breaker closes."""
|
|
103
|
+
|
|
104
|
+
group_id: str
|
|
105
|
+
|
|
106
|
+
def __post_init__(self):
|
|
107
|
+
super().__init__()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# --- Group Member ---
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class GroupMember:
|
|
115
|
+
"""A member of a provider group."""
|
|
116
|
+
|
|
117
|
+
provider: Provider
|
|
118
|
+
weight: int = 1
|
|
119
|
+
priority: int = 1
|
|
120
|
+
in_rotation: bool = False # Currently accepting traffic
|
|
121
|
+
consecutive_failures: int = 0
|
|
122
|
+
consecutive_successes: int = 0
|
|
123
|
+
last_selected_at: float = 0.0
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def id(self) -> str:
|
|
127
|
+
"""Get member's provider ID as string."""
|
|
128
|
+
# provider.id returns str (from Provider class)
|
|
129
|
+
return str(self.provider.id)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# --- Provider Group Aggregate ---
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ProviderGroup(AggregateRoot):
|
|
136
|
+
"""
|
|
137
|
+
Aggregate root for a group of load-balanced providers.
|
|
138
|
+
|
|
139
|
+
Responsibilities:
|
|
140
|
+
- Manage member lifecycle
|
|
141
|
+
- Load balancing decisions
|
|
142
|
+
- Group-level health tracking
|
|
143
|
+
- Circuit breaker for the entire group
|
|
144
|
+
|
|
145
|
+
Thread-safety:
|
|
146
|
+
- All public methods are thread-safe
|
|
147
|
+
- Internal lock prevents concurrent modification
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
group_id: str,
|
|
153
|
+
strategy: LoadBalancerStrategy = LoadBalancerStrategy.ROUND_ROBIN,
|
|
154
|
+
min_healthy: int = 1,
|
|
155
|
+
auto_start: bool = True,
|
|
156
|
+
unhealthy_threshold: int = 2,
|
|
157
|
+
healthy_threshold: int = 1,
|
|
158
|
+
circuit_failure_threshold: int = 10,
|
|
159
|
+
circuit_reset_timeout_s: float = 60.0,
|
|
160
|
+
description: Optional[str] = None,
|
|
161
|
+
):
|
|
162
|
+
"""
|
|
163
|
+
Initialize a provider group.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
group_id: Unique identifier for the group
|
|
167
|
+
strategy: Load balancing strategy
|
|
168
|
+
min_healthy: Minimum healthy members for HEALTHY state
|
|
169
|
+
auto_start: Automatically start members when added
|
|
170
|
+
unhealthy_threshold: Failures before removing from rotation
|
|
171
|
+
healthy_threshold: Successes before adding back to rotation
|
|
172
|
+
circuit_failure_threshold: Failures before circuit opens
|
|
173
|
+
circuit_reset_timeout_s: Time before circuit resets
|
|
174
|
+
description: Human-readable description
|
|
175
|
+
"""
|
|
176
|
+
super().__init__()
|
|
177
|
+
|
|
178
|
+
# Identity
|
|
179
|
+
self._id = GroupId(group_id)
|
|
180
|
+
self._description = description
|
|
181
|
+
|
|
182
|
+
# Configuration
|
|
183
|
+
self._strategy = strategy
|
|
184
|
+
self._min_healthy = max(1, min_healthy)
|
|
185
|
+
self._auto_start = auto_start
|
|
186
|
+
self._unhealthy_threshold = max(1, unhealthy_threshold)
|
|
187
|
+
self._healthy_threshold = max(1, healthy_threshold)
|
|
188
|
+
|
|
189
|
+
# State
|
|
190
|
+
self._state = GroupState.INACTIVE
|
|
191
|
+
self._members: Dict[str, GroupMember] = {}
|
|
192
|
+
self._load_balancer = LoadBalancer(strategy)
|
|
193
|
+
|
|
194
|
+
# Circuit breaker (extracted for SRP)
|
|
195
|
+
self._circuit_breaker = CircuitBreaker(
|
|
196
|
+
CircuitBreakerConfig(
|
|
197
|
+
failure_threshold=circuit_failure_threshold,
|
|
198
|
+
reset_timeout_s=circuit_reset_timeout_s,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Threading
|
|
203
|
+
self._lock = threading.RLock()
|
|
204
|
+
|
|
205
|
+
self._record_event(
|
|
206
|
+
GroupCreated(
|
|
207
|
+
group_id=group_id,
|
|
208
|
+
strategy=strategy.value,
|
|
209
|
+
min_healthy=min_healthy,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# --- Properties ---
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def id(self) -> str:
|
|
217
|
+
"""Get group ID."""
|
|
218
|
+
return self._id.value
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def description(self) -> Optional[str]:
|
|
222
|
+
"""Get group description."""
|
|
223
|
+
return self._description
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
def state(self) -> GroupState:
|
|
227
|
+
"""Get current group state."""
|
|
228
|
+
with self._lock:
|
|
229
|
+
return self._state
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def strategy(self) -> LoadBalancerStrategy:
|
|
233
|
+
"""Get load balancing strategy."""
|
|
234
|
+
return self._strategy
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def healthy_count(self) -> int:
|
|
238
|
+
"""Number of members currently in rotation."""
|
|
239
|
+
with self._lock:
|
|
240
|
+
return sum(1 for m in self._members.values() if m.in_rotation)
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def total_count(self) -> int:
|
|
244
|
+
"""Total number of members in the group."""
|
|
245
|
+
with self._lock:
|
|
246
|
+
return len(self._members)
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def is_available(self) -> bool:
|
|
250
|
+
"""Can the group accept requests?"""
|
|
251
|
+
with self._lock:
|
|
252
|
+
return not self._circuit_breaker.is_open and self._state.can_accept_requests and self.healthy_count >= 1
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def circuit_open(self) -> bool:
|
|
256
|
+
"""Is the circuit breaker open?"""
|
|
257
|
+
return self._circuit_breaker.is_open
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def members(self) -> List[GroupMember]:
|
|
261
|
+
"""Get list of all members."""
|
|
262
|
+
with self._lock:
|
|
263
|
+
return list(self._members.values())
|
|
264
|
+
|
|
265
|
+
# --- Member Management ---
|
|
266
|
+
|
|
267
|
+
def add_member(
|
|
268
|
+
self,
|
|
269
|
+
provider: Provider,
|
|
270
|
+
weight: int = 1,
|
|
271
|
+
priority: int = 1,
|
|
272
|
+
) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Add a provider to the group.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
provider: Provider instance to add
|
|
278
|
+
weight: Load balancing weight (higher = more traffic)
|
|
279
|
+
priority: Priority for priority-based selection (lower = higher priority)
|
|
280
|
+
|
|
281
|
+
Raises:
|
|
282
|
+
ValueError: If member already exists in group
|
|
283
|
+
"""
|
|
284
|
+
with self._lock:
|
|
285
|
+
# Get member ID as string for dictionary key
|
|
286
|
+
member_id = str(provider.id)
|
|
287
|
+
|
|
288
|
+
if member_id in self._members:
|
|
289
|
+
raise ValueError(f"Member {member_id} already in group {self.id}")
|
|
290
|
+
|
|
291
|
+
# Validate weight and priority
|
|
292
|
+
validated_weight = MemberWeight(weight)
|
|
293
|
+
validated_priority = MemberPriority(priority)
|
|
294
|
+
|
|
295
|
+
member = GroupMember(
|
|
296
|
+
provider=provider,
|
|
297
|
+
weight=validated_weight.value,
|
|
298
|
+
priority=validated_priority.value,
|
|
299
|
+
)
|
|
300
|
+
self._members[member_id] = member
|
|
301
|
+
|
|
302
|
+
self._record_event(
|
|
303
|
+
GroupMemberAdded(
|
|
304
|
+
group_id=self.id,
|
|
305
|
+
member_id=member_id,
|
|
306
|
+
weight=weight,
|
|
307
|
+
priority=priority,
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
logger.info(f"Added member {member_id} to group {self.id} (weight={weight}, priority={priority})")
|
|
312
|
+
|
|
313
|
+
# Auto-start if configured
|
|
314
|
+
if self._auto_start:
|
|
315
|
+
self._try_start_member(member)
|
|
316
|
+
|
|
317
|
+
def remove_member(self, member_id: str) -> bool:
|
|
318
|
+
"""
|
|
319
|
+
Remove a provider from the group.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
member_id: ID of the member to remove
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
True if member was removed, False if not found
|
|
326
|
+
"""
|
|
327
|
+
with self._lock:
|
|
328
|
+
member = self._members.pop(member_id, None)
|
|
329
|
+
if member:
|
|
330
|
+
member.in_rotation = False
|
|
331
|
+
self._update_state()
|
|
332
|
+
self._record_event(
|
|
333
|
+
GroupMemberRemoved(
|
|
334
|
+
group_id=self.id,
|
|
335
|
+
member_id=member_id,
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
logger.info(f"Removed member {member_id} from group {self.id}")
|
|
339
|
+
return True
|
|
340
|
+
return False
|
|
341
|
+
|
|
342
|
+
def get_member(self, member_id: str) -> Optional[GroupMember]:
|
|
343
|
+
"""Get a member by ID."""
|
|
344
|
+
with self._lock:
|
|
345
|
+
return self._members.get(member_id)
|
|
346
|
+
|
|
347
|
+
def _try_start_member(self, member: GroupMember) -> bool:
|
|
348
|
+
"""
|
|
349
|
+
Try to start a member and add to rotation if successful.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
True if member started and added to rotation
|
|
353
|
+
"""
|
|
354
|
+
try:
|
|
355
|
+
member.provider.ensure_ready()
|
|
356
|
+
if member.provider.state == ProviderState.READY:
|
|
357
|
+
member.in_rotation = True
|
|
358
|
+
member.consecutive_failures = 0
|
|
359
|
+
member.consecutive_successes = 1
|
|
360
|
+
self._update_state()
|
|
361
|
+
self._record_event(
|
|
362
|
+
GroupMemberHealthChanged(
|
|
363
|
+
group_id=self.id,
|
|
364
|
+
member_id=member.id,
|
|
365
|
+
in_rotation=True,
|
|
366
|
+
reason="started",
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
logger.info(f"Member {member.id} started and added to rotation")
|
|
370
|
+
return True
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.warning(f"Failed to start member {member.id}: {e}")
|
|
373
|
+
member.in_rotation = False
|
|
374
|
+
return False
|
|
375
|
+
|
|
376
|
+
# --- Load Balancing ---
|
|
377
|
+
|
|
378
|
+
def select_member(self) -> Optional[Provider]:
|
|
379
|
+
"""
|
|
380
|
+
Select a member for the next request using load balancer.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Selected provider or None if no healthy members available
|
|
384
|
+
"""
|
|
385
|
+
with self._lock:
|
|
386
|
+
if not self._circuit_breaker.allow_request():
|
|
387
|
+
return None
|
|
388
|
+
|
|
389
|
+
self._check_circuit_recovery()
|
|
390
|
+
|
|
391
|
+
available = [m for m in self._members.values() if m.in_rotation]
|
|
392
|
+
if not available:
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
selected = self._load_balancer.select(available)
|
|
396
|
+
if selected:
|
|
397
|
+
selected.last_selected_at = time.time()
|
|
398
|
+
return selected.provider
|
|
399
|
+
|
|
400
|
+
return None
|
|
401
|
+
|
|
402
|
+
def _check_circuit_recovery(self) -> None:
|
|
403
|
+
"""Check if circuit just recovered and emit event."""
|
|
404
|
+
if not self._circuit_breaker.is_open and self._state == GroupState.DEGRADED:
|
|
405
|
+
self._record_event(GroupCircuitClosed(group_id=self.id))
|
|
406
|
+
logger.info(f"Circuit breaker closed for group {self.id}")
|
|
407
|
+
self._update_state()
|
|
408
|
+
|
|
409
|
+
# --- Health Reporting ---
|
|
410
|
+
|
|
411
|
+
def report_success(self, member_id: str) -> None:
|
|
412
|
+
"""
|
|
413
|
+
Report successful invocation for a member.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
member_id: ID of the member that succeeded
|
|
417
|
+
"""
|
|
418
|
+
with self._lock:
|
|
419
|
+
member = self._members.get(member_id)
|
|
420
|
+
if not member:
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
member.consecutive_failures = 0
|
|
424
|
+
member.consecutive_successes += 1
|
|
425
|
+
self._maybe_add_to_rotation(member, member_id)
|
|
426
|
+
|
|
427
|
+
def _maybe_add_to_rotation(self, member: GroupMember, member_id: str) -> None:
|
|
428
|
+
"""Add member back to rotation if healthy threshold reached."""
|
|
429
|
+
if member.in_rotation:
|
|
430
|
+
return
|
|
431
|
+
if member.provider.state != ProviderState.READY:
|
|
432
|
+
return
|
|
433
|
+
if member.consecutive_successes < self._healthy_threshold:
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
member.in_rotation = True
|
|
437
|
+
self._record_event(
|
|
438
|
+
GroupMemberHealthChanged(
|
|
439
|
+
group_id=self.id,
|
|
440
|
+
member_id=member_id,
|
|
441
|
+
in_rotation=True,
|
|
442
|
+
reason="healthy_threshold_reached",
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
self._update_state()
|
|
446
|
+
logger.info(f"Member {member_id} added back to rotation")
|
|
447
|
+
|
|
448
|
+
def report_failure(self, member_id: str) -> None:
|
|
449
|
+
"""
|
|
450
|
+
Report failed invocation for a member.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
member_id: ID of the member that failed
|
|
454
|
+
"""
|
|
455
|
+
with self._lock:
|
|
456
|
+
member = self._members.get(member_id)
|
|
457
|
+
if not member:
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
member.consecutive_failures += 1
|
|
461
|
+
member.consecutive_successes = 0
|
|
462
|
+
|
|
463
|
+
self._maybe_remove_from_rotation(member, member_id)
|
|
464
|
+
self._maybe_open_circuit()
|
|
465
|
+
self._update_state()
|
|
466
|
+
|
|
467
|
+
def _maybe_remove_from_rotation(self, member: GroupMember, member_id: str) -> None:
|
|
468
|
+
"""Remove member from rotation if unhealthy threshold reached."""
|
|
469
|
+
if member.consecutive_failures < self._unhealthy_threshold:
|
|
470
|
+
return
|
|
471
|
+
if not member.in_rotation:
|
|
472
|
+
return
|
|
473
|
+
|
|
474
|
+
member.in_rotation = False
|
|
475
|
+
self._record_event(
|
|
476
|
+
GroupMemberHealthChanged(
|
|
477
|
+
group_id=self.id,
|
|
478
|
+
member_id=member_id,
|
|
479
|
+
in_rotation=False,
|
|
480
|
+
reason="unhealthy_threshold_reached",
|
|
481
|
+
)
|
|
482
|
+
)
|
|
483
|
+
logger.info(f"Member {member_id} removed from rotation after {member.consecutive_failures} failures")
|
|
484
|
+
|
|
485
|
+
def _maybe_open_circuit(self) -> None:
|
|
486
|
+
"""Open circuit breaker if failure threshold reached."""
|
|
487
|
+
circuit_just_opened = self._circuit_breaker.record_failure()
|
|
488
|
+
if not circuit_just_opened:
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
self._record_event(
|
|
492
|
+
GroupCircuitOpened(
|
|
493
|
+
group_id=self.id,
|
|
494
|
+
failure_count=self._circuit_breaker.failure_count,
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
logger.warning(
|
|
498
|
+
f"Circuit breaker opened for group {self.id} after {self._circuit_breaker.failure_count} failures"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# --- State Management ---
|
|
502
|
+
|
|
503
|
+
def _update_state(self) -> None:
|
|
504
|
+
"""Update group state based on member health."""
|
|
505
|
+
old_state = self._state
|
|
506
|
+
healthy = self.healthy_count
|
|
507
|
+
total = len(self._members)
|
|
508
|
+
|
|
509
|
+
if self._circuit_breaker.is_open:
|
|
510
|
+
new_state = GroupState.DEGRADED
|
|
511
|
+
elif healthy == 0:
|
|
512
|
+
new_state = GroupState.INACTIVE
|
|
513
|
+
elif healthy < self._min_healthy:
|
|
514
|
+
new_state = GroupState.PARTIAL
|
|
515
|
+
else:
|
|
516
|
+
new_state = GroupState.HEALTHY
|
|
517
|
+
|
|
518
|
+
if new_state != old_state:
|
|
519
|
+
self._state = new_state
|
|
520
|
+
self._record_event(
|
|
521
|
+
GroupStateChanged(
|
|
522
|
+
group_id=self.id,
|
|
523
|
+
old_state=old_state.value,
|
|
524
|
+
new_state=new_state.value,
|
|
525
|
+
healthy_count=healthy,
|
|
526
|
+
total_count=total,
|
|
527
|
+
)
|
|
528
|
+
)
|
|
529
|
+
logger.info(f"Group {self.id} state: {old_state.value} -> {new_state.value} (healthy={healthy}/{total})")
|
|
530
|
+
|
|
531
|
+
def rebalance(self) -> None:
|
|
532
|
+
"""
|
|
533
|
+
Manually trigger rebalancing.
|
|
534
|
+
|
|
535
|
+
Re-evaluates health of all members and updates rotation.
|
|
536
|
+
"""
|
|
537
|
+
with self._lock:
|
|
538
|
+
for member in self._members.values():
|
|
539
|
+
if member.provider.state == ProviderState.READY:
|
|
540
|
+
if not member.in_rotation:
|
|
541
|
+
member.in_rotation = True
|
|
542
|
+
member.consecutive_failures = 0
|
|
543
|
+
self._record_event(
|
|
544
|
+
GroupMemberHealthChanged(
|
|
545
|
+
group_id=self.id,
|
|
546
|
+
member_id=member.id,
|
|
547
|
+
in_rotation=True,
|
|
548
|
+
reason="rebalance",
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
if member.in_rotation:
|
|
553
|
+
member.in_rotation = False
|
|
554
|
+
self._record_event(
|
|
555
|
+
GroupMemberHealthChanged(
|
|
556
|
+
group_id=self.id,
|
|
557
|
+
member_id=member.id,
|
|
558
|
+
in_rotation=False,
|
|
559
|
+
reason="rebalance",
|
|
560
|
+
)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Reset load balancer state
|
|
564
|
+
self._load_balancer.reset()
|
|
565
|
+
|
|
566
|
+
# Reset circuit breaker
|
|
567
|
+
was_open = self._circuit_breaker.is_open
|
|
568
|
+
self._circuit_breaker.reset()
|
|
569
|
+
if was_open:
|
|
570
|
+
self._record_event(GroupCircuitClosed(group_id=self.id))
|
|
571
|
+
|
|
572
|
+
self._update_state()
|
|
573
|
+
logger.info(f"Group {self.id} rebalanced: {self.healthy_count} healthy")
|
|
574
|
+
|
|
575
|
+
# --- Lifecycle ---
|
|
576
|
+
|
|
577
|
+
def start_all(self) -> int:
|
|
578
|
+
"""
|
|
579
|
+
Start all members.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
Number of members successfully started
|
|
583
|
+
"""
|
|
584
|
+
with self._lock:
|
|
585
|
+
started = 0
|
|
586
|
+
for member in self._members.values():
|
|
587
|
+
if self._try_start_member(member):
|
|
588
|
+
started += 1
|
|
589
|
+
return started
|
|
590
|
+
|
|
591
|
+
def stop_all(self) -> None:
|
|
592
|
+
"""Stop all members."""
|
|
593
|
+
with self._lock:
|
|
594
|
+
for member in self._members.values():
|
|
595
|
+
try:
|
|
596
|
+
member.provider.shutdown()
|
|
597
|
+
member.in_rotation = False
|
|
598
|
+
except Exception as e:
|
|
599
|
+
logger.warning(f"Failed to stop member {member.id}: {e}")
|
|
600
|
+
self._update_state()
|
|
601
|
+
|
|
602
|
+
def shutdown(self) -> None:
|
|
603
|
+
"""Shutdown the group and all members."""
|
|
604
|
+
self.stop_all()
|
|
605
|
+
logger.info(f"Group {self.id} shutdown complete")
|
|
606
|
+
|
|
607
|
+
# --- Tools Access ---
|
|
608
|
+
|
|
609
|
+
def get_tools(self) -> List[Any]:
|
|
610
|
+
"""
|
|
611
|
+
Get tools from a healthy member.
|
|
612
|
+
|
|
613
|
+
Returns tools from the first healthy member, as all members
|
|
614
|
+
should have the same tools.
|
|
615
|
+
"""
|
|
616
|
+
with self._lock:
|
|
617
|
+
for member in self._members.values():
|
|
618
|
+
if member.in_rotation and member.provider.state == ProviderState.READY:
|
|
619
|
+
return list(member.provider.tools)
|
|
620
|
+
return []
|
|
621
|
+
|
|
622
|
+
def get_tool_names(self) -> List[str]:
|
|
623
|
+
"""Get list of tool names from a healthy member."""
|
|
624
|
+
with self._lock:
|
|
625
|
+
for member in self._members.values():
|
|
626
|
+
if member.in_rotation and member.provider.state == ProviderState.READY:
|
|
627
|
+
return member.provider.get_tool_names()
|
|
628
|
+
return []
|
|
629
|
+
|
|
630
|
+
# --- Serialization ---
|
|
631
|
+
|
|
632
|
+
def to_status_dict(self) -> Dict[str, Any]:
|
|
633
|
+
"""Get status as dictionary."""
|
|
634
|
+
with self._lock:
|
|
635
|
+
return {
|
|
636
|
+
"group_id": self.id,
|
|
637
|
+
"description": self._description,
|
|
638
|
+
"state": self._state.value,
|
|
639
|
+
"strategy": self._strategy.value,
|
|
640
|
+
"min_healthy": self._min_healthy,
|
|
641
|
+
"healthy_count": self.healthy_count,
|
|
642
|
+
"total_members": len(self._members),
|
|
643
|
+
"is_available": self.is_available,
|
|
644
|
+
"circuit_open": self._circuit_breaker.is_open,
|
|
645
|
+
"members": [
|
|
646
|
+
{
|
|
647
|
+
"id": m.id,
|
|
648
|
+
"state": m.provider.state.value,
|
|
649
|
+
"in_rotation": m.in_rotation,
|
|
650
|
+
"weight": m.weight,
|
|
651
|
+
"priority": m.priority,
|
|
652
|
+
"consecutive_failures": m.consecutive_failures,
|
|
653
|
+
}
|
|
654
|
+
for m in self._members.values()
|
|
655
|
+
],
|
|
656
|
+
}
|