marin-iris 0.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iris/__init__.py +2 -0
- iris/_build_info.py +3 -0
- iris/actor/__init__.py +35 -0
- iris/actor/client.py +223 -0
- iris/actor/pool.py +281 -0
- iris/actor/resolver.py +108 -0
- iris/actor/server.py +355 -0
- iris/chaos.py +98 -0
- iris/cli/__init__.py +12 -0
- iris/cli/actor.py +69 -0
- iris/cli/bug_report.py +528 -0
- iris/cli/build.py +493 -0
- iris/cli/cluster.py +1142 -0
- iris/cli/job.py +1261 -0
- iris/cli/main.py +486 -0
- iris/cli/process_status.py +194 -0
- iris/cli/query.py +82 -0
- iris/cli/rpc.py +327 -0
- iris/cli/task.py +70 -0
- iris/cli/token_store.py +125 -0
- iris/client/__init__.py +49 -0
- iris/client/client.py +1081 -0
- iris/client/resolver.py +102 -0
- iris/client/worker_pool.py +595 -0
- iris/cluster/__init__.py +2 -0
- iris/cluster/bundle.py +185 -0
- iris/cluster/client/__init__.py +22 -0
- iris/cluster/client/bundle.py +213 -0
- iris/cluster/client/job_info.py +167 -0
- iris/cluster/client/protocol.py +108 -0
- iris/cluster/client/remote_client.py +501 -0
- iris/cluster/config.py +1331 -0
- iris/cluster/constraints.py +1169 -0
- iris/cluster/controller/__init__.py +2 -0
- iris/cluster/controller/actor_proxy.py +104 -0
- iris/cluster/controller/auth.py +424 -0
- iris/cluster/controller/autoscaler/__init__.py +6 -0
- iris/cluster/controller/autoscaler/models.py +75 -0
- iris/cluster/controller/autoscaler/operations.py +176 -0
- iris/cluster/controller/autoscaler/planning.py +135 -0
- iris/cluster/controller/autoscaler/recovery.py +136 -0
- iris/cluster/controller/autoscaler/routing.py +597 -0
- iris/cluster/controller/autoscaler/runtime.py +641 -0
- iris/cluster/controller/autoscaler/scaling_group.py +1340 -0
- iris/cluster/controller/autoscaler/status.py +169 -0
- iris/cluster/controller/autoscaler/worker_registry.py +175 -0
- iris/cluster/controller/budget.py +222 -0
- iris/cluster/controller/checkpoint.py +421 -0
- iris/cluster/controller/codec.py +117 -0
- iris/cluster/controller/controller.py +2671 -0
- iris/cluster/controller/dashboard.py +801 -0
- iris/cluster/controller/db.py +993 -0
- iris/cluster/controller/endpoint_proxy.py +288 -0
- iris/cluster/controller/main.py +358 -0
- iris/cluster/controller/migrations/0001_init.py +10 -0
- iris/cluster/controller/migrations/0002_read_indexes.py +8 -0
- iris/cluster/controller/migrations/0003_normalize_scaling_groups.py +39 -0
- iris/cluster/controller/migrations/0004_api_keys.py +38 -0
- iris/cluster/controller/migrations/0004_worker_indexes.py +15 -0
- iris/cluster/controller/migrations/0005_task_profiles.py +32 -0
- iris/cluster/controller/migrations/0006_jwt_signing_key.py +16 -0
- iris/cluster/controller/migrations/0007_perf_indexes.py +19 -0
- iris/cluster/controller/migrations/0008_jobs_name.py +38 -0
- iris/cluster/controller/migrations/0009_query_indexes.py +24 -0
- iris/cluster/controller/migrations/0010_dashboard_indexes.py +42 -0
- iris/cluster/controller/migrations/0010_purge_orphaned_endpoints.py +15 -0
- iris/cluster/controller/migrations/0011_direct_provider.py +33 -0
- iris/cluster/controller/migrations/0012_container_name.py +16 -0
- iris/cluster/controller/migrations/0012_separate_auth_db.py +53 -0
- iris/cluster/controller/migrations/0013_has_reservation.py +46 -0
- iris/cluster/controller/migrations/0014_profile_kind.py +36 -0
- iris/cluster/controller/migrations/0015_drop_redundant_index.py +11 -0
- iris/cluster/controller/migrations/0016_worker_scheduling_fields.py +57 -0
- iris/cluster/controller/migrations/0017_job_scheduling_fields.py +72 -0
- iris/cluster/controller/migrations/0018_task_assignment_fields.py +30 -0
- iris/cluster/controller/migrations/0019_worker_fk_cascade.py +73 -0
- iris/cluster/controller/migrations/0020_perf_indices_and_profiles_fk.py +53 -0
- iris/cluster/controller/migrations/0021_budgets.py +38 -0
- iris/cluster/controller/migrations/0022_workers_slice_and_group.py +28 -0
- iris/cluster/controller/migrations/0023_separate_profiles_db.py +63 -0
- iris/cluster/controller/migrations/0024_normalize_resource_usage.py +47 -0
- iris/cluster/controller/migrations/0024_task_resource_history.py +25 -0
- iris/cluster/controller/migrations/0025_normalize_resource_snapshots.py +105 -0
- iris/cluster/controller/migrations/0026_normalize_worker_metadata.py +95 -0
- iris/cluster/controller/migrations/0027_normalize_job_resources.py +101 -0
- iris/cluster/controller/migrations/0028_job_config_table.py +273 -0
- iris/cluster/controller/migrations/0029_drop_task_resource_usage_columns.py +30 -0
- iris/cluster/controller/migrations/0030_backfill_worker_region.py +57 -0
- iris/cluster/controller/migrations/0030_job_submit_argv.py +13 -0
- iris/cluster/controller/migrations/0031_auto_vacuum_incremental.py +24 -0
- iris/cluster/controller/migrations/0032_backfill_attempt_finished_at.py +52 -0
- iris/cluster/controller/migrations/0033_worker_task_history_fk_cascade.py +60 -0
- iris/cluster/controller/migrations/0034_task_summaries_covering_index.py +24 -0
- iris/cluster/controller/migrations/0035_drop_dead_logs_table.py +17 -0
- iris/cluster/controller/migrations/0036_reconcile_reservation_holder_attempt_ids.py +39 -0
- iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py +19 -0
- iris/cluster/controller/migrations/0037_user_budget_default.py +18 -0
- iris/cluster/controller/migrations/0038_finalize_orphan_attempts.py +103 -0
- iris/cluster/controller/migrations/0039_requeue_split_coscheduled_jobs.py +214 -0
- iris/cluster/controller/migrations/0040_drop_resource_history_tables.py +35 -0
- iris/cluster/controller/migrations/0041_drop_worker_task_history.py +23 -0
- iris/cluster/controller/migrations/0042_drop_workers_dormant_columns.py +31 -0
- iris/cluster/controller/migrations/0043_drop_workers_committed_columns.py +33 -0
- iris/cluster/controller/migrations/0044_drop_dispatch_queue.py +45 -0
- iris/cluster/controller/migrations/0045_index_task_attempts_live_workerbound.py +40 -0
- iris/cluster/controller/migrations/0046_drop_slices_last_active_ms.py +22 -0
- iris/cluster/controller/provider.py +55 -0
- iris/cluster/controller/query.py +80 -0
- iris/cluster/controller/scheduler.py +940 -0
- iris/cluster/controller/schema.py +1710 -0
- iris/cluster/controller/service.py +2629 -0
- iris/cluster/controller/stores.py +2205 -0
- iris/cluster/controller/transitions.py +2764 -0
- iris/cluster/controller/vm_lifecycle.py +452 -0
- iris/cluster/controller/worker_health.py +199 -0
- iris/cluster/controller/worker_provider.py +289 -0
- iris/cluster/dashboard_common.py +181 -0
- iris/cluster/endpoints.py +187 -0
- iris/cluster/log_store_helpers.py +46 -0
- iris/cluster/process_status.py +105 -0
- iris/cluster/providers/__init__.py +30 -0
- iris/cluster/providers/_worker_base.py +116 -0
- iris/cluster/providers/factory.py +105 -0
- iris/cluster/providers/gcp/__init__.py +11 -0
- iris/cluster/providers/gcp/bootstrap.py +496 -0
- iris/cluster/providers/gcp/controller.py +378 -0
- iris/cluster/providers/gcp/fake.py +560 -0
- iris/cluster/providers/gcp/handles.py +492 -0
- iris/cluster/providers/gcp/local.py +171 -0
- iris/cluster/providers/gcp/service.py +948 -0
- iris/cluster/providers/gcp/ssh.py +158 -0
- iris/cluster/providers/gcp/workers.py +1029 -0
- iris/cluster/providers/k8s/__init__.py +4 -0
- iris/cluster/providers/k8s/bundle_fetch.py +84 -0
- iris/cluster/providers/k8s/constants.py +12 -0
- iris/cluster/providers/k8s/controller.py +919 -0
- iris/cluster/providers/k8s/fake.py +830 -0
- iris/cluster/providers/k8s/service.py +782 -0
- iris/cluster/providers/k8s/tasks.py +1680 -0
- iris/cluster/providers/k8s/types.py +146 -0
- iris/cluster/providers/local/__init__.py +2 -0
- iris/cluster/providers/local/cluster.py +338 -0
- iris/cluster/providers/manual/__init__.py +2 -0
- iris/cluster/providers/manual/provider.py +547 -0
- iris/cluster/providers/protocols.py +140 -0
- iris/cluster/providers/remote_exec.py +426 -0
- iris/cluster/providers/types.py +432 -0
- iris/cluster/redaction.py +93 -0
- iris/cluster/runtime/__init__.py +39 -0
- iris/cluster/runtime/docker.py +1182 -0
- iris/cluster/runtime/entrypoint.py +122 -0
- iris/cluster/runtime/env.py +134 -0
- iris/cluster/runtime/process.py +713 -0
- iris/cluster/runtime/profile.py +290 -0
- iris/cluster/runtime/types.py +385 -0
- iris/cluster/service_mode.py +10 -0
- iris/cluster/types.py +842 -0
- iris/cluster/worker/__init__.py +4 -0
- iris/cluster/worker/dashboard.py +61 -0
- iris/cluster/worker/env_probe.py +651 -0
- iris/cluster/worker/main.py +95 -0
- iris/cluster/worker/port_allocator.py +50 -0
- iris/cluster/worker/service.py +171 -0
- iris/cluster/worker/stats.py +151 -0
- iris/cluster/worker/task_attempt.py +1011 -0
- iris/cluster/worker/tpu_health.py +26 -0
- iris/cluster/worker/worker.py +1107 -0
- iris/cluster/worker/worker_types.py +70 -0
- iris/dev_tpu.py +87 -0
- iris/env_resources.py +174 -0
- iris/examples/coreweave-ci.yaml +92 -0
- iris/examples/coreweave-rno2a.yaml +99 -0
- iris/examples/coreweave-usw09b.yaml +98 -0
- iris/examples/coreweave.yaml +116 -0
- iris/examples/local-auth-gcp.yaml +35 -0
- iris/examples/local-auth-static.yaml +36 -0
- iris/examples/local.yaml +29 -0
- iris/examples/marin-dev.yaml +145 -0
- iris/examples/marin.yaml +223 -0
- iris/examples/smoke-gcp.yaml +71 -0
- iris/examples/test.yaml +165 -0
- iris/examples/tpu-demo.ipynb +461 -0
- iris/logging.py +12 -0
- iris/managed_thread.py +370 -0
- iris/rpc/__init__.py +12 -0
- iris/rpc/actor.proto +118 -0
- iris/rpc/actor_connect.py +513 -0
- iris/rpc/actor_pb2.py +70 -0
- iris/rpc/actor_pb2.pyi +134 -0
- iris/rpc/async_adapter.py +75 -0
- iris/rpc/auth.py +397 -0
- iris/rpc/codecs.py +62 -0
- iris/rpc/compression.py +23 -0
- iris/rpc/config.proto +534 -0
- iris/rpc/config_pb2.py +173 -0
- iris/rpc/config_pb2.pyi +581 -0
- iris/rpc/controller.proto +670 -0
- iris/rpc/controller_connect.py +2400 -0
- iris/rpc/controller_pb2.py +202 -0
- iris/rpc/controller_pb2.pyi +705 -0
- iris/rpc/errors.proto +28 -0
- iris/rpc/errors.py +301 -0
- iris/rpc/errors_pb2.py +38 -0
- iris/rpc/errors_pb2.pyi +19 -0
- iris/rpc/interceptors.py +190 -0
- iris/rpc/iris_logging.proto +46 -0
- iris/rpc/iris_logging_pb2.py +40 -0
- iris/rpc/iris_logging_pb2.pyi +39 -0
- iris/rpc/job.proto +621 -0
- iris/rpc/job_pb2.py +177 -0
- iris/rpc/job_pb2.pyi +768 -0
- iris/rpc/logging_pb2.py +9 -0
- iris/rpc/proto_utils.py +130 -0
- iris/rpc/query.proto +36 -0
- iris/rpc/query_pb2.py +41 -0
- iris/rpc/query_pb2.pyi +29 -0
- iris/rpc/stats.proto +70 -0
- iris/rpc/stats.py +289 -0
- iris/rpc/stats_connect.py +123 -0
- iris/rpc/stats_pb2.py +46 -0
- iris/rpc/stats_pb2.pyi +72 -0
- iris/rpc/stats_service.py +29 -0
- iris/rpc/time.proto +47 -0
- iris/rpc/time_pb2.py +39 -0
- iris/rpc/time_pb2.pyi +17 -0
- iris/rpc/vm.proto +189 -0
- iris/rpc/vm_pb2.py +89 -0
- iris/rpc/vm_pb2.pyi +288 -0
- iris/rpc/worker.proto +124 -0
- iris/rpc/worker_connect.py +709 -0
- iris/rpc/worker_pb2.py +73 -0
- iris/rpc/worker_pb2.pyi +109 -0
- iris/runtime/__init__.py +2 -0
- iris/runtime/jax_init.py +170 -0
- iris/test_util.py +65 -0
- iris/time_proto.py +28 -0
- iris/version.py +47 -0
- marin_iris-0.99.dist-info/METADATA +30 -0
- marin_iris-0.99.dist-info/RECORD +241 -0
- marin_iris-0.99.dist-info/WHEEL +4 -0
- marin_iris-0.99.dist-info/entry_points.txt +3 -0
iris/__init__.py
ADDED
iris/_build_info.py
ADDED
iris/actor/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Actor system for distributed RPC.
|
|
5
|
+
|
|
6
|
+
For ClusterResolver (namespace-aware controller-based resolution),
|
|
7
|
+
see iris.client.resolver.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from iris.actor.client import ActorClient
|
|
11
|
+
from iris.actor.pool import ActorPool, BroadcastFuture, CallResult
|
|
12
|
+
from iris.actor.resolver import (
|
|
13
|
+
ACTOR_ENDPOINT_HEADER,
|
|
14
|
+
FixedResolver,
|
|
15
|
+
ProxyResolver,
|
|
16
|
+
ResolvedEndpoint,
|
|
17
|
+
Resolver,
|
|
18
|
+
ResolveResult,
|
|
19
|
+
)
|
|
20
|
+
from iris.actor.server import ActorId, ActorServer
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"ACTOR_ENDPOINT_HEADER",
|
|
24
|
+
"ActorClient",
|
|
25
|
+
"ActorId",
|
|
26
|
+
"ActorPool",
|
|
27
|
+
"ActorServer",
|
|
28
|
+
"BroadcastFuture",
|
|
29
|
+
"CallResult",
|
|
30
|
+
"FixedResolver",
|
|
31
|
+
"ProxyResolver",
|
|
32
|
+
"ResolveResult",
|
|
33
|
+
"ResolvedEndpoint",
|
|
34
|
+
"Resolver",
|
|
35
|
+
]
|
iris/actor/client.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Actor client for making RPC calls to actor servers.
|
|
5
|
+
|
|
6
|
+
The ActorClient provides transparent actor discovery and invocation with
|
|
7
|
+
automatic retry logic. Both resolution failures (e.g., actor not yet
|
|
8
|
+
registered) and transient RPC errors are retried up to ``max_call_attempts``
|
|
9
|
+
with exponential backoff.
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
resolver = ClusterResolver("http://controller:8080")
|
|
13
|
+
client = ActorClient(resolver, "my-actor")
|
|
14
|
+
result = client.some_method(arg1, arg2) # Retries until actor found
|
|
15
|
+
|
|
16
|
+
Custom backoff behavior:
|
|
17
|
+
client = ActorClient(
|
|
18
|
+
resolver, "my-actor",
|
|
19
|
+
backoff=ExponentialBackoff(initial=0.2, maximum=5.0),
|
|
20
|
+
max_call_attempts=3,
|
|
21
|
+
)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import time
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
import cloudpickle
|
|
29
|
+
from connectrpc.code import Code
|
|
30
|
+
from connectrpc.errors import ConnectError
|
|
31
|
+
from rigging.timing import ExponentialBackoff
|
|
32
|
+
|
|
33
|
+
from iris.actor.resolver import Resolver
|
|
34
|
+
from iris.rpc import actor_pb2
|
|
35
|
+
from iris.rpc.actor_connect import ActorServiceClientSync
|
|
36
|
+
from iris.rpc.errors import call_with_retry
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def unwrap_actor_response(resp: actor_pb2.ActorResponse) -> Any:
|
|
42
|
+
"""Unwrap an ActorResponse, raising the embedded exception on error."""
|
|
43
|
+
if resp.HasField("error"):
|
|
44
|
+
if resp.error.serialized_exception:
|
|
45
|
+
raise cloudpickle.loads(resp.error.serialized_exception)
|
|
46
|
+
raise RuntimeError(f"{resp.error.error_type}: {resp.error.message}")
|
|
47
|
+
return cloudpickle.loads(resp.serialized_value)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ActorClient:
|
|
51
|
+
"""Actor client with resolver-based discovery.
|
|
52
|
+
|
|
53
|
+
By default the client waits forever, i.e. there's no timeout in httpx.
|
|
54
|
+
Specify ``call_timeout`` to apply a timeout to individual RPC calls.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
resolver: Resolver,
|
|
60
|
+
name: str,
|
|
61
|
+
call_timeout: float | None = None,
|
|
62
|
+
max_call_attempts: int = 10,
|
|
63
|
+
backoff: ExponentialBackoff = ExponentialBackoff(initial=0.5, maximum=10.0, factor=2.0, jitter=0.25),
|
|
64
|
+
):
|
|
65
|
+
"""Initialize the actor client.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
resolver: Resolver instance for endpoint discovery
|
|
69
|
+
name: Name of the actor to invoke
|
|
70
|
+
call_timeout: Timeout in seconds for individual RPC calls.
|
|
71
|
+
None (default) means no timeout.
|
|
72
|
+
max_call_attempts: Maximum number of RPC call attempts (including
|
|
73
|
+
resolution failures) before giving up.
|
|
74
|
+
backoff: Exponential backoff configuration for retries between attempts.
|
|
75
|
+
"""
|
|
76
|
+
self._resolver = resolver
|
|
77
|
+
self._name = name
|
|
78
|
+
self._call_timeout = call_timeout
|
|
79
|
+
self._max_call_attempts = max_call_attempts
|
|
80
|
+
self._backoff = backoff
|
|
81
|
+
|
|
82
|
+
self._rpc_client: ActorServiceClientSync | None = None
|
|
83
|
+
self._rpc_headers: dict[str, str] = {}
|
|
84
|
+
|
|
85
|
+
def rpc_client(self) -> ActorServiceClientSync:
|
|
86
|
+
"""Resolve actor name to an RPC client (single attempt).
|
|
87
|
+
|
|
88
|
+
Resolution is attempted once. On failure (empty endpoints or RPC error),
|
|
89
|
+
the exception propagates to the caller. The outer ``call_with_retry`` in
|
|
90
|
+
``_RpcMethod.__call__`` is responsible for retrying.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
ActorServiceClientSync connected to the resolved endpoint.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ConnectError(UNAVAILABLE): If no endpoints are found for the actor.
|
|
97
|
+
"""
|
|
98
|
+
if self._rpc_client:
|
|
99
|
+
return self._rpc_client
|
|
100
|
+
|
|
101
|
+
logger.info("Resolving name %s via %s", self._name, self._resolver)
|
|
102
|
+
result = self._resolver.resolve(self._name)
|
|
103
|
+
|
|
104
|
+
if result.is_empty:
|
|
105
|
+
raise ConnectError(
|
|
106
|
+
Code.UNAVAILABLE,
|
|
107
|
+
f"No endpoints found for actor '{self._name}'",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
logger.info(
|
|
111
|
+
"Resolved actor '%s' to %d endpoint(s)",
|
|
112
|
+
self._name,
|
|
113
|
+
len(result.endpoints),
|
|
114
|
+
)
|
|
115
|
+
endpoint = result.first()
|
|
116
|
+
logger.info("First endpoint: url=%s, actor_id=%s", endpoint.url, endpoint.actor_id)
|
|
117
|
+
self._rpc_headers = dict(endpoint.metadata)
|
|
118
|
+
self._rpc_client = ActorServiceClientSync(
|
|
119
|
+
address=endpoint.url,
|
|
120
|
+
timeout_ms=None if self._call_timeout is None else int(self._call_timeout * 1000),
|
|
121
|
+
accept_compression=[],
|
|
122
|
+
)
|
|
123
|
+
return self._rpc_client
|
|
124
|
+
|
|
125
|
+
def _clear_connection(self, _exc: Exception) -> None:
|
|
126
|
+
self._rpc_client = None
|
|
127
|
+
self._rpc_headers = {}
|
|
128
|
+
|
|
129
|
+
def start_operation(self, method_name: str, *args: Any, **kwargs: Any) -> str:
|
|
130
|
+
"""Start a long-running operation. Returns the operation ID."""
|
|
131
|
+
call = actor_pb2.ActorCall(
|
|
132
|
+
method_name=method_name,
|
|
133
|
+
actor_name=self._name,
|
|
134
|
+
serialized_args=cloudpickle.dumps(args),
|
|
135
|
+
serialized_kwargs=cloudpickle.dumps(kwargs),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def do_call():
|
|
139
|
+
client = self.rpc_client()
|
|
140
|
+
return client.start_operation(call, headers=self._rpc_headers)
|
|
141
|
+
|
|
142
|
+
op = call_with_retry(
|
|
143
|
+
f"{self._name}.start_operation({method_name})",
|
|
144
|
+
do_call,
|
|
145
|
+
on_retry=self._clear_connection,
|
|
146
|
+
max_attempts=self._max_call_attempts,
|
|
147
|
+
backoff=self._backoff,
|
|
148
|
+
)
|
|
149
|
+
return op.operation_id
|
|
150
|
+
|
|
151
|
+
def poll_operation_status(self, operation_id: str) -> actor_pb2.Operation:
|
|
152
|
+
"""Single-shot poll of a long-running operation's state."""
|
|
153
|
+
req = actor_pb2.OperationId(operation_id=operation_id)
|
|
154
|
+
|
|
155
|
+
def do_call():
|
|
156
|
+
return self.rpc_client().get_operation(req, headers=self._rpc_headers)
|
|
157
|
+
|
|
158
|
+
return call_with_retry(
|
|
159
|
+
f"{self._name}.poll_operation_status({operation_id[:8]})",
|
|
160
|
+
do_call,
|
|
161
|
+
on_retry=self._clear_connection,
|
|
162
|
+
max_attempts=self._max_call_attempts,
|
|
163
|
+
backoff=self._backoff,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def get_operation(
|
|
167
|
+
self,
|
|
168
|
+
operation_id: str,
|
|
169
|
+
poll_backoff: ExponentialBackoff | None = None,
|
|
170
|
+
) -> actor_pb2.Operation:
|
|
171
|
+
"""Poll a long-running operation until it completes, using exponential backoff."""
|
|
172
|
+
if poll_backoff is None:
|
|
173
|
+
poll_backoff = ExponentialBackoff(initial=0.1, maximum=10.0, factor=2.0, jitter=0.25)
|
|
174
|
+
while True:
|
|
175
|
+
op = self.poll_operation_status(operation_id)
|
|
176
|
+
if op.state != actor_pb2.Operation.RUNNING:
|
|
177
|
+
return op
|
|
178
|
+
time.sleep(poll_backoff.next_interval())
|
|
179
|
+
|
|
180
|
+
def cancel_operation(self, operation_id: str) -> actor_pb2.Operation:
|
|
181
|
+
"""Cancel a long-running operation."""
|
|
182
|
+
req = actor_pb2.OperationId(operation_id=operation_id)
|
|
183
|
+
|
|
184
|
+
def do_call():
|
|
185
|
+
return self.rpc_client().cancel_operation(req, headers=self._rpc_headers)
|
|
186
|
+
|
|
187
|
+
return call_with_retry(
|
|
188
|
+
f"{self._name}.cancel_operation({operation_id[:8]})",
|
|
189
|
+
do_call,
|
|
190
|
+
on_retry=self._clear_connection,
|
|
191
|
+
max_attempts=self._max_call_attempts,
|
|
192
|
+
backoff=self._backoff,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def __getattr__(self, method_name: str) -> "_RpcMethod":
|
|
196
|
+
return _RpcMethod(self, method_name)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class _RpcMethod:
|
|
200
|
+
def __init__(self, client: ActorClient, method_name: str):
|
|
201
|
+
self._client = client
|
|
202
|
+
self._method_name = method_name
|
|
203
|
+
|
|
204
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
205
|
+
call = actor_pb2.ActorCall(
|
|
206
|
+
method_name=self._method_name,
|
|
207
|
+
actor_name=self._client._name,
|
|
208
|
+
serialized_args=cloudpickle.dumps(args),
|
|
209
|
+
serialized_kwargs=cloudpickle.dumps(kwargs),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def do_call():
|
|
213
|
+
client = self._client.rpc_client()
|
|
214
|
+
resp = client.call(call, headers=self._client._rpc_headers)
|
|
215
|
+
return unwrap_actor_response(resp)
|
|
216
|
+
|
|
217
|
+
return call_with_retry(
|
|
218
|
+
f"{self._client._name}.{self._method_name}",
|
|
219
|
+
do_call,
|
|
220
|
+
on_retry=self._client._clear_connection,
|
|
221
|
+
max_attempts=self._client._max_call_attempts,
|
|
222
|
+
backoff=self._client._backoff,
|
|
223
|
+
)
|
iris/actor/pool.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Actor pool for load-balanced and broadcast RPC calls."""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from collections.abc import Callable, Iterator
|
|
10
|
+
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Generic, TypeVar
|
|
13
|
+
|
|
14
|
+
import cloudpickle
|
|
15
|
+
from rigging.timing import ExponentialBackoff
|
|
16
|
+
|
|
17
|
+
from iris.actor.client import unwrap_actor_response
|
|
18
|
+
from iris.actor.resolver import ResolvedEndpoint, Resolver, ResolveResult
|
|
19
|
+
from iris.rpc import actor_pb2
|
|
20
|
+
from iris.rpc.actor_connect import ActorServiceClientSync
|
|
21
|
+
from iris.rpc.errors import call_with_retry
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CallResult:
|
|
30
|
+
"""Result of a single call in a broadcast."""
|
|
31
|
+
|
|
32
|
+
endpoint: ResolvedEndpoint
|
|
33
|
+
value: Any | None = None
|
|
34
|
+
exception: BaseException | None = None
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def success(self) -> bool:
|
|
38
|
+
return self.exception is None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BroadcastFuture(Generic[T]):
|
|
42
|
+
"""Future representing results from a broadcast call to multiple endpoints."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, futures: list[tuple[ResolvedEndpoint, Future]]):
|
|
45
|
+
self._futures = futures
|
|
46
|
+
|
|
47
|
+
def wait_all(self, timeout: float | None = None) -> list[CallResult]:
|
|
48
|
+
results = []
|
|
49
|
+
for endpoint, future in self._futures:
|
|
50
|
+
try:
|
|
51
|
+
value = future.result(timeout=timeout)
|
|
52
|
+
results.append(CallResult(endpoint=endpoint, value=value))
|
|
53
|
+
except Exception as e:
|
|
54
|
+
results.append(CallResult(endpoint=endpoint, exception=e))
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
def wait_any(self, timeout: float | None = None) -> CallResult:
|
|
58
|
+
for future in as_completed([f for _, f in self._futures], timeout=timeout):
|
|
59
|
+
idx = next(i for i, (_, f) in enumerate(self._futures) if f is future)
|
|
60
|
+
endpoint = self._futures[idx][0]
|
|
61
|
+
try:
|
|
62
|
+
value = future.result()
|
|
63
|
+
return CallResult(endpoint=endpoint, value=value)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
return CallResult(endpoint=endpoint, exception=e)
|
|
66
|
+
raise TimeoutError("No results within timeout")
|
|
67
|
+
|
|
68
|
+
def as_completed(self, timeout: float | None = None) -> Iterator[CallResult]:
|
|
69
|
+
"""Iterate over results as they complete."""
|
|
70
|
+
endpoint_map = {id(f): ep for ep, f in self._futures}
|
|
71
|
+
for future in as_completed([f for _, f in self._futures], timeout=timeout):
|
|
72
|
+
endpoint = endpoint_map[id(future)]
|
|
73
|
+
try:
|
|
74
|
+
value = future.result()
|
|
75
|
+
yield CallResult(endpoint=endpoint, value=value)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
yield CallResult(endpoint=endpoint, exception=e)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ActorPool(Generic[T]):
|
|
81
|
+
"""Pool of actors for load-balanced and broadcast calls.
|
|
82
|
+
|
|
83
|
+
Resolves a pool of endpoints for an actor name and provides methods to
|
|
84
|
+
distribute calls across them (round-robin) or broadcast to all endpoints.
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
>>> pool = ActorPool(resolver, "inference")
|
|
88
|
+
>>> result = pool.call().predict(data) # Round-robin to one endpoint
|
|
89
|
+
>>> broadcast = pool.broadcast().reload_model() # Send to all endpoints
|
|
90
|
+
>>> results = broadcast.wait_all()
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
resolver: Resolver,
|
|
96
|
+
name: str,
|
|
97
|
+
timeout: float = 30.0,
|
|
98
|
+
max_call_attempts: int = 5,
|
|
99
|
+
backoff: ExponentialBackoff = ExponentialBackoff(initial=0.1, maximum=10.0, factor=2.0, jitter=0.25),
|
|
100
|
+
resolve_ttl: float = 5.0,
|
|
101
|
+
):
|
|
102
|
+
"""Initialize actor pool.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
resolver: Resolver to discover endpoints
|
|
106
|
+
name: Actor name to resolve
|
|
107
|
+
timeout: RPC timeout in seconds
|
|
108
|
+
max_call_attempts: Maximum number of RPC call attempts before giving up.
|
|
109
|
+
backoff: Exponential backoff configuration for call retries.
|
|
110
|
+
resolve_ttl: Seconds to cache resolve results before re-querying the resolver
|
|
111
|
+
"""
|
|
112
|
+
self._resolver = resolver
|
|
113
|
+
self._name = name
|
|
114
|
+
self._timeout = timeout
|
|
115
|
+
self._max_call_attempts = max_call_attempts
|
|
116
|
+
self._backoff = backoff
|
|
117
|
+
self._resolve_ttl = resolve_ttl
|
|
118
|
+
self._endpoint_index = 0
|
|
119
|
+
self._cached_result: ResolveResult | None = None
|
|
120
|
+
self._last_resolve_time: float = 0.0
|
|
121
|
+
self._lock = threading.Lock()
|
|
122
|
+
self._executor = ThreadPoolExecutor(max_workers=32)
|
|
123
|
+
self._clients: dict[str, ActorServiceClientSync] = {}
|
|
124
|
+
|
|
125
|
+
def _get_client(self, endpoint: ResolvedEndpoint) -> ActorServiceClientSync:
|
|
126
|
+
"""Return a cached client for the endpoint, creating one if needed."""
|
|
127
|
+
url = endpoint.url
|
|
128
|
+
with self._lock:
|
|
129
|
+
client = self._clients.get(url)
|
|
130
|
+
if client is not None:
|
|
131
|
+
return client
|
|
132
|
+
client = ActorServiceClientSync(
|
|
133
|
+
address=url,
|
|
134
|
+
timeout_ms=int(self._timeout * 1000),
|
|
135
|
+
accept_compression=[],
|
|
136
|
+
)
|
|
137
|
+
self._clients[url] = client
|
|
138
|
+
return client
|
|
139
|
+
|
|
140
|
+
def _resolve(self) -> ResolveResult:
|
|
141
|
+
now = time.monotonic()
|
|
142
|
+
with self._lock:
|
|
143
|
+
if self._cached_result is not None and (now - self._last_resolve_time) < self._resolve_ttl:
|
|
144
|
+
return self._cached_result
|
|
145
|
+
|
|
146
|
+
result = self._resolver.resolve(self._name)
|
|
147
|
+
if result.endpoints:
|
|
148
|
+
with self._lock:
|
|
149
|
+
self._cached_result = result
|
|
150
|
+
self._last_resolve_time = time.monotonic()
|
|
151
|
+
return result
|
|
152
|
+
|
|
153
|
+
def _invalidate_resolve_cache(self) -> None:
|
|
154
|
+
"""Force the next _resolve() call to re-query the resolver."""
|
|
155
|
+
with self._lock:
|
|
156
|
+
self._last_resolve_time = 0.0
|
|
157
|
+
self._cached_result = None
|
|
158
|
+
|
|
159
|
+
def _evict_client(self, url: str) -> None:
|
|
160
|
+
"""Remove and close a cached client so it is recreated on next use."""
|
|
161
|
+
with self._lock:
|
|
162
|
+
client = self._clients.pop(url, None)
|
|
163
|
+
if client is not None:
|
|
164
|
+
try:
|
|
165
|
+
client.close()
|
|
166
|
+
except Exception:
|
|
167
|
+
logger.debug("Error closing evicted client for %s", url, exc_info=True)
|
|
168
|
+
|
|
169
|
+
def _get_next_endpoint(self) -> ResolvedEndpoint:
|
|
170
|
+
"""Get the next endpoint in round-robin order.
|
|
171
|
+
|
|
172
|
+
Thread-safe: uses a lock to protect the endpoint index.
|
|
173
|
+
"""
|
|
174
|
+
endpoints = self._resolve().endpoints
|
|
175
|
+
with self._lock:
|
|
176
|
+
if not endpoints:
|
|
177
|
+
raise RuntimeError(f"No endpoints for '{self._name}'")
|
|
178
|
+
endpoint = endpoints[self._endpoint_index % len(endpoints)]
|
|
179
|
+
self._endpoint_index += 1
|
|
180
|
+
return endpoint
|
|
181
|
+
|
|
182
|
+
def shutdown(self) -> None:
|
|
183
|
+
self._executor.shutdown(wait=True)
|
|
184
|
+
with self._lock:
|
|
185
|
+
clients = list(self._clients.values())
|
|
186
|
+
self._clients.clear()
|
|
187
|
+
for client in clients:
|
|
188
|
+
try:
|
|
189
|
+
client.close()
|
|
190
|
+
except Exception:
|
|
191
|
+
logger.debug("Error closing client during shutdown", exc_info=True)
|
|
192
|
+
|
|
193
|
+
def __enter__(self) -> "ActorPool[T]":
|
|
194
|
+
return self
|
|
195
|
+
|
|
196
|
+
def __exit__(self, *args) -> None:
|
|
197
|
+
self.shutdown()
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def size(self) -> int:
|
|
201
|
+
return len(self._resolve().endpoints)
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def endpoints(self) -> list[ResolvedEndpoint]:
|
|
205
|
+
return list(self._resolve().endpoints)
|
|
206
|
+
|
|
207
|
+
def _call_endpoint(
|
|
208
|
+
self,
|
|
209
|
+
endpoint: ResolvedEndpoint,
|
|
210
|
+
method_name: str,
|
|
211
|
+
args: tuple,
|
|
212
|
+
kwargs: dict,
|
|
213
|
+
) -> Any:
|
|
214
|
+
client = self._get_client(endpoint)
|
|
215
|
+
|
|
216
|
+
call = actor_pb2.ActorCall(
|
|
217
|
+
method_name=method_name,
|
|
218
|
+
actor_name=self._name,
|
|
219
|
+
serialized_args=cloudpickle.dumps(args),
|
|
220
|
+
serialized_kwargs=cloudpickle.dumps(kwargs),
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
resp = client.call(call)
|
|
224
|
+
return unwrap_actor_response(resp)
|
|
225
|
+
|
|
226
|
+
def call(self) -> "_PoolCallProxy[T]":
|
|
227
|
+
return _PoolCallProxy(self)
|
|
228
|
+
|
|
229
|
+
def broadcast(self) -> "_PoolBroadcastProxy[T]":
|
|
230
|
+
return _PoolBroadcastProxy(self)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class _PoolCallProxy(Generic[T]):
|
|
234
|
+
def __init__(self, pool: ActorPool[T]):
|
|
235
|
+
self._pool = pool
|
|
236
|
+
|
|
237
|
+
def __getattr__(self, method_name: str) -> Callable[..., Any]:
|
|
238
|
+
def call(*args, **kwargs):
|
|
239
|
+
last_url: list[str | None] = [None]
|
|
240
|
+
|
|
241
|
+
def do_call():
|
|
242
|
+
endpoint = self._pool._get_next_endpoint()
|
|
243
|
+
last_url[0] = endpoint.url
|
|
244
|
+
return self._pool._call_endpoint(endpoint, method_name, args, kwargs)
|
|
245
|
+
|
|
246
|
+
def on_retry(_exc):
|
|
247
|
+
self._pool._invalidate_resolve_cache()
|
|
248
|
+
if last_url[0] is not None:
|
|
249
|
+
self._pool._evict_client(last_url[0])
|
|
250
|
+
|
|
251
|
+
return call_with_retry(
|
|
252
|
+
f"{self._pool._name}.{method_name}",
|
|
253
|
+
do_call,
|
|
254
|
+
on_retry=on_retry,
|
|
255
|
+
max_attempts=self._pool._max_call_attempts,
|
|
256
|
+
backoff=self._pool._backoff,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return call
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class _PoolBroadcastProxy(Generic[T]):
|
|
263
|
+
def __init__(self, pool: ActorPool[T]):
|
|
264
|
+
self._pool = pool
|
|
265
|
+
|
|
266
|
+
def __getattr__(self, method_name: str) -> Callable[..., BroadcastFuture]:
|
|
267
|
+
def broadcast(*args, **kwargs) -> BroadcastFuture:
|
|
268
|
+
result = self._pool._resolve()
|
|
269
|
+
futures = []
|
|
270
|
+
for endpoint in result.endpoints:
|
|
271
|
+
future = self._pool._executor.submit(
|
|
272
|
+
self._pool._call_endpoint,
|
|
273
|
+
endpoint,
|
|
274
|
+
method_name,
|
|
275
|
+
args,
|
|
276
|
+
kwargs,
|
|
277
|
+
)
|
|
278
|
+
futures.append((endpoint, future))
|
|
279
|
+
return BroadcastFuture(futures)
|
|
280
|
+
|
|
281
|
+
return broadcast
|
iris/actor/resolver.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Core types and resolver implementations for the actor system."""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Protocol
|
|
8
|
+
|
|
9
|
+
# Header used by ActorProxy to route requests to the correct actor endpoint.
|
|
10
|
+
# Shared constant between ProxyResolver (client-side) and ActorProxy (server-side).
|
|
11
|
+
ACTOR_ENDPOINT_HEADER = "x-iris-actor-endpoint"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ResolvedEndpoint:
|
|
16
|
+
"""A single resolved endpoint for an actor."""
|
|
17
|
+
|
|
18
|
+
url: str
|
|
19
|
+
actor_id: str
|
|
20
|
+
metadata: dict[str, str] = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ResolveResult:
|
|
25
|
+
"""Result of resolving an actor name to endpoints."""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
endpoints: list[ResolvedEndpoint] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def is_empty(self) -> bool:
|
|
32
|
+
return len(self.endpoints) == 0
|
|
33
|
+
|
|
34
|
+
def first(self) -> ResolvedEndpoint:
|
|
35
|
+
"""Get the first endpoint.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The first resolved endpoint
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
ValueError: If no endpoints are available
|
|
42
|
+
"""
|
|
43
|
+
if not self.endpoints:
|
|
44
|
+
raise ValueError(f"No endpoints for '{self.name}'")
|
|
45
|
+
return self.endpoints[0]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Resolver(Protocol):
|
|
49
|
+
"""Protocol for resolving actor names to endpoints.
|
|
50
|
+
|
|
51
|
+
Implementations:
|
|
52
|
+
- FixedResolver: Static endpoint mapping
|
|
53
|
+
- ClusterResolver: Resolves via cluster controller (lives in iris.client)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def resolve(self, name: str) -> ResolveResult: ...
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class FixedResolver:
|
|
60
|
+
"""Resolver with statically configured endpoints.
|
|
61
|
+
|
|
62
|
+
Used for testing or when endpoints are known ahead of time.
|
|
63
|
+
Does not use namespace prefixing since endpoints are static.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, endpoints: dict[str, str | list[str]]):
|
|
67
|
+
"""Initialize with a mapping of actor names to URLs."""
|
|
68
|
+
self._endpoints: dict[str, list[str]] = {}
|
|
69
|
+
for name, urls in endpoints.items():
|
|
70
|
+
if isinstance(urls, str):
|
|
71
|
+
self._endpoints[name] = [urls]
|
|
72
|
+
else:
|
|
73
|
+
self._endpoints[name] = list(urls)
|
|
74
|
+
|
|
75
|
+
def resolve(self, name: str) -> ResolveResult:
|
|
76
|
+
urls = self._endpoints.get(name, [])
|
|
77
|
+
endpoints = [ResolvedEndpoint(url=url, actor_id=f"fixed-{name}-{i}") for i, url in enumerate(urls)]
|
|
78
|
+
return ResolveResult(name=name, endpoints=endpoints)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ProxyResolver:
|
|
82
|
+
"""Resolver that routes actor calls through the controller's actor proxy.
|
|
83
|
+
|
|
84
|
+
Instead of resolving to the actor's direct address, returns the controller
|
|
85
|
+
URL so all RPCs go through the proxy. The proxy uses the
|
|
86
|
+
``X-Iris-Actor-Endpoint`` header to resolve the actual actor endpoint.
|
|
87
|
+
|
|
88
|
+
The caller passes the full actor name as registered in the endpoint registry
|
|
89
|
+
(e.g. ``/user/job/coordinator/actor-0``).
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
controller_url: Controller URL (e.g., ``http://localhost:8080``)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self, controller_url: str):
|
|
96
|
+
self._controller_url = controller_url.rstrip("/")
|
|
97
|
+
|
|
98
|
+
def resolve(self, name: str) -> ResolveResult:
|
|
99
|
+
return ResolveResult(
|
|
100
|
+
name=name,
|
|
101
|
+
endpoints=[
|
|
102
|
+
ResolvedEndpoint(
|
|
103
|
+
url=self._controller_url,
|
|
104
|
+
actor_id=f"proxy-{name}",
|
|
105
|
+
metadata={ACTOR_ENDPOINT_HEADER: name},
|
|
106
|
+
)
|
|
107
|
+
],
|
|
108
|
+
)
|