marin-iris 0.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. iris/__init__.py +2 -0
  2. iris/_build_info.py +3 -0
  3. iris/actor/__init__.py +35 -0
  4. iris/actor/client.py +223 -0
  5. iris/actor/pool.py +281 -0
  6. iris/actor/resolver.py +108 -0
  7. iris/actor/server.py +355 -0
  8. iris/chaos.py +98 -0
  9. iris/cli/__init__.py +12 -0
  10. iris/cli/actor.py +69 -0
  11. iris/cli/bug_report.py +528 -0
  12. iris/cli/build.py +493 -0
  13. iris/cli/cluster.py +1142 -0
  14. iris/cli/job.py +1261 -0
  15. iris/cli/main.py +486 -0
  16. iris/cli/process_status.py +194 -0
  17. iris/cli/query.py +82 -0
  18. iris/cli/rpc.py +327 -0
  19. iris/cli/task.py +70 -0
  20. iris/cli/token_store.py +125 -0
  21. iris/client/__init__.py +49 -0
  22. iris/client/client.py +1081 -0
  23. iris/client/resolver.py +102 -0
  24. iris/client/worker_pool.py +595 -0
  25. iris/cluster/__init__.py +2 -0
  26. iris/cluster/bundle.py +185 -0
  27. iris/cluster/client/__init__.py +22 -0
  28. iris/cluster/client/bundle.py +213 -0
  29. iris/cluster/client/job_info.py +167 -0
  30. iris/cluster/client/protocol.py +108 -0
  31. iris/cluster/client/remote_client.py +501 -0
  32. iris/cluster/config.py +1331 -0
  33. iris/cluster/constraints.py +1169 -0
  34. iris/cluster/controller/__init__.py +2 -0
  35. iris/cluster/controller/actor_proxy.py +104 -0
  36. iris/cluster/controller/auth.py +424 -0
  37. iris/cluster/controller/autoscaler/__init__.py +6 -0
  38. iris/cluster/controller/autoscaler/models.py +75 -0
  39. iris/cluster/controller/autoscaler/operations.py +176 -0
  40. iris/cluster/controller/autoscaler/planning.py +135 -0
  41. iris/cluster/controller/autoscaler/recovery.py +136 -0
  42. iris/cluster/controller/autoscaler/routing.py +597 -0
  43. iris/cluster/controller/autoscaler/runtime.py +641 -0
  44. iris/cluster/controller/autoscaler/scaling_group.py +1340 -0
  45. iris/cluster/controller/autoscaler/status.py +169 -0
  46. iris/cluster/controller/autoscaler/worker_registry.py +175 -0
  47. iris/cluster/controller/budget.py +222 -0
  48. iris/cluster/controller/checkpoint.py +421 -0
  49. iris/cluster/controller/codec.py +117 -0
  50. iris/cluster/controller/controller.py +2671 -0
  51. iris/cluster/controller/dashboard.py +801 -0
  52. iris/cluster/controller/db.py +993 -0
  53. iris/cluster/controller/endpoint_proxy.py +288 -0
  54. iris/cluster/controller/main.py +358 -0
  55. iris/cluster/controller/migrations/0001_init.py +10 -0
  56. iris/cluster/controller/migrations/0002_read_indexes.py +8 -0
  57. iris/cluster/controller/migrations/0003_normalize_scaling_groups.py +39 -0
  58. iris/cluster/controller/migrations/0004_api_keys.py +38 -0
  59. iris/cluster/controller/migrations/0004_worker_indexes.py +15 -0
  60. iris/cluster/controller/migrations/0005_task_profiles.py +32 -0
  61. iris/cluster/controller/migrations/0006_jwt_signing_key.py +16 -0
  62. iris/cluster/controller/migrations/0007_perf_indexes.py +19 -0
  63. iris/cluster/controller/migrations/0008_jobs_name.py +38 -0
  64. iris/cluster/controller/migrations/0009_query_indexes.py +24 -0
  65. iris/cluster/controller/migrations/0010_dashboard_indexes.py +42 -0
  66. iris/cluster/controller/migrations/0010_purge_orphaned_endpoints.py +15 -0
  67. iris/cluster/controller/migrations/0011_direct_provider.py +33 -0
  68. iris/cluster/controller/migrations/0012_container_name.py +16 -0
  69. iris/cluster/controller/migrations/0012_separate_auth_db.py +53 -0
  70. iris/cluster/controller/migrations/0013_has_reservation.py +46 -0
  71. iris/cluster/controller/migrations/0014_profile_kind.py +36 -0
  72. iris/cluster/controller/migrations/0015_drop_redundant_index.py +11 -0
  73. iris/cluster/controller/migrations/0016_worker_scheduling_fields.py +57 -0
  74. iris/cluster/controller/migrations/0017_job_scheduling_fields.py +72 -0
  75. iris/cluster/controller/migrations/0018_task_assignment_fields.py +30 -0
  76. iris/cluster/controller/migrations/0019_worker_fk_cascade.py +73 -0
  77. iris/cluster/controller/migrations/0020_perf_indices_and_profiles_fk.py +53 -0
  78. iris/cluster/controller/migrations/0021_budgets.py +38 -0
  79. iris/cluster/controller/migrations/0022_workers_slice_and_group.py +28 -0
  80. iris/cluster/controller/migrations/0023_separate_profiles_db.py +63 -0
  81. iris/cluster/controller/migrations/0024_normalize_resource_usage.py +47 -0
  82. iris/cluster/controller/migrations/0024_task_resource_history.py +25 -0
  83. iris/cluster/controller/migrations/0025_normalize_resource_snapshots.py +105 -0
  84. iris/cluster/controller/migrations/0026_normalize_worker_metadata.py +95 -0
  85. iris/cluster/controller/migrations/0027_normalize_job_resources.py +101 -0
  86. iris/cluster/controller/migrations/0028_job_config_table.py +273 -0
  87. iris/cluster/controller/migrations/0029_drop_task_resource_usage_columns.py +30 -0
  88. iris/cluster/controller/migrations/0030_backfill_worker_region.py +57 -0
  89. iris/cluster/controller/migrations/0030_job_submit_argv.py +13 -0
  90. iris/cluster/controller/migrations/0031_auto_vacuum_incremental.py +24 -0
  91. iris/cluster/controller/migrations/0032_backfill_attempt_finished_at.py +52 -0
  92. iris/cluster/controller/migrations/0033_worker_task_history_fk_cascade.py +60 -0
  93. iris/cluster/controller/migrations/0034_task_summaries_covering_index.py +24 -0
  94. iris/cluster/controller/migrations/0035_drop_dead_logs_table.py +17 -0
  95. iris/cluster/controller/migrations/0036_reconcile_reservation_holder_attempt_ids.py +39 -0
  96. iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py +19 -0
  97. iris/cluster/controller/migrations/0037_user_budget_default.py +18 -0
  98. iris/cluster/controller/migrations/0038_finalize_orphan_attempts.py +103 -0
  99. iris/cluster/controller/migrations/0039_requeue_split_coscheduled_jobs.py +214 -0
  100. iris/cluster/controller/migrations/0040_drop_resource_history_tables.py +35 -0
  101. iris/cluster/controller/migrations/0041_drop_worker_task_history.py +23 -0
  102. iris/cluster/controller/migrations/0042_drop_workers_dormant_columns.py +31 -0
  103. iris/cluster/controller/migrations/0043_drop_workers_committed_columns.py +33 -0
  104. iris/cluster/controller/migrations/0044_drop_dispatch_queue.py +45 -0
  105. iris/cluster/controller/migrations/0045_index_task_attempts_live_workerbound.py +40 -0
  106. iris/cluster/controller/migrations/0046_drop_slices_last_active_ms.py +22 -0
  107. iris/cluster/controller/provider.py +55 -0
  108. iris/cluster/controller/query.py +80 -0
  109. iris/cluster/controller/scheduler.py +940 -0
  110. iris/cluster/controller/schema.py +1710 -0
  111. iris/cluster/controller/service.py +2629 -0
  112. iris/cluster/controller/stores.py +2205 -0
  113. iris/cluster/controller/transitions.py +2764 -0
  114. iris/cluster/controller/vm_lifecycle.py +452 -0
  115. iris/cluster/controller/worker_health.py +199 -0
  116. iris/cluster/controller/worker_provider.py +289 -0
  117. iris/cluster/dashboard_common.py +181 -0
  118. iris/cluster/endpoints.py +187 -0
  119. iris/cluster/log_store_helpers.py +46 -0
  120. iris/cluster/process_status.py +105 -0
  121. iris/cluster/providers/__init__.py +30 -0
  122. iris/cluster/providers/_worker_base.py +116 -0
  123. iris/cluster/providers/factory.py +105 -0
  124. iris/cluster/providers/gcp/__init__.py +11 -0
  125. iris/cluster/providers/gcp/bootstrap.py +496 -0
  126. iris/cluster/providers/gcp/controller.py +378 -0
  127. iris/cluster/providers/gcp/fake.py +560 -0
  128. iris/cluster/providers/gcp/handles.py +492 -0
  129. iris/cluster/providers/gcp/local.py +171 -0
  130. iris/cluster/providers/gcp/service.py +948 -0
  131. iris/cluster/providers/gcp/ssh.py +158 -0
  132. iris/cluster/providers/gcp/workers.py +1029 -0
  133. iris/cluster/providers/k8s/__init__.py +4 -0
  134. iris/cluster/providers/k8s/bundle_fetch.py +84 -0
  135. iris/cluster/providers/k8s/constants.py +12 -0
  136. iris/cluster/providers/k8s/controller.py +919 -0
  137. iris/cluster/providers/k8s/fake.py +830 -0
  138. iris/cluster/providers/k8s/service.py +782 -0
  139. iris/cluster/providers/k8s/tasks.py +1680 -0
  140. iris/cluster/providers/k8s/types.py +146 -0
  141. iris/cluster/providers/local/__init__.py +2 -0
  142. iris/cluster/providers/local/cluster.py +338 -0
  143. iris/cluster/providers/manual/__init__.py +2 -0
  144. iris/cluster/providers/manual/provider.py +547 -0
  145. iris/cluster/providers/protocols.py +140 -0
  146. iris/cluster/providers/remote_exec.py +426 -0
  147. iris/cluster/providers/types.py +432 -0
  148. iris/cluster/redaction.py +93 -0
  149. iris/cluster/runtime/__init__.py +39 -0
  150. iris/cluster/runtime/docker.py +1182 -0
  151. iris/cluster/runtime/entrypoint.py +122 -0
  152. iris/cluster/runtime/env.py +134 -0
  153. iris/cluster/runtime/process.py +713 -0
  154. iris/cluster/runtime/profile.py +290 -0
  155. iris/cluster/runtime/types.py +385 -0
  156. iris/cluster/service_mode.py +10 -0
  157. iris/cluster/types.py +842 -0
  158. iris/cluster/worker/__init__.py +4 -0
  159. iris/cluster/worker/dashboard.py +61 -0
  160. iris/cluster/worker/env_probe.py +651 -0
  161. iris/cluster/worker/main.py +95 -0
  162. iris/cluster/worker/port_allocator.py +50 -0
  163. iris/cluster/worker/service.py +171 -0
  164. iris/cluster/worker/stats.py +151 -0
  165. iris/cluster/worker/task_attempt.py +1011 -0
  166. iris/cluster/worker/tpu_health.py +26 -0
  167. iris/cluster/worker/worker.py +1107 -0
  168. iris/cluster/worker/worker_types.py +70 -0
  169. iris/dev_tpu.py +87 -0
  170. iris/env_resources.py +174 -0
  171. iris/examples/coreweave-ci.yaml +92 -0
  172. iris/examples/coreweave-rno2a.yaml +99 -0
  173. iris/examples/coreweave-usw09b.yaml +98 -0
  174. iris/examples/coreweave.yaml +116 -0
  175. iris/examples/local-auth-gcp.yaml +35 -0
  176. iris/examples/local-auth-static.yaml +36 -0
  177. iris/examples/local.yaml +29 -0
  178. iris/examples/marin-dev.yaml +145 -0
  179. iris/examples/marin.yaml +223 -0
  180. iris/examples/smoke-gcp.yaml +71 -0
  181. iris/examples/test.yaml +165 -0
  182. iris/examples/tpu-demo.ipynb +461 -0
  183. iris/logging.py +12 -0
  184. iris/managed_thread.py +370 -0
  185. iris/rpc/__init__.py +12 -0
  186. iris/rpc/actor.proto +118 -0
  187. iris/rpc/actor_connect.py +513 -0
  188. iris/rpc/actor_pb2.py +70 -0
  189. iris/rpc/actor_pb2.pyi +134 -0
  190. iris/rpc/async_adapter.py +75 -0
  191. iris/rpc/auth.py +397 -0
  192. iris/rpc/codecs.py +62 -0
  193. iris/rpc/compression.py +23 -0
  194. iris/rpc/config.proto +534 -0
  195. iris/rpc/config_pb2.py +173 -0
  196. iris/rpc/config_pb2.pyi +581 -0
  197. iris/rpc/controller.proto +670 -0
  198. iris/rpc/controller_connect.py +2400 -0
  199. iris/rpc/controller_pb2.py +202 -0
  200. iris/rpc/controller_pb2.pyi +705 -0
  201. iris/rpc/errors.proto +28 -0
  202. iris/rpc/errors.py +301 -0
  203. iris/rpc/errors_pb2.py +38 -0
  204. iris/rpc/errors_pb2.pyi +19 -0
  205. iris/rpc/interceptors.py +190 -0
  206. iris/rpc/iris_logging.proto +46 -0
  207. iris/rpc/iris_logging_pb2.py +40 -0
  208. iris/rpc/iris_logging_pb2.pyi +39 -0
  209. iris/rpc/job.proto +621 -0
  210. iris/rpc/job_pb2.py +177 -0
  211. iris/rpc/job_pb2.pyi +768 -0
  212. iris/rpc/logging_pb2.py +9 -0
  213. iris/rpc/proto_utils.py +130 -0
  214. iris/rpc/query.proto +36 -0
  215. iris/rpc/query_pb2.py +41 -0
  216. iris/rpc/query_pb2.pyi +29 -0
  217. iris/rpc/stats.proto +70 -0
  218. iris/rpc/stats.py +289 -0
  219. iris/rpc/stats_connect.py +123 -0
  220. iris/rpc/stats_pb2.py +46 -0
  221. iris/rpc/stats_pb2.pyi +72 -0
  222. iris/rpc/stats_service.py +29 -0
  223. iris/rpc/time.proto +47 -0
  224. iris/rpc/time_pb2.py +39 -0
  225. iris/rpc/time_pb2.pyi +17 -0
  226. iris/rpc/vm.proto +189 -0
  227. iris/rpc/vm_pb2.py +89 -0
  228. iris/rpc/vm_pb2.pyi +288 -0
  229. iris/rpc/worker.proto +124 -0
  230. iris/rpc/worker_connect.py +709 -0
  231. iris/rpc/worker_pb2.py +73 -0
  232. iris/rpc/worker_pb2.pyi +109 -0
  233. iris/runtime/__init__.py +2 -0
  234. iris/runtime/jax_init.py +170 -0
  235. iris/test_util.py +65 -0
  236. iris/time_proto.py +28 -0
  237. iris/version.py +47 -0
  238. marin_iris-0.99.dist-info/METADATA +30 -0
  239. marin_iris-0.99.dist-info/RECORD +241 -0
  240. marin_iris-0.99.dist-info/WHEEL +4 -0
  241. marin_iris-0.99.dist-info/entry_points.txt +3 -0
iris/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ # Copyright The Marin Authors
2
+ # SPDX-License-Identifier: Apache-2.0
iris/_build_info.py ADDED
@@ -0,0 +1,3 @@
1
+ # Auto-generated by scripts/python_libs_package.py during wheel builds.
2
+
3
+ BUILD_DATE = "2026-05-10"
iris/actor/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ # Copyright The Marin Authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Actor system for distributed RPC.
5
+
6
+ For ClusterResolver (namespace-aware controller-based resolution),
7
+ see iris.client.resolver.
8
+ """
9
+
10
+ from iris.actor.client import ActorClient
11
+ from iris.actor.pool import ActorPool, BroadcastFuture, CallResult
12
+ from iris.actor.resolver import (
13
+ ACTOR_ENDPOINT_HEADER,
14
+ FixedResolver,
15
+ ProxyResolver,
16
+ ResolvedEndpoint,
17
+ Resolver,
18
+ ResolveResult,
19
+ )
20
+ from iris.actor.server import ActorId, ActorServer
21
+
22
+ __all__ = [
23
+ "ACTOR_ENDPOINT_HEADER",
24
+ "ActorClient",
25
+ "ActorId",
26
+ "ActorPool",
27
+ "ActorServer",
28
+ "BroadcastFuture",
29
+ "CallResult",
30
+ "FixedResolver",
31
+ "ProxyResolver",
32
+ "ResolveResult",
33
+ "ResolvedEndpoint",
34
+ "Resolver",
35
+ ]
iris/actor/client.py ADDED
@@ -0,0 +1,223 @@
1
+ # Copyright The Marin Authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Actor client for making RPC calls to actor servers.
5
+
6
+ The ActorClient provides transparent actor discovery and invocation with
7
+ automatic retry logic. Both resolution failures (e.g., actor not yet
8
+ registered) and transient RPC errors are retried up to ``max_call_attempts``
9
+ with exponential backoff.
10
+
11
+ Example:
12
+ resolver = ClusterResolver("http://controller:8080")
13
+ client = ActorClient(resolver, "my-actor")
14
+ result = client.some_method(arg1, arg2) # Retries until actor found
15
+
16
+ Custom backoff behavior:
17
+ client = ActorClient(
18
+ resolver, "my-actor",
19
+ backoff=ExponentialBackoff(initial=0.2, maximum=5.0),
20
+ max_call_attempts=3,
21
+ )
22
+ """
23
+
24
+ import logging
25
+ import time
26
+ from typing import Any
27
+
28
+ import cloudpickle
29
+ from connectrpc.code import Code
30
+ from connectrpc.errors import ConnectError
31
+ from rigging.timing import ExponentialBackoff
32
+
33
+ from iris.actor.resolver import Resolver
34
+ from iris.rpc import actor_pb2
35
+ from iris.rpc.actor_connect import ActorServiceClientSync
36
+ from iris.rpc.errors import call_with_retry
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def unwrap_actor_response(resp: actor_pb2.ActorResponse) -> Any:
42
+ """Unwrap an ActorResponse, raising the embedded exception on error."""
43
+ if resp.HasField("error"):
44
+ if resp.error.serialized_exception:
45
+ raise cloudpickle.loads(resp.error.serialized_exception)
46
+ raise RuntimeError(f"{resp.error.error_type}: {resp.error.message}")
47
+ return cloudpickle.loads(resp.serialized_value)
48
+
49
+
50
+ class ActorClient:
51
+ """Actor client with resolver-based discovery.
52
+
53
+ By default the client waits forever, i.e. there's no timeout in httpx.
54
+ Specify ``call_timeout`` to apply a timeout to individual RPC calls.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ resolver: Resolver,
60
+ name: str,
61
+ call_timeout: float | None = None,
62
+ max_call_attempts: int = 10,
63
+ backoff: ExponentialBackoff = ExponentialBackoff(initial=0.5, maximum=10.0, factor=2.0, jitter=0.25),
64
+ ):
65
+ """Initialize the actor client.
66
+
67
+ Args:
68
+ resolver: Resolver instance for endpoint discovery
69
+ name: Name of the actor to invoke
70
+ call_timeout: Timeout in seconds for individual RPC calls.
71
+ None (default) means no timeout.
72
+ max_call_attempts: Maximum number of RPC call attempts (including
73
+ resolution failures) before giving up.
74
+ backoff: Exponential backoff configuration for retries between attempts.
75
+ """
76
+ self._resolver = resolver
77
+ self._name = name
78
+ self._call_timeout = call_timeout
79
+ self._max_call_attempts = max_call_attempts
80
+ self._backoff = backoff
81
+
82
+ self._rpc_client: ActorServiceClientSync | None = None
83
+ self._rpc_headers: dict[str, str] = {}
84
+
85
+ def rpc_client(self) -> ActorServiceClientSync:
86
+ """Resolve actor name to an RPC client (single attempt).
87
+
88
+ Resolution is attempted once. On failure (empty endpoints or RPC error),
89
+ the exception propagates to the caller. The outer ``call_with_retry`` in
90
+ ``_RpcMethod.__call__`` is responsible for retrying.
91
+
92
+ Returns:
93
+ ActorServiceClientSync connected to the resolved endpoint.
94
+
95
+ Raises:
96
+ ConnectError(UNAVAILABLE): If no endpoints are found for the actor.
97
+ """
98
+ if self._rpc_client:
99
+ return self._rpc_client
100
+
101
+ logger.info("Resolving name %s via %s", self._name, self._resolver)
102
+ result = self._resolver.resolve(self._name)
103
+
104
+ if result.is_empty:
105
+ raise ConnectError(
106
+ Code.UNAVAILABLE,
107
+ f"No endpoints found for actor '{self._name}'",
108
+ )
109
+
110
+ logger.info(
111
+ "Resolved actor '%s' to %d endpoint(s)",
112
+ self._name,
113
+ len(result.endpoints),
114
+ )
115
+ endpoint = result.first()
116
+ logger.info("First endpoint: url=%s, actor_id=%s", endpoint.url, endpoint.actor_id)
117
+ self._rpc_headers = dict(endpoint.metadata)
118
+ self._rpc_client = ActorServiceClientSync(
119
+ address=endpoint.url,
120
+ timeout_ms=None if self._call_timeout is None else int(self._call_timeout * 1000),
121
+ accept_compression=[],
122
+ )
123
+ return self._rpc_client
124
+
125
+ def _clear_connection(self, _exc: Exception) -> None:
126
+ self._rpc_client = None
127
+ self._rpc_headers = {}
128
+
129
+ def start_operation(self, method_name: str, *args: Any, **kwargs: Any) -> str:
130
+ """Start a long-running operation. Returns the operation ID."""
131
+ call = actor_pb2.ActorCall(
132
+ method_name=method_name,
133
+ actor_name=self._name,
134
+ serialized_args=cloudpickle.dumps(args),
135
+ serialized_kwargs=cloudpickle.dumps(kwargs),
136
+ )
137
+
138
+ def do_call():
139
+ client = self.rpc_client()
140
+ return client.start_operation(call, headers=self._rpc_headers)
141
+
142
+ op = call_with_retry(
143
+ f"{self._name}.start_operation({method_name})",
144
+ do_call,
145
+ on_retry=self._clear_connection,
146
+ max_attempts=self._max_call_attempts,
147
+ backoff=self._backoff,
148
+ )
149
+ return op.operation_id
150
+
151
+ def poll_operation_status(self, operation_id: str) -> actor_pb2.Operation:
152
+ """Single-shot poll of a long-running operation's state."""
153
+ req = actor_pb2.OperationId(operation_id=operation_id)
154
+
155
+ def do_call():
156
+ return self.rpc_client().get_operation(req, headers=self._rpc_headers)
157
+
158
+ return call_with_retry(
159
+ f"{self._name}.poll_operation_status({operation_id[:8]})",
160
+ do_call,
161
+ on_retry=self._clear_connection,
162
+ max_attempts=self._max_call_attempts,
163
+ backoff=self._backoff,
164
+ )
165
+
166
+ def get_operation(
167
+ self,
168
+ operation_id: str,
169
+ poll_backoff: ExponentialBackoff | None = None,
170
+ ) -> actor_pb2.Operation:
171
+ """Poll a long-running operation until it completes, using exponential backoff."""
172
+ if poll_backoff is None:
173
+ poll_backoff = ExponentialBackoff(initial=0.1, maximum=10.0, factor=2.0, jitter=0.25)
174
+ while True:
175
+ op = self.poll_operation_status(operation_id)
176
+ if op.state != actor_pb2.Operation.RUNNING:
177
+ return op
178
+ time.sleep(poll_backoff.next_interval())
179
+
180
+ def cancel_operation(self, operation_id: str) -> actor_pb2.Operation:
181
+ """Cancel a long-running operation."""
182
+ req = actor_pb2.OperationId(operation_id=operation_id)
183
+
184
+ def do_call():
185
+ return self.rpc_client().cancel_operation(req, headers=self._rpc_headers)
186
+
187
+ return call_with_retry(
188
+ f"{self._name}.cancel_operation({operation_id[:8]})",
189
+ do_call,
190
+ on_retry=self._clear_connection,
191
+ max_attempts=self._max_call_attempts,
192
+ backoff=self._backoff,
193
+ )
194
+
195
+ def __getattr__(self, method_name: str) -> "_RpcMethod":
196
+ return _RpcMethod(self, method_name)
197
+
198
+
199
+ class _RpcMethod:
200
+ def __init__(self, client: ActorClient, method_name: str):
201
+ self._client = client
202
+ self._method_name = method_name
203
+
204
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
205
+ call = actor_pb2.ActorCall(
206
+ method_name=self._method_name,
207
+ actor_name=self._client._name,
208
+ serialized_args=cloudpickle.dumps(args),
209
+ serialized_kwargs=cloudpickle.dumps(kwargs),
210
+ )
211
+
212
+ def do_call():
213
+ client = self._client.rpc_client()
214
+ resp = client.call(call, headers=self._client._rpc_headers)
215
+ return unwrap_actor_response(resp)
216
+
217
+ return call_with_retry(
218
+ f"{self._client._name}.{self._method_name}",
219
+ do_call,
220
+ on_retry=self._client._clear_connection,
221
+ max_attempts=self._client._max_call_attempts,
222
+ backoff=self._client._backoff,
223
+ )
iris/actor/pool.py ADDED
@@ -0,0 +1,281 @@
1
+ # Copyright The Marin Authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Actor pool for load-balanced and broadcast RPC calls."""
5
+
6
+ import logging
7
+ import threading
8
+ import time
9
+ from collections.abc import Callable, Iterator
10
+ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
11
+ from dataclasses import dataclass
12
+ from typing import Any, Generic, TypeVar
13
+
14
+ import cloudpickle
15
+ from rigging.timing import ExponentialBackoff
16
+
17
+ from iris.actor.client import unwrap_actor_response
18
+ from iris.actor.resolver import ResolvedEndpoint, Resolver, ResolveResult
19
+ from iris.rpc import actor_pb2
20
+ from iris.rpc.actor_connect import ActorServiceClientSync
21
+ from iris.rpc.errors import call_with_retry
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ T = TypeVar("T")
26
+
27
+
28
+ @dataclass
29
+ class CallResult:
30
+ """Result of a single call in a broadcast."""
31
+
32
+ endpoint: ResolvedEndpoint
33
+ value: Any | None = None
34
+ exception: BaseException | None = None
35
+
36
+ @property
37
+ def success(self) -> bool:
38
+ return self.exception is None
39
+
40
+
41
+ class BroadcastFuture(Generic[T]):
42
+ """Future representing results from a broadcast call to multiple endpoints."""
43
+
44
+ def __init__(self, futures: list[tuple[ResolvedEndpoint, Future]]):
45
+ self._futures = futures
46
+
47
+ def wait_all(self, timeout: float | None = None) -> list[CallResult]:
48
+ results = []
49
+ for endpoint, future in self._futures:
50
+ try:
51
+ value = future.result(timeout=timeout)
52
+ results.append(CallResult(endpoint=endpoint, value=value))
53
+ except Exception as e:
54
+ results.append(CallResult(endpoint=endpoint, exception=e))
55
+ return results
56
+
57
+ def wait_any(self, timeout: float | None = None) -> CallResult:
58
+ for future in as_completed([f for _, f in self._futures], timeout=timeout):
59
+ idx = next(i for i, (_, f) in enumerate(self._futures) if f is future)
60
+ endpoint = self._futures[idx][0]
61
+ try:
62
+ value = future.result()
63
+ return CallResult(endpoint=endpoint, value=value)
64
+ except Exception as e:
65
+ return CallResult(endpoint=endpoint, exception=e)
66
+ raise TimeoutError("No results within timeout")
67
+
68
+ def as_completed(self, timeout: float | None = None) -> Iterator[CallResult]:
69
+ """Iterate over results as they complete."""
70
+ endpoint_map = {id(f): ep for ep, f in self._futures}
71
+ for future in as_completed([f for _, f in self._futures], timeout=timeout):
72
+ endpoint = endpoint_map[id(future)]
73
+ try:
74
+ value = future.result()
75
+ yield CallResult(endpoint=endpoint, value=value)
76
+ except Exception as e:
77
+ yield CallResult(endpoint=endpoint, exception=e)
78
+
79
+
80
+ class ActorPool(Generic[T]):
81
+ """Pool of actors for load-balanced and broadcast calls.
82
+
83
+ Resolves a pool of endpoints for an actor name and provides methods to
84
+ distribute calls across them (round-robin) or broadcast to all endpoints.
85
+
86
+ Example:
87
+ >>> pool = ActorPool(resolver, "inference")
88
+ >>> result = pool.call().predict(data) # Round-robin to one endpoint
89
+ >>> broadcast = pool.broadcast().reload_model() # Send to all endpoints
90
+ >>> results = broadcast.wait_all()
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ resolver: Resolver,
96
+ name: str,
97
+ timeout: float = 30.0,
98
+ max_call_attempts: int = 5,
99
+ backoff: ExponentialBackoff = ExponentialBackoff(initial=0.1, maximum=10.0, factor=2.0, jitter=0.25),
100
+ resolve_ttl: float = 5.0,
101
+ ):
102
+ """Initialize actor pool.
103
+
104
+ Args:
105
+ resolver: Resolver to discover endpoints
106
+ name: Actor name to resolve
107
+ timeout: RPC timeout in seconds
108
+ max_call_attempts: Maximum number of RPC call attempts before giving up.
109
+ backoff: Exponential backoff configuration for call retries.
110
+ resolve_ttl: Seconds to cache resolve results before re-querying the resolver
111
+ """
112
+ self._resolver = resolver
113
+ self._name = name
114
+ self._timeout = timeout
115
+ self._max_call_attempts = max_call_attempts
116
+ self._backoff = backoff
117
+ self._resolve_ttl = resolve_ttl
118
+ self._endpoint_index = 0
119
+ self._cached_result: ResolveResult | None = None
120
+ self._last_resolve_time: float = 0.0
121
+ self._lock = threading.Lock()
122
+ self._executor = ThreadPoolExecutor(max_workers=32)
123
+ self._clients: dict[str, ActorServiceClientSync] = {}
124
+
125
+ def _get_client(self, endpoint: ResolvedEndpoint) -> ActorServiceClientSync:
126
+ """Return a cached client for the endpoint, creating one if needed."""
127
+ url = endpoint.url
128
+ with self._lock:
129
+ client = self._clients.get(url)
130
+ if client is not None:
131
+ return client
132
+ client = ActorServiceClientSync(
133
+ address=url,
134
+ timeout_ms=int(self._timeout * 1000),
135
+ accept_compression=[],
136
+ )
137
+ self._clients[url] = client
138
+ return client
139
+
140
+ def _resolve(self) -> ResolveResult:
141
+ now = time.monotonic()
142
+ with self._lock:
143
+ if self._cached_result is not None and (now - self._last_resolve_time) < self._resolve_ttl:
144
+ return self._cached_result
145
+
146
+ result = self._resolver.resolve(self._name)
147
+ if result.endpoints:
148
+ with self._lock:
149
+ self._cached_result = result
150
+ self._last_resolve_time = time.monotonic()
151
+ return result
152
+
153
+ def _invalidate_resolve_cache(self) -> None:
154
+ """Force the next _resolve() call to re-query the resolver."""
155
+ with self._lock:
156
+ self._last_resolve_time = 0.0
157
+ self._cached_result = None
158
+
159
+ def _evict_client(self, url: str) -> None:
160
+ """Remove and close a cached client so it is recreated on next use."""
161
+ with self._lock:
162
+ client = self._clients.pop(url, None)
163
+ if client is not None:
164
+ try:
165
+ client.close()
166
+ except Exception:
167
+ logger.debug("Error closing evicted client for %s", url, exc_info=True)
168
+
169
+ def _get_next_endpoint(self) -> ResolvedEndpoint:
170
+ """Get the next endpoint in round-robin order.
171
+
172
+ Thread-safe: uses a lock to protect the endpoint index.
173
+ """
174
+ endpoints = self._resolve().endpoints
175
+ with self._lock:
176
+ if not endpoints:
177
+ raise RuntimeError(f"No endpoints for '{self._name}'")
178
+ endpoint = endpoints[self._endpoint_index % len(endpoints)]
179
+ self._endpoint_index += 1
180
+ return endpoint
181
+
182
+ def shutdown(self) -> None:
183
+ self._executor.shutdown(wait=True)
184
+ with self._lock:
185
+ clients = list(self._clients.values())
186
+ self._clients.clear()
187
+ for client in clients:
188
+ try:
189
+ client.close()
190
+ except Exception:
191
+ logger.debug("Error closing client during shutdown", exc_info=True)
192
+
193
+ def __enter__(self) -> "ActorPool[T]":
194
+ return self
195
+
196
+ def __exit__(self, *args) -> None:
197
+ self.shutdown()
198
+
199
+ @property
200
+ def size(self) -> int:
201
+ return len(self._resolve().endpoints)
202
+
203
+ @property
204
+ def endpoints(self) -> list[ResolvedEndpoint]:
205
+ return list(self._resolve().endpoints)
206
+
207
+ def _call_endpoint(
208
+ self,
209
+ endpoint: ResolvedEndpoint,
210
+ method_name: str,
211
+ args: tuple,
212
+ kwargs: dict,
213
+ ) -> Any:
214
+ client = self._get_client(endpoint)
215
+
216
+ call = actor_pb2.ActorCall(
217
+ method_name=method_name,
218
+ actor_name=self._name,
219
+ serialized_args=cloudpickle.dumps(args),
220
+ serialized_kwargs=cloudpickle.dumps(kwargs),
221
+ )
222
+
223
+ resp = client.call(call)
224
+ return unwrap_actor_response(resp)
225
+
226
+ def call(self) -> "_PoolCallProxy[T]":
227
+ return _PoolCallProxy(self)
228
+
229
+ def broadcast(self) -> "_PoolBroadcastProxy[T]":
230
+ return _PoolBroadcastProxy(self)
231
+
232
+
233
+ class _PoolCallProxy(Generic[T]):
234
+ def __init__(self, pool: ActorPool[T]):
235
+ self._pool = pool
236
+
237
+ def __getattr__(self, method_name: str) -> Callable[..., Any]:
238
+ def call(*args, **kwargs):
239
+ last_url: list[str | None] = [None]
240
+
241
+ def do_call():
242
+ endpoint = self._pool._get_next_endpoint()
243
+ last_url[0] = endpoint.url
244
+ return self._pool._call_endpoint(endpoint, method_name, args, kwargs)
245
+
246
+ def on_retry(_exc):
247
+ self._pool._invalidate_resolve_cache()
248
+ if last_url[0] is not None:
249
+ self._pool._evict_client(last_url[0])
250
+
251
+ return call_with_retry(
252
+ f"{self._pool._name}.{method_name}",
253
+ do_call,
254
+ on_retry=on_retry,
255
+ max_attempts=self._pool._max_call_attempts,
256
+ backoff=self._pool._backoff,
257
+ )
258
+
259
+ return call
260
+
261
+
262
+ class _PoolBroadcastProxy(Generic[T]):
263
+ def __init__(self, pool: ActorPool[T]):
264
+ self._pool = pool
265
+
266
+ def __getattr__(self, method_name: str) -> Callable[..., BroadcastFuture]:
267
+ def broadcast(*args, **kwargs) -> BroadcastFuture:
268
+ result = self._pool._resolve()
269
+ futures = []
270
+ for endpoint in result.endpoints:
271
+ future = self._pool._executor.submit(
272
+ self._pool._call_endpoint,
273
+ endpoint,
274
+ method_name,
275
+ args,
276
+ kwargs,
277
+ )
278
+ futures.append((endpoint, future))
279
+ return BroadcastFuture(futures)
280
+
281
+ return broadcast
iris/actor/resolver.py ADDED
@@ -0,0 +1,108 @@
1
+ # Copyright The Marin Authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Core types and resolver implementations for the actor system."""
5
+
6
+ from dataclasses import dataclass, field
7
+ from typing import Protocol
8
+
9
+ # Header used by ActorProxy to route requests to the correct actor endpoint.
10
+ # Shared constant between ProxyResolver (client-side) and ActorProxy (server-side).
11
+ ACTOR_ENDPOINT_HEADER = "x-iris-actor-endpoint"
12
+
13
+
14
+ @dataclass
15
+ class ResolvedEndpoint:
16
+ """A single resolved endpoint for an actor."""
17
+
18
+ url: str
19
+ actor_id: str
20
+ metadata: dict[str, str] = field(default_factory=dict)
21
+
22
+
23
+ @dataclass
24
+ class ResolveResult:
25
+ """Result of resolving an actor name to endpoints."""
26
+
27
+ name: str
28
+ endpoints: list[ResolvedEndpoint] = field(default_factory=list)
29
+
30
+ @property
31
+ def is_empty(self) -> bool:
32
+ return len(self.endpoints) == 0
33
+
34
+ def first(self) -> ResolvedEndpoint:
35
+ """Get the first endpoint.
36
+
37
+ Returns:
38
+ The first resolved endpoint
39
+
40
+ Raises:
41
+ ValueError: If no endpoints are available
42
+ """
43
+ if not self.endpoints:
44
+ raise ValueError(f"No endpoints for '{self.name}'")
45
+ return self.endpoints[0]
46
+
47
+
48
+ class Resolver(Protocol):
49
+ """Protocol for resolving actor names to endpoints.
50
+
51
+ Implementations:
52
+ - FixedResolver: Static endpoint mapping
53
+ - ClusterResolver: Resolves via cluster controller (lives in iris.client)
54
+ """
55
+
56
+ def resolve(self, name: str) -> ResolveResult: ...
57
+
58
+
59
+ class FixedResolver:
60
+ """Resolver with statically configured endpoints.
61
+
62
+ Used for testing or when endpoints are known ahead of time.
63
+ Does not use namespace prefixing since endpoints are static.
64
+ """
65
+
66
+ def __init__(self, endpoints: dict[str, str | list[str]]):
67
+ """Initialize with a mapping of actor names to URLs."""
68
+ self._endpoints: dict[str, list[str]] = {}
69
+ for name, urls in endpoints.items():
70
+ if isinstance(urls, str):
71
+ self._endpoints[name] = [urls]
72
+ else:
73
+ self._endpoints[name] = list(urls)
74
+
75
+ def resolve(self, name: str) -> ResolveResult:
76
+ urls = self._endpoints.get(name, [])
77
+ endpoints = [ResolvedEndpoint(url=url, actor_id=f"fixed-{name}-{i}") for i, url in enumerate(urls)]
78
+ return ResolveResult(name=name, endpoints=endpoints)
79
+
80
+
81
+ class ProxyResolver:
82
+ """Resolver that routes actor calls through the controller's actor proxy.
83
+
84
+ Instead of resolving to the actor's direct address, returns the controller
85
+ URL so all RPCs go through the proxy. The proxy uses the
86
+ ``X-Iris-Actor-Endpoint`` header to resolve the actual actor endpoint.
87
+
88
+ The caller passes the full actor name as registered in the endpoint registry
89
+ (e.g. ``/user/job/coordinator/actor-0``).
90
+
91
+ Args:
92
+ controller_url: Controller URL (e.g., ``http://localhost:8080``)
93
+ """
94
+
95
+ def __init__(self, controller_url: str):
96
+ self._controller_url = controller_url.rstrip("/")
97
+
98
+ def resolve(self, name: str) -> ResolveResult:
99
+ return ResolveResult(
100
+ name=name,
101
+ endpoints=[
102
+ ResolvedEndpoint(
103
+ url=self._controller_url,
104
+ actor_id=f"proxy-{name}",
105
+ metadata={ACTOR_ENDPOINT_HEADER: name},
106
+ )
107
+ ],
108
+ )