indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -311
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -267
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -314
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.30.dist-info/RECORD +0 -68
  73. indexify-0.3.30.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,95 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class FunctionExecutorStatus(Enum):
5
- """Status of a Function Executor.
6
-
7
- Each status lists transitions allowed to it.
8
- """
9
-
10
- # DESTROYED -> STARTING_UP
11
- STARTING_UP = "Starting Up"
12
- # STARTING_UP -> STARTUP_FAILED_CUSTOMER_ERROR
13
- STARTUP_FAILED_CUSTOMER_ERROR = "Startup Failed (Customer Error)"
14
- # STARTING_UP -> STARTUP_FAILED_PLATFORM_ERROR
15
- STARTUP_FAILED_PLATFORM_ERROR = "Startup Failed (Platform Error)"
16
- # STARTING_UP -> IDLE
17
- # RUNNING_TASK -> IDLE
18
- IDLE = "Idle"
19
- # IDLE -> RUNNING_TASK
20
- RUNNING_TASK = "Running Task"
21
- # IDLE -> UNHEALTHY
22
- # RUNNING_TASK -> UNHEALTHY
23
- UNHEALTHY = "Unhealthy"
24
- # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
25
- # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
26
- # RUNNING_TASK -> DESTROYING
27
- # UNHEALTHY -> DESTROYING
28
- # IDLE -> DESTROYING
29
- DESTROYING = "Destroying"
30
- # DESTROYED (initial status)
31
- # DESTROYING -> DESTROYED
32
- DESTROYED = "Destroyed"
33
- # Any state -> SHUTDOWN
34
- SHUTDOWN = "Shutdown" # Permanent stop state
35
-
36
-
37
- # TODO: After removing HTTP code simplify state transitions by not allowing to
38
- # startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
39
- def is_status_change_allowed(
40
- current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
41
- ) -> bool:
42
- """Returns True if the transition is allowed."""
43
- allowed_transitions = {
44
- FunctionExecutorStatus.DESTROYED: [
45
- FunctionExecutorStatus.DESTROYED,
46
- FunctionExecutorStatus.STARTING_UP,
47
- FunctionExecutorStatus.SHUTDOWN,
48
- ],
49
- FunctionExecutorStatus.STARTING_UP: [
50
- FunctionExecutorStatus.STARTING_UP,
51
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
52
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
53
- FunctionExecutorStatus.IDLE,
54
- FunctionExecutorStatus.SHUTDOWN,
55
- ],
56
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: [
57
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
58
- FunctionExecutorStatus.DESTROYING,
59
- FunctionExecutorStatus.SHUTDOWN,
60
- ],
61
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: [
62
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
63
- FunctionExecutorStatus.DESTROYING,
64
- FunctionExecutorStatus.SHUTDOWN,
65
- ],
66
- FunctionExecutorStatus.IDLE: [
67
- FunctionExecutorStatus.IDLE,
68
- FunctionExecutorStatus.RUNNING_TASK,
69
- FunctionExecutorStatus.UNHEALTHY,
70
- FunctionExecutorStatus.DESTROYING,
71
- FunctionExecutorStatus.SHUTDOWN,
72
- ],
73
- FunctionExecutorStatus.RUNNING_TASK: [
74
- FunctionExecutorStatus.RUNNING_TASK,
75
- FunctionExecutorStatus.DESTROYING,
76
- FunctionExecutorStatus.IDLE,
77
- FunctionExecutorStatus.UNHEALTHY,
78
- FunctionExecutorStatus.SHUTDOWN,
79
- ],
80
- FunctionExecutorStatus.UNHEALTHY: [
81
- FunctionExecutorStatus.UNHEALTHY,
82
- FunctionExecutorStatus.DESTROYING,
83
- FunctionExecutorStatus.SHUTDOWN,
84
- ],
85
- FunctionExecutorStatus.DESTROYING: [
86
- FunctionExecutorStatus.DESTROYING,
87
- FunctionExecutorStatus.DESTROYED,
88
- FunctionExecutorStatus.SHUTDOWN,
89
- ],
90
- FunctionExecutorStatus.SHUTDOWN: [
91
- FunctionExecutorStatus.SHUTDOWN
92
- ], # No transitions allowed from SHUTDOWN
93
- }
94
-
95
- return new_status in allowed_transitions.get(current_status, [])
@@ -1,46 +0,0 @@
1
- import prometheus_client
2
-
3
- from ..function_executor_status import FunctionExecutorStatus
4
-
5
- # This file contains all metrics used by FunctionExecutorState.
6
-
7
- metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
8
- prometheus_client.Counter(
9
- "function_executor_state_not_locked_errors",
10
- "Number of times a Function Executor state was used without acquiring its lock",
11
- )
12
- )
13
-
14
- # Function Executors count with a particular status.
15
- metric_function_executors_with_status: prometheus_client.Gauge = (
16
- prometheus_client.Gauge(
17
- "function_executors_with_status",
18
- "Number of Function Executors with a particular status",
19
- ["status"],
20
- )
21
- )
22
- metric_function_executors_with_status.labels(
23
- status=FunctionExecutorStatus.STARTING_UP.name
24
- )
25
- metric_function_executors_with_status.labels(
26
- status=FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR.name
27
- )
28
- metric_function_executors_with_status.labels(
29
- status=FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR.name
30
- )
31
- metric_function_executors_with_status.labels(status=FunctionExecutorStatus.IDLE.name)
32
- metric_function_executors_with_status.labels(
33
- status=FunctionExecutorStatus.RUNNING_TASK.name
34
- )
35
- metric_function_executors_with_status.labels(
36
- status=FunctionExecutorStatus.UNHEALTHY.name
37
- )
38
- metric_function_executors_with_status.labels(
39
- status=FunctionExecutorStatus.DESTROYING.name
40
- )
41
- metric_function_executors_with_status.labels(
42
- status=FunctionExecutorStatus.DESTROYED.name
43
- )
44
- metric_function_executors_with_status.labels(
45
- status=FunctionExecutorStatus.SHUTDOWN.name
46
- )
@@ -1,10 +0,0 @@
1
- import prometheus_client
2
-
3
- # This file contains all metrics used by FunctionExecutorStatesContainer.
4
-
5
- metric_function_executor_states_count: prometheus_client.Gauge = (
6
- prometheus_client.Gauge(
7
- "function_executor_states_count",
8
- "Number of existing Function Executor states",
9
- )
10
- )
@@ -1,345 +0,0 @@
1
- from collections.abc import Awaitable, Callable
2
- from math import ceil
3
- from typing import Any, Optional
4
-
5
- import grpc
6
- from tensorlake.function_executor.proto.function_executor_pb2 import (
7
- InitializeRequest,
8
- RunTaskRequest,
9
- RunTaskResponse,
10
- )
11
- from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
12
- FunctionExecutorStub,
13
- )
14
- from tensorlake.function_executor.proto.message_validator import MessageValidator
15
-
16
- from ..api_objects import Task
17
- from .function_executor import CustomerError, FunctionExecutor
18
- from .function_executor_state import FunctionExecutorState
19
- from .function_executor_status import FunctionExecutorStatus
20
- from .health_checker import HealthChecker, HealthCheckResult
21
- from .metrics.single_task_runner import (
22
- metric_function_executor_run_task_rpc_errors,
23
- metric_function_executor_run_task_rpc_latency,
24
- metric_function_executor_run_task_rpcs,
25
- )
26
- from .server.function_executor_server_factory import (
27
- FunctionExecutorServerConfiguration,
28
- FunctionExecutorServerFactory,
29
- )
30
- from .task_input import TaskInput
31
- from .task_output import TaskMetrics, TaskOutput
32
-
33
-
34
- class SingleTaskRunner:
35
- def __init__(
36
- self,
37
- executor_id: str,
38
- function_executor_state: FunctionExecutorState,
39
- task_input: TaskInput,
40
- function_executor_server_factory: FunctionExecutorServerFactory,
41
- base_url: str,
42
- config_path: Optional[str],
43
- logger: Any,
44
- ):
45
- self._executor_id: str = executor_id
46
- self._function_executor_state: FunctionExecutorState = function_executor_state
47
- self._task_input: TaskInput = task_input
48
- self._function_executor_server_factory: FunctionExecutorServerFactory = (
49
- function_executor_server_factory
50
- )
51
- self._base_url: str = base_url
52
- self._config_path: Optional[str] = config_path
53
- self._logger = logger.bind(module=__name__)
54
-
55
- async def run(self) -> TaskOutput:
56
- """Runs the task in the Function Executor.
57
-
58
- The FunctionExecutorState must be locked by the caller.
59
- The lock is released during actual task run in the server.
60
- The lock is relocked on return.
61
-
62
- Raises an exception if an error occured.
63
-
64
- On enter the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
65
- On return the Function Executor status is either IDLE, UNHEALTHY or DESTROYED.
66
- """
67
- self._function_executor_state.check_locked()
68
-
69
- if self._function_executor_state.status not in [
70
- FunctionExecutorStatus.IDLE,
71
- FunctionExecutorStatus.UNHEALTHY,
72
- FunctionExecutorStatus.DESTROYED,
73
- ]:
74
- self._logger.error(
75
- "Function Executor is not in oneof [IDLE, UNHEALTHY, DESTROYED] state, cannot run the task",
76
- status=self._function_executor_state.status,
77
- )
78
- raise RuntimeError(
79
- f"Unexpected Function Executor state {self._function_executor_state.status}"
80
- )
81
-
82
- # If Function Executor became unhealthy while was idle then destroy it.
83
- # It'll be recreated below.
84
- await self._destroy_existing_function_executor_if_unhealthy()
85
-
86
- # Create Function Executor if it doesn't exist yet.
87
- if self._function_executor_state.status == FunctionExecutorStatus.DESTROYED:
88
- try:
89
- await self._create_function_executor()
90
- except CustomerError as e:
91
- return TaskOutput(
92
- task_id=self._task_input.task.id,
93
- namespace=self._task_input.task.namespace,
94
- graph_name=self._task_input.task.compute_graph,
95
- function_name=self._task_input.task.compute_fn,
96
- graph_version=self._task_input.task.graph_version,
97
- graph_invocation_id=self._task_input.task.invocation_id,
98
- stderr=str(e),
99
- success=False,
100
- output_payload_uri_prefix=self._task_input.task.output_payload_uri_prefix,
101
- )
102
-
103
- try:
104
- return await self._run()
105
- finally:
106
- # If Function Executor became unhealthy while running the task then destroy it.
107
- # The periodic health checker might not notice this as it does only periodic checks.
108
- await self._destroy_existing_function_executor_if_unhealthy()
109
-
110
- if self._function_executor_state.status not in [
111
- FunctionExecutorStatus.IDLE,
112
- FunctionExecutorStatus.UNHEALTHY,
113
- FunctionExecutorStatus.DESTROYED,
114
- ]:
115
- self._logger.error(
116
- "Function Executor status is not oneof [IDLE, UNHEALTHY, DESTROYED] after running the task, resetting the state to mitigate a possible bug",
117
- status=self._function_executor_state.status,
118
- )
119
- if self._function_executor_state.function_executor is None:
120
- await self._function_executor_state.set_status(
121
- FunctionExecutorStatus.DESTROYED
122
- )
123
- else:
124
- await self._function_executor_state.set_status(
125
- FunctionExecutorStatus.UNHEALTHY
126
- )
127
-
128
- async def _create_function_executor(self) -> None:
129
- await self._function_executor_state.set_status(
130
- FunctionExecutorStatus.STARTING_UP
131
- )
132
- self._function_executor_state.function_executor = FunctionExecutor(
133
- server_factory=self._function_executor_server_factory, logger=self._logger
134
- )
135
- task: Task = self._task_input.task
136
- config: FunctionExecutorServerConfiguration = (
137
- FunctionExecutorServerConfiguration(
138
- executor_id=self._executor_id,
139
- function_executor_id=self._function_executor_state.id,
140
- namespace=task.namespace,
141
- graph_name=task.compute_graph,
142
- graph_version=task.graph_version,
143
- function_name=task.compute_fn,
144
- image_uri=task.image_uri,
145
- secret_names=task.secret_names or [],
146
- cpu_ms_per_sec=(
147
- None
148
- if task.resources.cpus is None
149
- else ceil(task.resources.cpus * 1000)
150
- ),
151
- memory_bytes=(
152
- None
153
- if task.resources.memory_mb is None
154
- else task.resources.memory_mb * 1024 * 1024
155
- ),
156
- disk_bytes=(
157
- None
158
- if task.resources.ephemeral_disk_mb is None
159
- else task.resources.ephemeral_disk_mb * 1024 * 1024
160
- ),
161
- gpu_count=0 if task.resources.gpu is None else task.resources.gpu.count,
162
- )
163
- )
164
- initialize_request: InitializeRequest = InitializeRequest(
165
- namespace=self._task_input.task.namespace,
166
- graph_name=self._task_input.task.compute_graph,
167
- graph_version=self._task_input.task.graph_version,
168
- function_name=self._task_input.task.compute_fn,
169
- graph=self._task_input.graph,
170
- )
171
-
172
- try:
173
- await self._function_executor_state.function_executor.initialize(
174
- config=config,
175
- initialize_request=initialize_request,
176
- base_url=self._base_url,
177
- config_path=self._config_path,
178
- )
179
- except CustomerError:
180
- # We have to follow the valid state transition sequence.
181
- await self._function_executor_state.set_status(
182
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
183
- )
184
- await self._function_executor_state.destroy_function_executor()
185
- raise
186
- except Exception:
187
- # We have to follow the valid state transition sequence.
188
- await self._function_executor_state.set_status(
189
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
190
- )
191
- await self._function_executor_state.destroy_function_executor()
192
- raise
193
-
194
- await self._function_executor_state.set_status(FunctionExecutorStatus.IDLE)
195
-
196
- async def _run(self) -> TaskOutput:
197
- request: RunTaskRequest = RunTaskRequest(
198
- namespace=self._task_input.task.namespace,
199
- graph_name=self._task_input.task.compute_graph,
200
- graph_version=self._task_input.task.graph_version,
201
- function_name=self._task_input.task.compute_fn,
202
- graph_invocation_id=self._task_input.task.invocation_id,
203
- task_id=self._task_input.task.id,
204
- function_input=self._task_input.input,
205
- )
206
- if self._task_input.init_value is not None:
207
- request.function_init_value.CopyFrom(self._task_input.init_value)
208
- channel: grpc.aio.Channel = (
209
- self._function_executor_state.function_executor.channel()
210
- )
211
-
212
- async with _RunningTaskContextManager(
213
- invocation_id=self._task_input.task.invocation_id,
214
- task_id=self._task_input.task.id,
215
- health_check_failed_callback=self._health_check_failed_callback,
216
- function_executor_state=self._function_executor_state,
217
- ):
218
- with (
219
- metric_function_executor_run_task_rpc_errors.count_exceptions(),
220
- metric_function_executor_run_task_rpc_latency.time(),
221
- ):
222
- metric_function_executor_run_task_rpcs.inc()
223
- # If this RPC failed due to customer code crashing the server we won't be
224
- # able to detect this. We'll treat this as our own error for now and thus
225
- # let the AioRpcError to be raised here.
226
- response: RunTaskResponse = await FunctionExecutorStub(
227
- channel
228
- ).run_task(request)
229
- return _task_output(task=self._task_input.task, response=response)
230
-
231
- async def _health_check_failed_callback(self, result: HealthCheckResult):
232
- # Function Executor destroy due to the periodic health check failure ensures that
233
- # a running task RPC stuck in unhealthy Function Executor fails immidiately.
234
- async with self._function_executor_state.lock:
235
- if (
236
- self._function_executor_state.status
237
- != FunctionExecutorStatus.RUNNING_TASK
238
- ):
239
- # Protection in case the callback gets delivered after we finished running the task.
240
- return
241
-
242
- await self._function_executor_state.set_status(
243
- FunctionExecutorStatus.UNHEALTHY
244
- )
245
- await self._destroy_function_executor_on_failed_health_check(result.reason)
246
-
247
- async def _destroy_existing_function_executor_if_unhealthy(self):
248
- self._function_executor_state.check_locked()
249
- if self._function_executor_state.status == FunctionExecutorStatus.IDLE:
250
- result: HealthCheckResult = (
251
- await self._function_executor_state.function_executor.health_checker().check()
252
- )
253
- if not result.is_healthy:
254
- await self._function_executor_state.set_status(
255
- FunctionExecutorStatus.UNHEALTHY
256
- )
257
-
258
- if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
259
- await self._destroy_function_executor_on_failed_health_check(result.reason)
260
-
261
- async def _destroy_function_executor_on_failed_health_check(self, reason: str):
262
- self._function_executor_state.check_locked()
263
- self._logger.error(
264
- "Function Executor health check failed, destroying Function Executor",
265
- health_check_fail_reason=reason,
266
- )
267
- await self._function_executor_state.destroy_function_executor()
268
-
269
-
270
- class _RunningTaskContextManager:
271
- """Performs all the actions required before and after running a task."""
272
-
273
- def __init__(
274
- self,
275
- invocation_id: str,
276
- task_id: str,
277
- health_check_failed_callback: Callable[[], Awaitable[None]],
278
- function_executor_state: FunctionExecutorState,
279
- ):
280
- self._invocation_id: str = invocation_id
281
- self._task_id: str = task_id
282
- self._health_check_failed_callback: Callable[[], Awaitable[None]] = (
283
- health_check_failed_callback
284
- )
285
- self._state: FunctionExecutorState = function_executor_state
286
-
287
- async def __aenter__(self):
288
- await self._state.set_status(FunctionExecutorStatus.RUNNING_TASK)
289
- self._state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
290
- task_id=self._task_id,
291
- invocation_id=self._invocation_id,
292
- )
293
- self._state.function_executor.health_checker().start(
294
- self._health_check_failed_callback
295
- )
296
- # Unlock the state so other tasks can act depending on it.
297
- self._state.lock.release()
298
- return self
299
-
300
- async def __aexit__(self, exc_type, exc_val, exc_tb):
301
- await self._state.lock.acquire()
302
- # Health check callback could destroy the FunctionExecutor and set status to UNHEALTHY
303
- if self._state.status == FunctionExecutorStatus.RUNNING_TASK:
304
- await self._state.set_status(FunctionExecutorStatus.IDLE)
305
- self._state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
306
- task_id=self._task_id
307
- )
308
- self._state.function_executor.health_checker().stop()
309
-
310
-
311
- def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
312
- response_validator = MessageValidator(response)
313
- response_validator.required_field("stdout")
314
- response_validator.required_field("stderr")
315
- response_validator.required_field("is_reducer")
316
- response_validator.required_field("success")
317
-
318
- metrics = TaskMetrics(counters={}, timers={})
319
- if response.HasField("metrics"):
320
- # Can be None if e.g. function failed.
321
- metrics.counters = dict(response.metrics.counters)
322
- metrics.timers = dict(response.metrics.timers)
323
-
324
- output = TaskOutput(
325
- task_id=task.id,
326
- namespace=task.namespace,
327
- graph_name=task.compute_graph,
328
- function_name=task.compute_fn,
329
- graph_version=task.graph_version,
330
- graph_invocation_id=task.invocation_id,
331
- stdout=response.stdout,
332
- stderr=response.stderr,
333
- reducer=response.is_reducer,
334
- success=response.success,
335
- metrics=metrics,
336
- output_payload_uri_prefix=task.output_payload_uri_prefix,
337
- )
338
-
339
- if response.HasField("function_output"):
340
- output.function_output = response.function_output
341
- output.output_encoding = response.function_output.output_encoding
342
- if response.HasField("router_output"):
343
- output.router_output = response.router_output
344
-
345
- return output
@@ -1,21 +0,0 @@
1
- from typing import Optional
2
-
3
- from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
4
-
5
- from ..api_objects import Task
6
-
7
-
8
- class TaskInput:
9
- """Task with all the resources required to run it."""
10
-
11
- def __init__(
12
- self,
13
- task: Task,
14
- graph: SerializedObject,
15
- input: SerializedObject,
16
- init_value: Optional[SerializedObject],
17
- ):
18
- self.task: Task = task
19
- self.graph: SerializedObject = graph
20
- self.input: SerializedObject = input
21
- self.init_value: Optional[SerializedObject] = init_value
@@ -1,105 +0,0 @@
1
- from typing import Dict, Optional
2
-
3
- from tensorlake.function_executor.proto.function_executor_pb2 import (
4
- FunctionOutput,
5
- RouterOutput,
6
- )
7
-
8
-
9
- class TaskMetrics:
10
- """Metrics for a task."""
11
-
12
- def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
13
- self.counters = counters
14
- self.timers = timers
15
-
16
-
17
- class TaskOutput:
18
- """Result of running a task."""
19
-
20
- def __init__(
21
- self,
22
- task_id: str,
23
- namespace: str,
24
- graph_name: str,
25
- function_name: str,
26
- graph_version: str,
27
- graph_invocation_id: str,
28
- output_payload_uri_prefix: Optional[str],
29
- output_encoding: Optional[str] = None,
30
- function_output: Optional[FunctionOutput] = None,
31
- router_output: Optional[RouterOutput] = None,
32
- stdout: Optional[str] = None,
33
- stderr: Optional[str] = None,
34
- reducer: bool = False,
35
- success: bool = False,
36
- is_internal_error: bool = False,
37
- metrics: Optional[TaskMetrics] = None,
38
- ):
39
- self.task_id = task_id
40
- self.namespace = namespace
41
- self.graph_name = graph_name
42
- self.function_name = function_name
43
- self.graph_version = graph_version
44
- self.graph_invocation_id = graph_invocation_id
45
- self.function_output = function_output
46
- self.router_output = router_output
47
- self.stdout = stdout
48
- self.stderr = stderr
49
- self.reducer = reducer
50
- self.success = success
51
- self.is_internal_error = is_internal_error
52
- self.metrics = metrics
53
- self.output_encoding = output_encoding
54
- self.output_payload_uri_prefix = output_payload_uri_prefix
55
-
56
- @classmethod
57
- def internal_error(
58
- cls,
59
- task_id: str,
60
- namespace: str,
61
- graph_name: str,
62
- function_name: str,
63
- graph_version: str,
64
- graph_invocation_id: str,
65
- output_payload_uri_prefix: Optional[str],
66
- ) -> "TaskOutput":
67
- """Creates a TaskOutput for an internal error."""
68
- # We are not sharing internal error messages with the customer.
69
- return TaskOutput(
70
- task_id=task_id,
71
- namespace=namespace,
72
- graph_name=graph_name,
73
- function_name=function_name,
74
- graph_version=graph_version,
75
- graph_invocation_id=graph_invocation_id,
76
- stderr="Platform failed to execute the function.",
77
- is_internal_error=True,
78
- output_payload_uri_prefix=output_payload_uri_prefix,
79
- )
80
-
81
- @classmethod
82
- def function_timeout(
83
- cls,
84
- task_id: str,
85
- namespace: str,
86
- graph_name: str,
87
- function_name: str,
88
- graph_version: str,
89
- graph_invocation_id: str,
90
- timeout_sec: float,
91
- output_payload_uri_prefix: Optional[str],
92
- ) -> "TaskOutput":
93
- """Creates a TaskOutput for an function timeout error."""
94
- # Task stdout, stderr is not available.
95
- return TaskOutput(
96
- task_id=task_id,
97
- namespace=namespace,
98
- graph_name=graph_name,
99
- function_name=function_name,
100
- graph_version=graph_version,
101
- graph_invocation_id=graph_invocation_id,
102
- stderr=f"Function or router exceeded its configured timeout of {timeout_sec:.3f} sec.",
103
- is_internal_error=False,
104
- output_payload_uri_prefix=output_payload_uri_prefix,
105
- )