indexify 0.3.17__tar.gz → 0.3.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {indexify-0.3.17 → indexify-0.3.19}/PKG-INFO +1 -1
  2. {indexify-0.3.17 → indexify-0.3.19}/pyproject.toml +3 -1
  3. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/cli/cli.py +21 -18
  4. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/api_objects.py +12 -0
  5. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/downloader.py +4 -1
  6. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/executor.py +65 -28
  7. indexify-0.3.19/src/indexify/executor/executor_flavor.py +7 -0
  8. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor.py +24 -11
  9. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_state.py +9 -1
  10. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_states_container.py +3 -1
  11. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_status.py +2 -0
  12. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/health_checker.py +20 -2
  13. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  14. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/single_task_runner.py +15 -11
  15. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/task_output.py +35 -2
  16. indexify-0.3.19/src/indexify/executor/grpc/channel_manager.py +160 -0
  17. indexify-0.3.19/src/indexify/executor/grpc/completed_tasks_container.py +26 -0
  18. indexify-0.3.19/src/indexify/executor/grpc/function_executor_controller.py +421 -0
  19. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/grpc/state_reconciler.py +33 -38
  20. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/grpc/state_reporter.py +100 -39
  21. indexify-0.3.19/src/indexify/executor/grpc/task_controller.py +449 -0
  22. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/metrics/task_reporter.py +14 -0
  23. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/task_fetcher.py +8 -3
  24. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/task_reporter.py +112 -4
  25. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/task_runner.py +1 -0
  26. indexify-0.3.17/src/indexify/proto/task_scheduler.proto → indexify-0.3.19/src/indexify/proto/executor_api.proto +86 -11
  27. indexify-0.3.19/src/indexify/proto/executor_api_pb2.py +80 -0
  28. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.pyi → indexify-0.3.19/src/indexify/proto/executor_api_pb2.pyi +162 -7
  29. indexify-0.3.19/src/indexify/proto/executor_api_pb2_grpc.py +227 -0
  30. indexify-0.3.17/src/indexify/executor/grpc/channel_creator.py +0 -53
  31. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.py +0 -64
  32. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2_grpc.py +0 -170
  33. {indexify-0.3.17 → indexify-0.3.19}/README.md +0 -0
  34. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/README.md +0 -0
  35. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  36. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  37. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
  38. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
  39. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  40. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  41. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
  42. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  43. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  44. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  45. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  46. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/function_executor/task_input.py +0 -0
  47. /indexify-0.3.17/src/indexify/executor/grpc/metrics/channel_creator.py → /indexify-0.3.19/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
  48. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
  49. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/metrics/downloader.py +0 -0
  50. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/metrics/executor.py +0 -0
  51. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/metrics/task_fetcher.py +0 -0
  52. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/metrics/task_runner.py +0 -0
  53. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
  54. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/handler.py +0 -0
  55. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  56. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  57. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  58. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/metrics.py +0 -0
  59. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  60. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/server.py +0 -0
  61. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  62. {indexify-0.3.17 → indexify-0.3.19}/src/indexify/executor/runtime_probes.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.17
3
+ Version: 0.3.19
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.3.17"
4
+ version = "0.3.19"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -24,6 +24,8 @@ aiohttp = "^3.11.0"
24
24
  prometheus-client = "^0.21.1"
25
25
  # Adds function-executor binary and utils lib.
26
26
  tensorlake = ">=0.1"
27
+ # Uncomment the next line to use local tensorlake package (only for development!)
28
+ # tensorlake = { path = "../tensorlake", develop = true }
27
29
  # pydantic is provided by tensorlake
28
30
  # httpx-sse is provided by tensorlake
29
31
  # grpcio is provided by tensorlake
@@ -13,7 +13,7 @@ import sys
13
13
  from importlib.metadata import version
14
14
  from pathlib import Path
15
15
  from socket import gethostname
16
- from typing import Annotated, List, Optional, Tuple
16
+ from typing import Annotated, Dict, List, Optional, Tuple
17
17
 
18
18
  import nanoid
19
19
  import prometheus_client
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
26
26
 
27
27
  from indexify.executor.api_objects import FunctionURI
28
28
  from indexify.executor.executor import Executor
29
+ from indexify.executor.executor_flavor import ExecutorFlavor
29
30
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
30
31
  SubprocessFunctionExecutorServerFactory,
31
32
  )
@@ -77,6 +78,7 @@ def build_image(
77
78
  )
78
79
  def executor(
79
80
  server_addr: str = "localhost:8900",
81
+ grpc_server_addr: str = "localhost:8901",
80
82
  dev: Annotated[
81
83
  bool, typer.Option("--dev", "-d", help="Run the executor in development mode")
82
84
  ] = False,
@@ -119,17 +121,6 @@ def executor(
119
121
  help="Port where to run Executor Monitoring server",
120
122
  ),
121
123
  ] = 7000,
122
- # TODO: Figure out mTLS for gRPC.
123
- grpc_server_addr: Annotated[
124
- Optional[str],
125
- typer.Option(
126
- "--grpc-server-addr",
127
- help=(
128
- "(exprimental) Address of server gRPC API to connect to, e.g. 'localhost:8901'.\n"
129
- "Enables gRPC state reporter that will periodically report the state of the Function Executors to Server\n"
130
- ),
131
- ),
132
- ] = None,
133
124
  enable_grpc_state_reconciler: Annotated[
134
125
  bool,
135
126
  typer.Option(
@@ -140,6 +131,15 @@ def executor(
140
131
  ),
141
132
  ),
142
133
  ] = False,
134
+ labels: Annotated[
135
+ List[str],
136
+ typer.Option(
137
+ "--label",
138
+ "-l",
139
+ help="Executor key-value label to be sent to the Server. "
140
+ "Specified as <key>=<value>",
141
+ ),
142
+ ] = [],
143
143
  ):
144
144
  if dev:
145
145
  configure_development_mode_logging()
@@ -157,10 +157,10 @@ def executor(
157
157
  "--executor-id should be at least 10 characters long and only include characters _-[0-9][a-z][A-Z]"
158
158
  )
159
159
 
160
- if enable_grpc_state_reconciler and grpc_server_addr is None:
161
- raise typer.BadParameter(
162
- "--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
163
- )
160
+ kv_labels: Dict[str, str] = {}
161
+ for label in labels:
162
+ key, value = label.split("=")
163
+ kv_labels[key] = value
164
164
 
165
165
  executor_version = version("indexify")
166
166
  logger = structlog.get_logger(module=__name__, executor_id=executor_id)
@@ -169,15 +169,16 @@ def executor(
169
169
  "starting executor",
170
170
  hostname=gethostname(),
171
171
  server_addr=server_addr,
172
+ grpc_server_addr=grpc_server_addr,
172
173
  config_path=config_path,
173
174
  executor_version=executor_version,
175
+ labels=kv_labels,
174
176
  executor_cache=executor_cache,
175
177
  ports=ports,
176
178
  functions=function_uris,
177
179
  dev_mode=dev,
178
180
  monitoring_server_host=monitoring_server_host,
179
181
  monitoring_server_port=monitoring_server_port,
180
- grpc_server_addr=grpc_server_addr,
181
182
  enable_grpc_state_reconciler=enable_grpc_state_reconciler,
182
183
  )
183
184
 
@@ -205,7 +206,9 @@ def executor(
205
206
  Executor(
206
207
  id=executor_id,
207
208
  development_mode=dev,
209
+ flavor=ExecutorFlavor.OSS,
208
210
  version=executor_version,
211
+ labels=kv_labels,
209
212
  health_checker=GenericHealthChecker(),
210
213
  code_path=executor_cache,
211
214
  function_allowlist=_parse_function_uris(function_uris),
@@ -214,10 +217,10 @@ def executor(
214
217
  server_ports=range(ports[0], ports[1]),
215
218
  ),
216
219
  server_addr=server_addr,
220
+ grpc_server_addr=grpc_server_addr,
217
221
  config_path=config_path,
218
222
  monitoring_server_host=monitoring_server_host,
219
223
  monitoring_server_port=monitoring_server_port,
220
- grpc_server_addr=grpc_server_addr,
221
224
  enable_grpc_state_reconciler=enable_grpc_state_reconciler,
222
225
  ).run()
223
226
 
@@ -49,5 +49,17 @@ class TaskResult(BaseModel):
49
49
  reducer: bool = False
50
50
 
51
51
 
52
+ class DataPayload(BaseModel):
53
+ path: str
54
+ size: int
55
+ sha256_hash: str
56
+
57
+
58
+ class IngestFnOutputsResponse(BaseModel):
59
+ data_payloads: List[DataPayload]
60
+ stdout: Optional[DataPayload] = None
61
+ stderr: Optional[DataPayload] = None
62
+
63
+
52
64
  TASK_OUTCOME_SUCCESS = "success"
53
65
  TASK_OUTCOME_FAILURE = "failure"
@@ -241,7 +241,10 @@ class Downloader:
241
241
  def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
242
242
  # We're hardcoding the content type currently used by Python SDK. It might change in the future.
243
243
  # There's no other way for now to determine if the response is a bytes or string.
244
- if response.headers["content-type"] == "application/octet-stream":
244
+ if response.headers["content-type"] in [
245
+ "application/octet-stream",
246
+ "application/pickle",
247
+ ]:
245
248
  return SerializedObject(
246
249
  bytes=response.content, content_type=response.headers["content-type"]
247
250
  )
@@ -9,17 +9,18 @@ import structlog
9
9
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
10
  from tensorlake.utils.logging import suppress as suppress_logging
11
11
 
12
- from indexify.proto.task_scheduler_pb2 import ExecutorStatus
12
+ from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
15
  from .downloader import Downloader
16
+ from .executor_flavor import ExecutorFlavor
16
17
  from .function_executor.function_executor_states_container import (
17
18
  FunctionExecutorStatesContainer,
18
19
  )
19
20
  from .function_executor.server.function_executor_server_factory import (
20
21
  FunctionExecutorServerFactory,
21
22
  )
22
- from .grpc.channel_creator import ChannelCreator
23
+ from .grpc.channel_manager import ChannelManager
23
24
  from .grpc.state_reconciler import ExecutorStateReconciler
24
25
  from .grpc.state_reporter import ExecutorStateReporter
25
26
  from .metrics.executor import (
@@ -55,16 +56,18 @@ class Executor:
55
56
  self,
56
57
  id: str,
57
58
  development_mode: bool,
59
+ flavor: ExecutorFlavor,
58
60
  version: str,
61
+ labels: Dict[str, str],
59
62
  code_path: Path,
60
63
  health_checker: HealthChecker,
61
64
  function_allowlist: Optional[List[FunctionURI]],
62
65
  function_executor_server_factory: FunctionExecutorServerFactory,
63
66
  server_addr: str,
67
+ grpc_server_addr: str,
64
68
  config_path: Optional[str],
65
69
  monitoring_server_host: str,
66
70
  monitoring_server_port: int,
67
- grpc_server_addr: Optional[str],
68
71
  enable_grpc_state_reconciler: bool,
69
72
  ):
70
73
  self._logger = structlog.get_logger(module=__name__)
@@ -94,36 +97,40 @@ class Executor:
94
97
  self._downloader = Downloader(
95
98
  code_path=code_path, base_url=self._base_url, config_path=config_path
96
99
  )
100
+ self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
101
+ self._function_executor_server_factory = function_executor_server_factory
102
+ self._channel_manager = ChannelManager(
103
+ server_address=grpc_server_addr,
104
+ config_path=config_path,
105
+ logger=self._logger,
106
+ )
107
+ self._state_reporter = ExecutorStateReporter(
108
+ executor_id=id,
109
+ flavor=flavor,
110
+ version=version,
111
+ labels=labels,
112
+ development_mode=development_mode,
113
+ function_allowlist=self._function_allowlist,
114
+ function_executor_states=self._function_executor_states,
115
+ channel_manager=self._channel_manager,
116
+ logger=self._logger,
117
+ )
118
+ self._state_reporter.update_executor_status(
119
+ ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
120
+ )
97
121
  self._task_reporter = TaskReporter(
98
122
  base_url=self._base_url,
99
123
  executor_id=id,
100
124
  config_path=config_path,
125
+ channel_manager=self._channel_manager,
101
126
  )
102
- self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
103
- self._function_executor_server_factory = function_executor_server_factory
104
127
 
105
- # HTTP mode services
128
+ # HTTP mode task runner
106
129
  self._task_runner: Optional[TaskRunner] = None
107
130
  self._task_fetcher: Optional[TaskFetcher] = None
108
- # gRPC mode services
109
- self._channel_creator: Optional[ChannelCreator] = None
110
- self._state_reporter: Optional[ExecutorStateReporter] = None
131
+ # gRPC mode state reconciler that runs tasks
111
132
  self._state_reconciler: Optional[ExecutorStateReconciler] = None
112
133
 
113
- if grpc_server_addr is not None:
114
- self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
115
- self._state_reporter = ExecutorStateReporter(
116
- executor_id=id,
117
- development_mode=development_mode,
118
- function_allowlist=self._function_allowlist,
119
- function_executor_states=self._function_executor_states,
120
- channel_creator=self._channel_creator,
121
- logger=self._logger,
122
- )
123
- self._state_reporter.update_executor_status(
124
- ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
125
- )
126
-
127
134
  if enable_grpc_state_reconciler:
128
135
  self._state_reconciler = ExecutorStateReconciler(
129
136
  executor_id=id,
@@ -133,7 +140,8 @@ class Executor:
133
140
  config_path=config_path,
134
141
  downloader=self._downloader,
135
142
  task_reporter=self._task_reporter,
136
- channel_creator=self._channel_creator,
143
+ channel_manager=self._channel_manager,
144
+ state_reporter=self._state_reporter,
137
145
  logger=self._logger,
138
146
  )
139
147
  else:
@@ -147,6 +155,7 @@ class Executor:
147
155
  self._task_fetcher = TaskFetcher(
148
156
  executor_id=id,
149
157
  executor_version=version,
158
+ labels=labels,
150
159
  function_allowlist=function_allowlist,
151
160
  protocol=protocol,
152
161
  indexify_server_addr=self._server_addr,
@@ -159,8 +168,8 @@ class Executor:
159
168
  "version": version,
160
169
  "code_path": str(code_path),
161
170
  "server_addr": server_addr,
162
- "config_path": str(config_path),
163
171
  "grpc_server_addr": str(grpc_server_addr),
172
+ "config_path": str(config_path),
164
173
  "enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
165
174
  "hostname": gethostname(),
166
175
  }
@@ -244,6 +253,9 @@ class Executor:
244
253
  )
245
254
  logger.error("task execution failed", exc_info=e)
246
255
 
256
+ if output.metrics is not None:
257
+ self.log_function_metrics(output)
258
+
247
259
  with (
248
260
  metric_tasks_reporting_outcome.track_inprogress(),
249
261
  metric_task_outcome_report_latency.time(),
@@ -253,6 +265,28 @@ class Executor:
253
265
 
254
266
  metric_task_completion_latency.observe(time.monotonic() - start_time)
255
267
 
268
+ def log_function_metrics(self, output: TaskOutput):
269
+ for counter_name, counter_value in output.metrics.counters.items():
270
+ self._logger.info(
271
+ f"function_metric",
272
+ counter_name=counter_name,
273
+ counter_value=counter_value,
274
+ invocation_id=output.graph_invocation_id,
275
+ function_name=output.function_name,
276
+ graph_name=output.graph_name,
277
+ namespace=output.namespace,
278
+ )
279
+ for timer_name, timer_value in output.metrics.timers.items():
280
+ self._logger.info(
281
+ f"function_metric",
282
+ timer_name=timer_name,
283
+ timer_value=timer_value,
284
+ invocation_id=output.graph_invocation_id,
285
+ function_name=output.function_name,
286
+ graph_name=output.graph_name,
287
+ namespace=output.namespace,
288
+ )
289
+
256
290
  async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
257
291
  graph: SerializedObject = await self._downloader.download_graph(
258
292
  namespace=task.namespace,
@@ -326,7 +360,9 @@ class Executor:
326
360
  ).inc()
327
361
 
328
362
  async def _shutdown(self, loop):
329
- self._logger.info("shutting down")
363
+ self._logger.info(
364
+ "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
365
+ )
330
366
  if self._state_reporter is not None:
331
367
  self._state_reporter.update_executor_status(
332
368
  ExecutorStatus.EXECUTOR_STATUS_STOPPING
@@ -339,12 +375,13 @@ class Executor:
339
375
 
340
376
  self._is_shutdown = True
341
377
  await self._monitoring_server.shutdown()
378
+ await self._task_reporter.shutdown()
342
379
 
343
380
  if self._task_runner is not None:
344
381
  await self._task_runner.shutdown()
345
382
 
346
- if self._channel_creator is not None:
347
- await self._channel_creator.shutdown()
383
+ if self._channel_manager is not None:
384
+ await self._channel_manager.shutdown()
348
385
  if self._state_reporter is not None:
349
386
  await self._state_reporter.shutdown()
350
387
  if self._state_reconciler is not None:
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExecutorFlavor(Enum):
5
+ UNKNOWN = "unknown"
6
+ OSS = "oss"
7
+ PLATFORM = "platform"
@@ -88,6 +88,7 @@ class FunctionExecutor:
88
88
  initialize_request: InitializeRequest,
89
89
  base_url: str,
90
90
  config_path: Optional[str],
91
+ customer_code_timeout_sec: Optional[float] = None,
91
92
  ):
92
93
  """Creates and initializes a FunctionExecutorServer and all resources associated with it.
93
94
 
@@ -103,7 +104,9 @@ class FunctionExecutor:
103
104
  await self._establish_channel()
104
105
  stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
105
106
  await _collect_server_info(stub)
106
- await _initialize_server(stub, initialize_request)
107
+ await _initialize_server(
108
+ stub, initialize_request, customer_code_timeout_sec
109
+ )
107
110
  await self._create_invocation_state_client(
108
111
  stub=stub,
109
112
  base_url=base_url,
@@ -293,18 +296,28 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
293
296
 
294
297
 
295
298
  async def _initialize_server(
296
- stub: FunctionExecutorStub, initialize_request: InitializeRequest
299
+ stub: FunctionExecutorStub,
300
+ initialize_request: InitializeRequest,
301
+ customer_code_timeout_sec: Optional[float],
297
302
  ) -> None:
298
303
  with (
299
304
  metric_initialize_rpc_errors.count_exceptions(),
300
305
  metric_initialize_rpc_latency.time(),
301
306
  ):
302
- initialize_response: InitializeResponse = await stub.initialize(
303
- initialize_request
304
- )
305
- if initialize_response.success:
306
- return
307
- if initialize_response.HasField("customer_error"):
308
- raise CustomerError(initialize_response.customer_error)
309
- else:
310
- raise Exception("initialize RPC failed at function executor server")
307
+ try:
308
+ initialize_response: InitializeResponse = await stub.initialize(
309
+ initialize_request,
310
+ timeout=customer_code_timeout_sec,
311
+ )
312
+ if initialize_response.success:
313
+ return
314
+ if initialize_response.HasField("customer_error"):
315
+ raise CustomerError(initialize_response.customer_error)
316
+ else:
317
+ raise Exception("initialize RPC failed at function executor server")
318
+ except grpc.aio.AioRpcError as e:
319
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
+ raise CustomerError(
321
+ f"Customer code timeout {customer_code_timeout_sec} sec expired"
322
+ ) from e
323
+ raise
@@ -25,6 +25,7 @@ class FunctionExecutorState:
25
25
  graph_version: str,
26
26
  function_name: str,
27
27
  image_uri: Optional[str],
28
+ secret_names: List[str],
28
29
  logger: Any,
29
30
  ):
30
31
  # Read only fields.
@@ -33,6 +34,7 @@ class FunctionExecutorState:
33
34
  self.graph_name: str = graph_name
34
35
  self.function_name: str = function_name
35
36
  self.image_uri: Optional[str] = image_uri
37
+ self.secret_names: List[str] = secret_names
36
38
  self._logger: Any = logger.bind(
37
39
  module=__name__,
38
40
  function_executor_id=id,
@@ -47,6 +49,7 @@ class FunctionExecutorState:
47
49
  # TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
48
50
  self.graph_version: str = graph_version
49
51
  self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
52
+ self.status_message: str = ""
50
53
  self.status_change_notifier: asyncio.Condition = asyncio.Condition(
51
54
  lock=self.lock
52
55
  )
@@ -62,7 +65,9 @@ class FunctionExecutorState:
62
65
  while self.status not in allowlist:
63
66
  await self.status_change_notifier.wait()
64
67
 
65
- async def set_status(self, new_status: FunctionExecutorStatus) -> None:
68
+ async def set_status(
69
+ self, new_status: FunctionExecutorStatus, status_message: str = ""
70
+ ) -> None:
66
71
  """Sets the status of the Function Executor.
67
72
 
68
73
  The caller must hold the lock.
@@ -70,6 +75,7 @@ class FunctionExecutorState:
70
75
  """
71
76
  self.check_locked()
72
77
  if is_status_change_allowed(self.status, new_status):
78
+ # If status didn't change then still log it for visibility.
73
79
  self._logger.info(
74
80
  "function executor status changed",
75
81
  old_status=self.status.name,
@@ -78,12 +84,14 @@ class FunctionExecutorState:
78
84
  metric_function_executors_with_status.labels(status=self.status.name).dec()
79
85
  metric_function_executors_with_status.labels(status=new_status.name).inc()
80
86
  self.status = new_status
87
+ self.status_message = status_message
81
88
  self.status_change_notifier.notify_all()
82
89
  else:
83
90
  raise ValueError(
84
91
  f"Invalid status change from {self.status} to {new_status}"
85
92
  )
86
93
 
94
+ # TODO: Delete this method once HTTP protocol is removed as it's used only there.
87
95
  async def destroy_function_executor(self) -> None:
88
96
  """Destroys the Function Executor if it exists.
89
97
 
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- from typing import Any, AsyncGenerator, Dict, Optional
2
+ from typing import Any, AsyncGenerator, Dict, List, Optional
3
3
 
4
4
  from .function_executor_state import FunctionExecutorState
5
5
  from .function_executor_status import FunctionExecutorStatus
@@ -26,6 +26,7 @@ class FunctionExecutorStatesContainer:
26
26
  graph_version: str,
27
27
  function_name: str,
28
28
  image_uri: Optional[str],
29
+ secret_names: List[str],
29
30
  ) -> FunctionExecutorState:
30
31
  """Get or create a function executor state with the given ID.
31
32
 
@@ -45,6 +46,7 @@ class FunctionExecutorStatesContainer:
45
46
  graph_version=graph_version,
46
47
  function_name=function_name,
47
48
  image_uri=image_uri,
49
+ secret_names=secret_names,
48
50
  logger=self._logger,
49
51
  )
50
52
  self._states[id] = state
@@ -23,6 +23,7 @@ class FunctionExecutorStatus(Enum):
23
23
  UNHEALTHY = "Unhealthy"
24
24
  # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
25
25
  # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
26
+ # RUNNING_TASK -> DESTROYING
26
27
  # UNHEALTHY -> DESTROYING
27
28
  # IDLE -> DESTROYING
28
29
  DESTROYING = "Destroying"
@@ -69,6 +70,7 @@ def is_status_change_allowed(
69
70
  ],
70
71
  FunctionExecutorStatus.RUNNING_TASK: [
71
72
  FunctionExecutorStatus.RUNNING_TASK,
73
+ FunctionExecutorStatus.DESTROYING,
72
74
  FunctionExecutorStatus.IDLE,
73
75
  FunctionExecutorStatus.UNHEALTHY,
74
76
  FunctionExecutorStatus.SHUTDOWN,
@@ -70,8 +70,10 @@ class HealthChecker:
70
70
  # code is not involved when TCP connections are established to FE. Problems reestablishing
71
71
  # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
72
  # not being available anymore or due to prolonged local networking failures on Executor.
73
- channel_connectivity = self._channel.get_state()
74
- if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
73
+ if (
74
+ _channel_state(self._channel, self._logger)
75
+ == grpc.ChannelConnectivity.TRANSIENT_FAILURE
76
+ ):
75
77
  return HealthCheckResult(
76
78
  is_healthy=False,
77
79
  reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
@@ -126,3 +128,19 @@ class HealthChecker:
126
128
 
127
129
  asyncio.create_task(self._health_check_failed_callback(result))
128
130
  self._health_check_loop_task = None
131
+
132
+
133
+ def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
134
+ """Get channel connectivity state and suppresses all exceptions.
135
+
136
+ Suppressing the exceptions is important because the channel connectivity state is an experimental
137
+ feature. On error fallse back to READY state which assumes that the channel is okay.
138
+ """
139
+ try:
140
+ return channel.get_state()
141
+ except Exception as e:
142
+ logger.error(
143
+ "Failed getting channel state, falling back to default READY state",
144
+ exc_info=e,
145
+ )
146
+ return grpc.ChannelConnectivity.READY
@@ -25,6 +25,12 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
25
25
  logger = logger.bind(module=__name__)
26
26
  port: Optional[int] = None
27
27
 
28
+ if len(config.secret_names) > 0:
29
+ logger.warning(
30
+ "Subprocess Function Executor does not support secrets. Please supply secrets as environment variables.",
31
+ secret_names=config.secret_names,
32
+ )
33
+
28
34
  try:
29
35
  port = self._allocate_port()
30
36
  args = [
@@ -10,6 +10,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
10
10
  from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
11
  FunctionExecutorStub,
12
12
  )
13
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
13
14
 
14
15
  from ..api_objects import Task
15
16
  from .function_executor import CustomerError, FunctionExecutor
@@ -26,7 +27,7 @@ from .server.function_executor_server_factory import (
26
27
  FunctionExecutorServerFactory,
27
28
  )
28
29
  from .task_input import TaskInput
29
- from .task_output import TaskOutput
30
+ from .task_output import TaskMetrics, TaskOutput
30
31
 
31
32
 
32
33
  class SingleTaskRunner:
@@ -286,16 +287,17 @@ class _RunningTaskContextManager:
286
287
 
287
288
 
288
289
  def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
289
- required_fields = [
290
- "stdout",
291
- "stderr",
292
- "is_reducer",
293
- "success",
294
- ]
295
-
296
- for field in required_fields:
297
- if not response.HasField(field):
298
- raise ValueError(f"Response is missing required field: {field}")
290
+ response_validator = MessageValidator(response)
291
+ response_validator.required_field("stdout")
292
+ response_validator.required_field("stderr")
293
+ response_validator.required_field("is_reducer")
294
+ response_validator.required_field("success")
295
+
296
+ metrics = TaskMetrics(counters={}, timers={})
297
+ if response.HasField("metrics"):
298
+ # Can be None if e.g. function failed.
299
+ metrics.counters = dict(response.metrics.counters)
300
+ metrics.timers = dict(response.metrics.timers)
299
301
 
300
302
  output = TaskOutput(
301
303
  task_id=task.id,
@@ -308,10 +310,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
308
310
  stderr=response.stderr,
309
311
  reducer=response.is_reducer,
310
312
  success=response.success,
313
+ metrics=metrics,
311
314
  )
312
315
 
313
316
  if response.HasField("function_output"):
314
317
  output.function_output = response.function_output
318
+ output.output_encoding = response.function_output.output_encoding
315
319
  if response.HasField("router_output"):
316
320
  output.router_output = response.router_output
317
321