indexify 0.2.42__py3-none-any.whl → 0.2.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/cli.py CHANGED
@@ -22,7 +22,7 @@ from rich.panel import Panel
22
22
  from rich.text import Text
23
23
  from rich.theme import Theme
24
24
 
25
- from indexify.executor.agent import ExtractorAgent
25
+ from indexify.executor.executor import Executor
26
26
  from indexify.function_executor.function_executor_service import (
27
27
  FunctionExecutorService,
28
28
  )
@@ -221,7 +221,7 @@ def executor(
221
221
  shutil.rmtree(executor_cache)
222
222
  Path(executor_cache).mkdir(parents=True, exist_ok=True)
223
223
 
224
- agent = ExtractorAgent(
224
+ executor = Executor(
225
225
  id,
226
226
  server_addr=server_addr,
227
227
  config_path=config_path,
@@ -232,7 +232,7 @@ def executor(
232
232
  )
233
233
 
234
234
  try:
235
- asyncio.get_event_loop().run_until_complete(agent.run())
235
+ asyncio.get_event_loop().run_until_complete(executor.run())
236
236
  except asyncio.CancelledError:
237
237
  logger.info("graceful shutdown")
238
238
 
@@ -0,0 +1,155 @@
1
+ import asyncio
2
+ import signal
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ import structlog
7
+
8
+ from indexify.function_executor.proto.function_executor_pb2 import (
9
+ SerializedObject,
10
+ )
11
+
12
+ from .api_objects import Task
13
+ from .downloader import DownloadedInputs, Downloader
14
+ from .function_executor.process_function_executor_factory import (
15
+ ProcessFunctionExecutorFactory,
16
+ )
17
+ from .function_worker import (
18
+ FunctionWorker,
19
+ FunctionWorkerInput,
20
+ FunctionWorkerOutput,
21
+ )
22
+ from .task_fetcher import TaskFetcher
23
+ from .task_reporter import TaskReporter
24
+
25
+
26
+ class Executor:
27
+ def __init__(
28
+ self,
29
+ executor_id: str,
30
+ code_path: Path,
31
+ server_addr: str = "localhost:8900",
32
+ development_mode: bool = False,
33
+ config_path: Optional[str] = None,
34
+ name_alias: Optional[str] = None,
35
+ image_hash: Optional[str] = None,
36
+ ):
37
+ self._logger = structlog.get_logger(module=__name__)
38
+ self._should_run = True
39
+ self._config_path = config_path
40
+ protocol: str = "http"
41
+ if config_path:
42
+ self._logger.info("running the extractor with TLS enabled")
43
+ protocol = "https"
44
+
45
+ self._function_worker = FunctionWorker(
46
+ function_executor_factory=ProcessFunctionExecutorFactory(
47
+ indexify_server_address=server_addr,
48
+ development_mode=development_mode,
49
+ config_path=config_path,
50
+ )
51
+ )
52
+ self._server_addr = server_addr
53
+ self._base_url = f"{protocol}://{self._server_addr}"
54
+ self._code_path = code_path
55
+ self._downloader = Downloader(
56
+ code_path=code_path, base_url=self._base_url, config_path=config_path
57
+ )
58
+ self._task_fetcher = TaskFetcher(
59
+ protocol=protocol,
60
+ indexify_server_addr=self._server_addr,
61
+ executor_id=executor_id,
62
+ name_alias=name_alias,
63
+ image_hash=image_hash,
64
+ config_path=config_path,
65
+ )
66
+ self._task_reporter = TaskReporter(
67
+ base_url=self._base_url,
68
+ executor_id=executor_id,
69
+ config_path=self._config_path,
70
+ )
71
+
72
+ async def run(self):
73
+ asyncio.get_event_loop().add_signal_handler(
74
+ signal.SIGINT, self.shutdown, asyncio.get_event_loop()
75
+ )
76
+ asyncio.get_event_loop().add_signal_handler(
77
+ signal.SIGTERM, self.shutdown, asyncio.get_event_loop()
78
+ )
79
+
80
+ while self._should_run:
81
+ try:
82
+ async for task in self._task_fetcher.run():
83
+ asyncio.create_task(self._run_task(task))
84
+ except Exception as e:
85
+ self._logger.error(
86
+ "failed fetching tasks, retrying in 5 seconds", exc_info=e
87
+ )
88
+ await asyncio.sleep(5)
89
+
90
+ async def _run_task(self, task: Task) -> None:
91
+ """Runs the supplied task.
92
+
93
+ Doesn't raise any Exceptions. All errors are reported to the server."""
94
+ logger = self._task_logger(task)
95
+ output: Optional[FunctionWorkerOutput] = None
96
+
97
+ try:
98
+ graph: SerializedObject = await self._downloader.download_graph(task)
99
+ input: DownloadedInputs = await self._downloader.download_inputs(task)
100
+ output = await self._function_worker.run(
101
+ input=FunctionWorkerInput(
102
+ task=task,
103
+ graph=graph,
104
+ function_input=input,
105
+ )
106
+ )
107
+ logger.info("task_execution_finished", success=output.success)
108
+ except Exception as e:
109
+ logger.error("failed running the task", exc_info=e)
110
+
111
+ await self._report_task_outcome(task=task, output=output, logger=logger)
112
+
113
+ async def _report_task_outcome(
114
+ self, task: Task, output: Optional[FunctionWorkerOutput], logger: Any
115
+ ) -> None:
116
+ """Reports the task with the given output to the server.
117
+
118
+ None output means that the task execution didn't finish due to an internal error.
119
+ Doesn't raise any exceptions."""
120
+ reporting_retries: int = 0
121
+
122
+ while True:
123
+ logger = logger.bind(retries=reporting_retries)
124
+ try:
125
+ await self._task_reporter.report(
126
+ task=task, output=output, logger=logger
127
+ )
128
+ break
129
+ except Exception as e:
130
+ logger.error(
131
+ "failed_to_report_task",
132
+ exc_info=e,
133
+ )
134
+ reporting_retries += 1
135
+ await asyncio.sleep(5)
136
+
137
+ async def _shutdown(self, loop):
138
+ self._logger.info("shutting_down")
139
+ self._should_run = False
140
+ await self._function_worker.shutdown()
141
+ for task in asyncio.all_tasks(loop):
142
+ task.cancel()
143
+
144
+ def shutdown(self, loop):
145
+ loop.create_task(self._shutdown(loop))
146
+
147
+ def _task_logger(self, task: Task) -> Any:
148
+ return self._logger.bind(
149
+ namespace=task.namespace,
150
+ graph=task.compute_graph,
151
+ graph_version=task.graph_version,
152
+ invocation_id=task.invocation_id,
153
+ function_name=task.compute_fn,
154
+ task_id=task.id,
155
+ )
@@ -31,13 +31,11 @@ class FunctionWorkerInput:
31
31
  def __init__(
32
32
  self,
33
33
  task: Task,
34
- graph: Optional[SerializedObject] = None,
35
- function_input: Optional[DownloadedInputs] = None,
34
+ graph: SerializedObject,
35
+ function_input: DownloadedInputs,
36
36
  ):
37
37
  self.task = task
38
- # Must not be None when running the task.
39
38
  self.graph = graph
40
- # Must not be None when running the task.
41
39
  self.function_input = function_input
42
40
 
43
41
 
@@ -2,7 +2,6 @@ import json
2
2
  from importlib.metadata import version
3
3
  from typing import AsyncGenerator, Optional
4
4
 
5
- import httpx
6
5
  import structlog
7
6
  from httpx_sse import aconnect_sse
8
7
 
@@ -66,7 +65,7 @@ class TaskFetcher:
66
65
  except Exception as e:
67
66
  await event_source.response.aread()
68
67
  raise Exception(
69
- "Failed to register at server. "
68
+ "failed to register at server. "
70
69
  f"Response code: {event_source.response.status_code}. "
71
70
  f"Response text: '{event_source.response.text}'."
72
71
  ) from e
@@ -1,16 +1,16 @@
1
- from typing import Optional
1
+ import asyncio
2
+ from typing import Any, List, Optional, Tuple
2
3
 
3
4
  import nanoid
4
- import structlog
5
5
  from httpx import Timeout
6
- from pydantic import BaseModel
7
6
 
8
7
  from indexify.common_util import get_httpx_client
9
- from indexify.executor.api_objects import RouterOutput as ApiRouterOutput
10
- from indexify.executor.api_objects import TaskResult
11
- from indexify.executor.task_store import CompletedTask
8
+ from indexify.executor.api_objects import RouterOutput, Task, TaskResult
9
+ from indexify.function_executor.proto.function_executor_pb2 import (
10
+ FunctionOutput,
11
+ )
12
12
 
13
- logger = structlog.get_logger(__name__)
13
+ from .function_worker import FunctionWorkerOutput
14
14
 
15
15
 
16
16
  # https://github.com/psf/requests/issues/1081#issuecomment-428504128
@@ -23,13 +23,16 @@ FORCE_MULTIPART = ForceMultipartDict()
23
23
  UTF_8_CONTENT_TYPE = "application/octet-stream"
24
24
 
25
25
 
26
- class ReportingData(BaseModel):
27
- output_count: int = 0
28
- output_total_bytes: int = 0
29
- stdout_count: int = 0
30
- stdout_total_bytes: int = 0
31
- stderr_count: int = 0
32
- stderr_total_bytes: int = 0
26
+ class TaskOutputSummary:
27
+ def __init__(self):
28
+ self.output_count: int = 0
29
+ self.output_total_bytes: int = 0
30
+ self.router_output_count: int = 0
31
+ self.stdout_count: int = 0
32
+ self.stdout_total_bytes: int = 0
33
+ self.stderr_count: int = 0
34
+ self.stderr_total_bytes: int = 0
35
+ self.total_bytes: int = 0
33
36
 
34
37
 
35
38
  class TaskReporter:
@@ -38,89 +41,39 @@ class TaskReporter:
38
41
  ):
39
42
  self._base_url = base_url
40
43
  self._executor_id = executor_id
41
- self._client = get_httpx_client(config_path)
42
-
43
- def report_task_outcome(self, completed_task: CompletedTask):
44
- report = ReportingData()
45
- fn_outputs = []
46
-
47
- if completed_task.function_output:
48
- for output in completed_task.function_output.outputs or []:
49
- payload = output.bytes if output.HasField("bytes") else output.string
50
- fn_outputs.append(
51
- (
52
- "node_outputs",
53
- (nanoid.generate(), payload, output.content_type),
54
- )
55
- )
56
- report.output_count += 1
57
- report.output_total_bytes += len(payload)
58
-
59
- if completed_task.stdout:
60
- fn_outputs.append(
61
- (
62
- "stdout",
63
- (
64
- nanoid.generate(),
65
- completed_task.stdout.encode(),
66
- UTF_8_CONTENT_TYPE,
67
- ),
68
- )
69
- )
70
- report.stdout_count += 1
71
- report.stdout_total_bytes += len(completed_task.stdout)
72
-
73
- if completed_task.stderr:
74
- fn_outputs.append(
75
- (
76
- "stderr",
77
- (
78
- nanoid.generate(),
79
- completed_task.stderr.encode(),
80
- UTF_8_CONTENT_TYPE,
81
- ),
82
- )
83
- )
84
- report.stderr_count += 1
85
- report.stderr_total_bytes += len(completed_task.stderr)
86
-
87
- router_output = (
88
- ApiRouterOutput(edges=completed_task.router_output.edges)
89
- if completed_task.router_output
90
- else None
91
- )
44
+ # Use thread-safe sync client due to issues with async client.
45
+ # Async client attempts to use connections it already closed.
46
+ # See e.g. https://github.com/encode/httpx/issues/2337.
47
+ # Creating a new async client for each request fixes this but it
48
+ # results in not reusing established TCP connections to server.
49
+ self._client = get_httpx_client(config_path, make_async=False)
50
+
51
+ async def report(
52
+ self, task: Task, output: Optional[FunctionWorkerOutput], logger: Any
53
+ ):
54
+ """Reports result of the supplied task.
92
55
 
93
- task_result = TaskResult(
94
- router_output=router_output,
95
- outcome=completed_task.task_outcome,
96
- namespace=completed_task.task.namespace,
97
- compute_graph=completed_task.task.compute_graph,
98
- compute_fn=completed_task.task.compute_fn,
99
- invocation_id=completed_task.task.invocation_id,
100
- executor_id=self._executor_id,
101
- task_id=completed_task.task.id,
102
- reducer=completed_task.reducer,
56
+ If FunctionWorkerOutput is None this means that the task didn't finish and failed with internal error.
57
+ """
58
+ logger = logger.bind(module=__name__)
59
+ task_result, output_files, output_summary = self._process_task_output(
60
+ task, output
103
61
  )
104
62
  task_result_data = task_result.model_dump_json(exclude_none=True)
105
63
 
106
- total_bytes = (
107
- report.output_total_bytes
108
- + report.stdout_total_bytes
109
- + report.stderr_total_bytes
110
- )
111
-
112
64
  logger.info(
113
65
  "reporting task outcome",
114
- task_id=completed_task.task.id,
115
- retries=completed_task.reporting_retries,
116
- total_bytes=total_bytes,
117
- total_files=report.output_count + report.stdout_count + report.stderr_count,
118
- output_files=report.output_count,
119
- output_bytes=total_bytes,
120
- stdout_bytes=report.stdout_total_bytes,
121
- stderr_bytes=report.stderr_total_bytes,
66
+ total_bytes=output_summary.total_bytes,
67
+ total_files=output_summary.output_count
68
+ + output_summary.stdout_count
69
+ + output_summary.stderr_count,
70
+ output_files=output_summary.output_count,
71
+ output_bytes=output_summary.total_bytes,
72
+ router_output_count=output_summary.router_output_count,
73
+ stdout_bytes=output_summary.stdout_total_bytes,
74
+ stderr_bytes=output_summary.stderr_total_bytes,
122
75
  )
123
- #
76
+
124
77
  kwargs = {
125
78
  "data": {"task_result": task_result_data},
126
79
  # Use httpx default timeout of 5s for all timeout types.
@@ -129,27 +82,134 @@ class TaskReporter:
129
82
  5.0,
130
83
  read=5.0 * 60,
131
84
  ),
85
+ "files": output_files if len(output_files) > 0 else FORCE_MULTIPART,
132
86
  }
133
- if fn_outputs and len(fn_outputs) > 0:
134
- kwargs["files"] = fn_outputs
135
- else:
136
- kwargs["files"] = FORCE_MULTIPART
137
-
138
- response = self._client.post(
139
- url=f"{self._base_url}/internal/ingest_files",
140
- **kwargs,
87
+ # Run in a separate thread to not block the main event loop.
88
+ response = await asyncio.to_thread(
89
+ self._client.post, url=f"{self._base_url}/internal/ingest_files", **kwargs
141
90
  )
142
91
 
143
92
  try:
144
93
  response.raise_for_status()
145
94
  except Exception as e:
146
95
  # Caller catches and logs the exception.
147
- # Log response details here for easier debugging.
148
- logger.error(
149
- "failed to report task outcome",
150
- task_id=completed_task.task.id,
151
- retries=completed_task.reporting_retries,
152
- status_code=response.status_code,
153
- response_text=response.text,
96
+ raise Exception(
97
+ "failed to report task outcome. "
98
+ f"Response code: {response.status_code}. "
99
+ f"Response text: '{response.text}'."
100
+ ) from e
101
+
102
+ def _process_task_output(
103
+ self, task: Task, output: Optional[FunctionWorkerOutput]
104
+ ) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
105
+ task_result = TaskResult(
106
+ outcome="failure",
107
+ namespace=task.namespace,
108
+ compute_graph=task.compute_graph,
109
+ compute_fn=task.compute_fn,
110
+ invocation_id=task.invocation_id,
111
+ executor_id=self._executor_id,
112
+ task_id=task.id,
113
+ )
114
+ output_files: List[Any] = []
115
+ summary: TaskOutputSummary = TaskOutputSummary()
116
+ if output is None:
117
+ return task_result, output_files, summary
118
+
119
+ task_result.outcome = "success" if output.success else "failure"
120
+ task_result.reducer = output.reducer
121
+
122
+ _process_function_output(
123
+ function_output=output.function_output,
124
+ output_files=output_files,
125
+ summary=summary,
126
+ )
127
+ _process_router_output(
128
+ router_output=output.router_output, task_result=task_result, summary=summary
129
+ )
130
+ _process_stdout(
131
+ stdout=output.stdout, output_files=output_files, summary=summary
132
+ )
133
+ _process_stderr(
134
+ stderr=output.stderr, output_files=output_files, summary=summary
135
+ )
136
+
137
+ summary.total_bytes = (
138
+ summary.output_total_bytes
139
+ + summary.stdout_total_bytes
140
+ + summary.stderr_total_bytes
141
+ )
142
+
143
+ return task_result, output_files, summary
144
+
145
+
146
+ def _process_function_output(
147
+ function_output: Optional[FunctionOutput],
148
+ output_files: List[Any],
149
+ summary: TaskOutputSummary,
150
+ ) -> None:
151
+ if function_output is None:
152
+ return
153
+
154
+ for output in function_output.outputs or []:
155
+ payload = output.bytes if output.HasField("bytes") else output.string
156
+ output_files.append(
157
+ (
158
+ "node_outputs",
159
+ (nanoid.generate(), payload, output.content_type),
154
160
  )
155
- raise e
161
+ )
162
+ summary.output_count += 1
163
+ summary.output_total_bytes += len(payload)
164
+
165
+
166
+ def _process_router_output(
167
+ router_output: Optional[RouterOutput],
168
+ task_result: TaskResult,
169
+ summary: TaskOutputSummary,
170
+ ) -> None:
171
+ if router_output is None:
172
+ return
173
+
174
+ task_result.router_output = RouterOutput(edges=router_output.edges)
175
+ summary.router_output_count += 1
176
+
177
+
178
+ def _process_stdout(
179
+ stdout: Optional[str], output_files: List[Any], summary: TaskOutputSummary
180
+ ) -> None:
181
+ if stdout is None:
182
+ return
183
+
184
+ output_files.append(
185
+ (
186
+ "stdout",
187
+ (
188
+ nanoid.generate(),
189
+ stdout.encode(),
190
+ UTF_8_CONTENT_TYPE,
191
+ ),
192
+ )
193
+ )
194
+ summary.stdout_count += 1
195
+ summary.stdout_total_bytes += len(stdout)
196
+
197
+
198
+ def _process_stderr(
199
+ stderr: Optional[str], output_files: List[Any], summary: TaskOutputSummary
200
+ ) -> None:
201
+ if stderr is None:
202
+ return
203
+
204
+ output_files.append(
205
+ (
206
+ "stderr",
207
+ (
208
+ nanoid.generate(),
209
+ stderr.encode(),
210
+ UTF_8_CONTENT_TYPE,
211
+ ),
212
+ )
213
+ )
214
+ summary.stderr_count += 1
215
+ summary.stderr_total_bytes += len(stderr)
@@ -37,7 +37,6 @@ from .indexify_functions import (
37
37
  IndexifyRouter,
38
38
  RouterCallResult,
39
39
  )
40
- from .local_cache import CacheAwareFunctionWrapper
41
40
  from .object_serializer import get_serializer
42
41
 
43
42
  RouterFn = Annotated[
@@ -86,7 +85,6 @@ class Graph:
86
85
 
87
86
  # Storage for local execution
88
87
  self._results: Dict[str, Dict[str, List[IndexifyData]]] = {}
89
- self._cache = CacheAwareFunctionWrapper("./indexify_local_runner_cache")
90
88
  self._accumulator_values: Dict[str, IndexifyData] = {}
91
89
  self._local_graph_ctx: Optional[GraphInvocationContext] = None
92
90
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.2.42
3
+ Version: 0.2.43
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -25,7 +25,7 @@ Requires-Dist: pydantic (==2.10.2)
25
25
  Requires-Dist: pyyaml (>=6,<7)
26
26
  Requires-Dist: rich (>=13.9.2,<14.0.0)
27
27
  Requires-Dist: structlog (>=24.4.0,<25.0.0)
28
- Requires-Dist: typer (>=0.13.0,<0.14.0)
28
+ Requires-Dist: typer (>=0.12,<0.13)
29
29
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
30
30
  Description-Content-Type: text/markdown
31
31
 
@@ -1,24 +1,22 @@
1
1
  indexify/__init__.py,sha256=P0mvM8sbkeS2CjYzRYyzb42CnXGhyJXdz4FdmTBMSWM,697
2
- indexify/cli.py,sha256=a6KPM7uWE3kcbREP1IkkY3Z30vJ-9B_70r6j9-1msSk,10385
2
+ indexify/cli.py,sha256=Y8mijSECm_s-ZLNcba-PIFrnAjB7KX7TVdw0q6KVD2I,10382
3
3
  indexify/common_util.py,sha256=LKS6yZ3yv8nF2J-KzisGIjqjTvCn7tLFifQJLT4gHRg,3529
4
4
  indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
5
5
  indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
6
6
  indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
7
7
  indexify/error.py,sha256=qAWr8R6AxPkjsxHSzXTc8zqYnNO_AjOqqYEPsQvF1Zs,238
8
- indexify/executor/agent.py,sha256=vPbGUN2E9N6fa7WDCpzUD4CzNwSnDMQ0_vs1d5R61GU,10902
9
8
  indexify/executor/api_objects.py,sha256=vp7aEjvfWL2j4nYDV0xLgLVZlGsbStMEu0nzrgJVq1A,741
10
9
  indexify/executor/downloader.py,sha256=oeiSGxIIn7uBTKT6hzyPUa-AlOUlqy7WQkinm7qpV0I,6442
11
- indexify/executor/executor_tasks.py,sha256=vxXK2pCSqu5_HaEPKj2VF9k8vgA1mUD8v4bx0TXGR-I,1570
10
+ indexify/executor/executor.py,sha256=i9TnF899scLkUWCpomnDh_dwUDqtXgiPWvnwsF7DUEs,5272
12
11
  indexify/executor/function_executor/function_executor.py,sha256=0W6vCT-DBVPUMPoOXLVRVHZjVGGUeTDV_NWb-XLP2gM,1218
13
12
  indexify/executor/function_executor/function_executor_factory.py,sha256=23BU_i47Tz6xd9fAwsKH29lm261bZXfzYGRhSqeFtwY,900
14
13
  indexify/executor/function_executor/function_executor_map.py,sha256=HXeCjkRusumJisU74Iy6M4e13k8jeHk4Yq8PyODSsUc,3922
15
14
  indexify/executor/function_executor/process_function_executor.py,sha256=CdqFtvlvS_VIzVdbE3Wnzj_Kqfzhzf4X05avHIielEE,2150
16
15
  indexify/executor/function_executor/process_function_executor_factory.py,sha256=ognYRd5l10kVD2KAShfkd8vg-JElYctG77Pcx77-grM,3850
17
- indexify/executor/function_worker.py,sha256=JcVrDZPj6XE0c-SDyXMlS5ZMfr5FuByZxG2XoACHsoU,9803
16
+ indexify/executor/function_worker.py,sha256=YfNcDvwlX5dvKF-4x6XkvjX4E0S-5WKZkmsLFaesapg,9669
18
17
  indexify/executor/runtime_probes.py,sha256=bo6Dq6AGZpJH099j0DHtVSDEH80tv3j9MXf3VXSx_p8,2182
19
- indexify/executor/task_fetcher.py,sha256=481Ta7UAlytDpf99wNGzcqaCpB7NGJitb1nn_rAE9Do,3017
20
- indexify/executor/task_reporter.py,sha256=jThbGHte8nBikBIiSFgjRUQ6o3wE3Ms2zyz1NaZWIUQ,5113
21
- indexify/executor/task_store.py,sha256=Gpsr7S6UuLhGiB4Vh-9A0-iU4Qgji5CYEzjwKWeETl4,4430
18
+ indexify/executor/task_fetcher.py,sha256=qBuVxgjzjkTi_1_hyQqtBxetBxdL7TGMgI4FBwyTANE,3004
19
+ indexify/executor/task_reporter.py,sha256=soCB31JNi0IZDgd0VRTm90X1Prd3_tHMGPqxccgGLKc,6869
22
20
  indexify/function_executor/function_executor_service.py,sha256=EwwHTPX-AVqbSl-BONWWy3Z6gowXnYj5tigGoFs1u0s,3694
23
21
  indexify/function_executor/handlers/run_function/function_inputs_loader.py,sha256=x2Lrzb6CsEWDWW5qG9etS7TyV7KVYfuFKEbzKVhYizo,1553
24
22
  indexify/function_executor/handlers/run_function/handler.py,sha256=eklicxgyExd9ScdDahQO9LaLNXPNBPAAI0KBt3Wb9-w,5636
@@ -33,12 +31,11 @@ indexify/function_executor/proto/function_executor_pb2_grpc.py,sha256=OIErF2wYWa
33
31
  indexify/function_executor/proto/message_validator.py,sha256=OKXPYgy5L9c-spnV9Zjv7PA_yxwzvykfhbYylYx8cwQ,1456
34
32
  indexify/function_executor/server.py,sha256=YEeKHR6qjNJyLFb8KNjcCYClBgVzdJNmKwdwmyJ-ybE,1136
35
33
  indexify/functions_sdk/data_objects.py,sha256=ZJ7B9b5OI7aieCWJFx18pV6tqScssIFtmth84i7nKTg,623
36
- indexify/functions_sdk/graph.py,sha256=WNaS5yGNKNLa55gH3LPIlp4fH9VpAsb7VAVkELozrZ0,13413
34
+ indexify/functions_sdk/graph.py,sha256=cyaOHpyUVkUq5hyS85ojf6GN_tBGXQbwWlFQnktj1q0,13281
37
35
  indexify/functions_sdk/graph_definition.py,sha256=yQ7c7jpA8RB8Yt81D1i5R3LPT8fmLvN8bLuEHvS4P_o,1775
38
36
  indexify/functions_sdk/graph_validation.py,sha256=mN2Fcp91GIwFZEQP6z_qGqt4LkLM70SnI7AWBi4CmKQ,2509
39
37
  indexify/functions_sdk/image.py,sha256=wVDs5O5yshjSSZ1d-keyMp9zUrck2TazJlkVq94jMZk,1965
40
38
  indexify/functions_sdk/indexify_functions.py,sha256=USuEeZnbQTd6uUjv-ZAwdBrbR8gGrTD78oXjyW6DBbw,13376
41
- indexify/functions_sdk/local_cache.py,sha256=cNWF67zbhbTJe3g86hyLBy3Rqzs6dNvp2SjLazGZWvw,1348
42
39
  indexify/functions_sdk/object_serializer.py,sha256=R58ALsl2Lb87ii6km4D6hBBsqRs_CHNISxhUICE2d9o,1931
43
40
  indexify/functions_sdk/pipeline.py,sha256=KmxZE8eBFAQ4bbEcYURXXR26HSyoAT3O6iu9H38-OXE,974
44
41
  indexify/http_client.py,sha256=XvIi7z7h7PZomMKlL8WEhFIWAAPzFe2hrLQUz4QC_Zc,15392
@@ -46,8 +43,8 @@ indexify/logging.py,sha256=_TLnWT0RzABvExwTixjdpuxoSNUev4zeNV0K1VlEUmA,1099
46
43
  indexify/remote_graph.py,sha256=OzBucU4buR5UdTViNJvh1RfnOTmYA34Uel5qXnRsQsA,5006
47
44
  indexify/remote_pipeline.py,sha256=oqx57rSPszNS3DToXO_nf-CKqkCZWptm1u_p3orV_gQ,790
48
45
  indexify/settings.py,sha256=Ny59mzYI4gbXoK8hjx66a_men6ndbd1J1zCTcKOoyzg,50
49
- indexify-0.2.42.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
- indexify-0.2.42.dist-info/METADATA,sha256=FuvJ1XvFCLmUCXltSb_9kPGGi-99T_T_T7x3br78GDA,6271
51
- indexify-0.2.42.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
52
- indexify-0.2.42.dist-info/entry_points.txt,sha256=Pih7WV-XMpAzI5dEvROcpLr-ybVhd9Y-AtuzBKUdcDs,49
53
- indexify-0.2.42.dist-info/RECORD,,
46
+ indexify-0.2.43.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
47
+ indexify-0.2.43.dist-info/METADATA,sha256=zMNCsHsP14BeoCqph91AkqHd5RBGMrczAL9Qdx9gknw,6267
48
+ indexify-0.2.43.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
49
+ indexify-0.2.43.dist-info/entry_points.txt,sha256=Pih7WV-XMpAzI5dEvROcpLr-ybVhd9Y-AtuzBKUdcDs,49
50
+ indexify-0.2.43.dist-info/RECORD,,
@@ -1,262 +0,0 @@
1
- import asyncio
2
- from pathlib import Path
3
- from typing import Dict, List, Optional
4
-
5
- import structlog
6
-
7
- from .downloader import Downloader
8
- from .executor_tasks import DownloadGraphTask, DownloadInputsTask, RunTask
9
- from .function_executor.process_function_executor_factory import (
10
- ProcessFunctionExecutorFactory,
11
- )
12
- from .function_worker import (
13
- FunctionWorker,
14
- FunctionWorkerInput,
15
- FunctionWorkerOutput,
16
- )
17
- from .task_fetcher import TaskFetcher
18
- from .task_reporter import TaskReporter
19
- from .task_store import CompletedTask, TaskStore
20
-
21
- logger = structlog.get_logger(module=__name__)
22
-
23
-
24
- class ExtractorAgent:
25
- def __init__(
26
- self,
27
- executor_id: str,
28
- code_path: Path,
29
- server_addr: str = "localhost:8900",
30
- development_mode: bool = False,
31
- config_path: Optional[str] = None,
32
- name_alias: Optional[str] = None,
33
- image_hash: Optional[str] = None,
34
- ):
35
- self._config_path = config_path
36
- protocol: str = "http"
37
- if config_path:
38
- logger.info("running the extractor with TLS enabled")
39
- protocol = "https"
40
-
41
- self._task_store: TaskStore = TaskStore()
42
- self._function_worker = FunctionWorker(
43
- function_executor_factory=ProcessFunctionExecutorFactory(
44
- indexify_server_address=server_addr,
45
- development_mode=development_mode,
46
- config_path=config_path,
47
- )
48
- )
49
- self._has_registered = False
50
- self._server_addr = server_addr
51
- self._base_url = f"{protocol}://{self._server_addr}"
52
- self._code_path = code_path
53
- self._downloader = Downloader(
54
- code_path=code_path, base_url=self._base_url, config_path=config_path
55
- )
56
- self._task_fetcher = TaskFetcher(
57
- protocol=protocol,
58
- indexify_server_addr=self._server_addr,
59
- executor_id=executor_id,
60
- name_alias=name_alias,
61
- image_hash=image_hash,
62
- config_path=config_path,
63
- )
64
- self._task_reporter = TaskReporter(
65
- base_url=self._base_url,
66
- executor_id=executor_id,
67
- config_path=self._config_path,
68
- )
69
-
70
- async def task_completion_reporter(self):
71
- logger.info("starting task completion reporter")
72
- # We should copy only the keys and not the values
73
- while True:
74
- outcomes = await self._task_store.task_outcomes()
75
- for task_outcome in outcomes:
76
- logger.info(
77
- "reporting_task_outcome",
78
- task_id=task_outcome.task.id,
79
- fn_name=task_outcome.task.compute_fn,
80
- num_outputs=(
81
- len(task_outcome.function_output.outputs)
82
- if task_outcome.function_output is not None
83
- else 0
84
- ),
85
- router_output=task_outcome.router_output,
86
- outcome=task_outcome.task_outcome,
87
- retries=task_outcome.reporting_retries,
88
- )
89
-
90
- try:
91
- # Send task outcome to the server
92
- self._task_reporter.report_task_outcome(completed_task=task_outcome)
93
- except Exception as e:
94
- # The connection was dropped in the middle of the reporting, process, retry
95
- logger.error(
96
- "failed_to_report_task",
97
- task_id=task_outcome.task.id,
98
- exc_info=e,
99
- retries=task_outcome.reporting_retries,
100
- )
101
- task_outcome.reporting_retries += 1
102
- await asyncio.sleep(5)
103
- continue
104
-
105
- self._task_store.mark_reported(task_id=task_outcome.task.id)
106
-
107
- async def task_launcher(self):
108
- async_tasks: List[asyncio.Task] = [
109
- asyncio.create_task(
110
- self._task_store.get_runnable_tasks(), name="get_runnable_tasks"
111
- )
112
- ]
113
-
114
- while True:
115
- done, pending = await asyncio.wait(
116
- async_tasks, return_when=asyncio.FIRST_COMPLETED
117
- )
118
-
119
- async_tasks: List[asyncio.Task] = list(pending)
120
- for async_task in done:
121
- if async_task.get_name() == "get_runnable_tasks":
122
- if async_task.exception():
123
- logger.error(
124
- "task_launcher_error, failed to get runnable tasks",
125
- exc_info=async_task.exception(),
126
- )
127
- continue
128
- result: Dict[str, Task] = await async_task
129
- task: Task
130
- for _, task in result.items():
131
- async_tasks.append(
132
- DownloadGraphTask(
133
- function_worker_input=FunctionWorkerInput(task=task),
134
- downloader=self._downloader,
135
- )
136
- )
137
- async_tasks.append(
138
- asyncio.create_task(
139
- self._task_store.get_runnable_tasks(),
140
- name="get_runnable_tasks",
141
- )
142
- )
143
- elif async_task.get_name() == "download_graph":
144
- if async_task.exception():
145
- logger.error(
146
- "task_launcher_error, failed to download graph",
147
- exc_info=async_task.exception(),
148
- )
149
- completed_task = CompletedTask(
150
- task=async_task.function_worker_input.task,
151
- task_outcome="failure",
152
- )
153
- self._task_store.complete(outcome=completed_task)
154
- continue
155
- async_task: DownloadGraphTask
156
- function_worker_input: FunctionWorkerInput = (
157
- async_task.function_worker_input
158
- )
159
- function_worker_input.graph = await async_task
160
- async_tasks.append(
161
- DownloadInputsTask(
162
- function_worker_input=function_worker_input,
163
- downloader=self._downloader,
164
- )
165
- )
166
- elif async_task.get_name() == "download_inputs":
167
- if async_task.exception():
168
- logger.error(
169
- "task_launcher_error, failed to download inputs",
170
- exc_info=async_task.exception(),
171
- )
172
- completed_task = CompletedTask(
173
- task=async_task.function_worker_input.task,
174
- task_outcome="failure",
175
- )
176
- self._task_store.complete(outcome=completed_task)
177
- continue
178
- async_task: DownloadInputsTask
179
- function_worker_input: FunctionWorkerInput = (
180
- async_task.function_worker_input
181
- )
182
- function_worker_input.function_input = await async_task
183
- async_tasks.append(
184
- RunTask(
185
- function_worker=self._function_worker,
186
- function_worker_input=function_worker_input,
187
- )
188
- )
189
- elif async_task.get_name() == "run_task":
190
- if async_task.exception():
191
- completed_task = CompletedTask(
192
- task=async_task.function_worker_input.task,
193
- task_outcome="failure",
194
- stderr=str(async_task.exception()),
195
- )
196
- self._task_store.complete(outcome=completed_task)
197
- continue
198
- async_task: RunTask
199
- try:
200
- outputs: FunctionWorkerOutput = await async_task
201
- if not outputs.success:
202
- task_outcome = "failure"
203
- else:
204
- task_outcome = "success"
205
-
206
- completed_task = CompletedTask(
207
- task=async_task.function_worker_input.task,
208
- task_outcome=task_outcome,
209
- function_output=outputs.function_output,
210
- router_output=outputs.router_output,
211
- stdout=outputs.stdout,
212
- stderr=outputs.stderr,
213
- reducer=outputs.reducer,
214
- )
215
- self._task_store.complete(outcome=completed_task)
216
- except Exception as e:
217
- logger.error(
218
- "failed to execute task",
219
- task_id=async_task.function_worker_input.task.id,
220
- exc_info=e,
221
- )
222
- completed_task = CompletedTask(
223
- task=async_task.function_worker_input.task,
224
- task_outcome="failure",
225
- )
226
- self._task_store.complete(outcome=completed_task)
227
- continue
228
-
229
- async def _main_loop(self):
230
- """Fetches incoming tasks from the server and starts their processing."""
231
- self._should_run = True
232
- while self._should_run:
233
- try:
234
- async for task in self._task_fetcher.run():
235
- self._task_store.add_tasks([task])
236
- except Exception as e:
237
- logger.error("failed fetching tasks, retrying in 5 seconds", exc_info=e)
238
- await asyncio.sleep(5)
239
- continue
240
-
241
- async def run(self):
242
- import signal
243
-
244
- asyncio.get_event_loop().add_signal_handler(
245
- signal.SIGINT, self.shutdown, asyncio.get_event_loop()
246
- )
247
- asyncio.get_event_loop().add_signal_handler(
248
- signal.SIGTERM, self.shutdown, asyncio.get_event_loop()
249
- )
250
- asyncio.create_task(self.task_launcher())
251
- asyncio.create_task(self.task_completion_reporter())
252
- await self._main_loop()
253
-
254
- async def _shutdown(self, loop):
255
- logger.info("shutting_down")
256
- self._should_run = False
257
- await self._function_worker.shutdown()
258
- for task in asyncio.all_tasks(loop):
259
- task.cancel()
260
-
261
- def shutdown(self, loop):
262
- loop.create_task(self._shutdown(loop))
@@ -1,58 +0,0 @@
1
- import asyncio
2
-
3
- from pydantic import BaseModel
4
-
5
- from .api_objects import Task
6
- from .downloader import Downloader
7
- from .function_worker import FunctionWorker, FunctionWorkerInput
8
-
9
-
10
- class DownloadGraphTask(asyncio.Task):
11
- def __init__(
12
- self,
13
- *,
14
- function_worker_input: FunctionWorkerInput,
15
- downloader: Downloader,
16
- **kwargs,
17
- ):
18
- kwargs["name"] = "download_graph"
19
- kwargs["loop"] = asyncio.get_event_loop()
20
- super().__init__(
21
- downloader.download_graph(function_worker_input.task),
22
- **kwargs,
23
- )
24
- self.function_worker_input = function_worker_input
25
-
26
-
27
- class DownloadInputsTask(asyncio.Task):
28
- def __init__(
29
- self,
30
- *,
31
- function_worker_input: FunctionWorkerInput,
32
- downloader: Downloader,
33
- **kwargs,
34
- ):
35
- kwargs["name"] = "download_inputs"
36
- kwargs["loop"] = asyncio.get_event_loop()
37
- super().__init__(
38
- downloader.download_inputs(function_worker_input.task),
39
- **kwargs,
40
- )
41
- self.function_worker_input = function_worker_input
42
-
43
-
44
- class RunTask(asyncio.Task):
45
- def __init__(
46
- self,
47
- *,
48
- function_worker: FunctionWorker,
49
- function_worker_input: FunctionWorkerInput,
50
- **kwargs,
51
- ):
52
- kwargs["name"] = "run_task"
53
- kwargs["loop"] = asyncio.get_event_loop()
54
- super().__init__(
55
- function_worker.run(function_worker_input),
56
- **kwargs,
57
- )
58
- self.function_worker_input = function_worker_input
@@ -1,132 +0,0 @@
1
- import asyncio
2
- from typing import Dict, List, Literal, Optional
3
-
4
- import structlog
5
-
6
- from indexify.function_executor.proto.function_executor_pb2 import (
7
- FunctionOutput,
8
- RouterOutput,
9
- )
10
-
11
- from .api_objects import Task
12
-
13
- logger = structlog.get_logger(module=__name__)
14
-
15
-
16
- class CompletedTask:
17
- def __init__(
18
- self,
19
- task: Task,
20
- task_outcome: Literal["success", "failure"],
21
- function_output: Optional[FunctionOutput] = None,
22
- router_output: Optional[RouterOutput] = None,
23
- stdout: Optional[str] = None,
24
- stderr: Optional[str] = None,
25
- reducer: bool = False,
26
- reporting_retries: int = 0,
27
- ):
28
- self.task = task
29
- self.task_outcome = task_outcome
30
- self.function_output = function_output
31
- self.router_output = router_output
32
- self.stdout = stdout
33
- self.stderr = stderr
34
- self.reducer = reducer
35
- self.reporting_retries = reporting_retries
36
-
37
-
38
- class TaskStore:
39
- def __init__(self) -> None:
40
- self._tasks: Dict[str, Task] = {}
41
- self._running_tasks: Dict[str, Task] = {}
42
- self._finished: Dict[str, CompletedTask] = {}
43
- self._retries: Dict[str, int] = {}
44
- self._new_task_event = asyncio.Event()
45
- self._finished_task_event = asyncio.Event()
46
-
47
- def get_task(self, id) -> Task:
48
- return self._tasks[id]
49
-
50
- def add_tasks(self, tasks: List[Task]):
51
- task: Task
52
- for task in tasks:
53
- if (
54
- (task.id in self._tasks)
55
- or (task.id in self._running_tasks)
56
- or (task.id in self._finished)
57
- ):
58
- continue
59
- logger.info(
60
- "added task",
61
- task_id=task.id,
62
- namespace=task.namespace,
63
- graph=task.compute_graph,
64
- fn=task.compute_fn,
65
- )
66
- self._tasks[task.id] = task
67
- self._new_task_event.set()
68
-
69
- async def get_runnable_tasks(self) -> Dict[str, Task]:
70
- while True:
71
- runnable_tasks = set(self._tasks) - set(self._running_tasks)
72
- runnable_tasks = set(runnable_tasks) - set(self._finished)
73
- if len(runnable_tasks) == 0:
74
- await self._new_task_event.wait()
75
- self._new_task_event.clear()
76
- else:
77
- break
78
- out = {}
79
- for task_id in runnable_tasks:
80
- out[task_id] = self._tasks[task_id]
81
- self._running_tasks[task_id] = self._tasks[task_id]
82
- return out
83
-
84
- def complete(self, outcome: CompletedTask):
85
- self._retries.pop(outcome.task.id, None)
86
- self._finished[outcome.task.id] = outcome
87
- if outcome.task.id in self._running_tasks:
88
- self._running_tasks.pop(outcome.task.id)
89
- self._finished_task_event.set()
90
-
91
- def retriable_failure(self, task_id: str):
92
- self._running_tasks.pop(task_id)
93
- if task_id not in self._retries:
94
- self._retries[task_id] = 0
95
- self._retries[task_id] += 1
96
- if self._retries[task_id] > 3:
97
- self._retries.pop(task_id)
98
- self.complete(
99
- outcome=CompletedTask(
100
- task_id=task_id, task_outcome="failed", outputs=[]
101
- )
102
- )
103
- else:
104
- self._new_task_event.set()
105
-
106
- def mark_reported(self, task_id: str):
107
- self._tasks.pop(task_id)
108
- self._finished.pop(task_id)
109
- logger.info("removed task", task_id=task_id)
110
-
111
- def report_failed(self, task_id: str):
112
- if self._finished[task_id].task_outcome != "Failed":
113
- # An error occurred while reporting the task, mark it as failed
114
- # and try reporting again.
115
- self._finished[task_id].task_outcome = "Failed"
116
- else:
117
- # If a task is already marked as failed, remove it from the queue.
118
- # The only possible error at this point is task not present at
119
- # the coordinator.
120
- self._tasks.pop(task_id)
121
-
122
- def num_pending_tasks(self) -> int:
123
- return len(self._tasks) + len(self._running_tasks)
124
-
125
- async def task_outcomes(self) -> List[CompletedTask]:
126
- while True:
127
- if len(self._finished) == 0:
128
- await self._finished_task_event.wait()
129
- self._finished_task_event.clear()
130
- else:
131
- break
132
- return self._finished.copy().values()
@@ -1,46 +0,0 @@
1
- import os
2
- from hashlib import sha256
3
- from typing import List, Optional
4
-
5
-
6
- class CacheAwareFunctionWrapper:
7
- def __init__(self, cache_dir: str):
8
- self._cache_dir = cache_dir
9
- if not os.path.exists(cache_dir):
10
- os.makedirs(cache_dir)
11
-
12
- def _get_key(self, input: bytes) -> str:
13
- h = sha256()
14
- h.update(input)
15
- return h.hexdigest()
16
-
17
- def get(self, graph: str, node_name: str, input: bytes) -> Optional[List[bytes]]:
18
- key = self._get_key(input)
19
- dir_path = os.path.join(self._cache_dir, graph, node_name, key)
20
- if not os.path.exists(dir_path):
21
- return None
22
-
23
- files = os.listdir(dir_path)
24
- outputs = []
25
- for file in files:
26
- with open(os.path.join(dir_path, file), "rb") as f:
27
- return f.read()
28
-
29
- return outputs
30
-
31
- def set(
32
- self,
33
- graph: str,
34
- node_name: str,
35
- input: bytes,
36
- output: List[bytes],
37
- ):
38
- key = self._get_key(input)
39
- dir_path = os.path.join(self._cache_dir, graph, node_name, key)
40
- if not os.path.exists(dir_path):
41
- os.makedirs(dir_path)
42
-
43
- for i, output_item in enumerate(output):
44
- file_path = os.path.join(dir_path, f"{i}.cbor")
45
- with open(file_path, "wb") as f:
46
- f.write(output_item)