indexify 0.0.43__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. indexify/__init__.py +15 -14
  2. indexify/base_client.py +48 -21
  3. indexify/cli.py +247 -0
  4. indexify/client.py +18 -790
  5. indexify/error.py +3 -30
  6. indexify/executor/agent.py +364 -0
  7. indexify/executor/api_objects.py +43 -0
  8. indexify/executor/downloader.py +124 -0
  9. indexify/executor/executor_tasks.py +72 -0
  10. indexify/executor/function_worker.py +177 -0
  11. indexify/executor/indexify_executor.py +32 -0
  12. indexify/executor/runtime_probes.py +48 -0
  13. indexify/executor/task_reporter.py +110 -0
  14. indexify/executor/task_store.py +113 -0
  15. indexify/foo +72 -0
  16. indexify/functions_sdk/data_objects.py +37 -0
  17. indexify/functions_sdk/graph.py +281 -0
  18. indexify/functions_sdk/graph_validation.py +66 -0
  19. indexify/functions_sdk/image.py +34 -0
  20. indexify/functions_sdk/indexify_functions.py +188 -0
  21. indexify/functions_sdk/local_cache.py +46 -0
  22. indexify/functions_sdk/object_serializer.py +60 -0
  23. indexify/local_client.py +183 -0
  24. indexify/remote_client.py +319 -0
  25. indexify-0.2.1.dist-info/METADATA +151 -0
  26. indexify-0.2.1.dist-info/RECORD +33 -0
  27. indexify-0.2.1.dist-info/entry_points.txt +3 -0
  28. indexify/exceptions.py +0 -3
  29. indexify/extraction_policy.py +0 -75
  30. indexify/extractor_sdk/__init__.py +0 -14
  31. indexify/extractor_sdk/data.py +0 -100
  32. indexify/extractor_sdk/extractor.py +0 -225
  33. indexify/extractor_sdk/utils.py +0 -102
  34. indexify/extractors/__init__.py +0 -0
  35. indexify/extractors/embedding.py +0 -55
  36. indexify/extractors/pdf_parser.py +0 -93
  37. indexify/graph.py +0 -133
  38. indexify/local_runner.py +0 -128
  39. indexify/runner.py +0 -22
  40. indexify/utils.py +0 -7
  41. indexify-0.0.43.dist-info/METADATA +0 -66
  42. indexify-0.0.43.dist-info/RECORD +0 -25
  43. {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/LICENSE.txt +0 -0
  44. {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/WHEEL +0 -0
indexify/error.py CHANGED
@@ -1,30 +1,3 @@
1
- class Error(Exception):
2
- status: str
3
- message: str
4
-
5
- def __init__(self, status: str, message: str):
6
- self.status = status
7
- self.message = message
8
-
9
- @staticmethod
10
- def from_tonic_error_string(url: str, error: str) -> "Error":
11
- data = error.split(", ")
12
-
13
- message = data[1].split(": ", 1)[1]
14
- if message.startswith('"') and message.endswith('"'):
15
- message = message[1:-1]
16
-
17
- status = "GeneralError"
18
- if "extraction_graph" in url:
19
- status = "ExtractionGraphError"
20
- elif "search" in url:
21
- status = "SearchError"
22
-
23
- error = Error(status, message)
24
- return error
25
-
26
- def __str__(self):
27
- return f"{self.status} | {self.message.capitalize()}"
28
-
29
- def __repr__(self):
30
- return f"Error(status={self.status!r}, message={self.message!r})"
1
+ class ApiException(Exception):
2
+ def __init__(self, message: str) -> None:
3
+ super().__init__(message)
@@ -0,0 +1,364 @@
1
+ import asyncio
2
+ import json
3
+ import ssl
4
+ from concurrent.futures.process import BrokenProcessPool
5
+ from typing import Dict, List, Optional
6
+
7
+ import httpx
8
+ import yaml
9
+ from httpx_sse import aconnect_sse
10
+ from pydantic import BaseModel
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.text import Text
14
+ from rich.theme import Theme
15
+
16
+ from indexify.functions_sdk.data_objects import (
17
+ FunctionWorkerOutput,
18
+ IndexifyData,
19
+ RouterOutput,
20
+ )
21
+
22
+ from .api_objects import ExecutorMetadata, Task
23
+ from .downloader import DownloadedInputs, Downloader
24
+ from .executor_tasks import DownloadGraphTask, DownloadInputTask, ExtractTask
25
+ from .function_worker import FunctionWorker
26
+ from .runtime_probes import ProbeInfo, RuntimeProbes
27
+ from .task_reporter import TaskReporter
28
+ from .task_store import CompletedTask, TaskStore
29
+
30
+ custom_theme = Theme(
31
+ {
32
+ "info": "cyan",
33
+ "warning": "yellow",
34
+ "error": "red",
35
+ "success": "green",
36
+ }
37
+ )
38
+
39
+ console = Console(theme=custom_theme)
40
+
41
+
42
+ class FunctionInput(BaseModel):
43
+ task_id: str
44
+ namespace: str
45
+ compute_graph: str
46
+ function: str
47
+ input: IndexifyData
48
+ init_value: Optional[IndexifyData] = None
49
+
50
+
51
+ class ExtractorAgent:
52
+ def __init__(
53
+ self,
54
+ executor_id: str,
55
+ num_workers,
56
+ code_path: str,
57
+ function_worker: FunctionWorker,
58
+ server_addr: str = "localhost:8900",
59
+ config_path: Optional[str] = None,
60
+ ):
61
+ self.num_workers = num_workers
62
+ self._use_tls = False
63
+ if config_path:
64
+ with open(config_path, "r") as f:
65
+ config = yaml.safe_load(f)
66
+ self._config = config
67
+ if config.get("use_tls", False):
68
+ console.print(
69
+ "Running the extractor with TLS enabled", style="cyan bold"
70
+ )
71
+ self._use_tls = True
72
+ tls_config = config["tls_config"]
73
+ self._ssl_context = ssl.create_default_context(
74
+ ssl.Purpose.SERVER_AUTH, cafile=tls_config["ca_bundle_path"]
75
+ )
76
+ self._ssl_context.load_cert_chain(
77
+ certfile=tls_config["cert_path"], keyfile=tls_config["key_path"]
78
+ )
79
+ self._protocol = "wss"
80
+ self._tls_config = tls_config
81
+ else:
82
+ self._ssl_context = None
83
+ self._protocol = "ws"
84
+ else:
85
+ self._ssl_context = None
86
+ self._protocol = "http"
87
+ self._config = {}
88
+
89
+ self._task_store: TaskStore = TaskStore()
90
+ self._executor_id = executor_id
91
+ self._function_worker = function_worker
92
+ self._has_registered = False
93
+ self._server_addr = server_addr
94
+ self._base_url = f"{self._protocol}://{self._server_addr}"
95
+ self._code_path = code_path
96
+ self._downloader = Downloader(code_path=code_path, base_url=self._base_url)
97
+ self._max_queued_tasks = 10
98
+ self._task_reporter = TaskReporter(
99
+ base_url=self._base_url, executor_id=self._executor_id
100
+ )
101
+ self._probe = RuntimeProbes()
102
+
103
+ async def task_completion_reporter(self):
104
+ console.print(Text("Starting task completion reporter", style="bold cyan"))
105
+ # We should copy only the keys and not the values
106
+ url = f"{self._protocol}://{self._server_addr}/write_content"
107
+ while True:
108
+ outcomes = await self._task_store.task_outcomes()
109
+ for task_outcome in outcomes:
110
+ outcome = task_outcome.task_outcome
111
+ style_outcome = (
112
+ f"[bold red] {outcome} [/]"
113
+ if "fail" in outcome
114
+ else f"[bold green] {outcome} [/]"
115
+ )
116
+ console.print(
117
+ Panel(
118
+ f"Reporting outcome of task {task_outcome.task.id}\n"
119
+ f"Outcome: {style_outcome}\n"
120
+ f"Outputs: {len(task_outcome.outputs or [])} Router Output: {task_outcome.router_output}",
121
+ title="Task Completion",
122
+ border_style="info",
123
+ )
124
+ )
125
+
126
+ try:
127
+ # Send task outcome to the server
128
+ self._task_reporter.report_task_outcome(completed_task=task_outcome)
129
+ except Exception as e:
130
+ # The connection was dropped in the middle of the reporting, process, retry
131
+ console.print(
132
+ Panel(
133
+ f"Failed to report task {task_outcome.task.id}\n"
134
+ f"Exception: {e}\nRetrying...",
135
+ title="Reporting Error",
136
+ border_style="error",
137
+ )
138
+ )
139
+ await asyncio.sleep(5)
140
+ continue
141
+
142
+ self._task_store.mark_reported(task_id=task_outcome.task.id)
143
+
144
+ async def task_launcher(self):
145
+ async_tasks: List[asyncio.Task] = []
146
+ fn_queue: List[FunctionInput] = []
147
+ async_tasks.append(
148
+ asyncio.create_task(
149
+ self._task_store.get_runnable_tasks(), name="get_runnable_tasks"
150
+ )
151
+ )
152
+ while True:
153
+ fn: FunctionInput
154
+ for fn in fn_queue:
155
+ task: Task = self._task_store.get_task(fn.task_id)
156
+ async_tasks.append(
157
+ ExtractTask(
158
+ function_worker=self._function_worker,
159
+ task=task,
160
+ input=fn.input,
161
+ code_path=f"{self._code_path}/{task.namespace}/{task.compute_graph}.{task.graph_version}",
162
+ init_value=fn.init_value,
163
+ )
164
+ )
165
+
166
+ fn_queue = []
167
+ done, pending = await asyncio.wait(
168
+ async_tasks, return_when=asyncio.FIRST_COMPLETED
169
+ )
170
+
171
+ async_tasks: List[asyncio.Task] = list(pending)
172
+ for async_task in done:
173
+ if async_task.get_name() == "get_runnable_tasks":
174
+ if async_task.exception():
175
+ console.print(
176
+ Text("Task Launcher Error: ", style="red bold")
177
+ + Text(
178
+ f"Failed to get runnable tasks: {async_task.exception()}",
179
+ style="red",
180
+ )
181
+ )
182
+ continue
183
+ result: Dict[str, Task] = await async_task
184
+ task: Task
185
+ for _, task in result.items():
186
+ async_tasks.append(
187
+ DownloadGraphTask(task=task, downloader=self._downloader)
188
+ )
189
+ async_tasks.append(
190
+ asyncio.create_task(
191
+ self._task_store.get_runnable_tasks(),
192
+ name="get_runnable_tasks",
193
+ )
194
+ )
195
+ elif async_task.get_name() == "download_graph":
196
+ if async_task.exception():
197
+ console.print(
198
+ Text(
199
+ f"Failed to download graph for task {async_task.task.id}\n",
200
+ style="red bold",
201
+ )
202
+ + Text(f"Exception: {async_task.exception()}", style="red")
203
+ )
204
+ completed_task = CompletedTask(
205
+ task=async_task.task,
206
+ outputs=[],
207
+ task_outcome="failure",
208
+ )
209
+ self._task_store.complete(outcome=completed_task)
210
+ continue
211
+ async_tasks.append(
212
+ DownloadInputTask(
213
+ task=async_task.task, downloader=self._downloader
214
+ )
215
+ )
216
+ elif async_task.get_name() == "download_input":
217
+ if async_task.exception():
218
+ console.print(
219
+ Text(
220
+ f"Failed to download input for task {async_task.task.id}\n",
221
+ style="red bold",
222
+ )
223
+ + Text(f"Exception: {async_task.exception()}", style="red")
224
+ )
225
+ completed_task = CompletedTask(
226
+ task=async_task.task,
227
+ outputs=[],
228
+ task_outcome="failure",
229
+ )
230
+ self._task_store.complete(outcome=completed_task)
231
+ continue
232
+ downloaded_inputs: DownloadedInputs = await async_task
233
+ task: Task = async_task.task
234
+ fn_queue.append(
235
+ FunctionInput(
236
+ task_id=task.id,
237
+ namespace=task.namespace,
238
+ compute_graph=task.compute_graph,
239
+ function=task.compute_fn,
240
+ input=downloaded_inputs.input,
241
+ init_value=downloaded_inputs.init_value,
242
+ )
243
+ )
244
+ elif async_task.get_name() == "run_function":
245
+ if async_task.exception():
246
+ completed_task = CompletedTask(
247
+ task=async_task.task,
248
+ task_outcome="failure",
249
+ outputs=[],
250
+ errors=str(async_task.exception()),
251
+ )
252
+ self._task_store.complete(outcome=completed_task)
253
+ continue
254
+ async_task: ExtractTask
255
+ try:
256
+ outputs: FunctionWorkerOutput = await async_task
257
+ if not outputs.success:
258
+ task_outcome = "failure"
259
+ else:
260
+ task_outcome = "success"
261
+
262
+ completed_task = CompletedTask(
263
+ task=async_task.task,
264
+ task_outcome=task_outcome,
265
+ outputs=outputs.fn_outputs,
266
+ router_output=outputs.router_output,
267
+ errors=outputs.exception,
268
+ stdout=outputs.stdout,
269
+ stderr=outputs.stderr,
270
+ reducer=outputs.reducer,
271
+ )
272
+ self._task_store.complete(outcome=completed_task)
273
+ except BrokenProcessPool:
274
+ self._task_store.retriable_failure(async_task.task.id)
275
+ continue
276
+ except Exception as e:
277
+ console.print(
278
+ Text(
279
+ f"Failed to execute task {async_task.task.id}\n",
280
+ style="red bold",
281
+ )
282
+ + Text(f"Exception: {e}", style="red")
283
+ )
284
+ completed_task = CompletedTask(
285
+ task=async_task.task,
286
+ task_outcome="failure",
287
+ outputs=[],
288
+ )
289
+ self._task_store.complete(outcome=completed_task)
290
+ continue
291
+
292
+ async def run(self):
293
+ import signal
294
+
295
+ asyncio.get_event_loop().add_signal_handler(
296
+ signal.SIGINT, self.shutdown, asyncio.get_event_loop()
297
+ )
298
+ asyncio.create_task(self.task_launcher())
299
+ asyncio.create_task(self.task_completion_reporter())
300
+ self._should_run = True
301
+ while self._should_run:
302
+ self._protocol = "http"
303
+ url = f"{self._protocol}://{self._server_addr}/internal/executors/{self._executor_id}/tasks"
304
+
305
+ def to_sentence_case(snake_str):
306
+ words = snake_str.split("_")
307
+ return words[0].capitalize() + "" + " ".join(words[1:])
308
+
309
+ runtime_probe: ProbeInfo = self._probe.probe()
310
+ data = ExecutorMetadata(
311
+ id=self._executor_id,
312
+ addr="",
313
+ image_name=runtime_probe.image_name,
314
+ labels=runtime_probe.labels,
315
+ ).model_dump()
316
+
317
+ panel_content = "\n".join(
318
+ [f"{to_sentence_case(key)}: {value}" for key, value in data.items()]
319
+ )
320
+ console.print(
321
+ Panel(
322
+ panel_content,
323
+ title="attempting to Register Executor",
324
+ border_style="cyan",
325
+ )
326
+ )
327
+
328
+ try:
329
+ async with httpx.AsyncClient() as client:
330
+ async with aconnect_sse(
331
+ client,
332
+ "POST",
333
+ url,
334
+ json=data,
335
+ headers={"Content-Type": "application/json"},
336
+ ) as event_source:
337
+ console.print(
338
+ Text("executor registered successfully", style="bold green")
339
+ )
340
+ async for sse in event_source.aiter_sse():
341
+ data = json.loads(sse.data)
342
+ tasks = []
343
+ for task_dict in data:
344
+ tasks.append(
345
+ Task.model_validate(task_dict, strict=False)
346
+ )
347
+ self._task_store.add_tasks(tasks)
348
+ except Exception as e:
349
+ console.print(
350
+ Text("registration Error: ", style="red bold")
351
+ + Text(f"failed to register: {e}", style="red")
352
+ )
353
+ await asyncio.sleep(5)
354
+ continue
355
+
356
+ async def _shutdown(self, loop):
357
+ console.print(Text("shutting down agent...", style="bold yellow"))
358
+ self._should_run = False
359
+ for task in asyncio.all_tasks(loop):
360
+ task.cancel()
361
+
362
+ def shutdown(self, loop):
363
+ self._function_worker.shutdown()
364
+ loop.create_task(self._shutdown(loop))
@@ -0,0 +1,43 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Json
4
+
5
+ from indexify.functions_sdk.data_objects import IndexifyData
6
+
7
+
8
+ class Task(BaseModel):
9
+ id: str
10
+ namespace: str
11
+ compute_graph: str
12
+ compute_fn: str
13
+ invocation_id: str
14
+ input_key: str
15
+ reducer_output_id: Optional[str] = None
16
+ graph_version: int
17
+
18
+
19
+ class ExecutorMetadata(BaseModel):
20
+ id: str
21
+ addr: str
22
+ image_name: str
23
+ labels: Dict[str, Any]
24
+
25
+
26
+ class RouterOutput(BaseModel):
27
+ edges: List[str]
28
+
29
+
30
+ class FnOutput(BaseModel):
31
+ payload: Json
32
+
33
+
34
+ class TaskResult(BaseModel):
35
+ router_output: Optional[RouterOutput] = None
36
+ outcome: str
37
+ namespace: str
38
+ compute_graph: str
39
+ compute_fn: str
40
+ invocation_id: str
41
+ executor_id: str
42
+ task_id: str
43
+ reducer: bool = False
@@ -0,0 +1,124 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ import httpx
5
+ from pydantic import BaseModel
6
+ from rich.console import Console
7
+ from rich.panel import Panel
8
+ from rich.theme import Theme
9
+
10
+ from indexify.functions_sdk.data_objects import IndexifyData
11
+ from indexify.functions_sdk.object_serializer import MsgPackSerializer
12
+
13
+ from .api_objects import Task
14
+
15
+ custom_theme = Theme(
16
+ {
17
+ "info": "cyan",
18
+ "warning": "yellow",
19
+ "error": "red",
20
+ }
21
+ )
22
+
23
+ console = Console(theme=custom_theme)
24
+
25
+
26
+ class DownloadedInputs(BaseModel):
27
+ input: IndexifyData
28
+ init_value: Optional[IndexifyData] = None
29
+
30
+
31
+ class Downloader:
32
+ def __init__(self, code_path: str, base_url: str):
33
+ self.code_path = code_path
34
+ self.base_url = base_url
35
+
36
+ async def download_graph(self, namespace: str, name: str, version: int) -> str:
37
+ path = os.path.join(self.code_path, namespace, f"{name}.{version}")
38
+ if os.path.exists(path):
39
+ return path
40
+
41
+ console.print(
42
+ Panel(
43
+ f"Downloading graph: {name}\nPath: {path}",
44
+ title="downloader",
45
+ border_style="cyan",
46
+ )
47
+ )
48
+
49
+ response = httpx.get(
50
+ f"{self.base_url}/internal/namespaces/{namespace}/compute_graphs/{name}/code"
51
+ )
52
+ try:
53
+ response.raise_for_status()
54
+ except httpx.HTTPStatusError as e:
55
+ console.print(
56
+ Panel(
57
+ f"Failed to download graph: {name}\nError: {response.text}",
58
+ title="downloader error",
59
+ border_style="error",
60
+ )
61
+ )
62
+ raise
63
+
64
+ os.makedirs(os.path.dirname(path), exist_ok=True)
65
+ with open(path, "wb") as f:
66
+ f.write(response.content)
67
+ return path
68
+
69
+ async def download_input(self, task: Task) -> IndexifyData:
70
+ input_id = task.input_key.split("|")[-1]
71
+ if task.invocation_id == input_id:
72
+ url = f"{self.base_url}/namespaces/{task.namespace}/compute_graphs/{task.compute_graph}/invocations/{task.invocation_id}/payload"
73
+ else:
74
+ url = f"{self.base_url}/internal/fn_outputs/{task.input_key}"
75
+
76
+ reducer_url = None
77
+ if task.reducer_output_id:
78
+ reducer_url = f"{self.base_url}/namespaces/{task.namespace}/compute_graphs/{task.compute_graph}/invocations/{task.invocation_id}/fn/{task.compute_fn}/{task.reducer_output_id}"
79
+
80
+ console.print(
81
+ Panel(
82
+ f"downloading input\nURL: {url} \n reducer input URL: {reducer_url}",
83
+ title="downloader",
84
+ border_style="cyan",
85
+ )
86
+ )
87
+
88
+ response = httpx.get(url)
89
+ try:
90
+ response.raise_for_status()
91
+ except httpx.HTTPStatusError as e:
92
+ console.print(
93
+ Panel(
94
+ f"failed to download input: {task.input_key}\nError: {response.text}",
95
+ title="downloader error",
96
+ border_style="error",
97
+ )
98
+ )
99
+ raise
100
+
101
+ if task.invocation_id == input_id:
102
+ return DownloadedInputs(
103
+ input=IndexifyData(payload=response.content, id=input_id)
104
+ )
105
+
106
+ init_value = None
107
+ if reducer_url:
108
+ init_value = httpx.get(reducer_url)
109
+ try:
110
+ init_value.raise_for_status()
111
+ except httpx.HTTPStatusError as e:
112
+ console.print(
113
+ Panel(
114
+ f"failed to download reducer output: {task.reducer_output_id}\nError: {init_value.text}",
115
+ title="downloader error",
116
+ border_style="error",
117
+ )
118
+ )
119
+ raise
120
+ init_value = MsgPackSerializer.deserialize(init_value.content)
121
+
122
+ return DownloadedInputs(
123
+ input=MsgPackSerializer.deserialize(response.content), init_value=init_value
124
+ )
@@ -0,0 +1,72 @@
1
+ import asyncio
2
+ from typing import Optional
3
+
4
+ from indexify.functions_sdk.data_objects import IndexifyData
5
+
6
+ from .api_objects import Task
7
+ from .downloader import Downloader
8
+ from .function_worker import FunctionWorker
9
+
10
+
11
+ class DownloadGraphTask(asyncio.Task):
12
+ def __init__(
13
+ self,
14
+ *,
15
+ task: Task,
16
+ downloader: Downloader,
17
+ **kwargs,
18
+ ):
19
+ kwargs["name"] = "download_graph"
20
+ kwargs["loop"] = asyncio.get_event_loop()
21
+ super().__init__(
22
+ downloader.download_graph(
23
+ task.namespace, task.compute_graph, task.graph_version
24
+ ),
25
+ **kwargs,
26
+ )
27
+ self.task = task
28
+
29
+
30
+ class DownloadInputTask(asyncio.Task):
31
+ def __init__(
32
+ self,
33
+ *,
34
+ task: Task,
35
+ downloader: Downloader,
36
+ **kwargs,
37
+ ):
38
+ kwargs["name"] = "download_input"
39
+ kwargs["loop"] = asyncio.get_event_loop()
40
+ super().__init__(
41
+ downloader.download_input(task),
42
+ **kwargs,
43
+ )
44
+ self.task = task
45
+
46
+
47
+ class ExtractTask(asyncio.Task):
48
+ def __init__(
49
+ self,
50
+ *,
51
+ function_worker: FunctionWorker,
52
+ task: Task,
53
+ input: IndexifyData,
54
+ init_value: Optional[IndexifyData] = None,
55
+ code_path: str,
56
+ **kwargs,
57
+ ):
58
+ kwargs["name"] = "run_function"
59
+ kwargs["loop"] = asyncio.get_event_loop()
60
+ super().__init__(
61
+ function_worker.async_submit(
62
+ namespace=task.namespace,
63
+ graph_name=task.compute_graph,
64
+ fn_name=task.compute_fn,
65
+ input=input,
66
+ init_value=init_value,
67
+ code_path=code_path,
68
+ version=task.graph_version,
69
+ ),
70
+ **kwargs,
71
+ )
72
+ self.task = task