indexify 0.0.43__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. indexify/__init__.py +13 -14
  2. indexify/base_client.py +48 -21
  3. indexify/cli.py +235 -0
  4. indexify/client.py +18 -790
  5. indexify/error.py +3 -30
  6. indexify/executor/agent.py +362 -0
  7. indexify/executor/api_objects.py +43 -0
  8. indexify/executor/downloader.py +124 -0
  9. indexify/executor/executor_tasks.py +72 -0
  10. indexify/executor/function_worker.py +177 -0
  11. indexify/executor/indexify_executor.py +32 -0
  12. indexify/executor/task_reporter.py +110 -0
  13. indexify/executor/task_store.py +113 -0
  14. indexify/foo +72 -0
  15. indexify/functions_sdk/data_objects.py +37 -0
  16. indexify/functions_sdk/graph.py +276 -0
  17. indexify/functions_sdk/graph_validation.py +69 -0
  18. indexify/functions_sdk/image.py +26 -0
  19. indexify/functions_sdk/indexify_functions.py +192 -0
  20. indexify/functions_sdk/local_cache.py +46 -0
  21. indexify/functions_sdk/object_serializer.py +61 -0
  22. indexify/local_client.py +183 -0
  23. indexify/remote_client.py +319 -0
  24. indexify-0.2.dist-info/METADATA +151 -0
  25. indexify-0.2.dist-info/RECORD +32 -0
  26. indexify-0.2.dist-info/entry_points.txt +3 -0
  27. indexify/exceptions.py +0 -3
  28. indexify/extraction_policy.py +0 -75
  29. indexify/extractor_sdk/__init__.py +0 -14
  30. indexify/extractor_sdk/data.py +0 -100
  31. indexify/extractor_sdk/extractor.py +0 -225
  32. indexify/extractor_sdk/utils.py +0 -102
  33. indexify/extractors/__init__.py +0 -0
  34. indexify/extractors/embedding.py +0 -55
  35. indexify/extractors/pdf_parser.py +0 -93
  36. indexify/graph.py +0 -133
  37. indexify/local_runner.py +0 -128
  38. indexify/runner.py +0 -22
  39. indexify/utils.py +0 -7
  40. indexify-0.0.43.dist-info/METADATA +0 -66
  41. indexify-0.0.43.dist-info/RECORD +0 -25
  42. {indexify-0.0.43.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
  43. {indexify-0.0.43.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
indexify/error.py CHANGED
@@ -1,30 +1,3 @@
1
- class Error(Exception):
2
- status: str
3
- message: str
4
-
5
- def __init__(self, status: str, message: str):
6
- self.status = status
7
- self.message = message
8
-
9
- @staticmethod
10
- def from_tonic_error_string(url: str, error: str) -> "Error":
11
- data = error.split(", ")
12
-
13
- message = data[1].split(": ", 1)[1]
14
- if message.startswith('"') and message.endswith('"'):
15
- message = message[1:-1]
16
-
17
- status = "GeneralError"
18
- if "extraction_graph" in url:
19
- status = "ExtractionGraphError"
20
- elif "search" in url:
21
- status = "SearchError"
22
-
23
- error = Error(status, message)
24
- return error
25
-
26
- def __str__(self):
27
- return f"{self.status} | {self.message.capitalize()}"
28
-
29
- def __repr__(self):
30
- return f"Error(status={self.status!r}, message={self.message!r})"
1
+ class ApiException(Exception):
2
+ def __init__(self, message: str) -> None:
3
+ super().__init__(message)
@@ -0,0 +1,362 @@
1
+ import asyncio
2
+ import json
3
+ import ssl
4
+ import traceback
5
+ from concurrent.futures.process import BrokenProcessPool
6
+ from typing import Dict, List, Optional
7
+
8
+ import httpx
9
+ import yaml
10
+ from httpx_sse import aconnect_sse
11
+ from pydantic import BaseModel
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+ from rich.text import Text
15
+ from rich.theme import Theme
16
+
17
+ from indexify.functions_sdk.data_objects import (
18
+ FunctionWorkerOutput,
19
+ IndexifyData,
20
+ RouterOutput,
21
+ )
22
+
23
+ from .api_objects import ExecutorMetadata, Task
24
+ from .downloader import DownloadedInputs, Downloader
25
+ from .executor_tasks import DownloadGraphTask, DownloadInputTask, ExtractTask
26
+ from .function_worker import FunctionWorker
27
+ from .task_reporter import TaskReporter
28
+ from .task_store import CompletedTask, TaskStore
29
+
30
+ custom_theme = Theme(
31
+ {
32
+ "info": "cyan",
33
+ "warning": "yellow",
34
+ "error": "red",
35
+ "success": "green",
36
+ }
37
+ )
38
+
39
+ console = Console(theme=custom_theme)
40
+
41
+
42
+ class FunctionInput(BaseModel):
43
+ task_id: str
44
+ namespace: str
45
+ compute_graph: str
46
+ function: str
47
+ input: IndexifyData
48
+ init_value: Optional[IndexifyData] = None
49
+
50
+
51
+ class ExtractorAgent:
52
+ def __init__(
53
+ self,
54
+ executor_id: str,
55
+ num_workers,
56
+ code_path: str,
57
+ function_worker: FunctionWorker,
58
+ server_addr: str = "localhost:8900",
59
+ config_path: Optional[str] = None,
60
+ ):
61
+ self.num_workers = num_workers
62
+ self._use_tls = False
63
+ if config_path:
64
+ with open(config_path, "r") as f:
65
+ config = yaml.safe_load(f)
66
+ self._config = config
67
+ if config.get("use_tls", False):
68
+ console.print(
69
+ "Running the extractor with TLS enabled", style="cyan bold"
70
+ )
71
+ self._use_tls = True
72
+ tls_config = config["tls_config"]
73
+ self._ssl_context = ssl.create_default_context(
74
+ ssl.Purpose.SERVER_AUTH, cafile=tls_config["ca_bundle_path"]
75
+ )
76
+ self._ssl_context.load_cert_chain(
77
+ certfile=tls_config["cert_path"], keyfile=tls_config["key_path"]
78
+ )
79
+ self._protocol = "wss"
80
+ self._tls_config = tls_config
81
+ else:
82
+ self._ssl_context = None
83
+ self._protocol = "ws"
84
+ else:
85
+ self._ssl_context = None
86
+ self._protocol = "http"
87
+ self._config = {}
88
+
89
+ self._task_store: TaskStore = TaskStore()
90
+ self._executor_id = executor_id
91
+ self._function_worker = function_worker
92
+ self._has_registered = False
93
+ self._server_addr = server_addr
94
+ self._base_url = f"{self._protocol}://{self._server_addr}"
95
+ self._code_path = code_path
96
+ self._downloader = Downloader(code_path=code_path, base_url=self._base_url)
97
+ self._max_queued_tasks = 10
98
+ self._task_reporter = TaskReporter(
99
+ base_url=self._base_url, executor_id=self._executor_id
100
+ )
101
+
102
+ async def task_completion_reporter(self):
103
+ console.print(Text("Starting task completion reporter", style="bold cyan"))
104
+ # We should copy only the keys and not the values
105
+ url = f"{self._protocol}://{self._server_addr}/write_content"
106
+ while True:
107
+ outcomes = await self._task_store.task_outcomes()
108
+ for task_outcome in outcomes:
109
+ outcome = task_outcome.task_outcome
110
+ style_outcome = (
111
+ f"[bold red] {outcome} [/]"
112
+ if "fail" in outcome
113
+ else f"[bold green] {outcome} [/]"
114
+ )
115
+ console.print(
116
+ Panel(
117
+ f"Reporting outcome of task {task_outcome.task.id}\n"
118
+ f"Outcome: {style_outcome}\n"
119
+ f"Outputs: {len(task_outcome.outputs or [])} Router Output: {task_outcome.router_output}",
120
+ title="Task Completion",
121
+ border_style="info",
122
+ )
123
+ )
124
+
125
+ try:
126
+ # Send task outcome to the server
127
+ self._task_reporter.report_task_outcome(completed_task=task_outcome)
128
+ except Exception as e:
129
+ # The connection was dropped in the middle of the reporting, process, retry
130
+ console.print(
131
+ Panel(
132
+ f"Failed to report task {task_outcome.task.id}\n"
133
+ f"Exception: {e}\nRetrying...",
134
+ title="Reporting Error",
135
+ border_style="error",
136
+ )
137
+ )
138
+ await asyncio.sleep(5)
139
+ continue
140
+
141
+ self._task_store.mark_reported(task_id=task_outcome.task.id)
142
+
143
+ async def task_launcher(self):
144
+ async_tasks: List[asyncio.Task] = []
145
+ fn_queue: List[FunctionInput] = []
146
+ async_tasks.append(
147
+ asyncio.create_task(
148
+ self._task_store.get_runnable_tasks(), name="get_runnable_tasks"
149
+ )
150
+ )
151
+ while True:
152
+ fn: FunctionInput
153
+ for fn in fn_queue:
154
+ task: Task = self._task_store.get_task(fn.task_id)
155
+ async_tasks.append(
156
+ ExtractTask(
157
+ function_worker=self._function_worker,
158
+ task=task,
159
+ input=fn.input,
160
+ code_path=f"{self._code_path}/{task.namespace}/{task.compute_graph}.{task.graph_version}",
161
+ init_value=fn.init_value,
162
+ )
163
+ )
164
+
165
+ fn_queue = []
166
+ done, pending = await asyncio.wait(
167
+ async_tasks, return_when=asyncio.FIRST_COMPLETED
168
+ )
169
+
170
+ async_tasks: List[asyncio.Task] = list(pending)
171
+ for async_task in done:
172
+ if async_task.get_name() == "get_runnable_tasks":
173
+ if async_task.exception():
174
+ console.print(
175
+ Text("Task Launcher Error: ", style="red bold")
176
+ + Text(
177
+ f"Failed to get runnable tasks: {async_task.exception()}",
178
+ style="red",
179
+ )
180
+ )
181
+ continue
182
+ result: Dict[str, Task] = await async_task
183
+ task: Task
184
+ for _, task in result.items():
185
+ async_tasks.append(
186
+ DownloadGraphTask(task=task, downloader=self._downloader)
187
+ )
188
+ async_tasks.append(
189
+ asyncio.create_task(
190
+ self._task_store.get_runnable_tasks(),
191
+ name="get_runnable_tasks",
192
+ )
193
+ )
194
+ elif async_task.get_name() == "download_graph":
195
+ if async_task.exception():
196
+ console.print(
197
+ Text(
198
+ f"Failed to download graph for task {async_task.task.id}\n",
199
+ style="red bold",
200
+ )
201
+ + Text(f"Exception: {async_task.exception()}", style="red")
202
+ )
203
+ completed_task = CompletedTask(
204
+ task=async_task.task,
205
+ outputs=[],
206
+ task_outcome="failure",
207
+ )
208
+ self._task_store.complete(outcome=completed_task)
209
+ continue
210
+ async_tasks.append(
211
+ DownloadInputTask(
212
+ task=async_task.task, downloader=self._downloader
213
+ )
214
+ )
215
+ elif async_task.get_name() == "download_input":
216
+ if async_task.exception():
217
+ console.print(
218
+ Text(
219
+ f"Failed to download input for task {async_task.task.id}\n",
220
+ style="red bold",
221
+ )
222
+ + Text(f"Exception: {async_task.exception()}", style="red")
223
+ )
224
+ completed_task = CompletedTask(
225
+ task=async_task.task,
226
+ outputs=[],
227
+ task_outcome="failure",
228
+ )
229
+ self._task_store.complete(outcome=completed_task)
230
+ continue
231
+ downloaded_inputs: DownloadedInputs = await async_task
232
+ task: Task = async_task.task
233
+ fn_queue.append(
234
+ FunctionInput(
235
+ task_id=task.id,
236
+ namespace=task.namespace,
237
+ compute_graph=task.compute_graph,
238
+ function=task.compute_fn,
239
+ input=downloaded_inputs.input,
240
+ init_value=downloaded_inputs.init_value,
241
+ )
242
+ )
243
+ elif async_task.get_name() == "run_function":
244
+ if async_task.exception():
245
+ completed_task = CompletedTask(
246
+ task=async_task.task,
247
+ task_outcome="failure",
248
+ outputs=[],
249
+ errors=str(async_task.exception()),
250
+ )
251
+ self._task_store.complete(outcome=completed_task)
252
+ continue
253
+ async_task: ExtractTask
254
+ try:
255
+ outputs: FunctionWorkerOutput = await async_task
256
+ if not outputs.success:
257
+ task_outcome = "failure"
258
+ else:
259
+ task_outcome = "success"
260
+
261
+ completed_task = CompletedTask(
262
+ task=async_task.task,
263
+ task_outcome=task_outcome,
264
+ outputs=outputs.fn_outputs,
265
+ router_output=outputs.router_output,
266
+ errors=outputs.exception,
267
+ stdout=outputs.stdout,
268
+ stderr=outputs.stderr,
269
+ reducer=outputs.reducer,
270
+ )
271
+ self._task_store.complete(outcome=completed_task)
272
+ except BrokenProcessPool:
273
+ self._task_store.retriable_failure(async_task.task.id)
274
+ continue
275
+ except Exception as e:
276
+ console.print(
277
+ Text(
278
+ f"Failed to execute task {async_task.task.id}\n",
279
+ style="red bold",
280
+ )
281
+ + Text(f"Exception: {e}", style="red")
282
+ )
283
+ completed_task = CompletedTask(
284
+ task=async_task.task,
285
+ task_outcome="failure",
286
+ outputs=[],
287
+ )
288
+ self._task_store.complete(outcome=completed_task)
289
+ continue
290
+
291
+ async def run(self):
292
+ import signal
293
+
294
+ asyncio.get_event_loop().add_signal_handler(
295
+ signal.SIGINT, self.shutdown, asyncio.get_event_loop()
296
+ )
297
+ asyncio.create_task(self.task_launcher())
298
+ asyncio.create_task(self.task_completion_reporter())
299
+ self._should_run = True
300
+ while self._should_run:
301
+ self._protocol = "http"
302
+ url = f"{self._protocol}://{self._server_addr}/internal/executors/{self._executor_id}/tasks"
303
+
304
+ def to_sentence_case(snake_str):
305
+ words = snake_str.split("_")
306
+ return words[0].capitalize() + "" + " ".join(words[1:])
307
+
308
+ data = ExecutorMetadata(
309
+ id=self._executor_id,
310
+ address="",
311
+ runner_name="extractor",
312
+ labels={},
313
+ ).model_dump()
314
+
315
+ panel_content = "\n".join(
316
+ [f"{to_sentence_case(key)}: {value}" for key, value in data.items()]
317
+ )
318
+ console.print(
319
+ Panel(
320
+ panel_content,
321
+ title="Attempting to Register Executor",
322
+ border_style="cyan",
323
+ )
324
+ )
325
+
326
+ try:
327
+ async with httpx.AsyncClient() as client:
328
+ async with aconnect_sse(
329
+ client,
330
+ "POST",
331
+ url,
332
+ json=data,
333
+ headers={"Content-Type": "application/json"},
334
+ ) as event_source:
335
+ console.print(
336
+ Text("Executor registered successfully", style="bold green")
337
+ )
338
+ async for sse in event_source.aiter_sse():
339
+ data = json.loads(sse.data)
340
+ tasks = []
341
+ for task_dict in data:
342
+ tasks.append(
343
+ Task.model_validate(task_dict, strict=False)
344
+ )
345
+ self._task_store.add_tasks(tasks)
346
+ except Exception as e:
347
+ console.print(
348
+ Text("Registration Error: ", style="red bold")
349
+ + Text(f"Failed to register: {e}", style="red")
350
+ )
351
+ await asyncio.sleep(5)
352
+ continue
353
+
354
+ async def _shutdown(self, loop):
355
+ console.print(Text("Shutting down agent...", style="bold yellow"))
356
+ self._should_run = False
357
+ for task in asyncio.all_tasks(loop):
358
+ task.cancel()
359
+
360
+ def shutdown(self, loop):
361
+ self._function_worker.shutdown()
362
+ loop.create_task(self._shutdown(loop))
@@ -0,0 +1,43 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Json
4
+
5
+ from indexify.functions_sdk.data_objects import IndexifyData
6
+
7
+
8
+ class Task(BaseModel):
9
+ id: str
10
+ namespace: str
11
+ compute_graph: str
12
+ compute_fn: str
13
+ invocation_id: str
14
+ input_key: str
15
+ reducer_output_id: Optional[str] = None
16
+ graph_version: int
17
+
18
+
19
+ class ExecutorMetadata(BaseModel):
20
+ id: str
21
+ address: str
22
+ runner_name: str
23
+ labels: Dict[str, Any]
24
+
25
+
26
+ class RouterOutput(BaseModel):
27
+ edges: List[str]
28
+
29
+
30
+ class FnOutput(BaseModel):
31
+ payload: Json
32
+
33
+
34
+ class TaskResult(BaseModel):
35
+ router_output: Optional[RouterOutput] = None
36
+ outcome: str
37
+ namespace: str
38
+ compute_graph: str
39
+ compute_fn: str
40
+ invocation_id: str
41
+ executor_id: str
42
+ task_id: str
43
+ reducer: bool = False
@@ -0,0 +1,124 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ import httpx
5
+ from pydantic import BaseModel
6
+ from rich.console import Console
7
+ from rich.panel import Panel
8
+ from rich.theme import Theme
9
+
10
+ from indexify.functions_sdk.data_objects import IndexifyData
11
+ from indexify.functions_sdk.object_serializer import MsgPackSerializer
12
+
13
+ from .api_objects import Task
14
+
15
+ custom_theme = Theme(
16
+ {
17
+ "info": "cyan",
18
+ "warning": "yellow",
19
+ "error": "red",
20
+ }
21
+ )
22
+
23
+ console = Console(theme=custom_theme)
24
+
25
+
26
+ class DownloadedInputs(BaseModel):
27
+ input: IndexifyData
28
+ init_value: Optional[IndexifyData] = None
29
+
30
+
31
+ class Downloader:
32
+ def __init__(self, code_path: str, base_url: str):
33
+ self.code_path = code_path
34
+ self.base_url = base_url
35
+
36
+ async def download_graph(self, namespace: str, name: str, version: int) -> str:
37
+ path = os.path.join(self.code_path, namespace, f"{name}.{version}")
38
+ if os.path.exists(path):
39
+ return path
40
+
41
+ console.print(
42
+ Panel(
43
+ f"Downloading graph: {name}\nPath: {path}",
44
+ title="downloader",
45
+ border_style="cyan",
46
+ )
47
+ )
48
+
49
+ response = httpx.get(
50
+ f"{self.base_url}/internal/namespaces/{namespace}/compute_graphs/{name}/code"
51
+ )
52
+ try:
53
+ response.raise_for_status()
54
+ except httpx.HTTPStatusError as e:
55
+ console.print(
56
+ Panel(
57
+ f"Failed to download graph: {name}\nError: {response.text}",
58
+ title="downloader error",
59
+ border_style="error",
60
+ )
61
+ )
62
+ raise
63
+
64
+ os.makedirs(os.path.dirname(path), exist_ok=True)
65
+ with open(path, "wb") as f:
66
+ f.write(response.content)
67
+ return path
68
+
69
+ async def download_input(self, task: Task) -> IndexifyData:
70
+ input_id = task.input_key.split("|")[-1]
71
+ if task.invocation_id == input_id:
72
+ url = f"{self.base_url}/namespaces/{task.namespace}/compute_graphs/{task.compute_graph}/invocations/{task.invocation_id}/payload"
73
+ else:
74
+ url = f"{self.base_url}/internal/fn_outputs/{task.input_key}"
75
+
76
+ reducer_url = None
77
+ if task.reducer_output_id:
78
+ reducer_url = f"{self.base_url}/namespaces/{task.namespace}/compute_graphs/{task.compute_graph}/invocations/{task.invocation_id}/fn/{task.compute_fn}/{task.reducer_output_id}"
79
+
80
+ console.print(
81
+ Panel(
82
+ f"downloading input\nURL: {url} \n reducer input URL: {reducer_url}",
83
+ title="downloader",
84
+ border_style="cyan",
85
+ )
86
+ )
87
+
88
+ response = httpx.get(url)
89
+ try:
90
+ response.raise_for_status()
91
+ except httpx.HTTPStatusError as e:
92
+ console.print(
93
+ Panel(
94
+ f"failed to download input: {task.input_key}\nError: {response.text}",
95
+ title="downloader error",
96
+ border_style="error",
97
+ )
98
+ )
99
+ raise
100
+
101
+ if task.invocation_id == input_id:
102
+ return DownloadedInputs(
103
+ input=IndexifyData(payload=response.content, id=input_id)
104
+ )
105
+
106
+ init_value = None
107
+ if reducer_url:
108
+ init_value = httpx.get(reducer_url)
109
+ try:
110
+ init_value.raise_for_status()
111
+ except httpx.HTTPStatusError as e:
112
+ console.print(
113
+ Panel(
114
+ f"failed to download reducer output: {task.reducer_output_id}\nError: {init_value.text}",
115
+ title="downloader error",
116
+ border_style="error",
117
+ )
118
+ )
119
+ raise
120
+ init_value = MsgPackSerializer.deserialize(init_value.content)
121
+
122
+ return DownloadedInputs(
123
+ input=MsgPackSerializer.deserialize(response.content), init_value=init_value
124
+ )
@@ -0,0 +1,72 @@
1
+ import asyncio
2
+ from typing import Optional
3
+
4
+ from indexify.functions_sdk.data_objects import IndexifyData
5
+
6
+ from .api_objects import Task
7
+ from .downloader import Downloader
8
+ from .function_worker import FunctionWorker
9
+
10
+
11
+ class DownloadGraphTask(asyncio.Task):
12
+ def __init__(
13
+ self,
14
+ *,
15
+ task: Task,
16
+ downloader: Downloader,
17
+ **kwargs,
18
+ ):
19
+ kwargs["name"] = "download_graph"
20
+ kwargs["loop"] = asyncio.get_event_loop()
21
+ super().__init__(
22
+ downloader.download_graph(
23
+ task.namespace, task.compute_graph, task.graph_version
24
+ ),
25
+ **kwargs,
26
+ )
27
+ self.task = task
28
+
29
+
30
+ class DownloadInputTask(asyncio.Task):
31
+ def __init__(
32
+ self,
33
+ *,
34
+ task: Task,
35
+ downloader: Downloader,
36
+ **kwargs,
37
+ ):
38
+ kwargs["name"] = "download_input"
39
+ kwargs["loop"] = asyncio.get_event_loop()
40
+ super().__init__(
41
+ downloader.download_input(task),
42
+ **kwargs,
43
+ )
44
+ self.task = task
45
+
46
+
47
+ class ExtractTask(asyncio.Task):
48
+ def __init__(
49
+ self,
50
+ *,
51
+ function_worker: FunctionWorker,
52
+ task: Task,
53
+ input: IndexifyData,
54
+ init_value: Optional[IndexifyData] = None,
55
+ code_path: str,
56
+ **kwargs,
57
+ ):
58
+ kwargs["name"] = "run_function"
59
+ kwargs["loop"] = asyncio.get_event_loop()
60
+ super().__init__(
61
+ function_worker.async_submit(
62
+ namespace=task.namespace,
63
+ graph_name=task.compute_graph,
64
+ fn_name=task.compute_fn,
65
+ input=input,
66
+ init_value=init_value,
67
+ code_path=code_path,
68
+ version=task.graph_version,
69
+ ),
70
+ **kwargs,
71
+ )
72
+ self.task = task