indexify 0.0.43__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +15 -14
- indexify/base_client.py +48 -21
- indexify/cli.py +247 -0
- indexify/client.py +18 -790
- indexify/error.py +3 -30
- indexify/executor/agent.py +364 -0
- indexify/executor/api_objects.py +43 -0
- indexify/executor/downloader.py +124 -0
- indexify/executor/executor_tasks.py +72 -0
- indexify/executor/function_worker.py +177 -0
- indexify/executor/indexify_executor.py +32 -0
- indexify/executor/runtime_probes.py +48 -0
- indexify/executor/task_reporter.py +110 -0
- indexify/executor/task_store.py +113 -0
- indexify/foo +72 -0
- indexify/functions_sdk/data_objects.py +37 -0
- indexify/functions_sdk/graph.py +281 -0
- indexify/functions_sdk/graph_validation.py +66 -0
- indexify/functions_sdk/image.py +34 -0
- indexify/functions_sdk/indexify_functions.py +188 -0
- indexify/functions_sdk/local_cache.py +46 -0
- indexify/functions_sdk/object_serializer.py +60 -0
- indexify/local_client.py +183 -0
- indexify/remote_client.py +319 -0
- indexify-0.2.1.dist-info/METADATA +151 -0
- indexify-0.2.1.dist-info/RECORD +33 -0
- indexify-0.2.1.dist-info/entry_points.txt +3 -0
- indexify/exceptions.py +0 -3
- indexify/extraction_policy.py +0 -75
- indexify/extractor_sdk/__init__.py +0 -14
- indexify/extractor_sdk/data.py +0 -100
- indexify/extractor_sdk/extractor.py +0 -225
- indexify/extractor_sdk/utils.py +0 -102
- indexify/extractors/__init__.py +0 -0
- indexify/extractors/embedding.py +0 -55
- indexify/extractors/pdf_parser.py +0 -93
- indexify/graph.py +0 -133
- indexify/local_runner.py +0 -128
- indexify/runner.py +0 -22
- indexify/utils.py +0 -7
- indexify-0.0.43.dist-info/METADATA +0 -66
- indexify-0.0.43.dist-info/RECORD +0 -25
- {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/WHEEL +0 -0
indexify/local_client.py
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from queue import deque
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
4
|
+
|
5
|
+
from nanoid import generate
|
6
|
+
from pydantic import BaseModel, Json
|
7
|
+
from rich import print
|
8
|
+
|
9
|
+
from indexify.base_client import IndexifyClient
|
10
|
+
from indexify.functions_sdk.data_objects import (
|
11
|
+
File,
|
12
|
+
IndexifyData,
|
13
|
+
RouterOutput,
|
14
|
+
)
|
15
|
+
from indexify.functions_sdk.graph import Graph
|
16
|
+
from indexify.functions_sdk.local_cache import CacheAwareFunctionWrapper
|
17
|
+
from indexify.functions_sdk.object_serializer import get_serializer
|
18
|
+
|
19
|
+
|
20
|
+
# Holds the outputs of a
|
21
|
+
class ContentTree(BaseModel):
|
22
|
+
id: str
|
23
|
+
outputs: Dict[str, List[IndexifyData]]
|
24
|
+
|
25
|
+
|
26
|
+
class LocalClient(IndexifyClient):
|
27
|
+
def __init__(self, cache_dir: str = "./indexify_local_runner_cache"):
|
28
|
+
self._cache_dir = cache_dir
|
29
|
+
self._graphs: Dict[str, Graph] = {}
|
30
|
+
self._results: Dict[str, Dict[str, List[IndexifyData]]] = {}
|
31
|
+
self._cache = CacheAwareFunctionWrapper(self._cache_dir)
|
32
|
+
self._accumulators: Dict[str, Dict[str, IndexifyData]] = {}
|
33
|
+
|
34
|
+
def register_compute_graph(self, graph: Graph):
|
35
|
+
self._graphs[graph.name] = graph
|
36
|
+
|
37
|
+
def run_from_serialized_code(self, code: bytes, **kwargs):
|
38
|
+
g = Graph.deserialize(graph=code)
|
39
|
+
self.run(g, **kwargs)
|
40
|
+
|
41
|
+
def run(self, g: Graph, **kwargs):
|
42
|
+
serializer = get_serializer(
|
43
|
+
g.get_function(g._start_node).indexify_function.payload_encoder
|
44
|
+
)
|
45
|
+
input = IndexifyData(id=generate(), payload=serializer.serialize(kwargs))
|
46
|
+
print(f"[bold] Invoking {g._start_node}[/bold]")
|
47
|
+
outputs = defaultdict(list)
|
48
|
+
for k, v in g.get_accumulators().items():
|
49
|
+
serializer = get_serializer(
|
50
|
+
g.get_function(k).indexify_function.payload_encoder
|
51
|
+
)
|
52
|
+
self._accumulators[k] = IndexifyData(payload=serializer.deserialize(v))
|
53
|
+
self._results[input.id] = outputs
|
54
|
+
self._run(g, input, outputs)
|
55
|
+
return input.id
|
56
|
+
|
57
|
+
def _run(
|
58
|
+
self,
|
59
|
+
g: Graph,
|
60
|
+
initial_input: bytes,
|
61
|
+
outputs: Dict[str, List[bytes]],
|
62
|
+
):
|
63
|
+
queue = deque([(g._start_node, initial_input)])
|
64
|
+
while queue:
|
65
|
+
node_name, input = queue.popleft()
|
66
|
+
serializer = get_serializer(
|
67
|
+
g.get_function(node_name).indexify_function.payload_encoder
|
68
|
+
)
|
69
|
+
input_bytes = serializer.serialize(input)
|
70
|
+
cached_output_bytes: Optional[bytes] = self._cache.get(
|
71
|
+
g.name, node_name, input_bytes
|
72
|
+
)
|
73
|
+
if cached_output_bytes is not None:
|
74
|
+
print(
|
75
|
+
f"ran {node_name}: num outputs: {len(cached_output_bytes)} (cache hit)"
|
76
|
+
)
|
77
|
+
function_outputs: List[IndexifyData] = []
|
78
|
+
cached_output_list = serializer.deserialize_list(cached_output_bytes)
|
79
|
+
if self._accumulators.get(node_name, None) is not None:
|
80
|
+
self._accumulators[node_name] = cached_output_list[-1].model_copy()
|
81
|
+
outputs[node_name] = []
|
82
|
+
function_outputs.extend(cached_output_list)
|
83
|
+
outputs[node_name].extend(cached_output_list)
|
84
|
+
else:
|
85
|
+
function_outputs: List[IndexifyData] = g.invoke_fn_ser(
|
86
|
+
node_name, input, self._accumulators.get(node_name, None)
|
87
|
+
)
|
88
|
+
print(f"ran {node_name}: num outputs: {len(function_outputs)}")
|
89
|
+
if self._accumulators.get(node_name, None) is not None:
|
90
|
+
self._accumulators[node_name] = function_outputs[-1].model_copy()
|
91
|
+
outputs[node_name] = []
|
92
|
+
outputs[node_name].extend(function_outputs)
|
93
|
+
function_outputs_bytes: List[bytes] = [
|
94
|
+
serializer.serialize_list(function_outputs)
|
95
|
+
]
|
96
|
+
self._cache.set(
|
97
|
+
g.name,
|
98
|
+
node_name,
|
99
|
+
input_bytes,
|
100
|
+
function_outputs_bytes,
|
101
|
+
)
|
102
|
+
if self._accumulators.get(node_name, None) is not None and queue:
|
103
|
+
print(
|
104
|
+
f"accumulator not none for {node_name}, continuing, len queue: {len(queue)}"
|
105
|
+
)
|
106
|
+
continue
|
107
|
+
|
108
|
+
out_edges = g.edges.get(node_name, [])
|
109
|
+
# Figure out if there are any routers for this node
|
110
|
+
for i, edge in enumerate(out_edges):
|
111
|
+
if edge in g.routers:
|
112
|
+
out_edges.remove(edge)
|
113
|
+
for output in function_outputs:
|
114
|
+
dynamic_edges = self._route(g, edge, output) or []
|
115
|
+
for dynamic_edge in dynamic_edges.edges:
|
116
|
+
if dynamic_edge in g.nodes:
|
117
|
+
print(
|
118
|
+
f"[bold]dynamic router returned node: {dynamic_edge}[/bold]"
|
119
|
+
)
|
120
|
+
out_edges.append(dynamic_edge)
|
121
|
+
for out_edge in out_edges:
|
122
|
+
for output in function_outputs:
|
123
|
+
queue.append((out_edge, output))
|
124
|
+
|
125
|
+
def _route(
|
126
|
+
self, g: Graph, node_name: str, input: IndexifyData
|
127
|
+
) -> Optional[RouterOutput]:
|
128
|
+
return g.invoke_router(node_name, input)
|
129
|
+
|
130
|
+
def graphs(self) -> str:
|
131
|
+
return list(self._graphs.keys())
|
132
|
+
|
133
|
+
def namespaces(self) -> str:
|
134
|
+
return "local"
|
135
|
+
|
136
|
+
def create_namespace(self, namespace: str):
|
137
|
+
pass
|
138
|
+
|
139
|
+
def rerun_graph(self, graph: str):
|
140
|
+
return super().rerun_graph(graph)
|
141
|
+
|
142
|
+
def invoke_graph_with_object(
|
143
|
+
self, graph: str, block_until_done: bool = False, **kwargs
|
144
|
+
) -> str:
|
145
|
+
graph: Graph = self._graphs[graph]
|
146
|
+
return self.run(graph, **kwargs)
|
147
|
+
|
148
|
+
def invoke_graph_with_file(
|
149
|
+
self,
|
150
|
+
graph: str,
|
151
|
+
path: str,
|
152
|
+
metadata: Optional[Dict[str, Json]] = None,
|
153
|
+
block_until_done: bool = False,
|
154
|
+
) -> str:
|
155
|
+
graph = self._graphs[graph]
|
156
|
+
with open(path, "rb") as f:
|
157
|
+
data = f.read()
|
158
|
+
file = File(data=data, metadata=metadata).model_dump()
|
159
|
+
return self.run(graph, file=file)
|
160
|
+
|
161
|
+
def graph_outputs(
|
162
|
+
self,
|
163
|
+
graph: str,
|
164
|
+
invocation_id: str,
|
165
|
+
fn_name: str,
|
166
|
+
) -> Union[Dict[str, List[Any]], List[Any]]:
|
167
|
+
if invocation_id not in self._results:
|
168
|
+
raise ValueError(f"no results found for graph {graph}")
|
169
|
+
if fn_name not in self._results[invocation_id]:
|
170
|
+
raise ValueError(f"no results found for fn {fn_name} on graph {graph}")
|
171
|
+
results = []
|
172
|
+
fn_model = self._graphs[graph].get_function(fn_name).get_output_model()
|
173
|
+
serializer = get_serializer(
|
174
|
+
self._graphs[graph].get_function(fn_name).indexify_function.payload_encoder
|
175
|
+
)
|
176
|
+
for result in self._results[invocation_id][fn_name]:
|
177
|
+
payload_dict = serializer.deserialize(result.payload)
|
178
|
+
if issubclass(fn_model, BaseModel) and isinstance(payload_dict, dict):
|
179
|
+
payload = fn_model.model_validate(payload_dict)
|
180
|
+
else:
|
181
|
+
payload = payload_dict
|
182
|
+
results.append(payload)
|
183
|
+
return results
|
@@ -0,0 +1,319 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
import cloudpickle
|
6
|
+
import httpx
|
7
|
+
import msgpack
|
8
|
+
import yaml
|
9
|
+
from httpx_sse import connect_sse
|
10
|
+
from pydantic import BaseModel, Json
|
11
|
+
from rich import print
|
12
|
+
|
13
|
+
from indexify.base_client import IndexifyClient
|
14
|
+
from indexify.error import ApiException
|
15
|
+
from indexify.functions_sdk.data_objects import IndexifyData
|
16
|
+
from indexify.functions_sdk.graph import ComputeGraphMetadata, Graph
|
17
|
+
from indexify.settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
18
|
+
|
19
|
+
|
20
|
+
class InvocationEventPayload(BaseModel):
|
21
|
+
invocation_id: str
|
22
|
+
fn_name: str
|
23
|
+
task_id: str
|
24
|
+
executor_id: Optional[str] = None
|
25
|
+
outcome: Optional[str] = None
|
26
|
+
|
27
|
+
|
28
|
+
class InvocationEvent(BaseModel):
|
29
|
+
event_name: str
|
30
|
+
payload: InvocationEventPayload
|
31
|
+
|
32
|
+
|
33
|
+
class GraphOutputMetadata(BaseModel):
|
34
|
+
id: str
|
35
|
+
compute_fn: str
|
36
|
+
|
37
|
+
|
38
|
+
class GraphOutputs(BaseModel):
|
39
|
+
outputs: List[GraphOutputMetadata]
|
40
|
+
|
41
|
+
|
42
|
+
class RemoteClient(IndexifyClient):
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
service_url: str = DEFAULT_SERVICE_URL,
|
46
|
+
config_path: Optional[str] = None,
|
47
|
+
namespace: str = "default",
|
48
|
+
**kwargs,
|
49
|
+
):
|
50
|
+
if os.environ.get("INDEXIFY_URL"):
|
51
|
+
print("Using INDEXIFY_URL environment variable to connect to Indexify")
|
52
|
+
service_url = os.environ["INDEXIFY_URL"]
|
53
|
+
|
54
|
+
self.service_url = service_url
|
55
|
+
self._client = httpx.Client()
|
56
|
+
if config_path:
|
57
|
+
with open(config_path, "r") as file:
|
58
|
+
config = yaml.safe_load(file)
|
59
|
+
|
60
|
+
if config.get("use_tls", False):
|
61
|
+
tls_config = config["tls_config"]
|
62
|
+
self._client = httpx.Client(
|
63
|
+
http2=True,
|
64
|
+
cert=(tls_config["cert_path"], tls_config["key_path"]),
|
65
|
+
verify=tls_config.get("ca_bundle_path", True),
|
66
|
+
)
|
67
|
+
|
68
|
+
self.namespace: str = namespace
|
69
|
+
self.compute_graphs: List[Graph] = []
|
70
|
+
self.labels: dict = {}
|
71
|
+
self._service_url = service_url
|
72
|
+
self._timeout = kwargs.get("timeout")
|
73
|
+
self._graphs: Dict[str, Graph] = {}
|
74
|
+
|
75
|
+
def _request(self, method: str, **kwargs) -> httpx.Response:
|
76
|
+
try:
|
77
|
+
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
78
|
+
status_code = str(response.status_code)
|
79
|
+
if status_code.startswith("4"):
|
80
|
+
raise ApiException(
|
81
|
+
"status code: " + status_code + " request args: " + str(kwargs)
|
82
|
+
)
|
83
|
+
if status_code.startswith("5"):
|
84
|
+
raise ApiException(response.text)
|
85
|
+
except httpx.ConnectError:
|
86
|
+
message = (
|
87
|
+
f"Make sure the server is running and accesible at {self._service_url}"
|
88
|
+
)
|
89
|
+
ex = ApiException(status="ConnectionError", message=message)
|
90
|
+
print(ex)
|
91
|
+
raise ex
|
92
|
+
return response
|
93
|
+
|
94
|
+
@classmethod
|
95
|
+
def with_mtls(
|
96
|
+
cls,
|
97
|
+
cert_path: str,
|
98
|
+
key_path: str,
|
99
|
+
ca_bundle_path: Optional[str] = None,
|
100
|
+
service_url: str = DEFAULT_SERVICE_URL_HTTPS,
|
101
|
+
*args,
|
102
|
+
**kwargs,
|
103
|
+
) -> "RemoteClient":
|
104
|
+
"""
|
105
|
+
Create a client with mutual TLS authentication. Also enables HTTP/2,
|
106
|
+
which is required for mTLS.
|
107
|
+
NOTE: mTLS must be enabled on the Indexify service for this to work.
|
108
|
+
|
109
|
+
:param cert_path: Path to the client certificate. Resolution handled by httpx.
|
110
|
+
:param key_path: Path to the client key. Resolution handled by httpx.
|
111
|
+
:param args: Arguments to pass to the httpx.Client constructor
|
112
|
+
:param kwargs: Keyword arguments to pass to the httpx.Client constructor
|
113
|
+
:return: A client with mTLS authentication
|
114
|
+
|
115
|
+
Example usage:
|
116
|
+
```
|
117
|
+
from indexify import IndexifyClient
|
118
|
+
|
119
|
+
client = IndexifyClient.with_mtls(
|
120
|
+
cert_path="/path/to/cert.pem",
|
121
|
+
key_path="/path/to/key.pem",
|
122
|
+
)
|
123
|
+
assert client.heartbeat() == True
|
124
|
+
```
|
125
|
+
"""
|
126
|
+
if not (cert_path and key_path):
|
127
|
+
raise ValueError("Both cert and key must be provided for mTLS")
|
128
|
+
|
129
|
+
client_certs = (cert_path, key_path)
|
130
|
+
verify_option = ca_bundle_path if ca_bundle_path else True
|
131
|
+
client = RemoteClient(
|
132
|
+
*args,
|
133
|
+
**kwargs,
|
134
|
+
service_url=service_url,
|
135
|
+
http2=True,
|
136
|
+
cert=client_certs,
|
137
|
+
verify=verify_option,
|
138
|
+
)
|
139
|
+
return client
|
140
|
+
|
141
|
+
def _get(self, endpoint: str, **kwargs) -> httpx.Response:
|
142
|
+
return self._request("GET", url=f"{self._service_url}/{endpoint}", **kwargs)
|
143
|
+
|
144
|
+
def _post(self, endpoint: str, **kwargs) -> httpx.Response:
|
145
|
+
return self._request("POST", url=f"{self._service_url}/{endpoint}", **kwargs)
|
146
|
+
|
147
|
+
def _put(self, endpoint: str, **kwargs) -> httpx.Response:
|
148
|
+
return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
|
149
|
+
|
150
|
+
def _delete(self, endpoint: str, **kwargs) -> httpx.Response:
|
151
|
+
return self._request("DELETE", url=f"{self._service_url}/{endpoint}", **kwargs)
|
152
|
+
|
153
|
+
def _close(self):
|
154
|
+
self._client.close()
|
155
|
+
|
156
|
+
def __enter__(self):
|
157
|
+
return self
|
158
|
+
|
159
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
160
|
+
self.close()
|
161
|
+
|
162
|
+
def register_compute_graph(self, graph: Graph):
|
163
|
+
graph_metadata = graph.definition()
|
164
|
+
serialized_code = graph.serialize()
|
165
|
+
response = self._post(
|
166
|
+
f"namespaces/{self.namespace}/compute_graphs",
|
167
|
+
files={"code": serialized_code},
|
168
|
+
data={"compute_graph": graph_metadata.model_dump_json(exclude_none=True)},
|
169
|
+
)
|
170
|
+
print(response.content.decode("utf-8"))
|
171
|
+
response.raise_for_status()
|
172
|
+
self._graphs[graph.name] = graph
|
173
|
+
|
174
|
+
def graphs(self) -> List[str]:
|
175
|
+
response = self._get(f"graphs")
|
176
|
+
return response.json()["graphs"]
|
177
|
+
|
178
|
+
def graph(self, name: str) -> ComputeGraphMetadata:
|
179
|
+
response = self._get(f"namespaces/{self.namespace}/compute_graphs/{name}")
|
180
|
+
return ComputeGraphMetadata(**response.json())
|
181
|
+
|
182
|
+
def load_graph(self, name: str) -> Graph:
|
183
|
+
response = self._get(
|
184
|
+
f"internal/namespaces/{self.namespace}/compute_graphs/{name}/code"
|
185
|
+
)
|
186
|
+
return Graph.deserialize(response.content)
|
187
|
+
|
188
|
+
def namespaces(self) -> List[str]:
|
189
|
+
response = self._get(f"namespaces")
|
190
|
+
namespaces_dict = response.json()["namespaces"]
|
191
|
+
namespaces = []
|
192
|
+
for item in namespaces_dict:
|
193
|
+
namespaces.append(item["name"])
|
194
|
+
return namespaces
|
195
|
+
|
196
|
+
def create_namespace(self, namespace: str):
|
197
|
+
self._post("namespaces", json={"namespace": namespace})
|
198
|
+
|
199
|
+
def logs(
|
200
|
+
self, invocation_id: str, cg_name: str, fn_name: str, file: str
|
201
|
+
) -> Optional[str]:
|
202
|
+
try:
|
203
|
+
response = self._get(
|
204
|
+
f"namespaces/{self.namespace}/compute_graphs/{cg_name}/invocations/{invocation_id}/fn/{fn_name}/logs/{file}"
|
205
|
+
)
|
206
|
+
response.raise_for_status()
|
207
|
+
return response.content.decode("utf-8")
|
208
|
+
except ApiException as e:
|
209
|
+
print(f"failed to fetch logs: {e}")
|
210
|
+
return None
|
211
|
+
|
212
|
+
def rerun_graph(self, graph: str):
|
213
|
+
self._post(f"namespaces/{self.namespace}/compute_graphs/{graph}/rerun")
|
214
|
+
|
215
|
+
def invoke_graph_with_object(
|
216
|
+
self, graph: str, block_until_done: bool = False, **kwargs
|
217
|
+
) -> str:
|
218
|
+
ser_input = cloudpickle.dumps(kwargs)
|
219
|
+
params = {"block_until_finish": block_until_done}
|
220
|
+
with httpx.Client() as client:
|
221
|
+
with connect_sse(
|
222
|
+
client,
|
223
|
+
"POST",
|
224
|
+
f"{self.service_url}/namespaces/{self.namespace}/compute_graphs/{graph}/invoke_object",
|
225
|
+
headers={"Content-Type": "application/cbor"},
|
226
|
+
data=ser_input,
|
227
|
+
params=params,
|
228
|
+
) as event_source:
|
229
|
+
for sse in event_source.iter_sse():
|
230
|
+
obj = json.loads(sse.data)
|
231
|
+
for k, v in obj.items():
|
232
|
+
if k == "InvocationFinished":
|
233
|
+
return v["id"]
|
234
|
+
event_payload = InvocationEventPayload.model_validate(v)
|
235
|
+
event = InvocationEvent(event_name=k, payload=event_payload)
|
236
|
+
if (
|
237
|
+
event.event_name == "TaskCompleted"
|
238
|
+
and event.payload.outcome == "Failure"
|
239
|
+
):
|
240
|
+
stdout = self.logs(
|
241
|
+
event.payload.invocation_id,
|
242
|
+
graph,
|
243
|
+
event.payload.fn_name,
|
244
|
+
"stdout",
|
245
|
+
)
|
246
|
+
stderr = self.logs(
|
247
|
+
event.payload.invocation_id,
|
248
|
+
graph,
|
249
|
+
event.payload.fn_name,
|
250
|
+
"stderr",
|
251
|
+
)
|
252
|
+
if stdout:
|
253
|
+
print(f"[bold red]stdout[/bold red]: \n {stdout}")
|
254
|
+
if stderr:
|
255
|
+
print(f"[bold red]stderr[/bold red]: \n {stderr}")
|
256
|
+
print(
|
257
|
+
f"[bold green]{event.event_name}[/bold green]: {event.payload}"
|
258
|
+
)
|
259
|
+
raise Exception("invocation ID not returned")
|
260
|
+
|
261
|
+
def _download_output(
|
262
|
+
self,
|
263
|
+
namespace: str,
|
264
|
+
graph: str,
|
265
|
+
invocation_id: str,
|
266
|
+
fn_name: str,
|
267
|
+
output_id: str,
|
268
|
+
) -> IndexifyData:
|
269
|
+
response = self._get(
|
270
|
+
f"namespaces/{namespace}/compute_graphs/{graph}/invocations/{invocation_id}/fn/{fn_name}/{output_id}",
|
271
|
+
)
|
272
|
+
response.raise_for_status()
|
273
|
+
data_dict = msgpack.unpackb(response.content)
|
274
|
+
return IndexifyData.model_validate(data_dict)
|
275
|
+
|
276
|
+
def graph_outputs(
|
277
|
+
self,
|
278
|
+
graph: str,
|
279
|
+
invocation_id: str,
|
280
|
+
fn_name: Optional[str],
|
281
|
+
) -> List[Any]:
|
282
|
+
"""
|
283
|
+
Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
|
284
|
+
If the extractor name is not provided, all the extracted objects are returned for the input object.
|
285
|
+
graph: str: The name of the graph
|
286
|
+
invocation_id: str: The ID of the ingested object
|
287
|
+
extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
|
288
|
+
block_until_done: bool = True: If True, the method will block until the extraction is done. If False, the method will return immediately.
|
289
|
+
return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
|
290
|
+
"""
|
291
|
+
if graph not in self._graphs:
|
292
|
+
self._graphs[graph] = self.load_graph(graph)
|
293
|
+
response = self._get(
|
294
|
+
f"namespaces/{self.namespace}/compute_graphs/{graph}/invocations/{invocation_id}/outputs",
|
295
|
+
)
|
296
|
+
response.raise_for_status()
|
297
|
+
graph_outputs = GraphOutputs(**response.json())
|
298
|
+
outputs = []
|
299
|
+
for output in graph_outputs.outputs:
|
300
|
+
if output.compute_fn == fn_name:
|
301
|
+
indexify_data = self._download_output(
|
302
|
+
self.namespace, graph, invocation_id, fn_name, output.id
|
303
|
+
)
|
304
|
+
output = self._graphs[graph].deserialize_fn_output(
|
305
|
+
fn_name, indexify_data
|
306
|
+
)
|
307
|
+
outputs.append(output)
|
308
|
+
return outputs
|
309
|
+
|
310
|
+
def invoke_graph_with_file(
|
311
|
+
self, graph: str, path: str, metadata: Optional[Dict[str, Json]] = None
|
312
|
+
) -> str:
|
313
|
+
"""
|
314
|
+
Invokes a graph with an input file. The file's mimetype is appropriately detected.
|
315
|
+
graph: str: The name of the graph to invoke
|
316
|
+
path: str: The path to the file to be ingested
|
317
|
+
return: str: The ID of the ingested object
|
318
|
+
"""
|
319
|
+
pass
|
@@ -0,0 +1,151 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: indexify
|
3
|
+
Version: 0.2.1
|
4
|
+
Summary: Python Client for Indexify
|
5
|
+
Home-page: https://github.com/tensorlakeai/indexify
|
6
|
+
License: Apache 2.0
|
7
|
+
Author: Tensorlake Inc.
|
8
|
+
Author-email: support@tensorlake.ai
|
9
|
+
Requires-Python: >=3.9,<4.0
|
10
|
+
Classifier: License :: Other/Proprietary License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: cloudpickle (>=3,<4)
|
17
|
+
Requires-Dist: docker (>=7.1.0,<8.0.0)
|
18
|
+
Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
|
19
|
+
Requires-Dist: httpx[http2] (>=0,<1)
|
20
|
+
Requires-Dist: msgpack (>=1.1.0,<2.0.0)
|
21
|
+
Requires-Dist: nanoid (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
23
|
+
Requires-Dist: pyyaml (>=6,<7)
|
24
|
+
Requires-Dist: rich (>=13,<14)
|
25
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
26
|
+
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
27
|
+
Description-Content-Type: text/markdown
|
28
|
+
|
29
|
+
# Indexify Python SDK
|
30
|
+
|
31
|
+
[](https://badge.fury.io/py/indexify)
|
32
|
+
[](https://discord.gg/VXkY7zVmTD)
|
33
|
+
|
34
|
+
This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
|
35
|
+
|
36
|
+
Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```shell
|
41
|
+
pip install indexify
|
42
|
+
```
|
43
|
+
|
44
|
+
## Examples
|
45
|
+
**[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
|
46
|
+
1. Extracts text, tables and images from an ingested PDF file
|
47
|
+
2. Indexes the text using MiniLM-L6-v2, the images with CLIP
|
48
|
+
3. Writes the results into a vector database.
|
49
|
+
|
50
|
+
**[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
|
51
|
+
1. Downloads Youtube Video
|
52
|
+
2. Extracts audio from the video and transcribes using `Faster Whisper`
|
53
|
+
3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
|
54
|
+
4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
|
55
|
+
5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
|
56
|
+
|
57
|
+
## Quick Start
|
58
|
+
1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
|
59
|
+
2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
|
60
|
+
3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
|
61
|
+
4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
|
62
|
+
|
63
|
+
## Functional Features
|
64
|
+
1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
|
65
|
+
2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
|
66
|
+
3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
|
67
|
+
4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
|
68
|
+
|
69
|
+
```python
|
70
|
+
from pydantic import BaseModel
|
71
|
+
from indexify import indexify_function
|
72
|
+
from typing import Dict, Any, Optional, List
|
73
|
+
|
74
|
+
# Define function inputs and outputs
|
75
|
+
class Document(BaseModel):
|
76
|
+
text: str
|
77
|
+
metadata: Dict[str, Any]
|
78
|
+
|
79
|
+
class TextChunk(BaseModel):
|
80
|
+
text: str
|
81
|
+
metadata: Dict[str, Any]
|
82
|
+
embedding: Optional[List[float]] = None
|
83
|
+
|
84
|
+
|
85
|
+
# Decorate a function which is going to be part of your data processing graph
|
86
|
+
@indexify_function()
|
87
|
+
def split_text(doc: Document) -> List[TextChunk]:
|
88
|
+
midpoint = len(doc.text) // 2
|
89
|
+
first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
|
90
|
+
second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
|
91
|
+
return [first_half, second_half]
|
92
|
+
|
93
|
+
# Any requirements specified is automatically installed in production clusters
|
94
|
+
@indexify_function(requirements=["langchain_text_splitter"])
|
95
|
+
def compute_embedding(chunk: TextChunk) -> TextChunk:
|
96
|
+
chunk.embedding = [0.1, 0.2, 0.3]
|
97
|
+
return chunk
|
98
|
+
|
99
|
+
# You can constrain functions to run on specific executors
|
100
|
+
@indexify_function(executor_runtime_name="postgres-driver-image")
|
101
|
+
def write_to_db(chunk: TextChunk):
|
102
|
+
# Write to your favorite vector database
|
103
|
+
...
|
104
|
+
|
105
|
+
## Create a graph
|
106
|
+
from indexify import Graph
|
107
|
+
|
108
|
+
g = Graph(name="my_graph", start_node=split_text)
|
109
|
+
g.add_edge(split_text, compute_embedding)
|
110
|
+
g.add_edge(embed_text, write_to_db)
|
111
|
+
```
|
112
|
+
|
113
|
+
## Graph Execution
|
114
|
+
Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
|
115
|
+
|
116
|
+
## Run the Graph Locally
|
117
|
+
```python
|
118
|
+
from indexify import IndexifyClient
|
119
|
+
|
120
|
+
client = IndexifyClient(local=True)
|
121
|
+
client.register_graph(g)
|
122
|
+
invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
123
|
+
graph_outputs = client.graph_outputs(g.name, invocation_id)
|
124
|
+
```
|
125
|
+
|
126
|
+
## Deploy the Graph to Indexify Server for Production
|
127
|
+
> Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
|
128
|
+
```python
|
129
|
+
from indexify import IndexifyClient
|
130
|
+
|
131
|
+
client = IndexifyClient(service_url="http://localhost:8900")
|
132
|
+
client.register_graph(g)
|
133
|
+
```
|
134
|
+
|
135
|
+
#### Ingestion into the Service
|
136
|
+
Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
|
137
|
+
|
138
|
+
```python
|
139
|
+
output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
140
|
+
```
|
141
|
+
|
142
|
+
#### Retrieve Graph Outputs for a given ingestion object
|
143
|
+
```python
|
144
|
+
graph_outputs = client.graph_outputs(g.name, output_id)
|
145
|
+
```
|
146
|
+
|
147
|
+
#### Retrieve All Graph Inputs
|
148
|
+
```python
|
149
|
+
graph_inputs = client.graph_inputs(g.name)
|
150
|
+
```
|
151
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
indexify/__init__.py,sha256=yJ3K_TyocH2EpFr6kEbKaOqfk8dA33fpoCB_QXV-rko,549
|
2
|
+
indexify/base_client.py,sha256=Si1XnZ6X_mFvkYCnS6qx6axFsBpkrGiorqmKohFwvLQ,3324
|
3
|
+
indexify/cli.py,sha256=kLKruCRNlo1xdezxYQoN6c9EpGWAeNiSs7kAjYfxCao,7311
|
4
|
+
indexify/client.py,sha256=6cwCxBky6IJYu4caq0E6SMWIxf3nn5SX795moHfS4Cw,501
|
5
|
+
indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
|
6
|
+
indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
|
7
|
+
indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
|
8
|
+
indexify/error.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
9
|
+
indexify/executor/agent.py,sha256=I08CiOWeJ_mz8OHr9_iJfp07Ma1VMQirZ2MsDp8lDZw,14723
|
10
|
+
indexify/executor/api_objects.py,sha256=SysjlGYu4JtYdqfexZHHN1IW4TtaDdFUF3hYZ5mpUJU,810
|
11
|
+
indexify/executor/downloader.py,sha256=0MPiKw0AWs3Z7ReC9l2z-3515yqq85ghPzdh485dnuw,3998
|
12
|
+
indexify/executor/executor_tasks.py,sha256=gAZ2pvza1YwGlaR1o_tJW4SXtdCgK7sLJgp4W7rOjR0,1834
|
13
|
+
indexify/executor/function_worker.py,sha256=83ih8TjAJHtrH6LqqDoSfzAj4zB7ZZPR2Voq0RMZ1T8,5410
|
14
|
+
indexify/executor/indexify_executor.py,sha256=2Ut_VX-Su_lm4b4aEROyRJ3gXx-uFHA-V7EN0sWiARE,771
|
15
|
+
indexify/executor/runtime_probes.py,sha256=tvi8KCaQTVJqcyBJ4-jzEUAnQ01ZbMmjCxV2KJ96_PI,1449
|
16
|
+
indexify/executor/task_reporter.py,sha256=gnnse0v6rjjni8lNzeb-ZYq6iF2DgafKoT7dcGUZhQ4,3716
|
17
|
+
indexify/executor/task_store.py,sha256=q8s2gImsFffWeXQR0mk1Xlo1Aj_2GfclNPjQ2EA_YBo,3984
|
18
|
+
indexify/foo,sha256=e385Ws-u8zx-LOq3tdfTa-siK9pMaccdAE8_0rrp_k4,5165
|
19
|
+
indexify/functions_sdk/data_objects.py,sha256=2LqAWJ_S2Xkp4OQTmhd3InVIrBs7juV41udnSQFMMfM,840
|
20
|
+
indexify/functions_sdk/graph.py,sha256=qy9zVbf26oTLW0d0jt991Hd8MP6N0F1CEBLL9VbwLBA,9975
|
21
|
+
indexify/functions_sdk/graph_validation.py,sha256=y-f0ZNiGYl_fjPA7v9OJWtoUMPELgtVR_ifpgqZ0IoY,2465
|
22
|
+
indexify/functions_sdk/image.py,sha256=euuz2QTZQoS-JmwnPmWJ8lfIgKzrSEsfkUc2qU26xjM,679
|
23
|
+
indexify/functions_sdk/indexify_functions.py,sha256=xxgvnw0MQ_csIksunIdero8be0PR4mfwgoHp3UlkMZU,5851
|
24
|
+
indexify/functions_sdk/local_cache.py,sha256=cNWF67zbhbTJe3g86hyLBy3Rqzs6dNvp2SjLazGZWvw,1348
|
25
|
+
indexify/functions_sdk/object_serializer.py,sha256=Zz4GobW3ZamBBtFDF76QxU3TP6oJNdWnhsfKd0OUFoc,1660
|
26
|
+
indexify/local_client.py,sha256=9wPYHjG516ZX6q5sPj5Pnn9C-Fi0sghYiLaNMe7LPPk,7220
|
27
|
+
indexify/remote_client.py,sha256=oKgTqLbIxQVDqkMjQmNCOOEIM156UeYMC1jDWWSqBAQ,12297
|
28
|
+
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
29
|
+
indexify-0.2.1.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
30
|
+
indexify-0.2.1.dist-info/METADATA,sha256=ZwqwbjyxS8vszZH5L_X_1zNJM9OyAr-B0ezxpUDeLJo,6129
|
31
|
+
indexify-0.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
32
|
+
indexify-0.2.1.dist-info/entry_points.txt,sha256=Pih7WV-XMpAzI5dEvROcpLr-ybVhd9Y-AtuzBKUdcDs,49
|
33
|
+
indexify-0.2.1.dist-info/RECORD,,
|