indexify 0.0.42__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. indexify/__init__.py +13 -14
  2. indexify/base_client.py +48 -21
  3. indexify/cli.py +235 -0
  4. indexify/client.py +18 -790
  5. indexify/error.py +3 -30
  6. indexify/executor/agent.py +362 -0
  7. indexify/executor/api_objects.py +43 -0
  8. indexify/executor/downloader.py +124 -0
  9. indexify/executor/executor_tasks.py +72 -0
  10. indexify/executor/function_worker.py +177 -0
  11. indexify/executor/indexify_executor.py +32 -0
  12. indexify/executor/task_reporter.py +110 -0
  13. indexify/executor/task_store.py +113 -0
  14. indexify/foo +72 -0
  15. indexify/functions_sdk/data_objects.py +37 -0
  16. indexify/functions_sdk/graph.py +276 -0
  17. indexify/functions_sdk/graph_validation.py +69 -0
  18. indexify/functions_sdk/image.py +26 -0
  19. indexify/functions_sdk/indexify_functions.py +192 -0
  20. indexify/functions_sdk/local_cache.py +46 -0
  21. indexify/functions_sdk/object_serializer.py +61 -0
  22. indexify/local_client.py +183 -0
  23. indexify/remote_client.py +319 -0
  24. indexify-0.2.dist-info/METADATA +151 -0
  25. indexify-0.2.dist-info/RECORD +32 -0
  26. indexify-0.2.dist-info/entry_points.txt +3 -0
  27. indexify/exceptions.py +0 -3
  28. indexify/extraction_policy.py +0 -75
  29. indexify/extractor_sdk/__init__.py +0 -14
  30. indexify/extractor_sdk/data.py +0 -100
  31. indexify/extractor_sdk/extractor.py +0 -223
  32. indexify/extractor_sdk/utils.py +0 -102
  33. indexify/extractors/__init__.py +0 -0
  34. indexify/extractors/embedding.py +0 -55
  35. indexify/extractors/pdf_parser.py +0 -93
  36. indexify/graph.py +0 -133
  37. indexify/local_runner.py +0 -128
  38. indexify/runner.py +0 -22
  39. indexify/utils.py +0 -7
  40. indexify-0.0.42.dist-info/METADATA +0 -66
  41. indexify-0.0.42.dist-info/RECORD +0 -25
  42. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
  43. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,183 @@
1
+ from collections import defaultdict
2
+ from queue import deque
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
+
5
+ from nanoid import generate
6
+ from pydantic import BaseModel, Json
7
+ from rich import print
8
+
9
+ from indexify.base_client import IndexifyClient
10
+ from indexify.functions_sdk.data_objects import (
11
+ File,
12
+ IndexifyData,
13
+ RouterOutput,
14
+ )
15
+ from indexify.functions_sdk.graph import Graph
16
+ from indexify.functions_sdk.local_cache import CacheAwareFunctionWrapper
17
+ from indexify.functions_sdk.object_serializer import get_serializer
18
+
19
+
20
+ # Holds the outputs of a
21
+ class ContentTree(BaseModel):
22
+ id: str
23
+ outputs: Dict[str, List[IndexifyData]]
24
+
25
+
26
+ class LocalClient(IndexifyClient):
27
+ def __init__(self, cache_dir: str = "./indexify_local_runner_cache"):
28
+ self._cache_dir = cache_dir
29
+ self._graphs: Dict[str, Graph] = {}
30
+ self._results: Dict[str, Dict[str, List[IndexifyData]]] = {}
31
+ self._cache = CacheAwareFunctionWrapper(self._cache_dir)
32
+ self._accumulators: Dict[str, Dict[str, IndexifyData]] = {}
33
+
34
+ def register_compute_graph(self, graph: Graph):
35
+ self._graphs[graph.name] = graph
36
+
37
+ def run_from_serialized_code(self, code: bytes, **kwargs):
38
+ g = Graph.deserialize(graph=code)
39
+ self.run(g, **kwargs)
40
+
41
+ def run(self, g: Graph, **kwargs):
42
+ serializer = get_serializer(
43
+ g.get_function(g._start_node).indexify_function.payload_encoder
44
+ )
45
+ input = IndexifyData(id=generate(), payload=serializer.serialize(kwargs))
46
+ print(f"[bold] Invoking {g._start_node}[/bold]")
47
+ outputs = defaultdict(list)
48
+ for k, v in g.get_accumulators().items():
49
+ serializer = get_serializer(
50
+ g.get_function(k).indexify_function.payload_encoder
51
+ )
52
+ self._accumulators[k] = IndexifyData(payload=serializer.deserialize(v))
53
+ self._results[input.id] = outputs
54
+ self._run(g, input, outputs)
55
+ return input.id
56
+
57
+ def _run(
58
+ self,
59
+ g: Graph,
60
+ initial_input: bytes,
61
+ outputs: Dict[str, List[bytes]],
62
+ ):
63
+ queue = deque([(g._start_node, initial_input)])
64
+ while queue:
65
+ node_name, input = queue.popleft()
66
+ serializer = get_serializer(
67
+ g.get_function(node_name).indexify_function.payload_encoder
68
+ )
69
+ input_bytes = serializer.serialize(input)
70
+ cached_output_bytes: Optional[bytes] = self._cache.get(
71
+ g.name, node_name, input_bytes
72
+ )
73
+ if cached_output_bytes is not None:
74
+ print(
75
+ f"ran {node_name}: num outputs: {len(cached_output_bytes)} (cache hit)"
76
+ )
77
+ function_outputs: List[IndexifyData] = []
78
+ cached_output_list = serializer.deserialize_list(cached_output_bytes)
79
+ if self._accumulators.get(node_name, None) is not None:
80
+ self._accumulators[node_name] = cached_output_list[-1].model_copy()
81
+ outputs[node_name] = []
82
+ function_outputs.extend(cached_output_list)
83
+ outputs[node_name].extend(cached_output_list)
84
+ else:
85
+ function_outputs: List[IndexifyData] = g.invoke_fn_ser(
86
+ node_name, input, self._accumulators.get(node_name, None)
87
+ )
88
+ print(f"ran {node_name}: num outputs: {len(function_outputs)}")
89
+ if self._accumulators.get(node_name, None) is not None:
90
+ self._accumulators[node_name] = function_outputs[-1].model_copy()
91
+ outputs[node_name] = []
92
+ outputs[node_name].extend(function_outputs)
93
+ function_outputs_bytes: List[bytes] = [
94
+ serializer.serialize_list(function_outputs)
95
+ ]
96
+ self._cache.set(
97
+ g.name,
98
+ node_name,
99
+ input_bytes,
100
+ function_outputs_bytes,
101
+ )
102
+ if self._accumulators.get(node_name, None) is not None and queue:
103
+ print(
104
+ f"accumulator not none for {node_name}, continuing, len queue: {len(queue)}"
105
+ )
106
+ continue
107
+
108
+ out_edges = g.edges.get(node_name, [])
109
+ # Figure out if there are any routers for this node
110
+ for i, edge in enumerate(out_edges):
111
+ if edge in g.routers:
112
+ out_edges.remove(edge)
113
+ for output in function_outputs:
114
+ dynamic_edges = self._route(g, edge, output) or []
115
+ for dynamic_edge in dynamic_edges.edges:
116
+ if dynamic_edge in g.nodes:
117
+ print(
118
+ f"[bold]dynamic router returned node: {dynamic_edge}[/bold]"
119
+ )
120
+ out_edges.append(dynamic_edge)
121
+ for out_edge in out_edges:
122
+ for output in function_outputs:
123
+ queue.append((out_edge, output))
124
+
125
+ def _route(
126
+ self, g: Graph, node_name: str, input: IndexifyData
127
+ ) -> Optional[RouterOutput]:
128
+ return g.invoke_router(node_name, input)
129
+
130
+ def graphs(self) -> str:
131
+ return list(self._graphs.keys())
132
+
133
+ def namespaces(self) -> str:
134
+ return "local"
135
+
136
+ def create_namespace(self, namespace: str):
137
+ pass
138
+
139
+ def rerun_graph(self, graph: str):
140
+ return super().rerun_graph(graph)
141
+
142
+ def invoke_graph_with_object(
143
+ self, graph: str, block_until_done: bool = False, **kwargs
144
+ ) -> str:
145
+ graph: Graph = self._graphs[graph]
146
+ return self.run(graph, **kwargs)
147
+
148
+ def invoke_graph_with_file(
149
+ self,
150
+ graph: str,
151
+ path: str,
152
+ metadata: Optional[Dict[str, Json]] = None,
153
+ block_until_done: bool = False,
154
+ ) -> str:
155
+ graph = self._graphs[graph]
156
+ with open(path, "rb") as f:
157
+ data = f.read()
158
+ file = File(data=data, metadata=metadata).model_dump()
159
+ return self.run(graph, file=file)
160
+
161
+ def graph_outputs(
162
+ self,
163
+ graph: str,
164
+ invocation_id: str,
165
+ fn_name: str,
166
+ ) -> Union[Dict[str, List[Any]], List[Any]]:
167
+ if invocation_id not in self._results:
168
+ raise ValueError(f"no results found for graph {graph}")
169
+ if fn_name not in self._results[invocation_id]:
170
+ raise ValueError(f"no results found for fn {fn_name} on graph {graph}")
171
+ results = []
172
+ fn_model = self._graphs[graph].get_function(fn_name).get_output_model()
173
+ serializer = get_serializer(
174
+ self._graphs[graph].get_function(fn_name).indexify_function.payload_encoder
175
+ )
176
+ for result in self._results[invocation_id][fn_name]:
177
+ payload_dict = serializer.deserialize(result.payload)
178
+ if issubclass(fn_model, BaseModel) and isinstance(payload_dict, dict):
179
+ payload = fn_model.model_validate(payload_dict)
180
+ else:
181
+ payload = payload_dict
182
+ results.append(payload)
183
+ return results
@@ -0,0 +1,319 @@
1
+ import json
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import cloudpickle
6
+ import httpx
7
+ import msgpack
8
+ import yaml
9
+ from httpx_sse import connect_sse
10
+ from pydantic import BaseModel, Json
11
+ from rich import print
12
+
13
+ from indexify.base_client import IndexifyClient
14
+ from indexify.error import ApiException
15
+ from indexify.functions_sdk.data_objects import IndexifyData
16
+ from indexify.functions_sdk.graph import ComputeGraphMetadata, Graph
17
+ from indexify.settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
18
+
19
+
20
+ class InvocationEventPayload(BaseModel):
21
+ invocation_id: str
22
+ fn_name: str
23
+ task_id: str
24
+ executor_id: Optional[str] = None
25
+ outcome: Optional[str] = None
26
+
27
+
28
+ class InvocationEvent(BaseModel):
29
+ event_name: str
30
+ payload: InvocationEventPayload
31
+
32
+
33
+ class GraphOutputMetadata(BaseModel):
34
+ id: str
35
+ compute_fn: str
36
+
37
+
38
+ class GraphOutputs(BaseModel):
39
+ outputs: List[GraphOutputMetadata]
40
+
41
+
42
+ class RemoteClient(IndexifyClient):
43
+ def __init__(
44
+ self,
45
+ service_url: str = DEFAULT_SERVICE_URL,
46
+ config_path: Optional[str] = None,
47
+ namespace: str = "default",
48
+ **kwargs,
49
+ ):
50
+ if os.environ.get("INDEXIFY_URL"):
51
+ print("Using INDEXIFY_URL environment variable to connect to Indexify")
52
+ service_url = os.environ["INDEXIFY_URL"]
53
+
54
+ self.service_url = service_url
55
+ self._client = httpx.Client()
56
+ if config_path:
57
+ with open(config_path, "r") as file:
58
+ config = yaml.safe_load(file)
59
+
60
+ if config.get("use_tls", False):
61
+ tls_config = config["tls_config"]
62
+ self._client = httpx.Client(
63
+ http2=True,
64
+ cert=(tls_config["cert_path"], tls_config["key_path"]),
65
+ verify=tls_config.get("ca_bundle_path", True),
66
+ )
67
+
68
+ self.namespace: str = namespace
69
+ self.compute_graphs: List[Graph] = []
70
+ self.labels: dict = {}
71
+ self._service_url = service_url
72
+ self._timeout = kwargs.get("timeout")
73
+ self._graphs: Dict[str, Graph] = {}
74
+
75
+ def _request(self, method: str, **kwargs) -> httpx.Response:
76
+ try:
77
+ response = self._client.request(method, timeout=self._timeout, **kwargs)
78
+ status_code = str(response.status_code)
79
+ if status_code.startswith("4"):
80
+ raise ApiException(
81
+ "status code: " + status_code + " request args: " + str(kwargs)
82
+ )
83
+ if status_code.startswith("5"):
84
+ raise ApiException(response.text)
85
+ except httpx.ConnectError:
86
+ message = (
87
+ f"Make sure the server is running and accesible at {self._service_url}"
88
+ )
89
+ ex = ApiException(status="ConnectionError", message=message)
90
+ print(ex)
91
+ raise ex
92
+ return response
93
+
94
+ @classmethod
95
+ def with_mtls(
96
+ cls,
97
+ cert_path: str,
98
+ key_path: str,
99
+ ca_bundle_path: Optional[str] = None,
100
+ service_url: str = DEFAULT_SERVICE_URL_HTTPS,
101
+ *args,
102
+ **kwargs,
103
+ ) -> "RemoteClient":
104
+ """
105
+ Create a client with mutual TLS authentication. Also enables HTTP/2,
106
+ which is required for mTLS.
107
+ NOTE: mTLS must be enabled on the Indexify service for this to work.
108
+
109
+ :param cert_path: Path to the client certificate. Resolution handled by httpx.
110
+ :param key_path: Path to the client key. Resolution handled by httpx.
111
+ :param args: Arguments to pass to the httpx.Client constructor
112
+ :param kwargs: Keyword arguments to pass to the httpx.Client constructor
113
+ :return: A client with mTLS authentication
114
+
115
+ Example usage:
116
+ ```
117
+ from indexify import IndexifyClient
118
+
119
+ client = IndexifyClient.with_mtls(
120
+ cert_path="/path/to/cert.pem",
121
+ key_path="/path/to/key.pem",
122
+ )
123
+ assert client.heartbeat() == True
124
+ ```
125
+ """
126
+ if not (cert_path and key_path):
127
+ raise ValueError("Both cert and key must be provided for mTLS")
128
+
129
+ client_certs = (cert_path, key_path)
130
+ verify_option = ca_bundle_path if ca_bundle_path else True
131
+ client = RemoteClient(
132
+ *args,
133
+ **kwargs,
134
+ service_url=service_url,
135
+ http2=True,
136
+ cert=client_certs,
137
+ verify=verify_option,
138
+ )
139
+ return client
140
+
141
+ def _get(self, endpoint: str, **kwargs) -> httpx.Response:
142
+ return self._request("GET", url=f"{self._service_url}/{endpoint}", **kwargs)
143
+
144
+ def _post(self, endpoint: str, **kwargs) -> httpx.Response:
145
+ return self._request("POST", url=f"{self._service_url}/{endpoint}", **kwargs)
146
+
147
+ def _put(self, endpoint: str, **kwargs) -> httpx.Response:
148
+ return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
149
+
150
+ def _delete(self, endpoint: str, **kwargs) -> httpx.Response:
151
+ return self._request("DELETE", url=f"{self._service_url}/{endpoint}", **kwargs)
152
+
153
+ def _close(self):
154
+ self._client.close()
155
+
156
+ def __enter__(self):
157
+ return self
158
+
159
+ def __exit__(self, exc_type, exc_value, traceback):
160
+ self.close()
161
+
162
+ def register_compute_graph(self, graph: Graph):
163
+ graph_metadata = graph.definition()
164
+ serialized_code = graph.serialize()
165
+ response = self._post(
166
+ f"namespaces/{self.namespace}/compute_graphs",
167
+ files={"code": serialized_code},
168
+ data={"compute_graph": graph_metadata.model_dump_json(exclude_none=True)},
169
+ )
170
+ print(response.content.decode("utf-8"))
171
+ response.raise_for_status()
172
+ self._graphs[graph.name] = graph
173
+
174
+ def graphs(self) -> List[str]:
175
+ response = self._get(f"graphs")
176
+ return response.json()["graphs"]
177
+
178
+ def graph(self, name: str) -> ComputeGraphMetadata:
179
+ response = self._get(f"namespaces/{self.namespace}/compute_graphs/{name}")
180
+ return ComputeGraphMetadata(**response.json())
181
+
182
+ def load_graph(self, name: str) -> Graph:
183
+ response = self._get(
184
+ f"internal/namespaces/{self.namespace}/compute_graphs/{name}/code"
185
+ )
186
+ return Graph.deserialize(response.content)
187
+
188
+ def namespaces(self) -> List[str]:
189
+ response = self._get(f"namespaces")
190
+ namespaces_dict = response.json()["namespaces"]
191
+ namespaces = []
192
+ for item in namespaces_dict:
193
+ namespaces.append(item["name"])
194
+ return namespaces
195
+
196
+ def create_namespace(self, namespace: str):
197
+ self._post("namespaces", json={"namespace": namespace})
198
+
199
+ def logs(
200
+ self, invocation_id: str, cg_name: str, fn_name: str, file: str
201
+ ) -> Optional[str]:
202
+ try:
203
+ response = self._get(
204
+ f"namespaces/{self.namespace}/compute_graphs/{cg_name}/invocations/{invocation_id}/fn/{fn_name}/logs/{file}"
205
+ )
206
+ response.raise_for_status()
207
+ return response.content.decode("utf-8")
208
+ except ApiException as e:
209
+ print(f"failed to fetch logs: {e}")
210
+ return None
211
+
212
+ def rerun_graph(self, graph: str):
213
+ self._post(f"namespaces/{self.namespace}/compute_graphs/{graph}/rerun")
214
+
215
+ def invoke_graph_with_object(
216
+ self, graph: str, block_until_done: bool = False, **kwargs
217
+ ) -> str:
218
+ ser_input = cloudpickle.dumps(kwargs)
219
+ params = {"block_until_finish": block_until_done}
220
+ with httpx.Client() as client:
221
+ with connect_sse(
222
+ client,
223
+ "POST",
224
+ f"{self.service_url}/namespaces/{self.namespace}/compute_graphs/{graph}/invoke_object",
225
+ headers={"Content-Type": "application/cbor"},
226
+ data=ser_input,
227
+ params=params,
228
+ ) as event_source:
229
+ for sse in event_source.iter_sse():
230
+ obj = json.loads(sse.data)
231
+ for k, v in obj.items():
232
+ if k == "InvocationFinished":
233
+ return v["id"]
234
+ event_payload = InvocationEventPayload.model_validate(v)
235
+ event = InvocationEvent(event_name=k, payload=event_payload)
236
+ if (
237
+ event.event_name == "TaskCompleted"
238
+ and event.payload.outcome == "Failure"
239
+ ):
240
+ stdout = self.logs(
241
+ event.payload.invocation_id,
242
+ graph,
243
+ event.payload.fn_name,
244
+ "stdout",
245
+ )
246
+ stderr = self.logs(
247
+ event.payload.invocation_id,
248
+ graph,
249
+ event.payload.fn_name,
250
+ "stderr",
251
+ )
252
+ if stdout:
253
+ print(f"[bold red]stdout[/bold red]: \n {stdout}")
254
+ if stderr:
255
+ print(f"[bold red]stderr[/bold red]: \n {stderr}")
256
+ print(
257
+ f"[bold green]{event.event_name}[/bold green]: {event.payload}"
258
+ )
259
+ raise Exception("invocation ID not returned")
260
+
261
+ def _download_output(
262
+ self,
263
+ namespace: str,
264
+ graph: str,
265
+ invocation_id: str,
266
+ fn_name: str,
267
+ output_id: str,
268
+ ) -> IndexifyData:
269
+ response = self._get(
270
+ f"namespaces/{namespace}/compute_graphs/{graph}/invocations/{invocation_id}/fn/{fn_name}/{output_id}",
271
+ )
272
+ response.raise_for_status()
273
+ data_dict = msgpack.unpackb(response.content)
274
+ return IndexifyData.model_validate(data_dict)
275
+
276
+ def graph_outputs(
277
+ self,
278
+ graph: str,
279
+ invocation_id: str,
280
+ fn_name: Optional[str],
281
+ ) -> List[Any]:
282
+ """
283
+ Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
284
+ If the extractor name is not provided, all the extracted objects are returned for the input object.
285
+ graph: str: The name of the graph
286
+ invocation_id: str: The ID of the ingested object
287
+ extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
288
+ block_until_done: bool = True: If True, the method will block until the extraction is done. If False, the method will return immediately.
289
+ return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
290
+ """
291
+ if graph not in self._graphs:
292
+ self._graphs[graph] = self.load_graph(graph)
293
+ response = self._get(
294
+ f"namespaces/{self.namespace}/compute_graphs/{graph}/invocations/{invocation_id}/outputs",
295
+ )
296
+ response.raise_for_status()
297
+ graph_outputs = GraphOutputs(**response.json())
298
+ outputs = []
299
+ for output in graph_outputs.outputs:
300
+ if output.compute_fn == fn_name:
301
+ indexify_data = self._download_output(
302
+ self.namespace, graph, invocation_id, fn_name, output.id
303
+ )
304
+ output = self._graphs[graph].deserialize_fn_output(
305
+ fn_name, indexify_data
306
+ )
307
+ outputs.append(output)
308
+ return outputs
309
+
310
+ def invoke_graph_with_file(
311
+ self, graph: str, path: str, metadata: Optional[Dict[str, Json]] = None
312
+ ) -> str:
313
+ """
314
+ Invokes a graph with an input file. The file's mimetype is appropriately detected.
315
+ graph: str: The name of the graph to invoke
316
+ path: str: The path to the file to be ingested
317
+ return: str: The ID of the ingested object
318
+ """
319
+ pass
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.1
2
+ Name: indexify
3
+ Version: 0.2
4
+ Summary: Python Client for Indexify
5
+ Home-page: https://github.com/tensorlakeai/indexify
6
+ License: Apache 2.0
7
+ Author: Tensorlake Inc.
8
+ Author-email: support@tensorlake.ai
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: License :: Other/Proprietary License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: cloudpickle (>=3,<4)
17
+ Requires-Dist: docker (>=7.1.0,<8.0.0)
18
+ Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
19
+ Requires-Dist: httpx[http2] (>=0,<1)
20
+ Requires-Dist: msgpack (>=1.1.0,<2.0.0)
21
+ Requires-Dist: nanoid (>=2.0.0,<3.0.0)
22
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
23
+ Requires-Dist: pyyaml (>=6,<7)
24
+ Requires-Dist: rich (>=13,<14)
25
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
26
+ Project-URL: Repository, https://github.com/tensorlakeai/indexify
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Indexify Python SDK
30
+
31
+ [![PyPI version](https://badge.fury.io/py/indexify.svg)](https://badge.fury.io/py/indexify)
32
+ [![Discord](https://dcbadge.vercel.app/api/server/VXkY7zVmTD?style=flat&compact=true)](https://discord.gg/VXkY7zVmTD)
33
+
34
+ This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
35
+
36
+ Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
37
+
38
+ ## Installation
39
+
40
+ ```shell
41
+ pip install indexify
42
+ ```
43
+
44
+ ## Examples
45
+ **[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
46
+ 1. Extracts text, tables and images from an ingested PDF file
47
+ 2. Indexes the text using MiniLM-L6-v2, the images with CLIP
48
+ 3. Writes the results into a vector database.
49
+
50
+ **[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
51
+ 1. Downloads Youtube Video
52
+ 2. Extracts audio from the video and transcribes using `Faster Whisper`
53
+ 3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
54
+ 4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
55
+ 5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
56
+
57
+ ## Quick Start
58
+ 1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
59
+ 2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
60
+ 3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
61
+ 4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
62
+
63
+ ## Functional Features
64
+ 1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
65
+ 2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
66
+ 3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
67
+ 4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
68
+
69
+ ```python
70
+ from pydantic import BaseModel
71
+ from indexify import indexify_function
72
+ from typing import Dict, Any, Optional, List
73
+
74
+ # Define function inputs and outputs
75
+ class Document(BaseModel):
76
+ text: str
77
+ metadata: Dict[str, Any]
78
+
79
+ class TextChunk(BaseModel):
80
+ text: str
81
+ metadata: Dict[str, Any]
82
+ embedding: Optional[List[float]] = None
83
+
84
+
85
+ # Decorate a function which is going to be part of your data processing graph
86
+ @indexify_function()
87
+ def split_text(doc: Document) -> List[TextChunk]:
88
+ midpoint = len(doc.text) // 2
89
+ first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
90
+ second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
91
+ return [first_half, second_half]
92
+
93
+ # Any requirements specified is automatically installed in production clusters
94
+ @indexify_function(requirements=["langchain_text_splitter"])
95
+ def compute_embedding(chunk: TextChunk) -> TextChunk:
96
+ chunk.embedding = [0.1, 0.2, 0.3]
97
+ return chunk
98
+
99
+ # You can constrain functions to run on specific executors
100
+ @indexify_function(executor_runtime_name="postgres-driver-image")
101
+ def write_to_db(chunk: TextChunk):
102
+ # Write to your favorite vector database
103
+ ...
104
+
105
+ ## Create a graph
106
+ from indexify import Graph
107
+
108
+ g = Graph(name="my_graph", start_node=split_text)
109
+ g.add_edge(split_text, compute_embedding)
110
+ g.add_edge(embed_text, write_to_db)
111
+ ```
112
+
113
+ ## Graph Execution
114
+ Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
115
+
116
+ ## Run the Graph Locally
117
+ ```python
118
+ from indexify import IndexifyClient
119
+
120
+ client = IndexifyClient(local=True)
121
+ client.register_graph(g)
122
+ invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
123
+ graph_outputs = client.graph_outputs(g.name, invocation_id)
124
+ ```
125
+
126
+ ## Deploy the Graph to Indexify Server for Production
127
+ > Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
128
+ ```python
129
+ from indexify import IndexifyClient
130
+
131
+ client = IndexifyClient(service_url="http://localhost:8900")
132
+ client.register_graph(g)
133
+ ```
134
+
135
+ #### Ingestion into the Service
136
+ Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
137
+
138
+ ```python
139
+ output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
140
+ ```
141
+
142
+ #### Retrieve Graph Outputs for a given ingestion object
143
+ ```python
144
+ graph_outputs = client.graph_outputs(g.name, output_id)
145
+ ```
146
+
147
+ #### Retrieve All Graph Inputs
148
+ ```python
149
+ graph_inputs = client.graph_inputs(g.name)
150
+ ```
151
+
@@ -0,0 +1,32 @@
1
+ indexify/__init__.py,sha256=fD9E-i9gawBNwbMd8ZIfiNsxUtL1S5VBCiyvKU5Yslc,497
2
+ indexify/base_client.py,sha256=Si1XnZ6X_mFvkYCnS6qx6axFsBpkrGiorqmKohFwvLQ,3324
3
+ indexify/cli.py,sha256=XZYU9iMVHqNzecMU6LdAyfa_JkHb0Xx35akaW319tTI,7091
4
+ indexify/client.py,sha256=sopeSA9hc7QHIHdpGq5rehSkr0tiPvdsLSJsaVzw3kY,491
5
+ indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
6
+ indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
7
+ indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
8
+ indexify/error.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
9
+ indexify/executor/agent.py,sha256=RxpmnZ4ne-2BCv6Lv0kkkBJX3Dz71bruvaFC8V3O1D0,14563
10
+ indexify/executor/api_objects.py,sha256=nvRmKYngjrv8sc_GoYM1MSLdFMYB6hoq7XK4LMx2U-s,814
11
+ indexify/executor/downloader.py,sha256=0MPiKw0AWs3Z7ReC9l2z-3515yqq85ghPzdh485dnuw,3998
12
+ indexify/executor/executor_tasks.py,sha256=gAZ2pvza1YwGlaR1o_tJW4SXtdCgK7sLJgp4W7rOjR0,1834
13
+ indexify/executor/function_worker.py,sha256=83ih8TjAJHtrH6LqqDoSfzAj4zB7ZZPR2Voq0RMZ1T8,5410
14
+ indexify/executor/indexify_executor.py,sha256=2Ut_VX-Su_lm4b4aEROyRJ3gXx-uFHA-V7EN0sWiARE,771
15
+ indexify/executor/task_reporter.py,sha256=gnnse0v6rjjni8lNzeb-ZYq6iF2DgafKoT7dcGUZhQ4,3716
16
+ indexify/executor/task_store.py,sha256=q8s2gImsFffWeXQR0mk1Xlo1Aj_2GfclNPjQ2EA_YBo,3984
17
+ indexify/foo,sha256=e385Ws-u8zx-LOq3tdfTa-siK9pMaccdAE8_0rrp_k4,5165
18
+ indexify/functions_sdk/data_objects.py,sha256=2LqAWJ_S2Xkp4OQTmhd3InVIrBs7juV41udnSQFMMfM,840
19
+ indexify/functions_sdk/graph.py,sha256=ilImZsALzOQLwioPYqva0_FxHtrn3FscCmtVniPJecU,9764
20
+ indexify/functions_sdk/graph_validation.py,sha256=UgP_iMlUsH3jO0aBkQOwfxGQV1_x2qx9_iZvSxvuJBo,2419
21
+ indexify/functions_sdk/image.py,sha256=0A8xTjLv0jITRu0u06iCFplS_cu47Zu3fUCXq9Rc7zI,525
22
+ indexify/functions_sdk/indexify_functions.py,sha256=sJclmxvOAiAPUONbcwmUwywsUEzMvMA6gIE6zaURr1s,5955
23
+ indexify/functions_sdk/local_cache.py,sha256=cNWF67zbhbTJe3g86hyLBy3Rqzs6dNvp2SjLazGZWvw,1348
24
+ indexify/functions_sdk/object_serializer.py,sha256=HPH0Ym6-Los8lg_RHxmerrAh5fSLBHDeTZikom0eXxk,1689
25
+ indexify/local_client.py,sha256=9wPYHjG516ZX6q5sPj5Pnn9C-Fi0sghYiLaNMe7LPPk,7220
26
+ indexify/remote_client.py,sha256=oKgTqLbIxQVDqkMjQmNCOOEIM156UeYMC1jDWWSqBAQ,12297
27
+ indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
28
+ indexify-0.2.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
29
+ indexify-0.2.dist-info/METADATA,sha256=GuigbP7vmakmIXhhijeYAiCbhvaFZX-Np2gZAcHYwrc,6127
30
+ indexify-0.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ indexify-0.2.dist-info/entry_points.txt,sha256=Pih7WV-XMpAzI5dEvROcpLr-ybVhd9Y-AtuzBKUdcDs,49
32
+ indexify-0.2.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ indexify-cli=indexify.cli:app
3
+