indexify 0.0.43__tar.gz → 0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify-0.2/PKG-INFO +151 -0
- indexify-0.2/README.md +122 -0
- indexify-0.2/indexify/__init__.py +21 -0
- {indexify-0.0.43 → indexify-0.2}/indexify/base_client.py +48 -21
- indexify-0.2/indexify/cli.py +235 -0
- indexify-0.2/indexify/client.py +18 -0
- indexify-0.2/indexify/executor/agent.py +362 -0
- indexify-0.2/indexify/executor/api_objects.py +43 -0
- indexify-0.2/indexify/executor/downloader.py +124 -0
- indexify-0.2/indexify/executor/executor_tasks.py +72 -0
- indexify-0.2/indexify/executor/function_worker.py +177 -0
- indexify-0.2/indexify/executor/indexify_executor.py +32 -0
- indexify-0.2/indexify/executor/task_reporter.py +110 -0
- indexify-0.2/indexify/executor/task_store.py +113 -0
- indexify-0.2/indexify/foo +72 -0
- indexify-0.2/indexify/functions_sdk/data_objects.py +37 -0
- indexify-0.2/indexify/functions_sdk/graph.py +276 -0
- indexify-0.2/indexify/functions_sdk/graph_validation.py +69 -0
- indexify-0.2/indexify/functions_sdk/image.py +26 -0
- indexify-0.2/indexify/functions_sdk/indexify_functions.py +192 -0
- indexify-0.2/indexify/functions_sdk/local_cache.py +46 -0
- indexify-0.2/indexify/functions_sdk/object_serializer.py +61 -0
- indexify-0.2/indexify/local_client.py +183 -0
- indexify-0.2/indexify/remote_client.py +319 -0
- {indexify-0.0.43 → indexify-0.2}/pyproject.toml +12 -3
- indexify-0.0.43/PKG-INFO +0 -66
- indexify-0.0.43/README.md +0 -43
- indexify-0.0.43/indexify/__init__.py +0 -22
- indexify-0.0.43/indexify/client.py +0 -790
- indexify-0.0.43/indexify/error.py +0 -30
- indexify-0.0.43/indexify/extraction_policy.py +0 -75
- indexify-0.0.43/indexify/extractor_sdk/__init__.py +0 -14
- indexify-0.0.43/indexify/extractor_sdk/data.py +0 -100
- indexify-0.0.43/indexify/extractor_sdk/extractor.py +0 -225
- indexify-0.0.43/indexify/extractor_sdk/utils.py +0 -102
- indexify-0.0.43/indexify/extractors/__init__.py +0 -0
- indexify-0.0.43/indexify/extractors/embedding.py +0 -55
- indexify-0.0.43/indexify/extractors/pdf_parser.py +0 -93
- indexify-0.0.43/indexify/graph.py +0 -133
- indexify-0.0.43/indexify/local_runner.py +0 -128
- indexify-0.0.43/indexify/runner.py +0 -22
- indexify-0.0.43/indexify/utils.py +0 -7
- {indexify-0.0.43 → indexify-0.2}/LICENSE.txt +0 -0
- {indexify-0.0.43 → indexify-0.2}/indexify/data_loaders/__init__.py +0 -0
- {indexify-0.0.43 → indexify-0.2}/indexify/data_loaders/local_directory_loader.py +0 -0
- {indexify-0.0.43 → indexify-0.2}/indexify/data_loaders/url_loader.py +0 -0
- /indexify-0.0.43/indexify/exceptions.py → /indexify-0.2/indexify/error.py +0 -0
- {indexify-0.0.43 → indexify-0.2}/indexify/settings.py +0 -0
indexify-0.2/PKG-INFO
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: indexify
|
3
|
+
Version: 0.2
|
4
|
+
Summary: Python Client for Indexify
|
5
|
+
Home-page: https://github.com/tensorlakeai/indexify
|
6
|
+
License: Apache 2.0
|
7
|
+
Author: Tensorlake Inc.
|
8
|
+
Author-email: support@tensorlake.ai
|
9
|
+
Requires-Python: >=3.9,<4.0
|
10
|
+
Classifier: License :: Other/Proprietary License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: cloudpickle (>=3,<4)
|
17
|
+
Requires-Dist: docker (>=7.1.0,<8.0.0)
|
18
|
+
Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
|
19
|
+
Requires-Dist: httpx[http2] (>=0,<1)
|
20
|
+
Requires-Dist: msgpack (>=1.1.0,<2.0.0)
|
21
|
+
Requires-Dist: nanoid (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
23
|
+
Requires-Dist: pyyaml (>=6,<7)
|
24
|
+
Requires-Dist: rich (>=13,<14)
|
25
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
26
|
+
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
27
|
+
Description-Content-Type: text/markdown
|
28
|
+
|
29
|
+
# Indexify Python SDK
|
30
|
+
|
31
|
+
[](https://badge.fury.io/py/indexify)
|
32
|
+
[](https://discord.gg/VXkY7zVmTD)
|
33
|
+
|
34
|
+
This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
|
35
|
+
|
36
|
+
Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```shell
|
41
|
+
pip install indexify
|
42
|
+
```
|
43
|
+
|
44
|
+
## Examples
|
45
|
+
**[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
|
46
|
+
1. Extracts text, tables and images from an ingested PDF file
|
47
|
+
2. Indexes the text using MiniLM-L6-v2, the images with CLIP
|
48
|
+
3. Writes the results into a vector database.
|
49
|
+
|
50
|
+
**[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
|
51
|
+
1. Downloads Youtube Video
|
52
|
+
2. Extracts audio from the video and transcribes using `Faster Whisper`
|
53
|
+
3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
|
54
|
+
4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
|
55
|
+
5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
|
56
|
+
|
57
|
+
## Quick Start
|
58
|
+
1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
|
59
|
+
2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
|
60
|
+
3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
|
61
|
+
4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
|
62
|
+
|
63
|
+
## Functional Features
|
64
|
+
1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
|
65
|
+
2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
|
66
|
+
3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
|
67
|
+
4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
|
68
|
+
|
69
|
+
```python
|
70
|
+
from pydantic import BaseModel
|
71
|
+
from indexify import indexify_function
|
72
|
+
from typing import Dict, Any, Optional, List
|
73
|
+
|
74
|
+
# Define function inputs and outputs
|
75
|
+
class Document(BaseModel):
|
76
|
+
text: str
|
77
|
+
metadata: Dict[str, Any]
|
78
|
+
|
79
|
+
class TextChunk(BaseModel):
|
80
|
+
text: str
|
81
|
+
metadata: Dict[str, Any]
|
82
|
+
embedding: Optional[List[float]] = None
|
83
|
+
|
84
|
+
|
85
|
+
# Decorate a function which is going to be part of your data processing graph
|
86
|
+
@indexify_function()
|
87
|
+
def split_text(doc: Document) -> List[TextChunk]:
|
88
|
+
midpoint = len(doc.text) // 2
|
89
|
+
first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
|
90
|
+
second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
|
91
|
+
return [first_half, second_half]
|
92
|
+
|
93
|
+
# Any requirements specified is automatically installed in production clusters
|
94
|
+
@indexify_function(requirements=["langchain_text_splitter"])
|
95
|
+
def compute_embedding(chunk: TextChunk) -> TextChunk:
|
96
|
+
chunk.embedding = [0.1, 0.2, 0.3]
|
97
|
+
return chunk
|
98
|
+
|
99
|
+
# You can constrain functions to run on specific executors
|
100
|
+
@indexify_function(executor_runtime_name="postgres-driver-image")
|
101
|
+
def write_to_db(chunk: TextChunk):
|
102
|
+
# Write to your favorite vector database
|
103
|
+
...
|
104
|
+
|
105
|
+
## Create a graph
|
106
|
+
from indexify import Graph
|
107
|
+
|
108
|
+
g = Graph(name="my_graph", start_node=split_text)
|
109
|
+
g.add_edge(split_text, compute_embedding)
|
110
|
+
g.add_edge(embed_text, write_to_db)
|
111
|
+
```
|
112
|
+
|
113
|
+
## Graph Execution
|
114
|
+
Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
|
115
|
+
|
116
|
+
## Run the Graph Locally
|
117
|
+
```python
|
118
|
+
from indexify import IndexifyClient
|
119
|
+
|
120
|
+
client = IndexifyClient(local=True)
|
121
|
+
client.register_graph(g)
|
122
|
+
invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
123
|
+
graph_outputs = client.graph_outputs(g.name, invocation_id)
|
124
|
+
```
|
125
|
+
|
126
|
+
## Deploy the Graph to Indexify Server for Production
|
127
|
+
> Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
|
128
|
+
```python
|
129
|
+
from indexify import IndexifyClient
|
130
|
+
|
131
|
+
client = IndexifyClient(service_url="http://localhost:8900")
|
132
|
+
client.register_graph(g)
|
133
|
+
```
|
134
|
+
|
135
|
+
#### Ingestion into the Service
|
136
|
+
Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
|
137
|
+
|
138
|
+
```python
|
139
|
+
output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
140
|
+
```
|
141
|
+
|
142
|
+
#### Retrieve Graph Outputs for a given ingestion object
|
143
|
+
```python
|
144
|
+
graph_outputs = client.graph_outputs(g.name, output_id)
|
145
|
+
```
|
146
|
+
|
147
|
+
#### Retrieve All Graph Inputs
|
148
|
+
```python
|
149
|
+
graph_inputs = client.graph_inputs(g.name)
|
150
|
+
```
|
151
|
+
|
indexify-0.2/README.md
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
# Indexify Python SDK
|
2
|
+
|
3
|
+
[](https://badge.fury.io/py/indexify)
|
4
|
+
[](https://discord.gg/VXkY7zVmTD)
|
5
|
+
|
6
|
+
This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
|
7
|
+
|
8
|
+
Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
```shell
|
13
|
+
pip install indexify
|
14
|
+
```
|
15
|
+
|
16
|
+
## Examples
|
17
|
+
**[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
|
18
|
+
1. Extracts text, tables and images from an ingested PDF file
|
19
|
+
2. Indexes the text using MiniLM-L6-v2, the images with CLIP
|
20
|
+
3. Writes the results into a vector database.
|
21
|
+
|
22
|
+
**[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
|
23
|
+
1. Downloads Youtube Video
|
24
|
+
2. Extracts audio from the video and transcribes using `Faster Whisper`
|
25
|
+
3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
|
26
|
+
4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
|
27
|
+
5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
|
28
|
+
|
29
|
+
## Quick Start
|
30
|
+
1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
|
31
|
+
2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
|
32
|
+
3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
|
33
|
+
4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
|
34
|
+
|
35
|
+
## Functional Features
|
36
|
+
1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
|
37
|
+
2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
|
38
|
+
3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
|
39
|
+
4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
|
40
|
+
|
41
|
+
```python
|
42
|
+
from pydantic import BaseModel
|
43
|
+
from indexify import indexify_function
|
44
|
+
from typing import Dict, Any, Optional, List
|
45
|
+
|
46
|
+
# Define function inputs and outputs
|
47
|
+
class Document(BaseModel):
|
48
|
+
text: str
|
49
|
+
metadata: Dict[str, Any]
|
50
|
+
|
51
|
+
class TextChunk(BaseModel):
|
52
|
+
text: str
|
53
|
+
metadata: Dict[str, Any]
|
54
|
+
embedding: Optional[List[float]] = None
|
55
|
+
|
56
|
+
|
57
|
+
# Decorate a function which is going to be part of your data processing graph
|
58
|
+
@indexify_function()
|
59
|
+
def split_text(doc: Document) -> List[TextChunk]:
|
60
|
+
midpoint = len(doc.text) // 2
|
61
|
+
first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
|
62
|
+
second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
|
63
|
+
return [first_half, second_half]
|
64
|
+
|
65
|
+
# Any requirements specified is automatically installed in production clusters
|
66
|
+
@indexify_function(requirements=["langchain_text_splitter"])
|
67
|
+
def compute_embedding(chunk: TextChunk) -> TextChunk:
|
68
|
+
chunk.embedding = [0.1, 0.2, 0.3]
|
69
|
+
return chunk
|
70
|
+
|
71
|
+
# You can constrain functions to run on specific executors
|
72
|
+
@indexify_function(executor_runtime_name="postgres-driver-image")
|
73
|
+
def write_to_db(chunk: TextChunk):
|
74
|
+
# Write to your favorite vector database
|
75
|
+
...
|
76
|
+
|
77
|
+
## Create a graph
|
78
|
+
from indexify import Graph
|
79
|
+
|
80
|
+
g = Graph(name="my_graph", start_node=split_text)
|
81
|
+
g.add_edge(split_text, compute_embedding)
|
82
|
+
g.add_edge(embed_text, write_to_db)
|
83
|
+
```
|
84
|
+
|
85
|
+
## Graph Execution
|
86
|
+
Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
|
87
|
+
|
88
|
+
## Run the Graph Locally
|
89
|
+
```python
|
90
|
+
from indexify import IndexifyClient
|
91
|
+
|
92
|
+
client = IndexifyClient(local=True)
|
93
|
+
client.register_graph(g)
|
94
|
+
invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
95
|
+
graph_outputs = client.graph_outputs(g.name, invocation_id)
|
96
|
+
```
|
97
|
+
|
98
|
+
## Deploy the Graph to Indexify Server for Production
|
99
|
+
> Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
|
100
|
+
```python
|
101
|
+
from indexify import IndexifyClient
|
102
|
+
|
103
|
+
client = IndexifyClient(service_url="http://localhost:8900")
|
104
|
+
client.register_graph(g)
|
105
|
+
```
|
106
|
+
|
107
|
+
#### Ingestion into the Service
|
108
|
+
Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
|
109
|
+
|
110
|
+
```python
|
111
|
+
output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
|
112
|
+
```
|
113
|
+
|
114
|
+
#### Retrieve Graph Outputs for a given ingestion object
|
115
|
+
```python
|
116
|
+
graph_outputs = client.graph_outputs(g.name, output_id)
|
117
|
+
```
|
118
|
+
|
119
|
+
#### Retrieve All Graph Inputs
|
120
|
+
```python
|
121
|
+
graph_inputs = client.graph_inputs(g.name)
|
122
|
+
```
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from . import data_loaders
|
2
|
+
from .client import create_client
|
3
|
+
from .functions_sdk.graph import Graph
|
4
|
+
from .functions_sdk.indexify_functions import (
|
5
|
+
indexify_function,
|
6
|
+
indexify_router,
|
7
|
+
)
|
8
|
+
from .local_client import LocalClient
|
9
|
+
from .remote_client import RemoteClient
|
10
|
+
from .settings import DEFAULT_SERVICE_URL
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"data_loaders",
|
14
|
+
"Graph",
|
15
|
+
"indexify_function",
|
16
|
+
"indexify_router",
|
17
|
+
"DEFAULT_SERVICE_URL",
|
18
|
+
"RemoteClient",
|
19
|
+
"LocalClient",
|
20
|
+
"create_client",
|
21
|
+
]
|
@@ -1,41 +1,67 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from typing import Any, Dict, List, Optional, Union
|
3
3
|
|
4
|
-
from
|
4
|
+
from pydantic import Json
|
5
5
|
|
6
|
+
from indexify.functions_sdk.graph import Graph
|
6
7
|
|
7
|
-
|
8
|
+
|
9
|
+
class IndexifyClient(ABC):
|
8
10
|
|
9
11
|
### Operational APIs
|
10
12
|
@abstractmethod
|
11
|
-
def
|
13
|
+
def register_compute_graph(self, graph: Graph):
|
14
|
+
"""
|
15
|
+
Register a compute graph.
|
16
|
+
graph: Graph: The graph to be registered
|
17
|
+
"""
|
12
18
|
pass
|
13
19
|
|
14
20
|
@abstractmethod
|
15
|
-
def graphs(self) -> str:
|
21
|
+
def graphs(self) -> List[str]:
|
22
|
+
"""
|
23
|
+
Get the graphs.
|
24
|
+
return: List[str]: The graphs
|
25
|
+
"""
|
16
26
|
pass
|
17
27
|
|
18
28
|
@abstractmethod
|
19
|
-
def namespaces(self) -> str:
|
29
|
+
def namespaces(self) -> List[str]:
|
30
|
+
"""
|
31
|
+
Get the namespaces.
|
32
|
+
return: List[str]: The namespaces
|
33
|
+
"""
|
20
34
|
pass
|
21
35
|
|
22
36
|
@abstractmethod
|
23
37
|
def create_namespace(self, namespace: str):
|
38
|
+
"""
|
39
|
+
Create a namespace.
|
40
|
+
namespace: str: The name of the namespace to be created
|
41
|
+
"""
|
24
42
|
pass
|
25
43
|
|
26
44
|
### Ingestion APIs
|
27
45
|
@abstractmethod
|
28
|
-
def invoke_graph_with_object(
|
46
|
+
def invoke_graph_with_object(
|
47
|
+
self, graph: str, block_until_done: bool = False, **kwargs
|
48
|
+
) -> str:
|
29
49
|
"""
|
30
50
|
Invokes a graph with an input object.
|
31
51
|
graph: str: The name of the graph to invoke
|
32
|
-
|
52
|
+
kwargs: Any: Named arguments to be passed to the graph. Example: url="https://www.google.com", web_page_text="Hello world!"
|
33
53
|
return: str: The ID of the ingested object
|
34
54
|
"""
|
35
55
|
pass
|
36
56
|
|
37
57
|
@abstractmethod
|
38
|
-
def invoke_graph_with_file(
|
58
|
+
def invoke_graph_with_file(
|
59
|
+
self,
|
60
|
+
graph: str,
|
61
|
+
path: str,
|
62
|
+
metadata: Optional[Dict[str, Json]] = None,
|
63
|
+
block_until_done: bool = False,
|
64
|
+
) -> str:
|
39
65
|
"""
|
40
66
|
Invokes a graph with an input file. The file's mimetype is appropriately detected.
|
41
67
|
graph: str: The name of the graph to invoke
|
@@ -44,10 +70,21 @@ class BaseClient(ABC):
|
|
44
70
|
"""
|
45
71
|
pass
|
46
72
|
|
73
|
+
@abstractmethod
|
74
|
+
def rerun_graph(self, graph: str):
|
75
|
+
"""
|
76
|
+
Rerun a graph.
|
77
|
+
graph: str: The name of the graph to rerun
|
78
|
+
"""
|
79
|
+
pass
|
80
|
+
|
47
81
|
### Retrieval APIs
|
48
82
|
@abstractmethod
|
49
|
-
def
|
50
|
-
self,
|
83
|
+
def graph_outputs(
|
84
|
+
self,
|
85
|
+
graph: str,
|
86
|
+
invocation_id: str,
|
87
|
+
fn_name: Optional[str],
|
51
88
|
) -> Union[Dict[str, List[Any]], List[Any]]:
|
52
89
|
"""
|
53
90
|
Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
|
@@ -55,17 +92,7 @@ class BaseClient(ABC):
|
|
55
92
|
graph: str: The name of the graph
|
56
93
|
ingested_object_id: str: The ID of the ingested object
|
57
94
|
extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
|
95
|
+
block_until_done: bool = True: If True, the method will block until the extraction is done. If False, the method will return immediately.
|
58
96
|
return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
|
59
97
|
"""
|
60
98
|
pass
|
61
|
-
|
62
|
-
@abstractmethod
|
63
|
-
def features(
|
64
|
-
self, object_id: str, graph: Optional[str]
|
65
|
-
) -> Union[Dict[str, List[Feature]], List[Feature]]:
|
66
|
-
"""
|
67
|
-
Returns the features of an object.
|
68
|
-
object_id: str: The ID of the object
|
69
|
-
return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
|
70
|
-
"""
|
71
|
-
pass
|
@@ -0,0 +1,235 @@
|
|
1
|
+
import asyncio
|
2
|
+
import io
|
3
|
+
import os
|
4
|
+
import shutil
|
5
|
+
import signal
|
6
|
+
import subprocess
|
7
|
+
import sys
|
8
|
+
import threading
|
9
|
+
import time
|
10
|
+
from typing import Annotated, List, Optional
|
11
|
+
|
12
|
+
import docker
|
13
|
+
import nanoid
|
14
|
+
import typer
|
15
|
+
from rich.console import Console
|
16
|
+
from rich.panel import Panel
|
17
|
+
from rich.text import Text
|
18
|
+
from rich.theme import Theme
|
19
|
+
|
20
|
+
from indexify.executor.agent import ExtractorAgent
|
21
|
+
from indexify.executor.function_worker import FunctionWorker
|
22
|
+
from indexify.functions_sdk.image import Image
|
23
|
+
|
24
|
+
custom_theme = Theme(
|
25
|
+
{
|
26
|
+
"info": "cyan",
|
27
|
+
"warning": "yellow",
|
28
|
+
"error": "red",
|
29
|
+
"highlight": "magenta",
|
30
|
+
}
|
31
|
+
)
|
32
|
+
|
33
|
+
console = Console(theme=custom_theme)
|
34
|
+
|
35
|
+
app = typer.Typer(pretty_exceptions_enable=False, no_args_is_help=True)
|
36
|
+
|
37
|
+
|
38
|
+
@app.command(
|
39
|
+
help="Run server and executor in dev mode (Not recommended for production.)"
|
40
|
+
)
|
41
|
+
def server_dev_mode():
|
42
|
+
indexify_server_path = os.path.expanduser("~/.indexify/indexify-server")
|
43
|
+
if not os.path.exists(indexify_server_path):
|
44
|
+
print("indexify-server not found. Downloading...")
|
45
|
+
try:
|
46
|
+
download_command = subprocess.check_output(
|
47
|
+
["curl", "-s", "https://getindexify.ai"], universal_newlines=True
|
48
|
+
)
|
49
|
+
subprocess.run(download_command, shell=True, check=True)
|
50
|
+
except subprocess.CalledProcessError as e:
|
51
|
+
print(f"failed to download indexify-server: {e}")
|
52
|
+
exit(1)
|
53
|
+
try:
|
54
|
+
os.makedirs(os.path.dirname(indexify_server_path), exist_ok=True)
|
55
|
+
shutil.move("indexify-server", indexify_server_path)
|
56
|
+
except Exception as e:
|
57
|
+
print(f"failed to move indexify-server to {indexify_server_path}: {e}")
|
58
|
+
exit(1)
|
59
|
+
print("starting indexify server and executor in dev mode...")
|
60
|
+
print("press Ctrl+C to stop the server and executor.")
|
61
|
+
print(f"server binary path: {indexify_server_path}")
|
62
|
+
commands = [indexify_server_path, "indexify-cli executor"]
|
63
|
+
|
64
|
+
processes = []
|
65
|
+
stop_event = threading.Event()
|
66
|
+
|
67
|
+
def handle_output(process):
|
68
|
+
for line in iter(process.stdout.readline, ""):
|
69
|
+
sys.stdout.write(line)
|
70
|
+
sys.stdout.flush()
|
71
|
+
|
72
|
+
def terminate_processes():
|
73
|
+
print("Terminating processes...")
|
74
|
+
stop_event.set()
|
75
|
+
for process in processes:
|
76
|
+
if process.poll() is None:
|
77
|
+
try:
|
78
|
+
process.terminate()
|
79
|
+
process.wait(timeout=5)
|
80
|
+
except subprocess.TimeoutExpired:
|
81
|
+
print(f"Force killing process {process.pid}")
|
82
|
+
process.kill()
|
83
|
+
|
84
|
+
def signal_handler(sig, frame):
|
85
|
+
print("\nCtrl+C pressed. Shutting down...")
|
86
|
+
terminate_processes()
|
87
|
+
sys.exit(0)
|
88
|
+
|
89
|
+
signal.signal(signal.SIGINT, signal_handler)
|
90
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
91
|
+
|
92
|
+
for cmd in commands:
|
93
|
+
process = subprocess.Popen(
|
94
|
+
cmd.split(),
|
95
|
+
stdout=subprocess.PIPE,
|
96
|
+
stderr=subprocess.STDOUT,
|
97
|
+
bufsize=1,
|
98
|
+
universal_newlines=True,
|
99
|
+
preexec_fn=os.setsid if os.name != "nt" else None,
|
100
|
+
)
|
101
|
+
processes.append(process)
|
102
|
+
|
103
|
+
thread = threading.Thread(target=handle_output, args=(process,))
|
104
|
+
thread.daemon = True
|
105
|
+
thread.start()
|
106
|
+
|
107
|
+
try:
|
108
|
+
while True:
|
109
|
+
time.sleep(1)
|
110
|
+
if all(process.poll() is not None for process in processes):
|
111
|
+
print("All processes have finished.")
|
112
|
+
break
|
113
|
+
except KeyboardInterrupt:
|
114
|
+
signal_handler(None, None)
|
115
|
+
finally:
|
116
|
+
terminate_processes()
|
117
|
+
|
118
|
+
print("Script execution completed.")
|
119
|
+
|
120
|
+
|
121
|
+
@app.command(help="Build image for function names")
|
122
|
+
def build_image(workflow_file_path: str, func_names: List[str]):
|
123
|
+
globals_dict = {}
|
124
|
+
|
125
|
+
try:
|
126
|
+
exec(open(workflow_file_path).read(), globals_dict)
|
127
|
+
except FileNotFoundError as e:
|
128
|
+
raise Exception(
|
129
|
+
f"Could not find workflow file to execute at: " f"`{workflow_file_path}`"
|
130
|
+
)
|
131
|
+
|
132
|
+
found_funcs = []
|
133
|
+
graph = None
|
134
|
+
for name, obj in globals_dict.items():
|
135
|
+
for func_name in func_names:
|
136
|
+
if name == func_name:
|
137
|
+
found_funcs.append(name)
|
138
|
+
_create_image_for_func(func_name=func_name, func_obj=obj)
|
139
|
+
|
140
|
+
console.print(
|
141
|
+
Text(f"Processed functions: ", style="cyan"),
|
142
|
+
Text(f"{found_funcs}", style="green"),
|
143
|
+
)
|
144
|
+
|
145
|
+
|
146
|
+
@app.command(help="Joins the extractors to the coordinator server")
|
147
|
+
def executor(
|
148
|
+
server_addr: str = "localhost:8900",
|
149
|
+
workers: Annotated[
|
150
|
+
int, typer.Option(help="number of worker processes for extraction")
|
151
|
+
] = 1,
|
152
|
+
config_path: Optional[str] = typer.Option(
|
153
|
+
None, help="Path to the TLS configuration file"
|
154
|
+
),
|
155
|
+
executor_cache: Optional[str] = typer.Option(
|
156
|
+
"~/.indexify/executor_cache", help="Path to the executor cache directory"
|
157
|
+
),
|
158
|
+
):
|
159
|
+
id = nanoid.generate()
|
160
|
+
console.print(
|
161
|
+
Panel(
|
162
|
+
f"Number of workers: {workers}\n"
|
163
|
+
f"Config path: {config_path}\n"
|
164
|
+
f"Server address: {server_addr}\n"
|
165
|
+
f"Executor ID: {id}\n"
|
166
|
+
f"Executor cache: {executor_cache}",
|
167
|
+
title="Agent Configuration",
|
168
|
+
border_style="info",
|
169
|
+
)
|
170
|
+
)
|
171
|
+
|
172
|
+
function_worker = FunctionWorker(workers=workers)
|
173
|
+
from pathlib import Path
|
174
|
+
|
175
|
+
executor_cache = Path(executor_cache).expanduser().absolute()
|
176
|
+
if os.path.exists(executor_cache):
|
177
|
+
shutil.rmtree(executor_cache)
|
178
|
+
Path(executor_cache).mkdir(parents=True, exist_ok=True)
|
179
|
+
|
180
|
+
agent = ExtractorAgent(
|
181
|
+
id,
|
182
|
+
num_workers=workers,
|
183
|
+
function_worker=function_worker,
|
184
|
+
server_addr=server_addr,
|
185
|
+
config_path=config_path,
|
186
|
+
code_path=executor_cache,
|
187
|
+
)
|
188
|
+
|
189
|
+
try:
|
190
|
+
asyncio.get_event_loop().run_until_complete(agent.run())
|
191
|
+
except asyncio.CancelledError as ex:
|
192
|
+
console.print(Text(f"Exiting gracefully: {ex}", style="bold yellow"))
|
193
|
+
|
194
|
+
|
195
|
+
def _create_image_for_func(func_name, func_obj):
|
196
|
+
console.print(
|
197
|
+
Text("Creating container for ", style="cyan"),
|
198
|
+
Text(f"`{func_name}`", style="cyan bold"),
|
199
|
+
)
|
200
|
+
_build_image(image=func_obj.image, func_name=func_name)
|
201
|
+
|
202
|
+
|
203
|
+
def _build_image(image: Image, func_name: str = None):
|
204
|
+
try:
|
205
|
+
client = docker.from_env()
|
206
|
+
client.ping()
|
207
|
+
except Exception as e:
|
208
|
+
console.print(
|
209
|
+
Text("Unable to connect with docker: ", style="red bold"),
|
210
|
+
Text(f"{e}", style="red"),
|
211
|
+
)
|
212
|
+
exit(-1)
|
213
|
+
|
214
|
+
docker_file_str_template = """
|
215
|
+
FROM {base_image}
|
216
|
+
|
217
|
+
WORKDIR /app
|
218
|
+
|
219
|
+
"""
|
220
|
+
|
221
|
+
docker_file_str = docker_file_str_template.format(base_image=image._base_image)
|
222
|
+
|
223
|
+
run_strs = ["RUN " + i for i in image._run_strs]
|
224
|
+
|
225
|
+
docker_file_str += "\n".join(run_strs)
|
226
|
+
|
227
|
+
console.print("Creating image using Dockerfile contents:", style="cyan bold")
|
228
|
+
console.print(f"{docker_file_str}", style="magenta")
|
229
|
+
|
230
|
+
client = docker.from_env()
|
231
|
+
client.images.build(
|
232
|
+
fileobj=io.BytesIO(docker_file_str.encode()),
|
233
|
+
tag=f"{image._image_name}:{image._tag}",
|
234
|
+
rm=True,
|
235
|
+
)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from .base_client import IndexifyClient
|
4
|
+
from .local_client import LocalClient
|
5
|
+
from .remote_client import RemoteClient
|
6
|
+
from .settings import DEFAULT_SERVICE_URL
|
7
|
+
|
8
|
+
|
9
|
+
def create_client(
|
10
|
+
service_url: str = DEFAULT_SERVICE_URL,
|
11
|
+
config_path: Optional[str] = None,
|
12
|
+
local: bool = False,
|
13
|
+
*args,
|
14
|
+
**kwargs,
|
15
|
+
) -> IndexifyClient:
|
16
|
+
if local:
|
17
|
+
return LocalClient()
|
18
|
+
return RemoteClient(config_path=config_path, service_url=service_url, **kwargs)
|