indexify 0.0.42__tar.gz → 0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. indexify-0.2/PKG-INFO +151 -0
  2. indexify-0.2/README.md +122 -0
  3. indexify-0.2/indexify/__init__.py +21 -0
  4. {indexify-0.0.42 → indexify-0.2}/indexify/base_client.py +48 -21
  5. indexify-0.2/indexify/cli.py +235 -0
  6. indexify-0.2/indexify/client.py +18 -0
  7. indexify-0.2/indexify/executor/agent.py +362 -0
  8. indexify-0.2/indexify/executor/api_objects.py +43 -0
  9. indexify-0.2/indexify/executor/downloader.py +124 -0
  10. indexify-0.2/indexify/executor/executor_tasks.py +72 -0
  11. indexify-0.2/indexify/executor/function_worker.py +177 -0
  12. indexify-0.2/indexify/executor/indexify_executor.py +32 -0
  13. indexify-0.2/indexify/executor/task_reporter.py +110 -0
  14. indexify-0.2/indexify/executor/task_store.py +113 -0
  15. indexify-0.2/indexify/foo +72 -0
  16. indexify-0.2/indexify/functions_sdk/data_objects.py +37 -0
  17. indexify-0.2/indexify/functions_sdk/graph.py +276 -0
  18. indexify-0.2/indexify/functions_sdk/graph_validation.py +69 -0
  19. indexify-0.2/indexify/functions_sdk/image.py +26 -0
  20. indexify-0.2/indexify/functions_sdk/indexify_functions.py +192 -0
  21. indexify-0.2/indexify/functions_sdk/local_cache.py +46 -0
  22. indexify-0.2/indexify/functions_sdk/object_serializer.py +61 -0
  23. indexify-0.2/indexify/local_client.py +183 -0
  24. indexify-0.2/indexify/remote_client.py +319 -0
  25. {indexify-0.0.42 → indexify-0.2}/pyproject.toml +12 -3
  26. indexify-0.0.42/PKG-INFO +0 -66
  27. indexify-0.0.42/README.md +0 -43
  28. indexify-0.0.42/indexify/__init__.py +0 -22
  29. indexify-0.0.42/indexify/client.py +0 -790
  30. indexify-0.0.42/indexify/error.py +0 -30
  31. indexify-0.0.42/indexify/extraction_policy.py +0 -75
  32. indexify-0.0.42/indexify/extractor_sdk/__init__.py +0 -14
  33. indexify-0.0.42/indexify/extractor_sdk/data.py +0 -100
  34. indexify-0.0.42/indexify/extractor_sdk/extractor.py +0 -223
  35. indexify-0.0.42/indexify/extractor_sdk/utils.py +0 -102
  36. indexify-0.0.42/indexify/extractors/__init__.py +0 -0
  37. indexify-0.0.42/indexify/extractors/embedding.py +0 -55
  38. indexify-0.0.42/indexify/extractors/pdf_parser.py +0 -93
  39. indexify-0.0.42/indexify/graph.py +0 -133
  40. indexify-0.0.42/indexify/local_runner.py +0 -128
  41. indexify-0.0.42/indexify/runner.py +0 -22
  42. indexify-0.0.42/indexify/utils.py +0 -7
  43. {indexify-0.0.42 → indexify-0.2}/LICENSE.txt +0 -0
  44. {indexify-0.0.42 → indexify-0.2}/indexify/data_loaders/__init__.py +0 -0
  45. {indexify-0.0.42 → indexify-0.2}/indexify/data_loaders/local_directory_loader.py +0 -0
  46. {indexify-0.0.42 → indexify-0.2}/indexify/data_loaders/url_loader.py +0 -0
  47. /indexify-0.0.42/indexify/exceptions.py → /indexify-0.2/indexify/error.py +0 -0
  48. {indexify-0.0.42 → indexify-0.2}/indexify/settings.py +0 -0
indexify-0.2/PKG-INFO ADDED
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.1
2
+ Name: indexify
3
+ Version: 0.2
4
+ Summary: Python Client for Indexify
5
+ Home-page: https://github.com/tensorlakeai/indexify
6
+ License: Apache 2.0
7
+ Author: Tensorlake Inc.
8
+ Author-email: support@tensorlake.ai
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: License :: Other/Proprietary License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: cloudpickle (>=3,<4)
17
+ Requires-Dist: docker (>=7.1.0,<8.0.0)
18
+ Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
19
+ Requires-Dist: httpx[http2] (>=0,<1)
20
+ Requires-Dist: msgpack (>=1.1.0,<2.0.0)
21
+ Requires-Dist: nanoid (>=2.0.0,<3.0.0)
22
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
23
+ Requires-Dist: pyyaml (>=6,<7)
24
+ Requires-Dist: rich (>=13,<14)
25
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
26
+ Project-URL: Repository, https://github.com/tensorlakeai/indexify
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Indexify Python SDK
30
+
31
+ [![PyPI version](https://badge.fury.io/py/indexify.svg)](https://badge.fury.io/py/indexify)
32
+ [![Discord](https://dcbadge.vercel.app/api/server/VXkY7zVmTD?style=flat&compact=true)](https://discord.gg/VXkY7zVmTD)
33
+
34
+ This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
35
+
36
+ Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
37
+
38
+ ## Installation
39
+
40
+ ```shell
41
+ pip install indexify
42
+ ```
43
+
44
+ ## Examples
45
+ **[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
46
+ 1. Extracts text, tables and images from an ingested PDF file
47
+ 2. Indexes the text using MiniLM-L6-v2, the images with CLIP
48
+ 3. Writes the results into a vector database.
49
+
50
+ **[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
51
+ 1. Downloads Youtube Video
52
+ 2. Extracts audio from the video and transcribes using `Faster Whisper`
53
+ 3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
54
+ 4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
55
+ 5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
56
+
57
+ ## Quick Start
58
+ 1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
59
+ 2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
60
+ 3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
61
+ 4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
62
+
63
+ ## Functional Features
64
+ 1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
65
+ 2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
66
+ 3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
67
+ 4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
68
+
69
+ ```python
70
+ from pydantic import BaseModel
71
+ from indexify import indexify_function
72
+ from typing import Dict, Any, Optional, List
73
+
74
+ # Define function inputs and outputs
75
+ class Document(BaseModel):
76
+ text: str
77
+ metadata: Dict[str, Any]
78
+
79
+ class TextChunk(BaseModel):
80
+ text: str
81
+ metadata: Dict[str, Any]
82
+ embedding: Optional[List[float]] = None
83
+
84
+
85
+ # Decorate a function which is going to be part of your data processing graph
86
+ @indexify_function()
87
+ def split_text(doc: Document) -> List[TextChunk]:
88
+ midpoint = len(doc.text) // 2
89
+ first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
90
+ second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
91
+ return [first_half, second_half]
92
+
93
+ # Any requirements specified is automatically installed in production clusters
94
+ @indexify_function(requirements=["langchain_text_splitter"])
95
+ def compute_embedding(chunk: TextChunk) -> TextChunk:
96
+ chunk.embedding = [0.1, 0.2, 0.3]
97
+ return chunk
98
+
99
+ # You can constrain functions to run on specific executors
100
+ @indexify_function(executor_runtime_name="postgres-driver-image")
101
+ def write_to_db(chunk: TextChunk):
102
+ # Write to your favorite vector database
103
+ ...
104
+
105
+ ## Create a graph
106
+ from indexify import Graph
107
+
108
+ g = Graph(name="my_graph", start_node=split_text)
109
+ g.add_edge(split_text, compute_embedding)
110
+ g.add_edge(embed_text, write_to_db)
111
+ ```
112
+
113
+ ## Graph Execution
114
+ Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
115
+
116
+ ## Run the Graph Locally
117
+ ```python
118
+ from indexify import IndexifyClient
119
+
120
+ client = IndexifyClient(local=True)
121
+ client.register_graph(g)
122
+ invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
123
+ graph_outputs = client.graph_outputs(g.name, invocation_id)
124
+ ```
125
+
126
+ ## Deploy the Graph to Indexify Server for Production
127
+ > Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
128
+ ```python
129
+ from indexify import IndexifyClient
130
+
131
+ client = IndexifyClient(service_url="http://localhost:8900")
132
+ client.register_graph(g)
133
+ ```
134
+
135
+ #### Ingestion into the Service
136
+ Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
137
+
138
+ ```python
139
+ output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
140
+ ```
141
+
142
+ #### Retrieve Graph Outputs for a given ingestion object
143
+ ```python
144
+ graph_outputs = client.graph_outputs(g.name, output_id)
145
+ ```
146
+
147
+ #### Retrieve All Graph Inputs
148
+ ```python
149
+ graph_inputs = client.graph_inputs(g.name)
150
+ ```
151
+
indexify-0.2/README.md ADDED
@@ -0,0 +1,122 @@
1
+ # Indexify Python SDK
2
+
3
+ [![PyPI version](https://badge.fury.io/py/indexify.svg)](https://badge.fury.io/py/indexify)
4
+ [![Discord](https://dcbadge.vercel.app/api/server/VXkY7zVmTD?style=flat&compact=true)](https://discord.gg/VXkY7zVmTD)
5
+
6
+ This is the Python SDK to build real-time continuously running unstructured data processing pipelines with Indexify.
7
+
8
+ Start by writing and testing your pipelines locally using your data, then deploy them into the Indexify service to process data in real-time at scale.
9
+
10
+ ## Installation
11
+
12
+ ```shell
13
+ pip install indexify
14
+ ```
15
+
16
+ ## Examples
17
+ **[PDF Document Extraction](./examples/pdf_document_extraction/workflow.py)**
18
+ 1. Extracts text, tables and images from an ingested PDF file
19
+ 2. Indexes the text using MiniLM-L6-v2, the images with CLIP
20
+ 3. Writes the results into a vector database.
21
+
22
+ **[Youtube Transcription Summarizer](./examples/video_summarization/workflow.py)**
23
+ 1. Downloads Youtube Video
24
+ 2. Extracts audio from the video and transcribes using `Faster Whisper`
25
+ 3. Uses Llama 3.1 backed by `Llama.cpp` to understand and classify the nature of the video.
26
+ 4. Routes the transcription dynamically to one of the transcription summarizer to retain specific summarization attributes.
27
+ 5. Finally the entire transcription is embedded and stored in a vector database for retrieval.
28
+
29
+ ## Quick Start
30
+ 1. Write data processing functions in Python and use Pydantic objects for returning complex data types from functions
31
+ 2. Connect functions using a graph interface. Indexify automatically stores function outputs and passes them along to downstream functions.
32
+ 3. If a function returns a list, the downstream functions will be called with each item in the list in **parallel**.
33
+ 4. The input of the first function becomes the input to the HTTP endpoint of the Graph.
34
+
35
+ ## Functional Features
36
+ 1. There is **NO** limit to volume of data being ingested since we use blob stores for storing metadata and objects
37
+ 2. The server can handle 10s of 1000s of files being ingested into the graphs in parallel.
38
+ 3. The scheduler reacts under 8 microseconds to ingestion events, so it's suitable for workflows which needs to run in realtime.
39
+ 4. Batch ingestion is handled gracefully by batching ingested data and scheduling for high throughput in production settings.
40
+
41
+ ```python
42
+ from pydantic import BaseModel
43
+ from indexify import indexify_function
44
+ from typing import Dict, Any, Optional, List
45
+
46
+ # Define function inputs and outputs
47
+ class Document(BaseModel):
48
+ text: str
49
+ metadata: Dict[str, Any]
50
+
51
+ class TextChunk(BaseModel):
52
+ text: str
53
+ metadata: Dict[str, Any]
54
+ embedding: Optional[List[float]] = None
55
+
56
+
57
+ # Decorate a function which is going to be part of your data processing graph
58
+ @indexify_function()
59
+ def split_text(doc: Document) -> List[TextChunk]:
60
+ midpoint = len(doc.text) // 2
61
+ first_half = TextChunk(text=doc.text[:midpoint], metadata=doc.metadata)
62
+ second_half = TextChunk(text=doc.text[midpoint:], metadata=doc.metadata)
63
+ return [first_half, second_half]
64
+
65
+ # Any requirements specified is automatically installed in production clusters
66
+ @indexify_function(requirements=["langchain_text_splitter"])
67
+ def compute_embedding(chunk: TextChunk) -> TextChunk:
68
+ chunk.embedding = [0.1, 0.2, 0.3]
69
+ return chunk
70
+
71
+ # You can constrain functions to run on specific executors
72
+ @indexify_function(executor_runtime_name="postgres-driver-image")
73
+ def write_to_db(chunk: TextChunk):
74
+ # Write to your favorite vector database
75
+ ...
76
+
77
+ ## Create a graph
78
+ from indexify import Graph
79
+
80
+ g = Graph(name="my_graph", start_node=split_text)
81
+ g.add_edge(split_text, compute_embedding)
82
+ g.add_edge(embed_text, write_to_db)
83
+ ```
84
+
85
+ ## Graph Execution
86
+ Every time the Graph is invoked, Indexify will provide an `Invocation Id` which can be used to know about the status of the processing and any outputs from the Graph.
87
+
88
+ ## Run the Graph Locally
89
+ ```python
90
+ from indexify import IndexifyClient
91
+
92
+ client = IndexifyClient(local=True)
93
+ client.register_graph(g)
94
+ invocation_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
95
+ graph_outputs = client.graph_outputs(g.name, invocation_id)
96
+ ```
97
+
98
+ ## Deploy the Graph to Indexify Server for Production
99
+ > Work In Progress - The version of server that works with python based graphs haven't been released yet. It will be shortly released. Join discord for development updates.
100
+ ```python
101
+ from indexify import IndexifyClient
102
+
103
+ client = IndexifyClient(service_url="http://localhost:8900")
104
+ client.register_graph(g)
105
+ ```
106
+
107
+ #### Ingestion into the Service
108
+ Extraction Graphs continuously run on the Indexify Service like any other web service. Indexify Server runs the extraction graphs in parallel and in real-time when new data is ingested into the service.
109
+
110
+ ```python
111
+ output_id = client.invoke_graph_with_object(g.name, Document(text="Hello, world!", metadata={"source": "test"}))
112
+ ```
113
+
114
+ #### Retrieve Graph Outputs for a given ingestion object
115
+ ```python
116
+ graph_outputs = client.graph_outputs(g.name, output_id)
117
+ ```
118
+
119
+ #### Retrieve All Graph Inputs
120
+ ```python
121
+ graph_inputs = client.graph_inputs(g.name)
122
+ ```
@@ -0,0 +1,21 @@
1
+ from . import data_loaders
2
+ from .client import create_client
3
+ from .functions_sdk.graph import Graph
4
+ from .functions_sdk.indexify_functions import (
5
+ indexify_function,
6
+ indexify_router,
7
+ )
8
+ from .local_client import LocalClient
9
+ from .remote_client import RemoteClient
10
+ from .settings import DEFAULT_SERVICE_URL
11
+
12
+ __all__ = [
13
+ "data_loaders",
14
+ "Graph",
15
+ "indexify_function",
16
+ "indexify_router",
17
+ "DEFAULT_SERVICE_URL",
18
+ "RemoteClient",
19
+ "LocalClient",
20
+ "create_client",
21
+ ]
@@ -1,41 +1,67 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Any, Dict, List, Optional, Union
3
3
 
4
- from .extractor_sdk import Feature, Graph
4
+ from pydantic import Json
5
5
 
6
+ from indexify.functions_sdk.graph import Graph
6
7
 
7
- class BaseClient(ABC):
8
+
9
+ class IndexifyClient(ABC):
8
10
 
9
11
  ### Operational APIs
10
12
  @abstractmethod
11
- def register_extraction_graph(self, graph: Graph):
13
+ def register_compute_graph(self, graph: Graph):
14
+ """
15
+ Register a compute graph.
16
+ graph: Graph: The graph to be registered
17
+ """
12
18
  pass
13
19
 
14
20
  @abstractmethod
15
- def graphs(self) -> str:
21
+ def graphs(self) -> List[str]:
22
+ """
23
+ Get the graphs.
24
+ return: List[str]: The graphs
25
+ """
16
26
  pass
17
27
 
18
28
  @abstractmethod
19
- def namespaces(self) -> str:
29
+ def namespaces(self) -> List[str]:
30
+ """
31
+ Get the namespaces.
32
+ return: List[str]: The namespaces
33
+ """
20
34
  pass
21
35
 
22
36
  @abstractmethod
23
37
  def create_namespace(self, namespace: str):
38
+ """
39
+ Create a namespace.
40
+ namespace: str: The name of the namespace to be created
41
+ """
24
42
  pass
25
43
 
26
44
  ### Ingestion APIs
27
45
  @abstractmethod
28
- def invoke_graph_with_object(self, graph: str, object: Any) -> str:
46
+ def invoke_graph_with_object(
47
+ self, graph: str, block_until_done: bool = False, **kwargs
48
+ ) -> str:
29
49
  """
30
50
  Invokes a graph with an input object.
31
51
  graph: str: The name of the graph to invoke
32
- object: Any: The input object to the graph. It should be JSON serializable
52
+ kwargs: Any: Named arguments to be passed to the graph. Example: url="https://www.google.com", web_page_text="Hello world!"
33
53
  return: str: The ID of the ingested object
34
54
  """
35
55
  pass
36
56
 
37
57
  @abstractmethod
38
- def invoke_graph_with_file(self, graph: str, path: str) -> str:
58
+ def invoke_graph_with_file(
59
+ self,
60
+ graph: str,
61
+ path: str,
62
+ metadata: Optional[Dict[str, Json]] = None,
63
+ block_until_done: bool = False,
64
+ ) -> str:
39
65
  """
40
66
  Invokes a graph with an input file. The file's mimetype is appropriately detected.
41
67
  graph: str: The name of the graph to invoke
@@ -44,10 +70,21 @@ class BaseClient(ABC):
44
70
  """
45
71
  pass
46
72
 
73
+ @abstractmethod
74
+ def rerun_graph(self, graph: str):
75
+ """
76
+ Rerun a graph.
77
+ graph: str: The name of the graph to rerun
78
+ """
79
+ pass
80
+
47
81
  ### Retrieval APIs
48
82
  @abstractmethod
49
- def extracted_objects(
50
- self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
83
+ def graph_outputs(
84
+ self,
85
+ graph: str,
86
+ invocation_id: str,
87
+ fn_name: Optional[str],
51
88
  ) -> Union[Dict[str, List[Any]], List[Any]]:
52
89
  """
53
90
  Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
@@ -55,17 +92,7 @@ class BaseClient(ABC):
55
92
  graph: str: The name of the graph
56
93
  ingested_object_id: str: The ID of the ingested object
57
94
  extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
95
+ block_until_done: bool = True: If True, the method will block until the extraction is done. If False, the method will return immediately.
58
96
  return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
59
97
  """
60
98
  pass
61
-
62
- @abstractmethod
63
- def features(
64
- self, object_id: str, graph: Optional[str]
65
- ) -> Union[Dict[str, List[Feature]], List[Feature]]:
66
- """
67
- Returns the features of an object.
68
- object_id: str: The ID of the object
69
- return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
70
- """
71
- pass
@@ -0,0 +1,235 @@
1
+ import asyncio
2
+ import io
3
+ import os
4
+ import shutil
5
+ import signal
6
+ import subprocess
7
+ import sys
8
+ import threading
9
+ import time
10
+ from typing import Annotated, List, Optional
11
+
12
+ import docker
13
+ import nanoid
14
+ import typer
15
+ from rich.console import Console
16
+ from rich.panel import Panel
17
+ from rich.text import Text
18
+ from rich.theme import Theme
19
+
20
+ from indexify.executor.agent import ExtractorAgent
21
+ from indexify.executor.function_worker import FunctionWorker
22
+ from indexify.functions_sdk.image import Image
23
+
24
+ custom_theme = Theme(
25
+ {
26
+ "info": "cyan",
27
+ "warning": "yellow",
28
+ "error": "red",
29
+ "highlight": "magenta",
30
+ }
31
+ )
32
+
33
+ console = Console(theme=custom_theme)
34
+
35
+ app = typer.Typer(pretty_exceptions_enable=False, no_args_is_help=True)
36
+
37
+
38
+ @app.command(
39
+ help="Run server and executor in dev mode (Not recommended for production.)"
40
+ )
41
+ def server_dev_mode():
42
+ indexify_server_path = os.path.expanduser("~/.indexify/indexify-server")
43
+ if not os.path.exists(indexify_server_path):
44
+ print("indexify-server not found. Downloading...")
45
+ try:
46
+ download_command = subprocess.check_output(
47
+ ["curl", "-s", "https://getindexify.ai"], universal_newlines=True
48
+ )
49
+ subprocess.run(download_command, shell=True, check=True)
50
+ except subprocess.CalledProcessError as e:
51
+ print(f"failed to download indexify-server: {e}")
52
+ exit(1)
53
+ try:
54
+ os.makedirs(os.path.dirname(indexify_server_path), exist_ok=True)
55
+ shutil.move("indexify-server", indexify_server_path)
56
+ except Exception as e:
57
+ print(f"failed to move indexify-server to {indexify_server_path}: {e}")
58
+ exit(1)
59
+ print("starting indexify server and executor in dev mode...")
60
+ print("press Ctrl+C to stop the server and executor.")
61
+ print(f"server binary path: {indexify_server_path}")
62
+ commands = [indexify_server_path, "indexify-cli executor"]
63
+
64
+ processes = []
65
+ stop_event = threading.Event()
66
+
67
+ def handle_output(process):
68
+ for line in iter(process.stdout.readline, ""):
69
+ sys.stdout.write(line)
70
+ sys.stdout.flush()
71
+
72
+ def terminate_processes():
73
+ print("Terminating processes...")
74
+ stop_event.set()
75
+ for process in processes:
76
+ if process.poll() is None:
77
+ try:
78
+ process.terminate()
79
+ process.wait(timeout=5)
80
+ except subprocess.TimeoutExpired:
81
+ print(f"Force killing process {process.pid}")
82
+ process.kill()
83
+
84
+ def signal_handler(sig, frame):
85
+ print("\nCtrl+C pressed. Shutting down...")
86
+ terminate_processes()
87
+ sys.exit(0)
88
+
89
+ signal.signal(signal.SIGINT, signal_handler)
90
+ signal.signal(signal.SIGTERM, signal_handler)
91
+
92
+ for cmd in commands:
93
+ process = subprocess.Popen(
94
+ cmd.split(),
95
+ stdout=subprocess.PIPE,
96
+ stderr=subprocess.STDOUT,
97
+ bufsize=1,
98
+ universal_newlines=True,
99
+ preexec_fn=os.setsid if os.name != "nt" else None,
100
+ )
101
+ processes.append(process)
102
+
103
+ thread = threading.Thread(target=handle_output, args=(process,))
104
+ thread.daemon = True
105
+ thread.start()
106
+
107
+ try:
108
+ while True:
109
+ time.sleep(1)
110
+ if all(process.poll() is not None for process in processes):
111
+ print("All processes have finished.")
112
+ break
113
+ except KeyboardInterrupt:
114
+ signal_handler(None, None)
115
+ finally:
116
+ terminate_processes()
117
+
118
+ print("Script execution completed.")
119
+
120
+
121
+ @app.command(help="Build image for function names")
122
+ def build_image(workflow_file_path: str, func_names: List[str]):
123
+ globals_dict = {}
124
+
125
+ try:
126
+ exec(open(workflow_file_path).read(), globals_dict)
127
+ except FileNotFoundError as e:
128
+ raise Exception(
129
+ f"Could not find workflow file to execute at: " f"`{workflow_file_path}`"
130
+ )
131
+
132
+ found_funcs = []
133
+ graph = None
134
+ for name, obj in globals_dict.items():
135
+ for func_name in func_names:
136
+ if name == func_name:
137
+ found_funcs.append(name)
138
+ _create_image_for_func(func_name=func_name, func_obj=obj)
139
+
140
+ console.print(
141
+ Text(f"Processed functions: ", style="cyan"),
142
+ Text(f"{found_funcs}", style="green"),
143
+ )
144
+
145
+
146
+ @app.command(help="Joins the extractors to the coordinator server")
147
+ def executor(
148
+ server_addr: str = "localhost:8900",
149
+ workers: Annotated[
150
+ int, typer.Option(help="number of worker processes for extraction")
151
+ ] = 1,
152
+ config_path: Optional[str] = typer.Option(
153
+ None, help="Path to the TLS configuration file"
154
+ ),
155
+ executor_cache: Optional[str] = typer.Option(
156
+ "~/.indexify/executor_cache", help="Path to the executor cache directory"
157
+ ),
158
+ ):
159
+ id = nanoid.generate()
160
+ console.print(
161
+ Panel(
162
+ f"Number of workers: {workers}\n"
163
+ f"Config path: {config_path}\n"
164
+ f"Server address: {server_addr}\n"
165
+ f"Executor ID: {id}\n"
166
+ f"Executor cache: {executor_cache}",
167
+ title="Agent Configuration",
168
+ border_style="info",
169
+ )
170
+ )
171
+
172
+ function_worker = FunctionWorker(workers=workers)
173
+ from pathlib import Path
174
+
175
+ executor_cache = Path(executor_cache).expanduser().absolute()
176
+ if os.path.exists(executor_cache):
177
+ shutil.rmtree(executor_cache)
178
+ Path(executor_cache).mkdir(parents=True, exist_ok=True)
179
+
180
+ agent = ExtractorAgent(
181
+ id,
182
+ num_workers=workers,
183
+ function_worker=function_worker,
184
+ server_addr=server_addr,
185
+ config_path=config_path,
186
+ code_path=executor_cache,
187
+ )
188
+
189
+ try:
190
+ asyncio.get_event_loop().run_until_complete(agent.run())
191
+ except asyncio.CancelledError as ex:
192
+ console.print(Text(f"Exiting gracefully: {ex}", style="bold yellow"))
193
+
194
+
195
+ def _create_image_for_func(func_name, func_obj):
196
+ console.print(
197
+ Text("Creating container for ", style="cyan"),
198
+ Text(f"`{func_name}`", style="cyan bold"),
199
+ )
200
+ _build_image(image=func_obj.image, func_name=func_name)
201
+
202
+
203
+ def _build_image(image: Image, func_name: str = None):
204
+ try:
205
+ client = docker.from_env()
206
+ client.ping()
207
+ except Exception as e:
208
+ console.print(
209
+ Text("Unable to connect with docker: ", style="red bold"),
210
+ Text(f"{e}", style="red"),
211
+ )
212
+ exit(-1)
213
+
214
+ docker_file_str_template = """
215
+ FROM {base_image}
216
+
217
+ WORKDIR /app
218
+
219
+ """
220
+
221
+ docker_file_str = docker_file_str_template.format(base_image=image._base_image)
222
+
223
+ run_strs = ["RUN " + i for i in image._run_strs]
224
+
225
+ docker_file_str += "\n".join(run_strs)
226
+
227
+ console.print("Creating image using Dockerfile contents:", style="cyan bold")
228
+ console.print(f"{docker_file_str}", style="magenta")
229
+
230
+ client = docker.from_env()
231
+ client.images.build(
232
+ fileobj=io.BytesIO(docker_file_str.encode()),
233
+ tag=f"{image._image_name}:{image._tag}",
234
+ rm=True,
235
+ )
@@ -0,0 +1,18 @@
1
+ from typing import Optional
2
+
3
+ from .base_client import IndexifyClient
4
+ from .local_client import LocalClient
5
+ from .remote_client import RemoteClient
6
+ from .settings import DEFAULT_SERVICE_URL
7
+
8
+
9
+ def create_client(
10
+ service_url: str = DEFAULT_SERVICE_URL,
11
+ config_path: Optional[str] = None,
12
+ local: bool = False,
13
+ *args,
14
+ **kwargs,
15
+ ) -> IndexifyClient:
16
+ if local:
17
+ return LocalClient()
18
+ return RemoteClient(config_path=config_path, service_url=service_url, **kwargs)