PyPI - indexify - Versions diffs - 0.0.20__tar.gz → 0.0.22__tar.gz - Mend

indexify 0.0.20tar.gz → 0.0.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{indexify-0.0.20 → indexify-0.0.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.20
+Version: 0.0.22
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0

{indexify-0.0.20 → indexify-0.0.22}/indexify/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .index import Index
 from .client import IndexifyClient
-from .extraction_policy import ExtractionPolicy
+from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
 from .client import IndexifyClient, Document
 from .settings import DEFAULT_SERVICE_URL
@@ -8,6 +8,7 @@ __all__ = [
     "Index",
     "Document",
     "IndexifyClient",
-    "ExtractionPolicy",
+    "ExtractionGraph",
+    "ExtractionGraphBuilder" "ExtractionPolicy",
     "DEFAULT_SERVICE_URL",
 ]

{indexify-0.0.20 → indexify-0.0.22}/indexify/client.py RENAMED Viewed

@@ -6,19 +6,20 @@ import json
 from collections import namedtuple
 from .settings import DEFAULT_SERVICE_URL
 from .extractor import Extractor
-from .extraction_policy import ExtractionPolicy
+from .extraction_policy import ExtractionPolicy, ExtractionGraph
 from .index import Index
 from .utils import json_set_default
+from .error import Error
 from .data_containers import TextChunk
 from indexify.exceptions import ApiException
 from dataclasses import dataclass
 from typing import List, Optional, Union, Dict
 Document = namedtuple("Document", ["text", "labels", "id"])
 SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
 @dataclass
 class SqlQueryResult:
     result: List[Dict]
@@ -45,22 +46,22 @@ class IndexifyClient:
     def __init__(
         self,
-        service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
+        service_url: str = DEFAULT_SERVICE_URL,  # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
         namespace: str = "default",
         config_path: Optional[str] = None,
         *args,
         **kwargs,
     ):
         if config_path:
-            with open(config_path, 'r') as file:
+            with open(config_path, "r") as file:
                 config = yaml.safe_load(file)
-            if config.get('use_tls', False):
-                tls_config = config['tls_config']
+            if config.get("use_tls", False):
+                tls_config = config["tls_config"]
                 self._client = httpx.Client(
                     http2=True,
-                    cert=(tls_config['cert_path'], tls_config['key_path']),
-                    verify=tls_config.get('ca_bundle_path', True)
+                    cert=(tls_config["cert_path"], tls_config["key_path"]),
+                    verify=tls_config.get("ca_bundle_path", True),
                 )
             else:
                 self._client = httpx.Client(*args, **kwargs)
@@ -68,17 +69,13 @@ class IndexifyClient:
             self._client = httpx.Client(*args, **kwargs)
         self.namespace: str = namespace
-        self.extraction_policies: List[ExtractionPolicy] = []
+        self.extraction_graphs: List[ExtractionGraph] = []
         self.labels: dict = {}
         self._service_url = service_url
+        self._timeout = kwargs.get("timeout")
         # get namespace data
-        response = self.get(f"namespaces/{self.namespace}")
-        response.raise_for_status()
-        resp_json = response.json()
-        # initialize extraction_policies
-        for eb in resp_json["namespace"]["extraction_policies"]:
-            self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
+        self.extraction_graphs = self.get_extraction_graphs()
     @classmethod
     def with_mtls(
@@ -128,12 +125,18 @@ class IndexifyClient:
         return client
     def _request(self, method: str, **kwargs) -> httpx.Response:
-        response = self._client.request(method,timeout=None, **kwargs)
         try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as exc:
-            print(f"exception: {exc}, response text: {response.text}")
-            raise exc
+            response = self._client.request(method, timeout=self._timeout, **kwargs)
+            status_code = str(response.status_code)
+            if status_code.startswith("4") or status_code.startswith("5"):
+                error = Error.from_tonic_error_string(str(response.url), response.text)
+                self.__print_additional_error_context(error)
+                raise error
+        except httpx.ConnectError:
+            message = f"Make sure the server is running and accesible at {self._service_url}"
+            error = Error(status="ConnectionError", message=message)
+            print(error)
+            raise error
         return response
     def get(self, endpoint: str, **kwargs) -> httpx.Response:
@@ -188,7 +191,7 @@ class IndexifyClient:
         ```
         """
         return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
     def delete(self, endpoint: str, **kwargs) -> httpx.Response:
         """
         Make a DELETE request to the Indexify service.
@@ -243,9 +246,9 @@ class IndexifyClient:
     def create_namespace(
         self,
         namespace: str,
-        extraction_policies: list = [],
+        extraction_graphs: list = [],
         labels: dict = {},
-        service_url: str = DEFAULT_SERVICE_URL
+        service_url: str = DEFAULT_SERVICE_URL,
     ) -> "IndexifyClient":
         """
         Create a new namespace.
@@ -253,16 +256,16 @@ class IndexifyClient:
         Returns:
             IndexifyClient: a new client with the given namespace
         """
-        extraction_policies = []
-        for bd in extraction_policies:
-            if isinstance(bd, ExtractionPolicy):
-                extraction_policies.append(bd.to_dict())
+        extraction_graphs = []
+        for bd in extraction_graphs:
+            if isinstance(bd, extraction_graphs):
+                extraction_graphs.append(bd.to_dict())
             else:
-                extraction_policies.append(bd)
+                extraction_graphs.append(bd)
         req = {
             "name": namespace,
-            "extraction_policies": extraction_policies,
+            "extraction_graphs": extraction_graphs,
             "labels": labels,
         }
@@ -289,7 +292,6 @@ class IndexifyClient:
             List[Index]: list of indexes in the current namespace
         """
         response = self.get(f"namespaces/{self.namespace}/indexes")
-        response.raise_for_status()
         return response.json()["indexes"]
     def extractors(self) -> List[Extractor]:
@@ -306,69 +308,36 @@ class IndexifyClient:
             extractors.append(Extractor.from_dict(ed))
         return extractors
-    def get_extraction_policies(self):
+    def get_extraction_graphs(self) -> List[ExtractionGraph]:
         """
         Retrieve and update the list of extraction policies for the current namespace.
         """
         response = self.get(f"namespaces/{self.namespace}")
-        response.raise_for_status()
+        json = response.json()
-        self.extraction_policies = []
-        for eb in response.json()["namespace"]["extraction_policies"]:
-            self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
-        return self.extraction_policies
+        self.extraction_graphs = []
+        for graph in json["namespace"]["extraction_graphs"]:
+            self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
-    def add_extraction_policy(
-        self,
-        extractor: str,
-        name: str,
-        input_params: dict = {},
-        labels_eq: str = None,
-        content_source="ingestion",
-    ) -> dict:
-        """Add a new extraction policy.
-        Args:
-            - extractor (str): Name of the extractor
-            - name (str): Name for this instance
-            - input_params (dict): Dictionary containing extractor input params
-            - filter (Filter): Optional filter for this extractor
-        Returns:
-            dict: response payload
-        Examples:
-            >>> repo.add_extraction_policy("EfficientNet", "efficientnet")
-            >>> repo.add_extraction_policy("MiniLML6", "minilm")
+        return self.extraction_graphs
+    def create_extraction_graph(self, extraction_graph: ExtractionGraph):
         """
-        req = {
-            "extractor": extractor,
-            "name": name,
-            "input_params": input_params,
-            "filters_eq": labels_eq,
-            "content_source": content_source,
-        }
-        if req["filters_eq"] == None:
-            del req["filters_eq"]
+        Create a new extraction graph.
+        Args:
+            - extraction_graph (ExtractionGraph): the extraction graph to create
+        """
+        req = extraction_graph.to_dict()
+        req["namespace"] = self.namespace
         request_body = json.dumps(req, default=json_set_default)
         response = self.post(
-            f"namespaces/{self.namespace}/extraction_policies",
+            f"namespaces/{self.namespace}/extraction_graphs",
             data=request_body,
             headers={"Content-Type": "application/json"},
         )
-        # update self.extractor_bindings
-        self.get_extraction_policies()
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as exc:
-            raise ApiException(exc.response.text)
         return
     def get_content_metadata(self, content_id: str) -> dict:
         """
         Get metadata for a specific content ID in a given index.
@@ -377,52 +346,32 @@ class IndexifyClient:
             - content_id (str): content id to query
         """
         response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
-        response.raise_for_status()
         return response.json()
-    def get_extracted_content(
-        self,
-        content_id: str = None,
-    ):
-        """
-        Get list of content from current namespace.
-        Args:
-            - parent_id (str): Optional filter for parent id
-            - labels_eq (str): Optional filter for labels
-        """
-        params = {"parent_id": content_id}
-        response = self.get(f"namespaces/{self.namespace}/content", params=params)
-        response.raise_for_status()
-        return [
-            self._add_content_url(content)
-            for content in response.json()["content_list"]
-        ]
-    def download_content(self, id:str) -> bytes:
+    def download_content(self, id: str) -> bytes:
         """
         Download content from id. Return bytes
         Args:
             - id (str): id of content to download
         """
         response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
-        try:
-            response.raise_for_status()
-            return response.content
-        except httpx.HTTPStatusError as exc:
-            raise ApiException(exc.response.text)
+        return response.content
     def add_documents(
-        self, documents: Union[Document, str, List[Union[Document, str]]], doc_id=None
-    ) -> None:
+        self,
+        extraction_graphs: Union[str, List[str]],
+        documents: Union[Document, str, List[Union[Document, str]]],
+        doc_id=None,
+    ) -> Union[str, List[str]]:
         """
         Add documents to current namespace.
         Args:
             - documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
         """
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
         if isinstance(documents, Document):
             documents = [documents]
         elif isinstance(documents, str):
@@ -433,7 +382,9 @@ class IndexifyClient:
                 if isinstance(item, Document):
                     new_documents.append(item)
                 elif isinstance(item, str):
-                    new_documents.append(Document(item, {}, id=None)) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
+                    new_documents.append(
+                        Document(item, {}, id=None)
+                    )  # don't pass in id for a string content because doesn't make sense to have same content id for all strings
                 else:
                     raise ValueError(
                         "List items must be either Document instances or strings."
@@ -444,13 +395,21 @@ class IndexifyClient:
                 "Invalid type for documents. Expected Document, str, or list of these."
             )
-        req = {"documents": [doc._asdict() for doc in documents]}
+        req = {
+            "documents": [doc._asdict() for doc in documents],
+            "extraction_graph_names": extraction_graphs,
+        }
         response = self.post(
             f"namespaces/{self.namespace}/add_texts",
             json=req,
             headers={"Content-Type": "application/json"},
         )
         response.raise_for_status()
+        response_json = response.json()
+        content_ids = response_json["content_ids"]
+        if len(documents) == 1 and len(content_ids) == 1:
+            return content_ids[0]
+        return content_ids
     def delete_documents(self, document_ids: List[str]) -> None:
         """
@@ -465,7 +424,6 @@ class IndexifyClient:
             json=req,
             headers={"Content-Type": "application/json"},
         )
-        response.raise_for_status()
     def update_content(self, document_id: str, path: str) -> None:
         """
@@ -475,8 +433,9 @@ class IndexifyClient:
             - path (str): relative path to the file to be uploaded
         """
         with open(path, "rb") as f:
-            response = self.put(f"namespaces/{self.namespace}/content/{document_id}", files={"file": f})
-            response.raise_for_status()
+            response = self.put(
+                f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
+            )
     def get_structured_data(self, content_id: str) -> dict:
         """
@@ -485,11 +444,14 @@ class IndexifyClient:
         Args:
             - content_id (str): content id to query
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
-        response.raise_for_status()
-        return response.json().get("metadata",[])
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/metadata"
+        )
+        return response.json().get("metadata", [])
-    def search_index(self, name: str, query: str, top_k: int, filters: List[str] = []) -> list[TextChunk]:
+    def search_index(
+        self, name: str, query: str, top_k: int, filters: List[str] = []
+    ) -> list[TextChunk]:
         """
         Search index in the current namespace.
@@ -505,10 +467,9 @@ class IndexifyClient:
             json=req,
             headers={"Content-Type": "application/json"},
         )
-        response.raise_for_status()
         return response.json()["results"]
-    def upload_file(self, path: str, id=None, labels: dict = {}) -> str:
+    def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
         """
         Upload a file.
@@ -516,9 +477,11 @@ class IndexifyClient:
             - path (str): relative path to the file to be uploaded
             - labels (dict): labels to be associated with the file
         """
-        params={}
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
+        params = {"extraction_graph_names": extraction_graphs}
         if id is not None:
-            params['id'] = id
+            params["id"] = id
         with open(path, "rb") as f:
             response = self.post(
                 f"namespaces/{self.namespace}/upload_file",
@@ -526,7 +489,6 @@ class IndexifyClient:
                 data=labels,
                 params=params,
             )
-            response.raise_for_status()
             response_json = response.json()
             return response_json["content_id"]
@@ -535,20 +497,47 @@ class IndexifyClient:
         List all schemas in the current namespace.
         """
         response = self.get(f"namespaces/{self.namespace}/schemas")
-        response.raise_for_status()
         return response.json()
-    def get_content_tree(self, content_id:str):
+    def get_content_tree(self, content_id: str):
         """
         Get content tree for a given content id
         Args:
             - content_id (str): id of content
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/content-tree")
-        response.raise_for_status()
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/content-tree"
+        )
         return response.json()
+    def get_extracted_content(self, content_id: str, level: int = 0):
+        """
+        Get list of child for a given content id and their content up to the specified level.
+        Args:
+        - content_id (str): id of content
+        - level (int): depth of content retrieval (default: 0)
+        """
+        content_tree = self.get_content_tree(content_id)
+        child_list = []
+        def traverse_content(parent_id, current_level):
+            if current_level > level:
+                return
+            for item in content_tree['content_tree_metadata']:
+                if item['parent_id'] == parent_id:
+                    child_id = item['id']
+                    content = self.download_content(child_id)
+                    child_list.append({'id': child_id, 'content': content})
+                    traverse_content(child_id, current_level + 1)
+        traverse_content(content_id, 0)
+        return child_list
     def sql_query(self, query: str):
         """
         Execute a SQL query.
@@ -562,24 +551,38 @@ class IndexifyClient:
             json=req,
             headers={"Content-Type": "application/json"},
         )
-        response.raise_for_status()
         result = response.json()
         rows = []
         for row in result["rows"]:
             data = row["data"]
             rows.append(data)
         return SqlQueryResult(result=rows)
-    def ingest_remote_file(self, url: str, mime_type: str, labels: Dict[str, str], id=None):
-        req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id}
+    def ingest_remote_file(
+        self, extraction_graphs: Union[str, List[str]], url: str, mime_type: str, labels: Dict[str, str], id=None
+    ):
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
+        req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id, "extraction_graph_names": extraction_graphs}
         response = self.post(
             f"namespaces/{self.namespace}/ingest_remote_file",
             json=req,
             headers={"Content-Type": "application/json"},
         )
-        response.raise_for_status()
         return response.json()
+    def wait_for_extraction(self, content_id: str):
+        """
+        Wait for extraction to complete for a given content id
+        Args:
+            - content_id (str): id of content
+        """
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/wait"
+        )
+        response.raise_for_status()
     def generate_unique_hex_id(self):
         """
         Generate a unique hexadecimal identifier
@@ -588,18 +591,29 @@ class IndexifyClient:
             str: a unique hexadecimal string
         """
         return uuid.uuid4().hex[:16]
     def generate_hash_from_string(self, input_string: str):
         """
         Generate a hash for the given string and return it as a hexadecimal string.
         Args:
             input_string (str): The input string to hash.
         Returns:
             str: The hexadecimal hash of the input string.
         """
         hash_object = hashlib.sha256(input_string.encode())
         return hash_object.hexdigest()[:16]
+    def __print_additional_error_context(self, error: Error):
+        print(error)
+        if error.status == "ExtractionGraphError":
+            graphs = [eg.name for eg in self.extraction_graphs]
+            extractors = [ext.name for ext in self.extractors()]
+            print(f"Available extraction graphs: {graphs}")
+            print(f"Available extractors: {extractors}")
+        if error.status == "SearchError":
+            indexes = [index["name"] for index in self.indexes()]
+            print(f"Available indexes: {indexes}")

indexify-0.0.22/indexify/error.py ADDED Viewed

@@ -0,0 +1,30 @@
+class Error(Exception):
+    status: str
+    message: str
+    def __init__(self, status: str, message: str):
+        self.status = status
+        self.message = message
+    @staticmethod
+    def from_tonic_error_string(url: str, error: str) -> "Error":
+        data = error.split(", ")
+        message = data[1].split(": ", 1)[1]
+        if message.startswith('"') and message.endswith('"'):
+            message = message[1:-1]
+        status = "GeneralError"
+        if "extraction_graph" in url:
+            status = "ExtractionGraphError"
+        elif "search" in url:
+            status = "SearchError"
+        error = Error(status, message)
+        return error
+    def __str__(self):
+        return f"{self.status} | {self.message.capitalize()}"
+    def __repr__(self):
+        return f"Error(status={self.status!r}, message={self.message!r})"

indexify-0.0.22/indexify/extraction_policy.py ADDED Viewed

@@ -0,0 +1,68 @@
+from dataclasses import dataclass, asdict
+from typing import Optional, List
+@dataclass
+class ExtractionPolicy:
+    extractor: str
+    name: str
+    content_source: str
+    input_params: Optional[dict] = None
+    id: Optional[str] = None
+    labels_eq: Optional[str] = None
+    def __repr__(self) -> str:
+        return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
+    def __str__(self) -> str:
+        return self.__repr__()
+    def to_dict(self) -> dict:
+        filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
+        return filtered_dict
+    @classmethod
+    def from_dict(cls, json: dict):
+        if "filters_eq" in json:
+            json["labels_eq"] = json.pop("filters_eq")
+        json["id"] = json.get("id", None)
+        return ExtractionPolicy(**json)
+@dataclass
+class ExtractionGraph:
+    id: str
+    name: str
+    extraction_policies: List[ExtractionPolicy]
+    @classmethod
+    def from_dict(cls, json: dict):
+        json["id"] = json.get("id", None)
+        if "namespace" in json.keys():
+            json.pop("namespace")
+        return ExtractionGraph(**json)
+    @staticmethod
+    def from_yaml(spec: str):
+        import yaml
+        return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
+    def to_dict(self) -> dict:
+        filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
+        return filtered_dict
+class ExtractionGraphBuilder:
+    def __init__(self, name: str):
+        self.name = name
+        self.extraction_policies = []
+    def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
+        self.extraction_policies.append(policy)
+        return self
+    def build(self):
+        return ExtractionGraph(
+            id=self.id, name=self.name, extraction_policies=self.extraction_policies
+        )

{indexify-0.0.20 → indexify-0.0.22}/indexify/extractor.py RENAMED Viewed

@@ -17,7 +17,12 @@ class ExtractorSchema:
 class Extractor:
     def __init__(
-        self, name: str, description: str, input_params: dict, outputs: ExtractorSchema, input_mime_types: list[str]
+        self,
+        name: str,
+        description: str,
+        input_params: dict,
+        outputs: ExtractorSchema,
+        input_mime_types: list[str],
     ):
         self.name = name
         self.description = description

indexify-0.0.22/indexify/settings.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ DEFAULT_SERVICE_URL = "http://localhost:8900"
2	+ DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"

{indexify-0.0.20 → indexify-0.0.22}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "indexify"
-version = "0.0.20"
+version = "0.0.22"
 description = "Python Client for Indexify"
 authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
 license = "Apache 2.0"

indexify-0.0.20/indexify/extraction_policy.py DELETED Viewed

@@ -1,28 +0,0 @@
-from dataclasses import dataclass, asdict
-from typing import Optional
-@dataclass
-class ExtractionPolicy:
-    extractor: str
-    name: str
-    content_source: str
-    input_params: dict
-    id: Optional[str] = None
-    labels_eq: Optional[str] = None
-    def __repr__(self) -> str:
-        return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
-    def __str__(self) -> str:
-        return self.__repr__()
-    def to_dict(self) -> dict:
-        filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
-        return filtered_dict
-    @classmethod
-    def from_dict(cls, json: dict):
-        if "filters_eq" in json:
-            json["labels_eq"] = json.pop("filters_eq")
-        return ExtractionPolicy(**json)

indexify-0.0.20/indexify/settings.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- DEFAULT_SERVICE_URL = "http://localhost:8900"
2	- DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"

{indexify-0.0.20 → indexify-0.0.22}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.20 → indexify-0.0.22}/README.md RENAMED Viewed

File without changes

{indexify-0.0.20 → indexify-0.0.22}/indexify/data_containers.py RENAMED Viewed

File without changes

{indexify-0.0.20 → indexify-0.0.22}/indexify/exceptions.py RENAMED Viewed

File without changes

{indexify-0.0.20 → indexify-0.0.22}/indexify/index.py RENAMED Viewed

File without changes

{indexify-0.0.20 → indexify-0.0.22}/indexify/utils.py RENAMED Viewed

File without changes

indexify 0.0.20__tar.gz → 0.0.22__tar.gz

indexify 0.0.20tar.gz → 0.0.22tar.gz