PyPI - indexify - Versions diffs - 0.0.28__tar.gz → 0.0.31__tar.gz - Mend

indexify 0.0.28tar.gz → 0.0.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{indexify-0.0.28 → indexify-0.0.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.28
+Version: 0.0.31
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -35,8 +35,8 @@ pip install indexify
 ## Usage
-See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
-Look at the [examples](examples) directory for more examples.
+See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
+Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
 ## Development

{indexify-0.0.28 → indexify-0.0.31}/README.md RENAMED Viewed

@@ -15,8 +15,8 @@ pip install indexify
 ## Usage
-See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
-Look at the [examples](examples) directory for more examples.
+See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
+Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
 ## Development

{indexify-0.0.28 → indexify-0.0.31}/indexify/__init__.py RENAMED Viewed

@@ -2,10 +2,12 @@ from .index import Index
 from .client import IndexifyClient
 from .extraction_policy import ExtractionGraph
 from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
+from .data_containers import Content
 from .settings import DEFAULT_SERVICE_URL
 __all__ = [
     "Index",
+    "Content",
     "Document",
     "IndexifyClient",
     "ExtractionGraph",

{indexify-0.0.28 → indexify-0.0.31}/indexify/client.py RENAMED Viewed

@@ -10,7 +10,7 @@ from .extraction_policy import ExtractionPolicy, ExtractionGraph
 from .index import Index
 from .utils import json_set_default
 from .error import Error
-from .data_containers import TextChunk
+from .data_containers import TextChunk, Content
 from indexify.exceptions import ApiException
 from dataclasses import dataclass
 from typing import List, Optional, Union, Dict
@@ -153,7 +153,11 @@ class IndexifyClient:
         try:
             response = self._client.request(method, timeout=self._timeout, **kwargs)
             status_code = str(response.status_code)
-            if status_code.startswith("4") or status_code.startswith("5"):
+            if status_code.startswith("4"):
+                raise ApiException(
+                    "status code: " + status_code + " request args: " + str(kwargs)
+                )
+            if status_code.startswith("5"):
                 raise ApiException(response.text)
                 # error = Error.from_tonic_error_string(str(response.url), response.text)
                 # self.__print_additional_error_context(error)
@@ -340,11 +344,11 @@ class IndexifyClient:
         """
         Retrieve and update the list of extraction policies for the current namespace.
         """
-        response = self.get(f"namespaces/{self.namespace}")
+        response = self.get(f"namespaces/{self.namespace}/extraction_graphs")
         json = response.json()
         self.extraction_graphs = []
-        for graph in json["namespace"]["extraction_graphs"]:
+        for graph in json["extraction_graphs"]:
             self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
         return self.extraction_graphs
@@ -366,6 +370,28 @@ class IndexifyClient:
         )
         return
+    def link_extraction_graphs(
+        self, source_graph: str, content_source: str, linked_graph: str
+    ):
+        """
+        Link an extraction graph to another extraction graph.
+        Args:
+            - source_graph (str): source extraction graph
+            - content_source (str): content source in source graph
+            - linked_graph (str): target extraction graph
+        """
+        req = {
+            "content_source": content_source,
+            "linked_graph_name": linked_graph,
+        }
+        response = self.post(
+            f"namespaces/{self.namespace}/extraction_graphs/{source_graph}/links",
+            json=req,
+            headers={"Content-Type": "application/json"},
+        )
+        return
     def get_content_metadata(self, content_id: str) -> dict:
         """
         Get metadata for a specific content ID in a given index.
@@ -373,17 +399,17 @@ class IndexifyClient:
         Args:
             - content_id (str): content id to query
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
+        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
         return response.json()
-    def download_content(self, id: str) -> bytes:
+    def download_content(self, content_id: str) -> bytes:
         """
         Download content from id. Return bytes
         Args:
-            - id (str): id of content to download
+            - content_id (str): id of content to download
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
+        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/download")
         return response.content
     def add_documents(
@@ -422,21 +448,21 @@ class IndexifyClient:
             raise TypeError(
                 "Invalid type for documents. Expected Document, str, or list of these."
             )
-        req = {
-            "documents": [doc._asdict() for doc in documents],
-            "extraction_graph_names": extraction_graphs,
-        }
-        response = self.post(
-            f"namespaces/{self.namespace}/add_texts",
-            json=req,
-            headers={"Content-Type": "application/json"},
-        )
-        response.raise_for_status()
-        response_json = response.json()
-        content_ids = response_json["content_ids"]
-        if len(documents) == 1 and len(content_ids) == 1:
-            return content_ids[0]
+        for document in documents:
+            document.labels["mime_type"] = "text/plain"
+        content_ids = []
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
+        for extraction_graph in extraction_graphs:
+            for document in documents:
+                response = self.post(
+                    f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
+                    files={"file": document.text},
+                    data={"labels": json.dumps(document.labels)},
+                )
+                response_json = response.json()
+                content_id = response_json["content_id"]
+                content_ids.append(content_id)
         return content_ids
     def delete_documents(self, document_ids: List[str]) -> None:
@@ -504,14 +530,47 @@ class IndexifyClient:
             - top_k (int): top k nearest neighbors to be returned
             - filters (List[str]): list of filters to apply
         """
-        req = {"index": name, "query": query, "k": top_k, "filters": filters}
+        req = {"query": query, "k": top_k, "filters": filters}
         response = self.post(
-            f"namespaces/{self.namespace}/search",
+            f"namespaces/{self.namespace}/indexes/{name}/search",
             json=req,
             headers={"Content-Type": "application/json"},
         )
         return response.json()["results"]
+    def list_content(
+        self,
+        extraction_graph: str,
+        extraction_policy: str = "",
+        labels_filter: List[str] = [],
+        start_id: str = "",
+        limit: int = 10,
+    ) -> List[Content]:
+        """
+        List content in the current namespace.
+        Args:
+            - extraction_graph (str): extraction graph name
+            - start_index (str): start index for pagination
+            - limit (int): number of items to return
+        """
+        params = {"graph": extraction_graph, "start_id": start_id, "limit": limit}
+        if extraction_policy:
+            params["source"] = extraction_policy
+        else:
+            params["source"] = "ingestion"
+        if len(labels_filter) > 0:
+            params["labels_filter"] = labels_filter
+        response = self.get(
+            f"namespaces/{self.namespace}/content",
+            params=params,
+        )
+        content_list = response.json()["content_list"]
+        content = []
+        for item in content_list:
+            content.append(Content.from_dict(item))
+        return content
     def upload_file(
         self,
         extraction_graphs: Union[str, List[str]],
@@ -528,18 +587,20 @@ class IndexifyClient:
         """
         if isinstance(extraction_graphs, str):
             extraction_graphs = [extraction_graphs]
-        params = {"extraction_graph_names": extraction_graphs}
+        params = {}
         if id is not None:
             params["id"] = id
         with open(path, "rb") as f:
-            response = self.post(
-                f"namespaces/{self.namespace}/upload_file",
-                files={"file": f},
-                data={"labels": json.dumps(labels)},
-                params=params,
-            )
+            for extraction_graph in extraction_graphs:
+                response = self.post(
+                    f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
+                    files={"file": f},
+                    data={"labels": json.dumps(labels)},
+                    params=params,
+                )
             response_json = response.json()
-            return response_json["content_id"]
+            content_id = response_json["content_id"]
+            return content_id
     def list_schemas(self) -> List[str]:
         """
@@ -548,35 +609,32 @@ class IndexifyClient:
         response = self.get(f"namespaces/{self.namespace}/schemas")
         return response.json()
-    def get_content_tree(self, content_id: str):
+    def get_extracted_content(
+        self, ingested_content_id: str, graph_name: str, extractor_name: str, blocking=False
+    ):
         """
-        Get content tree for a given content id
+        Get list of child for a given content id and their content up to the specified level.
         Args:
-            - content_id (str): id of content
+        - ingested_content_id (str): id of content
+        - graph_name (str): name of extraction graph
+        - extractor_name (str): name of extractor
+        - blocking (bool): wait for extraction to complete before returning (default: False)
         """
+        if blocking:
+            self.wait_for_extraction(ingested_content_id)
         response = self.get(
-            f"namespaces/{self.namespace}/content/{content_id}/content-tree"
+            f"namespaces/{self.namespace}/extraction_graphs/{graph_name}/extraction_policies/{extractor_name}/content/{ingested_content_id}"
         )
-        return response.json()
-    def get_extracted_content(self, content_id: str, graph_name: str, policy_name: str):
-        """
-        Get list of child for a given content id and their content up to the specified level.
-        Args:
-        - content_id (str): id of content
-        - level (int): depth of content retrieval (default: 0)
-        """
-        content_tree = self.get_content_tree(content_id)
+        content_tree = response.json()
         child_list = []
         for item in content_tree["content_tree_metadata"]:
             if (
                 graph_name in item["extraction_graph_names"]
-                and item["source"] == policy_name
+                and item["source"] == extractor_name
             ):
                 content = self.download_content(item["id"])
-                child_list.append({"id": item["id"], "content": content})
+                child_list.append({"id": item["id"], "mime_type": item["mime_type"], "content": content})
         return child_list
@@ -634,9 +692,13 @@ class IndexifyClient:
         """
         if type(content_ids) == str:
             content_ids = [content_ids]
-        print("Waiting for extraction to complete for content id: ", ",".join(content_ids))
+        print(
+            "Waiting for extraction to complete for content id: ", ",".join(content_ids)
+        )
         for content_id in content_ids:
-            response = self.get(f"namespaces/{self.namespace}/content/{content_id}/wait")
+            response = self.get(
+                f"namespaces/{self.namespace}/content/{content_id}/wait"
+            )
             print("Extraction completed for content id: ", content_id)
         response.raise_for_status()

indexify-0.0.31/indexify/data_containers.py ADDED Viewed

@@ -0,0 +1,37 @@
+from enum import Enum
+from typing import List
+from dataclasses import dataclass, field
+@dataclass
+class Content:
+    id: str
+    parent_id: str
+    labels: dict[str, any]
+    extraction_graph_names: List[str]
+    extraction_policy: str
+    mime_type: str
+    @classmethod
+    def from_dict(cls, json: dict):
+        return Content(
+            id=json["id"],
+            parent_id=json["parent_id"],
+            labels=json["labels"],
+            extraction_graph_names=json["extraction_graph_names"],
+            extraction_policy=json["source"],
+            mime_type=json["mime_type"],
+        )
+@dataclass
+class TextChunk:
+    text: str
+    metadata: dict[str, any] = field(default_factory=dict)
+    score: float = 0.0
+    def to_dict(self):
+        return {"text": self.text, "metadata": self.metadata}
+@dataclass
+class SearchResult:
+    results: List[TextChunk]

{indexify-0.0.28 → indexify-0.0.31}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "indexify"
-version = "0.0.28"
+version = "0.0.31"
 description = "Python Client for Indexify"
 authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
 license = "Apache 2.0"

indexify-0.0.28/indexify/data_containers.py DELETED Viewed

@@ -1,18 +0,0 @@
-from enum import Enum
-from typing import List
-from dataclasses import dataclass, field
-@dataclass
-class TextChunk:
-    text: str
-    metadata: dict[str, any] = field(default_factory=dict)
-    score: float = 0.0
-    def to_dict(self):
-        return {"text": self.text, "metadata": self.metadata}
-@dataclass
-class SearchResult:
-    results: List[TextChunk]