PyPI - indexify - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

indexify 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

indexify/__init__.py +3 -2
indexify/client.py +80 -89
indexify/extraction_policy.py +42 -2
indexify/extractor.py +6 -1
indexify/settings.py +1 -1
{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/METADATA +2 -1
indexify-0.0.21.dist-info/RECORD +13 -0
indexify-0.0.19.dist-info/RECORD +0 -13
{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/LICENSE.txt +0 -0
{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/WHEEL +0 -0

indexify/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .index import Index
 from .client import IndexifyClient
-from .extraction_policy import ExtractionPolicy
+from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
 from .client import IndexifyClient, Document
 from .settings import DEFAULT_SERVICE_URL
@@ -8,6 +8,7 @@ __all__ = [
     "Index",
     "Document",
     "IndexifyClient",
-    "ExtractionPolicy",
+    "ExtractionGraph",
+    "ExtractionGraphBuilder" "ExtractionPolicy",
     "DEFAULT_SERVICE_URL",
 ]

indexify/client.py CHANGED Viewed

@@ -6,7 +6,7 @@ import json
 from collections import namedtuple
 from .settings import DEFAULT_SERVICE_URL
 from .extractor import Extractor
-from .extraction_policy import ExtractionPolicy
+from .extraction_policy import ExtractionPolicy, ExtractionGraph
 from .index import Index
 from .utils import json_set_default
 from .data_containers import TextChunk
@@ -19,6 +19,7 @@ Document = namedtuple("Document", ["text", "labels", "id"])
 SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
 @dataclass
 class SqlQueryResult:
     result: List[Dict]
@@ -45,22 +46,22 @@ class IndexifyClient:
     def __init__(
         self,
-        service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
+        service_url: str = DEFAULT_SERVICE_URL,  # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
         namespace: str = "default",
         config_path: Optional[str] = None,
         *args,
         **kwargs,
     ):
         if config_path:
-            with open(config_path, 'r') as file:
+            with open(config_path, "r") as file:
                 config = yaml.safe_load(file)
-            if config.get('use_tls', False):
-                tls_config = config['tls_config']
+            if config.get("use_tls", False):
+                tls_config = config["tls_config"]
                 self._client = httpx.Client(
                     http2=True,
-                    cert=(tls_config['cert_path'], tls_config['key_path']),
-                    verify=tls_config.get('ca_bundle_path', True)
+                    cert=(tls_config["cert_path"], tls_config["key_path"]),
+                    verify=tls_config.get("ca_bundle_path", True),
                 )
             else:
                 self._client = httpx.Client(*args, **kwargs)
@@ -68,17 +69,18 @@ class IndexifyClient:
             self._client = httpx.Client(*args, **kwargs)
         self.namespace: str = namespace
-        self.extraction_policies: List[ExtractionPolicy] = []
+        self.extraction_graphs: List[ExtractionGraph] = []
         self.labels: dict = {}
         self._service_url = service_url
+        self._timeout = kwargs.get("timeout")
         # get namespace data
         response = self.get(f"namespaces/{self.namespace}")
         response.raise_for_status()
         resp_json = response.json()
         # initialize extraction_policies
-        for eb in resp_json["namespace"]["extraction_policies"]:
-            self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
+        for eb in resp_json["namespace"]["extraction_graphs"]:
+            self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
     @classmethod
     def with_mtls(
@@ -128,7 +130,7 @@ class IndexifyClient:
         return client
     def _request(self, method: str, **kwargs) -> httpx.Response:
-        response = self._client.request(method,timeout=None, **kwargs)
+        response = self._client.request(method, timeout=self._timeout, **kwargs)
         try:
             response.raise_for_status()
         except httpx.HTTPStatusError as exc:
@@ -188,7 +190,7 @@ class IndexifyClient:
         ```
         """
         return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
     def delete(self, endpoint: str, **kwargs) -> httpx.Response:
         """
         Make a DELETE request to the Indexify service.
@@ -243,9 +245,9 @@ class IndexifyClient:
     def create_namespace(
         self,
         namespace: str,
-        extraction_policies: list = [],
+        extraction_graphs: list = [],
         labels: dict = {},
-        service_url: str = DEFAULT_SERVICE_URL
+        service_url: str = DEFAULT_SERVICE_URL,
     ) -> "IndexifyClient":
         """
         Create a new namespace.
@@ -253,16 +255,16 @@ class IndexifyClient:
         Returns:
             IndexifyClient: a new client with the given namespace
         """
-        extraction_policies = []
-        for bd in extraction_policies:
-            if isinstance(bd, ExtractionPolicy):
-                extraction_policies.append(bd.to_dict())
+        extraction_graphs = []
+        for bd in extraction_graphs:
+            if isinstance(bd, extraction_graphs):
+                extraction_graphs.append(bd.to_dict())
             else:
-                extraction_policies.append(bd)
+                extraction_graphs.append(bd)
         req = {
             "name": namespace,
-            "extraction_policies": extraction_policies,
+            "extraction_graphs": extraction_graphs,
             "labels": labels,
         }
@@ -318,57 +320,24 @@ class IndexifyClient:
             self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
         return self.extraction_policies
-    def add_extraction_policy(
-        self,
-        extractor: str,
-        name: str,
-        input_params: dict = {},
-        labels_eq: str = None,
-        content_source="ingestion",
-    ) -> dict:
-        """Add a new extraction policy.
+    def create_extraction_graph(self, extraction_graph: ExtractionGraph):
+        """
+        Create a new extraction graph.
         Args:
-            - extractor (str): Name of the extractor
-            - name (str): Name for this instance
-            - input_params (dict): Dictionary containing extractor input params
-            - filter (Filter): Optional filter for this extractor
-        Returns:
-            dict: response payload
-        Examples:
-            >>> repo.add_extraction_policy("EfficientNet", "efficientnet")
-            >>> repo.add_extraction_policy("MiniLML6", "minilm")
+            - extraction_graph (ExtractionGraph): the extraction graph to create
         """
-        req = {
-            "extractor": extractor,
-            "name": name,
-            "input_params": input_params,
-            "filters_eq": labels_eq,
-            "content_source": content_source,
-        }
-        if req["filters_eq"] == None:
-            del req["filters_eq"]
+        req = extraction_graph.to_dict()
+        req["namespace"] = self.namespace
         request_body = json.dumps(req, default=json_set_default)
         response = self.post(
-            f"namespaces/{self.namespace}/extraction_policies",
+            f"namespaces/{self.namespace}/extraction_graphs",
             data=request_body,
             headers={"Content-Type": "application/json"},
         )
-        # update self.extractor_bindings
-        self.get_extraction_policies()
-        try:
-            response.raise_for_status()
-        except httpx.HTTPStatusError as exc:
-            raise ApiException(exc.response.text)
+        response.raise_for_status()
         return
     def get_content_metadata(self, content_id: str) -> dict:
         """
         Get metadata for a specific content ID in a given index.
@@ -399,11 +368,11 @@ class IndexifyClient:
             self._add_content_url(content)
             for content in response.json()["content_list"]
         ]
-    def download_content(self, id:str) -> bytes:
+    def download_content(self, id: str) -> bytes:
         """
         Download content from id. Return bytes
         Args:
             - id (str): id of content to download
         """
@@ -415,7 +384,10 @@ class IndexifyClient:
             raise ApiException(exc.response.text)
     def add_documents(
-        self, documents: Union[Document, str, List[Union[Document, str]]], doc_id=None
+        self,
+        extraction_graphs: Union[str, List[str]],
+        documents: Union[Document, str, List[Union[Document, str]]],
+        doc_id=None,
     ) -> None:
         """
         Add documents to current namespace.
@@ -423,6 +395,8 @@ class IndexifyClient:
         Args:
             - documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
         """
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
         if isinstance(documents, Document):
             documents = [documents]
         elif isinstance(documents, str):
@@ -433,7 +407,9 @@ class IndexifyClient:
                 if isinstance(item, Document):
                     new_documents.append(item)
                 elif isinstance(item, str):
-                    new_documents.append(Document(item, {}, id=None)) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
+                    new_documents.append(
+                        Document(item, {}, id=None)
+                    )  # don't pass in id for a string content because doesn't make sense to have same content id for all strings
                 else:
                     raise ValueError(
                         "List items must be either Document instances or strings."
@@ -444,7 +420,10 @@ class IndexifyClient:
                 "Invalid type for documents. Expected Document, str, or list of these."
             )
-        req = {"documents": [doc._asdict() for doc in documents]}
+        req = {
+            "documents": [doc._asdict() for doc in documents],
+            "extraction_graph_names": extraction_graphs,
+        }
         response = self.post(
             f"namespaces/{self.namespace}/add_texts",
             json=req,
@@ -475,7 +454,9 @@ class IndexifyClient:
             - path (str): relative path to the file to be uploaded
         """
         with open(path, "rb") as f:
-            response = self.put(f"namespaces/{self.namespace}/content/{document_id}", files={"file": f})
+            response = self.put(
+                f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
+            )
             response.raise_for_status()
     def get_structured_data(self, content_id: str) -> dict:
@@ -485,11 +466,15 @@ class IndexifyClient:
         Args:
             - content_id (str): content id to query
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/metadata"
+        )
         response.raise_for_status()
-        return response.json().get("metadata",[])
+        return response.json().get("metadata", [])
-    def search_index(self, name: str, query: str, top_k: int, filters: List[str] = []) -> list[TextChunk]:
+    def search_index(
+        self, name: str, query: str, top_k: int, filters: List[str] = []
+    ) -> list[TextChunk]:
         """
         Search index in the current namespace.
@@ -508,7 +493,7 @@ class IndexifyClient:
         response.raise_for_status()
         return response.json()["results"]
-    def upload_file(self, path: str, id=None, labels: dict = {}) -> str:
+    def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
         """
         Upload a file.
@@ -516,9 +501,11 @@ class IndexifyClient:
             - path (str): relative path to the file to be uploaded
             - labels (dict): labels to be associated with the file
         """
-        params={}
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
+        params = {"extraction_graph_names": extraction_graphs}
         if id is not None:
-            params['id'] = id
+            params["id"] = id
         with open(path, "rb") as f:
             response = self.post(
                 f"namespaces/{self.namespace}/upload_file",
@@ -537,18 +524,20 @@ class IndexifyClient:
         response = self.get(f"namespaces/{self.namespace}/schemas")
         response.raise_for_status()
         return response.json()
-    def get_content_tree(self, content_id:str):
+    def get_content_tree(self, content_id: str):
         """
         Get content tree for a given content id
         Args:
             - content_id (str): id of content
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/content-tree")
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/content-tree"
+        )
         response.raise_for_status()
         return response.json()
     def sql_query(self, query: str):
         """
         Execute a SQL query.
@@ -569,9 +558,13 @@ class IndexifyClient:
             data = row["data"]
             rows.append(data)
         return SqlQueryResult(result=rows)
-    def ingest_remote_file(self, url: str, mime_type: str, labels: Dict[str, str], id=None):
-        req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id}
+    def ingest_remote_file(
+        self, extraction_graphs: Union[str, List[str]], url: str, mime_type: str, labels: Dict[str, str], id=None
+    ):
+        if isinstance(extraction_graphs, str):
+            extraction_graphs = [extraction_graphs]
+        req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id, "extraction_graph_names": extraction_graphs}
         response = self.post(
             f"namespaces/{self.namespace}/ingest_remote_file",
             json=req,
@@ -579,7 +572,7 @@ class IndexifyClient:
         )
         response.raise_for_status()
         return response.json()
     def generate_unique_hex_id(self):
         """
         Generate a unique hexadecimal identifier
@@ -588,18 +581,16 @@ class IndexifyClient:
             str: a unique hexadecimal string
         """
         return uuid.uuid4().hex[:16]
     def generate_hash_from_string(self, input_string: str):
         """
         Generate a hash for the given string and return it as a hexadecimal string.
         Args:
             input_string (str): The input string to hash.
         Returns:
             str: The hexadecimal hash of the input string.
         """
         hash_object = hashlib.sha256(input_string.encode())
         return hash_object.hexdigest()[:16]

indexify/extraction_policy.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, asdict
-from typing import Optional
+from typing import Optional, List
 @dataclass
@@ -7,7 +7,7 @@ class ExtractionPolicy:
     extractor: str
     name: str
     content_source: str
-    input_params: dict
+    input_params: Optional[dict] = None
     id: Optional[str] = None
     labels_eq: Optional[str] = None
@@ -25,4 +25,44 @@ class ExtractionPolicy:
     def from_dict(cls, json: dict):
         if "filters_eq" in json:
             json["labels_eq"] = json.pop("filters_eq")
+        json["id"] = json.get("id", None)
         return ExtractionPolicy(**json)
+@dataclass
+class ExtractionGraph:
+    id: str
+    name: str
+    extraction_policies: List[ExtractionPolicy]
+    @classmethod
+    def from_dict(cls, json: dict):
+        json["id"] = json.get("id", None)
+        if "namespace" in json.keys():
+            json.pop("namespace")
+        return ExtractionGraph(**json)
+    @staticmethod
+    def from_yaml(spec: str):
+        import yaml
+        return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
+    def to_dict(self) -> dict:
+        filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
+        return filtered_dict
+class ExtractionGraphBuilder:
+    def __init__(self, name: str):
+        self.name = name
+        self.extraction_policies = []
+    def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
+        self.extraction_policies.append(policy)
+        return self
+    def build(self):
+        return ExtractionGraph(
+            id=self.id, name=self.name, extraction_policies=self.extraction_policies
+        )

indexify/extractor.py CHANGED Viewed

@@ -17,7 +17,12 @@ class ExtractorSchema:
 class Extractor:
     def __init__(
-        self, name: str, description: str, input_params: dict, outputs: ExtractorSchema, input_mime_types: list[str]
+        self,
+        name: str,
+        description: str,
+        input_params: dict,
+        outputs: ExtractorSchema,
+        input_mime_types: list[str],
     ):
         self.name = name
         self.description = description

indexify/settings.py CHANGED Viewed

@@ -1,2 +1,2 @@
 DEFAULT_SERVICE_URL = "http://localhost:8900"
-DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"
+DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"

{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.19
+Version: 0.0.21
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: httpx[http2] (>=0.26,<0.27)
+Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
 Project-URL: Repository, https://github.com/tensorlakeai/indexify
 Description-Content-Type: text/markdown

indexify-0.0.21.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+indexify/__init__.py,sha256=hhDqRvJo4gCW1eqVgFblxKiBzArCFfo2eFGOBsQkDOc,401
+indexify/client.py,sha256=s2Xflh75574WvNp0lbG6PGtK2Dy3CMfME5MDK1iDgR4,19334
+indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
+indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
+indexify/extraction_policy.py,sha256=dIyQK3N-QOpQ0BPjiZ_635o8A5ITNxaz1syQ_FPaE0k,1851
+indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
+indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
+indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
+indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
+indexify-0.0.21.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+indexify-0.0.21.dist-info/METADATA,sha256=Rb_7fwsIiJKuJaLnmJp7Cw4exYLhHcdx48OfBcFzaO4,1753
+indexify-0.0.21.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+indexify-0.0.21.dist-info/RECORD,,

indexify-0.0.19.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-indexify/__init__.py,sha256=Sz6zkAIHsPOi0rG5RM7dVkXGDa0fO2uurD6vS4Qo15E,312
-indexify/client.py,sha256=x2-Yqa59x20K4-5V7Agh35jOGqRIBGZrAoQYKXjuq0A,19480
-indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
-indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
-indexify/extraction_policy.py,sha256=vKVHT8jSjzhUaKqWpewOGkYojMBplvGdSm9zoSN9Pcg,750
-indexify/extractor.py,sha256=KMcP9xopHJRBzeSxalztGGTBvOzVKRFEsJynV-hLRSc,1175
-indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
-indexify/settings.py,sha256=UXUd6hYlDALPPjUCFvFkvUmsm7HwXAluWowCjZWoxjY,98
-indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
-indexify-0.0.19.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-indexify-0.0.19.dist-info/METADATA,sha256=reizFOmSBBTh3n4wMVcxqeOdg7APpnBmpcxr32jiwJg,1714
-indexify-0.0.19.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-indexify-0.0.19.dist-info/RECORD,,

{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.19.dist-info → indexify-0.0.21.dist-info}/WHEEL RENAMED Viewed

File without changes

indexify 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

indexify 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl