PyPI - indexify - Versions diffs - 0.0.34__tar.gz → 0.0.36__tar.gz - Mend

indexify 0.0.34tar.gz → 0.0.36tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{indexify-0.0.34 → indexify-0.0.36}/PKG-INFO +2 -1
indexify-0.0.36/indexify/__init__.py +30 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/client.py +63 -27
indexify-0.0.36/indexify/data.py +83 -0
indexify-0.0.36/indexify/data_loaders/__init__.py +55 -0
indexify-0.0.36/indexify/data_loaders/local_directory_loader.py +27 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/extraction_policy.py +1 -1
indexify-0.0.36/indexify/extractor.py +120 -0
indexify-0.0.36/indexify/extractor_utils.py +108 -0
indexify-0.0.36/indexify/graph.py +49 -0
indexify-0.0.36/indexify/local_runner.py +53 -0
{indexify-0.0.34 → indexify-0.0.36}/pyproject.toml +2 -1
indexify-0.0.34/indexify/__init__.py +0 -18
indexify-0.0.34/indexify/data_containers.py +0 -37
indexify-0.0.34/indexify/extractor.py +0 -47
indexify-0.0.34/indexify/index.py +0 -17
{indexify-0.0.34 → indexify-0.0.36}/LICENSE.txt +0 -0
{indexify-0.0.34 → indexify-0.0.36}/README.md +0 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/error.py +0 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/exceptions.py +0 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/settings.py +0 -0
{indexify-0.0.34 → indexify-0.0.36}/indexify/utils.py +0 -0

{indexify-0.0.34 → indexify-0.0.36}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.34
+Version: 0.0.36
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: httpx[http2] (>=0.26,<0.27)
+Requires-Dist: pydantic (>=2.8,<3.0)
 Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
 Project-URL: Repository, https://github.com/tensorlakeai/indexify
 Description-Content-Type: text/markdown

indexify-0.0.36/indexify/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+from .client import IndexifyClient
+from .extraction_policy import ExtractionGraph
+from .client import (
+    IndexifyClient,
+    Document,
+    generate_hash_from_string,
+    generate_unique_hex_id,
+)
+from .data import ContentMetadata, Content, Feature
+from .extractor import Extractor, extractor, EmbeddingSchema
+from .settings import DEFAULT_SERVICE_URL
+from . import data_loaders
+__all__ = [
+    "ContentMetadata",
+    "Content",
+    "data_loaders",
+    "Feature",
+    "Extractor",
+    "extractor",
+    "EmbeddingSchema",
+    "extractor",
+    "Document",
+    "IndexifyClient",
+    "ExtractionGraph",
+    "ExtractionGraphBuilder" "ExtractionPolicy",
+    "DEFAULT_SERVICE_URL",
+    "generate_hash_from_string",
+    "generate_unique_hex_id",
+]

{indexify-0.0.34 → indexify-0.0.36}/indexify/client.py RENAMED Viewed

@@ -6,11 +6,11 @@ import json
 from collections import namedtuple
 from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
 from .extractor import Extractor
-from .extraction_policy import ExtractionPolicy, ExtractionGraph
-from .index import Index
+from .extraction_policy import ExtractionGraph
 from .utils import json_set_default
 from .error import Error
-from .data_containers import TextChunk, Content
+from .data import Content, ContentMetadata
+from .data_loaders import DataLoader
 from indexify.exceptions import ApiException
 from dataclasses import dataclass
 from typing import List, Optional, Union, Dict
@@ -316,7 +316,7 @@ class IndexifyClient:
             "content_url": f"{self._service_url}/namespaces/{self.namespace}/content/{content['id']}/download",
         }
-    def indexes(self) -> List[Index]:
+    def indexes(self) -> dict:
         """
         Get the indexes of the current namespace.
@@ -399,8 +399,10 @@ class IndexifyClient:
         Args:
             - content_id (str): content id to query
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
-        return response.json()
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/metadata"
+        )
+        return response.json()["content_metadata"]
     def download_content(self, content_id: str) -> bytes:
         """
@@ -409,7 +411,9 @@ class IndexifyClient:
         Args:
             - content_id (str): id of content to download
         """
-        response = self.get(f"namespaces/{self.namespace}/content/{content_id}/download")
+        response = self.get(
+            f"namespaces/{self.namespace}/content/{content_id}/download"
+        )
         return response.content
     def add_documents(
@@ -520,7 +524,7 @@ class IndexifyClient:
     def search_index(
         self, name: str, query: str, top_k: int, filters: List[str] = []
-    ) -> list[TextChunk]:
+    ) -> dict:
         """
         Search index in the current namespace.
@@ -573,34 +577,59 @@ class IndexifyClient:
     def upload_file(
         self,
-        extraction_graphs: Union[str, List[str]],
-        path: str,
+        extraction_graph: str,
+        path: str,
+        file_bytes:bytes=None,
         id=None,
         labels: dict = {},
     ) -> str:
         """
-        Upload a file.
+        Upload a file from a path or the bytes.
         Args:
-            - path (str): relative path to the file to be uploaded
+            - extraction_graph (str): name of the extraction graph to use for extraction
+            - path (Union[str, bytes]): relative path to the file to be uploaded, or the bytes of the file
             - labels (dict): labels to be associated with the file
         """
-        if isinstance(extraction_graphs, str):
-            extraction_graphs = [extraction_graphs]
         params = {}
         if id is not None:
             params["id"] = id
-        with open(path, "rb") as f:
-            for extraction_graph in extraction_graphs:
+        if file_bytes == None:
+            with open(path, "rb") as f:
                 response = self.post(
                     f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
                     files={"file": f},
                     data={"labels": json.dumps(labels)},
                     params=params,
                 )
-            response_json = response.json()
-            content_id = response_json["content_id"]
-            return content_id
+        else:
+            response = self.post(
+                    f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
+                    files={"file": (path, file_bytes)},
+                    data={"labels": json.dumps(labels)},
+                    params=params,
+            )
+            file_content = path
+        response_json = response.json()
+        content_id = response_json["content_id"]
+        return content_id
+    def ingest_from_loader(self, loader: DataLoader, extraction_graph: str) -> List[str]:
+        """
+        Loads content using the loader, uploads them to Indexify and returns the content ids.
+        loader: DataLoader: The DataLoader object to use for loading content
+        extraction_graph: str: The name of the extraction graph to use for extraction
+        """
+        content_ids = []
+        files = loader.load()
+        for file_metadata in files:
+            labels={"file_name": file_metadata.path}
+            print(labels)
+            content_id = self.upload_file(extraction_graph, file_metadata.path, file_metadata.read_all_bytes(), labels=labels)
+            content_ids.append(content_id)
+        return content_ids
     def list_schemas(self) -> List[str]:
         """
@@ -610,7 +639,11 @@ class IndexifyClient:
         return response.json()
     def get_extracted_content(
-        self, ingested_content_id: str, graph_name: str, policy_name: str, blocking=False
+        self,
+        ingested_content_id: str,
+        graph_name: str,
+        policy_name: str,
+        blocking=False,
     ):
         """
         Get list of child for a given content id and their content up to the specified level.
@@ -631,10 +664,16 @@ class IndexifyClient:
         for item in content_tree["content_tree_metadata"]:
             if (
                 graph_name in item["extraction_graph_names"]
-                and item["source"] == policy_name
+                and item["source"] == policy_name
             ):
                 content = self.download_content(item["id"])
-                child_list.append({"id": item["id"], "mime_type": item["mime_type"], "content": content})
+                child_list.append(
+                    {
+                        "id": item["id"],
+                        "mime_type": item["mime_type"],
+                        "content": content,
+                    }
+                )
         return child_list
@@ -660,23 +699,20 @@ class IndexifyClient:
     def ingest_remote_file(
         self,
-        extraction_graphs: Union[str, List[str]],
+        extraction_graph: str,
         url: str,
         mime_type: str,
         labels: Dict[str, str],
         id=None,
     ):
-        if isinstance(extraction_graphs, str):
-            extraction_graphs = [extraction_graphs]
         req = {
             "url": url,
             "mime_type": mime_type,
             "labels": labels,
             "id": id,
-            "extraction_graph_names": extraction_graphs,
         }
         response = self.post(
-            f"namespaces/{self.namespace}/ingest_remote_file",
+            f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract_remote",
             json=req,
             headers={"Content-Type": "application/json"},
         )

indexify-0.0.36/indexify/data.py ADDED Viewed

@@ -0,0 +1,83 @@
+from typing import Any, List, Optional, Literal, Dict
+from pydantic import BaseModel, Json, Field
+import json
+class Feature(BaseModel):
+    feature_type: Literal["embedding", "metadata"]
+    name: str
+    value: Json
+    comment: Optional[Json] = Field(default=None)
+    @classmethod
+    def embedding(cls, values: List[float], name: str = "embedding", distance="cosine"):
+        return cls(
+            feature_type="embedding",
+            name=name,
+            value={values: values, distance: distance},
+            comment=None,
+        )
+    @classmethod
+    def metadata(cls, value: Json, comment: Json = None, name: str = "metadata"):
+        value = json.dumps(value)
+        comment = json.dumps(comment) if comment is not None else None
+        return cls(feature_type="metadata", name=name, value=value)
+class Content(BaseModel):
+    id: str
+    content_type: Optional[str]
+    data: bytes
+    features: List[Feature] = []
+    @classmethod
+    def from_text(
+        cls,
+        text: str,
+        features: List[Feature] = [],
+    ):
+        return Content(
+            id="none-for-now",
+            content_type="text/plain",
+            data=bytes(text, "utf-8"),
+            features=features,
+        )
+    @classmethod
+    def from_json(cls, json_data: Json, features: List[Feature] = []):
+        return cls(
+            content_type="application/json",
+            data=bytes(json.dumps(json_data), "utf-8"),
+            features=features,
+        )
+    @classmethod
+    def from_file(cls, path: str):
+        import mimetypes
+        m, _ = mimetypes.guess_type(path)
+        with open(path, "rb") as f:
+            return cls(content_type=m, data=f.read())
+class ContentMetadata(BaseModel):
+    id: str
+    parent_id: str
+    labels: Dict[str, Any]
+    extraction_graph_names: List[str]
+    extraction_policy: str
+    mime_type: str
+    extracted_metadata: Dict[str, Any] = {}
+    @classmethod
+    def from_dict(cls, json: Dict):
+        return cls(
+            id=json["id"],
+            parent_id=json["parent_id"],
+            labels=json["labels"],
+            extraction_graph_names=json["extraction_graph_names"],
+            extraction_policy=json["source"],
+            mime_type=json["mime_type"],
+            extracted_metadata=json["extracted_metadata"],
+        )

indexify-0.0.36/indexify/data_loaders/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+from pydantic import BaseModel
+from abc import ABC, abstractmethod
+from typing import List
+import os
+import mimetypes
+import hashlib
+class FileMetadata(BaseModel):
+    path: str
+    file_size: int
+    mime_type: str
+    md5_hash: str
+    created_at: int
+    updated_at: int
+    @classmethod
+    def from_path(cls, path: str):
+        file_size = os.path.getsize(path)
+        mime_type = mimetypes.guess_type(path)[0]
+        # Compute MD5 hash
+        hash_md5 = hashlib.md5()
+        with open(path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        md5_hash = hash_md5.hexdigest()
+        created_at = int(os.path.getctime(path))
+        updated_at = int(os.path.getmtime(path))
+        return cls(
+            path=path,
+            file_size=file_size,
+            mime_type=str(mime_type),
+            md5_hash=md5_hash,
+            created_at=created_at,
+            updated_at=updated_at,
+        )
+    def read_all_bytes(self) -> bytes:
+        with open(self.path, "rb") as f:
+            return f.read()
+class DataLoader(ABC):
+    @abstractmethod
+    def load(self) -> List[FileMetadata]:
+        pass
+    @abstractmethod
+    def state(self) -> dict:
+        pass
+from .local_directory_loader import LocalDirectoryLoader

indexify-0.0.36/indexify/data_loaders/local_directory_loader.py ADDED Viewed

@@ -0,0 +1,27 @@
+from . import DataLoader, FileMetadata
+from typing import List, Optional
+import os
+class LocalDirectoryLoader(DataLoader):
+    def __init__(self, directory: str, file_extensions: Optional[List[str]] = None, state: dict ={}):
+        self.directory = directory
+        self.file_extensions = file_extensions
+        self.processed_files = set(state.get("processed_files", []))
+    def load(self) -> List[FileMetadata]:
+        file_metadata_list = []
+        for root, _, files in os.walk(self.directory):
+            for file in files:
+                if self.file_extensions is None or any(
+                    file.endswith(ext) for ext in self.file_extensions
+                ):
+                    file_path = os.path.join(root, file)
+                    if file_path not in self.processed_files:
+                        file_metadata_list.append(FileMetadata.from_path(file_path))
+                        self.processed_files.add(file_path)
+        return file_metadata_list
+    def state(self) -> dict:
+        return {"processed_files": list(self.processed_files)}

{indexify-0.0.34 → indexify-0.0.36}/indexify/extraction_policy.py RENAMED Viewed

@@ -49,7 +49,7 @@ class ExtractionGraph:
         import yaml
         return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
     @staticmethod
     def from_yaml_file(path: str):
         with open(path, "r") as f:

indexify-0.0.36/indexify/extractor.py ADDED Viewed

@@ -0,0 +1,120 @@
+from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
+import inspect
+from pydantic import BaseModel
+from abc import ABC, abstractmethod
+from .data import Content, Feature
+import json
+class EmbeddingSchema(BaseModel):
+    dimension: int
+class Extractor(ABC):
+    name: str = ""
+    version: str = "0.0.0"
+    system_dependencies: List[str] = []
+    python_dependencies: List[str] = []
+    description: str = ""
+    input_mime_types = ["text/plain"]
+    def extract(
+        self, content: Content, params: Type[BaseModel] = None
+    ) -> List[Union[Feature, Content]]:
+        """
+        Extracts information from the content. Returns a list of features to add
+        to the content.
+        It can also return a list of Content objects, which will be added to storage
+        and any extraction policies defined will be applied to them.
+        """
+        pass
+    def extract_batch(
+        self, content_list: List[Content], params: List[Type[BaseModel]] = None
+    ) -> List[List[Union[Feature, Content]]]:
+        """
+        Extracts information from the content. Returns a list of features to add
+        to the content.
+        It can also return a list of Content objects, which will be added to storage
+        and any extraction policies defined will be applied to them.
+        """
+        pass
+    @classmethod
+    @abstractmethod
+    def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
+        pass
+    @classmethod
+    @abstractmethod
+    def embedding_schemas(cls) -> Dict[str, EmbeddingSchema]:
+        raise NotImplementedError
+    def describe(self) -> Dict:
+        embedding_schemas = {}
+        try:
+            embedding_schemas = self.embedding_schemas()
+        except NotImplementedError:
+            pass
+        json_schema = (
+            self._param_cls.model_json_schema() if self._param_cls is not None else None
+        )
+        return {
+            "name": self.name,
+            "version": self.version,
+            "description": self.description,
+            "system_dependencies": self.system_dependencies,
+            "python_dependencies": self.python_dependencies,
+            "input_mime_types": self.input_mime_types,
+            "embedding_schemas": embedding_schemas,
+            "input_params": json.dumps(json_schema),
+        }
+def extractor(
+    name: Optional[str] = None,
+    description: Optional[str] = "",
+    version: Optional[str] = "",
+    python_dependencies: Optional[List[str]] = None,
+    system_dependencies: Optional[List[str]] = None,
+    input_mime_types: Optional[List[str]] = None,
+    embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
+    sample_content: Optional[Callable] = None,
+):
+    args = locals()
+    del args["sample_content"]
+    def construct(fn):
+        def wrapper():
+            hint = get_type_hints(fn).get("params", dict)
+            if not args.get("name"):
+                args["name"] = (
+                    f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
+                )
+            class DecoratedFn(Extractor):
+                @classmethod
+                def extract(cls, content: Content, params: hint) -> List[Content]:  # type: ignore
+                    # TODO we can force all the functions to take in a parms object
+                    # or check if someone adds a params
+                    if params is None:
+                        return fn(content)
+                    else:
+                        return fn(content, params)
+                def sample_input(self) -> Content:
+                    return sample_content() if sample_content else self.sample_text()
+            for key, val in args.items():
+                setattr(DecoratedFn, key, val)
+            return DecoratedFn
+        return wrapper
+    return construct

indexify-0.0.36/indexify/extractor_utils.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import httpx
+from typing import List
+from .data import Content, Feature
+class SampleExtractorData:
+    def _download_file(self, url, filename):
+        if os.path.exists(filename):
+            # file exists skip
+            return
+        try:
+            with httpx.get(url, stream=True) as r:
+                r.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
+                with open(filename, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+        except httpx.exceptions.RequestException as e:
+            print(f"Error downloading the file: {e}")
+    def sample_mp3(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.mp3"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="audio/mpeg", data=f.read(), features=features)
+    def sample_mp4(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.mp4"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="video/mp4", data=f.read(), features=features)
+    def sample_jpg(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.jpg"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="image/jpg", data=f.read(), features=features)
+    def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.jpg"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="image/jpg", data=f.read(), features=features)
+    def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_presentation(self, features: List[Feature] = []) -> Content:
+        file_name = "test.pptx"
+        self._download_file(
+            "https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(
+            content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            data=f.read(),
+            features=features,
+        )
+    def sample_text(self, features: List[Feature] = []) -> Content:
+        article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18."""
+        return Content(content_type="text/plain", data=article, features=features)
+    def sample_html(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.html"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample.html",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="text/html", data=f.read(), features=features)

indexify-0.0.36/indexify/graph.py ADDED Viewed

@@ -0,0 +1,49 @@
+from indexify import Content, extractor
+from indexify.extractor import Extractor
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional
+@extractor(description="id function")
+def _id(content: Content) -> List[Content]:
+    return [content]
+class Graph:
+    def __init__(self, name: str):
+        # TODO check for cycles
+        self.name = name
+        self.nodes: Dict[str, Callable] = {}
+        self.params: Dict[str, Any] = {}
+        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
+        self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
+        self.nodes["start"] = _id
+        self.nodes["end"] = _id
+        self._topo_counter = defaultdict(int)
+        self._start_node = None
+    def node(self, name: str, closure: Extractor, params: Any = None) -> None:
+        if name in self.nodes:
+            raise Exception(f"Cannot insert node, node with name: `{name}` already exists")
+        self.nodes[name] = closure
+        self.params[name] = params
+        # assign each node a rank of 1 to init the graph
+        self._topo_counter[name] = 1
+    def edge(self, from_node: str, to_node: str, prefilter_predicates: Optional[str] = None) -> None:
+        self.edges[from_node].append((to_node, prefilter_predicates))
+        self._topo_counter[to_node] += 1
+    def _assign_start_node(self):
+        # this method should be called before a graph can be run
+        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
+        self._start_node = nodes[0][0]

indexify-0.0.36/indexify/local_runner.py ADDED Viewed

@@ -0,0 +1,53 @@
+from indexify import Content
+from collections import defaultdict
+from typing import Any, Callable, Dict, Optional
+class LocalRunner:
+    def __init__(self):
+        self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
+    def run(self, g, content: Content):
+        g._assign_start_node()
+        return self._run(g, content=content, node_name=g._start_node)
+    def _run(self, g, content: Content, node_name: str):
+        extractor_construct: Callable = g.nodes[node_name]
+        params = g.params.get(node_name, None)
+        res = extractor_construct().extract(content=content, params=params)
+        self.results[node_name].extend(res)
+        for out_edge, pre_filter_predicate in g.edges[node_name]:
+            # TODO there are no reductions yet, each recursion finishes it's path and returns
+            for r in res:
+                if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
+                    continue
+                self._run(g, content=r, node_name=out_edge)
+    def _prefilter_content(self, content: Content, prefilter_predicate: Optional[str]) -> bool:
+        if prefilter_predicate is None:
+            return False
+        atoms = prefilter_predicate.split('and')
+        if len(atoms) == 0 or len(atoms) == 1:
+            return False
+        # TODO For now only support `and` and `=` and `string values`
+        bools = []
+        for feature in content.features:
+            if feature.feature_type == 'metadata':
+                values = feature.value
+                print(f'{prefilter_predicate, atoms}')
+                for atom in atoms:
+                    l, r = atom.split('=')
+                    if l in values:
+                        bools.append(values[l] == r)
+        return all(bools)
+    def get_result(self, node_name: str) -> Content:
+        return self.results[node_name]

{indexify-0.0.34 → indexify-0.0.36}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "indexify"
-version = "0.0.34"
+version = "0.0.36"
 description = "Python Client for Indexify"
 authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
 license = "Apache 2.0"
@@ -12,6 +12,7 @@ repository = "https://github.com/tensorlakeai/indexify"
 python = "^3.9"
 httpx = { version = "^0.26", extras = ["http2"] }
 pyyaml = "^6.0.1"
+pydantic = "^2.8"
 [tool.poetry.dev-dependencies]
 black = "^22.3.0"

indexify-0.0.34/indexify/__init__.py DELETED Viewed

@@ -1,18 +0,0 @@
-from .index import Index
-from .client import IndexifyClient
-from .extraction_policy import ExtractionGraph
-from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
-from .data_containers import Content
-from .settings import DEFAULT_SERVICE_URL
-__all__ = [
-    "Index",
-    "Content",
-    "Document",
-    "IndexifyClient",
-    "ExtractionGraph",
-    "ExtractionGraphBuilder" "ExtractionPolicy",
-    "DEFAULT_SERVICE_URL",
-    "generate_hash_from_string",
-    "generate_unique_hex_id",
-]

indexify-0.0.34/indexify/data_containers.py DELETED Viewed

@@ -1,37 +0,0 @@
-from enum import Enum
-from typing import List
-from dataclasses import dataclass, field
-@dataclass
-class Content:
-    id: str
-    parent_id: str
-    labels: dict[str, any]
-    extraction_graph_names: List[str]
-    extraction_policy: str
-    mime_type: str
-    @classmethod
-    def from_dict(cls, json: dict):
-        return Content(
-            id=json["id"],
-            parent_id=json["parent_id"],
-            labels=json["labels"],
-            extraction_graph_names=json["extraction_graph_names"],
-            extraction_policy=json["source"],
-            mime_type=json["mime_type"],
-        )
-@dataclass
-class TextChunk:
-    text: str
-    metadata: dict[str, any] = field(default_factory=dict)
-    score: float = 0.0
-    def to_dict(self):
-        return {"text": self.text, "metadata": self.metadata}
-@dataclass
-class SearchResult:
-    results: List[TextChunk]

indexify-0.0.34/indexify/extractor.py DELETED Viewed

@@ -1,47 +0,0 @@
-from dataclasses import dataclass
-from typing import Union
-from .settings import DEFAULT_SERVICE_URL
-@dataclass
-class EmbeddingSchema:
-    distance: str
-    dim: int
-@dataclass
-class ExtractorSchema:
-    outputs: dict[str, Union[EmbeddingSchema, dict]]
-class Extractor:
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        input_params: dict,
-        outputs: ExtractorSchema,
-        input_mime_types: list[str],
-    ):
-        self.name = name
-        self.description = description
-        self.input_params = input_params
-        self.outputs = outputs
-        self.input_mime_types = input_mime_types
-    @classmethod
-    def from_dict(cls, data):
-        return Extractor(
-            name=data["name"],
-            description=data["description"],
-            input_params=data["input_params"],
-            input_mime_types=data["input_mime_types"],
-            outputs=data["outputs"],
-        )
-    def __repr__(self) -> str:
-        return f"Extractor(name={self.name}, description={self.description}, input_params={self.input_params}, input_mime_types={self.input_mime_types}, outputs={self.outputs})"
-    def __str__(self) -> str:
-        return self.__repr__()

indexify-0.0.34/indexify/index.py DELETED Viewed

@@ -1,17 +0,0 @@
-import httpx
-from .data_containers import TextChunk
-class Index:
-    def __init__(self, service_url, index):
-        self._service_url = service_url
-        self._index = index
-    def search(self, query: str, top_k: int) -> list[TextChunk]:
-        req = {"index": self._index, "query": query, "k": top_k}
-        response = httpx.post(
-            f"{self._service_url}/indexes/{self._index}/search", json=req
-        )
-        response.raise_for_status()
-        return response.json()["results"]

{indexify-0.0.34 → indexify-0.0.36}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.34 → indexify-0.0.36}/README.md RENAMED Viewed

File without changes

{indexify-0.0.34 → indexify-0.0.36}/indexify/error.py RENAMED Viewed

File without changes

{indexify-0.0.34 → indexify-0.0.36}/indexify/exceptions.py RENAMED Viewed

File without changes

{indexify-0.0.34 → indexify-0.0.36}/indexify/settings.py RENAMED Viewed

File without changes

{indexify-0.0.34 → indexify-0.0.36}/indexify/utils.py RENAMED Viewed

File without changes

indexify 0.0.34__tar.gz → 0.0.36__tar.gz

indexify 0.0.34tar.gz → 0.0.36tar.gz