PyPI - indexify - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl - Mend

indexify 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

indexify/__init__.py +2 -10
indexify/base_client.py +67 -0
indexify/client.py +24 -17
indexify/data_loaders/__init__.py +8 -5
indexify/data_loaders/local_directory_loader.py +10 -1
indexify/data_loaders/url_loader.py +51 -0
indexify/extractor_sdk/__init__.py +14 -0
indexify/{data.py → extractor_sdk/data.py} +29 -4
indexify/extractor_sdk/extractor.py +231 -0
indexify/{extractor_utils.py → extractor_sdk/utils.py} +2 -2
indexify/graph.py +17 -74
indexify/local_runner.py +90 -30
indexify/run_graph.py +122 -0
indexify/runner.py +22 -0
{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/METADATA +3 -3
indexify-0.0.39.dist-info/RECORD +23 -0
indexify/extractor.py +0 -122
indexify-0.0.37.dist-info/RECORD +0 -18
{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/LICENSE.txt +0 -0
{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/WHEEL +0 -0

indexify/__init__.py CHANGED Viewed

@@ -6,24 +6,16 @@ from .client import (
     generate_hash_from_string,
     generate_unique_hex_id,
 )
-from .data import ContentMetadata, Content, Feature
-from .extractor import Extractor, extractor, EmbeddingSchema
+from . import extractor_sdk
 from .settings import DEFAULT_SERVICE_URL
 from . import data_loaders
 __all__ = [
-    "ContentMetadata",
-    "Content",
     "data_loaders",
-    "Feature",
-    "Extractor",
-    "extractor",
-    "EmbeddingSchema",
-    "extractor",
     "Document",
+    "extractor_sdk",
     "IndexifyClient",
     "ExtractionGraph",
-    "ExtractionGraphBuilder" "ExtractionPolicy",
     "DEFAULT_SERVICE_URL",
     "generate_hash_from_string",
     "generate_unique_hex_id",

indexify/base_client.py ADDED Viewed

@@ -0,0 +1,67 @@
+from abc import ABC, abstractmethod
+from .extractor_sdk import Graph, Feature
+from typing import Any, List, Optional, Union, Dict
+class BaseClient(ABC):
+    ### Operational APIs
+    @abstractmethod
+    def register_extraction_graph(self, graph: Graph):
+        pass
+    @abstractmethod
+    def graphs(self) -> str:
+        pass
+    @abstractmethod
+    def namespaces(self) -> str:
+        pass
+    @abstractmethod
+    def create_namespace(self, namespace: str):
+        pass
+    ### Ingestion APIs
+    @abstractmethod
+    def invoke_graph_with_object(self, graph:str, object: Any) -> str:
+        """
+        Invokes a graph with an input object.
+        graph: str: The name of the graph to invoke
+        object: Any: The input object to the graph. It should be JSON serializable
+        return: str: The ID of the ingested object
+        """
+        pass
+    @abstractmethod
+    def invoke_graph_with_file(self, graph: str, path: str) -> str:
+        """
+        Invokes a graph with an input file. The file's mimetype is appropriately detected.
+        graph: str: The name of the graph to invoke
+        path: str: The path to the file to be ingested
+        return: str: The ID of the ingested object
+        """
+        pass
+    ### Retrieval APIs
+    @abstractmethod
+    def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
+        """
+        Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
+        If the extractor name is not provided, all the extracted objects are returned for the input object.
+        graph: str: The name of the graph
+        ingested_object_id: str: The ID of the ingested object
+        extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
+        return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
+        """
+        pass
+    @abstractmethod
+    def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
+        """
+        Returns the features of an object.
+        object_id: str: The ID of the object
+        return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
+        """
+        pass

indexify/client.py CHANGED Viewed

@@ -5,11 +5,11 @@ import hashlib
 import json
 from collections import namedtuple
 from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
-from .extractor import Extractor
+from .extractor_sdk.extractor import ExtractorMetadata
 from .extraction_policy import ExtractionGraph
 from .utils import json_set_default
 from .error import Error
-from .data import Content, ContentMetadata
+from .extractor_sdk.data import ContentMetadata
 from .data_loaders import DataLoader
 from indexify.exceptions import ApiException
 from dataclasses import dataclass
@@ -326,7 +326,7 @@ class IndexifyClient:
         response = self.get(f"namespaces/{self.namespace}/indexes")
         return response.json()["indexes"]
-    def extractors(self) -> List[Extractor]:
+    def extractors(self) -> List[ExtractorMetadata]:
         """
         Get a list of all extractors.
@@ -337,7 +337,8 @@ class IndexifyClient:
         extractors_dict = response.json()["extractors"]
         extractors = []
         for ed in extractors_dict:
-            extractors.append(Extractor.from_dict(ed))
+            print(ed)
+            extractors.append(ExtractorMetadata.model_validate(ed))
         return extractors
     def get_extraction_graphs(self) -> List[ExtractionGraph]:
@@ -578,8 +579,8 @@ class IndexifyClient:
     def upload_file(
         self,
         extraction_graph: str,
-        path: str,
-        file_bytes:bytes=None,
+        path: str,
+        file_bytes: bytes = None,
         id=None,
         labels: dict = {},
     ) -> str:
@@ -605,18 +606,20 @@ class IndexifyClient:
                 )
         else:
             response = self.post(
-                    f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
-                    files={"file": (path, file_bytes)},
-                    data={"labels": json.dumps(labels)},
-                    params=params,
+                f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
+                files={"file": (path, file_bytes)},
+                data={"labels": json.dumps(labels)},
+                params=params,
             )
             file_content = path
         response_json = response.json()
         content_id = response_json["content_id"]
         return content_id
-    def ingest_from_loader(self, loader: DataLoader, extraction_graph: str) -> List[str]:
+    def ingest_from_loader(
+        self, loader: DataLoader, extraction_graph: str
+    ) -> List[str]:
         """
         Loads content using the loader, uploads them to Indexify and returns the content ids.
         loader: DataLoader: The DataLoader object to use for loading content
@@ -625,9 +628,13 @@ class IndexifyClient:
         content_ids = []
         files = loader.load()
         for file_metadata in files:
-            labels={"file_name": file_metadata.path}
-            print(labels)
-            content_id = self.upload_file(extraction_graph, file_metadata.path, file_metadata.read_all_bytes(), labels=labels)
+            labels = {"file_name": file_metadata.path}
+            content_id = self.upload_file(
+                extraction_graph,
+                file_metadata.path,
+                loader.read_all_bytes(file_metadata),
+                labels=labels,
+            )
             content_ids.append(content_id)
         return content_ids
@@ -702,7 +709,7 @@ class IndexifyClient:
         extraction_graph: str,
         url: str,
         mime_type: str,
-        labels: Dict[str, str],
+        labels: Dict[str, str] = {},
         id=None,
     ):
         req = {

indexify/data_loaders/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 import mimetypes
 import hashlib
 class FileMetadata(BaseModel):
     path: str
     file_size: int
@@ -38,18 +39,20 @@ class FileMetadata(BaseModel):
             updated_at=updated_at,
         )
-    def read_all_bytes(self) -> bytes:
-        with open(self.path, "rb") as f:
-            return f.read()
 class DataLoader(ABC):
     @abstractmethod
     def load(self) -> List[FileMetadata]:
         pass
+    @abstractmethod
+    def read_all_bytes(self, file_metadata: FileMetadata) -> bytes:
+        pass
     @abstractmethod
     def state(self) -> dict:
         pass
-from .local_directory_loader import LocalDirectoryLoader
+from .local_directory_loader import LocalDirectoryLoader
+from .url_loader import UrlLoader

indexify/data_loaders/local_directory_loader.py CHANGED Viewed

@@ -4,7 +4,12 @@ import os
 class LocalDirectoryLoader(DataLoader):
-    def __init__(self, directory: str, file_extensions: Optional[List[str]] = None, state: dict ={}):
+    def __init__(
+        self,
+        directory: str,
+        file_extensions: Optional[List[str]] = None,
+        state: dict = {},
+    ):
         self.directory = directory
         self.file_extensions = file_extensions
         self.processed_files = set(state.get("processed_files", []))
@@ -23,5 +28,9 @@ class LocalDirectoryLoader(DataLoader):
         return file_metadata_list
+    def read_all_bytes(self, file: FileMetadata) -> bytes:
+        with open(file.path, "rb") as f:
+            return f.read()
     def state(self) -> dict:
         return {"processed_files": list(self.processed_files)}

indexify/data_loaders/url_loader.py ADDED Viewed

@@ -0,0 +1,51 @@
+from . import DataLoader, FileMetadata
+from typing import List
+import httpx
+import hashlib
+import email.utils
+def convert_date_to_epoch(date_str: str) -> int:
+    """
+    Convert a date string from URL header to Unix epoch time.
+    Args:
+        date_str (str): The date string from the URL header.
+    Returns:
+        int: The Unix epoch time.
+    """
+    if not date_str:
+        return 0
+    parsed_date = email.utils.parsedate_to_datetime(date_str)
+    return int(parsed_date.timestamp())
+class UrlLoader(DataLoader):
+    def __init__(self, urls: List[str], state: dict = {}):
+        self.urls = urls
+    def load(self) -> List[FileMetadata]:
+        file_metadata_list = []
+        for url in self.urls:
+            response = httpx.head(url, follow_redirects=True)
+            file_metadata_list.append(
+                FileMetadata(
+                    path=url,
+                    file_size=response.headers.get("content-length", 0),
+                    mime_type=response.headers.get("content-type"),
+                    md5_hash="",
+                    created_at=convert_date_to_epoch(response.headers.get("date")),
+                    updated_at=convert_date_to_epoch(
+                        response.headers.get("last-modified")
+                    ),
+                )
+            )
+        return file_metadata_list
+    def read_all_bytes(self, file: FileMetadata) -> bytes:
+        response = httpx.get(file.path, follow_redirects=True)
+        return response.content
+    def state(self) -> dict:
+        return {}

indexify/extractor_sdk/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .data import ContentMetadata, Content, Feature
+from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
+from .utils import SampleExtractorData
+__all__ = [
+    "ContentMetadata",
+    "Content",
+    "Feature",
+    "Extractor",
+    "extractor",
+    "EmbeddingSchema",
+    "ExtractorMetadata",
+    "SampleExtractorData",
+]

indexify/{data.py → extractor_sdk/data.py} RENAMED Viewed

@@ -1,6 +1,26 @@
-from typing import Any, List, Optional, Literal, Dict
+from typing import (
+    Any,
+    List,
+    Optional,
+    Literal,
+    Dict,
+    Type,
+    cast,
+    Mapping,
+)
 from pydantic import BaseModel, Json, Field
 import json
+from typing_extensions import Annotated, Doc
+class BaseData(BaseModel):
+    meta: Mapping[str, Type[BaseModel]] = {}
+    def get_features(self) -> List[Type[BaseModel]]:
+        return self.meta
+    def get_feature(self, name: str) -> Optional[Type[BaseModel]]:
+        return self.meta.get(name)
 class Feature(BaseModel):
@@ -14,7 +34,7 @@ class Feature(BaseModel):
         return cls(
             feature_type="embedding",
             name=name,
-            value={values: values, distance: distance},
+            value=json.dumps({"values": values, "distance": distance}),
             comment=None,
         )
@@ -26,7 +46,7 @@ class Feature(BaseModel):
 class Content(BaseModel):
-    id: str
+    id: Optional[str] = (None,)
     content_type: Optional[str]
     data: bytes
     features: List[Feature] = []
@@ -38,7 +58,7 @@ class Content(BaseModel):
         features: List[Feature] = [],
     ):
         return Content(
-            id="none-for-now",
+            id=None,
             content_type="text/plain",
             data=bytes(text, "utf-8"),
             features=features,
@@ -81,3 +101,8 @@ class ContentMetadata(BaseModel):
             mime_type=json["mime_type"],
             extracted_metadata=json["extracted_metadata"],
         )
+class PDFFile(BaseData):
+    data: bytes
+    mime_type: str

indexify/extractor_sdk/extractor.py ADDED Viewed

@@ -0,0 +1,231 @@
+from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
+import inspect
+from pydantic import BaseModel, Field
+from abc import ABC, abstractmethod
+from .data import BaseData, Content, Feature
+import json
+import os
+import requests
+class EmbeddingSchema(BaseModel):
+    dim: int
+    distance: str = "cosine"
+class ExtractorMetadata(BaseModel):
+    name: str
+    version: str
+    description: str
+    input_mime_types: List[str]
+    system_dependencies: List[str]
+    python_dependencies: List[str]
+    input_mime_types: List[str]
+    embedding_schemas: Dict[str, EmbeddingSchema]
+    # Make this a dynamic model since its a json schema
+    input_params: Optional[Dict]
+    # for backward compatibility
+    metadata_schemas: Optional[Dict]
+class Extractor(ABC):
+    name: str = ""
+    version: str = "0.0.0"
+    system_dependencies: List[str] = []
+    python_dependencies: List[str] = []
+    description: str = ""
+    input_mime_types = ["text/plain"]
+    embeddings: Dict[str, EmbeddingSchema] = {}
+    @abstractmethod
+    def extract(
+        self, input: Type[BaseModel], params: Type[BaseModel] = None
+    ) -> List[Union[Feature, Type[BaseModel]]]:
+        """
+        Extracts information from the content. Returns a list of features to add
+        to the content.
+        It can also return a list of Content objects, which will be added to storage
+        and any extraction policies defined will be applied to them.
+        """
+        pass
+    @classmethod
+    @abstractmethod
+    def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
+        pass
+    def describe(self) -> ExtractorMetadata:
+        embedding_schemas = {}
+        try:
+            embedding_schemas = self.embedding_schemas
+        except NotImplementedError:
+            pass
+        json_schema = (
+            self._param_cls.model_json_schema() if self._param_cls is not None else None
+        )
+        return ExtractorMetadata(
+            name=self.name,
+            version=self.version,
+            description=self.description,
+            system_dependencies=self.system_dependencies,
+            python_dependencies=self.python_dependencies,
+            input_mime_types=self.input_mime_types,
+            embedding_schemas=embedding_schemas,
+            input_params=json.dumps(json_schema),
+        )
+    def _download_file(self, url, filename):
+        if os.path.exists(filename):
+            # file exists skip
+            return
+        try:
+            with requests.get(url, stream=True) as r:
+                r.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
+                with open(filename, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+        except requests.exceptions.RequestException as e:
+            print(f"Error downloading the file: {e}")
+    def sample_mp3(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.mp3"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="audio/mpeg", data=f.read(), features=features)
+    def sample_mp4(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.mp4"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="video/mp4", data=f.read(), features=features)
+    def sample_jpg(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.jpg"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="image/jpg", data=f.read(), features=features)
+    def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.jpg"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="image/jpg", data=f.read(), features=features)
+    def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.pdf"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="application/pdf", data=f.read(), features=features)
+    def sample_presentation(self, features: List[Feature] = []) -> Content:
+        file_name = "test.pptx"
+        self._download_file(
+            "https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(
+            content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            data=f.read(),
+            features=features,
+        )
+    def sample_text(self, features: List[Feature] = []) -> Content:
+        article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18."""
+        return Content(content_type="text/plain", data=article, features=features)
+    def sample_html(self, features: List[Feature] = []) -> Content:
+        file_name = "sample.html"
+        self._download_file(
+            "https://extractor-files.diptanu-6d5.workers.dev/sample.html",
+            file_name,
+        )
+        f = open(file_name, "rb")
+        return Content(content_type="text/html", data=f.read(), features=features)
+def extractor(
+    name: Optional[str] = None,
+    description: Optional[str] = "",
+    version: Optional[str] = "",
+    python_dependencies: Optional[List[str]] = None,
+    system_dependencies: Optional[List[str]] = None,
+    input_mime_types: Optional[List[str]] = None,
+    embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
+    sample_content: Optional[Callable] = None,
+):
+    args = locals()
+    del args["sample_content"]
+    def construct(fn):
+        def wrapper():
+            hint = get_type_hints(fn).get("params", dict)
+            if not args.get("name"):
+                args[
+                    "name"
+                ] = f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
+            class DecoratedFn(Extractor):
+                @classmethod
+                def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]:  # type: ignore
+                    # TODO we can force all the functions to take in a parms object
+                    # or check if someone adds a params
+                    if params is None:
+                        return fn(input)
+                    else:
+                        return fn(input, params)
+                def sample_input(self) -> Content:
+                    return sample_content() if sample_content else self.sample_text()
+            for key, val in args.items():
+                setattr(DecoratedFn, key, val)
+            return DecoratedFn
+        wrapper._extractor_name = fn.__name__
+        wrapper.name = fn.__name__
+        return wrapper
+    return construct

indexify/{extractor_utils.py → extractor_sdk/utils.py} RENAMED Viewed

@@ -3,8 +3,8 @@ import httpx
 from typing import List
 from .data import Content, Feature
-class SampleExtractorData:
+class SampleExtractorData:
     def _download_file(self, url, filename):
         if os.path.exists(filename):
             # file exists skip
@@ -105,4 +105,4 @@ class SampleExtractorData:
             file_name,
         )
         f = open(file_name, "rb")
-        return Content(content_type="text/html", data=f.read(), features=features)
+        return Content(content_type="text/html", data=f.read(), features=features)

indexify/graph.py CHANGED Viewed

@@ -1,80 +1,23 @@
-from indexify import Content, extractor
-from indexify.extractor import Extractor
+from .extractor_sdk import extractor, Extractor
-from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Self
+from typing import Type, Union
+from pydantic import BaseModel
-import itertools
+from .run_graph import RunGraph
+from .local_runner import LocalRunner
-@extractor(description="id function")
-def _id(content: Content) -> List[Content]:
-    return [content]
+def Graph(
+    name: str,
+    input: Type[BaseModel],
+    start_node: Union[extractor, Extractor],
+    run_local: bool,
+) -> RunGraph:
-class Graph:
-    def __init__(self, name: str):
-        # TODO check for cycles
-        self.name = name
+    if run_local:
+        runner = LocalRunner()
+    else:
+        raise NotImplementedError("Remote runner not supported yet")
-        self.nodes: Dict[str, Callable] = {}
-        self.params: Dict[str, Any] = {}
-        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
-        self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
-        self.nodes["start"] = _id
-        self.nodes["end"] = _id
-        self._topo_counter = defaultdict(int)
-        self._start_node = None
-    def _node(self, extractor: Extractor, params: Any = None) -> Self:
-        name = extractor._extractor_name
-        # if you've already inserted a node just ignore the new insertion.
-        if name in self.nodes:
-            return
-        self.nodes[name] = extractor
-        self.params[name] = extractor.__dict__.get('params', None)
-        # assign each node a rank of 1 to init the graph
-        self._topo_counter[name] = 1
-        return self
-    def step(self,
-             from_node: extractor,
-             to_node: extractor,
-             prefilter_predicates: Optional[str] = None
-    ) -> Self:
-        self._node(from_node)
-        self._node(to_node)
-        from_node_name = from_node._extractor_name
-        to_node_name = to_node._extractor_name
-        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
-        self._topo_counter[to_node_name] += 1
-        return self
-    """
-    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
-    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
-    """
-    def steps(self, from_node: extractor, to_nodes: List[extractor], prefilter_predicates: List[str] = []) -> Self:
-        print(f'{to_nodes}, {prefilter_predicates}, {prefilter_predicates}')
-        for t_n, p in itertools.zip_longest(to_nodes, prefilter_predicates, fillvalue=None):
-            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
-        return self
-    def _assign_start_node(self):
-        # this method should be called before a graph can be run
-        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
-        self._start_node = nodes[0][0]
+    graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
+    return graph

indexify/local_runner.py CHANGED Viewed

@@ -1,65 +1,125 @@
-from indexify import Content, Extractor
+import hashlib
+import os
+import pickle
+import shutil
+from pathlib import Path
+from indexify.extractor_sdk.data import BaseData, Feature
+from indexify.extractor_sdk.extractor import extractor, Extractor
 from collections import defaultdict
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Union
+from indexify.run_graph import RunGraph
+from indexify.runner import Runner
-import json
-class LocalRunner:
+class LocalRunner(Runner):
     def __init__(self):
-        self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
+        self.results: Dict[str, Any] = defaultdict(
+            list
+        )  # TODO should the Any be Content?
+    def run(self, g, wf_input: BaseData):
+        return self._run(g, _input=wf_input, node_name=g._start_node)
-    def run(self, g, content: Content):
-        g._assign_start_node()
-        return self._run(g, content=content, node_name=g._start_node)
+    # graph is getting some files which are files, some lables and the MIME type of the bytes
+    # those bytes have to be a python type
+    # _input needs to be serializable into python object (ie json for ex) and Feature
+    def _run(self, g: RunGraph, _input: BaseData, node_name: str):
+        print(f"---- Starting node {node_name}")
+        print(f'node_name {node_name}')
-    def _run(self, g, content: Content, node_name: str):
         extractor_construct: Callable = g.nodes[node_name]
         params = g.params.get(node_name, None)
-        print(f"----Starting {node_name}")
+        # NOTE: User should clear cache for nodes they would like to re-rerun
+        input_hash = hashlib.sha256(str(_input).encode()).hexdigest()
+        memo_output = self.get_from_memo(node_name, input_hash)
+        if memo_output is None:
+            print("=== FYI Writing output to cache")
+            res = extractor_construct().extract(input=_input, params=params)
+            self.put_into_memo(node_name, input_hash, pickle.dumps(res))
+        else:
+            print("=== Reading output from cache")
+            res = pickle.loads(memo_output)
+        if not isinstance(res, list):
+            res = [res]
-        res = extractor_construct().extract(content=content, params=params)
+        res_data = [i for i in res if not isinstance(i, Feature)]
+        res_features = [i for i in res if isinstance(i, Feature)]
-        self.results[node_name].extend(res)
+        self.results[node_name].extend(res_data)
+        for f in res_features:
+            _input.meta[f.name] = f.value
+        # this assume that if an extractor emits features then the next edge will always process
+        # the edges
+        data_to_process = res_data
+        if len(res_features) > 0:
+            data_to_process.append(_input)
         for out_edge, pre_filter_predicate in g.edges[node_name]:
             # TODO there are no reductions yet, each recursion finishes it's path and returns
-            for r in res:
+            for r in data_to_process:
                 if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
                     continue
-                self._run(g, content=r, node_name=out_edge)
+                self._run(g, _input=r, node_name=out_edge)
     """
     Returns True if content should be filtered
     """
-    def _prefilter_content(self, content: Content, prefilter_predicate: Optional[str]) -> bool:
+    def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
         if prefilter_predicate is None:
             return False
-        atoms = prefilter_predicate.split('and')
+        atoms = prefilter_predicate.split("and")
         if len(atoms) == 0:
             return False
         # TODO For now only support `and` and `=` and `string values`
         bools = []
-        for feature in content.features:
-            if feature.feature_type == 'metadata':
-                predicates = json.loads(feature.value)
+        metadata = content.get_features()['metadata']
+        for atom in atoms:
+            l, r = atom.split('=')
+            if l in metadata:
+                bools.append(metadata[l] != r)
-                print(f"predicates {predicates}")
+        return all(bools)
-                for atom in atoms:
-                    l, r = atom.split('=')
-                    if l in predicates:
-                        print(f'predicates[l], r: {predicates[l], r}')
-                        bools.append(predicates[l] != r)
+    def get_result(self, node: Union[extractor, Extractor]) -> Any:
+        node_name = node.name
+        return self.results[node_name]
-        print(bools)
+    def deleted_from_memo(self, node_name):
+        path_prefix = f"./indexify_local_runner_cache/{node_name}"
-        return all(bools)
+        if os.path.exists(path_prefix) and os.path.isdir(path_prefix):
+            shutil.rmtree(path_prefix)
-    def get_result(self, node: Extractor) -> Content:
-        node_name = node._extractor_name
-        return self.results[node_name]
+    def get_from_memo(self, node_name, input_hash):
+        path_prefix = f"./indexify_local_runner_cache/{node_name}"
+        file_name = f"{input_hash}"
+        file_path = f"{path_prefix}/{file_name}"
+        if not os.path.exists(file_path):
+            return None
+        with open(file_path, 'rb') as f:
+            return f.read()
+    def put_into_memo(self, node_name, input_hash, output):
+        path_prefix = f"./indexify_local_runner_cache/{node_name}"
+        file_name = f"{input_hash}"
+        file_path = f"{path_prefix}/{file_name}"
+        os.makedirs(path_prefix, exist_ok=True)
+        Path(file_path).touch()
+        with open(file_path, 'wb') as f:
+            return f.write(output)

indexify/run_graph.py ADDED Viewed

@@ -0,0 +1,122 @@
+import json
+from .extractor_sdk import Content, extractor, Extractor
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Type, Union
+from pydantic import BaseModel
+import itertools
+from .runner import Runner
+@extractor(description="id function")
+def _id(content: Content) -> List[Content]:
+    return [content]
+class RunGraph:
+    def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
+        # TODO check for cycles
+        self.name = name
+        self.nodes: Dict[str, Union[extractor, Extractor]] = {}
+        self.params: Dict[str, Any] = {}
+        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
+        self.nodes["start"] = _id
+        self.nodes["end"] = _id
+        self._topo_counter = defaultdict(int)
+        self._start_node = None
+        self._input = input
+        self.runner = runner
+    def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
+        name = extractor.name
+        # if you've already inserted a node just ignore the new insertion.
+        if name in self.nodes:
+            return
+        self.nodes[name] = extractor
+        self.params[name] = extractor.__dict__.get("params", None)
+        # assign each node a rank of 1 to init the graph
+        self._topo_counter[name] = 1
+        return self
+    def add_edge(
+        self,
+        from_node: extractor,
+        to_node: extractor,
+        prefilter_predicates: Optional[str] = None,
+    ) -> 'RunGraph':
+        self._node(from_node)
+        self._node(to_node)
+        from_node_name = from_node.name
+        to_node_name = to_node.name
+        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
+        self._topo_counter[to_node_name] += 1
+        return self
+    """
+    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
+    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
+    """
+    def steps(
+        self,
+        from_node: extractor,
+        to_nodes: List[extractor],
+        prefilter_predicates: List[str] = [],
+    ) -> 'RunGraph':
+        print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
+        for t_n, p in itertools.zip_longest(
+            to_nodes, prefilter_predicates, fillvalue=None
+        ):
+            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
+        return self
+    def add_param(self, node: extractor, params: Dict[str, Any]):
+        try:
+            # check if the params can be serialized since the server needs this
+            json.dumps(params)
+        except Exception:
+            raise Exception(f"For node {node.name}, cannot serialize params as json.")
+        self.params[node.name] = params
+    def run(self, wf_input, local):
+        self._assign_start_node()
+        # self.runner = LocalRunner()
+        self.runner.run(self, wf_input=wf_input)
+        pass
+    def clear_cache_for_node(self, node: Union[extractor, Extractor]):
+        if node.name not in self.nodes.keys():
+            raise Exception(f"Node with name {node.name} not found in graph")
+        self.runner.deleted_from_memo(node.name)
+    def clear_cache_for_all_nodes(self):
+        for node_name in self.nodes:
+            self.runner.deleted_from_memo(node_name=node_name)
+    def get_result(self, node: Union[extractor, Extractor]) -> Any:
+        return self.runner.results[node.name]
+    def _assign_start_node(self):
+        # this method should be called before a graph can be run
+        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
+        self._start_node = nodes[0][0]

indexify/runner.py ADDED Viewed

@@ -0,0 +1,22 @@
+from abc import ABC
+from indexify.extractor_sdk.data import BaseData
+from indexify.extractor_sdk.extractor import extractor, Extractor
+from typing import Any, Union
+class Runner(ABC):
+    def run(self, g, wf_input: BaseData):
+        raise NotImplementedError()
+    def get_result(self, node: Union[extractor, Extractor]) -> Any:
+        raise NotImplementedError()
+    def deleted_from_memo(self, node_name):
+        raise NotImplementedError()
+    def get_from_memo(self, node_name, input_hash):
+        raise NotImplementedError()
+    def put_into_memo(self, node_name, input_hash, output):
+        raise NotImplementedError()

{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.37
+Version: 0.0.39
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -13,9 +13,9 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: httpx[http2] (>=0.26,<0.27)
+Requires-Dist: httpx[http2] (>=0,<1)
 Requires-Dist: pydantic (>=2.8,<3.0)
-Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
+Requires-Dist: pyyaml (>=6,<7)
 Project-URL: Repository, https://github.com/tensorlakeai/indexify
 Description-Content-Type: text/markdown

indexify-0.0.39.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
+indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
+indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
+indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
+indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
+indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
+indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
+indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
+indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
+indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
+indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
+indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
+indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
+indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
+indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
+indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
+indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
+indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
+indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
+indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
+indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+indexify-0.0.39.dist-info/RECORD,,

indexify/extractor.py DELETED Viewed

@@ -1,122 +0,0 @@
-from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
-import inspect
-from pydantic import BaseModel
-from abc import ABC, abstractmethod
-from .data import Content, Feature
-import json
-class EmbeddingSchema(BaseModel):
-    dimension: int
-class Extractor(ABC):
-    name: str = ""
-    version: str = "0.0.0"
-    system_dependencies: List[str] = []
-    python_dependencies: List[str] = []
-    description: str = ""
-    input_mime_types = ["text/plain"]
-    def extract(
-        self, input: Type[BaseModel], params: Type[BaseModel] = None
-    ) -> List[Union[Feature, Type[BaseModel]]]:
-        """
-        Extracts information from the content. Returns a list of features to add
-        to the content.
-        It can also return a list of Content objects, which will be added to storage
-        and any extraction policies defined will be applied to them.
-        """
-        pass
-    def extract_batch(
-        self, input_list: List[Type[BaseModel]], params: List[Type[BaseModel]] = None
-    ) -> List[List[Union[Feature, Type[BaseModel]]]]:
-        """
-        Extracts information from the content. Returns a list of features to add
-        to the content.
-        It can also return a list of Content objects, which will be added to storage
-        and any extraction policies defined will be applied to them.
-        """
-        pass
-    @classmethod
-    @abstractmethod
-    def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
-        pass
-    @classmethod
-    @abstractmethod
-    def embedding_schemas(cls) -> Dict[str, EmbeddingSchema]:
-        raise NotImplementedError
-    def describe(self) -> Dict:
-        embedding_schemas = {}
-        try:
-            embedding_schemas = self.embedding_schemas()
-        except NotImplementedError:
-            pass
-        json_schema = (
-            self._param_cls.model_json_schema() if self._param_cls is not None else None
-        )
-        return {
-            "name": self.name,
-            "version": self.version,
-            "description": self.description,
-            "system_dependencies": self.system_dependencies,
-            "python_dependencies": self.python_dependencies,
-            "input_mime_types": self.input_mime_types,
-            "embedding_schemas": embedding_schemas,
-            "input_params": json.dumps(json_schema),
-        }
-def extractor(
-    name: Optional[str] = None,
-    description: Optional[str] = "",
-    version: Optional[str] = "",
-    python_dependencies: Optional[List[str]] = None,
-    system_dependencies: Optional[List[str]] = None,
-    input_mime_types: Optional[List[str]] = None,
-    embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
-    sample_content: Optional[Callable] = None,
-):
-    args = locals()
-    del args["sample_content"]
-    def construct(fn):
-        def wrapper():
-            hint = get_type_hints(fn).get("params", dict)
-            if not args.get("name"):
-                args["name"] = (
-                    f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
-                )
-            class DecoratedFn(Extractor):
-                @classmethod
-                def extract(cls, input: Type[BaseModel], params: Type[BaseModel]=None) -> List[Content]:  # type: ignore
-                    # TODO we can force all the functions to take in a parms object
-                    # or check if someone adds a params
-                    if params is None:
-                        return fn(input)
-                    else:
-                        return fn(input, params)
-                def sample_input(self) -> Content:
-                    return sample_content() if sample_content else self.sample_text()
-            for key, val in args.items():
-                setattr(DecoratedFn, key, val)
-            return DecoratedFn
-        wrapper._extractor_name = fn.__name__
-        return wrapper
-    return construct

indexify-0.0.37.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-indexify/__init__.py,sha256=W58FqmnKHIx-gHKTBDQa1QI49Gi8f1rw90yDg31jwgQ,743
-indexify/client.py,sha256=faGiWAtdXkL4Vmx6xr0iHJLIBwhS2XZbQ6ld_7sMsBc,25874
-indexify/data.py,sha256=91We7J2QAKBOTu1yF3ApTl4yl4C-nDL2WSXhBdekLWg,2334
-indexify/data_loaders/__init__.py,sha256=EiYemxCP4zRfDWnDKiX6-SFwXVmv1TSdcXHBQRbE_Uw,1309
-indexify/data_loaders/local_directory_loader.py,sha256=kF7VwkuOJFBrhKrR7IOOdZ4TDAItw_CyUOfcuej1CKI,1080
-indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
-indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
-indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
-indexify/extractor.py,sha256=HnLot4DQv7aVI3FwFNH83LzKjq7DlSR1-wmpcVC89tE,3930
-indexify/extractor_utils.py,sha256=68V5vZB9GYx648dyyVKAia0M4pG_R31QPqUQz3ZZ1FQ,6593
-indexify/graph.py,sha256=hUGTpaI3ale54sQ90u5P3-RJCwsSlEJg1V1R0rmCZE0,2576
-indexify/local_runner.py,sha256=VV4Ff_ctibw0ZL4u1wVA7drRx4zLTgNmT_qLX3Cq2SY,2167
-indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
-indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
-indexify-0.0.37.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-indexify-0.0.37.dist-info/METADATA,sha256=_3uThIPuUiPQ9BBVoqoEEo5Prqp_LHx59jHrZ2CpSgk,1891
-indexify-0.0.37.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-indexify-0.0.37.dist-info/RECORD,,

{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/WHEEL RENAMED Viewed

File without changes

indexify 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

indexify 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl