PyPI - indexify - Versions diffs - 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl - Mend

indexify 0.0.39py3-none-any.whl → 0.0.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

indexify/__init__.py +2 -0
indexify/data_loaders/url_loader.py +0 -1
indexify/extractor_sdk/data.py +1 -1
indexify/extractor_sdk/extractor.py +7 -27
indexify/extractors/__init__.py +0 -0
indexify/extractors/embedding.py +53 -0
indexify/graph.py +122 -16
indexify/local_runner.py +2 -2
{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/METADATA +2 -1
{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/RECORD +12 -11
indexify/run_graph.py +0 -122
{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/LICENSE.txt +0 -0
{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/WHEEL +0 -0

indexify/__init__.py CHANGED Viewed

@@ -9,9 +9,11 @@ from .client import (
 from . import extractor_sdk
 from .settings import DEFAULT_SERVICE_URL
 from . import data_loaders
+from .graph import Graph
 __all__ = [
     "data_loaders",
+    "Graph",
     "Document",
     "extractor_sdk",
     "IndexifyClient",

indexify/data_loaders/url_loader.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from . import DataLoader, FileMetadata
 from typing import List
 import httpx
-import hashlib
 import email.utils

indexify/extractor_sdk/data.py CHANGED Viewed

@@ -103,6 +103,6 @@ class ContentMetadata(BaseModel):
         )
-class PDFFile(BaseData):
+class File(BaseData):
     data: bytes
     mime_type: str

indexify/extractor_sdk/extractor.py CHANGED Viewed

@@ -10,7 +10,8 @@ import requests
 class EmbeddingSchema(BaseModel):
     dim: int
-    distance: str = "cosine"
+    distance: Optional[str] = "cosine"
+    database_url: Optional[str] = None
 class ExtractorMetadata(BaseModel):
     name: str
@@ -40,8 +41,8 @@ class Extractor(ABC):
     input_mime_types = ["text/plain"]
-    embeddings: Dict[str, EmbeddingSchema] = {}
+    embedding_indexes: Dict[str, EmbeddingSchema] = {}
     @abstractmethod
     def extract(
         self, input: Type[BaseModel], params: Type[BaseModel] = None
@@ -55,31 +56,9 @@ class Extractor(ABC):
         pass
     @classmethod
-    @abstractmethod
     def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
         pass
-    def describe(self) -> ExtractorMetadata:
-        embedding_schemas = {}
-        try:
-            embedding_schemas = self.embedding_schemas
-        except NotImplementedError:
-            pass
-        json_schema = (
-            self._param_cls.model_json_schema() if self._param_cls is not None else None
-        )
-        return ExtractorMetadata(
-            name=self.name,
-            version=self.version,
-            description=self.description,
-            system_dependencies=self.system_dependencies,
-            python_dependencies=self.python_dependencies,
-            input_mime_types=self.input_mime_types,
-            embedding_schemas=embedding_schemas,
-            input_params=json.dumps(json_schema),
-        )
     def _download_file(self, url, filename):
         if os.path.exists(filename):
             # file exists skip
@@ -190,7 +169,7 @@ def extractor(
     python_dependencies: Optional[List[str]] = None,
     system_dependencies: Optional[List[str]] = None,
     input_mime_types: Optional[List[str]] = None,
-    embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
+    embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
     sample_content: Optional[Callable] = None,
 ):
     args = locals()
@@ -198,7 +177,7 @@ def extractor(
     def construct(fn):
         def wrapper():
-            hint = get_type_hints(fn).get("params", dict)
+            description = fn.__doc__ or args.get("description", "")
             if not args.get("name"):
                 args[
@@ -220,6 +199,7 @@ def extractor(
             for key, val in args.items():
                 setattr(DecoratedFn, key, val)
+            DecoratedFn.description = description
             return DecoratedFn

indexify/extractors/__init__.py ADDED Viewed

File without changes

indexify/extractors/embedding.py ADDED Viewed

@@ -0,0 +1,53 @@
+from typing import List
+from indexify.extractor_sdk.data import Feature
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel, AutoTokenizer
+from indexify.extractor_sdk.extractor import Extractor , Feature
+class SentenceTransformersEmbedding:
+    def __init__(self, model_name) -> None:
+        self._model_name = model_name
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            f"sentence-transformers/{model_name}"
+        )
+        self._model = AutoModel.from_pretrained(
+            f"sentence-transformers/{model_name}", torchscript=True
+        )
+        self._model.eval()
+    def embed_batch(self, inputs: List[str]) -> List[List[float]]:
+        result = self._embed(inputs)
+        return result.tolist()
+    def embed(self, query: str) -> List[float]:
+        result = self._embed([query])
+        return result[0].tolist()
+    def _embed(self, inputs: List[str]) -> torch.Tensor:
+        encoded_input = self._tokenizer(
+            inputs, padding=True, truncation=True, return_tensors="pt"
+        )
+        sentence_embeddings = self._model(**encoded_input)
+        return F.normalize(sentence_embeddings, p=2, dim=1)
+class BasicSentenceTransformerModels(Extractor):
+    def __init__(self, model: str):
+        super().__init__()
+        self.model = SentenceTransformersEmbedding(model)
+    def extract(self, input: str) -> List[Feature]:
+        embeddings = self.model.embed(input)
+        return [Feature.embedding(values=embeddings)]
+class BasicHFTransformerEmbeddingModels(Extractor):
+        def __init__(self, model: str):
+            super().__init__()
+            self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
+        def extract(self, input: str) -> List[Feature]:
+            embeddings = self.model.embed_query(input)
+            return [Feature.embedding(values=embeddings)]

indexify/graph.py CHANGED Viewed

@@ -1,23 +1,129 @@
-from .extractor_sdk import extractor, Extractor
+import json
+import itertools
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Type, Union
-from typing import Type, Union
+import cloudpickle
 from pydantic import BaseModel
-from .run_graph import RunGraph
-from .local_runner import LocalRunner
+from .extractor_sdk import Content, extractor, Extractor
+from .runner import Runner
+@extractor(description="id function")
+def _id(content: Content) -> List[Content]:
+    return [content]
-def Graph(
-    name: str,
-    input: Type[BaseModel],
-    start_node: Union[extractor, Extractor],
-    run_local: bool,
-) -> RunGraph:
-    if run_local:
-        runner = LocalRunner()
-    else:
-        raise NotImplementedError("Remote runner not supported yet")
+def load_graph(graph: bytes) -> 'Graph':
+    return cloudpickle.loads(graph)
-    graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
-    return graph
+class Graph:
+    def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
+        # TODO check for cycles
+        self.name = name
+        self.nodes: Dict[str, Union[extractor, Extractor]] = {}
+        self.params: Dict[str, Any] = {}
+        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
+        self.nodes["start"] = _id
+        self.nodes["end"] = _id
+        self._topo_counter = defaultdict(int)
+        self._start_node = None
+        self._input = input
+        self.runner = runner
+    def get_extractor(self, name: str) -> Extractor:
+        return self.nodes[name]
+    def _node(self, extractor: Extractor, params: Any = None) -> 'Graph':
+        name = extractor.name
+        # if you've already inserted a node just ignore the new insertion.
+        if name in self.nodes:
+            return
+        self.nodes[name] = extractor
+        self.params[name] = extractor.__dict__.get("params", None)
+        # assign each node a rank of 1 to init the graph
+        self._topo_counter[name] = 1
+        return self
+    def serialize(self):
+        return cloudpickle.dumps(self)
+    def add_edge(
+        self,
+        from_node: Type[Extractor],
+        to_node: Type[Extractor],
+        prefilter_predicates: Optional[str] = None,
+    ) -> 'Graph':
+        self._node(from_node)
+        self._node(to_node)
+        from_node_name = from_node.name
+        to_node_name = to_node.name
+        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
+        self._topo_counter[to_node_name] += 1
+        return self
+    """
+    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
+    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
+    """
+    def steps(
+        self,
+        from_node: extractor,
+        to_nodes: List[extractor],
+        prefilter_predicates: List[str] = [],
+    ) -> 'Graph':
+        print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
+        for t_n, p in itertools.zip_longest(
+            to_nodes, prefilter_predicates, fillvalue=None
+        ):
+            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
+        return self
+    def add_param(self, node: extractor, params: Dict[str, Any]):
+        try:
+            # check if the params can be serialized since the server needs this
+            json.dumps(params)
+        except Exception:
+            raise Exception(f"For node {node.name}, cannot serialize params as json.")
+        self.params[node.name] = params
+    def run(self, wf_input, local):
+        self._assign_start_node()
+        self.runner.run(self, wf_input=wf_input)
+        pass
+    def clear_cache_for_node(self, node: Union[extractor, Extractor]):
+        if node.name not in self.nodes.keys():
+            raise Exception(f"Node with name {node.name} not found in graph")
+        self.runner.deleted_from_memo(node.name)
+    def clear_cache_for_all_nodes(self):
+        for node_name in self.nodes:
+            self.runner.deleted_from_memo(node_name=node_name)
+    def get_result(self, node: Union[extractor, Extractor]) -> Any:
+        return self.runner.results[node.name]
+    def _assign_start_node(self):
+        # this method should be called before a graph can be run
+        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
+        self._start_node = nodes[0][0]

indexify/local_runner.py CHANGED Viewed

@@ -10,7 +10,7 @@ from indexify.extractor_sdk.extractor import extractor, Extractor
 from collections import defaultdict
 from typing import Any, Callable, Dict, Optional, Union
-from indexify.run_graph import RunGraph
+from indexify.graph import Graph
 from indexify.runner import Runner
@@ -27,7 +27,7 @@ class LocalRunner(Runner):
     # those bytes have to be a python type
     # _input needs to be serializable into python object (ie json for ex) and Feature
-    def _run(self, g: RunGraph, _input: BaseData, node_name: str):
+    def _run(self, g: Graph, _input: BaseData, node_name: str):
         print(f"---- Starting node {node_name}")
         print(f'node_name {node_name}')

{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.39
+Version: 0.0.40
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: cloudpickle (>=3,<4)
 Requires-Dist: httpx[http2] (>=0,<1)
 Requires-Dist: pydantic (>=2.8,<3.0)
 Requires-Dist: pyyaml (>=6,<7)

{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,24 @@
-indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
+indexify/__init__.py,sha256=0kUYM2FAVfYp0ZCx_3uQD5HbmDLDdBvEBtwHZrGQKaA,541
 indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
 indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
 indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
 indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
-indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
+indexify/data_loaders/url_loader.py,sha256=1q-uxFHsf5g5u49omzXHfP_zrzMwj-eFs7_1ugdr58g,1531
 indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
 indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
 indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
 indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
-indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
-indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
+indexify/extractor_sdk/data.py,sha256=DvNdq8w5XT4cyOR_wjWwyr32FdAfJ5297Hy89TqZcBI,2778
+indexify/extractor_sdk/extractor.py,sha256=D7QshIoYzZaeAJKQlYilSzUeLNpp2innE5RVtEoa06s,9820
 indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
-indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
-indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
-indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
+indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+indexify/extractors/embedding.py,sha256=LlE2Ti0AJULHqar9a7VbvEnMX8VJ1m88-vFE2n_55M0,1898
+indexify/graph.py,sha256=0pIGOBltNIk9HMfPf3iSwk_kTAzKJSbEGZCcr5PJBgg,3951
+indexify/local_runner.py,sha256=04nYTuPzco0yqzrqAqjbrxNDC_AxLuxJmD7-76VLdUQ,4336
 indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
 indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
 indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
-indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
-indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-indexify-0.0.39.dist-info/RECORD,,
+indexify-0.0.40.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+indexify-0.0.40.dist-info/METADATA,sha256=YxPEZNNIPhedRKwTmOT555lEVAbokNgO37qbEu_OYXE,1913
+indexify-0.0.40.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+indexify-0.0.40.dist-info/RECORD,,

indexify/run_graph.py DELETED Viewed

@@ -1,122 +0,0 @@
-import json
-from .extractor_sdk import Content, extractor, Extractor
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Type, Union
-from pydantic import BaseModel
-import itertools
-from .runner import Runner
-@extractor(description="id function")
-def _id(content: Content) -> List[Content]:
-    return [content]
-class RunGraph:
-    def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
-        # TODO check for cycles
-        self.name = name
-        self.nodes: Dict[str, Union[extractor, Extractor]] = {}
-        self.params: Dict[str, Any] = {}
-        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
-        self.nodes["start"] = _id
-        self.nodes["end"] = _id
-        self._topo_counter = defaultdict(int)
-        self._start_node = None
-        self._input = input
-        self.runner = runner
-    def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
-        name = extractor.name
-        # if you've already inserted a node just ignore the new insertion.
-        if name in self.nodes:
-            return
-        self.nodes[name] = extractor
-        self.params[name] = extractor.__dict__.get("params", None)
-        # assign each node a rank of 1 to init the graph
-        self._topo_counter[name] = 1
-        return self
-    def add_edge(
-        self,
-        from_node: extractor,
-        to_node: extractor,
-        prefilter_predicates: Optional[str] = None,
-    ) -> 'RunGraph':
-        self._node(from_node)
-        self._node(to_node)
-        from_node_name = from_node.name
-        to_node_name = to_node.name
-        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
-        self._topo_counter[to_node_name] += 1
-        return self
-    """
-    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
-    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
-    """
-    def steps(
-        self,
-        from_node: extractor,
-        to_nodes: List[extractor],
-        prefilter_predicates: List[str] = [],
-    ) -> 'RunGraph':
-        print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
-        for t_n, p in itertools.zip_longest(
-            to_nodes, prefilter_predicates, fillvalue=None
-        ):
-            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
-        return self
-    def add_param(self, node: extractor, params: Dict[str, Any]):
-        try:
-            # check if the params can be serialized since the server needs this
-            json.dumps(params)
-        except Exception:
-            raise Exception(f"For node {node.name}, cannot serialize params as json.")
-        self.params[node.name] = params
-    def run(self, wf_input, local):
-        self._assign_start_node()
-        # self.runner = LocalRunner()
-        self.runner.run(self, wf_input=wf_input)
-        pass
-    def clear_cache_for_node(self, node: Union[extractor, Extractor]):
-        if node.name not in self.nodes.keys():
-            raise Exception(f"Node with name {node.name} not found in graph")
-        self.runner.deleted_from_memo(node.name)
-    def clear_cache_for_all_nodes(self):
-        for node_name in self.nodes:
-            self.runner.deleted_from_memo(node_name=node_name)
-    def get_result(self, node: Union[extractor, Extractor]) -> Any:
-        return self.runner.results[node.name]
-    def _assign_start_node(self):
-        # this method should be called before a graph can be run
-        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
-        self._start_node = nodes[0][0]

{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.39.dist-info → indexify-0.0.40.dist-info}/WHEEL RENAMED Viewed

File without changes

indexify 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl

indexify 0.0.39py3-none-any.whl → 0.0.40py3-none-any.whl