PyPI - indexify - Versions diffs - 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl - Mend

indexify 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

indexify/__init__.py +5 -5
indexify/base_client.py +11 -7
indexify/client.py +15 -12
indexify/data_loaders/__init__.py +5 -5
indexify/data_loaders/local_directory_loader.py +3 -2
indexify/data_loaders/url_loader.py +4 -3
indexify/extraction_policy.py +2 -2
indexify/extractor_sdk/__init__.py +2 -2
indexify/extractor_sdk/data.py +4 -12
indexify/extractor_sdk/extractor.py +23 -31
indexify/extractor_sdk/utils.py +9 -15
indexify/extractors/__init__.py +0 -0
indexify/extractors/embedding.py +55 -0
indexify/extractors/pdf_parser.py +95 -0
indexify/graph.py +126 -16
indexify/local_runner.py +17 -14
indexify/runner.py +2 -2
{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/METADATA +2 -1
indexify-0.0.41.dist-info/RECORD +25 -0
indexify/run_graph.py +0 -122
indexify-0.0.39.dist-info/RECORD +0 -23
{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/LICENSE.txt +0 -0
{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/WHEEL +0 -0

indexify/__init__.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from .client import IndexifyClient
-from .extraction_policy import ExtractionGraph
+from . import data_loaders, extractor_sdk
 from .client import (
-    IndexifyClient,
     Document,
+    IndexifyClient,
     generate_hash_from_string,
     generate_unique_hex_id,
 )
-from . import extractor_sdk
+from .extraction_policy import ExtractionGraph
+from .graph import Graph
 from .settings import DEFAULT_SERVICE_URL
-from . import data_loaders
 __all__ = [
     "data_loaders",
+    "Graph",
     "Document",
     "extractor_sdk",
     "IndexifyClient",

indexify/base_client.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
-from .extractor_sdk import Graph, Feature
-from typing import Any, List, Optional, Union, Dict
+from typing import Any, Dict, List, Optional, Union
+from .extractor_sdk import Feature, Graph
 class BaseClient(ABC):
@@ -23,7 +25,7 @@ class BaseClient(ABC):
     ### Ingestion APIs
     @abstractmethod
-    def invoke_graph_with_object(self, graph:str, object: Any) -> str:
+    def invoke_graph_with_object(self, graph: str, object: Any) -> str:
         """
         Invokes a graph with an input object.
         graph: str: The name of the graph to invoke
@@ -42,10 +44,11 @@ class BaseClient(ABC):
         """
         pass
     ### Retrieval APIs
     @abstractmethod
-    def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
+    def extracted_objects(
+        self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
+    ) -> Union[Dict[str, List[Any]], List[Any]]:
         """
         Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
         If the extractor name is not provided, all the extracted objects are returned for the input object.
@@ -57,11 +60,12 @@ class BaseClient(ABC):
         pass
     @abstractmethod
-    def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
+    def features(
+        self, object_id: str, graph: Optional[str]
+    ) -> Union[Dict[str, List[Feature]], List[Feature]]:
         """
         Returns the features of an object.
         object_id: str: The ID of the object
         return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
         """
         pass

indexify/client.py CHANGED Viewed

@@ -1,20 +1,23 @@
-import yaml
-import httpx
-import uuid
 import hashlib
 import json
+import logging
+import uuid
 from collections import namedtuple
-from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
-from .extractor_sdk.extractor import ExtractorMetadata
-from .extraction_policy import ExtractionGraph
-from .utils import json_set_default
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+import httpx
+import yaml
+from indexify.exceptions import ApiException
+from .data_loaders import DataLoader
 from .error import Error
+from .extraction_policy import ExtractionGraph
 from .extractor_sdk.data import ContentMetadata
-from .data_loaders import DataLoader
-from indexify.exceptions import ApiException
-from dataclasses import dataclass
-from typing import List, Optional, Union, Dict
-import logging
+from .extractor_sdk.extractor import ExtractorMetadata
+from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
+from .utils import json_set_default
 Document = namedtuple("Document", ["text", "labels", "id"])

indexify/data_loaders/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from pydantic import BaseModel
+import hashlib
+import mimetypes
+import os
 from abc import ABC, abstractmethod
 from typing import List
-import os
-import mimetypes
-import hashlib
+from pydantic import BaseModel
 class FileMetadata(BaseModel):

indexify/data_loaders/local_directory_loader.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from . import DataLoader, FileMetadata
-from typing import List, Optional
 import os
+from typing import List, Optional
+from . import DataLoader, FileMetadata
 class LocalDirectoryLoader(DataLoader):

indexify/data_loaders/url_loader.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from . import DataLoader, FileMetadata
+import email.utils
 from typing import List
 import httpx
-import hashlib
-import email.utils
+from . import DataLoader, FileMetadata
 def convert_date_to_epoch(date_str: str) -> int:

indexify/extraction_policy.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from dataclasses import dataclass, asdict
-from typing import Optional, List
+from dataclasses import asdict, dataclass
+from typing import List, Optional
 @dataclass

indexify/extractor_sdk/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from .data import ContentMetadata, Content, Feature
-from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
+from .data import Content, ContentMetadata, Feature
+from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
 from .utils import SampleExtractorData
 __all__ = [

indexify/extractor_sdk/data.py CHANGED Viewed

@@ -1,15 +1,7 @@
-from typing import (
-    Any,
-    List,
-    Optional,
-    Literal,
-    Dict,
-    Type,
-    cast,
-    Mapping,
-)
-from pydantic import BaseModel, Json, Field
 import json
+from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
+from pydantic import BaseModel, Field, Json
 from typing_extensions import Annotated, Doc
@@ -103,6 +95,6 @@ class ContentMetadata(BaseModel):
         )
-class PDFFile(BaseData):
+class File(BaseData):
     data: bytes
     mime_type: str

indexify/extractor_sdk/extractor.py CHANGED Viewed

@@ -1,16 +1,29 @@
-from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
 import inspect
-from pydantic import BaseModel, Field
-from abc import ABC, abstractmethod
-from .data import BaseData, Content, Feature
 import json
 import os
+from abc import ABC, abstractmethod
+from typing import (
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    get_type_hints,
+)
 import requests
+from pydantic import BaseModel, Field
+from .data import BaseData, Content, Feature
 class EmbeddingSchema(BaseModel):
     dim: int
-    distance: str = "cosine"
+    distance: Optional[str] = "cosine"
+    database_url: Optional[str] = None
 class ExtractorMetadata(BaseModel):
     name: str
@@ -40,7 +53,7 @@ class Extractor(ABC):
     input_mime_types = ["text/plain"]
-    embeddings: Dict[str, EmbeddingSchema] = {}
+    embedding_indexes: Dict[str, EmbeddingSchema] = {}
     @abstractmethod
     def extract(
@@ -55,31 +68,9 @@ class Extractor(ABC):
         pass
     @classmethod
-    @abstractmethod
     def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
         pass
-    def describe(self) -> ExtractorMetadata:
-        embedding_schemas = {}
-        try:
-            embedding_schemas = self.embedding_schemas
-        except NotImplementedError:
-            pass
-        json_schema = (
-            self._param_cls.model_json_schema() if self._param_cls is not None else None
-        )
-        return ExtractorMetadata(
-            name=self.name,
-            version=self.version,
-            description=self.description,
-            system_dependencies=self.system_dependencies,
-            python_dependencies=self.python_dependencies,
-            input_mime_types=self.input_mime_types,
-            embedding_schemas=embedding_schemas,
-            input_params=json.dumps(json_schema),
-        )
     def _download_file(self, url, filename):
         if os.path.exists(filename):
             # file exists skip
@@ -190,7 +181,7 @@ def extractor(
     python_dependencies: Optional[List[str]] = None,
     system_dependencies: Optional[List[str]] = None,
     input_mime_types: Optional[List[str]] = None,
-    embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
+    embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
     sample_content: Optional[Callable] = None,
 ):
     args = locals()
@@ -198,7 +189,7 @@ def extractor(
     def construct(fn):
         def wrapper():
-            hint = get_type_hints(fn).get("params", dict)
+            description = fn.__doc__ or args.get("description", "")
             if not args.get("name"):
                 args[
@@ -207,7 +198,7 @@ def extractor(
             class DecoratedFn(Extractor):
                 @classmethod
-                def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]:  # type: ignore
+                def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]:  # type: ignore
                     # TODO we can force all the functions to take in a parms object
                     # or check if someone adds a params
                     if params is None:
@@ -220,6 +211,7 @@ def extractor(
             for key, val in args.items():
                 setattr(DecoratedFn, key, val)
+            DecoratedFn.description = description
             return DecoratedFn

indexify/extractor_sdk/utils.py CHANGED Viewed

@@ -1,20 +1,17 @@
 import os
-import httpx
 from typing import List
+import httpx
 from .data import Content, Feature
 class SampleExtractorData:
-    def _download_file(self, url, filename):
-        if os.path.exists(filename):
-            # file exists skip
-            return
+    def _download_file(self, url):
         try:
-            with httpx.get(url, stream=True) as r:
-                r.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
-                with open(filename, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
+            resp = httpx.get(url, follow_redirects=True)
+            resp.raise_for_status()
+            return resp.content
         except httpx.exceptions.RequestException as e:
             print(f"Error downloading the file: {e}")
@@ -55,13 +52,10 @@ class SampleExtractorData:
         return Content(content_type="image/jpg", data=f.read(), features=features)
     def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
-        file_name = "sample.pdf"
-        self._download_file(
+        data = self._download_file(
             "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
-            file_name,
         )
-        f = open(file_name, "rb")
-        return Content(content_type="application/pdf", data=f.read(), features=features)
+        return Content(content_type="application/pdf", data=data, features=features)
     def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
         file_name = "sample.pdf"

indexify/extractors/__init__.py ADDED Viewed

File without changes

indexify/extractors/embedding.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import List
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel, AutoTokenizer
+from indexify.extractor_sdk.data import Feature
+from indexify.extractor_sdk.extractor import Extractor, Feature
+class SentenceTransformersEmbedding:
+    def __init__(self, model_name) -> None:
+        self._model_name = model_name
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            f"sentence-transformers/{model_name}"
+        )
+        self._model = AutoModel.from_pretrained(
+            f"sentence-transformers/{model_name}", torchscript=True
+        )
+        self._model.eval()
+    def embed_batch(self, inputs: List[str]) -> List[List[float]]:
+        result = self._embed(inputs)
+        return result.tolist()
+    def embed(self, query: str) -> List[float]:
+        result = self._embed([query])
+        return result[0].tolist()
+    def _embed(self, inputs: List[str]) -> torch.Tensor:
+        encoded_input = self._tokenizer(
+            inputs, padding=True, truncation=True, return_tensors="pt"
+        )
+        sentence_embeddings = self._model(**encoded_input)
+        return F.normalize(sentence_embeddings, p=2, dim=1)
+class BasicSentenceTransformerModels(Extractor):
+    def __init__(self, model: str):
+        super().__init__()
+        self.model = SentenceTransformersEmbedding(model)
+    def extract(self, input: str) -> List[Feature]:
+        embeddings = self.model.embed(input)
+        return [Feature.embedding(values=embeddings)]
+class BasicHFTransformerEmbeddingModels(Extractor):
+    def __init__(self, model: str):
+        super().__init__()
+        self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
+    def extract(self, input: str) -> List[Feature]:
+        embeddings = self.model.embed_query(input)
+        return [Feature.embedding(values=embeddings)]

indexify/extractors/pdf_parser.py ADDED Viewed

@@ -0,0 +1,95 @@
+import tempfile
+from enum import Enum
+from typing import List, Optional
+import deepdoctection as dd
+from pydantic import BaseModel
+class PageFragmentType(str, Enum):
+    TEXT = "text"
+    FIGURE = "figure"
+    TABLE = "table"
+class Image(BaseModel):
+    data: bytes
+    mime_type: str
+class TableEncoding(str, Enum):
+    CSV = "csv"
+    HTML = "html"
+class Table(BaseModel):
+    data: str
+    encoding: TableEncoding
+class PageFragment(BaseModel):
+    fragment_type: PageFragmentType
+    text: Optional[str] = None
+    image: Optional[Image] = None
+    table: Optional[Table] = None
+    reading_order: Optional[int] = None
+class Page(BaseModel):
+    number: int
+    fragments: List[PageFragment]
+class PDFParser:
+    def __init__(self, data: bytes, language: Optional[str] = "en"):
+        self._data = data
+    def parse(self) -> List[Page]:
+        analyzer = dd.get_dd_analyzer()
+        parsed_pages = []
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
+            f.write(self._data)
+            f.flush()
+            df = analyzer.analyze(path=f.name)
+            df.reset_state()
+            for page in df:
+                parsed_pages.append(page)
+        outputs: List[Page] = []
+        for parsed_page in parsed_pages:
+            page_num = parsed_page.page_number
+            fragments = []
+            for layout in parsed_page.layouts:
+                if layout.category_name in ["text", "title"]:
+                    fragments.append(
+                        PageFragment(
+                            fragment_type=PageFragmentType.TEXT,
+                            text=layout.text,
+                            reading_order=layout.reading_order,
+                        )
+                    )
+            figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
+            print(len(figures))
+            for figure in figures:
+                image_bytes = dd.viz_handler.encode(figure.viz())
+                fragments.append(
+                    PageFragment(
+                        fragment_type=PageFragmentType.FIGURE,
+                        image=Image(data=image_bytes, mime_type="image/png"),
+                        reading_order=figure.reading_order,
+                    )
+                )
+            tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
+            print(len(tables))
+            for table in tables:
+                fragments.append(
+                    PageFragment(
+                        fragment_type=PageFragmentType.TABLE,
+                        table=Table(data=table.html, encoding=TableEncoding.HTML),
+                        reading_order=table.reading_order,
+                    )
+                )
+            outputs.append(Page(number=page_num, fragments=fragments))
+        return outputs

indexify/graph.py CHANGED Viewed

@@ -1,23 +1,133 @@
-from .extractor_sdk import extractor, Extractor
+import itertools
+import json
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Type, Union
-from typing import Type, Union
+import cloudpickle
 from pydantic import BaseModel
-from .run_graph import RunGraph
-from .local_runner import LocalRunner
+from .extractor_sdk import Content, Extractor, extractor
+from .runner import Runner
-def Graph(
-    name: str,
-    input: Type[BaseModel],
-    start_node: Union[extractor, Extractor],
-    run_local: bool,
-) -> RunGraph:
+@extractor(description="id function")
+def _id(content: Content) -> List[Content]:
+    return [content]
-    if run_local:
-        runner = LocalRunner()
-    else:
-        raise NotImplementedError("Remote runner not supported yet")
-    graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
-    return graph
+def load_graph(graph: bytes) -> "Graph":
+    return cloudpickle.loads(graph)
+class Graph:
+    def __init__(
+        self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
+    ):
+        # TODO check for cycles
+        self.name = name
+        self.nodes: Dict[str, Union[extractor, Extractor]] = {}
+        self.params: Dict[str, Any] = {}
+        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
+        self.nodes["start"] = _id
+        self.nodes["end"] = _id
+        self._topo_counter = defaultdict(int)
+        self._start_node = None
+        self._input = input
+        self.runner = runner
+    def get_extractor(self, name: str) -> Extractor:
+        return self.nodes[name]
+    def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
+        name = extractor.name
+        # if you've already inserted a node just ignore the new insertion.
+        if name in self.nodes:
+            return
+        self.nodes[name] = extractor
+        self.params[name] = extractor.__dict__.get("params", None)
+        # assign each node a rank of 1 to init the graph
+        self._topo_counter[name] = 1
+        return self
+    def serialize(self):
+        return cloudpickle.dumps(self)
+    def add_edge(
+        self,
+        from_node: Type[Extractor],
+        to_node: Type[Extractor],
+        prefilter_predicates: Optional[str] = None,
+    ) -> "Graph":
+        self._node(from_node)
+        self._node(to_node)
+        from_node_name = from_node.name
+        to_node_name = to_node.name
+        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
+        self._topo_counter[to_node_name] += 1
+        return self
+    """
+    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
+    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
+    """
+    def steps(
+        self,
+        from_node: extractor,
+        to_nodes: List[extractor],
+        prefilter_predicates: List[str] = [],
+    ) -> "Graph":
+        print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
+        for t_n, p in itertools.zip_longest(
+            to_nodes, prefilter_predicates, fillvalue=None
+        ):
+            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
+        return self
+    def add_param(self, node: extractor, params: Dict[str, Any]):
+        try:
+            # check if the params can be serialized since the server needs this
+            json.dumps(params)
+        except Exception:
+            raise Exception(f"For node {node.name}, cannot serialize params as json.")
+        self.params[node.name] = params
+    def run(self, wf_input, local):
+        self._assign_start_node()
+        self.runner.run(self, wf_input=wf_input)
+        pass
+    def clear_cache_for_node(self, node: Union[extractor, Extractor]):
+        if node.name not in self.nodes.keys():
+            raise Exception(f"Node with name {node.name} not found in graph")
+        self.runner.deleted_from_memo(node.name)
+    def clear_cache_for_all_nodes(self):
+        for node_name in self.nodes:
+            self.runner.deleted_from_memo(node_name=node_name)
+    def get_result(self, node: Union[extractor, Extractor]) -> Any:
+        return self.runner.results[node.name]
+    def _assign_start_node(self):
+        # this method should be called before a graph can be run
+        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
+        self._start_node = nodes[0][0]

indexify/local_runner.py CHANGED Viewed

@@ -2,15 +2,13 @@ import hashlib
 import os
 import pickle
 import shutil
-from pathlib import Path
-from indexify.extractor_sdk.data import BaseData, Feature
-from indexify.extractor_sdk.extractor import extractor, Extractor
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Union
-from indexify.run_graph import RunGraph
+from indexify.extractor_sdk.data import BaseData, Feature
+from indexify.extractor_sdk.extractor import Extractor, extractor
+from indexify.graph import Graph
 from indexify.runner import Runner
@@ -27,9 +25,9 @@ class LocalRunner(Runner):
     # those bytes have to be a python type
     # _input needs to be serializable into python object (ie json for ex) and Feature
-    def _run(self, g: RunGraph, _input: BaseData, node_name: str):
+    def _run(self, g: Graph, _input: BaseData, node_name: str):
         print(f"---- Starting node {node_name}")
-        print(f'node_name {node_name}')
+        print(f"node_name {node_name}")
         extractor_construct: Callable = g.nodes[node_name]
         params = g.params.get(node_name, None)
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
         for out_edge, pre_filter_predicate in g.edges[node_name]:
             # TODO there are no reductions yet, each recursion finishes it's path and returns
             for r in data_to_process:
-                if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
+                if self._prefilter_content(
+                    content=r, prefilter_predicate=pre_filter_predicate
+                ):
                     continue
                 self._run(g, _input=r, node_name=out_edge)
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
     """
     Returns True if content should be filtered
     """
-    def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
+    def _prefilter_content(
+        self, content: BaseData, prefilter_predicate: Optional[str]
+    ) -> bool:
         if prefilter_predicate is None:
             return False
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
         # TODO For now only support `and` and `=` and `string values`
         bools = []
-        metadata = content.get_features()['metadata']
+        metadata = content.get_features()["metadata"]
         for atom in atoms:
-            l, r = atom.split('=')
+            l, r = atom.split("=")
             if l in metadata:
                 bools.append(metadata[l] != r)
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
         if not os.path.exists(file_path):
             return None
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
             return f.read()
     def put_into_memo(self, node_name, input_hash, output):
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
         Path(file_path).touch()
-        with open(file_path, 'wb') as f:
+        with open(file_path, "wb") as f:
             return f.write(output)

indexify/runner.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from abc import ABC
+from typing import Any, Union
 from indexify.extractor_sdk.data import BaseData
-from indexify.extractor_sdk.extractor import extractor, Extractor
+from indexify.extractor_sdk.extractor import Extractor, extractor
-from typing import Any, Union
 class Runner(ABC):
     def run(self, g, wf_input: BaseData):

{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.39
+Version: 0.0.41
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: cloudpickle (>=3,<4)
 Requires-Dist: httpx[http2] (>=0,<1)
 Requires-Dist: pydantic (>=2.8,<3.0)
 Requires-Dist: pyyaml (>=6,<7)

indexify-0.0.41.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,25 @@
+indexify/__init__.py,sha256=e4s2395B3gEGrZk2u5OZO2RtrXYFYUTItaM3mtlusBE,493
+indexify/base_client.py,sha256=HwT2KJNq8j-KiPVA9RJm-yearSjxifRjXTcP1zUVeo8,2784
+indexify/client.py,sha256=p4WDmYR94DjU0EqosuCKNGjbfh11qUID6TxDhTK6Uk4,26001
+indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
+indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
+indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
+indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
+indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
+indexify/extraction_policy.py,sha256=927BBtZBDPsLMm01uQDPCZnj3Pwmjh6L6QLHb4ShQKk,2076
+indexify/extractor_sdk/__init__.py,sha256=DOL-wJvIspWPqjFRBpmhMbnsMZC2JY-NtNwQGiE6IqU,348
+indexify/extractor_sdk/data.py,sha256=JpX9WdTpiuK72wn6QYhtqj5p5JiJu4waBrK-Hi7lNsA,2742
+indexify/extractor_sdk/extractor.py,sha256=IEZvr1Qe-dVmTgAeJFAhEyHUW20n4uTEeEassH3C5j4,9858
+indexify/extractor_sdk/utils.py,sha256=bW_D2eMWTzcAYZ8Lv7LUKGgOD0cyW77E6gNO3y7iNNA,6234
+indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+indexify/extractors/embedding.py,sha256=Be6X4odSHbkAEm2myxB04RN-Mvb2bFk8uWXxUpY-Z6E,1859
+indexify/extractors/pdf_parser.py,sha256=XN-_b_W7CrpkTeWYs4H6hkK_mx-k4N2o1RSAVkQhr8Q,2842
+indexify/graph.py,sha256=UdvrpNc-SdD3U27Ee9aTMMYcSOUz__WQWc31oFHV4yQ,3963
+indexify/local_runner.py,sha256=uuMJbnT4qYMSySxsB3lEC7FSjYnJFh5eNZ00zu5gLNw,4387
+indexify/runner.py,sha256=VVmLGF1kAmEuE461Hs0QJFnSvVWtUzYhhQfB1KptYPU,637
+indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
+indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
+indexify-0.0.41.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+indexify-0.0.41.dist-info/METADATA,sha256=yTEubUvxQgDUcXrf6rxzvITsW6BDBjzG2LXxyl9A-O0,1913
+indexify-0.0.41.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+indexify-0.0.41.dist-info/RECORD,,

indexify/run_graph.py DELETED Viewed

@@ -1,122 +0,0 @@
-import json
-from .extractor_sdk import Content, extractor, Extractor
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Type, Union
-from pydantic import BaseModel
-import itertools
-from .runner import Runner
-@extractor(description="id function")
-def _id(content: Content) -> List[Content]:
-    return [content]
-class RunGraph:
-    def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
-        # TODO check for cycles
-        self.name = name
-        self.nodes: Dict[str, Union[extractor, Extractor]] = {}
-        self.params: Dict[str, Any] = {}
-        self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
-        self.nodes["start"] = _id
-        self.nodes["end"] = _id
-        self._topo_counter = defaultdict(int)
-        self._start_node = None
-        self._input = input
-        self.runner = runner
-    def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
-        name = extractor.name
-        # if you've already inserted a node just ignore the new insertion.
-        if name in self.nodes:
-            return
-        self.nodes[name] = extractor
-        self.params[name] = extractor.__dict__.get("params", None)
-        # assign each node a rank of 1 to init the graph
-        self._topo_counter[name] = 1
-        return self
-    def add_edge(
-        self,
-        from_node: extractor,
-        to_node: extractor,
-        prefilter_predicates: Optional[str] = None,
-    ) -> 'RunGraph':
-        self._node(from_node)
-        self._node(to_node)
-        from_node_name = from_node.name
-        to_node_name = to_node.name
-        self.edges[from_node_name].append((to_node_name, prefilter_predicates))
-        self._topo_counter[to_node_name] += 1
-        return self
-    """
-    Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
-    Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
-    """
-    def steps(
-        self,
-        from_node: extractor,
-        to_nodes: List[extractor],
-        prefilter_predicates: List[str] = [],
-    ) -> 'RunGraph':
-        print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
-        for t_n, p in itertools.zip_longest(
-            to_nodes, prefilter_predicates, fillvalue=None
-        ):
-            self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
-        return self
-    def add_param(self, node: extractor, params: Dict[str, Any]):
-        try:
-            # check if the params can be serialized since the server needs this
-            json.dumps(params)
-        except Exception:
-            raise Exception(f"For node {node.name}, cannot serialize params as json.")
-        self.params[node.name] = params
-    def run(self, wf_input, local):
-        self._assign_start_node()
-        # self.runner = LocalRunner()
-        self.runner.run(self, wf_input=wf_input)
-        pass
-    def clear_cache_for_node(self, node: Union[extractor, Extractor]):
-        if node.name not in self.nodes.keys():
-            raise Exception(f"Node with name {node.name} not found in graph")
-        self.runner.deleted_from_memo(node.name)
-    def clear_cache_for_all_nodes(self):
-        for node_name in self.nodes:
-            self.runner.deleted_from_memo(node_name=node_name)
-    def get_result(self, node: Union[extractor, Extractor]) -> Any:
-        return self.runner.results[node.name]
-    def _assign_start_node(self):
-        # this method should be called before a graph can be run
-        nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
-        self._start_node = nodes[0][0]

indexify-0.0.39.dist-info/RECORD DELETED Viewed

@@ -1,23 +0,0 @@
-indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
-indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
-indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
-indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
-indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
-indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
-indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
-indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
-indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
-indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
-indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
-indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
-indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
-indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
-indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
-indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
-indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
-indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
-indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
-indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
-indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-indexify-0.0.39.dist-info/RECORD,,

{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/WHEEL RENAMED Viewed

File without changes

indexify 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

indexify 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl