PyPI - indexify - Versions diffs - 0.0.40__tar.gz → 0.0.42__tar.gz - Mend

indexify 0.0.40tar.gz → 0.0.42tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{indexify-0.0.40 → indexify-0.0.42}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: indexify
-Version: 0.0.40
+Version: 0.0.42
 Summary: Python Client for Indexify
 Home-page: https://github.com/tensorlakeai/indexify
 License: Apache 2.0

{indexify-0.0.40 → indexify-0.0.42}/indexify/__init__.py RENAMED Viewed

@@ -1,15 +1,13 @@
-from .client import IndexifyClient
-from .extraction_policy import ExtractionGraph
+from . import data_loaders, extractor_sdk
 from .client import (
-    IndexifyClient,
     Document,
+    IndexifyClient,
     generate_hash_from_string,
     generate_unique_hex_id,
 )
-from . import extractor_sdk
-from .settings import DEFAULT_SERVICE_URL
-from . import data_loaders
+from .extraction_policy import ExtractionGraph
 from .graph import Graph
+from .settings import DEFAULT_SERVICE_URL
 __all__ = [
     "data_loaders",

{indexify-0.0.40 → indexify-0.0.42}/indexify/base_client.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
-from .extractor_sdk import Graph, Feature
-from typing import Any, List, Optional, Union, Dict
+from typing import Any, Dict, List, Optional, Union
+from .extractor_sdk import Feature, Graph
 class BaseClient(ABC):
@@ -23,7 +25,7 @@ class BaseClient(ABC):
     ### Ingestion APIs
     @abstractmethod
-    def invoke_graph_with_object(self, graph:str, object: Any) -> str:
+    def invoke_graph_with_object(self, graph: str, object: Any) -> str:
         """
         Invokes a graph with an input object.
         graph: str: The name of the graph to invoke
@@ -42,10 +44,11 @@ class BaseClient(ABC):
         """
         pass
     ### Retrieval APIs
     @abstractmethod
-    def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
+    def extracted_objects(
+        self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
+    ) -> Union[Dict[str, List[Any]], List[Any]]:
         """
         Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
         If the extractor name is not provided, all the extracted objects are returned for the input object.
@@ -57,11 +60,12 @@ class BaseClient(ABC):
         pass
     @abstractmethod
-    def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
+    def features(
+        self, object_id: str, graph: Optional[str]
+    ) -> Union[Dict[str, List[Feature]], List[Feature]]:
         """
         Returns the features of an object.
         object_id: str: The ID of the object
         return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
         """
         pass

{indexify-0.0.40 → indexify-0.0.42}/indexify/client.py RENAMED Viewed

@@ -1,20 +1,23 @@
-import yaml
-import httpx
-import uuid
 import hashlib
 import json
+import logging
+import uuid
 from collections import namedtuple
-from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
-from .extractor_sdk.extractor import ExtractorMetadata
-from .extraction_policy import ExtractionGraph
-from .utils import json_set_default
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+import httpx
+import yaml
+from indexify.exceptions import ApiException
+from .data_loaders import DataLoader
 from .error import Error
+from .extraction_policy import ExtractionGraph
 from .extractor_sdk.data import ContentMetadata
-from .data_loaders import DataLoader
-from indexify.exceptions import ApiException
-from dataclasses import dataclass
-from typing import List, Optional, Union, Dict
-import logging
+from .extractor_sdk.extractor import ExtractorMetadata
+from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
+from .utils import json_set_default
 Document = namedtuple("Document", ["text", "labels", "id"])

{indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/local_directory_loader.py RENAMED Viewed

@@ -1,6 +1,7 @@
-from . import DataLoader, FileMetadata
-from typing import List, Optional
 import os
+from typing import List, Optional
+from . import DataLoader, FileMetadata
 class LocalDirectoryLoader(DataLoader):

{indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/url_loader.py RENAMED Viewed

@@ -1,7 +1,9 @@
-from . import DataLoader, FileMetadata
+import email.utils
 from typing import List
 import httpx
-import email.utils
+from . import DataLoader, FileMetadata
 def convert_date_to_epoch(date_str: str) -> int:

{indexify-0.0.40 → indexify-0.0.42}/indexify/extraction_policy.py RENAMED Viewed

@@ -1,5 +1,5 @@
-from dataclasses import dataclass, asdict
-from typing import Optional, List
+from dataclasses import asdict, dataclass
+from typing import List, Optional
 @dataclass

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
-from .data import ContentMetadata, Content, Feature
-from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
+from .data import Content, ContentMetadata, Feature
+from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
 from .utils import SampleExtractorData
 __all__ = [

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/data.py RENAMED Viewed

@@ -1,15 +1,7 @@
-from typing import (
-    Any,
-    List,
-    Optional,
-    Literal,
-    Dict,
-    Type,
-    cast,
-    Mapping,
-)
-from pydantic import BaseModel, Json, Field
 import json
+from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
+from pydantic import BaseModel, Field, Json
 from typing_extensions import Annotated, Doc

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/extractor.py RENAMED Viewed

@@ -1,11 +1,22 @@
-from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
 import inspect
-from pydantic import BaseModel, Field
-from abc import ABC, abstractmethod
-from .data import BaseData, Content, Feature
 import json
 import os
+from abc import ABC, abstractmethod
+from typing import (
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    get_type_hints,
+)
 import requests
+from pydantic import BaseModel, Field
+from .data import BaseData, Content, Feature
 class EmbeddingSchema(BaseModel):
@@ -13,6 +24,7 @@ class EmbeddingSchema(BaseModel):
     distance: Optional[str] = "cosine"
     database_url: Optional[str] = None
 class ExtractorMetadata(BaseModel):
     name: str
     version: str
@@ -42,7 +54,7 @@ class Extractor(ABC):
     input_mime_types = ["text/plain"]
     embedding_indexes: Dict[str, EmbeddingSchema] = {}
     @abstractmethod
     def extract(
         self, input: Type[BaseModel], params: Type[BaseModel] = None
@@ -186,7 +198,7 @@ def extractor(
             class DecoratedFn(Extractor):
                 @classmethod
-                def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]:  # type: ignore
+                def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]:  # type: ignore
                     # TODO we can force all the functions to take in a parms object
                     # or check if someone adds a params
                     if params is None:

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/utils.py RENAMED Viewed

@@ -1,20 +1,17 @@
 import os
-import httpx
 from typing import List
+import httpx
 from .data import Content, Feature
 class SampleExtractorData:
-    def _download_file(self, url, filename):
-        if os.path.exists(filename):
-            # file exists skip
-            return
+    def _download_file(self, url):
         try:
-            with httpx.get(url, stream=True) as r:
-                r.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
-                with open(filename, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
+            resp = httpx.get(url, follow_redirects=True)
+            resp.raise_for_status()
+            return resp.content
         except httpx.exceptions.RequestException as e:
             print(f"Error downloading the file: {e}")
@@ -55,13 +52,10 @@ class SampleExtractorData:
         return Content(content_type="image/jpg", data=f.read(), features=features)
     def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
-        file_name = "sample.pdf"
-        self._download_file(
+        data = self._download_file(
             "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
-            file_name,
         )
-        f = open(file_name, "rb")
-        return Content(content_type="application/pdf", data=f.read(), features=features)
+        return Content(content_type="application/pdf", data=data, features=features)
     def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
         file_name = "sample.pdf"

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractors/embedding.py RENAMED Viewed

@@ -1,10 +1,12 @@
 from typing import List
-from indexify.extractor_sdk.data import Feature
 import torch
 import torch.nn.functional as F
 from transformers import AutoModel, AutoTokenizer
-from indexify.extractor_sdk.extractor import Extractor , Feature
+from indexify.extractor_sdk.data import Feature
+from indexify.extractor_sdk.extractor import Extractor, Feature
 class SentenceTransformersEmbedding:
     def __init__(self, model_name) -> None:
@@ -31,9 +33,9 @@ class SentenceTransformersEmbedding:
         )
         sentence_embeddings = self._model(**encoded_input)
         return F.normalize(sentence_embeddings, p=2, dim=1)
-class BasicSentenceTransformerModels(Extractor):
+class BasicSentenceTransformerModels(Extractor):
     def __init__(self, model: str):
         super().__init__()
         self.model = SentenceTransformersEmbedding(model)
@@ -41,13 +43,13 @@ class BasicSentenceTransformerModels(Extractor):
     def extract(self, input: str) -> List[Feature]:
         embeddings = self.model.embed(input)
         return [Feature.embedding(values=embeddings)]
 class BasicHFTransformerEmbeddingModels(Extractor):
-        def __init__(self, model: str):
-            super().__init__()
-            self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
-        def extract(self, input: str) -> List[Feature]:
-            embeddings = self.model.embed_query(input)
-            return [Feature.embedding(values=embeddings)]
+    def __init__(self, model: str):
+        super().__init__()
+        self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
+    def extract(self, input: str) -> List[Feature]:
+        embeddings = self.model.embed_query(input)
+        return [Feature.embedding(values=embeddings)]

indexify-0.0.42/indexify/extractors/pdf_parser.py ADDED Viewed

@@ -0,0 +1,93 @@
+import tempfile
+from enum import Enum
+from typing import List, Optional
+from pydantic import BaseModel
+class PageFragmentType(str, Enum):
+    TEXT = "text"
+    FIGURE = "figure"
+    TABLE = "table"
+class Image(BaseModel):
+    data: bytes
+    mime_type: str
+class TableEncoding(str, Enum):
+    CSV = "csv"
+    HTML = "html"
+class Table(BaseModel):
+    data: str
+    encoding: TableEncoding
+class PageFragment(BaseModel):
+    fragment_type: PageFragmentType
+    text: Optional[str] = None
+    image: Optional[Image] = None
+    table: Optional[Table] = None
+    reading_order: Optional[int] = None
+class Page(BaseModel):
+    number: int
+    fragments: List[PageFragment]
+class PDFParser:
+    def __init__(self, data: bytes, language: Optional[str] = "en"):
+        self._data = data
+    def parse(self) -> List[Page]:
+        import deepdoctection as dd
+        analyzer = dd.get_dd_analyzer()
+        parsed_pages = []
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
+            f.write(self._data)
+            f.flush()
+            df = analyzer.analyze(path=f.name)
+            df.reset_state()
+            for page in df:
+                parsed_pages.append(page)
+        outputs: List[Page] = []
+        for parsed_page in parsed_pages:
+            page_num = parsed_page.page_number
+            fragments = []
+            for layout in parsed_page.layouts:
+                if layout.category_name in ["text", "title"]:
+                    fragments.append(
+                        PageFragment(
+                            fragment_type=PageFragmentType.TEXT,
+                            text=layout.text,
+                            reading_order=layout.reading_order,
+                        )
+                    )
+            figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
+            for figure in figures:
+                image_bytes = dd.viz_handler.encode(figure.viz())
+                fragments.append(
+                    PageFragment(
+                        fragment_type=PageFragmentType.FIGURE,
+                        image=Image(data=image_bytes, mime_type="image/png"),
+                        reading_order=figure.reading_order,
+                    )
+                )
+            tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
+            for table in tables:
+                fragments.append(
+                    PageFragment(
+                        fragment_type=PageFragmentType.TABLE,
+                        table=Table(data=table.html, encoding=TableEncoding.HTML),
+                        reading_order=table.reading_order,
+                    )
+                )
+            outputs.append(Page(number=page_num, fragments=fragments))
+        return outputs

{indexify-0.0.40 → indexify-0.0.42}/indexify/graph.py RENAMED Viewed

@@ -1,24 +1,28 @@
-import json
 import itertools
+import json
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Type, Union
 import cloudpickle
 from pydantic import BaseModel
-from .extractor_sdk import Content, extractor, Extractor
+from .extractor_sdk import Content, Extractor, extractor
 from .runner import Runner
 @extractor(description="id function")
 def _id(content: Content) -> List[Content]:
     return [content]
-def load_graph(graph: bytes) -> 'Graph':
+def load_graph(graph: bytes) -> "Graph":
     return cloudpickle.loads(graph)
 class Graph:
-    def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
+    def __init__(
+        self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
+    ):
         # TODO check for cycles
         self.name = name
@@ -40,7 +44,7 @@ class Graph:
     def get_extractor(self, name: str) -> Extractor:
         return self.nodes[name]
-    def _node(self, extractor: Extractor, params: Any = None) -> 'Graph':
+    def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
         name = extractor.name
         # if you've already inserted a node just ignore the new insertion.
@@ -54,7 +58,7 @@ class Graph:
         self._topo_counter[name] = 1
         return self
     def serialize(self):
         return cloudpickle.dumps(self)
@@ -63,7 +67,7 @@ class Graph:
         from_node: Type[Extractor],
         to_node: Type[Extractor],
         prefilter_predicates: Optional[str] = None,
-    ) -> 'Graph':
+    ) -> "Graph":
         self._node(from_node)
         self._node(to_node)
@@ -87,7 +91,7 @@ class Graph:
         from_node: extractor,
         to_nodes: List[extractor],
         prefilter_predicates: List[str] = [],
-    ) -> 'Graph':
+    ) -> "Graph":
         print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
         for t_n, p in itertools.zip_longest(
             to_nodes, prefilter_predicates, fillvalue=None

{indexify-0.0.40 → indexify-0.0.42}/indexify/local_runner.py RENAMED Viewed

@@ -2,14 +2,12 @@ import hashlib
 import os
 import pickle
 import shutil
-from pathlib import Path
-from indexify.extractor_sdk.data import BaseData, Feature
-from indexify.extractor_sdk.extractor import extractor, Extractor
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Union
+from indexify.extractor_sdk.data import BaseData, Feature
+from indexify.extractor_sdk.extractor import Extractor, extractor
 from indexify.graph import Graph
 from indexify.runner import Runner
@@ -29,7 +27,7 @@ class LocalRunner(Runner):
     # _input needs to be serializable into python object (ie json for ex) and Feature
     def _run(self, g: Graph, _input: BaseData, node_name: str):
         print(f"---- Starting node {node_name}")
-        print(f'node_name {node_name}')
+        print(f"node_name {node_name}")
         extractor_construct: Callable = g.nodes[node_name]
         params = g.params.get(node_name, None)
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
         for out_edge, pre_filter_predicate in g.edges[node_name]:
             # TODO there are no reductions yet, each recursion finishes it's path and returns
             for r in data_to_process:
-                if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
+                if self._prefilter_content(
+                    content=r, prefilter_predicate=pre_filter_predicate
+                ):
                     continue
                 self._run(g, _input=r, node_name=out_edge)
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
     """
     Returns True if content should be filtered
     """
-    def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
+    def _prefilter_content(
+        self, content: BaseData, prefilter_predicate: Optional[str]
+    ) -> bool:
         if prefilter_predicate is None:
             return False
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
         # TODO For now only support `and` and `=` and `string values`
         bools = []
-        metadata = content.get_features()['metadata']
+        metadata = content.get_features()["metadata"]
         for atom in atoms:
-            l, r = atom.split('=')
+            l, r = atom.split("=")
             if l in metadata:
                 bools.append(metadata[l] != r)
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
         if not os.path.exists(file_path):
             return None
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
             return f.read()
     def put_into_memo(self, node_name, input_hash, output):
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
         Path(file_path).touch()
-        with open(file_path, 'wb') as f:
+        with open(file_path, "wb") as f:
             return f.write(output)

{indexify-0.0.40 → indexify-0.0.42}/indexify/runner.py RENAMED Viewed

@@ -1,9 +1,9 @@
 from abc import ABC
+from typing import Any, Union
 from indexify.extractor_sdk.data import BaseData
-from indexify.extractor_sdk.extractor import extractor, Extractor
+from indexify.extractor_sdk.extractor import Extractor, extractor
-from typing import Any, Union
 class Runner(ABC):
     def run(self, g, wf_input: BaseData):

{indexify-0.0.40 → indexify-0.0.42}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "indexify"
-version = "0.0.40"
+version = "0.0.42"
 description = "Python Client for Indexify"
 authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
 license = "Apache 2.0"

{indexify-0.0.40 → indexify-0.0.42}/LICENSE.txt RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/README.md RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/__init__.py RENAMED Viewed

@@ -1,10 +1,10 @@
-from pydantic import BaseModel
+import hashlib
+import mimetypes
+import os
 from abc import ABC, abstractmethod
 from typing import List
-import os
-import mimetypes
-import hashlib
+from pydantic import BaseModel
 class FileMetadata(BaseModel):

{indexify-0.0.40 → indexify-0.0.42}/indexify/error.py RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/indexify/exceptions.py RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/indexify/extractors/__init__.py RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/indexify/settings.py RENAMED Viewed

File without changes

{indexify-0.0.40 → indexify-0.0.42}/indexify/utils.py RENAMED Viewed

File without changes

indexify 0.0.40__tar.gz → 0.0.42__tar.gz

indexify 0.0.40tar.gz → 0.0.42tar.gz