indexify 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +5 -5
- indexify/base_client.py +11 -7
- indexify/client.py +15 -12
- indexify/data_loaders/__init__.py +5 -5
- indexify/data_loaders/local_directory_loader.py +3 -2
- indexify/data_loaders/url_loader.py +4 -3
- indexify/extraction_policy.py +2 -2
- indexify/extractor_sdk/__init__.py +2 -2
- indexify/extractor_sdk/data.py +4 -12
- indexify/extractor_sdk/extractor.py +23 -31
- indexify/extractor_sdk/utils.py +9 -15
- indexify/extractors/__init__.py +0 -0
- indexify/extractors/embedding.py +55 -0
- indexify/extractors/pdf_parser.py +95 -0
- indexify/graph.py +126 -16
- indexify/local_runner.py +17 -14
- indexify/runner.py +2 -2
- {indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/METADATA +2 -1
- indexify-0.0.41.dist-info/RECORD +25 -0
- indexify/run_graph.py +0 -122
- indexify-0.0.39.dist-info/RECORD +0 -23
- {indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.39.dist-info → indexify-0.0.41.dist-info}/WHEEL +0 -0
indexify/__init__.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
from .
|
2
|
-
from .extraction_policy import ExtractionGraph
|
1
|
+
from . import data_loaders, extractor_sdk
|
3
2
|
from .client import (
|
4
|
-
IndexifyClient,
|
5
3
|
Document,
|
4
|
+
IndexifyClient,
|
6
5
|
generate_hash_from_string,
|
7
6
|
generate_unique_hex_id,
|
8
7
|
)
|
9
|
-
from . import
|
8
|
+
from .extraction_policy import ExtractionGraph
|
9
|
+
from .graph import Graph
|
10
10
|
from .settings import DEFAULT_SERVICE_URL
|
11
|
-
from . import data_loaders
|
12
11
|
|
13
12
|
__all__ = [
|
14
13
|
"data_loaders",
|
14
|
+
"Graph",
|
15
15
|
"Document",
|
16
16
|
"extractor_sdk",
|
17
17
|
"IndexifyClient",
|
indexify/base_client.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from
|
3
|
-
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
3
|
+
|
4
|
+
from .extractor_sdk import Feature, Graph
|
5
|
+
|
4
6
|
|
5
7
|
class BaseClient(ABC):
|
6
8
|
|
@@ -23,7 +25,7 @@ class BaseClient(ABC):
|
|
23
25
|
|
24
26
|
### Ingestion APIs
|
25
27
|
@abstractmethod
|
26
|
-
def invoke_graph_with_object(self, graph:str, object: Any) -> str:
|
28
|
+
def invoke_graph_with_object(self, graph: str, object: Any) -> str:
|
27
29
|
"""
|
28
30
|
Invokes a graph with an input object.
|
29
31
|
graph: str: The name of the graph to invoke
|
@@ -42,10 +44,11 @@ class BaseClient(ABC):
|
|
42
44
|
"""
|
43
45
|
pass
|
44
46
|
|
45
|
-
|
46
47
|
### Retrieval APIs
|
47
48
|
@abstractmethod
|
48
|
-
def extracted_objects(
|
49
|
+
def extracted_objects(
|
50
|
+
self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
|
51
|
+
) -> Union[Dict[str, List[Any]], List[Any]]:
|
49
52
|
"""
|
50
53
|
Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
|
51
54
|
If the extractor name is not provided, all the extracted objects are returned for the input object.
|
@@ -57,11 +60,12 @@ class BaseClient(ABC):
|
|
57
60
|
pass
|
58
61
|
|
59
62
|
@abstractmethod
|
60
|
-
def features(
|
63
|
+
def features(
|
64
|
+
self, object_id: str, graph: Optional[str]
|
65
|
+
) -> Union[Dict[str, List[Feature]], List[Feature]]:
|
61
66
|
"""
|
62
67
|
Returns the features of an object.
|
63
68
|
object_id: str: The ID of the object
|
64
69
|
return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
|
65
70
|
"""
|
66
71
|
pass
|
67
|
-
|
indexify/client.py
CHANGED
@@ -1,20 +1,23 @@
|
|
1
|
-
import yaml
|
2
|
-
import httpx
|
3
|
-
import uuid
|
4
1
|
import hashlib
|
5
2
|
import json
|
3
|
+
import logging
|
4
|
+
import uuid
|
6
5
|
from collections import namedtuple
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
10
|
-
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Dict, List, Optional, Union
|
8
|
+
|
9
|
+
import httpx
|
10
|
+
import yaml
|
11
|
+
|
12
|
+
from indexify.exceptions import ApiException
|
13
|
+
|
14
|
+
from .data_loaders import DataLoader
|
11
15
|
from .error import Error
|
16
|
+
from .extraction_policy import ExtractionGraph
|
12
17
|
from .extractor_sdk.data import ContentMetadata
|
13
|
-
from .
|
14
|
-
from
|
15
|
-
from
|
16
|
-
from typing import List, Optional, Union, Dict
|
17
|
-
import logging
|
18
|
+
from .extractor_sdk.extractor import ExtractorMetadata
|
19
|
+
from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
20
|
+
from .utils import json_set_default
|
18
21
|
|
19
22
|
Document = namedtuple("Document", ["text", "labels", "id"])
|
20
23
|
|
@@ -1,10 +1,10 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
import hashlib
|
2
|
+
import mimetypes
|
3
|
+
import os
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from typing import List
|
5
|
-
|
6
|
-
import
|
7
|
-
import hashlib
|
6
|
+
|
7
|
+
from pydantic import BaseModel
|
8
8
|
|
9
9
|
|
10
10
|
class FileMetadata(BaseModel):
|
indexify/extraction_policy.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
from .data import
|
2
|
-
from .extractor import
|
1
|
+
from .data import Content, ContentMetadata, Feature
|
2
|
+
from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
|
3
3
|
from .utils import SampleExtractorData
|
4
4
|
|
5
5
|
__all__ = [
|
indexify/extractor_sdk/data.py
CHANGED
@@ -1,15 +1,7 @@
|
|
1
|
-
from typing import (
|
2
|
-
Any,
|
3
|
-
List,
|
4
|
-
Optional,
|
5
|
-
Literal,
|
6
|
-
Dict,
|
7
|
-
Type,
|
8
|
-
cast,
|
9
|
-
Mapping,
|
10
|
-
)
|
11
|
-
from pydantic import BaseModel, Json, Field
|
12
1
|
import json
|
2
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Field, Json
|
13
5
|
from typing_extensions import Annotated, Doc
|
14
6
|
|
15
7
|
|
@@ -103,6 +95,6 @@ class ContentMetadata(BaseModel):
|
|
103
95
|
)
|
104
96
|
|
105
97
|
|
106
|
-
class
|
98
|
+
class File(BaseData):
|
107
99
|
data: bytes
|
108
100
|
mime_type: str
|
@@ -1,16 +1,29 @@
|
|
1
|
-
from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
|
2
1
|
import inspect
|
3
|
-
from pydantic import BaseModel, Field
|
4
|
-
from abc import ABC, abstractmethod
|
5
|
-
from .data import BaseData, Content, Feature
|
6
2
|
import json
|
7
3
|
import os
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import (
|
6
|
+
Callable,
|
7
|
+
Dict,
|
8
|
+
List,
|
9
|
+
Optional,
|
10
|
+
Tuple,
|
11
|
+
Type,
|
12
|
+
Union,
|
13
|
+
get_type_hints,
|
14
|
+
)
|
15
|
+
|
8
16
|
import requests
|
17
|
+
from pydantic import BaseModel, Field
|
18
|
+
|
19
|
+
from .data import BaseData, Content, Feature
|
9
20
|
|
10
21
|
|
11
22
|
class EmbeddingSchema(BaseModel):
|
12
23
|
dim: int
|
13
|
-
distance: str = "cosine"
|
24
|
+
distance: Optional[str] = "cosine"
|
25
|
+
database_url: Optional[str] = None
|
26
|
+
|
14
27
|
|
15
28
|
class ExtractorMetadata(BaseModel):
|
16
29
|
name: str
|
@@ -40,7 +53,7 @@ class Extractor(ABC):
|
|
40
53
|
|
41
54
|
input_mime_types = ["text/plain"]
|
42
55
|
|
43
|
-
|
56
|
+
embedding_indexes: Dict[str, EmbeddingSchema] = {}
|
44
57
|
|
45
58
|
@abstractmethod
|
46
59
|
def extract(
|
@@ -55,31 +68,9 @@ class Extractor(ABC):
|
|
55
68
|
pass
|
56
69
|
|
57
70
|
@classmethod
|
58
|
-
@abstractmethod
|
59
71
|
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
60
72
|
pass
|
61
73
|
|
62
|
-
def describe(self) -> ExtractorMetadata:
|
63
|
-
embedding_schemas = {}
|
64
|
-
try:
|
65
|
-
embedding_schemas = self.embedding_schemas
|
66
|
-
except NotImplementedError:
|
67
|
-
pass
|
68
|
-
|
69
|
-
json_schema = (
|
70
|
-
self._param_cls.model_json_schema() if self._param_cls is not None else None
|
71
|
-
)
|
72
|
-
return ExtractorMetadata(
|
73
|
-
name=self.name,
|
74
|
-
version=self.version,
|
75
|
-
description=self.description,
|
76
|
-
system_dependencies=self.system_dependencies,
|
77
|
-
python_dependencies=self.python_dependencies,
|
78
|
-
input_mime_types=self.input_mime_types,
|
79
|
-
embedding_schemas=embedding_schemas,
|
80
|
-
input_params=json.dumps(json_schema),
|
81
|
-
)
|
82
|
-
|
83
74
|
def _download_file(self, url, filename):
|
84
75
|
if os.path.exists(filename):
|
85
76
|
# file exists skip
|
@@ -190,7 +181,7 @@ def extractor(
|
|
190
181
|
python_dependencies: Optional[List[str]] = None,
|
191
182
|
system_dependencies: Optional[List[str]] = None,
|
192
183
|
input_mime_types: Optional[List[str]] = None,
|
193
|
-
|
184
|
+
embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
|
194
185
|
sample_content: Optional[Callable] = None,
|
195
186
|
):
|
196
187
|
args = locals()
|
@@ -198,7 +189,7 @@ def extractor(
|
|
198
189
|
|
199
190
|
def construct(fn):
|
200
191
|
def wrapper():
|
201
|
-
|
192
|
+
description = fn.__doc__ or args.get("description", "")
|
202
193
|
|
203
194
|
if not args.get("name"):
|
204
195
|
args[
|
@@ -207,7 +198,7 @@ def extractor(
|
|
207
198
|
|
208
199
|
class DecoratedFn(Extractor):
|
209
200
|
@classmethod
|
210
|
-
def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
201
|
+
def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
211
202
|
# TODO we can force all the functions to take in a parms object
|
212
203
|
# or check if someone adds a params
|
213
204
|
if params is None:
|
@@ -220,6 +211,7 @@ def extractor(
|
|
220
211
|
|
221
212
|
for key, val in args.items():
|
222
213
|
setattr(DecoratedFn, key, val)
|
214
|
+
DecoratedFn.description = description
|
223
215
|
|
224
216
|
return DecoratedFn
|
225
217
|
|
indexify/extractor_sdk/utils.py
CHANGED
@@ -1,20 +1,17 @@
|
|
1
1
|
import os
|
2
|
-
import httpx
|
3
2
|
from typing import List
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
|
4
6
|
from .data import Content, Feature
|
5
7
|
|
6
8
|
|
7
9
|
class SampleExtractorData:
|
8
|
-
def _download_file(self, url
|
9
|
-
if os.path.exists(filename):
|
10
|
-
# file exists skip
|
11
|
-
return
|
10
|
+
def _download_file(self, url):
|
12
11
|
try:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
for chunk in r.iter_content(chunk_size=8192):
|
17
|
-
f.write(chunk)
|
12
|
+
resp = httpx.get(url, follow_redirects=True)
|
13
|
+
resp.raise_for_status()
|
14
|
+
return resp.content
|
18
15
|
except httpx.exceptions.RequestException as e:
|
19
16
|
print(f"Error downloading the file: {e}")
|
20
17
|
|
@@ -55,13 +52,10 @@ class SampleExtractorData:
|
|
55
52
|
return Content(content_type="image/jpg", data=f.read(), features=features)
|
56
53
|
|
57
54
|
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
58
|
-
|
59
|
-
self._download_file(
|
55
|
+
data = self._download_file(
|
60
56
|
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
61
|
-
file_name,
|
62
57
|
)
|
63
|
-
|
64
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
58
|
+
return Content(content_type="application/pdf", data=data, features=features)
|
65
59
|
|
66
60
|
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
67
61
|
file_name = "sample.pdf"
|
File without changes
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
import torch
|
4
|
+
import torch.nn.functional as F
|
5
|
+
from transformers import AutoModel, AutoTokenizer
|
6
|
+
|
7
|
+
from indexify.extractor_sdk.data import Feature
|
8
|
+
from indexify.extractor_sdk.extractor import Extractor, Feature
|
9
|
+
|
10
|
+
|
11
|
+
class SentenceTransformersEmbedding:
|
12
|
+
def __init__(self, model_name) -> None:
|
13
|
+
self._model_name = model_name
|
14
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
15
|
+
f"sentence-transformers/{model_name}"
|
16
|
+
)
|
17
|
+
self._model = AutoModel.from_pretrained(
|
18
|
+
f"sentence-transformers/{model_name}", torchscript=True
|
19
|
+
)
|
20
|
+
self._model.eval()
|
21
|
+
|
22
|
+
def embed_batch(self, inputs: List[str]) -> List[List[float]]:
|
23
|
+
result = self._embed(inputs)
|
24
|
+
return result.tolist()
|
25
|
+
|
26
|
+
def embed(self, query: str) -> List[float]:
|
27
|
+
result = self._embed([query])
|
28
|
+
return result[0].tolist()
|
29
|
+
|
30
|
+
def _embed(self, inputs: List[str]) -> torch.Tensor:
|
31
|
+
encoded_input = self._tokenizer(
|
32
|
+
inputs, padding=True, truncation=True, return_tensors="pt"
|
33
|
+
)
|
34
|
+
sentence_embeddings = self._model(**encoded_input)
|
35
|
+
return F.normalize(sentence_embeddings, p=2, dim=1)
|
36
|
+
|
37
|
+
|
38
|
+
class BasicSentenceTransformerModels(Extractor):
|
39
|
+
def __init__(self, model: str):
|
40
|
+
super().__init__()
|
41
|
+
self.model = SentenceTransformersEmbedding(model)
|
42
|
+
|
43
|
+
def extract(self, input: str) -> List[Feature]:
|
44
|
+
embeddings = self.model.embed(input)
|
45
|
+
return [Feature.embedding(values=embeddings)]
|
46
|
+
|
47
|
+
|
48
|
+
class BasicHFTransformerEmbeddingModels(Extractor):
|
49
|
+
def __init__(self, model: str):
|
50
|
+
super().__init__()
|
51
|
+
self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
|
52
|
+
|
53
|
+
def extract(self, input: str) -> List[Feature]:
|
54
|
+
embeddings = self.model.embed_query(input)
|
55
|
+
return [Feature.embedding(values=embeddings)]
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import tempfile
|
2
|
+
from enum import Enum
|
3
|
+
from typing import List, Optional
|
4
|
+
|
5
|
+
import deepdoctection as dd
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
|
9
|
+
class PageFragmentType(str, Enum):
|
10
|
+
TEXT = "text"
|
11
|
+
FIGURE = "figure"
|
12
|
+
TABLE = "table"
|
13
|
+
|
14
|
+
|
15
|
+
class Image(BaseModel):
|
16
|
+
data: bytes
|
17
|
+
mime_type: str
|
18
|
+
|
19
|
+
|
20
|
+
class TableEncoding(str, Enum):
|
21
|
+
CSV = "csv"
|
22
|
+
HTML = "html"
|
23
|
+
|
24
|
+
|
25
|
+
class Table(BaseModel):
|
26
|
+
data: str
|
27
|
+
encoding: TableEncoding
|
28
|
+
|
29
|
+
|
30
|
+
class PageFragment(BaseModel):
|
31
|
+
fragment_type: PageFragmentType
|
32
|
+
text: Optional[str] = None
|
33
|
+
image: Optional[Image] = None
|
34
|
+
table: Optional[Table] = None
|
35
|
+
reading_order: Optional[int] = None
|
36
|
+
|
37
|
+
|
38
|
+
class Page(BaseModel):
|
39
|
+
number: int
|
40
|
+
fragments: List[PageFragment]
|
41
|
+
|
42
|
+
|
43
|
+
class PDFParser:
|
44
|
+
def __init__(self, data: bytes, language: Optional[str] = "en"):
|
45
|
+
self._data = data
|
46
|
+
|
47
|
+
def parse(self) -> List[Page]:
|
48
|
+
analyzer = dd.get_dd_analyzer()
|
49
|
+
parsed_pages = []
|
50
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
|
51
|
+
f.write(self._data)
|
52
|
+
f.flush()
|
53
|
+
df = analyzer.analyze(path=f.name)
|
54
|
+
df.reset_state()
|
55
|
+
for page in df:
|
56
|
+
parsed_pages.append(page)
|
57
|
+
outputs: List[Page] = []
|
58
|
+
for parsed_page in parsed_pages:
|
59
|
+
page_num = parsed_page.page_number
|
60
|
+
fragments = []
|
61
|
+
for layout in parsed_page.layouts:
|
62
|
+
if layout.category_name in ["text", "title"]:
|
63
|
+
fragments.append(
|
64
|
+
PageFragment(
|
65
|
+
fragment_type=PageFragmentType.TEXT,
|
66
|
+
text=layout.text,
|
67
|
+
reading_order=layout.reading_order,
|
68
|
+
)
|
69
|
+
)
|
70
|
+
figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
|
71
|
+
print(len(figures))
|
72
|
+
for figure in figures:
|
73
|
+
image_bytes = dd.viz_handler.encode(figure.viz())
|
74
|
+
fragments.append(
|
75
|
+
PageFragment(
|
76
|
+
fragment_type=PageFragmentType.FIGURE,
|
77
|
+
image=Image(data=image_bytes, mime_type="image/png"),
|
78
|
+
reading_order=figure.reading_order,
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
82
|
+
tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
|
83
|
+
print(len(tables))
|
84
|
+
for table in tables:
|
85
|
+
fragments.append(
|
86
|
+
PageFragment(
|
87
|
+
fragment_type=PageFragmentType.TABLE,
|
88
|
+
table=Table(data=table.html, encoding=TableEncoding.HTML),
|
89
|
+
reading_order=table.reading_order,
|
90
|
+
)
|
91
|
+
)
|
92
|
+
|
93
|
+
outputs.append(Page(number=page_num, fragments=fragments))
|
94
|
+
|
95
|
+
return outputs
|
indexify/graph.py
CHANGED
@@ -1,23 +1,133 @@
|
|
1
|
-
|
1
|
+
import itertools
|
2
|
+
import json
|
3
|
+
from collections import defaultdict
|
4
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
2
5
|
|
3
|
-
|
6
|
+
import cloudpickle
|
4
7
|
from pydantic import BaseModel
|
5
8
|
|
6
|
-
from .
|
7
|
-
from .
|
9
|
+
from .extractor_sdk import Content, Extractor, extractor
|
10
|
+
from .runner import Runner
|
8
11
|
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
start_node: Union[extractor, Extractor],
|
14
|
-
run_local: bool,
|
15
|
-
) -> RunGraph:
|
13
|
+
@extractor(description="id function")
|
14
|
+
def _id(content: Content) -> List[Content]:
|
15
|
+
return [content]
|
16
16
|
|
17
|
-
if run_local:
|
18
|
-
runner = LocalRunner()
|
19
|
-
else:
|
20
|
-
raise NotImplementedError("Remote runner not supported yet")
|
21
17
|
|
22
|
-
|
23
|
-
return graph
|
18
|
+
def load_graph(graph: bytes) -> "Graph":
|
19
|
+
return cloudpickle.loads(graph)
|
20
|
+
|
21
|
+
|
22
|
+
class Graph:
|
23
|
+
def __init__(
|
24
|
+
self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
|
25
|
+
):
|
26
|
+
# TODO check for cycles
|
27
|
+
self.name = name
|
28
|
+
|
29
|
+
self.nodes: Dict[str, Union[extractor, Extractor]] = {}
|
30
|
+
self.params: Dict[str, Any] = {}
|
31
|
+
|
32
|
+
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
33
|
+
|
34
|
+
self.nodes["start"] = _id
|
35
|
+
self.nodes["end"] = _id
|
36
|
+
|
37
|
+
self._topo_counter = defaultdict(int)
|
38
|
+
|
39
|
+
self._start_node = None
|
40
|
+
self._input = input
|
41
|
+
|
42
|
+
self.runner = runner
|
43
|
+
|
44
|
+
def get_extractor(self, name: str) -> Extractor:
|
45
|
+
return self.nodes[name]
|
46
|
+
|
47
|
+
def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
|
48
|
+
name = extractor.name
|
49
|
+
|
50
|
+
# if you've already inserted a node just ignore the new insertion.
|
51
|
+
if name in self.nodes:
|
52
|
+
return
|
53
|
+
|
54
|
+
self.nodes[name] = extractor
|
55
|
+
self.params[name] = extractor.__dict__.get("params", None)
|
56
|
+
|
57
|
+
# assign each node a rank of 1 to init the graph
|
58
|
+
self._topo_counter[name] = 1
|
59
|
+
|
60
|
+
return self
|
61
|
+
|
62
|
+
def serialize(self):
|
63
|
+
return cloudpickle.dumps(self)
|
64
|
+
|
65
|
+
def add_edge(
|
66
|
+
self,
|
67
|
+
from_node: Type[Extractor],
|
68
|
+
to_node: Type[Extractor],
|
69
|
+
prefilter_predicates: Optional[str] = None,
|
70
|
+
) -> "Graph":
|
71
|
+
|
72
|
+
self._node(from_node)
|
73
|
+
self._node(to_node)
|
74
|
+
|
75
|
+
from_node_name = from_node.name
|
76
|
+
to_node_name = to_node.name
|
77
|
+
|
78
|
+
self.edges[from_node_name].append((to_node_name, prefilter_predicates))
|
79
|
+
|
80
|
+
self._topo_counter[to_node_name] += 1
|
81
|
+
|
82
|
+
return self
|
83
|
+
|
84
|
+
"""
|
85
|
+
Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
|
86
|
+
Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
|
87
|
+
"""
|
88
|
+
|
89
|
+
def steps(
|
90
|
+
self,
|
91
|
+
from_node: extractor,
|
92
|
+
to_nodes: List[extractor],
|
93
|
+
prefilter_predicates: List[str] = [],
|
94
|
+
) -> "Graph":
|
95
|
+
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
96
|
+
for t_n, p in itertools.zip_longest(
|
97
|
+
to_nodes, prefilter_predicates, fillvalue=None
|
98
|
+
):
|
99
|
+
self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
|
100
|
+
|
101
|
+
return self
|
102
|
+
|
103
|
+
def add_param(self, node: extractor, params: Dict[str, Any]):
|
104
|
+
try:
|
105
|
+
# check if the params can be serialized since the server needs this
|
106
|
+
json.dumps(params)
|
107
|
+
except Exception:
|
108
|
+
raise Exception(f"For node {node.name}, cannot serialize params as json.")
|
109
|
+
|
110
|
+
self.params[node.name] = params
|
111
|
+
|
112
|
+
def run(self, wf_input, local):
|
113
|
+
self._assign_start_node()
|
114
|
+
self.runner.run(self, wf_input=wf_input)
|
115
|
+
pass
|
116
|
+
|
117
|
+
def clear_cache_for_node(self, node: Union[extractor, Extractor]):
|
118
|
+
if node.name not in self.nodes.keys():
|
119
|
+
raise Exception(f"Node with name {node.name} not found in graph")
|
120
|
+
|
121
|
+
self.runner.deleted_from_memo(node.name)
|
122
|
+
|
123
|
+
def clear_cache_for_all_nodes(self):
|
124
|
+
for node_name in self.nodes:
|
125
|
+
self.runner.deleted_from_memo(node_name=node_name)
|
126
|
+
|
127
|
+
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
128
|
+
return self.runner.results[node.name]
|
129
|
+
|
130
|
+
def _assign_start_node(self):
|
131
|
+
# this method should be called before a graph can be run
|
132
|
+
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
133
|
+
self._start_node = nodes[0][0]
|
indexify/local_runner.py
CHANGED
@@ -2,15 +2,13 @@ import hashlib
|
|
2
2
|
import os
|
3
3
|
import pickle
|
4
4
|
import shutil
|
5
|
-
from pathlib import Path
|
6
|
-
|
7
|
-
from indexify.extractor_sdk.data import BaseData, Feature
|
8
|
-
from indexify.extractor_sdk.extractor import extractor, Extractor
|
9
|
-
|
10
5
|
from collections import defaultdict
|
6
|
+
from pathlib import Path
|
11
7
|
from typing import Any, Callable, Dict, Optional, Union
|
12
8
|
|
13
|
-
from indexify.
|
9
|
+
from indexify.extractor_sdk.data import BaseData, Feature
|
10
|
+
from indexify.extractor_sdk.extractor import Extractor, extractor
|
11
|
+
from indexify.graph import Graph
|
14
12
|
from indexify.runner import Runner
|
15
13
|
|
16
14
|
|
@@ -27,9 +25,9 @@ class LocalRunner(Runner):
|
|
27
25
|
# those bytes have to be a python type
|
28
26
|
|
29
27
|
# _input needs to be serializable into python object (ie json for ex) and Feature
|
30
|
-
def _run(self, g:
|
28
|
+
def _run(self, g: Graph, _input: BaseData, node_name: str):
|
31
29
|
print(f"---- Starting node {node_name}")
|
32
|
-
print(f
|
30
|
+
print(f"node_name {node_name}")
|
33
31
|
|
34
32
|
extractor_construct: Callable = g.nodes[node_name]
|
35
33
|
params = g.params.get(node_name, None)
|
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
|
|
65
63
|
for out_edge, pre_filter_predicate in g.edges[node_name]:
|
66
64
|
# TODO there are no reductions yet, each recursion finishes it's path and returns
|
67
65
|
for r in data_to_process:
|
68
|
-
if self._prefilter_content(
|
66
|
+
if self._prefilter_content(
|
67
|
+
content=r, prefilter_predicate=pre_filter_predicate
|
68
|
+
):
|
69
69
|
continue
|
70
70
|
|
71
71
|
self._run(g, _input=r, node_name=out_edge)
|
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
|
|
73
73
|
"""
|
74
74
|
Returns True if content should be filtered
|
75
75
|
"""
|
76
|
-
|
76
|
+
|
77
|
+
def _prefilter_content(
|
78
|
+
self, content: BaseData, prefilter_predicate: Optional[str]
|
79
|
+
) -> bool:
|
77
80
|
if prefilter_predicate is None:
|
78
81
|
return False
|
79
82
|
|
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
|
|
83
86
|
|
84
87
|
# TODO For now only support `and` and `=` and `string values`
|
85
88
|
bools = []
|
86
|
-
metadata = content.get_features()[
|
89
|
+
metadata = content.get_features()["metadata"]
|
87
90
|
for atom in atoms:
|
88
|
-
l, r = atom.split(
|
91
|
+
l, r = atom.split("=")
|
89
92
|
if l in metadata:
|
90
93
|
bools.append(metadata[l] != r)
|
91
94
|
|
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
|
|
109
112
|
if not os.path.exists(file_path):
|
110
113
|
return None
|
111
114
|
|
112
|
-
with open(file_path,
|
115
|
+
with open(file_path, "rb") as f:
|
113
116
|
return f.read()
|
114
117
|
|
115
118
|
def put_into_memo(self, node_name, input_hash, output):
|
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
|
|
121
124
|
|
122
125
|
Path(file_path).touch()
|
123
126
|
|
124
|
-
with open(file_path,
|
127
|
+
with open(file_path, "wb") as f:
|
125
128
|
return f.write(output)
|
indexify/runner.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from abc import ABC
|
2
|
+
from typing import Any, Union
|
2
3
|
|
3
4
|
from indexify.extractor_sdk.data import BaseData
|
4
|
-
from indexify.extractor_sdk.extractor import
|
5
|
+
from indexify.extractor_sdk.extractor import Extractor, extractor
|
5
6
|
|
6
|
-
from typing import Any, Union
|
7
7
|
|
8
8
|
class Runner(ABC):
|
9
9
|
def run(self, g, wf_input: BaseData):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.41
|
4
4
|
Summary: Python Client for Indexify
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: cloudpickle (>=3,<4)
|
16
17
|
Requires-Dist: httpx[http2] (>=0,<1)
|
17
18
|
Requires-Dist: pydantic (>=2.8,<3.0)
|
18
19
|
Requires-Dist: pyyaml (>=6,<7)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
indexify/__init__.py,sha256=e4s2395B3gEGrZk2u5OZO2RtrXYFYUTItaM3mtlusBE,493
|
2
|
+
indexify/base_client.py,sha256=HwT2KJNq8j-KiPVA9RJm-yearSjxifRjXTcP1zUVeo8,2784
|
3
|
+
indexify/client.py,sha256=p4WDmYR94DjU0EqosuCKNGjbfh11qUID6TxDhTK6Uk4,26001
|
4
|
+
indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
|
5
|
+
indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
|
6
|
+
indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
|
7
|
+
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
8
|
+
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
9
|
+
indexify/extraction_policy.py,sha256=927BBtZBDPsLMm01uQDPCZnj3Pwmjh6L6QLHb4ShQKk,2076
|
10
|
+
indexify/extractor_sdk/__init__.py,sha256=DOL-wJvIspWPqjFRBpmhMbnsMZC2JY-NtNwQGiE6IqU,348
|
11
|
+
indexify/extractor_sdk/data.py,sha256=JpX9WdTpiuK72wn6QYhtqj5p5JiJu4waBrK-Hi7lNsA,2742
|
12
|
+
indexify/extractor_sdk/extractor.py,sha256=IEZvr1Qe-dVmTgAeJFAhEyHUW20n4uTEeEassH3C5j4,9858
|
13
|
+
indexify/extractor_sdk/utils.py,sha256=bW_D2eMWTzcAYZ8Lv7LUKGgOD0cyW77E6gNO3y7iNNA,6234
|
14
|
+
indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
indexify/extractors/embedding.py,sha256=Be6X4odSHbkAEm2myxB04RN-Mvb2bFk8uWXxUpY-Z6E,1859
|
16
|
+
indexify/extractors/pdf_parser.py,sha256=XN-_b_W7CrpkTeWYs4H6hkK_mx-k4N2o1RSAVkQhr8Q,2842
|
17
|
+
indexify/graph.py,sha256=UdvrpNc-SdD3U27Ee9aTMMYcSOUz__WQWc31oFHV4yQ,3963
|
18
|
+
indexify/local_runner.py,sha256=uuMJbnT4qYMSySxsB3lEC7FSjYnJFh5eNZ00zu5gLNw,4387
|
19
|
+
indexify/runner.py,sha256=VVmLGF1kAmEuE461Hs0QJFnSvVWtUzYhhQfB1KptYPU,637
|
20
|
+
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
21
|
+
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
22
|
+
indexify-0.0.41.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
indexify-0.0.41.dist-info/METADATA,sha256=yTEubUvxQgDUcXrf6rxzvITsW6BDBjzG2LXxyl9A-O0,1913
|
24
|
+
indexify-0.0.41.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
25
|
+
indexify-0.0.41.dist-info/RECORD,,
|
indexify/run_graph.py
DELETED
@@ -1,122 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from .extractor_sdk import Content, extractor, Extractor
|
4
|
-
|
5
|
-
from collections import defaultdict
|
6
|
-
from typing import Any, Dict, List, Optional, Type, Union
|
7
|
-
from pydantic import BaseModel
|
8
|
-
|
9
|
-
import itertools
|
10
|
-
|
11
|
-
from .runner import Runner
|
12
|
-
|
13
|
-
@extractor(description="id function")
|
14
|
-
def _id(content: Content) -> List[Content]:
|
15
|
-
return [content]
|
16
|
-
|
17
|
-
|
18
|
-
class RunGraph:
|
19
|
-
def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
|
20
|
-
# TODO check for cycles
|
21
|
-
self.name = name
|
22
|
-
|
23
|
-
self.nodes: Dict[str, Union[extractor, Extractor]] = {}
|
24
|
-
self.params: Dict[str, Any] = {}
|
25
|
-
|
26
|
-
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
27
|
-
|
28
|
-
self.nodes["start"] = _id
|
29
|
-
self.nodes["end"] = _id
|
30
|
-
|
31
|
-
self._topo_counter = defaultdict(int)
|
32
|
-
|
33
|
-
self._start_node = None
|
34
|
-
self._input = input
|
35
|
-
|
36
|
-
self.runner = runner
|
37
|
-
|
38
|
-
def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
|
39
|
-
name = extractor.name
|
40
|
-
|
41
|
-
# if you've already inserted a node just ignore the new insertion.
|
42
|
-
if name in self.nodes:
|
43
|
-
return
|
44
|
-
|
45
|
-
self.nodes[name] = extractor
|
46
|
-
self.params[name] = extractor.__dict__.get("params", None)
|
47
|
-
|
48
|
-
# assign each node a rank of 1 to init the graph
|
49
|
-
self._topo_counter[name] = 1
|
50
|
-
|
51
|
-
return self
|
52
|
-
|
53
|
-
def add_edge(
|
54
|
-
self,
|
55
|
-
from_node: extractor,
|
56
|
-
to_node: extractor,
|
57
|
-
prefilter_predicates: Optional[str] = None,
|
58
|
-
) -> 'RunGraph':
|
59
|
-
|
60
|
-
self._node(from_node)
|
61
|
-
self._node(to_node)
|
62
|
-
|
63
|
-
from_node_name = from_node.name
|
64
|
-
to_node_name = to_node.name
|
65
|
-
|
66
|
-
self.edges[from_node_name].append((to_node_name, prefilter_predicates))
|
67
|
-
|
68
|
-
self._topo_counter[to_node_name] += 1
|
69
|
-
|
70
|
-
return self
|
71
|
-
|
72
|
-
"""
|
73
|
-
Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
|
74
|
-
Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
|
75
|
-
"""
|
76
|
-
|
77
|
-
def steps(
|
78
|
-
self,
|
79
|
-
from_node: extractor,
|
80
|
-
to_nodes: List[extractor],
|
81
|
-
prefilter_predicates: List[str] = [],
|
82
|
-
) -> 'RunGraph':
|
83
|
-
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
84
|
-
for t_n, p in itertools.zip_longest(
|
85
|
-
to_nodes, prefilter_predicates, fillvalue=None
|
86
|
-
):
|
87
|
-
self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
|
88
|
-
|
89
|
-
return self
|
90
|
-
|
91
|
-
def add_param(self, node: extractor, params: Dict[str, Any]):
|
92
|
-
try:
|
93
|
-
# check if the params can be serialized since the server needs this
|
94
|
-
json.dumps(params)
|
95
|
-
except Exception:
|
96
|
-
raise Exception(f"For node {node.name}, cannot serialize params as json.")
|
97
|
-
|
98
|
-
self.params[node.name] = params
|
99
|
-
|
100
|
-
def run(self, wf_input, local):
|
101
|
-
self._assign_start_node()
|
102
|
-
# self.runner = LocalRunner()
|
103
|
-
self.runner.run(self, wf_input=wf_input)
|
104
|
-
pass
|
105
|
-
|
106
|
-
def clear_cache_for_node(self, node: Union[extractor, Extractor]):
|
107
|
-
if node.name not in self.nodes.keys():
|
108
|
-
raise Exception(f"Node with name {node.name} not found in graph")
|
109
|
-
|
110
|
-
self.runner.deleted_from_memo(node.name)
|
111
|
-
|
112
|
-
def clear_cache_for_all_nodes(self):
|
113
|
-
for node_name in self.nodes:
|
114
|
-
self.runner.deleted_from_memo(node_name=node_name)
|
115
|
-
|
116
|
-
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
117
|
-
return self.runner.results[node.name]
|
118
|
-
|
119
|
-
def _assign_start_node(self):
|
120
|
-
# this method should be called before a graph can be run
|
121
|
-
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
122
|
-
self._start_node = nodes[0][0]
|
indexify-0.0.39.dist-info/RECORD
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
|
2
|
-
indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
|
3
|
-
indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
|
4
|
-
indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
|
5
|
-
indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
|
6
|
-
indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
|
7
|
-
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
8
|
-
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
9
|
-
indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
|
10
|
-
indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
|
11
|
-
indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
|
12
|
-
indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
|
13
|
-
indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
|
14
|
-
indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
|
15
|
-
indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
|
16
|
-
indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
|
17
|
-
indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
|
18
|
-
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
19
|
-
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
20
|
-
indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
21
|
-
indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
|
22
|
-
indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
23
|
-
indexify-0.0.39.dist-info/RECORD,,
|
File without changes
|
File without changes
|