indexify 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +2 -10
- indexify/base_client.py +67 -0
- indexify/client.py +24 -17
- indexify/data_loaders/__init__.py +8 -5
- indexify/data_loaders/local_directory_loader.py +10 -1
- indexify/data_loaders/url_loader.py +51 -0
- indexify/extractor_sdk/__init__.py +14 -0
- indexify/{data.py → extractor_sdk/data.py} +29 -4
- indexify/extractor_sdk/extractor.py +231 -0
- indexify/{extractor_utils.py → extractor_sdk/utils.py} +2 -2
- indexify/graph.py +17 -74
- indexify/local_runner.py +90 -30
- indexify/run_graph.py +122 -0
- indexify/runner.py +22 -0
- {indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/METADATA +3 -3
- indexify-0.0.39.dist-info/RECORD +23 -0
- indexify/extractor.py +0 -122
- indexify-0.0.37.dist-info/RECORD +0 -18
- {indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.37.dist-info → indexify-0.0.39.dist-info}/WHEEL +0 -0
indexify/__init__.py
CHANGED
@@ -6,24 +6,16 @@ from .client import (
|
|
6
6
|
generate_hash_from_string,
|
7
7
|
generate_unique_hex_id,
|
8
8
|
)
|
9
|
-
from .
|
10
|
-
from .extractor import Extractor, extractor, EmbeddingSchema
|
9
|
+
from . import extractor_sdk
|
11
10
|
from .settings import DEFAULT_SERVICE_URL
|
12
11
|
from . import data_loaders
|
13
12
|
|
14
13
|
__all__ = [
|
15
|
-
"ContentMetadata",
|
16
|
-
"Content",
|
17
14
|
"data_loaders",
|
18
|
-
"Feature",
|
19
|
-
"Extractor",
|
20
|
-
"extractor",
|
21
|
-
"EmbeddingSchema",
|
22
|
-
"extractor",
|
23
15
|
"Document",
|
16
|
+
"extractor_sdk",
|
24
17
|
"IndexifyClient",
|
25
18
|
"ExtractionGraph",
|
26
|
-
"ExtractionGraphBuilder" "ExtractionPolicy",
|
27
19
|
"DEFAULT_SERVICE_URL",
|
28
20
|
"generate_hash_from_string",
|
29
21
|
"generate_unique_hex_id",
|
indexify/base_client.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from .extractor_sdk import Graph, Feature
|
3
|
+
from typing import Any, List, Optional, Union, Dict
|
4
|
+
|
5
|
+
class BaseClient(ABC):
|
6
|
+
|
7
|
+
### Operational APIs
|
8
|
+
@abstractmethod
|
9
|
+
def register_extraction_graph(self, graph: Graph):
|
10
|
+
pass
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
def graphs(self) -> str:
|
14
|
+
pass
|
15
|
+
|
16
|
+
@abstractmethod
|
17
|
+
def namespaces(self) -> str:
|
18
|
+
pass
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def create_namespace(self, namespace: str):
|
22
|
+
pass
|
23
|
+
|
24
|
+
### Ingestion APIs
|
25
|
+
@abstractmethod
|
26
|
+
def invoke_graph_with_object(self, graph:str, object: Any) -> str:
|
27
|
+
"""
|
28
|
+
Invokes a graph with an input object.
|
29
|
+
graph: str: The name of the graph to invoke
|
30
|
+
object: Any: The input object to the graph. It should be JSON serializable
|
31
|
+
return: str: The ID of the ingested object
|
32
|
+
"""
|
33
|
+
pass
|
34
|
+
|
35
|
+
@abstractmethod
|
36
|
+
def invoke_graph_with_file(self, graph: str, path: str) -> str:
|
37
|
+
"""
|
38
|
+
Invokes a graph with an input file. The file's mimetype is appropriately detected.
|
39
|
+
graph: str: The name of the graph to invoke
|
40
|
+
path: str: The path to the file to be ingested
|
41
|
+
return: str: The ID of the ingested object
|
42
|
+
"""
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
### Retrieval APIs
|
47
|
+
@abstractmethod
|
48
|
+
def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
|
49
|
+
"""
|
50
|
+
Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
|
51
|
+
If the extractor name is not provided, all the extracted objects are returned for the input object.
|
52
|
+
graph: str: The name of the graph
|
53
|
+
ingested_object_id: str: The ID of the ingested object
|
54
|
+
extractor_name: Optional[str]: The name of the extractor whose output is to be returned if provided
|
55
|
+
return: Union[Dict[str, List[Any]], List[Any]]: The extracted objects. If the extractor name is provided, the output is a list of extracted objects by the extractor. If the extractor name is not provided, the output is a dictionary with the extractor name as the key and the extracted objects as the value. If no objects are found, an empty list is returned.
|
56
|
+
"""
|
57
|
+
pass
|
58
|
+
|
59
|
+
@abstractmethod
|
60
|
+
def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
|
61
|
+
"""
|
62
|
+
Returns the features of an object.
|
63
|
+
object_id: str: The ID of the object
|
64
|
+
return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
|
65
|
+
"""
|
66
|
+
pass
|
67
|
+
|
indexify/client.py
CHANGED
@@ -5,11 +5,11 @@ import hashlib
|
|
5
5
|
import json
|
6
6
|
from collections import namedtuple
|
7
7
|
from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
8
|
-
from .extractor import
|
8
|
+
from .extractor_sdk.extractor import ExtractorMetadata
|
9
9
|
from .extraction_policy import ExtractionGraph
|
10
10
|
from .utils import json_set_default
|
11
11
|
from .error import Error
|
12
|
-
from .data import
|
12
|
+
from .extractor_sdk.data import ContentMetadata
|
13
13
|
from .data_loaders import DataLoader
|
14
14
|
from indexify.exceptions import ApiException
|
15
15
|
from dataclasses import dataclass
|
@@ -326,7 +326,7 @@ class IndexifyClient:
|
|
326
326
|
response = self.get(f"namespaces/{self.namespace}/indexes")
|
327
327
|
return response.json()["indexes"]
|
328
328
|
|
329
|
-
def extractors(self) -> List[
|
329
|
+
def extractors(self) -> List[ExtractorMetadata]:
|
330
330
|
"""
|
331
331
|
Get a list of all extractors.
|
332
332
|
|
@@ -337,7 +337,8 @@ class IndexifyClient:
|
|
337
337
|
extractors_dict = response.json()["extractors"]
|
338
338
|
extractors = []
|
339
339
|
for ed in extractors_dict:
|
340
|
-
|
340
|
+
print(ed)
|
341
|
+
extractors.append(ExtractorMetadata.model_validate(ed))
|
341
342
|
return extractors
|
342
343
|
|
343
344
|
def get_extraction_graphs(self) -> List[ExtractionGraph]:
|
@@ -578,8 +579,8 @@ class IndexifyClient:
|
|
578
579
|
def upload_file(
|
579
580
|
self,
|
580
581
|
extraction_graph: str,
|
581
|
-
path: str,
|
582
|
-
file_bytes:bytes=None,
|
582
|
+
path: str,
|
583
|
+
file_bytes: bytes = None,
|
583
584
|
id=None,
|
584
585
|
labels: dict = {},
|
585
586
|
) -> str:
|
@@ -605,18 +606,20 @@ class IndexifyClient:
|
|
605
606
|
)
|
606
607
|
else:
|
607
608
|
response = self.post(
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
609
|
+
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
610
|
+
files={"file": (path, file_bytes)},
|
611
|
+
data={"labels": json.dumps(labels)},
|
612
|
+
params=params,
|
612
613
|
)
|
613
614
|
file_content = path
|
614
|
-
|
615
|
+
|
615
616
|
response_json = response.json()
|
616
617
|
content_id = response_json["content_id"]
|
617
618
|
return content_id
|
618
|
-
|
619
|
-
def ingest_from_loader(
|
619
|
+
|
620
|
+
def ingest_from_loader(
|
621
|
+
self, loader: DataLoader, extraction_graph: str
|
622
|
+
) -> List[str]:
|
620
623
|
"""
|
621
624
|
Loads content using the loader, uploads them to Indexify and returns the content ids.
|
622
625
|
loader: DataLoader: The DataLoader object to use for loading content
|
@@ -625,9 +628,13 @@ class IndexifyClient:
|
|
625
628
|
content_ids = []
|
626
629
|
files = loader.load()
|
627
630
|
for file_metadata in files:
|
628
|
-
labels={"file_name": file_metadata.path}
|
629
|
-
|
630
|
-
|
631
|
+
labels = {"file_name": file_metadata.path}
|
632
|
+
content_id = self.upload_file(
|
633
|
+
extraction_graph,
|
634
|
+
file_metadata.path,
|
635
|
+
loader.read_all_bytes(file_metadata),
|
636
|
+
labels=labels,
|
637
|
+
)
|
631
638
|
content_ids.append(content_id)
|
632
639
|
return content_ids
|
633
640
|
|
@@ -702,7 +709,7 @@ class IndexifyClient:
|
|
702
709
|
extraction_graph: str,
|
703
710
|
url: str,
|
704
711
|
mime_type: str,
|
705
|
-
labels: Dict[str, str],
|
712
|
+
labels: Dict[str, str] = {},
|
706
713
|
id=None,
|
707
714
|
):
|
708
715
|
req = {
|
@@ -6,6 +6,7 @@ import os
|
|
6
6
|
import mimetypes
|
7
7
|
import hashlib
|
8
8
|
|
9
|
+
|
9
10
|
class FileMetadata(BaseModel):
|
10
11
|
path: str
|
11
12
|
file_size: int
|
@@ -38,18 +39,20 @@ class FileMetadata(BaseModel):
|
|
38
39
|
updated_at=updated_at,
|
39
40
|
)
|
40
41
|
|
41
|
-
def read_all_bytes(self) -> bytes:
|
42
|
-
with open(self.path, "rb") as f:
|
43
|
-
return f.read()
|
44
|
-
|
45
42
|
|
46
43
|
class DataLoader(ABC):
|
47
44
|
@abstractmethod
|
48
45
|
def load(self) -> List[FileMetadata]:
|
49
46
|
pass
|
50
47
|
|
48
|
+
@abstractmethod
|
49
|
+
def read_all_bytes(self, file_metadata: FileMetadata) -> bytes:
|
50
|
+
pass
|
51
|
+
|
51
52
|
@abstractmethod
|
52
53
|
def state(self) -> dict:
|
53
54
|
pass
|
54
55
|
|
55
|
-
|
56
|
+
|
57
|
+
from .local_directory_loader import LocalDirectoryLoader
|
58
|
+
from .url_loader import UrlLoader
|
@@ -4,7 +4,12 @@ import os
|
|
4
4
|
|
5
5
|
|
6
6
|
class LocalDirectoryLoader(DataLoader):
|
7
|
-
def __init__(
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
directory: str,
|
10
|
+
file_extensions: Optional[List[str]] = None,
|
11
|
+
state: dict = {},
|
12
|
+
):
|
8
13
|
self.directory = directory
|
9
14
|
self.file_extensions = file_extensions
|
10
15
|
self.processed_files = set(state.get("processed_files", []))
|
@@ -23,5 +28,9 @@ class LocalDirectoryLoader(DataLoader):
|
|
23
28
|
|
24
29
|
return file_metadata_list
|
25
30
|
|
31
|
+
def read_all_bytes(self, file: FileMetadata) -> bytes:
|
32
|
+
with open(file.path, "rb") as f:
|
33
|
+
return f.read()
|
34
|
+
|
26
35
|
def state(self) -> dict:
|
27
36
|
return {"processed_files": list(self.processed_files)}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from . import DataLoader, FileMetadata
|
2
|
+
from typing import List
|
3
|
+
import httpx
|
4
|
+
import hashlib
|
5
|
+
import email.utils
|
6
|
+
|
7
|
+
|
8
|
+
def convert_date_to_epoch(date_str: str) -> int:
|
9
|
+
"""
|
10
|
+
Convert a date string from URL header to Unix epoch time.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
date_str (str): The date string from the URL header.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
int: The Unix epoch time.
|
17
|
+
"""
|
18
|
+
if not date_str:
|
19
|
+
return 0
|
20
|
+
parsed_date = email.utils.parsedate_to_datetime(date_str)
|
21
|
+
return int(parsed_date.timestamp())
|
22
|
+
|
23
|
+
|
24
|
+
class UrlLoader(DataLoader):
|
25
|
+
def __init__(self, urls: List[str], state: dict = {}):
|
26
|
+
self.urls = urls
|
27
|
+
|
28
|
+
def load(self) -> List[FileMetadata]:
|
29
|
+
file_metadata_list = []
|
30
|
+
for url in self.urls:
|
31
|
+
response = httpx.head(url, follow_redirects=True)
|
32
|
+
file_metadata_list.append(
|
33
|
+
FileMetadata(
|
34
|
+
path=url,
|
35
|
+
file_size=response.headers.get("content-length", 0),
|
36
|
+
mime_type=response.headers.get("content-type"),
|
37
|
+
md5_hash="",
|
38
|
+
created_at=convert_date_to_epoch(response.headers.get("date")),
|
39
|
+
updated_at=convert_date_to_epoch(
|
40
|
+
response.headers.get("last-modified")
|
41
|
+
),
|
42
|
+
)
|
43
|
+
)
|
44
|
+
return file_metadata_list
|
45
|
+
|
46
|
+
def read_all_bytes(self, file: FileMetadata) -> bytes:
|
47
|
+
response = httpx.get(file.path, follow_redirects=True)
|
48
|
+
return response.content
|
49
|
+
|
50
|
+
def state(self) -> dict:
|
51
|
+
return {}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from .data import ContentMetadata, Content, Feature
|
2
|
+
from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
|
3
|
+
from .utils import SampleExtractorData
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"ContentMetadata",
|
7
|
+
"Content",
|
8
|
+
"Feature",
|
9
|
+
"Extractor",
|
10
|
+
"extractor",
|
11
|
+
"EmbeddingSchema",
|
12
|
+
"ExtractorMetadata",
|
13
|
+
"SampleExtractorData",
|
14
|
+
]
|
@@ -1,6 +1,26 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import (
|
2
|
+
Any,
|
3
|
+
List,
|
4
|
+
Optional,
|
5
|
+
Literal,
|
6
|
+
Dict,
|
7
|
+
Type,
|
8
|
+
cast,
|
9
|
+
Mapping,
|
10
|
+
)
|
2
11
|
from pydantic import BaseModel, Json, Field
|
3
12
|
import json
|
13
|
+
from typing_extensions import Annotated, Doc
|
14
|
+
|
15
|
+
|
16
|
+
class BaseData(BaseModel):
|
17
|
+
meta: Mapping[str, Type[BaseModel]] = {}
|
18
|
+
|
19
|
+
def get_features(self) -> List[Type[BaseModel]]:
|
20
|
+
return self.meta
|
21
|
+
|
22
|
+
def get_feature(self, name: str) -> Optional[Type[BaseModel]]:
|
23
|
+
return self.meta.get(name)
|
4
24
|
|
5
25
|
|
6
26
|
class Feature(BaseModel):
|
@@ -14,7 +34,7 @@ class Feature(BaseModel):
|
|
14
34
|
return cls(
|
15
35
|
feature_type="embedding",
|
16
36
|
name=name,
|
17
|
-
value={values: values, distance: distance},
|
37
|
+
value=json.dumps({"values": values, "distance": distance}),
|
18
38
|
comment=None,
|
19
39
|
)
|
20
40
|
|
@@ -26,7 +46,7 @@ class Feature(BaseModel):
|
|
26
46
|
|
27
47
|
|
28
48
|
class Content(BaseModel):
|
29
|
-
id: str
|
49
|
+
id: Optional[str] = (None,)
|
30
50
|
content_type: Optional[str]
|
31
51
|
data: bytes
|
32
52
|
features: List[Feature] = []
|
@@ -38,7 +58,7 @@ class Content(BaseModel):
|
|
38
58
|
features: List[Feature] = [],
|
39
59
|
):
|
40
60
|
return Content(
|
41
|
-
id=
|
61
|
+
id=None,
|
42
62
|
content_type="text/plain",
|
43
63
|
data=bytes(text, "utf-8"),
|
44
64
|
features=features,
|
@@ -81,3 +101,8 @@ class ContentMetadata(BaseModel):
|
|
81
101
|
mime_type=json["mime_type"],
|
82
102
|
extracted_metadata=json["extracted_metadata"],
|
83
103
|
)
|
104
|
+
|
105
|
+
|
106
|
+
class PDFFile(BaseData):
|
107
|
+
data: bytes
|
108
|
+
mime_type: str
|
@@ -0,0 +1,231 @@
|
|
1
|
+
from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
|
2
|
+
import inspect
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from .data import BaseData, Content, Feature
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
import requests
|
9
|
+
|
10
|
+
|
11
|
+
class EmbeddingSchema(BaseModel):
|
12
|
+
dim: int
|
13
|
+
distance: str = "cosine"
|
14
|
+
|
15
|
+
class ExtractorMetadata(BaseModel):
|
16
|
+
name: str
|
17
|
+
version: str
|
18
|
+
description: str
|
19
|
+
input_mime_types: List[str]
|
20
|
+
system_dependencies: List[str]
|
21
|
+
python_dependencies: List[str]
|
22
|
+
input_mime_types: List[str]
|
23
|
+
embedding_schemas: Dict[str, EmbeddingSchema]
|
24
|
+
# Make this a dynamic model since its a json schema
|
25
|
+
input_params: Optional[Dict]
|
26
|
+
# for backward compatibility
|
27
|
+
metadata_schemas: Optional[Dict]
|
28
|
+
|
29
|
+
|
30
|
+
class Extractor(ABC):
|
31
|
+
name: str = ""
|
32
|
+
|
33
|
+
version: str = "0.0.0"
|
34
|
+
|
35
|
+
system_dependencies: List[str] = []
|
36
|
+
|
37
|
+
python_dependencies: List[str] = []
|
38
|
+
|
39
|
+
description: str = ""
|
40
|
+
|
41
|
+
input_mime_types = ["text/plain"]
|
42
|
+
|
43
|
+
embeddings: Dict[str, EmbeddingSchema] = {}
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
def extract(
|
47
|
+
self, input: Type[BaseModel], params: Type[BaseModel] = None
|
48
|
+
) -> List[Union[Feature, Type[BaseModel]]]:
|
49
|
+
"""
|
50
|
+
Extracts information from the content. Returns a list of features to add
|
51
|
+
to the content.
|
52
|
+
It can also return a list of Content objects, which will be added to storage
|
53
|
+
and any extraction policies defined will be applied to them.
|
54
|
+
"""
|
55
|
+
pass
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
@abstractmethod
|
59
|
+
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
60
|
+
pass
|
61
|
+
|
62
|
+
def describe(self) -> ExtractorMetadata:
|
63
|
+
embedding_schemas = {}
|
64
|
+
try:
|
65
|
+
embedding_schemas = self.embedding_schemas
|
66
|
+
except NotImplementedError:
|
67
|
+
pass
|
68
|
+
|
69
|
+
json_schema = (
|
70
|
+
self._param_cls.model_json_schema() if self._param_cls is not None else None
|
71
|
+
)
|
72
|
+
return ExtractorMetadata(
|
73
|
+
name=self.name,
|
74
|
+
version=self.version,
|
75
|
+
description=self.description,
|
76
|
+
system_dependencies=self.system_dependencies,
|
77
|
+
python_dependencies=self.python_dependencies,
|
78
|
+
input_mime_types=self.input_mime_types,
|
79
|
+
embedding_schemas=embedding_schemas,
|
80
|
+
input_params=json.dumps(json_schema),
|
81
|
+
)
|
82
|
+
|
83
|
+
def _download_file(self, url, filename):
|
84
|
+
if os.path.exists(filename):
|
85
|
+
# file exists skip
|
86
|
+
return
|
87
|
+
try:
|
88
|
+
with requests.get(url, stream=True) as r:
|
89
|
+
r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
|
90
|
+
with open(filename, "wb") as f:
|
91
|
+
for chunk in r.iter_content(chunk_size=8192):
|
92
|
+
f.write(chunk)
|
93
|
+
except requests.exceptions.RequestException as e:
|
94
|
+
print(f"Error downloading the file: {e}")
|
95
|
+
|
96
|
+
def sample_mp3(self, features: List[Feature] = []) -> Content:
|
97
|
+
file_name = "sample.mp3"
|
98
|
+
self._download_file(
|
99
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
|
100
|
+
file_name,
|
101
|
+
)
|
102
|
+
f = open(file_name, "rb")
|
103
|
+
return Content(content_type="audio/mpeg", data=f.read(), features=features)
|
104
|
+
|
105
|
+
def sample_mp4(self, features: List[Feature] = []) -> Content:
|
106
|
+
file_name = "sample.mp4"
|
107
|
+
self._download_file(
|
108
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
|
109
|
+
file_name,
|
110
|
+
)
|
111
|
+
f = open(file_name, "rb")
|
112
|
+
return Content(content_type="video/mp4", data=f.read(), features=features)
|
113
|
+
|
114
|
+
def sample_jpg(self, features: List[Feature] = []) -> Content:
|
115
|
+
file_name = "sample.jpg"
|
116
|
+
self._download_file(
|
117
|
+
"https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
|
118
|
+
file_name,
|
119
|
+
)
|
120
|
+
f = open(file_name, "rb")
|
121
|
+
return Content(content_type="image/jpg", data=f.read(), features=features)
|
122
|
+
|
123
|
+
def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
|
124
|
+
file_name = "sample.jpg"
|
125
|
+
self._download_file(
|
126
|
+
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
|
127
|
+
file_name,
|
128
|
+
)
|
129
|
+
f = open(file_name, "rb")
|
130
|
+
return Content(content_type="image/jpg", data=f.read(), features=features)
|
131
|
+
|
132
|
+
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
133
|
+
file_name = "sample.pdf"
|
134
|
+
self._download_file(
|
135
|
+
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
136
|
+
file_name,
|
137
|
+
)
|
138
|
+
f = open(file_name, "rb")
|
139
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
140
|
+
|
141
|
+
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
142
|
+
file_name = "sample.pdf"
|
143
|
+
self._download_file(
|
144
|
+
"https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
|
145
|
+
file_name,
|
146
|
+
)
|
147
|
+
f = open(file_name, "rb")
|
148
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
149
|
+
|
150
|
+
def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
|
151
|
+
file_name = "sample.pdf"
|
152
|
+
self._download_file(
|
153
|
+
"https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
|
154
|
+
file_name,
|
155
|
+
)
|
156
|
+
f = open(file_name, "rb")
|
157
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
158
|
+
|
159
|
+
def sample_presentation(self, features: List[Feature] = []) -> Content:
|
160
|
+
file_name = "test.pptx"
|
161
|
+
self._download_file(
|
162
|
+
"https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
|
163
|
+
file_name,
|
164
|
+
)
|
165
|
+
f = open(file_name, "rb")
|
166
|
+
return Content(
|
167
|
+
content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
168
|
+
data=f.read(),
|
169
|
+
features=features,
|
170
|
+
)
|
171
|
+
|
172
|
+
def sample_text(self, features: List[Feature] = []) -> Content:
|
173
|
+
article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
|
174
|
+
return Content(content_type="text/plain", data=article, features=features)
|
175
|
+
|
176
|
+
def sample_html(self, features: List[Feature] = []) -> Content:
|
177
|
+
file_name = "sample.html"
|
178
|
+
self._download_file(
|
179
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample.html",
|
180
|
+
file_name,
|
181
|
+
)
|
182
|
+
f = open(file_name, "rb")
|
183
|
+
return Content(content_type="text/html", data=f.read(), features=features)
|
184
|
+
|
185
|
+
|
186
|
+
def extractor(
|
187
|
+
name: Optional[str] = None,
|
188
|
+
description: Optional[str] = "",
|
189
|
+
version: Optional[str] = "",
|
190
|
+
python_dependencies: Optional[List[str]] = None,
|
191
|
+
system_dependencies: Optional[List[str]] = None,
|
192
|
+
input_mime_types: Optional[List[str]] = None,
|
193
|
+
embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
|
194
|
+
sample_content: Optional[Callable] = None,
|
195
|
+
):
|
196
|
+
args = locals()
|
197
|
+
del args["sample_content"]
|
198
|
+
|
199
|
+
def construct(fn):
|
200
|
+
def wrapper():
|
201
|
+
hint = get_type_hints(fn).get("params", dict)
|
202
|
+
|
203
|
+
if not args.get("name"):
|
204
|
+
args[
|
205
|
+
"name"
|
206
|
+
] = f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
|
207
|
+
|
208
|
+
class DecoratedFn(Extractor):
|
209
|
+
@classmethod
|
210
|
+
def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
211
|
+
# TODO we can force all the functions to take in a parms object
|
212
|
+
# or check if someone adds a params
|
213
|
+
if params is None:
|
214
|
+
return fn(input)
|
215
|
+
else:
|
216
|
+
return fn(input, params)
|
217
|
+
|
218
|
+
def sample_input(self) -> Content:
|
219
|
+
return sample_content() if sample_content else self.sample_text()
|
220
|
+
|
221
|
+
for key, val in args.items():
|
222
|
+
setattr(DecoratedFn, key, val)
|
223
|
+
|
224
|
+
return DecoratedFn
|
225
|
+
|
226
|
+
wrapper._extractor_name = fn.__name__
|
227
|
+
wrapper.name = fn.__name__
|
228
|
+
|
229
|
+
return wrapper
|
230
|
+
|
231
|
+
return construct
|
@@ -3,8 +3,8 @@ import httpx
|
|
3
3
|
from typing import List
|
4
4
|
from .data import Content, Feature
|
5
5
|
|
6
|
-
class SampleExtractorData:
|
7
6
|
|
7
|
+
class SampleExtractorData:
|
8
8
|
def _download_file(self, url, filename):
|
9
9
|
if os.path.exists(filename):
|
10
10
|
# file exists skip
|
@@ -105,4 +105,4 @@ class SampleExtractorData:
|
|
105
105
|
file_name,
|
106
106
|
)
|
107
107
|
f = open(file_name, "rb")
|
108
|
-
return Content(content_type="text/html", data=f.read(), features=features)
|
108
|
+
return Content(content_type="text/html", data=f.read(), features=features)
|
indexify/graph.py
CHANGED
@@ -1,80 +1,23 @@
|
|
1
|
-
from
|
2
|
-
from indexify.extractor import Extractor
|
1
|
+
from .extractor_sdk import extractor, Extractor
|
3
2
|
|
4
|
-
from
|
5
|
-
from
|
3
|
+
from typing import Type, Union
|
4
|
+
from pydantic import BaseModel
|
6
5
|
|
7
|
-
import
|
6
|
+
from .run_graph import RunGraph
|
7
|
+
from .local_runner import LocalRunner
|
8
8
|
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def Graph(
|
11
|
+
name: str,
|
12
|
+
input: Type[BaseModel],
|
13
|
+
start_node: Union[extractor, Extractor],
|
14
|
+
run_local: bool,
|
15
|
+
) -> RunGraph:
|
13
16
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
17
|
+
if run_local:
|
18
|
+
runner = LocalRunner()
|
19
|
+
else:
|
20
|
+
raise NotImplementedError("Remote runner not supported yet")
|
18
21
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
23
|
-
|
24
|
-
self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
|
25
|
-
|
26
|
-
self.nodes["start"] = _id
|
27
|
-
self.nodes["end"] = _id
|
28
|
-
|
29
|
-
self._topo_counter = defaultdict(int)
|
30
|
-
|
31
|
-
self._start_node = None
|
32
|
-
|
33
|
-
def _node(self, extractor: Extractor, params: Any = None) -> Self:
|
34
|
-
name = extractor._extractor_name
|
35
|
-
|
36
|
-
# if you've already inserted a node just ignore the new insertion.
|
37
|
-
if name in self.nodes:
|
38
|
-
return
|
39
|
-
|
40
|
-
self.nodes[name] = extractor
|
41
|
-
self.params[name] = extractor.__dict__.get('params', None)
|
42
|
-
|
43
|
-
# assign each node a rank of 1 to init the graph
|
44
|
-
self._topo_counter[name] = 1
|
45
|
-
|
46
|
-
return self
|
47
|
-
|
48
|
-
def step(self,
|
49
|
-
from_node: extractor,
|
50
|
-
to_node: extractor,
|
51
|
-
prefilter_predicates: Optional[str] = None
|
52
|
-
) -> Self:
|
53
|
-
|
54
|
-
self._node(from_node)
|
55
|
-
self._node(to_node)
|
56
|
-
|
57
|
-
from_node_name = from_node._extractor_name
|
58
|
-
to_node_name = to_node._extractor_name
|
59
|
-
|
60
|
-
self.edges[from_node_name].append((to_node_name, prefilter_predicates))
|
61
|
-
|
62
|
-
self._topo_counter[to_node_name] += 1
|
63
|
-
|
64
|
-
return self
|
65
|
-
|
66
|
-
"""
|
67
|
-
Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
|
68
|
-
Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
|
69
|
-
"""
|
70
|
-
def steps(self, from_node: extractor, to_nodes: List[extractor], prefilter_predicates: List[str] = []) -> Self:
|
71
|
-
print(f'{to_nodes}, {prefilter_predicates}, {prefilter_predicates}')
|
72
|
-
for t_n, p in itertools.zip_longest(to_nodes, prefilter_predicates, fillvalue=None):
|
73
|
-
self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
|
74
|
-
|
75
|
-
return self
|
76
|
-
|
77
|
-
def _assign_start_node(self):
|
78
|
-
# this method should be called before a graph can be run
|
79
|
-
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
80
|
-
self._start_node = nodes[0][0]
|
22
|
+
graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
|
23
|
+
return graph
|
indexify/local_runner.py
CHANGED
@@ -1,65 +1,125 @@
|
|
1
|
-
|
1
|
+
import hashlib
|
2
|
+
import os
|
3
|
+
import pickle
|
4
|
+
import shutil
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from indexify.extractor_sdk.data import BaseData, Feature
|
8
|
+
from indexify.extractor_sdk.extractor import extractor, Extractor
|
2
9
|
|
3
10
|
from collections import defaultdict
|
4
|
-
from typing import Any, Callable, Dict, Optional
|
11
|
+
from typing import Any, Callable, Dict, Optional, Union
|
12
|
+
|
13
|
+
from indexify.run_graph import RunGraph
|
14
|
+
from indexify.runner import Runner
|
5
15
|
|
6
|
-
import json
|
7
16
|
|
8
|
-
class LocalRunner:
|
17
|
+
class LocalRunner(Runner):
|
9
18
|
def __init__(self):
|
10
|
-
self.results: Dict[str, Any] = defaultdict(
|
19
|
+
self.results: Dict[str, Any] = defaultdict(
|
20
|
+
list
|
21
|
+
) # TODO should the Any be Content?
|
22
|
+
|
23
|
+
def run(self, g, wf_input: BaseData):
|
24
|
+
return self._run(g, _input=wf_input, node_name=g._start_node)
|
11
25
|
|
12
|
-
|
13
|
-
|
14
|
-
|
26
|
+
# graph is getting some files which are files, some lables and the MIME type of the bytes
|
27
|
+
# those bytes have to be a python type
|
28
|
+
|
29
|
+
# _input needs to be serializable into python object (ie json for ex) and Feature
|
30
|
+
def _run(self, g: RunGraph, _input: BaseData, node_name: str):
|
31
|
+
print(f"---- Starting node {node_name}")
|
32
|
+
print(f'node_name {node_name}')
|
15
33
|
|
16
|
-
def _run(self, g, content: Content, node_name: str):
|
17
34
|
extractor_construct: Callable = g.nodes[node_name]
|
18
35
|
params = g.params.get(node_name, None)
|
19
36
|
|
20
|
-
|
37
|
+
# NOTE: User should clear cache for nodes they would like to re-rerun
|
38
|
+
input_hash = hashlib.sha256(str(_input).encode()).hexdigest()
|
39
|
+
memo_output = self.get_from_memo(node_name, input_hash)
|
40
|
+
if memo_output is None:
|
41
|
+
print("=== FYI Writing output to cache")
|
42
|
+
res = extractor_construct().extract(input=_input, params=params)
|
43
|
+
self.put_into_memo(node_name, input_hash, pickle.dumps(res))
|
44
|
+
else:
|
45
|
+
print("=== Reading output from cache")
|
46
|
+
res = pickle.loads(memo_output)
|
47
|
+
|
48
|
+
if not isinstance(res, list):
|
49
|
+
res = [res]
|
21
50
|
|
22
|
-
|
51
|
+
res_data = [i for i in res if not isinstance(i, Feature)]
|
52
|
+
res_features = [i for i in res if isinstance(i, Feature)]
|
23
53
|
|
24
|
-
self.results[node_name].extend(
|
54
|
+
self.results[node_name].extend(res_data)
|
55
|
+
|
56
|
+
for f in res_features:
|
57
|
+
_input.meta[f.name] = f.value
|
58
|
+
|
59
|
+
# this assume that if an extractor emits features then the next edge will always process
|
60
|
+
# the edges
|
61
|
+
data_to_process = res_data
|
62
|
+
if len(res_features) > 0:
|
63
|
+
data_to_process.append(_input)
|
25
64
|
|
26
65
|
for out_edge, pre_filter_predicate in g.edges[node_name]:
|
27
66
|
# TODO there are no reductions yet, each recursion finishes it's path and returns
|
28
|
-
for r in
|
67
|
+
for r in data_to_process:
|
29
68
|
if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
|
30
69
|
continue
|
31
70
|
|
32
|
-
self._run(g,
|
71
|
+
self._run(g, _input=r, node_name=out_edge)
|
33
72
|
|
34
73
|
"""
|
35
74
|
Returns True if content should be filtered
|
36
75
|
"""
|
37
|
-
def _prefilter_content(self, content:
|
76
|
+
def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
|
38
77
|
if prefilter_predicate is None:
|
39
78
|
return False
|
40
79
|
|
41
|
-
atoms = prefilter_predicate.split(
|
80
|
+
atoms = prefilter_predicate.split("and")
|
42
81
|
if len(atoms) == 0:
|
43
82
|
return False
|
44
83
|
|
45
84
|
# TODO For now only support `and` and `=` and `string values`
|
46
85
|
bools = []
|
47
|
-
|
48
|
-
|
49
|
-
|
86
|
+
metadata = content.get_features()['metadata']
|
87
|
+
for atom in atoms:
|
88
|
+
l, r = atom.split('=')
|
89
|
+
if l in metadata:
|
90
|
+
bools.append(metadata[l] != r)
|
50
91
|
|
51
|
-
|
92
|
+
return all(bools)
|
52
93
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
print(f'predicates[l], r: {predicates[l], r}')
|
57
|
-
bools.append(predicates[l] != r)
|
94
|
+
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
95
|
+
node_name = node.name
|
96
|
+
return self.results[node_name]
|
58
97
|
|
59
|
-
|
98
|
+
def deleted_from_memo(self, node_name):
|
99
|
+
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
60
100
|
|
61
|
-
|
101
|
+
if os.path.exists(path_prefix) and os.path.isdir(path_prefix):
|
102
|
+
shutil.rmtree(path_prefix)
|
62
103
|
|
63
|
-
def
|
64
|
-
|
65
|
-
|
104
|
+
def get_from_memo(self, node_name, input_hash):
|
105
|
+
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
106
|
+
file_name = f"{input_hash}"
|
107
|
+
file_path = f"{path_prefix}/{file_name}"
|
108
|
+
|
109
|
+
if not os.path.exists(file_path):
|
110
|
+
return None
|
111
|
+
|
112
|
+
with open(file_path, 'rb') as f:
|
113
|
+
return f.read()
|
114
|
+
|
115
|
+
def put_into_memo(self, node_name, input_hash, output):
|
116
|
+
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
117
|
+
file_name = f"{input_hash}"
|
118
|
+
file_path = f"{path_prefix}/{file_name}"
|
119
|
+
|
120
|
+
os.makedirs(path_prefix, exist_ok=True)
|
121
|
+
|
122
|
+
Path(file_path).touch()
|
123
|
+
|
124
|
+
with open(file_path, 'wb') as f:
|
125
|
+
return f.write(output)
|
indexify/run_graph.py
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from .extractor_sdk import Content, extractor, Extractor
|
4
|
+
|
5
|
+
from collections import defaultdict
|
6
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
import itertools
|
10
|
+
|
11
|
+
from .runner import Runner
|
12
|
+
|
13
|
+
@extractor(description="id function")
|
14
|
+
def _id(content: Content) -> List[Content]:
|
15
|
+
return [content]
|
16
|
+
|
17
|
+
|
18
|
+
class RunGraph:
|
19
|
+
def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
|
20
|
+
# TODO check for cycles
|
21
|
+
self.name = name
|
22
|
+
|
23
|
+
self.nodes: Dict[str, Union[extractor, Extractor]] = {}
|
24
|
+
self.params: Dict[str, Any] = {}
|
25
|
+
|
26
|
+
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
27
|
+
|
28
|
+
self.nodes["start"] = _id
|
29
|
+
self.nodes["end"] = _id
|
30
|
+
|
31
|
+
self._topo_counter = defaultdict(int)
|
32
|
+
|
33
|
+
self._start_node = None
|
34
|
+
self._input = input
|
35
|
+
|
36
|
+
self.runner = runner
|
37
|
+
|
38
|
+
def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
|
39
|
+
name = extractor.name
|
40
|
+
|
41
|
+
# if you've already inserted a node just ignore the new insertion.
|
42
|
+
if name in self.nodes:
|
43
|
+
return
|
44
|
+
|
45
|
+
self.nodes[name] = extractor
|
46
|
+
self.params[name] = extractor.__dict__.get("params", None)
|
47
|
+
|
48
|
+
# assign each node a rank of 1 to init the graph
|
49
|
+
self._topo_counter[name] = 1
|
50
|
+
|
51
|
+
return self
|
52
|
+
|
53
|
+
def add_edge(
|
54
|
+
self,
|
55
|
+
from_node: extractor,
|
56
|
+
to_node: extractor,
|
57
|
+
prefilter_predicates: Optional[str] = None,
|
58
|
+
) -> 'RunGraph':
|
59
|
+
|
60
|
+
self._node(from_node)
|
61
|
+
self._node(to_node)
|
62
|
+
|
63
|
+
from_node_name = from_node.name
|
64
|
+
to_node_name = to_node.name
|
65
|
+
|
66
|
+
self.edges[from_node_name].append((to_node_name, prefilter_predicates))
|
67
|
+
|
68
|
+
self._topo_counter[to_node_name] += 1
|
69
|
+
|
70
|
+
return self
|
71
|
+
|
72
|
+
"""
|
73
|
+
Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
|
74
|
+
Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def steps(
|
78
|
+
self,
|
79
|
+
from_node: extractor,
|
80
|
+
to_nodes: List[extractor],
|
81
|
+
prefilter_predicates: List[str] = [],
|
82
|
+
) -> 'RunGraph':
|
83
|
+
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
84
|
+
for t_n, p in itertools.zip_longest(
|
85
|
+
to_nodes, prefilter_predicates, fillvalue=None
|
86
|
+
):
|
87
|
+
self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
|
88
|
+
|
89
|
+
return self
|
90
|
+
|
91
|
+
def add_param(self, node: extractor, params: Dict[str, Any]):
|
92
|
+
try:
|
93
|
+
# check if the params can be serialized since the server needs this
|
94
|
+
json.dumps(params)
|
95
|
+
except Exception:
|
96
|
+
raise Exception(f"For node {node.name}, cannot serialize params as json.")
|
97
|
+
|
98
|
+
self.params[node.name] = params
|
99
|
+
|
100
|
+
def run(self, wf_input, local):
|
101
|
+
self._assign_start_node()
|
102
|
+
# self.runner = LocalRunner()
|
103
|
+
self.runner.run(self, wf_input=wf_input)
|
104
|
+
pass
|
105
|
+
|
106
|
+
def clear_cache_for_node(self, node: Union[extractor, Extractor]):
|
107
|
+
if node.name not in self.nodes.keys():
|
108
|
+
raise Exception(f"Node with name {node.name} not found in graph")
|
109
|
+
|
110
|
+
self.runner.deleted_from_memo(node.name)
|
111
|
+
|
112
|
+
def clear_cache_for_all_nodes(self):
|
113
|
+
for node_name in self.nodes:
|
114
|
+
self.runner.deleted_from_memo(node_name=node_name)
|
115
|
+
|
116
|
+
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
117
|
+
return self.runner.results[node.name]
|
118
|
+
|
119
|
+
def _assign_start_node(self):
|
120
|
+
# this method should be called before a graph can be run
|
121
|
+
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
122
|
+
self._start_node = nodes[0][0]
|
indexify/runner.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
|
3
|
+
from indexify.extractor_sdk.data import BaseData
|
4
|
+
from indexify.extractor_sdk.extractor import extractor, Extractor
|
5
|
+
|
6
|
+
from typing import Any, Union
|
7
|
+
|
8
|
+
class Runner(ABC):
|
9
|
+
def run(self, g, wf_input: BaseData):
|
10
|
+
raise NotImplementedError()
|
11
|
+
|
12
|
+
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
13
|
+
raise NotImplementedError()
|
14
|
+
|
15
|
+
def deleted_from_memo(self, node_name):
|
16
|
+
raise NotImplementedError()
|
17
|
+
|
18
|
+
def get_from_memo(self, node_name, input_hash):
|
19
|
+
raise NotImplementedError()
|
20
|
+
|
21
|
+
def put_into_memo(self, node_name, input_hash, output):
|
22
|
+
raise NotImplementedError()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.39
|
4
4
|
Summary: Python Client for Indexify
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -13,9 +13,9 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Requires-Dist: httpx[http2] (>=0
|
16
|
+
Requires-Dist: httpx[http2] (>=0,<1)
|
17
17
|
Requires-Dist: pydantic (>=2.8,<3.0)
|
18
|
-
Requires-Dist: pyyaml (>=6
|
18
|
+
Requires-Dist: pyyaml (>=6,<7)
|
19
19
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
20
20
|
Description-Content-Type: text/markdown
|
21
21
|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
|
2
|
+
indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
|
3
|
+
indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
|
4
|
+
indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
|
5
|
+
indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
|
6
|
+
indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
|
7
|
+
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
8
|
+
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
9
|
+
indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
|
10
|
+
indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
|
11
|
+
indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
|
12
|
+
indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
|
13
|
+
indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
|
14
|
+
indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
|
15
|
+
indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
|
16
|
+
indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
|
17
|
+
indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
|
18
|
+
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
19
|
+
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
20
|
+
indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
21
|
+
indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
|
22
|
+
indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
23
|
+
indexify-0.0.39.dist-info/RECORD,,
|
indexify/extractor.py
DELETED
@@ -1,122 +0,0 @@
|
|
1
|
-
from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
|
2
|
-
import inspect
|
3
|
-
from pydantic import BaseModel
|
4
|
-
from abc import ABC, abstractmethod
|
5
|
-
from .data import Content, Feature
|
6
|
-
import json
|
7
|
-
|
8
|
-
class EmbeddingSchema(BaseModel):
|
9
|
-
dimension: int
|
10
|
-
|
11
|
-
class Extractor(ABC):
|
12
|
-
name: str = ""
|
13
|
-
|
14
|
-
version: str = "0.0.0"
|
15
|
-
|
16
|
-
system_dependencies: List[str] = []
|
17
|
-
|
18
|
-
python_dependencies: List[str] = []
|
19
|
-
|
20
|
-
description: str = ""
|
21
|
-
|
22
|
-
input_mime_types = ["text/plain"]
|
23
|
-
|
24
|
-
def extract(
|
25
|
-
self, input: Type[BaseModel], params: Type[BaseModel] = None
|
26
|
-
) -> List[Union[Feature, Type[BaseModel]]]:
|
27
|
-
"""
|
28
|
-
Extracts information from the content. Returns a list of features to add
|
29
|
-
to the content.
|
30
|
-
It can also return a list of Content objects, which will be added to storage
|
31
|
-
and any extraction policies defined will be applied to them.
|
32
|
-
"""
|
33
|
-
pass
|
34
|
-
|
35
|
-
def extract_batch(
|
36
|
-
self, input_list: List[Type[BaseModel]], params: List[Type[BaseModel]] = None
|
37
|
-
) -> List[List[Union[Feature, Type[BaseModel]]]]:
|
38
|
-
"""
|
39
|
-
Extracts information from the content. Returns a list of features to add
|
40
|
-
to the content.
|
41
|
-
It can also return a list of Content objects, which will be added to storage
|
42
|
-
and any extraction policies defined will be applied to them.
|
43
|
-
"""
|
44
|
-
pass
|
45
|
-
|
46
|
-
@classmethod
|
47
|
-
@abstractmethod
|
48
|
-
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
49
|
-
pass
|
50
|
-
|
51
|
-
@classmethod
|
52
|
-
@abstractmethod
|
53
|
-
def embedding_schemas(cls) -> Dict[str, EmbeddingSchema]:
|
54
|
-
raise NotImplementedError
|
55
|
-
|
56
|
-
def describe(self) -> Dict:
|
57
|
-
embedding_schemas = {}
|
58
|
-
try:
|
59
|
-
embedding_schemas = self.embedding_schemas()
|
60
|
-
except NotImplementedError:
|
61
|
-
pass
|
62
|
-
|
63
|
-
json_schema = (
|
64
|
-
self._param_cls.model_json_schema() if self._param_cls is not None else None
|
65
|
-
)
|
66
|
-
|
67
|
-
return {
|
68
|
-
"name": self.name,
|
69
|
-
"version": self.version,
|
70
|
-
"description": self.description,
|
71
|
-
"system_dependencies": self.system_dependencies,
|
72
|
-
"python_dependencies": self.python_dependencies,
|
73
|
-
"input_mime_types": self.input_mime_types,
|
74
|
-
"embedding_schemas": embedding_schemas,
|
75
|
-
"input_params": json.dumps(json_schema),
|
76
|
-
}
|
77
|
-
|
78
|
-
def extractor(
|
79
|
-
name: Optional[str] = None,
|
80
|
-
description: Optional[str] = "",
|
81
|
-
version: Optional[str] = "",
|
82
|
-
python_dependencies: Optional[List[str]] = None,
|
83
|
-
system_dependencies: Optional[List[str]] = None,
|
84
|
-
input_mime_types: Optional[List[str]] = None,
|
85
|
-
embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
|
86
|
-
sample_content: Optional[Callable] = None,
|
87
|
-
):
|
88
|
-
args = locals()
|
89
|
-
del args["sample_content"]
|
90
|
-
|
91
|
-
def construct(fn):
|
92
|
-
def wrapper():
|
93
|
-
hint = get_type_hints(fn).get("params", dict)
|
94
|
-
|
95
|
-
if not args.get("name"):
|
96
|
-
args["name"] = (
|
97
|
-
f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
|
98
|
-
)
|
99
|
-
|
100
|
-
class DecoratedFn(Extractor):
|
101
|
-
@classmethod
|
102
|
-
def extract(cls, input: Type[BaseModel], params: Type[BaseModel]=None) -> List[Content]: # type: ignore
|
103
|
-
# TODO we can force all the functions to take in a parms object
|
104
|
-
# or check if someone adds a params
|
105
|
-
if params is None:
|
106
|
-
return fn(input)
|
107
|
-
else:
|
108
|
-
return fn(input, params)
|
109
|
-
|
110
|
-
def sample_input(self) -> Content:
|
111
|
-
return sample_content() if sample_content else self.sample_text()
|
112
|
-
|
113
|
-
for key, val in args.items():
|
114
|
-
setattr(DecoratedFn, key, val)
|
115
|
-
|
116
|
-
return DecoratedFn
|
117
|
-
|
118
|
-
wrapper._extractor_name = fn.__name__
|
119
|
-
|
120
|
-
return wrapper
|
121
|
-
|
122
|
-
return construct
|
indexify-0.0.37.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
indexify/__init__.py,sha256=W58FqmnKHIx-gHKTBDQa1QI49Gi8f1rw90yDg31jwgQ,743
|
2
|
-
indexify/client.py,sha256=faGiWAtdXkL4Vmx6xr0iHJLIBwhS2XZbQ6ld_7sMsBc,25874
|
3
|
-
indexify/data.py,sha256=91We7J2QAKBOTu1yF3ApTl4yl4C-nDL2WSXhBdekLWg,2334
|
4
|
-
indexify/data_loaders/__init__.py,sha256=EiYemxCP4zRfDWnDKiX6-SFwXVmv1TSdcXHBQRbE_Uw,1309
|
5
|
-
indexify/data_loaders/local_directory_loader.py,sha256=kF7VwkuOJFBrhKrR7IOOdZ4TDAItw_CyUOfcuej1CKI,1080
|
6
|
-
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
7
|
-
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
8
|
-
indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
|
9
|
-
indexify/extractor.py,sha256=HnLot4DQv7aVI3FwFNH83LzKjq7DlSR1-wmpcVC89tE,3930
|
10
|
-
indexify/extractor_utils.py,sha256=68V5vZB9GYx648dyyVKAia0M4pG_R31QPqUQz3ZZ1FQ,6593
|
11
|
-
indexify/graph.py,sha256=hUGTpaI3ale54sQ90u5P3-RJCwsSlEJg1V1R0rmCZE0,2576
|
12
|
-
indexify/local_runner.py,sha256=VV4Ff_ctibw0ZL4u1wVA7drRx4zLTgNmT_qLX3Cq2SY,2167
|
13
|
-
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
14
|
-
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
15
|
-
indexify-0.0.37.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
16
|
-
indexify-0.0.37.dist-info/METADATA,sha256=_3uThIPuUiPQ9BBVoqoEEo5Prqp_LHx59jHrZ2CpSgk,1891
|
17
|
-
indexify-0.0.37.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
18
|
-
indexify-0.0.37.dist-info/RECORD,,
|
File without changes
|
File without changes
|