indexify 0.0.39__tar.gz → 0.0.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.39 → indexify-0.0.40}/PKG-INFO +2 -1
- {indexify-0.0.39 → indexify-0.0.40}/indexify/__init__.py +2 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/url_loader.py +0 -1
- {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/data.py +1 -1
- {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/extractor.py +7 -27
- indexify-0.0.40/indexify/extractors/__init__.py +0 -0
- indexify-0.0.40/indexify/extractors/embedding.py +53 -0
- indexify-0.0.39/indexify/run_graph.py → indexify-0.0.40/indexify/graph.py +19 -12
- {indexify-0.0.39 → indexify-0.0.40}/indexify/local_runner.py +2 -2
- {indexify-0.0.39 → indexify-0.0.40}/pyproject.toml +2 -1
- indexify-0.0.39/indexify/graph.py +0 -23
- {indexify-0.0.39 → indexify-0.0.40}/LICENSE.txt +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/README.md +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/base_client.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/client.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/__init__.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/local_directory_loader.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/error.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/exceptions.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/extraction_policy.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/__init__.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/utils.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/runner.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/settings.py +0 -0
- {indexify-0.0.39 → indexify-0.0.40}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.40
|
4
4
|
Summary: Python Client for Indexify
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: cloudpickle (>=3,<4)
|
16
17
|
Requires-Dist: httpx[http2] (>=0,<1)
|
17
18
|
Requires-Dist: pydantic (>=2.8,<3.0)
|
18
19
|
Requires-Dist: pyyaml (>=6,<7)
|
@@ -9,9 +9,11 @@ from .client import (
|
|
9
9
|
from . import extractor_sdk
|
10
10
|
from .settings import DEFAULT_SERVICE_URL
|
11
11
|
from . import data_loaders
|
12
|
+
from .graph import Graph
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"data_loaders",
|
16
|
+
"Graph",
|
15
17
|
"Document",
|
16
18
|
"extractor_sdk",
|
17
19
|
"IndexifyClient",
|
@@ -10,7 +10,8 @@ import requests
|
|
10
10
|
|
11
11
|
class EmbeddingSchema(BaseModel):
|
12
12
|
dim: int
|
13
|
-
distance: str = "cosine"
|
13
|
+
distance: Optional[str] = "cosine"
|
14
|
+
database_url: Optional[str] = None
|
14
15
|
|
15
16
|
class ExtractorMetadata(BaseModel):
|
16
17
|
name: str
|
@@ -40,8 +41,8 @@ class Extractor(ABC):
|
|
40
41
|
|
41
42
|
input_mime_types = ["text/plain"]
|
42
43
|
|
43
|
-
|
44
|
-
|
44
|
+
embedding_indexes: Dict[str, EmbeddingSchema] = {}
|
45
|
+
|
45
46
|
@abstractmethod
|
46
47
|
def extract(
|
47
48
|
self, input: Type[BaseModel], params: Type[BaseModel] = None
|
@@ -55,31 +56,9 @@ class Extractor(ABC):
|
|
55
56
|
pass
|
56
57
|
|
57
58
|
@classmethod
|
58
|
-
@abstractmethod
|
59
59
|
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
60
60
|
pass
|
61
61
|
|
62
|
-
def describe(self) -> ExtractorMetadata:
|
63
|
-
embedding_schemas = {}
|
64
|
-
try:
|
65
|
-
embedding_schemas = self.embedding_schemas
|
66
|
-
except NotImplementedError:
|
67
|
-
pass
|
68
|
-
|
69
|
-
json_schema = (
|
70
|
-
self._param_cls.model_json_schema() if self._param_cls is not None else None
|
71
|
-
)
|
72
|
-
return ExtractorMetadata(
|
73
|
-
name=self.name,
|
74
|
-
version=self.version,
|
75
|
-
description=self.description,
|
76
|
-
system_dependencies=self.system_dependencies,
|
77
|
-
python_dependencies=self.python_dependencies,
|
78
|
-
input_mime_types=self.input_mime_types,
|
79
|
-
embedding_schemas=embedding_schemas,
|
80
|
-
input_params=json.dumps(json_schema),
|
81
|
-
)
|
82
|
-
|
83
62
|
def _download_file(self, url, filename):
|
84
63
|
if os.path.exists(filename):
|
85
64
|
# file exists skip
|
@@ -190,7 +169,7 @@ def extractor(
|
|
190
169
|
python_dependencies: Optional[List[str]] = None,
|
191
170
|
system_dependencies: Optional[List[str]] = None,
|
192
171
|
input_mime_types: Optional[List[str]] = None,
|
193
|
-
|
172
|
+
embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
|
194
173
|
sample_content: Optional[Callable] = None,
|
195
174
|
):
|
196
175
|
args = locals()
|
@@ -198,7 +177,7 @@ def extractor(
|
|
198
177
|
|
199
178
|
def construct(fn):
|
200
179
|
def wrapper():
|
201
|
-
|
180
|
+
description = fn.__doc__ or args.get("description", "")
|
202
181
|
|
203
182
|
if not args.get("name"):
|
204
183
|
args[
|
@@ -220,6 +199,7 @@ def extractor(
|
|
220
199
|
|
221
200
|
for key, val in args.items():
|
222
201
|
setattr(DecoratedFn, key, val)
|
202
|
+
DecoratedFn.description = description
|
223
203
|
|
224
204
|
return DecoratedFn
|
225
205
|
|
File without changes
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
from indexify.extractor_sdk.data import Feature
|
4
|
+
import torch
|
5
|
+
import torch.nn.functional as F
|
6
|
+
from transformers import AutoModel, AutoTokenizer
|
7
|
+
from indexify.extractor_sdk.extractor import Extractor , Feature
|
8
|
+
|
9
|
+
class SentenceTransformersEmbedding:
|
10
|
+
def __init__(self, model_name) -> None:
|
11
|
+
self._model_name = model_name
|
12
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
13
|
+
f"sentence-transformers/{model_name}"
|
14
|
+
)
|
15
|
+
self._model = AutoModel.from_pretrained(
|
16
|
+
f"sentence-transformers/{model_name}", torchscript=True
|
17
|
+
)
|
18
|
+
self._model.eval()
|
19
|
+
|
20
|
+
def embed_batch(self, inputs: List[str]) -> List[List[float]]:
|
21
|
+
result = self._embed(inputs)
|
22
|
+
return result.tolist()
|
23
|
+
|
24
|
+
def embed(self, query: str) -> List[float]:
|
25
|
+
result = self._embed([query])
|
26
|
+
return result[0].tolist()
|
27
|
+
|
28
|
+
def _embed(self, inputs: List[str]) -> torch.Tensor:
|
29
|
+
encoded_input = self._tokenizer(
|
30
|
+
inputs, padding=True, truncation=True, return_tensors="pt"
|
31
|
+
)
|
32
|
+
sentence_embeddings = self._model(**encoded_input)
|
33
|
+
return F.normalize(sentence_embeddings, p=2, dim=1)
|
34
|
+
|
35
|
+
class BasicSentenceTransformerModels(Extractor):
|
36
|
+
|
37
|
+
def __init__(self, model: str):
|
38
|
+
super().__init__()
|
39
|
+
self.model = SentenceTransformersEmbedding(model)
|
40
|
+
|
41
|
+
def extract(self, input: str) -> List[Feature]:
|
42
|
+
embeddings = self.model.embed(input)
|
43
|
+
return [Feature.embedding(values=embeddings)]
|
44
|
+
|
45
|
+
class BasicHFTransformerEmbeddingModels(Extractor):
|
46
|
+
|
47
|
+
def __init__(self, model: str):
|
48
|
+
super().__init__()
|
49
|
+
self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
|
50
|
+
|
51
|
+
def extract(self, input: str) -> List[Feature]:
|
52
|
+
embeddings = self.model.embed_query(input)
|
53
|
+
return [Feature.embedding(values=embeddings)]
|
@@ -1,13 +1,12 @@
|
|
1
1
|
import json
|
2
|
-
|
3
|
-
from .extractor_sdk import Content, extractor, Extractor
|
4
|
-
|
2
|
+
import itertools
|
5
3
|
from collections import defaultdict
|
6
4
|
from typing import Any, Dict, List, Optional, Type, Union
|
7
|
-
from pydantic import BaseModel
|
8
5
|
|
9
|
-
import
|
6
|
+
import cloudpickle
|
7
|
+
from pydantic import BaseModel
|
10
8
|
|
9
|
+
from .extractor_sdk import Content, extractor, Extractor
|
11
10
|
from .runner import Runner
|
12
11
|
|
13
12
|
@extractor(description="id function")
|
@@ -15,7 +14,10 @@ def _id(content: Content) -> List[Content]:
|
|
15
14
|
return [content]
|
16
15
|
|
17
16
|
|
18
|
-
|
17
|
+
def load_graph(graph: bytes) -> 'Graph':
|
18
|
+
return cloudpickle.loads(graph)
|
19
|
+
|
20
|
+
class Graph:
|
19
21
|
def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
|
20
22
|
# TODO check for cycles
|
21
23
|
self.name = name
|
@@ -35,7 +37,10 @@ class RunGraph:
|
|
35
37
|
|
36
38
|
self.runner = runner
|
37
39
|
|
38
|
-
def
|
40
|
+
def get_extractor(self, name: str) -> Extractor:
|
41
|
+
return self.nodes[name]
|
42
|
+
|
43
|
+
def _node(self, extractor: Extractor, params: Any = None) -> 'Graph':
|
39
44
|
name = extractor.name
|
40
45
|
|
41
46
|
# if you've already inserted a node just ignore the new insertion.
|
@@ -49,13 +54,16 @@ class RunGraph:
|
|
49
54
|
self._topo_counter[name] = 1
|
50
55
|
|
51
56
|
return self
|
57
|
+
|
58
|
+
def serialize(self):
|
59
|
+
return cloudpickle.dumps(self)
|
52
60
|
|
53
61
|
def add_edge(
|
54
62
|
self,
|
55
|
-
from_node:
|
56
|
-
to_node:
|
63
|
+
from_node: Type[Extractor],
|
64
|
+
to_node: Type[Extractor],
|
57
65
|
prefilter_predicates: Optional[str] = None,
|
58
|
-
) -> '
|
66
|
+
) -> 'Graph':
|
59
67
|
|
60
68
|
self._node(from_node)
|
61
69
|
self._node(to_node)
|
@@ -79,7 +87,7 @@ class RunGraph:
|
|
79
87
|
from_node: extractor,
|
80
88
|
to_nodes: List[extractor],
|
81
89
|
prefilter_predicates: List[str] = [],
|
82
|
-
) -> '
|
90
|
+
) -> 'Graph':
|
83
91
|
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
84
92
|
for t_n, p in itertools.zip_longest(
|
85
93
|
to_nodes, prefilter_predicates, fillvalue=None
|
@@ -99,7 +107,6 @@ class RunGraph:
|
|
99
107
|
|
100
108
|
def run(self, wf_input, local):
|
101
109
|
self._assign_start_node()
|
102
|
-
# self.runner = LocalRunner()
|
103
110
|
self.runner.run(self, wf_input=wf_input)
|
104
111
|
pass
|
105
112
|
|
@@ -10,7 +10,7 @@ from indexify.extractor_sdk.extractor import extractor, Extractor
|
|
10
10
|
from collections import defaultdict
|
11
11
|
from typing import Any, Callable, Dict, Optional, Union
|
12
12
|
|
13
|
-
from indexify.
|
13
|
+
from indexify.graph import Graph
|
14
14
|
from indexify.runner import Runner
|
15
15
|
|
16
16
|
|
@@ -27,7 +27,7 @@ class LocalRunner(Runner):
|
|
27
27
|
# those bytes have to be a python type
|
28
28
|
|
29
29
|
# _input needs to be serializable into python object (ie json for ex) and Feature
|
30
|
-
def _run(self, g:
|
30
|
+
def _run(self, g: Graph, _input: BaseData, node_name: str):
|
31
31
|
print(f"---- Starting node {node_name}")
|
32
32
|
print(f'node_name {node_name}')
|
33
33
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.40"
|
4
4
|
description = "Python Client for Indexify"
|
5
5
|
authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
|
6
6
|
license = "Apache 2.0"
|
@@ -13,6 +13,7 @@ python = "^3.9"
|
|
13
13
|
httpx = { version = "^0", extras = ["http2"] }
|
14
14
|
pyyaml = "^6"
|
15
15
|
pydantic = "^2.8"
|
16
|
+
cloudpickle = "^3"
|
16
17
|
|
17
18
|
[tool.poetry.dev-dependencies]
|
18
19
|
black = "^22.3.0"
|
@@ -1,23 +0,0 @@
|
|
1
|
-
from .extractor_sdk import extractor, Extractor
|
2
|
-
|
3
|
-
from typing import Type, Union
|
4
|
-
from pydantic import BaseModel
|
5
|
-
|
6
|
-
from .run_graph import RunGraph
|
7
|
-
from .local_runner import LocalRunner
|
8
|
-
|
9
|
-
|
10
|
-
def Graph(
|
11
|
-
name: str,
|
12
|
-
input: Type[BaseModel],
|
13
|
-
start_node: Union[extractor, Extractor],
|
14
|
-
run_local: bool,
|
15
|
-
) -> RunGraph:
|
16
|
-
|
17
|
-
if run_local:
|
18
|
-
runner = LocalRunner()
|
19
|
-
else:
|
20
|
-
raise NotImplementedError("Remote runner not supported yet")
|
21
|
-
|
22
|
-
graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
|
23
|
-
return graph
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|