indexify 0.0.43__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +13 -14
- indexify/base_client.py +48 -21
- indexify/cli.py +235 -0
- indexify/client.py +18 -790
- indexify/error.py +3 -30
- indexify/executor/agent.py +362 -0
- indexify/executor/api_objects.py +43 -0
- indexify/executor/downloader.py +124 -0
- indexify/executor/executor_tasks.py +72 -0
- indexify/executor/function_worker.py +177 -0
- indexify/executor/indexify_executor.py +32 -0
- indexify/executor/task_reporter.py +110 -0
- indexify/executor/task_store.py +113 -0
- indexify/foo +72 -0
- indexify/functions_sdk/data_objects.py +37 -0
- indexify/functions_sdk/graph.py +276 -0
- indexify/functions_sdk/graph_validation.py +69 -0
- indexify/functions_sdk/image.py +26 -0
- indexify/functions_sdk/indexify_functions.py +192 -0
- indexify/functions_sdk/local_cache.py +46 -0
- indexify/functions_sdk/object_serializer.py +61 -0
- indexify/local_client.py +183 -0
- indexify/remote_client.py +319 -0
- indexify-0.2.dist-info/METADATA +151 -0
- indexify-0.2.dist-info/RECORD +32 -0
- indexify-0.2.dist-info/entry_points.txt +3 -0
- indexify/exceptions.py +0 -3
- indexify/extraction_policy.py +0 -75
- indexify/extractor_sdk/__init__.py +0 -14
- indexify/extractor_sdk/data.py +0 -100
- indexify/extractor_sdk/extractor.py +0 -225
- indexify/extractor_sdk/utils.py +0 -102
- indexify/extractors/__init__.py +0 -0
- indexify/extractors/embedding.py +0 -55
- indexify/extractors/pdf_parser.py +0 -93
- indexify/graph.py +0 -133
- indexify/local_runner.py +0 -128
- indexify/runner.py +0 -22
- indexify/utils.py +0 -7
- indexify-0.0.43.dist-info/METADATA +0 -66
- indexify-0.0.43.dist-info/RECORD +0 -25
- {indexify-0.0.43.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.43.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
indexify/graph.py
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
import json
|
3
|
-
from collections import defaultdict
|
4
|
-
from typing import Any, Dict, List, Optional, Type, Union
|
5
|
-
|
6
|
-
import cloudpickle
|
7
|
-
from pydantic import BaseModel
|
8
|
-
|
9
|
-
from .extractor_sdk import Content, Extractor, extractor
|
10
|
-
from .runner import Runner
|
11
|
-
|
12
|
-
|
13
|
-
@extractor(description="id function")
|
14
|
-
def _id(content: Content) -> List[Content]:
|
15
|
-
return [content]
|
16
|
-
|
17
|
-
|
18
|
-
def load_graph(graph: bytes) -> "Graph":
|
19
|
-
return cloudpickle.loads(graph)
|
20
|
-
|
21
|
-
|
22
|
-
class Graph:
|
23
|
-
def __init__(
|
24
|
-
self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
|
25
|
-
):
|
26
|
-
# TODO check for cycles
|
27
|
-
self.name = name
|
28
|
-
|
29
|
-
self.nodes: Dict[str, Union[extractor, Extractor]] = {}
|
30
|
-
self.params: Dict[str, Any] = {}
|
31
|
-
|
32
|
-
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
33
|
-
|
34
|
-
self.nodes["start"] = _id
|
35
|
-
self.nodes["end"] = _id
|
36
|
-
|
37
|
-
self._topo_counter = defaultdict(int)
|
38
|
-
|
39
|
-
self._start_node = None
|
40
|
-
self._input = input
|
41
|
-
|
42
|
-
self.runner = runner
|
43
|
-
|
44
|
-
def get_extractor(self, name: str) -> Extractor:
|
45
|
-
return self.nodes[name]
|
46
|
-
|
47
|
-
def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
|
48
|
-
name = extractor.name
|
49
|
-
|
50
|
-
# if you've already inserted a node just ignore the new insertion.
|
51
|
-
if name in self.nodes:
|
52
|
-
return
|
53
|
-
|
54
|
-
self.nodes[name] = extractor
|
55
|
-
self.params[name] = extractor.__dict__.get("params", None)
|
56
|
-
|
57
|
-
# assign each node a rank of 1 to init the graph
|
58
|
-
self._topo_counter[name] = 1
|
59
|
-
|
60
|
-
return self
|
61
|
-
|
62
|
-
def serialize(self):
|
63
|
-
return cloudpickle.dumps(self)
|
64
|
-
|
65
|
-
def add_edge(
|
66
|
-
self,
|
67
|
-
from_node: Type[Extractor],
|
68
|
-
to_node: Type[Extractor],
|
69
|
-
prefilter_predicates: Optional[str] = None,
|
70
|
-
) -> "Graph":
|
71
|
-
|
72
|
-
self._node(from_node)
|
73
|
-
self._node(to_node)
|
74
|
-
|
75
|
-
from_node_name = from_node.name
|
76
|
-
to_node_name = to_node.name
|
77
|
-
|
78
|
-
self.edges[from_node_name].append((to_node_name, prefilter_predicates))
|
79
|
-
|
80
|
-
self._topo_counter[to_node_name] += 1
|
81
|
-
|
82
|
-
return self
|
83
|
-
|
84
|
-
"""
|
85
|
-
Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
|
86
|
-
Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
|
87
|
-
"""
|
88
|
-
|
89
|
-
def steps(
|
90
|
-
self,
|
91
|
-
from_node: extractor,
|
92
|
-
to_nodes: List[extractor],
|
93
|
-
prefilter_predicates: List[str] = [],
|
94
|
-
) -> "Graph":
|
95
|
-
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
96
|
-
for t_n, p in itertools.zip_longest(
|
97
|
-
to_nodes, prefilter_predicates, fillvalue=None
|
98
|
-
):
|
99
|
-
self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
|
100
|
-
|
101
|
-
return self
|
102
|
-
|
103
|
-
def add_param(self, node: extractor, params: Dict[str, Any]):
|
104
|
-
try:
|
105
|
-
# check if the params can be serialized since the server needs this
|
106
|
-
json.dumps(params)
|
107
|
-
except Exception:
|
108
|
-
raise Exception(f"For node {node.name}, cannot serialize params as json.")
|
109
|
-
|
110
|
-
self.params[node.name] = params
|
111
|
-
|
112
|
-
def run(self, wf_input, local):
|
113
|
-
self._assign_start_node()
|
114
|
-
self.runner.run(self, wf_input=wf_input)
|
115
|
-
pass
|
116
|
-
|
117
|
-
def clear_cache_for_node(self, node: Union[extractor, Extractor]):
|
118
|
-
if node.name not in self.nodes.keys():
|
119
|
-
raise Exception(f"Node with name {node.name} not found in graph")
|
120
|
-
|
121
|
-
self.runner.deleted_from_memo(node.name)
|
122
|
-
|
123
|
-
def clear_cache_for_all_nodes(self):
|
124
|
-
for node_name in self.nodes:
|
125
|
-
self.runner.deleted_from_memo(node_name=node_name)
|
126
|
-
|
127
|
-
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
128
|
-
return self.runner.results[node.name]
|
129
|
-
|
130
|
-
def _assign_start_node(self):
|
131
|
-
# this method should be called before a graph can be run
|
132
|
-
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
133
|
-
self._start_node = nodes[0][0]
|
indexify/local_runner.py
DELETED
@@ -1,128 +0,0 @@
|
|
1
|
-
import hashlib
|
2
|
-
import os
|
3
|
-
import pickle
|
4
|
-
import shutil
|
5
|
-
from collections import defaultdict
|
6
|
-
from pathlib import Path
|
7
|
-
from typing import Any, Callable, Dict, Optional, Union
|
8
|
-
|
9
|
-
from indexify.extractor_sdk.data import BaseData, Feature
|
10
|
-
from indexify.extractor_sdk.extractor import Extractor, extractor
|
11
|
-
from indexify.graph import Graph
|
12
|
-
from indexify.runner import Runner
|
13
|
-
|
14
|
-
|
15
|
-
class LocalRunner(Runner):
|
16
|
-
def __init__(self):
|
17
|
-
self.results: Dict[str, Any] = defaultdict(
|
18
|
-
list
|
19
|
-
) # TODO should the Any be Content?
|
20
|
-
|
21
|
-
def run(self, g, wf_input: BaseData):
|
22
|
-
return self._run(g, _input=wf_input, node_name=g._start_node)
|
23
|
-
|
24
|
-
# graph is getting some files which are files, some lables and the MIME type of the bytes
|
25
|
-
# those bytes have to be a python type
|
26
|
-
|
27
|
-
# _input needs to be serializable into python object (ie json for ex) and Feature
|
28
|
-
def _run(self, g: Graph, _input: BaseData, node_name: str):
|
29
|
-
print(f"---- Starting node {node_name}")
|
30
|
-
print(f"node_name {node_name}")
|
31
|
-
|
32
|
-
extractor_construct: Callable = g.nodes[node_name]
|
33
|
-
params = g.params.get(node_name, None)
|
34
|
-
|
35
|
-
# NOTE: User should clear cache for nodes they would like to re-rerun
|
36
|
-
input_hash = hashlib.sha256(str(_input).encode()).hexdigest()
|
37
|
-
memo_output = self.get_from_memo(node_name, input_hash)
|
38
|
-
if memo_output is None:
|
39
|
-
print("=== FYI Writing output to cache")
|
40
|
-
res = extractor_construct().extract(input=_input, params=params)
|
41
|
-
self.put_into_memo(node_name, input_hash, pickle.dumps(res))
|
42
|
-
else:
|
43
|
-
print("=== Reading output from cache")
|
44
|
-
res = pickle.loads(memo_output)
|
45
|
-
|
46
|
-
if not isinstance(res, list):
|
47
|
-
res = [res]
|
48
|
-
|
49
|
-
res_data = [i for i in res if not isinstance(i, Feature)]
|
50
|
-
res_features = [i for i in res if isinstance(i, Feature)]
|
51
|
-
|
52
|
-
self.results[node_name].extend(res_data)
|
53
|
-
|
54
|
-
for f in res_features:
|
55
|
-
_input.meta[f.name] = f.value
|
56
|
-
|
57
|
-
# this assume that if an extractor emits features then the next edge will always process
|
58
|
-
# the edges
|
59
|
-
data_to_process = res_data
|
60
|
-
if len(res_features) > 0:
|
61
|
-
data_to_process.append(_input)
|
62
|
-
|
63
|
-
for out_edge, pre_filter_predicate in g.edges[node_name]:
|
64
|
-
# TODO there are no reductions yet, each recursion finishes it's path and returns
|
65
|
-
for r in data_to_process:
|
66
|
-
if self._prefilter_content(
|
67
|
-
content=r, prefilter_predicate=pre_filter_predicate
|
68
|
-
):
|
69
|
-
continue
|
70
|
-
|
71
|
-
self._run(g, _input=r, node_name=out_edge)
|
72
|
-
|
73
|
-
"""
|
74
|
-
Returns True if content should be filtered
|
75
|
-
"""
|
76
|
-
|
77
|
-
def _prefilter_content(
|
78
|
-
self, content: BaseData, prefilter_predicate: Optional[str]
|
79
|
-
) -> bool:
|
80
|
-
if prefilter_predicate is None:
|
81
|
-
return False
|
82
|
-
|
83
|
-
atoms = prefilter_predicate.split("and")
|
84
|
-
if len(atoms) == 0:
|
85
|
-
return False
|
86
|
-
|
87
|
-
# TODO For now only support `and` and `=` and `string values`
|
88
|
-
bools = []
|
89
|
-
metadata = content.get_features()["metadata"]
|
90
|
-
for atom in atoms:
|
91
|
-
l, r = atom.split("=")
|
92
|
-
if l in metadata:
|
93
|
-
bools.append(metadata[l] != r)
|
94
|
-
|
95
|
-
return all(bools)
|
96
|
-
|
97
|
-
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
98
|
-
node_name = node.name
|
99
|
-
return self.results[node_name]
|
100
|
-
|
101
|
-
def deleted_from_memo(self, node_name):
|
102
|
-
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
103
|
-
|
104
|
-
if os.path.exists(path_prefix) and os.path.isdir(path_prefix):
|
105
|
-
shutil.rmtree(path_prefix)
|
106
|
-
|
107
|
-
def get_from_memo(self, node_name, input_hash):
|
108
|
-
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
109
|
-
file_name = f"{input_hash}"
|
110
|
-
file_path = f"{path_prefix}/{file_name}"
|
111
|
-
|
112
|
-
if not os.path.exists(file_path):
|
113
|
-
return None
|
114
|
-
|
115
|
-
with open(file_path, "rb") as f:
|
116
|
-
return f.read()
|
117
|
-
|
118
|
-
def put_into_memo(self, node_name, input_hash, output):
|
119
|
-
path_prefix = f"./indexify_local_runner_cache/{node_name}"
|
120
|
-
file_name = f"{input_hash}"
|
121
|
-
file_path = f"{path_prefix}/{file_name}"
|
122
|
-
|
123
|
-
os.makedirs(path_prefix, exist_ok=True)
|
124
|
-
|
125
|
-
Path(file_path).touch()
|
126
|
-
|
127
|
-
with open(file_path, "wb") as f:
|
128
|
-
return f.write(output)
|
indexify/runner.py
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
from abc import ABC
|
2
|
-
from typing import Any, Union
|
3
|
-
|
4
|
-
from indexify.extractor_sdk.data import BaseData
|
5
|
-
from indexify.extractor_sdk.extractor import Extractor, extractor
|
6
|
-
|
7
|
-
|
8
|
-
class Runner(ABC):
|
9
|
-
def run(self, g, wf_input: BaseData):
|
10
|
-
raise NotImplementedError()
|
11
|
-
|
12
|
-
def get_result(self, node: Union[extractor, Extractor]) -> Any:
|
13
|
-
raise NotImplementedError()
|
14
|
-
|
15
|
-
def deleted_from_memo(self, node_name):
|
16
|
-
raise NotImplementedError()
|
17
|
-
|
18
|
-
def get_from_memo(self, node_name, input_hash):
|
19
|
-
raise NotImplementedError()
|
20
|
-
|
21
|
-
def put_into_memo(self, node_name, input_hash, output):
|
22
|
-
raise NotImplementedError()
|
indexify/utils.py
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: indexify
|
3
|
-
Version: 0.0.43
|
4
|
-
Summary: Python Client for Indexify
|
5
|
-
Home-page: https://github.com/tensorlakeai/indexify
|
6
|
-
License: Apache 2.0
|
7
|
-
Author: Diptanu Gon Choudhury
|
8
|
-
Author-email: diptanuc@gmail.com
|
9
|
-
Requires-Python: >=3.9,<4.0
|
10
|
-
Classifier: License :: Other/Proprietary License
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Requires-Dist: cloudpickle (>=3,<4)
|
17
|
-
Requires-Dist: httpx[http2] (>=0,<1)
|
18
|
-
Requires-Dist: pydantic (>=2.8,<3.0)
|
19
|
-
Requires-Dist: pyyaml (>=6,<7)
|
20
|
-
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
21
|
-
Description-Content-Type: text/markdown
|
22
|
-
|
23
|
-
# Indexify Python Client
|
24
|
-
|
25
|
-
|
26
|
-
[](https://badge.fury.io/py/indexify)
|
27
|
-
|
28
|
-
## Installation
|
29
|
-
|
30
|
-
This is the Python client for interacting with the Indexify service.
|
31
|
-
|
32
|
-
To install it, simply run:
|
33
|
-
|
34
|
-
```shell
|
35
|
-
pip install indexify
|
36
|
-
```
|
37
|
-
|
38
|
-
## Usage
|
39
|
-
|
40
|
-
See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
|
41
|
-
Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
|
42
|
-
|
43
|
-
## Development
|
44
|
-
|
45
|
-
To install the client from this repository for development:
|
46
|
-
|
47
|
-
```shell
|
48
|
-
cd "path to this repository"
|
49
|
-
pip install -e .
|
50
|
-
```
|
51
|
-
|
52
|
-
Install and run the `poetry` package manager:
|
53
|
-
|
54
|
-
```shell
|
55
|
-
pip install poetry
|
56
|
-
poetry install
|
57
|
-
```
|
58
|
-
|
59
|
-
More information at [https://python-poetry.org/docs/](https://python-poetry.org/docs/).
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
### Environment Variables
|
64
|
-
|
65
|
-
IndexifyClient uses httpx under the hood, so there are many environment variables that can be used to configure the client. More information on supported environment variables can be found [here](https://www.python-httpx.org/environment_variables/).
|
66
|
-
|
indexify-0.0.43.dist-info/RECORD
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
indexify/__init__.py,sha256=e4s2395B3gEGrZk2u5OZO2RtrXYFYUTItaM3mtlusBE,493
|
2
|
-
indexify/base_client.py,sha256=HwT2KJNq8j-KiPVA9RJm-yearSjxifRjXTcP1zUVeo8,2784
|
3
|
-
indexify/client.py,sha256=p4WDmYR94DjU0EqosuCKNGjbfh11qUID6TxDhTK6Uk4,26001
|
4
|
-
indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
|
5
|
-
indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
|
6
|
-
indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
|
7
|
-
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
8
|
-
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
9
|
-
indexify/extraction_policy.py,sha256=927BBtZBDPsLMm01uQDPCZnj3Pwmjh6L6QLHb4ShQKk,2076
|
10
|
-
indexify/extractor_sdk/__init__.py,sha256=DOL-wJvIspWPqjFRBpmhMbnsMZC2JY-NtNwQGiE6IqU,348
|
11
|
-
indexify/extractor_sdk/data.py,sha256=JpX9WdTpiuK72wn6QYhtqj5p5JiJu4waBrK-Hi7lNsA,2742
|
12
|
-
indexify/extractor_sdk/extractor.py,sha256=1SFYXW_vCZt7WdsBfHlfxpuZv6inrsyqRpLCiKvdmX0,9896
|
13
|
-
indexify/extractor_sdk/utils.py,sha256=bW_D2eMWTzcAYZ8Lv7LUKGgOD0cyW77E6gNO3y7iNNA,6234
|
14
|
-
indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
indexify/extractors/embedding.py,sha256=Be6X4odSHbkAEm2myxB04RN-Mvb2bFk8uWXxUpY-Z6E,1859
|
16
|
-
indexify/extractors/pdf_parser.py,sha256=DwHXVbdy-3SDPOo1U_tdp6g5NWs0Tde5h0GoMGTi1WA,2787
|
17
|
-
indexify/graph.py,sha256=UdvrpNc-SdD3U27Ee9aTMMYcSOUz__WQWc31oFHV4yQ,3963
|
18
|
-
indexify/local_runner.py,sha256=uuMJbnT4qYMSySxsB3lEC7FSjYnJFh5eNZ00zu5gLNw,4387
|
19
|
-
indexify/runner.py,sha256=VVmLGF1kAmEuE461Hs0QJFnSvVWtUzYhhQfB1KptYPU,637
|
20
|
-
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
21
|
-
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
22
|
-
indexify-0.0.43.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
-
indexify-0.0.43.dist-info/METADATA,sha256=FHsxTsqQwFL7_ezp2EUjjMTj7fP1Oyma4G_AlTQDq7w,1913
|
24
|
-
indexify-0.0.43.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
25
|
-
indexify-0.0.43.dist-info/RECORD,,
|
File without changes
|
File without changes
|