indexify 0.0.42__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. indexify/__init__.py +13 -14
  2. indexify/base_client.py +48 -21
  3. indexify/cli.py +235 -0
  4. indexify/client.py +18 -790
  5. indexify/error.py +3 -30
  6. indexify/executor/agent.py +362 -0
  7. indexify/executor/api_objects.py +43 -0
  8. indexify/executor/downloader.py +124 -0
  9. indexify/executor/executor_tasks.py +72 -0
  10. indexify/executor/function_worker.py +177 -0
  11. indexify/executor/indexify_executor.py +32 -0
  12. indexify/executor/task_reporter.py +110 -0
  13. indexify/executor/task_store.py +113 -0
  14. indexify/foo +72 -0
  15. indexify/functions_sdk/data_objects.py +37 -0
  16. indexify/functions_sdk/graph.py +276 -0
  17. indexify/functions_sdk/graph_validation.py +69 -0
  18. indexify/functions_sdk/image.py +26 -0
  19. indexify/functions_sdk/indexify_functions.py +192 -0
  20. indexify/functions_sdk/local_cache.py +46 -0
  21. indexify/functions_sdk/object_serializer.py +61 -0
  22. indexify/local_client.py +183 -0
  23. indexify/remote_client.py +319 -0
  24. indexify-0.2.dist-info/METADATA +151 -0
  25. indexify-0.2.dist-info/RECORD +32 -0
  26. indexify-0.2.dist-info/entry_points.txt +3 -0
  27. indexify/exceptions.py +0 -3
  28. indexify/extraction_policy.py +0 -75
  29. indexify/extractor_sdk/__init__.py +0 -14
  30. indexify/extractor_sdk/data.py +0 -100
  31. indexify/extractor_sdk/extractor.py +0 -223
  32. indexify/extractor_sdk/utils.py +0 -102
  33. indexify/extractors/__init__.py +0 -0
  34. indexify/extractors/embedding.py +0 -55
  35. indexify/extractors/pdf_parser.py +0 -93
  36. indexify/graph.py +0 -133
  37. indexify/local_runner.py +0 -128
  38. indexify/runner.py +0 -22
  39. indexify/utils.py +0 -7
  40. indexify-0.0.42.dist-info/METADATA +0 -66
  41. indexify-0.0.42.dist-info/RECORD +0 -25
  42. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
  43. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
indexify/graph.py DELETED
@@ -1,133 +0,0 @@
1
- import itertools
2
- import json
3
- from collections import defaultdict
4
- from typing import Any, Dict, List, Optional, Type, Union
5
-
6
- import cloudpickle
7
- from pydantic import BaseModel
8
-
9
- from .extractor_sdk import Content, Extractor, extractor
10
- from .runner import Runner
11
-
12
-
13
- @extractor(description="id function")
14
- def _id(content: Content) -> List[Content]:
15
- return [content]
16
-
17
-
18
- def load_graph(graph: bytes) -> "Graph":
19
- return cloudpickle.loads(graph)
20
-
21
-
22
- class Graph:
23
- def __init__(
24
- self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
25
- ):
26
- # TODO check for cycles
27
- self.name = name
28
-
29
- self.nodes: Dict[str, Union[extractor, Extractor]] = {}
30
- self.params: Dict[str, Any] = {}
31
-
32
- self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
33
-
34
- self.nodes["start"] = _id
35
- self.nodes["end"] = _id
36
-
37
- self._topo_counter = defaultdict(int)
38
-
39
- self._start_node = None
40
- self._input = input
41
-
42
- self.runner = runner
43
-
44
- def get_extractor(self, name: str) -> Extractor:
45
- return self.nodes[name]
46
-
47
- def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
48
- name = extractor.name
49
-
50
- # if you've already inserted a node just ignore the new insertion.
51
- if name in self.nodes:
52
- return
53
-
54
- self.nodes[name] = extractor
55
- self.params[name] = extractor.__dict__.get("params", None)
56
-
57
- # assign each node a rank of 1 to init the graph
58
- self._topo_counter[name] = 1
59
-
60
- return self
61
-
62
- def serialize(self):
63
- return cloudpickle.dumps(self)
64
-
65
- def add_edge(
66
- self,
67
- from_node: Type[Extractor],
68
- to_node: Type[Extractor],
69
- prefilter_predicates: Optional[str] = None,
70
- ) -> "Graph":
71
-
72
- self._node(from_node)
73
- self._node(to_node)
74
-
75
- from_node_name = from_node.name
76
- to_node_name = to_node.name
77
-
78
- self.edges[from_node_name].append((to_node_name, prefilter_predicates))
79
-
80
- self._topo_counter[to_node_name] += 1
81
-
82
- return self
83
-
84
- """
85
- Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
86
- Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
87
- """
88
-
89
- def steps(
90
- self,
91
- from_node: extractor,
92
- to_nodes: List[extractor],
93
- prefilter_predicates: List[str] = [],
94
- ) -> "Graph":
95
- print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
96
- for t_n, p in itertools.zip_longest(
97
- to_nodes, prefilter_predicates, fillvalue=None
98
- ):
99
- self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
100
-
101
- return self
102
-
103
- def add_param(self, node: extractor, params: Dict[str, Any]):
104
- try:
105
- # check if the params can be serialized since the server needs this
106
- json.dumps(params)
107
- except Exception:
108
- raise Exception(f"For node {node.name}, cannot serialize params as json.")
109
-
110
- self.params[node.name] = params
111
-
112
- def run(self, wf_input, local):
113
- self._assign_start_node()
114
- self.runner.run(self, wf_input=wf_input)
115
- pass
116
-
117
- def clear_cache_for_node(self, node: Union[extractor, Extractor]):
118
- if node.name not in self.nodes.keys():
119
- raise Exception(f"Node with name {node.name} not found in graph")
120
-
121
- self.runner.deleted_from_memo(node.name)
122
-
123
- def clear_cache_for_all_nodes(self):
124
- for node_name in self.nodes:
125
- self.runner.deleted_from_memo(node_name=node_name)
126
-
127
- def get_result(self, node: Union[extractor, Extractor]) -> Any:
128
- return self.runner.results[node.name]
129
-
130
- def _assign_start_node(self):
131
- # this method should be called before a graph can be run
132
- nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
133
- self._start_node = nodes[0][0]
indexify/local_runner.py DELETED
@@ -1,128 +0,0 @@
1
- import hashlib
2
- import os
3
- import pickle
4
- import shutil
5
- from collections import defaultdict
6
- from pathlib import Path
7
- from typing import Any, Callable, Dict, Optional, Union
8
-
9
- from indexify.extractor_sdk.data import BaseData, Feature
10
- from indexify.extractor_sdk.extractor import Extractor, extractor
11
- from indexify.graph import Graph
12
- from indexify.runner import Runner
13
-
14
-
15
- class LocalRunner(Runner):
16
- def __init__(self):
17
- self.results: Dict[str, Any] = defaultdict(
18
- list
19
- ) # TODO should the Any be Content?
20
-
21
- def run(self, g, wf_input: BaseData):
22
- return self._run(g, _input=wf_input, node_name=g._start_node)
23
-
24
- # graph is getting some files which are files, some lables and the MIME type of the bytes
25
- # those bytes have to be a python type
26
-
27
- # _input needs to be serializable into python object (ie json for ex) and Feature
28
- def _run(self, g: Graph, _input: BaseData, node_name: str):
29
- print(f"---- Starting node {node_name}")
30
- print(f"node_name {node_name}")
31
-
32
- extractor_construct: Callable = g.nodes[node_name]
33
- params = g.params.get(node_name, None)
34
-
35
- # NOTE: User should clear cache for nodes they would like to re-rerun
36
- input_hash = hashlib.sha256(str(_input).encode()).hexdigest()
37
- memo_output = self.get_from_memo(node_name, input_hash)
38
- if memo_output is None:
39
- print("=== FYI Writing output to cache")
40
- res = extractor_construct().extract(input=_input, params=params)
41
- self.put_into_memo(node_name, input_hash, pickle.dumps(res))
42
- else:
43
- print("=== Reading output from cache")
44
- res = pickle.loads(memo_output)
45
-
46
- if not isinstance(res, list):
47
- res = [res]
48
-
49
- res_data = [i for i in res if not isinstance(i, Feature)]
50
- res_features = [i for i in res if isinstance(i, Feature)]
51
-
52
- self.results[node_name].extend(res_data)
53
-
54
- for f in res_features:
55
- _input.meta[f.name] = f.value
56
-
57
- # this assume that if an extractor emits features then the next edge will always process
58
- # the edges
59
- data_to_process = res_data
60
- if len(res_features) > 0:
61
- data_to_process.append(_input)
62
-
63
- for out_edge, pre_filter_predicate in g.edges[node_name]:
64
- # TODO there are no reductions yet, each recursion finishes it's path and returns
65
- for r in data_to_process:
66
- if self._prefilter_content(
67
- content=r, prefilter_predicate=pre_filter_predicate
68
- ):
69
- continue
70
-
71
- self._run(g, _input=r, node_name=out_edge)
72
-
73
- """
74
- Returns True if content should be filtered
75
- """
76
-
77
- def _prefilter_content(
78
- self, content: BaseData, prefilter_predicate: Optional[str]
79
- ) -> bool:
80
- if prefilter_predicate is None:
81
- return False
82
-
83
- atoms = prefilter_predicate.split("and")
84
- if len(atoms) == 0:
85
- return False
86
-
87
- # TODO For now only support `and` and `=` and `string values`
88
- bools = []
89
- metadata = content.get_features()["metadata"]
90
- for atom in atoms:
91
- l, r = atom.split("=")
92
- if l in metadata:
93
- bools.append(metadata[l] != r)
94
-
95
- return all(bools)
96
-
97
- def get_result(self, node: Union[extractor, Extractor]) -> Any:
98
- node_name = node.name
99
- return self.results[node_name]
100
-
101
- def deleted_from_memo(self, node_name):
102
- path_prefix = f"./indexify_local_runner_cache/{node_name}"
103
-
104
- if os.path.exists(path_prefix) and os.path.isdir(path_prefix):
105
- shutil.rmtree(path_prefix)
106
-
107
- def get_from_memo(self, node_name, input_hash):
108
- path_prefix = f"./indexify_local_runner_cache/{node_name}"
109
- file_name = f"{input_hash}"
110
- file_path = f"{path_prefix}/{file_name}"
111
-
112
- if not os.path.exists(file_path):
113
- return None
114
-
115
- with open(file_path, "rb") as f:
116
- return f.read()
117
-
118
- def put_into_memo(self, node_name, input_hash, output):
119
- path_prefix = f"./indexify_local_runner_cache/{node_name}"
120
- file_name = f"{input_hash}"
121
- file_path = f"{path_prefix}/{file_name}"
122
-
123
- os.makedirs(path_prefix, exist_ok=True)
124
-
125
- Path(file_path).touch()
126
-
127
- with open(file_path, "wb") as f:
128
- return f.write(output)
indexify/runner.py DELETED
@@ -1,22 +0,0 @@
1
- from abc import ABC
2
- from typing import Any, Union
3
-
4
- from indexify.extractor_sdk.data import BaseData
5
- from indexify.extractor_sdk.extractor import Extractor, extractor
6
-
7
-
8
- class Runner(ABC):
9
- def run(self, g, wf_input: BaseData):
10
- raise NotImplementedError()
11
-
12
- def get_result(self, node: Union[extractor, Extractor]) -> Any:
13
- raise NotImplementedError()
14
-
15
- def deleted_from_memo(self, node_name):
16
- raise NotImplementedError()
17
-
18
- def get_from_memo(self, node_name, input_hash):
19
- raise NotImplementedError()
20
-
21
- def put_into_memo(self, node_name, input_hash, output):
22
- raise NotImplementedError()
indexify/utils.py DELETED
@@ -1,7 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- def json_set_default(obj):
5
- if isinstance(obj, set):
6
- return list(obj)
7
- raise TypeError
@@ -1,66 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: indexify
3
- Version: 0.0.42
4
- Summary: Python Client for Indexify
5
- Home-page: https://github.com/tensorlakeai/indexify
6
- License: Apache 2.0
7
- Author: Diptanu Gon Choudhury
8
- Author-email: diptanuc@gmail.com
9
- Requires-Python: >=3.9,<4.0
10
- Classifier: License :: Other/Proprietary License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.9
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Requires-Dist: cloudpickle (>=3,<4)
17
- Requires-Dist: httpx[http2] (>=0,<1)
18
- Requires-Dist: pydantic (>=2.8,<3.0)
19
- Requires-Dist: pyyaml (>=6,<7)
20
- Project-URL: Repository, https://github.com/tensorlakeai/indexify
21
- Description-Content-Type: text/markdown
22
-
23
- # Indexify Python Client
24
-
25
-
26
- [![PyPI version](https://badge.fury.io/py/indexify.svg)](https://badge.fury.io/py/indexify)
27
-
28
- ## Installation
29
-
30
- This is the Python client for interacting with the Indexify service.
31
-
32
- To install it, simply run:
33
-
34
- ```shell
35
- pip install indexify
36
- ```
37
-
38
- ## Usage
39
-
40
- See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
41
- Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
42
-
43
- ## Development
44
-
45
- To install the client from this repository for development:
46
-
47
- ```shell
48
- cd "path to this repository"
49
- pip install -e .
50
- ```
51
-
52
- Install and run the `poetry` package manager:
53
-
54
- ```shell
55
- pip install poetry
56
- poetry install
57
- ```
58
-
59
- More information at [https://python-poetry.org/docs/](https://python-poetry.org/docs/).
60
-
61
-
62
-
63
- ### Environment Variables
64
-
65
- IndexifyClient uses httpx under the hood, so there are many environment variables that can be used to configure the client. More information on supported environment variables can be found [here](https://www.python-httpx.org/environment_variables/).
66
-
@@ -1,25 +0,0 @@
1
- indexify/__init__.py,sha256=e4s2395B3gEGrZk2u5OZO2RtrXYFYUTItaM3mtlusBE,493
2
- indexify/base_client.py,sha256=HwT2KJNq8j-KiPVA9RJm-yearSjxifRjXTcP1zUVeo8,2784
3
- indexify/client.py,sha256=p4WDmYR94DjU0EqosuCKNGjbfh11qUID6TxDhTK6Uk4,26001
4
- indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
5
- indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
6
- indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
7
- indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
8
- indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
9
- indexify/extraction_policy.py,sha256=927BBtZBDPsLMm01uQDPCZnj3Pwmjh6L6QLHb4ShQKk,2076
10
- indexify/extractor_sdk/__init__.py,sha256=DOL-wJvIspWPqjFRBpmhMbnsMZC2JY-NtNwQGiE6IqU,348
11
- indexify/extractor_sdk/data.py,sha256=JpX9WdTpiuK72wn6QYhtqj5p5JiJu4waBrK-Hi7lNsA,2742
12
- indexify/extractor_sdk/extractor.py,sha256=IEZvr1Qe-dVmTgAeJFAhEyHUW20n4uTEeEassH3C5j4,9858
13
- indexify/extractor_sdk/utils.py,sha256=bW_D2eMWTzcAYZ8Lv7LUKGgOD0cyW77E6gNO3y7iNNA,6234
14
- indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- indexify/extractors/embedding.py,sha256=Be6X4odSHbkAEm2myxB04RN-Mvb2bFk8uWXxUpY-Z6E,1859
16
- indexify/extractors/pdf_parser.py,sha256=DwHXVbdy-3SDPOo1U_tdp6g5NWs0Tde5h0GoMGTi1WA,2787
17
- indexify/graph.py,sha256=UdvrpNc-SdD3U27Ee9aTMMYcSOUz__WQWc31oFHV4yQ,3963
18
- indexify/local_runner.py,sha256=uuMJbnT4qYMSySxsB3lEC7FSjYnJFh5eNZ00zu5gLNw,4387
19
- indexify/runner.py,sha256=VVmLGF1kAmEuE461Hs0QJFnSvVWtUzYhhQfB1KptYPU,637
20
- indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
21
- indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
22
- indexify-0.0.42.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- indexify-0.0.42.dist-info/METADATA,sha256=kA0oqiewM71YLHKk-cXX-nR7ERJDkC9rVZqjF2LX_zI,1913
24
- indexify-0.0.42.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
- indexify-0.0.42.dist-info/RECORD,,