indexify 0.0.39__tar.gz → 0.0.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {indexify-0.0.39 → indexify-0.0.40}/PKG-INFO +2 -1
  2. {indexify-0.0.39 → indexify-0.0.40}/indexify/__init__.py +2 -0
  3. {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/url_loader.py +0 -1
  4. {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/data.py +1 -1
  5. {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/extractor.py +7 -27
  6. indexify-0.0.40/indexify/extractors/__init__.py +0 -0
  7. indexify-0.0.40/indexify/extractors/embedding.py +53 -0
  8. indexify-0.0.39/indexify/run_graph.py → indexify-0.0.40/indexify/graph.py +19 -12
  9. {indexify-0.0.39 → indexify-0.0.40}/indexify/local_runner.py +2 -2
  10. {indexify-0.0.39 → indexify-0.0.40}/pyproject.toml +2 -1
  11. indexify-0.0.39/indexify/graph.py +0 -23
  12. {indexify-0.0.39 → indexify-0.0.40}/LICENSE.txt +0 -0
  13. {indexify-0.0.39 → indexify-0.0.40}/README.md +0 -0
  14. {indexify-0.0.39 → indexify-0.0.40}/indexify/base_client.py +0 -0
  15. {indexify-0.0.39 → indexify-0.0.40}/indexify/client.py +0 -0
  16. {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/__init__.py +0 -0
  17. {indexify-0.0.39 → indexify-0.0.40}/indexify/data_loaders/local_directory_loader.py +0 -0
  18. {indexify-0.0.39 → indexify-0.0.40}/indexify/error.py +0 -0
  19. {indexify-0.0.39 → indexify-0.0.40}/indexify/exceptions.py +0 -0
  20. {indexify-0.0.39 → indexify-0.0.40}/indexify/extraction_policy.py +0 -0
  21. {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/__init__.py +0 -0
  22. {indexify-0.0.39 → indexify-0.0.40}/indexify/extractor_sdk/utils.py +0 -0
  23. {indexify-0.0.39 → indexify-0.0.40}/indexify/runner.py +0 -0
  24. {indexify-0.0.39 → indexify-0.0.40}/indexify/settings.py +0 -0
  25. {indexify-0.0.39 → indexify-0.0.40}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.39
3
+ Version: 0.0.40
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: cloudpickle (>=3,<4)
16
17
  Requires-Dist: httpx[http2] (>=0,<1)
17
18
  Requires-Dist: pydantic (>=2.8,<3.0)
18
19
  Requires-Dist: pyyaml (>=6,<7)
@@ -9,9 +9,11 @@ from .client import (
9
9
  from . import extractor_sdk
10
10
  from .settings import DEFAULT_SERVICE_URL
11
11
  from . import data_loaders
12
+ from .graph import Graph
12
13
 
13
14
  __all__ = [
14
15
  "data_loaders",
16
+ "Graph",
15
17
  "Document",
16
18
  "extractor_sdk",
17
19
  "IndexifyClient",
@@ -1,7 +1,6 @@
1
1
  from . import DataLoader, FileMetadata
2
2
  from typing import List
3
3
  import httpx
4
- import hashlib
5
4
  import email.utils
6
5
 
7
6
 
@@ -103,6 +103,6 @@ class ContentMetadata(BaseModel):
103
103
  )
104
104
 
105
105
 
106
- class PDFFile(BaseData):
106
+ class File(BaseData):
107
107
  data: bytes
108
108
  mime_type: str
@@ -10,7 +10,8 @@ import requests
10
10
 
11
11
  class EmbeddingSchema(BaseModel):
12
12
  dim: int
13
- distance: str = "cosine"
13
+ distance: Optional[str] = "cosine"
14
+ database_url: Optional[str] = None
14
15
 
15
16
  class ExtractorMetadata(BaseModel):
16
17
  name: str
@@ -40,8 +41,8 @@ class Extractor(ABC):
40
41
 
41
42
  input_mime_types = ["text/plain"]
42
43
 
43
- embeddings: Dict[str, EmbeddingSchema] = {}
44
-
44
+ embedding_indexes: Dict[str, EmbeddingSchema] = {}
45
+
45
46
  @abstractmethod
46
47
  def extract(
47
48
  self, input: Type[BaseModel], params: Type[BaseModel] = None
@@ -55,31 +56,9 @@ class Extractor(ABC):
55
56
  pass
56
57
 
57
58
  @classmethod
58
- @abstractmethod
59
59
  def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
60
60
  pass
61
61
 
62
- def describe(self) -> ExtractorMetadata:
63
- embedding_schemas = {}
64
- try:
65
- embedding_schemas = self.embedding_schemas
66
- except NotImplementedError:
67
- pass
68
-
69
- json_schema = (
70
- self._param_cls.model_json_schema() if self._param_cls is not None else None
71
- )
72
- return ExtractorMetadata(
73
- name=self.name,
74
- version=self.version,
75
- description=self.description,
76
- system_dependencies=self.system_dependencies,
77
- python_dependencies=self.python_dependencies,
78
- input_mime_types=self.input_mime_types,
79
- embedding_schemas=embedding_schemas,
80
- input_params=json.dumps(json_schema),
81
- )
82
-
83
62
  def _download_file(self, url, filename):
84
63
  if os.path.exists(filename):
85
64
  # file exists skip
@@ -190,7 +169,7 @@ def extractor(
190
169
  python_dependencies: Optional[List[str]] = None,
191
170
  system_dependencies: Optional[List[str]] = None,
192
171
  input_mime_types: Optional[List[str]] = None,
193
- embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
172
+ embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
194
173
  sample_content: Optional[Callable] = None,
195
174
  ):
196
175
  args = locals()
@@ -198,7 +177,7 @@ def extractor(
198
177
 
199
178
  def construct(fn):
200
179
  def wrapper():
201
- hint = get_type_hints(fn).get("params", dict)
180
+ description = fn.__doc__ or args.get("description", "")
202
181
 
203
182
  if not args.get("name"):
204
183
  args[
@@ -220,6 +199,7 @@ def extractor(
220
199
 
221
200
  for key, val in args.items():
222
201
  setattr(DecoratedFn, key, val)
202
+ DecoratedFn.description = description
223
203
 
224
204
  return DecoratedFn
225
205
 
File without changes
@@ -0,0 +1,53 @@
1
+ from typing import List
2
+
3
+ from indexify.extractor_sdk.data import Feature
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from indexify.extractor_sdk.extractor import Extractor , Feature
8
+
9
+ class SentenceTransformersEmbedding:
10
+ def __init__(self, model_name) -> None:
11
+ self._model_name = model_name
12
+ self._tokenizer = AutoTokenizer.from_pretrained(
13
+ f"sentence-transformers/{model_name}"
14
+ )
15
+ self._model = AutoModel.from_pretrained(
16
+ f"sentence-transformers/{model_name}", torchscript=True
17
+ )
18
+ self._model.eval()
19
+
20
+ def embed_batch(self, inputs: List[str]) -> List[List[float]]:
21
+ result = self._embed(inputs)
22
+ return result.tolist()
23
+
24
+ def embed(self, query: str) -> List[float]:
25
+ result = self._embed([query])
26
+ return result[0].tolist()
27
+
28
+ def _embed(self, inputs: List[str]) -> torch.Tensor:
29
+ encoded_input = self._tokenizer(
30
+ inputs, padding=True, truncation=True, return_tensors="pt"
31
+ )
32
+ sentence_embeddings = self._model(**encoded_input)
33
+ return F.normalize(sentence_embeddings, p=2, dim=1)
34
+
35
+ class BasicSentenceTransformerModels(Extractor):
36
+
37
+ def __init__(self, model: str):
38
+ super().__init__()
39
+ self.model = SentenceTransformersEmbedding(model)
40
+
41
+ def extract(self, input: str) -> List[Feature]:
42
+ embeddings = self.model.embed(input)
43
+ return [Feature.embedding(values=embeddings)]
44
+
45
+ class BasicHFTransformerEmbeddingModels(Extractor):
46
+
47
+ def __init__(self, model: str):
48
+ super().__init__()
49
+ self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
50
+
51
+ def extract(self, input: str) -> List[Feature]:
52
+ embeddings = self.model.embed_query(input)
53
+ return [Feature.embedding(values=embeddings)]
@@ -1,13 +1,12 @@
1
1
  import json
2
-
3
- from .extractor_sdk import Content, extractor, Extractor
4
-
2
+ import itertools
5
3
  from collections import defaultdict
6
4
  from typing import Any, Dict, List, Optional, Type, Union
7
- from pydantic import BaseModel
8
5
 
9
- import itertools
6
+ import cloudpickle
7
+ from pydantic import BaseModel
10
8
 
9
+ from .extractor_sdk import Content, extractor, Extractor
11
10
  from .runner import Runner
12
11
 
13
12
  @extractor(description="id function")
@@ -15,7 +14,10 @@ def _id(content: Content) -> List[Content]:
15
14
  return [content]
16
15
 
17
16
 
18
- class RunGraph:
17
+ def load_graph(graph: bytes) -> 'Graph':
18
+ return cloudpickle.loads(graph)
19
+
20
+ class Graph:
19
21
  def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
20
22
  # TODO check for cycles
21
23
  self.name = name
@@ -35,7 +37,10 @@ class RunGraph:
35
37
 
36
38
  self.runner = runner
37
39
 
38
- def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
40
+ def get_extractor(self, name: str) -> Extractor:
41
+ return self.nodes[name]
42
+
43
+ def _node(self, extractor: Extractor, params: Any = None) -> 'Graph':
39
44
  name = extractor.name
40
45
 
41
46
  # if you've already inserted a node just ignore the new insertion.
@@ -49,13 +54,16 @@ class RunGraph:
49
54
  self._topo_counter[name] = 1
50
55
 
51
56
  return self
57
+
58
+ def serialize(self):
59
+ return cloudpickle.dumps(self)
52
60
 
53
61
  def add_edge(
54
62
  self,
55
- from_node: extractor,
56
- to_node: extractor,
63
+ from_node: Type[Extractor],
64
+ to_node: Type[Extractor],
57
65
  prefilter_predicates: Optional[str] = None,
58
- ) -> 'RunGraph':
66
+ ) -> 'Graph':
59
67
 
60
68
  self._node(from_node)
61
69
  self._node(to_node)
@@ -79,7 +87,7 @@ class RunGraph:
79
87
  from_node: extractor,
80
88
  to_nodes: List[extractor],
81
89
  prefilter_predicates: List[str] = [],
82
- ) -> 'RunGraph':
90
+ ) -> 'Graph':
83
91
  print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
84
92
  for t_n, p in itertools.zip_longest(
85
93
  to_nodes, prefilter_predicates, fillvalue=None
@@ -99,7 +107,6 @@ class RunGraph:
99
107
 
100
108
  def run(self, wf_input, local):
101
109
  self._assign_start_node()
102
- # self.runner = LocalRunner()
103
110
  self.runner.run(self, wf_input=wf_input)
104
111
  pass
105
112
 
@@ -10,7 +10,7 @@ from indexify.extractor_sdk.extractor import extractor, Extractor
10
10
  from collections import defaultdict
11
11
  from typing import Any, Callable, Dict, Optional, Union
12
12
 
13
- from indexify.run_graph import RunGraph
13
+ from indexify.graph import Graph
14
14
  from indexify.runner import Runner
15
15
 
16
16
 
@@ -27,7 +27,7 @@ class LocalRunner(Runner):
27
27
  # those bytes have to be a python type
28
28
 
29
29
  # _input needs to be serializable into python object (ie json for ex) and Feature
30
- def _run(self, g: RunGraph, _input: BaseData, node_name: str):
30
+ def _run(self, g: Graph, _input: BaseData, node_name: str):
31
31
  print(f"---- Starting node {node_name}")
32
32
  print(f'node_name {node_name}')
33
33
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.39"
3
+ version = "0.0.40"
4
4
  description = "Python Client for Indexify"
5
5
  authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
@@ -13,6 +13,7 @@ python = "^3.9"
13
13
  httpx = { version = "^0", extras = ["http2"] }
14
14
  pyyaml = "^6"
15
15
  pydantic = "^2.8"
16
+ cloudpickle = "^3"
16
17
 
17
18
  [tool.poetry.dev-dependencies]
18
19
  black = "^22.3.0"
@@ -1,23 +0,0 @@
1
- from .extractor_sdk import extractor, Extractor
2
-
3
- from typing import Type, Union
4
- from pydantic import BaseModel
5
-
6
- from .run_graph import RunGraph
7
- from .local_runner import LocalRunner
8
-
9
-
10
- def Graph(
11
- name: str,
12
- input: Type[BaseModel],
13
- start_node: Union[extractor, Extractor],
14
- run_local: bool,
15
- ) -> RunGraph:
16
-
17
- if run_local:
18
- runner = LocalRunner()
19
- else:
20
- raise NotImplementedError("Remote runner not supported yet")
21
-
22
- graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
23
- return graph
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes