indexify 0.0.36__tar.gz → 0.0.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.36
3
+ Version: 0.0.37
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -549,7 +549,7 @@ class IndexifyClient:
549
549
  labels_filter: List[str] = [],
550
550
  start_id: str = "",
551
551
  limit: int = 10,
552
- ) -> List[Content]:
552
+ ) -> List[ContentMetadata]:
553
553
  """
554
554
  List content in the current namespace.
555
555
 
@@ -572,7 +572,7 @@ class IndexifyClient:
572
572
  content_list = response.json()["content_list"]
573
573
  content = []
574
574
  for item in content_list:
575
- content.append(Content.from_dict(item))
575
+ content.append(ContentMetadata.from_dict(item))
576
576
  return content
577
577
 
578
578
  def upload_file(
@@ -58,7 +58,7 @@ class Content(BaseModel):
58
58
 
59
59
  m, _ = mimetypes.guess_type(path)
60
60
  with open(path, "rb") as f:
61
- return cls(content_type=m, data=f.read())
61
+ return cls(id="none-for-now", content_type=m, data=f.read())
62
62
 
63
63
 
64
64
  class ContentMetadata(BaseModel):
@@ -22,8 +22,8 @@ class Extractor(ABC):
22
22
  input_mime_types = ["text/plain"]
23
23
 
24
24
  def extract(
25
- self, content: Content, params: Type[BaseModel] = None
26
- ) -> List[Union[Feature, Content]]:
25
+ self, input: Type[BaseModel], params: Type[BaseModel] = None
26
+ ) -> List[Union[Feature, Type[BaseModel]]]:
27
27
  """
28
28
  Extracts information from the content. Returns a list of features to add
29
29
  to the content.
@@ -33,8 +33,8 @@ class Extractor(ABC):
33
33
  pass
34
34
 
35
35
  def extract_batch(
36
- self, content_list: List[Content], params: List[Type[BaseModel]] = None
37
- ) -> List[List[Union[Feature, Content]]]:
36
+ self, input_list: List[Type[BaseModel]], params: List[Type[BaseModel]] = None
37
+ ) -> List[List[Union[Feature, Type[BaseModel]]]]:
38
38
  """
39
39
  Extracts information from the content. Returns a list of features to add
40
40
  to the content.
@@ -99,13 +99,13 @@ def extractor(
99
99
 
100
100
  class DecoratedFn(Extractor):
101
101
  @classmethod
102
- def extract(cls, content: Content, params: hint) -> List[Content]: # type: ignore
102
+ def extract(cls, input: Type[BaseModel], params: Type[BaseModel]=None) -> List[Content]: # type: ignore
103
103
  # TODO we can force all the functions to take in a parms object
104
104
  # or check if someone adds a params
105
105
  if params is None:
106
- return fn(content)
106
+ return fn(input)
107
107
  else:
108
- return fn(content, params)
108
+ return fn(input, params)
109
109
 
110
110
  def sample_input(self) -> Content:
111
111
  return sample_content() if sample_content else self.sample_text()
@@ -115,6 +115,8 @@ def extractor(
115
115
 
116
116
  return DecoratedFn
117
117
 
118
+ wrapper._extractor_name = fn.__name__
119
+
118
120
  return wrapper
119
121
 
120
122
  return construct
@@ -0,0 +1,80 @@
1
+ from indexify import Content, extractor
2
+ from indexify.extractor import Extractor
3
+
4
+ from collections import defaultdict
5
+ from typing import Any, Callable, Dict, List, Optional, Self
6
+
7
+ import itertools
8
+
9
+
10
+ @extractor(description="id function")
11
+ def _id(content: Content) -> List[Content]:
12
+ return [content]
13
+
14
+ class Graph:
15
+ def __init__(self, name: str):
16
+ # TODO check for cycles
17
+ self.name = name
18
+
19
+ self.nodes: Dict[str, Callable] = {}
20
+ self.params: Dict[str, Any] = {}
21
+
22
+ self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
23
+
24
+ self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
25
+
26
+ self.nodes["start"] = _id
27
+ self.nodes["end"] = _id
28
+
29
+ self._topo_counter = defaultdict(int)
30
+
31
+ self._start_node = None
32
+
33
+ def _node(self, extractor: Extractor, params: Any = None) -> Self:
34
+ name = extractor._extractor_name
35
+
36
+ # if you've already inserted a node just ignore the new insertion.
37
+ if name in self.nodes:
38
+ return
39
+
40
+ self.nodes[name] = extractor
41
+ self.params[name] = extractor.__dict__.get('params', None)
42
+
43
+ # assign each node a rank of 1 to init the graph
44
+ self._topo_counter[name] = 1
45
+
46
+ return self
47
+
48
+ def step(self,
49
+ from_node: extractor,
50
+ to_node: extractor,
51
+ prefilter_predicates: Optional[str] = None
52
+ ) -> Self:
53
+
54
+ self._node(from_node)
55
+ self._node(to_node)
56
+
57
+ from_node_name = from_node._extractor_name
58
+ to_node_name = to_node._extractor_name
59
+
60
+ self.edges[from_node_name].append((to_node_name, prefilter_predicates))
61
+
62
+ self._topo_counter[to_node_name] += 1
63
+
64
+ return self
65
+
66
+ """
67
+ Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
68
+ Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
69
+ """
70
+ def steps(self, from_node: extractor, to_nodes: List[extractor], prefilter_predicates: List[str] = []) -> Self:
71
+ print(f'{to_nodes}, {prefilter_predicates}, {prefilter_predicates}')
72
+ for t_n, p in itertools.zip_longest(to_nodes, prefilter_predicates, fillvalue=None):
73
+ self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
74
+
75
+ return self
76
+
77
+ def _assign_start_node(self):
78
+ # this method should be called before a graph can be run
79
+ nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
80
+ self._start_node = nodes[0][0]
@@ -1,8 +1,10 @@
1
- from indexify import Content
1
+ from indexify import Content, Extractor
2
2
 
3
3
  from collections import defaultdict
4
4
  from typing import Any, Callable, Dict, Optional
5
5
 
6
+ import json
7
+
6
8
  class LocalRunner:
7
9
  def __init__(self):
8
10
  self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
@@ -15,6 +17,8 @@ class LocalRunner:
15
17
  extractor_construct: Callable = g.nodes[node_name]
16
18
  params = g.params.get(node_name, None)
17
19
 
20
+ print(f"----Starting {node_name}")
21
+
18
22
  res = extractor_construct().extract(content=content, params=params)
19
23
 
20
24
  self.results[node_name].extend(res)
@@ -27,27 +31,35 @@ class LocalRunner:
27
31
 
28
32
  self._run(g, content=r, node_name=out_edge)
29
33
 
34
+ """
35
+ Returns True if content should be filtered
36
+ """
30
37
  def _prefilter_content(self, content: Content, prefilter_predicate: Optional[str]) -> bool:
31
38
  if prefilter_predicate is None:
32
39
  return False
33
40
 
34
41
  atoms = prefilter_predicate.split('and')
35
- if len(atoms) == 0 or len(atoms) == 1:
42
+ if len(atoms) == 0:
36
43
  return False
37
44
 
38
45
  # TODO For now only support `and` and `=` and `string values`
39
46
  bools = []
40
47
  for feature in content.features:
41
48
  if feature.feature_type == 'metadata':
42
- values = feature.value
49
+ predicates = json.loads(feature.value)
50
+
51
+ print(f"predicates {predicates}")
43
52
 
44
- print(f'{prefilter_predicate, atoms}')
45
53
  for atom in atoms:
46
54
  l, r = atom.split('=')
47
- if l in values:
48
- bools.append(values[l] == r)
55
+ if l in predicates:
56
+ print(f'predicates[l], r: {predicates[l], r}')
57
+ bools.append(predicates[l] != r)
58
+
59
+ print(bools)
49
60
 
50
61
  return all(bools)
51
62
 
52
- def get_result(self, node_name: str) -> Content:
63
+ def get_result(self, node: Extractor) -> Content:
64
+ node_name = node._extractor_name
53
65
  return self.results[node_name]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.36"
3
+ version = "0.0.37"
4
4
  description = "Python Client for Indexify"
5
5
  authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
@@ -1,49 +0,0 @@
1
- from indexify import Content, extractor
2
- from indexify.extractor import Extractor
3
-
4
- from collections import defaultdict
5
- from typing import Any, Callable, Dict, List, Optional
6
-
7
-
8
- @extractor(description="id function")
9
- def _id(content: Content) -> List[Content]:
10
- return [content]
11
-
12
- class Graph:
13
- def __init__(self, name: str):
14
- # TODO check for cycles
15
- self.name = name
16
-
17
- self.nodes: Dict[str, Callable] = {}
18
- self.params: Dict[str, Any] = {}
19
-
20
- self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
21
-
22
- self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
23
-
24
- self.nodes["start"] = _id
25
- self.nodes["end"] = _id
26
-
27
- self._topo_counter = defaultdict(int)
28
-
29
- self._start_node = None
30
-
31
- def node(self, name: str, closure: Extractor, params: Any = None) -> None:
32
- if name in self.nodes:
33
- raise Exception(f"Cannot insert node, node with name: `{name}` already exists")
34
-
35
- self.nodes[name] = closure
36
- self.params[name] = params
37
-
38
- # assign each node a rank of 1 to init the graph
39
- self._topo_counter[name] = 1
40
-
41
- def edge(self, from_node: str, to_node: str, prefilter_predicates: Optional[str] = None) -> None:
42
- self.edges[from_node].append((to_node, prefilter_predicates))
43
-
44
- self._topo_counter[to_node] += 1
45
-
46
- def _assign_start_node(self):
47
- # this method should be called before a graph can be run
48
- nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
49
- self._start_node = nodes[0][0]
File without changes
File without changes
File without changes
File without changes