indexify 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/__init__.py CHANGED
@@ -1,17 +1,17 @@
1
- from .client import IndexifyClient
2
- from .extraction_policy import ExtractionGraph
1
+ from . import data_loaders, extractor_sdk
3
2
  from .client import (
4
- IndexifyClient,
5
3
  Document,
4
+ IndexifyClient,
6
5
  generate_hash_from_string,
7
6
  generate_unique_hex_id,
8
7
  )
9
- from . import extractor_sdk
8
+ from .extraction_policy import ExtractionGraph
9
+ from .graph import Graph
10
10
  from .settings import DEFAULT_SERVICE_URL
11
- from . import data_loaders
12
11
 
13
12
  __all__ = [
14
13
  "data_loaders",
14
+ "Graph",
15
15
  "Document",
16
16
  "extractor_sdk",
17
17
  "IndexifyClient",
indexify/base_client.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
- from .extractor_sdk import Graph, Feature
3
- from typing import Any, List, Optional, Union, Dict
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from .extractor_sdk import Feature, Graph
5
+
4
6
 
5
7
  class BaseClient(ABC):
6
8
 
@@ -23,7 +25,7 @@ class BaseClient(ABC):
23
25
 
24
26
  ### Ingestion APIs
25
27
  @abstractmethod
26
- def invoke_graph_with_object(self, graph:str, object: Any) -> str:
28
+ def invoke_graph_with_object(self, graph: str, object: Any) -> str:
27
29
  """
28
30
  Invokes a graph with an input object.
29
31
  graph: str: The name of the graph to invoke
@@ -42,10 +44,11 @@ class BaseClient(ABC):
42
44
  """
43
45
  pass
44
46
 
45
-
46
47
  ### Retrieval APIs
47
48
  @abstractmethod
48
- def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
49
+ def extracted_objects(
50
+ self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
51
+ ) -> Union[Dict[str, List[Any]], List[Any]]:
49
52
  """
50
53
  Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
51
54
  If the extractor name is not provided, all the extracted objects are returned for the input object.
@@ -57,11 +60,12 @@ class BaseClient(ABC):
57
60
  pass
58
61
 
59
62
  @abstractmethod
60
- def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
63
+ def features(
64
+ self, object_id: str, graph: Optional[str]
65
+ ) -> Union[Dict[str, List[Feature]], List[Feature]]:
61
66
  """
62
67
  Returns the features of an object.
63
68
  object_id: str: The ID of the object
64
69
  return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
65
70
  """
66
71
  pass
67
-
indexify/client.py CHANGED
@@ -1,20 +1,23 @@
1
- import yaml
2
- import httpx
3
- import uuid
4
1
  import hashlib
5
2
  import json
3
+ import logging
4
+ import uuid
6
5
  from collections import namedtuple
7
- from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
8
- from .extractor_sdk.extractor import ExtractorMetadata
9
- from .extraction_policy import ExtractionGraph
10
- from .utils import json_set_default
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List, Optional, Union
8
+
9
+ import httpx
10
+ import yaml
11
+
12
+ from indexify.exceptions import ApiException
13
+
14
+ from .data_loaders import DataLoader
11
15
  from .error import Error
16
+ from .extraction_policy import ExtractionGraph
12
17
  from .extractor_sdk.data import ContentMetadata
13
- from .data_loaders import DataLoader
14
- from indexify.exceptions import ApiException
15
- from dataclasses import dataclass
16
- from typing import List, Optional, Union, Dict
17
- import logging
18
+ from .extractor_sdk.extractor import ExtractorMetadata
19
+ from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
20
+ from .utils import json_set_default
18
21
 
19
22
  Document = namedtuple("Document", ["text", "labels", "id"])
20
23
 
@@ -1,10 +1,10 @@
1
- from pydantic import BaseModel
2
-
1
+ import hashlib
2
+ import mimetypes
3
+ import os
3
4
  from abc import ABC, abstractmethod
4
5
  from typing import List
5
- import os
6
- import mimetypes
7
- import hashlib
6
+
7
+ from pydantic import BaseModel
8
8
 
9
9
 
10
10
  class FileMetadata(BaseModel):
@@ -1,6 +1,7 @@
1
- from . import DataLoader, FileMetadata
2
- from typing import List, Optional
3
1
  import os
2
+ from typing import List, Optional
3
+
4
+ from . import DataLoader, FileMetadata
4
5
 
5
6
 
6
7
  class LocalDirectoryLoader(DataLoader):
@@ -1,8 +1,9 @@
1
- from . import DataLoader, FileMetadata
1
+ import email.utils
2
2
  from typing import List
3
+
3
4
  import httpx
4
- import hashlib
5
- import email.utils
5
+
6
+ from . import DataLoader, FileMetadata
6
7
 
7
8
 
8
9
  def convert_date_to_epoch(date_str: str) -> int:
@@ -1,5 +1,5 @@
1
- from dataclasses import dataclass, asdict
2
- from typing import Optional, List
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List, Optional
3
3
 
4
4
 
5
5
  @dataclass
@@ -1,5 +1,5 @@
1
- from .data import ContentMetadata, Content, Feature
2
- from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
1
+ from .data import Content, ContentMetadata, Feature
2
+ from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
3
3
  from .utils import SampleExtractorData
4
4
 
5
5
  __all__ = [
@@ -1,15 +1,7 @@
1
- from typing import (
2
- Any,
3
- List,
4
- Optional,
5
- Literal,
6
- Dict,
7
- Type,
8
- cast,
9
- Mapping,
10
- )
11
- from pydantic import BaseModel, Json, Field
12
1
  import json
2
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
3
+
4
+ from pydantic import BaseModel, Field, Json
13
5
  from typing_extensions import Annotated, Doc
14
6
 
15
7
 
@@ -103,6 +95,6 @@ class ContentMetadata(BaseModel):
103
95
  )
104
96
 
105
97
 
106
- class PDFFile(BaseData):
98
+ class File(BaseData):
107
99
  data: bytes
108
100
  mime_type: str
@@ -1,16 +1,29 @@
1
- from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
2
1
  import inspect
3
- from pydantic import BaseModel, Field
4
- from abc import ABC, abstractmethod
5
- from .data import BaseData, Content, Feature
6
2
  import json
7
3
  import os
4
+ from abc import ABC, abstractmethod
5
+ from typing import (
6
+ Callable,
7
+ Dict,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ Type,
12
+ Union,
13
+ get_type_hints,
14
+ )
15
+
8
16
  import requests
17
+ from pydantic import BaseModel, Field
18
+
19
+ from .data import BaseData, Content, Feature
9
20
 
10
21
 
11
22
  class EmbeddingSchema(BaseModel):
12
23
  dim: int
13
- distance: str = "cosine"
24
+ distance: Optional[str] = "cosine"
25
+ database_url: Optional[str] = None
26
+
14
27
 
15
28
  class ExtractorMetadata(BaseModel):
16
29
  name: str
@@ -40,7 +53,7 @@ class Extractor(ABC):
40
53
 
41
54
  input_mime_types = ["text/plain"]
42
55
 
43
- embeddings: Dict[str, EmbeddingSchema] = {}
56
+ embedding_indexes: Dict[str, EmbeddingSchema] = {}
44
57
 
45
58
  @abstractmethod
46
59
  def extract(
@@ -55,31 +68,9 @@ class Extractor(ABC):
55
68
  pass
56
69
 
57
70
  @classmethod
58
- @abstractmethod
59
71
  def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
60
72
  pass
61
73
 
62
- def describe(self) -> ExtractorMetadata:
63
- embedding_schemas = {}
64
- try:
65
- embedding_schemas = self.embedding_schemas
66
- except NotImplementedError:
67
- pass
68
-
69
- json_schema = (
70
- self._param_cls.model_json_schema() if self._param_cls is not None else None
71
- )
72
- return ExtractorMetadata(
73
- name=self.name,
74
- version=self.version,
75
- description=self.description,
76
- system_dependencies=self.system_dependencies,
77
- python_dependencies=self.python_dependencies,
78
- input_mime_types=self.input_mime_types,
79
- embedding_schemas=embedding_schemas,
80
- input_params=json.dumps(json_schema),
81
- )
82
-
83
74
  def _download_file(self, url, filename):
84
75
  if os.path.exists(filename):
85
76
  # file exists skip
@@ -190,7 +181,7 @@ def extractor(
190
181
  python_dependencies: Optional[List[str]] = None,
191
182
  system_dependencies: Optional[List[str]] = None,
192
183
  input_mime_types: Optional[List[str]] = None,
193
- embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
184
+ embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
194
185
  sample_content: Optional[Callable] = None,
195
186
  ):
196
187
  args = locals()
@@ -198,7 +189,7 @@ def extractor(
198
189
 
199
190
  def construct(fn):
200
191
  def wrapper():
201
- hint = get_type_hints(fn).get("params", dict)
192
+ description = fn.__doc__ or args.get("description", "")
202
193
 
203
194
  if not args.get("name"):
204
195
  args[
@@ -207,7 +198,7 @@ def extractor(
207
198
 
208
199
  class DecoratedFn(Extractor):
209
200
  @classmethod
210
- def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
201
+ def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
211
202
  # TODO we can force all the functions to take in a parms object
212
203
  # or check if someone adds a params
213
204
  if params is None:
@@ -220,6 +211,7 @@ def extractor(
220
211
 
221
212
  for key, val in args.items():
222
213
  setattr(DecoratedFn, key, val)
214
+ DecoratedFn.description = description
223
215
 
224
216
  return DecoratedFn
225
217
 
@@ -1,20 +1,17 @@
1
1
  import os
2
- import httpx
3
2
  from typing import List
3
+
4
+ import httpx
5
+
4
6
  from .data import Content, Feature
5
7
 
6
8
 
7
9
  class SampleExtractorData:
8
- def _download_file(self, url, filename):
9
- if os.path.exists(filename):
10
- # file exists skip
11
- return
10
+ def _download_file(self, url):
12
11
  try:
13
- with httpx.get(url, stream=True) as r:
14
- r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
15
- with open(filename, "wb") as f:
16
- for chunk in r.iter_content(chunk_size=8192):
17
- f.write(chunk)
12
+ resp = httpx.get(url, follow_redirects=True)
13
+ resp.raise_for_status()
14
+ return resp.content
18
15
  except httpx.exceptions.RequestException as e:
19
16
  print(f"Error downloading the file: {e}")
20
17
 
@@ -55,13 +52,10 @@ class SampleExtractorData:
55
52
  return Content(content_type="image/jpg", data=f.read(), features=features)
56
53
 
57
54
  def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
58
- file_name = "sample.pdf"
59
- self._download_file(
55
+ data = self._download_file(
60
56
  "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
61
- file_name,
62
57
  )
63
- f = open(file_name, "rb")
64
- return Content(content_type="application/pdf", data=f.read(), features=features)
58
+ return Content(content_type="application/pdf", data=data, features=features)
65
59
 
66
60
  def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
67
61
  file_name = "sample.pdf"
File without changes
@@ -0,0 +1,55 @@
1
+ from typing import List
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from transformers import AutoModel, AutoTokenizer
6
+
7
+ from indexify.extractor_sdk.data import Feature
8
+ from indexify.extractor_sdk.extractor import Extractor, Feature
9
+
10
+
11
+ class SentenceTransformersEmbedding:
12
+ def __init__(self, model_name) -> None:
13
+ self._model_name = model_name
14
+ self._tokenizer = AutoTokenizer.from_pretrained(
15
+ f"sentence-transformers/{model_name}"
16
+ )
17
+ self._model = AutoModel.from_pretrained(
18
+ f"sentence-transformers/{model_name}", torchscript=True
19
+ )
20
+ self._model.eval()
21
+
22
+ def embed_batch(self, inputs: List[str]) -> List[List[float]]:
23
+ result = self._embed(inputs)
24
+ return result.tolist()
25
+
26
+ def embed(self, query: str) -> List[float]:
27
+ result = self._embed([query])
28
+ return result[0].tolist()
29
+
30
+ def _embed(self, inputs: List[str]) -> torch.Tensor:
31
+ encoded_input = self._tokenizer(
32
+ inputs, padding=True, truncation=True, return_tensors="pt"
33
+ )
34
+ sentence_embeddings = self._model(**encoded_input)
35
+ return F.normalize(sentence_embeddings, p=2, dim=1)
36
+
37
+
38
+ class BasicSentenceTransformerModels(Extractor):
39
+ def __init__(self, model: str):
40
+ super().__init__()
41
+ self.model = SentenceTransformersEmbedding(model)
42
+
43
+ def extract(self, input: str) -> List[Feature]:
44
+ embeddings = self.model.embed(input)
45
+ return [Feature.embedding(values=embeddings)]
46
+
47
+
48
+ class BasicHFTransformerEmbeddingModels(Extractor):
49
+ def __init__(self, model: str):
50
+ super().__init__()
51
+ self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
52
+
53
+ def extract(self, input: str) -> List[Feature]:
54
+ embeddings = self.model.embed_query(input)
55
+ return [Feature.embedding(values=embeddings)]
@@ -0,0 +1,95 @@
1
+ import tempfile
2
+ from enum import Enum
3
+ from typing import List, Optional
4
+
5
+ import deepdoctection as dd
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class PageFragmentType(str, Enum):
10
+ TEXT = "text"
11
+ FIGURE = "figure"
12
+ TABLE = "table"
13
+
14
+
15
+ class Image(BaseModel):
16
+ data: bytes
17
+ mime_type: str
18
+
19
+
20
+ class TableEncoding(str, Enum):
21
+ CSV = "csv"
22
+ HTML = "html"
23
+
24
+
25
+ class Table(BaseModel):
26
+ data: str
27
+ encoding: TableEncoding
28
+
29
+
30
+ class PageFragment(BaseModel):
31
+ fragment_type: PageFragmentType
32
+ text: Optional[str] = None
33
+ image: Optional[Image] = None
34
+ table: Optional[Table] = None
35
+ reading_order: Optional[int] = None
36
+
37
+
38
+ class Page(BaseModel):
39
+ number: int
40
+ fragments: List[PageFragment]
41
+
42
+
43
+ class PDFParser:
44
+ def __init__(self, data: bytes, language: Optional[str] = "en"):
45
+ self._data = data
46
+
47
+ def parse(self) -> List[Page]:
48
+ analyzer = dd.get_dd_analyzer()
49
+ parsed_pages = []
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
51
+ f.write(self._data)
52
+ f.flush()
53
+ df = analyzer.analyze(path=f.name)
54
+ df.reset_state()
55
+ for page in df:
56
+ parsed_pages.append(page)
57
+ outputs: List[Page] = []
58
+ for parsed_page in parsed_pages:
59
+ page_num = parsed_page.page_number
60
+ fragments = []
61
+ for layout in parsed_page.layouts:
62
+ if layout.category_name in ["text", "title"]:
63
+ fragments.append(
64
+ PageFragment(
65
+ fragment_type=PageFragmentType.TEXT,
66
+ text=layout.text,
67
+ reading_order=layout.reading_order,
68
+ )
69
+ )
70
+ figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
71
+ print(len(figures))
72
+ for figure in figures:
73
+ image_bytes = dd.viz_handler.encode(figure.viz())
74
+ fragments.append(
75
+ PageFragment(
76
+ fragment_type=PageFragmentType.FIGURE,
77
+ image=Image(data=image_bytes, mime_type="image/png"),
78
+ reading_order=figure.reading_order,
79
+ )
80
+ )
81
+
82
+ tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
83
+ print(len(tables))
84
+ for table in tables:
85
+ fragments.append(
86
+ PageFragment(
87
+ fragment_type=PageFragmentType.TABLE,
88
+ table=Table(data=table.html, encoding=TableEncoding.HTML),
89
+ reading_order=table.reading_order,
90
+ )
91
+ )
92
+
93
+ outputs.append(Page(number=page_num, fragments=fragments))
94
+
95
+ return outputs
indexify/graph.py CHANGED
@@ -1,23 +1,133 @@
1
- from .extractor_sdk import extractor, Extractor
1
+ import itertools
2
+ import json
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional, Type, Union
2
5
 
3
- from typing import Type, Union
6
+ import cloudpickle
4
7
  from pydantic import BaseModel
5
8
 
6
- from .run_graph import RunGraph
7
- from .local_runner import LocalRunner
9
+ from .extractor_sdk import Content, Extractor, extractor
10
+ from .runner import Runner
8
11
 
9
12
 
10
- def Graph(
11
- name: str,
12
- input: Type[BaseModel],
13
- start_node: Union[extractor, Extractor],
14
- run_local: bool,
15
- ) -> RunGraph:
13
+ @extractor(description="id function")
14
+ def _id(content: Content) -> List[Content]:
15
+ return [content]
16
16
 
17
- if run_local:
18
- runner = LocalRunner()
19
- else:
20
- raise NotImplementedError("Remote runner not supported yet")
21
17
 
22
- graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
23
- return graph
18
+ def load_graph(graph: bytes) -> "Graph":
19
+ return cloudpickle.loads(graph)
20
+
21
+
22
+ class Graph:
23
+ def __init__(
24
+ self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
25
+ ):
26
+ # TODO check for cycles
27
+ self.name = name
28
+
29
+ self.nodes: Dict[str, Union[extractor, Extractor]] = {}
30
+ self.params: Dict[str, Any] = {}
31
+
32
+ self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
33
+
34
+ self.nodes["start"] = _id
35
+ self.nodes["end"] = _id
36
+
37
+ self._topo_counter = defaultdict(int)
38
+
39
+ self._start_node = None
40
+ self._input = input
41
+
42
+ self.runner = runner
43
+
44
+ def get_extractor(self, name: str) -> Extractor:
45
+ return self.nodes[name]
46
+
47
+ def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
48
+ name = extractor.name
49
+
50
+ # if you've already inserted a node just ignore the new insertion.
51
+ if name in self.nodes:
52
+ return
53
+
54
+ self.nodes[name] = extractor
55
+ self.params[name] = extractor.__dict__.get("params", None)
56
+
57
+ # assign each node a rank of 1 to init the graph
58
+ self._topo_counter[name] = 1
59
+
60
+ return self
61
+
62
+ def serialize(self):
63
+ return cloudpickle.dumps(self)
64
+
65
+ def add_edge(
66
+ self,
67
+ from_node: Type[Extractor],
68
+ to_node: Type[Extractor],
69
+ prefilter_predicates: Optional[str] = None,
70
+ ) -> "Graph":
71
+
72
+ self._node(from_node)
73
+ self._node(to_node)
74
+
75
+ from_node_name = from_node.name
76
+ to_node_name = to_node.name
77
+
78
+ self.edges[from_node_name].append((to_node_name, prefilter_predicates))
79
+
80
+ self._topo_counter[to_node_name] += 1
81
+
82
+ return self
83
+
84
+ """
85
+ Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
86
+ Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
87
+ """
88
+
89
+ def steps(
90
+ self,
91
+ from_node: extractor,
92
+ to_nodes: List[extractor],
93
+ prefilter_predicates: List[str] = [],
94
+ ) -> "Graph":
95
+ print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
96
+ for t_n, p in itertools.zip_longest(
97
+ to_nodes, prefilter_predicates, fillvalue=None
98
+ ):
99
+ self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
100
+
101
+ return self
102
+
103
+ def add_param(self, node: extractor, params: Dict[str, Any]):
104
+ try:
105
+ # check if the params can be serialized since the server needs this
106
+ json.dumps(params)
107
+ except Exception:
108
+ raise Exception(f"For node {node.name}, cannot serialize params as json.")
109
+
110
+ self.params[node.name] = params
111
+
112
+ def run(self, wf_input, local):
113
+ self._assign_start_node()
114
+ self.runner.run(self, wf_input=wf_input)
115
+ pass
116
+
117
+ def clear_cache_for_node(self, node: Union[extractor, Extractor]):
118
+ if node.name not in self.nodes.keys():
119
+ raise Exception(f"Node with name {node.name} not found in graph")
120
+
121
+ self.runner.deleted_from_memo(node.name)
122
+
123
+ def clear_cache_for_all_nodes(self):
124
+ for node_name in self.nodes:
125
+ self.runner.deleted_from_memo(node_name=node_name)
126
+
127
+ def get_result(self, node: Union[extractor, Extractor]) -> Any:
128
+ return self.runner.results[node.name]
129
+
130
+ def _assign_start_node(self):
131
+ # this method should be called before a graph can be run
132
+ nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
133
+ self._start_node = nodes[0][0]
indexify/local_runner.py CHANGED
@@ -2,15 +2,13 @@ import hashlib
2
2
  import os
3
3
  import pickle
4
4
  import shutil
5
- from pathlib import Path
6
-
7
- from indexify.extractor_sdk.data import BaseData, Feature
8
- from indexify.extractor_sdk.extractor import extractor, Extractor
9
-
10
5
  from collections import defaultdict
6
+ from pathlib import Path
11
7
  from typing import Any, Callable, Dict, Optional, Union
12
8
 
13
- from indexify.run_graph import RunGraph
9
+ from indexify.extractor_sdk.data import BaseData, Feature
10
+ from indexify.extractor_sdk.extractor import Extractor, extractor
11
+ from indexify.graph import Graph
14
12
  from indexify.runner import Runner
15
13
 
16
14
 
@@ -27,9 +25,9 @@ class LocalRunner(Runner):
27
25
  # those bytes have to be a python type
28
26
 
29
27
  # _input needs to be serializable into python object (ie json for ex) and Feature
30
- def _run(self, g: RunGraph, _input: BaseData, node_name: str):
28
+ def _run(self, g: Graph, _input: BaseData, node_name: str):
31
29
  print(f"---- Starting node {node_name}")
32
- print(f'node_name {node_name}')
30
+ print(f"node_name {node_name}")
33
31
 
34
32
  extractor_construct: Callable = g.nodes[node_name]
35
33
  params = g.params.get(node_name, None)
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
65
63
  for out_edge, pre_filter_predicate in g.edges[node_name]:
66
64
  # TODO there are no reductions yet, each recursion finishes it's path and returns
67
65
  for r in data_to_process:
68
- if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
66
+ if self._prefilter_content(
67
+ content=r, prefilter_predicate=pre_filter_predicate
68
+ ):
69
69
  continue
70
70
 
71
71
  self._run(g, _input=r, node_name=out_edge)
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
73
73
  """
74
74
  Returns True if content should be filtered
75
75
  """
76
- def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
76
+
77
+ def _prefilter_content(
78
+ self, content: BaseData, prefilter_predicate: Optional[str]
79
+ ) -> bool:
77
80
  if prefilter_predicate is None:
78
81
  return False
79
82
 
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
83
86
 
84
87
  # TODO For now only support `and` and `=` and `string values`
85
88
  bools = []
86
- metadata = content.get_features()['metadata']
89
+ metadata = content.get_features()["metadata"]
87
90
  for atom in atoms:
88
- l, r = atom.split('=')
91
+ l, r = atom.split("=")
89
92
  if l in metadata:
90
93
  bools.append(metadata[l] != r)
91
94
 
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
109
112
  if not os.path.exists(file_path):
110
113
  return None
111
114
 
112
- with open(file_path, 'rb') as f:
115
+ with open(file_path, "rb") as f:
113
116
  return f.read()
114
117
 
115
118
  def put_into_memo(self, node_name, input_hash, output):
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
121
124
 
122
125
  Path(file_path).touch()
123
126
 
124
- with open(file_path, 'wb') as f:
127
+ with open(file_path, "wb") as f:
125
128
  return f.write(output)
indexify/runner.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from abc import ABC
2
+ from typing import Any, Union
2
3
 
3
4
  from indexify.extractor_sdk.data import BaseData
4
- from indexify.extractor_sdk.extractor import extractor, Extractor
5
+ from indexify.extractor_sdk.extractor import Extractor, extractor
5
6
 
6
- from typing import Any, Union
7
7
 
8
8
  class Runner(ABC):
9
9
  def run(self, g, wf_input: BaseData):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.39
3
+ Version: 0.0.41
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: cloudpickle (>=3,<4)
16
17
  Requires-Dist: httpx[http2] (>=0,<1)
17
18
  Requires-Dist: pydantic (>=2.8,<3.0)
18
19
  Requires-Dist: pyyaml (>=6,<7)
@@ -0,0 +1,25 @@
1
+ indexify/__init__.py,sha256=e4s2395B3gEGrZk2u5OZO2RtrXYFYUTItaM3mtlusBE,493
2
+ indexify/base_client.py,sha256=HwT2KJNq8j-KiPVA9RJm-yearSjxifRjXTcP1zUVeo8,2784
3
+ indexify/client.py,sha256=p4WDmYR94DjU0EqosuCKNGjbfh11qUID6TxDhTK6Uk4,26001
4
+ indexify/data_loaders/__init__.py,sha256=Y5NEuseTcYAICRiweYw5wBQ2m2YplbsY21I7df-rdi4,1339
5
+ indexify/data_loaders/local_directory_loader.py,sha256=fCrgj5drnW71ZUdDDvcB1-VJjIs1w6Q8sEW0HSGSAiA,1247
6
+ indexify/data_loaders/url_loader.py,sha256=32SERljcq1Xsi4RdLz2dgyk2TER5pQPTtXl3gUzwHbY,1533
7
+ indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
8
+ indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
9
+ indexify/extraction_policy.py,sha256=927BBtZBDPsLMm01uQDPCZnj3Pwmjh6L6QLHb4ShQKk,2076
10
+ indexify/extractor_sdk/__init__.py,sha256=DOL-wJvIspWPqjFRBpmhMbnsMZC2JY-NtNwQGiE6IqU,348
11
+ indexify/extractor_sdk/data.py,sha256=JpX9WdTpiuK72wn6QYhtqj5p5JiJu4waBrK-Hi7lNsA,2742
12
+ indexify/extractor_sdk/extractor.py,sha256=IEZvr1Qe-dVmTgAeJFAhEyHUW20n4uTEeEassH3C5j4,9858
13
+ indexify/extractor_sdk/utils.py,sha256=bW_D2eMWTzcAYZ8Lv7LUKGgOD0cyW77E6gNO3y7iNNA,6234
14
+ indexify/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ indexify/extractors/embedding.py,sha256=Be6X4odSHbkAEm2myxB04RN-Mvb2bFk8uWXxUpY-Z6E,1859
16
+ indexify/extractors/pdf_parser.py,sha256=XN-_b_W7CrpkTeWYs4H6hkK_mx-k4N2o1RSAVkQhr8Q,2842
17
+ indexify/graph.py,sha256=UdvrpNc-SdD3U27Ee9aTMMYcSOUz__WQWc31oFHV4yQ,3963
18
+ indexify/local_runner.py,sha256=uuMJbnT4qYMSySxsB3lEC7FSjYnJFh5eNZ00zu5gLNw,4387
19
+ indexify/runner.py,sha256=VVmLGF1kAmEuE461Hs0QJFnSvVWtUzYhhQfB1KptYPU,637
20
+ indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
21
+ indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
22
+ indexify-0.0.41.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ indexify-0.0.41.dist-info/METADATA,sha256=yTEubUvxQgDUcXrf6rxzvITsW6BDBjzG2LXxyl9A-O0,1913
24
+ indexify-0.0.41.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
+ indexify-0.0.41.dist-info/RECORD,,
indexify/run_graph.py DELETED
@@ -1,122 +0,0 @@
1
- import json
2
-
3
- from .extractor_sdk import Content, extractor, Extractor
4
-
5
- from collections import defaultdict
6
- from typing import Any, Dict, List, Optional, Type, Union
7
- from pydantic import BaseModel
8
-
9
- import itertools
10
-
11
- from .runner import Runner
12
-
13
- @extractor(description="id function")
14
- def _id(content: Content) -> List[Content]:
15
- return [content]
16
-
17
-
18
- class RunGraph:
19
- def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
20
- # TODO check for cycles
21
- self.name = name
22
-
23
- self.nodes: Dict[str, Union[extractor, Extractor]] = {}
24
- self.params: Dict[str, Any] = {}
25
-
26
- self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
27
-
28
- self.nodes["start"] = _id
29
- self.nodes["end"] = _id
30
-
31
- self._topo_counter = defaultdict(int)
32
-
33
- self._start_node = None
34
- self._input = input
35
-
36
- self.runner = runner
37
-
38
- def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
39
- name = extractor.name
40
-
41
- # if you've already inserted a node just ignore the new insertion.
42
- if name in self.nodes:
43
- return
44
-
45
- self.nodes[name] = extractor
46
- self.params[name] = extractor.__dict__.get("params", None)
47
-
48
- # assign each node a rank of 1 to init the graph
49
- self._topo_counter[name] = 1
50
-
51
- return self
52
-
53
- def add_edge(
54
- self,
55
- from_node: extractor,
56
- to_node: extractor,
57
- prefilter_predicates: Optional[str] = None,
58
- ) -> 'RunGraph':
59
-
60
- self._node(from_node)
61
- self._node(to_node)
62
-
63
- from_node_name = from_node.name
64
- to_node_name = to_node.name
65
-
66
- self.edges[from_node_name].append((to_node_name, prefilter_predicates))
67
-
68
- self._topo_counter[to_node_name] += 1
69
-
70
- return self
71
-
72
- """
73
- Connect nodes as a fan out from one `from_node` to multiple `to_nodes` and respective `prefilter_predicates`.
74
- Note: The user has to match the sizes of the lists to make sure they line up otherwise a None is used as a default.
75
- """
76
-
77
- def steps(
78
- self,
79
- from_node: extractor,
80
- to_nodes: List[extractor],
81
- prefilter_predicates: List[str] = [],
82
- ) -> 'RunGraph':
83
- print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
84
- for t_n, p in itertools.zip_longest(
85
- to_nodes, prefilter_predicates, fillvalue=None
86
- ):
87
- self.step(from_node=from_node, to_node=t_n, prefilter_predicates=p)
88
-
89
- return self
90
-
91
- def add_param(self, node: extractor, params: Dict[str, Any]):
92
- try:
93
- # check if the params can be serialized since the server needs this
94
- json.dumps(params)
95
- except Exception:
96
- raise Exception(f"For node {node.name}, cannot serialize params as json.")
97
-
98
- self.params[node.name] = params
99
-
100
- def run(self, wf_input, local):
101
- self._assign_start_node()
102
- # self.runner = LocalRunner()
103
- self.runner.run(self, wf_input=wf_input)
104
- pass
105
-
106
- def clear_cache_for_node(self, node: Union[extractor, Extractor]):
107
- if node.name not in self.nodes.keys():
108
- raise Exception(f"Node with name {node.name} not found in graph")
109
-
110
- self.runner.deleted_from_memo(node.name)
111
-
112
- def clear_cache_for_all_nodes(self):
113
- for node_name in self.nodes:
114
- self.runner.deleted_from_memo(node_name=node_name)
115
-
116
- def get_result(self, node: Union[extractor, Extractor]) -> Any:
117
- return self.runner.results[node.name]
118
-
119
- def _assign_start_node(self):
120
- # this method should be called before a graph can be run
121
- nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
122
- self._start_node = nodes[0][0]
@@ -1,23 +0,0 @@
1
- indexify/__init__.py,sha256=ZDpPkRz4hBo6eqArhVBxqIscLSiD20q5rOHPYyOTloE,503
2
- indexify/base_client.py,sha256=Db-BNYQ6yNmOIXPaQN8W5qjTYvfFvPzoxC9206YRc-U,2755
3
- indexify/client.py,sha256=FPCO2DN6RstKLasmNrPxRhzBXDgM14tbc3eDDxl8J_A,25998
4
- indexify/data_loaders/__init__.py,sha256=TmOJLgKC5gM7_1n7zxYiuza3fOilIiYYupxBGd31PfA,1339
5
- indexify/data_loaders/local_directory_loader.py,sha256=0X_FgLS5unisJSij8LICv1htp8IdW09LbTIJ2wvVJg4,1246
6
- indexify/data_loaders/url_loader.py,sha256=shjw6dYBlaxA_PzP6qCB9TTtbPiY4h6FV7uopDbRQCc,1546
7
- indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
8
- indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
9
- indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
10
- indexify/extractor_sdk/__init__.py,sha256=T512UtvFPUXEXlnT9HHHLHPcEau1Acoac_ksByuo7jA,348
11
- indexify/extractor_sdk/data.py,sha256=632fY4S_F_aYPLtOl_7dZnSAyMvVZY8ujSSIWJ9k104,2781
12
- indexify/extractor_sdk/extractor.py,sha256=CtlRn8JC8vGn9fm4QameA47x9T1l_cRpkJMUYYpetco,10457
13
- indexify/extractor_sdk/utils.py,sha256=_j8WflgOM0Qkf2NjhK2p1xXuwq4drLxO0mgKVPEHhlw,6594
14
- indexify/graph.py,sha256=fVZeGIcSqO3p8dGIQOdbuFYQ-8QaTQ7Jr37OefA2Phk,549
15
- indexify/local_runner.py,sha256=Ri-Wpw2qgnQ4I3fRR9qdXXRDASuZnu4-VR2xECG9gnY,4346
16
- indexify/run_graph.py,sha256=gw3IEf8-myVaHUV7g6LPt8-uSMIVr7S0Zs62aT7UB90,3757
17
- indexify/runner.py,sha256=M_3_GWYyPpb4lR5KFTpW8OAgp-fm9kYd_5xEqmiCBU4,637
18
- indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
19
- indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
20
- indexify-0.0.39.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
21
- indexify-0.0.39.dist-info/METADATA,sha256=EvEM7lkuDP1YJsh0wskXIBMQxivHYPKfPNERLV0eaa0,1877
22
- indexify-0.0.39.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- indexify-0.0.39.dist-info/RECORD,,