indexify 0.0.39__tar.gz → 0.0.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {indexify-0.0.39 → indexify-0.0.41}/PKG-INFO +2 -1
  2. {indexify-0.0.39 → indexify-0.0.41}/indexify/__init__.py +5 -5
  3. {indexify-0.0.39 → indexify-0.0.41}/indexify/base_client.py +11 -7
  4. {indexify-0.0.39 → indexify-0.0.41}/indexify/client.py +15 -12
  5. {indexify-0.0.39 → indexify-0.0.41}/indexify/data_loaders/local_directory_loader.py +3 -2
  6. {indexify-0.0.39 → indexify-0.0.41}/indexify/data_loaders/url_loader.py +4 -3
  7. {indexify-0.0.39 → indexify-0.0.41}/indexify/extraction_policy.py +2 -2
  8. {indexify-0.0.39 → indexify-0.0.41}/indexify/extractor_sdk/__init__.py +2 -2
  9. {indexify-0.0.39 → indexify-0.0.41}/indexify/extractor_sdk/data.py +4 -12
  10. {indexify-0.0.39 → indexify-0.0.41}/indexify/extractor_sdk/extractor.py +23 -31
  11. {indexify-0.0.39 → indexify-0.0.41}/indexify/extractor_sdk/utils.py +9 -15
  12. indexify-0.0.41/indexify/extractors/__init__.py +0 -0
  13. indexify-0.0.41/indexify/extractors/embedding.py +55 -0
  14. indexify-0.0.41/indexify/extractors/pdf_parser.py +95 -0
  15. indexify-0.0.39/indexify/run_graph.py → indexify-0.0.41/indexify/graph.py +24 -13
  16. {indexify-0.0.39 → indexify-0.0.41}/indexify/local_runner.py +17 -14
  17. {indexify-0.0.39 → indexify-0.0.41}/indexify/runner.py +2 -2
  18. {indexify-0.0.39 → indexify-0.0.41}/pyproject.toml +2 -1
  19. indexify-0.0.39/indexify/graph.py +0 -23
  20. {indexify-0.0.39 → indexify-0.0.41}/LICENSE.txt +0 -0
  21. {indexify-0.0.39 → indexify-0.0.41}/README.md +0 -0
  22. {indexify-0.0.39 → indexify-0.0.41}/indexify/data_loaders/__init__.py +5 -5
  23. {indexify-0.0.39 → indexify-0.0.41}/indexify/error.py +0 -0
  24. {indexify-0.0.39 → indexify-0.0.41}/indexify/exceptions.py +0 -0
  25. {indexify-0.0.39 → indexify-0.0.41}/indexify/settings.py +0 -0
  26. {indexify-0.0.39 → indexify-0.0.41}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.39
3
+ Version: 0.0.41
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: cloudpickle (>=3,<4)
16
17
  Requires-Dist: httpx[http2] (>=0,<1)
17
18
  Requires-Dist: pydantic (>=2.8,<3.0)
18
19
  Requires-Dist: pyyaml (>=6,<7)
@@ -1,17 +1,17 @@
1
- from .client import IndexifyClient
2
- from .extraction_policy import ExtractionGraph
1
+ from . import data_loaders, extractor_sdk
3
2
  from .client import (
4
- IndexifyClient,
5
3
  Document,
4
+ IndexifyClient,
6
5
  generate_hash_from_string,
7
6
  generate_unique_hex_id,
8
7
  )
9
- from . import extractor_sdk
8
+ from .extraction_policy import ExtractionGraph
9
+ from .graph import Graph
10
10
  from .settings import DEFAULT_SERVICE_URL
11
- from . import data_loaders
12
11
 
13
12
  __all__ = [
14
13
  "data_loaders",
14
+ "Graph",
15
15
  "Document",
16
16
  "extractor_sdk",
17
17
  "IndexifyClient",
@@ -1,6 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
- from .extractor_sdk import Graph, Feature
3
- from typing import Any, List, Optional, Union, Dict
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from .extractor_sdk import Feature, Graph
5
+
4
6
 
5
7
  class BaseClient(ABC):
6
8
 
@@ -23,7 +25,7 @@ class BaseClient(ABC):
23
25
 
24
26
  ### Ingestion APIs
25
27
  @abstractmethod
26
- def invoke_graph_with_object(self, graph:str, object: Any) -> str:
28
+ def invoke_graph_with_object(self, graph: str, object: Any) -> str:
27
29
  """
28
30
  Invokes a graph with an input object.
29
31
  graph: str: The name of the graph to invoke
@@ -42,10 +44,11 @@ class BaseClient(ABC):
42
44
  """
43
45
  pass
44
46
 
45
-
46
47
  ### Retrieval APIs
47
48
  @abstractmethod
48
- def extracted_objects(self, graph: str, ingested_object_id: str, extractor_name: Optional[str]) -> Union[Dict[str, List[Any]], List[Any]]:
49
+ def extracted_objects(
50
+ self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
51
+ ) -> Union[Dict[str, List[Any]], List[Any]]:
49
52
  """
50
53
  Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
51
54
  If the extractor name is not provided, all the extracted objects are returned for the input object.
@@ -57,11 +60,12 @@ class BaseClient(ABC):
57
60
  pass
58
61
 
59
62
  @abstractmethod
60
- def features(self, object_id: str, graph: Optional[str]) -> Union[Dict[str, List[Feature]], List[Feature]]:
63
+ def features(
64
+ self, object_id: str, graph: Optional[str]
65
+ ) -> Union[Dict[str, List[Feature]], List[Feature]]:
61
66
  """
62
67
  Returns the features of an object.
63
68
  object_id: str: The ID of the object
64
69
  return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
65
70
  """
66
71
  pass
67
-
@@ -1,20 +1,23 @@
1
- import yaml
2
- import httpx
3
- import uuid
4
1
  import hashlib
5
2
  import json
3
+ import logging
4
+ import uuid
6
5
  from collections import namedtuple
7
- from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
8
- from .extractor_sdk.extractor import ExtractorMetadata
9
- from .extraction_policy import ExtractionGraph
10
- from .utils import json_set_default
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List, Optional, Union
8
+
9
+ import httpx
10
+ import yaml
11
+
12
+ from indexify.exceptions import ApiException
13
+
14
+ from .data_loaders import DataLoader
11
15
  from .error import Error
16
+ from .extraction_policy import ExtractionGraph
12
17
  from .extractor_sdk.data import ContentMetadata
13
- from .data_loaders import DataLoader
14
- from indexify.exceptions import ApiException
15
- from dataclasses import dataclass
16
- from typing import List, Optional, Union, Dict
17
- import logging
18
+ from .extractor_sdk.extractor import ExtractorMetadata
19
+ from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
20
+ from .utils import json_set_default
18
21
 
19
22
  Document = namedtuple("Document", ["text", "labels", "id"])
20
23
 
@@ -1,6 +1,7 @@
1
- from . import DataLoader, FileMetadata
2
- from typing import List, Optional
3
1
  import os
2
+ from typing import List, Optional
3
+
4
+ from . import DataLoader, FileMetadata
4
5
 
5
6
 
6
7
  class LocalDirectoryLoader(DataLoader):
@@ -1,8 +1,9 @@
1
- from . import DataLoader, FileMetadata
1
+ import email.utils
2
2
  from typing import List
3
+
3
4
  import httpx
4
- import hashlib
5
- import email.utils
5
+
6
+ from . import DataLoader, FileMetadata
6
7
 
7
8
 
8
9
  def convert_date_to_epoch(date_str: str) -> int:
@@ -1,5 +1,5 @@
1
- from dataclasses import dataclass, asdict
2
- from typing import Optional, List
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List, Optional
3
3
 
4
4
 
5
5
  @dataclass
@@ -1,5 +1,5 @@
1
- from .data import ContentMetadata, Content, Feature
2
- from .extractor import Extractor, extractor, EmbeddingSchema, ExtractorMetadata
1
+ from .data import Content, ContentMetadata, Feature
2
+ from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
3
3
  from .utils import SampleExtractorData
4
4
 
5
5
  __all__ = [
@@ -1,15 +1,7 @@
1
- from typing import (
2
- Any,
3
- List,
4
- Optional,
5
- Literal,
6
- Dict,
7
- Type,
8
- cast,
9
- Mapping,
10
- )
11
- from pydantic import BaseModel, Json, Field
12
1
  import json
2
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
3
+
4
+ from pydantic import BaseModel, Field, Json
13
5
  from typing_extensions import Annotated, Doc
14
6
 
15
7
 
@@ -103,6 +95,6 @@ class ContentMetadata(BaseModel):
103
95
  )
104
96
 
105
97
 
106
- class PDFFile(BaseData):
98
+ class File(BaseData):
107
99
  data: bytes
108
100
  mime_type: str
@@ -1,16 +1,29 @@
1
- from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
2
1
  import inspect
3
- from pydantic import BaseModel, Field
4
- from abc import ABC, abstractmethod
5
- from .data import BaseData, Content, Feature
6
2
  import json
7
3
  import os
4
+ from abc import ABC, abstractmethod
5
+ from typing import (
6
+ Callable,
7
+ Dict,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ Type,
12
+ Union,
13
+ get_type_hints,
14
+ )
15
+
8
16
  import requests
17
+ from pydantic import BaseModel, Field
18
+
19
+ from .data import BaseData, Content, Feature
9
20
 
10
21
 
11
22
  class EmbeddingSchema(BaseModel):
12
23
  dim: int
13
- distance: str = "cosine"
24
+ distance: Optional[str] = "cosine"
25
+ database_url: Optional[str] = None
26
+
14
27
 
15
28
  class ExtractorMetadata(BaseModel):
16
29
  name: str
@@ -40,7 +53,7 @@ class Extractor(ABC):
40
53
 
41
54
  input_mime_types = ["text/plain"]
42
55
 
43
- embeddings: Dict[str, EmbeddingSchema] = {}
56
+ embedding_indexes: Dict[str, EmbeddingSchema] = {}
44
57
 
45
58
  @abstractmethod
46
59
  def extract(
@@ -55,31 +68,9 @@ class Extractor(ABC):
55
68
  pass
56
69
 
57
70
  @classmethod
58
- @abstractmethod
59
71
  def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
60
72
  pass
61
73
 
62
- def describe(self) -> ExtractorMetadata:
63
- embedding_schemas = {}
64
- try:
65
- embedding_schemas = self.embedding_schemas
66
- except NotImplementedError:
67
- pass
68
-
69
- json_schema = (
70
- self._param_cls.model_json_schema() if self._param_cls is not None else None
71
- )
72
- return ExtractorMetadata(
73
- name=self.name,
74
- version=self.version,
75
- description=self.description,
76
- system_dependencies=self.system_dependencies,
77
- python_dependencies=self.python_dependencies,
78
- input_mime_types=self.input_mime_types,
79
- embedding_schemas=embedding_schemas,
80
- input_params=json.dumps(json_schema),
81
- )
82
-
83
74
  def _download_file(self, url, filename):
84
75
  if os.path.exists(filename):
85
76
  # file exists skip
@@ -190,7 +181,7 @@ def extractor(
190
181
  python_dependencies: Optional[List[str]] = None,
191
182
  system_dependencies: Optional[List[str]] = None,
192
183
  input_mime_types: Optional[List[str]] = None,
193
- embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
184
+ embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
194
185
  sample_content: Optional[Callable] = None,
195
186
  ):
196
187
  args = locals()
@@ -198,7 +189,7 @@ def extractor(
198
189
 
199
190
  def construct(fn):
200
191
  def wrapper():
201
- hint = get_type_hints(fn).get("params", dict)
192
+ description = fn.__doc__ or args.get("description", "")
202
193
 
203
194
  if not args.get("name"):
204
195
  args[
@@ -207,7 +198,7 @@ def extractor(
207
198
 
208
199
  class DecoratedFn(Extractor):
209
200
  @classmethod
210
- def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
201
+ def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
211
202
  # TODO we can force all the functions to take in a parms object
212
203
  # or check if someone adds a params
213
204
  if params is None:
@@ -220,6 +211,7 @@ def extractor(
220
211
 
221
212
  for key, val in args.items():
222
213
  setattr(DecoratedFn, key, val)
214
+ DecoratedFn.description = description
223
215
 
224
216
  return DecoratedFn
225
217
 
@@ -1,20 +1,17 @@
1
1
  import os
2
- import httpx
3
2
  from typing import List
3
+
4
+ import httpx
5
+
4
6
  from .data import Content, Feature
5
7
 
6
8
 
7
9
  class SampleExtractorData:
8
- def _download_file(self, url, filename):
9
- if os.path.exists(filename):
10
- # file exists skip
11
- return
10
+ def _download_file(self, url):
12
11
  try:
13
- with httpx.get(url, stream=True) as r:
14
- r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
15
- with open(filename, "wb") as f:
16
- for chunk in r.iter_content(chunk_size=8192):
17
- f.write(chunk)
12
+ resp = httpx.get(url, follow_redirects=True)
13
+ resp.raise_for_status()
14
+ return resp.content
18
15
  except httpx.exceptions.RequestException as e:
19
16
  print(f"Error downloading the file: {e}")
20
17
 
@@ -55,13 +52,10 @@ class SampleExtractorData:
55
52
  return Content(content_type="image/jpg", data=f.read(), features=features)
56
53
 
57
54
  def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
58
- file_name = "sample.pdf"
59
- self._download_file(
55
+ data = self._download_file(
60
56
  "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
61
- file_name,
62
57
  )
63
- f = open(file_name, "rb")
64
- return Content(content_type="application/pdf", data=f.read(), features=features)
58
+ return Content(content_type="application/pdf", data=data, features=features)
65
59
 
66
60
  def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
67
61
  file_name = "sample.pdf"
File without changes
@@ -0,0 +1,55 @@
1
+ from typing import List
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from transformers import AutoModel, AutoTokenizer
6
+
7
+ from indexify.extractor_sdk.data import Feature
8
+ from indexify.extractor_sdk.extractor import Extractor, Feature
9
+
10
+
11
+ class SentenceTransformersEmbedding:
12
+ def __init__(self, model_name) -> None:
13
+ self._model_name = model_name
14
+ self._tokenizer = AutoTokenizer.from_pretrained(
15
+ f"sentence-transformers/{model_name}"
16
+ )
17
+ self._model = AutoModel.from_pretrained(
18
+ f"sentence-transformers/{model_name}", torchscript=True
19
+ )
20
+ self._model.eval()
21
+
22
+ def embed_batch(self, inputs: List[str]) -> List[List[float]]:
23
+ result = self._embed(inputs)
24
+ return result.tolist()
25
+
26
+ def embed(self, query: str) -> List[float]:
27
+ result = self._embed([query])
28
+ return result[0].tolist()
29
+
30
+ def _embed(self, inputs: List[str]) -> torch.Tensor:
31
+ encoded_input = self._tokenizer(
32
+ inputs, padding=True, truncation=True, return_tensors="pt"
33
+ )
34
+ sentence_embeddings = self._model(**encoded_input)
35
+ return F.normalize(sentence_embeddings, p=2, dim=1)
36
+
37
+
38
+ class BasicSentenceTransformerModels(Extractor):
39
+ def __init__(self, model: str):
40
+ super().__init__()
41
+ self.model = SentenceTransformersEmbedding(model)
42
+
43
+ def extract(self, input: str) -> List[Feature]:
44
+ embeddings = self.model.embed(input)
45
+ return [Feature.embedding(values=embeddings)]
46
+
47
+
48
+ class BasicHFTransformerEmbeddingModels(Extractor):
49
+ def __init__(self, model: str):
50
+ super().__init__()
51
+ self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
52
+
53
+ def extract(self, input: str) -> List[Feature]:
54
+ embeddings = self.model.embed_query(input)
55
+ return [Feature.embedding(values=embeddings)]
@@ -0,0 +1,95 @@
1
+ import tempfile
2
+ from enum import Enum
3
+ from typing import List, Optional
4
+
5
+ import deepdoctection as dd
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class PageFragmentType(str, Enum):
10
+ TEXT = "text"
11
+ FIGURE = "figure"
12
+ TABLE = "table"
13
+
14
+
15
+ class Image(BaseModel):
16
+ data: bytes
17
+ mime_type: str
18
+
19
+
20
+ class TableEncoding(str, Enum):
21
+ CSV = "csv"
22
+ HTML = "html"
23
+
24
+
25
+ class Table(BaseModel):
26
+ data: str
27
+ encoding: TableEncoding
28
+
29
+
30
+ class PageFragment(BaseModel):
31
+ fragment_type: PageFragmentType
32
+ text: Optional[str] = None
33
+ image: Optional[Image] = None
34
+ table: Optional[Table] = None
35
+ reading_order: Optional[int] = None
36
+
37
+
38
+ class Page(BaseModel):
39
+ number: int
40
+ fragments: List[PageFragment]
41
+
42
+
43
+ class PDFParser:
44
+ def __init__(self, data: bytes, language: Optional[str] = "en"):
45
+ self._data = data
46
+
47
+ def parse(self) -> List[Page]:
48
+ analyzer = dd.get_dd_analyzer()
49
+ parsed_pages = []
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
51
+ f.write(self._data)
52
+ f.flush()
53
+ df = analyzer.analyze(path=f.name)
54
+ df.reset_state()
55
+ for page in df:
56
+ parsed_pages.append(page)
57
+ outputs: List[Page] = []
58
+ for parsed_page in parsed_pages:
59
+ page_num = parsed_page.page_number
60
+ fragments = []
61
+ for layout in parsed_page.layouts:
62
+ if layout.category_name in ["text", "title"]:
63
+ fragments.append(
64
+ PageFragment(
65
+ fragment_type=PageFragmentType.TEXT,
66
+ text=layout.text,
67
+ reading_order=layout.reading_order,
68
+ )
69
+ )
70
+ figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
71
+ print(len(figures))
72
+ for figure in figures:
73
+ image_bytes = dd.viz_handler.encode(figure.viz())
74
+ fragments.append(
75
+ PageFragment(
76
+ fragment_type=PageFragmentType.FIGURE,
77
+ image=Image(data=image_bytes, mime_type="image/png"),
78
+ reading_order=figure.reading_order,
79
+ )
80
+ )
81
+
82
+ tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
83
+ print(len(tables))
84
+ for table in tables:
85
+ fragments.append(
86
+ PageFragment(
87
+ fragment_type=PageFragmentType.TABLE,
88
+ table=Table(data=table.html, encoding=TableEncoding.HTML),
89
+ reading_order=table.reading_order,
90
+ )
91
+ )
92
+
93
+ outputs.append(Page(number=page_num, fragments=fragments))
94
+
95
+ return outputs
@@ -1,22 +1,28 @@
1
+ import itertools
1
2
  import json
2
-
3
- from .extractor_sdk import Content, extractor, Extractor
4
-
5
3
  from collections import defaultdict
6
4
  from typing import Any, Dict, List, Optional, Type, Union
7
- from pydantic import BaseModel
8
5
 
9
- import itertools
6
+ import cloudpickle
7
+ from pydantic import BaseModel
10
8
 
9
+ from .extractor_sdk import Content, Extractor, extractor
11
10
  from .runner import Runner
12
11
 
12
+
13
13
  @extractor(description="id function")
14
14
  def _id(content: Content) -> List[Content]:
15
15
  return [content]
16
16
 
17
17
 
18
- class RunGraph:
19
- def __init__(self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner):
18
+ def load_graph(graph: bytes) -> "Graph":
19
+ return cloudpickle.loads(graph)
20
+
21
+
22
+ class Graph:
23
+ def __init__(
24
+ self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
25
+ ):
20
26
  # TODO check for cycles
21
27
  self.name = name
22
28
 
@@ -35,7 +41,10 @@ class RunGraph:
35
41
 
36
42
  self.runner = runner
37
43
 
38
- def _node(self, extractor: Union[extractor, Extractor], params: Any = None) -> 'RunGraph':
44
+ def get_extractor(self, name: str) -> Extractor:
45
+ return self.nodes[name]
46
+
47
+ def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
39
48
  name = extractor.name
40
49
 
41
50
  # if you've already inserted a node just ignore the new insertion.
@@ -50,12 +59,15 @@ class RunGraph:
50
59
 
51
60
  return self
52
61
 
62
+ def serialize(self):
63
+ return cloudpickle.dumps(self)
64
+
53
65
  def add_edge(
54
66
  self,
55
- from_node: extractor,
56
- to_node: extractor,
67
+ from_node: Type[Extractor],
68
+ to_node: Type[Extractor],
57
69
  prefilter_predicates: Optional[str] = None,
58
- ) -> 'RunGraph':
70
+ ) -> "Graph":
59
71
 
60
72
  self._node(from_node)
61
73
  self._node(to_node)
@@ -79,7 +91,7 @@ class RunGraph:
79
91
  from_node: extractor,
80
92
  to_nodes: List[extractor],
81
93
  prefilter_predicates: List[str] = [],
82
- ) -> 'RunGraph':
94
+ ) -> "Graph":
83
95
  print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
84
96
  for t_n, p in itertools.zip_longest(
85
97
  to_nodes, prefilter_predicates, fillvalue=None
@@ -99,7 +111,6 @@ class RunGraph:
99
111
 
100
112
  def run(self, wf_input, local):
101
113
  self._assign_start_node()
102
- # self.runner = LocalRunner()
103
114
  self.runner.run(self, wf_input=wf_input)
104
115
  pass
105
116
 
@@ -2,15 +2,13 @@ import hashlib
2
2
  import os
3
3
  import pickle
4
4
  import shutil
5
- from pathlib import Path
6
-
7
- from indexify.extractor_sdk.data import BaseData, Feature
8
- from indexify.extractor_sdk.extractor import extractor, Extractor
9
-
10
5
  from collections import defaultdict
6
+ from pathlib import Path
11
7
  from typing import Any, Callable, Dict, Optional, Union
12
8
 
13
- from indexify.run_graph import RunGraph
9
+ from indexify.extractor_sdk.data import BaseData, Feature
10
+ from indexify.extractor_sdk.extractor import Extractor, extractor
11
+ from indexify.graph import Graph
14
12
  from indexify.runner import Runner
15
13
 
16
14
 
@@ -27,9 +25,9 @@ class LocalRunner(Runner):
27
25
  # those bytes have to be a python type
28
26
 
29
27
  # _input needs to be serializable into python object (ie json for ex) and Feature
30
- def _run(self, g: RunGraph, _input: BaseData, node_name: str):
28
+ def _run(self, g: Graph, _input: BaseData, node_name: str):
31
29
  print(f"---- Starting node {node_name}")
32
- print(f'node_name {node_name}')
30
+ print(f"node_name {node_name}")
33
31
 
34
32
  extractor_construct: Callable = g.nodes[node_name]
35
33
  params = g.params.get(node_name, None)
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
65
63
  for out_edge, pre_filter_predicate in g.edges[node_name]:
66
64
  # TODO there are no reductions yet, each recursion finishes it's path and returns
67
65
  for r in data_to_process:
68
- if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
66
+ if self._prefilter_content(
67
+ content=r, prefilter_predicate=pre_filter_predicate
68
+ ):
69
69
  continue
70
70
 
71
71
  self._run(g, _input=r, node_name=out_edge)
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
73
73
  """
74
74
  Returns True if content should be filtered
75
75
  """
76
- def _prefilter_content(self, content: BaseData, prefilter_predicate: Optional[str]) -> bool:
76
+
77
+ def _prefilter_content(
78
+ self, content: BaseData, prefilter_predicate: Optional[str]
79
+ ) -> bool:
77
80
  if prefilter_predicate is None:
78
81
  return False
79
82
 
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
83
86
 
84
87
  # TODO For now only support `and` and `=` and `string values`
85
88
  bools = []
86
- metadata = content.get_features()['metadata']
89
+ metadata = content.get_features()["metadata"]
87
90
  for atom in atoms:
88
- l, r = atom.split('=')
91
+ l, r = atom.split("=")
89
92
  if l in metadata:
90
93
  bools.append(metadata[l] != r)
91
94
 
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
109
112
  if not os.path.exists(file_path):
110
113
  return None
111
114
 
112
- with open(file_path, 'rb') as f:
115
+ with open(file_path, "rb") as f:
113
116
  return f.read()
114
117
 
115
118
  def put_into_memo(self, node_name, input_hash, output):
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
121
124
 
122
125
  Path(file_path).touch()
123
126
 
124
- with open(file_path, 'wb') as f:
127
+ with open(file_path, "wb") as f:
125
128
  return f.write(output)
@@ -1,9 +1,9 @@
1
1
  from abc import ABC
2
+ from typing import Any, Union
2
3
 
3
4
  from indexify.extractor_sdk.data import BaseData
4
- from indexify.extractor_sdk.extractor import extractor, Extractor
5
+ from indexify.extractor_sdk.extractor import Extractor, extractor
5
6
 
6
- from typing import Any, Union
7
7
 
8
8
  class Runner(ABC):
9
9
  def run(self, g, wf_input: BaseData):
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.39"
3
+ version = "0.0.41"
4
4
  description = "Python Client for Indexify"
5
5
  authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
@@ -13,6 +13,7 @@ python = "^3.9"
13
13
  httpx = { version = "^0", extras = ["http2"] }
14
14
  pyyaml = "^6"
15
15
  pydantic = "^2.8"
16
+ cloudpickle = "^3"
16
17
 
17
18
  [tool.poetry.dev-dependencies]
18
19
  black = "^22.3.0"
@@ -1,23 +0,0 @@
1
- from .extractor_sdk import extractor, Extractor
2
-
3
- from typing import Type, Union
4
- from pydantic import BaseModel
5
-
6
- from .run_graph import RunGraph
7
- from .local_runner import LocalRunner
8
-
9
-
10
- def Graph(
11
- name: str,
12
- input: Type[BaseModel],
13
- start_node: Union[extractor, Extractor],
14
- run_local: bool,
15
- ) -> RunGraph:
16
-
17
- if run_local:
18
- runner = LocalRunner()
19
- else:
20
- raise NotImplementedError("Remote runner not supported yet")
21
-
22
- graph = RunGraph(name=name, input=input, start_node=start_node, runner=runner)
23
- return graph
File without changes
File without changes
@@ -1,10 +1,10 @@
1
- from pydantic import BaseModel
2
-
1
+ import hashlib
2
+ import mimetypes
3
+ import os
3
4
  from abc import ABC, abstractmethod
4
5
  from typing import List
5
- import os
6
- import mimetypes
7
- import hashlib
6
+
7
+ from pydantic import BaseModel
8
8
 
9
9
 
10
10
  class FileMetadata(BaseModel):
File without changes
File without changes