indexify 0.0.40__tar.gz → 0.0.42__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.40 → indexify-0.0.42}/PKG-INFO +1 -1
- {indexify-0.0.40 → indexify-0.0.42}/indexify/__init__.py +4 -6
- {indexify-0.0.40 → indexify-0.0.42}/indexify/base_client.py +11 -7
- {indexify-0.0.40 → indexify-0.0.42}/indexify/client.py +15 -12
- {indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/local_directory_loader.py +3 -2
- {indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/url_loader.py +4 -2
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extraction_policy.py +2 -2
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/__init__.py +2 -2
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/data.py +3 -11
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/extractor.py +18 -6
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractor_sdk/utils.py +9 -15
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractors/embedding.py +15 -13
- indexify-0.0.42/indexify/extractors/pdf_parser.py +93 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/graph.py +12 -8
- {indexify-0.0.40 → indexify-0.0.42}/indexify/local_runner.py +15 -12
- {indexify-0.0.40 → indexify-0.0.42}/indexify/runner.py +2 -2
- {indexify-0.0.40 → indexify-0.0.42}/pyproject.toml +1 -1
- {indexify-0.0.40 → indexify-0.0.42}/LICENSE.txt +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/README.md +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/data_loaders/__init__.py +5 -5
- {indexify-0.0.40 → indexify-0.0.42}/indexify/error.py +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/exceptions.py +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/extractors/__init__.py +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/settings.py +0 -0
- {indexify-0.0.40 → indexify-0.0.42}/indexify/utils.py +0 -0
@@ -1,15 +1,13 @@
|
|
1
|
-
from .
|
2
|
-
from .extraction_policy import ExtractionGraph
|
1
|
+
from . import data_loaders, extractor_sdk
|
3
2
|
from .client import (
|
4
|
-
IndexifyClient,
|
5
3
|
Document,
|
4
|
+
IndexifyClient,
|
6
5
|
generate_hash_from_string,
|
7
6
|
generate_unique_hex_id,
|
8
7
|
)
|
9
|
-
from . import
|
10
|
-
from .settings import DEFAULT_SERVICE_URL
|
11
|
-
from . import data_loaders
|
8
|
+
from .extraction_policy import ExtractionGraph
|
12
9
|
from .graph import Graph
|
10
|
+
from .settings import DEFAULT_SERVICE_URL
|
13
11
|
|
14
12
|
__all__ = [
|
15
13
|
"data_loaders",
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from
|
3
|
-
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
3
|
+
|
4
|
+
from .extractor_sdk import Feature, Graph
|
5
|
+
|
4
6
|
|
5
7
|
class BaseClient(ABC):
|
6
8
|
|
@@ -23,7 +25,7 @@ class BaseClient(ABC):
|
|
23
25
|
|
24
26
|
### Ingestion APIs
|
25
27
|
@abstractmethod
|
26
|
-
def invoke_graph_with_object(self, graph:str, object: Any) -> str:
|
28
|
+
def invoke_graph_with_object(self, graph: str, object: Any) -> str:
|
27
29
|
"""
|
28
30
|
Invokes a graph with an input object.
|
29
31
|
graph: str: The name of the graph to invoke
|
@@ -42,10 +44,11 @@ class BaseClient(ABC):
|
|
42
44
|
"""
|
43
45
|
pass
|
44
46
|
|
45
|
-
|
46
47
|
### Retrieval APIs
|
47
48
|
@abstractmethod
|
48
|
-
def extracted_objects(
|
49
|
+
def extracted_objects(
|
50
|
+
self, graph: str, ingested_object_id: str, extractor_name: Optional[str]
|
51
|
+
) -> Union[Dict[str, List[Any]], List[Any]]:
|
49
52
|
"""
|
50
53
|
Returns the extracted objects by a graph for an ingested object. If the extractor name is provided, only the objects extracted by that extractor are returned.
|
51
54
|
If the extractor name is not provided, all the extracted objects are returned for the input object.
|
@@ -57,11 +60,12 @@ class BaseClient(ABC):
|
|
57
60
|
pass
|
58
61
|
|
59
62
|
@abstractmethod
|
60
|
-
def features(
|
63
|
+
def features(
|
64
|
+
self, object_id: str, graph: Optional[str]
|
65
|
+
) -> Union[Dict[str, List[Feature]], List[Feature]]:
|
61
66
|
"""
|
62
67
|
Returns the features of an object.
|
63
68
|
object_id: str: The ID of the object
|
64
69
|
return: List[Feature]: The features associated with the object that were extracted. If a graph name is provided, only the features extracted by that graph are returned.
|
65
70
|
"""
|
66
71
|
pass
|
67
|
-
|
@@ -1,20 +1,23 @@
|
|
1
|
-
import yaml
|
2
|
-
import httpx
|
3
|
-
import uuid
|
4
1
|
import hashlib
|
5
2
|
import json
|
3
|
+
import logging
|
4
|
+
import uuid
|
6
5
|
from collections import namedtuple
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
10
|
-
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Dict, List, Optional, Union
|
8
|
+
|
9
|
+
import httpx
|
10
|
+
import yaml
|
11
|
+
|
12
|
+
from indexify.exceptions import ApiException
|
13
|
+
|
14
|
+
from .data_loaders import DataLoader
|
11
15
|
from .error import Error
|
16
|
+
from .extraction_policy import ExtractionGraph
|
12
17
|
from .extractor_sdk.data import ContentMetadata
|
13
|
-
from .
|
14
|
-
from
|
15
|
-
from
|
16
|
-
from typing import List, Optional, Union, Dict
|
17
|
-
import logging
|
18
|
+
from .extractor_sdk.extractor import ExtractorMetadata
|
19
|
+
from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
20
|
+
from .utils import json_set_default
|
18
21
|
|
19
22
|
Document = namedtuple("Document", ["text", "labels", "id"])
|
20
23
|
|
@@ -1,5 +1,5 @@
|
|
1
|
-
from .data import
|
2
|
-
from .extractor import
|
1
|
+
from .data import Content, ContentMetadata, Feature
|
2
|
+
from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
|
3
3
|
from .utils import SampleExtractorData
|
4
4
|
|
5
5
|
__all__ = [
|
@@ -1,15 +1,7 @@
|
|
1
|
-
from typing import (
|
2
|
-
Any,
|
3
|
-
List,
|
4
|
-
Optional,
|
5
|
-
Literal,
|
6
|
-
Dict,
|
7
|
-
Type,
|
8
|
-
cast,
|
9
|
-
Mapping,
|
10
|
-
)
|
11
|
-
from pydantic import BaseModel, Json, Field
|
12
1
|
import json
|
2
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Field, Json
|
13
5
|
from typing_extensions import Annotated, Doc
|
14
6
|
|
15
7
|
|
@@ -1,11 +1,22 @@
|
|
1
|
-
from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
|
2
1
|
import inspect
|
3
|
-
from pydantic import BaseModel, Field
|
4
|
-
from abc import ABC, abstractmethod
|
5
|
-
from .data import BaseData, Content, Feature
|
6
2
|
import json
|
7
3
|
import os
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import (
|
6
|
+
Callable,
|
7
|
+
Dict,
|
8
|
+
List,
|
9
|
+
Optional,
|
10
|
+
Tuple,
|
11
|
+
Type,
|
12
|
+
Union,
|
13
|
+
get_type_hints,
|
14
|
+
)
|
15
|
+
|
8
16
|
import requests
|
17
|
+
from pydantic import BaseModel, Field
|
18
|
+
|
19
|
+
from .data import BaseData, Content, Feature
|
9
20
|
|
10
21
|
|
11
22
|
class EmbeddingSchema(BaseModel):
|
@@ -13,6 +24,7 @@ class EmbeddingSchema(BaseModel):
|
|
13
24
|
distance: Optional[str] = "cosine"
|
14
25
|
database_url: Optional[str] = None
|
15
26
|
|
27
|
+
|
16
28
|
class ExtractorMetadata(BaseModel):
|
17
29
|
name: str
|
18
30
|
version: str
|
@@ -42,7 +54,7 @@ class Extractor(ABC):
|
|
42
54
|
input_mime_types = ["text/plain"]
|
43
55
|
|
44
56
|
embedding_indexes: Dict[str, EmbeddingSchema] = {}
|
45
|
-
|
57
|
+
|
46
58
|
@abstractmethod
|
47
59
|
def extract(
|
48
60
|
self, input: Type[BaseModel], params: Type[BaseModel] = None
|
@@ -186,7 +198,7 @@ def extractor(
|
|
186
198
|
|
187
199
|
class DecoratedFn(Extractor):
|
188
200
|
@classmethod
|
189
|
-
def extract(cls, input: Type[BaseData], params: Type[BaseModel]=None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
201
|
+
def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
190
202
|
# TODO we can force all the functions to take in a parms object
|
191
203
|
# or check if someone adds a params
|
192
204
|
if params is None:
|
@@ -1,20 +1,17 @@
|
|
1
1
|
import os
|
2
|
-
import httpx
|
3
2
|
from typing import List
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
|
4
6
|
from .data import Content, Feature
|
5
7
|
|
6
8
|
|
7
9
|
class SampleExtractorData:
|
8
|
-
def _download_file(self, url
|
9
|
-
if os.path.exists(filename):
|
10
|
-
# file exists skip
|
11
|
-
return
|
10
|
+
def _download_file(self, url):
|
12
11
|
try:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
for chunk in r.iter_content(chunk_size=8192):
|
17
|
-
f.write(chunk)
|
12
|
+
resp = httpx.get(url, follow_redirects=True)
|
13
|
+
resp.raise_for_status()
|
14
|
+
return resp.content
|
18
15
|
except httpx.exceptions.RequestException as e:
|
19
16
|
print(f"Error downloading the file: {e}")
|
20
17
|
|
@@ -55,13 +52,10 @@ class SampleExtractorData:
|
|
55
52
|
return Content(content_type="image/jpg", data=f.read(), features=features)
|
56
53
|
|
57
54
|
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
58
|
-
|
59
|
-
self._download_file(
|
55
|
+
data = self._download_file(
|
60
56
|
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
61
|
-
file_name,
|
62
57
|
)
|
63
|
-
|
64
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
58
|
+
return Content(content_type="application/pdf", data=data, features=features)
|
65
59
|
|
66
60
|
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
67
61
|
file_name = "sample.pdf"
|
@@ -1,10 +1,12 @@
|
|
1
1
|
from typing import List
|
2
2
|
|
3
|
-
from indexify.extractor_sdk.data import Feature
|
4
3
|
import torch
|
5
4
|
import torch.nn.functional as F
|
6
5
|
from transformers import AutoModel, AutoTokenizer
|
7
|
-
|
6
|
+
|
7
|
+
from indexify.extractor_sdk.data import Feature
|
8
|
+
from indexify.extractor_sdk.extractor import Extractor, Feature
|
9
|
+
|
8
10
|
|
9
11
|
class SentenceTransformersEmbedding:
|
10
12
|
def __init__(self, model_name) -> None:
|
@@ -31,9 +33,9 @@ class SentenceTransformersEmbedding:
|
|
31
33
|
)
|
32
34
|
sentence_embeddings = self._model(**encoded_input)
|
33
35
|
return F.normalize(sentence_embeddings, p=2, dim=1)
|
34
|
-
|
35
|
-
class BasicSentenceTransformerModels(Extractor):
|
36
36
|
|
37
|
+
|
38
|
+
class BasicSentenceTransformerModels(Extractor):
|
37
39
|
def __init__(self, model: str):
|
38
40
|
super().__init__()
|
39
41
|
self.model = SentenceTransformersEmbedding(model)
|
@@ -41,13 +43,13 @@ class BasicSentenceTransformerModels(Extractor):
|
|
41
43
|
def extract(self, input: str) -> List[Feature]:
|
42
44
|
embeddings = self.model.embed(input)
|
43
45
|
return [Feature.embedding(values=embeddings)]
|
44
|
-
|
46
|
+
|
47
|
+
|
45
48
|
class BasicHFTransformerEmbeddingModels(Extractor):
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
return [Feature.embedding(values=embeddings)]
|
49
|
+
def __init__(self, model: str):
|
50
|
+
super().__init__()
|
51
|
+
self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
|
52
|
+
|
53
|
+
def extract(self, input: str) -> List[Feature]:
|
54
|
+
embeddings = self.model.embed_query(input)
|
55
|
+
return [Feature.embedding(values=embeddings)]
|
@@ -0,0 +1,93 @@
|
|
1
|
+
import tempfile
|
2
|
+
from enum import Enum
|
3
|
+
from typing import List, Optional
|
4
|
+
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
|
8
|
+
class PageFragmentType(str, Enum):
|
9
|
+
TEXT = "text"
|
10
|
+
FIGURE = "figure"
|
11
|
+
TABLE = "table"
|
12
|
+
|
13
|
+
|
14
|
+
class Image(BaseModel):
|
15
|
+
data: bytes
|
16
|
+
mime_type: str
|
17
|
+
|
18
|
+
|
19
|
+
class TableEncoding(str, Enum):
|
20
|
+
CSV = "csv"
|
21
|
+
HTML = "html"
|
22
|
+
|
23
|
+
|
24
|
+
class Table(BaseModel):
|
25
|
+
data: str
|
26
|
+
encoding: TableEncoding
|
27
|
+
|
28
|
+
|
29
|
+
class PageFragment(BaseModel):
|
30
|
+
fragment_type: PageFragmentType
|
31
|
+
text: Optional[str] = None
|
32
|
+
image: Optional[Image] = None
|
33
|
+
table: Optional[Table] = None
|
34
|
+
reading_order: Optional[int] = None
|
35
|
+
|
36
|
+
|
37
|
+
class Page(BaseModel):
|
38
|
+
number: int
|
39
|
+
fragments: List[PageFragment]
|
40
|
+
|
41
|
+
|
42
|
+
class PDFParser:
|
43
|
+
def __init__(self, data: bytes, language: Optional[str] = "en"):
|
44
|
+
self._data = data
|
45
|
+
|
46
|
+
def parse(self) -> List[Page]:
|
47
|
+
import deepdoctection as dd
|
48
|
+
analyzer = dd.get_dd_analyzer()
|
49
|
+
parsed_pages = []
|
50
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
|
51
|
+
f.write(self._data)
|
52
|
+
f.flush()
|
53
|
+
df = analyzer.analyze(path=f.name)
|
54
|
+
df.reset_state()
|
55
|
+
for page in df:
|
56
|
+
parsed_pages.append(page)
|
57
|
+
outputs: List[Page] = []
|
58
|
+
for parsed_page in parsed_pages:
|
59
|
+
page_num = parsed_page.page_number
|
60
|
+
fragments = []
|
61
|
+
for layout in parsed_page.layouts:
|
62
|
+
if layout.category_name in ["text", "title"]:
|
63
|
+
fragments.append(
|
64
|
+
PageFragment(
|
65
|
+
fragment_type=PageFragmentType.TEXT,
|
66
|
+
text=layout.text,
|
67
|
+
reading_order=layout.reading_order,
|
68
|
+
)
|
69
|
+
)
|
70
|
+
figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
|
71
|
+
for figure in figures:
|
72
|
+
image_bytes = dd.viz_handler.encode(figure.viz())
|
73
|
+
fragments.append(
|
74
|
+
PageFragment(
|
75
|
+
fragment_type=PageFragmentType.FIGURE,
|
76
|
+
image=Image(data=image_bytes, mime_type="image/png"),
|
77
|
+
reading_order=figure.reading_order,
|
78
|
+
)
|
79
|
+
)
|
80
|
+
|
81
|
+
tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
|
82
|
+
for table in tables:
|
83
|
+
fragments.append(
|
84
|
+
PageFragment(
|
85
|
+
fragment_type=PageFragmentType.TABLE,
|
86
|
+
table=Table(data=table.html, encoding=TableEncoding.HTML),
|
87
|
+
reading_order=table.reading_order,
|
88
|
+
)
|
89
|
+
)
|
90
|
+
|
91
|
+
outputs.append(Page(number=page_num, fragments=fragments))
|
92
|
+
|
93
|
+
return outputs
|
@@ -1,24 +1,28 @@
|
|
1
|
-
import json
|
2
1
|
import itertools
|
2
|
+
import json
|
3
3
|
from collections import defaultdict
|
4
4
|
from typing import Any, Dict, List, Optional, Type, Union
|
5
5
|
|
6
6
|
import cloudpickle
|
7
7
|
from pydantic import BaseModel
|
8
8
|
|
9
|
-
from .extractor_sdk import Content,
|
9
|
+
from .extractor_sdk import Content, Extractor, extractor
|
10
10
|
from .runner import Runner
|
11
11
|
|
12
|
+
|
12
13
|
@extractor(description="id function")
|
13
14
|
def _id(content: Content) -> List[Content]:
|
14
15
|
return [content]
|
15
16
|
|
16
17
|
|
17
|
-
def load_graph(graph: bytes) ->
|
18
|
+
def load_graph(graph: bytes) -> "Graph":
|
18
19
|
return cloudpickle.loads(graph)
|
19
20
|
|
21
|
+
|
20
22
|
class Graph:
|
21
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self, name: str, input: Type[BaseModel], start_node: extractor, runner: Runner
|
25
|
+
):
|
22
26
|
# TODO check for cycles
|
23
27
|
self.name = name
|
24
28
|
|
@@ -40,7 +44,7 @@ class Graph:
|
|
40
44
|
def get_extractor(self, name: str) -> Extractor:
|
41
45
|
return self.nodes[name]
|
42
46
|
|
43
|
-
def _node(self, extractor: Extractor, params: Any = None) ->
|
47
|
+
def _node(self, extractor: Extractor, params: Any = None) -> "Graph":
|
44
48
|
name = extractor.name
|
45
49
|
|
46
50
|
# if you've already inserted a node just ignore the new insertion.
|
@@ -54,7 +58,7 @@ class Graph:
|
|
54
58
|
self._topo_counter[name] = 1
|
55
59
|
|
56
60
|
return self
|
57
|
-
|
61
|
+
|
58
62
|
def serialize(self):
|
59
63
|
return cloudpickle.dumps(self)
|
60
64
|
|
@@ -63,7 +67,7 @@ class Graph:
|
|
63
67
|
from_node: Type[Extractor],
|
64
68
|
to_node: Type[Extractor],
|
65
69
|
prefilter_predicates: Optional[str] = None,
|
66
|
-
) ->
|
70
|
+
) -> "Graph":
|
67
71
|
|
68
72
|
self._node(from_node)
|
69
73
|
self._node(to_node)
|
@@ -87,7 +91,7 @@ class Graph:
|
|
87
91
|
from_node: extractor,
|
88
92
|
to_nodes: List[extractor],
|
89
93
|
prefilter_predicates: List[str] = [],
|
90
|
-
) ->
|
94
|
+
) -> "Graph":
|
91
95
|
print(f"{to_nodes}, {prefilter_predicates}, {prefilter_predicates}")
|
92
96
|
for t_n, p in itertools.zip_longest(
|
93
97
|
to_nodes, prefilter_predicates, fillvalue=None
|
@@ -2,14 +2,12 @@ import hashlib
|
|
2
2
|
import os
|
3
3
|
import pickle
|
4
4
|
import shutil
|
5
|
-
from pathlib import Path
|
6
|
-
|
7
|
-
from indexify.extractor_sdk.data import BaseData, Feature
|
8
|
-
from indexify.extractor_sdk.extractor import extractor, Extractor
|
9
|
-
|
10
5
|
from collections import defaultdict
|
6
|
+
from pathlib import Path
|
11
7
|
from typing import Any, Callable, Dict, Optional, Union
|
12
8
|
|
9
|
+
from indexify.extractor_sdk.data import BaseData, Feature
|
10
|
+
from indexify.extractor_sdk.extractor import Extractor, extractor
|
13
11
|
from indexify.graph import Graph
|
14
12
|
from indexify.runner import Runner
|
15
13
|
|
@@ -29,7 +27,7 @@ class LocalRunner(Runner):
|
|
29
27
|
# _input needs to be serializable into python object (ie json for ex) and Feature
|
30
28
|
def _run(self, g: Graph, _input: BaseData, node_name: str):
|
31
29
|
print(f"---- Starting node {node_name}")
|
32
|
-
print(f
|
30
|
+
print(f"node_name {node_name}")
|
33
31
|
|
34
32
|
extractor_construct: Callable = g.nodes[node_name]
|
35
33
|
params = g.params.get(node_name, None)
|
@@ -65,7 +63,9 @@ class LocalRunner(Runner):
|
|
65
63
|
for out_edge, pre_filter_predicate in g.edges[node_name]:
|
66
64
|
# TODO there are no reductions yet, each recursion finishes it's path and returns
|
67
65
|
for r in data_to_process:
|
68
|
-
if self._prefilter_content(
|
66
|
+
if self._prefilter_content(
|
67
|
+
content=r, prefilter_predicate=pre_filter_predicate
|
68
|
+
):
|
69
69
|
continue
|
70
70
|
|
71
71
|
self._run(g, _input=r, node_name=out_edge)
|
@@ -73,7 +73,10 @@ class LocalRunner(Runner):
|
|
73
73
|
"""
|
74
74
|
Returns True if content should be filtered
|
75
75
|
"""
|
76
|
-
|
76
|
+
|
77
|
+
def _prefilter_content(
|
78
|
+
self, content: BaseData, prefilter_predicate: Optional[str]
|
79
|
+
) -> bool:
|
77
80
|
if prefilter_predicate is None:
|
78
81
|
return False
|
79
82
|
|
@@ -83,9 +86,9 @@ class LocalRunner(Runner):
|
|
83
86
|
|
84
87
|
# TODO For now only support `and` and `=` and `string values`
|
85
88
|
bools = []
|
86
|
-
metadata = content.get_features()[
|
89
|
+
metadata = content.get_features()["metadata"]
|
87
90
|
for atom in atoms:
|
88
|
-
l, r = atom.split(
|
91
|
+
l, r = atom.split("=")
|
89
92
|
if l in metadata:
|
90
93
|
bools.append(metadata[l] != r)
|
91
94
|
|
@@ -109,7 +112,7 @@ class LocalRunner(Runner):
|
|
109
112
|
if not os.path.exists(file_path):
|
110
113
|
return None
|
111
114
|
|
112
|
-
with open(file_path,
|
115
|
+
with open(file_path, "rb") as f:
|
113
116
|
return f.read()
|
114
117
|
|
115
118
|
def put_into_memo(self, node_name, input_hash, output):
|
@@ -121,5 +124,5 @@ class LocalRunner(Runner):
|
|
121
124
|
|
122
125
|
Path(file_path).touch()
|
123
126
|
|
124
|
-
with open(file_path,
|
127
|
+
with open(file_path, "wb") as f:
|
125
128
|
return f.write(output)
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from abc import ABC
|
2
|
+
from typing import Any, Union
|
2
3
|
|
3
4
|
from indexify.extractor_sdk.data import BaseData
|
4
|
-
from indexify.extractor_sdk.extractor import
|
5
|
+
from indexify.extractor_sdk.extractor import Extractor, extractor
|
5
6
|
|
6
|
-
from typing import Any, Union
|
7
7
|
|
8
8
|
class Runner(ABC):
|
9
9
|
def run(self, g, wf_input: BaseData):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.42"
|
4
4
|
description = "Python Client for Indexify"
|
5
5
|
authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
|
6
6
|
license = "Apache 2.0"
|
File without changes
|
File without changes
|
@@ -1,10 +1,10 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
import hashlib
|
2
|
+
import mimetypes
|
3
|
+
import os
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from typing import List
|
5
|
-
|
6
|
-
import
|
7
|
-
import hashlib
|
6
|
+
|
7
|
+
from pydantic import BaseModel
|
8
8
|
|
9
9
|
|
10
10
|
class FileMetadata(BaseModel):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|