indexify 0.0.42__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +13 -14
- indexify/base_client.py +48 -21
- indexify/cli.py +235 -0
- indexify/client.py +18 -790
- indexify/error.py +3 -30
- indexify/executor/agent.py +362 -0
- indexify/executor/api_objects.py +43 -0
- indexify/executor/downloader.py +124 -0
- indexify/executor/executor_tasks.py +72 -0
- indexify/executor/function_worker.py +177 -0
- indexify/executor/indexify_executor.py +32 -0
- indexify/executor/task_reporter.py +110 -0
- indexify/executor/task_store.py +113 -0
- indexify/foo +72 -0
- indexify/functions_sdk/data_objects.py +37 -0
- indexify/functions_sdk/graph.py +276 -0
- indexify/functions_sdk/graph_validation.py +69 -0
- indexify/functions_sdk/image.py +26 -0
- indexify/functions_sdk/indexify_functions.py +192 -0
- indexify/functions_sdk/local_cache.py +46 -0
- indexify/functions_sdk/object_serializer.py +61 -0
- indexify/local_client.py +183 -0
- indexify/remote_client.py +319 -0
- indexify-0.2.dist-info/METADATA +151 -0
- indexify-0.2.dist-info/RECORD +32 -0
- indexify-0.2.dist-info/entry_points.txt +3 -0
- indexify/exceptions.py +0 -3
- indexify/extraction_policy.py +0 -75
- indexify/extractor_sdk/__init__.py +0 -14
- indexify/extractor_sdk/data.py +0 -100
- indexify/extractor_sdk/extractor.py +0 -223
- indexify/extractor_sdk/utils.py +0 -102
- indexify/extractors/__init__.py +0 -0
- indexify/extractors/embedding.py +0 -55
- indexify/extractors/pdf_parser.py +0 -93
- indexify/graph.py +0 -133
- indexify/local_runner.py +0 -128
- indexify/runner.py +0 -22
- indexify/utils.py +0 -7
- indexify-0.0.42.dist-info/METADATA +0 -66
- indexify-0.0.42.dist-info/RECORD +0 -25
- {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
indexify/exceptions.py
DELETED
indexify/extraction_policy.py
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
from dataclasses import asdict, dataclass
|
2
|
-
from typing import List, Optional
|
3
|
-
|
4
|
-
|
5
|
-
@dataclass
|
6
|
-
class ExtractionPolicy:
|
7
|
-
extractor: str
|
8
|
-
name: str
|
9
|
-
content_source: str
|
10
|
-
input_params: Optional[dict] = None
|
11
|
-
id: Optional[str] = None
|
12
|
-
labels_eq: Optional[str] = None
|
13
|
-
|
14
|
-
def __repr__(self) -> str:
|
15
|
-
return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
|
16
|
-
|
17
|
-
def __str__(self) -> str:
|
18
|
-
return self.__repr__()
|
19
|
-
|
20
|
-
def to_dict(self) -> dict:
|
21
|
-
filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
|
22
|
-
return filtered_dict
|
23
|
-
|
24
|
-
@classmethod
|
25
|
-
def from_dict(cls, json: dict):
|
26
|
-
if "filters_eq" in json:
|
27
|
-
json["labels_eq"] = json.pop("filters_eq")
|
28
|
-
json["id"] = json.get("id", None)
|
29
|
-
return ExtractionPolicy(**json)
|
30
|
-
|
31
|
-
|
32
|
-
@dataclass
|
33
|
-
class ExtractionGraph:
|
34
|
-
id: str
|
35
|
-
name: str
|
36
|
-
description: str
|
37
|
-
extraction_policies: List[ExtractionPolicy]
|
38
|
-
|
39
|
-
@classmethod
|
40
|
-
def from_dict(cls, json: dict):
|
41
|
-
json["id"] = json.get("id", None)
|
42
|
-
json["description"] = json.get("description", None)
|
43
|
-
if "namespace" in json.keys():
|
44
|
-
json.pop("namespace")
|
45
|
-
return ExtractionGraph(**json)
|
46
|
-
|
47
|
-
@staticmethod
|
48
|
-
def from_yaml(spec: str):
|
49
|
-
import yaml
|
50
|
-
|
51
|
-
return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
|
52
|
-
|
53
|
-
@staticmethod
|
54
|
-
def from_yaml_file(path: str):
|
55
|
-
with open(path, "r") as f:
|
56
|
-
return ExtractionGraph.from_yaml(f.read())
|
57
|
-
|
58
|
-
def to_dict(self) -> dict:
|
59
|
-
filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
|
60
|
-
return filtered_dict
|
61
|
-
|
62
|
-
|
63
|
-
class ExtractionGraphBuilder:
|
64
|
-
def __init__(self, name: str):
|
65
|
-
self.name = name
|
66
|
-
self.extraction_policies = []
|
67
|
-
|
68
|
-
def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
|
69
|
-
self.extraction_policies.append(policy)
|
70
|
-
return self
|
71
|
-
|
72
|
-
def build(self):
|
73
|
-
return ExtractionGraph(
|
74
|
-
id=self.id, name=self.name, extraction_policies=self.extraction_policies
|
75
|
-
)
|
@@ -1,14 +0,0 @@
|
|
1
|
-
from .data import Content, ContentMetadata, Feature
|
2
|
-
from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
|
3
|
-
from .utils import SampleExtractorData
|
4
|
-
|
5
|
-
__all__ = [
|
6
|
-
"ContentMetadata",
|
7
|
-
"Content",
|
8
|
-
"Feature",
|
9
|
-
"Extractor",
|
10
|
-
"extractor",
|
11
|
-
"EmbeddingSchema",
|
12
|
-
"ExtractorMetadata",
|
13
|
-
"SampleExtractorData",
|
14
|
-
]
|
indexify/extractor_sdk/data.py
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
|
3
|
-
|
4
|
-
from pydantic import BaseModel, Field, Json
|
5
|
-
from typing_extensions import Annotated, Doc
|
6
|
-
|
7
|
-
|
8
|
-
class BaseData(BaseModel):
|
9
|
-
meta: Mapping[str, Type[BaseModel]] = {}
|
10
|
-
|
11
|
-
def get_features(self) -> List[Type[BaseModel]]:
|
12
|
-
return self.meta
|
13
|
-
|
14
|
-
def get_feature(self, name: str) -> Optional[Type[BaseModel]]:
|
15
|
-
return self.meta.get(name)
|
16
|
-
|
17
|
-
|
18
|
-
class Feature(BaseModel):
|
19
|
-
feature_type: Literal["embedding", "metadata"]
|
20
|
-
name: str
|
21
|
-
value: Json
|
22
|
-
comment: Optional[Json] = Field(default=None)
|
23
|
-
|
24
|
-
@classmethod
|
25
|
-
def embedding(cls, values: List[float], name: str = "embedding", distance="cosine"):
|
26
|
-
return cls(
|
27
|
-
feature_type="embedding",
|
28
|
-
name=name,
|
29
|
-
value=json.dumps({"values": values, "distance": distance}),
|
30
|
-
comment=None,
|
31
|
-
)
|
32
|
-
|
33
|
-
@classmethod
|
34
|
-
def metadata(cls, value: Json, comment: Json = None, name: str = "metadata"):
|
35
|
-
value = json.dumps(value)
|
36
|
-
comment = json.dumps(comment) if comment is not None else None
|
37
|
-
return cls(feature_type="metadata", name=name, value=value)
|
38
|
-
|
39
|
-
|
40
|
-
class Content(BaseModel):
|
41
|
-
id: Optional[str] = (None,)
|
42
|
-
content_type: Optional[str]
|
43
|
-
data: bytes
|
44
|
-
features: List[Feature] = []
|
45
|
-
|
46
|
-
@classmethod
|
47
|
-
def from_text(
|
48
|
-
cls,
|
49
|
-
text: str,
|
50
|
-
features: List[Feature] = [],
|
51
|
-
):
|
52
|
-
return Content(
|
53
|
-
id=None,
|
54
|
-
content_type="text/plain",
|
55
|
-
data=bytes(text, "utf-8"),
|
56
|
-
features=features,
|
57
|
-
)
|
58
|
-
|
59
|
-
@classmethod
|
60
|
-
def from_json(cls, json_data: Json, features: List[Feature] = []):
|
61
|
-
return cls(
|
62
|
-
content_type="application/json",
|
63
|
-
data=bytes(json.dumps(json_data), "utf-8"),
|
64
|
-
features=features,
|
65
|
-
)
|
66
|
-
|
67
|
-
@classmethod
|
68
|
-
def from_file(cls, path: str):
|
69
|
-
import mimetypes
|
70
|
-
|
71
|
-
m, _ = mimetypes.guess_type(path)
|
72
|
-
with open(path, "rb") as f:
|
73
|
-
return cls(id="none-for-now", content_type=m, data=f.read())
|
74
|
-
|
75
|
-
|
76
|
-
class ContentMetadata(BaseModel):
|
77
|
-
id: str
|
78
|
-
parent_id: str
|
79
|
-
labels: Dict[str, Any]
|
80
|
-
extraction_graph_names: List[str]
|
81
|
-
extraction_policy: str
|
82
|
-
mime_type: str
|
83
|
-
extracted_metadata: Dict[str, Any] = {}
|
84
|
-
|
85
|
-
@classmethod
|
86
|
-
def from_dict(cls, json: Dict):
|
87
|
-
return cls(
|
88
|
-
id=json["id"],
|
89
|
-
parent_id=json["parent_id"],
|
90
|
-
labels=json["labels"],
|
91
|
-
extraction_graph_names=json["extraction_graph_names"],
|
92
|
-
extraction_policy=json["source"],
|
93
|
-
mime_type=json["mime_type"],
|
94
|
-
extracted_metadata=json["extracted_metadata"],
|
95
|
-
)
|
96
|
-
|
97
|
-
|
98
|
-
class File(BaseData):
|
99
|
-
data: bytes
|
100
|
-
mime_type: str
|
@@ -1,223 +0,0 @@
|
|
1
|
-
import inspect
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
from abc import ABC, abstractmethod
|
5
|
-
from typing import (
|
6
|
-
Callable,
|
7
|
-
Dict,
|
8
|
-
List,
|
9
|
-
Optional,
|
10
|
-
Tuple,
|
11
|
-
Type,
|
12
|
-
Union,
|
13
|
-
get_type_hints,
|
14
|
-
)
|
15
|
-
|
16
|
-
import requests
|
17
|
-
from pydantic import BaseModel, Field
|
18
|
-
|
19
|
-
from .data import BaseData, Content, Feature
|
20
|
-
|
21
|
-
|
22
|
-
class EmbeddingSchema(BaseModel):
|
23
|
-
dim: int
|
24
|
-
distance: Optional[str] = "cosine"
|
25
|
-
database_url: Optional[str] = None
|
26
|
-
|
27
|
-
|
28
|
-
class ExtractorMetadata(BaseModel):
|
29
|
-
name: str
|
30
|
-
version: str
|
31
|
-
description: str
|
32
|
-
input_mime_types: List[str]
|
33
|
-
system_dependencies: List[str]
|
34
|
-
python_dependencies: List[str]
|
35
|
-
input_mime_types: List[str]
|
36
|
-
embedding_schemas: Dict[str, EmbeddingSchema]
|
37
|
-
# Make this a dynamic model since its a json schema
|
38
|
-
input_params: Optional[Dict]
|
39
|
-
# for backward compatibility
|
40
|
-
metadata_schemas: Optional[Dict]
|
41
|
-
|
42
|
-
|
43
|
-
class Extractor(ABC):
|
44
|
-
name: str = ""
|
45
|
-
|
46
|
-
version: str = "0.0.0"
|
47
|
-
|
48
|
-
system_dependencies: List[str] = []
|
49
|
-
|
50
|
-
python_dependencies: List[str] = []
|
51
|
-
|
52
|
-
description: str = ""
|
53
|
-
|
54
|
-
input_mime_types = ["text/plain"]
|
55
|
-
|
56
|
-
embedding_indexes: Dict[str, EmbeddingSchema] = {}
|
57
|
-
|
58
|
-
@abstractmethod
|
59
|
-
def extract(
|
60
|
-
self, input: Type[BaseModel], params: Type[BaseModel] = None
|
61
|
-
) -> List[Union[Feature, Type[BaseModel]]]:
|
62
|
-
"""
|
63
|
-
Extracts information from the content. Returns a list of features to add
|
64
|
-
to the content.
|
65
|
-
It can also return a list of Content objects, which will be added to storage
|
66
|
-
and any extraction policies defined will be applied to them.
|
67
|
-
"""
|
68
|
-
pass
|
69
|
-
|
70
|
-
@classmethod
|
71
|
-
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
72
|
-
pass
|
73
|
-
|
74
|
-
def _download_file(self, url, filename):
|
75
|
-
if os.path.exists(filename):
|
76
|
-
# file exists skip
|
77
|
-
return
|
78
|
-
try:
|
79
|
-
with requests.get(url, stream=True) as r:
|
80
|
-
r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
|
81
|
-
with open(filename, "wb") as f:
|
82
|
-
for chunk in r.iter_content(chunk_size=8192):
|
83
|
-
f.write(chunk)
|
84
|
-
except requests.exceptions.RequestException as e:
|
85
|
-
print(f"Error downloading the file: {e}")
|
86
|
-
|
87
|
-
def sample_mp3(self, features: List[Feature] = []) -> Content:
|
88
|
-
file_name = "sample.mp3"
|
89
|
-
self._download_file(
|
90
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
|
91
|
-
file_name,
|
92
|
-
)
|
93
|
-
f = open(file_name, "rb")
|
94
|
-
return Content(content_type="audio/mpeg", data=f.read(), features=features)
|
95
|
-
|
96
|
-
def sample_mp4(self, features: List[Feature] = []) -> Content:
|
97
|
-
file_name = "sample.mp4"
|
98
|
-
self._download_file(
|
99
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
|
100
|
-
file_name,
|
101
|
-
)
|
102
|
-
f = open(file_name, "rb")
|
103
|
-
return Content(content_type="video/mp4", data=f.read(), features=features)
|
104
|
-
|
105
|
-
def sample_jpg(self, features: List[Feature] = []) -> Content:
|
106
|
-
file_name = "sample.jpg"
|
107
|
-
self._download_file(
|
108
|
-
"https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
|
109
|
-
file_name,
|
110
|
-
)
|
111
|
-
f = open(file_name, "rb")
|
112
|
-
return Content(content_type="image/jpg", data=f.read(), features=features)
|
113
|
-
|
114
|
-
def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
|
115
|
-
file_name = "sample.jpg"
|
116
|
-
self._download_file(
|
117
|
-
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
|
118
|
-
file_name,
|
119
|
-
)
|
120
|
-
f = open(file_name, "rb")
|
121
|
-
return Content(content_type="image/jpg", data=f.read(), features=features)
|
122
|
-
|
123
|
-
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
124
|
-
file_name = "sample.pdf"
|
125
|
-
self._download_file(
|
126
|
-
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
127
|
-
file_name,
|
128
|
-
)
|
129
|
-
f = open(file_name, "rb")
|
130
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
131
|
-
|
132
|
-
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
133
|
-
file_name = "sample.pdf"
|
134
|
-
self._download_file(
|
135
|
-
"https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
|
136
|
-
file_name,
|
137
|
-
)
|
138
|
-
f = open(file_name, "rb")
|
139
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
140
|
-
|
141
|
-
def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
|
142
|
-
file_name = "sample.pdf"
|
143
|
-
self._download_file(
|
144
|
-
"https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
|
145
|
-
file_name,
|
146
|
-
)
|
147
|
-
f = open(file_name, "rb")
|
148
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
149
|
-
|
150
|
-
def sample_presentation(self, features: List[Feature] = []) -> Content:
|
151
|
-
file_name = "test.pptx"
|
152
|
-
self._download_file(
|
153
|
-
"https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
|
154
|
-
file_name,
|
155
|
-
)
|
156
|
-
f = open(file_name, "rb")
|
157
|
-
return Content(
|
158
|
-
content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
159
|
-
data=f.read(),
|
160
|
-
features=features,
|
161
|
-
)
|
162
|
-
|
163
|
-
def sample_text(self, features: List[Feature] = []) -> Content:
|
164
|
-
article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
|
165
|
-
return Content(content_type="text/plain", data=article, features=features)
|
166
|
-
|
167
|
-
def sample_html(self, features: List[Feature] = []) -> Content:
|
168
|
-
file_name = "sample.html"
|
169
|
-
self._download_file(
|
170
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample.html",
|
171
|
-
file_name,
|
172
|
-
)
|
173
|
-
f = open(file_name, "rb")
|
174
|
-
return Content(content_type="text/html", data=f.read(), features=features)
|
175
|
-
|
176
|
-
|
177
|
-
def extractor(
|
178
|
-
name: Optional[str] = None,
|
179
|
-
description: Optional[str] = "",
|
180
|
-
version: Optional[str] = "",
|
181
|
-
python_dependencies: Optional[List[str]] = None,
|
182
|
-
system_dependencies: Optional[List[str]] = None,
|
183
|
-
input_mime_types: Optional[List[str]] = None,
|
184
|
-
embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
|
185
|
-
sample_content: Optional[Callable] = None,
|
186
|
-
):
|
187
|
-
args = locals()
|
188
|
-
del args["sample_content"]
|
189
|
-
|
190
|
-
def construct(fn):
|
191
|
-
def wrapper():
|
192
|
-
description = fn.__doc__ or args.get("description", "")
|
193
|
-
|
194
|
-
if not args.get("name"):
|
195
|
-
args[
|
196
|
-
"name"
|
197
|
-
] = f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
|
198
|
-
|
199
|
-
class DecoratedFn(Extractor):
|
200
|
-
@classmethod
|
201
|
-
def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
|
202
|
-
# TODO we can force all the functions to take in a parms object
|
203
|
-
# or check if someone adds a params
|
204
|
-
if params is None:
|
205
|
-
return fn(input)
|
206
|
-
else:
|
207
|
-
return fn(input, params)
|
208
|
-
|
209
|
-
def sample_input(self) -> Content:
|
210
|
-
return sample_content() if sample_content else self.sample_text()
|
211
|
-
|
212
|
-
for key, val in args.items():
|
213
|
-
setattr(DecoratedFn, key, val)
|
214
|
-
DecoratedFn.description = description
|
215
|
-
|
216
|
-
return DecoratedFn
|
217
|
-
|
218
|
-
wrapper._extractor_name = fn.__name__
|
219
|
-
wrapper.name = fn.__name__
|
220
|
-
|
221
|
-
return wrapper
|
222
|
-
|
223
|
-
return construct
|
indexify/extractor_sdk/utils.py
DELETED
@@ -1,102 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
import httpx
|
5
|
-
|
6
|
-
from .data import Content, Feature
|
7
|
-
|
8
|
-
|
9
|
-
class SampleExtractorData:
|
10
|
-
def _download_file(self, url):
|
11
|
-
try:
|
12
|
-
resp = httpx.get(url, follow_redirects=True)
|
13
|
-
resp.raise_for_status()
|
14
|
-
return resp.content
|
15
|
-
except httpx.exceptions.RequestException as e:
|
16
|
-
print(f"Error downloading the file: {e}")
|
17
|
-
|
18
|
-
def sample_mp3(self, features: List[Feature] = []) -> Content:
|
19
|
-
file_name = "sample.mp3"
|
20
|
-
self._download_file(
|
21
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
|
22
|
-
file_name,
|
23
|
-
)
|
24
|
-
f = open(file_name, "rb")
|
25
|
-
return Content(content_type="audio/mpeg", data=f.read(), features=features)
|
26
|
-
|
27
|
-
def sample_mp4(self, features: List[Feature] = []) -> Content:
|
28
|
-
file_name = "sample.mp4"
|
29
|
-
self._download_file(
|
30
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
|
31
|
-
file_name,
|
32
|
-
)
|
33
|
-
f = open(file_name, "rb")
|
34
|
-
return Content(content_type="video/mp4", data=f.read(), features=features)
|
35
|
-
|
36
|
-
def sample_jpg(self, features: List[Feature] = []) -> Content:
|
37
|
-
file_name = "sample.jpg"
|
38
|
-
self._download_file(
|
39
|
-
"https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
|
40
|
-
file_name,
|
41
|
-
)
|
42
|
-
f = open(file_name, "rb")
|
43
|
-
return Content(content_type="image/jpg", data=f.read(), features=features)
|
44
|
-
|
45
|
-
def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
|
46
|
-
file_name = "sample.jpg"
|
47
|
-
self._download_file(
|
48
|
-
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
|
49
|
-
file_name,
|
50
|
-
)
|
51
|
-
f = open(file_name, "rb")
|
52
|
-
return Content(content_type="image/jpg", data=f.read(), features=features)
|
53
|
-
|
54
|
-
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
55
|
-
data = self._download_file(
|
56
|
-
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
57
|
-
)
|
58
|
-
return Content(content_type="application/pdf", data=data, features=features)
|
59
|
-
|
60
|
-
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
61
|
-
file_name = "sample.pdf"
|
62
|
-
self._download_file(
|
63
|
-
"https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
|
64
|
-
file_name,
|
65
|
-
)
|
66
|
-
f = open(file_name, "rb")
|
67
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
68
|
-
|
69
|
-
def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
|
70
|
-
file_name = "sample.pdf"
|
71
|
-
self._download_file(
|
72
|
-
"https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
|
73
|
-
file_name,
|
74
|
-
)
|
75
|
-
f = open(file_name, "rb")
|
76
|
-
return Content(content_type="application/pdf", data=f.read(), features=features)
|
77
|
-
|
78
|
-
def sample_presentation(self, features: List[Feature] = []) -> Content:
|
79
|
-
file_name = "test.pptx"
|
80
|
-
self._download_file(
|
81
|
-
"https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
|
82
|
-
file_name,
|
83
|
-
)
|
84
|
-
f = open(file_name, "rb")
|
85
|
-
return Content(
|
86
|
-
content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
87
|
-
data=f.read(),
|
88
|
-
features=features,
|
89
|
-
)
|
90
|
-
|
91
|
-
def sample_text(self, features: List[Feature] = []) -> Content:
|
92
|
-
article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
|
93
|
-
return Content(content_type="text/plain", data=article, features=features)
|
94
|
-
|
95
|
-
def sample_html(self, features: List[Feature] = []) -> Content:
|
96
|
-
file_name = "sample.html"
|
97
|
-
self._download_file(
|
98
|
-
"https://extractor-files.diptanu-6d5.workers.dev/sample.html",
|
99
|
-
file_name,
|
100
|
-
)
|
101
|
-
f = open(file_name, "rb")
|
102
|
-
return Content(content_type="text/html", data=f.read(), features=features)
|
indexify/extractors/__init__.py
DELETED
File without changes
|
indexify/extractors/embedding.py
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
import torch
|
4
|
-
import torch.nn.functional as F
|
5
|
-
from transformers import AutoModel, AutoTokenizer
|
6
|
-
|
7
|
-
from indexify.extractor_sdk.data import Feature
|
8
|
-
from indexify.extractor_sdk.extractor import Extractor, Feature
|
9
|
-
|
10
|
-
|
11
|
-
class SentenceTransformersEmbedding:
|
12
|
-
def __init__(self, model_name) -> None:
|
13
|
-
self._model_name = model_name
|
14
|
-
self._tokenizer = AutoTokenizer.from_pretrained(
|
15
|
-
f"sentence-transformers/{model_name}"
|
16
|
-
)
|
17
|
-
self._model = AutoModel.from_pretrained(
|
18
|
-
f"sentence-transformers/{model_name}", torchscript=True
|
19
|
-
)
|
20
|
-
self._model.eval()
|
21
|
-
|
22
|
-
def embed_batch(self, inputs: List[str]) -> List[List[float]]:
|
23
|
-
result = self._embed(inputs)
|
24
|
-
return result.tolist()
|
25
|
-
|
26
|
-
def embed(self, query: str) -> List[float]:
|
27
|
-
result = self._embed([query])
|
28
|
-
return result[0].tolist()
|
29
|
-
|
30
|
-
def _embed(self, inputs: List[str]) -> torch.Tensor:
|
31
|
-
encoded_input = self._tokenizer(
|
32
|
-
inputs, padding=True, truncation=True, return_tensors="pt"
|
33
|
-
)
|
34
|
-
sentence_embeddings = self._model(**encoded_input)
|
35
|
-
return F.normalize(sentence_embeddings, p=2, dim=1)
|
36
|
-
|
37
|
-
|
38
|
-
class BasicSentenceTransformerModels(Extractor):
|
39
|
-
def __init__(self, model: str):
|
40
|
-
super().__init__()
|
41
|
-
self.model = SentenceTransformersEmbedding(model)
|
42
|
-
|
43
|
-
def extract(self, input: str) -> List[Feature]:
|
44
|
-
embeddings = self.model.embed(input)
|
45
|
-
return [Feature.embedding(values=embeddings)]
|
46
|
-
|
47
|
-
|
48
|
-
class BasicHFTransformerEmbeddingModels(Extractor):
|
49
|
-
def __init__(self, model: str):
|
50
|
-
super().__init__()
|
51
|
-
self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
|
52
|
-
|
53
|
-
def extract(self, input: str) -> List[Feature]:
|
54
|
-
embeddings = self.model.embed_query(input)
|
55
|
-
return [Feature.embedding(values=embeddings)]
|
@@ -1,93 +0,0 @@
|
|
1
|
-
import tempfile
|
2
|
-
from enum import Enum
|
3
|
-
from typing import List, Optional
|
4
|
-
|
5
|
-
from pydantic import BaseModel
|
6
|
-
|
7
|
-
|
8
|
-
class PageFragmentType(str, Enum):
|
9
|
-
TEXT = "text"
|
10
|
-
FIGURE = "figure"
|
11
|
-
TABLE = "table"
|
12
|
-
|
13
|
-
|
14
|
-
class Image(BaseModel):
|
15
|
-
data: bytes
|
16
|
-
mime_type: str
|
17
|
-
|
18
|
-
|
19
|
-
class TableEncoding(str, Enum):
|
20
|
-
CSV = "csv"
|
21
|
-
HTML = "html"
|
22
|
-
|
23
|
-
|
24
|
-
class Table(BaseModel):
|
25
|
-
data: str
|
26
|
-
encoding: TableEncoding
|
27
|
-
|
28
|
-
|
29
|
-
class PageFragment(BaseModel):
|
30
|
-
fragment_type: PageFragmentType
|
31
|
-
text: Optional[str] = None
|
32
|
-
image: Optional[Image] = None
|
33
|
-
table: Optional[Table] = None
|
34
|
-
reading_order: Optional[int] = None
|
35
|
-
|
36
|
-
|
37
|
-
class Page(BaseModel):
|
38
|
-
number: int
|
39
|
-
fragments: List[PageFragment]
|
40
|
-
|
41
|
-
|
42
|
-
class PDFParser:
|
43
|
-
def __init__(self, data: bytes, language: Optional[str] = "en"):
|
44
|
-
self._data = data
|
45
|
-
|
46
|
-
def parse(self) -> List[Page]:
|
47
|
-
import deepdoctection as dd
|
48
|
-
analyzer = dd.get_dd_analyzer()
|
49
|
-
parsed_pages = []
|
50
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
|
51
|
-
f.write(self._data)
|
52
|
-
f.flush()
|
53
|
-
df = analyzer.analyze(path=f.name)
|
54
|
-
df.reset_state()
|
55
|
-
for page in df:
|
56
|
-
parsed_pages.append(page)
|
57
|
-
outputs: List[Page] = []
|
58
|
-
for parsed_page in parsed_pages:
|
59
|
-
page_num = parsed_page.page_number
|
60
|
-
fragments = []
|
61
|
-
for layout in parsed_page.layouts:
|
62
|
-
if layout.category_name in ["text", "title"]:
|
63
|
-
fragments.append(
|
64
|
-
PageFragment(
|
65
|
-
fragment_type=PageFragmentType.TEXT,
|
66
|
-
text=layout.text,
|
67
|
-
reading_order=layout.reading_order,
|
68
|
-
)
|
69
|
-
)
|
70
|
-
figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
|
71
|
-
for figure in figures:
|
72
|
-
image_bytes = dd.viz_handler.encode(figure.viz())
|
73
|
-
fragments.append(
|
74
|
-
PageFragment(
|
75
|
-
fragment_type=PageFragmentType.FIGURE,
|
76
|
-
image=Image(data=image_bytes, mime_type="image/png"),
|
77
|
-
reading_order=figure.reading_order,
|
78
|
-
)
|
79
|
-
)
|
80
|
-
|
81
|
-
tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
|
82
|
-
for table in tables:
|
83
|
-
fragments.append(
|
84
|
-
PageFragment(
|
85
|
-
fragment_type=PageFragmentType.TABLE,
|
86
|
-
table=Table(data=table.html, encoding=TableEncoding.HTML),
|
87
|
-
reading_order=table.reading_order,
|
88
|
-
)
|
89
|
-
)
|
90
|
-
|
91
|
-
outputs.append(Page(number=page_num, fragments=fragments))
|
92
|
-
|
93
|
-
return outputs
|