indexify 0.0.42__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. indexify/__init__.py +13 -14
  2. indexify/base_client.py +48 -21
  3. indexify/cli.py +235 -0
  4. indexify/client.py +18 -790
  5. indexify/error.py +3 -30
  6. indexify/executor/agent.py +362 -0
  7. indexify/executor/api_objects.py +43 -0
  8. indexify/executor/downloader.py +124 -0
  9. indexify/executor/executor_tasks.py +72 -0
  10. indexify/executor/function_worker.py +177 -0
  11. indexify/executor/indexify_executor.py +32 -0
  12. indexify/executor/task_reporter.py +110 -0
  13. indexify/executor/task_store.py +113 -0
  14. indexify/foo +72 -0
  15. indexify/functions_sdk/data_objects.py +37 -0
  16. indexify/functions_sdk/graph.py +276 -0
  17. indexify/functions_sdk/graph_validation.py +69 -0
  18. indexify/functions_sdk/image.py +26 -0
  19. indexify/functions_sdk/indexify_functions.py +192 -0
  20. indexify/functions_sdk/local_cache.py +46 -0
  21. indexify/functions_sdk/object_serializer.py +61 -0
  22. indexify/local_client.py +183 -0
  23. indexify/remote_client.py +319 -0
  24. indexify-0.2.dist-info/METADATA +151 -0
  25. indexify-0.2.dist-info/RECORD +32 -0
  26. indexify-0.2.dist-info/entry_points.txt +3 -0
  27. indexify/exceptions.py +0 -3
  28. indexify/extraction_policy.py +0 -75
  29. indexify/extractor_sdk/__init__.py +0 -14
  30. indexify/extractor_sdk/data.py +0 -100
  31. indexify/extractor_sdk/extractor.py +0 -223
  32. indexify/extractor_sdk/utils.py +0 -102
  33. indexify/extractors/__init__.py +0 -0
  34. indexify/extractors/embedding.py +0 -55
  35. indexify/extractors/pdf_parser.py +0 -93
  36. indexify/graph.py +0 -133
  37. indexify/local_runner.py +0 -128
  38. indexify/runner.py +0 -22
  39. indexify/utils.py +0 -7
  40. indexify-0.0.42.dist-info/METADATA +0 -66
  41. indexify-0.0.42.dist-info/RECORD +0 -25
  42. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/LICENSE.txt +0 -0
  43. {indexify-0.0.42.dist-info → indexify-0.2.dist-info}/WHEEL +0 -0
indexify/exceptions.py DELETED
@@ -1,3 +0,0 @@
1
- class ApiException(Exception):
2
- def __init__(self, message: str) -> None:
3
- super().__init__(message)
@@ -1,75 +0,0 @@
1
- from dataclasses import asdict, dataclass
2
- from typing import List, Optional
3
-
4
-
5
- @dataclass
6
- class ExtractionPolicy:
7
- extractor: str
8
- name: str
9
- content_source: str
10
- input_params: Optional[dict] = None
11
- id: Optional[str] = None
12
- labels_eq: Optional[str] = None
13
-
14
- def __repr__(self) -> str:
15
- return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
16
-
17
- def __str__(self) -> str:
18
- return self.__repr__()
19
-
20
- def to_dict(self) -> dict:
21
- filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
22
- return filtered_dict
23
-
24
- @classmethod
25
- def from_dict(cls, json: dict):
26
- if "filters_eq" in json:
27
- json["labels_eq"] = json.pop("filters_eq")
28
- json["id"] = json.get("id", None)
29
- return ExtractionPolicy(**json)
30
-
31
-
32
- @dataclass
33
- class ExtractionGraph:
34
- id: str
35
- name: str
36
- description: str
37
- extraction_policies: List[ExtractionPolicy]
38
-
39
- @classmethod
40
- def from_dict(cls, json: dict):
41
- json["id"] = json.get("id", None)
42
- json["description"] = json.get("description", None)
43
- if "namespace" in json.keys():
44
- json.pop("namespace")
45
- return ExtractionGraph(**json)
46
-
47
- @staticmethod
48
- def from_yaml(spec: str):
49
- import yaml
50
-
51
- return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
52
-
53
- @staticmethod
54
- def from_yaml_file(path: str):
55
- with open(path, "r") as f:
56
- return ExtractionGraph.from_yaml(f.read())
57
-
58
- def to_dict(self) -> dict:
59
- filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
60
- return filtered_dict
61
-
62
-
63
- class ExtractionGraphBuilder:
64
- def __init__(self, name: str):
65
- self.name = name
66
- self.extraction_policies = []
67
-
68
- def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
69
- self.extraction_policies.append(policy)
70
- return self
71
-
72
- def build(self):
73
- return ExtractionGraph(
74
- id=self.id, name=self.name, extraction_policies=self.extraction_policies
75
- )
@@ -1,14 +0,0 @@
1
- from .data import Content, ContentMetadata, Feature
2
- from .extractor import EmbeddingSchema, Extractor, ExtractorMetadata, extractor
3
- from .utils import SampleExtractorData
4
-
5
- __all__ = [
6
- "ContentMetadata",
7
- "Content",
8
- "Feature",
9
- "Extractor",
10
- "extractor",
11
- "EmbeddingSchema",
12
- "ExtractorMetadata",
13
- "SampleExtractorData",
14
- ]
@@ -1,100 +0,0 @@
1
- import json
2
- from typing import Any, Dict, List, Literal, Mapping, Optional, Type, cast
3
-
4
- from pydantic import BaseModel, Field, Json
5
- from typing_extensions import Annotated, Doc
6
-
7
-
8
- class BaseData(BaseModel):
9
- meta: Mapping[str, Type[BaseModel]] = {}
10
-
11
- def get_features(self) -> List[Type[BaseModel]]:
12
- return self.meta
13
-
14
- def get_feature(self, name: str) -> Optional[Type[BaseModel]]:
15
- return self.meta.get(name)
16
-
17
-
18
- class Feature(BaseModel):
19
- feature_type: Literal["embedding", "metadata"]
20
- name: str
21
- value: Json
22
- comment: Optional[Json] = Field(default=None)
23
-
24
- @classmethod
25
- def embedding(cls, values: List[float], name: str = "embedding", distance="cosine"):
26
- return cls(
27
- feature_type="embedding",
28
- name=name,
29
- value=json.dumps({"values": values, "distance": distance}),
30
- comment=None,
31
- )
32
-
33
- @classmethod
34
- def metadata(cls, value: Json, comment: Json = None, name: str = "metadata"):
35
- value = json.dumps(value)
36
- comment = json.dumps(comment) if comment is not None else None
37
- return cls(feature_type="metadata", name=name, value=value)
38
-
39
-
40
- class Content(BaseModel):
41
- id: Optional[str] = (None,)
42
- content_type: Optional[str]
43
- data: bytes
44
- features: List[Feature] = []
45
-
46
- @classmethod
47
- def from_text(
48
- cls,
49
- text: str,
50
- features: List[Feature] = [],
51
- ):
52
- return Content(
53
- id=None,
54
- content_type="text/plain",
55
- data=bytes(text, "utf-8"),
56
- features=features,
57
- )
58
-
59
- @classmethod
60
- def from_json(cls, json_data: Json, features: List[Feature] = []):
61
- return cls(
62
- content_type="application/json",
63
- data=bytes(json.dumps(json_data), "utf-8"),
64
- features=features,
65
- )
66
-
67
- @classmethod
68
- def from_file(cls, path: str):
69
- import mimetypes
70
-
71
- m, _ = mimetypes.guess_type(path)
72
- with open(path, "rb") as f:
73
- return cls(id="none-for-now", content_type=m, data=f.read())
74
-
75
-
76
- class ContentMetadata(BaseModel):
77
- id: str
78
- parent_id: str
79
- labels: Dict[str, Any]
80
- extraction_graph_names: List[str]
81
- extraction_policy: str
82
- mime_type: str
83
- extracted_metadata: Dict[str, Any] = {}
84
-
85
- @classmethod
86
- def from_dict(cls, json: Dict):
87
- return cls(
88
- id=json["id"],
89
- parent_id=json["parent_id"],
90
- labels=json["labels"],
91
- extraction_graph_names=json["extraction_graph_names"],
92
- extraction_policy=json["source"],
93
- mime_type=json["mime_type"],
94
- extracted_metadata=json["extracted_metadata"],
95
- )
96
-
97
-
98
- class File(BaseData):
99
- data: bytes
100
- mime_type: str
@@ -1,223 +0,0 @@
1
- import inspect
2
- import json
3
- import os
4
- from abc import ABC, abstractmethod
5
- from typing import (
6
- Callable,
7
- Dict,
8
- List,
9
- Optional,
10
- Tuple,
11
- Type,
12
- Union,
13
- get_type_hints,
14
- )
15
-
16
- import requests
17
- from pydantic import BaseModel, Field
18
-
19
- from .data import BaseData, Content, Feature
20
-
21
-
22
- class EmbeddingSchema(BaseModel):
23
- dim: int
24
- distance: Optional[str] = "cosine"
25
- database_url: Optional[str] = None
26
-
27
-
28
- class ExtractorMetadata(BaseModel):
29
- name: str
30
- version: str
31
- description: str
32
- input_mime_types: List[str]
33
- system_dependencies: List[str]
34
- python_dependencies: List[str]
35
- input_mime_types: List[str]
36
- embedding_schemas: Dict[str, EmbeddingSchema]
37
- # Make this a dynamic model since its a json schema
38
- input_params: Optional[Dict]
39
- # for backward compatibility
40
- metadata_schemas: Optional[Dict]
41
-
42
-
43
- class Extractor(ABC):
44
- name: str = ""
45
-
46
- version: str = "0.0.0"
47
-
48
- system_dependencies: List[str] = []
49
-
50
- python_dependencies: List[str] = []
51
-
52
- description: str = ""
53
-
54
- input_mime_types = ["text/plain"]
55
-
56
- embedding_indexes: Dict[str, EmbeddingSchema] = {}
57
-
58
- @abstractmethod
59
- def extract(
60
- self, input: Type[BaseModel], params: Type[BaseModel] = None
61
- ) -> List[Union[Feature, Type[BaseModel]]]:
62
- """
63
- Extracts information from the content. Returns a list of features to add
64
- to the content.
65
- It can also return a list of Content objects, which will be added to storage
66
- and any extraction policies defined will be applied to them.
67
- """
68
- pass
69
-
70
- @classmethod
71
- def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
72
- pass
73
-
74
- def _download_file(self, url, filename):
75
- if os.path.exists(filename):
76
- # file exists skip
77
- return
78
- try:
79
- with requests.get(url, stream=True) as r:
80
- r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
81
- with open(filename, "wb") as f:
82
- for chunk in r.iter_content(chunk_size=8192):
83
- f.write(chunk)
84
- except requests.exceptions.RequestException as e:
85
- print(f"Error downloading the file: {e}")
86
-
87
- def sample_mp3(self, features: List[Feature] = []) -> Content:
88
- file_name = "sample.mp3"
89
- self._download_file(
90
- "https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
91
- file_name,
92
- )
93
- f = open(file_name, "rb")
94
- return Content(content_type="audio/mpeg", data=f.read(), features=features)
95
-
96
- def sample_mp4(self, features: List[Feature] = []) -> Content:
97
- file_name = "sample.mp4"
98
- self._download_file(
99
- "https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
100
- file_name,
101
- )
102
- f = open(file_name, "rb")
103
- return Content(content_type="video/mp4", data=f.read(), features=features)
104
-
105
- def sample_jpg(self, features: List[Feature] = []) -> Content:
106
- file_name = "sample.jpg"
107
- self._download_file(
108
- "https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
109
- file_name,
110
- )
111
- f = open(file_name, "rb")
112
- return Content(content_type="image/jpg", data=f.read(), features=features)
113
-
114
- def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
115
- file_name = "sample.jpg"
116
- self._download_file(
117
- "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
118
- file_name,
119
- )
120
- f = open(file_name, "rb")
121
- return Content(content_type="image/jpg", data=f.read(), features=features)
122
-
123
- def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
124
- file_name = "sample.pdf"
125
- self._download_file(
126
- "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
127
- file_name,
128
- )
129
- f = open(file_name, "rb")
130
- return Content(content_type="application/pdf", data=f.read(), features=features)
131
-
132
- def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
133
- file_name = "sample.pdf"
134
- self._download_file(
135
- "https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
136
- file_name,
137
- )
138
- f = open(file_name, "rb")
139
- return Content(content_type="application/pdf", data=f.read(), features=features)
140
-
141
- def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
142
- file_name = "sample.pdf"
143
- self._download_file(
144
- "https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
145
- file_name,
146
- )
147
- f = open(file_name, "rb")
148
- return Content(content_type="application/pdf", data=f.read(), features=features)
149
-
150
- def sample_presentation(self, features: List[Feature] = []) -> Content:
151
- file_name = "test.pptx"
152
- self._download_file(
153
- "https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
154
- file_name,
155
- )
156
- f = open(file_name, "rb")
157
- return Content(
158
- content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
159
- data=f.read(),
160
- features=features,
161
- )
162
-
163
- def sample_text(self, features: List[Feature] = []) -> Content:
164
- article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
165
- return Content(content_type="text/plain", data=article, features=features)
166
-
167
- def sample_html(self, features: List[Feature] = []) -> Content:
168
- file_name = "sample.html"
169
- self._download_file(
170
- "https://extractor-files.diptanu-6d5.workers.dev/sample.html",
171
- file_name,
172
- )
173
- f = open(file_name, "rb")
174
- return Content(content_type="text/html", data=f.read(), features=features)
175
-
176
-
177
- def extractor(
178
- name: Optional[str] = None,
179
- description: Optional[str] = "",
180
- version: Optional[str] = "",
181
- python_dependencies: Optional[List[str]] = None,
182
- system_dependencies: Optional[List[str]] = None,
183
- input_mime_types: Optional[List[str]] = None,
184
- embedding_indexes: Optional[Dict[str, EmbeddingSchema]] = None,
185
- sample_content: Optional[Callable] = None,
186
- ):
187
- args = locals()
188
- del args["sample_content"]
189
-
190
- def construct(fn):
191
- def wrapper():
192
- description = fn.__doc__ or args.get("description", "")
193
-
194
- if not args.get("name"):
195
- args[
196
- "name"
197
- ] = f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
198
-
199
- class DecoratedFn(Extractor):
200
- @classmethod
201
- def extract(cls, input: Type[BaseData], params: Type[BaseModel] = None) -> List[Union[Type[BaseModel], Type[Feature]]]: # type: ignore
202
- # TODO we can force all the functions to take in a parms object
203
- # or check if someone adds a params
204
- if params is None:
205
- return fn(input)
206
- else:
207
- return fn(input, params)
208
-
209
- def sample_input(self) -> Content:
210
- return sample_content() if sample_content else self.sample_text()
211
-
212
- for key, val in args.items():
213
- setattr(DecoratedFn, key, val)
214
- DecoratedFn.description = description
215
-
216
- return DecoratedFn
217
-
218
- wrapper._extractor_name = fn.__name__
219
- wrapper.name = fn.__name__
220
-
221
- return wrapper
222
-
223
- return construct
@@ -1,102 +0,0 @@
1
- import os
2
- from typing import List
3
-
4
- import httpx
5
-
6
- from .data import Content, Feature
7
-
8
-
9
- class SampleExtractorData:
10
- def _download_file(self, url):
11
- try:
12
- resp = httpx.get(url, follow_redirects=True)
13
- resp.raise_for_status()
14
- return resp.content
15
- except httpx.exceptions.RequestException as e:
16
- print(f"Error downloading the file: {e}")
17
-
18
- def sample_mp3(self, features: List[Feature] = []) -> Content:
19
- file_name = "sample.mp3"
20
- self._download_file(
21
- "https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
22
- file_name,
23
- )
24
- f = open(file_name, "rb")
25
- return Content(content_type="audio/mpeg", data=f.read(), features=features)
26
-
27
- def sample_mp4(self, features: List[Feature] = []) -> Content:
28
- file_name = "sample.mp4"
29
- self._download_file(
30
- "https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
31
- file_name,
32
- )
33
- f = open(file_name, "rb")
34
- return Content(content_type="video/mp4", data=f.read(), features=features)
35
-
36
- def sample_jpg(self, features: List[Feature] = []) -> Content:
37
- file_name = "sample.jpg"
38
- self._download_file(
39
- "https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
40
- file_name,
41
- )
42
- f = open(file_name, "rb")
43
- return Content(content_type="image/jpg", data=f.read(), features=features)
44
-
45
- def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
46
- file_name = "sample.jpg"
47
- self._download_file(
48
- "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
49
- file_name,
50
- )
51
- f = open(file_name, "rb")
52
- return Content(content_type="image/jpg", data=f.read(), features=features)
53
-
54
- def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
55
- data = self._download_file(
56
- "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
57
- )
58
- return Content(content_type="application/pdf", data=data, features=features)
59
-
60
- def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
61
- file_name = "sample.pdf"
62
- self._download_file(
63
- "https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
64
- file_name,
65
- )
66
- f = open(file_name, "rb")
67
- return Content(content_type="application/pdf", data=f.read(), features=features)
68
-
69
- def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
70
- file_name = "sample.pdf"
71
- self._download_file(
72
- "https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
73
- file_name,
74
- )
75
- f = open(file_name, "rb")
76
- return Content(content_type="application/pdf", data=f.read(), features=features)
77
-
78
- def sample_presentation(self, features: List[Feature] = []) -> Content:
79
- file_name = "test.pptx"
80
- self._download_file(
81
- "https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
82
- file_name,
83
- )
84
- f = open(file_name, "rb")
85
- return Content(
86
- content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
87
- data=f.read(),
88
- features=features,
89
- )
90
-
91
- def sample_text(self, features: List[Feature] = []) -> Content:
92
- article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
93
- return Content(content_type="text/plain", data=article, features=features)
94
-
95
- def sample_html(self, features: List[Feature] = []) -> Content:
96
- file_name = "sample.html"
97
- self._download_file(
98
- "https://extractor-files.diptanu-6d5.workers.dev/sample.html",
99
- file_name,
100
- )
101
- f = open(file_name, "rb")
102
- return Content(content_type="text/html", data=f.read(), features=features)
File without changes
@@ -1,55 +0,0 @@
1
- from typing import List
2
-
3
- import torch
4
- import torch.nn.functional as F
5
- from transformers import AutoModel, AutoTokenizer
6
-
7
- from indexify.extractor_sdk.data import Feature
8
- from indexify.extractor_sdk.extractor import Extractor, Feature
9
-
10
-
11
- class SentenceTransformersEmbedding:
12
- def __init__(self, model_name) -> None:
13
- self._model_name = model_name
14
- self._tokenizer = AutoTokenizer.from_pretrained(
15
- f"sentence-transformers/{model_name}"
16
- )
17
- self._model = AutoModel.from_pretrained(
18
- f"sentence-transformers/{model_name}", torchscript=True
19
- )
20
- self._model.eval()
21
-
22
- def embed_batch(self, inputs: List[str]) -> List[List[float]]:
23
- result = self._embed(inputs)
24
- return result.tolist()
25
-
26
- def embed(self, query: str) -> List[float]:
27
- result = self._embed([query])
28
- return result[0].tolist()
29
-
30
- def _embed(self, inputs: List[str]) -> torch.Tensor:
31
- encoded_input = self._tokenizer(
32
- inputs, padding=True, truncation=True, return_tensors="pt"
33
- )
34
- sentence_embeddings = self._model(**encoded_input)
35
- return F.normalize(sentence_embeddings, p=2, dim=1)
36
-
37
-
38
- class BasicSentenceTransformerModels(Extractor):
39
- def __init__(self, model: str):
40
- super().__init__()
41
- self.model = SentenceTransformersEmbedding(model)
42
-
43
- def extract(self, input: str) -> List[Feature]:
44
- embeddings = self.model.embed(input)
45
- return [Feature.embedding(values=embeddings)]
46
-
47
-
48
- class BasicHFTransformerEmbeddingModels(Extractor):
49
- def __init__(self, model: str):
50
- super().__init__()
51
- self._model = AutoModel.from_pretrained(model, trust_remote_code=True)
52
-
53
- def extract(self, input: str) -> List[Feature]:
54
- embeddings = self.model.embed_query(input)
55
- return [Feature.embedding(values=embeddings)]
@@ -1,93 +0,0 @@
1
- import tempfile
2
- from enum import Enum
3
- from typing import List, Optional
4
-
5
- from pydantic import BaseModel
6
-
7
-
8
- class PageFragmentType(str, Enum):
9
- TEXT = "text"
10
- FIGURE = "figure"
11
- TABLE = "table"
12
-
13
-
14
- class Image(BaseModel):
15
- data: bytes
16
- mime_type: str
17
-
18
-
19
- class TableEncoding(str, Enum):
20
- CSV = "csv"
21
- HTML = "html"
22
-
23
-
24
- class Table(BaseModel):
25
- data: str
26
- encoding: TableEncoding
27
-
28
-
29
- class PageFragment(BaseModel):
30
- fragment_type: PageFragmentType
31
- text: Optional[str] = None
32
- image: Optional[Image] = None
33
- table: Optional[Table] = None
34
- reading_order: Optional[int] = None
35
-
36
-
37
- class Page(BaseModel):
38
- number: int
39
- fragments: List[PageFragment]
40
-
41
-
42
- class PDFParser:
43
- def __init__(self, data: bytes, language: Optional[str] = "en"):
44
- self._data = data
45
-
46
- def parse(self) -> List[Page]:
47
- import deepdoctection as dd
48
- analyzer = dd.get_dd_analyzer()
49
- parsed_pages = []
50
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
51
- f.write(self._data)
52
- f.flush()
53
- df = analyzer.analyze(path=f.name)
54
- df.reset_state()
55
- for page in df:
56
- parsed_pages.append(page)
57
- outputs: List[Page] = []
58
- for parsed_page in parsed_pages:
59
- page_num = parsed_page.page_number
60
- fragments = []
61
- for layout in parsed_page.layouts:
62
- if layout.category_name in ["text", "title"]:
63
- fragments.append(
64
- PageFragment(
65
- fragment_type=PageFragmentType.TEXT,
66
- text=layout.text,
67
- reading_order=layout.reading_order,
68
- )
69
- )
70
- figures = parsed_page.get_annotation(category_names=dd.LayoutType.FIGURE)
71
- for figure in figures:
72
- image_bytes = dd.viz_handler.encode(figure.viz())
73
- fragments.append(
74
- PageFragment(
75
- fragment_type=PageFragmentType.FIGURE,
76
- image=Image(data=image_bytes, mime_type="image/png"),
77
- reading_order=figure.reading_order,
78
- )
79
- )
80
-
81
- tables = parsed_page.get_annotation(category_names=dd.LayoutType.TABLE)
82
- for table in tables:
83
- fragments.append(
84
- PageFragment(
85
- fragment_type=PageFragmentType.TABLE,
86
- table=Table(data=table.html, encoding=TableEncoding.HTML),
87
- reading_order=table.reading_order,
88
- )
89
- )
90
-
91
- outputs.append(Page(number=page_num, fragments=fragments))
92
-
93
- return outputs