unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +6 -2
- test/integration/connectors/sql/test_singlestore.py +6 -2
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +6 -2
- test/integration/connectors/test_milvus.py +13 -0
- test/integration/connectors/test_onedrive.py +6 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/embedders/test_bedrock.py +28 -0
- test/integration/embedders/test_octoai.py +14 -0
- test/integration/embedders/test_openai.py +13 -0
- test/integration/embedders/test_togetherai.py +10 -0
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/embed/test_octoai.py +8 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +39 -11
- unstructured_ingest/embed/interfaces.py +5 -0
- unstructured_ingest/embed/octoai.py +44 -3
- unstructured_ingest/embed/openai.py +37 -1
- unstructured_ingest/embed/togetherai.py +28 -1
- unstructured_ingest/embed/voyageai.py +33 -1
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/file_data.py +11 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
- unstructured_ingest/v2/processes/connectors/couchbase.py +2 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +3 -4
- unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/unstructured_api.py +25 -2
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +20 -16
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +52 -48
- test/integration/connectors/test_kafka.py +0 -304
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Mapping, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
Uploader,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
UploadStager,
|
|
21
|
+
UploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
|
+
|
|
26
|
+
BASE_URL = "https://api.vectara.io/v2"
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "vectara"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class VectaraAccessConfig(AccessConfig):
|
|
32
|
+
oauth_client_id: str = Field(description="Client ID")
|
|
33
|
+
oauth_secret: str = Field(description="Client Secret")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class VectaraConnectionConfig(ConnectionConfig):
|
|
37
|
+
access_config: Secret[VectaraAccessConfig]
|
|
38
|
+
customer_id: str
|
|
39
|
+
corpus_name: Optional[str] = None
|
|
40
|
+
corpus_key: Optional[str] = None
|
|
41
|
+
token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class VectaraUploadStagerConfig(UploadStagerConfig):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class VectaraUploadStager(UploadStager):
|
|
50
|
+
upload_stager_config: VectaraUploadStagerConfig = field(
|
|
51
|
+
default_factory=lambda: VectaraUploadStagerConfig()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def conform_dict(data: dict) -> dict:
|
|
56
|
+
"""
|
|
57
|
+
Prepares dictionary in the format that Vectara requires.
|
|
58
|
+
See more detail in https://docs.vectara.com/docs/rest-api/create-corpus-document
|
|
59
|
+
|
|
60
|
+
Select which meta-data fields to include and optionally map them to a new format.
|
|
61
|
+
remove the "metadata-" prefix from the keys
|
|
62
|
+
"""
|
|
63
|
+
metadata_map = {
|
|
64
|
+
"page_number": "page_number",
|
|
65
|
+
"data_source-url": "url",
|
|
66
|
+
"filename": "filename",
|
|
67
|
+
"filetype": "filetype",
|
|
68
|
+
"last_modified": "last_modified",
|
|
69
|
+
"element_id": "element_id",
|
|
70
|
+
}
|
|
71
|
+
md = flatten_dict(data, separator="-", flatten_lists=True)
|
|
72
|
+
md = {k.replace("metadata-", ""): v for k, v in md.items()}
|
|
73
|
+
md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
|
|
74
|
+
return md
|
|
75
|
+
|
|
76
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
77
|
+
with input_file.open() as in_f:
|
|
78
|
+
elements_contents = json.load(in_f)
|
|
79
|
+
|
|
80
|
+
logger.info(
|
|
81
|
+
f"Extending {len(elements_contents)} json elements from content in {input_file}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
conformed_elements = [
|
|
85
|
+
{
|
|
86
|
+
"id": str(uuid.uuid4()),
|
|
87
|
+
"type": "core",
|
|
88
|
+
"metadata": {
|
|
89
|
+
"title": file_data.identifier,
|
|
90
|
+
},
|
|
91
|
+
"document_parts": [
|
|
92
|
+
{
|
|
93
|
+
"text": element.pop("text", None),
|
|
94
|
+
"metadata": self.conform_dict(data=element),
|
|
95
|
+
}
|
|
96
|
+
for element in elements_contents
|
|
97
|
+
],
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
with open(output_file, "w") as out_f:
|
|
102
|
+
json.dump(conformed_elements, out_f, indent=2)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class VectaraUploaderConfig(UploaderConfig):
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class VectaraUploader(Uploader):
|
|
111
|
+
|
|
112
|
+
connector_type: str = CONNECTOR_TYPE
|
|
113
|
+
upload_config: VectaraUploaderConfig
|
|
114
|
+
connection_config: VectaraConnectionConfig
|
|
115
|
+
_jwt_token: Optional[str] = field(init=False, default=None)
|
|
116
|
+
_jwt_token_expires_ts: Optional[float] = field(init=False, default=None)
|
|
117
|
+
|
|
118
|
+
def is_async(self) -> bool:
|
|
119
|
+
return True
|
|
120
|
+
|
|
121
|
+
def precheck(self) -> None:
|
|
122
|
+
try:
|
|
123
|
+
self._check_connection_and_corpora()
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
126
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
async def jwt_token_async(self) -> str:
|
|
130
|
+
if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
|
|
131
|
+
self._jwt_token = await self._get_jwt_token_async()
|
|
132
|
+
return self._jwt_token
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def jwt_token(self) -> str:
|
|
136
|
+
if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
|
|
137
|
+
self._jwt_token = self._get_jwt_token()
|
|
138
|
+
return self._jwt_token
|
|
139
|
+
|
|
140
|
+
# Get Oauth2 JWT token
|
|
141
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
142
|
+
async def _get_jwt_token_async(self) -> str:
|
|
143
|
+
import httpx
|
|
144
|
+
|
|
145
|
+
"""Connect to the server and get a JWT token."""
|
|
146
|
+
token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
|
|
147
|
+
headers = {
|
|
148
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
149
|
+
}
|
|
150
|
+
data = {
|
|
151
|
+
"grant_type": "client_credentials",
|
|
152
|
+
"client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
|
|
153
|
+
"client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async with httpx.AsyncClient() as client:
|
|
157
|
+
response = await client.post(token_endpoint, headers=headers, data=data)
|
|
158
|
+
response.raise_for_status()
|
|
159
|
+
response_json = response.json()
|
|
160
|
+
|
|
161
|
+
request_time = datetime.now().timestamp()
|
|
162
|
+
self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
|
|
163
|
+
|
|
164
|
+
return response_json.get("access_token")
|
|
165
|
+
|
|
166
|
+
# Get Oauth2 JWT token
|
|
167
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
168
|
+
def _get_jwt_token(self) -> str:
|
|
169
|
+
import httpx
|
|
170
|
+
|
|
171
|
+
"""Connect to the server and get a JWT token."""
|
|
172
|
+
token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
|
|
173
|
+
headers = {
|
|
174
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
175
|
+
}
|
|
176
|
+
data = {
|
|
177
|
+
"grant_type": "client_credentials",
|
|
178
|
+
"client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
|
|
179
|
+
"client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
with httpx.Client() as client:
|
|
183
|
+
response = client.post(token_endpoint, headers=headers, data=data)
|
|
184
|
+
response.raise_for_status()
|
|
185
|
+
response_json = response.json()
|
|
186
|
+
|
|
187
|
+
request_time = datetime.now().timestamp()
|
|
188
|
+
self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
|
|
189
|
+
|
|
190
|
+
return response_json.get("access_token")
|
|
191
|
+
|
|
192
|
+
@DestinationConnectionError.wrap
|
|
193
|
+
def _check_connection_and_corpora(self) -> None:
|
|
194
|
+
"""
|
|
195
|
+
Check the connection for Vectara and validate corpus exists.
|
|
196
|
+
- If more than one corpus with the same name exists - raise error
|
|
197
|
+
- If exactly one corpus exists with this name - use it.
|
|
198
|
+
- If does not exist - raise error.
|
|
199
|
+
"""
|
|
200
|
+
# Get token if not already set
|
|
201
|
+
self.jwt_token
|
|
202
|
+
|
|
203
|
+
_, list_corpora_response = self._request(
|
|
204
|
+
http_method="GET",
|
|
205
|
+
endpoint="corpora",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if self.connection_config.corpus_name:
|
|
209
|
+
possible_corpora_keys_names_map = {
|
|
210
|
+
corpus.get("key"): corpus.get("name")
|
|
211
|
+
for corpus in list_corpora_response.get("corpora")
|
|
212
|
+
if corpus.get("name") == self.connection_config.corpus_name
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if len(possible_corpora_keys_names_map) > 1:
|
|
216
|
+
raise ValueError(
|
|
217
|
+
f"Multiple Corpus exist with name {self.connection_config.corpus_name} in dest."
|
|
218
|
+
)
|
|
219
|
+
if len(possible_corpora_keys_names_map) == 1:
|
|
220
|
+
if not self.connection_config.corpus_key:
|
|
221
|
+
self.connection_config.corpus_key = list(
|
|
222
|
+
possible_corpora_keys_names_map.keys()
|
|
223
|
+
)[0]
|
|
224
|
+
elif (
|
|
225
|
+
self.connection_config.corpus_key
|
|
226
|
+
!= list(possible_corpora_keys_names_map.keys())[0]
|
|
227
|
+
):
|
|
228
|
+
raise ValueError("Corpus key does not match provided corpus name.")
|
|
229
|
+
else:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"No Corpora exist with name {self.connection_config.corpus_name} in dest."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
235
|
+
async def _async_request(
|
|
236
|
+
self,
|
|
237
|
+
endpoint: str,
|
|
238
|
+
http_method: str = "POST",
|
|
239
|
+
params: Mapping[str, Any] = None,
|
|
240
|
+
data: Mapping[str, Any] = None,
|
|
241
|
+
) -> tuple[bool, dict]:
|
|
242
|
+
import httpx
|
|
243
|
+
|
|
244
|
+
url = f"{BASE_URL}/{endpoint}"
|
|
245
|
+
|
|
246
|
+
headers = {
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
"Accept": "application/json",
|
|
249
|
+
"Authorization": f"Bearer {await self.jwt_token_async}",
|
|
250
|
+
"X-source": "unstructured",
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async with httpx.AsyncClient() as client:
|
|
254
|
+
response = await client.request(
|
|
255
|
+
method=http_method, url=url, headers=headers, params=params, json=data
|
|
256
|
+
)
|
|
257
|
+
response.raise_for_status()
|
|
258
|
+
return response.json()
|
|
259
|
+
|
|
260
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
261
|
+
def _request(
|
|
262
|
+
self,
|
|
263
|
+
endpoint: str,
|
|
264
|
+
http_method: str = "POST",
|
|
265
|
+
params: Mapping[str, Any] = None,
|
|
266
|
+
data: Mapping[str, Any] = None,
|
|
267
|
+
) -> tuple[bool, dict]:
|
|
268
|
+
import httpx
|
|
269
|
+
|
|
270
|
+
url = f"{BASE_URL}/{endpoint}"
|
|
271
|
+
|
|
272
|
+
headers = {
|
|
273
|
+
"Content-Type": "application/json",
|
|
274
|
+
"Accept": "application/json",
|
|
275
|
+
"Authorization": f"Bearer {self.jwt_token}",
|
|
276
|
+
"X-source": "unstructured",
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
with httpx.Client() as client:
|
|
280
|
+
response = client.request(
|
|
281
|
+
method=http_method, url=url, headers=headers, params=params, json=data
|
|
282
|
+
)
|
|
283
|
+
response.raise_for_status()
|
|
284
|
+
return response.json()
|
|
285
|
+
|
|
286
|
+
async def _delete_doc(self, doc_id: str) -> tuple[bool, dict]:
|
|
287
|
+
"""
|
|
288
|
+
Delete a document from the Vectara corpus.
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
return await self._async_request(
|
|
292
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents/{doc_id}",
|
|
293
|
+
http_method="DELETE",
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
async def _index_document(self, document: Dict[str, Any]) -> None:
|
|
297
|
+
"""
|
|
298
|
+
Index a document (by uploading it to the Vectara corpus) from the document dictionary
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
logger.debug(
|
|
302
|
+
f"Indexing document {document['id']} to corpus key {self.connection_config.corpus_key}"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
result = await self._async_request(
|
|
307
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
|
|
308
|
+
)
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.error(f"exception {e} while indexing document {document['id']}")
|
|
311
|
+
return
|
|
312
|
+
|
|
313
|
+
if (
|
|
314
|
+
"messages" in result
|
|
315
|
+
and result["messages"]
|
|
316
|
+
and (
|
|
317
|
+
"ALREADY_EXISTS" in result["messages"]
|
|
318
|
+
or (
|
|
319
|
+
"CONFLICT: Indexing doesn't support updating documents."
|
|
320
|
+
in result["messages"][0]
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
):
|
|
324
|
+
logger.info(f"document {document['id']} already exists, re-indexing")
|
|
325
|
+
await self._delete_doc(document["id"])
|
|
326
|
+
await self._async_request(
|
|
327
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
|
|
328
|
+
)
|
|
329
|
+
return
|
|
330
|
+
|
|
331
|
+
logger.info(f"indexing document {document['id']} succeeded")
|
|
332
|
+
|
|
333
|
+
async def run_data_async(
|
|
334
|
+
self,
|
|
335
|
+
data: list[dict],
|
|
336
|
+
file_data: FileData,
|
|
337
|
+
**kwargs: Any,
|
|
338
|
+
) -> None:
|
|
339
|
+
|
|
340
|
+
logger.info(f"inserting / updating {len(data)} documents to Vectara ")
|
|
341
|
+
await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
vectara_destination_entry = DestinationRegistryEntry(
|
|
345
|
+
connection_config=VectaraConnectionConfig,
|
|
346
|
+
uploader=VectaraUploader,
|
|
347
|
+
uploader_config=VectaraUploaderConfig,
|
|
348
|
+
upload_stager=VectaraUploadStager,
|
|
349
|
+
upload_stager_config=VectaraUploadStagerConfig,
|
|
350
|
+
)
|
|
@@ -2,6 +2,7 @@ from dataclasses import fields
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
|
|
5
|
+
from unstructured_ingest.v2.errors import ProviderError, UserError
|
|
5
6
|
from unstructured_ingest.v2.logger import logger
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
@@ -51,6 +52,22 @@ def create_partition_request(filename: Path, parameters_dict: dict) -> "Partitio
|
|
|
51
52
|
return PartitionRequest(partition_parameters=partition_params)
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
def handle_error(e: Exception):
|
|
56
|
+
from unstructured_client.models.errors.sdkerror import SDKError
|
|
57
|
+
|
|
58
|
+
if isinstance(e, SDKError):
|
|
59
|
+
logger.error(f"Error calling Unstructured API: {e}")
|
|
60
|
+
if 400 <= e.status_code < 500:
|
|
61
|
+
raise UserError(e.body)
|
|
62
|
+
elif e.status_code >= 500:
|
|
63
|
+
raise ProviderError(e.body)
|
|
64
|
+
else:
|
|
65
|
+
raise e
|
|
66
|
+
else:
|
|
67
|
+
logger.error(f"Uncaught Error calling API: {e}")
|
|
68
|
+
raise e
|
|
69
|
+
|
|
70
|
+
|
|
54
71
|
async def call_api_async(
|
|
55
72
|
server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
|
|
56
73
|
) -> list[dict]:
|
|
@@ -71,7 +88,10 @@ async def call_api_async(
|
|
|
71
88
|
api_key_auth=api_key,
|
|
72
89
|
)
|
|
73
90
|
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
74
|
-
|
|
91
|
+
try:
|
|
92
|
+
res = await client.general.partition_async(request=partition_request)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
handle_error(e)
|
|
75
95
|
|
|
76
96
|
return res.elements or []
|
|
77
97
|
|
|
@@ -96,6 +116,9 @@ def call_api(
|
|
|
96
116
|
api_key_auth=api_key,
|
|
97
117
|
)
|
|
98
118
|
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
99
|
-
|
|
119
|
+
try:
|
|
120
|
+
res = client.general.partition(request=partition_request)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
handle_error(e)
|
|
100
123
|
|
|
101
124
|
return res.elements or []
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.12
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,14 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: opentelemetry-sdk
|
|
26
|
-
Requires-Dist: python-dateutil
|
|
27
|
-
Requires-Dist: click
|
|
28
25
|
Requires-Dist: ndjson
|
|
26
|
+
Requires-Dist: python-dateutil
|
|
29
27
|
Requires-Dist: pydantic>=2.7
|
|
30
28
|
Requires-Dist: pandas
|
|
31
|
-
Requires-Dist: tqdm
|
|
32
29
|
Requires-Dist: dataclasses-json
|
|
30
|
+
Requires-Dist: tqdm
|
|
31
|
+
Requires-Dist: click
|
|
32
|
+
Requires-Dist: opentelemetry-sdk
|
|
33
33
|
Provides-Extra: airtable
|
|
34
34
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
35
35
|
Provides-Extra: astradb
|
|
@@ -42,11 +42,11 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
42
42
|
Provides-Extra: bedrock
|
|
43
43
|
Requires-Dist: boto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
46
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: fsspec; extra == "box"
|
|
49
48
|
Requires-Dist: boxfs; extra == "box"
|
|
49
|
+
Requires-Dist: fsspec; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
@@ -90,9 +90,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
90
90
|
Provides-Extra: epub
|
|
91
91
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
92
92
|
Provides-Extra: gcs
|
|
93
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
94
|
Requires-Dist: bs4; extra == "gcs"
|
|
94
95
|
Requires-Dist: fsspec; extra == "gcs"
|
|
95
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
96
|
Provides-Extra: github
|
|
97
97
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
98
98
|
Requires-Dist: requests; extra == "github"
|
|
@@ -101,8 +101,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
101
101
|
Provides-Extra: google-drive
|
|
102
102
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
103
103
|
Provides-Extra: hubspot
|
|
104
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
105
104
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
105
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
106
106
|
Provides-Extra: jira
|
|
107
107
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
108
108
|
Provides-Extra: kafka
|
|
@@ -120,20 +120,20 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
120
120
|
Provides-Extra: msg
|
|
121
121
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
122
122
|
Provides-Extra: neo4j
|
|
123
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
124
123
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
125
124
|
Requires-Dist: networkx; extra == "neo4j"
|
|
125
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
126
126
|
Provides-Extra: notion
|
|
127
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
128
127
|
Requires-Dist: backoff; extra == "notion"
|
|
129
|
-
Requires-Dist:
|
|
128
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
130
129
|
Requires-Dist: httpx; extra == "notion"
|
|
130
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
131
131
|
Provides-Extra: odt
|
|
132
132
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
133
133
|
Provides-Extra: onedrive
|
|
134
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
135
134
|
Requires-Dist: bs4; extra == "onedrive"
|
|
136
135
|
Requires-Dist: msal; extra == "onedrive"
|
|
136
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
137
|
Provides-Extra: openai
|
|
138
138
|
Requires-Dist: openai; extra == "openai"
|
|
139
139
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -142,8 +142,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
142
142
|
Provides-Extra: org
|
|
143
143
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
144
144
|
Provides-Extra: outlook
|
|
145
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
146
145
|
Requires-Dist: msal; extra == "outlook"
|
|
146
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
147
147
|
Provides-Extra: pdf
|
|
148
148
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
149
149
|
Provides-Extra: pinecone
|
|
@@ -158,6 +158,8 @@ Provides-Extra: qdrant
|
|
|
158
158
|
Requires-Dist: qdrant-client; extra == "qdrant"
|
|
159
159
|
Provides-Extra: reddit
|
|
160
160
|
Requires-Dist: praw; extra == "reddit"
|
|
161
|
+
Provides-Extra: redis
|
|
162
|
+
Requires-Dist: redis; extra == "redis"
|
|
161
163
|
Provides-Extra: remote
|
|
162
164
|
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
163
165
|
Provides-Extra: rst
|
|
@@ -170,11 +172,11 @@ Requires-Dist: fsspec; extra == "s3"
|
|
|
170
172
|
Provides-Extra: salesforce
|
|
171
173
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
172
174
|
Provides-Extra: sftp
|
|
173
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
174
175
|
Requires-Dist: paramiko; extra == "sftp"
|
|
176
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
175
177
|
Provides-Extra: sharepoint
|
|
176
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
177
178
|
Requires-Dist: msal; extra == "sharepoint"
|
|
179
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
178
180
|
Provides-Extra: singlestore
|
|
179
181
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
182
|
Provides-Extra: slack
|
|
@@ -187,6 +189,8 @@ Requires-Dist: together; extra == "togetherai"
|
|
|
187
189
|
Provides-Extra: tsv
|
|
188
190
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
189
191
|
Provides-Extra: vectara
|
|
192
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
193
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
190
194
|
Requires-Dist: requests; extra == "vectara"
|
|
191
195
|
Provides-Extra: weaviate
|
|
192
196
|
Requires-Dist: weaviate-client; extra == "weaviate"
|