unstructured-ingest 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_milvus.py +13 -0
- test/integration/connectors/test_onedrive.py +6 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/embedders/test_bedrock.py +28 -0
- test/integration/embedders/test_octoai.py +14 -0
- test/integration/embedders/test_openai.py +13 -0
- test/integration/embedders/test_togetherai.py +10 -0
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/embed/test_octoai.py +8 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +39 -11
- unstructured_ingest/embed/interfaces.py +5 -0
- unstructured_ingest/embed/octoai.py +44 -3
- unstructured_ingest/embed/openai.py +37 -1
- unstructured_ingest/embed/togetherai.py +28 -1
- unstructured_ingest/embed/voyageai.py +33 -1
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
- unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/unstructured_api.py +25 -2
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +23 -19
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +35 -31
- test/integration/connectors/test_kafka.py +0 -304
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
|
@@ -174,6 +174,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
|
174
174
|
uploader.precheck()
|
|
175
175
|
|
|
176
176
|
|
|
177
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
178
|
+
def test_precheck_fails_on_nonexisting_db(collection: str):
|
|
179
|
+
uploader = MilvusUploader(
|
|
180
|
+
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
181
|
+
upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
|
|
182
|
+
)
|
|
183
|
+
with pytest.raises(
|
|
184
|
+
DestinationConnectionError,
|
|
185
|
+
match="database not found",
|
|
186
|
+
):
|
|
187
|
+
uploader.precheck()
|
|
188
|
+
|
|
189
|
+
|
|
177
190
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
178
191
|
def test_milvus_stager(
|
|
179
192
|
request: TopRequest,
|
|
@@ -20,6 +20,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@pytest.fixture
|
|
23
|
+
@pytest.mark.xfail(
|
|
24
|
+
reason="Issues with test setup on the provider side."
|
|
25
|
+
) # TODO: remove line when issues are addressed
|
|
23
26
|
def onedrive_test_folder() -> str:
|
|
24
27
|
"""
|
|
25
28
|
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
@@ -66,6 +69,9 @@ def get_connection_config():
|
|
|
66
69
|
|
|
67
70
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
71
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
72
|
+
@pytest.mark.xfail(
|
|
73
|
+
reason="Issues with test setup on the provider side."
|
|
74
|
+
) # TODO: remove line when issues are addressed
|
|
69
75
|
def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
70
76
|
"""
|
|
71
77
|
Integration test for the OneDrive destination connector.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pytest
|
|
9
|
+
from redis import exceptions as redis_exceptions
|
|
10
|
+
from redis.asyncio import Redis, from_url
|
|
11
|
+
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
13
|
+
from test.integration.utils import requires_env
|
|
14
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
16
|
+
CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
19
|
+
RedisAccessConfig,
|
|
20
|
+
RedisConnectionConfig,
|
|
21
|
+
RedisUploader,
|
|
22
|
+
RedisUploaderConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def delete_record(client: Redis, element_id: str) -> None:
|
|
27
|
+
await client.delete(element_id)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def validate_upload(client: Redis, first_element: dict):
|
|
31
|
+
element_id = first_element["element_id"]
|
|
32
|
+
expected_text = first_element["text"]
|
|
33
|
+
expected_embeddings = first_element["embeddings"]
|
|
34
|
+
async with client.pipeline(transaction=True) as pipe:
|
|
35
|
+
try:
|
|
36
|
+
response = await pipe.json().get(element_id, "$").execute()
|
|
37
|
+
response = response[0][0]
|
|
38
|
+
except redis_exceptions.ResponseError:
|
|
39
|
+
response = await pipe.get(element_id).execute()
|
|
40
|
+
response = json.loads(response[0])
|
|
41
|
+
|
|
42
|
+
embedding_similarity = np.linalg.norm(
|
|
43
|
+
np.array(response["embeddings"]) - np.array(expected_embeddings)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
assert response is not None
|
|
47
|
+
assert response["element_id"] == element_id
|
|
48
|
+
assert response["text"] == expected_text
|
|
49
|
+
assert embedding_similarity < 1e-10
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def redis_destination_test(
|
|
53
|
+
upload_file: Path,
|
|
54
|
+
tmp_path: Path,
|
|
55
|
+
connection_kwargs: dict,
|
|
56
|
+
uri: Optional[str] = None,
|
|
57
|
+
password: Optional[str] = None,
|
|
58
|
+
):
|
|
59
|
+
uploader = RedisUploader(
|
|
60
|
+
connection_config=RedisConnectionConfig(
|
|
61
|
+
**connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
|
|
62
|
+
),
|
|
63
|
+
upload_config=RedisUploaderConfig(batch_size=10),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
file_data = FileData(
|
|
67
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
68
|
+
connector_type=REDIS_CONNECTOR_TYPE,
|
|
69
|
+
identifier="mock-file-data",
|
|
70
|
+
)
|
|
71
|
+
with upload_file.open() as upload_fp:
|
|
72
|
+
elements = json.load(upload_fp)
|
|
73
|
+
first_element = elements[0]
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
if uploader.is_async():
|
|
77
|
+
await uploader.run_data_async(data=elements, file_data=file_data)
|
|
78
|
+
|
|
79
|
+
if uri:
|
|
80
|
+
async with from_url(uri) as client:
|
|
81
|
+
await validate_upload(client=client, first_element=first_element)
|
|
82
|
+
else:
|
|
83
|
+
async with Redis(**connection_kwargs, password=password) as client:
|
|
84
|
+
await validate_upload(client=client, first_element=first_element)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
raise e
|
|
87
|
+
finally:
|
|
88
|
+
if uri:
|
|
89
|
+
async with from_url(uri) as client:
|
|
90
|
+
tasks = [delete_record(client, element["element_id"]) for element in elements]
|
|
91
|
+
await asyncio.gather(*tasks)
|
|
92
|
+
else:
|
|
93
|
+
async with Redis(**connection_kwargs, password=password) as client:
|
|
94
|
+
tasks = [delete_record(client, element["element_id"]) for element in elements]
|
|
95
|
+
await asyncio.gather(*tasks)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@pytest.mark.asyncio
|
|
99
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
|
|
100
|
+
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
101
|
+
async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
|
|
102
|
+
connection_kwargs = {
|
|
103
|
+
"host": "utic-dashboard-dev.redis.cache.windows.net",
|
|
104
|
+
"port": 6380,
|
|
105
|
+
"db": 0,
|
|
106
|
+
"ssl": True,
|
|
107
|
+
}
|
|
108
|
+
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
109
|
+
await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@pytest.mark.asyncio
|
|
113
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
|
|
114
|
+
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
115
|
+
async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
|
|
116
|
+
connection_kwargs = {}
|
|
117
|
+
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
118
|
+
uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
|
|
119
|
+
await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Generator
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
|
+
from test.integration.utils import requires_env
|
|
13
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
14
|
+
from unstructured_ingest.v2.logger import logger
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
16
|
+
CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
19
|
+
VectaraAccessConfig,
|
|
20
|
+
VectaraConnectionConfig,
|
|
21
|
+
VectaraUploader,
|
|
22
|
+
VectaraUploaderConfig,
|
|
23
|
+
VectaraUploadStager,
|
|
24
|
+
VectaraUploadStagerConfig,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate_upload(response: dict, expected_data: dict):
|
|
29
|
+
element_id = expected_data["element_id"]
|
|
30
|
+
expected_text = expected_data["text"]
|
|
31
|
+
filename = expected_data["metadata"]["filename"]
|
|
32
|
+
filetype = expected_data["metadata"]["filetype"]
|
|
33
|
+
page_number = expected_data["metadata"]["page_number"]
|
|
34
|
+
|
|
35
|
+
response = response["search_results"][0]
|
|
36
|
+
|
|
37
|
+
assert response is not None
|
|
38
|
+
assert response["text"] == expected_text
|
|
39
|
+
assert response["part_metadata"]["element_id"] == element_id
|
|
40
|
+
assert response["part_metadata"]["filename"] == filename
|
|
41
|
+
assert response["part_metadata"]["filetype"] == filetype
|
|
42
|
+
assert response["part_metadata"]["page_number"] == page_number
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
46
|
+
def _get_jwt_token():
|
|
47
|
+
"""Connect to the server and get a JWT token."""
|
|
48
|
+
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
49
|
+
token_endpoint = (
|
|
50
|
+
f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
51
|
+
)
|
|
52
|
+
headers = {
|
|
53
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
54
|
+
}
|
|
55
|
+
data = {
|
|
56
|
+
"grant_type": "client_credentials",
|
|
57
|
+
"client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
|
|
58
|
+
"client_secret": os.environ["VECTARA_OAUTH_SECRET"],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
response = requests.post(token_endpoint, headers=headers, data=data)
|
|
62
|
+
response.raise_for_status()
|
|
63
|
+
response_json = response.json()
|
|
64
|
+
|
|
65
|
+
return response_json.get("access_token")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def query_data(corpus_key: str, element_id: str) -> dict:
|
|
69
|
+
|
|
70
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
|
|
71
|
+
|
|
72
|
+
# the query below requires the corpus to have filter attributes for element_id
|
|
73
|
+
|
|
74
|
+
data = json.dumps(
|
|
75
|
+
{
|
|
76
|
+
"query": "string",
|
|
77
|
+
"search": {
|
|
78
|
+
"metadata_filter": f"part.element_id = '{element_id}'",
|
|
79
|
+
"lexical_interpolation": 1,
|
|
80
|
+
"limit": 10,
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
jwt_token = _get_jwt_token()
|
|
86
|
+
headers = {
|
|
87
|
+
"Content-Type": "application/json",
|
|
88
|
+
"Accept": "application/json",
|
|
89
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
90
|
+
"X-source": "unstructured",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
response = requests.post(url, headers=headers, data=data)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
response_json = response.json()
|
|
96
|
+
|
|
97
|
+
return response_json
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
101
|
+
url = "https://api.vectara.io/v2/corpora"
|
|
102
|
+
data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
|
|
103
|
+
jwt_token = _get_jwt_token()
|
|
104
|
+
headers = {
|
|
105
|
+
"Content-Type": "application/json",
|
|
106
|
+
"Accept": "application/json",
|
|
107
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
108
|
+
"X-source": "unstructured",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
response = requests.post(url, headers=headers, data=data)
|
|
112
|
+
response.raise_for_status()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def replace_filter_attributes(corpus_key: str) -> None:
|
|
116
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
|
|
117
|
+
data = json.dumps(
|
|
118
|
+
{
|
|
119
|
+
"filter_attributes": [
|
|
120
|
+
{"name": "element_id", "level": "part", "indexed": True, "type": "text"}
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
jwt_token = _get_jwt_token()
|
|
125
|
+
headers = {
|
|
126
|
+
"Content-Type": "application/json",
|
|
127
|
+
"Accept": "application/json",
|
|
128
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
129
|
+
"X-source": "unstructured",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
response = requests.post(url, headers=headers, data=data)
|
|
133
|
+
response.raise_for_status()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def delete_corpora(corpus_key: str) -> None:
|
|
137
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
138
|
+
|
|
139
|
+
jwt_token = _get_jwt_token()
|
|
140
|
+
headers = {
|
|
141
|
+
"Content-Type": "application/json",
|
|
142
|
+
"Accept": "application/json",
|
|
143
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
144
|
+
"X-source": "unstructured",
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
response = requests.delete(url, headers=headers)
|
|
148
|
+
response.raise_for_status()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def list_corpora() -> list:
|
|
152
|
+
url = "https://api.vectara.io/v2/corpora?limit=100"
|
|
153
|
+
jwt_token = _get_jwt_token()
|
|
154
|
+
headers = {
|
|
155
|
+
"Content-Type": "application/json",
|
|
156
|
+
"Accept": "application/json",
|
|
157
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
158
|
+
"X-source": "unstructured",
|
|
159
|
+
}
|
|
160
|
+
response = requests.get(url, headers=headers)
|
|
161
|
+
response.raise_for_status()
|
|
162
|
+
response_json = response.json()
|
|
163
|
+
if response_json.get("corpora"):
|
|
164
|
+
return [item["key"] for item in response_json.get("corpora")]
|
|
165
|
+
else:
|
|
166
|
+
return []
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
170
|
+
def is_ready_status():
|
|
171
|
+
corpora_list = list_corpora()
|
|
172
|
+
return corpus_key in corpora_list
|
|
173
|
+
|
|
174
|
+
start = time.time()
|
|
175
|
+
is_ready = is_ready_status()
|
|
176
|
+
while not is_ready and time.time() - start < timeout:
|
|
177
|
+
time.sleep(interval)
|
|
178
|
+
is_ready = is_ready_status()
|
|
179
|
+
if not is_ready:
|
|
180
|
+
raise TimeoutError("time out waiting for corpus to be ready")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
184
|
+
start = time.time()
|
|
185
|
+
while time.time() - start < timeout:
|
|
186
|
+
corpora_list = list_corpora()
|
|
187
|
+
if corpus_key not in corpora_list:
|
|
188
|
+
return
|
|
189
|
+
time.sleep(interval)
|
|
190
|
+
|
|
191
|
+
raise TimeoutError("time out waiting for corpus to delete")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@pytest.fixture
|
|
195
|
+
def corpora_util() -> Generator[str, None, None]:
|
|
196
|
+
random_id = str(uuid4()).split("-")[0]
|
|
197
|
+
corpus_key = f"ingest-test-{random_id}"
|
|
198
|
+
corpus_name = "ingest-test"
|
|
199
|
+
logger.info(f"Creating corpus with key: {corpus_key}")
|
|
200
|
+
try:
|
|
201
|
+
create_corpora(corpus_key, corpus_name)
|
|
202
|
+
replace_filter_attributes(corpus_key)
|
|
203
|
+
wait_for_ready(corpus_key=corpus_key)
|
|
204
|
+
yield corpus_key
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(f"failed to create corpus {corpus_key}: {e}")
|
|
207
|
+
finally:
|
|
208
|
+
logger.info(f"deleting corpus: {corpus_key}")
|
|
209
|
+
delete_corpora(corpus_key)
|
|
210
|
+
wait_for_delete(corpus_key=corpus_key)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@pytest.mark.asyncio
|
|
214
|
+
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
|
|
215
|
+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
|
+
async def test_vectara_destination(
|
|
217
|
+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
|
|
218
|
+
):
|
|
219
|
+
corpus_key = corpora_util
|
|
220
|
+
connection_kwargs = {
|
|
221
|
+
"customer_id": os.environ["VECTARA_CUSTOMER_ID"],
|
|
222
|
+
"corpus_key": corpus_key,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
|
|
226
|
+
oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
|
|
227
|
+
|
|
228
|
+
file_data = FileData(
|
|
229
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
230
|
+
connector_type=VECTARA_CONNECTOR_TYPE,
|
|
231
|
+
identifier="mock-file-data",
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
stager_config = VectaraUploadStagerConfig(batch_size=10)
|
|
235
|
+
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
236
|
+
new_upload_file = stager.run(
|
|
237
|
+
elements_filepath=upload_file,
|
|
238
|
+
output_dir=tmp_path,
|
|
239
|
+
output_filename=upload_file.name,
|
|
240
|
+
file_data=file_data,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
uploader = VectaraUploader(
|
|
244
|
+
connection_config=VectaraConnectionConfig(
|
|
245
|
+
**connection_kwargs,
|
|
246
|
+
access_config=VectaraAccessConfig(
|
|
247
|
+
oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
|
|
248
|
+
),
|
|
249
|
+
),
|
|
250
|
+
upload_config=VectaraUploaderConfig(),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
with new_upload_file.open() as new_upload_fp:
|
|
254
|
+
elements_stager = json.load(new_upload_fp)
|
|
255
|
+
|
|
256
|
+
if uploader.is_async():
|
|
257
|
+
await uploader.run_data_async(data=elements_stager, file_data=file_data)
|
|
258
|
+
|
|
259
|
+
with upload_file.open() as upload_fp:
|
|
260
|
+
elements = json.load(upload_fp)
|
|
261
|
+
first_element = elements[0]
|
|
262
|
+
|
|
263
|
+
for i in range(retries):
|
|
264
|
+
response = query_data(corpus_key, first_element["element_id"])
|
|
265
|
+
if not response["search_results"]:
|
|
266
|
+
time.sleep(interval)
|
|
267
|
+
else:
|
|
268
|
+
break
|
|
269
|
+
|
|
270
|
+
validate_upload(response=response, expected_data=first_element)
|
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError, UserError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
|
|
@@ -47,3 +50,28 @@ def test_raw_bedrock_embedder(embedder_file: Path):
|
|
|
47
50
|
expected_dimensions=(1536,),
|
|
48
51
|
expected_is_unit_vector=False,
|
|
49
52
|
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
|
|
56
|
+
embedder = BedrockEmbeddingEncoder(
|
|
57
|
+
config=BedrockEmbeddingConfig(
|
|
58
|
+
aws_access_key_id="no_key",
|
|
59
|
+
aws_secret_access_key="no_secret",
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
with pytest.raises(UserAuthError):
|
|
63
|
+
embedder.get_exemplary_embedding()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
67
|
+
def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
|
|
68
|
+
aws_credentials = get_aws_credentials()
|
|
69
|
+
embedder = BedrockEmbeddingEncoder(
|
|
70
|
+
config=BedrockEmbeddingConfig(
|
|
71
|
+
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
72
|
+
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
73
|
+
model_name="invalid_model",
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
with pytest.raises(UserError):
|
|
77
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
API_KEY = "OCTOAI_API_KEY"
|
|
@@ -39,3 +42,14 @@ def test_raw_octoai_embedder(embedder_file: Path):
|
|
|
39
42
|
validate_raw_embedder(
|
|
40
43
|
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
41
44
|
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.mark.skip(reason="Unexpected connection error at the moment")
|
|
48
|
+
def test_raw_octoai_embedder_invalid_credentials():
|
|
49
|
+
embedder = OctoAIEmbeddingEncoder(
|
|
50
|
+
config=OctoAiEmbeddingConfig(
|
|
51
|
+
api_key="fake_api_key",
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
with pytest.raises(UserAuthError):
|
|
55
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,9 +2,12 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
10
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
8
11
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
12
|
|
|
10
13
|
API_KEY = "OPENAI_API_KEY"
|
|
@@ -39,3 +42,13 @@ def test_raw_openai_embedder(embedder_file: Path):
|
|
|
39
42
|
validate_raw_embedder(
|
|
40
43
|
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
41
44
|
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_raw_openai_embedder_invalid_credentials():
|
|
48
|
+
embedder = OpenAIEmbeddingEncoder(
|
|
49
|
+
config=OpenAIEmbeddingConfig(
|
|
50
|
+
api_key="fake_api_key",
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
with pytest.raises(UserAuthError):
|
|
54
|
+
embedder.get_exemplary_embedding()
|
|
@@ -2,12 +2,15 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
5
7
|
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
8
|
from test.integration.utils import requires_env
|
|
7
9
|
from unstructured_ingest.embed.togetherai import (
|
|
8
10
|
TogetherAIEmbeddingConfig,
|
|
9
11
|
TogetherAIEmbeddingEncoder,
|
|
10
12
|
)
|
|
13
|
+
from unstructured_ingest.v2.errors import UserAuthError
|
|
11
14
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
12
15
|
|
|
13
16
|
API_KEY = "TOGETHERAI_API_KEY"
|
|
@@ -41,3 +44,10 @@ def test_raw_togetherai_embedder(embedder_file: Path):
|
|
|
41
44
|
expected_dimensions=(768,),
|
|
42
45
|
expected_is_unit_vector=False,
|
|
43
46
|
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_raw_togetherai_embedder_invalid_credentials():
|
|
50
|
+
embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
|
|
51
|
+
|
|
52
|
+
with pytest.raises(UserAuthError):
|
|
53
|
+
embedder.get_exemplary_embedding()
|
|
@@ -3,9 +3,9 @@ import os
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from unstructured_client.models.errors.sdkerror import SDKError
|
|
7
6
|
|
|
8
7
|
from test.integration.utils import requires_env
|
|
8
|
+
from unstructured_ingest.v2.errors import UserError
|
|
9
9
|
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
10
10
|
|
|
11
11
|
int_test_dir = Path(__file__).parent
|
|
@@ -71,5 +71,5 @@ async def test_partitioner_api_fast_error(partition_file: Path):
|
|
|
71
71
|
strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
72
72
|
)
|
|
73
73
|
partitioner = Partitioner(config=partitioner_config)
|
|
74
|
-
with pytest.raises(
|
|
74
|
+
with pytest.raises(UserError):
|
|
75
75
|
await partitioner.run_async(filename=partition_file)
|
test/unit/embed/test_octoai.py
CHANGED
|
@@ -4,7 +4,14 @@ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbedd
|
|
|
4
4
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
5
|
# Mocked client with the desired behavior for embed_documents
|
|
6
6
|
mock_client = mocker.MagicMock()
|
|
7
|
-
|
|
7
|
+
mock_data = []
|
|
8
|
+
for i in range(2):
|
|
9
|
+
data = mocker.MagicMock()
|
|
10
|
+
data.embedding = [1, 2]
|
|
11
|
+
mock_data.append(data)
|
|
12
|
+
mock_response = mocker.MagicMock()
|
|
13
|
+
mock_response.data = mock_data
|
|
14
|
+
mock_client.embeddings.create.return_value = mock_response
|
|
8
15
|
|
|
9
16
|
# Mock get_client to return our mock_client
|
|
10
17
|
mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.12" # pragma: no cover
|
|
@@ -6,7 +6,9 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
from pydantic import Field, SecretStr
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
9
|
+
from unstructured_ingest.logger import logger
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
12
14
|
from botocore.client import BaseClient
|
|
@@ -44,6 +46,32 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
44
46
|
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
45
47
|
config: BedrockEmbeddingConfig
|
|
46
48
|
|
|
49
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
50
|
+
from botocore.exceptions import ClientError
|
|
51
|
+
|
|
52
|
+
if isinstance(e, ClientError):
|
|
53
|
+
# https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
|
|
54
|
+
http_response = e.response
|
|
55
|
+
meta = http_response["ResponseMetadata"]
|
|
56
|
+
http_response_code = meta["HTTPStatusCode"]
|
|
57
|
+
error_code = http_response["Error"]["Code"]
|
|
58
|
+
if http_response_code == 400:
|
|
59
|
+
if error_code == "ValidationError":
|
|
60
|
+
return UserError(http_response["Error"])
|
|
61
|
+
elif error_code == "ThrottlingException":
|
|
62
|
+
return RateLimitError(http_response["Error"])
|
|
63
|
+
elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
|
|
64
|
+
return UserAuthError(http_response["Error"])
|
|
65
|
+
if http_response_code == 403:
|
|
66
|
+
return UserAuthError(http_response["Error"])
|
|
67
|
+
if 400 <= http_response_code < 500:
|
|
68
|
+
return UserError(http_response["Error"])
|
|
69
|
+
if http_response_code >= 500:
|
|
70
|
+
return ProviderError(http_response["Error"])
|
|
71
|
+
|
|
72
|
+
logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
|
|
73
|
+
return e
|
|
74
|
+
|
|
47
75
|
def embed_query(self, query: str) -> list[float]:
|
|
48
76
|
"""Call out to Bedrock embedding endpoint."""
|
|
49
77
|
# replace newlines, which can negatively affect performance.
|
|
@@ -61,25 +89,25 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
61
89
|
input_body["inputText"] = text
|
|
62
90
|
body = json.dumps(input_body)
|
|
63
91
|
|
|
92
|
+
bedrock_client = self.config.get_client()
|
|
93
|
+
# invoke bedrock API
|
|
64
94
|
try:
|
|
65
|
-
bedrock_client = self.config.get_client()
|
|
66
|
-
# invoke bedrock API
|
|
67
95
|
response = bedrock_client.invoke_model(
|
|
68
96
|
body=body,
|
|
69
97
|
modelId=self.config.embed_model_name,
|
|
70
98
|
accept="application/json",
|
|
71
99
|
contentType="application/json",
|
|
72
100
|
)
|
|
73
|
-
|
|
74
|
-
# format output based on provider
|
|
75
|
-
response_body = json.loads(response.get("body").read())
|
|
76
|
-
if provider == "cohere":
|
|
77
|
-
return response_body.get("embeddings")[0]
|
|
78
|
-
else:
|
|
79
|
-
# includes common provider == "amazon"
|
|
80
|
-
return response_body.get("embedding")
|
|
81
101
|
except Exception as e:
|
|
82
|
-
raise
|
|
102
|
+
raise self.wrap_error(e=e)
|
|
103
|
+
|
|
104
|
+
# format output based on provider
|
|
105
|
+
response_body = json.loads(response.get("body").read())
|
|
106
|
+
if provider == "cohere":
|
|
107
|
+
return response_body.get("embeddings")[0]
|
|
108
|
+
else:
|
|
109
|
+
# includes common provider == "amazon"
|
|
110
|
+
return response_body.get("embedding")
|
|
83
111
|
|
|
84
112
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
85
113
|
embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
|
|
@@ -17,6 +17,11 @@ class BaseEmbeddingEncoder(ABC):
|
|
|
17
17
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
18
18
|
is properly configured: e.g., embed a single a element"""
|
|
19
19
|
|
|
20
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
21
|
+
"""Handle errors from the embedding service. Should raise a more informative error
|
|
22
|
+
if possible"""
|
|
23
|
+
return e
|
|
24
|
+
|
|
20
25
|
@property
|
|
21
26
|
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
22
27
|
exemplary_embedding = self.get_exemplary_embedding()
|