unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +6 -2
- test/integration/connectors/sql/test_singlestore.py +6 -2
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +6 -2
- test/integration/connectors/test_milvus.py +13 -0
- test/integration/connectors/test_onedrive.py +6 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/embedders/test_bedrock.py +28 -0
- test/integration/embedders/test_octoai.py +14 -0
- test/integration/embedders/test_openai.py +13 -0
- test/integration/embedders/test_togetherai.py +10 -0
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/embed/test_octoai.py +8 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +39 -11
- unstructured_ingest/embed/interfaces.py +5 -0
- unstructured_ingest/embed/octoai.py +44 -3
- unstructured_ingest/embed/openai.py +37 -1
- unstructured_ingest/embed/togetherai.py +28 -1
- unstructured_ingest/embed/voyageai.py +33 -1
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/file_data.py +11 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
- unstructured_ingest/v2/processes/connectors/couchbase.py +2 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +3 -4
- unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/unstructured_api.py +25 -2
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +20 -16
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +52 -48
- test/integration/connectors/test_kafka.py +0 -304
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import tempfile
|
|
4
3
|
import uuid
|
|
5
4
|
from contextlib import contextmanager
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from pathlib import Path
|
|
7
|
+
from unittest import mock
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
10
|
from databricks.sdk import WorkspaceClient
|
|
@@ -31,11 +31,15 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@dataclass
|
|
34
|
-
class
|
|
34
|
+
class BaseEnvData:
|
|
35
35
|
host: str
|
|
36
|
+
catalog: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class BasicAuthEnvData(BaseEnvData):
|
|
36
41
|
client_id: str
|
|
37
42
|
client_secret: str
|
|
38
|
-
catalog: str
|
|
39
43
|
|
|
40
44
|
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
41
45
|
return DatabricksNativeVolumesConnectionConfig(
|
|
@@ -47,8 +51,21 @@ class EnvData:
|
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
|
|
50
|
-
|
|
51
|
-
|
|
54
|
+
@dataclass
|
|
55
|
+
class PATEnvData(BaseEnvData):
|
|
56
|
+
token: str
|
|
57
|
+
|
|
58
|
+
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
59
|
+
return DatabricksNativeVolumesConnectionConfig(
|
|
60
|
+
host=self.host,
|
|
61
|
+
access_config=DatabricksNativeVolumesAccessConfig(
|
|
62
|
+
token=self.token,
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_basic_auth_env_data() -> BasicAuthEnvData:
|
|
68
|
+
return BasicAuthEnvData(
|
|
52
69
|
host=os.environ["DATABRICKS_HOST"],
|
|
53
70
|
client_id=os.environ["DATABRICKS_CLIENT_ID"],
|
|
54
71
|
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
|
|
@@ -56,23 +73,30 @@ def get_env_data() -> EnvData:
|
|
|
56
73
|
)
|
|
57
74
|
|
|
58
75
|
|
|
76
|
+
def get_pat_env_data() -> PATEnvData:
|
|
77
|
+
return PATEnvData(
|
|
78
|
+
host=os.environ["DATABRICKS_HOST"],
|
|
79
|
+
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
80
|
+
token=os.environ["DATABRICKS_PAT"],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
59
84
|
@pytest.mark.asyncio
|
|
60
85
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
61
86
|
@requires_env(
|
|
62
87
|
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
63
88
|
)
|
|
64
|
-
async def test_volumes_native_source():
|
|
65
|
-
env_data =
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
|
|
89
|
+
async def test_volumes_native_source(tmp_path: Path):
|
|
90
|
+
env_data = get_basic_auth_env_data()
|
|
91
|
+
with mock.patch.dict(os.environ, clear=True):
|
|
92
|
+
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
93
|
+
recursive=True,
|
|
94
|
+
volume="test-platform",
|
|
95
|
+
volume_path="databricks-volumes-test-input",
|
|
96
|
+
catalog=env_data.catalog,
|
|
97
|
+
)
|
|
98
|
+
connection_config = env_data.get_connection_config()
|
|
99
|
+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
76
100
|
indexer = DatabricksNativeVolumesIndexer(
|
|
77
101
|
connection_config=connection_config, index_config=indexer_config
|
|
78
102
|
)
|
|
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
|
|
|
89
113
|
)
|
|
90
114
|
|
|
91
115
|
|
|
116
|
+
@pytest.mark.asyncio
|
|
117
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
118
|
+
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
119
|
+
async def test_volumes_native_source_pat(tmp_path: Path):
|
|
120
|
+
env_data = get_pat_env_data()
|
|
121
|
+
with mock.patch.dict(os.environ, clear=True):
|
|
122
|
+
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
123
|
+
recursive=True,
|
|
124
|
+
volume="test-platform",
|
|
125
|
+
volume_path="databricks-volumes-test-input",
|
|
126
|
+
catalog=env_data.catalog,
|
|
127
|
+
)
|
|
128
|
+
connection_config = env_data.get_connection_config()
|
|
129
|
+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
130
|
+
indexer = DatabricksNativeVolumesIndexer(
|
|
131
|
+
connection_config=connection_config, index_config=indexer_config
|
|
132
|
+
)
|
|
133
|
+
downloader = DatabricksNativeVolumesDownloader(
|
|
134
|
+
connection_config=connection_config, download_config=download_config
|
|
135
|
+
)
|
|
136
|
+
await source_connector_validation(
|
|
137
|
+
indexer=indexer,
|
|
138
|
+
downloader=downloader,
|
|
139
|
+
configs=SourceValidationConfigs(
|
|
140
|
+
test_id="databricks_volumes_native_pat",
|
|
141
|
+
expected_num_files=1,
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
92
146
|
def _get_volume_path(catalog: str, volume: str, volume_path: str):
|
|
93
147
|
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
|
|
94
148
|
|
|
95
149
|
|
|
96
150
|
@contextmanager
|
|
97
|
-
def databricks_destination_context(
|
|
151
|
+
def databricks_destination_context(
|
|
152
|
+
env_data: BasicAuthEnvData, volume: str, volume_path
|
|
153
|
+
) -> WorkspaceClient:
|
|
98
154
|
client = WorkspaceClient(
|
|
99
155
|
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
|
|
100
156
|
)
|
|
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
|
|
|
137
193
|
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
138
194
|
)
|
|
139
195
|
async def test_volumes_native_destination(upload_file: Path):
|
|
140
|
-
env_data =
|
|
196
|
+
env_data = get_basic_auth_env_data()
|
|
141
197
|
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
|
|
142
198
|
file_data = FileData(
|
|
143
199
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
PostgresAccessConfig,
|
|
@@ -119,7 +119,11 @@ def validate_destination(
|
|
|
119
119
|
async def test_postgres_destination(upload_file: Path, temp_dir: Path):
|
|
120
120
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
121
121
|
# mocking it with arbitrary values to meet the base requirements:
|
|
122
|
-
mock_file_data = FileData(
|
|
122
|
+
mock_file_data = FileData(
|
|
123
|
+
identifier="mock file data",
|
|
124
|
+
connector_type=CONNECTOR_TYPE,
|
|
125
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
126
|
+
)
|
|
123
127
|
with docker_compose_context(
|
|
124
128
|
docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
|
|
125
129
|
):
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
SingleStoreAccessConfig,
|
|
@@ -103,7 +103,11 @@ def validate_destination(
|
|
|
103
103
|
@pytest.mark.asyncio
|
|
104
104
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
105
105
|
async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
|
|
106
|
-
mock_file_data = FileData(
|
|
106
|
+
mock_file_data = FileData(
|
|
107
|
+
identifier="mock file data",
|
|
108
|
+
connector_type=CONNECTOR_TYPE,
|
|
109
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
110
|
+
)
|
|
107
111
|
with docker_compose_context(
|
|
108
112
|
docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
|
|
109
113
|
):
|
|
@@ -17,7 +17,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
17
17
|
source_connector_validation,
|
|
18
18
|
)
|
|
19
19
|
from test.integration.utils import requires_env
|
|
20
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
20
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
21
21
|
from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
|
|
22
22
|
CONNECTOR_TYPE,
|
|
23
23
|
SnowflakeAccessConfig,
|
|
@@ -170,7 +170,11 @@ async def test_snowflake_destination(
|
|
|
170
170
|
):
|
|
171
171
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
172
172
|
# mocking it with arbitrary values to meet the base requirements:
|
|
173
|
-
mock_file_data = FileData(
|
|
173
|
+
mock_file_data = FileData(
|
|
174
|
+
identifier="mock file data",
|
|
175
|
+
connector_type=CONNECTOR_TYPE,
|
|
176
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
177
|
+
)
|
|
174
178
|
init_db_destination()
|
|
175
179
|
stager = SnowflakeUploadStager()
|
|
176
180
|
staged_path = stager.run(
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
SQLiteConnectionConfig,
|
|
@@ -116,7 +116,11 @@ async def test_sqlite_destination(
|
|
|
116
116
|
):
|
|
117
117
|
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
118
118
|
# mocking it with arbitrary values to meet the base requirements:
|
|
119
|
-
mock_file_data = FileData(
|
|
119
|
+
mock_file_data = FileData(
|
|
120
|
+
identifier="mock file data",
|
|
121
|
+
connector_type=CONNECTOR_TYPE,
|
|
122
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
123
|
+
)
|
|
120
124
|
stager = SQLiteUploadStager()
|
|
121
125
|
staged_path = stager.run(
|
|
122
126
|
elements_filepath=upload_file,
|
|
@@ -174,6 +174,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
|
174
174
|
uploader.precheck()
|
|
175
175
|
|
|
176
176
|
|
|
177
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
178
|
+
def test_precheck_fails_on_nonexisting_db(collection: str):
|
|
179
|
+
uploader = MilvusUploader(
|
|
180
|
+
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
181
|
+
upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
|
|
182
|
+
)
|
|
183
|
+
with pytest.raises(
|
|
184
|
+
DestinationConnectionError,
|
|
185
|
+
match="database not found",
|
|
186
|
+
):
|
|
187
|
+
uploader.precheck()
|
|
188
|
+
|
|
189
|
+
|
|
177
190
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
178
191
|
def test_milvus_stager(
|
|
179
192
|
request: TopRequest,
|
|
@@ -20,6 +20,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@pytest.fixture
|
|
23
|
+
@pytest.mark.xfail(
|
|
24
|
+
reason="Issues with test setup on the provider side."
|
|
25
|
+
) # TODO: remove line when issues are addressed
|
|
23
26
|
def onedrive_test_folder() -> str:
|
|
24
27
|
"""
|
|
25
28
|
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
@@ -66,6 +69,9 @@ def get_connection_config():
|
|
|
66
69
|
|
|
67
70
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
71
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
72
|
+
@pytest.mark.xfail(
|
|
73
|
+
reason="Issues with test setup on the provider side."
|
|
74
|
+
) # TODO: remove line when issues are addressed
|
|
69
75
|
def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
70
76
|
"""
|
|
71
77
|
Integration test for the OneDrive destination connector.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pytest
|
|
9
|
+
from redis import exceptions as redis_exceptions
|
|
10
|
+
from redis.asyncio import Redis, from_url
|
|
11
|
+
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
13
|
+
from test.integration.utils import requires_env
|
|
14
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
16
|
+
CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
19
|
+
RedisAccessConfig,
|
|
20
|
+
RedisConnectionConfig,
|
|
21
|
+
RedisUploader,
|
|
22
|
+
RedisUploaderConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def delete_record(client: Redis, element_id: str) -> None:
|
|
27
|
+
await client.delete(element_id)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def validate_upload(client: Redis, first_element: dict):
|
|
31
|
+
element_id = first_element["element_id"]
|
|
32
|
+
expected_text = first_element["text"]
|
|
33
|
+
expected_embeddings = first_element["embeddings"]
|
|
34
|
+
async with client.pipeline(transaction=True) as pipe:
|
|
35
|
+
try:
|
|
36
|
+
response = await pipe.json().get(element_id, "$").execute()
|
|
37
|
+
response = response[0][0]
|
|
38
|
+
except redis_exceptions.ResponseError:
|
|
39
|
+
response = await pipe.get(element_id).execute()
|
|
40
|
+
response = json.loads(response[0])
|
|
41
|
+
|
|
42
|
+
embedding_similarity = np.linalg.norm(
|
|
43
|
+
np.array(response["embeddings"]) - np.array(expected_embeddings)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
assert response is not None
|
|
47
|
+
assert response["element_id"] == element_id
|
|
48
|
+
assert response["text"] == expected_text
|
|
49
|
+
assert embedding_similarity < 1e-10
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def redis_destination_test(
|
|
53
|
+
upload_file: Path,
|
|
54
|
+
tmp_path: Path,
|
|
55
|
+
connection_kwargs: dict,
|
|
56
|
+
uri: Optional[str] = None,
|
|
57
|
+
password: Optional[str] = None,
|
|
58
|
+
):
|
|
59
|
+
uploader = RedisUploader(
|
|
60
|
+
connection_config=RedisConnectionConfig(
|
|
61
|
+
**connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
|
|
62
|
+
),
|
|
63
|
+
upload_config=RedisUploaderConfig(batch_size=10),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
file_data = FileData(
|
|
67
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
68
|
+
connector_type=REDIS_CONNECTOR_TYPE,
|
|
69
|
+
identifier="mock-file-data",
|
|
70
|
+
)
|
|
71
|
+
with upload_file.open() as upload_fp:
|
|
72
|
+
elements = json.load(upload_fp)
|
|
73
|
+
first_element = elements[0]
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
if uploader.is_async():
|
|
77
|
+
await uploader.run_data_async(data=elements, file_data=file_data)
|
|
78
|
+
|
|
79
|
+
if uri:
|
|
80
|
+
async with from_url(uri) as client:
|
|
81
|
+
await validate_upload(client=client, first_element=first_element)
|
|
82
|
+
else:
|
|
83
|
+
async with Redis(**connection_kwargs, password=password) as client:
|
|
84
|
+
await validate_upload(client=client, first_element=first_element)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
raise e
|
|
87
|
+
finally:
|
|
88
|
+
if uri:
|
|
89
|
+
async with from_url(uri) as client:
|
|
90
|
+
tasks = [delete_record(client, element["element_id"]) for element in elements]
|
|
91
|
+
await asyncio.gather(*tasks)
|
|
92
|
+
else:
|
|
93
|
+
async with Redis(**connection_kwargs, password=password) as client:
|
|
94
|
+
tasks = [delete_record(client, element["element_id"]) for element in elements]
|
|
95
|
+
await asyncio.gather(*tasks)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@pytest.mark.asyncio
|
|
99
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
|
|
100
|
+
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
101
|
+
async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
|
|
102
|
+
connection_kwargs = {
|
|
103
|
+
"host": "utic-dashboard-dev.redis.cache.windows.net",
|
|
104
|
+
"port": 6380,
|
|
105
|
+
"db": 0,
|
|
106
|
+
"ssl": True,
|
|
107
|
+
}
|
|
108
|
+
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
109
|
+
await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@pytest.mark.asyncio
|
|
113
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
|
|
114
|
+
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
115
|
+
async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
|
|
116
|
+
connection_kwargs = {}
|
|
117
|
+
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
118
|
+
uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
|
|
119
|
+
await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Generator
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
|
+
from test.integration.utils import requires_env
|
|
13
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
14
|
+
from unstructured_ingest.v2.logger import logger
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
16
|
+
CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
19
|
+
VectaraAccessConfig,
|
|
20
|
+
VectaraConnectionConfig,
|
|
21
|
+
VectaraUploader,
|
|
22
|
+
VectaraUploaderConfig,
|
|
23
|
+
VectaraUploadStager,
|
|
24
|
+
VectaraUploadStagerConfig,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate_upload(response: dict, expected_data: dict):
|
|
29
|
+
element_id = expected_data["element_id"]
|
|
30
|
+
expected_text = expected_data["text"]
|
|
31
|
+
filename = expected_data["metadata"]["filename"]
|
|
32
|
+
filetype = expected_data["metadata"]["filetype"]
|
|
33
|
+
page_number = expected_data["metadata"]["page_number"]
|
|
34
|
+
|
|
35
|
+
response = response["search_results"][0]
|
|
36
|
+
|
|
37
|
+
assert response is not None
|
|
38
|
+
assert response["text"] == expected_text
|
|
39
|
+
assert response["part_metadata"]["element_id"] == element_id
|
|
40
|
+
assert response["part_metadata"]["filename"] == filename
|
|
41
|
+
assert response["part_metadata"]["filetype"] == filetype
|
|
42
|
+
assert response["part_metadata"]["page_number"] == page_number
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
46
|
+
def _get_jwt_token():
|
|
47
|
+
"""Connect to the server and get a JWT token."""
|
|
48
|
+
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
49
|
+
token_endpoint = (
|
|
50
|
+
f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
51
|
+
)
|
|
52
|
+
headers = {
|
|
53
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
54
|
+
}
|
|
55
|
+
data = {
|
|
56
|
+
"grant_type": "client_credentials",
|
|
57
|
+
"client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
|
|
58
|
+
"client_secret": os.environ["VECTARA_OAUTH_SECRET"],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
response = requests.post(token_endpoint, headers=headers, data=data)
|
|
62
|
+
response.raise_for_status()
|
|
63
|
+
response_json = response.json()
|
|
64
|
+
|
|
65
|
+
return response_json.get("access_token")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def query_data(corpus_key: str, element_id: str) -> dict:
|
|
69
|
+
|
|
70
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
|
|
71
|
+
|
|
72
|
+
# the query below requires the corpus to have filter attributes for element_id
|
|
73
|
+
|
|
74
|
+
data = json.dumps(
|
|
75
|
+
{
|
|
76
|
+
"query": "string",
|
|
77
|
+
"search": {
|
|
78
|
+
"metadata_filter": f"part.element_id = '{element_id}'",
|
|
79
|
+
"lexical_interpolation": 1,
|
|
80
|
+
"limit": 10,
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
jwt_token = _get_jwt_token()
|
|
86
|
+
headers = {
|
|
87
|
+
"Content-Type": "application/json",
|
|
88
|
+
"Accept": "application/json",
|
|
89
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
90
|
+
"X-source": "unstructured",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
response = requests.post(url, headers=headers, data=data)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
response_json = response.json()
|
|
96
|
+
|
|
97
|
+
return response_json
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
101
|
+
url = "https://api.vectara.io/v2/corpora"
|
|
102
|
+
data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
|
|
103
|
+
jwt_token = _get_jwt_token()
|
|
104
|
+
headers = {
|
|
105
|
+
"Content-Type": "application/json",
|
|
106
|
+
"Accept": "application/json",
|
|
107
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
108
|
+
"X-source": "unstructured",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
response = requests.post(url, headers=headers, data=data)
|
|
112
|
+
response.raise_for_status()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def replace_filter_attributes(corpus_key: str) -> None:
|
|
116
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
|
|
117
|
+
data = json.dumps(
|
|
118
|
+
{
|
|
119
|
+
"filter_attributes": [
|
|
120
|
+
{"name": "element_id", "level": "part", "indexed": True, "type": "text"}
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
jwt_token = _get_jwt_token()
|
|
125
|
+
headers = {
|
|
126
|
+
"Content-Type": "application/json",
|
|
127
|
+
"Accept": "application/json",
|
|
128
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
129
|
+
"X-source": "unstructured",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
response = requests.post(url, headers=headers, data=data)
|
|
133
|
+
response.raise_for_status()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def delete_corpora(corpus_key: str) -> None:
|
|
137
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
138
|
+
|
|
139
|
+
jwt_token = _get_jwt_token()
|
|
140
|
+
headers = {
|
|
141
|
+
"Content-Type": "application/json",
|
|
142
|
+
"Accept": "application/json",
|
|
143
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
144
|
+
"X-source": "unstructured",
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
response = requests.delete(url, headers=headers)
|
|
148
|
+
response.raise_for_status()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def list_corpora() -> list:
|
|
152
|
+
url = "https://api.vectara.io/v2/corpora?limit=100"
|
|
153
|
+
jwt_token = _get_jwt_token()
|
|
154
|
+
headers = {
|
|
155
|
+
"Content-Type": "application/json",
|
|
156
|
+
"Accept": "application/json",
|
|
157
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
158
|
+
"X-source": "unstructured",
|
|
159
|
+
}
|
|
160
|
+
response = requests.get(url, headers=headers)
|
|
161
|
+
response.raise_for_status()
|
|
162
|
+
response_json = response.json()
|
|
163
|
+
if response_json.get("corpora"):
|
|
164
|
+
return [item["key"] for item in response_json.get("corpora")]
|
|
165
|
+
else:
|
|
166
|
+
return []
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
170
|
+
def is_ready_status():
|
|
171
|
+
corpora_list = list_corpora()
|
|
172
|
+
return corpus_key in corpora_list
|
|
173
|
+
|
|
174
|
+
start = time.time()
|
|
175
|
+
is_ready = is_ready_status()
|
|
176
|
+
while not is_ready and time.time() - start < timeout:
|
|
177
|
+
time.sleep(interval)
|
|
178
|
+
is_ready = is_ready_status()
|
|
179
|
+
if not is_ready:
|
|
180
|
+
raise TimeoutError("time out waiting for corpus to be ready")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
184
|
+
start = time.time()
|
|
185
|
+
while time.time() - start < timeout:
|
|
186
|
+
corpora_list = list_corpora()
|
|
187
|
+
if corpus_key not in corpora_list:
|
|
188
|
+
return
|
|
189
|
+
time.sleep(interval)
|
|
190
|
+
|
|
191
|
+
raise TimeoutError("time out waiting for corpus to delete")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@pytest.fixture
|
|
195
|
+
def corpora_util() -> Generator[str, None, None]:
|
|
196
|
+
random_id = str(uuid4()).split("-")[0]
|
|
197
|
+
corpus_key = f"ingest-test-{random_id}"
|
|
198
|
+
corpus_name = "ingest-test"
|
|
199
|
+
logger.info(f"Creating corpus with key: {corpus_key}")
|
|
200
|
+
try:
|
|
201
|
+
create_corpora(corpus_key, corpus_name)
|
|
202
|
+
replace_filter_attributes(corpus_key)
|
|
203
|
+
wait_for_ready(corpus_key=corpus_key)
|
|
204
|
+
yield corpus_key
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(f"failed to create corpus {corpus_key}: {e}")
|
|
207
|
+
finally:
|
|
208
|
+
logger.info(f"deleting corpus: {corpus_key}")
|
|
209
|
+
delete_corpora(corpus_key)
|
|
210
|
+
wait_for_delete(corpus_key=corpus_key)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@pytest.mark.asyncio
|
|
214
|
+
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
|
|
215
|
+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
|
+
async def test_vectara_destination(
|
|
217
|
+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
|
|
218
|
+
):
|
|
219
|
+
corpus_key = corpora_util
|
|
220
|
+
connection_kwargs = {
|
|
221
|
+
"customer_id": os.environ["VECTARA_CUSTOMER_ID"],
|
|
222
|
+
"corpus_key": corpus_key,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
|
|
226
|
+
oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
|
|
227
|
+
|
|
228
|
+
file_data = FileData(
|
|
229
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
230
|
+
connector_type=VECTARA_CONNECTOR_TYPE,
|
|
231
|
+
identifier="mock-file-data",
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
stager_config = VectaraUploadStagerConfig(batch_size=10)
|
|
235
|
+
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
236
|
+
new_upload_file = stager.run(
|
|
237
|
+
elements_filepath=upload_file,
|
|
238
|
+
output_dir=tmp_path,
|
|
239
|
+
output_filename=upload_file.name,
|
|
240
|
+
file_data=file_data,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
uploader = VectaraUploader(
|
|
244
|
+
connection_config=VectaraConnectionConfig(
|
|
245
|
+
**connection_kwargs,
|
|
246
|
+
access_config=VectaraAccessConfig(
|
|
247
|
+
oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
|
|
248
|
+
),
|
|
249
|
+
),
|
|
250
|
+
upload_config=VectaraUploaderConfig(),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
with new_upload_file.open() as new_upload_fp:
|
|
254
|
+
elements_stager = json.load(new_upload_fp)
|
|
255
|
+
|
|
256
|
+
if uploader.is_async():
|
|
257
|
+
await uploader.run_data_async(data=elements_stager, file_data=file_data)
|
|
258
|
+
|
|
259
|
+
with upload_file.open() as upload_fp:
|
|
260
|
+
elements = json.load(upload_fp)
|
|
261
|
+
first_element = elements[0]
|
|
262
|
+
|
|
263
|
+
for i in range(retries):
|
|
264
|
+
response = query_data(corpus_key, first_element["element_id"])
|
|
265
|
+
if not response["search_results"]:
|
|
266
|
+
time.sleep(interval)
|
|
267
|
+
else:
|
|
268
|
+
break
|
|
269
|
+
|
|
270
|
+
validate_upload(response=response, expected_data=first_element)
|