unstructured-ingest 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (53) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +6 -2
  3. test/integration/connectors/sql/test_singlestore.py +6 -2
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +6 -2
  6. test/integration/connectors/test_milvus.py +13 -0
  7. test/integration/connectors/test_onedrive.py +6 -0
  8. test/integration/connectors/test_redis.py +119 -0
  9. test/integration/connectors/test_vectara.py +270 -0
  10. test/integration/embedders/test_bedrock.py +28 -0
  11. test/integration/embedders/test_octoai.py +14 -0
  12. test/integration/embedders/test_openai.py +13 -0
  13. test/integration/embedders/test_togetherai.py +10 -0
  14. test/integration/partitioners/test_partitioner.py +2 -2
  15. test/unit/embed/test_octoai.py +8 -1
  16. unstructured_ingest/__version__.py +1 -1
  17. unstructured_ingest/embed/bedrock.py +39 -11
  18. unstructured_ingest/embed/interfaces.py +5 -0
  19. unstructured_ingest/embed/octoai.py +44 -3
  20. unstructured_ingest/embed/openai.py +37 -1
  21. unstructured_ingest/embed/togetherai.py +28 -1
  22. unstructured_ingest/embed/voyageai.py +33 -1
  23. unstructured_ingest/v2/errors.py +18 -0
  24. unstructured_ingest/v2/interfaces/file_data.py +11 -1
  25. unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
  26. unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
  27. unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
  28. unstructured_ingest/v2/processes/connectors/couchbase.py +2 -0
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  30. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  31. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  32. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  33. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  34. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -1
  35. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
  36. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
  37. unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +3 -4
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
  41. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
  42. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  43. unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -0
  44. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  45. unstructured_ingest/v2/unstructured_api.py +25 -2
  46. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +20 -16
  47. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +52 -48
  48. test/integration/connectors/test_kafka.py +0 -304
  49. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  50. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
  51. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
  52. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
  53. {unstructured_ingest-0.3.10.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  import json
2
2
  import os
3
- import tempfile
4
3
  import uuid
5
4
  from contextlib import contextmanager
6
5
  from dataclasses import dataclass
7
6
  from pathlib import Path
7
+ from unittest import mock
8
8
 
9
9
  import pytest
10
10
  from databricks.sdk import WorkspaceClient
@@ -31,11 +31,15 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
31
31
 
32
32
 
33
33
  @dataclass
34
- class EnvData:
34
+ class BaseEnvData:
35
35
  host: str
36
+ catalog: str
37
+
38
+
39
+ @dataclass
40
+ class BasicAuthEnvData(BaseEnvData):
36
41
  client_id: str
37
42
  client_secret: str
38
- catalog: str
39
43
 
40
44
  def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
41
45
  return DatabricksNativeVolumesConnectionConfig(
@@ -47,8 +51,21 @@ class EnvData:
47
51
  )
48
52
 
49
53
 
50
- def get_env_data() -> EnvData:
51
- return EnvData(
54
+ @dataclass
55
+ class PATEnvData(BaseEnvData):
56
+ token: str
57
+
58
+ def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
59
+ return DatabricksNativeVolumesConnectionConfig(
60
+ host=self.host,
61
+ access_config=DatabricksNativeVolumesAccessConfig(
62
+ token=self.token,
63
+ ),
64
+ )
65
+
66
+
67
+ def get_basic_auth_env_data() -> BasicAuthEnvData:
68
+ return BasicAuthEnvData(
52
69
  host=os.environ["DATABRICKS_HOST"],
53
70
  client_id=os.environ["DATABRICKS_CLIENT_ID"],
54
71
  client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
@@ -56,23 +73,30 @@ def get_env_data() -> EnvData:
56
73
  )
57
74
 
58
75
 
76
+ def get_pat_env_data() -> PATEnvData:
77
+ return PATEnvData(
78
+ host=os.environ["DATABRICKS_HOST"],
79
+ catalog=os.environ["DATABRICKS_CATALOG"],
80
+ token=os.environ["DATABRICKS_PAT"],
81
+ )
82
+
83
+
59
84
  @pytest.mark.asyncio
60
85
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
61
86
  @requires_env(
62
87
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
63
88
  )
64
- async def test_volumes_native_source():
65
- env_data = get_env_data()
66
- indexer_config = DatabricksNativeVolumesIndexerConfig(
67
- recursive=True,
68
- volume="test-platform",
69
- volume_path="databricks-volumes-test-input",
70
- catalog=env_data.catalog,
71
- )
72
- connection_config = env_data.get_connection_config()
73
- with tempfile.TemporaryDirectory() as tempdir:
74
- tempdir_path = Path(tempdir)
75
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
89
+ async def test_volumes_native_source(tmp_path: Path):
90
+ env_data = get_basic_auth_env_data()
91
+ with mock.patch.dict(os.environ, clear=True):
92
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
93
+ recursive=True,
94
+ volume="test-platform",
95
+ volume_path="databricks-volumes-test-input",
96
+ catalog=env_data.catalog,
97
+ )
98
+ connection_config = env_data.get_connection_config()
99
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
76
100
  indexer = DatabricksNativeVolumesIndexer(
77
101
  connection_config=connection_config, index_config=indexer_config
78
102
  )
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
89
113
  )
90
114
 
91
115
 
116
+ @pytest.mark.asyncio
117
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
118
+ @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
119
+ async def test_volumes_native_source_pat(tmp_path: Path):
120
+ env_data = get_pat_env_data()
121
+ with mock.patch.dict(os.environ, clear=True):
122
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
123
+ recursive=True,
124
+ volume="test-platform",
125
+ volume_path="databricks-volumes-test-input",
126
+ catalog=env_data.catalog,
127
+ )
128
+ connection_config = env_data.get_connection_config()
129
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
130
+ indexer = DatabricksNativeVolumesIndexer(
131
+ connection_config=connection_config, index_config=indexer_config
132
+ )
133
+ downloader = DatabricksNativeVolumesDownloader(
134
+ connection_config=connection_config, download_config=download_config
135
+ )
136
+ await source_connector_validation(
137
+ indexer=indexer,
138
+ downloader=downloader,
139
+ configs=SourceValidationConfigs(
140
+ test_id="databricks_volumes_native_pat",
141
+ expected_num_files=1,
142
+ ),
143
+ )
144
+
145
+
92
146
  def _get_volume_path(catalog: str, volume: str, volume_path: str):
93
147
  return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94
148
 
95
149
 
96
150
  @contextmanager
97
- def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
151
+ def databricks_destination_context(
152
+ env_data: BasicAuthEnvData, volume: str, volume_path
153
+ ) -> WorkspaceClient:
98
154
  client = WorkspaceClient(
99
155
  host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100
156
  )
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
137
193
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138
194
  )
139
195
  async def test_volumes_native_destination(upload_file: Path):
140
- env_data = get_env_data()
196
+ env_data = get_basic_auth_env_data()
141
197
  volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142
198
  file_data = FileData(
143
199
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.postgres import (
20
20
  CONNECTOR_TYPE,
21
21
  PostgresAccessConfig,
@@ -119,7 +119,11 @@ def validate_destination(
119
119
  async def test_postgres_destination(upload_file: Path, temp_dir: Path):
120
120
  # the postgres destination connector doesn't leverage the file data but is required as an input,
121
121
  # mocking it with arbitrary values to meet the base requirements:
122
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
122
+ mock_file_data = FileData(
123
+ identifier="mock file data",
124
+ connector_type=CONNECTOR_TYPE,
125
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
126
+ )
123
127
  with docker_compose_context(
124
128
  docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
125
129
  ):
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
20
20
  CONNECTOR_TYPE,
21
21
  SingleStoreAccessConfig,
@@ -103,7 +103,11 @@ def validate_destination(
103
103
  @pytest.mark.asyncio
104
104
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
105
105
  async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
106
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
106
+ mock_file_data = FileData(
107
+ identifier="mock file data",
108
+ connector_type=CONNECTOR_TYPE,
109
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
110
+ )
107
111
  with docker_compose_context(
108
112
  docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
109
113
  ):
@@ -17,7 +17,7 @@ from test.integration.connectors.utils.validation.source import (
17
17
  source_connector_validation,
18
18
  )
19
19
  from test.integration.utils import requires_env
20
- from unstructured_ingest.v2.interfaces import FileData
20
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
21
21
  from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
22
22
  CONNECTOR_TYPE,
23
23
  SnowflakeAccessConfig,
@@ -170,7 +170,11 @@ async def test_snowflake_destination(
170
170
  ):
171
171
  # the postgres destination connector doesn't leverage the file data but is required as an input,
172
172
  # mocking it with arbitrary values to meet the base requirements:
173
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
173
+ mock_file_data = FileData(
174
+ identifier="mock file data",
175
+ connector_type=CONNECTOR_TYPE,
176
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
177
+ )
174
178
  init_db_destination()
175
179
  stager = SnowflakeUploadStager()
176
180
  staged_path = stager.run(
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
20
20
  CONNECTOR_TYPE,
21
21
  SQLiteConnectionConfig,
@@ -116,7 +116,11 @@ async def test_sqlite_destination(
116
116
  ):
117
117
  # the sqlite destination connector doesn't leverage the file data but is required as an input,
118
118
  # mocking it with arbitrary values to meet the base requirements:
119
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
119
+ mock_file_data = FileData(
120
+ identifier="mock file data",
121
+ connector_type=CONNECTOR_TYPE,
122
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
123
+ )
120
124
  stager = SQLiteUploadStager()
121
125
  staged_path = stager.run(
122
126
  elements_filepath=upload_file,
@@ -174,6 +174,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
174
174
  uploader.precheck()
175
175
 
176
176
 
177
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
178
+ def test_precheck_fails_on_nonexisting_db(collection: str):
179
+ uploader = MilvusUploader(
180
+ connection_config=MilvusConnectionConfig(uri=DB_URI),
181
+ upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
182
+ )
183
+ with pytest.raises(
184
+ DestinationConnectionError,
185
+ match="database not found",
186
+ ):
187
+ uploader.precheck()
188
+
189
+
177
190
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
178
191
  def test_milvus_stager(
179
192
  request: TopRequest,
@@ -20,6 +20,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
20
20
 
21
21
 
22
22
  @pytest.fixture
23
+ @pytest.mark.xfail(
24
+ reason="Issues with test setup on the provider side."
25
+ ) # TODO: remove line when issues are addressed
23
26
  def onedrive_test_folder() -> str:
24
27
  """
25
28
  Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
@@ -66,6 +69,9 @@ def get_connection_config():
66
69
 
67
70
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
71
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
72
+ @pytest.mark.xfail(
73
+ reason="Issues with test setup on the provider side."
74
+ ) # TODO: remove line when issues are addressed
69
75
  def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
70
76
  """
71
77
  Integration test for the OneDrive destination connector.
@@ -0,0 +1,119 @@
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import pytest
9
+ from redis import exceptions as redis_exceptions
10
+ from redis.asyncio import Redis, from_url
11
+
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
13
+ from test.integration.utils import requires_env
14
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
+ from unstructured_ingest.v2.processes.connectors.redisdb import (
16
+ CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.redisdb import (
19
+ RedisAccessConfig,
20
+ RedisConnectionConfig,
21
+ RedisUploader,
22
+ RedisUploaderConfig,
23
+ )
24
+
25
+
26
+ async def delete_record(client: Redis, element_id: str) -> None:
27
+ await client.delete(element_id)
28
+
29
+
30
+ async def validate_upload(client: Redis, first_element: dict):
31
+ element_id = first_element["element_id"]
32
+ expected_text = first_element["text"]
33
+ expected_embeddings = first_element["embeddings"]
34
+ async with client.pipeline(transaction=True) as pipe:
35
+ try:
36
+ response = await pipe.json().get(element_id, "$").execute()
37
+ response = response[0][0]
38
+ except redis_exceptions.ResponseError:
39
+ response = await pipe.get(element_id).execute()
40
+ response = json.loads(response[0])
41
+
42
+ embedding_similarity = np.linalg.norm(
43
+ np.array(response["embeddings"]) - np.array(expected_embeddings)
44
+ )
45
+
46
+ assert response is not None
47
+ assert response["element_id"] == element_id
48
+ assert response["text"] == expected_text
49
+ assert embedding_similarity < 1e-10
50
+
51
+
52
+ async def redis_destination_test(
53
+ upload_file: Path,
54
+ tmp_path: Path,
55
+ connection_kwargs: dict,
56
+ uri: Optional[str] = None,
57
+ password: Optional[str] = None,
58
+ ):
59
+ uploader = RedisUploader(
60
+ connection_config=RedisConnectionConfig(
61
+ **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
62
+ ),
63
+ upload_config=RedisUploaderConfig(batch_size=10),
64
+ )
65
+
66
+ file_data = FileData(
67
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
68
+ connector_type=REDIS_CONNECTOR_TYPE,
69
+ identifier="mock-file-data",
70
+ )
71
+ with upload_file.open() as upload_fp:
72
+ elements = json.load(upload_fp)
73
+ first_element = elements[0]
74
+
75
+ try:
76
+ if uploader.is_async():
77
+ await uploader.run_data_async(data=elements, file_data=file_data)
78
+
79
+ if uri:
80
+ async with from_url(uri) as client:
81
+ await validate_upload(client=client, first_element=first_element)
82
+ else:
83
+ async with Redis(**connection_kwargs, password=password) as client:
84
+ await validate_upload(client=client, first_element=first_element)
85
+ except Exception as e:
86
+ raise e
87
+ finally:
88
+ if uri:
89
+ async with from_url(uri) as client:
90
+ tasks = [delete_record(client, element["element_id"]) for element in elements]
91
+ await asyncio.gather(*tasks)
92
+ else:
93
+ async with Redis(**connection_kwargs, password=password) as client:
94
+ tasks = [delete_record(client, element["element_id"]) for element in elements]
95
+ await asyncio.gather(*tasks)
96
+
97
+
98
+ @pytest.mark.asyncio
99
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
100
+ @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
101
+ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
102
+ connection_kwargs = {
103
+ "host": "utic-dashboard-dev.redis.cache.windows.net",
104
+ "port": 6380,
105
+ "db": 0,
106
+ "ssl": True,
107
+ }
108
+ redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
109
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
110
+
111
+
112
+ @pytest.mark.asyncio
113
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
114
+ @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
+ async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
+ connection_kwargs = {}
117
+ redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
118
+ uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
119
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
@@ -0,0 +1,270 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Generator
6
+ from uuid import uuid4
7
+
8
+ import pytest
9
+ import requests
10
+
11
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
+ from unstructured_ingest.v2.logger import logger
15
+ from unstructured_ingest.v2.processes.connectors.vectara import (
16
+ CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.vectara import (
19
+ VectaraAccessConfig,
20
+ VectaraConnectionConfig,
21
+ VectaraUploader,
22
+ VectaraUploaderConfig,
23
+ VectaraUploadStager,
24
+ VectaraUploadStagerConfig,
25
+ )
26
+
27
+
28
+ def validate_upload(response: dict, expected_data: dict):
29
+ element_id = expected_data["element_id"]
30
+ expected_text = expected_data["text"]
31
+ filename = expected_data["metadata"]["filename"]
32
+ filetype = expected_data["metadata"]["filetype"]
33
+ page_number = expected_data["metadata"]["page_number"]
34
+
35
+ response = response["search_results"][0]
36
+
37
+ assert response is not None
38
+ assert response["text"] == expected_text
39
+ assert response["part_metadata"]["element_id"] == element_id
40
+ assert response["part_metadata"]["filename"] == filename
41
+ assert response["part_metadata"]["filetype"] == filetype
42
+ assert response["part_metadata"]["page_number"] == page_number
43
+
44
+
45
+ @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
46
+ def _get_jwt_token():
47
+ """Connect to the server and get a JWT token."""
48
+ customer_id = os.environ["VECTARA_CUSTOMER_ID"]
49
+ token_endpoint = (
50
+ f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
51
+ )
52
+ headers = {
53
+ "Content-Type": "application/x-www-form-urlencoded",
54
+ }
55
+ data = {
56
+ "grant_type": "client_credentials",
57
+ "client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
58
+ "client_secret": os.environ["VECTARA_OAUTH_SECRET"],
59
+ }
60
+
61
+ response = requests.post(token_endpoint, headers=headers, data=data)
62
+ response.raise_for_status()
63
+ response_json = response.json()
64
+
65
+ return response_json.get("access_token")
66
+
67
+
68
+ def query_data(corpus_key: str, element_id: str) -> dict:
69
+
70
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
71
+
72
+ # the query below requires the corpus to have filter attributes for element_id
73
+
74
+ data = json.dumps(
75
+ {
76
+ "query": "string",
77
+ "search": {
78
+ "metadata_filter": f"part.element_id = '{element_id}'",
79
+ "lexical_interpolation": 1,
80
+ "limit": 10,
81
+ },
82
+ }
83
+ )
84
+
85
+ jwt_token = _get_jwt_token()
86
+ headers = {
87
+ "Content-Type": "application/json",
88
+ "Accept": "application/json",
89
+ "Authorization": f"Bearer {jwt_token}",
90
+ "X-source": "unstructured",
91
+ }
92
+
93
+ response = requests.post(url, headers=headers, data=data)
94
+ response.raise_for_status()
95
+ response_json = response.json()
96
+
97
+ return response_json
98
+
99
+
100
+ def create_corpora(corpus_key: str, corpus_name: str) -> None:
101
+ url = "https://api.vectara.io/v2/corpora"
102
+ data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
103
+ jwt_token = _get_jwt_token()
104
+ headers = {
105
+ "Content-Type": "application/json",
106
+ "Accept": "application/json",
107
+ "Authorization": f"Bearer {jwt_token}",
108
+ "X-source": "unstructured",
109
+ }
110
+
111
+ response = requests.post(url, headers=headers, data=data)
112
+ response.raise_for_status()
113
+
114
+
115
+ def replace_filter_attributes(corpus_key: str) -> None:
116
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
117
+ data = json.dumps(
118
+ {
119
+ "filter_attributes": [
120
+ {"name": "element_id", "level": "part", "indexed": True, "type": "text"}
121
+ ]
122
+ }
123
+ )
124
+ jwt_token = _get_jwt_token()
125
+ headers = {
126
+ "Content-Type": "application/json",
127
+ "Accept": "application/json",
128
+ "Authorization": f"Bearer {jwt_token}",
129
+ "X-source": "unstructured",
130
+ }
131
+
132
+ response = requests.post(url, headers=headers, data=data)
133
+ response.raise_for_status()
134
+
135
+
136
+ def delete_corpora(corpus_key: str) -> None:
137
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
138
+
139
+ jwt_token = _get_jwt_token()
140
+ headers = {
141
+ "Content-Type": "application/json",
142
+ "Accept": "application/json",
143
+ "Authorization": f"Bearer {jwt_token}",
144
+ "X-source": "unstructured",
145
+ }
146
+
147
+ response = requests.delete(url, headers=headers)
148
+ response.raise_for_status()
149
+
150
+
151
+ def list_corpora() -> list:
152
+ url = "https://api.vectara.io/v2/corpora?limit=100"
153
+ jwt_token = _get_jwt_token()
154
+ headers = {
155
+ "Content-Type": "application/json",
156
+ "Accept": "application/json",
157
+ "Authorization": f"Bearer {jwt_token}",
158
+ "X-source": "unstructured",
159
+ }
160
+ response = requests.get(url, headers=headers)
161
+ response.raise_for_status()
162
+ response_json = response.json()
163
+ if response_json.get("corpora"):
164
+ return [item["key"] for item in response_json.get("corpora")]
165
+ else:
166
+ return []
167
+
168
+
169
+ def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170
+ def is_ready_status():
171
+ corpora_list = list_corpora()
172
+ return corpus_key in corpora_list
173
+
174
+ start = time.time()
175
+ is_ready = is_ready_status()
176
+ while not is_ready and time.time() - start < timeout:
177
+ time.sleep(interval)
178
+ is_ready = is_ready_status()
179
+ if not is_ready:
180
+ raise TimeoutError("time out waiting for corpus to be ready")
181
+
182
+
183
+ def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184
+ start = time.time()
185
+ while time.time() - start < timeout:
186
+ corpora_list = list_corpora()
187
+ if corpus_key not in corpora_list:
188
+ return
189
+ time.sleep(interval)
190
+
191
+ raise TimeoutError("time out waiting for corpus to delete")
192
+
193
+
194
+ @pytest.fixture
195
+ def corpora_util() -> Generator[str, None, None]:
196
+ random_id = str(uuid4()).split("-")[0]
197
+ corpus_key = f"ingest-test-{random_id}"
198
+ corpus_name = "ingest-test"
199
+ logger.info(f"Creating corpus with key: {corpus_key}")
200
+ try:
201
+ create_corpora(corpus_key, corpus_name)
202
+ replace_filter_attributes(corpus_key)
203
+ wait_for_ready(corpus_key=corpus_key)
204
+ yield corpus_key
205
+ except Exception as e:
206
+ logger.error(f"failed to create corpus {corpus_key}: {e}")
207
+ finally:
208
+ logger.info(f"deleting corpus: {corpus_key}")
209
+ delete_corpora(corpus_key)
210
+ wait_for_delete(corpus_key=corpus_key)
211
+
212
+
213
+ @pytest.mark.asyncio
214
+ @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
215
+ @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
+ async def test_vectara_destination(
217
+ upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
218
+ ):
219
+ corpus_key = corpora_util
220
+ connection_kwargs = {
221
+ "customer_id": os.environ["VECTARA_CUSTOMER_ID"],
222
+ "corpus_key": corpus_key,
223
+ }
224
+
225
+ oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
226
+ oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
227
+
228
+ file_data = FileData(
229
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
230
+ connector_type=VECTARA_CONNECTOR_TYPE,
231
+ identifier="mock-file-data",
232
+ )
233
+
234
+ stager_config = VectaraUploadStagerConfig(batch_size=10)
235
+ stager = VectaraUploadStager(upload_stager_config=stager_config)
236
+ new_upload_file = stager.run(
237
+ elements_filepath=upload_file,
238
+ output_dir=tmp_path,
239
+ output_filename=upload_file.name,
240
+ file_data=file_data,
241
+ )
242
+
243
+ uploader = VectaraUploader(
244
+ connection_config=VectaraConnectionConfig(
245
+ **connection_kwargs,
246
+ access_config=VectaraAccessConfig(
247
+ oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
248
+ ),
249
+ ),
250
+ upload_config=VectaraUploaderConfig(),
251
+ )
252
+
253
+ with new_upload_file.open() as new_upload_fp:
254
+ elements_stager = json.load(new_upload_fp)
255
+
256
+ if uploader.is_async():
257
+ await uploader.run_data_async(data=elements_stager, file_data=file_data)
258
+
259
+ with upload_file.open() as upload_fp:
260
+ elements = json.load(upload_fp)
261
+ first_element = elements[0]
262
+
263
+ for i in range(retries):
264
+ response = query_data(corpus_key, first_element["element_id"])
265
+ if not response["search_results"]:
266
+ time.sleep(interval)
267
+ else:
268
+ break
269
+
270
+ validate_upload(response=response, expected_data=first_element)