unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +156 -0
  10. test/integration/connectors/test_azure_cog_search.py +233 -0
  11. test/integration/connectors/test_delta_table.py +46 -0
  12. test/integration/connectors/test_kafka.py +150 -16
  13. test/integration/connectors/test_lancedb.py +209 -0
  14. test/integration/connectors/test_milvus.py +141 -0
  15. test/integration/connectors/test_pinecone.py +213 -0
  16. test/integration/connectors/test_s3.py +23 -0
  17. test/integration/connectors/utils/docker.py +81 -15
  18. test/integration/connectors/utils/validation.py +10 -0
  19. test/integration/connectors/weaviate/__init__.py +0 -0
  20. test/integration/connectors/weaviate/conftest.py +15 -0
  21. test/integration/connectors/weaviate/test_local.py +131 -0
  22. test/unit/v2/__init__.py +0 -0
  23. test/unit/v2/chunkers/__init__.py +0 -0
  24. test/unit/v2/chunkers/test_chunkers.py +49 -0
  25. test/unit/v2/connectors/__init__.py +0 -0
  26. test/unit/v2/embedders/__init__.py +0 -0
  27. test/unit/v2/embedders/test_bedrock.py +36 -0
  28. test/unit/v2/embedders/test_huggingface.py +48 -0
  29. test/unit/v2/embedders/test_mixedbread.py +37 -0
  30. test/unit/v2/embedders/test_octoai.py +35 -0
  31. test/unit/v2/embedders/test_openai.py +35 -0
  32. test/unit/v2/embedders/test_togetherai.py +37 -0
  33. test/unit/v2/embedders/test_vertexai.py +37 -0
  34. test/unit/v2/embedders/test_voyageai.py +38 -0
  35. test/unit/v2/partitioners/__init__.py +0 -0
  36. test/unit/v2/partitioners/test_partitioner.py +63 -0
  37. test/unit/v2/utils/__init__.py +0 -0
  38. test/unit/v2/utils/data_generator.py +32 -0
  39. unstructured_ingest/__version__.py +1 -1
  40. unstructured_ingest/cli/cmds/__init__.py +2 -2
  41. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  42. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  43. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  44. unstructured_ingest/runner/writers/__init__.py +2 -2
  45. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  46. unstructured_ingest/utils/data_prep.py +9 -1
  47. unstructured_ingest/v2/constants.py +2 -0
  48. unstructured_ingest/v2/processes/connectors/__init__.py +7 -20
  49. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  50. unstructured_ingest/v2/processes/connectors/astradb.py +35 -23
  51. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +116 -35
  52. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  53. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  54. unstructured_ingest/v2/processes/connectors/delta_table.py +37 -9
  55. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  56. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +93 -46
  57. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  58. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
  59. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  60. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
  61. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
  62. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +84 -23
  63. unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
  64. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  65. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  66. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  67. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  69. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  70. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
  72. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +101 -13
  74. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  75. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  78. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  79. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  80. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  81. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  82. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  83. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +20 -19
  84. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +91 -50
  85. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  86. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  87. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  88. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  89. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  90. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  91. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  92. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  93. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
@@ -86,7 +86,7 @@ async def test_snowflake_source():
86
86
  image="localstack/snowflake",
87
87
  environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
88
88
  ports={4566: 4566, 443: 443},
89
- healthcheck_timeout=30,
89
+ healthcheck_retries=30,
90
90
  ):
91
91
  seed_data()
92
92
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -156,7 +156,7 @@ async def test_snowflake_destination(upload_file: Path):
156
156
  image="localstack/snowflake",
157
157
  environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
158
158
  ports={4566: 4566, 443: 443},
159
- healthcheck_timeout=30,
159
+ healthcheck_retries=30,
160
160
  ):
161
161
  init_db_destination()
162
162
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -192,10 +192,8 @@ async def test_snowflake_destination(upload_file: Path):
192
192
  host=connect_params["host"],
193
193
  )
194
194
  )
195
- if uploader.is_async():
196
- await uploader.run_async(path=staged_path, file_data=mock_file_data)
197
- else:
198
- uploader.run(path=staged_path, file_data=mock_file_data)
195
+
196
+ uploader.run(path=staged_path, file_data=mock_file_data)
199
197
 
200
198
  staged_df = pd.read_json(staged_path, orient="records", lines=True)
201
199
  expected_num_elements = len(staged_df)
@@ -203,3 +201,9 @@ async def test_snowflake_destination(upload_file: Path):
203
201
  connect_params=connect_params,
204
202
  expected_num_elements=expected_num_elements,
205
203
  )
204
+
205
+ uploader.run(path=staged_path, file_data=mock_file_data)
206
+ validate_destination(
207
+ connect_params=connect_params,
208
+ expected_num_elements=expected_num_elements,
209
+ )
@@ -138,10 +138,10 @@ async def test_sqlite_destination(upload_file: Path):
138
138
  uploader = SQLiteUploader(
139
139
  connection_config=SQLiteConnectionConfig(database_path=db_path)
140
140
  )
141
- if uploader.is_async():
142
- await uploader.run_async(path=staged_path, file_data=mock_file_data)
143
- else:
144
- uploader.run(path=staged_path, file_data=mock_file_data)
141
+ uploader.run(path=staged_path, file_data=mock_file_data)
145
142
 
146
143
  staged_df = pd.read_json(staged_path, orient="records", lines=True)
147
144
  validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
145
+
146
+ uploader.run(path=staged_path, file_data=mock_file_data)
147
+ validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
@@ -0,0 +1,156 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ import pytest
8
+ from astrapy import Collection
9
+ from astrapy import DataAPIClient as AstraDBClient
10
+
11
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
14
+ from unstructured_ingest.v2.processes.connectors.astradb import (
15
+ CONNECTOR_TYPE,
16
+ AstraDBAccessConfig,
17
+ AstraDBConnectionConfig,
18
+ AstraDBIndexer,
19
+ AstraDBIndexerConfig,
20
+ AstraDBUploader,
21
+ AstraDBUploaderConfig,
22
+ AstraDBUploadStager,
23
+ DestinationConnectionError,
24
+ SourceConnectionError,
25
+ )
26
+
27
+ EXISTENT_COLLECTION_NAME = "ingest_test_src"
28
+ NONEXISTENT_COLLECTION_NAME = "nonexistant"
29
+
30
+
31
+ @pytest.fixture
32
+ def connection_config() -> AstraDBConnectionConfig:
33
+ return AstraDBConnectionConfig(
34
+ access_config=AstraDBAccessConfig(
35
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
36
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
37
+ )
38
+ )
39
+
40
+
41
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
42
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
43
+ def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
44
+ indexer = AstraDBIndexer(
45
+ connection_config=connection_config,
46
+ index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
47
+ )
48
+ uploader = AstraDBUploader(
49
+ connection_config=connection_config,
50
+ upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
51
+ )
52
+ indexer.precheck()
53
+ uploader.precheck()
54
+
55
+
56
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
57
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
58
+ def test_precheck_fails(connection_config: AstraDBConnectionConfig):
59
+ indexer = AstraDBIndexer(
60
+ connection_config=connection_config,
61
+ index_config=AstraDBIndexerConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
62
+ )
63
+ uploader = AstraDBUploader(
64
+ connection_config=connection_config,
65
+ upload_config=AstraDBUploaderConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
66
+ )
67
+ with pytest.raises(expected_exception=SourceConnectionError):
68
+ indexer.precheck()
69
+ with pytest.raises(expected_exception=DestinationConnectionError):
70
+ uploader.precheck()
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class EnvData:
75
+ api_endpoint: str
76
+ token: str
77
+
78
+
79
+ def get_env_data() -> EnvData:
80
+ return EnvData(
81
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
82
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
83
+ )
84
+
85
+
86
+ @pytest.fixture
87
+ def collection(upload_file: Path) -> Collection:
88
+ random_id = str(uuid4())[:8]
89
+ collection_name = f"utic_test_{random_id}"
90
+ with upload_file.open("r") as upload_fp:
91
+ upload_data = json.load(upload_fp)
92
+ first_content = upload_data[0]
93
+ embeddings = first_content["embeddings"]
94
+ embedding_dimension = len(embeddings)
95
+ my_client = AstraDBClient()
96
+ env_data = get_env_data()
97
+ astra_db = my_client.get_database(
98
+ api_endpoint=env_data.api_endpoint,
99
+ token=env_data.token,
100
+ )
101
+ collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
102
+ try:
103
+ yield collection
104
+ finally:
105
+ astra_db.drop_collection(collection)
106
+
107
+
108
+ @pytest.mark.asyncio
109
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
110
+ @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
111
+ async def test_azure_ai_search_destination(
112
+ upload_file: Path,
113
+ collection: Collection,
114
+ tmp_path: Path,
115
+ ):
116
+ file_data = FileData(
117
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
118
+ connector_type=CONNECTOR_TYPE,
119
+ identifier="mock file data",
120
+ )
121
+ stager = AstraDBUploadStager()
122
+ env_data = get_env_data()
123
+ uploader = AstraDBUploader(
124
+ connection_config=AstraDBConnectionConfig(
125
+ access_config=AstraDBAccessConfig(
126
+ api_endpoint=env_data.api_endpoint, token=env_data.token
127
+ ),
128
+ ),
129
+ upload_config=AstraDBUploaderConfig(collection_name=collection.name),
130
+ )
131
+ staged_filepath = stager.run(
132
+ elements_filepath=upload_file,
133
+ file_data=file_data,
134
+ output_dir=tmp_path,
135
+ output_filename=upload_file.name,
136
+ )
137
+ uploader.precheck()
138
+ uploader.run(path=staged_filepath, file_data=file_data)
139
+
140
+ # Run validation
141
+ with staged_filepath.open() as f:
142
+ staged_elements = json.load(f)
143
+ expected_count = len(staged_elements)
144
+ current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
145
+ assert current_count == expected_count, (
146
+ f"Expected count ({expected_count}) doesn't match how "
147
+ f"much came back from collection: {current_count}"
148
+ )
149
+
150
+ # Rerun and make sure the same documents get updated
151
+ uploader.run(path=staged_filepath, file_data=file_data)
152
+ current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
153
+ assert current_count == expected_count, (
154
+ f"Expected count ({expected_count}) doesn't match how "
155
+ f"much came back from collection: {current_count}"
156
+ )
@@ -0,0 +1,233 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ import pytest
8
+ from azure.core.credentials import AzureKeyCredential
9
+ from azure.search.documents import SearchClient
10
+ from azure.search.documents.indexes import SearchIndexClient
11
+ from azure.search.documents.indexes.models import (
12
+ ComplexField,
13
+ CorsOptions,
14
+ HnswAlgorithmConfiguration,
15
+ HnswParameters,
16
+ SearchField,
17
+ SearchFieldDataType,
18
+ SearchIndex,
19
+ SimpleField,
20
+ VectorSearch,
21
+ VectorSearchAlgorithmMetric,
22
+ VectorSearchProfile,
23
+ )
24
+
25
+ from test.integration.connectors.utils.constants import (
26
+ DESTINATION_TAG,
27
+ )
28
+ from test.integration.utils import requires_env
29
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
30
+ from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
31
+ CONNECTOR_TYPE,
32
+ RECORD_ID_LABEL,
33
+ AzureAISearchAccessConfig,
34
+ AzureAISearchConnectionConfig,
35
+ AzureAISearchUploader,
36
+ AzureAISearchUploaderConfig,
37
+ AzureAISearchUploadStager,
38
+ AzureAISearchUploadStagerConfig,
39
+ )
40
+
41
+ repo_path = Path(__file__).parent.resolve()
42
+
43
+ API_KEY = "AZURE_SEARCH_API_KEY"
44
+ ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
45
+
46
+
47
+ def get_api_key() -> str:
48
+ key = os.environ[API_KEY]
49
+ return key
50
+
51
+
52
+ def get_fields() -> list:
53
+ data_source_fields = [
54
+ SimpleField(name="url", type=SearchFieldDataType.String),
55
+ SimpleField(name="version", type=SearchFieldDataType.String),
56
+ SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
57
+ SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
58
+ SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
59
+ SimpleField(name="permissions_data", type=SearchFieldDataType.String),
60
+ SimpleField(name="record_locator", type=SearchFieldDataType.String),
61
+ ]
62
+ coordinates_fields = [
63
+ SimpleField(name="system", type=SearchFieldDataType.String),
64
+ SimpleField(name="layout_width", type=SearchFieldDataType.Double),
65
+ SimpleField(name="layout_height", type=SearchFieldDataType.Double),
66
+ SimpleField(name="points", type=SearchFieldDataType.String),
67
+ ]
68
+ metadata_fields = [
69
+ SimpleField(name="orig_elements", type=SearchFieldDataType.String),
70
+ SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
71
+ SimpleField(name="parent_id", type=SearchFieldDataType.String),
72
+ SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
73
+ SimpleField(name="filetype", type=SearchFieldDataType.String),
74
+ SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
75
+ SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
76
+ SimpleField(name="file_directory", type=SearchFieldDataType.String),
77
+ SimpleField(name="filename", type=SearchFieldDataType.String),
78
+ ComplexField(name="data_source", fields=data_source_fields),
79
+ ComplexField(name="coordinates", fields=coordinates_fields),
80
+ SimpleField(
81
+ name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
82
+ ),
83
+ SimpleField(name="page_number", type=SearchFieldDataType.String),
84
+ SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
85
+ SimpleField(name="page_name", type=SearchFieldDataType.String),
86
+ SimpleField(name="url", type=SearchFieldDataType.String),
87
+ SimpleField(
88
+ name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
89
+ ),
90
+ SimpleField(
91
+ name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
92
+ ),
93
+ SimpleField(
94
+ name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
95
+ ),
96
+ SimpleField(
97
+ name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
98
+ ),
99
+ SimpleField(name="subject", type=SearchFieldDataType.String),
100
+ SimpleField(name="section", type=SearchFieldDataType.String),
101
+ SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
102
+ SimpleField(
103
+ name="emphasized_text_contents",
104
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
105
+ ),
106
+ SimpleField(
107
+ name="emphasized_text_tags",
108
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
109
+ ),
110
+ SimpleField(name="text_as_html", type=SearchFieldDataType.String),
111
+ SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
112
+ SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
113
+ ]
114
+ fields = [
115
+ SimpleField(name="id", type=SearchFieldDataType.String, key=True),
116
+ SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
117
+ SimpleField(name="element_id", type=SearchFieldDataType.String),
118
+ SimpleField(name="text", type=SearchFieldDataType.String),
119
+ SimpleField(name="type", type=SearchFieldDataType.String),
120
+ ComplexField(name="metadata", fields=metadata_fields),
121
+ SearchField(
122
+ name="embeddings",
123
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
124
+ vector_search_dimensions=384,
125
+ vector_search_profile_name="embeddings-config-profile",
126
+ ),
127
+ ]
128
+ return fields
129
+
130
+
131
+ def get_vector_search() -> VectorSearch:
132
+ return VectorSearch(
133
+ algorithms=[
134
+ HnswAlgorithmConfiguration(
135
+ name="hnsw-config",
136
+ parameters=HnswParameters(
137
+ metric=VectorSearchAlgorithmMetric.COSINE,
138
+ ),
139
+ )
140
+ ],
141
+ profiles=[
142
+ VectorSearchProfile(
143
+ name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
144
+ )
145
+ ],
146
+ )
147
+
148
+
149
+ def get_search_index_client() -> SearchIndexClient:
150
+ api_key = get_api_key()
151
+ return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
152
+
153
+
154
+ @pytest.fixture
155
+ def index() -> str:
156
+ random_id = str(uuid4())[:8]
157
+ index_name = f"utic-test-{random_id}"
158
+ client = get_search_index_client()
159
+ index = SearchIndex(
160
+ name=index_name,
161
+ fields=get_fields(),
162
+ vector_search=get_vector_search(),
163
+ cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
164
+ )
165
+ print(f"creating index: {index_name}")
166
+ client.create_index(index=index)
167
+ try:
168
+ yield index_name
169
+ finally:
170
+ print(f"deleting index: {index_name}")
171
+ client.delete_index(index)
172
+
173
+
174
+ def validate_count(
175
+ search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
176
+ ) -> None:
177
+ index_count = search_client.get_document_count()
178
+ if index_count == expected_count:
179
+ return
180
+ tries = 0
181
+ while tries < retries:
182
+ time.sleep(interval)
183
+ index_count = search_client.get_document_count()
184
+ if index_count == expected_count:
185
+ break
186
+ assert index_count == expected_count, (
187
+ f"Expected count ({expected_count}) doesn't match how "
188
+ f"much came back from index: {index_count}"
189
+ )
190
+
191
+
192
+ @pytest.mark.asyncio
193
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
194
+ @requires_env("AZURE_SEARCH_API_KEY")
195
+ async def test_azure_ai_search_destination(
196
+ upload_file: Path,
197
+ index: str,
198
+ tmp_path: Path,
199
+ ):
200
+ file_data = FileData(
201
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
202
+ connector_type=CONNECTOR_TYPE,
203
+ identifier="mock file data",
204
+ )
205
+ stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
206
+
207
+ uploader = AzureAISearchUploader(
208
+ connection_config=AzureAISearchConnectionConfig(
209
+ access_config=AzureAISearchAccessConfig(key=get_api_key()),
210
+ endpoint=ENDPOINT,
211
+ index=index,
212
+ ),
213
+ upload_config=AzureAISearchUploaderConfig(),
214
+ )
215
+ staged_filepath = stager.run(
216
+ elements_filepath=upload_file,
217
+ file_data=file_data,
218
+ output_dir=tmp_path,
219
+ output_filename=upload_file.name,
220
+ )
221
+ uploader.precheck()
222
+ uploader.run(path=staged_filepath, file_data=file_data)
223
+
224
+ # Run validation
225
+ with staged_filepath.open() as f:
226
+ staged_elements = json.load(f)
227
+ expected_count = len(staged_elements)
228
+ search_client: SearchClient = uploader.connection_config.get_search_client()
229
+ validate_count(search_client=search_client, expected_count=expected_count)
230
+
231
+ # Rerun and make sure the same documents get updated
232
+ uploader.run(path=staged_filepath, file_data=file_data)
233
+ validate_count(search_client=search_client, expected_count=expected_count)
@@ -136,3 +136,49 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
136
136
  secret=aws_credentials["AWS_SECRET_ACCESS_KEY"],
137
137
  )
138
138
  s3fs.rm(path=destination_path, recursive=True)
139
+
140
+
141
+ @pytest.mark.asyncio
142
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
143
+ @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
144
+ async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
145
+ aws_credentials = {
146
+ "AWS_ACCESS_KEY_ID": "bad key",
147
+ "AWS_SECRET_ACCESS_KEY": "bad secret",
148
+ "AWS_REGION": "us-east-2",
149
+ }
150
+ s3_bucket = "s3://utic-platform-test-destination"
151
+ destination_path = f"{s3_bucket}/destination/test"
152
+ connection_config = DeltaTableConnectionConfig(
153
+ access_config=DeltaTableAccessConfig(
154
+ aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
155
+ aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
156
+ ),
157
+ aws_region=aws_credentials["AWS_REGION"],
158
+ table_uri=destination_path,
159
+ )
160
+ stager_config = DeltaTableUploadStagerConfig()
161
+ stager = DeltaTableUploadStager(upload_stager_config=stager_config)
162
+ new_upload_file = stager.run(
163
+ elements_filepath=upload_file,
164
+ output_dir=temp_dir,
165
+ output_filename=upload_file.name,
166
+ )
167
+
168
+ upload_config = DeltaTableUploaderConfig()
169
+ uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
170
+ file_data = FileData(
171
+ source_identifiers=SourceIdentifiers(
172
+ fullpath=upload_file.name, filename=new_upload_file.name
173
+ ),
174
+ connector_type=CONNECTOR_TYPE,
175
+ identifier="mock file data",
176
+ )
177
+
178
+ with pytest.raises(Exception) as excinfo:
179
+ if uploader.is_async():
180
+ await uploader.run_async(path=new_upload_file, file_data=file_data)
181
+ else:
182
+ uploader.run(path=new_upload_file, file_data=file_data)
183
+
184
+ assert "403 Forbidden" in str(excinfo.value), f"Exception message did not match: {str(excinfo)}"