unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
- test/integration/connectors/sql/test_postgres.py +10 -4
- test/integration/connectors/sql/test_singlestore.py +8 -4
- test/integration/connectors/sql/test_snowflake.py +10 -6
- test/integration/connectors/sql/test_sqlite.py +4 -4
- test/integration/connectors/test_astradb.py +50 -3
- test/integration/connectors/test_delta_table.py +46 -0
- test/integration/connectors/test_kafka.py +40 -6
- test/integration/connectors/test_lancedb.py +210 -0
- test/integration/connectors/test_milvus.py +141 -0
- test/integration/connectors/test_mongodb.py +332 -0
- test/integration/connectors/test_pinecone.py +53 -1
- test/integration/connectors/utils/docker.py +81 -15
- test/integration/connectors/utils/validation.py +10 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
- unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
- unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
- unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
- unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
- unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
- /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Generator
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from pydantic import BaseModel, SecretStr
|
|
11
|
+
from pymongo.collection import Collection
|
|
12
|
+
from pymongo.database import Database
|
|
13
|
+
from pymongo.mongo_client import MongoClient
|
|
14
|
+
from pymongo.operations import SearchIndexModel
|
|
15
|
+
|
|
16
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
17
|
+
from test.integration.connectors.utils.validation import (
|
|
18
|
+
ValidationConfigs,
|
|
19
|
+
source_connector_validation,
|
|
20
|
+
)
|
|
21
|
+
from test.integration.utils import requires_env
|
|
22
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
23
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
|
+
from unstructured_ingest.v2.processes.connectors.mongodb import (
|
|
25
|
+
CONNECTOR_TYPE,
|
|
26
|
+
MongoDBAccessConfig,
|
|
27
|
+
MongoDBConnectionConfig,
|
|
28
|
+
MongoDBDownloader,
|
|
29
|
+
MongoDBDownloaderConfig,
|
|
30
|
+
MongoDBIndexer,
|
|
31
|
+
MongoDBIndexerConfig,
|
|
32
|
+
MongoDBUploader,
|
|
33
|
+
MongoDBUploaderConfig,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
SOURCE_COLLECTION = "sample-mongodb-data"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class EnvData(BaseModel):
|
|
40
|
+
uri: SecretStr
|
|
41
|
+
database: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_env_data() -> EnvData:
|
|
45
|
+
uri = os.getenv("MONGODB_URI")
|
|
46
|
+
assert uri
|
|
47
|
+
database = os.getenv("MONGODB_DATABASE")
|
|
48
|
+
assert database
|
|
49
|
+
return EnvData(uri=uri, database=database)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_client() -> Generator[MongoClient, None, None]:
|
|
54
|
+
uri = get_env_data().uri.get_secret_value()
|
|
55
|
+
with MongoClient(uri) as client:
|
|
56
|
+
assert client.admin.command("ping")
|
|
57
|
+
yield client
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wait_for_collection(
|
|
61
|
+
database: Database, collection_name: str, retries: int = 10, interval: int = 1
|
|
62
|
+
):
|
|
63
|
+
collections = database.list_collection_names()
|
|
64
|
+
attempts = 0
|
|
65
|
+
while collection_name not in collections and attempts < retries:
|
|
66
|
+
attempts += 1
|
|
67
|
+
print(
|
|
68
|
+
"Waiting for collection {} to be recognized: {}".format(
|
|
69
|
+
collection_name, ", ".join(collections)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
time.sleep(interval)
|
|
73
|
+
collections = database.list_collection_names()
|
|
74
|
+
if collection_name not in collection_name:
|
|
75
|
+
raise TimeoutError(f"Collection {collection_name} was not recognized")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_search_index_status(collection: Collection, index_name: str) -> str:
|
|
79
|
+
search_indexes = collection.list_search_indexes(name=index_name)
|
|
80
|
+
search_index = list(search_indexes)[0]
|
|
81
|
+
return search_index["status"]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def wait_for_search_index(
|
|
85
|
+
collection: Collection, index_name: str, retries: int = 60, interval: int = 1
|
|
86
|
+
):
|
|
87
|
+
current_status = get_search_index_status(collection, index_name)
|
|
88
|
+
attempts = 0
|
|
89
|
+
while current_status != "READY" and attempts < retries:
|
|
90
|
+
attempts += 1
|
|
91
|
+
print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
|
|
92
|
+
time.sleep(interval)
|
|
93
|
+
current_status = get_search_index_status(collection, index_name)
|
|
94
|
+
|
|
95
|
+
if current_status != "READY":
|
|
96
|
+
raise TimeoutError("search index never detected as READY")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@pytest.fixture
|
|
100
|
+
def destination_collection() -> Collection:
|
|
101
|
+
env_data = get_env_data()
|
|
102
|
+
collection_name = f"utic-test-output-{uuid.uuid4()}"
|
|
103
|
+
with get_client() as client:
|
|
104
|
+
database = client[env_data.database]
|
|
105
|
+
print(f"creating collection in database {database}: {collection_name}")
|
|
106
|
+
collection = database.create_collection(name=collection_name)
|
|
107
|
+
search_index_name = "embeddings"
|
|
108
|
+
collection.create_search_index(
|
|
109
|
+
model=SearchIndexModel(
|
|
110
|
+
name=search_index_name,
|
|
111
|
+
definition={
|
|
112
|
+
"mappings": {
|
|
113
|
+
"dynamic": True,
|
|
114
|
+
"fields": {
|
|
115
|
+
"embeddings": [
|
|
116
|
+
{"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
collection.create_index("record_id")
|
|
124
|
+
wait_for_collection(database=database, collection_name=collection_name)
|
|
125
|
+
wait_for_search_index(collection=collection, index_name=search_index_name)
|
|
126
|
+
try:
|
|
127
|
+
yield collection
|
|
128
|
+
finally:
|
|
129
|
+
print(f"deleting collection: {collection_name}")
|
|
130
|
+
collection.drop()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def validate_collection_count(
|
|
134
|
+
collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
|
|
135
|
+
) -> None:
|
|
136
|
+
count = collection.count_documents(filter={})
|
|
137
|
+
attempt = 0
|
|
138
|
+
while count != expected_records and attempt < retries:
|
|
139
|
+
attempt += 1
|
|
140
|
+
print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
|
|
141
|
+
time.sleep(interval)
|
|
142
|
+
count = collection.count_documents(filter={})
|
|
143
|
+
assert (
|
|
144
|
+
count == expected_records
|
|
145
|
+
), f"expected count ({expected_records}) does not match how many records were found: {count}"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def validate_collection_vector(
|
|
149
|
+
collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
|
|
150
|
+
) -> None:
|
|
151
|
+
pipeline = [
|
|
152
|
+
{
|
|
153
|
+
"$vectorSearch": {
|
|
154
|
+
"index": "embeddings",
|
|
155
|
+
"path": "embeddings",
|
|
156
|
+
"queryVector": embedding,
|
|
157
|
+
"numCandidates": 150,
|
|
158
|
+
"limit": 10,
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
{"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
|
|
162
|
+
]
|
|
163
|
+
attempts = 0
|
|
164
|
+
results = list(collection.aggregate(pipeline=pipeline))
|
|
165
|
+
while not results and attempts < retries:
|
|
166
|
+
attempts += 1
|
|
167
|
+
print(f"attempt {attempts}, waiting for valid results: {results}")
|
|
168
|
+
time.sleep(interval)
|
|
169
|
+
results = list(collection.aggregate(pipeline=pipeline))
|
|
170
|
+
if not results:
|
|
171
|
+
raise TimeoutError("Timed out waiting for valid results")
|
|
172
|
+
print(f"found results on attempt {attempts}")
|
|
173
|
+
top_result = results[0]
|
|
174
|
+
assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
|
|
175
|
+
assert top_result["text"] == text, "text detected should be {}, found: {}".format(
|
|
176
|
+
text, top_result["text"]
|
|
177
|
+
)
|
|
178
|
+
for r in results[1:]:
|
|
179
|
+
assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@pytest.mark.asyncio
|
|
183
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
184
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
185
|
+
async def test_mongodb_source(temp_dir: Path):
|
|
186
|
+
env_data = get_env_data()
|
|
187
|
+
indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
|
|
188
|
+
download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
|
|
189
|
+
connection_config = MongoDBConnectionConfig(
|
|
190
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
191
|
+
)
|
|
192
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
193
|
+
downloader = MongoDBDownloader(
|
|
194
|
+
connection_config=connection_config, download_config=download_config
|
|
195
|
+
)
|
|
196
|
+
await source_connector_validation(
|
|
197
|
+
indexer=indexer,
|
|
198
|
+
downloader=downloader,
|
|
199
|
+
configs=ValidationConfigs(
|
|
200
|
+
test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
206
|
+
def test_mongodb_indexer_precheck_fail_no_host():
|
|
207
|
+
indexer_config = MongoDBIndexerConfig(
|
|
208
|
+
database="non-existent-database", collection="non-existent-database"
|
|
209
|
+
)
|
|
210
|
+
connection_config = MongoDBConnectionConfig(
|
|
211
|
+
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
212
|
+
)
|
|
213
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
214
|
+
with pytest.raises(SourceConnectionError):
|
|
215
|
+
indexer.precheck()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
219
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
220
|
+
def test_mongodb_indexer_precheck_fail_no_database():
|
|
221
|
+
env_data = get_env_data()
|
|
222
|
+
indexer_config = MongoDBIndexerConfig(
|
|
223
|
+
database="non-existent-database", collection=SOURCE_COLLECTION
|
|
224
|
+
)
|
|
225
|
+
connection_config = MongoDBConnectionConfig(
|
|
226
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
227
|
+
)
|
|
228
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
229
|
+
with pytest.raises(SourceConnectionError):
|
|
230
|
+
indexer.precheck()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
234
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
235
|
+
def test_mongodb_indexer_precheck_fail_no_collection():
|
|
236
|
+
env_data = get_env_data()
|
|
237
|
+
indexer_config = MongoDBIndexerConfig(
|
|
238
|
+
database=env_data.database, collection="non-existent-collection"
|
|
239
|
+
)
|
|
240
|
+
connection_config = MongoDBConnectionConfig(
|
|
241
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
242
|
+
)
|
|
243
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
244
|
+
with pytest.raises(SourceConnectionError):
|
|
245
|
+
indexer.precheck()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@pytest.mark.asyncio
|
|
249
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
250
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
251
|
+
async def test_mongodb_destination(
|
|
252
|
+
upload_file: Path,
|
|
253
|
+
destination_collection: Collection,
|
|
254
|
+
tmp_path: Path,
|
|
255
|
+
):
|
|
256
|
+
env_data = get_env_data()
|
|
257
|
+
file_data = FileData(
|
|
258
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
259
|
+
connector_type=CONNECTOR_TYPE,
|
|
260
|
+
identifier="mongodb_mock_id",
|
|
261
|
+
)
|
|
262
|
+
connection_config = MongoDBConnectionConfig(
|
|
263
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
upload_config = MongoDBUploaderConfig(
|
|
267
|
+
database=env_data.database,
|
|
268
|
+
collection=destination_collection.name,
|
|
269
|
+
)
|
|
270
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
271
|
+
uploader.precheck()
|
|
272
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
273
|
+
|
|
274
|
+
with upload_file.open() as f:
|
|
275
|
+
staged_elements = json.load(f)
|
|
276
|
+
expected_records = len(staged_elements)
|
|
277
|
+
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
278
|
+
first_element = staged_elements[0]
|
|
279
|
+
validate_collection_vector(
|
|
280
|
+
collection=destination_collection,
|
|
281
|
+
embedding=first_element["embeddings"],
|
|
282
|
+
text=first_element["text"],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
286
|
+
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
290
|
+
def test_mongodb_uploader_precheck_fail_no_host():
|
|
291
|
+
upload_config = MongoDBUploaderConfig(
|
|
292
|
+
database="database",
|
|
293
|
+
collection="collection",
|
|
294
|
+
)
|
|
295
|
+
connection_config = MongoDBConnectionConfig(
|
|
296
|
+
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
297
|
+
)
|
|
298
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
299
|
+
with pytest.raises(DestinationConnectionError):
|
|
300
|
+
uploader.precheck()
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
304
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
305
|
+
def test_mongodb_uploader_precheck_fail_no_database():
|
|
306
|
+
env_data = get_env_data()
|
|
307
|
+
upload_config = MongoDBUploaderConfig(
|
|
308
|
+
database="database",
|
|
309
|
+
collection="collection",
|
|
310
|
+
)
|
|
311
|
+
connection_config = MongoDBConnectionConfig(
|
|
312
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
313
|
+
)
|
|
314
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
315
|
+
with pytest.raises(DestinationConnectionError):
|
|
316
|
+
uploader.precheck()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
320
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
321
|
+
def test_mongodb_uploader_precheck_fail_no_collection():
|
|
322
|
+
env_data = get_env_data()
|
|
323
|
+
upload_config = MongoDBUploaderConfig(
|
|
324
|
+
database=env_data.database,
|
|
325
|
+
collection="collection",
|
|
326
|
+
)
|
|
327
|
+
connection_config = MongoDBConnectionConfig(
|
|
328
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
329
|
+
)
|
|
330
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
331
|
+
with pytest.raises(DestinationConnectionError):
|
|
332
|
+
uploader.precheck()
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import re
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Generator
|
|
5
7
|
from uuid import uuid4
|
|
6
8
|
|
|
7
9
|
import pytest
|
|
@@ -12,6 +14,7 @@ from test.integration.connectors.utils.constants import (
|
|
|
12
14
|
DESTINATION_TAG,
|
|
13
15
|
)
|
|
14
16
|
from test.integration.utils import requires_env
|
|
17
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
15
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
16
19
|
from unstructured_ingest.v2.logger import logger
|
|
17
20
|
from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
@@ -24,6 +27,12 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
|
24
27
|
PineconeUploadStagerConfig,
|
|
25
28
|
)
|
|
26
29
|
|
|
30
|
+
METADATA_BYTES_LIMIT = (
|
|
31
|
+
40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
|
|
32
|
+
)
|
|
33
|
+
VECTOR_DIMENSION = 384
|
|
34
|
+
SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
|
|
35
|
+
ALLOWED_METADATA_FIELD = "text"
|
|
27
36
|
API_KEY = "PINECONE_API_KEY"
|
|
28
37
|
|
|
29
38
|
|
|
@@ -62,7 +71,7 @@ def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) ->
|
|
|
62
71
|
|
|
63
72
|
|
|
64
73
|
@pytest.fixture
|
|
65
|
-
def pinecone_index() -> str:
|
|
74
|
+
def pinecone_index() -> Generator[str, None, None]:
|
|
66
75
|
pinecone = Pinecone(api_key=get_api_key())
|
|
67
76
|
random_id = str(uuid4()).split("-")[0]
|
|
68
77
|
index_name = f"ingest-test-{random_id}"
|
|
@@ -159,3 +168,46 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
|
|
|
159
168
|
validate_pinecone_index(
|
|
160
169
|
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
161
170
|
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@requires_env(API_KEY)
|
|
174
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
175
|
+
def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
176
|
+
stager = PineconeUploadStager()
|
|
177
|
+
uploader = PineconeUploader(
|
|
178
|
+
connection_config=PineconeConnectionConfig(
|
|
179
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
180
|
+
index_name=pinecone_index,
|
|
181
|
+
),
|
|
182
|
+
upload_config=PineconeUploaderConfig(),
|
|
183
|
+
)
|
|
184
|
+
large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
|
|
185
|
+
large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
|
|
186
|
+
|
|
187
|
+
with open(upload_file) as file:
|
|
188
|
+
elements = json.load(file)
|
|
189
|
+
|
|
190
|
+
with open(large_metadata_upload_file, "w") as file:
|
|
191
|
+
mock_element = elements[0]
|
|
192
|
+
mock_element["metadata"] = large_metadata
|
|
193
|
+
json.dump([mock_element], file)
|
|
194
|
+
|
|
195
|
+
file_data = FileData(
|
|
196
|
+
source_identifiers=SourceIdentifiers(
|
|
197
|
+
fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
|
|
198
|
+
),
|
|
199
|
+
connector_type=CONNECTOR_TYPE,
|
|
200
|
+
identifier="mock-file-data",
|
|
201
|
+
)
|
|
202
|
+
staged_file = stager.run(
|
|
203
|
+
file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
|
|
204
|
+
)
|
|
205
|
+
try:
|
|
206
|
+
uploader.run(staged_file, file_data)
|
|
207
|
+
except DestinationConnectionError as e:
|
|
208
|
+
error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
|
|
209
|
+
if re.search(re.compile(error_line), str(e)) is None:
|
|
210
|
+
raise e
|
|
211
|
+
raise pytest.fail("Upload request failed due to metadata exceeding limits.")
|
|
212
|
+
|
|
213
|
+
validate_pinecone_index(pinecone_index, 1, interval=5)
|
|
@@ -1,9 +1,43 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from contextlib import contextmanager
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import docker
|
|
6
6
|
from docker.models.containers import Container
|
|
7
|
+
from pydantic import BaseModel, Field, field_serializer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HealthCheck(BaseModel):
|
|
11
|
+
test: Union[str, list[str]]
|
|
12
|
+
interval: int = Field(
|
|
13
|
+
gt=0, default=30, description="The time to wait between checks in seconds."
|
|
14
|
+
)
|
|
15
|
+
timeout: int = Field(
|
|
16
|
+
gt=0, default=30, description="The time to wait before considering the check to have hung."
|
|
17
|
+
)
|
|
18
|
+
retries: int = Field(
|
|
19
|
+
gt=0,
|
|
20
|
+
default=3,
|
|
21
|
+
description="The number of consecutive failures needed "
|
|
22
|
+
"to consider a container as unhealthy.",
|
|
23
|
+
)
|
|
24
|
+
start_period: int = Field(
|
|
25
|
+
gt=0,
|
|
26
|
+
default=0,
|
|
27
|
+
description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@field_serializer("interval")
|
|
31
|
+
def serialize_interval(self, interval: int) -> int:
|
|
32
|
+
return int(interval * 10e8)
|
|
33
|
+
|
|
34
|
+
@field_serializer("timeout")
|
|
35
|
+
def serialize_timeout(self, timeout: int) -> int:
|
|
36
|
+
return int(timeout * 10e8)
|
|
37
|
+
|
|
38
|
+
@field_serializer("start_period")
|
|
39
|
+
def serialize_start_period(self, start_period: int) -> int:
|
|
40
|
+
return int(start_period * 10e8)
|
|
7
41
|
|
|
8
42
|
|
|
9
43
|
def get_container(
|
|
@@ -12,7 +46,7 @@ def get_container(
|
|
|
12
46
|
ports: dict,
|
|
13
47
|
environment: Optional[dict] = None,
|
|
14
48
|
volumes: Optional[dict] = None,
|
|
15
|
-
healthcheck: Optional[
|
|
49
|
+
healthcheck: Optional[HealthCheck] = None,
|
|
16
50
|
) -> Container:
|
|
17
51
|
run_kwargs = {
|
|
18
52
|
"image": image,
|
|
@@ -24,25 +58,49 @@ def get_container(
|
|
|
24
58
|
if volumes:
|
|
25
59
|
run_kwargs["volumes"] = volumes
|
|
26
60
|
if healthcheck:
|
|
27
|
-
run_kwargs["healthcheck"] = healthcheck
|
|
61
|
+
run_kwargs["healthcheck"] = healthcheck.model_dump()
|
|
28
62
|
container: Container = docker_client.containers.run(**run_kwargs)
|
|
29
63
|
return container
|
|
30
64
|
|
|
31
65
|
|
|
32
|
-
def
|
|
33
|
-
|
|
66
|
+
def get_healthcheck(container: Container) -> Optional[HealthCheck]:
|
|
67
|
+
healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
|
|
68
|
+
if not healthcheck_config:
|
|
69
|
+
return None
|
|
70
|
+
healthcheck_data = {
|
|
71
|
+
"test": healthcheck_config["Test"],
|
|
72
|
+
}
|
|
73
|
+
if interval := healthcheck_config.get("Interval"):
|
|
74
|
+
healthcheck_data["interval"] = interval / 10e8
|
|
75
|
+
if start_period := healthcheck_config.get("StartPeriod"):
|
|
76
|
+
healthcheck_data["start_period"] = start_period / 10e8
|
|
77
|
+
if retries := healthcheck_config.get("Retries"):
|
|
78
|
+
healthcheck_data["retries"] = retries
|
|
79
|
+
return HealthCheck.model_validate(healthcheck_data)
|
|
34
80
|
|
|
35
81
|
|
|
36
|
-
def healthcheck_wait(
|
|
82
|
+
def healthcheck_wait(
|
|
83
|
+
container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
|
|
84
|
+
) -> None:
|
|
85
|
+
if start_period:
|
|
86
|
+
time.sleep(start_period)
|
|
37
87
|
health = container.health
|
|
38
|
-
|
|
39
|
-
while health != "healthy" and
|
|
40
|
-
|
|
88
|
+
tries = 0
|
|
89
|
+
while health != "healthy" and tries < retries:
|
|
90
|
+
tries += 1
|
|
91
|
+
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
92
|
+
latest_log = logs[-1] if logs else None
|
|
93
|
+
print(
|
|
94
|
+
f"attempt {tries} - waiting for docker container "
|
|
95
|
+
f"to be healthy: {health} latest log: {latest_log}"
|
|
96
|
+
)
|
|
97
|
+
time.sleep(interval)
|
|
41
98
|
container.reload()
|
|
42
99
|
health = container.health
|
|
43
100
|
if health != "healthy":
|
|
44
|
-
|
|
45
|
-
|
|
101
|
+
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
102
|
+
latest_log = logs[-1] if logs else None
|
|
103
|
+
raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
|
|
46
104
|
|
|
47
105
|
|
|
48
106
|
@contextmanager
|
|
@@ -51,11 +109,13 @@ def container_context(
|
|
|
51
109
|
ports: dict,
|
|
52
110
|
environment: Optional[dict] = None,
|
|
53
111
|
volumes: Optional[dict] = None,
|
|
54
|
-
healthcheck: Optional[
|
|
55
|
-
|
|
112
|
+
healthcheck: Optional[HealthCheck] = None,
|
|
113
|
+
healthcheck_retries: int = 30,
|
|
56
114
|
docker_client: Optional[docker.DockerClient] = None,
|
|
57
115
|
):
|
|
58
116
|
docker_client = docker_client or docker.from_env()
|
|
117
|
+
print(f"pulling image {image}")
|
|
118
|
+
docker_client.images.pull(image)
|
|
59
119
|
container: Optional[Container] = None
|
|
60
120
|
try:
|
|
61
121
|
container = get_container(
|
|
@@ -66,8 +126,14 @@ def container_context(
|
|
|
66
126
|
volumes=volumes,
|
|
67
127
|
healthcheck=healthcheck,
|
|
68
128
|
)
|
|
69
|
-
if
|
|
70
|
-
|
|
129
|
+
if healthcheck_data := get_healthcheck(container):
|
|
130
|
+
# Mirror whatever healthcheck config set on container
|
|
131
|
+
healthcheck_wait(
|
|
132
|
+
container=container,
|
|
133
|
+
retries=healthcheck_retries,
|
|
134
|
+
start_period=healthcheck_data.start_period,
|
|
135
|
+
interval=healthcheck_data.interval,
|
|
136
|
+
)
|
|
71
137
|
yield container
|
|
72
138
|
except AssertionError as e:
|
|
73
139
|
if container:
|
|
@@ -240,6 +240,10 @@ def update_fixtures(
|
|
|
240
240
|
# Rewrite the current file data
|
|
241
241
|
if save_filedata:
|
|
242
242
|
file_data_output_path = output_dir / "file_data"
|
|
243
|
+
print(
|
|
244
|
+
f"Writing {len(all_file_data)} file data to "
|
|
245
|
+
f"saved fixture location {file_data_output_path}"
|
|
246
|
+
)
|
|
243
247
|
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
244
248
|
for file_data in all_file_data:
|
|
245
249
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
@@ -256,6 +260,10 @@ def update_fixtures(
|
|
|
256
260
|
# If applicable, save raw downloads
|
|
257
261
|
if save_downloads:
|
|
258
262
|
raw_download_output_path = output_dir / "downloads"
|
|
263
|
+
print(
|
|
264
|
+
f"Writing {len(download_files)} downloaded files to "
|
|
265
|
+
f"saved fixture location {raw_download_output_path}"
|
|
266
|
+
)
|
|
259
267
|
shutil.copytree(download_dir, raw_download_output_path)
|
|
260
268
|
|
|
261
269
|
|
|
@@ -328,6 +336,7 @@ async def source_connector_validation(
|
|
|
328
336
|
postdownload_file_data = replace(resp["file_data"])
|
|
329
337
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
330
338
|
if not overwrite_fixtures:
|
|
339
|
+
print("Running validation")
|
|
331
340
|
run_all_validations(
|
|
332
341
|
configs=configs,
|
|
333
342
|
predownload_file_data=all_predownload_file_data,
|
|
@@ -336,6 +345,7 @@ async def source_connector_validation(
|
|
|
336
345
|
test_output_dir=test_output_dir,
|
|
337
346
|
)
|
|
338
347
|
else:
|
|
348
|
+
print("Running fixtures update")
|
|
339
349
|
update_fixtures(
|
|
340
350
|
output_dir=test_output_dir,
|
|
341
351
|
download_dir=download_dir,
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def collections_schema_config() -> dict:
|
|
9
|
+
int_test_dir = Path(__file__).parent
|
|
10
|
+
assets_dir = int_test_dir / "assets"
|
|
11
|
+
config_file = assets_dir / "elements.json"
|
|
12
|
+
assert config_file.exists()
|
|
13
|
+
assert config_file.is_file()
|
|
14
|
+
with config_file.open() as config_data:
|
|
15
|
+
return json.load(config_data)
|