unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (55) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +210 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_mongodb.py +332 -0
  15. test/integration/connectors/test_pinecone.py +53 -1
  16. test/integration/connectors/utils/docker.py +81 -15
  17. test/integration/connectors/utils/validation.py +10 -0
  18. test/integration/connectors/weaviate/__init__.py +0 -0
  19. test/integration/connectors/weaviate/conftest.py +15 -0
  20. test/integration/connectors/weaviate/test_local.py +131 -0
  21. unstructured_ingest/__version__.py +1 -1
  22. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  23. unstructured_ingest/utils/data_prep.py +9 -1
  24. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  25. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  27. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  28. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  29. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  30. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  31. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  32. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  38. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  39. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  40. unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
  41. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  42. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  43. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  45. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  46. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  47. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
  50. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  51. /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
  52. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
  53. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
  54. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
  55. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,332 @@
1
+ import json
2
+ import os
3
+ import time
4
+ import uuid
5
+ from contextlib import contextmanager
6
+ from pathlib import Path
7
+ from typing import Generator
8
+
9
+ import pytest
10
+ from pydantic import BaseModel, SecretStr
11
+ from pymongo.collection import Collection
12
+ from pymongo.database import Database
13
+ from pymongo.mongo_client import MongoClient
14
+ from pymongo.operations import SearchIndexModel
15
+
16
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
17
+ from test.integration.connectors.utils.validation import (
18
+ ValidationConfigs,
19
+ source_connector_validation,
20
+ )
21
+ from test.integration.utils import requires_env
22
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
23
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
+ from unstructured_ingest.v2.processes.connectors.mongodb import (
25
+ CONNECTOR_TYPE,
26
+ MongoDBAccessConfig,
27
+ MongoDBConnectionConfig,
28
+ MongoDBDownloader,
29
+ MongoDBDownloaderConfig,
30
+ MongoDBIndexer,
31
+ MongoDBIndexerConfig,
32
+ MongoDBUploader,
33
+ MongoDBUploaderConfig,
34
+ )
35
+
36
+ SOURCE_COLLECTION = "sample-mongodb-data"
37
+
38
+
39
+ class EnvData(BaseModel):
40
+ uri: SecretStr
41
+ database: str
42
+
43
+
44
+ def get_env_data() -> EnvData:
45
+ uri = os.getenv("MONGODB_URI")
46
+ assert uri
47
+ database = os.getenv("MONGODB_DATABASE")
48
+ assert database
49
+ return EnvData(uri=uri, database=database)
50
+
51
+
52
+ @contextmanager
53
+ def get_client() -> Generator[MongoClient, None, None]:
54
+ uri = get_env_data().uri.get_secret_value()
55
+ with MongoClient(uri) as client:
56
+ assert client.admin.command("ping")
57
+ yield client
58
+
59
+
60
+ def wait_for_collection(
61
+ database: Database, collection_name: str, retries: int = 10, interval: int = 1
62
+ ):
63
+ collections = database.list_collection_names()
64
+ attempts = 0
65
+ while collection_name not in collections and attempts < retries:
66
+ attempts += 1
67
+ print(
68
+ "Waiting for collection {} to be recognized: {}".format(
69
+ collection_name, ", ".join(collections)
70
+ )
71
+ )
72
+ time.sleep(interval)
73
+ collections = database.list_collection_names()
74
+ if collection_name not in collection_name:
75
+ raise TimeoutError(f"Collection {collection_name} was not recognized")
76
+
77
+
78
+ def get_search_index_status(collection: Collection, index_name: str) -> str:
79
+ search_indexes = collection.list_search_indexes(name=index_name)
80
+ search_index = list(search_indexes)[0]
81
+ return search_index["status"]
82
+
83
+
84
+ def wait_for_search_index(
85
+ collection: Collection, index_name: str, retries: int = 60, interval: int = 1
86
+ ):
87
+ current_status = get_search_index_status(collection, index_name)
88
+ attempts = 0
89
+ while current_status != "READY" and attempts < retries:
90
+ attempts += 1
91
+ print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
92
+ time.sleep(interval)
93
+ current_status = get_search_index_status(collection, index_name)
94
+
95
+ if current_status != "READY":
96
+ raise TimeoutError("search index never detected as READY")
97
+
98
+
99
+ @pytest.fixture
100
+ def destination_collection() -> Collection:
101
+ env_data = get_env_data()
102
+ collection_name = f"utic-test-output-{uuid.uuid4()}"
103
+ with get_client() as client:
104
+ database = client[env_data.database]
105
+ print(f"creating collection in database {database}: {collection_name}")
106
+ collection = database.create_collection(name=collection_name)
107
+ search_index_name = "embeddings"
108
+ collection.create_search_index(
109
+ model=SearchIndexModel(
110
+ name=search_index_name,
111
+ definition={
112
+ "mappings": {
113
+ "dynamic": True,
114
+ "fields": {
115
+ "embeddings": [
116
+ {"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
117
+ ]
118
+ },
119
+ }
120
+ },
121
+ )
122
+ )
123
+ collection.create_index("record_id")
124
+ wait_for_collection(database=database, collection_name=collection_name)
125
+ wait_for_search_index(collection=collection, index_name=search_index_name)
126
+ try:
127
+ yield collection
128
+ finally:
129
+ print(f"deleting collection: {collection_name}")
130
+ collection.drop()
131
+
132
+
133
+ def validate_collection_count(
134
+ collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
135
+ ) -> None:
136
+ count = collection.count_documents(filter={})
137
+ attempt = 0
138
+ while count != expected_records and attempt < retries:
139
+ attempt += 1
140
+ print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
141
+ time.sleep(interval)
142
+ count = collection.count_documents(filter={})
143
+ assert (
144
+ count == expected_records
145
+ ), f"expected count ({expected_records}) does not match how many records were found: {count}"
146
+
147
+
148
+ def validate_collection_vector(
149
+ collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
150
+ ) -> None:
151
+ pipeline = [
152
+ {
153
+ "$vectorSearch": {
154
+ "index": "embeddings",
155
+ "path": "embeddings",
156
+ "queryVector": embedding,
157
+ "numCandidates": 150,
158
+ "limit": 10,
159
+ },
160
+ },
161
+ {"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
162
+ ]
163
+ attempts = 0
164
+ results = list(collection.aggregate(pipeline=pipeline))
165
+ while not results and attempts < retries:
166
+ attempts += 1
167
+ print(f"attempt {attempts}, waiting for valid results: {results}")
168
+ time.sleep(interval)
169
+ results = list(collection.aggregate(pipeline=pipeline))
170
+ if not results:
171
+ raise TimeoutError("Timed out waiting for valid results")
172
+ print(f"found results on attempt {attempts}")
173
+ top_result = results[0]
174
+ assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
175
+ assert top_result["text"] == text, "text detected should be {}, found: {}".format(
176
+ text, top_result["text"]
177
+ )
178
+ for r in results[1:]:
179
+ assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
180
+
181
+
182
+ @pytest.mark.asyncio
183
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
184
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
185
+ async def test_mongodb_source(temp_dir: Path):
186
+ env_data = get_env_data()
187
+ indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
188
+ download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
189
+ connection_config = MongoDBConnectionConfig(
190
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
191
+ )
192
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
193
+ downloader = MongoDBDownloader(
194
+ connection_config=connection_config, download_config=download_config
195
+ )
196
+ await source_connector_validation(
197
+ indexer=indexer,
198
+ downloader=downloader,
199
+ configs=ValidationConfigs(
200
+ test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
201
+ ),
202
+ )
203
+
204
+
205
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
206
+ def test_mongodb_indexer_precheck_fail_no_host():
207
+ indexer_config = MongoDBIndexerConfig(
208
+ database="non-existent-database", collection="non-existent-database"
209
+ )
210
+ connection_config = MongoDBConnectionConfig(
211
+ access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
212
+ )
213
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
214
+ with pytest.raises(SourceConnectionError):
215
+ indexer.precheck()
216
+
217
+
218
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
219
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
220
+ def test_mongodb_indexer_precheck_fail_no_database():
221
+ env_data = get_env_data()
222
+ indexer_config = MongoDBIndexerConfig(
223
+ database="non-existent-database", collection=SOURCE_COLLECTION
224
+ )
225
+ connection_config = MongoDBConnectionConfig(
226
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
227
+ )
228
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
229
+ with pytest.raises(SourceConnectionError):
230
+ indexer.precheck()
231
+
232
+
233
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
234
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
235
+ def test_mongodb_indexer_precheck_fail_no_collection():
236
+ env_data = get_env_data()
237
+ indexer_config = MongoDBIndexerConfig(
238
+ database=env_data.database, collection="non-existent-collection"
239
+ )
240
+ connection_config = MongoDBConnectionConfig(
241
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
242
+ )
243
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
244
+ with pytest.raises(SourceConnectionError):
245
+ indexer.precheck()
246
+
247
+
248
+ @pytest.mark.asyncio
249
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
250
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
251
+ async def test_mongodb_destination(
252
+ upload_file: Path,
253
+ destination_collection: Collection,
254
+ tmp_path: Path,
255
+ ):
256
+ env_data = get_env_data()
257
+ file_data = FileData(
258
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
259
+ connector_type=CONNECTOR_TYPE,
260
+ identifier="mongodb_mock_id",
261
+ )
262
+ connection_config = MongoDBConnectionConfig(
263
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
264
+ )
265
+
266
+ upload_config = MongoDBUploaderConfig(
267
+ database=env_data.database,
268
+ collection=destination_collection.name,
269
+ )
270
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
271
+ uploader.precheck()
272
+ uploader.run(path=upload_file, file_data=file_data)
273
+
274
+ with upload_file.open() as f:
275
+ staged_elements = json.load(f)
276
+ expected_records = len(staged_elements)
277
+ validate_collection_count(collection=destination_collection, expected_records=expected_records)
278
+ first_element = staged_elements[0]
279
+ validate_collection_vector(
280
+ collection=destination_collection,
281
+ embedding=first_element["embeddings"],
282
+ text=first_element["text"],
283
+ )
284
+
285
+ uploader.run(path=upload_file, file_data=file_data)
286
+ validate_collection_count(collection=destination_collection, expected_records=expected_records)
287
+
288
+
289
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
290
+ def test_mongodb_uploader_precheck_fail_no_host():
291
+ upload_config = MongoDBUploaderConfig(
292
+ database="database",
293
+ collection="collection",
294
+ )
295
+ connection_config = MongoDBConnectionConfig(
296
+ access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
297
+ )
298
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
299
+ with pytest.raises(DestinationConnectionError):
300
+ uploader.precheck()
301
+
302
+
303
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
304
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
305
+ def test_mongodb_uploader_precheck_fail_no_database():
306
+ env_data = get_env_data()
307
+ upload_config = MongoDBUploaderConfig(
308
+ database="database",
309
+ collection="collection",
310
+ )
311
+ connection_config = MongoDBConnectionConfig(
312
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
313
+ )
314
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
315
+ with pytest.raises(DestinationConnectionError):
316
+ uploader.precheck()
317
+
318
+
319
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
320
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
321
+ def test_mongodb_uploader_precheck_fail_no_collection():
322
+ env_data = get_env_data()
323
+ upload_config = MongoDBUploaderConfig(
324
+ database=env_data.database,
325
+ collection="collection",
326
+ )
327
+ connection_config = MongoDBConnectionConfig(
328
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
329
+ )
330
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
331
+ with pytest.raises(DestinationConnectionError):
332
+ uploader.precheck()
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import os
3
+ import re
3
4
  import time
4
5
  from pathlib import Path
6
+ from typing import Generator
5
7
  from uuid import uuid4
6
8
 
7
9
  import pytest
@@ -12,6 +14,7 @@ from test.integration.connectors.utils.constants import (
12
14
  DESTINATION_TAG,
13
15
  )
14
16
  from test.integration.utils import requires_env
17
+ from unstructured_ingest.error import DestinationConnectionError
15
18
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
16
19
  from unstructured_ingest.v2.logger import logger
17
20
  from unstructured_ingest.v2.processes.connectors.pinecone import (
@@ -24,6 +27,12 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
24
27
  PineconeUploadStagerConfig,
25
28
  )
26
29
 
30
+ METADATA_BYTES_LIMIT = (
31
+ 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
32
+ )
33
+ VECTOR_DIMENSION = 384
34
+ SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
35
+ ALLOWED_METADATA_FIELD = "text"
27
36
  API_KEY = "PINECONE_API_KEY"
28
37
 
29
38
 
@@ -62,7 +71,7 @@ def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) ->
62
71
 
63
72
 
64
73
  @pytest.fixture
65
- def pinecone_index() -> str:
74
+ def pinecone_index() -> Generator[str, None, None]:
66
75
  pinecone = Pinecone(api_key=get_api_key())
67
76
  random_id = str(uuid4()).split("-")[0]
68
77
  index_name = f"ingest-test-{random_id}"
@@ -159,3 +168,46 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
159
168
  validate_pinecone_index(
160
169
  index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
161
170
  )
171
+
172
+
173
+ @requires_env(API_KEY)
174
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
175
+ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
176
+ stager = PineconeUploadStager()
177
+ uploader = PineconeUploader(
178
+ connection_config=PineconeConnectionConfig(
179
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
180
+ index_name=pinecone_index,
181
+ ),
182
+ upload_config=PineconeUploaderConfig(),
183
+ )
184
+ large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
185
+ large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
186
+
187
+ with open(upload_file) as file:
188
+ elements = json.load(file)
189
+
190
+ with open(large_metadata_upload_file, "w") as file:
191
+ mock_element = elements[0]
192
+ mock_element["metadata"] = large_metadata
193
+ json.dump([mock_element], file)
194
+
195
+ file_data = FileData(
196
+ source_identifiers=SourceIdentifiers(
197
+ fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
198
+ ),
199
+ connector_type=CONNECTOR_TYPE,
200
+ identifier="mock-file-data",
201
+ )
202
+ staged_file = stager.run(
203
+ file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
204
+ )
205
+ try:
206
+ uploader.run(staged_file, file_data)
207
+ except DestinationConnectionError as e:
208
+ error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
209
+ if re.search(re.compile(error_line), str(e)) is None:
210
+ raise e
211
+ raise pytest.fail("Upload request failed due to metadata exceeding limits.")
212
+
213
+ validate_pinecone_index(pinecone_index, 1, interval=5)
@@ -1,9 +1,43 @@
1
1
  import time
2
2
  from contextlib import contextmanager
3
- from typing import Optional
3
+ from typing import Optional, Union
4
4
 
5
5
  import docker
6
6
  from docker.models.containers import Container
7
+ from pydantic import BaseModel, Field, field_serializer
8
+
9
+
10
+ class HealthCheck(BaseModel):
11
+ test: Union[str, list[str]]
12
+ interval: int = Field(
13
+ gt=0, default=30, description="The time to wait between checks in seconds."
14
+ )
15
+ timeout: int = Field(
16
+ gt=0, default=30, description="The time to wait before considering the check to have hung."
17
+ )
18
+ retries: int = Field(
19
+ gt=0,
20
+ default=3,
21
+ description="The number of consecutive failures needed "
22
+ "to consider a container as unhealthy.",
23
+ )
24
+ start_period: int = Field(
25
+ gt=0,
26
+ default=0,
27
+ description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
28
+ )
29
+
30
+ @field_serializer("interval")
31
+ def serialize_interval(self, interval: int) -> int:
32
+ return int(interval * 10e8)
33
+
34
+ @field_serializer("timeout")
35
+ def serialize_timeout(self, timeout: int) -> int:
36
+ return int(timeout * 10e8)
37
+
38
+ @field_serializer("start_period")
39
+ def serialize_start_period(self, start_period: int) -> int:
40
+ return int(start_period * 10e8)
7
41
 
8
42
 
9
43
  def get_container(
@@ -12,7 +46,7 @@ def get_container(
12
46
  ports: dict,
13
47
  environment: Optional[dict] = None,
14
48
  volumes: Optional[dict] = None,
15
- healthcheck: Optional[dict] = None,
49
+ healthcheck: Optional[HealthCheck] = None,
16
50
  ) -> Container:
17
51
  run_kwargs = {
18
52
  "image": image,
@@ -24,25 +58,49 @@ def get_container(
24
58
  if volumes:
25
59
  run_kwargs["volumes"] = volumes
26
60
  if healthcheck:
27
- run_kwargs["healthcheck"] = healthcheck
61
+ run_kwargs["healthcheck"] = healthcheck.model_dump()
28
62
  container: Container = docker_client.containers.run(**run_kwargs)
29
63
  return container
30
64
 
31
65
 
32
- def has_healthcheck(container: Container) -> bool:
33
- return container.attrs.get("Config", {}).get("Healthcheck", None) is not None
66
+ def get_healthcheck(container: Container) -> Optional[HealthCheck]:
67
+ healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
68
+ if not healthcheck_config:
69
+ return None
70
+ healthcheck_data = {
71
+ "test": healthcheck_config["Test"],
72
+ }
73
+ if interval := healthcheck_config.get("Interval"):
74
+ healthcheck_data["interval"] = interval / 10e8
75
+ if start_period := healthcheck_config.get("StartPeriod"):
76
+ healthcheck_data["start_period"] = start_period / 10e8
77
+ if retries := healthcheck_config.get("Retries"):
78
+ healthcheck_data["retries"] = retries
79
+ return HealthCheck.model_validate(healthcheck_data)
34
80
 
35
81
 
36
- def healthcheck_wait(container: Container, timeout: int = 10) -> None:
82
+ def healthcheck_wait(
83
+ container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
84
+ ) -> None:
85
+ if start_period:
86
+ time.sleep(start_period)
37
87
  health = container.health
38
- start = time.time()
39
- while health != "healthy" and time.time() - start < timeout:
40
- time.sleep(1)
88
+ tries = 0
89
+ while health != "healthy" and tries < retries:
90
+ tries += 1
91
+ logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
92
+ latest_log = logs[-1] if logs else None
93
+ print(
94
+ f"attempt {tries} - waiting for docker container "
95
+ f"to be healthy: {health} latest log: {latest_log}"
96
+ )
97
+ time.sleep(interval)
41
98
  container.reload()
42
99
  health = container.health
43
100
  if health != "healthy":
44
- health_dict = container.attrs.get("State", {}).get("Health", {})
45
- raise TimeoutError(f"Docker container never came up healthy: {health_dict}")
101
+ logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
102
+ latest_log = logs[-1] if logs else None
103
+ raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
46
104
 
47
105
 
48
106
  @contextmanager
@@ -51,11 +109,13 @@ def container_context(
51
109
  ports: dict,
52
110
  environment: Optional[dict] = None,
53
111
  volumes: Optional[dict] = None,
54
- healthcheck: Optional[dict] = None,
55
- healthcheck_timeout: int = 10,
112
+ healthcheck: Optional[HealthCheck] = None,
113
+ healthcheck_retries: int = 30,
56
114
  docker_client: Optional[docker.DockerClient] = None,
57
115
  ):
58
116
  docker_client = docker_client or docker.from_env()
117
+ print(f"pulling image {image}")
118
+ docker_client.images.pull(image)
59
119
  container: Optional[Container] = None
60
120
  try:
61
121
  container = get_container(
@@ -66,8 +126,14 @@ def container_context(
66
126
  volumes=volumes,
67
127
  healthcheck=healthcheck,
68
128
  )
69
- if has_healthcheck(container):
70
- healthcheck_wait(container=container, timeout=healthcheck_timeout)
129
+ if healthcheck_data := get_healthcheck(container):
130
+ # Mirror whatever healthcheck config set on container
131
+ healthcheck_wait(
132
+ container=container,
133
+ retries=healthcheck_retries,
134
+ start_period=healthcheck_data.start_period,
135
+ interval=healthcheck_data.interval,
136
+ )
71
137
  yield container
72
138
  except AssertionError as e:
73
139
  if container:
@@ -240,6 +240,10 @@ def update_fixtures(
240
240
  # Rewrite the current file data
241
241
  if save_filedata:
242
242
  file_data_output_path = output_dir / "file_data"
243
+ print(
244
+ f"Writing {len(all_file_data)} file data to "
245
+ f"saved fixture location {file_data_output_path}"
246
+ )
243
247
  file_data_output_path.mkdir(parents=True, exist_ok=True)
244
248
  for file_data in all_file_data:
245
249
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
@@ -256,6 +260,10 @@ def update_fixtures(
256
260
  # If applicable, save raw downloads
257
261
  if save_downloads:
258
262
  raw_download_output_path = output_dir / "downloads"
263
+ print(
264
+ f"Writing {len(download_files)} downloaded files to "
265
+ f"saved fixture location {raw_download_output_path}"
266
+ )
259
267
  shutil.copytree(download_dir, raw_download_output_path)
260
268
 
261
269
 
@@ -328,6 +336,7 @@ async def source_connector_validation(
328
336
  postdownload_file_data = replace(resp["file_data"])
329
337
  all_postdownload_file_data.append(postdownload_file_data)
330
338
  if not overwrite_fixtures:
339
+ print("Running validation")
331
340
  run_all_validations(
332
341
  configs=configs,
333
342
  predownload_file_data=all_predownload_file_data,
@@ -336,6 +345,7 @@ async def source_connector_validation(
336
345
  test_output_dir=test_output_dir,
337
346
  )
338
347
  else:
348
+ print("Running fixtures update")
339
349
  update_fixtures(
340
350
  output_dir=test_output_dir,
341
351
  download_dir=download_dir,
File without changes
@@ -0,0 +1,15 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+
7
+ @pytest.fixture
8
+ def collections_schema_config() -> dict:
9
+ int_test_dir = Path(__file__).parent
10
+ assets_dir = int_test_dir / "assets"
11
+ config_file = assets_dir / "elements.json"
12
+ assert config_file.exists()
13
+ assert config_file.is_file()
14
+ with config_file.open() as config_data:
15
+ return json.load(config_data)