unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,120 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import chromadb
5
+ import pytest
6
+ from _pytest.fixtures import TopRequest
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ )
11
+ from test.integration.connectors.utils.docker import HealthCheck, container_context
12
+ from test.integration.connectors.utils.validation.destination import (
13
+ StagerValidationConfigs,
14
+ stager_validation,
15
+ )
16
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
17
+ from unstructured_ingest.v2.processes.connectors.chroma import (
18
+ CONNECTOR_TYPE,
19
+ ChromaConnectionConfig,
20
+ ChromaUploader,
21
+ ChromaUploaderConfig,
22
+ ChromaUploadStager,
23
+ ChromaUploadStagerConfig,
24
+ )
25
+
26
+
27
+ @pytest.fixture
28
+ def chroma_instance():
29
+ with container_context(
30
+ image="chromadb/chroma:latest",
31
+ ports={8000: 8000},
32
+ name="chroma_int_test",
33
+ healthcheck=HealthCheck(
34
+ interval=5,
35
+ timeout=10,
36
+ retries=3,
37
+ test="timeout 10s bash -c ':> /dev/tcp/127.0.0.1/8000' || exit 1",
38
+ ),
39
+ ) as ctx:
40
+ yield ctx
41
+
42
+
43
+ def validate_collection(collection_name: str, num_embeddings: int):
44
+ print(f"Checking contents of Chroma collection: {collection_name}")
45
+
46
+ chroma_client = chromadb.HttpClient(
47
+ host="localhost",
48
+ port="8000",
49
+ tenant="default_tenant",
50
+ database="default_database",
51
+ )
52
+
53
+ collection = chroma_client.get_or_create_collection(name=collection_name)
54
+
55
+ number_of_embeddings = collection.count()
56
+ expected_embeddings = num_embeddings
57
+ print(
58
+ f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
59
+ )
60
+
61
+ assert number_of_embeddings == expected_embeddings, (
62
+ f"Number of rows in generated table ({number_of_embeddings}) "
63
+ f"doesn't match expected value: {expected_embeddings}"
64
+ )
65
+
66
+
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ def test_chroma_destination(
69
+ upload_file: Path,
70
+ chroma_instance,
71
+ tmp_path: Path,
72
+ ):
73
+ collection_name = "test_collection"
74
+ file_data = FileData(
75
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
76
+ connector_type=CONNECTOR_TYPE,
77
+ identifier="mock file data",
78
+ )
79
+ stager = ChromaUploadStager(upload_stager_config=ChromaUploadStagerConfig())
80
+
81
+ uploader = ChromaUploader(
82
+ connection_config=ChromaConnectionConfig(
83
+ host="localhost",
84
+ port=8000,
85
+ tenant="default_tenant",
86
+ database="default_database",
87
+ ),
88
+ upload_config=ChromaUploaderConfig(collection_name=collection_name),
89
+ )
90
+ staged_filepath = stager.run(
91
+ elements_filepath=upload_file,
92
+ file_data=file_data,
93
+ output_dir=tmp_path,
94
+ output_filename=upload_file.name,
95
+ )
96
+ uploader.precheck()
97
+ uploader.run(path=staged_filepath, file_data=file_data)
98
+
99
+ # Run validation
100
+ with staged_filepath.open() as f:
101
+ staged_elements = json.load(f)
102
+ expected_count = len(staged_elements)
103
+ validate_collection(collection_name=collection_name, num_embeddings=expected_count)
104
+
105
+
106
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
107
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
108
+ def test_chroma_stager(
109
+ request: TopRequest,
110
+ upload_file_str: str,
111
+ tmp_path: Path,
112
+ ):
113
+ upload_file: Path = request.getfixturevalue(upload_file_str)
114
+ stager = ChromaUploadStager()
115
+ stager_validation(
116
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
117
+ input_file=upload_file,
118
+ stager=stager,
119
+ tmp_dir=tmp_path,
120
+ )
@@ -5,8 +5,8 @@ import pytest
5
5
  from test.integration.connectors.utils.constants import (
6
6
  SOURCE_TAG,
7
7
  )
8
- from test.integration.connectors.utils.validation import (
9
- ValidationConfigs,
8
+ from test.integration.connectors.utils.validation.source import (
9
+ SourceValidationConfigs,
10
10
  source_connector_validation,
11
11
  )
12
12
  from test.integration.utils import requires_env
@@ -60,7 +60,7 @@ async def test_confluence_source(temp_dir):
60
60
  await source_connector_validation(
61
61
  indexer=indexer,
62
62
  downloader=downloader,
63
- configs=ValidationConfigs(
63
+ configs=SourceValidationConfigs(
64
64
  test_id="confluence",
65
65
  expected_num_files=11,
66
66
  validate_downloaded_files=True,
@@ -107,7 +107,7 @@ async def test_confluence_source_large(temp_dir):
107
107
  await source_connector_validation(
108
108
  indexer=indexer,
109
109
  downloader=downloader,
110
- configs=ValidationConfigs(
110
+ configs=SourceValidationConfigs(
111
111
  test_id="confluence_large", expected_num_files=250, validate_file_data=False
112
112
  ),
113
113
  )
@@ -114,6 +114,7 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
114
114
  )
115
115
 
116
116
  try:
117
+ uploader.precheck()
117
118
  if uploader.is_async():
118
119
  await uploader.run_async(path=new_upload_file, file_data=file_data)
119
120
  else:
@@ -14,8 +14,8 @@ from test.integration.connectors.utils.constants import (
14
14
  env_setup_path,
15
15
  )
16
16
  from test.integration.connectors.utils.docker_compose import docker_compose_context
17
- from test.integration.connectors.utils.validation import (
18
- ValidationConfigs,
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
19
  source_connector_validation,
20
20
  )
21
21
  from test.integration.utils import requires_env
@@ -121,8 +121,8 @@ async def test_kafka_source_local(kafka_seed_topic: str):
121
121
  await source_connector_validation(
122
122
  indexer=indexer,
123
123
  downloader=downloader,
124
- configs=ValidationConfigs(
125
- test_id="kafka", expected_num_files=5, validate_downloaded_files=True
124
+ configs=SourceValidationConfigs(
125
+ test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
126
126
  ),
127
127
  )
128
128
 
@@ -203,8 +203,8 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
203
203
  await source_connector_validation(
204
204
  indexer=indexer,
205
205
  downloader=downloader,
206
- configs=ValidationConfigs(
207
- test_id="kafka",
206
+ configs=SourceValidationConfigs(
207
+ test_id="kafka-cloud",
208
208
  exclude_fields_extend=["connector_type"],
209
209
  expected_num_files=expected_messages,
210
210
  validate_downloaded_files=True,
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
 
5
5
  import docker
6
6
  import pytest
7
+ from _pytest.fixtures import TopRequest
7
8
  from pymilvus import (
8
9
  CollectionSchema,
9
10
  DataType,
@@ -15,6 +16,10 @@ from pymilvus.milvus_client import IndexParams
15
16
  from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
16
17
  from test.integration.connectors.utils.docker import healthcheck_wait
17
18
  from test.integration.connectors.utils.docker_compose import docker_compose_context
19
+ from test.integration.connectors.utils.validation.destination import (
20
+ StagerValidationConfigs,
21
+ stager_validation,
22
+ )
18
23
  from unstructured_ingest.error import DestinationConnectionError
19
24
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
20
25
  from unstructured_ingest.v2.processes.connectors.milvus import (
@@ -167,3 +172,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
167
172
  match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
168
173
  ):
169
174
  uploader.precheck()
175
+
176
+
177
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
178
+ def test_milvus_stager(
179
+ request: TopRequest,
180
+ upload_file_str: str,
181
+ tmp_path: Path,
182
+ ):
183
+ upload_file: Path = request.getfixturevalue(upload_file_str)
184
+ stager = MilvusUploadStager()
185
+ stager_validation(
186
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
187
+ input_file=upload_file,
188
+ stager=stager,
189
+ tmp_dir=tmp_path,
190
+ )
@@ -14,8 +14,8 @@ from pymongo.mongo_client import MongoClient
14
14
  from pymongo.operations import SearchIndexModel
15
15
 
16
16
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
17
- from test.integration.connectors.utils.validation import (
18
- ValidationConfigs,
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
19
  source_connector_validation,
20
20
  )
21
21
  from test.integration.utils import requires_env
@@ -196,8 +196,11 @@ async def test_mongodb_source(temp_dir: Path):
196
196
  await source_connector_validation(
197
197
  indexer=indexer,
198
198
  downloader=downloader,
199
- configs=ValidationConfigs(
200
- test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
199
+ configs=SourceValidationConfigs(
200
+ test_id=CONNECTOR_TYPE,
201
+ expected_num_files=4,
202
+ validate_downloaded_files=True,
203
+ expected_number_indexed_file_data=1,
201
204
  ),
202
205
  )
203
206
 
@@ -0,0 +1,236 @@
1
+ import json
2
+ import time
3
+ import uuid
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+ from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
9
+ from neo4j.exceptions import ServiceUnavailable
10
+ from pytest_check import check
11
+
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
13
+ from test.integration.connectors.utils.docker import container_context
14
+ from unstructured_ingest.error import DestinationConnectionError
15
+ from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
16
+ from unstructured_ingest.v2.interfaces.file_data import (
17
+ FileData,
18
+ FileDataSourceMetadata,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.v2.processes.connectors.neo4j import (
22
+ CONNECTOR_TYPE,
23
+ Label,
24
+ Neo4jAccessConfig,
25
+ Neo4jConnectionConfig,
26
+ Neo4jUploader,
27
+ Neo4jUploaderConfig,
28
+ Neo4jUploadStager,
29
+ Relationship,
30
+ )
31
+
32
+ USERNAME = "neo4j"
33
+ PASSWORD = "password"
34
+ URI = "neo4j://localhost:7687"
35
+ DATABASE = "neo4j"
36
+
37
+ EXPECTED_DOCUMENT_COUNT = 1
38
+
39
+
40
+ # NOTE: Precheck tests are read-only so we utilize the same container for all tests.
41
+ # If new tests require clean neo4j container, this fixture's scope should be adjusted.
42
+ @pytest.fixture(autouse=True, scope="module")
43
+ def _neo4j_server():
44
+ with container_context(
45
+ image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
46
+ ):
47
+ driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
48
+ wait_for_connection(driver)
49
+ driver.close()
50
+ yield
51
+
52
+
53
+ @pytest.mark.asyncio
54
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
55
+ async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
56
+ stager = Neo4jUploadStager()
57
+ uploader = Neo4jUploader(
58
+ connection_config=Neo4jConnectionConfig(
59
+ access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
60
+ username=USERNAME,
61
+ uri=URI,
62
+ database=DATABASE,
63
+ ),
64
+ upload_config=Neo4jUploaderConfig(),
65
+ )
66
+ file_data = FileData(
67
+ identifier="mock-file-data",
68
+ connector_type="neo4j",
69
+ source_identifiers=SourceIdentifiers(
70
+ filename=upload_file.name,
71
+ fullpath=upload_file.name,
72
+ ),
73
+ metadata=FileDataSourceMetadata(
74
+ date_created=str(datetime(2022, 1, 1).timestamp()),
75
+ date_modified=str(datetime(2022, 1, 2).timestamp()),
76
+ ),
77
+ )
78
+ staged_filepath = stager.run(
79
+ upload_file,
80
+ file_data=file_data,
81
+ output_dir=tmp_path,
82
+ output_filename=upload_file.name,
83
+ )
84
+
85
+ await uploader.run_async(staged_filepath, file_data)
86
+ await validate_uploaded_graph(upload_file)
87
+
88
+ modified_upload_file = tmp_path / f"modified-{upload_file.name}"
89
+ with open(upload_file) as file:
90
+ elements = json.load(file)
91
+ for element in elements:
92
+ element["element_id"] = str(uuid.uuid4())
93
+
94
+ with open(modified_upload_file, "w") as file:
95
+ json.dump(elements, file, indent=4)
96
+
97
+ staged_filepath = stager.run(
98
+ modified_upload_file,
99
+ file_data=file_data,
100
+ output_dir=tmp_path,
101
+ output_filename=modified_upload_file.name,
102
+ )
103
+ await uploader.run_async(staged_filepath, file_data)
104
+ await validate_uploaded_graph(modified_upload_file)
105
+
106
+
107
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
108
+ class TestPrecheck:
109
+ @pytest.fixture
110
+ def configured_uploader(self) -> Neo4jUploader:
111
+ return Neo4jUploader(
112
+ connection_config=Neo4jConnectionConfig(
113
+ access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
114
+ username=USERNAME,
115
+ uri=URI,
116
+ database=DATABASE,
117
+ ),
118
+ upload_config=Neo4jUploaderConfig(),
119
+ )
120
+
121
+ def test_succeeds(self, configured_uploader: Neo4jUploader):
122
+ configured_uploader.precheck()
123
+
124
+ def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
125
+ configured_uploader.connection_config.access_config.get_secret_value().password = (
126
+ "invalid-password"
127
+ )
128
+ with pytest.raises(
129
+ DestinationConnectionError,
130
+ match="{code: Neo.ClientError.Security.Unauthorized}",
131
+ ):
132
+ configured_uploader.precheck()
133
+
134
+ def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
135
+ configured_uploader.connection_config.username = "invalid-username"
136
+ with pytest.raises(
137
+ DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
138
+ ):
139
+ configured_uploader.precheck()
140
+
141
+ @pytest.mark.parametrize(
142
+ ("uri", "expected_error_msg"),
143
+ [
144
+ ("neo4j://localhst:7687", "Cannot resolve address"),
145
+ ("neo4j://localhost:7777", "Unable to retrieve routing information"),
146
+ ],
147
+ )
148
+ def test_fails_on_invalid_uri(
149
+ self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
150
+ ):
151
+ configured_uploader.connection_config.uri = uri
152
+ with pytest.raises(DestinationConnectionError, match=expected_error_msg):
153
+ configured_uploader.precheck()
154
+
155
+ def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
156
+ configured_uploader.connection_config.database = "invalid-database"
157
+ with pytest.raises(
158
+ DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
159
+ ):
160
+ configured_uploader.precheck()
161
+
162
+
163
+ def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
164
+ attempts = 0
165
+ while attempts < retries:
166
+ try:
167
+ driver.verify_connectivity()
168
+ return
169
+ except ServiceUnavailable:
170
+ time.sleep(delay_seconds)
171
+ attempts += 1
172
+
173
+ pytest.fail("Failed to connect with Neo4j server.")
174
+
175
+
176
+ async def validate_uploaded_graph(upload_file: Path):
177
+ with open(upload_file) as file:
178
+ elements = json.load(file)
179
+
180
+ for element in elements:
181
+ if "orig_elements" in element["metadata"]:
182
+ element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
183
+ element["metadata"]["orig_elements"]
184
+ )
185
+ else:
186
+ element["metadata"]["orig_elements"] = []
187
+
188
+ expected_chunks_count = len(elements)
189
+ expected_element_count = len(
190
+ {
191
+ origin_element["element_id"]
192
+ for chunk in elements
193
+ for origin_element in chunk["metadata"]["orig_elements"]
194
+ }
195
+ )
196
+ expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
197
+
198
+ driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
199
+ try:
200
+ nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
201
+ chunk_nodes_count = len(
202
+ (await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
203
+ )
204
+ document_nodes_count = len(
205
+ (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
206
+ )
207
+ element_nodes_count = len(
208
+ (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
209
+ )
210
+ with check:
211
+ assert nodes_count == expected_nodes_count
212
+ with check:
213
+ assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
214
+ with check:
215
+ assert chunk_nodes_count == expected_chunks_count
216
+ with check:
217
+ assert element_nodes_count == expected_element_count
218
+
219
+ records, _, _ = await driver.execute_query(
220
+ f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
221
+ )
222
+ part_of_document_count = len(records)
223
+
224
+ records, _, _ = await driver.execute_query(
225
+ f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
226
+ )
227
+ next_chunk_count = len(records)
228
+
229
+ if not check.any_failures():
230
+ with check:
231
+ assert part_of_document_count == expected_chunks_count + expected_element_count
232
+ with check:
233
+ assert next_chunk_count == expected_chunks_count - 1
234
+
235
+ finally:
236
+ await driver.close()
@@ -8,12 +8,17 @@ from typing import Generator
8
8
  from uuid import uuid4
9
9
 
10
10
  import pytest
11
+ from _pytest.fixtures import TopRequest
11
12
  from pinecone import Pinecone, ServerlessSpec
12
13
  from pinecone.core.openapi.shared.exceptions import NotFoundException
13
14
 
14
15
  from test.integration.connectors.utils.constants import (
15
16
  DESTINATION_TAG,
16
17
  )
18
+ from test.integration.connectors.utils.validation.destination import (
19
+ StagerValidationConfigs,
20
+ stager_validation,
21
+ )
17
22
  from test.integration.utils import requires_env
18
23
  from unstructured_ingest.error import DestinationConnectionError
19
24
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
@@ -251,7 +256,10 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
251
256
  identifier="mock-file-data",
252
257
  )
253
258
  staged_file = stager.run(
254
- file_data, large_metadata_upload_file, tmp_path, large_metadata_upload_file.name
259
+ elements_filepath=large_metadata_upload_file,
260
+ file_data=file_data,
261
+ output_dir=tmp_path,
262
+ output_filename=large_metadata_upload_file.name,
255
263
  )
256
264
  try:
257
265
  uploader.run(staged_file, file_data)
@@ -262,3 +270,19 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
262
270
  raise pytest.fail("Upload request failed due to metadata exceeding limits.")
263
271
 
264
272
  validate_pinecone_index(pinecone_index, 1, interval=5)
273
+
274
+
275
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
276
+ def test_pinecone_stager(
277
+ request: TopRequest,
278
+ upload_file_str: str,
279
+ tmp_path: Path,
280
+ ):
281
+ upload_file: Path = request.getfixturevalue(upload_file_str)
282
+ stager = PineconeUploadStager()
283
+ stager_validation(
284
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
285
+ input_file=upload_file,
286
+ stager=stager,
287
+ tmp_dir=tmp_path,
288
+ )
@@ -6,10 +6,15 @@ from pathlib import Path
6
6
  from typing import AsyncGenerator
7
7
 
8
8
  import pytest
9
+ from _pytest.fixtures import TopRequest
9
10
  from qdrant_client import AsyncQdrantClient
10
11
 
11
12
  from test.integration.connectors.utils.constants import DESTINATION_TAG
12
13
  from test.integration.connectors.utils.docker import container_context
14
+ from test.integration.connectors.utils.validation.destination import (
15
+ StagerValidationConfigs,
16
+ stager_validation,
17
+ )
13
18
  from test.integration.utils import requires_env
14
19
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
20
  from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
@@ -138,7 +143,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
138
143
  output_dir=tmp_path,
139
144
  output_filename=upload_file.name,
140
145
  )
141
-
146
+ uploader.precheck()
142
147
  if uploader.is_async():
143
148
  await uploader.run_async(path=staged_upload_file, file_data=file_data)
144
149
  else:
@@ -183,10 +188,28 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
183
188
  output_dir=tmp_path,
184
189
  output_filename=upload_file.name,
185
190
  )
186
-
191
+ uploader.precheck()
187
192
  if uploader.is_async():
188
193
  await uploader.run_async(path=staged_upload_file, file_data=file_data)
189
194
  else:
190
195
  uploader.run(path=staged_upload_file, file_data=file_data)
191
196
  async with qdrant_client(connection_kwargs) as client:
192
197
  await validate_upload(client=client, upload_file=upload_file)
198
+
199
+
200
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
201
+ def test_qdrant_stager(
202
+ request: TopRequest,
203
+ upload_file_str: str,
204
+ tmp_path: Path,
205
+ ):
206
+ upload_file: Path = request.getfixturevalue(upload_file_str)
207
+ stager = LocalQdrantUploadStager(
208
+ upload_stager_config=LocalQdrantUploadStagerConfig(),
209
+ )
210
+ stager_validation(
211
+ configs=StagerValidationConfigs(test_id=LOCAL_CONNECTOR_TYPE, expected_count=22),
212
+ input_file=upload_file,
213
+ stager=stager,
214
+ tmp_dir=tmp_path,
215
+ )
@@ -11,8 +11,8 @@ from test.integration.connectors.utils.constants import (
11
11
  env_setup_path,
12
12
  )
13
13
  from test.integration.connectors.utils.docker_compose import docker_compose_context
14
- from test.integration.connectors.utils.validation import (
15
- ValidationConfigs,
14
+ from test.integration.connectors.utils.validation.source import (
15
+ SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
18
  from test.integration.utils import requires_env
@@ -62,7 +62,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
62
62
  await source_connector_validation(
63
63
  indexer=indexer,
64
64
  downloader=downloader,
65
- configs=ValidationConfigs(
65
+ configs=SourceValidationConfigs(
66
66
  test_id="s3",
67
67
  predownload_file_data_check=validate_predownload_file_data,
68
68
  postdownload_file_data_check=validate_postdownload_file_data,
@@ -85,7 +85,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
85
85
  await source_connector_validation(
86
86
  indexer=indexer,
87
87
  downloader=downloader,
88
- configs=ValidationConfigs(
88
+ configs=SourceValidationConfigs(
89
89
  test_id="s3-specialchar",
90
90
  predownload_file_data_check=validate_predownload_file_data,
91
91
  postdownload_file_data_check=validate_postdownload_file_data,
@@ -121,7 +121,7 @@ async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
121
121
  await source_connector_validation(
122
122
  indexer=indexer,
123
123
  downloader=downloader,
124
- configs=ValidationConfigs(
124
+ configs=SourceValidationConfigs(
125
125
  test_id="s3-minio",
126
126
  predownload_file_data_check=validate_predownload_file_data,
127
127
  postdownload_file_data_check=validate_postdownload_file_data,
@@ -165,11 +165,14 @@ async def test_s3_destination(upload_file: Path):
165
165
  identifier="mock file data",
166
166
  )
167
167
  try:
168
+ uploader.precheck()
168
169
  if uploader.is_async():
169
170
  await uploader.run_async(path=upload_file, file_data=file_data)
170
171
  else:
171
172
  uploader.run(path=upload_file, file_data=file_data)
172
- uploaded_files = s3fs.ls(path=destination_path)
173
+ uploaded_files = [
174
+ Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
175
+ ]
173
176
  assert len(uploaded_files) == 1
174
177
  finally:
175
178
  s3fs.rm(path=destination_path, recursive=True)
@@ -44,6 +44,7 @@ def get_container(
44
44
  docker_client: docker.DockerClient,
45
45
  image: str,
46
46
  ports: dict,
47
+ name: Optional[str] = "connector_test",
47
48
  environment: Optional[dict] = None,
48
49
  volumes: Optional[dict] = None,
49
50
  healthcheck: Optional[HealthCheck] = None,
@@ -59,6 +60,8 @@ def get_container(
59
60
  run_kwargs["volumes"] = volumes
60
61
  if healthcheck:
61
62
  run_kwargs["healthcheck"] = healthcheck.model_dump()
63
+ if name:
64
+ run_kwargs["name"] = name
62
65
  container: Container = docker_client.containers.run(**run_kwargs)
63
66
  return container
64
67
 
@@ -112,6 +115,7 @@ def container_context(
112
115
  healthcheck: Optional[HealthCheck] = None,
113
116
  healthcheck_retries: int = 30,
114
117
  docker_client: Optional[docker.DockerClient] = None,
118
+ name: Optional[str] = "connector_test",
115
119
  ):
116
120
  docker_client = docker_client or docker.from_env()
117
121
  print(f"pulling image {image}")
@@ -125,6 +129,7 @@ def container_context(
125
129
  environment=environment,
126
130
  volumes=volumes,
127
131
  healthcheck=healthcheck,
132
+ name=name,
128
133
  )
129
134
  if healthcheck_data := get_healthcheck(container):
130
135
  # Mirror whatever healthcheck config set on container
@@ -143,3 +148,4 @@ def container_context(
143
148
  finally:
144
149
  if container:
145
150
  container.kill()
151
+ container.remove()