unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +10 -6
  2. test/integration/connectors/discord/test_discord.py +4 -4
  3. test/integration/connectors/duckdb/test_duckdb.py +3 -2
  4. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  5. test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
  6. test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
  7. test/integration/connectors/sql/test_postgres.py +9 -3
  8. test/integration/connectors/sql/test_singlestore.py +9 -3
  9. test/integration/connectors/sql/test_snowflake.py +9 -3
  10. test/integration/connectors/sql/test_sqlite.py +9 -3
  11. test/integration/connectors/test_astradb.py +25 -9
  12. test/integration/connectors/test_azure_ai_search.py +3 -4
  13. test/integration/connectors/test_chroma.py +4 -6
  14. test/integration/connectors/test_confluence.py +3 -5
  15. test/integration/connectors/test_delta_table.py +4 -6
  16. test/integration/connectors/test_lancedb.py +3 -3
  17. test/integration/connectors/test_milvus.py +10 -5
  18. test/integration/connectors/test_mongodb.py +9 -9
  19. test/integration/connectors/test_neo4j.py +16 -8
  20. test/integration/connectors/test_notion.py +7 -0
  21. test/integration/connectors/test_onedrive.py +2 -4
  22. test/integration/connectors/test_pinecone.py +5 -6
  23. test/integration/connectors/test_qdrant.py +5 -4
  24. test/integration/connectors/test_redis.py +3 -3
  25. test/integration/connectors/test_s3.py +7 -6
  26. test/integration/connectors/test_vectara.py +2 -2
  27. test/integration/connectors/utils/constants.py +6 -0
  28. test/integration/connectors/utils/docker.py +2 -2
  29. test/integration/connectors/weaviate/test_cloud.py +5 -0
  30. test/integration/connectors/weaviate/test_local.py +2 -2
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
  33. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/METADATA +18 -18
  34. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/RECORD +38 -38
  35. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/LICENSE.md +0 -0
  36. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/WHEEL +0 -0
  37. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/entry_points.txt +0 -0
  38. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ import pytest_asyncio
11
11
  from lancedb import AsyncConnection
12
12
  from upath import UPath
13
13
 
14
- from test.integration.connectors.utils.constants import DESTINATION_TAG
14
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
15
15
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
16
16
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
17
17
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
@@ -106,7 +106,7 @@ async def connection_with_uri(request, tmp_path: Path):
106
106
 
107
107
 
108
108
  @pytest.mark.asyncio
109
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
109
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
110
110
  @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
111
111
  async def test_lancedb_destination(
112
112
  upload_file: Path,
@@ -164,7 +164,7 @@ async def test_lancedb_destination(
164
164
 
165
165
 
166
166
  class TestPrecheck:
167
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
167
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
168
168
  @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
169
169
  def test_succeeds(
170
170
  self,
@@ -13,7 +13,11 @@ from pymilvus import (
13
13
  )
14
14
  from pymilvus.milvus_client import IndexParams
15
15
 
16
- from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
16
+ from test.integration.connectors.utils.constants import (
17
+ DESTINATION_TAG,
18
+ VECTOR_DB_TAG,
19
+ env_setup_path,
20
+ )
17
21
  from test.integration.connectors.utils.docker import healthcheck_wait
18
22
  from test.integration.connectors.utils.docker_compose import docker_compose_context
19
23
  from test.integration.connectors.utils.validation.destination import (
@@ -112,7 +116,7 @@ def validate_count(
112
116
 
113
117
 
114
118
  @pytest.mark.asyncio
115
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
119
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
116
120
  async def test_milvus_destination(
117
121
  upload_file: Path,
118
122
  collection: str,
@@ -150,7 +154,7 @@ async def test_milvus_destination(
150
154
  validate_count(client=client, expected_count=expected_count)
151
155
 
152
156
 
153
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
157
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
154
158
  def test_precheck_succeeds(collection: str):
155
159
  uploader = MilvusUploader(
156
160
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -159,7 +163,7 @@ def test_precheck_succeeds(collection: str):
159
163
  uploader.precheck()
160
164
 
161
165
 
162
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
166
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
163
167
  def test_precheck_fails_on_nonexistent_collection(collection: str):
164
168
  uploader = MilvusUploader(
165
169
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -174,7 +178,7 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
174
178
  uploader.precheck()
175
179
 
176
180
 
177
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
181
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
178
182
  def test_precheck_fails_on_nonexisting_db(collection: str):
179
183
  uploader = MilvusUploader(
180
184
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -187,6 +191,7 @@ def test_precheck_fails_on_nonexisting_db(collection: str):
187
191
  uploader.precheck()
188
192
 
189
193
 
194
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
190
195
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
191
196
  def test_milvus_stager(
192
197
  request: TopRequest,
@@ -13,7 +13,7 @@ from pymongo.database import Database
13
13
  from pymongo.mongo_client import MongoClient
14
14
  from pymongo.operations import SearchIndexModel
15
15
 
16
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
16
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
17
17
  from test.integration.connectors.utils.validation.source import (
18
18
  SourceValidationConfigs,
19
19
  source_connector_validation,
@@ -180,7 +180,7 @@ def validate_collection_vector(
180
180
 
181
181
 
182
182
  @pytest.mark.asyncio
183
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
183
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
184
184
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
185
185
  async def test_mongodb_source(temp_dir: Path):
186
186
  env_data = get_env_data()
@@ -205,7 +205,7 @@ async def test_mongodb_source(temp_dir: Path):
205
205
  )
206
206
 
207
207
 
208
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
208
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
209
209
  def test_mongodb_indexer_precheck_fail_no_host():
210
210
  indexer_config = MongoDBIndexerConfig(
211
211
  database="non-existent-database", collection="non-existent-database"
@@ -218,7 +218,7 @@ def test_mongodb_indexer_precheck_fail_no_host():
218
218
  indexer.precheck()
219
219
 
220
220
 
221
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
221
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
222
222
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
223
223
  def test_mongodb_indexer_precheck_fail_no_database():
224
224
  env_data = get_env_data()
@@ -233,7 +233,7 @@ def test_mongodb_indexer_precheck_fail_no_database():
233
233
  indexer.precheck()
234
234
 
235
235
 
236
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
236
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
237
237
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
238
238
  def test_mongodb_indexer_precheck_fail_no_collection():
239
239
  env_data = get_env_data()
@@ -249,7 +249,7 @@ def test_mongodb_indexer_precheck_fail_no_collection():
249
249
 
250
250
 
251
251
  @pytest.mark.asyncio
252
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
252
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
253
253
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
254
254
  async def test_mongodb_destination(
255
255
  upload_file: Path,
@@ -289,7 +289,7 @@ async def test_mongodb_destination(
289
289
  validate_collection_count(collection=destination_collection, expected_records=expected_records)
290
290
 
291
291
 
292
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
292
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
293
293
  def test_mongodb_uploader_precheck_fail_no_host():
294
294
  upload_config = MongoDBUploaderConfig(
295
295
  database="database",
@@ -303,7 +303,7 @@ def test_mongodb_uploader_precheck_fail_no_host():
303
303
  uploader.precheck()
304
304
 
305
305
 
306
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
306
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
307
307
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
308
308
  def test_mongodb_uploader_precheck_fail_no_database():
309
309
  env_data = get_env_data()
@@ -319,7 +319,7 @@ def test_mongodb_uploader_precheck_fail_no_database():
319
319
  uploader.precheck()
320
320
 
321
321
 
322
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
322
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
323
323
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
324
324
  def test_mongodb_uploader_precheck_fail_no_collection():
325
325
  env_data = get_env_data()
@@ -9,7 +9,7 @@ from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
9
9
  from neo4j.exceptions import ServiceUnavailable
10
10
  from pytest_check import check
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_DB_TAG
13
13
  from test.integration.connectors.utils.docker import container_context
14
14
  from unstructured_ingest.error import DestinationConnectionError
15
15
  from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
@@ -51,7 +51,7 @@ def _neo4j_server():
51
51
 
52
52
 
53
53
  @pytest.mark.asyncio
54
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
54
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
55
55
  async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
56
56
  stager = Neo4jUploadStager()
57
57
  uploader = Neo4jUploader(
@@ -104,7 +104,7 @@ async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
104
104
  await validate_uploaded_graph(modified_upload_file)
105
105
 
106
106
 
107
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
107
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
108
108
  class TestPrecheck:
109
109
  @pytest.fixture
110
110
  def configured_uploader(self) -> Neo4jUploader:
@@ -199,13 +199,15 @@ async def validate_uploaded_graph(upload_file: Path):
199
199
  try:
200
200
  nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
201
201
  chunk_nodes_count = len(
202
- (await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
202
+ (await driver.execute_query(f"MATCH (n: {Label.CHUNK.value}) RETURN n"))[0]
203
203
  )
204
204
  document_nodes_count = len(
205
- (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
205
+ (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT.value}) RETURN n"))[0]
206
206
  )
207
207
  element_nodes_count = len(
208
- (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
208
+ (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT.value}) RETURN n"))[
209
+ 0
210
+ ]
209
211
  )
210
212
  with check:
211
213
  assert nodes_count == expected_nodes_count
@@ -217,12 +219,18 @@ async def validate_uploaded_graph(upload_file: Path):
217
219
  assert element_nodes_count == expected_element_count
218
220
 
219
221
  records, _, _ = await driver.execute_query(
220
- f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
222
+ f"""
223
+ MATCH ()-[r:{Relationship.PART_OF_DOCUMENT.value}]->(:{Label.DOCUMENT.value})
224
+ RETURN r
225
+ """
221
226
  )
222
227
  part_of_document_count = len(records)
223
228
 
224
229
  records, _, _ = await driver.execute_query(
225
- f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
230
+ f"""
231
+ MATCH (:{Label.CHUNK.value})-[r:{Relationship.NEXT_CHUNK.value}]->(:{Label.CHUNK.value})
232
+ RETURN r
233
+ """
226
234
  )
227
235
  next_chunk_count = len(records)
228
236
 
@@ -1,5 +1,8 @@
1
1
  import os
2
2
 
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
3
6
  from test.integration.connectors.utils.validation.source import (
4
7
  SourceValidationConfigs,
5
8
  get_all_file_data,
@@ -8,6 +11,7 @@ from test.integration.connectors.utils.validation.source import (
8
11
  )
9
12
  from unstructured_ingest.v2.interfaces import Downloader, Indexer
10
13
  from unstructured_ingest.v2.processes.connectors.notion.connector import (
14
+ CONNECTOR_TYPE,
11
15
  NotionAccessConfig,
12
16
  NotionConnectionConfig,
13
17
  NotionDownloader,
@@ -17,6 +21,7 @@ from unstructured_ingest.v2.processes.connectors.notion.connector import (
17
21
  )
18
22
 
19
23
 
24
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
20
25
  def test_notion_source_database(temp_dir):
21
26
  # Retrieve environment variables
22
27
  notion_api_key = os.environ["NOTION_API_KEY"]
@@ -55,6 +60,7 @@ def test_notion_source_database(temp_dir):
55
60
  )
56
61
 
57
62
 
63
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
58
64
  def test_notion_source_page(temp_dir):
59
65
  # Retrieve environment variables
60
66
  notion_api_key = os.environ["NOTION_API_KEY"]
@@ -93,6 +99,7 @@ def test_notion_source_page(temp_dir):
93
99
  )
94
100
 
95
101
 
102
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
96
103
  def source_connector_validation(
97
104
  indexer: Indexer,
98
105
  downloader: Downloader,
@@ -5,9 +5,7 @@ from pathlib import Path
5
5
  import pytest
6
6
  from office365.graph_client import GraphClient
7
7
 
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- )
8
+ from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
11
9
  from test.integration.utils import requires_env
12
10
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
11
  from unstructured_ingest.v2.processes.connectors.onedrive import (
@@ -67,7 +65,7 @@ def get_connection_config():
67
65
  return connection_config
68
66
 
69
67
 
70
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
71
69
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
72
70
  @pytest.mark.xfail(
73
71
  reason="Issues with test setup on the provider side."
@@ -12,9 +12,7 @@ from _pytest.fixtures import TopRequest
12
12
  from pinecone import Pinecone, ServerlessSpec
13
13
  from pinecone.core.openapi.shared.exceptions import NotFoundException
14
14
 
15
- from test.integration.connectors.utils.constants import (
16
- DESTINATION_TAG,
17
- )
15
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
18
16
  from test.integration.connectors.utils.validation.destination import (
19
17
  StagerValidationConfigs,
20
18
  stager_validation,
@@ -133,7 +131,7 @@ def validate_pinecone_index(
133
131
 
134
132
  @requires_env(API_KEY)
135
133
  @pytest.mark.asyncio
136
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
134
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
137
135
  async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
138
136
  file_data = FileData(
139
137
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -176,7 +174,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
176
174
 
177
175
  @requires_env(API_KEY)
178
176
  @pytest.mark.asyncio
179
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
177
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
180
178
  @pytest.mark.skip(reason="TODO: get this to work")
181
179
  async def test_pinecone_destination_large_index(
182
180
  pinecone_index: str, upload_file: Path, temp_dir: Path
@@ -227,7 +225,7 @@ async def test_pinecone_destination_large_index(
227
225
 
228
226
 
229
227
  @requires_env(API_KEY)
230
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
228
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
231
229
  def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
232
230
  stager = PineconeUploadStager()
233
231
  uploader = PineconeUploader(
@@ -272,6 +270,7 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
272
270
  validate_pinecone_index(pinecone_index, 1, interval=5)
273
271
 
274
272
 
273
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
275
274
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
276
275
  def test_pinecone_stager(
277
276
  request: TopRequest,
@@ -9,7 +9,7 @@ import pytest
9
9
  from _pytest.fixtures import TopRequest
10
10
  from qdrant_client import AsyncQdrantClient
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
13
13
  from test.integration.connectors.utils.docker import container_context
14
14
  from test.integration.connectors.utils.validation.destination import (
15
15
  StagerValidationConfigs,
@@ -75,7 +75,7 @@ async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
75
75
 
76
76
 
77
77
  @pytest.mark.asyncio
78
- @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
78
+ @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
79
79
  async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
80
80
  connection_kwargs = {"path": str(tmp_path / "qdrant")}
81
81
  async with qdrant_client(connection_kwargs) as client:
@@ -117,7 +117,7 @@ def docker_context():
117
117
 
118
118
 
119
119
  @pytest.mark.asyncio
120
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
120
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
121
121
  async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
122
122
  connection_kwargs = {"location": "http://localhost:6333"}
123
123
  async with qdrant_client(connection_kwargs) as client:
@@ -153,7 +153,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
153
153
 
154
154
 
155
155
  @pytest.mark.asyncio
156
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
156
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
157
157
  @requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
158
158
  async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
159
159
  server_url = os.environ["QDRANT_SERVER_URL"]
@@ -197,6 +197,7 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
197
197
  await validate_upload(client=client, upload_file=upload_file)
198
198
 
199
199
 
200
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
200
201
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
201
202
  def test_qdrant_stager(
202
203
  request: TopRequest,
@@ -9,7 +9,7 @@ import pytest
9
9
  from redis import exceptions as redis_exceptions
10
10
  from redis.asyncio import Redis, from_url
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
13
  from test.integration.utils import requires_env
14
14
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
15
  from unstructured_ingest.v2.processes.connectors.redisdb import (
@@ -96,7 +96,7 @@ async def redis_destination_test(
96
96
 
97
97
 
98
98
  @pytest.mark.asyncio
99
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
99
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
100
100
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
101
101
  async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
102
102
  connection_kwargs = {
@@ -110,7 +110,7 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
110
110
 
111
111
 
112
112
  @pytest.mark.asyncio
113
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
113
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
114
114
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
115
  async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
116
  connection_kwargs = {}
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  import pytest
7
7
 
8
8
  from test.integration.connectors.utils.constants import (
9
+ BLOB_STORAGE_TAG,
9
10
  DESTINATION_TAG,
10
11
  SOURCE_TAG,
11
12
  env_setup_path,
@@ -47,7 +48,7 @@ def anon_connection_config() -> S3ConnectionConfig:
47
48
 
48
49
 
49
50
  @pytest.mark.asyncio
50
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
51
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
51
52
  async def test_s3_source(anon_connection_config: S3ConnectionConfig):
52
53
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
53
54
  with tempfile.TemporaryDirectory() as tempdir:
@@ -70,7 +71,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
70
71
 
71
72
 
72
73
  @pytest.mark.asyncio
73
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
74
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
74
75
  async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
75
76
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
76
77
  with tempfile.TemporaryDirectory() as tempdir:
@@ -92,7 +93,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
92
93
  )
93
94
 
94
95
 
95
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
96
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
96
97
  def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
97
98
  indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
98
99
  indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
@@ -100,7 +101,7 @@ def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
100
101
  indexer.precheck()
101
102
 
102
103
 
103
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
104
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
104
105
  def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
105
106
  indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
106
107
  indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
@@ -109,7 +110,7 @@ def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
109
110
 
110
111
 
111
112
  @pytest.mark.asyncio
112
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
113
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
113
114
  async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
114
115
  anon_connection_config.endpoint_url = "http://localhost:9000"
115
116
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
@@ -149,7 +150,7 @@ def get_aws_credentials() -> dict:
149
150
 
150
151
 
151
152
  @pytest.mark.asyncio
152
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
153
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
153
154
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
154
155
  async def test_s3_destination(upload_file: Path):
155
156
  aws_credentials = get_aws_credentials()
@@ -8,7 +8,7 @@ from uuid import uuid4
8
8
  import pytest
9
9
  import requests
10
10
 
11
- from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
12
12
  from test.integration.utils import requires_env
13
13
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
14
  from unstructured_ingest.v2.logger import logger
@@ -211,7 +211,7 @@ def corpora_util() -> Generator[str, None, None]:
211
211
 
212
212
 
213
213
  @pytest.mark.asyncio
214
- @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
214
+ @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
215
215
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
216
  async def test_vectara_destination(
217
217
  upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
@@ -2,6 +2,12 @@ from pathlib import Path
2
2
 
3
3
  SOURCE_TAG = "source"
4
4
  DESTINATION_TAG = "destination"
5
+ BLOB_STORAGE_TAG = "blob_storage"
6
+ SQL_TAG = "sql"
7
+ NOSQL_TAG = "nosql"
8
+ VECTOR_DB_TAG = "vector_db"
9
+ GRAPH_DB_TAG = "graph_db"
10
+ UNCATEGORIZED_TAG = "uncategorized"
5
11
 
6
12
  env_setup_path = Path(__file__).parents[1] / "env_setup"
7
13
  expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -44,7 +44,7 @@ def get_container(
44
44
  docker_client: docker.DockerClient,
45
45
  image: str,
46
46
  ports: dict,
47
- name: Optional[str] = "connector_test",
47
+ name: Optional[str] = None,
48
48
  environment: Optional[dict] = None,
49
49
  volumes: Optional[dict] = None,
50
50
  healthcheck: Optional[HealthCheck] = None,
@@ -115,7 +115,7 @@ def container_context(
115
115
  healthcheck: Optional[HealthCheck] = None,
116
116
  healthcheck_retries: int = 30,
117
117
  docker_client: Optional[docker.DockerClient] = None,
118
- name: Optional[str] = "connector_test",
118
+ name: Optional[str] = None,
119
119
  ):
120
120
  docker_client = docker_client or docker.from_env()
121
121
  print(f"pulling image {image}")
@@ -1,12 +1,15 @@
1
1
  import pytest
2
2
  from pydantic import ValidationError
3
3
 
4
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
4
5
  from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
6
+ CONNECTOR_TYPE,
5
7
  CloudWeaviateAccessConfig,
6
8
  CloudWeaviateConnectionConfig,
7
9
  )
8
10
 
9
11
 
12
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
10
13
  def test_weaviate_failing_connection_config():
11
14
  with pytest.raises(ValidationError):
12
15
  CloudWeaviateConnectionConfig(
@@ -16,6 +19,7 @@ def test_weaviate_failing_connection_config():
16
19
  )
17
20
 
18
21
 
22
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
19
23
  def test_weaviate_connection_config_happy_path():
20
24
  CloudWeaviateConnectionConfig(
21
25
  access_config=CloudWeaviateAccessConfig(
@@ -25,6 +29,7 @@ def test_weaviate_connection_config_happy_path():
25
29
  )
26
30
 
27
31
 
32
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
28
33
  def test_weaviate_connection_config_anonymous():
29
34
  CloudWeaviateConnectionConfig(
30
35
  access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
@@ -7,7 +7,7 @@ import requests
7
7
  import weaviate
8
8
  from weaviate.client import WeaviateClient
9
9
 
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
11
  from test.integration.connectors.utils.docker import container_context
12
12
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
13
  from unstructured_ingest.v2.processes.connectors.weaviate.local import (
@@ -74,7 +74,7 @@ def run_uploader_and_validate(
74
74
 
75
75
 
76
76
  @pytest.mark.asyncio
77
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
77
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
78
78
  def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
79
79
  file_data = FileData(
80
80
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -1 +1 @@
1
- __version__ = "0.3.13" # pragma: no cover
1
+ __version__ = "0.3.14" # pragma: no cover
@@ -105,7 +105,7 @@ class Neo4jUploadStager(UploadStager):
105
105
  output_filepath.parent.mkdir(parents=True, exist_ok=True)
106
106
 
107
107
  with open(output_filepath, "w") as file:
108
- json.dump(_GraphData.from_nx(nx_graph).model_dump(), file, indent=4)
108
+ file.write(_GraphData.from_nx(nx_graph).model_dump_json())
109
109
 
110
110
  return output_filepath
111
111
 
@@ -196,7 +196,7 @@ class _GraphData(BaseModel):
196
196
 
197
197
 
198
198
  class _Node(BaseModel):
199
- model_config = ConfigDict(use_enum_values=True)
199
+ model_config = ConfigDict()
200
200
 
201
201
  id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
202
202
  labels: list[Label] = Field(default_factory=list)
@@ -207,20 +207,20 @@ class _Node(BaseModel):
207
207
 
208
208
 
209
209
  class _Edge(BaseModel):
210
- model_config = ConfigDict(use_enum_values=True)
210
+ model_config = ConfigDict()
211
211
 
212
212
  source_id: str
213
213
  destination_id: str
214
214
  relationship: Relationship
215
215
 
216
216
 
217
- class Label(str, Enum):
217
+ class Label(Enum):
218
218
  UNSTRUCTURED_ELEMENT = "UnstructuredElement"
219
219
  CHUNK = "Chunk"
220
220
  DOCUMENT = "Document"
221
221
 
222
222
 
223
- class Relationship(str, Enum):
223
+ class Relationship(Enum):
224
224
  PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
225
225
  PART_OF_CHUNK = "PART_OF_CHUNK"
226
226
  NEXT_CHUNK = "NEXT_CHUNK"
@@ -263,14 +263,14 @@ class Neo4jUploader(Uploader):
263
263
  async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
264
264
  for label in Label:
265
265
  logger.info(
266
- f"Adding id uniqueness constraint for nodes labeled '{label}'"
266
+ f"Adding id uniqueness constraint for nodes labeled '{label.value}'"
267
267
  " if it does not already exist."
268
268
  )
269
- constraint_name = f"{label.lower()}_id"
269
+ constraint_name = f"{label.value.lower()}_id"
270
270
  await client.execute_query(
271
271
  f"""
272
272
  CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
273
- FOR (n: {label}) REQUIRE n.id IS UNIQUE
273
+ FOR (n: {label.value}) REQUIRE n.id IS UNIQUE
274
274
  """
275
275
  )
276
276
 
@@ -278,8 +278,8 @@ class Neo4jUploader(Uploader):
278
278
  logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
279
279
  _, summary, _ = await client.execute_query(
280
280
  f"""
281
- MATCH (n: {Label.DOCUMENT} {{id: $identifier}})
282
- MATCH (n)--(m: {Label.CHUNK}|{Label.UNSTRUCTURED_ELEMENT})
281
+ MATCH (n: {Label.DOCUMENT.value} {{id: $identifier}})
282
+ MATCH (n)--(m: {Label.CHUNK.value}|{Label.UNSTRUCTURED_ELEMENT.value})
283
283
  DETACH DELETE m""",
284
284
  identifier=file_data.identifier,
285
285
  )
@@ -349,7 +349,7 @@ class Neo4jUploader(Uploader):
349
349
 
350
350
  @staticmethod
351
351
  def _create_nodes_query(nodes: list[_Node], labels: tuple[Label, ...]) -> tuple[str, dict]:
352
- labels_string = ", ".join(labels)
352
+ labels_string = ", ".join([label.value for label in labels])
353
353
  logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
354
354
  query_string = f"""
355
355
  UNWIND $nodes AS node
@@ -366,7 +366,7 @@ class Neo4jUploader(Uploader):
366
366
  UNWIND $edges AS edge
367
367
  MATCH (u {{id: edge.source}})
368
368
  MATCH (v {{id: edge.destination}})
369
- MERGE (u)-[:{relationship}]->(v)
369
+ MERGE (u)-[:{relationship.value}]->(v)
370
370
  """
371
371
  parameters = {
372
372
  "edges": [