unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +10 -6
  2. test/integration/connectors/discord/test_discord.py +4 -4
  3. test/integration/connectors/duckdb/test_duckdb.py +3 -2
  4. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  5. test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
  6. test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
  7. test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
  8. test/integration/connectors/sql/test_postgres.py +9 -3
  9. test/integration/connectors/sql/test_singlestore.py +9 -3
  10. test/integration/connectors/sql/test_snowflake.py +9 -3
  11. test/integration/connectors/sql/test_sqlite.py +9 -3
  12. test/integration/connectors/test_astradb.py +25 -9
  13. test/integration/connectors/test_azure_ai_search.py +3 -4
  14. test/integration/connectors/test_chroma.py +4 -6
  15. test/integration/connectors/test_confluence.py +3 -5
  16. test/integration/connectors/test_delta_table.py +4 -6
  17. test/integration/connectors/test_lancedb.py +3 -3
  18. test/integration/connectors/test_milvus.py +10 -5
  19. test/integration/connectors/test_mongodb.py +9 -9
  20. test/integration/connectors/test_neo4j.py +16 -8
  21. test/integration/connectors/test_notion.py +7 -0
  22. test/integration/connectors/test_onedrive.py +2 -4
  23. test/integration/connectors/test_pinecone.py +73 -8
  24. test/integration/connectors/test_qdrant.py +5 -4
  25. test/integration/connectors/test_redis.py +3 -3
  26. test/integration/connectors/test_s3.py +7 -6
  27. test/integration/connectors/test_vectara.py +2 -2
  28. test/integration/connectors/utils/constants.py +6 -0
  29. test/integration/connectors/utils/docker.py +2 -2
  30. test/integration/connectors/weaviate/test_cloud.py +5 -0
  31. test/integration/connectors/weaviate/test_local.py +2 -2
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  34. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
  35. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
  36. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
  37. unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
  39. unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
  40. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
  41. unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
  42. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +20 -18
  43. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +47 -44
  44. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
  45. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
  46. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
  47. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
@@ -23,9 +23,7 @@ from azure.search.documents.indexes.models import (
23
23
  VectorSearchProfile,
24
24
  )
25
25
 
26
- from test.integration.connectors.utils.constants import (
27
- DESTINATION_TAG,
28
- )
26
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
29
27
  from test.integration.connectors.utils.validation.destination import (
30
28
  StagerValidationConfigs,
31
29
  stager_validation,
@@ -195,7 +193,7 @@ def validate_count(
195
193
 
196
194
 
197
195
  @pytest.mark.asyncio
198
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
196
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
199
197
  @requires_env("AZURE_SEARCH_API_KEY")
200
198
  async def test_azure_ai_search_destination(
201
199
  upload_file: Path,
@@ -239,6 +237,7 @@ async def test_azure_ai_search_destination(
239
237
  validate_count(search_client=search_client, expected_count=expected_count)
240
238
 
241
239
 
240
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
242
241
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
243
242
  def test_azure_ai_search_stager(
244
243
  request: TopRequest,
@@ -5,9 +5,7 @@ import chromadb
5
5
  import pytest
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- )
8
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
9
  from test.integration.connectors.utils.docker import HealthCheck, container_context
12
10
  from test.integration.connectors.utils.validation.destination import (
13
11
  StagerValidationConfigs,
@@ -27,7 +25,7 @@ from unstructured_ingest.v2.processes.connectors.chroma import (
27
25
  @pytest.fixture
28
26
  def chroma_instance():
29
27
  with container_context(
30
- image="chromadb/chroma:latest",
28
+ image="chromadb/chroma:0.6.2",
31
29
  ports={8000: 8000},
32
30
  name="chroma_int_test",
33
31
  healthcheck=HealthCheck(
@@ -64,7 +62,7 @@ def validate_collection(collection_name: str, num_embeddings: int):
64
62
  )
65
63
 
66
64
 
67
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
65
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
68
66
  def test_chroma_destination(
69
67
  upload_file: Path,
70
68
  chroma_instance,
@@ -104,7 +102,7 @@ def test_chroma_destination(
104
102
 
105
103
 
106
104
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
107
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
105
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager", VECTOR_DB_TAG)
108
106
  def test_chroma_stager(
109
107
  request: TopRequest,
110
108
  upload_file_str: str,
@@ -2,9 +2,7 @@ import os
2
2
 
3
3
  import pytest
4
4
 
5
- from test.integration.connectors.utils.constants import (
6
- SOURCE_TAG,
7
- )
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
8
6
  from test.integration.connectors.utils.validation.source import (
9
7
  SourceValidationConfigs,
10
8
  source_connector_validation,
@@ -22,7 +20,7 @@ from unstructured_ingest.v2.processes.connectors.confluence import (
22
20
 
23
21
 
24
22
  @pytest.mark.asyncio
25
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
23
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
26
24
  @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
27
25
  async def test_confluence_source(temp_dir):
28
26
  # Retrieve environment variables
@@ -69,7 +67,7 @@ async def test_confluence_source(temp_dir):
69
67
 
70
68
 
71
69
  @pytest.mark.asyncio
72
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
70
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
73
71
  @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
74
72
  async def test_confluence_source_large(temp_dir):
75
73
  # Retrieve environment variables
@@ -6,9 +6,7 @@ import pytest
6
6
  from deltalake import DeltaTable
7
7
  from fsspec import get_filesystem_class
8
8
 
9
- from test.integration.connectors.utils.constants import (
10
- DESTINATION_TAG,
11
- )
9
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
12
10
  from test.integration.utils import requires_env
13
11
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
14
12
  from unstructured_ingest.v2.processes.connectors.delta_table import (
@@ -25,7 +23,7 @@ multiprocessing.set_start_method("spawn")
25
23
 
26
24
 
27
25
  @pytest.mark.asyncio
28
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
26
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
29
27
  async def test_delta_table_destination_local(upload_file: Path, temp_dir: Path):
30
28
  destination_path = str(temp_dir)
31
29
  connection_config = DeltaTableConnectionConfig(
@@ -81,7 +79,7 @@ def get_aws_credentials() -> dict:
81
79
 
82
80
 
83
81
  @pytest.mark.asyncio
84
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
82
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
85
83
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
86
84
  async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
87
85
  aws_credentials = get_aws_credentials()
@@ -140,7 +138,7 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
140
138
 
141
139
 
142
140
  @pytest.mark.asyncio
143
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
141
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
144
142
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
145
143
  async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
146
144
  aws_credentials = {
@@ -11,7 +11,7 @@ import pytest_asyncio
11
11
  from lancedb import AsyncConnection
12
12
  from upath import UPath
13
13
 
14
- from test.integration.connectors.utils.constants import DESTINATION_TAG
14
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
15
15
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
16
16
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
17
17
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
@@ -106,7 +106,7 @@ async def connection_with_uri(request, tmp_path: Path):
106
106
 
107
107
 
108
108
  @pytest.mark.asyncio
109
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
109
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
110
110
  @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
111
111
  async def test_lancedb_destination(
112
112
  upload_file: Path,
@@ -164,7 +164,7 @@ async def test_lancedb_destination(
164
164
 
165
165
 
166
166
  class TestPrecheck:
167
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
167
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
168
168
  @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
169
169
  def test_succeeds(
170
170
  self,
@@ -13,7 +13,11 @@ from pymilvus import (
13
13
  )
14
14
  from pymilvus.milvus_client import IndexParams
15
15
 
16
- from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
16
+ from test.integration.connectors.utils.constants import (
17
+ DESTINATION_TAG,
18
+ VECTOR_DB_TAG,
19
+ env_setup_path,
20
+ )
17
21
  from test.integration.connectors.utils.docker import healthcheck_wait
18
22
  from test.integration.connectors.utils.docker_compose import docker_compose_context
19
23
  from test.integration.connectors.utils.validation.destination import (
@@ -112,7 +116,7 @@ def validate_count(
112
116
 
113
117
 
114
118
  @pytest.mark.asyncio
115
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
119
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
116
120
  async def test_milvus_destination(
117
121
  upload_file: Path,
118
122
  collection: str,
@@ -150,7 +154,7 @@ async def test_milvus_destination(
150
154
  validate_count(client=client, expected_count=expected_count)
151
155
 
152
156
 
153
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
157
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
154
158
  def test_precheck_succeeds(collection: str):
155
159
  uploader = MilvusUploader(
156
160
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -159,7 +163,7 @@ def test_precheck_succeeds(collection: str):
159
163
  uploader.precheck()
160
164
 
161
165
 
162
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
166
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
163
167
  def test_precheck_fails_on_nonexistent_collection(collection: str):
164
168
  uploader = MilvusUploader(
165
169
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -174,7 +178,7 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
174
178
  uploader.precheck()
175
179
 
176
180
 
177
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
181
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
178
182
  def test_precheck_fails_on_nonexisting_db(collection: str):
179
183
  uploader = MilvusUploader(
180
184
  connection_config=MilvusConnectionConfig(uri=DB_URI),
@@ -187,6 +191,7 @@ def test_precheck_fails_on_nonexisting_db(collection: str):
187
191
  uploader.precheck()
188
192
 
189
193
 
194
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
190
195
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
191
196
  def test_milvus_stager(
192
197
  request: TopRequest,
@@ -13,7 +13,7 @@ from pymongo.database import Database
13
13
  from pymongo.mongo_client import MongoClient
14
14
  from pymongo.operations import SearchIndexModel
15
15
 
16
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
16
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
17
17
  from test.integration.connectors.utils.validation.source import (
18
18
  SourceValidationConfigs,
19
19
  source_connector_validation,
@@ -180,7 +180,7 @@ def validate_collection_vector(
180
180
 
181
181
 
182
182
  @pytest.mark.asyncio
183
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
183
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
184
184
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
185
185
  async def test_mongodb_source(temp_dir: Path):
186
186
  env_data = get_env_data()
@@ -205,7 +205,7 @@ async def test_mongodb_source(temp_dir: Path):
205
205
  )
206
206
 
207
207
 
208
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
208
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
209
209
  def test_mongodb_indexer_precheck_fail_no_host():
210
210
  indexer_config = MongoDBIndexerConfig(
211
211
  database="non-existent-database", collection="non-existent-database"
@@ -218,7 +218,7 @@ def test_mongodb_indexer_precheck_fail_no_host():
218
218
  indexer.precheck()
219
219
 
220
220
 
221
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
221
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
222
222
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
223
223
  def test_mongodb_indexer_precheck_fail_no_database():
224
224
  env_data = get_env_data()
@@ -233,7 +233,7 @@ def test_mongodb_indexer_precheck_fail_no_database():
233
233
  indexer.precheck()
234
234
 
235
235
 
236
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
236
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
237
237
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
238
238
  def test_mongodb_indexer_precheck_fail_no_collection():
239
239
  env_data = get_env_data()
@@ -249,7 +249,7 @@ def test_mongodb_indexer_precheck_fail_no_collection():
249
249
 
250
250
 
251
251
  @pytest.mark.asyncio
252
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
252
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
253
253
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
254
254
  async def test_mongodb_destination(
255
255
  upload_file: Path,
@@ -289,7 +289,7 @@ async def test_mongodb_destination(
289
289
  validate_collection_count(collection=destination_collection, expected_records=expected_records)
290
290
 
291
291
 
292
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
292
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
293
293
  def test_mongodb_uploader_precheck_fail_no_host():
294
294
  upload_config = MongoDBUploaderConfig(
295
295
  database="database",
@@ -303,7 +303,7 @@ def test_mongodb_uploader_precheck_fail_no_host():
303
303
  uploader.precheck()
304
304
 
305
305
 
306
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
306
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
307
307
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
308
308
  def test_mongodb_uploader_precheck_fail_no_database():
309
309
  env_data = get_env_data()
@@ -319,7 +319,7 @@ def test_mongodb_uploader_precheck_fail_no_database():
319
319
  uploader.precheck()
320
320
 
321
321
 
322
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
322
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
323
323
  @requires_env("MONGODB_URI", "MONGODB_DATABASE")
324
324
  def test_mongodb_uploader_precheck_fail_no_collection():
325
325
  env_data = get_env_data()
@@ -9,7 +9,7 @@ from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
9
9
  from neo4j.exceptions import ServiceUnavailable
10
10
  from pytest_check import check
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_DB_TAG
13
13
  from test.integration.connectors.utils.docker import container_context
14
14
  from unstructured_ingest.error import DestinationConnectionError
15
15
  from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
@@ -51,7 +51,7 @@ def _neo4j_server():
51
51
 
52
52
 
53
53
  @pytest.mark.asyncio
54
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
54
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
55
55
  async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
56
56
  stager = Neo4jUploadStager()
57
57
  uploader = Neo4jUploader(
@@ -104,7 +104,7 @@ async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
104
104
  await validate_uploaded_graph(modified_upload_file)
105
105
 
106
106
 
107
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
107
+ @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
108
108
  class TestPrecheck:
109
109
  @pytest.fixture
110
110
  def configured_uploader(self) -> Neo4jUploader:
@@ -199,13 +199,15 @@ async def validate_uploaded_graph(upload_file: Path):
199
199
  try:
200
200
  nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
201
201
  chunk_nodes_count = len(
202
- (await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
202
+ (await driver.execute_query(f"MATCH (n: {Label.CHUNK.value}) RETURN n"))[0]
203
203
  )
204
204
  document_nodes_count = len(
205
- (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
205
+ (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT.value}) RETURN n"))[0]
206
206
  )
207
207
  element_nodes_count = len(
208
- (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
208
+ (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT.value}) RETURN n"))[
209
+ 0
210
+ ]
209
211
  )
210
212
  with check:
211
213
  assert nodes_count == expected_nodes_count
@@ -217,12 +219,18 @@ async def validate_uploaded_graph(upload_file: Path):
217
219
  assert element_nodes_count == expected_element_count
218
220
 
219
221
  records, _, _ = await driver.execute_query(
220
- f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
222
+ f"""
223
+ MATCH ()-[r:{Relationship.PART_OF_DOCUMENT.value}]->(:{Label.DOCUMENT.value})
224
+ RETURN r
225
+ """
221
226
  )
222
227
  part_of_document_count = len(records)
223
228
 
224
229
  records, _, _ = await driver.execute_query(
225
- f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
230
+ f"""
231
+ MATCH (:{Label.CHUNK.value})-[r:{Relationship.NEXT_CHUNK.value}]->(:{Label.CHUNK.value})
232
+ RETURN r
233
+ """
226
234
  )
227
235
  next_chunk_count = len(records)
228
236
 
@@ -1,5 +1,8 @@
1
1
  import os
2
2
 
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
3
6
  from test.integration.connectors.utils.validation.source import (
4
7
  SourceValidationConfigs,
5
8
  get_all_file_data,
@@ -8,6 +11,7 @@ from test.integration.connectors.utils.validation.source import (
8
11
  )
9
12
  from unstructured_ingest.v2.interfaces import Downloader, Indexer
10
13
  from unstructured_ingest.v2.processes.connectors.notion.connector import (
14
+ CONNECTOR_TYPE,
11
15
  NotionAccessConfig,
12
16
  NotionConnectionConfig,
13
17
  NotionDownloader,
@@ -17,6 +21,7 @@ from unstructured_ingest.v2.processes.connectors.notion.connector import (
17
21
  )
18
22
 
19
23
 
24
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
20
25
  def test_notion_source_database(temp_dir):
21
26
  # Retrieve environment variables
22
27
  notion_api_key = os.environ["NOTION_API_KEY"]
@@ -55,6 +60,7 @@ def test_notion_source_database(temp_dir):
55
60
  )
56
61
 
57
62
 
63
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
58
64
  def test_notion_source_page(temp_dir):
59
65
  # Retrieve environment variables
60
66
  notion_api_key = os.environ["NOTION_API_KEY"]
@@ -93,6 +99,7 @@ def test_notion_source_page(temp_dir):
93
99
  )
94
100
 
95
101
 
102
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
96
103
  def source_connector_validation(
97
104
  indexer: Indexer,
98
105
  downloader: Downloader,
@@ -5,9 +5,7 @@ from pathlib import Path
5
5
  import pytest
6
6
  from office365.graph_client import GraphClient
7
7
 
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- )
8
+ from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
11
9
  from test.integration.utils import requires_env
12
10
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
11
  from unstructured_ingest.v2.processes.connectors.onedrive import (
@@ -67,7 +65,7 @@ def get_connection_config():
67
65
  return connection_config
68
66
 
69
67
 
70
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
71
69
  @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
72
70
  @pytest.mark.xfail(
73
71
  reason="Issues with test setup on the provider side."
@@ -12,9 +12,7 @@ from _pytest.fixtures import TopRequest
12
12
  from pinecone import Pinecone, ServerlessSpec
13
13
  from pinecone.core.openapi.shared.exceptions import NotFoundException
14
14
 
15
- from test.integration.connectors.utils.constants import (
16
- DESTINATION_TAG,
17
- )
15
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
18
16
  from test.integration.connectors.utils.validation.destination import (
19
17
  StagerValidationConfigs,
20
18
  stager_validation,
@@ -109,11 +107,15 @@ def pinecone_index() -> Generator[str, None, None]:
109
107
 
110
108
 
111
109
  def validate_pinecone_index(
112
- index_name: str, expected_num_of_vectors: int, retries=30, interval=1
110
+ index_name: str,
111
+ expected_num_of_vectors: int,
112
+ retries=30,
113
+ interval=1,
114
+ namespace: str = "default",
113
115
  ) -> None:
114
116
  # Because there's a delay for the index to catch up to the recent writes, add in a retry
115
117
  pinecone = Pinecone(api_key=get_api_key())
116
- index = pinecone.Index(name=index_name)
118
+ index = pinecone.Index(name=index_name, namespace=namespace)
117
119
  vector_count = -1
118
120
  for i in range(retries):
119
121
  index_stats = index.describe_index_stats()
@@ -133,13 +135,15 @@ def validate_pinecone_index(
133
135
 
134
136
  @requires_env(API_KEY)
135
137
  @pytest.mark.asyncio
136
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
138
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
137
139
  async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
140
+
138
141
  file_data = FileData(
139
142
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
140
143
  connector_type=CONNECTOR_TYPE,
141
144
  identifier="pinecone_mock_id",
142
145
  )
146
+
143
147
  connection_config = PineconeConnectionConfig(
144
148
  index_name=pinecone_index,
145
149
  access_config=PineconeAccessConfig(api_key=get_api_key()),
@@ -176,7 +180,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
176
180
 
177
181
  @requires_env(API_KEY)
178
182
  @pytest.mark.asyncio
179
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
183
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
180
184
  @pytest.mark.skip(reason="TODO: get this to work")
181
185
  async def test_pinecone_destination_large_index(
182
186
  pinecone_index: str, upload_file: Path, temp_dir: Path
@@ -227,7 +231,67 @@ async def test_pinecone_destination_large_index(
227
231
 
228
232
 
229
233
  @requires_env(API_KEY)
230
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
234
+ @pytest.mark.asyncio
235
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
236
+ async def test_pinecone_destination_namespace(
237
+ pinecone_index: str, upload_file: Path, temp_dir: Path
238
+ ):
239
+ """
240
+ tests namespace functionality of destination connector.
241
+ """
242
+
243
+ # creates a file data structure.
244
+ file_data = FileData(
245
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
246
+ connector_type=CONNECTOR_TYPE,
247
+ identifier="pinecone_mock_id",
248
+ )
249
+
250
+ connection_config = PineconeConnectionConfig(
251
+ index_name=pinecone_index,
252
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
253
+ )
254
+
255
+ stager_config = PineconeUploadStagerConfig()
256
+
257
+ stager = PineconeUploadStager(upload_stager_config=stager_config)
258
+ new_upload_file = stager.run(
259
+ elements_filepath=upload_file,
260
+ output_dir=temp_dir,
261
+ output_filename=upload_file.name,
262
+ file_data=file_data,
263
+ )
264
+
265
+ # here add namespace defintion
266
+ upload_config = PineconeUploaderConfig()
267
+ namespace_test_name = "user-1"
268
+ upload_config.namespace = namespace_test_name
269
+ uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
270
+ uploader.precheck()
271
+
272
+ uploader.run(path=new_upload_file, file_data=file_data)
273
+ with new_upload_file.open() as f:
274
+ staged_content = json.load(f)
275
+ expected_num_of_vectors = len(staged_content)
276
+ logger.info("validating first upload")
277
+ validate_pinecone_index(
278
+ index_name=pinecone_index,
279
+ expected_num_of_vectors=expected_num_of_vectors,
280
+ namespace=namespace_test_name,
281
+ )
282
+
283
+ # Rerun uploader and make sure no duplicates exist
284
+ uploader.run(path=new_upload_file, file_data=file_data)
285
+ logger.info("validating second upload")
286
+ validate_pinecone_index(
287
+ index_name=pinecone_index,
288
+ expected_num_of_vectors=expected_num_of_vectors,
289
+ namespace=namespace_test_name,
290
+ )
291
+
292
+
293
+ @requires_env(API_KEY)
294
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
231
295
  def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
232
296
  stager = PineconeUploadStager()
233
297
  uploader = PineconeUploader(
@@ -272,6 +336,7 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
272
336
  validate_pinecone_index(pinecone_index, 1, interval=5)
273
337
 
274
338
 
339
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
275
340
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
276
341
  def test_pinecone_stager(
277
342
  request: TopRequest,
@@ -9,7 +9,7 @@ import pytest
9
9
  from _pytest.fixtures import TopRequest
10
10
  from qdrant_client import AsyncQdrantClient
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
13
13
  from test.integration.connectors.utils.docker import container_context
14
14
  from test.integration.connectors.utils.validation.destination import (
15
15
  StagerValidationConfigs,
@@ -75,7 +75,7 @@ async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
75
75
 
76
76
 
77
77
  @pytest.mark.asyncio
78
- @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
78
+ @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
79
79
  async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
80
80
  connection_kwargs = {"path": str(tmp_path / "qdrant")}
81
81
  async with qdrant_client(connection_kwargs) as client:
@@ -117,7 +117,7 @@ def docker_context():
117
117
 
118
118
 
119
119
  @pytest.mark.asyncio
120
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
120
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
121
121
  async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
122
122
  connection_kwargs = {"location": "http://localhost:6333"}
123
123
  async with qdrant_client(connection_kwargs) as client:
@@ -153,7 +153,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
153
153
 
154
154
 
155
155
  @pytest.mark.asyncio
156
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
156
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
157
157
  @requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
158
158
  async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
159
159
  server_url = os.environ["QDRANT_SERVER_URL"]
@@ -197,6 +197,7 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
197
197
  await validate_upload(client=client, upload_file=upload_file)
198
198
 
199
199
 
200
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
200
201
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
201
202
  def test_qdrant_stager(
202
203
  request: TopRequest,
@@ -9,7 +9,7 @@ import pytest
9
9
  from redis import exceptions as redis_exceptions
10
10
  from redis.asyncio import Redis, from_url
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
13
  from test.integration.utils import requires_env
14
14
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
15
  from unstructured_ingest.v2.processes.connectors.redisdb import (
@@ -96,7 +96,7 @@ async def redis_destination_test(
96
96
 
97
97
 
98
98
  @pytest.mark.asyncio
99
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
99
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
100
100
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
101
101
  async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
102
102
  connection_kwargs = {
@@ -110,7 +110,7 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
110
110
 
111
111
 
112
112
  @pytest.mark.asyncio
113
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
113
+ @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
114
114
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
115
  async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
116
  connection_kwargs = {}