unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +10 -6
- test/integration/connectors/discord/test_discord.py +4 -4
- test/integration/connectors/duckdb/test_duckdb.py +3 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
- test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
- test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
- test/integration/connectors/sql/test_postgres.py +9 -3
- test/integration/connectors/sql/test_singlestore.py +9 -3
- test/integration/connectors/sql/test_snowflake.py +9 -3
- test/integration/connectors/sql/test_sqlite.py +9 -3
- test/integration/connectors/test_astradb.py +25 -9
- test/integration/connectors/test_azure_ai_search.py +3 -4
- test/integration/connectors/test_chroma.py +4 -6
- test/integration/connectors/test_confluence.py +3 -5
- test/integration/connectors/test_delta_table.py +4 -6
- test/integration/connectors/test_lancedb.py +3 -3
- test/integration/connectors/test_milvus.py +10 -5
- test/integration/connectors/test_mongodb.py +9 -9
- test/integration/connectors/test_neo4j.py +16 -8
- test/integration/connectors/test_notion.py +7 -0
- test/integration/connectors/test_onedrive.py +2 -4
- test/integration/connectors/test_pinecone.py +73 -8
- test/integration/connectors/test_qdrant.py +5 -4
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +7 -6
- test/integration/connectors/test_vectara.py +2 -2
- test/integration/connectors/utils/constants.py +6 -0
- test/integration/connectors/utils/docker.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +5 -0
- test/integration/connectors/weaviate/test_local.py +2 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
- unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +20 -18
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +47 -44
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
|
@@ -23,9 +23,7 @@ from azure.search.documents.indexes.models import (
|
|
|
23
23
|
VectorSearchProfile,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
-
from test.integration.connectors.utils.constants import
|
|
27
|
-
DESTINATION_TAG,
|
|
28
|
-
)
|
|
26
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
29
27
|
from test.integration.connectors.utils.validation.destination import (
|
|
30
28
|
StagerValidationConfigs,
|
|
31
29
|
stager_validation,
|
|
@@ -195,7 +193,7 @@ def validate_count(
|
|
|
195
193
|
|
|
196
194
|
|
|
197
195
|
@pytest.mark.asyncio
|
|
198
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
196
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
199
197
|
@requires_env("AZURE_SEARCH_API_KEY")
|
|
200
198
|
async def test_azure_ai_search_destination(
|
|
201
199
|
upload_file: Path,
|
|
@@ -239,6 +237,7 @@ async def test_azure_ai_search_destination(
|
|
|
239
237
|
validate_count(search_client=search_client, expected_count=expected_count)
|
|
240
238
|
|
|
241
239
|
|
|
240
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
242
241
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
243
242
|
def test_azure_ai_search_stager(
|
|
244
243
|
request: TopRequest,
|
|
@@ -5,9 +5,7 @@ import chromadb
|
|
|
5
5
|
import pytest
|
|
6
6
|
from _pytest.fixtures import TopRequest
|
|
7
7
|
|
|
8
|
-
from test.integration.connectors.utils.constants import
|
|
9
|
-
DESTINATION_TAG,
|
|
10
|
-
)
|
|
8
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
11
9
|
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
12
10
|
from test.integration.connectors.utils.validation.destination import (
|
|
13
11
|
StagerValidationConfigs,
|
|
@@ -27,7 +25,7 @@ from unstructured_ingest.v2.processes.connectors.chroma import (
|
|
|
27
25
|
@pytest.fixture
|
|
28
26
|
def chroma_instance():
|
|
29
27
|
with container_context(
|
|
30
|
-
image="chromadb/chroma:
|
|
28
|
+
image="chromadb/chroma:0.6.2",
|
|
31
29
|
ports={8000: 8000},
|
|
32
30
|
name="chroma_int_test",
|
|
33
31
|
healthcheck=HealthCheck(
|
|
@@ -64,7 +62,7 @@ def validate_collection(collection_name: str, num_embeddings: int):
|
|
|
64
62
|
)
|
|
65
63
|
|
|
66
64
|
|
|
67
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
65
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
68
66
|
def test_chroma_destination(
|
|
69
67
|
upload_file: Path,
|
|
70
68
|
chroma_instance,
|
|
@@ -104,7 +102,7 @@ def test_chroma_destination(
|
|
|
104
102
|
|
|
105
103
|
|
|
106
104
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
107
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
|
|
105
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager", VECTOR_DB_TAG)
|
|
108
106
|
def test_chroma_stager(
|
|
109
107
|
request: TopRequest,
|
|
110
108
|
upload_file_str: str,
|
|
@@ -2,9 +2,7 @@ import os
|
|
|
2
2
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
|
-
from test.integration.connectors.utils.constants import
|
|
6
|
-
SOURCE_TAG,
|
|
7
|
-
)
|
|
5
|
+
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
8
6
|
from test.integration.connectors.utils.validation.source import (
|
|
9
7
|
SourceValidationConfigs,
|
|
10
8
|
source_connector_validation,
|
|
@@ -22,7 +20,7 @@ from unstructured_ingest.v2.processes.connectors.confluence import (
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
@pytest.mark.asyncio
|
|
25
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
23
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
26
24
|
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
27
25
|
async def test_confluence_source(temp_dir):
|
|
28
26
|
# Retrieve environment variables
|
|
@@ -69,7 +67,7 @@ async def test_confluence_source(temp_dir):
|
|
|
69
67
|
|
|
70
68
|
|
|
71
69
|
@pytest.mark.asyncio
|
|
72
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
70
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
73
71
|
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
74
72
|
async def test_confluence_source_large(temp_dir):
|
|
75
73
|
# Retrieve environment variables
|
|
@@ -6,9 +6,7 @@ import pytest
|
|
|
6
6
|
from deltalake import DeltaTable
|
|
7
7
|
from fsspec import get_filesystem_class
|
|
8
8
|
|
|
9
|
-
from test.integration.connectors.utils.constants import
|
|
10
|
-
DESTINATION_TAG,
|
|
11
|
-
)
|
|
9
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
12
10
|
from test.integration.utils import requires_env
|
|
13
11
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
14
12
|
from unstructured_ingest.v2.processes.connectors.delta_table import (
|
|
@@ -25,7 +23,7 @@ multiprocessing.set_start_method("spawn")
|
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
@pytest.mark.asyncio
|
|
28
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
26
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
29
27
|
async def test_delta_table_destination_local(upload_file: Path, temp_dir: Path):
|
|
30
28
|
destination_path = str(temp_dir)
|
|
31
29
|
connection_config = DeltaTableConnectionConfig(
|
|
@@ -81,7 +79,7 @@ def get_aws_credentials() -> dict:
|
|
|
81
79
|
|
|
82
80
|
|
|
83
81
|
@pytest.mark.asyncio
|
|
84
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
82
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
85
83
|
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
86
84
|
async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
|
|
87
85
|
aws_credentials = get_aws_credentials()
|
|
@@ -140,7 +138,7 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
|
|
|
140
138
|
|
|
141
139
|
|
|
142
140
|
@pytest.mark.asyncio
|
|
143
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
141
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
144
142
|
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
145
143
|
async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
|
|
146
144
|
aws_credentials = {
|
|
@@ -11,7 +11,7 @@ import pytest_asyncio
|
|
|
11
11
|
from lancedb import AsyncConnection
|
|
12
12
|
from upath import UPath
|
|
13
13
|
|
|
14
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
14
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
15
15
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
16
16
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
17
17
|
from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
|
|
@@ -106,7 +106,7 @@ async def connection_with_uri(request, tmp_path: Path):
|
|
|
106
106
|
|
|
107
107
|
|
|
108
108
|
@pytest.mark.asyncio
|
|
109
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
109
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
110
110
|
@pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
|
|
111
111
|
async def test_lancedb_destination(
|
|
112
112
|
upload_file: Path,
|
|
@@ -164,7 +164,7 @@ async def test_lancedb_destination(
|
|
|
164
164
|
|
|
165
165
|
|
|
166
166
|
class TestPrecheck:
|
|
167
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
167
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
168
168
|
@pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
|
|
169
169
|
def test_succeeds(
|
|
170
170
|
self,
|
|
@@ -13,7 +13,11 @@ from pymilvus import (
|
|
|
13
13
|
)
|
|
14
14
|
from pymilvus.milvus_client import IndexParams
|
|
15
15
|
|
|
16
|
-
from test.integration.connectors.utils.constants import
|
|
16
|
+
from test.integration.connectors.utils.constants import (
|
|
17
|
+
DESTINATION_TAG,
|
|
18
|
+
VECTOR_DB_TAG,
|
|
19
|
+
env_setup_path,
|
|
20
|
+
)
|
|
17
21
|
from test.integration.connectors.utils.docker import healthcheck_wait
|
|
18
22
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
19
23
|
from test.integration.connectors.utils.validation.destination import (
|
|
@@ -112,7 +116,7 @@ def validate_count(
|
|
|
112
116
|
|
|
113
117
|
|
|
114
118
|
@pytest.mark.asyncio
|
|
115
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
119
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
116
120
|
async def test_milvus_destination(
|
|
117
121
|
upload_file: Path,
|
|
118
122
|
collection: str,
|
|
@@ -150,7 +154,7 @@ async def test_milvus_destination(
|
|
|
150
154
|
validate_count(client=client, expected_count=expected_count)
|
|
151
155
|
|
|
152
156
|
|
|
153
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
157
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
154
158
|
def test_precheck_succeeds(collection: str):
|
|
155
159
|
uploader = MilvusUploader(
|
|
156
160
|
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
@@ -159,7 +163,7 @@ def test_precheck_succeeds(collection: str):
|
|
|
159
163
|
uploader.precheck()
|
|
160
164
|
|
|
161
165
|
|
|
162
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
166
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
163
167
|
def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
164
168
|
uploader = MilvusUploader(
|
|
165
169
|
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
@@ -174,7 +178,7 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
|
174
178
|
uploader.precheck()
|
|
175
179
|
|
|
176
180
|
|
|
177
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
181
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
178
182
|
def test_precheck_fails_on_nonexisting_db(collection: str):
|
|
179
183
|
uploader = MilvusUploader(
|
|
180
184
|
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
@@ -187,6 +191,7 @@ def test_precheck_fails_on_nonexisting_db(collection: str):
|
|
|
187
191
|
uploader.precheck()
|
|
188
192
|
|
|
189
193
|
|
|
194
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
190
195
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
191
196
|
def test_milvus_stager(
|
|
192
197
|
request: TopRequest,
|
|
@@ -13,7 +13,7 @@ from pymongo.database import Database
|
|
|
13
13
|
from pymongo.mongo_client import MongoClient
|
|
14
14
|
from pymongo.operations import SearchIndexModel
|
|
15
15
|
|
|
16
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
16
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
|
|
17
17
|
from test.integration.connectors.utils.validation.source import (
|
|
18
18
|
SourceValidationConfigs,
|
|
19
19
|
source_connector_validation,
|
|
@@ -180,7 +180,7 @@ def validate_collection_vector(
|
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@pytest.mark.asyncio
|
|
183
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
183
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
184
184
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
185
185
|
async def test_mongodb_source(temp_dir: Path):
|
|
186
186
|
env_data = get_env_data()
|
|
@@ -205,7 +205,7 @@ async def test_mongodb_source(temp_dir: Path):
|
|
|
205
205
|
)
|
|
206
206
|
|
|
207
207
|
|
|
208
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
208
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
209
209
|
def test_mongodb_indexer_precheck_fail_no_host():
|
|
210
210
|
indexer_config = MongoDBIndexerConfig(
|
|
211
211
|
database="non-existent-database", collection="non-existent-database"
|
|
@@ -218,7 +218,7 @@ def test_mongodb_indexer_precheck_fail_no_host():
|
|
|
218
218
|
indexer.precheck()
|
|
219
219
|
|
|
220
220
|
|
|
221
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
221
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
222
222
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
223
223
|
def test_mongodb_indexer_precheck_fail_no_database():
|
|
224
224
|
env_data = get_env_data()
|
|
@@ -233,7 +233,7 @@ def test_mongodb_indexer_precheck_fail_no_database():
|
|
|
233
233
|
indexer.precheck()
|
|
234
234
|
|
|
235
235
|
|
|
236
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
236
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
237
237
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
238
238
|
def test_mongodb_indexer_precheck_fail_no_collection():
|
|
239
239
|
env_data = get_env_data()
|
|
@@ -249,7 +249,7 @@ def test_mongodb_indexer_precheck_fail_no_collection():
|
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
@pytest.mark.asyncio
|
|
252
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
252
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
253
253
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
254
254
|
async def test_mongodb_destination(
|
|
255
255
|
upload_file: Path,
|
|
@@ -289,7 +289,7 @@ async def test_mongodb_destination(
|
|
|
289
289
|
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
290
290
|
|
|
291
291
|
|
|
292
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
292
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
293
293
|
def test_mongodb_uploader_precheck_fail_no_host():
|
|
294
294
|
upload_config = MongoDBUploaderConfig(
|
|
295
295
|
database="database",
|
|
@@ -303,7 +303,7 @@ def test_mongodb_uploader_precheck_fail_no_host():
|
|
|
303
303
|
uploader.precheck()
|
|
304
304
|
|
|
305
305
|
|
|
306
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
306
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
307
307
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
308
308
|
def test_mongodb_uploader_precheck_fail_no_database():
|
|
309
309
|
env_data = get_env_data()
|
|
@@ -319,7 +319,7 @@ def test_mongodb_uploader_precheck_fail_no_database():
|
|
|
319
319
|
uploader.precheck()
|
|
320
320
|
|
|
321
321
|
|
|
322
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
322
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
323
323
|
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
324
324
|
def test_mongodb_uploader_precheck_fail_no_collection():
|
|
325
325
|
env_data = get_env_data()
|
|
@@ -9,7 +9,7 @@ from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
|
|
|
9
9
|
from neo4j.exceptions import ServiceUnavailable
|
|
10
10
|
from pytest_check import check
|
|
11
11
|
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_DB_TAG
|
|
13
13
|
from test.integration.connectors.utils.docker import container_context
|
|
14
14
|
from unstructured_ingest.error import DestinationConnectionError
|
|
15
15
|
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
@@ -51,7 +51,7 @@ def _neo4j_server():
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@pytest.mark.asyncio
|
|
54
|
-
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
|
|
54
|
+
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
|
|
55
55
|
async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
|
|
56
56
|
stager = Neo4jUploadStager()
|
|
57
57
|
uploader = Neo4jUploader(
|
|
@@ -104,7 +104,7 @@ async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
|
|
|
104
104
|
await validate_uploaded_graph(modified_upload_file)
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
|
|
107
|
+
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
|
|
108
108
|
class TestPrecheck:
|
|
109
109
|
@pytest.fixture
|
|
110
110
|
def configured_uploader(self) -> Neo4jUploader:
|
|
@@ -199,13 +199,15 @@ async def validate_uploaded_graph(upload_file: Path):
|
|
|
199
199
|
try:
|
|
200
200
|
nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
|
|
201
201
|
chunk_nodes_count = len(
|
|
202
|
-
(await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
|
|
202
|
+
(await driver.execute_query(f"MATCH (n: {Label.CHUNK.value}) RETURN n"))[0]
|
|
203
203
|
)
|
|
204
204
|
document_nodes_count = len(
|
|
205
|
-
(await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
|
|
205
|
+
(await driver.execute_query(f"MATCH (n: {Label.DOCUMENT.value}) RETURN n"))[0]
|
|
206
206
|
)
|
|
207
207
|
element_nodes_count = len(
|
|
208
|
-
(await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[
|
|
208
|
+
(await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT.value}) RETURN n"))[
|
|
209
|
+
0
|
|
210
|
+
]
|
|
209
211
|
)
|
|
210
212
|
with check:
|
|
211
213
|
assert nodes_count == expected_nodes_count
|
|
@@ -217,12 +219,18 @@ async def validate_uploaded_graph(upload_file: Path):
|
|
|
217
219
|
assert element_nodes_count == expected_element_count
|
|
218
220
|
|
|
219
221
|
records, _, _ = await driver.execute_query(
|
|
220
|
-
f"
|
|
222
|
+
f"""
|
|
223
|
+
MATCH ()-[r:{Relationship.PART_OF_DOCUMENT.value}]->(:{Label.DOCUMENT.value})
|
|
224
|
+
RETURN r
|
|
225
|
+
"""
|
|
221
226
|
)
|
|
222
227
|
part_of_document_count = len(records)
|
|
223
228
|
|
|
224
229
|
records, _, _ = await driver.execute_query(
|
|
225
|
-
f"
|
|
230
|
+
f"""
|
|
231
|
+
MATCH (:{Label.CHUNK.value})-[r:{Relationship.NEXT_CHUNK.value}]->(:{Label.CHUNK.value})
|
|
232
|
+
RETURN r
|
|
233
|
+
"""
|
|
226
234
|
)
|
|
227
235
|
next_chunk_count = len(records)
|
|
228
236
|
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
3
6
|
from test.integration.connectors.utils.validation.source import (
|
|
4
7
|
SourceValidationConfigs,
|
|
5
8
|
get_all_file_data,
|
|
@@ -8,6 +11,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
8
11
|
)
|
|
9
12
|
from unstructured_ingest.v2.interfaces import Downloader, Indexer
|
|
10
13
|
from unstructured_ingest.v2.processes.connectors.notion.connector import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
11
15
|
NotionAccessConfig,
|
|
12
16
|
NotionConnectionConfig,
|
|
13
17
|
NotionDownloader,
|
|
@@ -17,6 +21,7 @@ from unstructured_ingest.v2.processes.connectors.notion.connector import (
|
|
|
17
21
|
)
|
|
18
22
|
|
|
19
23
|
|
|
24
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
20
25
|
def test_notion_source_database(temp_dir):
|
|
21
26
|
# Retrieve environment variables
|
|
22
27
|
notion_api_key = os.environ["NOTION_API_KEY"]
|
|
@@ -55,6 +60,7 @@ def test_notion_source_database(temp_dir):
|
|
|
55
60
|
)
|
|
56
61
|
|
|
57
62
|
|
|
63
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
58
64
|
def test_notion_source_page(temp_dir):
|
|
59
65
|
# Retrieve environment variables
|
|
60
66
|
notion_api_key = os.environ["NOTION_API_KEY"]
|
|
@@ -93,6 +99,7 @@ def test_notion_source_page(temp_dir):
|
|
|
93
99
|
)
|
|
94
100
|
|
|
95
101
|
|
|
102
|
+
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
96
103
|
def source_connector_validation(
|
|
97
104
|
indexer: Indexer,
|
|
98
105
|
downloader: Downloader,
|
|
@@ -5,9 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
import pytest
|
|
6
6
|
from office365.graph_client import GraphClient
|
|
7
7
|
|
|
8
|
-
from test.integration.connectors.utils.constants import
|
|
9
|
-
DESTINATION_TAG,
|
|
10
|
-
)
|
|
8
|
+
from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, DESTINATION_TAG
|
|
11
9
|
from test.integration.utils import requires_env
|
|
12
10
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
11
|
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
@@ -67,7 +65,7 @@ def get_connection_config():
|
|
|
67
65
|
return connection_config
|
|
68
66
|
|
|
69
67
|
|
|
70
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
71
69
|
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
72
70
|
@pytest.mark.xfail(
|
|
73
71
|
reason="Issues with test setup on the provider side."
|
|
@@ -12,9 +12,7 @@ from _pytest.fixtures import TopRequest
|
|
|
12
12
|
from pinecone import Pinecone, ServerlessSpec
|
|
13
13
|
from pinecone.core.openapi.shared.exceptions import NotFoundException
|
|
14
14
|
|
|
15
|
-
from test.integration.connectors.utils.constants import
|
|
16
|
-
DESTINATION_TAG,
|
|
17
|
-
)
|
|
15
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
18
16
|
from test.integration.connectors.utils.validation.destination import (
|
|
19
17
|
StagerValidationConfigs,
|
|
20
18
|
stager_validation,
|
|
@@ -109,11 +107,15 @@ def pinecone_index() -> Generator[str, None, None]:
|
|
|
109
107
|
|
|
110
108
|
|
|
111
109
|
def validate_pinecone_index(
|
|
112
|
-
index_name: str,
|
|
110
|
+
index_name: str,
|
|
111
|
+
expected_num_of_vectors: int,
|
|
112
|
+
retries=30,
|
|
113
|
+
interval=1,
|
|
114
|
+
namespace: str = "default",
|
|
113
115
|
) -> None:
|
|
114
116
|
# Because there's a delay for the index to catch up to the recent writes, add in a retry
|
|
115
117
|
pinecone = Pinecone(api_key=get_api_key())
|
|
116
|
-
index = pinecone.Index(name=index_name)
|
|
118
|
+
index = pinecone.Index(name=index_name, namespace=namespace)
|
|
117
119
|
vector_count = -1
|
|
118
120
|
for i in range(retries):
|
|
119
121
|
index_stats = index.describe_index_stats()
|
|
@@ -133,13 +135,15 @@ def validate_pinecone_index(
|
|
|
133
135
|
|
|
134
136
|
@requires_env(API_KEY)
|
|
135
137
|
@pytest.mark.asyncio
|
|
136
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
138
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
137
139
|
async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
|
|
140
|
+
|
|
138
141
|
file_data = FileData(
|
|
139
142
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
140
143
|
connector_type=CONNECTOR_TYPE,
|
|
141
144
|
identifier="pinecone_mock_id",
|
|
142
145
|
)
|
|
146
|
+
|
|
143
147
|
connection_config = PineconeConnectionConfig(
|
|
144
148
|
index_name=pinecone_index,
|
|
145
149
|
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
@@ -176,7 +180,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
|
|
|
176
180
|
|
|
177
181
|
@requires_env(API_KEY)
|
|
178
182
|
@pytest.mark.asyncio
|
|
179
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
183
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
180
184
|
@pytest.mark.skip(reason="TODO: get this to work")
|
|
181
185
|
async def test_pinecone_destination_large_index(
|
|
182
186
|
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
@@ -227,7 +231,67 @@ async def test_pinecone_destination_large_index(
|
|
|
227
231
|
|
|
228
232
|
|
|
229
233
|
@requires_env(API_KEY)
|
|
230
|
-
@pytest.mark.
|
|
234
|
+
@pytest.mark.asyncio
|
|
235
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
236
|
+
async def test_pinecone_destination_namespace(
|
|
237
|
+
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
238
|
+
):
|
|
239
|
+
"""
|
|
240
|
+
tests namespace functionality of destination connector.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
# creates a file data structure.
|
|
244
|
+
file_data = FileData(
|
|
245
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
246
|
+
connector_type=CONNECTOR_TYPE,
|
|
247
|
+
identifier="pinecone_mock_id",
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
connection_config = PineconeConnectionConfig(
|
|
251
|
+
index_name=pinecone_index,
|
|
252
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
stager_config = PineconeUploadStagerConfig()
|
|
256
|
+
|
|
257
|
+
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
258
|
+
new_upload_file = stager.run(
|
|
259
|
+
elements_filepath=upload_file,
|
|
260
|
+
output_dir=temp_dir,
|
|
261
|
+
output_filename=upload_file.name,
|
|
262
|
+
file_data=file_data,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# here add namespace defintion
|
|
266
|
+
upload_config = PineconeUploaderConfig()
|
|
267
|
+
namespace_test_name = "user-1"
|
|
268
|
+
upload_config.namespace = namespace_test_name
|
|
269
|
+
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
270
|
+
uploader.precheck()
|
|
271
|
+
|
|
272
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
273
|
+
with new_upload_file.open() as f:
|
|
274
|
+
staged_content = json.load(f)
|
|
275
|
+
expected_num_of_vectors = len(staged_content)
|
|
276
|
+
logger.info("validating first upload")
|
|
277
|
+
validate_pinecone_index(
|
|
278
|
+
index_name=pinecone_index,
|
|
279
|
+
expected_num_of_vectors=expected_num_of_vectors,
|
|
280
|
+
namespace=namespace_test_name,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Rerun uploader and make sure no duplicates exist
|
|
284
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
285
|
+
logger.info("validating second upload")
|
|
286
|
+
validate_pinecone_index(
|
|
287
|
+
index_name=pinecone_index,
|
|
288
|
+
expected_num_of_vectors=expected_num_of_vectors,
|
|
289
|
+
namespace=namespace_test_name,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@requires_env(API_KEY)
|
|
294
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
231
295
|
def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
232
296
|
stager = PineconeUploadStager()
|
|
233
297
|
uploader = PineconeUploader(
|
|
@@ -272,6 +336,7 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
|
272
336
|
validate_pinecone_index(pinecone_index, 1, interval=5)
|
|
273
337
|
|
|
274
338
|
|
|
339
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
275
340
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
276
341
|
def test_pinecone_stager(
|
|
277
342
|
request: TopRequest,
|
|
@@ -9,7 +9,7 @@ import pytest
|
|
|
9
9
|
from _pytest.fixtures import TopRequest
|
|
10
10
|
from qdrant_client import AsyncQdrantClient
|
|
11
11
|
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
13
13
|
from test.integration.connectors.utils.docker import container_context
|
|
14
14
|
from test.integration.connectors.utils.validation.destination import (
|
|
15
15
|
StagerValidationConfigs,
|
|
@@ -75,7 +75,7 @@ async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
|
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
@pytest.mark.asyncio
|
|
78
|
-
@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
78
|
+
@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
79
79
|
async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
|
|
80
80
|
connection_kwargs = {"path": str(tmp_path / "qdrant")}
|
|
81
81
|
async with qdrant_client(connection_kwargs) as client:
|
|
@@ -117,7 +117,7 @@ def docker_context():
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
@pytest.mark.asyncio
|
|
120
|
-
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
120
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
121
121
|
async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
|
|
122
122
|
connection_kwargs = {"location": "http://localhost:6333"}
|
|
123
123
|
async with qdrant_client(connection_kwargs) as client:
|
|
@@ -153,7 +153,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
|
|
|
153
153
|
|
|
154
154
|
|
|
155
155
|
@pytest.mark.asyncio
|
|
156
|
-
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
156
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
157
157
|
@requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
|
|
158
158
|
async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
|
|
159
159
|
server_url = os.environ["QDRANT_SERVER_URL"]
|
|
@@ -197,6 +197,7 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
|
|
|
197
197
|
await validate_upload(client=client, upload_file=upload_file)
|
|
198
198
|
|
|
199
199
|
|
|
200
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
200
201
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
201
202
|
def test_qdrant_stager(
|
|
202
203
|
request: TopRequest,
|
|
@@ -9,7 +9,7 @@ import pytest
|
|
|
9
9
|
from redis import exceptions as redis_exceptions
|
|
10
10
|
from redis.asyncio import Redis, from_url
|
|
11
11
|
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
13
13
|
from test.integration.utils import requires_env
|
|
14
14
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
15
|
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
@@ -96,7 +96,7 @@ async def redis_destination_test(
|
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
@pytest.mark.asyncio
|
|
99
|
-
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG)
|
|
99
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
100
100
|
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
101
101
|
async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
|
|
102
102
|
connection_kwargs = {
|
|
@@ -110,7 +110,7 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
|
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
@pytest.mark.asyncio
|
|
113
|
-
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis")
|
|
113
|
+
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
|
|
114
114
|
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
115
115
|
async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
|
|
116
116
|
connection_kwargs = {}
|