unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import chromadb
|
|
5
|
+
import pytest
|
|
6
|
+
from _pytest.fixtures import TopRequest
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
)
|
|
11
|
+
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
12
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
13
|
+
StagerValidationConfigs,
|
|
14
|
+
stager_validation,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.chroma import (
|
|
18
|
+
CONNECTOR_TYPE,
|
|
19
|
+
ChromaConnectionConfig,
|
|
20
|
+
ChromaUploader,
|
|
21
|
+
ChromaUploaderConfig,
|
|
22
|
+
ChromaUploadStager,
|
|
23
|
+
ChromaUploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def chroma_instance():
|
|
29
|
+
with container_context(
|
|
30
|
+
image="chromadb/chroma:latest",
|
|
31
|
+
ports={8000: 8000},
|
|
32
|
+
name="chroma_int_test",
|
|
33
|
+
healthcheck=HealthCheck(
|
|
34
|
+
interval=5,
|
|
35
|
+
timeout=10,
|
|
36
|
+
retries=3,
|
|
37
|
+
test="timeout 10s bash -c ':> /dev/tcp/127.0.0.1/8000' || exit 1",
|
|
38
|
+
),
|
|
39
|
+
) as ctx:
|
|
40
|
+
yield ctx
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_collection(collection_name: str, num_embeddings: int):
|
|
44
|
+
print(f"Checking contents of Chroma collection: {collection_name}")
|
|
45
|
+
|
|
46
|
+
chroma_client = chromadb.HttpClient(
|
|
47
|
+
host="localhost",
|
|
48
|
+
port="8000",
|
|
49
|
+
tenant="default_tenant",
|
|
50
|
+
database="default_database",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
|
54
|
+
|
|
55
|
+
number_of_embeddings = collection.count()
|
|
56
|
+
expected_embeddings = num_embeddings
|
|
57
|
+
print(
|
|
58
|
+
f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
assert number_of_embeddings == expected_embeddings, (
|
|
62
|
+
f"Number of rows in generated table ({number_of_embeddings}) "
|
|
63
|
+
f"doesn't match expected value: {expected_embeddings}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
|
+
def test_chroma_destination(
|
|
69
|
+
upload_file: Path,
|
|
70
|
+
chroma_instance,
|
|
71
|
+
tmp_path: Path,
|
|
72
|
+
):
|
|
73
|
+
collection_name = "test_collection"
|
|
74
|
+
file_data = FileData(
|
|
75
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
76
|
+
connector_type=CONNECTOR_TYPE,
|
|
77
|
+
identifier="mock file data",
|
|
78
|
+
)
|
|
79
|
+
stager = ChromaUploadStager(upload_stager_config=ChromaUploadStagerConfig())
|
|
80
|
+
|
|
81
|
+
uploader = ChromaUploader(
|
|
82
|
+
connection_config=ChromaConnectionConfig(
|
|
83
|
+
host="localhost",
|
|
84
|
+
port=8000,
|
|
85
|
+
tenant="default_tenant",
|
|
86
|
+
database="default_database",
|
|
87
|
+
),
|
|
88
|
+
upload_config=ChromaUploaderConfig(collection_name=collection_name),
|
|
89
|
+
)
|
|
90
|
+
staged_filepath = stager.run(
|
|
91
|
+
elements_filepath=upload_file,
|
|
92
|
+
file_data=file_data,
|
|
93
|
+
output_dir=tmp_path,
|
|
94
|
+
output_filename=upload_file.name,
|
|
95
|
+
)
|
|
96
|
+
uploader.precheck()
|
|
97
|
+
uploader.run(path=staged_filepath, file_data=file_data)
|
|
98
|
+
|
|
99
|
+
# Run validation
|
|
100
|
+
with staged_filepath.open() as f:
|
|
101
|
+
staged_elements = json.load(f)
|
|
102
|
+
expected_count = len(staged_elements)
|
|
103
|
+
validate_collection(collection_name=collection_name, num_embeddings=expected_count)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
107
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
|
|
108
|
+
def test_chroma_stager(
|
|
109
|
+
request: TopRequest,
|
|
110
|
+
upload_file_str: str,
|
|
111
|
+
tmp_path: Path,
|
|
112
|
+
):
|
|
113
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
114
|
+
stager = ChromaUploadStager()
|
|
115
|
+
stager_validation(
|
|
116
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
117
|
+
input_file=upload_file,
|
|
118
|
+
stager=stager,
|
|
119
|
+
tmp_dir=tmp_path,
|
|
120
|
+
)
|
|
@@ -5,8 +5,8 @@ import pytest
|
|
|
5
5
|
from test.integration.connectors.utils.constants import (
|
|
6
6
|
SOURCE_TAG,
|
|
7
7
|
)
|
|
8
|
-
from test.integration.connectors.utils.validation import (
|
|
9
|
-
|
|
8
|
+
from test.integration.connectors.utils.validation.source import (
|
|
9
|
+
SourceValidationConfigs,
|
|
10
10
|
source_connector_validation,
|
|
11
11
|
)
|
|
12
12
|
from test.integration.utils import requires_env
|
|
@@ -60,7 +60,7 @@ async def test_confluence_source(temp_dir):
|
|
|
60
60
|
await source_connector_validation(
|
|
61
61
|
indexer=indexer,
|
|
62
62
|
downloader=downloader,
|
|
63
|
-
configs=
|
|
63
|
+
configs=SourceValidationConfigs(
|
|
64
64
|
test_id="confluence",
|
|
65
65
|
expected_num_files=11,
|
|
66
66
|
validate_downloaded_files=True,
|
|
@@ -107,7 +107,7 @@ async def test_confluence_source_large(temp_dir):
|
|
|
107
107
|
await source_connector_validation(
|
|
108
108
|
indexer=indexer,
|
|
109
109
|
downloader=downloader,
|
|
110
|
-
configs=
|
|
110
|
+
configs=SourceValidationConfigs(
|
|
111
111
|
test_id="confluence_large", expected_num_files=250, validate_file_data=False
|
|
112
112
|
),
|
|
113
113
|
)
|
|
@@ -14,8 +14,8 @@ from test.integration.connectors.utils.constants import (
|
|
|
14
14
|
env_setup_path,
|
|
15
15
|
)
|
|
16
16
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
17
|
-
from test.integration.connectors.utils.validation import (
|
|
18
|
-
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
19
|
source_connector_validation,
|
|
20
20
|
)
|
|
21
21
|
from test.integration.utils import requires_env
|
|
@@ -121,8 +121,8 @@ async def test_kafka_source_local(kafka_seed_topic: str):
|
|
|
121
121
|
await source_connector_validation(
|
|
122
122
|
indexer=indexer,
|
|
123
123
|
downloader=downloader,
|
|
124
|
-
configs=
|
|
125
|
-
test_id="kafka", expected_num_files=5, validate_downloaded_files=True
|
|
124
|
+
configs=SourceValidationConfigs(
|
|
125
|
+
test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
|
|
126
126
|
),
|
|
127
127
|
)
|
|
128
128
|
|
|
@@ -203,8 +203,8 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
|
|
|
203
203
|
await source_connector_validation(
|
|
204
204
|
indexer=indexer,
|
|
205
205
|
downloader=downloader,
|
|
206
|
-
configs=
|
|
207
|
-
test_id="kafka",
|
|
206
|
+
configs=SourceValidationConfigs(
|
|
207
|
+
test_id="kafka-cloud",
|
|
208
208
|
exclude_fields_extend=["connector_type"],
|
|
209
209
|
expected_num_files=expected_messages,
|
|
210
210
|
validate_downloaded_files=True,
|
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
import docker
|
|
6
6
|
import pytest
|
|
7
|
+
from _pytest.fixtures import TopRequest
|
|
7
8
|
from pymilvus import (
|
|
8
9
|
CollectionSchema,
|
|
9
10
|
DataType,
|
|
@@ -15,6 +16,10 @@ from pymilvus.milvus_client import IndexParams
|
|
|
15
16
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
16
17
|
from test.integration.connectors.utils.docker import healthcheck_wait
|
|
17
18
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
19
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
20
|
+
StagerValidationConfigs,
|
|
21
|
+
stager_validation,
|
|
22
|
+
)
|
|
18
23
|
from unstructured_ingest.error import DestinationConnectionError
|
|
19
24
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
20
25
|
from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
@@ -167,3 +172,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
|
167
172
|
match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
|
|
168
173
|
):
|
|
169
174
|
uploader.precheck()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
178
|
+
def test_milvus_stager(
|
|
179
|
+
request: TopRequest,
|
|
180
|
+
upload_file_str: str,
|
|
181
|
+
tmp_path: Path,
|
|
182
|
+
):
|
|
183
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
184
|
+
stager = MilvusUploadStager()
|
|
185
|
+
stager_validation(
|
|
186
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
187
|
+
input_file=upload_file,
|
|
188
|
+
stager=stager,
|
|
189
|
+
tmp_dir=tmp_path,
|
|
190
|
+
)
|
|
@@ -14,8 +14,8 @@ from pymongo.mongo_client import MongoClient
|
|
|
14
14
|
from pymongo.operations import SearchIndexModel
|
|
15
15
|
|
|
16
16
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
17
|
-
from test.integration.connectors.utils.validation import (
|
|
18
|
-
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
19
|
source_connector_validation,
|
|
20
20
|
)
|
|
21
21
|
from test.integration.utils import requires_env
|
|
@@ -196,8 +196,11 @@ async def test_mongodb_source(temp_dir: Path):
|
|
|
196
196
|
await source_connector_validation(
|
|
197
197
|
indexer=indexer,
|
|
198
198
|
downloader=downloader,
|
|
199
|
-
configs=
|
|
200
|
-
test_id=CONNECTOR_TYPE,
|
|
199
|
+
configs=SourceValidationConfigs(
|
|
200
|
+
test_id=CONNECTOR_TYPE,
|
|
201
|
+
expected_num_files=4,
|
|
202
|
+
validate_downloaded_files=True,
|
|
203
|
+
expected_number_indexed_file_data=1,
|
|
201
204
|
),
|
|
202
205
|
)
|
|
203
206
|
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
|
|
9
|
+
from neo4j.exceptions import ServiceUnavailable
|
|
10
|
+
from pytest_check import check
|
|
11
|
+
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
13
|
+
from test.integration.connectors.utils.docker import container_context
|
|
14
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
15
|
+
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
16
|
+
from unstructured_ingest.v2.interfaces.file_data import (
|
|
17
|
+
FileData,
|
|
18
|
+
FileDataSourceMetadata,
|
|
19
|
+
SourceIdentifiers,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.processes.connectors.neo4j import (
|
|
22
|
+
CONNECTOR_TYPE,
|
|
23
|
+
Label,
|
|
24
|
+
Neo4jAccessConfig,
|
|
25
|
+
Neo4jConnectionConfig,
|
|
26
|
+
Neo4jUploader,
|
|
27
|
+
Neo4jUploaderConfig,
|
|
28
|
+
Neo4jUploadStager,
|
|
29
|
+
Relationship,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
USERNAME = "neo4j"
|
|
33
|
+
PASSWORD = "password"
|
|
34
|
+
URI = "neo4j://localhost:7687"
|
|
35
|
+
DATABASE = "neo4j"
|
|
36
|
+
|
|
37
|
+
EXPECTED_DOCUMENT_COUNT = 1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# NOTE: Precheck tests are read-only so we utilize the same container for all tests.
|
|
41
|
+
# If new tests require clean neo4j container, this fixture's scope should be adjusted.
|
|
42
|
+
@pytest.fixture(autouse=True, scope="module")
|
|
43
|
+
def _neo4j_server():
|
|
44
|
+
with container_context(
|
|
45
|
+
image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
|
|
46
|
+
):
|
|
47
|
+
driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
|
|
48
|
+
wait_for_connection(driver)
|
|
49
|
+
driver.close()
|
|
50
|
+
yield
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.mark.asyncio
|
|
54
|
+
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
|
|
55
|
+
async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
|
|
56
|
+
stager = Neo4jUploadStager()
|
|
57
|
+
uploader = Neo4jUploader(
|
|
58
|
+
connection_config=Neo4jConnectionConfig(
|
|
59
|
+
access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
|
|
60
|
+
username=USERNAME,
|
|
61
|
+
uri=URI,
|
|
62
|
+
database=DATABASE,
|
|
63
|
+
),
|
|
64
|
+
upload_config=Neo4jUploaderConfig(),
|
|
65
|
+
)
|
|
66
|
+
file_data = FileData(
|
|
67
|
+
identifier="mock-file-data",
|
|
68
|
+
connector_type="neo4j",
|
|
69
|
+
source_identifiers=SourceIdentifiers(
|
|
70
|
+
filename=upload_file.name,
|
|
71
|
+
fullpath=upload_file.name,
|
|
72
|
+
),
|
|
73
|
+
metadata=FileDataSourceMetadata(
|
|
74
|
+
date_created=str(datetime(2022, 1, 1).timestamp()),
|
|
75
|
+
date_modified=str(datetime(2022, 1, 2).timestamp()),
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
staged_filepath = stager.run(
|
|
79
|
+
upload_file,
|
|
80
|
+
file_data=file_data,
|
|
81
|
+
output_dir=tmp_path,
|
|
82
|
+
output_filename=upload_file.name,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
await uploader.run_async(staged_filepath, file_data)
|
|
86
|
+
await validate_uploaded_graph(upload_file)
|
|
87
|
+
|
|
88
|
+
modified_upload_file = tmp_path / f"modified-{upload_file.name}"
|
|
89
|
+
with open(upload_file) as file:
|
|
90
|
+
elements = json.load(file)
|
|
91
|
+
for element in elements:
|
|
92
|
+
element["element_id"] = str(uuid.uuid4())
|
|
93
|
+
|
|
94
|
+
with open(modified_upload_file, "w") as file:
|
|
95
|
+
json.dump(elements, file, indent=4)
|
|
96
|
+
|
|
97
|
+
staged_filepath = stager.run(
|
|
98
|
+
modified_upload_file,
|
|
99
|
+
file_data=file_data,
|
|
100
|
+
output_dir=tmp_path,
|
|
101
|
+
output_filename=modified_upload_file.name,
|
|
102
|
+
)
|
|
103
|
+
await uploader.run_async(staged_filepath, file_data)
|
|
104
|
+
await validate_uploaded_graph(modified_upload_file)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE)
|
|
108
|
+
class TestPrecheck:
|
|
109
|
+
@pytest.fixture
|
|
110
|
+
def configured_uploader(self) -> Neo4jUploader:
|
|
111
|
+
return Neo4jUploader(
|
|
112
|
+
connection_config=Neo4jConnectionConfig(
|
|
113
|
+
access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
|
|
114
|
+
username=USERNAME,
|
|
115
|
+
uri=URI,
|
|
116
|
+
database=DATABASE,
|
|
117
|
+
),
|
|
118
|
+
upload_config=Neo4jUploaderConfig(),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def test_succeeds(self, configured_uploader: Neo4jUploader):
|
|
122
|
+
configured_uploader.precheck()
|
|
123
|
+
|
|
124
|
+
def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
|
|
125
|
+
configured_uploader.connection_config.access_config.get_secret_value().password = (
|
|
126
|
+
"invalid-password"
|
|
127
|
+
)
|
|
128
|
+
with pytest.raises(
|
|
129
|
+
DestinationConnectionError,
|
|
130
|
+
match="{code: Neo.ClientError.Security.Unauthorized}",
|
|
131
|
+
):
|
|
132
|
+
configured_uploader.precheck()
|
|
133
|
+
|
|
134
|
+
def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
|
|
135
|
+
configured_uploader.connection_config.username = "invalid-username"
|
|
136
|
+
with pytest.raises(
|
|
137
|
+
DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
|
|
138
|
+
):
|
|
139
|
+
configured_uploader.precheck()
|
|
140
|
+
|
|
141
|
+
@pytest.mark.parametrize(
|
|
142
|
+
("uri", "expected_error_msg"),
|
|
143
|
+
[
|
|
144
|
+
("neo4j://localhst:7687", "Cannot resolve address"),
|
|
145
|
+
("neo4j://localhost:7777", "Unable to retrieve routing information"),
|
|
146
|
+
],
|
|
147
|
+
)
|
|
148
|
+
def test_fails_on_invalid_uri(
|
|
149
|
+
self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
|
|
150
|
+
):
|
|
151
|
+
configured_uploader.connection_config.uri = uri
|
|
152
|
+
with pytest.raises(DestinationConnectionError, match=expected_error_msg):
|
|
153
|
+
configured_uploader.precheck()
|
|
154
|
+
|
|
155
|
+
def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
|
|
156
|
+
configured_uploader.connection_config.database = "invalid-database"
|
|
157
|
+
with pytest.raises(
|
|
158
|
+
DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
|
|
159
|
+
):
|
|
160
|
+
configured_uploader.precheck()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
|
|
164
|
+
attempts = 0
|
|
165
|
+
while attempts < retries:
|
|
166
|
+
try:
|
|
167
|
+
driver.verify_connectivity()
|
|
168
|
+
return
|
|
169
|
+
except ServiceUnavailable:
|
|
170
|
+
time.sleep(delay_seconds)
|
|
171
|
+
attempts += 1
|
|
172
|
+
|
|
173
|
+
pytest.fail("Failed to connect with Neo4j server.")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def validate_uploaded_graph(upload_file: Path):
|
|
177
|
+
with open(upload_file) as file:
|
|
178
|
+
elements = json.load(file)
|
|
179
|
+
|
|
180
|
+
for element in elements:
|
|
181
|
+
if "orig_elements" in element["metadata"]:
|
|
182
|
+
element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
|
|
183
|
+
element["metadata"]["orig_elements"]
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
element["metadata"]["orig_elements"] = []
|
|
187
|
+
|
|
188
|
+
expected_chunks_count = len(elements)
|
|
189
|
+
expected_element_count = len(
|
|
190
|
+
{
|
|
191
|
+
origin_element["element_id"]
|
|
192
|
+
for chunk in elements
|
|
193
|
+
for origin_element in chunk["metadata"]["orig_elements"]
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
|
|
197
|
+
|
|
198
|
+
driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
|
|
199
|
+
try:
|
|
200
|
+
nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
|
|
201
|
+
chunk_nodes_count = len(
|
|
202
|
+
(await driver.execute_query(f"MATCH (n: {Label.CHUNK}) RETURN n"))[0]
|
|
203
|
+
)
|
|
204
|
+
document_nodes_count = len(
|
|
205
|
+
(await driver.execute_query(f"MATCH (n: {Label.DOCUMENT}) RETURN n"))[0]
|
|
206
|
+
)
|
|
207
|
+
element_nodes_count = len(
|
|
208
|
+
(await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT}) RETURN n"))[0]
|
|
209
|
+
)
|
|
210
|
+
with check:
|
|
211
|
+
assert nodes_count == expected_nodes_count
|
|
212
|
+
with check:
|
|
213
|
+
assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
|
|
214
|
+
with check:
|
|
215
|
+
assert chunk_nodes_count == expected_chunks_count
|
|
216
|
+
with check:
|
|
217
|
+
assert element_nodes_count == expected_element_count
|
|
218
|
+
|
|
219
|
+
records, _, _ = await driver.execute_query(
|
|
220
|
+
f"MATCH ()-[r:{Relationship.PART_OF_DOCUMENT}]->(:{Label.DOCUMENT}) RETURN r"
|
|
221
|
+
)
|
|
222
|
+
part_of_document_count = len(records)
|
|
223
|
+
|
|
224
|
+
records, _, _ = await driver.execute_query(
|
|
225
|
+
f"MATCH (:{Label.CHUNK})-[r:{Relationship.NEXT_CHUNK}]->(:{Label.CHUNK}) RETURN r"
|
|
226
|
+
)
|
|
227
|
+
next_chunk_count = len(records)
|
|
228
|
+
|
|
229
|
+
if not check.any_failures():
|
|
230
|
+
with check:
|
|
231
|
+
assert part_of_document_count == expected_chunks_count + expected_element_count
|
|
232
|
+
with check:
|
|
233
|
+
assert next_chunk_count == expected_chunks_count - 1
|
|
234
|
+
|
|
235
|
+
finally:
|
|
236
|
+
await driver.close()
|
|
@@ -8,12 +8,17 @@ from typing import Generator
|
|
|
8
8
|
from uuid import uuid4
|
|
9
9
|
|
|
10
10
|
import pytest
|
|
11
|
+
from _pytest.fixtures import TopRequest
|
|
11
12
|
from pinecone import Pinecone, ServerlessSpec
|
|
12
13
|
from pinecone.core.openapi.shared.exceptions import NotFoundException
|
|
13
14
|
|
|
14
15
|
from test.integration.connectors.utils.constants import (
|
|
15
16
|
DESTINATION_TAG,
|
|
16
17
|
)
|
|
18
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
19
|
+
StagerValidationConfigs,
|
|
20
|
+
stager_validation,
|
|
21
|
+
)
|
|
17
22
|
from test.integration.utils import requires_env
|
|
18
23
|
from unstructured_ingest.error import DestinationConnectionError
|
|
19
24
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
@@ -251,7 +256,10 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
|
251
256
|
identifier="mock-file-data",
|
|
252
257
|
)
|
|
253
258
|
staged_file = stager.run(
|
|
254
|
-
|
|
259
|
+
elements_filepath=large_metadata_upload_file,
|
|
260
|
+
file_data=file_data,
|
|
261
|
+
output_dir=tmp_path,
|
|
262
|
+
output_filename=large_metadata_upload_file.name,
|
|
255
263
|
)
|
|
256
264
|
try:
|
|
257
265
|
uploader.run(staged_file, file_data)
|
|
@@ -262,3 +270,19 @@ def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
|
262
270
|
raise pytest.fail("Upload request failed due to metadata exceeding limits.")
|
|
263
271
|
|
|
264
272
|
validate_pinecone_index(pinecone_index, 1, interval=5)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
276
|
+
def test_pinecone_stager(
|
|
277
|
+
request: TopRequest,
|
|
278
|
+
upload_file_str: str,
|
|
279
|
+
tmp_path: Path,
|
|
280
|
+
):
|
|
281
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
282
|
+
stager = PineconeUploadStager()
|
|
283
|
+
stager_validation(
|
|
284
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
285
|
+
input_file=upload_file,
|
|
286
|
+
stager=stager,
|
|
287
|
+
tmp_dir=tmp_path,
|
|
288
|
+
)
|
|
@@ -6,10 +6,15 @@ from pathlib import Path
|
|
|
6
6
|
from typing import AsyncGenerator
|
|
7
7
|
|
|
8
8
|
import pytest
|
|
9
|
+
from _pytest.fixtures import TopRequest
|
|
9
10
|
from qdrant_client import AsyncQdrantClient
|
|
10
11
|
|
|
11
12
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
12
13
|
from test.integration.connectors.utils.docker import container_context
|
|
14
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
15
|
+
StagerValidationConfigs,
|
|
16
|
+
stager_validation,
|
|
17
|
+
)
|
|
13
18
|
from test.integration.utils import requires_env
|
|
14
19
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
20
|
from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
|
|
@@ -138,7 +143,7 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
|
|
|
138
143
|
output_dir=tmp_path,
|
|
139
144
|
output_filename=upload_file.name,
|
|
140
145
|
)
|
|
141
|
-
|
|
146
|
+
uploader.precheck()
|
|
142
147
|
if uploader.is_async():
|
|
143
148
|
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
144
149
|
else:
|
|
@@ -183,10 +188,28 @@ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
|
|
|
183
188
|
output_dir=tmp_path,
|
|
184
189
|
output_filename=upload_file.name,
|
|
185
190
|
)
|
|
186
|
-
|
|
191
|
+
uploader.precheck()
|
|
187
192
|
if uploader.is_async():
|
|
188
193
|
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
189
194
|
else:
|
|
190
195
|
uploader.run(path=staged_upload_file, file_data=file_data)
|
|
191
196
|
async with qdrant_client(connection_kwargs) as client:
|
|
192
197
|
await validate_upload(client=client, upload_file=upload_file)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
201
|
+
def test_qdrant_stager(
|
|
202
|
+
request: TopRequest,
|
|
203
|
+
upload_file_str: str,
|
|
204
|
+
tmp_path: Path,
|
|
205
|
+
):
|
|
206
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
207
|
+
stager = LocalQdrantUploadStager(
|
|
208
|
+
upload_stager_config=LocalQdrantUploadStagerConfig(),
|
|
209
|
+
)
|
|
210
|
+
stager_validation(
|
|
211
|
+
configs=StagerValidationConfigs(test_id=LOCAL_CONNECTOR_TYPE, expected_count=22),
|
|
212
|
+
input_file=upload_file,
|
|
213
|
+
stager=stager,
|
|
214
|
+
tmp_dir=tmp_path,
|
|
215
|
+
)
|
|
@@ -11,8 +11,8 @@ from test.integration.connectors.utils.constants import (
|
|
|
11
11
|
env_setup_path,
|
|
12
12
|
)
|
|
13
13
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
14
|
-
from test.integration.connectors.utils.validation import (
|
|
15
|
-
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
18
|
from test.integration.utils import requires_env
|
|
@@ -62,7 +62,7 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
|
62
62
|
await source_connector_validation(
|
|
63
63
|
indexer=indexer,
|
|
64
64
|
downloader=downloader,
|
|
65
|
-
configs=
|
|
65
|
+
configs=SourceValidationConfigs(
|
|
66
66
|
test_id="s3",
|
|
67
67
|
predownload_file_data_check=validate_predownload_file_data,
|
|
68
68
|
postdownload_file_data_check=validate_postdownload_file_data,
|
|
@@ -85,7 +85,7 @@ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig
|
|
|
85
85
|
await source_connector_validation(
|
|
86
86
|
indexer=indexer,
|
|
87
87
|
downloader=downloader,
|
|
88
|
-
configs=
|
|
88
|
+
configs=SourceValidationConfigs(
|
|
89
89
|
test_id="s3-specialchar",
|
|
90
90
|
predownload_file_data_check=validate_predownload_file_data,
|
|
91
91
|
postdownload_file_data_check=validate_postdownload_file_data,
|
|
@@ -121,7 +121,7 @@ async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
|
121
121
|
await source_connector_validation(
|
|
122
122
|
indexer=indexer,
|
|
123
123
|
downloader=downloader,
|
|
124
|
-
configs=
|
|
124
|
+
configs=SourceValidationConfigs(
|
|
125
125
|
test_id="s3-minio",
|
|
126
126
|
predownload_file_data_check=validate_predownload_file_data,
|
|
127
127
|
postdownload_file_data_check=validate_postdownload_file_data,
|
|
@@ -165,11 +165,14 @@ async def test_s3_destination(upload_file: Path):
|
|
|
165
165
|
identifier="mock file data",
|
|
166
166
|
)
|
|
167
167
|
try:
|
|
168
|
+
uploader.precheck()
|
|
168
169
|
if uploader.is_async():
|
|
169
170
|
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
170
171
|
else:
|
|
171
172
|
uploader.run(path=upload_file, file_data=file_data)
|
|
172
|
-
uploaded_files =
|
|
173
|
+
uploaded_files = [
|
|
174
|
+
Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
|
|
175
|
+
]
|
|
173
176
|
assert len(uploaded_files) == 1
|
|
174
177
|
finally:
|
|
175
178
|
s3fs.rm(path=destination_path, recursive=True)
|
|
@@ -44,6 +44,7 @@ def get_container(
|
|
|
44
44
|
docker_client: docker.DockerClient,
|
|
45
45
|
image: str,
|
|
46
46
|
ports: dict,
|
|
47
|
+
name: Optional[str] = "connector_test",
|
|
47
48
|
environment: Optional[dict] = None,
|
|
48
49
|
volumes: Optional[dict] = None,
|
|
49
50
|
healthcheck: Optional[HealthCheck] = None,
|
|
@@ -59,6 +60,8 @@ def get_container(
|
|
|
59
60
|
run_kwargs["volumes"] = volumes
|
|
60
61
|
if healthcheck:
|
|
61
62
|
run_kwargs["healthcheck"] = healthcheck.model_dump()
|
|
63
|
+
if name:
|
|
64
|
+
run_kwargs["name"] = name
|
|
62
65
|
container: Container = docker_client.containers.run(**run_kwargs)
|
|
63
66
|
return container
|
|
64
67
|
|
|
@@ -112,6 +115,7 @@ def container_context(
|
|
|
112
115
|
healthcheck: Optional[HealthCheck] = None,
|
|
113
116
|
healthcheck_retries: int = 30,
|
|
114
117
|
docker_client: Optional[docker.DockerClient] = None,
|
|
118
|
+
name: Optional[str] = "connector_test",
|
|
115
119
|
):
|
|
116
120
|
docker_client = docker_client or docker.from_env()
|
|
117
121
|
print(f"pulling image {image}")
|
|
@@ -125,6 +129,7 @@ def container_context(
|
|
|
125
129
|
environment=environment,
|
|
126
130
|
volumes=volumes,
|
|
127
131
|
healthcheck=healthcheck,
|
|
132
|
+
name=name,
|
|
128
133
|
)
|
|
129
134
|
if healthcheck_data := get_healthcheck(container):
|
|
130
135
|
# Mirror whatever healthcheck config set on container
|
|
@@ -143,3 +148,4 @@ def container_context(
|
|
|
143
148
|
finally:
|
|
144
149
|
if container:
|
|
145
150
|
container.kill()
|
|
151
|
+
container.remove()
|
|
File without changes
|