unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from office365.graph_client import GraphClient
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
)
|
|
11
|
+
from test.integration.utils import requires_env
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
OnedriveAccessConfig,
|
|
16
|
+
OnedriveConnectionConfig,
|
|
17
|
+
OnedriveUploader,
|
|
18
|
+
OnedriveUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def onedrive_test_folder() -> str:
|
|
24
|
+
"""
|
|
25
|
+
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
26
|
+
"""
|
|
27
|
+
connection_config = get_connection_config()
|
|
28
|
+
user_pname = connection_config.user_pname
|
|
29
|
+
|
|
30
|
+
# Get the OneDrive client
|
|
31
|
+
client: GraphClient = connection_config.get_client()
|
|
32
|
+
drive = client.users[user_pname].drive
|
|
33
|
+
|
|
34
|
+
# Generate a unique test folder path
|
|
35
|
+
test_folder_path = f"utic-test-output-{uuid.uuid4()}"
|
|
36
|
+
|
|
37
|
+
# Create the test folder
|
|
38
|
+
root = drive.root
|
|
39
|
+
folder = root.create_folder(test_folder_path).execute_query()
|
|
40
|
+
print(f"created folder: {folder.name}")
|
|
41
|
+
try:
|
|
42
|
+
yield test_folder_path
|
|
43
|
+
finally:
|
|
44
|
+
# Teardown: delete the test folder and its contents
|
|
45
|
+
folder.delete_object().execute_query()
|
|
46
|
+
print(f"successfully deleted folder: {folder.name}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_connection_config():
|
|
50
|
+
"""
|
|
51
|
+
Pytest fixture that provides the OnedriveConnectionConfig for tests.
|
|
52
|
+
"""
|
|
53
|
+
client_id = os.getenv("MS_CLIENT_ID")
|
|
54
|
+
client_secret = os.getenv("MS_CLIENT_CRED")
|
|
55
|
+
tenant_id = os.getenv("MS_TENANT_ID")
|
|
56
|
+
user_pname = os.getenv("MS_USER_PNAME")
|
|
57
|
+
|
|
58
|
+
connection_config = OnedriveConnectionConfig(
|
|
59
|
+
client_id=client_id,
|
|
60
|
+
tenant=tenant_id,
|
|
61
|
+
user_pname=user_pname,
|
|
62
|
+
access_config=OnedriveAccessConfig(client_cred=client_secret),
|
|
63
|
+
)
|
|
64
|
+
return connection_config
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
|
+
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
69
|
+
def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
70
|
+
"""
|
|
71
|
+
Integration test for the OneDrive destination connector.
|
|
72
|
+
|
|
73
|
+
This test uploads a file to OneDrive and verifies that it exists.
|
|
74
|
+
"""
|
|
75
|
+
connection_config = get_connection_config()
|
|
76
|
+
# Retrieve user principal name from the connection config
|
|
77
|
+
user_pname = connection_config.user_pname
|
|
78
|
+
|
|
79
|
+
# The test folder is provided by the fixture
|
|
80
|
+
destination_folder = onedrive_test_folder
|
|
81
|
+
destination_fullpath = f"{destination_folder}/{upload_file.name}"
|
|
82
|
+
|
|
83
|
+
# Configure the uploader with remote_url
|
|
84
|
+
upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
|
|
85
|
+
|
|
86
|
+
uploader = OnedriveUploader(
|
|
87
|
+
connection_config=connection_config,
|
|
88
|
+
upload_config=upload_config,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
file_data = FileData(
|
|
92
|
+
source_identifiers=SourceIdentifiers(
|
|
93
|
+
fullpath=destination_fullpath,
|
|
94
|
+
filename=upload_file.name,
|
|
95
|
+
),
|
|
96
|
+
connector_type=CONNECTOR_TYPE,
|
|
97
|
+
identifier="mock_file_data",
|
|
98
|
+
)
|
|
99
|
+
uploader.precheck()
|
|
100
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
101
|
+
|
|
102
|
+
# Verify that the file was uploaded
|
|
103
|
+
client = connection_config.get_client()
|
|
104
|
+
drive = client.users[user_pname].drive
|
|
105
|
+
|
|
106
|
+
uploaded_file = (
|
|
107
|
+
drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Check if the file exists
|
|
111
|
+
assert uploaded_file is not None
|
|
112
|
+
assert uploaded_file.name == upload_file.name
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
9
|
+
from pinecone.core.openapi.shared.exceptions import NotFoundException
|
|
10
|
+
|
|
11
|
+
from test.integration.connectors.utils.constants import (
|
|
12
|
+
DESTINATION_TAG,
|
|
13
|
+
)
|
|
14
|
+
from test.integration.utils import requires_env
|
|
15
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
16
|
+
from unstructured_ingest.v2.logger import logger
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
18
|
+
CONNECTOR_TYPE,
|
|
19
|
+
PineconeAccessConfig,
|
|
20
|
+
PineconeConnectionConfig,
|
|
21
|
+
PineconeUploader,
|
|
22
|
+
PineconeUploaderConfig,
|
|
23
|
+
PineconeUploadStager,
|
|
24
|
+
PineconeUploadStagerConfig,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
API_KEY = "PINECONE_API_KEY"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_api_key() -> str:
|
|
31
|
+
api_key = os.getenv(API_KEY, None)
|
|
32
|
+
assert api_key
|
|
33
|
+
return api_key
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def wait_for_delete(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
|
|
37
|
+
start = time.time()
|
|
38
|
+
while True and time.time() - start < timeout:
|
|
39
|
+
try:
|
|
40
|
+
description = client.describe_index(name=index_name)
|
|
41
|
+
logger.info(f"current index status: {description}")
|
|
42
|
+
except NotFoundException:
|
|
43
|
+
return
|
|
44
|
+
time.sleep(interval)
|
|
45
|
+
|
|
46
|
+
raise TimeoutError("time out waiting for index to delete")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
|
|
50
|
+
def is_ready_status():
|
|
51
|
+
description = client.describe_index(name=index_name)
|
|
52
|
+
status = description["status"]
|
|
53
|
+
return status["ready"]
|
|
54
|
+
|
|
55
|
+
start = time.time()
|
|
56
|
+
is_ready = is_ready_status()
|
|
57
|
+
while not is_ready and time.time() - start < timeout:
|
|
58
|
+
time.sleep(interval)
|
|
59
|
+
is_ready = is_ready_status()
|
|
60
|
+
if not is_ready:
|
|
61
|
+
raise TimeoutError("time out waiting for index to be ready")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@pytest.fixture
|
|
65
|
+
def pinecone_index() -> str:
|
|
66
|
+
pinecone = Pinecone(api_key=get_api_key())
|
|
67
|
+
random_id = str(uuid4()).split("-")[0]
|
|
68
|
+
index_name = f"ingest-test-{random_id}"
|
|
69
|
+
assert len(index_name) < 45
|
|
70
|
+
logger.info(f"Creating index: {index_name}")
|
|
71
|
+
try:
|
|
72
|
+
pinecone.create_index(
|
|
73
|
+
name=index_name,
|
|
74
|
+
dimension=384,
|
|
75
|
+
metric="cosine",
|
|
76
|
+
spec=ServerlessSpec(
|
|
77
|
+
cloud="aws",
|
|
78
|
+
region="us-east-1",
|
|
79
|
+
),
|
|
80
|
+
deletion_protection="disabled",
|
|
81
|
+
)
|
|
82
|
+
wait_for_ready(client=pinecone, index_name=index_name)
|
|
83
|
+
yield index_name
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"failed to create index {index_name}: {e}")
|
|
86
|
+
finally:
|
|
87
|
+
try:
|
|
88
|
+
logger.info(f"deleting index: {index_name}")
|
|
89
|
+
pinecone.delete_index(name=index_name)
|
|
90
|
+
wait_for_delete(client=pinecone, index_name=index_name)
|
|
91
|
+
except NotFoundException:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def validate_pinecone_index(
|
|
96
|
+
index_name: str, expected_num_of_vectors: int, retries=30, interval=1
|
|
97
|
+
) -> None:
|
|
98
|
+
# Because there's a delay for the index to catch up to the recent writes, add in a retry
|
|
99
|
+
pinecone = Pinecone(api_key=get_api_key())
|
|
100
|
+
index = pinecone.Index(name=index_name)
|
|
101
|
+
vector_count = -1
|
|
102
|
+
for i in range(retries):
|
|
103
|
+
index_stats = index.describe_index_stats()
|
|
104
|
+
vector_count = index_stats["total_vector_count"]
|
|
105
|
+
if vector_count == expected_num_of_vectors:
|
|
106
|
+
logger.info(f"expected {expected_num_of_vectors} == vector count {vector_count}")
|
|
107
|
+
break
|
|
108
|
+
logger.info(
|
|
109
|
+
f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
|
|
110
|
+
)
|
|
111
|
+
time.sleep(interval)
|
|
112
|
+
assert vector_count == expected_num_of_vectors
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@requires_env(API_KEY)
|
|
116
|
+
@pytest.mark.asyncio
|
|
117
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
118
|
+
async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
|
|
119
|
+
file_data = FileData(
|
|
120
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
121
|
+
connector_type=CONNECTOR_TYPE,
|
|
122
|
+
identifier="pinecone_mock_id",
|
|
123
|
+
)
|
|
124
|
+
connection_config = PineconeConnectionConfig(
|
|
125
|
+
index_name=pinecone_index,
|
|
126
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
127
|
+
)
|
|
128
|
+
stager_config = PineconeUploadStagerConfig()
|
|
129
|
+
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
130
|
+
new_upload_file = stager.run(
|
|
131
|
+
elements_filepath=upload_file,
|
|
132
|
+
output_dir=temp_dir,
|
|
133
|
+
output_filename=upload_file.name,
|
|
134
|
+
file_data=file_data,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
upload_config = PineconeUploaderConfig()
|
|
138
|
+
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
139
|
+
uploader.precheck()
|
|
140
|
+
|
|
141
|
+
if uploader.is_async():
|
|
142
|
+
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
143
|
+
else:
|
|
144
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
145
|
+
with new_upload_file.open() as f:
|
|
146
|
+
staged_content = json.load(f)
|
|
147
|
+
expected_num_of_vectors = len(staged_content)
|
|
148
|
+
logger.info("validating first upload")
|
|
149
|
+
validate_pinecone_index(
|
|
150
|
+
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Rerun uploader and make sure no duplicates exist
|
|
154
|
+
if uploader.is_async():
|
|
155
|
+
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
156
|
+
else:
|
|
157
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
158
|
+
logger.info("validating second upload")
|
|
159
|
+
validate_pinecone_index(
|
|
160
|
+
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
161
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import AsyncGenerator
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from qdrant_client import AsyncQdrantClient
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from test.integration.connectors.utils.docker import container_context
|
|
12
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.local import (
|
|
14
|
+
CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.local import (
|
|
17
|
+
LocalQdrantConnectionConfig,
|
|
18
|
+
LocalQdrantUploader,
|
|
19
|
+
LocalQdrantUploaderConfig,
|
|
20
|
+
LocalQdrantUploadStager,
|
|
21
|
+
LocalQdrantUploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.server import (
|
|
24
|
+
CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.server import (
|
|
27
|
+
ServerQdrantConnectionConfig,
|
|
28
|
+
ServerQdrantUploader,
|
|
29
|
+
ServerQdrantUploaderConfig,
|
|
30
|
+
ServerQdrantUploadStager,
|
|
31
|
+
ServerQdrantUploadStagerConfig,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
|
|
35
|
+
VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@asynccontextmanager
|
|
39
|
+
async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
|
|
40
|
+
client = AsyncQdrantClient(**client_params)
|
|
41
|
+
try:
|
|
42
|
+
yield client
|
|
43
|
+
finally:
|
|
44
|
+
await client.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
|
|
48
|
+
with upload_file.open() as upload_fp:
|
|
49
|
+
elements = json.load(upload_fp)
|
|
50
|
+
expected_point_count = len(elements)
|
|
51
|
+
first_element = elements[0]
|
|
52
|
+
expected_text = first_element["text"]
|
|
53
|
+
embeddings = first_element["embeddings"]
|
|
54
|
+
collection = await client.get_collection(COLLECTION_NAME)
|
|
55
|
+
assert collection.points_count == expected_point_count
|
|
56
|
+
|
|
57
|
+
response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
|
|
58
|
+
assert response.points[0].payload is not None
|
|
59
|
+
assert response.points[0].payload["text"] == expected_text
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.asyncio
|
|
63
|
+
@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
64
|
+
async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
|
|
65
|
+
connection_kwargs = {"path": str(tmp_path / "qdrant")}
|
|
66
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
67
|
+
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
68
|
+
AsyncQdrantClient(**connection_kwargs)
|
|
69
|
+
stager = LocalQdrantUploadStager(
|
|
70
|
+
upload_stager_config=LocalQdrantUploadStagerConfig(),
|
|
71
|
+
)
|
|
72
|
+
uploader = LocalQdrantUploader(
|
|
73
|
+
connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
|
|
74
|
+
upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
file_data = FileData(
|
|
78
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
79
|
+
connector_type=LOCAL_CONNECTOR_TYPE,
|
|
80
|
+
identifier="mock-file-data",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
staged_upload_file = stager.run(
|
|
84
|
+
elements_filepath=upload_file,
|
|
85
|
+
file_data=file_data,
|
|
86
|
+
output_dir=tmp_path,
|
|
87
|
+
output_filename=upload_file.name,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if uploader.is_async():
|
|
91
|
+
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
92
|
+
else:
|
|
93
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
94
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
95
|
+
await validate_upload(client=client, upload_file=upload_file)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@pytest.fixture
|
|
99
|
+
def docker_context():
|
|
100
|
+
with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
|
|
101
|
+
yield container
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@pytest.mark.asyncio
|
|
105
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
106
|
+
async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
|
|
107
|
+
connection_kwargs = {"location": "http://localhost:6333"}
|
|
108
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
109
|
+
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
110
|
+
AsyncQdrantClient(**connection_kwargs)
|
|
111
|
+
stager = ServerQdrantUploadStager(
|
|
112
|
+
upload_stager_config=ServerQdrantUploadStagerConfig(),
|
|
113
|
+
)
|
|
114
|
+
uploader = ServerQdrantUploader(
|
|
115
|
+
connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
|
|
116
|
+
upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
file_data = FileData(
|
|
120
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
121
|
+
connector_type=SERVER_CONNECTOR_TYPE,
|
|
122
|
+
identifier="mock-file-data",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
staged_upload_file = stager.run(
|
|
126
|
+
elements_filepath=upload_file,
|
|
127
|
+
file_data=file_data,
|
|
128
|
+
output_dir=tmp_path,
|
|
129
|
+
output_filename=upload_file.name,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if uploader.is_async():
|
|
133
|
+
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
134
|
+
else:
|
|
135
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
136
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
137
|
+
await validate_upload(client=client, upload_file=upload_file)
|
|
@@ -71,6 +71,29 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
@pytest.mark.asyncio
|
|
75
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
76
|
+
async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
|
|
77
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
|
|
78
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
79
|
+
tempdir_path = Path(tempdir)
|
|
80
|
+
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
81
|
+
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
82
|
+
downloader = S3Downloader(
|
|
83
|
+
connection_config=anon_connection_config, download_config=download_config
|
|
84
|
+
)
|
|
85
|
+
await source_connector_validation(
|
|
86
|
+
indexer=indexer,
|
|
87
|
+
downloader=downloader,
|
|
88
|
+
configs=ValidationConfigs(
|
|
89
|
+
test_id="s3-specialchar",
|
|
90
|
+
predownload_file_data_check=validate_predownload_file_data,
|
|
91
|
+
postdownload_file_data_check=validate_postdownload_file_data,
|
|
92
|
+
expected_num_files=1,
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
74
97
|
@pytest.mark.asyncio
|
|
75
98
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
76
99
|
async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
@@ -47,14 +47,15 @@ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
|
|
|
47
47
|
|
|
48
48
|
@contextmanager
|
|
49
49
|
def container_context(
|
|
50
|
-
docker_client: docker.DockerClient,
|
|
51
50
|
image: str,
|
|
52
51
|
ports: dict,
|
|
53
52
|
environment: Optional[dict] = None,
|
|
54
53
|
volumes: Optional[dict] = None,
|
|
55
54
|
healthcheck: Optional[dict] = None,
|
|
56
55
|
healthcheck_timeout: int = 10,
|
|
56
|
+
docker_client: Optional[docker.DockerClient] = None,
|
|
57
57
|
):
|
|
58
|
+
docker_client = docker_client or docker.from_env()
|
|
58
59
|
container: Optional[Container] = None
|
|
59
60
|
try:
|
|
60
61
|
container = get_container(
|
|
@@ -7,13 +7,14 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Callable, Optional
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
10
11
|
from deepdiff import DeepDiff
|
|
11
12
|
|
|
12
13
|
from test.integration.connectors.utils.constants import expected_results_path
|
|
13
14
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def
|
|
17
|
+
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
17
18
|
expected_df = pd.read_csv(expected_filepath)
|
|
18
19
|
current_df = pd.read_csv(current_filepath)
|
|
19
20
|
if expected_df.equals(current_df):
|
|
@@ -27,6 +28,42 @@ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) ->
|
|
|
27
28
|
return False
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
32
|
+
with expected_filepath.open() as expected_f:
|
|
33
|
+
expected_soup = BeautifulSoup(expected_f, "html.parser")
|
|
34
|
+
with current_filepath.open() as current_f:
|
|
35
|
+
current_soup = BeautifulSoup(current_f, "html.parser")
|
|
36
|
+
return expected_soup.text == current_soup.text
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
40
|
+
with expected_filepath.open() as expected_f:
|
|
41
|
+
expected_text_lines = expected_f.readlines()
|
|
42
|
+
with current_filepath.open() as current_f:
|
|
43
|
+
current_text_lines = current_f.readlines()
|
|
44
|
+
if len(expected_text_lines) != len(current_text_lines):
|
|
45
|
+
print(
|
|
46
|
+
f"Lines in expected text file ({len(expected_text_lines)}) "
|
|
47
|
+
f"don't match current text file ({len(current_text_lines)})"
|
|
48
|
+
)
|
|
49
|
+
return False
|
|
50
|
+
expected_text = "\n".join(expected_text_lines)
|
|
51
|
+
current_text = "\n".join(current_text_lines)
|
|
52
|
+
if expected_text == current_text:
|
|
53
|
+
return True
|
|
54
|
+
print("txt content don't match:")
|
|
55
|
+
print(f"expected: {expected_text}")
|
|
56
|
+
print(f"current: {current_text}")
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
file_type_equality_check = {
|
|
61
|
+
".json": json_equality_check,
|
|
62
|
+
".html": html_equality_check,
|
|
63
|
+
".txt": txt_equality_check,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
30
67
|
@dataclass
|
|
31
68
|
class ValidationConfigs:
|
|
32
69
|
test_id: str
|
|
@@ -39,6 +76,7 @@ class ValidationConfigs:
|
|
|
39
76
|
)
|
|
40
77
|
exclude_fields_extend: list[str] = field(default_factory=list)
|
|
41
78
|
validate_downloaded_files: bool = False
|
|
79
|
+
validate_file_data: bool = True
|
|
42
80
|
downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
43
81
|
|
|
44
82
|
def get_exclude_fields(self) -> list[str]:
|
|
@@ -86,7 +124,7 @@ class ValidationConfigs:
|
|
|
86
124
|
|
|
87
125
|
def get_files(dir_path: Path) -> list[str]:
|
|
88
126
|
return [
|
|
89
|
-
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.
|
|
127
|
+
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
|
|
90
128
|
]
|
|
91
129
|
|
|
92
130
|
|
|
@@ -122,6 +160,23 @@ def check_contents(
|
|
|
122
160
|
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
123
161
|
|
|
124
162
|
|
|
163
|
+
def detect_diff(
|
|
164
|
+
configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
|
|
165
|
+
) -> bool:
|
|
166
|
+
if expected_filepath.suffix != current_filepath.suffix:
|
|
167
|
+
return True
|
|
168
|
+
if downloaded_file_equality_check := configs.downloaded_file_equality_check:
|
|
169
|
+
return not downloaded_file_equality_check(expected_filepath, current_filepath)
|
|
170
|
+
current_suffix = expected_filepath.suffix
|
|
171
|
+
if current_suffix in file_type_equality_check:
|
|
172
|
+
equality_check_callable = file_type_equality_check[current_suffix]
|
|
173
|
+
return not equality_check_callable(
|
|
174
|
+
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
175
|
+
)
|
|
176
|
+
# Fallback is using filecmp.cmp to compare the files
|
|
177
|
+
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
178
|
+
|
|
179
|
+
|
|
125
180
|
def check_raw_file_contents(
|
|
126
181
|
expected_output_dir: Path,
|
|
127
182
|
current_output_dir: Path,
|
|
@@ -133,15 +188,7 @@ def check_raw_file_contents(
|
|
|
133
188
|
for current_file in current_files:
|
|
134
189
|
current_file_path = current_output_dir / current_file
|
|
135
190
|
expected_file_path = expected_output_dir / current_file
|
|
136
|
-
if
|
|
137
|
-
is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
|
|
138
|
-
elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
|
|
139
|
-
is_different = not pandas_df_equality_check(
|
|
140
|
-
expected_filepath=expected_file_path, current_filepath=current_file_path
|
|
141
|
-
)
|
|
142
|
-
else:
|
|
143
|
-
is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
|
|
144
|
-
if is_different:
|
|
191
|
+
if detect_diff(configs, expected_file_path, current_file_path):
|
|
145
192
|
found_diff = True
|
|
146
193
|
files.append(str(expected_file_path))
|
|
147
194
|
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
@@ -185,17 +232,19 @@ def update_fixtures(
|
|
|
185
232
|
download_dir: Path,
|
|
186
233
|
all_file_data: list[FileData],
|
|
187
234
|
save_downloads: bool = False,
|
|
235
|
+
save_filedata: bool = True,
|
|
188
236
|
):
|
|
189
237
|
# Delete current files
|
|
190
238
|
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
191
239
|
output_dir.mkdir(parents=True)
|
|
192
240
|
# Rewrite the current file data
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
241
|
+
if save_filedata:
|
|
242
|
+
file_data_output_path = output_dir / "file_data"
|
|
243
|
+
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
244
|
+
for file_data in all_file_data:
|
|
245
|
+
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
246
|
+
with file_data_path.open(mode="w") as f:
|
|
247
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
199
248
|
|
|
200
249
|
# Record file structure of download directory
|
|
201
250
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -229,11 +278,12 @@ def run_all_validations(
|
|
|
229
278
|
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
230
279
|
)
|
|
231
280
|
configs.run_download_dir_validation(download_dir=download_dir)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
281
|
+
if configs.validate_file_data:
|
|
282
|
+
run_expected_results_validation(
|
|
283
|
+
expected_output_dir=test_output_dir / "file_data",
|
|
284
|
+
all_file_data=postdownload_file_data,
|
|
285
|
+
configs=configs,
|
|
286
|
+
)
|
|
237
287
|
download_files = get_files(dir_path=download_dir)
|
|
238
288
|
download_files.sort()
|
|
239
289
|
run_directory_structure_validation(
|
|
@@ -291,4 +341,5 @@ async def source_connector_validation(
|
|
|
291
341
|
download_dir=download_dir,
|
|
292
342
|
all_file_data=all_postdownload_file_data,
|
|
293
343
|
save_downloads=configs.validate_downloaded_files,
|
|
344
|
+
save_filedata=configs.validate_file_data,
|
|
294
345
|
)
|
test/unit/v2/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
|
|
7
|
+
|
|
8
|
+
fake = faker.Faker()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_chunker_config_params() -> dict:
|
|
12
|
+
params = {}
|
|
13
|
+
random_val = random.random()
|
|
14
|
+
if random_val < 0.5:
|
|
15
|
+
params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
|
|
16
|
+
params["chunk_combine_text_under_n_chars"] = (
|
|
17
|
+
fake.random_int() if random.random() < 0.5 else None
|
|
18
|
+
)
|
|
19
|
+
params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
|
|
20
|
+
params["chunk_max_characters"] = fake.random_int()
|
|
21
|
+
params["chunk_multipage_sections"] = fake.boolean()
|
|
22
|
+
params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
|
|
23
|
+
params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
|
|
24
|
+
params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
|
|
25
|
+
if random_val < 0.5:
|
|
26
|
+
params["chunk_by_api"] = True
|
|
27
|
+
params["chunking_endpoint"] = fake.url()
|
|
28
|
+
params["chunk_api_key"] = fake.password()
|
|
29
|
+
else:
|
|
30
|
+
params["chunk_by_api"] = False
|
|
31
|
+
|
|
32
|
+
return params
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.mark.parametrize(
|
|
36
|
+
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
|
|
37
|
+
)
|
|
38
|
+
def test_chunker_config(partition_config_params: dict):
|
|
39
|
+
chunker_config = ChunkerConfig.model_validate(partition_config_params)
|
|
40
|
+
assert chunker_config
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.mark.parametrize(
|
|
44
|
+
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
|
|
45
|
+
)
|
|
46
|
+
def test_chunker(partition_config_params: dict):
|
|
47
|
+
chunker_config = ChunkerConfig.model_validate(partition_config_params)
|
|
48
|
+
chunker = Chunker(config=chunker_config)
|
|
49
|
+
assert chunker
|
|
File without changes
|
|
File without changes
|