unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,387 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import math
|
|
3
|
-
import os
|
|
4
|
-
import re
|
|
5
|
-
import time
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Generator
|
|
8
|
-
from uuid import uuid4
|
|
9
|
-
|
|
10
|
-
import pytest
|
|
11
|
-
from _pytest.fixtures import TopRequest
|
|
12
|
-
from pinecone import Pinecone, ServerlessSpec
|
|
13
|
-
from pinecone.core.openapi.shared.exceptions import NotFoundException
|
|
14
|
-
|
|
15
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
16
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
17
|
-
StagerValidationConfigs,
|
|
18
|
-
stager_validation,
|
|
19
|
-
)
|
|
20
|
-
from test.integration.utils import requires_env
|
|
21
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
22
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
23
|
-
from unstructured_ingest.logger import logger
|
|
24
|
-
from unstructured_ingest.processes.connectors.pinecone import (
|
|
25
|
-
CONNECTOR_TYPE,
|
|
26
|
-
MAX_QUERY_RESULTS,
|
|
27
|
-
PineconeAccessConfig,
|
|
28
|
-
PineconeConnectionConfig,
|
|
29
|
-
PineconeUploader,
|
|
30
|
-
PineconeUploaderConfig,
|
|
31
|
-
PineconeUploadStager,
|
|
32
|
-
PineconeUploadStagerConfig,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
METADATA_BYTES_LIMIT = (
|
|
36
|
-
40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
|
|
37
|
-
)
|
|
38
|
-
VECTOR_DIMENSION = 384
|
|
39
|
-
SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
|
|
40
|
-
ALLOWED_METADATA_FIELD = "text"
|
|
41
|
-
API_KEY = "PINECONE_API_KEY"
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def get_api_key() -> str:
|
|
45
|
-
api_key = os.getenv(API_KEY, None)
|
|
46
|
-
assert api_key
|
|
47
|
-
return api_key
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def wait_for_delete(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
|
|
51
|
-
start = time.time()
|
|
52
|
-
while True and time.time() - start < timeout:
|
|
53
|
-
try:
|
|
54
|
-
description = client.describe_index(name=index_name)
|
|
55
|
-
logger.info(f"current index status: {description}")
|
|
56
|
-
except NotFoundException:
|
|
57
|
-
return
|
|
58
|
-
time.sleep(interval)
|
|
59
|
-
|
|
60
|
-
raise TimeoutError("time out waiting for index to delete")
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
|
|
64
|
-
def is_ready_status():
|
|
65
|
-
description = client.describe_index(name=index_name)
|
|
66
|
-
status = description["status"]
|
|
67
|
-
return status["ready"]
|
|
68
|
-
|
|
69
|
-
start = time.time()
|
|
70
|
-
is_ready = is_ready_status()
|
|
71
|
-
while not is_ready and time.time() - start < timeout:
|
|
72
|
-
time.sleep(interval)
|
|
73
|
-
is_ready = is_ready_status()
|
|
74
|
-
if not is_ready:
|
|
75
|
-
raise TimeoutError("time out waiting for index to be ready")
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@pytest.fixture
|
|
79
|
-
def pinecone_index() -> Generator[str, None, None]:
|
|
80
|
-
pinecone = Pinecone(api_key=get_api_key())
|
|
81
|
-
random_id = str(uuid4()).split("-")[0]
|
|
82
|
-
index_name = f"ingest-test-{random_id}"
|
|
83
|
-
assert len(index_name) < 45
|
|
84
|
-
logger.info(f"Creating index: {index_name}")
|
|
85
|
-
try:
|
|
86
|
-
pinecone.create_index(
|
|
87
|
-
name=index_name,
|
|
88
|
-
dimension=384,
|
|
89
|
-
metric="cosine",
|
|
90
|
-
spec=ServerlessSpec(
|
|
91
|
-
cloud="aws",
|
|
92
|
-
region="us-east-1",
|
|
93
|
-
),
|
|
94
|
-
deletion_protection="disabled",
|
|
95
|
-
)
|
|
96
|
-
wait_for_ready(client=pinecone, index_name=index_name)
|
|
97
|
-
yield index_name
|
|
98
|
-
except Exception as e:
|
|
99
|
-
logger.error(f"failed to create index {index_name}: {e}")
|
|
100
|
-
finally:
|
|
101
|
-
try:
|
|
102
|
-
logger.info(f"deleting index: {index_name}")
|
|
103
|
-
pinecone.delete_index(name=index_name)
|
|
104
|
-
wait_for_delete(client=pinecone, index_name=index_name)
|
|
105
|
-
except NotFoundException:
|
|
106
|
-
return
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def validate_pinecone_index(
|
|
110
|
-
index_name: str,
|
|
111
|
-
expected_num_of_vectors: int,
|
|
112
|
-
retries=30,
|
|
113
|
-
interval=1,
|
|
114
|
-
namespace: str = "default",
|
|
115
|
-
) -> None:
|
|
116
|
-
# Because there's a delay for the index to catch up to the recent writes, add in a retry
|
|
117
|
-
pinecone = Pinecone(api_key=get_api_key())
|
|
118
|
-
index = pinecone.Index(name=index_name, namespace=namespace)
|
|
119
|
-
vector_count = -1
|
|
120
|
-
for i in range(retries):
|
|
121
|
-
index_stats = index.describe_index_stats()
|
|
122
|
-
vector_count = index_stats["total_vector_count"]
|
|
123
|
-
if vector_count == expected_num_of_vectors:
|
|
124
|
-
logger.info(f"expected {expected_num_of_vectors} == vector count {vector_count}")
|
|
125
|
-
break
|
|
126
|
-
logger.info(
|
|
127
|
-
f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
|
|
128
|
-
)
|
|
129
|
-
time.sleep(interval)
|
|
130
|
-
assert vector_count == expected_num_of_vectors, (
|
|
131
|
-
f"vector count from index ({vector_count}) doesn't "
|
|
132
|
-
f"match expected number: {expected_num_of_vectors}"
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
@requires_env(API_KEY)
|
|
137
|
-
@pytest.mark.asyncio
|
|
138
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
139
|
-
async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
|
|
140
|
-
|
|
141
|
-
file_data = FileData(
|
|
142
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
143
|
-
connector_type=CONNECTOR_TYPE,
|
|
144
|
-
identifier="pinecone_mock_id",
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
connection_config = PineconeConnectionConfig(
|
|
148
|
-
index_name=pinecone_index,
|
|
149
|
-
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
150
|
-
)
|
|
151
|
-
stager_config = PineconeUploadStagerConfig()
|
|
152
|
-
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
153
|
-
new_upload_file = stager.run(
|
|
154
|
-
elements_filepath=upload_file,
|
|
155
|
-
output_dir=temp_dir,
|
|
156
|
-
output_filename=upload_file.name,
|
|
157
|
-
file_data=file_data,
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
upload_config = PineconeUploaderConfig()
|
|
161
|
-
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
162
|
-
uploader.precheck()
|
|
163
|
-
|
|
164
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
165
|
-
with new_upload_file.open() as f:
|
|
166
|
-
staged_content = json.load(f)
|
|
167
|
-
expected_num_of_vectors = len(staged_content)
|
|
168
|
-
logger.info("validating first upload")
|
|
169
|
-
validate_pinecone_index(
|
|
170
|
-
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# Rerun uploader and make sure no duplicates exist
|
|
174
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
175
|
-
logger.info("validating second upload")
|
|
176
|
-
validate_pinecone_index(
|
|
177
|
-
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
@requires_env(API_KEY)
|
|
182
|
-
@pytest.mark.asyncio
|
|
183
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
184
|
-
@pytest.mark.skip(reason="TODO: get this to work")
|
|
185
|
-
async def test_pinecone_destination_large_index(
|
|
186
|
-
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
187
|
-
):
|
|
188
|
-
new_file = temp_dir / "large_file.json"
|
|
189
|
-
with upload_file.open() as f:
|
|
190
|
-
upload_content = json.load(f)
|
|
191
|
-
|
|
192
|
-
min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
|
|
193
|
-
new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
|
|
194
|
-
print(f"Creating large index content with {len(new_content)} records")
|
|
195
|
-
with new_file.open("w") as f:
|
|
196
|
-
json.dump(new_content, f)
|
|
197
|
-
|
|
198
|
-
expected_num_of_vectors = len(new_content)
|
|
199
|
-
file_data = FileData(
|
|
200
|
-
source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
|
|
201
|
-
connector_type=CONNECTOR_TYPE,
|
|
202
|
-
identifier="pinecone_mock_id",
|
|
203
|
-
)
|
|
204
|
-
connection_config = PineconeConnectionConfig(
|
|
205
|
-
index_name=pinecone_index,
|
|
206
|
-
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
207
|
-
)
|
|
208
|
-
stager_config = PineconeUploadStagerConfig()
|
|
209
|
-
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
210
|
-
new_upload_file = stager.run(
|
|
211
|
-
elements_filepath=new_file,
|
|
212
|
-
output_dir=temp_dir,
|
|
213
|
-
output_filename=new_file.name,
|
|
214
|
-
file_data=file_data,
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
upload_config = PineconeUploaderConfig()
|
|
218
|
-
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
219
|
-
uploader.precheck()
|
|
220
|
-
|
|
221
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
222
|
-
validate_pinecone_index(
|
|
223
|
-
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
224
|
-
)
|
|
225
|
-
# Rerun uploader and make sure no duplicates exist
|
|
226
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
227
|
-
logger.info("validating second upload")
|
|
228
|
-
validate_pinecone_index(
|
|
229
|
-
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
@requires_env(API_KEY)
|
|
234
|
-
@pytest.mark.asyncio
|
|
235
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
236
|
-
async def test_pinecone_destination_namespace(
|
|
237
|
-
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
238
|
-
):
|
|
239
|
-
"""
|
|
240
|
-
tests namespace functionality of destination connector.
|
|
241
|
-
"""
|
|
242
|
-
|
|
243
|
-
# creates a file data structure.
|
|
244
|
-
file_data = FileData(
|
|
245
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
246
|
-
connector_type=CONNECTOR_TYPE,
|
|
247
|
-
identifier="pinecone_mock_id",
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
connection_config = PineconeConnectionConfig(
|
|
251
|
-
index_name=pinecone_index,
|
|
252
|
-
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
stager_config = PineconeUploadStagerConfig()
|
|
256
|
-
|
|
257
|
-
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
258
|
-
new_upload_file = stager.run(
|
|
259
|
-
elements_filepath=upload_file,
|
|
260
|
-
output_dir=temp_dir,
|
|
261
|
-
output_filename=upload_file.name,
|
|
262
|
-
file_data=file_data,
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
# here add namespace defintion
|
|
266
|
-
upload_config = PineconeUploaderConfig()
|
|
267
|
-
namespace_test_name = "user-1"
|
|
268
|
-
upload_config.namespace = namespace_test_name
|
|
269
|
-
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
270
|
-
uploader.precheck()
|
|
271
|
-
|
|
272
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
273
|
-
with new_upload_file.open() as f:
|
|
274
|
-
staged_content = json.load(f)
|
|
275
|
-
expected_num_of_vectors = len(staged_content)
|
|
276
|
-
logger.info("validating first upload")
|
|
277
|
-
validate_pinecone_index(
|
|
278
|
-
index_name=pinecone_index,
|
|
279
|
-
expected_num_of_vectors=expected_num_of_vectors,
|
|
280
|
-
namespace=namespace_test_name,
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
# Rerun uploader and make sure no duplicates exist
|
|
284
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
285
|
-
logger.info("validating second upload")
|
|
286
|
-
validate_pinecone_index(
|
|
287
|
-
index_name=pinecone_index,
|
|
288
|
-
expected_num_of_vectors=expected_num_of_vectors,
|
|
289
|
-
namespace=namespace_test_name,
|
|
290
|
-
)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
@requires_env(API_KEY)
|
|
294
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
295
|
-
def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
296
|
-
stager = PineconeUploadStager()
|
|
297
|
-
uploader = PineconeUploader(
|
|
298
|
-
connection_config=PineconeConnectionConfig(
|
|
299
|
-
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
300
|
-
index_name=pinecone_index,
|
|
301
|
-
),
|
|
302
|
-
upload_config=PineconeUploaderConfig(),
|
|
303
|
-
)
|
|
304
|
-
large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
|
|
305
|
-
large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
|
|
306
|
-
|
|
307
|
-
with open(upload_file) as file:
|
|
308
|
-
elements = json.load(file)
|
|
309
|
-
|
|
310
|
-
with open(large_metadata_upload_file, "w") as file:
|
|
311
|
-
mock_element = elements[0]
|
|
312
|
-
mock_element["metadata"] = large_metadata
|
|
313
|
-
json.dump([mock_element], file)
|
|
314
|
-
|
|
315
|
-
file_data = FileData(
|
|
316
|
-
source_identifiers=SourceIdentifiers(
|
|
317
|
-
fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
|
|
318
|
-
),
|
|
319
|
-
connector_type=CONNECTOR_TYPE,
|
|
320
|
-
identifier="mock-file-data",
|
|
321
|
-
)
|
|
322
|
-
staged_file = stager.run(
|
|
323
|
-
elements_filepath=large_metadata_upload_file,
|
|
324
|
-
file_data=file_data,
|
|
325
|
-
output_dir=tmp_path,
|
|
326
|
-
output_filename=large_metadata_upload_file.name,
|
|
327
|
-
)
|
|
328
|
-
try:
|
|
329
|
-
uploader.run(staged_file, file_data)
|
|
330
|
-
except DestinationConnectionError as e:
|
|
331
|
-
error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
|
|
332
|
-
if re.search(re.compile(error_line), str(e)) is None:
|
|
333
|
-
raise e
|
|
334
|
-
raise pytest.fail("Upload request failed due to metadata exceeding limits.")
|
|
335
|
-
|
|
336
|
-
validate_pinecone_index(pinecone_index, 1, interval=5)
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
340
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
341
|
-
def test_pinecone_stager(
|
|
342
|
-
request: TopRequest,
|
|
343
|
-
upload_file_str: str,
|
|
344
|
-
tmp_path: Path,
|
|
345
|
-
):
|
|
346
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
347
|
-
stager = PineconeUploadStager()
|
|
348
|
-
stager_validation(
|
|
349
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
350
|
-
input_file=upload_file,
|
|
351
|
-
stager=stager,
|
|
352
|
-
tmp_dir=tmp_path,
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
@requires_env(API_KEY)
|
|
357
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
358
|
-
def test_pinecone_create_destination(pinecone_index):
|
|
359
|
-
uploader = PineconeUploader(
|
|
360
|
-
connection_config=PineconeConnectionConfig(
|
|
361
|
-
access_config=PineconeAccessConfig(api_key=get_api_key())
|
|
362
|
-
),
|
|
363
|
-
upload_config=PineconeUploaderConfig(),
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
random_id = str(uuid4()).split("-")[0]
|
|
367
|
-
|
|
368
|
-
index_name = f"test-create-destination-{random_id}"
|
|
369
|
-
|
|
370
|
-
assert not uploader.index_exists(index_name=index_name)
|
|
371
|
-
|
|
372
|
-
try:
|
|
373
|
-
uploader.create_destination(destination_name=index_name, vector_length=1536)
|
|
374
|
-
except Exception as e:
|
|
375
|
-
error_body = getattr(e, "body", None)
|
|
376
|
-
raise pytest.fail(f"failed to create destination: {e} {error_body}")
|
|
377
|
-
|
|
378
|
-
assert uploader.index_exists(index_name=index_name), "destination was not created successfully"
|
|
379
|
-
|
|
380
|
-
try:
|
|
381
|
-
pc = uploader.connection_config.get_client()
|
|
382
|
-
logger.info(f"deleting index for test create destination: {index_name}")
|
|
383
|
-
pc.delete_index(name=index_name)
|
|
384
|
-
except Exception as e:
|
|
385
|
-
raise pytest.fail(f"failed to cleanup / delete the destination: {e}")
|
|
386
|
-
|
|
387
|
-
assert not uploader.index_exists(index_name=index_name), "cleanup failed"
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import uuid
|
|
4
|
-
from contextlib import asynccontextmanager
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import AsyncGenerator
|
|
7
|
-
|
|
8
|
-
import pytest
|
|
9
|
-
from _pytest.fixtures import TopRequest
|
|
10
|
-
from qdrant_client import AsyncQdrantClient
|
|
11
|
-
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
13
|
-
from test.integration.connectors.utils.docker import container_context
|
|
14
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
15
|
-
StagerValidationConfigs,
|
|
16
|
-
stager_validation,
|
|
17
|
-
)
|
|
18
|
-
from test.integration.utils import requires_env
|
|
19
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
20
|
-
from unstructured_ingest.processes.connectors.qdrant.cloud import (
|
|
21
|
-
CloudQdrantAccessConfig,
|
|
22
|
-
CloudQdrantConnectionConfig,
|
|
23
|
-
CloudQdrantUploader,
|
|
24
|
-
CloudQdrantUploaderConfig,
|
|
25
|
-
CloudQdrantUploadStager,
|
|
26
|
-
CloudQdrantUploadStagerConfig,
|
|
27
|
-
)
|
|
28
|
-
from unstructured_ingest.processes.connectors.qdrant.local import (
|
|
29
|
-
CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
|
|
30
|
-
)
|
|
31
|
-
from unstructured_ingest.processes.connectors.qdrant.local import (
|
|
32
|
-
LocalQdrantConnectionConfig,
|
|
33
|
-
LocalQdrantUploader,
|
|
34
|
-
LocalQdrantUploaderConfig,
|
|
35
|
-
LocalQdrantUploadStager,
|
|
36
|
-
LocalQdrantUploadStagerConfig,
|
|
37
|
-
)
|
|
38
|
-
from unstructured_ingest.processes.connectors.qdrant.server import (
|
|
39
|
-
CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
|
|
40
|
-
)
|
|
41
|
-
from unstructured_ingest.processes.connectors.qdrant.server import (
|
|
42
|
-
ServerQdrantConnectionConfig,
|
|
43
|
-
ServerQdrantUploader,
|
|
44
|
-
ServerQdrantUploaderConfig,
|
|
45
|
-
ServerQdrantUploadStager,
|
|
46
|
-
ServerQdrantUploadStagerConfig,
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
|
|
50
|
-
VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@asynccontextmanager
|
|
54
|
-
async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
|
|
55
|
-
client = AsyncQdrantClient(**client_params)
|
|
56
|
-
try:
|
|
57
|
-
yield client
|
|
58
|
-
finally:
|
|
59
|
-
await client.close()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
|
|
63
|
-
with upload_file.open() as upload_fp:
|
|
64
|
-
elements = json.load(upload_fp)
|
|
65
|
-
expected_point_count = len(elements)
|
|
66
|
-
first_element = elements[0]
|
|
67
|
-
expected_text = first_element["text"]
|
|
68
|
-
embeddings = first_element["embeddings"]
|
|
69
|
-
collection = await client.get_collection(COLLECTION_NAME)
|
|
70
|
-
assert collection.points_count == expected_point_count
|
|
71
|
-
|
|
72
|
-
response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
|
|
73
|
-
assert response.points[0].payload is not None
|
|
74
|
-
assert response.points[0].payload["text"] == expected_text
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
@pytest.mark.asyncio
|
|
78
|
-
@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
79
|
-
async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
|
|
80
|
-
connection_kwargs = {"path": str(tmp_path / "qdrant")}
|
|
81
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
82
|
-
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
83
|
-
AsyncQdrantClient(**connection_kwargs)
|
|
84
|
-
stager = LocalQdrantUploadStager(
|
|
85
|
-
upload_stager_config=LocalQdrantUploadStagerConfig(),
|
|
86
|
-
)
|
|
87
|
-
uploader = LocalQdrantUploader(
|
|
88
|
-
connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
|
|
89
|
-
upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
file_data = FileData(
|
|
93
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
94
|
-
connector_type=LOCAL_CONNECTOR_TYPE,
|
|
95
|
-
identifier="mock-file-data",
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
staged_upload_file = stager.run(
|
|
99
|
-
elements_filepath=upload_file,
|
|
100
|
-
file_data=file_data,
|
|
101
|
-
output_dir=tmp_path,
|
|
102
|
-
output_filename=upload_file.name,
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
if uploader.is_async():
|
|
106
|
-
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
107
|
-
else:
|
|
108
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
109
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
110
|
-
await validate_upload(client=client, upload_file=upload_file)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
@pytest.fixture
|
|
114
|
-
def docker_context():
|
|
115
|
-
with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
|
|
116
|
-
yield container
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
@pytest.mark.asyncio
|
|
120
|
-
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
121
|
-
async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
|
|
122
|
-
connection_kwargs = {"location": "http://localhost:6333"}
|
|
123
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
124
|
-
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
125
|
-
AsyncQdrantClient(**connection_kwargs)
|
|
126
|
-
stager = ServerQdrantUploadStager(
|
|
127
|
-
upload_stager_config=ServerQdrantUploadStagerConfig(),
|
|
128
|
-
)
|
|
129
|
-
uploader = ServerQdrantUploader(
|
|
130
|
-
connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
|
|
131
|
-
upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
file_data = FileData(
|
|
135
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
136
|
-
connector_type=SERVER_CONNECTOR_TYPE,
|
|
137
|
-
identifier="mock-file-data",
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
staged_upload_file = stager.run(
|
|
141
|
-
elements_filepath=upload_file,
|
|
142
|
-
file_data=file_data,
|
|
143
|
-
output_dir=tmp_path,
|
|
144
|
-
output_filename=upload_file.name,
|
|
145
|
-
)
|
|
146
|
-
uploader.precheck()
|
|
147
|
-
if uploader.is_async():
|
|
148
|
-
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
149
|
-
else:
|
|
150
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
151
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
152
|
-
await validate_upload(client=client, upload_file=upload_file)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
@pytest.mark.asyncio
|
|
156
|
-
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
157
|
-
@requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
|
|
158
|
-
async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
|
|
159
|
-
server_url = os.environ["QDRANT_SERVER_URL"]
|
|
160
|
-
api_key = os.environ["QDRANT_API_KEY"]
|
|
161
|
-
connection_kwargs = {"location": server_url, "api_key": api_key}
|
|
162
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
163
|
-
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
164
|
-
AsyncQdrantClient(**connection_kwargs)
|
|
165
|
-
|
|
166
|
-
stager = CloudQdrantUploadStager(
|
|
167
|
-
upload_stager_config=CloudQdrantUploadStagerConfig(),
|
|
168
|
-
)
|
|
169
|
-
uploader = CloudQdrantUploader(
|
|
170
|
-
connection_config=CloudQdrantConnectionConfig(
|
|
171
|
-
url=server_url,
|
|
172
|
-
access_config=CloudQdrantAccessConfig(
|
|
173
|
-
api_key=api_key,
|
|
174
|
-
),
|
|
175
|
-
),
|
|
176
|
-
upload_config=CloudQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
file_data = FileData(
|
|
180
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
181
|
-
connector_type=SERVER_CONNECTOR_TYPE,
|
|
182
|
-
identifier="mock-file-data",
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
staged_upload_file = stager.run(
|
|
186
|
-
elements_filepath=upload_file,
|
|
187
|
-
file_data=file_data,
|
|
188
|
-
output_dir=tmp_path,
|
|
189
|
-
output_filename=upload_file.name,
|
|
190
|
-
)
|
|
191
|
-
uploader.precheck()
|
|
192
|
-
if uploader.is_async():
|
|
193
|
-
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
194
|
-
else:
|
|
195
|
-
uploader.run(path=staged_upload_file, file_data=file_data)
|
|
196
|
-
async with qdrant_client(connection_kwargs) as client:
|
|
197
|
-
await validate_upload(client=client, upload_file=upload_file)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
|
|
201
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
202
|
-
def test_qdrant_stager(
|
|
203
|
-
request: TopRequest,
|
|
204
|
-
upload_file_str: str,
|
|
205
|
-
tmp_path: Path,
|
|
206
|
-
):
|
|
207
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
208
|
-
stager = LocalQdrantUploadStager(
|
|
209
|
-
upload_stager_config=LocalQdrantUploadStagerConfig(),
|
|
210
|
-
)
|
|
211
|
-
stager_validation(
|
|
212
|
-
configs=StagerValidationConfigs(test_id=LOCAL_CONNECTOR_TYPE, expected_count=22),
|
|
213
|
-
input_file=upload_file,
|
|
214
|
-
stager=stager,
|
|
215
|
-
tmp_dir=tmp_path,
|
|
216
|
-
)
|