unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import docker
|
|
6
|
-
import pytest
|
|
7
|
-
from _pytest.fixtures import TopRequest
|
|
8
|
-
from pymilvus import (
|
|
9
|
-
CollectionSchema,
|
|
10
|
-
DataType,
|
|
11
|
-
FieldSchema,
|
|
12
|
-
MilvusClient,
|
|
13
|
-
)
|
|
14
|
-
from pymilvus.milvus_client import IndexParams
|
|
15
|
-
|
|
16
|
-
from test.integration.connectors.utils.constants import (
|
|
17
|
-
DESTINATION_TAG,
|
|
18
|
-
VECTOR_DB_TAG,
|
|
19
|
-
env_setup_path,
|
|
20
|
-
)
|
|
21
|
-
from test.integration.connectors.utils.docker import healthcheck_wait
|
|
22
|
-
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
23
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
24
|
-
StagerValidationConfigs,
|
|
25
|
-
stager_validation,
|
|
26
|
-
)
|
|
27
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
28
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
29
|
-
from unstructured_ingest.processes.connectors.milvus import (
|
|
30
|
-
CONNECTOR_TYPE,
|
|
31
|
-
MilvusConnectionConfig,
|
|
32
|
-
MilvusUploader,
|
|
33
|
-
MilvusUploaderConfig,
|
|
34
|
-
MilvusUploadStager,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
DB_NAME = "test_database"
|
|
38
|
-
EXISTENT_COLLECTION_NAME = "test_collection"
|
|
39
|
-
NONEXISTENT_COLLECTION_NAME = "nonexistent_collection"
|
|
40
|
-
DB_URI = "http://localhost:19530"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_schema() -> CollectionSchema:
|
|
44
|
-
id_field = FieldSchema(
|
|
45
|
-
name="id", dtype=DataType.INT64, description="primary field", is_primary=True, auto_id=True
|
|
46
|
-
)
|
|
47
|
-
embeddings_field = FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384)
|
|
48
|
-
record_id_field = FieldSchema(name="record_id", dtype=DataType.VARCHAR, max_length=64)
|
|
49
|
-
|
|
50
|
-
schema = CollectionSchema(
|
|
51
|
-
enable_dynamic_field=True,
|
|
52
|
-
fields=[
|
|
53
|
-
id_field,
|
|
54
|
-
record_id_field,
|
|
55
|
-
embeddings_field,
|
|
56
|
-
],
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
return schema
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def get_index_params() -> IndexParams:
|
|
63
|
-
index_params = IndexParams()
|
|
64
|
-
index_params.add_index(field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE")
|
|
65
|
-
index_params.add_index(field_name="record_id", index_type="Trie")
|
|
66
|
-
return index_params
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# NOTE: Precheck tests are read-only so they don't interfere with destination test,
|
|
70
|
-
# using scope="module" we can limit number of times the docker-compose has to be run
|
|
71
|
-
@pytest.fixture(scope="module")
|
|
72
|
-
def collection():
|
|
73
|
-
docker_client = docker.from_env()
|
|
74
|
-
with docker_compose_context(docker_compose_path=env_setup_path / "milvus"):
|
|
75
|
-
milvus_container = docker_client.containers.get("milvus-standalone")
|
|
76
|
-
healthcheck_wait(container=milvus_container)
|
|
77
|
-
milvus_client = MilvusClient(uri=DB_URI)
|
|
78
|
-
try:
|
|
79
|
-
# Create the database
|
|
80
|
-
database_resp = milvus_client._get_connection().create_database(db_name=DB_NAME)
|
|
81
|
-
milvus_client.using_database(db_name=DB_NAME)
|
|
82
|
-
|
|
83
|
-
print(f"Created database {DB_NAME}: {database_resp}")
|
|
84
|
-
|
|
85
|
-
# Create the collection
|
|
86
|
-
schema = get_schema()
|
|
87
|
-
index_params = get_index_params()
|
|
88
|
-
collection_resp = milvus_client.create_collection(
|
|
89
|
-
collection_name=EXISTENT_COLLECTION_NAME, schema=schema, index_params=index_params
|
|
90
|
-
)
|
|
91
|
-
print(f"Created collection {EXISTENT_COLLECTION_NAME}: {collection_resp}")
|
|
92
|
-
yield EXISTENT_COLLECTION_NAME
|
|
93
|
-
finally:
|
|
94
|
-
milvus_client.close()
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def get_count(client: MilvusClient) -> int:
|
|
98
|
-
count_field = "count(*)"
|
|
99
|
-
resp = client.query(collection_name="test_collection", output_fields=[count_field])
|
|
100
|
-
return resp[0][count_field]
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def validate_count(
|
|
104
|
-
client: MilvusClient, expected_count: int, retries: int = 10, interval: int = 1
|
|
105
|
-
) -> None:
|
|
106
|
-
current_count = get_count(client=client)
|
|
107
|
-
retry_count = 0
|
|
108
|
-
while current_count != expected_count and retry_count < retries:
|
|
109
|
-
time.sleep(interval)
|
|
110
|
-
current_count = get_count(client=client)
|
|
111
|
-
retry_count += 1
|
|
112
|
-
assert current_count == expected_count, (
|
|
113
|
-
f"Expected count ({expected_count}) doesn't match how "
|
|
114
|
-
f"much came back from collection: {current_count}"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@pytest.mark.asyncio
|
|
119
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
120
|
-
async def test_milvus_destination(
|
|
121
|
-
upload_file: Path,
|
|
122
|
-
collection: str,
|
|
123
|
-
tmp_path: Path,
|
|
124
|
-
):
|
|
125
|
-
file_data = FileData(
|
|
126
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
127
|
-
connector_type=CONNECTOR_TYPE,
|
|
128
|
-
identifier="mock file data",
|
|
129
|
-
)
|
|
130
|
-
stager = MilvusUploadStager()
|
|
131
|
-
uploader = MilvusUploader(
|
|
132
|
-
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
133
|
-
upload_config=MilvusUploaderConfig(collection_name=collection, db_name=DB_NAME),
|
|
134
|
-
)
|
|
135
|
-
staged_filepath = stager.run(
|
|
136
|
-
elements_filepath=upload_file,
|
|
137
|
-
file_data=file_data,
|
|
138
|
-
output_dir=tmp_path,
|
|
139
|
-
output_filename=upload_file.name,
|
|
140
|
-
)
|
|
141
|
-
uploader.precheck()
|
|
142
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
143
|
-
|
|
144
|
-
# Run validation
|
|
145
|
-
with staged_filepath.open() as f:
|
|
146
|
-
staged_elements = json.load(f)
|
|
147
|
-
expected_count = len(staged_elements)
|
|
148
|
-
with uploader.get_client() as client:
|
|
149
|
-
validate_count(client=client, expected_count=expected_count)
|
|
150
|
-
|
|
151
|
-
# Rerun and make sure the same documents get updated
|
|
152
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
153
|
-
with uploader.get_client() as client:
|
|
154
|
-
validate_count(client=client, expected_count=expected_count)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
158
|
-
def test_precheck_succeeds(collection: str):
|
|
159
|
-
uploader = MilvusUploader(
|
|
160
|
-
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
161
|
-
upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
|
|
162
|
-
)
|
|
163
|
-
uploader.precheck()
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
167
|
-
def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
168
|
-
uploader = MilvusUploader(
|
|
169
|
-
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
170
|
-
upload_config=MilvusUploaderConfig(
|
|
171
|
-
db_name=DB_NAME, collection_name=NONEXISTENT_COLLECTION_NAME
|
|
172
|
-
),
|
|
173
|
-
)
|
|
174
|
-
with pytest.raises(
|
|
175
|
-
DestinationConnectionError,
|
|
176
|
-
match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
|
|
177
|
-
):
|
|
178
|
-
uploader.precheck()
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
182
|
-
def test_precheck_fails_on_nonexisting_db(collection: str):
|
|
183
|
-
uploader = MilvusUploader(
|
|
184
|
-
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
185
|
-
upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
|
|
186
|
-
)
|
|
187
|
-
with pytest.raises(
|
|
188
|
-
DestinationConnectionError,
|
|
189
|
-
match="database not found",
|
|
190
|
-
):
|
|
191
|
-
uploader.precheck()
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
195
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
196
|
-
def test_milvus_stager(
|
|
197
|
-
request: TopRequest,
|
|
198
|
-
upload_file_str: str,
|
|
199
|
-
tmp_path: Path,
|
|
200
|
-
):
|
|
201
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
202
|
-
stager = MilvusUploadStager()
|
|
203
|
-
stager_validation(
|
|
204
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
205
|
-
input_file=upload_file,
|
|
206
|
-
stager=stager,
|
|
207
|
-
tmp_dir=tmp_path,
|
|
208
|
-
)
|
|
@@ -1,335 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import time
|
|
4
|
-
import uuid
|
|
5
|
-
from contextlib import contextmanager
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Generator
|
|
8
|
-
|
|
9
|
-
import pytest
|
|
10
|
-
from pydantic import BaseModel, SecretStr
|
|
11
|
-
from pymongo.collection import Collection
|
|
12
|
-
from pymongo.database import Database
|
|
13
|
-
from pymongo.mongo_client import MongoClient
|
|
14
|
-
from pymongo.operations import SearchIndexModel
|
|
15
|
-
|
|
16
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
|
|
17
|
-
from test.integration.connectors.utils.validation.source import (
|
|
18
|
-
SourceValidationConfigs,
|
|
19
|
-
source_connector_validation,
|
|
20
|
-
)
|
|
21
|
-
from test.integration.utils import requires_env
|
|
22
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
23
|
-
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
24
|
-
from unstructured_ingest.processes.connectors.mongodb import (
|
|
25
|
-
CONNECTOR_TYPE,
|
|
26
|
-
MongoDBAccessConfig,
|
|
27
|
-
MongoDBConnectionConfig,
|
|
28
|
-
MongoDBDownloader,
|
|
29
|
-
MongoDBDownloaderConfig,
|
|
30
|
-
MongoDBIndexer,
|
|
31
|
-
MongoDBIndexerConfig,
|
|
32
|
-
MongoDBUploader,
|
|
33
|
-
MongoDBUploaderConfig,
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
SOURCE_COLLECTION = "sample-mongodb-data"
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class EnvData(BaseModel):
|
|
40
|
-
uri: SecretStr
|
|
41
|
-
database: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def get_env_data() -> EnvData:
|
|
45
|
-
uri = os.getenv("MONGODB_URI")
|
|
46
|
-
assert uri
|
|
47
|
-
database = os.getenv("MONGODB_DATABASE")
|
|
48
|
-
assert database
|
|
49
|
-
return EnvData(uri=uri, database=database)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@contextmanager
|
|
53
|
-
def get_client() -> Generator[MongoClient, None, None]:
|
|
54
|
-
uri = get_env_data().uri.get_secret_value()
|
|
55
|
-
with MongoClient(uri) as client:
|
|
56
|
-
assert client.admin.command("ping")
|
|
57
|
-
yield client
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def wait_for_collection(
|
|
61
|
-
database: Database, collection_name: str, retries: int = 10, interval: int = 1
|
|
62
|
-
):
|
|
63
|
-
collections = database.list_collection_names()
|
|
64
|
-
attempts = 0
|
|
65
|
-
while collection_name not in collections and attempts < retries:
|
|
66
|
-
attempts += 1
|
|
67
|
-
print(
|
|
68
|
-
"Waiting for collection {} to be recognized: {}".format(
|
|
69
|
-
collection_name, ", ".join(collections)
|
|
70
|
-
)
|
|
71
|
-
)
|
|
72
|
-
time.sleep(interval)
|
|
73
|
-
collections = database.list_collection_names()
|
|
74
|
-
if collection_name not in collection_name:
|
|
75
|
-
raise TimeoutError(f"Collection {collection_name} was not recognized")
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def get_search_index_status(collection: Collection, index_name: str) -> str:
|
|
79
|
-
search_indexes = collection.list_search_indexes(name=index_name)
|
|
80
|
-
search_index = list(search_indexes)[0]
|
|
81
|
-
return search_index["status"]
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def wait_for_search_index(
|
|
85
|
-
collection: Collection, index_name: str, retries: int = 60, interval: int = 1
|
|
86
|
-
):
|
|
87
|
-
current_status = get_search_index_status(collection, index_name)
|
|
88
|
-
attempts = 0
|
|
89
|
-
while current_status != "READY" and attempts < retries:
|
|
90
|
-
attempts += 1
|
|
91
|
-
print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
|
|
92
|
-
time.sleep(interval)
|
|
93
|
-
current_status = get_search_index_status(collection, index_name)
|
|
94
|
-
|
|
95
|
-
if current_status != "READY":
|
|
96
|
-
raise TimeoutError("search index never detected as READY")
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
@pytest.fixture
|
|
100
|
-
def destination_collection() -> Collection:
|
|
101
|
-
env_data = get_env_data()
|
|
102
|
-
collection_name = f"utic-test-output-{uuid.uuid4()}"
|
|
103
|
-
with get_client() as client:
|
|
104
|
-
database = client[env_data.database]
|
|
105
|
-
print(f"creating collection in database {database}: {collection_name}")
|
|
106
|
-
collection = database.create_collection(name=collection_name)
|
|
107
|
-
search_index_name = "embeddings"
|
|
108
|
-
collection.create_search_index(
|
|
109
|
-
model=SearchIndexModel(
|
|
110
|
-
name=search_index_name,
|
|
111
|
-
definition={
|
|
112
|
-
"mappings": {
|
|
113
|
-
"dynamic": True,
|
|
114
|
-
"fields": {
|
|
115
|
-
"embeddings": [
|
|
116
|
-
{"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
|
|
117
|
-
]
|
|
118
|
-
},
|
|
119
|
-
}
|
|
120
|
-
},
|
|
121
|
-
)
|
|
122
|
-
)
|
|
123
|
-
collection.create_index("record_id")
|
|
124
|
-
wait_for_collection(database=database, collection_name=collection_name)
|
|
125
|
-
wait_for_search_index(collection=collection, index_name=search_index_name)
|
|
126
|
-
try:
|
|
127
|
-
yield collection
|
|
128
|
-
finally:
|
|
129
|
-
print(f"deleting collection: {collection_name}")
|
|
130
|
-
collection.drop()
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def validate_collection_count(
|
|
134
|
-
collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
|
|
135
|
-
) -> None:
|
|
136
|
-
count = collection.count_documents(filter={})
|
|
137
|
-
attempt = 0
|
|
138
|
-
while count != expected_records and attempt < retries:
|
|
139
|
-
attempt += 1
|
|
140
|
-
print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
|
|
141
|
-
time.sleep(interval)
|
|
142
|
-
count = collection.count_documents(filter={})
|
|
143
|
-
assert (
|
|
144
|
-
count == expected_records
|
|
145
|
-
), f"expected count ({expected_records}) does not match how many records were found: {count}"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def validate_collection_vector(
|
|
149
|
-
collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
|
|
150
|
-
) -> None:
|
|
151
|
-
pipeline = [
|
|
152
|
-
{
|
|
153
|
-
"$vectorSearch": {
|
|
154
|
-
"index": "embeddings",
|
|
155
|
-
"path": "embeddings",
|
|
156
|
-
"queryVector": embedding,
|
|
157
|
-
"numCandidates": 150,
|
|
158
|
-
"limit": 10,
|
|
159
|
-
},
|
|
160
|
-
},
|
|
161
|
-
{"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
|
|
162
|
-
]
|
|
163
|
-
attempts = 0
|
|
164
|
-
results = list(collection.aggregate(pipeline=pipeline))
|
|
165
|
-
while not results and attempts < retries:
|
|
166
|
-
attempts += 1
|
|
167
|
-
print(f"attempt {attempts}, waiting for valid results: {results}")
|
|
168
|
-
time.sleep(interval)
|
|
169
|
-
results = list(collection.aggregate(pipeline=pipeline))
|
|
170
|
-
if not results:
|
|
171
|
-
raise TimeoutError("Timed out waiting for valid results")
|
|
172
|
-
print(f"found results on attempt {attempts}")
|
|
173
|
-
top_result = results[0]
|
|
174
|
-
assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
|
|
175
|
-
assert top_result["text"] == text, "text detected should be {}, found: {}".format(
|
|
176
|
-
text, top_result["text"]
|
|
177
|
-
)
|
|
178
|
-
for r in results[1:]:
|
|
179
|
-
assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
@pytest.mark.asyncio
|
|
183
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
184
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
185
|
-
async def test_mongodb_source(temp_dir: Path):
|
|
186
|
-
env_data = get_env_data()
|
|
187
|
-
indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
|
|
188
|
-
download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
|
|
189
|
-
connection_config = MongoDBConnectionConfig(
|
|
190
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
191
|
-
)
|
|
192
|
-
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
193
|
-
downloader = MongoDBDownloader(
|
|
194
|
-
connection_config=connection_config, download_config=download_config
|
|
195
|
-
)
|
|
196
|
-
await source_connector_validation(
|
|
197
|
-
indexer=indexer,
|
|
198
|
-
downloader=downloader,
|
|
199
|
-
configs=SourceValidationConfigs(
|
|
200
|
-
test_id=CONNECTOR_TYPE,
|
|
201
|
-
expected_num_files=4,
|
|
202
|
-
validate_downloaded_files=True,
|
|
203
|
-
expected_number_indexed_file_data=1,
|
|
204
|
-
),
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
209
|
-
def test_mongodb_indexer_precheck_fail_no_host():
|
|
210
|
-
indexer_config = MongoDBIndexerConfig(
|
|
211
|
-
database="non-existent-database", collection="non-existent-database"
|
|
212
|
-
)
|
|
213
|
-
connection_config = MongoDBConnectionConfig(
|
|
214
|
-
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
215
|
-
)
|
|
216
|
-
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
217
|
-
with pytest.raises(SourceConnectionError):
|
|
218
|
-
indexer.precheck()
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
222
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
223
|
-
def test_mongodb_indexer_precheck_fail_no_database():
|
|
224
|
-
env_data = get_env_data()
|
|
225
|
-
indexer_config = MongoDBIndexerConfig(
|
|
226
|
-
database="non-existent-database", collection=SOURCE_COLLECTION
|
|
227
|
-
)
|
|
228
|
-
connection_config = MongoDBConnectionConfig(
|
|
229
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
230
|
-
)
|
|
231
|
-
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
232
|
-
with pytest.raises(SourceConnectionError):
|
|
233
|
-
indexer.precheck()
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
237
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
238
|
-
def test_mongodb_indexer_precheck_fail_no_collection():
|
|
239
|
-
env_data = get_env_data()
|
|
240
|
-
indexer_config = MongoDBIndexerConfig(
|
|
241
|
-
database=env_data.database, collection="non-existent-collection"
|
|
242
|
-
)
|
|
243
|
-
connection_config = MongoDBConnectionConfig(
|
|
244
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
245
|
-
)
|
|
246
|
-
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
247
|
-
with pytest.raises(SourceConnectionError):
|
|
248
|
-
indexer.precheck()
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
@pytest.mark.asyncio
|
|
252
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
253
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
254
|
-
async def test_mongodb_destination(
|
|
255
|
-
upload_file: Path,
|
|
256
|
-
destination_collection: Collection,
|
|
257
|
-
tmp_path: Path,
|
|
258
|
-
):
|
|
259
|
-
env_data = get_env_data()
|
|
260
|
-
file_data = FileData(
|
|
261
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
262
|
-
connector_type=CONNECTOR_TYPE,
|
|
263
|
-
identifier="mongodb_mock_id",
|
|
264
|
-
)
|
|
265
|
-
connection_config = MongoDBConnectionConfig(
|
|
266
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
upload_config = MongoDBUploaderConfig(
|
|
270
|
-
database=env_data.database,
|
|
271
|
-
collection=destination_collection.name,
|
|
272
|
-
)
|
|
273
|
-
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
274
|
-
uploader.precheck()
|
|
275
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
276
|
-
|
|
277
|
-
with upload_file.open() as f:
|
|
278
|
-
staged_elements = json.load(f)
|
|
279
|
-
expected_records = len(staged_elements)
|
|
280
|
-
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
281
|
-
first_element = staged_elements[0]
|
|
282
|
-
validate_collection_vector(
|
|
283
|
-
collection=destination_collection,
|
|
284
|
-
embedding=first_element["embeddings"],
|
|
285
|
-
text=first_element["text"],
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
289
|
-
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
293
|
-
def test_mongodb_uploader_precheck_fail_no_host():
|
|
294
|
-
upload_config = MongoDBUploaderConfig(
|
|
295
|
-
database="database",
|
|
296
|
-
collection="collection",
|
|
297
|
-
)
|
|
298
|
-
connection_config = MongoDBConnectionConfig(
|
|
299
|
-
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
300
|
-
)
|
|
301
|
-
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
302
|
-
with pytest.raises(DestinationConnectionError):
|
|
303
|
-
uploader.precheck()
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
307
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
308
|
-
def test_mongodb_uploader_precheck_fail_no_database():
|
|
309
|
-
env_data = get_env_data()
|
|
310
|
-
upload_config = MongoDBUploaderConfig(
|
|
311
|
-
database="database",
|
|
312
|
-
collection="collection",
|
|
313
|
-
)
|
|
314
|
-
connection_config = MongoDBConnectionConfig(
|
|
315
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
316
|
-
)
|
|
317
|
-
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
318
|
-
with pytest.raises(DestinationConnectionError):
|
|
319
|
-
uploader.precheck()
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
323
|
-
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
324
|
-
def test_mongodb_uploader_precheck_fail_no_collection():
|
|
325
|
-
env_data = get_env_data()
|
|
326
|
-
upload_config = MongoDBUploaderConfig(
|
|
327
|
-
database=env_data.database,
|
|
328
|
-
collection="collection",
|
|
329
|
-
)
|
|
330
|
-
connection_config = MongoDBConnectionConfig(
|
|
331
|
-
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
332
|
-
)
|
|
333
|
-
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
334
|
-
with pytest.raises(DestinationConnectionError):
|
|
335
|
-
uploader.precheck()
|