unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +102 -91
- test/integration/connectors/sql/test_singlestore.py +111 -99
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +86 -75
- test/integration/connectors/test_astradb.py +22 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +4 -4
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +3 -3
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
- unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -65
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +21 -17
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,18 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import sqlite3
|
|
2
3
|
import tempfile
|
|
3
|
-
from contextlib import contextmanager
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
import pytest
|
|
7
|
+
from _pytest.fixtures import TopRequest
|
|
8
8
|
|
|
9
9
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
10
|
-
from test.integration.connectors.utils.validation import (
|
|
11
|
-
|
|
10
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
11
|
+
StagerValidationConfigs,
|
|
12
|
+
stager_validation,
|
|
13
|
+
)
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
12
16
|
source_connector_validation,
|
|
13
17
|
)
|
|
14
18
|
from unstructured_ingest.v2.interfaces import FileData
|
|
@@ -26,8 +30,8 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
|
26
30
|
SEED_DATA_ROWS = 20
|
|
27
31
|
|
|
28
32
|
|
|
29
|
-
@
|
|
30
|
-
def
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def source_database_setup() -> Path:
|
|
31
35
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
32
36
|
db_path = Path(tmpdir) / "mock_database.db"
|
|
33
37
|
db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
|
|
@@ -49,49 +53,42 @@ def sqlite_download_setup() -> Path:
|
|
|
49
53
|
|
|
50
54
|
@pytest.mark.asyncio
|
|
51
55
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
52
|
-
async def test_sqlite_source():
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@contextmanager
|
|
81
|
-
def sqlite_upload_setup() -> Path:
|
|
56
|
+
async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
57
|
+
connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
|
|
58
|
+
indexer = SQLiteIndexer(
|
|
59
|
+
connection_config=connection_config,
|
|
60
|
+
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
|
|
61
|
+
)
|
|
62
|
+
downloader = SQLiteDownloader(
|
|
63
|
+
connection_config=connection_config,
|
|
64
|
+
download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
|
|
65
|
+
)
|
|
66
|
+
await source_connector_validation(
|
|
67
|
+
indexer=indexer,
|
|
68
|
+
downloader=downloader,
|
|
69
|
+
configs=SourceValidationConfigs(
|
|
70
|
+
test_id="sqlite",
|
|
71
|
+
expected_num_files=SEED_DATA_ROWS,
|
|
72
|
+
expected_number_indexed_file_data=4,
|
|
73
|
+
validate_downloaded_files=True,
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.fixture
|
|
79
|
+
def destination_database_setup(temp_dir: Path) -> Path:
|
|
82
80
|
# Provision the local file that sqlite points to to have the desired schema for the integration
|
|
83
81
|
# tests and make sure the file and connection get cleaned up by using a context manager.
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
with
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
yield db_path
|
|
82
|
+
db_path = temp_dir / "elements.db"
|
|
83
|
+
db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
|
|
84
|
+
assert db_init_path.exists()
|
|
85
|
+
assert db_init_path.is_file()
|
|
86
|
+
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
87
|
+
with db_init_path.open("r") as f:
|
|
88
|
+
query = f.read()
|
|
89
|
+
cursor = sqlite_connection.cursor()
|
|
90
|
+
cursor.executescript(query)
|
|
91
|
+
return db_path
|
|
95
92
|
|
|
96
93
|
|
|
97
94
|
def validate_destination(db_path: Path, expected_num_elements: int):
|
|
@@ -114,34 +111,48 @@ def validate_destination(db_path: Path, expected_num_elements: int):
|
|
|
114
111
|
|
|
115
112
|
@pytest.mark.asyncio
|
|
116
113
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
117
|
-
async def test_sqlite_destination(
|
|
114
|
+
async def test_sqlite_destination(
|
|
115
|
+
upload_file: Path, temp_dir: Path, destination_database_setup: Path
|
|
116
|
+
):
|
|
118
117
|
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
119
118
|
# mocking it with arbitrary values to meet the base requirements:
|
|
120
119
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
120
|
+
stager = SQLiteUploadStager()
|
|
121
|
+
staged_path = stager.run(
|
|
122
|
+
elements_filepath=upload_file,
|
|
123
|
+
file_data=mock_file_data,
|
|
124
|
+
output_dir=temp_dir,
|
|
125
|
+
output_filename=upload_file.name,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
129
|
+
assert staged_path.suffix == upload_file.suffix
|
|
130
|
+
|
|
131
|
+
uploader = SQLiteUploader(
|
|
132
|
+
connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
|
|
133
|
+
)
|
|
134
|
+
uploader.precheck()
|
|
135
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
136
|
+
|
|
137
|
+
with staged_path.open("r") as f:
|
|
138
|
+
staged_data = json.load(f)
|
|
139
|
+
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
140
|
+
|
|
141
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
142
|
+
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
146
|
+
def test_sqlite_stager(
|
|
147
|
+
request: TopRequest,
|
|
148
|
+
upload_file_str: str,
|
|
149
|
+
tmp_path: Path,
|
|
150
|
+
):
|
|
151
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
152
|
+
stager = SQLiteUploadStager()
|
|
153
|
+
stager_validation(
|
|
154
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
155
|
+
input_file=upload_file,
|
|
156
|
+
stager=stager,
|
|
157
|
+
tmp_dir=tmp_path,
|
|
158
|
+
)
|
|
@@ -5,10 +5,15 @@ from pathlib import Path
|
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
|
+
from _pytest.fixtures import TopRequest
|
|
8
9
|
from astrapy import Collection
|
|
9
10
|
from astrapy import DataAPIClient as AstraDBClient
|
|
10
11
|
|
|
11
12
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
13
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
14
|
+
StagerValidationConfigs,
|
|
15
|
+
stager_validation,
|
|
16
|
+
)
|
|
12
17
|
from test.integration.utils import requires_env
|
|
13
18
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
14
19
|
from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
@@ -108,7 +113,7 @@ def collection(upload_file: Path) -> Collection:
|
|
|
108
113
|
@pytest.mark.asyncio
|
|
109
114
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
110
115
|
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
111
|
-
async def
|
|
116
|
+
async def test_astra_search_destination(
|
|
112
117
|
upload_file: Path,
|
|
113
118
|
collection: Collection,
|
|
114
119
|
tmp_path: Path,
|
|
@@ -154,3 +159,19 @@ async def test_azure_ai_search_destination(
|
|
|
154
159
|
f"Expected count ({expected_count}) doesn't match how "
|
|
155
160
|
f"much came back from collection: {current_count}"
|
|
156
161
|
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
165
|
+
def test_astra_stager(
|
|
166
|
+
request: TopRequest,
|
|
167
|
+
upload_file_str: str,
|
|
168
|
+
tmp_path: Path,
|
|
169
|
+
):
|
|
170
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
171
|
+
stager = AstraDBUploadStager()
|
|
172
|
+
stager_validation(
|
|
173
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
174
|
+
input_file=upload_file,
|
|
175
|
+
stager=stager,
|
|
176
|
+
tmp_dir=tmp_path,
|
|
177
|
+
)
|
|
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
|
+
from _pytest.fixtures import TopRequest
|
|
8
9
|
from azure.core.credentials import AzureKeyCredential
|
|
9
10
|
from azure.search.documents import SearchClient
|
|
10
11
|
from azure.search.documents.indexes import SearchIndexClient
|
|
@@ -25,6 +26,10 @@ from azure.search.documents.indexes.models import (
|
|
|
25
26
|
from test.integration.connectors.utils.constants import (
|
|
26
27
|
DESTINATION_TAG,
|
|
27
28
|
)
|
|
29
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
30
|
+
StagerValidationConfigs,
|
|
31
|
+
stager_validation,
|
|
32
|
+
)
|
|
28
33
|
from test.integration.utils import requires_env
|
|
29
34
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
30
35
|
from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
|
|
@@ -225,9 +230,26 @@ async def test_azure_ai_search_destination(
|
|
|
225
230
|
with staged_filepath.open() as f:
|
|
226
231
|
staged_elements = json.load(f)
|
|
227
232
|
expected_count = len(staged_elements)
|
|
228
|
-
|
|
229
|
-
|
|
233
|
+
with uploader.connection_config.get_search_client() as search_client:
|
|
234
|
+
validate_count(search_client=search_client, expected_count=expected_count)
|
|
230
235
|
|
|
231
236
|
# Rerun and make sure the same documents get updated
|
|
232
237
|
uploader.run(path=staged_filepath, file_data=file_data)
|
|
233
|
-
|
|
238
|
+
with uploader.connection_config.get_search_client() as search_client:
|
|
239
|
+
validate_count(search_client=search_client, expected_count=expected_count)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
243
|
+
def test_azure_ai_search_stager(
|
|
244
|
+
request: TopRequest,
|
|
245
|
+
upload_file_str: str,
|
|
246
|
+
tmp_path: Path,
|
|
247
|
+
):
|
|
248
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
249
|
+
stager = AzureAISearchUploadStager()
|
|
250
|
+
stager_validation(
|
|
251
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
252
|
+
input_file=upload_file,
|
|
253
|
+
stager=stager,
|
|
254
|
+
tmp_dir=tmp_path,
|
|
255
|
+
)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import chromadb
|
|
5
|
+
import pytest
|
|
6
|
+
from _pytest.fixtures import TopRequest
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
)
|
|
11
|
+
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
12
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
13
|
+
StagerValidationConfigs,
|
|
14
|
+
stager_validation,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.chroma import (
|
|
18
|
+
CONNECTOR_TYPE,
|
|
19
|
+
ChromaConnectionConfig,
|
|
20
|
+
ChromaUploader,
|
|
21
|
+
ChromaUploaderConfig,
|
|
22
|
+
ChromaUploadStager,
|
|
23
|
+
ChromaUploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def chroma_instance():
|
|
29
|
+
with container_context(
|
|
30
|
+
image="chromadb/chroma:latest",
|
|
31
|
+
ports={8000: 8000},
|
|
32
|
+
name="chroma_int_test",
|
|
33
|
+
healthcheck=HealthCheck(
|
|
34
|
+
interval=5,
|
|
35
|
+
timeout=10,
|
|
36
|
+
retries=3,
|
|
37
|
+
test="timeout 10s bash -c ':> /dev/tcp/127.0.0.1/8000' || exit 1",
|
|
38
|
+
),
|
|
39
|
+
) as ctx:
|
|
40
|
+
yield ctx
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_collection(collection_name: str, num_embeddings: int):
|
|
44
|
+
print(f"Checking contents of Chroma collection: {collection_name}")
|
|
45
|
+
|
|
46
|
+
chroma_client = chromadb.HttpClient(
|
|
47
|
+
host="localhost",
|
|
48
|
+
port="8000",
|
|
49
|
+
tenant="default_tenant",
|
|
50
|
+
database="default_database",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
|
54
|
+
|
|
55
|
+
number_of_embeddings = collection.count()
|
|
56
|
+
expected_embeddings = num_embeddings
|
|
57
|
+
print(
|
|
58
|
+
f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
assert number_of_embeddings == expected_embeddings, (
|
|
62
|
+
f"Number of rows in generated table ({number_of_embeddings}) "
|
|
63
|
+
f"doesn't match expected value: {expected_embeddings}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
|
+
def test_chroma_destination(
|
|
69
|
+
upload_file: Path,
|
|
70
|
+
chroma_instance,
|
|
71
|
+
tmp_path: Path,
|
|
72
|
+
):
|
|
73
|
+
collection_name = "test_collection"
|
|
74
|
+
file_data = FileData(
|
|
75
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
76
|
+
connector_type=CONNECTOR_TYPE,
|
|
77
|
+
identifier="mock file data",
|
|
78
|
+
)
|
|
79
|
+
stager = ChromaUploadStager(upload_stager_config=ChromaUploadStagerConfig())
|
|
80
|
+
|
|
81
|
+
uploader = ChromaUploader(
|
|
82
|
+
connection_config=ChromaConnectionConfig(
|
|
83
|
+
host="localhost",
|
|
84
|
+
port=8000,
|
|
85
|
+
tenant="default_tenant",
|
|
86
|
+
database="default_database",
|
|
87
|
+
),
|
|
88
|
+
upload_config=ChromaUploaderConfig(collection_name=collection_name),
|
|
89
|
+
)
|
|
90
|
+
staged_filepath = stager.run(
|
|
91
|
+
elements_filepath=upload_file,
|
|
92
|
+
file_data=file_data,
|
|
93
|
+
output_dir=tmp_path,
|
|
94
|
+
output_filename=upload_file.name,
|
|
95
|
+
)
|
|
96
|
+
uploader.precheck()
|
|
97
|
+
uploader.run(path=staged_filepath, file_data=file_data)
|
|
98
|
+
|
|
99
|
+
# Run validation
|
|
100
|
+
with staged_filepath.open() as f:
|
|
101
|
+
staged_elements = json.load(f)
|
|
102
|
+
expected_count = len(staged_elements)
|
|
103
|
+
validate_collection(collection_name=collection_name, num_embeddings=expected_count)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
107
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
|
|
108
|
+
def test_chroma_stager(
|
|
109
|
+
request: TopRequest,
|
|
110
|
+
upload_file_str: str,
|
|
111
|
+
tmp_path: Path,
|
|
112
|
+
):
|
|
113
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
114
|
+
stager = ChromaUploadStager()
|
|
115
|
+
stager_validation(
|
|
116
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
117
|
+
input_file=upload_file,
|
|
118
|
+
stager=stager,
|
|
119
|
+
tmp_dir=tmp_path,
|
|
120
|
+
)
|
|
@@ -5,8 +5,8 @@ import pytest
|
|
|
5
5
|
from test.integration.connectors.utils.constants import (
|
|
6
6
|
SOURCE_TAG,
|
|
7
7
|
)
|
|
8
|
-
from test.integration.connectors.utils.validation import (
|
|
9
|
-
|
|
8
|
+
from test.integration.connectors.utils.validation.source import (
|
|
9
|
+
SourceValidationConfigs,
|
|
10
10
|
source_connector_validation,
|
|
11
11
|
)
|
|
12
12
|
from test.integration.utils import requires_env
|
|
@@ -60,7 +60,7 @@ async def test_confluence_source(temp_dir):
|
|
|
60
60
|
await source_connector_validation(
|
|
61
61
|
indexer=indexer,
|
|
62
62
|
downloader=downloader,
|
|
63
|
-
configs=
|
|
63
|
+
configs=SourceValidationConfigs(
|
|
64
64
|
test_id="confluence",
|
|
65
65
|
expected_num_files=11,
|
|
66
66
|
validate_downloaded_files=True,
|
|
@@ -107,7 +107,7 @@ async def test_confluence_source_large(temp_dir):
|
|
|
107
107
|
await source_connector_validation(
|
|
108
108
|
indexer=indexer,
|
|
109
109
|
downloader=downloader,
|
|
110
|
-
configs=
|
|
110
|
+
configs=SourceValidationConfigs(
|
|
111
111
|
test_id="confluence_large", expected_num_files=250, validate_file_data=False
|
|
112
112
|
),
|
|
113
113
|
)
|
|
@@ -14,8 +14,8 @@ from test.integration.connectors.utils.constants import (
|
|
|
14
14
|
env_setup_path,
|
|
15
15
|
)
|
|
16
16
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
17
|
-
from test.integration.connectors.utils.validation import (
|
|
18
|
-
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
19
|
source_connector_validation,
|
|
20
20
|
)
|
|
21
21
|
from test.integration.utils import requires_env
|
|
@@ -121,7 +121,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
|
|
|
121
121
|
await source_connector_validation(
|
|
122
122
|
indexer=indexer,
|
|
123
123
|
downloader=downloader,
|
|
124
|
-
configs=
|
|
124
|
+
configs=SourceValidationConfigs(
|
|
125
125
|
test_id="kafka", expected_num_files=5, validate_downloaded_files=True
|
|
126
126
|
),
|
|
127
127
|
)
|
|
@@ -203,7 +203,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
|
|
|
203
203
|
await source_connector_validation(
|
|
204
204
|
indexer=indexer,
|
|
205
205
|
downloader=downloader,
|
|
206
|
-
configs=
|
|
206
|
+
configs=SourceValidationConfigs(
|
|
207
207
|
test_id="kafka",
|
|
208
208
|
exclude_fields_extend=["connector_type"],
|
|
209
209
|
expected_num_files=expected_messages,
|
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
import docker
|
|
6
6
|
import pytest
|
|
7
|
+
from _pytest.fixtures import TopRequest
|
|
7
8
|
from pymilvus import (
|
|
8
9
|
CollectionSchema,
|
|
9
10
|
DataType,
|
|
@@ -15,6 +16,10 @@ from pymilvus.milvus_client import IndexParams
|
|
|
15
16
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
16
17
|
from test.integration.connectors.utils.docker import healthcheck_wait
|
|
17
18
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
19
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
20
|
+
StagerValidationConfigs,
|
|
21
|
+
stager_validation,
|
|
22
|
+
)
|
|
18
23
|
from unstructured_ingest.error import DestinationConnectionError
|
|
19
24
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
20
25
|
from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
@@ -167,3 +172,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
|
167
172
|
match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
|
|
168
173
|
):
|
|
169
174
|
uploader.precheck()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
178
|
+
def test_milvus_stager(
|
|
179
|
+
request: TopRequest,
|
|
180
|
+
upload_file_str: str,
|
|
181
|
+
tmp_path: Path,
|
|
182
|
+
):
|
|
183
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
184
|
+
stager = MilvusUploadStager()
|
|
185
|
+
stager_validation(
|
|
186
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
187
|
+
input_file=upload_file,
|
|
188
|
+
stager=stager,
|
|
189
|
+
tmp_dir=tmp_path,
|
|
190
|
+
)
|
|
@@ -14,8 +14,8 @@ from pymongo.mongo_client import MongoClient
|
|
|
14
14
|
from pymongo.operations import SearchIndexModel
|
|
15
15
|
|
|
16
16
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
17
|
-
from test.integration.connectors.utils.validation import (
|
|
18
|
-
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
19
|
source_connector_validation,
|
|
20
20
|
)
|
|
21
21
|
from test.integration.utils import requires_env
|
|
@@ -196,7 +196,7 @@ async def test_mongodb_source(temp_dir: Path):
|
|
|
196
196
|
await source_connector_validation(
|
|
197
197
|
indexer=indexer,
|
|
198
198
|
downloader=downloader,
|
|
199
|
-
configs=
|
|
199
|
+
configs=SourceValidationConfigs(
|
|
200
200
|
test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
|
|
201
201
|
),
|
|
202
202
|
)
|