unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +102 -91
  10. test/integration/connectors/sql/test_singlestore.py +111 -99
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +86 -75
  13. test/integration/connectors/test_astradb.py +22 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +4 -4
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +3 -3
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  35. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  36. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  37. unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
  38. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  39. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  40. unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
  41. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  42. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  43. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  44. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  45. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
  46. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  47. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  48. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  49. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  50. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  51. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  52. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  53. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
  54. unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
  55. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  56. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  57. unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
  58. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  59. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +18 -14
  60. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
  61. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
  62. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
  63. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
  64. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,18 @@
1
+ import json
1
2
  import sqlite3
2
3
  import tempfile
3
- from contextlib import contextmanager
4
4
  from pathlib import Path
5
5
 
6
- import pandas as pd
7
6
  import pytest
7
+ from _pytest.fixtures import TopRequest
8
8
 
9
9
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
10
- from test.integration.connectors.utils.validation import (
11
- ValidationConfigs,
10
+ from test.integration.connectors.utils.validation.destination import (
11
+ StagerValidationConfigs,
12
+ stager_validation,
13
+ )
14
+ from test.integration.connectors.utils.validation.source import (
15
+ SourceValidationConfigs,
12
16
  source_connector_validation,
13
17
  )
14
18
  from unstructured_ingest.v2.interfaces import FileData
@@ -26,8 +30,8 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
26
30
  SEED_DATA_ROWS = 20
27
31
 
28
32
 
29
- @contextmanager
30
- def sqlite_download_setup() -> Path:
33
+ @pytest.fixture
34
+ def source_database_setup() -> Path:
31
35
  with tempfile.TemporaryDirectory() as tmpdir:
32
36
  db_path = Path(tmpdir) / "mock_database.db"
33
37
  db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
@@ -49,49 +53,42 @@ def sqlite_download_setup() -> Path:
49
53
 
50
54
  @pytest.mark.asyncio
51
55
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
52
- async def test_sqlite_source():
53
- with sqlite_download_setup() as db_path:
54
- with tempfile.TemporaryDirectory() as tmpdir:
55
- connection_config = SQLiteConnectionConfig(database_path=db_path)
56
- indexer = SQLiteIndexer(
57
- connection_config=connection_config,
58
- index_config=SQLiteIndexerConfig(
59
- table_name="cars", id_column="car_id", batch_size=5
60
- ),
61
- )
62
- downloader = SQLiteDownloader(
63
- connection_config=connection_config,
64
- download_config=SQLiteDownloaderConfig(
65
- fields=["car_id", "brand"], download_dir=Path(tmpdir)
66
- ),
67
- )
68
- await source_connector_validation(
69
- indexer=indexer,
70
- downloader=downloader,
71
- configs=ValidationConfigs(
72
- test_id="sqlite",
73
- expected_num_files=SEED_DATA_ROWS,
74
- expected_number_indexed_file_data=4,
75
- validate_downloaded_files=True,
76
- ),
77
- )
78
-
79
-
80
- @contextmanager
81
- def sqlite_upload_setup() -> Path:
56
+ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
+ connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
+ indexer = SQLiteIndexer(
59
+ connection_config=connection_config,
60
+ index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
61
+ )
62
+ downloader = SQLiteDownloader(
63
+ connection_config=connection_config,
64
+ download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
65
+ )
66
+ await source_connector_validation(
67
+ indexer=indexer,
68
+ downloader=downloader,
69
+ configs=SourceValidationConfigs(
70
+ test_id="sqlite",
71
+ expected_num_files=SEED_DATA_ROWS,
72
+ expected_number_indexed_file_data=4,
73
+ validate_downloaded_files=True,
74
+ ),
75
+ )
76
+
77
+
78
+ @pytest.fixture
79
+ def destination_database_setup(temp_dir: Path) -> Path:
82
80
  # Provision the local file that sqlite points to to have the desired schema for the integration
83
81
  # tests and make sure the file and connection get cleaned up by using a context manager.
84
- with tempfile.TemporaryDirectory() as tmpdir:
85
- db_path = Path(tmpdir) / "elements.db"
86
- db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
87
- assert db_init_path.exists()
88
- assert db_init_path.is_file()
89
- with sqlite3.connect(database=db_path) as sqlite_connection:
90
- with db_init_path.open("r") as f:
91
- query = f.read()
92
- cursor = sqlite_connection.cursor()
93
- cursor.executescript(query)
94
- yield db_path
82
+ db_path = temp_dir / "elements.db"
83
+ db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
84
+ assert db_init_path.exists()
85
+ assert db_init_path.is_file()
86
+ with sqlite3.connect(database=db_path) as sqlite_connection:
87
+ with db_init_path.open("r") as f:
88
+ query = f.read()
89
+ cursor = sqlite_connection.cursor()
90
+ cursor.executescript(query)
91
+ return db_path
95
92
 
96
93
 
97
94
  def validate_destination(db_path: Path, expected_num_elements: int):
@@ -114,34 +111,48 @@ def validate_destination(db_path: Path, expected_num_elements: int):
114
111
 
115
112
  @pytest.mark.asyncio
116
113
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
117
- async def test_sqlite_destination(upload_file: Path):
114
+ async def test_sqlite_destination(
115
+ upload_file: Path, temp_dir: Path, destination_database_setup: Path
116
+ ):
118
117
  # the sqlite destination connector doesn't leverage the file data but is required as an input,
119
118
  # mocking it with arbitrary values to meet the base requirements:
120
119
  mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
121
- with sqlite_upload_setup() as db_path:
122
- with tempfile.TemporaryDirectory() as tmpdir:
123
- stager = SQLiteUploadStager()
124
- stager_params = {
125
- "elements_filepath": upload_file,
126
- "file_data": mock_file_data,
127
- "output_dir": Path(tmpdir),
128
- "output_filename": "test_db",
129
- }
130
- if stager.is_async():
131
- staged_path = await stager.run_async(**stager_params)
132
- else:
133
- staged_path = stager.run(**stager_params)
134
-
135
- # The stager should append the `.json` suffix to the output filename passed in.
136
- assert staged_path.name == "test_db.json"
137
-
138
- uploader = SQLiteUploader(
139
- connection_config=SQLiteConnectionConfig(database_path=db_path)
140
- )
141
- uploader.run(path=staged_path, file_data=mock_file_data)
142
-
143
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
144
- validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
145
-
146
- uploader.run(path=staged_path, file_data=mock_file_data)
147
- validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
120
+ stager = SQLiteUploadStager()
121
+ staged_path = stager.run(
122
+ elements_filepath=upload_file,
123
+ file_data=mock_file_data,
124
+ output_dir=temp_dir,
125
+ output_filename=upload_file.name,
126
+ )
127
+
128
+ # The stager should append the `.json` suffix to the output filename passed in.
129
+ assert staged_path.suffix == upload_file.suffix
130
+
131
+ uploader = SQLiteUploader(
132
+ connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
133
+ )
134
+ uploader.precheck()
135
+ uploader.run(path=staged_path, file_data=mock_file_data)
136
+
137
+ with staged_path.open("r") as f:
138
+ staged_data = json.load(f)
139
+ validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
140
+
141
+ uploader.run(path=staged_path, file_data=mock_file_data)
142
+ validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
143
+
144
+
145
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
146
+ def test_sqlite_stager(
147
+ request: TopRequest,
148
+ upload_file_str: str,
149
+ tmp_path: Path,
150
+ ):
151
+ upload_file: Path = request.getfixturevalue(upload_file_str)
152
+ stager = SQLiteUploadStager()
153
+ stager_validation(
154
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
155
+ input_file=upload_file,
156
+ stager=stager,
157
+ tmp_dir=tmp_path,
158
+ )
@@ -5,10 +5,15 @@ from pathlib import Path
5
5
  from uuid import uuid4
6
6
 
7
7
  import pytest
8
+ from _pytest.fixtures import TopRequest
8
9
  from astrapy import Collection
9
10
  from astrapy import DataAPIClient as AstraDBClient
10
11
 
11
12
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.validation.destination import (
14
+ StagerValidationConfigs,
15
+ stager_validation,
16
+ )
12
17
  from test.integration.utils import requires_env
13
18
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
14
19
  from unstructured_ingest.v2.processes.connectors.astradb import (
@@ -108,7 +113,7 @@ def collection(upload_file: Path) -> Collection:
108
113
  @pytest.mark.asyncio
109
114
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
110
115
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
111
- async def test_azure_ai_search_destination(
116
+ async def test_astra_search_destination(
112
117
  upload_file: Path,
113
118
  collection: Collection,
114
119
  tmp_path: Path,
@@ -154,3 +159,19 @@ async def test_azure_ai_search_destination(
154
159
  f"Expected count ({expected_count}) doesn't match how "
155
160
  f"much came back from collection: {current_count}"
156
161
  )
162
+
163
+
164
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
165
+ def test_astra_stager(
166
+ request: TopRequest,
167
+ upload_file_str: str,
168
+ tmp_path: Path,
169
+ ):
170
+ upload_file: Path = request.getfixturevalue(upload_file_str)
171
+ stager = AstraDBUploadStager()
172
+ stager_validation(
173
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
174
+ input_file=upload_file,
175
+ stager=stager,
176
+ tmp_dir=tmp_path,
177
+ )
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  from uuid import uuid4
6
6
 
7
7
  import pytest
8
+ from _pytest.fixtures import TopRequest
8
9
  from azure.core.credentials import AzureKeyCredential
9
10
  from azure.search.documents import SearchClient
10
11
  from azure.search.documents.indexes import SearchIndexClient
@@ -25,6 +26,10 @@ from azure.search.documents.indexes.models import (
25
26
  from test.integration.connectors.utils.constants import (
26
27
  DESTINATION_TAG,
27
28
  )
29
+ from test.integration.connectors.utils.validation.destination import (
30
+ StagerValidationConfigs,
31
+ stager_validation,
32
+ )
28
33
  from test.integration.utils import requires_env
29
34
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
30
35
  from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
@@ -225,9 +230,26 @@ async def test_azure_ai_search_destination(
225
230
  with staged_filepath.open() as f:
226
231
  staged_elements = json.load(f)
227
232
  expected_count = len(staged_elements)
228
- search_client: SearchClient = uploader.connection_config.get_search_client()
229
- validate_count(search_client=search_client, expected_count=expected_count)
233
+ with uploader.connection_config.get_search_client() as search_client:
234
+ validate_count(search_client=search_client, expected_count=expected_count)
230
235
 
231
236
  # Rerun and make sure the same documents get updated
232
237
  uploader.run(path=staged_filepath, file_data=file_data)
233
- validate_count(search_client=search_client, expected_count=expected_count)
238
+ with uploader.connection_config.get_search_client() as search_client:
239
+ validate_count(search_client=search_client, expected_count=expected_count)
240
+
241
+
242
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
243
+ def test_azure_ai_search_stager(
244
+ request: TopRequest,
245
+ upload_file_str: str,
246
+ tmp_path: Path,
247
+ ):
248
+ upload_file: Path = request.getfixturevalue(upload_file_str)
249
+ stager = AzureAISearchUploadStager()
250
+ stager_validation(
251
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
252
+ input_file=upload_file,
253
+ stager=stager,
254
+ tmp_dir=tmp_path,
255
+ )
@@ -0,0 +1,120 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import chromadb
5
+ import pytest
6
+ from _pytest.fixtures import TopRequest
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ )
11
+ from test.integration.connectors.utils.docker import HealthCheck, container_context
12
+ from test.integration.connectors.utils.validation.destination import (
13
+ StagerValidationConfigs,
14
+ stager_validation,
15
+ )
16
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
17
+ from unstructured_ingest.v2.processes.connectors.chroma import (
18
+ CONNECTOR_TYPE,
19
+ ChromaConnectionConfig,
20
+ ChromaUploader,
21
+ ChromaUploaderConfig,
22
+ ChromaUploadStager,
23
+ ChromaUploadStagerConfig,
24
+ )
25
+
26
+
27
+ @pytest.fixture
28
+ def chroma_instance():
29
+ with container_context(
30
+ image="chromadb/chroma:latest",
31
+ ports={8000: 8000},
32
+ name="chroma_int_test",
33
+ healthcheck=HealthCheck(
34
+ interval=5,
35
+ timeout=10,
36
+ retries=3,
37
+ test="timeout 10s bash -c ':> /dev/tcp/127.0.0.1/8000' || exit 1",
38
+ ),
39
+ ) as ctx:
40
+ yield ctx
41
+
42
+
43
+ def validate_collection(collection_name: str, num_embeddings: int):
44
+ print(f"Checking contents of Chroma collection: {collection_name}")
45
+
46
+ chroma_client = chromadb.HttpClient(
47
+ host="localhost",
48
+ port="8000",
49
+ tenant="default_tenant",
50
+ database="default_database",
51
+ )
52
+
53
+ collection = chroma_client.get_or_create_collection(name=collection_name)
54
+
55
+ number_of_embeddings = collection.count()
56
+ expected_embeddings = num_embeddings
57
+ print(
58
+ f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
59
+ )
60
+
61
+ assert number_of_embeddings == expected_embeddings, (
62
+ f"Number of rows in generated table ({number_of_embeddings}) "
63
+ f"doesn't match expected value: {expected_embeddings}"
64
+ )
65
+
66
+
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ def test_chroma_destination(
69
+ upload_file: Path,
70
+ chroma_instance,
71
+ tmp_path: Path,
72
+ ):
73
+ collection_name = "test_collection"
74
+ file_data = FileData(
75
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
76
+ connector_type=CONNECTOR_TYPE,
77
+ identifier="mock file data",
78
+ )
79
+ stager = ChromaUploadStager(upload_stager_config=ChromaUploadStagerConfig())
80
+
81
+ uploader = ChromaUploader(
82
+ connection_config=ChromaConnectionConfig(
83
+ host="localhost",
84
+ port=8000,
85
+ tenant="default_tenant",
86
+ database="default_database",
87
+ ),
88
+ upload_config=ChromaUploaderConfig(collection_name=collection_name),
89
+ )
90
+ staged_filepath = stager.run(
91
+ elements_filepath=upload_file,
92
+ file_data=file_data,
93
+ output_dir=tmp_path,
94
+ output_filename=upload_file.name,
95
+ )
96
+ uploader.precheck()
97
+ uploader.run(path=staged_filepath, file_data=file_data)
98
+
99
+ # Run validation
100
+ with staged_filepath.open() as f:
101
+ staged_elements = json.load(f)
102
+ expected_count = len(staged_elements)
103
+ validate_collection(collection_name=collection_name, num_embeddings=expected_count)
104
+
105
+
106
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
107
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
108
+ def test_chroma_stager(
109
+ request: TopRequest,
110
+ upload_file_str: str,
111
+ tmp_path: Path,
112
+ ):
113
+ upload_file: Path = request.getfixturevalue(upload_file_str)
114
+ stager = ChromaUploadStager()
115
+ stager_validation(
116
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
117
+ input_file=upload_file,
118
+ stager=stager,
119
+ tmp_dir=tmp_path,
120
+ )
@@ -5,8 +5,8 @@ import pytest
5
5
  from test.integration.connectors.utils.constants import (
6
6
  SOURCE_TAG,
7
7
  )
8
- from test.integration.connectors.utils.validation import (
9
- ValidationConfigs,
8
+ from test.integration.connectors.utils.validation.source import (
9
+ SourceValidationConfigs,
10
10
  source_connector_validation,
11
11
  )
12
12
  from test.integration.utils import requires_env
@@ -60,7 +60,7 @@ async def test_confluence_source(temp_dir):
60
60
  await source_connector_validation(
61
61
  indexer=indexer,
62
62
  downloader=downloader,
63
- configs=ValidationConfigs(
63
+ configs=SourceValidationConfigs(
64
64
  test_id="confluence",
65
65
  expected_num_files=11,
66
66
  validate_downloaded_files=True,
@@ -107,7 +107,7 @@ async def test_confluence_source_large(temp_dir):
107
107
  await source_connector_validation(
108
108
  indexer=indexer,
109
109
  downloader=downloader,
110
- configs=ValidationConfigs(
110
+ configs=SourceValidationConfigs(
111
111
  test_id="confluence_large", expected_num_files=250, validate_file_data=False
112
112
  ),
113
113
  )
@@ -114,6 +114,7 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
114
114
  )
115
115
 
116
116
  try:
117
+ uploader.precheck()
117
118
  if uploader.is_async():
118
119
  await uploader.run_async(path=new_upload_file, file_data=file_data)
119
120
  else:
@@ -14,8 +14,8 @@ from test.integration.connectors.utils.constants import (
14
14
  env_setup_path,
15
15
  )
16
16
  from test.integration.connectors.utils.docker_compose import docker_compose_context
17
- from test.integration.connectors.utils.validation import (
18
- ValidationConfigs,
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
19
  source_connector_validation,
20
20
  )
21
21
  from test.integration.utils import requires_env
@@ -121,7 +121,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
121
121
  await source_connector_validation(
122
122
  indexer=indexer,
123
123
  downloader=downloader,
124
- configs=ValidationConfigs(
124
+ configs=SourceValidationConfigs(
125
125
  test_id="kafka", expected_num_files=5, validate_downloaded_files=True
126
126
  ),
127
127
  )
@@ -203,7 +203,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
203
203
  await source_connector_validation(
204
204
  indexer=indexer,
205
205
  downloader=downloader,
206
- configs=ValidationConfigs(
206
+ configs=SourceValidationConfigs(
207
207
  test_id="kafka",
208
208
  exclude_fields_extend=["connector_type"],
209
209
  expected_num_files=expected_messages,
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
 
5
5
  import docker
6
6
  import pytest
7
+ from _pytest.fixtures import TopRequest
7
8
  from pymilvus import (
8
9
  CollectionSchema,
9
10
  DataType,
@@ -15,6 +16,10 @@ from pymilvus.milvus_client import IndexParams
15
16
  from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
16
17
  from test.integration.connectors.utils.docker import healthcheck_wait
17
18
  from test.integration.connectors.utils.docker_compose import docker_compose_context
19
+ from test.integration.connectors.utils.validation.destination import (
20
+ StagerValidationConfigs,
21
+ stager_validation,
22
+ )
18
23
  from unstructured_ingest.error import DestinationConnectionError
19
24
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
20
25
  from unstructured_ingest.v2.processes.connectors.milvus import (
@@ -167,3 +172,19 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
167
172
  match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
168
173
  ):
169
174
  uploader.precheck()
175
+
176
+
177
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
178
+ def test_milvus_stager(
179
+ request: TopRequest,
180
+ upload_file_str: str,
181
+ tmp_path: Path,
182
+ ):
183
+ upload_file: Path = request.getfixturevalue(upload_file_str)
184
+ stager = MilvusUploadStager()
185
+ stager_validation(
186
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
187
+ input_file=upload_file,
188
+ stager=stager,
189
+ tmp_dir=tmp_path,
190
+ )
@@ -14,8 +14,8 @@ from pymongo.mongo_client import MongoClient
14
14
  from pymongo.operations import SearchIndexModel
15
15
 
16
16
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
17
- from test.integration.connectors.utils.validation import (
18
- ValidationConfigs,
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
19
  source_connector_validation,
20
20
  )
21
21
  from test.integration.utils import requires_env
@@ -196,7 +196,7 @@ async def test_mongodb_source(temp_dir: Path):
196
196
  await source_connector_validation(
197
197
  indexer=indexer,
198
198
  downloader=downloader,
199
- configs=ValidationConfigs(
199
+ configs=SourceValidationConfigs(
200
200
  test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
201
201
  ),
202
202
  )