unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +10 -6
  2. test/integration/connectors/discord/test_discord.py +4 -4
  3. test/integration/connectors/duckdb/test_duckdb.py +3 -2
  4. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  5. test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
  6. test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
  7. test/integration/connectors/sql/test_postgres.py +9 -3
  8. test/integration/connectors/sql/test_singlestore.py +9 -3
  9. test/integration/connectors/sql/test_snowflake.py +9 -3
  10. test/integration/connectors/sql/test_sqlite.py +9 -3
  11. test/integration/connectors/test_astradb.py +25 -9
  12. test/integration/connectors/test_azure_ai_search.py +3 -4
  13. test/integration/connectors/test_chroma.py +4 -6
  14. test/integration/connectors/test_confluence.py +3 -5
  15. test/integration/connectors/test_delta_table.py +4 -6
  16. test/integration/connectors/test_lancedb.py +3 -3
  17. test/integration/connectors/test_milvus.py +10 -5
  18. test/integration/connectors/test_mongodb.py +9 -9
  19. test/integration/connectors/test_neo4j.py +16 -8
  20. test/integration/connectors/test_notion.py +7 -0
  21. test/integration/connectors/test_onedrive.py +2 -4
  22. test/integration/connectors/test_pinecone.py +5 -6
  23. test/integration/connectors/test_qdrant.py +5 -4
  24. test/integration/connectors/test_redis.py +3 -3
  25. test/integration/connectors/test_s3.py +7 -6
  26. test/integration/connectors/test_vectara.py +2 -2
  27. test/integration/connectors/utils/constants.py +6 -0
  28. test/integration/connectors/utils/docker.py +2 -2
  29. test/integration/connectors/weaviate/test_cloud.py +5 -0
  30. test/integration/connectors/weaviate/test_local.py +2 -2
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
  33. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/METADATA +18 -18
  34. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/RECORD +38 -38
  35. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/LICENSE.md +0 -0
  36. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/WHEEL +0 -0
  37. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/entry_points.txt +0 -0
  38. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.14.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,11 @@ import pytest
10
10
  from databricks.sdk import WorkspaceClient
11
11
  from databricks.sdk.errors.platform import NotFound
12
12
 
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.constants import (
14
+ BLOB_STORAGE_TAG,
15
+ DESTINATION_TAG,
16
+ SOURCE_TAG,
17
+ )
14
18
  from test.integration.connectors.utils.validation.source import (
15
19
  SourceValidationConfigs,
16
20
  source_connector_validation,
@@ -83,7 +87,7 @@ def get_pat_env_data() -> PATEnvData:
83
87
 
84
88
 
85
89
  @pytest.mark.asyncio
86
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
90
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
87
91
  @requires_env(
88
92
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
89
93
  )
@@ -115,7 +119,7 @@ async def test_volumes_native_source(tmp_path: Path):
115
119
 
116
120
 
117
121
  @pytest.mark.asyncio
118
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
122
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
119
123
  @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
120
124
  async def test_volumes_native_source_pat(tmp_path: Path):
121
125
  env_data = get_pat_env_data()
@@ -144,7 +148,7 @@ async def test_volumes_native_source_pat(tmp_path: Path):
144
148
  )
145
149
 
146
150
 
147
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
151
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
148
152
  @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
149
153
  def test_volumes_native_source_pat_invalid_catalog():
150
154
  env_data = get_pat_env_data()
@@ -162,7 +166,7 @@ def test_volumes_native_source_pat_invalid_catalog():
162
166
  _ = list(indexer.run())
163
167
 
164
168
 
165
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
166
170
  @requires_env("DATABRICKS_HOST")
167
171
  def test_volumes_native_source_pat_invalid_pat():
168
172
  host = os.environ["DATABRICKS_HOST"]
@@ -231,7 +235,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
231
235
 
232
236
 
233
237
  @pytest.mark.asyncio
234
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
238
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
235
239
  @requires_env(
236
240
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
237
241
  )
@@ -6,7 +6,7 @@ from typing import Optional
6
6
 
7
7
  import pytest
8
8
 
9
- from test.integration.connectors.utils.constants import SOURCE_TAG
9
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
10
10
  from test.integration.connectors.utils.validation.source import (
11
11
  SourceValidationConfigs,
12
12
  source_connector_validation,
@@ -38,7 +38,7 @@ def get_env_data() -> EnvData:
38
38
 
39
39
 
40
40
  @pytest.mark.asyncio
41
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
41
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
42
42
  @requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
43
43
  async def test_discord_source():
44
44
  env = get_env_data()
@@ -66,7 +66,7 @@ async def test_discord_source():
66
66
  )
67
67
 
68
68
 
69
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
69
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
70
70
  @requires_env("DISCORD_CHANNELS")
71
71
  def test_discord_source_precheck_fail_no_token():
72
72
  indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
@@ -77,7 +77,7 @@ def test_discord_source_precheck_fail_no_token():
77
77
  indexer.precheck()
78
78
 
79
79
 
80
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
80
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
81
81
  @requires_env("DISCORD_TOKEN")
82
82
  def test_discord_source_precheck_fail_no_channels():
83
83
  indexer_config = DiscordIndexerConfig(channels=[])
@@ -5,7 +5,7 @@ import duckdb
5
5
  import pytest
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG
8
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
9
9
  from test.integration.connectors.utils.validation.destination import (
10
10
  StagerValidationConfigs,
11
11
  stager_validation,
@@ -46,7 +46,7 @@ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
46
46
  conn.close()
47
47
 
48
48
 
49
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
49
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
50
50
  def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
51
51
  file_data = FileData(
52
52
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -73,6 +73,7 @@ def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_d
73
73
  validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
74
74
 
75
75
 
76
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
76
77
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
77
78
  def test_duckdb_stager(
78
79
  request: TopRequest,
@@ -7,7 +7,7 @@ import duckdb
7
7
  import pandas as pd
8
8
  import pytest
9
9
 
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
11
  from test.integration.utils import requires_env
12
12
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
13
13
  from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
@@ -61,7 +61,7 @@ def validate_motherduck_destination(database: str, expected_num_elements: int, m
61
61
  conn.close()
62
62
 
63
63
 
64
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
64
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
65
65
  @requires_env("MOTHERDUCK_TOKEN")
66
66
  def test_motherduck_destination(
67
67
  md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
@@ -15,7 +15,7 @@ from _pytest.fixtures import TopRequest
15
15
  from elasticsearch import Elasticsearch as ElasticsearchClient
16
16
  from elasticsearch.helpers import bulk
17
17
 
18
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
18
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, NOSQL_TAG
19
19
  from test.integration.connectors.utils.docker import HealthCheck, container_context
20
20
  from test.integration.connectors.utils.validation.source import (
21
21
  SourceValidationConfigs,
@@ -177,7 +177,7 @@ def destination_index(elasticsearch_elements_mapping: dict) -> str:
177
177
 
178
178
 
179
179
  @pytest.mark.asyncio
180
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
180
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
181
181
  async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
182
182
  indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
183
183
  with tempfile.TemporaryDirectory() as tempdir:
@@ -207,7 +207,7 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
207
207
  )
208
208
 
209
209
 
210
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
210
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
211
211
  def test_elasticsearch_source_precheck_fail_no_cluster():
212
212
  indexer_config = ElasticsearchIndexerConfig(index_name="index")
213
213
 
@@ -221,7 +221,7 @@ def test_elasticsearch_source_precheck_fail_no_cluster():
221
221
  indexer.precheck()
222
222
 
223
223
 
224
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
224
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
225
225
  def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
226
226
  indexer_config = ElasticsearchIndexerConfig(index_name="index")
227
227
 
@@ -236,7 +236,7 @@ def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
236
236
 
237
237
 
238
238
  @pytest.mark.asyncio
239
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
239
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
240
240
  async def test_elasticsearch_destination(
241
241
  upload_file: Path,
242
242
  destination_index: str,
@@ -282,7 +282,7 @@ async def test_elasticsearch_destination(
282
282
  validate_count(client=client, expected_count=expected_count, index_name=destination_index)
283
283
 
284
284
 
285
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
285
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
286
286
  def test_elasticsearch_destination_precheck_fail():
287
287
  connection_config = ElasticsearchConnectionConfig(
288
288
  access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
@@ -297,7 +297,7 @@ def test_elasticsearch_destination_precheck_fail():
297
297
  uploader.precheck()
298
298
 
299
299
 
300
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
300
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
301
301
  def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
302
302
  connection_config = ElasticsearchConnectionConfig(
303
303
  access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
@@ -312,6 +312,7 @@ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str
312
312
  uploader.precheck()
313
313
 
314
314
 
315
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
315
316
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
316
317
  def test_elasticsearch_stager(
317
318
  request: TopRequest,
@@ -10,7 +10,7 @@ import pytest
10
10
  from _pytest.fixtures import TopRequest
11
11
  from opensearchpy import Document, Keyword, OpenSearch, Text
12
12
 
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
14
14
  from test.integration.connectors.utils.docker import HealthCheck, container_context
15
15
  from test.integration.connectors.utils.validation.destination import (
16
16
  StagerValidationConfigs,
@@ -166,7 +166,7 @@ def destination_index(opensearch_elements_mapping: dict) -> str:
166
166
 
167
167
 
168
168
  @pytest.mark.asyncio
169
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
170
170
  async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFrame):
171
171
  indexer_config = OpenSearchIndexerConfig(index_name=source_index)
172
172
  with tempfile.TemporaryDirectory() as tempdir:
@@ -197,7 +197,7 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
197
197
  )
198
198
 
199
199
 
200
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
200
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
201
201
  def test_opensearch_source_precheck_fail_no_cluster():
202
202
  indexer_config = OpenSearchIndexerConfig(index_name="index")
203
203
 
@@ -212,7 +212,7 @@ def test_opensearch_source_precheck_fail_no_cluster():
212
212
  indexer.precheck()
213
213
 
214
214
 
215
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
215
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
216
216
  def test_opensearch_source_precheck_fail_no_index(source_index: str):
217
217
  indexer_config = OpenSearchIndexerConfig(index_name="index")
218
218
 
@@ -228,7 +228,7 @@ def test_opensearch_source_precheck_fail_no_index(source_index: str):
228
228
 
229
229
 
230
230
  @pytest.mark.asyncio
231
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
231
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
232
232
  async def test_opensearch_destination(
233
233
  upload_file: Path,
234
234
  destination_index: str,
@@ -275,7 +275,7 @@ async def test_opensearch_destination(
275
275
  validate_count(client=client, expected_count=expected_count, index_name=destination_index)
276
276
 
277
277
 
278
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
278
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
279
279
  def test_opensearch_destination_precheck_fail():
280
280
  connection_config = OpenSearchConnectionConfig(
281
281
  access_config=OpenSearchAccessConfig(password="admin"),
@@ -291,7 +291,7 @@ def test_opensearch_destination_precheck_fail():
291
291
  uploader.precheck()
292
292
 
293
293
 
294
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
294
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
295
295
  def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
296
296
  connection_config = OpenSearchConnectionConfig(
297
297
  access_config=OpenSearchAccessConfig(password="admin"),
@@ -307,6 +307,7 @@ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
307
307
  uploader.precheck()
308
308
 
309
309
 
310
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
310
311
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
311
312
  def test_opensearch_stager(
312
313
  request: TopRequest,
@@ -5,7 +5,12 @@ import pytest
5
5
  from _pytest.fixtures import TopRequest
6
6
  from psycopg2 import connect
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ SQL_TAG,
12
+ env_setup_path,
13
+ )
9
14
  from test.integration.connectors.utils.docker_compose import docker_compose_context
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
@@ -51,7 +56,7 @@ def source_database_setup() -> str:
51
56
 
52
57
 
53
58
  @pytest.mark.asyncio
54
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
59
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
55
60
  async def test_postgres_source(temp_dir: Path, source_database_setup: str):
56
61
  connect_params = {
57
62
  "host": "localhost",
@@ -115,7 +120,7 @@ def validate_destination(
115
120
 
116
121
 
117
122
  @pytest.mark.asyncio
118
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
123
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
119
124
  async def test_postgres_destination(upload_file: Path, temp_dir: Path):
120
125
  # the postgres destination connector doesn't leverage the file data but is required as an input,
121
126
  # mocking it with arbitrary values to meet the base requirements:
@@ -179,6 +184,7 @@ async def test_postgres_destination(upload_file: Path, temp_dir: Path):
179
184
  )
180
185
 
181
186
 
187
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
182
188
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
183
189
  def test_postgres_stager(
184
190
  request: TopRequest,
@@ -5,7 +5,12 @@ import pytest
5
5
  import singlestoredb as s2
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ SQL_TAG,
12
+ env_setup_path,
13
+ )
9
14
  from test.integration.connectors.utils.docker_compose import docker_compose_context
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
@@ -54,7 +59,7 @@ def source_database_setup() -> dict:
54
59
 
55
60
 
56
61
  @pytest.mark.asyncio
57
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
62
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
58
63
  async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
59
64
 
60
65
  connection_config = SingleStoreConnectionConfig(
@@ -101,7 +106,7 @@ def validate_destination(
101
106
 
102
107
 
103
108
  @pytest.mark.asyncio
104
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
109
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
105
110
  async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
106
111
  mock_file_data = FileData(
107
112
  identifier="mock file data",
@@ -160,6 +165,7 @@ async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
160
165
  )
161
166
 
162
167
 
168
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
163
169
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
164
170
  def test_singlestore_stager(
165
171
  request: TopRequest,
@@ -6,7 +6,12 @@ import pytest
6
6
  import snowflake.connector as sf
7
7
  from _pytest.fixtures import TopRequest
8
8
 
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
9
+ from test.integration.connectors.utils.constants import (
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ SQL_TAG,
13
+ env_setup_path,
14
+ )
10
15
  from test.integration.connectors.utils.docker import container_context
11
16
  from test.integration.connectors.utils.validation.destination import (
12
17
  StagerValidationConfigs,
@@ -109,7 +114,7 @@ def destination_database_setup() -> dict:
109
114
 
110
115
 
111
116
  @pytest.mark.asyncio
112
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
117
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
113
118
  @requires_env("LOCALSTACK_AUTH_TOKEN")
114
119
  async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
115
120
  connection_config = SnowflakeConnectionConfig(
@@ -163,7 +168,7 @@ def validate_destination(
163
168
 
164
169
 
165
170
  @pytest.mark.asyncio
166
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
171
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
167
172
  @requires_env("LOCALSTACK_AUTH_TOKEN")
168
173
  async def test_snowflake_destination(
169
174
  upload_file: Path, temp_dir: Path, destination_database_setup: dict
@@ -222,6 +227,7 @@ async def test_snowflake_destination(
222
227
  )
223
228
 
224
229
 
230
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
225
231
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
226
232
  def test_snowflake_stager(
227
233
  request: TopRequest,
@@ -6,7 +6,12 @@ from pathlib import Path
6
6
  import pytest
7
7
  from _pytest.fixtures import TopRequest
8
8
 
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
9
+ from test.integration.connectors.utils.constants import (
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ SQL_TAG,
13
+ env_setup_path,
14
+ )
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
12
17
  stager_validation,
@@ -52,7 +57,7 @@ def source_database_setup() -> Path:
52
57
 
53
58
 
54
59
  @pytest.mark.asyncio
55
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
60
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
56
61
  async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
62
  connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
63
  indexer = SQLiteIndexer(
@@ -110,7 +115,7 @@ def validate_destination(db_path: Path, expected_num_elements: int):
110
115
 
111
116
 
112
117
  @pytest.mark.asyncio
113
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
118
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
114
119
  async def test_sqlite_destination(
115
120
  upload_file: Path, temp_dir: Path, destination_database_setup: Path
116
121
  ):
@@ -146,6 +151,7 @@ async def test_sqlite_destination(
146
151
  validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
147
152
 
148
153
 
154
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
149
155
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
150
156
  def test_sqlite_stager(
151
157
  request: TopRequest,
@@ -9,7 +9,7 @@ from _pytest.fixtures import TopRequest
9
9
  from astrapy import Collection
10
10
  from astrapy import DataAPIClient as AstraDBClient
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, VECTOR_DB_TAG
13
13
  from test.integration.connectors.utils.validation.destination import (
14
14
  StagerValidationConfigs,
15
15
  stager_validation,
@@ -49,9 +49,9 @@ def connection_config() -> AstraDBConnectionConfig:
49
49
  )
50
50
 
51
51
 
52
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
52
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
53
53
  @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
54
- def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
54
+ def test_precheck_succeeds_indexer(connection_config: AstraDBConnectionConfig):
55
55
  indexer = AstraDBIndexer(
56
56
  connection_config=connection_config,
57
57
  index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
@@ -64,19 +64,34 @@ def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
64
64
  uploader.precheck()
65
65
 
66
66
 
67
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
68
68
  @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
69
- def test_precheck_fails(connection_config: AstraDBConnectionConfig):
69
+ def test_precheck_succeeds_uploader(connection_config: AstraDBConnectionConfig):
70
+ uploader = AstraDBUploader(
71
+ connection_config=connection_config,
72
+ upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
73
+ )
74
+ uploader.precheck()
75
+
76
+
77
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
78
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
79
+ def test_precheck_fails_indexer(connection_config: AstraDBConnectionConfig):
70
80
  indexer = AstraDBIndexer(
71
81
  connection_config=connection_config,
72
82
  index_config=AstraDBIndexerConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
73
83
  )
84
+ with pytest.raises(expected_exception=SourceConnectionError):
85
+ indexer.precheck()
86
+
87
+
88
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
89
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
90
+ def test_precheck_fails_uploader(connection_config: AstraDBConnectionConfig):
74
91
  uploader = AstraDBUploader(
75
92
  connection_config=connection_config,
76
93
  upload_config=AstraDBUploaderConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
77
94
  )
78
- with pytest.raises(expected_exception=SourceConnectionError):
79
- indexer.precheck()
80
95
  with pytest.raises(expected_exception=DestinationConnectionError):
81
96
  uploader.precheck()
82
97
 
@@ -117,7 +132,7 @@ def collection(upload_file: Path) -> Collection:
117
132
 
118
133
 
119
134
  @pytest.mark.asyncio
120
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
135
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
121
136
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
122
137
  async def test_astra_search_source(
123
138
  tmp_path: Path,
@@ -151,7 +166,7 @@ async def test_astra_search_source(
151
166
 
152
167
 
153
168
  @pytest.mark.asyncio
154
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
155
170
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
156
171
  async def test_astra_search_destination(
157
172
  upload_file: Path,
@@ -201,6 +216,7 @@ async def test_astra_search_destination(
201
216
  )
202
217
 
203
218
 
219
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
204
220
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
205
221
  def test_astra_stager(
206
222
  request: TopRequest,
@@ -23,9 +23,7 @@ from azure.search.documents.indexes.models import (
23
23
  VectorSearchProfile,
24
24
  )
25
25
 
26
- from test.integration.connectors.utils.constants import (
27
- DESTINATION_TAG,
28
- )
26
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
29
27
  from test.integration.connectors.utils.validation.destination import (
30
28
  StagerValidationConfigs,
31
29
  stager_validation,
@@ -195,7 +193,7 @@ def validate_count(
195
193
 
196
194
 
197
195
  @pytest.mark.asyncio
198
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
196
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
199
197
  @requires_env("AZURE_SEARCH_API_KEY")
200
198
  async def test_azure_ai_search_destination(
201
199
  upload_file: Path,
@@ -239,6 +237,7 @@ async def test_azure_ai_search_destination(
239
237
  validate_count(search_client=search_client, expected_count=expected_count)
240
238
 
241
239
 
240
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
242
241
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
243
242
  def test_azure_ai_search_stager(
244
243
  request: TopRequest,
@@ -5,9 +5,7 @@ import chromadb
5
5
  import pytest
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- )
8
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
9
  from test.integration.connectors.utils.docker import HealthCheck, container_context
12
10
  from test.integration.connectors.utils.validation.destination import (
13
11
  StagerValidationConfigs,
@@ -27,7 +25,7 @@ from unstructured_ingest.v2.processes.connectors.chroma import (
27
25
  @pytest.fixture
28
26
  def chroma_instance():
29
27
  with container_context(
30
- image="chromadb/chroma:latest",
28
+ image="chromadb/chroma:0.6.2",
31
29
  ports={8000: 8000},
32
30
  name="chroma_int_test",
33
31
  healthcheck=HealthCheck(
@@ -64,7 +62,7 @@ def validate_collection(collection_name: str, num_embeddings: int):
64
62
  )
65
63
 
66
64
 
67
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
65
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
68
66
  def test_chroma_destination(
69
67
  upload_file: Path,
70
68
  chroma_instance,
@@ -104,7 +102,7 @@ def test_chroma_destination(
104
102
 
105
103
 
106
104
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
107
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager")
105
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager", VECTOR_DB_TAG)
108
106
  def test_chroma_stager(
109
107
  request: TopRequest,
110
108
  upload_file_str: str,
@@ -2,9 +2,7 @@ import os
2
2
 
3
3
  import pytest
4
4
 
5
- from test.integration.connectors.utils.constants import (
6
- SOURCE_TAG,
7
- )
5
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
8
6
  from test.integration.connectors.utils.validation.source import (
9
7
  SourceValidationConfigs,
10
8
  source_connector_validation,
@@ -22,7 +20,7 @@ from unstructured_ingest.v2.processes.connectors.confluence import (
22
20
 
23
21
 
24
22
  @pytest.mark.asyncio
25
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
23
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
26
24
  @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
27
25
  async def test_confluence_source(temp_dir):
28
26
  # Retrieve environment variables
@@ -69,7 +67,7 @@ async def test_confluence_source(temp_dir):
69
67
 
70
68
 
71
69
  @pytest.mark.asyncio
72
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
70
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
73
71
  @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
74
72
  async def test_confluence_source_large(temp_dir):
75
73
  # Retrieve environment variables
@@ -6,9 +6,7 @@ import pytest
6
6
  from deltalake import DeltaTable
7
7
  from fsspec import get_filesystem_class
8
8
 
9
- from test.integration.connectors.utils.constants import (
10
- DESTINATION_TAG,
11
- )
9
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
12
10
  from test.integration.utils import requires_env
13
11
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
14
12
  from unstructured_ingest.v2.processes.connectors.delta_table import (
@@ -25,7 +23,7 @@ multiprocessing.set_start_method("spawn")
25
23
 
26
24
 
27
25
  @pytest.mark.asyncio
28
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
26
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
29
27
  async def test_delta_table_destination_local(upload_file: Path, temp_dir: Path):
30
28
  destination_path = str(temp_dir)
31
29
  connection_config = DeltaTableConnectionConfig(
@@ -81,7 +79,7 @@ def get_aws_credentials() -> dict:
81
79
 
82
80
 
83
81
  @pytest.mark.asyncio
84
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
82
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
85
83
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
86
84
  async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
87
85
  aws_credentials = get_aws_credentials()
@@ -140,7 +138,7 @@ async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
140
138
 
141
139
 
142
140
  @pytest.mark.asyncio
143
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
141
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
144
142
  @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
145
143
  async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
146
144
  aws_credentials = {