unstructured-ingest 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +10 -6
  2. test/integration/connectors/discord/test_discord.py +4 -4
  3. test/integration/connectors/duckdb/test_duckdb.py +3 -2
  4. test/integration/connectors/duckdb/test_motherduck.py +2 -2
  5. test/integration/connectors/elasticsearch/test_elasticsearch.py +8 -7
  6. test/integration/connectors/elasticsearch/test_opensearch.py +8 -7
  7. test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
  8. test/integration/connectors/sql/test_postgres.py +9 -3
  9. test/integration/connectors/sql/test_singlestore.py +9 -3
  10. test/integration/connectors/sql/test_snowflake.py +9 -3
  11. test/integration/connectors/sql/test_sqlite.py +9 -3
  12. test/integration/connectors/test_astradb.py +25 -9
  13. test/integration/connectors/test_azure_ai_search.py +3 -4
  14. test/integration/connectors/test_chroma.py +4 -6
  15. test/integration/connectors/test_confluence.py +3 -5
  16. test/integration/connectors/test_delta_table.py +4 -6
  17. test/integration/connectors/test_lancedb.py +3 -3
  18. test/integration/connectors/test_milvus.py +10 -5
  19. test/integration/connectors/test_mongodb.py +9 -9
  20. test/integration/connectors/test_neo4j.py +16 -8
  21. test/integration/connectors/test_notion.py +7 -0
  22. test/integration/connectors/test_onedrive.py +2 -4
  23. test/integration/connectors/test_pinecone.py +73 -8
  24. test/integration/connectors/test_qdrant.py +5 -4
  25. test/integration/connectors/test_redis.py +3 -3
  26. test/integration/connectors/test_s3.py +7 -6
  27. test/integration/connectors/test_vectara.py +2 -2
  28. test/integration/connectors/utils/constants.py +6 -0
  29. test/integration/connectors/utils/docker.py +2 -2
  30. test/integration/connectors/weaviate/test_cloud.py +5 -0
  31. test/integration/connectors/weaviate/test_local.py +2 -2
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  34. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
  35. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
  36. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
  37. unstructured_ingest/v2/processes/connectors/neo4j.py +12 -12
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
  39. unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
  40. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
  41. unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
  42. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +20 -18
  43. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +47 -44
  44. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
  45. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
  46. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
  47. {unstructured_ingest-0.3.13.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,11 @@ import pytest
10
10
  from databricks.sdk import WorkspaceClient
11
11
  from databricks.sdk.errors.platform import NotFound
12
12
 
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.constants import (
14
+ BLOB_STORAGE_TAG,
15
+ DESTINATION_TAG,
16
+ SOURCE_TAG,
17
+ )
14
18
  from test.integration.connectors.utils.validation.source import (
15
19
  SourceValidationConfigs,
16
20
  source_connector_validation,
@@ -83,7 +87,7 @@ def get_pat_env_data() -> PATEnvData:
83
87
 
84
88
 
85
89
  @pytest.mark.asyncio
86
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
90
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
87
91
  @requires_env(
88
92
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
89
93
  )
@@ -115,7 +119,7 @@ async def test_volumes_native_source(tmp_path: Path):
115
119
 
116
120
 
117
121
  @pytest.mark.asyncio
118
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
122
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
119
123
  @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
120
124
  async def test_volumes_native_source_pat(tmp_path: Path):
121
125
  env_data = get_pat_env_data()
@@ -144,7 +148,7 @@ async def test_volumes_native_source_pat(tmp_path: Path):
144
148
  )
145
149
 
146
150
 
147
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
151
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
148
152
  @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
149
153
  def test_volumes_native_source_pat_invalid_catalog():
150
154
  env_data = get_pat_env_data()
@@ -162,7 +166,7 @@ def test_volumes_native_source_pat_invalid_catalog():
162
166
  _ = list(indexer.run())
163
167
 
164
168
 
165
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
166
170
  @requires_env("DATABRICKS_HOST")
167
171
  def test_volumes_native_source_pat_invalid_pat():
168
172
  host = os.environ["DATABRICKS_HOST"]
@@ -231,7 +235,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
231
235
 
232
236
 
233
237
  @pytest.mark.asyncio
234
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
238
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
235
239
  @requires_env(
236
240
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
237
241
  )
@@ -6,7 +6,7 @@ from typing import Optional
6
6
 
7
7
  import pytest
8
8
 
9
- from test.integration.connectors.utils.constants import SOURCE_TAG
9
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
10
10
  from test.integration.connectors.utils.validation.source import (
11
11
  SourceValidationConfigs,
12
12
  source_connector_validation,
@@ -38,7 +38,7 @@ def get_env_data() -> EnvData:
38
38
 
39
39
 
40
40
  @pytest.mark.asyncio
41
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
41
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
42
42
  @requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
43
43
  async def test_discord_source():
44
44
  env = get_env_data()
@@ -66,7 +66,7 @@ async def test_discord_source():
66
66
  )
67
67
 
68
68
 
69
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
69
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
70
70
  @requires_env("DISCORD_CHANNELS")
71
71
  def test_discord_source_precheck_fail_no_token():
72
72
  indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
@@ -77,7 +77,7 @@ def test_discord_source_precheck_fail_no_token():
77
77
  indexer.precheck()
78
78
 
79
79
 
80
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
80
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
81
81
  @requires_env("DISCORD_TOKEN")
82
82
  def test_discord_source_precheck_fail_no_channels():
83
83
  indexer_config = DiscordIndexerConfig(channels=[])
@@ -5,7 +5,7 @@ import duckdb
5
5
  import pytest
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG
8
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
9
9
  from test.integration.connectors.utils.validation.destination import (
10
10
  StagerValidationConfigs,
11
11
  stager_validation,
@@ -46,7 +46,7 @@ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
46
46
  conn.close()
47
47
 
48
48
 
49
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
49
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
50
50
  def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
51
51
  file_data = FileData(
52
52
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -73,6 +73,7 @@ def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_d
73
73
  validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
74
74
 
75
75
 
76
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
76
77
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
77
78
  def test_duckdb_stager(
78
79
  request: TopRequest,
@@ -7,7 +7,7 @@ import duckdb
7
7
  import pandas as pd
8
8
  import pytest
9
9
 
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
11
  from test.integration.utils import requires_env
12
12
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
13
13
  from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
@@ -61,7 +61,7 @@ def validate_motherduck_destination(database: str, expected_num_elements: int, m
61
61
  conn.close()
62
62
 
63
63
 
64
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
64
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
65
65
  @requires_env("MOTHERDUCK_TOKEN")
66
66
  def test_motherduck_destination(
67
67
  md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
@@ -15,7 +15,7 @@ from _pytest.fixtures import TopRequest
15
15
  from elasticsearch import Elasticsearch as ElasticsearchClient
16
16
  from elasticsearch.helpers import bulk
17
17
 
18
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
18
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, NOSQL_TAG
19
19
  from test.integration.connectors.utils.docker import HealthCheck, container_context
20
20
  from test.integration.connectors.utils.validation.source import (
21
21
  SourceValidationConfigs,
@@ -177,7 +177,7 @@ def destination_index(elasticsearch_elements_mapping: dict) -> str:
177
177
 
178
178
 
179
179
  @pytest.mark.asyncio
180
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
180
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
181
181
  async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
182
182
  indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
183
183
  with tempfile.TemporaryDirectory() as tempdir:
@@ -207,7 +207,7 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
207
207
  )
208
208
 
209
209
 
210
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
210
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
211
211
  def test_elasticsearch_source_precheck_fail_no_cluster():
212
212
  indexer_config = ElasticsearchIndexerConfig(index_name="index")
213
213
 
@@ -221,7 +221,7 @@ def test_elasticsearch_source_precheck_fail_no_cluster():
221
221
  indexer.precheck()
222
222
 
223
223
 
224
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
224
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
225
225
  def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
226
226
  indexer_config = ElasticsearchIndexerConfig(index_name="index")
227
227
 
@@ -236,7 +236,7 @@ def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
236
236
 
237
237
 
238
238
  @pytest.mark.asyncio
239
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
239
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
240
240
  async def test_elasticsearch_destination(
241
241
  upload_file: Path,
242
242
  destination_index: str,
@@ -282,7 +282,7 @@ async def test_elasticsearch_destination(
282
282
  validate_count(client=client, expected_count=expected_count, index_name=destination_index)
283
283
 
284
284
 
285
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
285
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
286
286
  def test_elasticsearch_destination_precheck_fail():
287
287
  connection_config = ElasticsearchConnectionConfig(
288
288
  access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
@@ -297,7 +297,7 @@ def test_elasticsearch_destination_precheck_fail():
297
297
  uploader.precheck()
298
298
 
299
299
 
300
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
300
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
301
301
  def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
302
302
  connection_config = ElasticsearchConnectionConfig(
303
303
  access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
@@ -312,6 +312,7 @@ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str
312
312
  uploader.precheck()
313
313
 
314
314
 
315
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
315
316
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
316
317
  def test_elasticsearch_stager(
317
318
  request: TopRequest,
@@ -10,7 +10,7 @@ import pytest
10
10
  from _pytest.fixtures import TopRequest
11
11
  from opensearchpy import Document, Keyword, OpenSearch, Text
12
12
 
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
14
14
  from test.integration.connectors.utils.docker import HealthCheck, container_context
15
15
  from test.integration.connectors.utils.validation.destination import (
16
16
  StagerValidationConfigs,
@@ -166,7 +166,7 @@ def destination_index(opensearch_elements_mapping: dict) -> str:
166
166
 
167
167
 
168
168
  @pytest.mark.asyncio
169
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
170
170
  async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFrame):
171
171
  indexer_config = OpenSearchIndexerConfig(index_name=source_index)
172
172
  with tempfile.TemporaryDirectory() as tempdir:
@@ -197,7 +197,7 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
197
197
  )
198
198
 
199
199
 
200
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
200
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
201
201
  def test_opensearch_source_precheck_fail_no_cluster():
202
202
  indexer_config = OpenSearchIndexerConfig(index_name="index")
203
203
 
@@ -212,7 +212,7 @@ def test_opensearch_source_precheck_fail_no_cluster():
212
212
  indexer.precheck()
213
213
 
214
214
 
215
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
215
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
216
216
  def test_opensearch_source_precheck_fail_no_index(source_index: str):
217
217
  indexer_config = OpenSearchIndexerConfig(index_name="index")
218
218
 
@@ -228,7 +228,7 @@ def test_opensearch_source_precheck_fail_no_index(source_index: str):
228
228
 
229
229
 
230
230
  @pytest.mark.asyncio
231
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
231
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
232
232
  async def test_opensearch_destination(
233
233
  upload_file: Path,
234
234
  destination_index: str,
@@ -275,7 +275,7 @@ async def test_opensearch_destination(
275
275
  validate_count(client=client, expected_count=expected_count, index_name=destination_index)
276
276
 
277
277
 
278
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
278
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
279
279
  def test_opensearch_destination_precheck_fail():
280
280
  connection_config = OpenSearchConnectionConfig(
281
281
  access_config=OpenSearchAccessConfig(password="admin"),
@@ -291,7 +291,7 @@ def test_opensearch_destination_precheck_fail():
291
291
  uploader.precheck()
292
292
 
293
293
 
294
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
294
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
295
295
  def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
296
296
  connection_config = OpenSearchConnectionConfig(
297
297
  access_config=OpenSearchAccessConfig(password="admin"),
@@ -307,6 +307,7 @@ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
307
307
  uploader.precheck()
308
308
 
309
309
 
310
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
310
311
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
311
312
  def test_opensearch_stager(
312
313
  request: TopRequest,
@@ -0,0 +1,142 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+ from uuid import uuid4
7
+
8
+ import pytest
9
+ from databricks.sql import connect
10
+ from databricks.sql.client import Connection as DeltaTableConnection
11
+ from databricks.sql.client import Cursor as DeltaTableCursor
12
+ from pydantic import BaseModel, SecretStr
13
+
14
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
15
+ from test.integration.utils import requires_env
16
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
17
+ from unstructured_ingest.v2.logger import logger
18
+ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
19
+ CONNECTOR_TYPE,
20
+ DatabrickDeltaTablesAccessConfig,
21
+ DatabrickDeltaTablesConnectionConfig,
22
+ DatabrickDeltaTablesUploader,
23
+ DatabrickDeltaTablesUploaderConfig,
24
+ DatabrickDeltaTablesUploadStager,
25
+ )
26
+
27
+ CATALOG = "utic-dev-tech-fixtures"
28
+
29
+
30
+ class EnvData(BaseModel):
31
+ server_hostname: str
32
+ http_path: str
33
+ access_token: SecretStr
34
+
35
+
36
+ def get_env_data() -> EnvData:
37
+ return EnvData(
38
+ server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
39
+ http_path=os.environ["DATABRICKS_HTTP_PATH"],
40
+ access_token=os.environ["DATABRICKS_ACCESS_TOKEN"],
41
+ )
42
+
43
+
44
+ def get_destination_schema(new_table_name: str) -> str:
45
+ p = Path(env_setup_path / "sql" / "databricks_delta_tables" / "destination" / "schema.sql")
46
+ with p.open() as f:
47
+ data_lines = f.readlines()
48
+ data_lines[0] = data_lines[0].replace("elements", new_table_name)
49
+ data = "".join([line.strip() for line in data_lines])
50
+ return data
51
+
52
+
53
+ @contextmanager
54
+ def get_connection() -> DeltaTableConnection:
55
+ env_data = get_env_data()
56
+ with connect(
57
+ server_hostname=env_data.server_hostname,
58
+ http_path=env_data.http_path,
59
+ access_token=env_data.access_token.get_secret_value(),
60
+ ) as connection:
61
+ yield connection
62
+
63
+
64
+ @contextmanager
65
+ def get_cursor() -> DeltaTableCursor:
66
+ with get_connection() as connection:
67
+ with connection.cursor() as cursor:
68
+ cursor.execute(f"USE CATALOG '{CATALOG}'")
69
+ yield cursor
70
+
71
+
72
+ @pytest.fixture
73
+ def destination_table() -> str:
74
+ random_id = str(uuid4())[:8]
75
+ table_name = f"elements_{random_id}"
76
+ destination_schema = get_destination_schema(new_table_name=table_name)
77
+ with get_cursor() as cursor:
78
+ logger.info(f"creating table: {table_name}")
79
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
80
+ cursor.execute(destination_schema)
81
+
82
+ yield table_name
83
+ with get_cursor() as cursor:
84
+ logger.info(f"dropping table: {table_name}")
85
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
86
+
87
+
88
+ def validate_destination(expected_num_elements: int, table_name: str, retries=30, interval=1):
89
+ with get_cursor() as cursor:
90
+ for i in range(retries):
91
+ cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
92
+ count = cursor.fetchone()[0]
93
+ if count == expected_num_elements:
94
+ break
95
+ logger.info(f"retry attempt {i}: expected {expected_num_elements} != count {count}")
96
+ time.sleep(interval)
97
+ assert (
98
+ count == expected_num_elements
99
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
100
+
101
+
102
+ @pytest.mark.asyncio
103
+ @pytest.mark.skip("Resources take too long to spin up to run in CI")
104
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
105
+ @requires_env("DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_ACCESS_TOKEN")
106
+ async def test_databricks_delta_tables_destination(
107
+ upload_file: Path, temp_dir: Path, destination_table: str
108
+ ):
109
+ env_data = get_env_data()
110
+ mock_file_data = FileData(
111
+ identifier="mock file data",
112
+ connector_type=CONNECTOR_TYPE,
113
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
114
+ )
115
+ stager = DatabrickDeltaTablesUploadStager()
116
+ staged_path = stager.run(
117
+ elements_filepath=upload_file,
118
+ file_data=mock_file_data,
119
+ output_dir=temp_dir,
120
+ output_filename=upload_file.name,
121
+ )
122
+
123
+ assert staged_path.suffix == upload_file.suffix
124
+
125
+ uploader = DatabrickDeltaTablesUploader(
126
+ connection_config=DatabrickDeltaTablesConnectionConfig(
127
+ access_config=DatabrickDeltaTablesAccessConfig(
128
+ token=env_data.access_token.get_secret_value()
129
+ ),
130
+ http_path=env_data.http_path,
131
+ server_hostname=env_data.server_hostname,
132
+ ),
133
+ upload_config=DatabrickDeltaTablesUploaderConfig(
134
+ catalog=CATALOG, database="default", table_name=destination_table
135
+ ),
136
+ )
137
+ with staged_path.open("r") as f:
138
+ staged_data = json.load(f)
139
+ expected_num_elements = len(staged_data)
140
+ uploader.precheck()
141
+ uploader.run(path=staged_path, file_data=mock_file_data)
142
+ validate_destination(expected_num_elements=expected_num_elements, table_name=destination_table)
@@ -5,7 +5,12 @@ import pytest
5
5
  from _pytest.fixtures import TopRequest
6
6
  from psycopg2 import connect
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ SQL_TAG,
12
+ env_setup_path,
13
+ )
9
14
  from test.integration.connectors.utils.docker_compose import docker_compose_context
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
@@ -51,7 +56,7 @@ def source_database_setup() -> str:
51
56
 
52
57
 
53
58
  @pytest.mark.asyncio
54
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
59
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
55
60
  async def test_postgres_source(temp_dir: Path, source_database_setup: str):
56
61
  connect_params = {
57
62
  "host": "localhost",
@@ -115,7 +120,7 @@ def validate_destination(
115
120
 
116
121
 
117
122
  @pytest.mark.asyncio
118
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
123
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
119
124
  async def test_postgres_destination(upload_file: Path, temp_dir: Path):
120
125
  # the postgres destination connector doesn't leverage the file data but is required as an input,
121
126
  # mocking it with arbitrary values to meet the base requirements:
@@ -179,6 +184,7 @@ async def test_postgres_destination(upload_file: Path, temp_dir: Path):
179
184
  )
180
185
 
181
186
 
187
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
182
188
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
183
189
  def test_postgres_stager(
184
190
  request: TopRequest,
@@ -5,7 +5,12 @@ import pytest
5
5
  import singlestoredb as s2
6
6
  from _pytest.fixtures import TopRequest
7
7
 
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ SQL_TAG,
12
+ env_setup_path,
13
+ )
9
14
  from test.integration.connectors.utils.docker_compose import docker_compose_context
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
@@ -54,7 +59,7 @@ def source_database_setup() -> dict:
54
59
 
55
60
 
56
61
  @pytest.mark.asyncio
57
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
62
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
58
63
  async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
59
64
 
60
65
  connection_config = SingleStoreConnectionConfig(
@@ -101,7 +106,7 @@ def validate_destination(
101
106
 
102
107
 
103
108
  @pytest.mark.asyncio
104
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
109
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
105
110
  async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
106
111
  mock_file_data = FileData(
107
112
  identifier="mock file data",
@@ -160,6 +165,7 @@ async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
160
165
  )
161
166
 
162
167
 
168
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
163
169
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
164
170
  def test_singlestore_stager(
165
171
  request: TopRequest,
@@ -6,7 +6,12 @@ import pytest
6
6
  import snowflake.connector as sf
7
7
  from _pytest.fixtures import TopRequest
8
8
 
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
9
+ from test.integration.connectors.utils.constants import (
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ SQL_TAG,
13
+ env_setup_path,
14
+ )
10
15
  from test.integration.connectors.utils.docker import container_context
11
16
  from test.integration.connectors.utils.validation.destination import (
12
17
  StagerValidationConfigs,
@@ -109,7 +114,7 @@ def destination_database_setup() -> dict:
109
114
 
110
115
 
111
116
  @pytest.mark.asyncio
112
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
117
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
113
118
  @requires_env("LOCALSTACK_AUTH_TOKEN")
114
119
  async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
115
120
  connection_config = SnowflakeConnectionConfig(
@@ -163,7 +168,7 @@ def validate_destination(
163
168
 
164
169
 
165
170
  @pytest.mark.asyncio
166
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
171
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
167
172
  @requires_env("LOCALSTACK_AUTH_TOKEN")
168
173
  async def test_snowflake_destination(
169
174
  upload_file: Path, temp_dir: Path, destination_database_setup: dict
@@ -222,6 +227,7 @@ async def test_snowflake_destination(
222
227
  )
223
228
 
224
229
 
230
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
225
231
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
226
232
  def test_snowflake_stager(
227
233
  request: TopRequest,
@@ -6,7 +6,12 @@ from pathlib import Path
6
6
  import pytest
7
7
  from _pytest.fixtures import TopRequest
8
8
 
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
9
+ from test.integration.connectors.utils.constants import (
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ SQL_TAG,
13
+ env_setup_path,
14
+ )
10
15
  from test.integration.connectors.utils.validation.destination import (
11
16
  StagerValidationConfigs,
12
17
  stager_validation,
@@ -52,7 +57,7 @@ def source_database_setup() -> Path:
52
57
 
53
58
 
54
59
  @pytest.mark.asyncio
55
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
60
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
56
61
  async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
62
  connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
63
  indexer = SQLiteIndexer(
@@ -110,7 +115,7 @@ def validate_destination(db_path: Path, expected_num_elements: int):
110
115
 
111
116
 
112
117
  @pytest.mark.asyncio
113
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
118
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
114
119
  async def test_sqlite_destination(
115
120
  upload_file: Path, temp_dir: Path, destination_database_setup: Path
116
121
  ):
@@ -146,6 +151,7 @@ async def test_sqlite_destination(
146
151
  validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
147
152
 
148
153
 
154
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
149
155
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
150
156
  def test_sqlite_stager(
151
157
  request: TopRequest,
@@ -9,7 +9,7 @@ from _pytest.fixtures import TopRequest
9
9
  from astrapy import Collection
10
10
  from astrapy import DataAPIClient as AstraDBClient
11
11
 
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, VECTOR_DB_TAG
13
13
  from test.integration.connectors.utils.validation.destination import (
14
14
  StagerValidationConfigs,
15
15
  stager_validation,
@@ -49,9 +49,9 @@ def connection_config() -> AstraDBConnectionConfig:
49
49
  )
50
50
 
51
51
 
52
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
52
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
53
53
  @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
54
- def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
54
+ def test_precheck_succeeds_indexer(connection_config: AstraDBConnectionConfig):
55
55
  indexer = AstraDBIndexer(
56
56
  connection_config=connection_config,
57
57
  index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
@@ -64,19 +64,34 @@ def test_precheck_succeeds(connection_config: AstraDBConnectionConfig):
64
64
  uploader.precheck()
65
65
 
66
66
 
67
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, DESTINATION_TAG)
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
68
68
  @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
69
- def test_precheck_fails(connection_config: AstraDBConnectionConfig):
69
+ def test_precheck_succeeds_uploader(connection_config: AstraDBConnectionConfig):
70
+ uploader = AstraDBUploader(
71
+ connection_config=connection_config,
72
+ upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
73
+ )
74
+ uploader.precheck()
75
+
76
+
77
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
78
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
79
+ def test_precheck_fails_indexer(connection_config: AstraDBConnectionConfig):
70
80
  indexer = AstraDBIndexer(
71
81
  connection_config=connection_config,
72
82
  index_config=AstraDBIndexerConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
73
83
  )
84
+ with pytest.raises(expected_exception=SourceConnectionError):
85
+ indexer.precheck()
86
+
87
+
88
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
89
+ @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
90
+ def test_precheck_fails_uploader(connection_config: AstraDBConnectionConfig):
74
91
  uploader = AstraDBUploader(
75
92
  connection_config=connection_config,
76
93
  upload_config=AstraDBUploaderConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
77
94
  )
78
- with pytest.raises(expected_exception=SourceConnectionError):
79
- indexer.precheck()
80
95
  with pytest.raises(expected_exception=DestinationConnectionError):
81
96
  uploader.precheck()
82
97
 
@@ -117,7 +132,7 @@ def collection(upload_file: Path) -> Collection:
117
132
 
118
133
 
119
134
  @pytest.mark.asyncio
120
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
135
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
121
136
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
122
137
  async def test_astra_search_source(
123
138
  tmp_path: Path,
@@ -151,7 +166,7 @@ async def test_astra_search_source(
151
166
 
152
167
 
153
168
  @pytest.mark.asyncio
154
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
169
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
155
170
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
156
171
  async def test_astra_search_destination(
157
172
  upload_file: Path,
@@ -201,6 +216,7 @@ async def test_astra_search_destination(
201
216
  )
202
217
 
203
218
 
219
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
204
220
  @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
205
221
  def test_astra_stager(
206
222
  request: TopRequest,