unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,326 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import tempfile
|
|
3
|
-
import time
|
|
4
|
-
from contextlib import contextmanager
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Generator
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import pytest
|
|
10
|
-
from _pytest.fixtures import TopRequest
|
|
11
|
-
from opensearchpy import Document, Keyword, OpenSearch, Text
|
|
12
|
-
|
|
13
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
|
|
14
|
-
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
15
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
16
|
-
StagerValidationConfigs,
|
|
17
|
-
stager_validation,
|
|
18
|
-
)
|
|
19
|
-
from test.integration.connectors.utils.validation.source import (
|
|
20
|
-
SourceValidationConfigs,
|
|
21
|
-
source_connector_validation,
|
|
22
|
-
)
|
|
23
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
-
from unstructured_ingest.error import (
|
|
25
|
-
DestinationConnectionError,
|
|
26
|
-
SourceConnectionError,
|
|
27
|
-
)
|
|
28
|
-
from unstructured_ingest.processes.connectors.elasticsearch.opensearch import (
|
|
29
|
-
CONNECTOR_TYPE,
|
|
30
|
-
OpenSearchAccessConfig,
|
|
31
|
-
OpenSearchConnectionConfig,
|
|
32
|
-
OpenSearchDownloader,
|
|
33
|
-
OpenSearchDownloaderConfig,
|
|
34
|
-
OpenSearchIndexer,
|
|
35
|
-
OpenSearchIndexerConfig,
|
|
36
|
-
OpenSearchUploader,
|
|
37
|
-
OpenSearchUploaderConfig,
|
|
38
|
-
OpenSearchUploadStager,
|
|
39
|
-
OpenSearchUploadStagerConfig,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
SOURCE_INDEX_NAME = "movies"
|
|
43
|
-
DESTINATION_INDEX_NAME = "elements"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class Movie(Document):
|
|
47
|
-
title = Text(fields={"raw": Keyword()})
|
|
48
|
-
year = Text()
|
|
49
|
-
director = Text()
|
|
50
|
-
cast = Text()
|
|
51
|
-
genre = Text()
|
|
52
|
-
wiki_page = Text()
|
|
53
|
-
ethnicity = Text()
|
|
54
|
-
plot = Text()
|
|
55
|
-
|
|
56
|
-
class Index:
|
|
57
|
-
name = SOURCE_INDEX_NAME
|
|
58
|
-
|
|
59
|
-
def save(self, **kwargs):
|
|
60
|
-
return super(Movie, self).save(**kwargs)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@contextmanager
|
|
64
|
-
def get_client() -> Generator[OpenSearch, None, None]:
|
|
65
|
-
with OpenSearch(
|
|
66
|
-
hosts=[{"host": "localhost", "port": 9200}],
|
|
67
|
-
http_auth=("admin", "admin"),
|
|
68
|
-
use_ssl=True,
|
|
69
|
-
verify_certs=False,
|
|
70
|
-
ssl_show_warn=False,
|
|
71
|
-
) as client:
|
|
72
|
-
yield client
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def get_index_count(client: OpenSearch, index_name: str) -> int:
|
|
76
|
-
count_resp = client.cat.count(index=index_name, params={"format": "json"})
|
|
77
|
-
return int(count_resp[0]["count"])
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def wait_for_write(
|
|
81
|
-
client: OpenSearch, index_name: str, expected_count: int, timeout: int = 30, interval: int = 1
|
|
82
|
-
) -> None:
|
|
83
|
-
current_count = get_index_count(client, index_name)
|
|
84
|
-
start = time.time()
|
|
85
|
-
while time.time() - start < timeout:
|
|
86
|
-
print(f"waiting for current count ({current_count}) to match expected {expected_count}")
|
|
87
|
-
time.sleep(interval)
|
|
88
|
-
current_count = get_index_count(client, index_name)
|
|
89
|
-
if current_count == expected_count:
|
|
90
|
-
return
|
|
91
|
-
raise TimeoutError("Timed out while waiting for write to sync")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def validate_count(
|
|
95
|
-
client: OpenSearch, index_name: str, expected_count: int, retries: int = 10, interval: int = 1
|
|
96
|
-
) -> None:
|
|
97
|
-
current_count = get_index_count(client, index_name)
|
|
98
|
-
if current_count == expected_count:
|
|
99
|
-
return
|
|
100
|
-
tries = 0
|
|
101
|
-
while tries < retries:
|
|
102
|
-
print(
|
|
103
|
-
f"retrying validation to check if expected count "
|
|
104
|
-
f"{expected_count} will match current count {current_count}"
|
|
105
|
-
)
|
|
106
|
-
time.sleep(interval)
|
|
107
|
-
current_count = get_index_count(client, index_name)
|
|
108
|
-
if current_count == expected_count:
|
|
109
|
-
break
|
|
110
|
-
assert current_count == expected_count, (
|
|
111
|
-
f"Expected count ({expected_count}) doesn't match how "
|
|
112
|
-
f"much came back from index: {current_count}"
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
@pytest.fixture
|
|
117
|
-
def source_index(movies_dataframe: pd.DataFrame) -> str:
|
|
118
|
-
with container_context(
|
|
119
|
-
image="opensearchproject/opensearch:2.11.1",
|
|
120
|
-
ports={9200: 9200, 9600: 9600},
|
|
121
|
-
environment={"discovery.type": "single-node"},
|
|
122
|
-
healthcheck=HealthCheck(
|
|
123
|
-
test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
|
|
124
|
-
interval=1,
|
|
125
|
-
),
|
|
126
|
-
):
|
|
127
|
-
with get_client() as client:
|
|
128
|
-
Movie.init(using=client)
|
|
129
|
-
for i, row in movies_dataframe.iterrows():
|
|
130
|
-
movie = Movie(
|
|
131
|
-
meta={"id": i},
|
|
132
|
-
title=row["Title"],
|
|
133
|
-
year=row["Release Year"],
|
|
134
|
-
director=row["Director"],
|
|
135
|
-
cast=row["Cast"],
|
|
136
|
-
genre=row["Genre"],
|
|
137
|
-
wiki_page=row["Wiki Page"],
|
|
138
|
-
ethnicity=row["Origin/Ethnicity"],
|
|
139
|
-
plot=row["Plot"],
|
|
140
|
-
)
|
|
141
|
-
movie.save(using=client)
|
|
142
|
-
wait_for_write(
|
|
143
|
-
client=client, index_name=SOURCE_INDEX_NAME, expected_count=len(movies_dataframe)
|
|
144
|
-
)
|
|
145
|
-
yield SOURCE_INDEX_NAME
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
@pytest.fixture
|
|
149
|
-
def destination_index(opensearch_elements_mapping: dict) -> str:
|
|
150
|
-
with container_context(
|
|
151
|
-
image="opensearchproject/opensearch:2.11.1",
|
|
152
|
-
ports={9200: 9200, 9600: 9600},
|
|
153
|
-
environment={"discovery.type": "single-node"},
|
|
154
|
-
healthcheck=HealthCheck(
|
|
155
|
-
test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
|
|
156
|
-
interval=1,
|
|
157
|
-
),
|
|
158
|
-
):
|
|
159
|
-
with get_client() as client:
|
|
160
|
-
response = client.indices.create(
|
|
161
|
-
index=DESTINATION_INDEX_NAME, body=opensearch_elements_mapping
|
|
162
|
-
)
|
|
163
|
-
if not response["acknowledged"]:
|
|
164
|
-
raise RuntimeError(f"failed to create index: {response}")
|
|
165
|
-
yield DESTINATION_INDEX_NAME
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
@pytest.mark.asyncio
|
|
169
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
170
|
-
async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFrame):
|
|
171
|
-
indexer_config = OpenSearchIndexerConfig(index_name=source_index)
|
|
172
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
173
|
-
tempdir_path = Path(tempdir)
|
|
174
|
-
connection_config = OpenSearchConnectionConfig(
|
|
175
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
176
|
-
username="admin",
|
|
177
|
-
hosts=["http://localhost:9200"],
|
|
178
|
-
use_ssl=True,
|
|
179
|
-
)
|
|
180
|
-
download_config = OpenSearchDownloaderConfig(download_dir=tempdir_path)
|
|
181
|
-
indexer = OpenSearchIndexer(
|
|
182
|
-
connection_config=connection_config, index_config=indexer_config
|
|
183
|
-
)
|
|
184
|
-
downloader = OpenSearchDownloader(
|
|
185
|
-
connection_config=connection_config, download_config=download_config
|
|
186
|
-
)
|
|
187
|
-
expected_num_files = len(movies_dataframe)
|
|
188
|
-
await source_connector_validation(
|
|
189
|
-
indexer=indexer,
|
|
190
|
-
downloader=downloader,
|
|
191
|
-
configs=SourceValidationConfigs(
|
|
192
|
-
test_id=CONNECTOR_TYPE,
|
|
193
|
-
expected_num_files=expected_num_files,
|
|
194
|
-
expected_number_indexed_file_data=1,
|
|
195
|
-
validate_downloaded_files=True,
|
|
196
|
-
),
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
201
|
-
def test_opensearch_source_precheck_fail_no_cluster():
|
|
202
|
-
indexer_config = OpenSearchIndexerConfig(index_name="index")
|
|
203
|
-
|
|
204
|
-
connection_config = OpenSearchConnectionConfig(
|
|
205
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
206
|
-
username="admin",
|
|
207
|
-
hosts=["http://localhost:9200"],
|
|
208
|
-
use_ssl=True,
|
|
209
|
-
)
|
|
210
|
-
indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
211
|
-
with pytest.raises(SourceConnectionError):
|
|
212
|
-
indexer.precheck()
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
216
|
-
def test_opensearch_source_precheck_fail_no_index(source_index: str):
|
|
217
|
-
indexer_config = OpenSearchIndexerConfig(index_name="index")
|
|
218
|
-
|
|
219
|
-
connection_config = OpenSearchConnectionConfig(
|
|
220
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
221
|
-
username="admin",
|
|
222
|
-
hosts=["http://localhost:9200"],
|
|
223
|
-
use_ssl=True,
|
|
224
|
-
)
|
|
225
|
-
indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
226
|
-
with pytest.raises(SourceConnectionError):
|
|
227
|
-
indexer.precheck()
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
@pytest.mark.asyncio
|
|
231
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
232
|
-
async def test_opensearch_destination(
|
|
233
|
-
upload_file: Path,
|
|
234
|
-
destination_index: str,
|
|
235
|
-
tmp_path: Path,
|
|
236
|
-
):
|
|
237
|
-
file_data = FileData(
|
|
238
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
239
|
-
connector_type=CONNECTOR_TYPE,
|
|
240
|
-
identifier="mock file data",
|
|
241
|
-
)
|
|
242
|
-
connection_config = OpenSearchConnectionConfig(
|
|
243
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
244
|
-
username="admin",
|
|
245
|
-
hosts=["http://localhost:9200"],
|
|
246
|
-
use_ssl=True,
|
|
247
|
-
)
|
|
248
|
-
stager = OpenSearchUploadStager(
|
|
249
|
-
upload_stager_config=OpenSearchUploadStagerConfig(index_name=destination_index)
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
uploader = OpenSearchUploader(
|
|
253
|
-
connection_config=connection_config,
|
|
254
|
-
upload_config=OpenSearchUploaderConfig(index_name=destination_index),
|
|
255
|
-
)
|
|
256
|
-
staged_filepath = stager.run(
|
|
257
|
-
elements_filepath=upload_file,
|
|
258
|
-
file_data=file_data,
|
|
259
|
-
output_dir=tmp_path,
|
|
260
|
-
output_filename=upload_file.name,
|
|
261
|
-
)
|
|
262
|
-
uploader.precheck()
|
|
263
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
264
|
-
|
|
265
|
-
# Run validation
|
|
266
|
-
with staged_filepath.open() as f:
|
|
267
|
-
staged_elements = json.load(f)
|
|
268
|
-
expected_count = len(staged_elements)
|
|
269
|
-
with get_client() as client:
|
|
270
|
-
validate_count(client=client, expected_count=expected_count, index_name=destination_index)
|
|
271
|
-
|
|
272
|
-
# Rerun and make sure the same documents get updated
|
|
273
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
274
|
-
with get_client() as client:
|
|
275
|
-
validate_count(client=client, expected_count=expected_count, index_name=destination_index)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
279
|
-
def test_opensearch_destination_precheck_fail():
|
|
280
|
-
connection_config = OpenSearchConnectionConfig(
|
|
281
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
282
|
-
username="admin",
|
|
283
|
-
hosts=["http://localhost:9200"],
|
|
284
|
-
use_ssl=True,
|
|
285
|
-
)
|
|
286
|
-
uploader = OpenSearchUploader(
|
|
287
|
-
connection_config=connection_config,
|
|
288
|
-
upload_config=OpenSearchUploaderConfig(index_name="index"),
|
|
289
|
-
)
|
|
290
|
-
with pytest.raises(DestinationConnectionError):
|
|
291
|
-
uploader.precheck()
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
295
|
-
def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
|
|
296
|
-
connection_config = OpenSearchConnectionConfig(
|
|
297
|
-
access_config=OpenSearchAccessConfig(password="admin"),
|
|
298
|
-
username="admin",
|
|
299
|
-
hosts=["http://localhost:9200"],
|
|
300
|
-
use_ssl=True,
|
|
301
|
-
)
|
|
302
|
-
uploader = OpenSearchUploader(
|
|
303
|
-
connection_config=connection_config,
|
|
304
|
-
upload_config=OpenSearchUploaderConfig(index_name="index"),
|
|
305
|
-
)
|
|
306
|
-
with pytest.raises(DestinationConnectionError):
|
|
307
|
-
uploader.precheck()
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
311
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
312
|
-
def test_opensearch_stager(
|
|
313
|
-
request: TopRequest,
|
|
314
|
-
upload_file_str: str,
|
|
315
|
-
tmp_path: Path,
|
|
316
|
-
):
|
|
317
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
318
|
-
stager = OpenSearchUploadStager(
|
|
319
|
-
upload_stager_config=OpenSearchUploadStagerConfig(index_name="mock_index")
|
|
320
|
-
)
|
|
321
|
-
stager_validation(
|
|
322
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
323
|
-
input_file=upload_file,
|
|
324
|
-
stager=stager,
|
|
325
|
-
tmp_dir=tmp_path,
|
|
326
|
-
)
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import time
|
|
4
|
-
from contextlib import contextmanager
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from uuid import uuid4
|
|
7
|
-
|
|
8
|
-
import pytest
|
|
9
|
-
from databricks.sql import connect
|
|
10
|
-
from databricks.sql.client import Connection as DeltaTableConnection
|
|
11
|
-
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
12
|
-
from pydantic import BaseModel, Secret, SecretStr
|
|
13
|
-
from pytest_mock import MockerFixture
|
|
14
|
-
|
|
15
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
|
|
16
|
-
from test.integration.utils import requires_env
|
|
17
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
18
|
-
from unstructured_ingest.logger import logger
|
|
19
|
-
from unstructured_ingest.processes.connectors.sql.databricks_delta_tables import (
|
|
20
|
-
CONNECTOR_TYPE,
|
|
21
|
-
DatabricksDeltaTablesAccessConfig,
|
|
22
|
-
DatabricksDeltaTablesConnectionConfig,
|
|
23
|
-
DatabricksDeltaTablesUploader,
|
|
24
|
-
DatabricksDeltaTablesUploaderConfig,
|
|
25
|
-
DatabricksDeltaTablesUploadStager,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
CATALOG = "utic-dev-tech-fixtures"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class EnvData(BaseModel):
|
|
32
|
-
server_hostname: str
|
|
33
|
-
http_path: str
|
|
34
|
-
access_token: SecretStr
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def get_env_data() -> EnvData:
|
|
38
|
-
return EnvData(
|
|
39
|
-
server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
|
|
40
|
-
http_path=os.environ["DATABRICKS_HTTP_PATH"],
|
|
41
|
-
access_token=os.environ["DATABRICKS_ACCESS_TOKEN"],
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def get_destination_schema(new_table_name: str) -> str:
|
|
46
|
-
p = Path(env_setup_path / "sql" / "databricks_delta_tables" / "destination" / "schema.sql")
|
|
47
|
-
with p.open() as f:
|
|
48
|
-
data_lines = f.readlines()
|
|
49
|
-
data_lines[0] = data_lines[0].replace("elements", new_table_name)
|
|
50
|
-
data = "".join([line.strip() for line in data_lines])
|
|
51
|
-
return data
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@contextmanager
|
|
55
|
-
def get_connection() -> DeltaTableConnection:
|
|
56
|
-
env_data = get_env_data()
|
|
57
|
-
with connect(
|
|
58
|
-
server_hostname=env_data.server_hostname,
|
|
59
|
-
http_path=env_data.http_path,
|
|
60
|
-
access_token=env_data.access_token.get_secret_value(),
|
|
61
|
-
) as connection:
|
|
62
|
-
yield connection
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@contextmanager
|
|
66
|
-
def get_cursor() -> DeltaTableCursor:
|
|
67
|
-
with get_connection() as connection:
|
|
68
|
-
with connection.cursor() as cursor:
|
|
69
|
-
cursor.execute(f"USE CATALOG '{CATALOG}'")
|
|
70
|
-
yield cursor
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@pytest.fixture
|
|
74
|
-
def destination_table() -> str:
|
|
75
|
-
random_id = str(uuid4())[:8]
|
|
76
|
-
table_name = f"elements_{random_id}"
|
|
77
|
-
destination_schema = get_destination_schema(new_table_name=table_name)
|
|
78
|
-
with get_cursor() as cursor:
|
|
79
|
-
logger.info(f"creating table: {table_name}")
|
|
80
|
-
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
81
|
-
cursor.execute(destination_schema)
|
|
82
|
-
|
|
83
|
-
yield table_name
|
|
84
|
-
with get_cursor() as cursor:
|
|
85
|
-
logger.info(f"dropping table: {table_name}")
|
|
86
|
-
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def validate_destination(expected_num_elements: int, table_name: str, retries=30, interval=1):
|
|
90
|
-
with get_cursor() as cursor:
|
|
91
|
-
for i in range(retries):
|
|
92
|
-
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
93
|
-
count = cursor.fetchone()[0]
|
|
94
|
-
if count == expected_num_elements:
|
|
95
|
-
break
|
|
96
|
-
logger.info(f"retry attempt {i}: expected {expected_num_elements} != count {count}")
|
|
97
|
-
time.sleep(interval)
|
|
98
|
-
assert (
|
|
99
|
-
count == expected_num_elements
|
|
100
|
-
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
@pytest.mark.asyncio
|
|
104
|
-
@pytest.mark.skip("Resources take too long to spin up to run in CI")
|
|
105
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
106
|
-
@requires_env("DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_ACCESS_TOKEN")
|
|
107
|
-
async def test_databricks_delta_tables_destination(
|
|
108
|
-
upload_file: Path, temp_dir: Path, destination_table: str
|
|
109
|
-
):
|
|
110
|
-
env_data = get_env_data()
|
|
111
|
-
mock_file_data = FileData(
|
|
112
|
-
identifier="mock file data",
|
|
113
|
-
connector_type=CONNECTOR_TYPE,
|
|
114
|
-
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
115
|
-
)
|
|
116
|
-
stager = DatabricksDeltaTablesUploadStager()
|
|
117
|
-
staged_path = stager.run(
|
|
118
|
-
elements_filepath=upload_file,
|
|
119
|
-
file_data=mock_file_data,
|
|
120
|
-
output_dir=temp_dir,
|
|
121
|
-
output_filename=upload_file.name,
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
assert staged_path.suffix == upload_file.suffix
|
|
125
|
-
|
|
126
|
-
uploader = DatabricksDeltaTablesUploader(
|
|
127
|
-
connection_config=DatabricksDeltaTablesConnectionConfig(
|
|
128
|
-
access_config=DatabricksDeltaTablesAccessConfig(
|
|
129
|
-
token=env_data.access_token.get_secret_value()
|
|
130
|
-
),
|
|
131
|
-
http_path=env_data.http_path,
|
|
132
|
-
server_hostname=env_data.server_hostname,
|
|
133
|
-
),
|
|
134
|
-
upload_config=DatabricksDeltaTablesUploaderConfig(
|
|
135
|
-
catalog=CATALOG, database="default", table_name=destination_table
|
|
136
|
-
),
|
|
137
|
-
)
|
|
138
|
-
with staged_path.open("r") as f:
|
|
139
|
-
staged_data = json.load(f)
|
|
140
|
-
expected_num_elements = len(staged_data)
|
|
141
|
-
uploader.precheck()
|
|
142
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
143
|
-
validate_destination(expected_num_elements=expected_num_elements, table_name=destination_table)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def test_get_credentials_provider_with_client_id_and_secret(mocker: MockerFixture):
|
|
147
|
-
access_config = DatabricksDeltaTablesAccessConfig(
|
|
148
|
-
client_id="test_client_id", client_secret="test_client_secret"
|
|
149
|
-
)
|
|
150
|
-
connection_config = DatabricksDeltaTablesConnectionConfig(
|
|
151
|
-
access_config=Secret(access_config),
|
|
152
|
-
server_hostname="test_server_hostname",
|
|
153
|
-
http_path="test_http_path",
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
credentials_provider = connection_config.get_credentials_provider()
|
|
157
|
-
assert credentials_provider is not False
|
|
158
|
-
assert type(credentials_provider).__name__ == "function"
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def test_get_credentials_provider_with_token(mocker: MockerFixture):
|
|
162
|
-
access_config = DatabricksDeltaTablesAccessConfig(token="test_token")
|
|
163
|
-
connection_config = DatabricksDeltaTablesConnectionConfig(
|
|
164
|
-
access_config=Secret(access_config),
|
|
165
|
-
server_hostname="test_server_hostname",
|
|
166
|
-
http_path="test_http_path",
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
credentials_provider = connection_config.get_credentials_provider()
|
|
170
|
-
assert credentials_provider is False
|
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
from _pytest.fixtures import TopRequest
|
|
6
|
-
from psycopg2 import connect
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import (
|
|
9
|
-
DESTINATION_TAG,
|
|
10
|
-
SOURCE_TAG,
|
|
11
|
-
SQL_TAG,
|
|
12
|
-
env_setup_path,
|
|
13
|
-
)
|
|
14
|
-
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
15
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
16
|
-
StagerValidationConfigs,
|
|
17
|
-
stager_validation,
|
|
18
|
-
)
|
|
19
|
-
from test.integration.connectors.utils.validation.source import (
|
|
20
|
-
SourceValidationConfigs,
|
|
21
|
-
source_connector_validation,
|
|
22
|
-
)
|
|
23
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
-
from unstructured_ingest.processes.connectors.sql.postgres import (
|
|
25
|
-
CONNECTOR_TYPE,
|
|
26
|
-
PostgresAccessConfig,
|
|
27
|
-
PostgresConnectionConfig,
|
|
28
|
-
PostgresDownloader,
|
|
29
|
-
PostgresDownloaderConfig,
|
|
30
|
-
PostgresIndexer,
|
|
31
|
-
PostgresIndexerConfig,
|
|
32
|
-
PostgresUploader,
|
|
33
|
-
PostgresUploadStager,
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
SEED_DATA_ROWS = 10
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@pytest.fixture
|
|
40
|
-
def source_database_setup() -> str:
|
|
41
|
-
db_name = "test_db"
|
|
42
|
-
with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
|
|
43
|
-
connection = connect(
|
|
44
|
-
user="unstructured",
|
|
45
|
-
password="test",
|
|
46
|
-
dbname=db_name,
|
|
47
|
-
host="localhost",
|
|
48
|
-
port=5433,
|
|
49
|
-
)
|
|
50
|
-
with connection.cursor() as cursor:
|
|
51
|
-
for i in range(SEED_DATA_ROWS):
|
|
52
|
-
sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
|
|
53
|
-
cursor.execute(sql_statment)
|
|
54
|
-
connection.commit()
|
|
55
|
-
yield db_name
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
@pytest.mark.asyncio
|
|
59
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
|
|
60
|
-
async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
61
|
-
connect_params = {
|
|
62
|
-
"host": "localhost",
|
|
63
|
-
"port": 5433,
|
|
64
|
-
"database": "test_db",
|
|
65
|
-
"user": "unstructured",
|
|
66
|
-
"password": "test",
|
|
67
|
-
}
|
|
68
|
-
connection_config = PostgresConnectionConfig(
|
|
69
|
-
host=connect_params["host"],
|
|
70
|
-
port=connect_params["port"],
|
|
71
|
-
database=connect_params["database"],
|
|
72
|
-
username=connect_params["user"],
|
|
73
|
-
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
74
|
-
)
|
|
75
|
-
indexer = PostgresIndexer(
|
|
76
|
-
connection_config=connection_config,
|
|
77
|
-
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
78
|
-
)
|
|
79
|
-
downloader = PostgresDownloader(
|
|
80
|
-
connection_config=connection_config,
|
|
81
|
-
download_config=PostgresDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
|
|
82
|
-
)
|
|
83
|
-
await source_connector_validation(
|
|
84
|
-
indexer=indexer,
|
|
85
|
-
downloader=downloader,
|
|
86
|
-
configs=SourceValidationConfigs(
|
|
87
|
-
test_id="postgres",
|
|
88
|
-
expected_num_files=SEED_DATA_ROWS,
|
|
89
|
-
expected_number_indexed_file_data=2,
|
|
90
|
-
validate_downloaded_files=True,
|
|
91
|
-
),
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def validate_destination(
|
|
96
|
-
connect_params: dict,
|
|
97
|
-
expected_num_elements: int,
|
|
98
|
-
test_embedding: list[float],
|
|
99
|
-
expected_text: str,
|
|
100
|
-
):
|
|
101
|
-
# Run the following validations:
|
|
102
|
-
# * Check that the number of records in the table match the expected value
|
|
103
|
-
# * Given the embedding, make sure it matches the associated text it belongs to
|
|
104
|
-
with connect(**connect_params) as connection:
|
|
105
|
-
cursor = connection.cursor()
|
|
106
|
-
query = "select count(*) from elements;"
|
|
107
|
-
cursor.execute(query)
|
|
108
|
-
count = cursor.fetchone()[0]
|
|
109
|
-
assert (
|
|
110
|
-
count == expected_num_elements
|
|
111
|
-
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
112
|
-
|
|
113
|
-
cursor.execute("SELECT embeddings FROM elements order by text limit 1")
|
|
114
|
-
similarity_query = (
|
|
115
|
-
f"SELECT text FROM elements ORDER BY embeddings <-> '{test_embedding}' LIMIT 1;"
|
|
116
|
-
)
|
|
117
|
-
cursor.execute(similarity_query)
|
|
118
|
-
res = cursor.fetchone()
|
|
119
|
-
assert res[0] == expected_text
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@pytest.mark.asyncio
|
|
123
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
124
|
-
async def test_postgres_destination(upload_file: Path, temp_dir: Path):
|
|
125
|
-
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
126
|
-
# mocking it with arbitrary values to meet the base requirements:
|
|
127
|
-
mock_file_data = FileData(
|
|
128
|
-
identifier="mock file data",
|
|
129
|
-
connector_type=CONNECTOR_TYPE,
|
|
130
|
-
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
131
|
-
)
|
|
132
|
-
with docker_compose_context(
|
|
133
|
-
docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
|
|
134
|
-
):
|
|
135
|
-
stager = PostgresUploadStager()
|
|
136
|
-
staged_path = stager.run(
|
|
137
|
-
elements_filepath=upload_file,
|
|
138
|
-
file_data=mock_file_data,
|
|
139
|
-
output_dir=temp_dir,
|
|
140
|
-
output_filename=upload_file.name,
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
# The stager should append the `.json` suffix to the output filename passed in.
|
|
144
|
-
assert staged_path.suffix == upload_file.suffix
|
|
145
|
-
|
|
146
|
-
connect_params = {
|
|
147
|
-
"host": "localhost",
|
|
148
|
-
"port": 5433,
|
|
149
|
-
"database": "elements",
|
|
150
|
-
"user": "unstructured",
|
|
151
|
-
"password": "test",
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
uploader = PostgresUploader(
|
|
155
|
-
connection_config=PostgresConnectionConfig(
|
|
156
|
-
host=connect_params["host"],
|
|
157
|
-
port=connect_params["port"],
|
|
158
|
-
database=connect_params["database"],
|
|
159
|
-
username=connect_params["user"],
|
|
160
|
-
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
161
|
-
)
|
|
162
|
-
)
|
|
163
|
-
uploader.precheck()
|
|
164
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
165
|
-
|
|
166
|
-
with staged_path.open("r") as f:
|
|
167
|
-
staged_data = json.load(f)
|
|
168
|
-
|
|
169
|
-
sample_element = staged_data[0]
|
|
170
|
-
expected_num_elements = len(staged_data)
|
|
171
|
-
validate_destination(
|
|
172
|
-
connect_params=connect_params,
|
|
173
|
-
expected_num_elements=expected_num_elements,
|
|
174
|
-
expected_text=sample_element["text"],
|
|
175
|
-
test_embedding=sample_element["embeddings"],
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
179
|
-
validate_destination(
|
|
180
|
-
connect_params=connect_params,
|
|
181
|
-
expected_num_elements=expected_num_elements,
|
|
182
|
-
expected_text=sample_element["text"],
|
|
183
|
-
test_embedding=sample_element["embeddings"],
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
188
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
189
|
-
def test_postgres_stager(
|
|
190
|
-
request: TopRequest,
|
|
191
|
-
upload_file_str: str,
|
|
192
|
-
tmp_path: Path,
|
|
193
|
-
):
|
|
194
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
195
|
-
stager = PostgresUploadStager()
|
|
196
|
-
stager_validation(
|
|
197
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
198
|
-
input_file=upload_file,
|
|
199
|
-
stager=stager,
|
|
200
|
-
tmp_dir=tmp_path,
|
|
201
|
-
)
|