unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (51) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +209 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_pinecone.py +53 -1
  15. test/integration/connectors/utils/docker.py +81 -15
  16. test/integration/connectors/utils/validation.py +10 -0
  17. test/integration/connectors/weaviate/__init__.py +0 -0
  18. test/integration/connectors/weaviate/conftest.py +15 -0
  19. test/integration/connectors/weaviate/test_local.py +131 -0
  20. unstructured_ingest/__version__.py +1 -1
  21. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  22. unstructured_ingest/utils/data_prep.py +9 -1
  23. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  24. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  25. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  26. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  27. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  28. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  29. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  30. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  31. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  32. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  39. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  40. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  41. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  42. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  43. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  45. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +15 -15
  46. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +50 -30
  47. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  50. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  51. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,34 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import pytest
6
+
7
+ int_test_dir = Path(__file__).parent
8
+ assets_dir = int_test_dir / "assets"
9
+
10
+
11
+ @pytest.fixture
12
+ def movies_dataframe() -> pd.DataFrame:
13
+ movies_file = assets_dir / "wiki_movie_plots_small.csv"
14
+ assert movies_file.exists()
15
+ assert movies_file.is_file()
16
+ return pd.read_csv(movies_file).dropna().reset_index()
17
+
18
+
19
+ @pytest.fixture
20
+ def opensearch_elements_mapping() -> dict:
21
+ elements_mapping_file = assets_dir / "opensearch_elements_mappings.json"
22
+ assert elements_mapping_file.exists()
23
+ assert elements_mapping_file.is_file()
24
+ with elements_mapping_file.open() as fp:
25
+ return json.load(fp)
26
+
27
+
28
+ @pytest.fixture
29
+ def elasticsearch_elements_mapping() -> dict:
30
+ elements_mapping_file = assets_dir / "elasticsearch_elements_mappings.json"
31
+ assert elements_mapping_file.exists()
32
+ assert elements_mapping_file.is_file()
33
+ with elements_mapping_file.open() as fp:
34
+ return json.load(fp)
@@ -0,0 +1,308 @@
1
+ # ruff: noqa: I001
2
+ import json
3
+ import tempfile
4
+ import time
5
+ from contextlib import contextmanager
6
+ from pathlib import Path
7
+ from typing import Generator
8
+
9
+ import pandas as pd
10
+ import pytest
11
+ from elasticsearch import Elasticsearch as ElasticsearchClient
12
+ from elasticsearch.helpers import bulk
13
+
14
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
15
+ from test.integration.connectors.utils.docker import HealthCheck, container_context
16
+ from test.integration.connectors.utils.validation import (
17
+ ValidationConfigs,
18
+ source_connector_validation,
19
+ )
20
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
21
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
22
+ from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
23
+ CONNECTOR_TYPE,
24
+ ElasticsearchAccessConfig,
25
+ ElasticsearchConnectionConfig,
26
+ ElasticsearchDownloader,
27
+ ElasticsearchDownloaderConfig,
28
+ ElasticsearchIndexer,
29
+ ElasticsearchIndexerConfig,
30
+ ElasticsearchUploader,
31
+ ElasticsearchUploaderConfig,
32
+ ElasticsearchUploadStager,
33
+ ElasticsearchUploadStagerConfig,
34
+ )
35
+
36
+ SOURCE_INDEX_NAME = "movies"
37
+ DESTINATION_INDEX_NAME = "elements"
38
+ ES_USERNAME = "elastic"
39
+ ES_PASSWORD = "elastic_password"
40
+
41
+
42
+ @contextmanager
43
+ def get_client() -> Generator[ElasticsearchClient, None, None]:
44
+ with ElasticsearchClient(
45
+ hosts="http://localhost:9200", basic_auth=(ES_USERNAME, ES_PASSWORD), request_timeout=30
46
+ ) as client:
47
+ yield client
48
+
49
+
50
+ def form_elasticsearch_doc_dict(i, csv_row):
51
+ return {
52
+ "_index": SOURCE_INDEX_NAME,
53
+ "_id": i,
54
+ "_source": {
55
+ "title": csv_row["Title"],
56
+ "ethnicity": csv_row["Origin/Ethnicity"],
57
+ "director": csv_row["Director"],
58
+ "cast": csv_row["Cast"],
59
+ "genre": csv_row["Genre"],
60
+ "plot": csv_row["Plot"],
61
+ "year": csv_row["Release Year"],
62
+ "wiki_page": csv_row["Wiki Page"],
63
+ },
64
+ }
65
+
66
+
67
+ def dataframe_to_upload_data(df: pd.DataFrame) -> list[dict]:
68
+ upload_data = []
69
+ for index, row in df.iterrows():
70
+ upload_data.append(form_elasticsearch_doc_dict(index, row))
71
+ return upload_data
72
+
73
+
74
+ def get_index_count(client: ElasticsearchClient, index_name: str) -> int:
75
+ count_resp = client.cat.count(index=index_name, format="json")
76
+ return int(count_resp[0]["count"])
77
+
78
+
79
+ def validate_count(
80
+ client: ElasticsearchClient,
81
+ index_name: str,
82
+ expected_count: int,
83
+ retries: int = 10,
84
+ interval: int = 1,
85
+ ) -> None:
86
+ current_count = get_index_count(client, index_name)
87
+ if current_count == expected_count:
88
+ return
89
+ tries = 0
90
+ while tries < retries:
91
+ print(
92
+ f"retrying validation to check if expected count "
93
+ f"{expected_count} will match current count {current_count}"
94
+ )
95
+ time.sleep(interval)
96
+ current_count = get_index_count(client, index_name)
97
+ if current_count == expected_count:
98
+ break
99
+ assert current_count == expected_count, (
100
+ f"Expected count ({expected_count}) doesn't match how "
101
+ f"much came back from index: {current_count}"
102
+ )
103
+
104
+
105
+ def seed_source_db(df: pd.DataFrame):
106
+ mapping = {
107
+ "properties": {
108
+ "title": {"type": "text", "analyzer": "english"},
109
+ "ethnicity": {"type": "text", "analyzer": "standard"},
110
+ "director": {"type": "text", "analyzer": "standard"},
111
+ "cast": {"type": "text", "analyzer": "standard"},
112
+ "genre": {"type": "text", "analyzer": "standard"},
113
+ "plot": {"type": "text", "analyzer": "english"},
114
+ "year": {"type": "integer"},
115
+ "wiki_page": {"type": "keyword"},
116
+ },
117
+ }
118
+ # seed content
119
+ with get_client() as client:
120
+ client.indices.create(index=SOURCE_INDEX_NAME, mappings=mapping)
121
+ upload_data = dataframe_to_upload_data(df=df)
122
+ bulk(client, upload_data)
123
+ client.indices.refresh(index=SOURCE_INDEX_NAME)
124
+ count = get_index_count(client, SOURCE_INDEX_NAME)
125
+ print(f"seeded {SOURCE_INDEX_NAME} index with {count} records")
126
+
127
+
128
+ @pytest.fixture
129
+ def source_index(movies_dataframe: pd.DataFrame) -> str:
130
+ with container_context(
131
+ image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
132
+ ports={9200: 9200, 9300: 9300},
133
+ environment={
134
+ "discovery.type": "single-node",
135
+ "xpack.security.enabled": True,
136
+ "ELASTIC_PASSWORD": ES_PASSWORD,
137
+ "ELASTIC_USER": ES_USERNAME,
138
+ },
139
+ healthcheck=HealthCheck(
140
+ test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
141
+ interval=1,
142
+ start_period=5,
143
+ ),
144
+ ):
145
+ seed_source_db(df=movies_dataframe)
146
+ yield SOURCE_INDEX_NAME
147
+
148
+
149
+ @pytest.fixture
150
+ def destination_index(elasticsearch_elements_mapping: dict) -> str:
151
+ with container_context(
152
+ image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
153
+ ports={9200: 9200, 9300: 9300},
154
+ environment={
155
+ "discovery.type": "single-node",
156
+ "xpack.security.enabled": True,
157
+ "ELASTIC_PASSWORD": ES_PASSWORD,
158
+ "ELASTIC_USER": ES_USERNAME,
159
+ },
160
+ healthcheck=HealthCheck(
161
+ test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
162
+ interval=1,
163
+ start_period=5,
164
+ ),
165
+ ):
166
+ with get_client() as client:
167
+ response = client.indices.create(
168
+ index=DESTINATION_INDEX_NAME, mappings=elasticsearch_elements_mapping
169
+ )
170
+ if not response["acknowledged"]:
171
+ raise RuntimeError(f"failed to create index: {response}")
172
+ yield DESTINATION_INDEX_NAME
173
+
174
+
175
+ @pytest.mark.asyncio
176
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
177
+ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
178
+ indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
179
+ with tempfile.TemporaryDirectory() as tempdir:
180
+ tempdir_path = Path(tempdir)
181
+ connection_config = ElasticsearchConnectionConfig(
182
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
183
+ username=ES_USERNAME,
184
+ hosts=["http://localhost:9200"],
185
+ )
186
+ download_config = ElasticsearchDownloaderConfig(download_dir=tempdir_path)
187
+ indexer = ElasticsearchIndexer(
188
+ connection_config=connection_config, index_config=indexer_config
189
+ )
190
+ downloader = ElasticsearchDownloader(
191
+ connection_config=connection_config, download_config=download_config
192
+ )
193
+ expected_num_files = len(movies_dataframe)
194
+ await source_connector_validation(
195
+ indexer=indexer,
196
+ downloader=downloader,
197
+ configs=ValidationConfigs(
198
+ test_id=CONNECTOR_TYPE,
199
+ expected_num_files=expected_num_files,
200
+ expected_number_indexed_file_data=1,
201
+ validate_downloaded_files=True,
202
+ ),
203
+ )
204
+
205
+
206
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
207
+ def test_elasticsearch_source_precheck_fail_no_cluster():
208
+ indexer_config = ElasticsearchIndexerConfig(index_name="index")
209
+
210
+ connection_config = ElasticsearchConnectionConfig(
211
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
212
+ username=ES_USERNAME,
213
+ hosts=["http://localhost:9200"],
214
+ )
215
+ indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
216
+ with pytest.raises(SourceConnectionError):
217
+ indexer.precheck()
218
+
219
+
220
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
221
+ def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
222
+ indexer_config = ElasticsearchIndexerConfig(index_name="index")
223
+
224
+ connection_config = ElasticsearchConnectionConfig(
225
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
226
+ username=ES_USERNAME,
227
+ hosts=["http://localhost:9200"],
228
+ )
229
+ indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
230
+ with pytest.raises(SourceConnectionError):
231
+ indexer.precheck()
232
+
233
+
234
+ @pytest.mark.asyncio
235
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
236
+ async def test_elasticsearch_destination(
237
+ upload_file: Path,
238
+ destination_index: str,
239
+ tmp_path: Path,
240
+ ):
241
+ file_data = FileData(
242
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
243
+ connector_type=CONNECTOR_TYPE,
244
+ identifier="mock file data",
245
+ )
246
+ connection_config = ElasticsearchConnectionConfig(
247
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
248
+ username=ES_USERNAME,
249
+ hosts=["http://localhost:9200"],
250
+ )
251
+ stager = ElasticsearchUploadStager(
252
+ upload_stager_config=ElasticsearchUploadStagerConfig(index_name=destination_index)
253
+ )
254
+
255
+ uploader = ElasticsearchUploader(
256
+ connection_config=connection_config,
257
+ upload_config=ElasticsearchUploaderConfig(index_name=destination_index),
258
+ )
259
+ staged_filepath = stager.run(
260
+ elements_filepath=upload_file,
261
+ file_data=file_data,
262
+ output_dir=tmp_path,
263
+ output_filename=upload_file.name,
264
+ )
265
+ uploader.precheck()
266
+ uploader.run(path=staged_filepath, file_data=file_data)
267
+
268
+ # Run validation
269
+ with staged_filepath.open() as f:
270
+ staged_elements = json.load(f)
271
+ expected_count = len(staged_elements)
272
+ with get_client() as client:
273
+ validate_count(client=client, expected_count=expected_count, index_name=destination_index)
274
+
275
+ # Rerun and make sure the same documents get updated
276
+ uploader.run(path=staged_filepath, file_data=file_data)
277
+ with get_client() as client:
278
+ validate_count(client=client, expected_count=expected_count, index_name=destination_index)
279
+
280
+
281
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
282
+ def test_elasticsearch_destination_precheck_fail():
283
+ connection_config = ElasticsearchConnectionConfig(
284
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
285
+ username=ES_USERNAME,
286
+ hosts=["http://localhost:9200"],
287
+ )
288
+ uploader = ElasticsearchUploader(
289
+ connection_config=connection_config,
290
+ upload_config=ElasticsearchUploaderConfig(index_name="index"),
291
+ )
292
+ with pytest.raises(DestinationConnectionError):
293
+ uploader.precheck()
294
+
295
+
296
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
297
+ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
298
+ connection_config = ElasticsearchConnectionConfig(
299
+ access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
300
+ username=ES_USERNAME,
301
+ hosts=["http://localhost:9200"],
302
+ )
303
+ uploader = ElasticsearchUploader(
304
+ connection_config=connection_config,
305
+ upload_config=ElasticsearchUploaderConfig(index_name="index"),
306
+ )
307
+ with pytest.raises(DestinationConnectionError):
308
+ uploader.precheck()
@@ -0,0 +1,302 @@
1
+ import json
2
+ import tempfile
3
+ import time
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+ from typing import Generator
7
+
8
+ import pandas as pd
9
+ import pytest
10
+ from opensearchpy import Document, Keyword, OpenSearch, Text
11
+
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.docker import HealthCheck, container_context
14
+ from test.integration.connectors.utils.validation import (
15
+ ValidationConfigs,
16
+ source_connector_validation,
17
+ )
18
+ from unstructured_ingest.error import (
19
+ DestinationConnectionError,
20
+ SourceConnectionError,
21
+ )
22
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
23
+ from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import (
24
+ CONNECTOR_TYPE,
25
+ OpenSearchAccessConfig,
26
+ OpenSearchConnectionConfig,
27
+ OpenSearchDownloader,
28
+ OpenSearchDownloaderConfig,
29
+ OpenSearchIndexer,
30
+ OpenSearchIndexerConfig,
31
+ OpenSearchUploader,
32
+ OpenSearchUploaderConfig,
33
+ OpenSearchUploadStager,
34
+ OpenSearchUploadStagerConfig,
35
+ )
36
+
37
+ SOURCE_INDEX_NAME = "movies"
38
+ DESTINATION_INDEX_NAME = "elements"
39
+
40
+
41
+ class Movie(Document):
42
+ title = Text(fields={"raw": Keyword()})
43
+ year = Text()
44
+ director = Text()
45
+ cast = Text()
46
+ genre = Text()
47
+ wiki_page = Text()
48
+ ethnicity = Text()
49
+ plot = Text()
50
+
51
+ class Index:
52
+ name = SOURCE_INDEX_NAME
53
+
54
+ def save(self, **kwargs):
55
+ return super(Movie, self).save(**kwargs)
56
+
57
+
58
+ @contextmanager
59
+ def get_client() -> Generator[OpenSearch, None, None]:
60
+ with OpenSearch(
61
+ hosts=[{"host": "localhost", "port": 9200}],
62
+ http_auth=("admin", "admin"),
63
+ use_ssl=True,
64
+ verify_certs=False,
65
+ ssl_show_warn=False,
66
+ ) as client:
67
+ yield client
68
+
69
+
70
+ def get_index_count(client: OpenSearch, index_name: str) -> int:
71
+ count_resp = client.cat.count(index=index_name, params={"format": "json"})
72
+ return int(count_resp[0]["count"])
73
+
74
+
75
+ def wait_for_write(
76
+ client: OpenSearch, index_name: str, expected_count: int, timeout: int = 30, interval: int = 1
77
+ ) -> None:
78
+ current_count = get_index_count(client, index_name)
79
+ start = time.time()
80
+ while time.time() - start < timeout:
81
+ print(f"waiting for current count ({current_count}) to match expected {expected_count}")
82
+ time.sleep(interval)
83
+ current_count = get_index_count(client, index_name)
84
+ if current_count == expected_count:
85
+ return
86
+ raise TimeoutError("Timed out while waiting for write to sync")
87
+
88
+
89
+ def validate_count(
90
+ client: OpenSearch, index_name: str, expected_count: int, retries: int = 10, interval: int = 1
91
+ ) -> None:
92
+ current_count = get_index_count(client, index_name)
93
+ if current_count == expected_count:
94
+ return
95
+ tries = 0
96
+ while tries < retries:
97
+ print(
98
+ f"retrying validation to check if expected count "
99
+ f"{expected_count} will match current count {current_count}"
100
+ )
101
+ time.sleep(interval)
102
+ current_count = get_index_count(client, index_name)
103
+ if current_count == expected_count:
104
+ break
105
+ assert current_count == expected_count, (
106
+ f"Expected count ({expected_count}) doesn't match how "
107
+ f"much came back from index: {current_count}"
108
+ )
109
+
110
+
111
+ @pytest.fixture
112
+ def source_index(movies_dataframe: pd.DataFrame) -> str:
113
+ with container_context(
114
+ image="opensearchproject/opensearch:2.11.1",
115
+ ports={9200: 9200, 9600: 9600},
116
+ environment={"discovery.type": "single-node"},
117
+ healthcheck=HealthCheck(
118
+ test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
119
+ interval=1,
120
+ ),
121
+ ):
122
+ with get_client() as client:
123
+ Movie.init(using=client)
124
+ for i, row in movies_dataframe.iterrows():
125
+ movie = Movie(
126
+ meta={"id": i},
127
+ title=row["Title"],
128
+ year=row["Release Year"],
129
+ director=row["Director"],
130
+ cast=row["Cast"],
131
+ genre=row["Genre"],
132
+ wiki_page=row["Wiki Page"],
133
+ ethnicity=row["Origin/Ethnicity"],
134
+ plot=row["Plot"],
135
+ )
136
+ movie.save(using=client)
137
+ wait_for_write(
138
+ client=client, index_name=SOURCE_INDEX_NAME, expected_count=len(movies_dataframe)
139
+ )
140
+ yield SOURCE_INDEX_NAME
141
+
142
+
143
+ @pytest.fixture
144
+ def destination_index(opensearch_elements_mapping: dict) -> str:
145
+ with container_context(
146
+ image="opensearchproject/opensearch:2.11.1",
147
+ ports={9200: 9200, 9600: 9600},
148
+ environment={"discovery.type": "single-node"},
149
+ healthcheck=HealthCheck(
150
+ test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
151
+ interval=1,
152
+ ),
153
+ ):
154
+ with get_client() as client:
155
+ response = client.indices.create(
156
+ index=DESTINATION_INDEX_NAME, body=opensearch_elements_mapping
157
+ )
158
+ if not response["acknowledged"]:
159
+ raise RuntimeError(f"failed to create index: {response}")
160
+ yield DESTINATION_INDEX_NAME
161
+
162
+
163
+ @pytest.mark.asyncio
164
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
165
+ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFrame):
166
+ indexer_config = OpenSearchIndexerConfig(index_name=source_index)
167
+ with tempfile.TemporaryDirectory() as tempdir:
168
+ tempdir_path = Path(tempdir)
169
+ connection_config = OpenSearchConnectionConfig(
170
+ access_config=OpenSearchAccessConfig(password="admin"),
171
+ username="admin",
172
+ hosts=["http://localhost:9200"],
173
+ use_ssl=True,
174
+ )
175
+ download_config = OpenSearchDownloaderConfig(download_dir=tempdir_path)
176
+ indexer = OpenSearchIndexer(
177
+ connection_config=connection_config, index_config=indexer_config
178
+ )
179
+ downloader = OpenSearchDownloader(
180
+ connection_config=connection_config, download_config=download_config
181
+ )
182
+ expected_num_files = len(movies_dataframe)
183
+ await source_connector_validation(
184
+ indexer=indexer,
185
+ downloader=downloader,
186
+ configs=ValidationConfigs(
187
+ test_id=CONNECTOR_TYPE,
188
+ expected_num_files=expected_num_files,
189
+ expected_number_indexed_file_data=1,
190
+ validate_downloaded_files=True,
191
+ ),
192
+ )
193
+
194
+
195
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
196
+ def test_opensearch_source_precheck_fail_no_cluster():
197
+ indexer_config = OpenSearchIndexerConfig(index_name="index")
198
+
199
+ connection_config = OpenSearchConnectionConfig(
200
+ access_config=OpenSearchAccessConfig(password="admin"),
201
+ username="admin",
202
+ hosts=["http://localhost:9200"],
203
+ use_ssl=True,
204
+ )
205
+ indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
206
+ with pytest.raises(SourceConnectionError):
207
+ indexer.precheck()
208
+
209
+
210
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
211
+ def test_opensearch_source_precheck_fail_no_index(source_index: str):
212
+ indexer_config = OpenSearchIndexerConfig(index_name="index")
213
+
214
+ connection_config = OpenSearchConnectionConfig(
215
+ access_config=OpenSearchAccessConfig(password="admin"),
216
+ username="admin",
217
+ hosts=["http://localhost:9200"],
218
+ use_ssl=True,
219
+ )
220
+ indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
221
+ with pytest.raises(SourceConnectionError):
222
+ indexer.precheck()
223
+
224
+
225
+ @pytest.mark.asyncio
226
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
227
+ async def test_opensearch_destination(
228
+ upload_file: Path,
229
+ destination_index: str,
230
+ tmp_path: Path,
231
+ ):
232
+ file_data = FileData(
233
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
234
+ connector_type=CONNECTOR_TYPE,
235
+ identifier="mock file data",
236
+ )
237
+ connection_config = OpenSearchConnectionConfig(
238
+ access_config=OpenSearchAccessConfig(password="admin"),
239
+ username="admin",
240
+ hosts=["http://localhost:9200"],
241
+ use_ssl=True,
242
+ )
243
+ stager = OpenSearchUploadStager(
244
+ upload_stager_config=OpenSearchUploadStagerConfig(index_name=destination_index)
245
+ )
246
+
247
+ uploader = OpenSearchUploader(
248
+ connection_config=connection_config,
249
+ upload_config=OpenSearchUploaderConfig(index_name=destination_index),
250
+ )
251
+ staged_filepath = stager.run(
252
+ elements_filepath=upload_file,
253
+ file_data=file_data,
254
+ output_dir=tmp_path,
255
+ output_filename=upload_file.name,
256
+ )
257
+ uploader.precheck()
258
+ uploader.run(path=staged_filepath, file_data=file_data)
259
+
260
+ # Run validation
261
+ with staged_filepath.open() as f:
262
+ staged_elements = json.load(f)
263
+ expected_count = len(staged_elements)
264
+ with get_client() as client:
265
+ validate_count(client=client, expected_count=expected_count, index_name=destination_index)
266
+
267
+ # Rerun and make sure the same documents get updated
268
+ uploader.run(path=staged_filepath, file_data=file_data)
269
+ with get_client() as client:
270
+ validate_count(client=client, expected_count=expected_count, index_name=destination_index)
271
+
272
+
273
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
274
+ def test_opensearch_destination_precheck_fail():
275
+ connection_config = OpenSearchConnectionConfig(
276
+ access_config=OpenSearchAccessConfig(password="admin"),
277
+ username="admin",
278
+ hosts=["http://localhost:9200"],
279
+ use_ssl=True,
280
+ )
281
+ uploader = OpenSearchUploader(
282
+ connection_config=connection_config,
283
+ upload_config=OpenSearchUploaderConfig(index_name="index"),
284
+ )
285
+ with pytest.raises(DestinationConnectionError):
286
+ uploader.precheck()
287
+
288
+
289
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
290
+ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
291
+ connection_config = OpenSearchConnectionConfig(
292
+ access_config=OpenSearchAccessConfig(password="admin"),
293
+ username="admin",
294
+ hosts=["http://localhost:9200"],
295
+ use_ssl=True,
296
+ )
297
+ uploader = OpenSearchUploader(
298
+ connection_config=connection_config,
299
+ upload_config=OpenSearchUploaderConfig(index_name="index"),
300
+ )
301
+ with pytest.raises(DestinationConnectionError):
302
+ uploader.precheck()
@@ -158,10 +158,8 @@ async def test_postgres_destination(upload_file: Path):
158
158
  access_config=PostgresAccessConfig(password=connect_params["password"]),
159
159
  )
160
160
  )
161
- if uploader.is_async():
162
- await uploader.run_async(path=staged_path, file_data=mock_file_data)
163
- else:
164
- uploader.run(path=staged_path, file_data=mock_file_data)
161
+
162
+ uploader.run(path=staged_path, file_data=mock_file_data)
165
163
 
166
164
  staged_df = pd.read_json(staged_path, orient="records", lines=True)
167
165
  sample_element = staged_df.iloc[0]
@@ -172,3 +170,11 @@ async def test_postgres_destination(upload_file: Path):
172
170
  expected_text=sample_element["text"],
173
171
  test_embedding=sample_element["embeddings"],
174
172
  )
173
+
174
+ uploader.run(path=staged_path, file_data=mock_file_data)
175
+ validate_destination(
176
+ connect_params=connect_params,
177
+ expected_num_elements=expected_num_elements,
178
+ expected_text=sample_element["text"],
179
+ test_embedding=sample_element["embeddings"],
180
+ )
@@ -143,10 +143,8 @@ async def test_singlestore_destination(upload_file: Path):
143
143
  table_name="elements",
144
144
  ),
145
145
  )
146
- if uploader.is_async():
147
- await uploader.run_async(path=staged_path, file_data=mock_file_data)
148
- else:
149
- uploader.run(path=staged_path, file_data=mock_file_data)
146
+
147
+ uploader.run(path=staged_path, file_data=mock_file_data)
150
148
 
151
149
  staged_df = pd.read_json(staged_path, orient="records", lines=True)
152
150
  expected_num_elements = len(staged_df)
@@ -154,3 +152,9 @@ async def test_singlestore_destination(upload_file: Path):
154
152
  connect_params=connect_params,
155
153
  expected_num_elements=expected_num_elements,
156
154
  )
155
+
156
+ uploader.run(path=staged_path, file_data=mock_file_data)
157
+ validate_destination(
158
+ connect_params=connect_params,
159
+ expected_num_elements=expected_num_elements,
160
+ )