unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,331 +0,0 @@
1
- # ruff: noqa: I001
2
- import json
3
- import tempfile
4
- import time
5
- from contextlib import contextmanager
6
- from pathlib import Path
7
- from typing import Generator
8
- from test.integration.connectors.utils.validation.destination import (
9
- StagerValidationConfigs,
10
- stager_validation,
11
- )
12
- import pandas as pd
13
- import pytest
14
- from _pytest.fixtures import TopRequest
15
- from elasticsearch import Elasticsearch as ElasticsearchClient
16
- from elasticsearch.helpers import bulk
17
-
18
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, NOSQL_TAG
19
- from test.integration.connectors.utils.docker import HealthCheck, container_context
20
- from test.integration.connectors.utils.validation.source import (
21
- SourceValidationConfigs,
22
- source_connector_validation,
23
- )
24
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
25
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
26
- from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
27
- CONNECTOR_TYPE,
28
- ElasticsearchAccessConfig,
29
- ElasticsearchConnectionConfig,
30
- ElasticsearchDownloader,
31
- ElasticsearchDownloaderConfig,
32
- ElasticsearchIndexer,
33
- ElasticsearchIndexerConfig,
34
- ElasticsearchUploader,
35
- ElasticsearchUploaderConfig,
36
- ElasticsearchUploadStager,
37
- ElasticsearchUploadStagerConfig,
38
- )
39
-
40
- SOURCE_INDEX_NAME = "movies"
41
- DESTINATION_INDEX_NAME = "elements"
42
- ES_USERNAME = "elastic"
43
- ES_PASSWORD = "elastic_password"
44
-
45
-
46
- @contextmanager
47
- def get_client() -> Generator[ElasticsearchClient, None, None]:
48
- with ElasticsearchClient(
49
- hosts="http://localhost:9200", basic_auth=(ES_USERNAME, ES_PASSWORD), request_timeout=30
50
- ) as client:
51
- yield client
52
-
53
-
54
- def form_elasticsearch_doc_dict(i, csv_row):
55
- return {
56
- "_index": SOURCE_INDEX_NAME,
57
- "_id": i,
58
- "_source": {
59
- "title": csv_row["Title"],
60
- "ethnicity": csv_row["Origin/Ethnicity"],
61
- "director": csv_row["Director"],
62
- "cast": csv_row["Cast"],
63
- "genre": csv_row["Genre"],
64
- "plot": csv_row["Plot"],
65
- "year": csv_row["Release Year"],
66
- "wiki_page": csv_row["Wiki Page"],
67
- },
68
- }
69
-
70
-
71
- def dataframe_to_upload_data(df: pd.DataFrame) -> list[dict]:
72
- upload_data = []
73
- for index, row in df.iterrows():
74
- upload_data.append(form_elasticsearch_doc_dict(index, row))
75
- return upload_data
76
-
77
-
78
- def get_index_count(client: ElasticsearchClient, index_name: str) -> int:
79
- count_resp = client.cat.count(index=index_name, format="json")
80
- return int(count_resp[0]["count"])
81
-
82
-
83
- def validate_count(
84
- client: ElasticsearchClient,
85
- index_name: str,
86
- expected_count: int,
87
- retries: int = 10,
88
- interval: int = 1,
89
- ) -> None:
90
- current_count = get_index_count(client, index_name)
91
- if current_count == expected_count:
92
- return
93
- tries = 0
94
- while tries < retries:
95
- print(
96
- f"retrying validation to check if expected count "
97
- f"{expected_count} will match current count {current_count}"
98
- )
99
- time.sleep(interval)
100
- current_count = get_index_count(client, index_name)
101
- if current_count == expected_count:
102
- break
103
- assert current_count == expected_count, (
104
- f"Expected count ({expected_count}) doesn't match how "
105
- f"much came back from index: {current_count}"
106
- )
107
-
108
-
109
- def seed_source_db(df: pd.DataFrame):
110
- mapping = {
111
- "properties": {
112
- "title": {"type": "text", "analyzer": "english"},
113
- "ethnicity": {"type": "text", "analyzer": "standard"},
114
- "director": {"type": "text", "analyzer": "standard"},
115
- "cast": {"type": "text", "analyzer": "standard"},
116
- "genre": {"type": "text", "analyzer": "standard"},
117
- "plot": {"type": "text", "analyzer": "english"},
118
- "year": {"type": "integer"},
119
- "wiki_page": {"type": "keyword"},
120
- },
121
- }
122
- # seed content
123
- with get_client() as client:
124
- client.indices.create(index=SOURCE_INDEX_NAME, mappings=mapping)
125
- upload_data = dataframe_to_upload_data(df=df)
126
- bulk(client, upload_data)
127
- client.indices.refresh(index=SOURCE_INDEX_NAME)
128
- count = get_index_count(client, SOURCE_INDEX_NAME)
129
- print(f"seeded {SOURCE_INDEX_NAME} index with {count} records")
130
-
131
-
132
- @pytest.fixture
133
- def source_index(movies_dataframe: pd.DataFrame) -> str:
134
- with container_context(
135
- image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
136
- ports={9200: 9200, 9300: 9300},
137
- environment={
138
- "discovery.type": "single-node",
139
- "xpack.security.enabled": True,
140
- "ELASTIC_PASSWORD": ES_PASSWORD,
141
- "ELASTIC_USER": ES_USERNAME,
142
- },
143
- healthcheck=HealthCheck(
144
- test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
145
- interval=1,
146
- start_period=5,
147
- ),
148
- ):
149
- seed_source_db(df=movies_dataframe)
150
- yield SOURCE_INDEX_NAME
151
-
152
-
153
- @pytest.fixture
154
- def destination_index(elasticsearch_elements_mapping: dict) -> str:
155
- with container_context(
156
- image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
157
- ports={9200: 9200, 9300: 9300},
158
- environment={
159
- "discovery.type": "single-node",
160
- "xpack.security.enabled": True,
161
- "ELASTIC_PASSWORD": ES_PASSWORD,
162
- "ELASTIC_USER": ES_USERNAME,
163
- },
164
- healthcheck=HealthCheck(
165
- test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
166
- interval=1,
167
- start_period=5,
168
- ),
169
- ):
170
- with get_client() as client:
171
- response = client.indices.create(
172
- index=DESTINATION_INDEX_NAME, mappings=elasticsearch_elements_mapping
173
- )
174
- if not response["acknowledged"]:
175
- raise RuntimeError(f"failed to create index: {response}")
176
- yield DESTINATION_INDEX_NAME
177
-
178
-
179
- @pytest.mark.asyncio
180
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
181
- async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
182
- indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
183
- with tempfile.TemporaryDirectory() as tempdir:
184
- tempdir_path = Path(tempdir)
185
- connection_config = ElasticsearchConnectionConfig(
186
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
187
- username=ES_USERNAME,
188
- hosts=["http://localhost:9200"],
189
- )
190
- download_config = ElasticsearchDownloaderConfig(download_dir=tempdir_path)
191
- indexer = ElasticsearchIndexer(
192
- connection_config=connection_config, index_config=indexer_config
193
- )
194
- downloader = ElasticsearchDownloader(
195
- connection_config=connection_config, download_config=download_config
196
- )
197
- expected_num_files = len(movies_dataframe)
198
- await source_connector_validation(
199
- indexer=indexer,
200
- downloader=downloader,
201
- configs=SourceValidationConfigs(
202
- test_id=CONNECTOR_TYPE,
203
- expected_num_files=expected_num_files,
204
- expected_number_indexed_file_data=1,
205
- validate_downloaded_files=True,
206
- ),
207
- )
208
-
209
-
210
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
211
- def test_elasticsearch_source_precheck_fail_no_cluster():
212
- indexer_config = ElasticsearchIndexerConfig(index_name="index")
213
-
214
- connection_config = ElasticsearchConnectionConfig(
215
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
216
- username=ES_USERNAME,
217
- hosts=["http://localhost:9200"],
218
- )
219
- indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
220
- with pytest.raises(SourceConnectionError):
221
- indexer.precheck()
222
-
223
-
224
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
225
- def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
226
- indexer_config = ElasticsearchIndexerConfig(index_name="index")
227
-
228
- connection_config = ElasticsearchConnectionConfig(
229
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
230
- username=ES_USERNAME,
231
- hosts=["http://localhost:9200"],
232
- )
233
- indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
234
- with pytest.raises(SourceConnectionError):
235
- indexer.precheck()
236
-
237
-
238
- @pytest.mark.asyncio
239
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
240
- async def test_elasticsearch_destination(
241
- upload_file: Path,
242
- destination_index: str,
243
- tmp_path: Path,
244
- ):
245
- file_data = FileData(
246
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
247
- connector_type=CONNECTOR_TYPE,
248
- identifier="mock file data",
249
- )
250
- connection_config = ElasticsearchConnectionConfig(
251
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
252
- username=ES_USERNAME,
253
- hosts=["http://localhost:9200"],
254
- )
255
- stager = ElasticsearchUploadStager(
256
- upload_stager_config=ElasticsearchUploadStagerConfig(index_name=destination_index)
257
- )
258
-
259
- uploader = ElasticsearchUploader(
260
- connection_config=connection_config,
261
- upload_config=ElasticsearchUploaderConfig(index_name=destination_index),
262
- )
263
- staged_filepath = stager.run(
264
- elements_filepath=upload_file,
265
- file_data=file_data,
266
- output_dir=tmp_path,
267
- output_filename=upload_file.name,
268
- )
269
- uploader.precheck()
270
- uploader.run(path=staged_filepath, file_data=file_data)
271
-
272
- # Run validation
273
- with staged_filepath.open() as f:
274
- staged_elements = json.load(f)
275
- expected_count = len(staged_elements)
276
- with get_client() as client:
277
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
278
-
279
- # Rerun and make sure the same documents get updated
280
- uploader.run(path=staged_filepath, file_data=file_data)
281
- with get_client() as client:
282
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
283
-
284
-
285
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
286
- def test_elasticsearch_destination_precheck_fail():
287
- connection_config = ElasticsearchConnectionConfig(
288
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
289
- username=ES_USERNAME,
290
- hosts=["http://localhost:9200"],
291
- )
292
- uploader = ElasticsearchUploader(
293
- connection_config=connection_config,
294
- upload_config=ElasticsearchUploaderConfig(index_name="index"),
295
- )
296
- with pytest.raises(DestinationConnectionError):
297
- uploader.precheck()
298
-
299
-
300
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
301
- def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
302
- connection_config = ElasticsearchConnectionConfig(
303
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
304
- username=ES_USERNAME,
305
- hosts=["http://localhost:9200"],
306
- )
307
- uploader = ElasticsearchUploader(
308
- connection_config=connection_config,
309
- upload_config=ElasticsearchUploaderConfig(index_name="index"),
310
- )
311
- with pytest.raises(DestinationConnectionError):
312
- uploader.precheck()
313
-
314
-
315
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
316
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
317
- def test_elasticsearch_stager(
318
- request: TopRequest,
319
- upload_file_str: str,
320
- tmp_path: Path,
321
- ):
322
- upload_file: Path = request.getfixturevalue(upload_file_str)
323
- stager = ElasticsearchUploadStager(
324
- upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
325
- )
326
- stager_validation(
327
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
328
- input_file=upload_file,
329
- stager=stager,
330
- tmp_dir=tmp_path,
331
- )
@@ -1,326 +0,0 @@
1
- import json
2
- import tempfile
3
- import time
4
- from contextlib import contextmanager
5
- from pathlib import Path
6
- from typing import Generator
7
-
8
- import pandas as pd
9
- import pytest
10
- from _pytest.fixtures import TopRequest
11
- from opensearchpy import Document, Keyword, OpenSearch, Text
12
-
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG, SOURCE_TAG
14
- from test.integration.connectors.utils.docker import HealthCheck, container_context
15
- from test.integration.connectors.utils.validation.destination import (
16
- StagerValidationConfigs,
17
- stager_validation,
18
- )
19
- from test.integration.connectors.utils.validation.source import (
20
- SourceValidationConfigs,
21
- source_connector_validation,
22
- )
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.error import (
25
- DestinationConnectionError,
26
- SourceConnectionError,
27
- )
28
- from unstructured_ingest.processes.connectors.elasticsearch.opensearch import (
29
- CONNECTOR_TYPE,
30
- OpenSearchAccessConfig,
31
- OpenSearchConnectionConfig,
32
- OpenSearchDownloader,
33
- OpenSearchDownloaderConfig,
34
- OpenSearchIndexer,
35
- OpenSearchIndexerConfig,
36
- OpenSearchUploader,
37
- OpenSearchUploaderConfig,
38
- OpenSearchUploadStager,
39
- OpenSearchUploadStagerConfig,
40
- )
41
-
42
- SOURCE_INDEX_NAME = "movies"
43
- DESTINATION_INDEX_NAME = "elements"
44
-
45
-
46
- class Movie(Document):
47
- title = Text(fields={"raw": Keyword()})
48
- year = Text()
49
- director = Text()
50
- cast = Text()
51
- genre = Text()
52
- wiki_page = Text()
53
- ethnicity = Text()
54
- plot = Text()
55
-
56
- class Index:
57
- name = SOURCE_INDEX_NAME
58
-
59
- def save(self, **kwargs):
60
- return super(Movie, self).save(**kwargs)
61
-
62
-
63
- @contextmanager
64
- def get_client() -> Generator[OpenSearch, None, None]:
65
- with OpenSearch(
66
- hosts=[{"host": "localhost", "port": 9200}],
67
- http_auth=("admin", "admin"),
68
- use_ssl=True,
69
- verify_certs=False,
70
- ssl_show_warn=False,
71
- ) as client:
72
- yield client
73
-
74
-
75
- def get_index_count(client: OpenSearch, index_name: str) -> int:
76
- count_resp = client.cat.count(index=index_name, params={"format": "json"})
77
- return int(count_resp[0]["count"])
78
-
79
-
80
- def wait_for_write(
81
- client: OpenSearch, index_name: str, expected_count: int, timeout: int = 30, interval: int = 1
82
- ) -> None:
83
- current_count = get_index_count(client, index_name)
84
- start = time.time()
85
- while time.time() - start < timeout:
86
- print(f"waiting for current count ({current_count}) to match expected {expected_count}")
87
- time.sleep(interval)
88
- current_count = get_index_count(client, index_name)
89
- if current_count == expected_count:
90
- return
91
- raise TimeoutError("Timed out while waiting for write to sync")
92
-
93
-
94
- def validate_count(
95
- client: OpenSearch, index_name: str, expected_count: int, retries: int = 10, interval: int = 1
96
- ) -> None:
97
- current_count = get_index_count(client, index_name)
98
- if current_count == expected_count:
99
- return
100
- tries = 0
101
- while tries < retries:
102
- print(
103
- f"retrying validation to check if expected count "
104
- f"{expected_count} will match current count {current_count}"
105
- )
106
- time.sleep(interval)
107
- current_count = get_index_count(client, index_name)
108
- if current_count == expected_count:
109
- break
110
- assert current_count == expected_count, (
111
- f"Expected count ({expected_count}) doesn't match how "
112
- f"much came back from index: {current_count}"
113
- )
114
-
115
-
116
- @pytest.fixture
117
- def source_index(movies_dataframe: pd.DataFrame) -> str:
118
- with container_context(
119
- image="opensearchproject/opensearch:2.11.1",
120
- ports={9200: 9200, 9600: 9600},
121
- environment={"discovery.type": "single-node"},
122
- healthcheck=HealthCheck(
123
- test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
124
- interval=1,
125
- ),
126
- ):
127
- with get_client() as client:
128
- Movie.init(using=client)
129
- for i, row in movies_dataframe.iterrows():
130
- movie = Movie(
131
- meta={"id": i},
132
- title=row["Title"],
133
- year=row["Release Year"],
134
- director=row["Director"],
135
- cast=row["Cast"],
136
- genre=row["Genre"],
137
- wiki_page=row["Wiki Page"],
138
- ethnicity=row["Origin/Ethnicity"],
139
- plot=row["Plot"],
140
- )
141
- movie.save(using=client)
142
- wait_for_write(
143
- client=client, index_name=SOURCE_INDEX_NAME, expected_count=len(movies_dataframe)
144
- )
145
- yield SOURCE_INDEX_NAME
146
-
147
-
148
- @pytest.fixture
149
- def destination_index(opensearch_elements_mapping: dict) -> str:
150
- with container_context(
151
- image="opensearchproject/opensearch:2.11.1",
152
- ports={9200: 9200, 9600: 9600},
153
- environment={"discovery.type": "single-node"},
154
- healthcheck=HealthCheck(
155
- test="curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1", # noqa: E501
156
- interval=1,
157
- ),
158
- ):
159
- with get_client() as client:
160
- response = client.indices.create(
161
- index=DESTINATION_INDEX_NAME, body=opensearch_elements_mapping
162
- )
163
- if not response["acknowledged"]:
164
- raise RuntimeError(f"failed to create index: {response}")
165
- yield DESTINATION_INDEX_NAME
166
-
167
-
168
- @pytest.mark.asyncio
169
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
170
- async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFrame):
171
- indexer_config = OpenSearchIndexerConfig(index_name=source_index)
172
- with tempfile.TemporaryDirectory() as tempdir:
173
- tempdir_path = Path(tempdir)
174
- connection_config = OpenSearchConnectionConfig(
175
- access_config=OpenSearchAccessConfig(password="admin"),
176
- username="admin",
177
- hosts=["http://localhost:9200"],
178
- use_ssl=True,
179
- )
180
- download_config = OpenSearchDownloaderConfig(download_dir=tempdir_path)
181
- indexer = OpenSearchIndexer(
182
- connection_config=connection_config, index_config=indexer_config
183
- )
184
- downloader = OpenSearchDownloader(
185
- connection_config=connection_config, download_config=download_config
186
- )
187
- expected_num_files = len(movies_dataframe)
188
- await source_connector_validation(
189
- indexer=indexer,
190
- downloader=downloader,
191
- configs=SourceValidationConfigs(
192
- test_id=CONNECTOR_TYPE,
193
- expected_num_files=expected_num_files,
194
- expected_number_indexed_file_data=1,
195
- validate_downloaded_files=True,
196
- ),
197
- )
198
-
199
-
200
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
201
- def test_opensearch_source_precheck_fail_no_cluster():
202
- indexer_config = OpenSearchIndexerConfig(index_name="index")
203
-
204
- connection_config = OpenSearchConnectionConfig(
205
- access_config=OpenSearchAccessConfig(password="admin"),
206
- username="admin",
207
- hosts=["http://localhost:9200"],
208
- use_ssl=True,
209
- )
210
- indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
211
- with pytest.raises(SourceConnectionError):
212
- indexer.precheck()
213
-
214
-
215
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
216
- def test_opensearch_source_precheck_fail_no_index(source_index: str):
217
- indexer_config = OpenSearchIndexerConfig(index_name="index")
218
-
219
- connection_config = OpenSearchConnectionConfig(
220
- access_config=OpenSearchAccessConfig(password="admin"),
221
- username="admin",
222
- hosts=["http://localhost:9200"],
223
- use_ssl=True,
224
- )
225
- indexer = OpenSearchIndexer(connection_config=connection_config, index_config=indexer_config)
226
- with pytest.raises(SourceConnectionError):
227
- indexer.precheck()
228
-
229
-
230
- @pytest.mark.asyncio
231
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
232
- async def test_opensearch_destination(
233
- upload_file: Path,
234
- destination_index: str,
235
- tmp_path: Path,
236
- ):
237
- file_data = FileData(
238
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
239
- connector_type=CONNECTOR_TYPE,
240
- identifier="mock file data",
241
- )
242
- connection_config = OpenSearchConnectionConfig(
243
- access_config=OpenSearchAccessConfig(password="admin"),
244
- username="admin",
245
- hosts=["http://localhost:9200"],
246
- use_ssl=True,
247
- )
248
- stager = OpenSearchUploadStager(
249
- upload_stager_config=OpenSearchUploadStagerConfig(index_name=destination_index)
250
- )
251
-
252
- uploader = OpenSearchUploader(
253
- connection_config=connection_config,
254
- upload_config=OpenSearchUploaderConfig(index_name=destination_index),
255
- )
256
- staged_filepath = stager.run(
257
- elements_filepath=upload_file,
258
- file_data=file_data,
259
- output_dir=tmp_path,
260
- output_filename=upload_file.name,
261
- )
262
- uploader.precheck()
263
- uploader.run(path=staged_filepath, file_data=file_data)
264
-
265
- # Run validation
266
- with staged_filepath.open() as f:
267
- staged_elements = json.load(f)
268
- expected_count = len(staged_elements)
269
- with get_client() as client:
270
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
271
-
272
- # Rerun and make sure the same documents get updated
273
- uploader.run(path=staged_filepath, file_data=file_data)
274
- with get_client() as client:
275
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
276
-
277
-
278
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
279
- def test_opensearch_destination_precheck_fail():
280
- connection_config = OpenSearchConnectionConfig(
281
- access_config=OpenSearchAccessConfig(password="admin"),
282
- username="admin",
283
- hosts=["http://localhost:9200"],
284
- use_ssl=True,
285
- )
286
- uploader = OpenSearchUploader(
287
- connection_config=connection_config,
288
- upload_config=OpenSearchUploaderConfig(index_name="index"),
289
- )
290
- with pytest.raises(DestinationConnectionError):
291
- uploader.precheck()
292
-
293
-
294
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
295
- def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
296
- connection_config = OpenSearchConnectionConfig(
297
- access_config=OpenSearchAccessConfig(password="admin"),
298
- username="admin",
299
- hosts=["http://localhost:9200"],
300
- use_ssl=True,
301
- )
302
- uploader = OpenSearchUploader(
303
- connection_config=connection_config,
304
- upload_config=OpenSearchUploaderConfig(index_name="index"),
305
- )
306
- with pytest.raises(DestinationConnectionError):
307
- uploader.precheck()
308
-
309
-
310
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
311
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
312
- def test_opensearch_stager(
313
- request: TopRequest,
314
- upload_file_str: str,
315
- tmp_path: Path,
316
- ):
317
- upload_file: Path = request.getfixturevalue(upload_file_str)
318
- stager = OpenSearchUploadStager(
319
- upload_stager_config=OpenSearchUploadStagerConfig(index_name="mock_index")
320
- )
321
- stager_validation(
322
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
323
- input_file=upload_file,
324
- stager=stager,
325
- tmp_dir=tmp_path,
326
- )
File without changes