unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,282 +0,0 @@
1
- import json
2
- import os
3
- import time
4
- from functools import lru_cache
5
- from pathlib import Path
6
- from typing import Generator
7
- from uuid import uuid4
8
-
9
- import pytest
10
- import requests
11
-
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
- from test.integration.utils import requires_env
14
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
15
- from unstructured_ingest.logger import logger
16
- from unstructured_ingest.processes.connectors.vectara import (
17
- CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
18
- )
19
- from unstructured_ingest.processes.connectors.vectara import (
20
- VectaraAccessConfig,
21
- VectaraConnectionConfig,
22
- VectaraUploader,
23
- VectaraUploaderConfig,
24
- VectaraUploadStager,
25
- VectaraUploadStagerConfig,
26
- )
27
-
28
-
29
- def validate_upload(document: dict, expected_data: dict):
30
- logger.info(f"validating document: {document}")
31
- element_id = expected_data["element_id"]
32
- expected_text = expected_data["text"]
33
- filename = expected_data["metadata"]["filename"]
34
- filetype = expected_data["metadata"]["filetype"]
35
- page_number = expected_data["metadata"]["page_number"]
36
-
37
- assert document is not None
38
- speech_parts = document["parts"]
39
- assert speech_parts
40
- first_part = speech_parts[0]
41
- assert first_part["text"] == expected_text
42
- part_metadata = first_part["metadata"]
43
- assert part_metadata
44
- assert part_metadata["element_id"] == element_id
45
- assert part_metadata["filename"] == filename
46
- assert part_metadata["filetype"] == filetype
47
- assert part_metadata["page_number"] == page_number
48
-
49
-
50
- @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
51
- @lru_cache()
52
- def _get_jwt_token():
53
- """Connect to the server and get a JWT token."""
54
- customer_id = os.environ["VECTARA_CUSTOMER_ID"]
55
- token_endpoint = (
56
- f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
57
- )
58
- headers = {
59
- "Content-Type": "application/x-www-form-urlencoded",
60
- }
61
- data = {
62
- "grant_type": "client_credentials",
63
- "client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
64
- "client_secret": os.environ["VECTARA_OAUTH_SECRET"],
65
- }
66
-
67
- response = requests.post(token_endpoint, headers=headers, data=data)
68
- response.raise_for_status()
69
- response_json = response.json()
70
-
71
- return response_json.get("access_token")
72
-
73
-
74
- def list_documents(corpus_key: str) -> list[str]:
75
-
76
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
77
-
78
- # the query below requires the corpus to have filter attributes for element_id
79
-
80
- jwt_token = _get_jwt_token()
81
- headers = {
82
- "Content-Type": "application/json",
83
- "Accept": "application/json",
84
- "Authorization": f"Bearer {jwt_token}",
85
- "X-source": "unstructured",
86
- }
87
-
88
- response = requests.get(url, headers=headers)
89
- response.raise_for_status()
90
- response_json = response.json()
91
- documents = response_json.get("documents", [])
92
- return documents
93
-
94
-
95
- def fetch_document(corpus_key: str, documents_id: str) -> dict:
96
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
97
- jwt_token = _get_jwt_token()
98
- headers = {
99
- "Content-Type": "application/json",
100
- "Accept": "application/json",
101
- "Authorization": f"Bearer {jwt_token}",
102
- "X-source": "unstructured",
103
- }
104
-
105
- response = requests.get(url, headers=headers)
106
- response.raise_for_status()
107
- return response.json()
108
-
109
-
110
- def create_corpora(corpus_key: str, corpus_name: str) -> None:
111
- url = "https://api.vectara.io/v2/corpora"
112
- data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
113
- jwt_token = _get_jwt_token()
114
- headers = {
115
- "Content-Type": "application/json",
116
- "Accept": "application/json",
117
- "Authorization": f"Bearer {jwt_token}",
118
- "X-source": "unstructured",
119
- }
120
-
121
- response = requests.post(url, headers=headers, data=data)
122
- response.raise_for_status()
123
-
124
-
125
- def replace_filter_attributes(corpus_key: str) -> None:
126
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
127
- data = json.dumps(
128
- {
129
- "filter_attributes": [
130
- {"name": "element_id", "level": "part", "indexed": True, "type": "text"}
131
- ]
132
- }
133
- )
134
- jwt_token = _get_jwt_token()
135
- headers = {
136
- "Content-Type": "application/json",
137
- "Accept": "application/json",
138
- "Authorization": f"Bearer {jwt_token}",
139
- "X-source": "unstructured",
140
- }
141
-
142
- response = requests.post(url, headers=headers, data=data)
143
- response.raise_for_status()
144
-
145
-
146
- def delete_corpora(corpus_key: str) -> None:
147
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
148
-
149
- jwt_token = _get_jwt_token()
150
- headers = {
151
- "Content-Type": "application/json",
152
- "Accept": "application/json",
153
- "Authorization": f"Bearer {jwt_token}",
154
- "X-source": "unstructured",
155
- }
156
-
157
- response = requests.delete(url, headers=headers)
158
- response.raise_for_status()
159
-
160
-
161
- def get_metadata(corpus_key: str):
162
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
163
- jwt_token = _get_jwt_token()
164
- headers = {
165
- "Content-Type": "application/json",
166
- "Accept": "application/json",
167
- "Authorization": f"Bearer {jwt_token}",
168
- "X-source": "unstructured",
169
- }
170
- response = requests.get(url, headers=headers)
171
- response.raise_for_status()
172
- return response.json()
173
-
174
-
175
- def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
176
- start = time.time()
177
- while time.time() - start < timeout:
178
- try:
179
- get_metadata(corpus_key)
180
- return
181
- except requests.HTTPError:
182
- time.sleep(interval)
183
- raise TimeoutError("time out waiting for corpus to be ready")
184
-
185
-
186
- def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
187
- start = time.time()
188
- while time.time() - start < timeout:
189
- try:
190
- get_metadata(corpus_key)
191
- time.sleep(interval)
192
- except requests.HTTPError:
193
- return
194
- raise TimeoutError("time out waiting for corpus to delete")
195
-
196
-
197
- @pytest.fixture
198
- def corpora_util() -> Generator[str, None, None]:
199
- random_id = str(uuid4()).split("-")[0]
200
- corpus_key = f"ingest-test-{random_id}"
201
- corpus_name = "ingest-test"
202
- logger.info(f"Creating corpus with key: {corpus_key}")
203
- try:
204
- create_corpora(corpus_key, corpus_name)
205
- replace_filter_attributes(corpus_key)
206
- wait_for_ready(corpus_key=corpus_key)
207
- yield corpus_key
208
- except Exception as e:
209
- logger.error(f"failed to create corpus {corpus_key}: {e}")
210
- finally:
211
- logger.info(f"deleting corpus: {corpus_key}")
212
- delete_corpora(corpus_key)
213
- wait_for_delete(corpus_key=corpus_key)
214
-
215
-
216
- def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
217
- start = time.time()
218
- while time.time() - start < timeout:
219
- all_document_meta = list_documents(corpus_key)
220
- if not all_document_meta:
221
- time.sleep(interval)
222
- continue
223
- else:
224
- return all_document_meta
225
- raise TimeoutError("time out waiting for document to be ready")
226
-
227
-
228
- @pytest.mark.asyncio
229
- @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
230
- @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
231
- async def test_vectara_destination(
232
- upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
233
- ):
234
- corpus_key = corpora_util
235
- connection_kwargs = {
236
- "customer_id": os.environ["VECTARA_CUSTOMER_ID"],
237
- "corpus_key": corpus_key,
238
- }
239
-
240
- oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
241
- oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
242
-
243
- file_data = FileData(
244
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
245
- connector_type=VECTARA_CONNECTOR_TYPE,
246
- identifier="mock-file-data",
247
- )
248
-
249
- stager_config = VectaraUploadStagerConfig()
250
- stager = VectaraUploadStager(upload_stager_config=stager_config)
251
- new_upload_file = stager.run(
252
- elements_filepath=upload_file,
253
- output_dir=tmp_path,
254
- output_filename=upload_file.name,
255
- file_data=file_data,
256
- )
257
-
258
- uploader = VectaraUploader(
259
- connection_config=VectaraConnectionConfig(
260
- **connection_kwargs,
261
- access_config=VectaraAccessConfig(
262
- oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
263
- ),
264
- ),
265
- upload_config=VectaraUploaderConfig(),
266
- )
267
-
268
- with new_upload_file.open() as new_upload_fp:
269
- elements_stager = json.load(new_upload_fp)
270
-
271
- if uploader.is_async():
272
- await uploader.run_data_async(data=elements_stager, file_data=file_data)
273
-
274
- with upload_file.open() as upload_fp:
275
- elements = json.load(upload_fp)
276
- first_element = elements[0]
277
-
278
- all_document_meta = wait_for_doc_meta(corpus_key)
279
- assert len(all_document_meta) == 1
280
- document_meta = all_document_meta[0]
281
- document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
282
- validate_upload(document=document, expected_data=first_element)
@@ -1,120 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
7
- from test.integration.connectors.utils.validation.source import (
8
- SourceValidationConfigs,
9
- source_connector_validation,
10
- )
11
- from test.integration.utils import requires_env
12
- from unstructured_ingest.errors_v2 import UserAuthError
13
- from unstructured_ingest.processes.connectors.zendesk.zendesk import (
14
- CONNECTOR_TYPE,
15
- ZendeskAccessConfig,
16
- ZendeskConnectionConfig,
17
- ZendeskDownloader,
18
- ZendeskDownloaderConfig,
19
- ZendeskIndexer,
20
- ZendeskIndexerConfig,
21
- )
22
-
23
- SUBDOMAIN = "unstructuredhelp"
24
- EMAIL = "test@unstructured.io"
25
-
26
-
27
- @pytest.mark.asyncio
28
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
29
- @requires_env("ZENDESK_TOKEN")
30
- async def test_zendesk_source_tickets(temp_dir: Path):
31
- access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
32
- connection_config = ZendeskConnectionConfig(
33
- subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
34
- )
35
-
36
- index_config = ZendeskIndexerConfig(item_type="tickets")
37
-
38
- indexer = ZendeskIndexer(
39
- connection_config=connection_config,
40
- index_config=index_config,
41
- connector_type=CONNECTOR_TYPE,
42
- )
43
-
44
- # handle downloader.
45
- download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
46
-
47
- downloader = ZendeskDownloader(
48
- connection_config=connection_config,
49
- download_config=download_config,
50
- connector_type=CONNECTOR_TYPE,
51
- )
52
-
53
- # Run the source connector validation
54
- await source_connector_validation(
55
- indexer=indexer,
56
- downloader=downloader,
57
- configs=SourceValidationConfigs(
58
- test_id="zendesk-tickets",
59
- expected_num_files=8,
60
- validate_file_data=False,
61
- validate_downloaded_files=True,
62
- ),
63
- )
64
-
65
-
66
- @pytest.mark.asyncio
67
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
68
- @requires_env("ZENDESK_TOKEN")
69
- async def test_zendesk_source_articles(temp_dir):
70
- access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
71
- connection_config = ZendeskConnectionConfig(
72
- subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
73
- )
74
-
75
- index_config = ZendeskIndexerConfig(item_type="articles")
76
-
77
- indexer = ZendeskIndexer(
78
- connection_config=connection_config,
79
- index_config=index_config,
80
- connector_type=CONNECTOR_TYPE,
81
- )
82
-
83
- # handle downloader.
84
- download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
85
-
86
- downloader = ZendeskDownloader(
87
- connection_config=connection_config,
88
- download_config=download_config,
89
- connector_type=CONNECTOR_TYPE,
90
- )
91
-
92
- # Run the source connector validation
93
- await source_connector_validation(
94
- indexer=indexer,
95
- downloader=downloader,
96
- configs=SourceValidationConfigs(
97
- test_id="zendesk-articles",
98
- expected_num_files=8,
99
- validate_file_data=True,
100
- validate_downloaded_files=True,
101
- ),
102
- )
103
-
104
-
105
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
106
- def test_zendesk_source_articles_fail(temp_dir):
107
- access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
108
- connection_config = ZendeskConnectionConfig(
109
- subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
110
- )
111
-
112
- index_config = ZendeskIndexerConfig(item_type="tickets")
113
-
114
- indexer = ZendeskIndexer(
115
- connection_config=connection_config,
116
- index_config=index_config,
117
- connector_type=CONNECTOR_TYPE,
118
- )
119
- with pytest.raises(expected_exception=UserAuthError):
120
- indexer.precheck()
File without changes
@@ -1,13 +0,0 @@
1
- from pathlib import Path
2
-
3
- SOURCE_TAG = "source"
4
- DESTINATION_TAG = "destination"
5
- BLOB_STORAGE_TAG = "blob_storage"
6
- SQL_TAG = "sql"
7
- NOSQL_TAG = "nosql"
8
- VECTOR_DB_TAG = "vector_db"
9
- GRAPH_DB_TAG = "graph_db"
10
- UNCATEGORIZED_TAG = "uncategorized"
11
-
12
- env_setup_path = Path(__file__).parents[1] / "env_setup"
13
- expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -1,151 +0,0 @@
1
- import time
2
- from contextlib import contextmanager
3
- from typing import Optional, Union
4
-
5
- import docker
6
- from docker.models.containers import Container
7
- from pydantic import BaseModel, Field, field_serializer
8
-
9
-
10
- class HealthCheck(BaseModel):
11
- test: Union[str, list[str]]
12
- interval: int = Field(
13
- gt=0, default=30, description="The time to wait between checks in seconds."
14
- )
15
- timeout: int = Field(
16
- gt=0, default=30, description="The time to wait before considering the check to have hung."
17
- )
18
- retries: int = Field(
19
- gt=0,
20
- default=3,
21
- description="The number of consecutive failures needed "
22
- "to consider a container as unhealthy.",
23
- )
24
- start_period: int = Field(
25
- gt=0,
26
- default=0,
27
- description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
28
- )
29
-
30
- @field_serializer("interval")
31
- def serialize_interval(self, interval: int) -> int:
32
- return int(interval * 10e8)
33
-
34
- @field_serializer("timeout")
35
- def serialize_timeout(self, timeout: int) -> int:
36
- return int(timeout * 10e8)
37
-
38
- @field_serializer("start_period")
39
- def serialize_start_period(self, start_period: int) -> int:
40
- return int(start_period * 10e8)
41
-
42
-
43
- def get_container(
44
- docker_client: docker.DockerClient,
45
- image: str,
46
- ports: dict,
47
- name: Optional[str] = None,
48
- environment: Optional[dict] = None,
49
- volumes: Optional[dict] = None,
50
- healthcheck: Optional[HealthCheck] = None,
51
- ) -> Container:
52
- run_kwargs = {
53
- "image": image,
54
- "detach": True,
55
- "ports": ports,
56
- }
57
- if environment:
58
- run_kwargs["environment"] = environment
59
- if volumes:
60
- run_kwargs["volumes"] = volumes
61
- if healthcheck:
62
- run_kwargs["healthcheck"] = healthcheck.model_dump()
63
- if name:
64
- run_kwargs["name"] = name
65
- container: Container = docker_client.containers.run(**run_kwargs)
66
- return container
67
-
68
-
69
- def get_healthcheck(container: Container) -> Optional[HealthCheck]:
70
- healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
71
- if not healthcheck_config:
72
- return None
73
- healthcheck_data = {
74
- "test": healthcheck_config["Test"],
75
- }
76
- if interval := healthcheck_config.get("Interval"):
77
- healthcheck_data["interval"] = interval / 10e8
78
- if start_period := healthcheck_config.get("StartPeriod"):
79
- healthcheck_data["start_period"] = start_period / 10e8
80
- if retries := healthcheck_config.get("Retries"):
81
- healthcheck_data["retries"] = retries
82
- return HealthCheck.model_validate(healthcheck_data)
83
-
84
-
85
- def healthcheck_wait(
86
- container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
87
- ) -> None:
88
- if start_period:
89
- time.sleep(start_period)
90
- health = container.health
91
- tries = 0
92
- while health != "healthy" and tries < retries:
93
- tries += 1
94
- logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
95
- latest_log = logs[-1] if logs else None
96
- print(
97
- f"attempt {tries} - waiting for docker container "
98
- f"to be healthy: {health} latest log: {latest_log}"
99
- )
100
- time.sleep(interval)
101
- container.reload()
102
- health = container.health
103
- if health != "healthy":
104
- logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
105
- latest_log = logs[-1] if logs else None
106
- raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
107
-
108
-
109
- @contextmanager
110
- def container_context(
111
- image: str,
112
- ports: dict,
113
- environment: Optional[dict] = None,
114
- volumes: Optional[dict] = None,
115
- healthcheck: Optional[HealthCheck] = None,
116
- healthcheck_retries: int = 30,
117
- docker_client: Optional[docker.DockerClient] = None,
118
- name: Optional[str] = None,
119
- ):
120
- docker_client = docker_client or docker.from_env()
121
- print(f"pulling image {image}")
122
- docker_client.images.pull(image)
123
- container: Optional[Container] = None
124
- try:
125
- container = get_container(
126
- docker_client=docker_client,
127
- image=image,
128
- ports=ports,
129
- environment=environment,
130
- volumes=volumes,
131
- healthcheck=healthcheck,
132
- name=name,
133
- )
134
- if healthcheck_data := get_healthcheck(container):
135
- # Mirror whatever healthcheck config set on container
136
- healthcheck_wait(
137
- container=container,
138
- retries=healthcheck_retries,
139
- start_period=healthcheck_data.start_period,
140
- interval=healthcheck_data.interval,
141
- )
142
- yield container
143
- except AssertionError as e:
144
- if container:
145
- logs = container.logs()
146
- print(logs.decode("utf-8"))
147
- raise e
148
- finally:
149
- if container:
150
- container.kill()
151
- container.remove()
@@ -1,59 +0,0 @@
1
- import subprocess
2
- from contextlib import contextmanager
3
- from pathlib import Path
4
-
5
-
6
- def docker_compose_down(docker_compose_path: Path):
7
- cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v --rmi all"
8
- print(f"Running command: {cmd}")
9
- final_resp = subprocess.run(
10
- cmd,
11
- shell=True,
12
- capture_output=True,
13
- )
14
- if final_resp.returncode != 0:
15
- print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
16
- print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
17
-
18
-
19
- def run_cleanup(docker_compose_path: Path):
20
- docker_compose_down(docker_compose_path=docker_compose_path)
21
-
22
-
23
- @contextmanager
24
- def docker_compose_context(docker_compose_path: Path):
25
- # Dynamically run a specific docker compose file and make sure it gets cleanup by
26
- # by leveraging a context manager. Uses subprocess to map docker compose commands
27
- # to the underlying shell.
28
- assert docker_compose_path.exists()
29
- if docker_compose_path.is_dir():
30
- if (docker_compose_path / "docker-compose.yml").exists():
31
- docker_compose_path = docker_compose_path / "docker-compose.yml"
32
- elif (docker_compose_path / "docker-compose.yaml").exists():
33
- docker_compose_path = docker_compose_path / "docker-compose.yaml"
34
- assert docker_compose_path.is_file()
35
- resp = None
36
- try:
37
- cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
38
- print(f"Running command: {cmd}")
39
- resp = subprocess.run(
40
- cmd,
41
- shell=True,
42
- capture_output=True,
43
- )
44
- # Return code from docker compose using --wait can be 1 even if no error
45
- yield
46
- except Exception as e:
47
- if resp:
48
- print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
49
- print("STDERR: {}".format(resp.stderr.decode("utf-8")))
50
- cmd = f"docker compose -f {docker_compose_path.resolve()} logs"
51
- logs = subprocess.run(
52
- cmd,
53
- shell=True,
54
- capture_output=True,
55
- )
56
- print("DOCKER LOGS: {}".format(logs.stdout.decode("utf-8")))
57
- raise e
58
- finally:
59
- run_cleanup(docker_compose_path=docker_compose_path)
File without changes