unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import time
|
|
4
|
-
from functools import lru_cache
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Generator
|
|
7
|
-
from uuid import uuid4
|
|
8
|
-
|
|
9
|
-
import pytest
|
|
10
|
-
import requests
|
|
11
|
-
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
13
|
-
from test.integration.utils import requires_env
|
|
14
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
15
|
-
from unstructured_ingest.logger import logger
|
|
16
|
-
from unstructured_ingest.processes.connectors.vectara import (
|
|
17
|
-
CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
|
|
18
|
-
)
|
|
19
|
-
from unstructured_ingest.processes.connectors.vectara import (
|
|
20
|
-
VectaraAccessConfig,
|
|
21
|
-
VectaraConnectionConfig,
|
|
22
|
-
VectaraUploader,
|
|
23
|
-
VectaraUploaderConfig,
|
|
24
|
-
VectaraUploadStager,
|
|
25
|
-
VectaraUploadStagerConfig,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def validate_upload(document: dict, expected_data: dict):
|
|
30
|
-
logger.info(f"validating document: {document}")
|
|
31
|
-
element_id = expected_data["element_id"]
|
|
32
|
-
expected_text = expected_data["text"]
|
|
33
|
-
filename = expected_data["metadata"]["filename"]
|
|
34
|
-
filetype = expected_data["metadata"]["filetype"]
|
|
35
|
-
page_number = expected_data["metadata"]["page_number"]
|
|
36
|
-
|
|
37
|
-
assert document is not None
|
|
38
|
-
speech_parts = document["parts"]
|
|
39
|
-
assert speech_parts
|
|
40
|
-
first_part = speech_parts[0]
|
|
41
|
-
assert first_part["text"] == expected_text
|
|
42
|
-
part_metadata = first_part["metadata"]
|
|
43
|
-
assert part_metadata
|
|
44
|
-
assert part_metadata["element_id"] == element_id
|
|
45
|
-
assert part_metadata["filename"] == filename
|
|
46
|
-
assert part_metadata["filetype"] == filetype
|
|
47
|
-
assert part_metadata["page_number"] == page_number
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
51
|
-
@lru_cache()
|
|
52
|
-
def _get_jwt_token():
|
|
53
|
-
"""Connect to the server and get a JWT token."""
|
|
54
|
-
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
55
|
-
token_endpoint = (
|
|
56
|
-
f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
57
|
-
)
|
|
58
|
-
headers = {
|
|
59
|
-
"Content-Type": "application/x-www-form-urlencoded",
|
|
60
|
-
}
|
|
61
|
-
data = {
|
|
62
|
-
"grant_type": "client_credentials",
|
|
63
|
-
"client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
|
|
64
|
-
"client_secret": os.environ["VECTARA_OAUTH_SECRET"],
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
response = requests.post(token_endpoint, headers=headers, data=data)
|
|
68
|
-
response.raise_for_status()
|
|
69
|
-
response_json = response.json()
|
|
70
|
-
|
|
71
|
-
return response_json.get("access_token")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def list_documents(corpus_key: str) -> list[str]:
|
|
75
|
-
|
|
76
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
|
|
77
|
-
|
|
78
|
-
# the query below requires the corpus to have filter attributes for element_id
|
|
79
|
-
|
|
80
|
-
jwt_token = _get_jwt_token()
|
|
81
|
-
headers = {
|
|
82
|
-
"Content-Type": "application/json",
|
|
83
|
-
"Accept": "application/json",
|
|
84
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
85
|
-
"X-source": "unstructured",
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
response = requests.get(url, headers=headers)
|
|
89
|
-
response.raise_for_status()
|
|
90
|
-
response_json = response.json()
|
|
91
|
-
documents = response_json.get("documents", [])
|
|
92
|
-
return documents
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def fetch_document(corpus_key: str, documents_id: str) -> dict:
|
|
96
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
|
|
97
|
-
jwt_token = _get_jwt_token()
|
|
98
|
-
headers = {
|
|
99
|
-
"Content-Type": "application/json",
|
|
100
|
-
"Accept": "application/json",
|
|
101
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
102
|
-
"X-source": "unstructured",
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
response = requests.get(url, headers=headers)
|
|
106
|
-
response.raise_for_status()
|
|
107
|
-
return response.json()
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
111
|
-
url = "https://api.vectara.io/v2/corpora"
|
|
112
|
-
data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
|
|
113
|
-
jwt_token = _get_jwt_token()
|
|
114
|
-
headers = {
|
|
115
|
-
"Content-Type": "application/json",
|
|
116
|
-
"Accept": "application/json",
|
|
117
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
118
|
-
"X-source": "unstructured",
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
response = requests.post(url, headers=headers, data=data)
|
|
122
|
-
response.raise_for_status()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def replace_filter_attributes(corpus_key: str) -> None:
|
|
126
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
|
|
127
|
-
data = json.dumps(
|
|
128
|
-
{
|
|
129
|
-
"filter_attributes": [
|
|
130
|
-
{"name": "element_id", "level": "part", "indexed": True, "type": "text"}
|
|
131
|
-
]
|
|
132
|
-
}
|
|
133
|
-
)
|
|
134
|
-
jwt_token = _get_jwt_token()
|
|
135
|
-
headers = {
|
|
136
|
-
"Content-Type": "application/json",
|
|
137
|
-
"Accept": "application/json",
|
|
138
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
139
|
-
"X-source": "unstructured",
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
response = requests.post(url, headers=headers, data=data)
|
|
143
|
-
response.raise_for_status()
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def delete_corpora(corpus_key: str) -> None:
|
|
147
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
148
|
-
|
|
149
|
-
jwt_token = _get_jwt_token()
|
|
150
|
-
headers = {
|
|
151
|
-
"Content-Type": "application/json",
|
|
152
|
-
"Accept": "application/json",
|
|
153
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
154
|
-
"X-source": "unstructured",
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
response = requests.delete(url, headers=headers)
|
|
158
|
-
response.raise_for_status()
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def get_metadata(corpus_key: str):
|
|
162
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
163
|
-
jwt_token = _get_jwt_token()
|
|
164
|
-
headers = {
|
|
165
|
-
"Content-Type": "application/json",
|
|
166
|
-
"Accept": "application/json",
|
|
167
|
-
"Authorization": f"Bearer {jwt_token}",
|
|
168
|
-
"X-source": "unstructured",
|
|
169
|
-
}
|
|
170
|
-
response = requests.get(url, headers=headers)
|
|
171
|
-
response.raise_for_status()
|
|
172
|
-
return response.json()
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
176
|
-
start = time.time()
|
|
177
|
-
while time.time() - start < timeout:
|
|
178
|
-
try:
|
|
179
|
-
get_metadata(corpus_key)
|
|
180
|
-
return
|
|
181
|
-
except requests.HTTPError:
|
|
182
|
-
time.sleep(interval)
|
|
183
|
-
raise TimeoutError("time out waiting for corpus to be ready")
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
187
|
-
start = time.time()
|
|
188
|
-
while time.time() - start < timeout:
|
|
189
|
-
try:
|
|
190
|
-
get_metadata(corpus_key)
|
|
191
|
-
time.sleep(interval)
|
|
192
|
-
except requests.HTTPError:
|
|
193
|
-
return
|
|
194
|
-
raise TimeoutError("time out waiting for corpus to delete")
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@pytest.fixture
|
|
198
|
-
def corpora_util() -> Generator[str, None, None]:
|
|
199
|
-
random_id = str(uuid4()).split("-")[0]
|
|
200
|
-
corpus_key = f"ingest-test-{random_id}"
|
|
201
|
-
corpus_name = "ingest-test"
|
|
202
|
-
logger.info(f"Creating corpus with key: {corpus_key}")
|
|
203
|
-
try:
|
|
204
|
-
create_corpora(corpus_key, corpus_name)
|
|
205
|
-
replace_filter_attributes(corpus_key)
|
|
206
|
-
wait_for_ready(corpus_key=corpus_key)
|
|
207
|
-
yield corpus_key
|
|
208
|
-
except Exception as e:
|
|
209
|
-
logger.error(f"failed to create corpus {corpus_key}: {e}")
|
|
210
|
-
finally:
|
|
211
|
-
logger.info(f"deleting corpus: {corpus_key}")
|
|
212
|
-
delete_corpora(corpus_key)
|
|
213
|
-
wait_for_delete(corpus_key=corpus_key)
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
|
|
217
|
-
start = time.time()
|
|
218
|
-
while time.time() - start < timeout:
|
|
219
|
-
all_document_meta = list_documents(corpus_key)
|
|
220
|
-
if not all_document_meta:
|
|
221
|
-
time.sleep(interval)
|
|
222
|
-
continue
|
|
223
|
-
else:
|
|
224
|
-
return all_document_meta
|
|
225
|
-
raise TimeoutError("time out waiting for document to be ready")
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
@pytest.mark.asyncio
|
|
229
|
-
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
|
|
230
|
-
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
231
|
-
async def test_vectara_destination(
|
|
232
|
-
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
|
|
233
|
-
):
|
|
234
|
-
corpus_key = corpora_util
|
|
235
|
-
connection_kwargs = {
|
|
236
|
-
"customer_id": os.environ["VECTARA_CUSTOMER_ID"],
|
|
237
|
-
"corpus_key": corpus_key,
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
|
|
241
|
-
oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
|
|
242
|
-
|
|
243
|
-
file_data = FileData(
|
|
244
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
245
|
-
connector_type=VECTARA_CONNECTOR_TYPE,
|
|
246
|
-
identifier="mock-file-data",
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
stager_config = VectaraUploadStagerConfig()
|
|
250
|
-
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
251
|
-
new_upload_file = stager.run(
|
|
252
|
-
elements_filepath=upload_file,
|
|
253
|
-
output_dir=tmp_path,
|
|
254
|
-
output_filename=upload_file.name,
|
|
255
|
-
file_data=file_data,
|
|
256
|
-
)
|
|
257
|
-
|
|
258
|
-
uploader = VectaraUploader(
|
|
259
|
-
connection_config=VectaraConnectionConfig(
|
|
260
|
-
**connection_kwargs,
|
|
261
|
-
access_config=VectaraAccessConfig(
|
|
262
|
-
oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
|
|
263
|
-
),
|
|
264
|
-
),
|
|
265
|
-
upload_config=VectaraUploaderConfig(),
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
with new_upload_file.open() as new_upload_fp:
|
|
269
|
-
elements_stager = json.load(new_upload_fp)
|
|
270
|
-
|
|
271
|
-
if uploader.is_async():
|
|
272
|
-
await uploader.run_data_async(data=elements_stager, file_data=file_data)
|
|
273
|
-
|
|
274
|
-
with upload_file.open() as upload_fp:
|
|
275
|
-
elements = json.load(upload_fp)
|
|
276
|
-
first_element = elements[0]
|
|
277
|
-
|
|
278
|
-
all_document_meta = wait_for_doc_meta(corpus_key)
|
|
279
|
-
assert len(all_document_meta) == 1
|
|
280
|
-
document_meta = all_document_meta[0]
|
|
281
|
-
document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
|
|
282
|
-
validate_upload(document=document, expected_data=first_element)
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
7
|
-
from test.integration.connectors.utils.validation.source import (
|
|
8
|
-
SourceValidationConfigs,
|
|
9
|
-
source_connector_validation,
|
|
10
|
-
)
|
|
11
|
-
from test.integration.utils import requires_env
|
|
12
|
-
from unstructured_ingest.errors_v2 import UserAuthError
|
|
13
|
-
from unstructured_ingest.processes.connectors.zendesk.zendesk import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
ZendeskAccessConfig,
|
|
16
|
-
ZendeskConnectionConfig,
|
|
17
|
-
ZendeskDownloader,
|
|
18
|
-
ZendeskDownloaderConfig,
|
|
19
|
-
ZendeskIndexer,
|
|
20
|
-
ZendeskIndexerConfig,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
SUBDOMAIN = "unstructuredhelp"
|
|
24
|
-
EMAIL = "test@unstructured.io"
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@pytest.mark.asyncio
|
|
28
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
29
|
-
@requires_env("ZENDESK_TOKEN")
|
|
30
|
-
async def test_zendesk_source_tickets(temp_dir: Path):
|
|
31
|
-
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
32
|
-
connection_config = ZendeskConnectionConfig(
|
|
33
|
-
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
37
|
-
|
|
38
|
-
indexer = ZendeskIndexer(
|
|
39
|
-
connection_config=connection_config,
|
|
40
|
-
index_config=index_config,
|
|
41
|
-
connector_type=CONNECTOR_TYPE,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
# handle downloader.
|
|
45
|
-
download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
|
|
46
|
-
|
|
47
|
-
downloader = ZendeskDownloader(
|
|
48
|
-
connection_config=connection_config,
|
|
49
|
-
download_config=download_config,
|
|
50
|
-
connector_type=CONNECTOR_TYPE,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# Run the source connector validation
|
|
54
|
-
await source_connector_validation(
|
|
55
|
-
indexer=indexer,
|
|
56
|
-
downloader=downloader,
|
|
57
|
-
configs=SourceValidationConfigs(
|
|
58
|
-
test_id="zendesk-tickets",
|
|
59
|
-
expected_num_files=8,
|
|
60
|
-
validate_file_data=False,
|
|
61
|
-
validate_downloaded_files=True,
|
|
62
|
-
),
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
@pytest.mark.asyncio
|
|
67
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
68
|
-
@requires_env("ZENDESK_TOKEN")
|
|
69
|
-
async def test_zendesk_source_articles(temp_dir):
|
|
70
|
-
access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
|
|
71
|
-
connection_config = ZendeskConnectionConfig(
|
|
72
|
-
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
index_config = ZendeskIndexerConfig(item_type="articles")
|
|
76
|
-
|
|
77
|
-
indexer = ZendeskIndexer(
|
|
78
|
-
connection_config=connection_config,
|
|
79
|
-
index_config=index_config,
|
|
80
|
-
connector_type=CONNECTOR_TYPE,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# handle downloader.
|
|
84
|
-
download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
|
|
85
|
-
|
|
86
|
-
downloader = ZendeskDownloader(
|
|
87
|
-
connection_config=connection_config,
|
|
88
|
-
download_config=download_config,
|
|
89
|
-
connector_type=CONNECTOR_TYPE,
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
# Run the source connector validation
|
|
93
|
-
await source_connector_validation(
|
|
94
|
-
indexer=indexer,
|
|
95
|
-
downloader=downloader,
|
|
96
|
-
configs=SourceValidationConfigs(
|
|
97
|
-
test_id="zendesk-articles",
|
|
98
|
-
expected_num_files=8,
|
|
99
|
-
validate_file_data=True,
|
|
100
|
-
validate_downloaded_files=True,
|
|
101
|
-
),
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
106
|
-
def test_zendesk_source_articles_fail(temp_dir):
|
|
107
|
-
access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
|
|
108
|
-
connection_config = ZendeskConnectionConfig(
|
|
109
|
-
subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
index_config = ZendeskIndexerConfig(item_type="tickets")
|
|
113
|
-
|
|
114
|
-
indexer = ZendeskIndexer(
|
|
115
|
-
connection_config=connection_config,
|
|
116
|
-
index_config=index_config,
|
|
117
|
-
connector_type=CONNECTOR_TYPE,
|
|
118
|
-
)
|
|
119
|
-
with pytest.raises(expected_exception=UserAuthError):
|
|
120
|
-
indexer.precheck()
|
|
File without changes
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
SOURCE_TAG = "source"
|
|
4
|
-
DESTINATION_TAG = "destination"
|
|
5
|
-
BLOB_STORAGE_TAG = "blob_storage"
|
|
6
|
-
SQL_TAG = "sql"
|
|
7
|
-
NOSQL_TAG = "nosql"
|
|
8
|
-
VECTOR_DB_TAG = "vector_db"
|
|
9
|
-
GRAPH_DB_TAG = "graph_db"
|
|
10
|
-
UNCATEGORIZED_TAG = "uncategorized"
|
|
11
|
-
|
|
12
|
-
env_setup_path = Path(__file__).parents[1] / "env_setup"
|
|
13
|
-
expected_results_path = Path(__file__).parents[1] / "expected_results"
|
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from contextlib import contextmanager
|
|
3
|
-
from typing import Optional, Union
|
|
4
|
-
|
|
5
|
-
import docker
|
|
6
|
-
from docker.models.containers import Container
|
|
7
|
-
from pydantic import BaseModel, Field, field_serializer
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class HealthCheck(BaseModel):
|
|
11
|
-
test: Union[str, list[str]]
|
|
12
|
-
interval: int = Field(
|
|
13
|
-
gt=0, default=30, description="The time to wait between checks in seconds."
|
|
14
|
-
)
|
|
15
|
-
timeout: int = Field(
|
|
16
|
-
gt=0, default=30, description="The time to wait before considering the check to have hung."
|
|
17
|
-
)
|
|
18
|
-
retries: int = Field(
|
|
19
|
-
gt=0,
|
|
20
|
-
default=3,
|
|
21
|
-
description="The number of consecutive failures needed "
|
|
22
|
-
"to consider a container as unhealthy.",
|
|
23
|
-
)
|
|
24
|
-
start_period: int = Field(
|
|
25
|
-
gt=0,
|
|
26
|
-
default=0,
|
|
27
|
-
description="Start period for the container to initialize before starting health-retries countdown in seconds.", # noqa: E501
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
@field_serializer("interval")
|
|
31
|
-
def serialize_interval(self, interval: int) -> int:
|
|
32
|
-
return int(interval * 10e8)
|
|
33
|
-
|
|
34
|
-
@field_serializer("timeout")
|
|
35
|
-
def serialize_timeout(self, timeout: int) -> int:
|
|
36
|
-
return int(timeout * 10e8)
|
|
37
|
-
|
|
38
|
-
@field_serializer("start_period")
|
|
39
|
-
def serialize_start_period(self, start_period: int) -> int:
|
|
40
|
-
return int(start_period * 10e8)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_container(
|
|
44
|
-
docker_client: docker.DockerClient,
|
|
45
|
-
image: str,
|
|
46
|
-
ports: dict,
|
|
47
|
-
name: Optional[str] = None,
|
|
48
|
-
environment: Optional[dict] = None,
|
|
49
|
-
volumes: Optional[dict] = None,
|
|
50
|
-
healthcheck: Optional[HealthCheck] = None,
|
|
51
|
-
) -> Container:
|
|
52
|
-
run_kwargs = {
|
|
53
|
-
"image": image,
|
|
54
|
-
"detach": True,
|
|
55
|
-
"ports": ports,
|
|
56
|
-
}
|
|
57
|
-
if environment:
|
|
58
|
-
run_kwargs["environment"] = environment
|
|
59
|
-
if volumes:
|
|
60
|
-
run_kwargs["volumes"] = volumes
|
|
61
|
-
if healthcheck:
|
|
62
|
-
run_kwargs["healthcheck"] = healthcheck.model_dump()
|
|
63
|
-
if name:
|
|
64
|
-
run_kwargs["name"] = name
|
|
65
|
-
container: Container = docker_client.containers.run(**run_kwargs)
|
|
66
|
-
return container
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def get_healthcheck(container: Container) -> Optional[HealthCheck]:
|
|
70
|
-
healthcheck_config = container.attrs.get("Config", {}).get("Healthcheck", None)
|
|
71
|
-
if not healthcheck_config:
|
|
72
|
-
return None
|
|
73
|
-
healthcheck_data = {
|
|
74
|
-
"test": healthcheck_config["Test"],
|
|
75
|
-
}
|
|
76
|
-
if interval := healthcheck_config.get("Interval"):
|
|
77
|
-
healthcheck_data["interval"] = interval / 10e8
|
|
78
|
-
if start_period := healthcheck_config.get("StartPeriod"):
|
|
79
|
-
healthcheck_data["start_period"] = start_period / 10e8
|
|
80
|
-
if retries := healthcheck_config.get("Retries"):
|
|
81
|
-
healthcheck_data["retries"] = retries
|
|
82
|
-
return HealthCheck.model_validate(healthcheck_data)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def healthcheck_wait(
|
|
86
|
-
container: Container, retries: int = 30, interval: int = 1, start_period: Optional[int] = None
|
|
87
|
-
) -> None:
|
|
88
|
-
if start_period:
|
|
89
|
-
time.sleep(start_period)
|
|
90
|
-
health = container.health
|
|
91
|
-
tries = 0
|
|
92
|
-
while health != "healthy" and tries < retries:
|
|
93
|
-
tries += 1
|
|
94
|
-
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
95
|
-
latest_log = logs[-1] if logs else None
|
|
96
|
-
print(
|
|
97
|
-
f"attempt {tries} - waiting for docker container "
|
|
98
|
-
f"to be healthy: {health} latest log: {latest_log}"
|
|
99
|
-
)
|
|
100
|
-
time.sleep(interval)
|
|
101
|
-
container.reload()
|
|
102
|
-
health = container.health
|
|
103
|
-
if health != "healthy":
|
|
104
|
-
logs = container.attrs.get("State", {}).get("Health", {}).get("Log")
|
|
105
|
-
latest_log = logs[-1] if logs else None
|
|
106
|
-
raise TimeoutError(f"Docker container never came up healthy: {latest_log}")
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
@contextmanager
|
|
110
|
-
def container_context(
|
|
111
|
-
image: str,
|
|
112
|
-
ports: dict,
|
|
113
|
-
environment: Optional[dict] = None,
|
|
114
|
-
volumes: Optional[dict] = None,
|
|
115
|
-
healthcheck: Optional[HealthCheck] = None,
|
|
116
|
-
healthcheck_retries: int = 30,
|
|
117
|
-
docker_client: Optional[docker.DockerClient] = None,
|
|
118
|
-
name: Optional[str] = None,
|
|
119
|
-
):
|
|
120
|
-
docker_client = docker_client or docker.from_env()
|
|
121
|
-
print(f"pulling image {image}")
|
|
122
|
-
docker_client.images.pull(image)
|
|
123
|
-
container: Optional[Container] = None
|
|
124
|
-
try:
|
|
125
|
-
container = get_container(
|
|
126
|
-
docker_client=docker_client,
|
|
127
|
-
image=image,
|
|
128
|
-
ports=ports,
|
|
129
|
-
environment=environment,
|
|
130
|
-
volumes=volumes,
|
|
131
|
-
healthcheck=healthcheck,
|
|
132
|
-
name=name,
|
|
133
|
-
)
|
|
134
|
-
if healthcheck_data := get_healthcheck(container):
|
|
135
|
-
# Mirror whatever healthcheck config set on container
|
|
136
|
-
healthcheck_wait(
|
|
137
|
-
container=container,
|
|
138
|
-
retries=healthcheck_retries,
|
|
139
|
-
start_period=healthcheck_data.start_period,
|
|
140
|
-
interval=healthcheck_data.interval,
|
|
141
|
-
)
|
|
142
|
-
yield container
|
|
143
|
-
except AssertionError as e:
|
|
144
|
-
if container:
|
|
145
|
-
logs = container.logs()
|
|
146
|
-
print(logs.decode("utf-8"))
|
|
147
|
-
raise e
|
|
148
|
-
finally:
|
|
149
|
-
if container:
|
|
150
|
-
container.kill()
|
|
151
|
-
container.remove()
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import subprocess
|
|
2
|
-
from contextlib import contextmanager
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def docker_compose_down(docker_compose_path: Path):
|
|
7
|
-
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v --rmi all"
|
|
8
|
-
print(f"Running command: {cmd}")
|
|
9
|
-
final_resp = subprocess.run(
|
|
10
|
-
cmd,
|
|
11
|
-
shell=True,
|
|
12
|
-
capture_output=True,
|
|
13
|
-
)
|
|
14
|
-
if final_resp.returncode != 0:
|
|
15
|
-
print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
|
|
16
|
-
print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def run_cleanup(docker_compose_path: Path):
|
|
20
|
-
docker_compose_down(docker_compose_path=docker_compose_path)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@contextmanager
|
|
24
|
-
def docker_compose_context(docker_compose_path: Path):
|
|
25
|
-
# Dynamically run a specific docker compose file and make sure it gets cleanup by
|
|
26
|
-
# by leveraging a context manager. Uses subprocess to map docker compose commands
|
|
27
|
-
# to the underlying shell.
|
|
28
|
-
assert docker_compose_path.exists()
|
|
29
|
-
if docker_compose_path.is_dir():
|
|
30
|
-
if (docker_compose_path / "docker-compose.yml").exists():
|
|
31
|
-
docker_compose_path = docker_compose_path / "docker-compose.yml"
|
|
32
|
-
elif (docker_compose_path / "docker-compose.yaml").exists():
|
|
33
|
-
docker_compose_path = docker_compose_path / "docker-compose.yaml"
|
|
34
|
-
assert docker_compose_path.is_file()
|
|
35
|
-
resp = None
|
|
36
|
-
try:
|
|
37
|
-
cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
|
|
38
|
-
print(f"Running command: {cmd}")
|
|
39
|
-
resp = subprocess.run(
|
|
40
|
-
cmd,
|
|
41
|
-
shell=True,
|
|
42
|
-
capture_output=True,
|
|
43
|
-
)
|
|
44
|
-
# Return code from docker compose using --wait can be 1 even if no error
|
|
45
|
-
yield
|
|
46
|
-
except Exception as e:
|
|
47
|
-
if resp:
|
|
48
|
-
print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
|
|
49
|
-
print("STDERR: {}".format(resp.stderr.decode("utf-8")))
|
|
50
|
-
cmd = f"docker compose -f {docker_compose_path.resolve()} logs"
|
|
51
|
-
logs = subprocess.run(
|
|
52
|
-
cmd,
|
|
53
|
-
shell=True,
|
|
54
|
-
capture_output=True,
|
|
55
|
-
)
|
|
56
|
-
print("DOCKER LOGS: {}".format(logs.stdout.decode("utf-8")))
|
|
57
|
-
raise e
|
|
58
|
-
finally:
|
|
59
|
-
run_cleanup(docker_compose_path=docker_compose_path)
|
|
File without changes
|