unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pytest
|
|
9
|
-
from redis import exceptions as redis_exceptions
|
|
10
|
-
from redis.asyncio import Redis, from_url
|
|
11
|
-
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
13
|
-
from test.integration.utils import requires_env
|
|
14
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
15
|
-
from unstructured_ingest.processes.connectors.redisdb import (
|
|
16
|
-
CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.connectors.redisdb import (
|
|
19
|
-
RedisAccessConfig,
|
|
20
|
-
RedisConnectionConfig,
|
|
21
|
-
RedisUploader,
|
|
22
|
-
RedisUploaderConfig,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
|
|
27
|
-
key_with_prefix = f"{key_prefix}{element_id}"
|
|
28
|
-
await client.delete(key_with_prefix)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
|
|
32
|
-
element_id = first_element["element_id"]
|
|
33
|
-
key_with_prefix = f"{key_prefix}{element_id}"
|
|
34
|
-
expected_text = first_element["text"]
|
|
35
|
-
expected_embeddings = first_element["embeddings"]
|
|
36
|
-
async with client.pipeline(transaction=True) as pipe:
|
|
37
|
-
try:
|
|
38
|
-
response = await pipe.json().get(key_with_prefix, "$").execute()
|
|
39
|
-
response = response[0][0]
|
|
40
|
-
except redis_exceptions.ResponseError:
|
|
41
|
-
response = await pipe.get(key_with_prefix).execute()
|
|
42
|
-
response = json.loads(response[0])
|
|
43
|
-
|
|
44
|
-
embedding_similarity = np.linalg.norm(
|
|
45
|
-
np.array(response["embeddings"]) - np.array(expected_embeddings)
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
assert response is not None
|
|
49
|
-
assert response["element_id"] == element_id
|
|
50
|
-
assert response["text"] == expected_text
|
|
51
|
-
assert embedding_similarity < 1e-10
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
async def redis_destination_test(
|
|
55
|
-
upload_file: Path,
|
|
56
|
-
tmp_path: Path,
|
|
57
|
-
connection_kwargs: dict,
|
|
58
|
-
uploader_config: dict,
|
|
59
|
-
uri: Optional[str] = None,
|
|
60
|
-
password: Optional[str] = None,
|
|
61
|
-
):
|
|
62
|
-
uploader = RedisUploader(
|
|
63
|
-
connection_config=RedisConnectionConfig(
|
|
64
|
-
**connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
|
|
65
|
-
),
|
|
66
|
-
upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
|
|
67
|
-
)
|
|
68
|
-
key_prefix = uploader.upload_config.key_prefix
|
|
69
|
-
|
|
70
|
-
file_data = FileData(
|
|
71
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
72
|
-
connector_type=REDIS_CONNECTOR_TYPE,
|
|
73
|
-
identifier="mock-file-data",
|
|
74
|
-
)
|
|
75
|
-
with upload_file.open() as upload_fp:
|
|
76
|
-
elements = json.load(upload_fp)
|
|
77
|
-
first_element = elements[0]
|
|
78
|
-
|
|
79
|
-
try:
|
|
80
|
-
if uploader.is_async():
|
|
81
|
-
await uploader.run_data_async(data=elements, file_data=file_data)
|
|
82
|
-
|
|
83
|
-
if uri:
|
|
84
|
-
async with from_url(uri) as client:
|
|
85
|
-
await validate_upload(
|
|
86
|
-
client=client,
|
|
87
|
-
first_element=first_element,
|
|
88
|
-
key_prefix=key_prefix,
|
|
89
|
-
)
|
|
90
|
-
else:
|
|
91
|
-
async with Redis(**connection_kwargs, password=password) as client:
|
|
92
|
-
await validate_upload(
|
|
93
|
-
client=client,
|
|
94
|
-
first_element=first_element,
|
|
95
|
-
key_prefix=key_prefix,
|
|
96
|
-
)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
raise e
|
|
99
|
-
finally:
|
|
100
|
-
if uri:
|
|
101
|
-
async with from_url(uri) as client:
|
|
102
|
-
tasks = [
|
|
103
|
-
delete_record(client, element["element_id"], key_prefix) for element in elements
|
|
104
|
-
]
|
|
105
|
-
await asyncio.gather(*tasks)
|
|
106
|
-
else:
|
|
107
|
-
async with Redis(**connection_kwargs, password=password) as client:
|
|
108
|
-
tasks = [
|
|
109
|
-
delete_record(client, element["element_id"], key_prefix) for element in elements
|
|
110
|
-
]
|
|
111
|
-
await asyncio.gather(*tasks)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@pytest.mark.asyncio
|
|
115
|
-
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
116
|
-
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
117
|
-
async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
|
|
118
|
-
connection_kwargs = {
|
|
119
|
-
"host": "utic-dashboard-dev.redis.cache.windows.net",
|
|
120
|
-
"port": 6380,
|
|
121
|
-
"db": 0,
|
|
122
|
-
"ssl": True,
|
|
123
|
-
}
|
|
124
|
-
uploader_config = {
|
|
125
|
-
"key_prefix": "test_ingest:",
|
|
126
|
-
}
|
|
127
|
-
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
128
|
-
await redis_destination_test(
|
|
129
|
-
upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@pytest.mark.asyncio
|
|
134
|
-
@pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
|
|
135
|
-
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
136
|
-
async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
|
|
137
|
-
connection_kwargs = {}
|
|
138
|
-
uploader_config = {
|
|
139
|
-
"key_prefix": "test_ingest:",
|
|
140
|
-
}
|
|
141
|
-
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
142
|
-
uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
|
|
143
|
-
await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tempfile
|
|
3
|
-
import uuid
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import (
|
|
9
|
-
BLOB_STORAGE_TAG,
|
|
10
|
-
DESTINATION_TAG,
|
|
11
|
-
SOURCE_TAG,
|
|
12
|
-
env_setup_path,
|
|
13
|
-
)
|
|
14
|
-
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
15
|
-
from test.integration.connectors.utils.validation.source import (
|
|
16
|
-
SourceValidationConfigs,
|
|
17
|
-
source_connector_validation,
|
|
18
|
-
)
|
|
19
|
-
from test.integration.utils import requires_env
|
|
20
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
21
|
-
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
22
|
-
from unstructured_ingest.processes.connectors.fsspec.s3 import (
|
|
23
|
-
CONNECTOR_TYPE,
|
|
24
|
-
S3AccessConfig,
|
|
25
|
-
S3ConnectionConfig,
|
|
26
|
-
S3Downloader,
|
|
27
|
-
S3DownloaderConfig,
|
|
28
|
-
S3Indexer,
|
|
29
|
-
S3IndexerConfig,
|
|
30
|
-
S3Uploader,
|
|
31
|
-
S3UploaderConfig,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def validate_predownload_file_data(file_data: FileData):
|
|
36
|
-
assert file_data.connector_type == CONNECTOR_TYPE
|
|
37
|
-
assert file_data.local_download_path is None
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def validate_postdownload_file_data(file_data: FileData):
|
|
41
|
-
assert file_data.connector_type == CONNECTOR_TYPE
|
|
42
|
-
assert file_data.local_download_path is not None
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@pytest.fixture
|
|
46
|
-
def anon_connection_config() -> S3ConnectionConfig:
|
|
47
|
-
return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@pytest.mark.asyncio
|
|
51
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
52
|
-
async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
53
|
-
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
|
|
54
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
55
|
-
tempdir_path = Path(tempdir)
|
|
56
|
-
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
57
|
-
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
58
|
-
downloader = S3Downloader(
|
|
59
|
-
connection_config=anon_connection_config, download_config=download_config
|
|
60
|
-
)
|
|
61
|
-
await source_connector_validation(
|
|
62
|
-
indexer=indexer,
|
|
63
|
-
downloader=downloader,
|
|
64
|
-
configs=SourceValidationConfigs(
|
|
65
|
-
test_id="s3",
|
|
66
|
-
predownload_file_data_check=validate_predownload_file_data,
|
|
67
|
-
postdownload_file_data_check=validate_postdownload_file_data,
|
|
68
|
-
expected_num_files=4,
|
|
69
|
-
),
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@pytest.mark.asyncio
|
|
74
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
75
|
-
async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
|
|
76
|
-
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
|
|
77
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
78
|
-
tempdir_path = Path(tempdir)
|
|
79
|
-
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
80
|
-
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
81
|
-
downloader = S3Downloader(
|
|
82
|
-
connection_config=anon_connection_config, download_config=download_config
|
|
83
|
-
)
|
|
84
|
-
await source_connector_validation(
|
|
85
|
-
indexer=indexer,
|
|
86
|
-
downloader=downloader,
|
|
87
|
-
configs=SourceValidationConfigs(
|
|
88
|
-
test_id="s3-specialchar",
|
|
89
|
-
predownload_file_data_check=validate_predownload_file_data,
|
|
90
|
-
postdownload_file_data_check=validate_postdownload_file_data,
|
|
91
|
-
expected_num_files=1,
|
|
92
|
-
),
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
97
|
-
def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
98
|
-
indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
|
|
99
|
-
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
100
|
-
with pytest.raises(UserAuthError):
|
|
101
|
-
indexer.precheck()
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
105
|
-
def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
|
|
106
|
-
indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
|
|
107
|
-
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
108
|
-
with pytest.raises(UserError):
|
|
109
|
-
indexer.precheck()
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
@pytest.mark.asyncio
|
|
113
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
|
|
114
|
-
async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
115
|
-
anon_connection_config.endpoint_url = "http://localhost:9000"
|
|
116
|
-
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
|
|
117
|
-
with docker_compose_context(docker_compose_path=env_setup_path / "minio" / "source"):
|
|
118
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
119
|
-
tempdir_path = Path(tempdir)
|
|
120
|
-
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
121
|
-
indexer = S3Indexer(
|
|
122
|
-
connection_config=anon_connection_config, index_config=indexer_config
|
|
123
|
-
)
|
|
124
|
-
downloader = S3Downloader(
|
|
125
|
-
connection_config=anon_connection_config, download_config=download_config
|
|
126
|
-
)
|
|
127
|
-
await source_connector_validation(
|
|
128
|
-
indexer=indexer,
|
|
129
|
-
downloader=downloader,
|
|
130
|
-
configs=SourceValidationConfigs(
|
|
131
|
-
test_id="s3-minio",
|
|
132
|
-
predownload_file_data_check=validate_predownload_file_data,
|
|
133
|
-
postdownload_file_data_check=validate_postdownload_file_data,
|
|
134
|
-
expected_num_files=1,
|
|
135
|
-
exclude_fields_extend=[
|
|
136
|
-
"metadata.date_modified",
|
|
137
|
-
"metadata.date_created",
|
|
138
|
-
"additional_metadata.LastModified",
|
|
139
|
-
],
|
|
140
|
-
),
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def get_aws_credentials() -> dict:
|
|
145
|
-
access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
|
|
146
|
-
assert access_key
|
|
147
|
-
secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
|
|
148
|
-
assert secret_key
|
|
149
|
-
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
@pytest.mark.asyncio
|
|
153
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
154
|
-
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
155
|
-
async def test_s3_destination(upload_file: Path):
|
|
156
|
-
aws_credentials = get_aws_credentials()
|
|
157
|
-
s3_bucket = "s3://utic-ingest-test-fixtures"
|
|
158
|
-
destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
|
|
159
|
-
connection_config = S3ConnectionConfig(
|
|
160
|
-
access_config=S3AccessConfig(
|
|
161
|
-
key=aws_credentials["aws_access_key_id"],
|
|
162
|
-
secret=aws_credentials["aws_secret_access_key"],
|
|
163
|
-
),
|
|
164
|
-
)
|
|
165
|
-
upload_config = S3UploaderConfig(remote_url=destination_path)
|
|
166
|
-
uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
|
|
167
|
-
s3fs = uploader.fs
|
|
168
|
-
file_data = FileData(
|
|
169
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
170
|
-
connector_type=CONNECTOR_TYPE,
|
|
171
|
-
identifier="mock file data",
|
|
172
|
-
)
|
|
173
|
-
try:
|
|
174
|
-
uploader.precheck()
|
|
175
|
-
if uploader.is_async():
|
|
176
|
-
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
177
|
-
else:
|
|
178
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
179
|
-
uploaded_files = [
|
|
180
|
-
Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
|
|
181
|
-
]
|
|
182
|
-
assert len(uploaded_files) == 1
|
|
183
|
-
finally:
|
|
184
|
-
s3fs.rm(path=destination_path, recursive=True)
|
|
@@ -1,222 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
|
|
6
|
-
from test.integration.connectors.utils.validation.source import (
|
|
7
|
-
SourceValidationConfigs,
|
|
8
|
-
source_connector_validation,
|
|
9
|
-
)
|
|
10
|
-
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.processes.connectors.sharepoint import (
|
|
12
|
-
CONNECTOR_TYPE,
|
|
13
|
-
SharepointAccessConfig,
|
|
14
|
-
SharepointConnectionConfig,
|
|
15
|
-
SharepointDownloader,
|
|
16
|
-
SharepointDownloaderConfig,
|
|
17
|
-
SharepointIndexer,
|
|
18
|
-
SharepointIndexerConfig,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def sharepoint_config():
|
|
23
|
-
class SharepointTestConfig:
|
|
24
|
-
def __init__(self):
|
|
25
|
-
self.client_id = os.environ["SHAREPOINT_CLIENT_ID"]
|
|
26
|
-
self.client_cred = os.environ["SHAREPOINT_CRED"]
|
|
27
|
-
self.user_pname = os.environ["MS_USER_PNAME"]
|
|
28
|
-
self.tenant = os.environ["MS_TENANT_ID"]
|
|
29
|
-
|
|
30
|
-
return SharepointTestConfig()
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@pytest.mark.asyncio
|
|
34
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
35
|
-
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
36
|
-
async def test_sharepoint_source(temp_dir):
|
|
37
|
-
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
38
|
-
config = sharepoint_config()
|
|
39
|
-
|
|
40
|
-
# Create connection and indexer configurations
|
|
41
|
-
access_config = SharepointAccessConfig(client_cred=config.client_cred)
|
|
42
|
-
connection_config = SharepointConnectionConfig(
|
|
43
|
-
client_id=config.client_id,
|
|
44
|
-
site=site,
|
|
45
|
-
tenant=config.tenant,
|
|
46
|
-
user_pname=config.user_pname,
|
|
47
|
-
access_config=access_config,
|
|
48
|
-
)
|
|
49
|
-
index_config = SharepointIndexerConfig(recursive=True)
|
|
50
|
-
|
|
51
|
-
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
52
|
-
|
|
53
|
-
# Instantiate indexer and downloader
|
|
54
|
-
indexer = SharepointIndexer(
|
|
55
|
-
connection_config=connection_config,
|
|
56
|
-
index_config=index_config,
|
|
57
|
-
)
|
|
58
|
-
downloader = SharepointDownloader(
|
|
59
|
-
connection_config=connection_config,
|
|
60
|
-
download_config=download_config,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Run the source connector validation
|
|
64
|
-
await source_connector_validation(
|
|
65
|
-
indexer=indexer,
|
|
66
|
-
downloader=downloader,
|
|
67
|
-
configs=SourceValidationConfigs(
|
|
68
|
-
test_id="sharepoint1",
|
|
69
|
-
expected_num_files=4,
|
|
70
|
-
validate_downloaded_files=True,
|
|
71
|
-
exclude_fields_extend=[
|
|
72
|
-
"metadata.date_created",
|
|
73
|
-
"metadata.date_modified",
|
|
74
|
-
"additional_metadata.LastModified",
|
|
75
|
-
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
76
|
-
],
|
|
77
|
-
),
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@pytest.mark.asyncio
|
|
82
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
83
|
-
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
84
|
-
async def test_sharepoint_source_with_path(temp_dir):
|
|
85
|
-
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
86
|
-
config = sharepoint_config()
|
|
87
|
-
|
|
88
|
-
# Create connection and indexer configurations
|
|
89
|
-
access_config = SharepointAccessConfig(client_cred=config.client_cred)
|
|
90
|
-
connection_config = SharepointConnectionConfig(
|
|
91
|
-
client_id=config.client_id,
|
|
92
|
-
site=site,
|
|
93
|
-
tenant=config.tenant,
|
|
94
|
-
user_pname=config.user_pname,
|
|
95
|
-
access_config=access_config,
|
|
96
|
-
)
|
|
97
|
-
index_config = SharepointIndexerConfig(recursive=True, path="Folder1")
|
|
98
|
-
|
|
99
|
-
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
100
|
-
|
|
101
|
-
# Instantiate indexer and downloader
|
|
102
|
-
indexer = SharepointIndexer(
|
|
103
|
-
connection_config=connection_config,
|
|
104
|
-
index_config=index_config,
|
|
105
|
-
)
|
|
106
|
-
downloader = SharepointDownloader(
|
|
107
|
-
connection_config=connection_config,
|
|
108
|
-
download_config=download_config,
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
# Run the source connector validation
|
|
112
|
-
await source_connector_validation(
|
|
113
|
-
indexer=indexer,
|
|
114
|
-
downloader=downloader,
|
|
115
|
-
configs=SourceValidationConfigs(
|
|
116
|
-
test_id="sharepoint2",
|
|
117
|
-
expected_num_files=2,
|
|
118
|
-
validate_downloaded_files=True,
|
|
119
|
-
exclude_fields_extend=[
|
|
120
|
-
"metadata.date_created",
|
|
121
|
-
"metadata.date_modified",
|
|
122
|
-
"additional_metadata.LastModified",
|
|
123
|
-
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
124
|
-
],
|
|
125
|
-
),
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
@pytest.mark.asyncio
|
|
130
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
131
|
-
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
132
|
-
async def test_sharepoint_root_with_path(temp_dir):
|
|
133
|
-
site = "https://unstructuredio.sharepoint.com/"
|
|
134
|
-
config = sharepoint_config()
|
|
135
|
-
|
|
136
|
-
# Create connection and indexer configurations
|
|
137
|
-
access_config = SharepointAccessConfig(client_cred=config.client_cred)
|
|
138
|
-
connection_config = SharepointConnectionConfig(
|
|
139
|
-
client_id=config.client_id,
|
|
140
|
-
site=site,
|
|
141
|
-
tenant=config.tenant,
|
|
142
|
-
user_pname=config.user_pname,
|
|
143
|
-
access_config=access_config,
|
|
144
|
-
)
|
|
145
|
-
index_config = SharepointIndexerConfig(recursive=True, path="e2e-test-folder")
|
|
146
|
-
|
|
147
|
-
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
148
|
-
|
|
149
|
-
# Instantiate indexer and downloader
|
|
150
|
-
indexer = SharepointIndexer(
|
|
151
|
-
connection_config=connection_config,
|
|
152
|
-
index_config=index_config,
|
|
153
|
-
)
|
|
154
|
-
downloader = SharepointDownloader(
|
|
155
|
-
connection_config=connection_config,
|
|
156
|
-
download_config=download_config,
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
# Run the source connector validation
|
|
160
|
-
await source_connector_validation(
|
|
161
|
-
indexer=indexer,
|
|
162
|
-
downloader=downloader,
|
|
163
|
-
configs=SourceValidationConfigs(
|
|
164
|
-
test_id="sharepoint3",
|
|
165
|
-
expected_num_files=1,
|
|
166
|
-
validate_downloaded_files=True,
|
|
167
|
-
exclude_fields_extend=[
|
|
168
|
-
"metadata.date_created",
|
|
169
|
-
"metadata.date_modified",
|
|
170
|
-
"additional_metadata.LastModified",
|
|
171
|
-
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
172
|
-
],
|
|
173
|
-
),
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@pytest.mark.asyncio
|
|
178
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
179
|
-
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
180
|
-
async def test_sharepoint_shared_documents(temp_dir):
|
|
181
|
-
site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
|
|
182
|
-
config = sharepoint_config()
|
|
183
|
-
|
|
184
|
-
# Create connection and indexer configurations
|
|
185
|
-
access_config = SharepointAccessConfig(client_cred=config.client_cred)
|
|
186
|
-
connection_config = SharepointConnectionConfig(
|
|
187
|
-
client_id=config.client_id,
|
|
188
|
-
site=site,
|
|
189
|
-
tenant=config.tenant,
|
|
190
|
-
user_pname=config.user_pname,
|
|
191
|
-
access_config=access_config,
|
|
192
|
-
)
|
|
193
|
-
index_config = SharepointIndexerConfig(recursive=True, path="Shared Documents")
|
|
194
|
-
|
|
195
|
-
download_config = SharepointDownloaderConfig(download_dir=temp_dir)
|
|
196
|
-
|
|
197
|
-
# Instantiate indexer and downloader
|
|
198
|
-
indexer = SharepointIndexer(
|
|
199
|
-
connection_config=connection_config,
|
|
200
|
-
index_config=index_config,
|
|
201
|
-
)
|
|
202
|
-
downloader = SharepointDownloader(
|
|
203
|
-
connection_config=connection_config,
|
|
204
|
-
download_config=download_config,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
# Run the source connector validation
|
|
208
|
-
await source_connector_validation(
|
|
209
|
-
indexer=indexer,
|
|
210
|
-
downloader=downloader,
|
|
211
|
-
configs=SourceValidationConfigs(
|
|
212
|
-
test_id="sharepoint4",
|
|
213
|
-
expected_num_files=4,
|
|
214
|
-
validate_downloaded_files=True,
|
|
215
|
-
exclude_fields_extend=[
|
|
216
|
-
"metadata.date_created",
|
|
217
|
-
"metadata.date_modified",
|
|
218
|
-
"additional_metadata.LastModified",
|
|
219
|
-
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
220
|
-
],
|
|
221
|
-
),
|
|
222
|
-
)
|