unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
6
|
-
from test.integration.connectors.utils.validation.source import (
|
|
7
|
-
SourceValidationConfigs,
|
|
8
|
-
source_connector_validation,
|
|
9
|
-
)
|
|
10
|
-
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.processes.connectors.confluence import (
|
|
12
|
-
CONNECTOR_TYPE,
|
|
13
|
-
ConfluenceAccessConfig,
|
|
14
|
-
ConfluenceConnectionConfig,
|
|
15
|
-
ConfluenceDownloader,
|
|
16
|
-
ConfluenceDownloaderConfig,
|
|
17
|
-
ConfluenceIndexer,
|
|
18
|
-
ConfluenceIndexerConfig,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.asyncio
|
|
23
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
24
|
-
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
25
|
-
async def test_confluence_source(temp_dir):
|
|
26
|
-
# Retrieve environment variables
|
|
27
|
-
confluence_url = "https://unstructured-ingest-test.atlassian.net"
|
|
28
|
-
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
|
|
29
|
-
api_token = os.environ["CONFLUENCE_API_TOKEN"]
|
|
30
|
-
spaces = ["testteamsp", "MFS"]
|
|
31
|
-
|
|
32
|
-
# Create connection and indexer configurations
|
|
33
|
-
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
34
|
-
connection_config = ConfluenceConnectionConfig(
|
|
35
|
-
url=confluence_url,
|
|
36
|
-
username=user_email,
|
|
37
|
-
access_config=access_config,
|
|
38
|
-
)
|
|
39
|
-
index_config = ConfluenceIndexerConfig(
|
|
40
|
-
max_num_of_spaces=500,
|
|
41
|
-
max_num_of_docs_from_each_space=100,
|
|
42
|
-
spaces=spaces,
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
|
|
46
|
-
|
|
47
|
-
# Instantiate indexer and downloader
|
|
48
|
-
indexer = ConfluenceIndexer(
|
|
49
|
-
connection_config=connection_config,
|
|
50
|
-
index_config=index_config,
|
|
51
|
-
)
|
|
52
|
-
downloader = ConfluenceDownloader(
|
|
53
|
-
connection_config=connection_config,
|
|
54
|
-
download_config=download_config,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
# Run the source connector validation
|
|
58
|
-
await source_connector_validation(
|
|
59
|
-
indexer=indexer,
|
|
60
|
-
downloader=downloader,
|
|
61
|
-
configs=SourceValidationConfigs(
|
|
62
|
-
test_id="confluence",
|
|
63
|
-
expected_num_files=11,
|
|
64
|
-
validate_downloaded_files=True,
|
|
65
|
-
),
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@pytest.mark.asyncio
|
|
70
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
71
|
-
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
72
|
-
async def test_confluence_source_large(temp_dir):
|
|
73
|
-
# Retrieve environment variables
|
|
74
|
-
confluence_url = "https://unstructured-ingest-test.atlassian.net"
|
|
75
|
-
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
|
|
76
|
-
api_token = os.environ["CONFLUENCE_API_TOKEN"]
|
|
77
|
-
spaces = ["testteamsp1"]
|
|
78
|
-
|
|
79
|
-
# Create connection and indexer configurations
|
|
80
|
-
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
81
|
-
connection_config = ConfluenceConnectionConfig(
|
|
82
|
-
url=confluence_url,
|
|
83
|
-
username=user_email,
|
|
84
|
-
access_config=access_config,
|
|
85
|
-
)
|
|
86
|
-
index_config = ConfluenceIndexerConfig(
|
|
87
|
-
max_num_of_spaces=10,
|
|
88
|
-
max_num_of_docs_from_each_space=250,
|
|
89
|
-
spaces=spaces,
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
|
|
93
|
-
|
|
94
|
-
# Instantiate indexer and downloader
|
|
95
|
-
indexer = ConfluenceIndexer(
|
|
96
|
-
connection_config=connection_config,
|
|
97
|
-
index_config=index_config,
|
|
98
|
-
)
|
|
99
|
-
downloader = ConfluenceDownloader(
|
|
100
|
-
connection_config=connection_config,
|
|
101
|
-
download_config=download_config,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Run the source connector validation
|
|
105
|
-
await source_connector_validation(
|
|
106
|
-
indexer=indexer,
|
|
107
|
-
downloader=downloader,
|
|
108
|
-
configs=SourceValidationConfigs(
|
|
109
|
-
test_id="confluence_large", expected_num_files=250, validate_file_data=False
|
|
110
|
-
),
|
|
111
|
-
)
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
import multiprocessing
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
from deltalake import DeltaTable
|
|
7
|
-
from fsspec import get_filesystem_class
|
|
8
|
-
|
|
9
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
10
|
-
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
12
|
-
from unstructured_ingest.processes.connectors.delta_table import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
DeltaTableAccessConfig,
|
|
15
|
-
DeltaTableConnectionConfig,
|
|
16
|
-
DeltaTableUploader,
|
|
17
|
-
DeltaTableUploaderConfig,
|
|
18
|
-
DeltaTableUploadStager,
|
|
19
|
-
DeltaTableUploadStagerConfig,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
multiprocessing.set_start_method("spawn")
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@pytest.mark.asyncio
|
|
26
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
27
|
-
async def test_delta_table_destination_local(upload_file: Path, temp_dir: Path):
|
|
28
|
-
destination_path = str(temp_dir)
|
|
29
|
-
connection_config = DeltaTableConnectionConfig(
|
|
30
|
-
access_config=DeltaTableAccessConfig(),
|
|
31
|
-
table_uri=destination_path,
|
|
32
|
-
)
|
|
33
|
-
stager_config = DeltaTableUploadStagerConfig()
|
|
34
|
-
stager = DeltaTableUploadStager(upload_stager_config=stager_config)
|
|
35
|
-
new_upload_file = stager.run(
|
|
36
|
-
elements_filepath=upload_file,
|
|
37
|
-
output_dir=temp_dir,
|
|
38
|
-
output_filename=upload_file.name,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
upload_config = DeltaTableUploaderConfig()
|
|
42
|
-
uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
|
|
43
|
-
file_data = FileData(
|
|
44
|
-
source_identifiers=SourceIdentifiers(
|
|
45
|
-
fullpath=upload_file.name, filename=new_upload_file.name
|
|
46
|
-
),
|
|
47
|
-
connector_type=CONNECTOR_TYPE,
|
|
48
|
-
identifier="mock file data",
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
if uploader.is_async():
|
|
52
|
-
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
53
|
-
else:
|
|
54
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
55
|
-
delta_table_path = os.path.join(destination_path, upload_file.name)
|
|
56
|
-
delta_table = DeltaTable(table_uri=delta_table_path)
|
|
57
|
-
df = delta_table.to_pandas()
|
|
58
|
-
|
|
59
|
-
EXPECTED_COLUMNS = 10
|
|
60
|
-
EXPECTED_ROWS = 22
|
|
61
|
-
assert (
|
|
62
|
-
len(df) == EXPECTED_ROWS
|
|
63
|
-
), f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}"
|
|
64
|
-
assert (
|
|
65
|
-
len(df.columns) == EXPECTED_COLUMNS
|
|
66
|
-
), f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}"
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def get_aws_credentials() -> dict:
|
|
70
|
-
access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
|
|
71
|
-
assert access_key
|
|
72
|
-
secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
|
|
73
|
-
assert secret_key
|
|
74
|
-
return {
|
|
75
|
-
"AWS_ACCESS_KEY_ID": access_key,
|
|
76
|
-
"AWS_SECRET_ACCESS_KEY": secret_key,
|
|
77
|
-
"AWS_REGION": "us-east-2",
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@pytest.mark.asyncio
|
|
82
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
83
|
-
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
84
|
-
async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
|
|
85
|
-
aws_credentials = get_aws_credentials()
|
|
86
|
-
s3_bucket = "s3://utic-platform-test-destination"
|
|
87
|
-
destination_path = f"{s3_bucket}/destination/test"
|
|
88
|
-
connection_config = DeltaTableConnectionConfig(
|
|
89
|
-
access_config=DeltaTableAccessConfig(
|
|
90
|
-
aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
|
|
91
|
-
aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
|
|
92
|
-
),
|
|
93
|
-
aws_region=aws_credentials["AWS_REGION"],
|
|
94
|
-
table_uri=destination_path,
|
|
95
|
-
)
|
|
96
|
-
stager_config = DeltaTableUploadStagerConfig()
|
|
97
|
-
stager = DeltaTableUploadStager(upload_stager_config=stager_config)
|
|
98
|
-
new_upload_file = stager.run(
|
|
99
|
-
elements_filepath=upload_file,
|
|
100
|
-
output_dir=temp_dir,
|
|
101
|
-
output_filename=upload_file.name,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
upload_config = DeltaTableUploaderConfig()
|
|
105
|
-
uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
|
|
106
|
-
file_data = FileData(
|
|
107
|
-
source_identifiers=SourceIdentifiers(
|
|
108
|
-
fullpath=upload_file.name, filename=new_upload_file.name
|
|
109
|
-
),
|
|
110
|
-
connector_type=CONNECTOR_TYPE,
|
|
111
|
-
identifier="mock file data",
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
try:
|
|
115
|
-
uploader.precheck()
|
|
116
|
-
if uploader.is_async():
|
|
117
|
-
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
118
|
-
else:
|
|
119
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
120
|
-
delta_table_path = os.path.join(destination_path, upload_file.name)
|
|
121
|
-
delta_table = DeltaTable(table_uri=delta_table_path, storage_options=aws_credentials)
|
|
122
|
-
df = delta_table.to_pandas()
|
|
123
|
-
|
|
124
|
-
EXPECTED_COLUMNS = 10
|
|
125
|
-
EXPECTED_ROWS = 22
|
|
126
|
-
assert (
|
|
127
|
-
len(df) == EXPECTED_ROWS
|
|
128
|
-
), f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}"
|
|
129
|
-
assert (
|
|
130
|
-
len(df.columns) == EXPECTED_COLUMNS
|
|
131
|
-
), f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}"
|
|
132
|
-
finally:
|
|
133
|
-
s3fs = get_filesystem_class("s3")(
|
|
134
|
-
key=aws_credentials["AWS_ACCESS_KEY_ID"],
|
|
135
|
-
secret=aws_credentials["AWS_SECRET_ACCESS_KEY"],
|
|
136
|
-
)
|
|
137
|
-
s3fs.rm(path=destination_path, recursive=True)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
@pytest.mark.asyncio
|
|
141
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
142
|
-
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
143
|
-
async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
|
|
144
|
-
aws_credentials = {
|
|
145
|
-
"AWS_ACCESS_KEY_ID": "bad key",
|
|
146
|
-
"AWS_SECRET_ACCESS_KEY": "bad secret",
|
|
147
|
-
"AWS_REGION": "us-east-2",
|
|
148
|
-
}
|
|
149
|
-
s3_bucket = "s3://utic-platform-test-destination"
|
|
150
|
-
destination_path = f"{s3_bucket}/destination/test"
|
|
151
|
-
connection_config = DeltaTableConnectionConfig(
|
|
152
|
-
access_config=DeltaTableAccessConfig(
|
|
153
|
-
aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
|
|
154
|
-
aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
|
|
155
|
-
),
|
|
156
|
-
aws_region=aws_credentials["AWS_REGION"],
|
|
157
|
-
table_uri=destination_path,
|
|
158
|
-
)
|
|
159
|
-
stager_config = DeltaTableUploadStagerConfig()
|
|
160
|
-
stager = DeltaTableUploadStager(upload_stager_config=stager_config)
|
|
161
|
-
new_upload_file = stager.run(
|
|
162
|
-
elements_filepath=upload_file,
|
|
163
|
-
output_dir=temp_dir,
|
|
164
|
-
output_filename=upload_file.name,
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
upload_config = DeltaTableUploaderConfig()
|
|
168
|
-
uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
|
|
169
|
-
file_data = FileData(
|
|
170
|
-
source_identifiers=SourceIdentifiers(
|
|
171
|
-
fullpath=upload_file.name, filename=new_upload_file.name
|
|
172
|
-
),
|
|
173
|
-
connector_type=CONNECTOR_TYPE,
|
|
174
|
-
identifier="mock file data",
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
with pytest.raises(Exception) as excinfo:
|
|
178
|
-
if uploader.is_async():
|
|
179
|
-
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
180
|
-
else:
|
|
181
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
182
|
-
|
|
183
|
-
assert "403 Forbidden" in str(excinfo.value), f"Exception message did not match: {str(excinfo)}"
|
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
import requests
|
|
5
|
-
|
|
6
|
-
from test.integration.connectors.utils.constants import (
|
|
7
|
-
BLOB_STORAGE_TAG,
|
|
8
|
-
SOURCE_TAG,
|
|
9
|
-
)
|
|
10
|
-
from test.integration.connectors.utils.validation.source import (
|
|
11
|
-
SourceValidationConfigs,
|
|
12
|
-
source_connector_validation,
|
|
13
|
-
)
|
|
14
|
-
from test.integration.utils import requires_env
|
|
15
|
-
from unstructured_ingest.processes.connectors.fsspec.dropbox import (
|
|
16
|
-
CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.connectors.fsspec.dropbox import (
|
|
19
|
-
DropboxAccessConfig,
|
|
20
|
-
DropboxConnectionConfig,
|
|
21
|
-
DropboxDownloader,
|
|
22
|
-
DropboxDownloaderConfig,
|
|
23
|
-
DropboxIndexer,
|
|
24
|
-
DropboxIndexerConfig,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@pytest.mark.asyncio
|
|
29
|
-
@pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
30
|
-
@requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
|
|
31
|
-
async def test_dropbox_source(temp_dir):
|
|
32
|
-
"""
|
|
33
|
-
Integration test for the Dropbox source connector.
|
|
34
|
-
|
|
35
|
-
This test indexes data from dropbox://test-input/ and downloads the resulting files,
|
|
36
|
-
then compares them to fixture data.
|
|
37
|
-
"""
|
|
38
|
-
refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
|
|
39
|
-
app_key = os.getenv("DROPBOX_APP_KEY")
|
|
40
|
-
app_secret = os.getenv("DROPBOX_APP_SECRET")
|
|
41
|
-
|
|
42
|
-
connection_config = DropboxConnectionConfig(
|
|
43
|
-
access_config=DropboxAccessConfig(
|
|
44
|
-
refresh_token=refresh_token,
|
|
45
|
-
app_key=app_key,
|
|
46
|
-
app_secret=app_secret,
|
|
47
|
-
)
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
index_config = DropboxIndexerConfig(
|
|
51
|
-
recursive=True,
|
|
52
|
-
remote_url="dropbox://test-input",
|
|
53
|
-
)
|
|
54
|
-
downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
|
|
55
|
-
|
|
56
|
-
indexer = DropboxIndexer(
|
|
57
|
-
connection_config=connection_config,
|
|
58
|
-
index_config=index_config,
|
|
59
|
-
)
|
|
60
|
-
downloader = DropboxDownloader(
|
|
61
|
-
connection_config=connection_config,
|
|
62
|
-
download_config=downloader_config,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
await source_connector_validation(
|
|
66
|
-
indexer=indexer,
|
|
67
|
-
downloader=downloader,
|
|
68
|
-
configs=SourceValidationConfigs(
|
|
69
|
-
test_id="dropbox",
|
|
70
|
-
expected_num_files=4,
|
|
71
|
-
validate_downloaded_files=True,
|
|
72
|
-
exclude_fields_extend=[
|
|
73
|
-
"metadata.date_created",
|
|
74
|
-
"metadata.date_modified",
|
|
75
|
-
],
|
|
76
|
-
),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@pytest.mark.asyncio
|
|
81
|
-
@pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
82
|
-
@requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
|
|
83
|
-
async def test_dropbox_short_lived_token_via_refresh(temp_dir):
|
|
84
|
-
"""
|
|
85
|
-
Demonstrates manually generating an access token from refresh credentials,
|
|
86
|
-
then passing ONLY the short-lived token to the Dropbox connector
|
|
87
|
-
(no app_key, app_secret, or refresh_token in the actual connection config).
|
|
88
|
-
|
|
89
|
-
This effectively mimics an external system that hands us a short-lived token.
|
|
90
|
-
"""
|
|
91
|
-
refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
|
|
92
|
-
app_key = os.getenv("DROPBOX_APP_KEY")
|
|
93
|
-
app_secret = os.getenv("DROPBOX_APP_SECRET")
|
|
94
|
-
|
|
95
|
-
# Manually request a short-lived token from Dropbox's OAuth endpoint
|
|
96
|
-
# This call is basically what the connector code does internally,
|
|
97
|
-
# but we're doing it here in the test so we can pass only the short-lived token later.
|
|
98
|
-
response = requests.post(
|
|
99
|
-
"https://api.dropboxapi.com/oauth2/token",
|
|
100
|
-
data={
|
|
101
|
-
"grant_type": "refresh_token",
|
|
102
|
-
"refresh_token": refresh_token,
|
|
103
|
-
},
|
|
104
|
-
auth=(app_key, app_secret),
|
|
105
|
-
timeout=30, # seconds
|
|
106
|
-
)
|
|
107
|
-
response.raise_for_status()
|
|
108
|
-
data = response.json()
|
|
109
|
-
short_lived_token = data["access_token"]
|
|
110
|
-
print("Acquired an access token from Dropbox")
|
|
111
|
-
|
|
112
|
-
# Build connection config with ONLY the short-lived token
|
|
113
|
-
# We omit refresh_token, app_key, and app_secret to confirm that
|
|
114
|
-
# our connector can operate purely on the short-lived token.
|
|
115
|
-
connection_config = DropboxConnectionConfig(
|
|
116
|
-
access_config=DropboxAccessConfig(
|
|
117
|
-
token=short_lived_token,
|
|
118
|
-
app_key=None,
|
|
119
|
-
app_secret=None,
|
|
120
|
-
refresh_token=None,
|
|
121
|
-
)
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
index_config = DropboxIndexerConfig(
|
|
125
|
-
recursive=True,
|
|
126
|
-
remote_url="dropbox://test-input",
|
|
127
|
-
)
|
|
128
|
-
downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
|
|
129
|
-
|
|
130
|
-
indexer = DropboxIndexer(
|
|
131
|
-
connection_config=connection_config,
|
|
132
|
-
index_config=index_config,
|
|
133
|
-
)
|
|
134
|
-
downloader = DropboxDownloader(
|
|
135
|
-
connection_config=connection_config,
|
|
136
|
-
download_config=downloader_config,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
await source_connector_validation(
|
|
140
|
-
indexer=indexer,
|
|
141
|
-
downloader=downloader,
|
|
142
|
-
configs=SourceValidationConfigs(
|
|
143
|
-
test_id="dropbox_short_lived_via_refresh",
|
|
144
|
-
expected_num_files=4,
|
|
145
|
-
validate_downloaded_files=True,
|
|
146
|
-
exclude_fields_extend=[
|
|
147
|
-
"metadata.date_created",
|
|
148
|
-
"metadata.date_modified",
|
|
149
|
-
],
|
|
150
|
-
),
|
|
151
|
-
)
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
6
|
-
from test.integration.connectors.utils.validation.source import (
|
|
7
|
-
SourceValidationConfigs,
|
|
8
|
-
source_connector_validation,
|
|
9
|
-
)
|
|
10
|
-
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.processes.connectors.github import (
|
|
12
|
-
CONNECTOR_TYPE,
|
|
13
|
-
GithubAccessConfig,
|
|
14
|
-
GithubConnectionConfig,
|
|
15
|
-
GithubDownloader,
|
|
16
|
-
GithubDownloaderConfig,
|
|
17
|
-
GithubIndexer,
|
|
18
|
-
GithubIndexerConfig,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
23
|
-
@pytest.mark.asyncio
|
|
24
|
-
@requires_env("GH_READ_ONLY_ACCESS_TOKEN")
|
|
25
|
-
async def test_github_source(temp_dir):
|
|
26
|
-
access_token = os.environ["GH_READ_ONLY_ACCESS_TOKEN"]
|
|
27
|
-
connection_config = GithubConnectionConfig(
|
|
28
|
-
access_config=GithubAccessConfig(access_token=access_token),
|
|
29
|
-
url="dcneiner/Downloadify",
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
indexer = GithubIndexer(
|
|
33
|
-
connection_config=connection_config,
|
|
34
|
-
index_config=GithubIndexerConfig(file_glob=["*.txt", "*.html"]),
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
downloader = GithubDownloader(
|
|
38
|
-
connection_config=connection_config,
|
|
39
|
-
download_config=GithubDownloaderConfig(download_dir=temp_dir),
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
# Run the source connector validation
|
|
43
|
-
await source_connector_validation(
|
|
44
|
-
indexer=indexer,
|
|
45
|
-
downloader=downloader,
|
|
46
|
-
configs=SourceValidationConfigs(
|
|
47
|
-
test_id="github", expected_num_files=2, validate_downloaded_files=True
|
|
48
|
-
),
|
|
49
|
-
)
|