unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,257 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import uuid
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
from googleapiclient.errors import HttpError
|
|
6
|
-
|
|
7
|
-
from test.integration.connectors.utils.constants import (
|
|
8
|
-
SOURCE_TAG,
|
|
9
|
-
UNCATEGORIZED_TAG,
|
|
10
|
-
)
|
|
11
|
-
from test.integration.connectors.utils.validation.source import (
|
|
12
|
-
SourceValidationConfigs,
|
|
13
|
-
get_all_file_data,
|
|
14
|
-
run_all_validations,
|
|
15
|
-
update_fixtures,
|
|
16
|
-
)
|
|
17
|
-
from test.integration.utils import requires_env
|
|
18
|
-
from unstructured_ingest.error import (
|
|
19
|
-
SourceConnectionError,
|
|
20
|
-
)
|
|
21
|
-
from unstructured_ingest.interfaces import Downloader, Indexer
|
|
22
|
-
from unstructured_ingest.processes.connectors.google_drive import (
|
|
23
|
-
CONNECTOR_TYPE,
|
|
24
|
-
GoogleDriveAccessConfig,
|
|
25
|
-
GoogleDriveConnectionConfig,
|
|
26
|
-
GoogleDriveDownloader,
|
|
27
|
-
GoogleDriveDownloaderConfig,
|
|
28
|
-
GoogleDriveIndexer,
|
|
29
|
-
GoogleDriveIndexerConfig,
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@pytest.fixture
|
|
34
|
-
def google_drive_connection_config():
|
|
35
|
-
"""
|
|
36
|
-
Build a valid GoogleDriveConnectionConfig using the environment variables.
|
|
37
|
-
Expects:
|
|
38
|
-
- GOOGLE_DRIVE_ID
|
|
39
|
-
- GOOGLE_DRIVE_SERVICE_KEY
|
|
40
|
-
"""
|
|
41
|
-
drive_id = os.getenv("GOOGLE_DRIVE_ID")
|
|
42
|
-
service_key = os.getenv("GOOGLE_DRIVE_SERVICE_KEY")
|
|
43
|
-
if not drive_id or not service_key:
|
|
44
|
-
pytest.skip("Google Drive credentials not provided in environment variables.")
|
|
45
|
-
|
|
46
|
-
access_config = GoogleDriveAccessConfig(service_account_key=service_key)
|
|
47
|
-
return GoogleDriveConnectionConfig(drive_id=drive_id, access_config=access_config)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@pytest.fixture
|
|
51
|
-
def google_drive_empty_folder(google_drive_connection_config):
|
|
52
|
-
"""
|
|
53
|
-
Creates an empty folder on Google Drive for testing the "empty folder" case.
|
|
54
|
-
The folder is deleted after the test.
|
|
55
|
-
"""
|
|
56
|
-
from google.oauth2 import service_account
|
|
57
|
-
from googleapiclient.discovery import build
|
|
58
|
-
|
|
59
|
-
access_config = google_drive_connection_config.access_config.get_secret_value()
|
|
60
|
-
creds = service_account.Credentials.from_service_account_info(access_config.service_account_key)
|
|
61
|
-
service = build("drive", "v3", credentials=creds)
|
|
62
|
-
|
|
63
|
-
# Create an empty folder.
|
|
64
|
-
file_metadata = {
|
|
65
|
-
"name": f"utic-empty-folder-{uuid.uuid4()}",
|
|
66
|
-
"mimeType": "application/vnd.google-apps.folder",
|
|
67
|
-
}
|
|
68
|
-
folder = service.files().create(body=file_metadata, fields="id, name").execute()
|
|
69
|
-
folder_id = folder.get("id")
|
|
70
|
-
try:
|
|
71
|
-
yield folder_id
|
|
72
|
-
finally:
|
|
73
|
-
service.files().delete(fileId=folder_id).execute()
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@requires_env("GOOGLE_DRIVE_SERVICE_KEY")
|
|
77
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
|
|
78
|
-
def test_google_drive_source(temp_dir):
|
|
79
|
-
# Retrieve environment variables
|
|
80
|
-
service_account_key = os.environ["GOOGLE_DRIVE_SERVICE_KEY"]
|
|
81
|
-
|
|
82
|
-
# Create connection and indexer configurations
|
|
83
|
-
access_config = GoogleDriveAccessConfig(service_account_key=service_account_key)
|
|
84
|
-
connection_config = GoogleDriveConnectionConfig(
|
|
85
|
-
drive_id="1XidSOO76VpZ4m0i3gJN2m1X0Obol3UAi",
|
|
86
|
-
access_config=access_config,
|
|
87
|
-
)
|
|
88
|
-
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
89
|
-
|
|
90
|
-
download_config = GoogleDriveDownloaderConfig(download_dir=temp_dir)
|
|
91
|
-
|
|
92
|
-
# Instantiate indexer and downloader
|
|
93
|
-
indexer = GoogleDriveIndexer(
|
|
94
|
-
connection_config=connection_config,
|
|
95
|
-
index_config=index_config,
|
|
96
|
-
)
|
|
97
|
-
downloader = GoogleDriveDownloader(
|
|
98
|
-
connection_config=connection_config,
|
|
99
|
-
download_config=download_config,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
# Run the source connector validation
|
|
103
|
-
source_connector_validation(
|
|
104
|
-
indexer=indexer,
|
|
105
|
-
downloader=downloader,
|
|
106
|
-
configs=SourceValidationConfigs(
|
|
107
|
-
test_id="google_drive_source",
|
|
108
|
-
expected_num_files=1,
|
|
109
|
-
validate_downloaded_files=True,
|
|
110
|
-
),
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
115
|
-
def source_connector_validation(
|
|
116
|
-
indexer: Indexer,
|
|
117
|
-
downloader: Downloader,
|
|
118
|
-
configs: SourceValidationConfigs,
|
|
119
|
-
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
120
|
-
) -> None:
|
|
121
|
-
# Run common validations on the process of running a source connector, supporting dynamic
|
|
122
|
-
# validators that get passed in along with comparisons on the saved expected values.
|
|
123
|
-
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
124
|
-
# expected values with what gets generated by this test.
|
|
125
|
-
all_predownload_file_data = []
|
|
126
|
-
all_postdownload_file_data = []
|
|
127
|
-
indexer.precheck()
|
|
128
|
-
download_dir = downloader.download_config.download_dir
|
|
129
|
-
test_output_dir = configs.test_output_dir()
|
|
130
|
-
|
|
131
|
-
for file_data in indexer.run():
|
|
132
|
-
assert file_data
|
|
133
|
-
predownload_file_data = file_data.model_copy(deep=True)
|
|
134
|
-
all_predownload_file_data.append(predownload_file_data)
|
|
135
|
-
resp = downloader.run(file_data=file_data)
|
|
136
|
-
if isinstance(resp, list):
|
|
137
|
-
for r in resp:
|
|
138
|
-
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
139
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
140
|
-
else:
|
|
141
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
142
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
143
|
-
|
|
144
|
-
if not overwrite_fixtures:
|
|
145
|
-
print("Running validation")
|
|
146
|
-
run_all_validations(
|
|
147
|
-
configs=configs,
|
|
148
|
-
predownload_file_data=all_predownload_file_data,
|
|
149
|
-
postdownload_file_data=all_postdownload_file_data,
|
|
150
|
-
download_dir=download_dir,
|
|
151
|
-
test_output_dir=test_output_dir,
|
|
152
|
-
)
|
|
153
|
-
else:
|
|
154
|
-
print("Running fixtures update")
|
|
155
|
-
update_fixtures(
|
|
156
|
-
output_dir=test_output_dir,
|
|
157
|
-
download_dir=download_dir,
|
|
158
|
-
all_file_data=get_all_file_data(
|
|
159
|
-
all_predownload_file_data=all_predownload_file_data,
|
|
160
|
-
all_postdownload_file_data=all_postdownload_file_data,
|
|
161
|
-
),
|
|
162
|
-
save_downloads=configs.validate_downloaded_files,
|
|
163
|
-
save_filedata=configs.validate_file_data,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# Precheck fails when the drive ID has an appended parameter (simulate copy-paste error)
|
|
168
|
-
@pytest.mark.tags("google-drive", "precheck")
|
|
169
|
-
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
170
|
-
def test_google_drive_precheck_invalid_parameter(google_drive_connection_config):
|
|
171
|
-
# Append a query parameter as often happens when copying from a URL.
|
|
172
|
-
invalid_drive_id = google_drive_connection_config.drive_id + "?usp=sharing"
|
|
173
|
-
connection_config = GoogleDriveConnectionConfig(
|
|
174
|
-
drive_id=invalid_drive_id,
|
|
175
|
-
access_config=google_drive_connection_config.access_config,
|
|
176
|
-
)
|
|
177
|
-
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
178
|
-
indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
179
|
-
with pytest.raises(SourceConnectionError) as excinfo:
|
|
180
|
-
indexer.precheck()
|
|
181
|
-
assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
# Precheck fails due to lack of permission (simulate via monkeypatching).
|
|
185
|
-
@pytest.mark.tags("google-drive", "precheck")
|
|
186
|
-
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
187
|
-
def test_google_drive_precheck_no_permission(google_drive_connection_config, monkeypatch):
|
|
188
|
-
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
189
|
-
indexer = GoogleDriveIndexer(
|
|
190
|
-
connection_config=google_drive_connection_config,
|
|
191
|
-
index_config=index_config,
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
# Monkeypatch get_root_info to always raise an HTTP 403 error.
|
|
195
|
-
def fake_get_root_info(files_client, object_id):
|
|
196
|
-
raise HttpError(
|
|
197
|
-
resp=type("Response", (), {"status": 403, "reason": "Forbidden"})(),
|
|
198
|
-
content=b"Forbidden",
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
monkeypatch.setattr(indexer, "get_root_info", fake_get_root_info)
|
|
202
|
-
with pytest.raises(SourceConnectionError) as excinfo:
|
|
203
|
-
indexer.precheck()
|
|
204
|
-
assert "forbidden" in str(excinfo.value).lower() or "permission" in str(excinfo.value).lower()
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# Precheck fails when the folder is empty.
|
|
208
|
-
# @pytest.mark.tags("google-drive", "precheck")
|
|
209
|
-
# @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
210
|
-
# def test_google_drive_precheck_empty_folder(
|
|
211
|
-
# google_drive_connection_config, google_drive_empty_folder
|
|
212
|
-
# ):
|
|
213
|
-
# # Use the empty folder's ID as the target.
|
|
214
|
-
# connection_config = GoogleDriveConnectionConfig(
|
|
215
|
-
# drive_id=google_drive_empty_folder,
|
|
216
|
-
# access_config=google_drive_connection_config.access_config,
|
|
217
|
-
# )
|
|
218
|
-
|
|
219
|
-
# index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
220
|
-
# indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
221
|
-
# with pytest.raises(SourceConnectionError) as excinfo:
|
|
222
|
-
# indexer.precheck()
|
|
223
|
-
# assert "empty folder" in str(excinfo.value).lower()
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
@pytest.mark.tags("google-drive", "count", "integration")
|
|
227
|
-
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
228
|
-
def test_google_drive_count_files(google_drive_connection_config):
|
|
229
|
-
"""
|
|
230
|
-
This test verifies that the count_files_recursively method returns the expected count of files.
|
|
231
|
-
According to the test credentials, there are 3 files in the root directory and 1 nested file,
|
|
232
|
-
so the total count should be 4.
|
|
233
|
-
"""
|
|
234
|
-
# I assumed that we're applying the same extension filter as with other tests
|
|
235
|
-
# However there's 6 files in total in the test dir
|
|
236
|
-
extensions_filter = ["pdf", "docx"]
|
|
237
|
-
with google_drive_connection_config.get_client() as client:
|
|
238
|
-
count = GoogleDriveIndexer.count_files_recursively(
|
|
239
|
-
client, google_drive_connection_config.drive_id, extensions_filter
|
|
240
|
-
)
|
|
241
|
-
assert count == 4, f"Expected file count of 4, but got {count}"
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# Precheck fails with a completely invalid drive ID.
|
|
245
|
-
@pytest.mark.tags("google-drive", "precheck")
|
|
246
|
-
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
247
|
-
def test_google_drive_precheck_invalid_drive_id(google_drive_connection_config):
|
|
248
|
-
invalid_drive_id = "invalid_drive_id"
|
|
249
|
-
connection_config = GoogleDriveConnectionConfig(
|
|
250
|
-
drive_id=invalid_drive_id,
|
|
251
|
-
access_config=google_drive_connection_config.access_config,
|
|
252
|
-
)
|
|
253
|
-
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
254
|
-
indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
255
|
-
with pytest.raises(SourceConnectionError) as excinfo:
|
|
256
|
-
indexer.precheck()
|
|
257
|
-
assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
6
|
-
from test.integration.connectors.utils.validation.source import (
|
|
7
|
-
SourceValidationConfigs,
|
|
8
|
-
source_connector_validation,
|
|
9
|
-
)
|
|
10
|
-
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.processes.connectors.jira import (
|
|
12
|
-
CONNECTOR_TYPE,
|
|
13
|
-
JiraAccessConfig,
|
|
14
|
-
JiraConnectionConfig,
|
|
15
|
-
JiraDownloader,
|
|
16
|
-
JiraDownloaderConfig,
|
|
17
|
-
JiraIndexer,
|
|
18
|
-
JiraIndexerConfig,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.asyncio
|
|
23
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
24
|
-
@requires_env("JIRA_INGEST_USER_EMAIL", "JIRA_INGEST_API_TOKEN")
|
|
25
|
-
async def test_jira_source(temp_dir):
|
|
26
|
-
# Retrieve environment variables
|
|
27
|
-
jira_url = os.environ.get(
|
|
28
|
-
"JIRA_INGEST_URL", "https://unstructured-jira-connector-test.atlassian.net"
|
|
29
|
-
)
|
|
30
|
-
user_email = os.environ["JIRA_INGEST_USER_EMAIL"]
|
|
31
|
-
api_token = os.environ["JIRA_INGEST_API_TOKEN"]
|
|
32
|
-
projects = ["JCTP1"]
|
|
33
|
-
boards = ["3"]
|
|
34
|
-
issues = ["JCTP2-1", "JCTP2-2", "JCTP2-3"]
|
|
35
|
-
|
|
36
|
-
# Create connection and indexer configurations
|
|
37
|
-
access_config = JiraAccessConfig(password=api_token)
|
|
38
|
-
connection_config = JiraConnectionConfig(
|
|
39
|
-
url=jira_url,
|
|
40
|
-
username=user_email,
|
|
41
|
-
access_config=access_config,
|
|
42
|
-
)
|
|
43
|
-
index_config = JiraIndexerConfig(projects=projects, boards=boards, issues=issues)
|
|
44
|
-
|
|
45
|
-
download_config = JiraDownloaderConfig(download_dir=temp_dir)
|
|
46
|
-
|
|
47
|
-
# Instantiate indexer and downloader
|
|
48
|
-
indexer = JiraIndexer(
|
|
49
|
-
connection_config=connection_config,
|
|
50
|
-
index_config=index_config,
|
|
51
|
-
)
|
|
52
|
-
downloader = JiraDownloader(
|
|
53
|
-
connection_config=connection_config,
|
|
54
|
-
download_config=download_config,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
# Run the source connector validation
|
|
58
|
-
await source_connector_validation(
|
|
59
|
-
indexer=indexer,
|
|
60
|
-
downloader=downloader,
|
|
61
|
-
configs=SourceValidationConfigs(
|
|
62
|
-
test_id="jira",
|
|
63
|
-
expected_num_files=8,
|
|
64
|
-
validate_file_data=True,
|
|
65
|
-
validate_downloaded_files=True,
|
|
66
|
-
),
|
|
67
|
-
)
|
|
@@ -1,247 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Literal, Union
|
|
4
|
-
from uuid import uuid4
|
|
5
|
-
|
|
6
|
-
import lancedb
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import pyarrow as pa
|
|
9
|
-
import pytest
|
|
10
|
-
import pytest_asyncio
|
|
11
|
-
from lancedb import AsyncConnection
|
|
12
|
-
from upath import UPath
|
|
13
|
-
|
|
14
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
15
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
16
|
-
from unstructured_ingest.processes.connectors.lancedb.aws import (
|
|
17
|
-
LanceDBAwsAccessConfig,
|
|
18
|
-
LanceDBAwsConnectionConfig,
|
|
19
|
-
LanceDBAwsUploader,
|
|
20
|
-
)
|
|
21
|
-
from unstructured_ingest.processes.connectors.lancedb.azure import (
|
|
22
|
-
LanceDBAzureAccessConfig,
|
|
23
|
-
LanceDBAzureConnectionConfig,
|
|
24
|
-
LanceDBAzureUploader,
|
|
25
|
-
)
|
|
26
|
-
from unstructured_ingest.processes.connectors.lancedb.gcp import (
|
|
27
|
-
LanceDBGCSAccessConfig,
|
|
28
|
-
LanceDBGCSConnectionConfig,
|
|
29
|
-
LanceDBGSPUploader,
|
|
30
|
-
)
|
|
31
|
-
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
32
|
-
CONNECTOR_TYPE,
|
|
33
|
-
LanceDBUploaderConfig,
|
|
34
|
-
LanceDBUploadStager,
|
|
35
|
-
)
|
|
36
|
-
from unstructured_ingest.processes.connectors.lancedb.local import (
|
|
37
|
-
LanceDBLocalAccessConfig,
|
|
38
|
-
LanceDBLocalConnectionConfig,
|
|
39
|
-
LanceDBLocalUploader,
|
|
40
|
-
)
|
|
41
|
-
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
42
|
-
|
|
43
|
-
DATABASE_NAME = "database"
|
|
44
|
-
TABLE_NAME = "elements"
|
|
45
|
-
DIMENSION = 384
|
|
46
|
-
NUMBER_EXPECTED_ROWS = 22
|
|
47
|
-
S3_BUCKET = "s3://utic-ingest-test-fixtures/"
|
|
48
|
-
GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
|
|
49
|
-
AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
|
|
50
|
-
REQUIRED_ENV_VARS = {
|
|
51
|
-
"s3": ("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY"),
|
|
52
|
-
"gcs": ("GCP_INGEST_SERVICE_KEY",),
|
|
53
|
-
"az": ("AZURE_DEST_CONNECTION_STR",),
|
|
54
|
-
"local": (),
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
SCHEMA = pa.schema(
|
|
58
|
-
[
|
|
59
|
-
pa.field(RECORD_ID_LABEL, pa.string()),
|
|
60
|
-
pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
|
|
61
|
-
pa.field("text", pa.string(), nullable=True),
|
|
62
|
-
pa.field("type", pa.string(), nullable=True),
|
|
63
|
-
pa.field("element_id", pa.string(), nullable=True),
|
|
64
|
-
pa.field("metadata-text_as_html", pa.string(), nullable=True),
|
|
65
|
-
pa.field("metadata-filetype", pa.string(), nullable=True),
|
|
66
|
-
pa.field("metadata-filename", pa.string(), nullable=True),
|
|
67
|
-
pa.field("metadata-languages", pa.list_(pa.string()), nullable=True),
|
|
68
|
-
pa.field("metadata-is_continuation", pa.bool_(), nullable=True),
|
|
69
|
-
pa.field("metadata-page_number", pa.int32(), nullable=True),
|
|
70
|
-
]
|
|
71
|
-
)
|
|
72
|
-
NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@pytest_asyncio.fixture
|
|
76
|
-
async def connection_with_uri(request, tmp_path: Path):
|
|
77
|
-
target = request.param
|
|
78
|
-
uri = _get_uri(target, local_base_path=tmp_path)
|
|
79
|
-
|
|
80
|
-
unset_variables = [env for env in REQUIRED_ENV_VARS[target] if env not in os.environ]
|
|
81
|
-
if unset_variables:
|
|
82
|
-
pytest.skip(
|
|
83
|
-
reason="Following required environment variables were not set: "
|
|
84
|
-
+ f"{', '.join(unset_variables)}"
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
storage_options = {
|
|
88
|
-
"aws_access_key_id": os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
|
|
89
|
-
"aws_secret_access_key": os.getenv("S3_INGEST_TEST_SECRET_KEY"),
|
|
90
|
-
"google_service_account_key": os.getenv("GCP_INGEST_SERVICE_KEY"),
|
|
91
|
-
}
|
|
92
|
-
azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
|
|
93
|
-
if azure_connection_string:
|
|
94
|
-
storage_options.update(_parse_azure_connection_string(azure_connection_string))
|
|
95
|
-
|
|
96
|
-
storage_options = {key: value for key, value in storage_options.items() if value is not None}
|
|
97
|
-
connection = await lancedb.connect_async(
|
|
98
|
-
uri=uri,
|
|
99
|
-
storage_options=storage_options,
|
|
100
|
-
)
|
|
101
|
-
await connection.create_table(name=TABLE_NAME, schema=SCHEMA)
|
|
102
|
-
|
|
103
|
-
yield connection, uri
|
|
104
|
-
|
|
105
|
-
await connection.drop_database()
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
@pytest.mark.asyncio
|
|
109
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
110
|
-
@pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
|
|
111
|
-
async def test_lancedb_destination(
|
|
112
|
-
upload_file: Path,
|
|
113
|
-
connection_with_uri: tuple[AsyncConnection, str],
|
|
114
|
-
tmp_path: Path,
|
|
115
|
-
) -> None:
|
|
116
|
-
connection, uri = connection_with_uri
|
|
117
|
-
file_data = FileData(
|
|
118
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
119
|
-
connector_type=CONNECTOR_TYPE,
|
|
120
|
-
identifier="mock-file-data",
|
|
121
|
-
)
|
|
122
|
-
stager = LanceDBUploadStager()
|
|
123
|
-
uploader = _get_uploader(uri)
|
|
124
|
-
staged_file_path = stager.run(
|
|
125
|
-
elements_filepath=upload_file,
|
|
126
|
-
file_data=file_data,
|
|
127
|
-
output_dir=tmp_path,
|
|
128
|
-
output_filename=upload_file.name,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
await uploader.run_async(path=staged_file_path, file_data=file_data)
|
|
132
|
-
|
|
133
|
-
# Test upload to empty table
|
|
134
|
-
with await connection.open_table(TABLE_NAME) as table:
|
|
135
|
-
table_df: pd.DataFrame = await table.to_pandas()
|
|
136
|
-
|
|
137
|
-
assert len(table_df) == NUMBER_EXPECTED_ROWS
|
|
138
|
-
assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
|
|
139
|
-
|
|
140
|
-
assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
|
|
141
|
-
assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
|
|
142
|
-
assert table_df["type"][0] == "CompositeElement"
|
|
143
|
-
assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
|
|
144
|
-
assert table_df["metadata-text_as_html"][0] is None
|
|
145
|
-
|
|
146
|
-
# Test upload of the second file, rows should be appended
|
|
147
|
-
file_data.identifier = "mock-file-data-2"
|
|
148
|
-
staged_second_file_path = stager.run(
|
|
149
|
-
elements_filepath=upload_file,
|
|
150
|
-
file_data=file_data,
|
|
151
|
-
output_dir=tmp_path,
|
|
152
|
-
output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
|
|
153
|
-
)
|
|
154
|
-
await uploader.run_async(path=staged_second_file_path, file_data=file_data)
|
|
155
|
-
with await connection.open_table(TABLE_NAME) as table:
|
|
156
|
-
appended_table_df: pd.DataFrame = await table.to_pandas()
|
|
157
|
-
assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
|
|
158
|
-
|
|
159
|
-
# Test re-upload of the first file, rows should be overwritten, not appended
|
|
160
|
-
await uploader.run_async(path=staged_file_path, file_data=file_data)
|
|
161
|
-
with await connection.open_table(TABLE_NAME) as table:
|
|
162
|
-
overwritten_table_df: pd.DataFrame = await table.to_pandas()
|
|
163
|
-
assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
class TestPrecheck:
|
|
167
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
168
|
-
@pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
|
|
169
|
-
def test_succeeds(
|
|
170
|
-
self,
|
|
171
|
-
upload_file: Path,
|
|
172
|
-
connection_with_uri: tuple[AsyncConnection, str],
|
|
173
|
-
tmp_path: Path,
|
|
174
|
-
) -> None:
|
|
175
|
-
_, uri = connection_with_uri
|
|
176
|
-
uploader = _get_uploader(uri)
|
|
177
|
-
uploader.precheck()
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
|
|
181
|
-
if target == "local":
|
|
182
|
-
return str(local_base_path / DATABASE_NAME)
|
|
183
|
-
if target == "s3":
|
|
184
|
-
base_uri = UPath(S3_BUCKET)
|
|
185
|
-
elif target == "gcs":
|
|
186
|
-
base_uri = UPath(GS_BUCKET)
|
|
187
|
-
elif target == "az":
|
|
188
|
-
base_uri = UPath(AZURE_BUCKET)
|
|
189
|
-
|
|
190
|
-
return str(base_uri / "destination" / "lancedb" / str(uuid4()) / DATABASE_NAME)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _get_uploader(
|
|
194
|
-
uri: str,
|
|
195
|
-
) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
|
|
196
|
-
target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
|
|
197
|
-
upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
|
|
198
|
-
if target == "az":
|
|
199
|
-
azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
|
|
200
|
-
access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
|
|
201
|
-
return LanceDBAzureUploader(
|
|
202
|
-
upload_config=upload_config,
|
|
203
|
-
connection_config=LanceDBAzureConnectionConfig(
|
|
204
|
-
access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
|
|
205
|
-
uri=uri,
|
|
206
|
-
),
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
elif target == "s3":
|
|
210
|
-
return LanceDBAwsUploader(
|
|
211
|
-
upload_config=upload_config,
|
|
212
|
-
connection_config=LanceDBAwsConnectionConfig(
|
|
213
|
-
access_config=LanceDBAwsAccessConfig(
|
|
214
|
-
aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
|
|
215
|
-
aws_secret_access_key=os.getenv("S3_INGEST_TEST_SECRET_KEY"),
|
|
216
|
-
),
|
|
217
|
-
uri=uri,
|
|
218
|
-
),
|
|
219
|
-
)
|
|
220
|
-
elif target == "gs":
|
|
221
|
-
return LanceDBGSPUploader(
|
|
222
|
-
upload_config=upload_config,
|
|
223
|
-
connection_config=LanceDBGCSConnectionConfig(
|
|
224
|
-
access_config=LanceDBGCSAccessConfig(
|
|
225
|
-
google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
|
|
226
|
-
),
|
|
227
|
-
uri=uri,
|
|
228
|
-
),
|
|
229
|
-
)
|
|
230
|
-
else:
|
|
231
|
-
return LanceDBLocalUploader(
|
|
232
|
-
upload_config=upload_config,
|
|
233
|
-
connection_config=LanceDBLocalConnectionConfig(
|
|
234
|
-
access_config=LanceDBLocalAccessConfig(),
|
|
235
|
-
uri=uri,
|
|
236
|
-
),
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def _parse_azure_connection_string(
|
|
241
|
-
connection_str: str,
|
|
242
|
-
) -> dict[Literal["azure_storage_account_name", "azure_storage_account_key"], str]:
|
|
243
|
-
parameters = dict(keyvalue.split("=", maxsplit=1) for keyvalue in connection_str.split(";"))
|
|
244
|
-
return {
|
|
245
|
-
"azure_storage_account_name": parameters.get("AccountName"),
|
|
246
|
-
"azure_storage_account_key": parameters.get("AccountKey"),
|
|
247
|
-
}
|