unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import uuid
|
|
4
|
-
from contextlib import contextmanager
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from unittest import mock
|
|
8
|
-
|
|
9
|
-
import pytest
|
|
10
|
-
from databricks.sdk import WorkspaceClient
|
|
11
|
-
from databricks.sdk.errors.platform import NotFound
|
|
12
|
-
|
|
13
|
-
from test.integration.connectors.utils.constants import (
|
|
14
|
-
BLOB_STORAGE_TAG,
|
|
15
|
-
DESTINATION_TAG,
|
|
16
|
-
SOURCE_TAG,
|
|
17
|
-
)
|
|
18
|
-
from test.integration.connectors.utils.validation.source import (
|
|
19
|
-
SourceValidationConfigs,
|
|
20
|
-
source_connector_validation,
|
|
21
|
-
)
|
|
22
|
-
from test.integration.utils import requires_env
|
|
23
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
-
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
25
|
-
from unstructured_ingest.processes.connectors.databricks.volumes_native import (
|
|
26
|
-
CONNECTOR_TYPE,
|
|
27
|
-
DatabricksNativeVolumesAccessConfig,
|
|
28
|
-
DatabricksNativeVolumesConnectionConfig,
|
|
29
|
-
DatabricksNativeVolumesDownloader,
|
|
30
|
-
DatabricksNativeVolumesDownloaderConfig,
|
|
31
|
-
DatabricksNativeVolumesIndexer,
|
|
32
|
-
DatabricksNativeVolumesIndexerConfig,
|
|
33
|
-
DatabricksNativeVolumesUploader,
|
|
34
|
-
DatabricksNativeVolumesUploaderConfig,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@dataclass
|
|
39
|
-
class BaseEnvData:
|
|
40
|
-
host: str
|
|
41
|
-
catalog: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@dataclass
|
|
45
|
-
class BasicAuthEnvData(BaseEnvData):
|
|
46
|
-
client_id: str
|
|
47
|
-
client_secret: str
|
|
48
|
-
|
|
49
|
-
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
50
|
-
return DatabricksNativeVolumesConnectionConfig(
|
|
51
|
-
host=self.host,
|
|
52
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
53
|
-
client_id=self.client_id,
|
|
54
|
-
client_secret=self.client_secret,
|
|
55
|
-
),
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass
|
|
60
|
-
class PATEnvData(BaseEnvData):
|
|
61
|
-
token: str
|
|
62
|
-
|
|
63
|
-
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
64
|
-
return DatabricksNativeVolumesConnectionConfig(
|
|
65
|
-
host=self.host,
|
|
66
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
67
|
-
token=self.token,
|
|
68
|
-
),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def get_basic_auth_env_data() -> BasicAuthEnvData:
|
|
73
|
-
return BasicAuthEnvData(
|
|
74
|
-
host=os.environ["DATABRICKS_HOST"],
|
|
75
|
-
client_id=os.environ["DATABRICKS_CLIENT_ID"],
|
|
76
|
-
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
|
|
77
|
-
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def get_pat_env_data() -> PATEnvData:
|
|
82
|
-
return PATEnvData(
|
|
83
|
-
host=os.environ["DATABRICKS_HOST"],
|
|
84
|
-
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
85
|
-
token=os.environ["DATABRICKS_PAT"],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@pytest.mark.asyncio
|
|
90
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
91
|
-
@requires_env(
|
|
92
|
-
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
93
|
-
)
|
|
94
|
-
async def test_volumes_native_source(tmp_path: Path):
|
|
95
|
-
env_data = get_basic_auth_env_data()
|
|
96
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
97
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
98
|
-
recursive=True,
|
|
99
|
-
volume="test-platform",
|
|
100
|
-
volume_path="databricks-volumes-test-input",
|
|
101
|
-
catalog=env_data.catalog,
|
|
102
|
-
)
|
|
103
|
-
connection_config = env_data.get_connection_config()
|
|
104
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
105
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
106
|
-
connection_config=connection_config, index_config=indexer_config
|
|
107
|
-
)
|
|
108
|
-
downloader = DatabricksNativeVolumesDownloader(
|
|
109
|
-
connection_config=connection_config, download_config=download_config
|
|
110
|
-
)
|
|
111
|
-
await source_connector_validation(
|
|
112
|
-
indexer=indexer,
|
|
113
|
-
downloader=downloader,
|
|
114
|
-
configs=SourceValidationConfigs(
|
|
115
|
-
test_id="databricks_volumes_native",
|
|
116
|
-
expected_num_files=1,
|
|
117
|
-
),
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
@pytest.mark.asyncio
|
|
122
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
123
|
-
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
124
|
-
async def test_volumes_native_source_pat(tmp_path: Path):
|
|
125
|
-
env_data = get_pat_env_data()
|
|
126
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
127
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
128
|
-
recursive=True,
|
|
129
|
-
volume="test-platform",
|
|
130
|
-
volume_path="databricks-volumes-test-input",
|
|
131
|
-
catalog=env_data.catalog,
|
|
132
|
-
)
|
|
133
|
-
connection_config = env_data.get_connection_config()
|
|
134
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
135
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
136
|
-
connection_config=connection_config, index_config=indexer_config
|
|
137
|
-
)
|
|
138
|
-
downloader = DatabricksNativeVolumesDownloader(
|
|
139
|
-
connection_config=connection_config, download_config=download_config
|
|
140
|
-
)
|
|
141
|
-
await source_connector_validation(
|
|
142
|
-
indexer=indexer,
|
|
143
|
-
downloader=downloader,
|
|
144
|
-
configs=SourceValidationConfigs(
|
|
145
|
-
test_id="databricks_volumes_native_pat",
|
|
146
|
-
expected_num_files=1,
|
|
147
|
-
),
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
152
|
-
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
153
|
-
def test_volumes_native_source_pat_invalid_catalog():
|
|
154
|
-
env_data = get_pat_env_data()
|
|
155
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
156
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
157
|
-
recursive=True,
|
|
158
|
-
volume="test-platform",
|
|
159
|
-
volume_path="databricks-volumes-test-input",
|
|
160
|
-
catalog="fake_catalog",
|
|
161
|
-
)
|
|
162
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
163
|
-
connection_config=env_data.get_connection_config(), index_config=indexer_config
|
|
164
|
-
)
|
|
165
|
-
with pytest.raises(UserError):
|
|
166
|
-
_ = list(indexer.run())
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
170
|
-
@requires_env("DATABRICKS_HOST")
|
|
171
|
-
def test_volumes_native_source_pat_invalid_pat():
|
|
172
|
-
host = os.environ["DATABRICKS_HOST"]
|
|
173
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
174
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
175
|
-
recursive=True,
|
|
176
|
-
volume="test-platform",
|
|
177
|
-
volume_path="databricks-volumes-test-input",
|
|
178
|
-
catalog="fake_catalog",
|
|
179
|
-
)
|
|
180
|
-
connection_config = DatabricksNativeVolumesConnectionConfig(
|
|
181
|
-
host=host,
|
|
182
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
183
|
-
token="invalid-token",
|
|
184
|
-
),
|
|
185
|
-
)
|
|
186
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
187
|
-
connection_config=connection_config, index_config=indexer_config
|
|
188
|
-
)
|
|
189
|
-
with pytest.raises(UserAuthError):
|
|
190
|
-
_ = list(indexer.run())
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _get_volume_path(catalog: str, volume: str, volume_path: str):
|
|
194
|
-
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@contextmanager
|
|
198
|
-
def databricks_destination_context(
|
|
199
|
-
env_data: BasicAuthEnvData, volume: str, volume_path
|
|
200
|
-
) -> WorkspaceClient:
|
|
201
|
-
client = WorkspaceClient(
|
|
202
|
-
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
|
|
203
|
-
)
|
|
204
|
-
try:
|
|
205
|
-
yield client
|
|
206
|
-
finally:
|
|
207
|
-
# Cleanup
|
|
208
|
-
try:
|
|
209
|
-
for file in client.files.list_directory_contents(
|
|
210
|
-
directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
|
|
211
|
-
):
|
|
212
|
-
client.files.delete(file.path)
|
|
213
|
-
client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
|
|
214
|
-
except NotFound:
|
|
215
|
-
# Directory was never created, don't need to delete
|
|
216
|
-
pass
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
|
|
220
|
-
files = list(
|
|
221
|
-
client.files.list_directory_contents(
|
|
222
|
-
directory_path=_get_volume_path(catalog, volume, volume_path)
|
|
223
|
-
)
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
assert len(files) == 1
|
|
227
|
-
|
|
228
|
-
resp = client.files.download(files[0].path)
|
|
229
|
-
data = json.loads(resp.contents.read())
|
|
230
|
-
|
|
231
|
-
assert len(data) == 22
|
|
232
|
-
element_types = {v["type"] for v in data}
|
|
233
|
-
assert len(element_types) == 1
|
|
234
|
-
assert "CompositeElement" in element_types
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
@pytest.mark.asyncio
|
|
238
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
239
|
-
@requires_env(
|
|
240
|
-
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
241
|
-
)
|
|
242
|
-
async def test_volumes_native_destination(upload_file: Path):
|
|
243
|
-
env_data = get_basic_auth_env_data()
|
|
244
|
-
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
|
|
245
|
-
file_data = FileData(
|
|
246
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
247
|
-
connector_type=CONNECTOR_TYPE,
|
|
248
|
-
identifier="mock file data",
|
|
249
|
-
)
|
|
250
|
-
with databricks_destination_context(
|
|
251
|
-
volume="test-platform", volume_path=volume_path, env_data=env_data
|
|
252
|
-
) as workspace_client:
|
|
253
|
-
connection_config = env_data.get_connection_config()
|
|
254
|
-
uploader = DatabricksNativeVolumesUploader(
|
|
255
|
-
connection_config=connection_config,
|
|
256
|
-
upload_config=DatabricksNativeVolumesUploaderConfig(
|
|
257
|
-
volume="test-platform",
|
|
258
|
-
volume_path=volume_path,
|
|
259
|
-
catalog=env_data.catalog,
|
|
260
|
-
),
|
|
261
|
-
)
|
|
262
|
-
uploader.precheck()
|
|
263
|
-
if uploader.is_async():
|
|
264
|
-
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
265
|
-
else:
|
|
266
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
267
|
-
|
|
268
|
-
validate_upload(
|
|
269
|
-
client=workspace_client,
|
|
270
|
-
catalog=env_data.catalog,
|
|
271
|
-
volume="test-platform",
|
|
272
|
-
volume_path=volume_path,
|
|
273
|
-
)
|
|
File without changes
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tempfile
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import pytest
|
|
8
|
-
|
|
9
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
10
|
-
from test.integration.connectors.utils.validation.source import (
|
|
11
|
-
SourceValidationConfigs,
|
|
12
|
-
source_connector_validation,
|
|
13
|
-
)
|
|
14
|
-
from test.integration.utils import requires_env
|
|
15
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
16
|
-
from unstructured_ingest.processes.connectors.discord import (
|
|
17
|
-
CONNECTOR_TYPE,
|
|
18
|
-
DiscordAccessConfig,
|
|
19
|
-
DiscordConnectionConfig,
|
|
20
|
-
DiscordDownloader,
|
|
21
|
-
DiscordDownloaderConfig,
|
|
22
|
-
DiscordIndexer,
|
|
23
|
-
DiscordIndexerConfig,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass(frozen=True)
|
|
28
|
-
class EnvData:
|
|
29
|
-
token: Optional[str]
|
|
30
|
-
channels: Optional[list[str]]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def get_env_data() -> EnvData:
|
|
34
|
-
return EnvData(
|
|
35
|
-
token=os.getenv("DISCORD_TOKEN"),
|
|
36
|
-
channels=os.getenv("DISCORD_CHANNELS", default=[]).split(","),
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@pytest.mark.asyncio
|
|
41
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
42
|
-
@requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
|
|
43
|
-
async def test_discord_source():
|
|
44
|
-
env = get_env_data()
|
|
45
|
-
indexer_config = DiscordIndexerConfig(channels=env.channels)
|
|
46
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
47
|
-
tempdir_path = Path(tempdir)
|
|
48
|
-
connection_config = DiscordConnectionConfig(
|
|
49
|
-
access_config=DiscordAccessConfig(token=env.token)
|
|
50
|
-
)
|
|
51
|
-
download_config = DiscordDownloaderConfig(download_dir=tempdir_path)
|
|
52
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
53
|
-
downloader = DiscordDownloader(
|
|
54
|
-
connection_config=connection_config, download_config=download_config
|
|
55
|
-
)
|
|
56
|
-
expected_num_files = len(env.channels)
|
|
57
|
-
await source_connector_validation(
|
|
58
|
-
indexer=indexer,
|
|
59
|
-
downloader=downloader,
|
|
60
|
-
configs=SourceValidationConfigs(
|
|
61
|
-
test_id=CONNECTOR_TYPE,
|
|
62
|
-
expected_num_files=expected_num_files,
|
|
63
|
-
expected_number_indexed_file_data=expected_num_files,
|
|
64
|
-
validate_downloaded_files=True,
|
|
65
|
-
),
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
70
|
-
@requires_env("DISCORD_CHANNELS")
|
|
71
|
-
def test_discord_source_precheck_fail_no_token():
|
|
72
|
-
indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
|
|
73
|
-
|
|
74
|
-
connection_config = DiscordConnectionConfig(access_config=DiscordAccessConfig(token=""))
|
|
75
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
76
|
-
with pytest.raises(SourceConnectionError):
|
|
77
|
-
indexer.precheck()
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
81
|
-
@requires_env("DISCORD_TOKEN")
|
|
82
|
-
def test_discord_source_precheck_fail_no_channels():
|
|
83
|
-
indexer_config = DiscordIndexerConfig(channels=[])
|
|
84
|
-
|
|
85
|
-
connection_config = DiscordConnectionConfig(
|
|
86
|
-
access_config=DiscordAccessConfig(token=get_env_data().token)
|
|
87
|
-
)
|
|
88
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
89
|
-
with pytest.raises(SourceConnectionError):
|
|
90
|
-
indexer.precheck()
|
|
File without changes
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
int_test_dir = Path(__file__).parent
|
|
6
|
-
assets_dir = int_test_dir / "assets"
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@pytest.fixture
|
|
10
|
-
def duckdb_schema() -> Path:
|
|
11
|
-
schema_file = assets_dir / "duckdb-schema.sql"
|
|
12
|
-
assert schema_file.exists()
|
|
13
|
-
assert schema_file.is_file()
|
|
14
|
-
return schema_file
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import duckdb
|
|
5
|
-
import pytest
|
|
6
|
-
from _pytest.fixtures import TopRequest
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
9
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
10
|
-
StagerValidationConfigs,
|
|
11
|
-
stager_validation,
|
|
12
|
-
)
|
|
13
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
14
|
-
from unstructured_ingest.processes.connectors.duckdb.duckdb import (
|
|
15
|
-
CONNECTOR_TYPE,
|
|
16
|
-
DuckDBConnectionConfig,
|
|
17
|
-
DuckDBUploader,
|
|
18
|
-
DuckDBUploaderConfig,
|
|
19
|
-
DuckDBUploadStager,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.fixture
|
|
24
|
-
def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
|
|
25
|
-
db_path = Path(temp_dir) / "temp_duck.db"
|
|
26
|
-
with duckdb.connect(database=db_path) as duckdb_connection:
|
|
27
|
-
with duckdb_schema.open("r") as f:
|
|
28
|
-
query = f.read()
|
|
29
|
-
duckdb_connection.execute(query)
|
|
30
|
-
duckdb_connection.close()
|
|
31
|
-
return db_path
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
35
|
-
conn = None
|
|
36
|
-
try:
|
|
37
|
-
conn = duckdb.connect(db_path)
|
|
38
|
-
_results = conn.sql("select count(*) from elements").fetchall()
|
|
39
|
-
_count = _results[0][0]
|
|
40
|
-
assert (
|
|
41
|
-
_count == expected_num_elements
|
|
42
|
-
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
43
|
-
conn.close()
|
|
44
|
-
finally:
|
|
45
|
-
if conn:
|
|
46
|
-
conn.close()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
|
|
50
|
-
def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
|
|
51
|
-
file_data = FileData(
|
|
52
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
53
|
-
connector_type=CONNECTOR_TYPE,
|
|
54
|
-
identifier="mock-file-data",
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
stager = DuckDBUploadStager()
|
|
58
|
-
staged_path = stager.run(
|
|
59
|
-
elements_filepath=upload_file,
|
|
60
|
-
file_data=file_data,
|
|
61
|
-
output_dir=temp_dir,
|
|
62
|
-
output_filename=upload_file.name,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
|
|
66
|
-
upload_config = DuckDBUploaderConfig()
|
|
67
|
-
uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
68
|
-
|
|
69
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
70
|
-
|
|
71
|
-
with staged_path.open() as f:
|
|
72
|
-
data = json.load(f)
|
|
73
|
-
validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
|
|
77
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
78
|
-
def test_duckdb_stager(
|
|
79
|
-
request: TopRequest,
|
|
80
|
-
upload_file_str: str,
|
|
81
|
-
tmp_path: Path,
|
|
82
|
-
):
|
|
83
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
84
|
-
stager = DuckDBUploadStager()
|
|
85
|
-
stager_validation(
|
|
86
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
87
|
-
input_file=upload_file,
|
|
88
|
-
stager=stager,
|
|
89
|
-
tmp_dir=tmp_path,
|
|
90
|
-
)
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import uuid
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Generator
|
|
5
|
-
|
|
6
|
-
import duckdb
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import pytest
|
|
9
|
-
|
|
10
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
11
|
-
from test.integration.utils import requires_env
|
|
12
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
13
|
-
from unstructured_ingest.processes.connectors.duckdb.motherduck import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
MotherDuckAccessConfig,
|
|
16
|
-
MotherDuckConnectionConfig,
|
|
17
|
-
MotherDuckUploader,
|
|
18
|
-
MotherDuckUploaderConfig,
|
|
19
|
-
MotherDuckUploadStager,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.fixture
|
|
24
|
-
def md_token() -> str:
|
|
25
|
-
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
26
|
-
assert motherduck_token
|
|
27
|
-
return motherduck_token
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@pytest.fixture
|
|
31
|
-
def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
|
|
32
|
-
database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
|
|
33
|
-
try:
|
|
34
|
-
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
35
|
-
with duckdb_schema.open("r") as f:
|
|
36
|
-
query = f.read()
|
|
37
|
-
md_conn.execute(f"CREATE DATABASE {database_name}")
|
|
38
|
-
md_conn.execute(f"USE {database_name}")
|
|
39
|
-
md_conn.execute(query)
|
|
40
|
-
md_conn.close()
|
|
41
|
-
yield database_name
|
|
42
|
-
finally:
|
|
43
|
-
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
44
|
-
md_conn.execute(f"DROP DATABASE {database_name}")
|
|
45
|
-
md_conn.close()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
|
|
49
|
-
conn = None
|
|
50
|
-
try:
|
|
51
|
-
conn = duckdb.connect(f"md:?motherduck_token={md_token}")
|
|
52
|
-
conn.execute(f"USE {database}")
|
|
53
|
-
_results = conn.sql("select count(*) from elements").fetchall()
|
|
54
|
-
_count = _results[0][0]
|
|
55
|
-
assert (
|
|
56
|
-
_count == expected_num_elements
|
|
57
|
-
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
58
|
-
conn.close()
|
|
59
|
-
finally:
|
|
60
|
-
if conn:
|
|
61
|
-
conn.close()
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
65
|
-
@requires_env("MOTHERDUCK_TOKEN")
|
|
66
|
-
def test_motherduck_destination(
|
|
67
|
-
md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
|
|
68
|
-
):
|
|
69
|
-
file_data = FileData(
|
|
70
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
71
|
-
connector_type=CONNECTOR_TYPE,
|
|
72
|
-
identifier="mock-file-data",
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
stager = MotherDuckUploadStager()
|
|
76
|
-
staged_path = stager.run(
|
|
77
|
-
elements_filepath=upload_file,
|
|
78
|
-
file_data=file_data,
|
|
79
|
-
output_dir=temp_dir,
|
|
80
|
-
output_filename=upload_file.name,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
access_config = MotherDuckAccessConfig(md_token=md_token)
|
|
84
|
-
connection_config = MotherDuckConnectionConfig(
|
|
85
|
-
database=provisioned_db, access_config=access_config
|
|
86
|
-
)
|
|
87
|
-
upload_config = MotherDuckUploaderConfig()
|
|
88
|
-
uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
|
|
89
|
-
|
|
90
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
91
|
-
|
|
92
|
-
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
93
|
-
validate_motherduck_destination(
|
|
94
|
-
database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
|
|
95
|
-
)
|
|
File without changes
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
int_test_dir = Path(__file__).parent
|
|
8
|
-
assets_dir = int_test_dir / "assets"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def movies_dataframe() -> pd.DataFrame:
|
|
13
|
-
movies_file = assets_dir / "wiki_movie_plots_small.csv"
|
|
14
|
-
assert movies_file.exists()
|
|
15
|
-
assert movies_file.is_file()
|
|
16
|
-
return pd.read_csv(movies_file).dropna().reset_index()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@pytest.fixture
|
|
20
|
-
def opensearch_elements_mapping() -> dict:
|
|
21
|
-
elements_mapping_file = assets_dir / "opensearch_elements_mappings.json"
|
|
22
|
-
assert elements_mapping_file.exists()
|
|
23
|
-
assert elements_mapping_file.is_file()
|
|
24
|
-
with elements_mapping_file.open() as fp:
|
|
25
|
-
return json.load(fp)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@pytest.fixture
|
|
29
|
-
def elasticsearch_elements_mapping() -> dict:
|
|
30
|
-
elements_mapping_file = assets_dir / "elasticsearch_elements_mappings.json"
|
|
31
|
-
assert elements_mapping_file.exists()
|
|
32
|
-
assert elements_mapping_file.is_file()
|
|
33
|
-
with elements_mapping_file.open() as fp:
|
|
34
|
-
return json.load(fp)
|