unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,459 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from unittest.mock import MagicMock
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pytest
|
|
6
|
-
from pydantic import Secret
|
|
7
|
-
from pyiceberg.exceptions import CommitFailedException
|
|
8
|
-
from pytest_mock import MockerFixture
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
11
|
-
from unstructured_ingest.errors_v2 import ProviderError, UserError
|
|
12
|
-
from unstructured_ingest.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
13
|
-
from unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
|
|
14
|
-
IbmWatsonxAccessConfig,
|
|
15
|
-
IbmWatsonxConnectionConfig,
|
|
16
|
-
IbmWatsonxUploader,
|
|
17
|
-
IbmWatsonxUploaderConfig,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@pytest.fixture
|
|
22
|
-
def file_data():
|
|
23
|
-
return FileData(
|
|
24
|
-
identifier="test_identifier",
|
|
25
|
-
connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
|
|
26
|
-
source_identifiers=SourceIdentifiers(
|
|
27
|
-
filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
|
|
28
|
-
),
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.fixture
|
|
33
|
-
def access_config():
|
|
34
|
-
return IbmWatsonxAccessConfig(
|
|
35
|
-
iam_api_key="test_iam_api_key",
|
|
36
|
-
access_key_id="test_access_key_id",
|
|
37
|
-
secret_access_key="test_secret_access_key",
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@pytest.fixture
|
|
42
|
-
def connection_config(access_config: IbmWatsonxAccessConfig):
|
|
43
|
-
return IbmWatsonxConnectionConfig(
|
|
44
|
-
access_config=Secret(access_config),
|
|
45
|
-
iceberg_endpoint="test_iceberg_endpoint/",
|
|
46
|
-
object_storage_endpoint="test_object_storage_endpoint/",
|
|
47
|
-
object_storage_region="test_region",
|
|
48
|
-
catalog="test_catalog",
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@pytest.fixture
|
|
53
|
-
def uploader_config():
|
|
54
|
-
return IbmWatsonxUploaderConfig(
|
|
55
|
-
namespace="test_namespace",
|
|
56
|
-
table="test_table",
|
|
57
|
-
record_id_key="test_record_id_key",
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@pytest.fixture
|
|
62
|
-
def uploader(
|
|
63
|
-
connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
|
|
64
|
-
):
|
|
65
|
-
return IbmWatsonxUploader(
|
|
66
|
-
connection_config=connection_config,
|
|
67
|
-
upload_config=uploader_config,
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
@pytest.fixture
|
|
72
|
-
def mock_catalog(mocker: MockerFixture):
|
|
73
|
-
mock_catalog = mocker.MagicMock()
|
|
74
|
-
mock_catalog.namespace_exists.return_value = True
|
|
75
|
-
mock_catalog.table_exists.return_value = True
|
|
76
|
-
return mock_catalog
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@pytest.fixture
|
|
80
|
-
def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
|
|
81
|
-
mock_get_catalog = mocker.patch.context_manager(
|
|
82
|
-
IbmWatsonxConnectionConfig, "get_catalog", autospec=True
|
|
83
|
-
)
|
|
84
|
-
mock_get_catalog.return_value.__enter__.return_value = mock_catalog
|
|
85
|
-
return mock_get_catalog
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@pytest.fixture
|
|
89
|
-
def mock_table(mocker: MockerFixture):
|
|
90
|
-
mock_table = mocker.MagicMock()
|
|
91
|
-
return mock_table
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
@pytest.fixture
|
|
95
|
-
def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
|
|
96
|
-
mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
|
|
97
|
-
mock_get_table.return_value.__enter__.return_value = mock_table
|
|
98
|
-
return mock_get_table
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
@pytest.fixture
|
|
102
|
-
def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
|
|
103
|
-
mock_transaction = mocker.MagicMock()
|
|
104
|
-
mock_table.transaction.return_value.__enter__.return_value = mock_transaction
|
|
105
|
-
return mock_transaction
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
@pytest.fixture
|
|
109
|
-
def mock_data_table(mocker: MockerFixture):
|
|
110
|
-
mock_data_table = mocker.MagicMock()
|
|
111
|
-
mock_data_table.schema = "schema"
|
|
112
|
-
return mock_data_table
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@pytest.fixture
|
|
116
|
-
def mock_delete(mocker: MockerFixture):
|
|
117
|
-
return mocker.patch.object(IbmWatsonxUploader, "_delete")
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
@pytest.fixture
|
|
121
|
-
def test_df():
|
|
122
|
-
return pd.DataFrame(
|
|
123
|
-
{
|
|
124
|
-
"test_column_0": [True, False, True],
|
|
125
|
-
"test_column_1": [1, 2, 3],
|
|
126
|
-
"test_column_2": ["a", "b", "c"],
|
|
127
|
-
}
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
@pytest.fixture
|
|
132
|
-
def timestamp_now():
|
|
133
|
-
return int(time.time())
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def test_ibm_watsonx_connection_config_iceberg_url(
|
|
137
|
-
mocker: MockerFixture,
|
|
138
|
-
connection_config: IbmWatsonxConnectionConfig,
|
|
139
|
-
):
|
|
140
|
-
mocker.patch(
|
|
141
|
-
"unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
142
|
-
new="/mds/iceberg",
|
|
143
|
-
)
|
|
144
|
-
expected_url = "https://test_iceberg_endpoint/mds/iceberg"
|
|
145
|
-
assert connection_config.iceberg_url == expected_url
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def test_ibm_watsonx_connection_config_object_storage_url(
|
|
149
|
-
connection_config: IbmWatsonxConnectionConfig,
|
|
150
|
-
):
|
|
151
|
-
expected_url = "https://test_object_storage_endpoint"
|
|
152
|
-
assert connection_config.object_storage_url == expected_url
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def test_ibm_watsonx_connection_config_bearer_token_new_token(
|
|
156
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
157
|
-
):
|
|
158
|
-
mock_generate_bearer_token = mocker.patch.object(
|
|
159
|
-
IbmWatsonxConnectionConfig,
|
|
160
|
-
"generate_bearer_token",
|
|
161
|
-
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
162
|
-
)
|
|
163
|
-
token = connection_config.bearer_token
|
|
164
|
-
assert token == "new_token"
|
|
165
|
-
mock_generate_bearer_token.assert_called_once()
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def test_ibm_watsonx_connection_config_bearer_token_existing_token(
|
|
169
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
170
|
-
):
|
|
171
|
-
connection_config._bearer_token = {
|
|
172
|
-
"access_token": "existing_token",
|
|
173
|
-
"expiration": timestamp_now + 3600,
|
|
174
|
-
}
|
|
175
|
-
mock_generate_bearer_token = mocker.patch.object(
|
|
176
|
-
IbmWatsonxConnectionConfig, "generate_bearer_token"
|
|
177
|
-
)
|
|
178
|
-
token = connection_config.bearer_token
|
|
179
|
-
assert token == "existing_token"
|
|
180
|
-
mock_generate_bearer_token.assert_not_called()
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def test_ibm_watsonx_connection_config_bearer_token_expired_token(
|
|
184
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
185
|
-
):
|
|
186
|
-
connection_config._bearer_token = {
|
|
187
|
-
"access_token": "expired_token",
|
|
188
|
-
"expiration": timestamp_now - 3600,
|
|
189
|
-
}
|
|
190
|
-
mock_generate_bearer_token = mocker.patch.object(
|
|
191
|
-
IbmWatsonxConnectionConfig,
|
|
192
|
-
"generate_bearer_token",
|
|
193
|
-
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
194
|
-
)
|
|
195
|
-
token = connection_config.bearer_token
|
|
196
|
-
assert token == "new_token"
|
|
197
|
-
mock_generate_bearer_token.assert_called_once()
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
|
|
201
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
202
|
-
):
|
|
203
|
-
connection_config._bearer_token = {
|
|
204
|
-
"access_token": "soon_to_expire_token",
|
|
205
|
-
"expiration": timestamp_now + 60,
|
|
206
|
-
}
|
|
207
|
-
mock_generate_bearer_token = mocker.patch.object(
|
|
208
|
-
IbmWatsonxConnectionConfig,
|
|
209
|
-
"generate_bearer_token",
|
|
210
|
-
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
211
|
-
)
|
|
212
|
-
token = connection_config.bearer_token
|
|
213
|
-
assert token == "new_token"
|
|
214
|
-
mock_generate_bearer_token.assert_called_once()
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def test_ibm_watsonx_connection_config_get_catalog_success(
|
|
218
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
219
|
-
):
|
|
220
|
-
mocker.patch(
|
|
221
|
-
"unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
222
|
-
new="/mds/iceberg",
|
|
223
|
-
)
|
|
224
|
-
mocker.patch.object(
|
|
225
|
-
IbmWatsonxConnectionConfig,
|
|
226
|
-
"bearer_token",
|
|
227
|
-
new="test_bearer_token",
|
|
228
|
-
)
|
|
229
|
-
mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
|
|
230
|
-
|
|
231
|
-
with connection_config.get_catalog() as catalog:
|
|
232
|
-
assert catalog is not None
|
|
233
|
-
mock_load_catalog.assert_called_once_with(
|
|
234
|
-
**{
|
|
235
|
-
"name": "test_catalog",
|
|
236
|
-
"type": "rest",
|
|
237
|
-
"uri": "https://test_iceberg_endpoint/mds/iceberg",
|
|
238
|
-
"token": "test_bearer_token",
|
|
239
|
-
"warehouse": "test_catalog",
|
|
240
|
-
"s3.endpoint": "https://test_object_storage_endpoint",
|
|
241
|
-
"s3.access-key-id": "test_access_key_id",
|
|
242
|
-
"s3.secret-access-key": "test_secret_access_key",
|
|
243
|
-
"s3.region": "test_region",
|
|
244
|
-
}
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def test_ibm_watsonx_connection_config_get_catalog_failure(
|
|
249
|
-
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
250
|
-
):
|
|
251
|
-
mocker.patch(
|
|
252
|
-
"pyiceberg.catalog.load_catalog",
|
|
253
|
-
side_effect=Exception("Connection error"),
|
|
254
|
-
)
|
|
255
|
-
mocker.patch.object(
|
|
256
|
-
IbmWatsonxConnectionConfig,
|
|
257
|
-
"bearer_token",
|
|
258
|
-
new="test_bearer_token",
|
|
259
|
-
)
|
|
260
|
-
with pytest.raises(ProviderError):
|
|
261
|
-
with connection_config.get_catalog():
|
|
262
|
-
pass
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
|
|
266
|
-
mock_get_catalog: MagicMock,
|
|
267
|
-
mock_catalog: MagicMock,
|
|
268
|
-
uploader: IbmWatsonxUploader,
|
|
269
|
-
):
|
|
270
|
-
uploader.precheck()
|
|
271
|
-
|
|
272
|
-
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
273
|
-
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
|
|
277
|
-
mock_get_catalog: MagicMock,
|
|
278
|
-
mock_catalog: MagicMock,
|
|
279
|
-
uploader: IbmWatsonxUploader,
|
|
280
|
-
):
|
|
281
|
-
mock_catalog.namespace_exists.return_value = False
|
|
282
|
-
|
|
283
|
-
with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
|
|
284
|
-
uploader.precheck()
|
|
285
|
-
|
|
286
|
-
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
287
|
-
mock_catalog.table_exists.assert_not_called()
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
|
|
291
|
-
mock_get_catalog: MagicMock,
|
|
292
|
-
mock_catalog: MagicMock,
|
|
293
|
-
uploader: IbmWatsonxUploader,
|
|
294
|
-
):
|
|
295
|
-
mock_catalog.table_exists.return_value = False
|
|
296
|
-
|
|
297
|
-
with pytest.raises(
|
|
298
|
-
UserError,
|
|
299
|
-
match="Table 'test_table' does not exist in namespace 'test_namespace'",
|
|
300
|
-
):
|
|
301
|
-
uploader.precheck()
|
|
302
|
-
|
|
303
|
-
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
304
|
-
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def test_ibm_watsonx_uploader_upload_data_table_success(
|
|
308
|
-
uploader: IbmWatsonxUploader,
|
|
309
|
-
mock_table: MagicMock,
|
|
310
|
-
mock_transaction: MagicMock,
|
|
311
|
-
mock_data_table: MagicMock,
|
|
312
|
-
mock_delete: MagicMock,
|
|
313
|
-
file_data: FileData,
|
|
314
|
-
):
|
|
315
|
-
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
316
|
-
|
|
317
|
-
mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
|
|
318
|
-
mock_transaction.append.assert_called_once_with(mock_data_table)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
|
|
322
|
-
uploader: IbmWatsonxUploader,
|
|
323
|
-
mock_table: MagicMock,
|
|
324
|
-
mock_transaction: MagicMock,
|
|
325
|
-
mock_data_table: MagicMock,
|
|
326
|
-
mock_delete: MagicMock,
|
|
327
|
-
file_data: FileData,
|
|
328
|
-
):
|
|
329
|
-
mock_transaction.append.side_effect = CommitFailedException()
|
|
330
|
-
|
|
331
|
-
with pytest.raises(ProviderError):
|
|
332
|
-
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
333
|
-
assert mock_table.refresh.call_count == 5
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def test_ibm_watsonx_uploader_upload_data_table_exception(
|
|
337
|
-
uploader: IbmWatsonxUploader,
|
|
338
|
-
mock_table: MagicMock,
|
|
339
|
-
mock_transaction: MagicMock,
|
|
340
|
-
mock_data_table: MagicMock,
|
|
341
|
-
mock_delete: MagicMock,
|
|
342
|
-
file_data: FileData,
|
|
343
|
-
):
|
|
344
|
-
mock_transaction.append.side_effect = Exception()
|
|
345
|
-
|
|
346
|
-
with pytest.raises(ProviderError):
|
|
347
|
-
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
348
|
-
assert mock_table.refresh.call_count == 0
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
def test_ibm_watsonx_uploader_df_to_arrow_table(
|
|
352
|
-
mocker: MockerFixture,
|
|
353
|
-
uploader: IbmWatsonxUploader,
|
|
354
|
-
test_df: pd.DataFrame,
|
|
355
|
-
):
|
|
356
|
-
mock_fit_to_schema = mocker.patch.object(
|
|
357
|
-
IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
result = uploader._df_to_arrow_table(test_df)
|
|
361
|
-
|
|
362
|
-
mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
|
|
363
|
-
assert len(result.column_names) == 3
|
|
364
|
-
assert "test_column_0" in result.column_names
|
|
365
|
-
assert "test_column_1" in result.column_names
|
|
366
|
-
assert "test_column_2" in result.column_names
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
def test_ibm_watsonx_uploader_can_delete_column_exists(
|
|
370
|
-
mocker: MockerFixture,
|
|
371
|
-
uploader: IbmWatsonxUploader,
|
|
372
|
-
):
|
|
373
|
-
mocker.patch.object(
|
|
374
|
-
IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
assert uploader.can_delete() is True
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
|
|
381
|
-
mocker: MockerFixture,
|
|
382
|
-
uploader: IbmWatsonxUploader,
|
|
383
|
-
):
|
|
384
|
-
mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
|
|
385
|
-
|
|
386
|
-
assert uploader.can_delete() is False
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
def test_ibm_watsonx_uploader_get_table_columns_cache(
|
|
390
|
-
uploader: IbmWatsonxUploader,
|
|
391
|
-
):
|
|
392
|
-
uploader._columns = ["cached_column"]
|
|
393
|
-
|
|
394
|
-
result = uploader.get_table_columns()
|
|
395
|
-
|
|
396
|
-
assert result == ["cached_column"]
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def test_ibm_watsonx_uploader_get_table_columns_no_cache(
|
|
400
|
-
uploader: IbmWatsonxUploader,
|
|
401
|
-
mock_get_table: MagicMock,
|
|
402
|
-
mock_table: MagicMock,
|
|
403
|
-
):
|
|
404
|
-
uploader._columns = None
|
|
405
|
-
mock_table.schema.return_value.column_names = ["column_1", "column_2"]
|
|
406
|
-
|
|
407
|
-
result = uploader.get_table_columns()
|
|
408
|
-
|
|
409
|
-
mock_get_table.assert_called_once()
|
|
410
|
-
assert result == ["column_1", "column_2"]
|
|
411
|
-
assert uploader._columns == ["column_1", "column_2"]
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
def test_ibm_watsonx_uploader_upload_dataframe_success(
|
|
415
|
-
mocker: MockerFixture,
|
|
416
|
-
uploader: IbmWatsonxUploader,
|
|
417
|
-
test_df: pd.DataFrame,
|
|
418
|
-
mock_get_table: MagicMock,
|
|
419
|
-
mock_table: MagicMock,
|
|
420
|
-
mock_data_table: MagicMock,
|
|
421
|
-
file_data: FileData,
|
|
422
|
-
):
|
|
423
|
-
mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
|
|
424
|
-
mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
|
|
425
|
-
|
|
426
|
-
uploader.upload_dataframe(test_df, file_data)
|
|
427
|
-
|
|
428
|
-
mock_get_table.assert_called_once()
|
|
429
|
-
mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def test_ibm_watsonx_uploader_delete_can_delete(
|
|
433
|
-
mocker: MockerFixture,
|
|
434
|
-
uploader: IbmWatsonxUploader,
|
|
435
|
-
mock_transaction: MagicMock,
|
|
436
|
-
):
|
|
437
|
-
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
|
|
438
|
-
mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
|
|
439
|
-
|
|
440
|
-
uploader._delete(mock_transaction, "test_identifier")
|
|
441
|
-
|
|
442
|
-
mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
|
|
443
|
-
mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
def test_ibm_watsonx_uploader_delete_cannot_delete(
|
|
447
|
-
caplog: pytest.LogCaptureFixture,
|
|
448
|
-
mocker: MockerFixture,
|
|
449
|
-
uploader: IbmWatsonxUploader,
|
|
450
|
-
mock_transaction: MagicMock,
|
|
451
|
-
):
|
|
452
|
-
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
|
|
453
|
-
|
|
454
|
-
uploader._delete(mock_transaction, "test_identifier")
|
|
455
|
-
mock_transaction.delete.assert_not_called()
|
|
456
|
-
assert (
|
|
457
|
-
"Table doesn't contain expected record id column test_record_id_key, skipping delete"
|
|
458
|
-
in caplog.text
|
|
459
|
-
)
|
|
File without changes
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from pytest_mock import MockerFixture
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
7
|
-
from unstructured_ingest.interfaces import UploadStagerConfig
|
|
8
|
-
from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def mock_instance() -> BaseDuckDBUploadStager:
|
|
13
|
-
return BaseDuckDBUploadStager(UploadStagerConfig())
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.mark.parametrize(
|
|
17
|
-
("input_filepath", "output_filename", "expected"),
|
|
18
|
-
[
|
|
19
|
-
(
|
|
20
|
-
"/path/to/input_file.ndjson",
|
|
21
|
-
"output_file.ndjson",
|
|
22
|
-
"output_file.ndjson",
|
|
23
|
-
),
|
|
24
|
-
("input_file.txt", "output_file.json", "output_file.txt"),
|
|
25
|
-
("/path/to/input_file.json", "output_file", "output_file.json"),
|
|
26
|
-
],
|
|
27
|
-
)
|
|
28
|
-
def test_run_output_filename_suffix(
|
|
29
|
-
mocker: MockerFixture,
|
|
30
|
-
mock_instance: BaseDuckDBUploadStager,
|
|
31
|
-
input_filepath: str,
|
|
32
|
-
output_filename: str,
|
|
33
|
-
expected: str,
|
|
34
|
-
):
|
|
35
|
-
output_dir = Path("/tmp/test/output_dir")
|
|
36
|
-
|
|
37
|
-
# Mocks
|
|
38
|
-
mock_get_data = mocker.patch(
|
|
39
|
-
"unstructured_ingest.processes.connectors.duckdb.base.get_data",
|
|
40
|
-
return_value=[{"key": "value"}, {"key": "value2"}],
|
|
41
|
-
)
|
|
42
|
-
mock_conform_dict = mocker.patch.object(
|
|
43
|
-
BaseDuckDBUploadStager,
|
|
44
|
-
"conform_dict",
|
|
45
|
-
side_effect=lambda element_dict, file_data: element_dict,
|
|
46
|
-
)
|
|
47
|
-
mock_get_output_path = mocker.patch.object(
|
|
48
|
-
BaseDuckDBUploadStager, "get_output_path", return_value=output_dir / expected
|
|
49
|
-
)
|
|
50
|
-
mock_write_output = mocker.patch(
|
|
51
|
-
"unstructured_ingest.processes.connectors.duckdb.base.write_data", return_value=None
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
# Act
|
|
55
|
-
result = mock_instance.run(
|
|
56
|
-
elements_filepath=Path(input_filepath),
|
|
57
|
-
file_data=FileData(
|
|
58
|
-
identifier="test",
|
|
59
|
-
connector_type="test",
|
|
60
|
-
source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
|
|
61
|
-
),
|
|
62
|
-
output_dir=output_dir,
|
|
63
|
-
output_filename=output_filename,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# Assert
|
|
67
|
-
mock_get_data.assert_called_once_with(path=Path(input_filepath))
|
|
68
|
-
assert mock_conform_dict.call_count == 2
|
|
69
|
-
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
70
|
-
mock_write_output.assert_called_once_with(
|
|
71
|
-
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
72
|
-
)
|
|
73
|
-
assert result.name == expected
|
|
File without changes
|
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import pytest
|
|
5
|
-
from pytest_mock import MockerFixture
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
8
|
-
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
9
|
-
SQLConnectionConfig,
|
|
10
|
-
SQLUploader,
|
|
11
|
-
SQLUploaderConfig,
|
|
12
|
-
SQLUploadStager,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.fixture
|
|
17
|
-
def mock_instance() -> SQLUploadStager:
|
|
18
|
-
return SQLUploadStager()
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@pytest.fixture
|
|
22
|
-
def mock_uploader(mocker: MockerFixture) -> SQLUploader:
|
|
23
|
-
mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
|
|
24
|
-
mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
|
|
25
|
-
return SQLUploader(
|
|
26
|
-
upload_config=mock_upload_config,
|
|
27
|
-
connection_config=mock_connection_config,
|
|
28
|
-
connector_type="sql_test",
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.mark.parametrize(
|
|
33
|
-
("input_filepath", "output_filename", "expected"),
|
|
34
|
-
[
|
|
35
|
-
(
|
|
36
|
-
"/path/to/input_file.ndjson",
|
|
37
|
-
"output_file.ndjson",
|
|
38
|
-
"output_file.ndjson",
|
|
39
|
-
),
|
|
40
|
-
("input_file.txt", "output_file.json", "output_file.txt"),
|
|
41
|
-
("/path/to/input_file.json", "output_file", "output_file.json"),
|
|
42
|
-
],
|
|
43
|
-
)
|
|
44
|
-
def test_run_output_filename_suffix(
|
|
45
|
-
mocker: MockerFixture,
|
|
46
|
-
mock_instance: SQLUploadStager,
|
|
47
|
-
input_filepath: str,
|
|
48
|
-
output_filename: str,
|
|
49
|
-
expected: str,
|
|
50
|
-
):
|
|
51
|
-
output_dir = Path("/tmp/test/output_dir")
|
|
52
|
-
|
|
53
|
-
# Mocks
|
|
54
|
-
mock_get_data = mocker.patch(
|
|
55
|
-
"unstructured_ingest.processes.connectors.sql.sql.get_data",
|
|
56
|
-
return_value=[{"key": "value"}, {"key": "value2"}],
|
|
57
|
-
)
|
|
58
|
-
mock_conform_dict = mocker.patch.object(
|
|
59
|
-
SQLUploadStager, "conform_dict", side_effect=lambda element_dict, file_data: element_dict
|
|
60
|
-
)
|
|
61
|
-
mock_conform_dataframe = mocker.patch.object(
|
|
62
|
-
SQLUploadStager, "conform_dataframe", side_effect=lambda df: df
|
|
63
|
-
)
|
|
64
|
-
mock_get_output_path = mocker.patch.object(
|
|
65
|
-
SQLUploadStager, "get_output_path", return_value=output_dir / expected
|
|
66
|
-
)
|
|
67
|
-
mock_write_output = mocker.patch(
|
|
68
|
-
"unstructured_ingest.processes.connectors.sql.sql.write_data", return_value=None
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
# Act
|
|
72
|
-
result = mock_instance.run(
|
|
73
|
-
elements_filepath=Path(input_filepath),
|
|
74
|
-
file_data=FileData(
|
|
75
|
-
identifier="test",
|
|
76
|
-
connector_type="test",
|
|
77
|
-
source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
|
|
78
|
-
),
|
|
79
|
-
output_dir=output_dir,
|
|
80
|
-
output_filename=output_filename,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# Assert
|
|
84
|
-
mock_get_data.assert_called_once_with(path=Path(input_filepath))
|
|
85
|
-
assert mock_conform_dict.call_count == 2
|
|
86
|
-
mock_conform_dataframe.assert_called_once()
|
|
87
|
-
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
88
|
-
mock_write_output.assert_called_once_with(
|
|
89
|
-
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
90
|
-
)
|
|
91
|
-
assert result.name == expected
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
95
|
-
df = pd.DataFrame(
|
|
96
|
-
{
|
|
97
|
-
"col1": [1, 2],
|
|
98
|
-
"col2": [3, 4],
|
|
99
|
-
"col3": [5, 6],
|
|
100
|
-
}
|
|
101
|
-
)
|
|
102
|
-
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
103
|
-
|
|
104
|
-
result = mock_uploader._fit_to_schema(df)
|
|
105
|
-
|
|
106
|
-
assert "col3" not in result.columns
|
|
107
|
-
assert "col1" in result.columns
|
|
108
|
-
assert "col2" in result.columns
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
112
|
-
df = pd.DataFrame(
|
|
113
|
-
{
|
|
114
|
-
"col1": [1, 2],
|
|
115
|
-
}
|
|
116
|
-
)
|
|
117
|
-
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
118
|
-
|
|
119
|
-
result = mock_uploader._fit_to_schema(df)
|
|
120
|
-
|
|
121
|
-
assert "col2" in result.columns
|
|
122
|
-
assert result["col2"].isnull().all()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
126
|
-
df = pd.DataFrame(
|
|
127
|
-
{
|
|
128
|
-
"col1": [1, 2],
|
|
129
|
-
"col2": [3, 4],
|
|
130
|
-
}
|
|
131
|
-
)
|
|
132
|
-
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
133
|
-
|
|
134
|
-
result = mock_uploader._fit_to_schema(df)
|
|
135
|
-
|
|
136
|
-
assert "col1" in result.columns
|
|
137
|
-
assert "col2" in result.columns
|
|
138
|
-
assert result.equals(df)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
142
|
-
df = pd.DataFrame(
|
|
143
|
-
{
|
|
144
|
-
"col1": [1, 2],
|
|
145
|
-
}
|
|
146
|
-
)
|
|
147
|
-
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
148
|
-
|
|
149
|
-
result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
|
|
150
|
-
|
|
151
|
-
assert "col2" not in result.columns
|
|
152
|
-
assert "col1" in result.columns
|