unstructured-ingest 0.5.18__py3-none-any.whl → 0.5.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_redis.py +36 -12
- test/unit/v2/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +459 -0
- test/unit/v2/connectors/sql/test_sql.py +79 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +2 -0
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +301 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +7 -6
- unstructured_ingest/v2/processes/connectors/sql/sql.py +5 -4
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/RECORD +16 -12
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/top_level.txt +0 -0
|
@@ -23,20 +23,22 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
async def delete_record(client: Redis, element_id: str) -> None:
|
|
27
|
-
|
|
26
|
+
async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
|
|
27
|
+
key_with_prefix = f"{key_prefix}{element_id}"
|
|
28
|
+
await client.delete(key_with_prefix)
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
async def validate_upload(client: Redis, first_element: dict):
|
|
31
|
+
async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
|
|
31
32
|
element_id = first_element["element_id"]
|
|
33
|
+
key_with_prefix = f"{key_prefix}{element_id}"
|
|
32
34
|
expected_text = first_element["text"]
|
|
33
35
|
expected_embeddings = first_element["embeddings"]
|
|
34
36
|
async with client.pipeline(transaction=True) as pipe:
|
|
35
37
|
try:
|
|
36
|
-
response = await pipe.json().get(
|
|
38
|
+
response = await pipe.json().get(key_with_prefix, "$").execute()
|
|
37
39
|
response = response[0][0]
|
|
38
40
|
except redis_exceptions.ResponseError:
|
|
39
|
-
response = await pipe.get(
|
|
41
|
+
response = await pipe.get(key_with_prefix).execute()
|
|
40
42
|
response = json.loads(response[0])
|
|
41
43
|
|
|
42
44
|
embedding_similarity = np.linalg.norm(
|
|
@@ -53,6 +55,7 @@ async def redis_destination_test(
|
|
|
53
55
|
upload_file: Path,
|
|
54
56
|
tmp_path: Path,
|
|
55
57
|
connection_kwargs: dict,
|
|
58
|
+
uploader_config: dict,
|
|
56
59
|
uri: Optional[str] = None,
|
|
57
60
|
password: Optional[str] = None,
|
|
58
61
|
):
|
|
@@ -60,8 +63,9 @@ async def redis_destination_test(
|
|
|
60
63
|
connection_config=RedisConnectionConfig(
|
|
61
64
|
**connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
|
|
62
65
|
),
|
|
63
|
-
upload_config=RedisUploaderConfig(batch_size=10),
|
|
66
|
+
upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
|
|
64
67
|
)
|
|
68
|
+
key_prefix = uploader.upload_config.key_prefix
|
|
65
69
|
|
|
66
70
|
file_data = FileData(
|
|
67
71
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
@@ -78,20 +82,32 @@ async def redis_destination_test(
|
|
|
78
82
|
|
|
79
83
|
if uri:
|
|
80
84
|
async with from_url(uri) as client:
|
|
81
|
-
await validate_upload(
|
|
85
|
+
await validate_upload(
|
|
86
|
+
client=client,
|
|
87
|
+
first_element=first_element,
|
|
88
|
+
key_prefix=key_prefix,
|
|
89
|
+
)
|
|
82
90
|
else:
|
|
83
91
|
async with Redis(**connection_kwargs, password=password) as client:
|
|
84
|
-
await validate_upload(
|
|
92
|
+
await validate_upload(
|
|
93
|
+
client=client,
|
|
94
|
+
first_element=first_element,
|
|
95
|
+
key_prefix=key_prefix,
|
|
96
|
+
)
|
|
85
97
|
except Exception as e:
|
|
86
98
|
raise e
|
|
87
99
|
finally:
|
|
88
100
|
if uri:
|
|
89
101
|
async with from_url(uri) as client:
|
|
90
|
-
tasks = [
|
|
102
|
+
tasks = [
|
|
103
|
+
delete_record(client, element["element_id"], key_prefix) for element in elements
|
|
104
|
+
]
|
|
91
105
|
await asyncio.gather(*tasks)
|
|
92
106
|
else:
|
|
93
107
|
async with Redis(**connection_kwargs, password=password) as client:
|
|
94
|
-
tasks = [
|
|
108
|
+
tasks = [
|
|
109
|
+
delete_record(client, element["element_id"], key_prefix) for element in elements
|
|
110
|
+
]
|
|
95
111
|
await asyncio.gather(*tasks)
|
|
96
112
|
|
|
97
113
|
|
|
@@ -105,8 +121,13 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
|
|
|
105
121
|
"db": 0,
|
|
106
122
|
"ssl": True,
|
|
107
123
|
}
|
|
124
|
+
uploader_config = {
|
|
125
|
+
"key_prefix": "test_ingest:",
|
|
126
|
+
}
|
|
108
127
|
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
109
|
-
await redis_destination_test(
|
|
128
|
+
await redis_destination_test(
|
|
129
|
+
upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
|
|
130
|
+
)
|
|
110
131
|
|
|
111
132
|
|
|
112
133
|
@pytest.mark.asyncio
|
|
@@ -114,6 +135,9 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
|
|
|
114
135
|
@requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
|
|
115
136
|
async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
|
|
116
137
|
connection_kwargs = {}
|
|
138
|
+
uploader_config = {
|
|
139
|
+
"key_prefix": "test_ingest:",
|
|
140
|
+
}
|
|
117
141
|
redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
|
|
118
142
|
uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
|
|
119
|
-
await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
|
|
143
|
+
await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
|
|
File without changes
|
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from unittest.mock import MagicMock
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
from pydantic import Secret
|
|
7
|
+
from pyiceberg.exceptions import CommitFailedException
|
|
8
|
+
from pytest_mock import MockerFixture
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.errors import ProviderError, UserError
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
|
|
14
|
+
IbmWatsonxAccessConfig,
|
|
15
|
+
IbmWatsonxConnectionConfig,
|
|
16
|
+
IbmWatsonxUploader,
|
|
17
|
+
IbmWatsonxUploaderConfig,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def file_data():
|
|
23
|
+
return FileData(
|
|
24
|
+
identifier="test_identifier",
|
|
25
|
+
connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
|
|
26
|
+
source_identifiers=SourceIdentifiers(
|
|
27
|
+
filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture
|
|
33
|
+
def access_config():
|
|
34
|
+
return IbmWatsonxAccessConfig(
|
|
35
|
+
iam_api_key="test_iam_api_key",
|
|
36
|
+
access_key_id="test_access_key_id",
|
|
37
|
+
secret_access_key="test_secret_access_key",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@pytest.fixture
|
|
42
|
+
def connection_config(access_config: IbmWatsonxAccessConfig):
|
|
43
|
+
return IbmWatsonxConnectionConfig(
|
|
44
|
+
access_config=Secret(access_config),
|
|
45
|
+
iceberg_endpoint="test_iceberg_endpoint/",
|
|
46
|
+
object_storage_endpoint="test_object_storage_endpoint/",
|
|
47
|
+
object_storage_region="test_region",
|
|
48
|
+
catalog="test_catalog",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.fixture
|
|
53
|
+
def uploader_config():
|
|
54
|
+
return IbmWatsonxUploaderConfig(
|
|
55
|
+
namespace="test_namespace",
|
|
56
|
+
table="test_table",
|
|
57
|
+
record_id_key="test_record_id_key",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.fixture
|
|
62
|
+
def uploader(
|
|
63
|
+
connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
|
|
64
|
+
):
|
|
65
|
+
return IbmWatsonxUploader(
|
|
66
|
+
connection_config=connection_config,
|
|
67
|
+
upload_config=uploader_config,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.fixture
|
|
72
|
+
def mock_catalog(mocker: MockerFixture):
|
|
73
|
+
mock_catalog = mocker.MagicMock()
|
|
74
|
+
mock_catalog.namespace_exists.return_value = True
|
|
75
|
+
mock_catalog.table_exists.return_value = True
|
|
76
|
+
return mock_catalog
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.fixture
|
|
80
|
+
def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
|
|
81
|
+
mock_get_catalog = mocker.patch.context_manager(
|
|
82
|
+
IbmWatsonxConnectionConfig, "get_catalog", autospec=True
|
|
83
|
+
)
|
|
84
|
+
mock_get_catalog.return_value.__enter__.return_value = mock_catalog
|
|
85
|
+
return mock_get_catalog
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@pytest.fixture
|
|
89
|
+
def mock_table(mocker: MockerFixture):
|
|
90
|
+
mock_table = mocker.MagicMock()
|
|
91
|
+
return mock_table
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.fixture
|
|
95
|
+
def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
|
|
96
|
+
mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
|
|
97
|
+
mock_get_table.return_value.__enter__.return_value = mock_table
|
|
98
|
+
return mock_get_table
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@pytest.fixture
|
|
102
|
+
def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
|
|
103
|
+
mock_transaction = mocker.MagicMock()
|
|
104
|
+
mock_table.transaction.return_value.__enter__.return_value = mock_transaction
|
|
105
|
+
return mock_transaction
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@pytest.fixture
|
|
109
|
+
def mock_data_table(mocker: MockerFixture):
|
|
110
|
+
mock_data_table = mocker.MagicMock()
|
|
111
|
+
mock_data_table.schema = "schema"
|
|
112
|
+
return mock_data_table
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@pytest.fixture
|
|
116
|
+
def mock_delete(mocker: MockerFixture):
|
|
117
|
+
return mocker.patch.object(IbmWatsonxUploader, "_delete")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.fixture
|
|
121
|
+
def test_df():
|
|
122
|
+
return pd.DataFrame(
|
|
123
|
+
{
|
|
124
|
+
"test_column_0": [True, False, True],
|
|
125
|
+
"test_column_1": [1, 2, 3],
|
|
126
|
+
"test_column_2": ["a", "b", "c"],
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@pytest.fixture
|
|
132
|
+
def timestamp_now():
|
|
133
|
+
return int(time.time())
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_ibm_watsonx_connection_config_iceberg_url(
|
|
137
|
+
mocker: MockerFixture,
|
|
138
|
+
connection_config: IbmWatsonxConnectionConfig,
|
|
139
|
+
):
|
|
140
|
+
mocker.patch(
|
|
141
|
+
"unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
142
|
+
new="/mds/iceberg",
|
|
143
|
+
)
|
|
144
|
+
expected_url = "https://test_iceberg_endpoint/mds/iceberg"
|
|
145
|
+
assert connection_config.iceberg_url == expected_url
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_ibm_watsonx_connection_config_object_storage_url(
|
|
149
|
+
connection_config: IbmWatsonxConnectionConfig,
|
|
150
|
+
):
|
|
151
|
+
expected_url = "https://test_object_storage_endpoint"
|
|
152
|
+
assert connection_config.object_storage_url == expected_url
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def test_ibm_watsonx_connection_config_bearer_token_new_token(
|
|
156
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
157
|
+
):
|
|
158
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
159
|
+
IbmWatsonxConnectionConfig,
|
|
160
|
+
"generate_bearer_token",
|
|
161
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
162
|
+
)
|
|
163
|
+
token = connection_config.bearer_token
|
|
164
|
+
assert token == "new_token"
|
|
165
|
+
mock_generate_bearer_token.assert_called_once()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_ibm_watsonx_connection_config_bearer_token_existing_token(
|
|
169
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
170
|
+
):
|
|
171
|
+
connection_config._bearer_token = {
|
|
172
|
+
"access_token": "existing_token",
|
|
173
|
+
"expiration": timestamp_now + 3600,
|
|
174
|
+
}
|
|
175
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
176
|
+
IbmWatsonxConnectionConfig, "generate_bearer_token"
|
|
177
|
+
)
|
|
178
|
+
token = connection_config.bearer_token
|
|
179
|
+
assert token == "existing_token"
|
|
180
|
+
mock_generate_bearer_token.assert_not_called()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_ibm_watsonx_connection_config_bearer_token_expired_token(
|
|
184
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
185
|
+
):
|
|
186
|
+
connection_config._bearer_token = {
|
|
187
|
+
"access_token": "expired_token",
|
|
188
|
+
"expiration": timestamp_now - 3600,
|
|
189
|
+
}
|
|
190
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
191
|
+
IbmWatsonxConnectionConfig,
|
|
192
|
+
"generate_bearer_token",
|
|
193
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
194
|
+
)
|
|
195
|
+
token = connection_config.bearer_token
|
|
196
|
+
assert token == "new_token"
|
|
197
|
+
mock_generate_bearer_token.assert_called_once()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
|
|
201
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
202
|
+
):
|
|
203
|
+
connection_config._bearer_token = {
|
|
204
|
+
"access_token": "soon_to_expire_token",
|
|
205
|
+
"expiration": timestamp_now + 60,
|
|
206
|
+
}
|
|
207
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
208
|
+
IbmWatsonxConnectionConfig,
|
|
209
|
+
"generate_bearer_token",
|
|
210
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
211
|
+
)
|
|
212
|
+
token = connection_config.bearer_token
|
|
213
|
+
assert token == "new_token"
|
|
214
|
+
mock_generate_bearer_token.assert_called_once()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def test_ibm_watsonx_connection_config_get_catalog_success(
|
|
218
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
219
|
+
):
|
|
220
|
+
mocker.patch(
|
|
221
|
+
"unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
222
|
+
new="/mds/iceberg",
|
|
223
|
+
)
|
|
224
|
+
mocker.patch.object(
|
|
225
|
+
IbmWatsonxConnectionConfig,
|
|
226
|
+
"bearer_token",
|
|
227
|
+
new="test_bearer_token",
|
|
228
|
+
)
|
|
229
|
+
mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
|
|
230
|
+
|
|
231
|
+
with connection_config.get_catalog() as catalog:
|
|
232
|
+
assert catalog is not None
|
|
233
|
+
mock_load_catalog.assert_called_once_with(
|
|
234
|
+
**{
|
|
235
|
+
"name": "test_catalog",
|
|
236
|
+
"type": "rest",
|
|
237
|
+
"uri": "https://test_iceberg_endpoint/mds/iceberg",
|
|
238
|
+
"token": "test_bearer_token",
|
|
239
|
+
"warehouse": "test_catalog",
|
|
240
|
+
"s3.endpoint": "https://test_object_storage_endpoint",
|
|
241
|
+
"s3.access-key-id": "test_access_key_id",
|
|
242
|
+
"s3.secret-access-key": "test_secret_access_key",
|
|
243
|
+
"s3.region": "test_region",
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_ibm_watsonx_connection_config_get_catalog_failure(
|
|
249
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
250
|
+
):
|
|
251
|
+
mocker.patch(
|
|
252
|
+
"pyiceberg.catalog.load_catalog",
|
|
253
|
+
side_effect=Exception("Connection error"),
|
|
254
|
+
)
|
|
255
|
+
mocker.patch.object(
|
|
256
|
+
IbmWatsonxConnectionConfig,
|
|
257
|
+
"bearer_token",
|
|
258
|
+
new="test_bearer_token",
|
|
259
|
+
)
|
|
260
|
+
with pytest.raises(ProviderError):
|
|
261
|
+
with connection_config.get_catalog():
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
|
|
266
|
+
mock_get_catalog: MagicMock,
|
|
267
|
+
mock_catalog: MagicMock,
|
|
268
|
+
uploader: IbmWatsonxUploader,
|
|
269
|
+
):
|
|
270
|
+
uploader.precheck()
|
|
271
|
+
|
|
272
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
273
|
+
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
|
|
277
|
+
mock_get_catalog: MagicMock,
|
|
278
|
+
mock_catalog: MagicMock,
|
|
279
|
+
uploader: IbmWatsonxUploader,
|
|
280
|
+
):
|
|
281
|
+
mock_catalog.namespace_exists.return_value = False
|
|
282
|
+
|
|
283
|
+
with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
|
|
284
|
+
uploader.precheck()
|
|
285
|
+
|
|
286
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
287
|
+
mock_catalog.table_exists.assert_not_called()
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
|
|
291
|
+
mock_get_catalog: MagicMock,
|
|
292
|
+
mock_catalog: MagicMock,
|
|
293
|
+
uploader: IbmWatsonxUploader,
|
|
294
|
+
):
|
|
295
|
+
mock_catalog.table_exists.return_value = False
|
|
296
|
+
|
|
297
|
+
with pytest.raises(
|
|
298
|
+
UserError,
|
|
299
|
+
match="Table 'test_table' does not exist in namespace 'test_namespace'",
|
|
300
|
+
):
|
|
301
|
+
uploader.precheck()
|
|
302
|
+
|
|
303
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
304
|
+
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def test_ibm_watsonx_uploader_upload_data_table_success(
|
|
308
|
+
uploader: IbmWatsonxUploader,
|
|
309
|
+
mock_table: MagicMock,
|
|
310
|
+
mock_transaction: MagicMock,
|
|
311
|
+
mock_data_table: MagicMock,
|
|
312
|
+
mock_delete: MagicMock,
|
|
313
|
+
file_data: FileData,
|
|
314
|
+
):
|
|
315
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
316
|
+
|
|
317
|
+
mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
|
|
318
|
+
mock_transaction.append.assert_called_once_with(mock_data_table)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
|
|
322
|
+
uploader: IbmWatsonxUploader,
|
|
323
|
+
mock_table: MagicMock,
|
|
324
|
+
mock_transaction: MagicMock,
|
|
325
|
+
mock_data_table: MagicMock,
|
|
326
|
+
mock_delete: MagicMock,
|
|
327
|
+
file_data: FileData,
|
|
328
|
+
):
|
|
329
|
+
mock_transaction.append.side_effect = CommitFailedException()
|
|
330
|
+
|
|
331
|
+
with pytest.raises(ProviderError):
|
|
332
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
333
|
+
assert mock_table.refresh.call_count == 5
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def test_ibm_watsonx_uploader_upload_data_table_exception(
|
|
337
|
+
uploader: IbmWatsonxUploader,
|
|
338
|
+
mock_table: MagicMock,
|
|
339
|
+
mock_transaction: MagicMock,
|
|
340
|
+
mock_data_table: MagicMock,
|
|
341
|
+
mock_delete: MagicMock,
|
|
342
|
+
file_data: FileData,
|
|
343
|
+
):
|
|
344
|
+
mock_transaction.append.side_effect = Exception()
|
|
345
|
+
|
|
346
|
+
with pytest.raises(ProviderError):
|
|
347
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
348
|
+
assert mock_table.refresh.call_count == 0
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def test_ibm_watsonx_uploader_df_to_arrow_table(
|
|
352
|
+
mocker: MockerFixture,
|
|
353
|
+
uploader: IbmWatsonxUploader,
|
|
354
|
+
test_df: pd.DataFrame,
|
|
355
|
+
):
|
|
356
|
+
mock_fit_to_schema = mocker.patch.object(
|
|
357
|
+
IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
result = uploader._df_to_arrow_table(test_df)
|
|
361
|
+
|
|
362
|
+
mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
|
|
363
|
+
assert len(result.column_names) == 3
|
|
364
|
+
assert "test_column_0" in result.column_names
|
|
365
|
+
assert "test_column_1" in result.column_names
|
|
366
|
+
assert "test_column_2" in result.column_names
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def test_ibm_watsonx_uploader_can_delete_column_exists(
|
|
370
|
+
mocker: MockerFixture,
|
|
371
|
+
uploader: IbmWatsonxUploader,
|
|
372
|
+
):
|
|
373
|
+
mocker.patch.object(
|
|
374
|
+
IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
assert uploader.can_delete() is True
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
|
|
381
|
+
mocker: MockerFixture,
|
|
382
|
+
uploader: IbmWatsonxUploader,
|
|
383
|
+
):
|
|
384
|
+
mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
|
|
385
|
+
|
|
386
|
+
assert uploader.can_delete() is False
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def test_ibm_watsonx_uploader_get_table_columns_cache(
|
|
390
|
+
uploader: IbmWatsonxUploader,
|
|
391
|
+
):
|
|
392
|
+
uploader._columns = ["cached_column"]
|
|
393
|
+
|
|
394
|
+
result = uploader.get_table_columns()
|
|
395
|
+
|
|
396
|
+
assert result == ["cached_column"]
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def test_ibm_watsonx_uploader_get_table_columns_no_cache(
|
|
400
|
+
uploader: IbmWatsonxUploader,
|
|
401
|
+
mock_get_table: MagicMock,
|
|
402
|
+
mock_table: MagicMock,
|
|
403
|
+
):
|
|
404
|
+
uploader._columns = None
|
|
405
|
+
mock_table.schema.return_value.column_names = ["column_1", "column_2"]
|
|
406
|
+
|
|
407
|
+
result = uploader.get_table_columns()
|
|
408
|
+
|
|
409
|
+
mock_get_table.assert_called_once()
|
|
410
|
+
assert result == ["column_1", "column_2"]
|
|
411
|
+
assert uploader._columns == ["column_1", "column_2"]
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def test_ibm_watsonx_uploader_upload_dataframe_success(
|
|
415
|
+
mocker: MockerFixture,
|
|
416
|
+
uploader: IbmWatsonxUploader,
|
|
417
|
+
test_df: pd.DataFrame,
|
|
418
|
+
mock_get_table: MagicMock,
|
|
419
|
+
mock_table: MagicMock,
|
|
420
|
+
mock_data_table: MagicMock,
|
|
421
|
+
file_data: FileData,
|
|
422
|
+
):
|
|
423
|
+
mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
|
|
424
|
+
mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
|
|
425
|
+
|
|
426
|
+
uploader.upload_dataframe(test_df, file_data)
|
|
427
|
+
|
|
428
|
+
mock_get_table.assert_called_once()
|
|
429
|
+
mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def test_ibm_watsonx_uploader_delete_can_delete(
|
|
433
|
+
mocker: MockerFixture,
|
|
434
|
+
uploader: IbmWatsonxUploader,
|
|
435
|
+
mock_transaction: MagicMock,
|
|
436
|
+
):
|
|
437
|
+
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
|
|
438
|
+
mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
|
|
439
|
+
|
|
440
|
+
uploader._delete(mock_transaction, "test_identifier")
|
|
441
|
+
|
|
442
|
+
mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
|
|
443
|
+
mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def test_ibm_watsonx_uploader_delete_cannot_delete(
|
|
447
|
+
caplog: pytest.LogCaptureFixture,
|
|
448
|
+
mocker: MockerFixture,
|
|
449
|
+
uploader: IbmWatsonxUploader,
|
|
450
|
+
mock_transaction: MagicMock,
|
|
451
|
+
):
|
|
452
|
+
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
|
|
453
|
+
|
|
454
|
+
uploader._delete(mock_transaction, "test_identifier")
|
|
455
|
+
mock_transaction.delete.assert_not_called()
|
|
456
|
+
assert (
|
|
457
|
+
"Table doesn't contain expected record id column test_record_id_key, skipping delete"
|
|
458
|
+
in caplog.text
|
|
459
|
+
)
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
import pandas as pd
|
|
3
4
|
import pytest
|
|
4
5
|
from pytest_mock import MockerFixture
|
|
5
6
|
|
|
6
7
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sql.sql import
|
|
8
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
9
|
+
SQLConnectionConfig,
|
|
10
|
+
SQLUploader,
|
|
11
|
+
SQLUploaderConfig,
|
|
12
|
+
SQLUploadStager,
|
|
13
|
+
)
|
|
8
14
|
|
|
9
15
|
|
|
10
16
|
@pytest.fixture
|
|
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
|
|
|
12
18
|
return SQLUploadStager()
|
|
13
19
|
|
|
14
20
|
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def mock_uploader(mocker: MockerFixture) -> SQLUploader:
|
|
23
|
+
mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
|
|
24
|
+
mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
|
|
25
|
+
return SQLUploader(
|
|
26
|
+
upload_config=mock_upload_config,
|
|
27
|
+
connection_config=mock_connection_config,
|
|
28
|
+
connector_type="sql_test",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
15
32
|
@pytest.mark.parametrize(
|
|
16
33
|
("input_filepath", "output_filename", "expected"),
|
|
17
34
|
[
|
|
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
|
|
|
72
89
|
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
73
90
|
)
|
|
74
91
|
assert result.name == expected
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
95
|
+
df = pd.DataFrame(
|
|
96
|
+
{
|
|
97
|
+
"col1": [1, 2],
|
|
98
|
+
"col2": [3, 4],
|
|
99
|
+
"col3": [5, 6],
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
103
|
+
|
|
104
|
+
result = mock_uploader._fit_to_schema(df)
|
|
105
|
+
|
|
106
|
+
assert "col3" not in result.columns
|
|
107
|
+
assert "col1" in result.columns
|
|
108
|
+
assert "col2" in result.columns
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
112
|
+
df = pd.DataFrame(
|
|
113
|
+
{
|
|
114
|
+
"col1": [1, 2],
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
118
|
+
|
|
119
|
+
result = mock_uploader._fit_to_schema(df)
|
|
120
|
+
|
|
121
|
+
assert "col2" in result.columns
|
|
122
|
+
assert result["col2"].isnull().all()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
126
|
+
df = pd.DataFrame(
|
|
127
|
+
{
|
|
128
|
+
"col1": [1, 2],
|
|
129
|
+
"col2": [3, 4],
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
133
|
+
|
|
134
|
+
result = mock_uploader._fit_to_schema(df)
|
|
135
|
+
|
|
136
|
+
assert "col1" in result.columns
|
|
137
|
+
assert "col2" in result.columns
|
|
138
|
+
assert result.equals(df)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
142
|
+
df = pd.DataFrame(
|
|
143
|
+
{
|
|
144
|
+
"col1": [1, 2],
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
148
|
+
|
|
149
|
+
result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
|
|
150
|
+
|
|
151
|
+
assert "col2" not in result.columns
|
|
152
|
+
assert "col1" in result.columns
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.20" # pragma: no cover
|
|
@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
|
4
4
|
import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
|
|
5
5
|
import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
|
|
6
6
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
7
|
+
import unstructured_ingest.v2.processes.connectors.ibm_watsonx # noqa: F401
|
|
7
8
|
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
8
9
|
import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
|
|
9
10
|
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
|
|
|
121
122
|
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
|
|
122
123
|
|
|
123
124
|
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
|
|
125
|
+
|
|
124
126
|
add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
|
|
4
|
+
|
|
5
|
+
from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
6
|
+
from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
|
|
7
|
+
|
|
8
|
+
add_destination_entry(
|
|
9
|
+
destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
|
|
10
|
+
)
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
14
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
26
|
+
SQLUploader,
|
|
27
|
+
SQLUploadStager,
|
|
28
|
+
SQLUploadStagerConfig,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from pyarrow import Table as ArrowTable
|
|
33
|
+
from pyiceberg.catalog.rest import RestCatalog
|
|
34
|
+
from pyiceberg.table import Table, Transaction
|
|
35
|
+
|
|
36
|
+
CONNECTOR_TYPE = "ibm_watsonx_s3"
|
|
37
|
+
|
|
38
|
+
DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
|
|
39
|
+
DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
|
|
40
|
+
DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class IcebergCommitFailedException(Exception):
|
|
44
|
+
"""Failed to commit changes to the iceberg table."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IbmWatsonxAccessConfig(AccessConfig):
|
|
48
|
+
iam_api_key: str = Field(description="IBM IAM API Key")
|
|
49
|
+
access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
|
|
50
|
+
secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
54
|
+
access_config: Secret[IbmWatsonxAccessConfig]
|
|
55
|
+
iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
|
|
56
|
+
object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
|
|
57
|
+
object_storage_region: str = Field(description="Cloud Object Storage region")
|
|
58
|
+
catalog: str = Field(description="Catalog name")
|
|
59
|
+
|
|
60
|
+
_bearer_token: Optional[dict[str, Any]] = None
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def iceberg_url(self) -> str:
|
|
64
|
+
return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def object_storage_url(self) -> str:
|
|
68
|
+
return f"https://{self.object_storage_endpoint.strip('/')}"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def bearer_token(self) -> str:
|
|
72
|
+
# Add 60 seconds to deal with edge cases where the token expires before the request is made
|
|
73
|
+
timestamp = int(time.time()) + 60
|
|
74
|
+
if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
|
|
75
|
+
self._bearer_token = self.generate_bearer_token()
|
|
76
|
+
return self._bearer_token["access_token"]
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
79
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
80
|
+
import httpx
|
|
81
|
+
|
|
82
|
+
if not isinstance(e, httpx.HTTPStatusError):
|
|
83
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
84
|
+
return e
|
|
85
|
+
url = e.request.url
|
|
86
|
+
response_code = e.response.status_code
|
|
87
|
+
if response_code == 401:
|
|
88
|
+
logger.error(
|
|
89
|
+
f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
|
|
90
|
+
)
|
|
91
|
+
return UserAuthError(e)
|
|
92
|
+
if response_code == 403:
|
|
93
|
+
logger.error(
|
|
94
|
+
f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
|
|
95
|
+
)
|
|
96
|
+
return UserAuthError(e)
|
|
97
|
+
if 400 <= response_code < 500:
|
|
98
|
+
logger.error(
|
|
99
|
+
f"Request to {url} failed"
|
|
100
|
+
f"in IBM watsonx.data connector, status code {response_code}"
|
|
101
|
+
)
|
|
102
|
+
return UserError(e)
|
|
103
|
+
if response_code > 500:
|
|
104
|
+
logger.error(
|
|
105
|
+
f"Request to {url} failed"
|
|
106
|
+
f"in IBM watsonx.data connector, status code {response_code}"
|
|
107
|
+
)
|
|
108
|
+
return ProviderError(e)
|
|
109
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
110
|
+
return e
|
|
111
|
+
|
|
112
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
113
|
+
def generate_bearer_token(self) -> dict[str, Any]:
|
|
114
|
+
import httpx
|
|
115
|
+
|
|
116
|
+
headers = {
|
|
117
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
118
|
+
"Accept": "application/json",
|
|
119
|
+
}
|
|
120
|
+
data = {
|
|
121
|
+
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
|
122
|
+
"apikey": self.access_config.get_secret_value().iam_api_key,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
logger.info("Generating IBM IAM Bearer Token")
|
|
126
|
+
try:
|
|
127
|
+
response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
raise self.wrap_error(e)
|
|
131
|
+
return response.json()
|
|
132
|
+
|
|
133
|
+
def get_catalog_config(self) -> dict[str, Any]:
|
|
134
|
+
return {
|
|
135
|
+
"name": self.catalog,
|
|
136
|
+
"type": DEFAULT_ICEBERG_CATALOG_TYPE,
|
|
137
|
+
"uri": self.iceberg_url,
|
|
138
|
+
"token": self.bearer_token,
|
|
139
|
+
"warehouse": self.catalog,
|
|
140
|
+
"s3.endpoint": self.object_storage_url,
|
|
141
|
+
"s3.access-key-id": self.access_config.get_secret_value().access_key_id,
|
|
142
|
+
"s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
|
|
143
|
+
"s3.region": self.object_storage_region,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
147
|
+
@contextmanager
|
|
148
|
+
def get_catalog(self) -> Generator["RestCatalog", None, None]:
|
|
149
|
+
from pyiceberg.catalog import load_catalog
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
catalog_config = self.get_catalog_config()
|
|
153
|
+
catalog = load_catalog(**catalog_config)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
|
|
156
|
+
raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
|
|
157
|
+
|
|
158
|
+
yield catalog
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass
|
|
162
|
+
class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class IbmWatsonxUploadStager(SQLUploadStager):
|
|
168
|
+
upload_stager_config: IbmWatsonxUploadStagerConfig = field(
|
|
169
|
+
default_factory=IbmWatsonxUploadStagerConfig
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class IbmWatsonxUploaderConfig(UploaderConfig):
|
|
174
|
+
namespace: str = Field(description="Namespace name")
|
|
175
|
+
table: str = Field(description="Table name")
|
|
176
|
+
max_retries: int = Field(
|
|
177
|
+
default=5, description="Maximum number of retries to upload data", ge=2, le=10
|
|
178
|
+
)
|
|
179
|
+
record_id_key: str = Field(
|
|
180
|
+
default=RECORD_ID_LABEL,
|
|
181
|
+
description="Searchable key to find entries for the same record on previous runs",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def table_identifier(self) -> Tuple[str, str]:
|
|
186
|
+
return (self.namespace, self.table)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@dataclass
|
|
190
|
+
class IbmWatsonxUploader(SQLUploader):
|
|
191
|
+
connection_config: IbmWatsonxConnectionConfig
|
|
192
|
+
upload_config: IbmWatsonxUploaderConfig
|
|
193
|
+
connector_type: str = CONNECTOR_TYPE
|
|
194
|
+
|
|
195
|
+
def precheck(self) -> None:
|
|
196
|
+
with self.connection_config.get_catalog() as catalog:
|
|
197
|
+
if not catalog.namespace_exists(self.upload_config.namespace):
|
|
198
|
+
raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
|
|
199
|
+
if not catalog.table_exists(self.upload_config.table_identifier):
|
|
200
|
+
raise UserError(
|
|
201
|
+
f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
@contextmanager
|
|
205
|
+
def get_table(self) -> Generator["Table", None, None]:
|
|
206
|
+
with self.connection_config.get_catalog() as catalog:
|
|
207
|
+
table = catalog.load_table(self.upload_config.table_identifier)
|
|
208
|
+
yield table
|
|
209
|
+
|
|
210
|
+
def get_table_columns(self) -> list[str]:
|
|
211
|
+
if self._columns is None:
|
|
212
|
+
with self.get_table() as table:
|
|
213
|
+
self._columns = table.schema().column_names
|
|
214
|
+
return self._columns
|
|
215
|
+
|
|
216
|
+
def can_delete(self) -> bool:
|
|
217
|
+
return self.upload_config.record_id_key in self.get_table_columns()
|
|
218
|
+
|
|
219
|
+
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
220
|
+
def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
|
|
221
|
+
import pyarrow as pa
|
|
222
|
+
|
|
223
|
+
# Iceberg will automatically fill missing columns with nulls
|
|
224
|
+
# Iceberg will throw an error if the DataFrame column has only null values
|
|
225
|
+
# because it can't infer the type of the column and match it with the table schema
|
|
226
|
+
return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
|
|
227
|
+
|
|
228
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
229
|
+
def _delete(self, transaction: "Transaction", identifier: str) -> None:
|
|
230
|
+
from pyiceberg.expressions import EqualTo
|
|
231
|
+
|
|
232
|
+
if self.can_delete():
|
|
233
|
+
transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
|
|
234
|
+
else:
|
|
235
|
+
logger.warning(
|
|
236
|
+
f"Table doesn't contain expected "
|
|
237
|
+
f"record id column "
|
|
238
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
242
|
+
def upload_data_table(
|
|
243
|
+
self, table: "Table", data_table: "ArrowTable", file_data: FileData
|
|
244
|
+
) -> None:
|
|
245
|
+
from pyiceberg.exceptions import CommitFailedException
|
|
246
|
+
from tenacity import (
|
|
247
|
+
before_log,
|
|
248
|
+
retry,
|
|
249
|
+
retry_if_exception_type,
|
|
250
|
+
stop_after_attempt,
|
|
251
|
+
wait_random,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
@retry(
|
|
255
|
+
stop=stop_after_attempt(self.upload_config.max_retries),
|
|
256
|
+
wait=wait_random(),
|
|
257
|
+
retry=retry_if_exception_type(IcebergCommitFailedException),
|
|
258
|
+
before=before_log(logger, logging.DEBUG),
|
|
259
|
+
reraise=True,
|
|
260
|
+
)
|
|
261
|
+
def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
|
|
262
|
+
try:
|
|
263
|
+
with table.transaction() as transaction:
|
|
264
|
+
self._delete(transaction, file_data.identifier)
|
|
265
|
+
transaction.append(data_table)
|
|
266
|
+
except CommitFailedException as e:
|
|
267
|
+
table.refresh()
|
|
268
|
+
logger.debug(e)
|
|
269
|
+
raise IcebergCommitFailedException(e)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
return _upload_data_table(table, data_table, file_data)
|
|
275
|
+
except ProviderError:
|
|
276
|
+
raise
|
|
277
|
+
except Exception as e:
|
|
278
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
279
|
+
|
|
280
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
281
|
+
data_table = self._df_to_arrow_table(df)
|
|
282
|
+
|
|
283
|
+
with self.get_table() as table:
|
|
284
|
+
self.upload_data_table(table, data_table, file_data)
|
|
285
|
+
|
|
286
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
287
|
+
df = pd.DataFrame(data)
|
|
288
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
289
|
+
|
|
290
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
291
|
+
df = get_data_df(path=path)
|
|
292
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
|
|
296
|
+
connection_config=IbmWatsonxConnectionConfig,
|
|
297
|
+
uploader=IbmWatsonxUploader,
|
|
298
|
+
uploader_config=IbmWatsonxUploaderConfig,
|
|
299
|
+
upload_stager=IbmWatsonxUploadStager,
|
|
300
|
+
upload_stager_config=IbmWatsonxUploadStagerConfig,
|
|
301
|
+
)
|
|
@@ -110,6 +110,7 @@ class RedisConnectionConfig(ConnectionConfig):
|
|
|
110
110
|
|
|
111
111
|
class RedisUploaderConfig(UploaderConfig):
|
|
112
112
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
113
|
+
key_prefix: str = Field(default="", description="Prefix for Redis keys")
|
|
113
114
|
|
|
114
115
|
|
|
115
116
|
@dataclass
|
|
@@ -145,11 +146,11 @@ class RedisUploader(Uploader):
|
|
|
145
146
|
async with self.connection_config.create_async_client() as async_client:
|
|
146
147
|
async with async_client.pipeline(transaction=True) as pipe:
|
|
147
148
|
for element in batch:
|
|
148
|
-
|
|
149
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
149
150
|
if redis_stack:
|
|
150
|
-
pipe.json().set(
|
|
151
|
+
pipe.json().set(key_with_prefix, "$", element)
|
|
151
152
|
else:
|
|
152
|
-
pipe.set(
|
|
153
|
+
pipe.set(key_with_prefix, json.dumps(element))
|
|
153
154
|
await pipe.execute()
|
|
154
155
|
|
|
155
156
|
@requires_dependencies(["redis"], extras="redis")
|
|
@@ -159,16 +160,16 @@ class RedisUploader(Uploader):
|
|
|
159
160
|
redis_stack = True
|
|
160
161
|
async with self.connection_config.create_async_client() as async_client:
|
|
161
162
|
async with async_client.pipeline(transaction=True) as pipe:
|
|
162
|
-
|
|
163
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
163
164
|
try:
|
|
164
165
|
# Redis with stack extension supports JSON type
|
|
165
|
-
await pipe.json().set(
|
|
166
|
+
await pipe.json().set(key_with_prefix, "$", element).execute()
|
|
166
167
|
except redis_exceptions.ResponseError as e:
|
|
167
168
|
message = str(e)
|
|
168
169
|
if "unknown command `JSON.SET`" in message:
|
|
169
170
|
# if this error occurs, Redis server doesn't support JSON type,
|
|
170
171
|
# so save as string type instead
|
|
171
|
-
await pipe.set(
|
|
172
|
+
await pipe.set(key_with_prefix, json.dumps(element)).execute()
|
|
172
173
|
redis_stack = False
|
|
173
174
|
else:
|
|
174
175
|
raise e
|
|
@@ -323,7 +323,7 @@ class SQLUploader(Uploader):
|
|
|
323
323
|
output.append(tuple(parsed))
|
|
324
324
|
return output
|
|
325
325
|
|
|
326
|
-
def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
326
|
+
def _fit_to_schema(self, df: pd.DataFrame, add_missing_columns: bool = True) -> pd.DataFrame:
|
|
327
327
|
table_columns = self.get_table_columns()
|
|
328
328
|
columns = set(df.columns)
|
|
329
329
|
schema_fields = set(table_columns)
|
|
@@ -335,7 +335,7 @@ class SQLUploader(Uploader):
|
|
|
335
335
|
"Following columns will be dropped to match the table's schema: "
|
|
336
336
|
f"{', '.join(columns_to_drop)}"
|
|
337
337
|
)
|
|
338
|
-
if missing_columns:
|
|
338
|
+
if missing_columns and add_missing_columns:
|
|
339
339
|
logger.info(
|
|
340
340
|
"Following null filled columns will be added to match the table's schema:"
|
|
341
341
|
f" {', '.join(missing_columns)} "
|
|
@@ -343,8 +343,9 @@ class SQLUploader(Uploader):
|
|
|
343
343
|
|
|
344
344
|
df = df.drop(columns=columns_to_drop)
|
|
345
345
|
|
|
346
|
-
|
|
347
|
-
|
|
346
|
+
if add_missing_columns:
|
|
347
|
+
for column in missing_columns:
|
|
348
|
+
df[column] = pd.Series()
|
|
348
349
|
return df
|
|
349
350
|
|
|
350
351
|
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.20
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: click
|
|
26
27
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
+
Requires-Dist: pydantic>=2.7
|
|
27
29
|
Requires-Dist: pandas
|
|
28
|
-
Requires-Dist: dataclasses_json
|
|
29
30
|
Requires-Dist: tqdm
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: pydantic>=2.7
|
|
31
|
+
Requires-Dist: dataclasses_json
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
|
|
34
34
|
Provides-Extra: csv
|
|
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
|
86
86
|
Provides-Extra: couchbase
|
|
87
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
88
88
|
Provides-Extra: delta-table
|
|
89
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
90
89
|
Requires-Dist: boto3; extra == "delta-table"
|
|
90
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
91
91
|
Provides-Extra: discord
|
|
92
92
|
Requires-Dist: discord.py; extra == "discord"
|
|
93
93
|
Provides-Extra: dropbox
|
|
@@ -99,8 +99,8 @@ Provides-Extra: elasticsearch
|
|
|
99
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
100
100
|
Provides-Extra: gcs
|
|
101
101
|
Requires-Dist: bs4; extra == "gcs"
|
|
102
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
103
102
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
103
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
105
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
106
|
Requires-Dist: requests; extra == "github"
|
|
@@ -109,8 +109,13 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
109
109
|
Provides-Extra: google-drive
|
|
110
110
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
111
111
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
113
112
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
114
|
+
Provides-Extra: ibm-watsonx-s3
|
|
115
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
116
|
+
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
117
|
+
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
118
|
+
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
114
119
|
Provides-Extra: jira
|
|
115
120
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
116
121
|
Provides-Extra: kafka
|
|
@@ -125,13 +130,13 @@ Provides-Extra: mongodb
|
|
|
125
130
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
131
|
Provides-Extra: neo4j
|
|
127
132
|
Requires-Dist: networkx; extra == "neo4j"
|
|
128
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
129
133
|
Requires-Dist: cymple; extra == "neo4j"
|
|
134
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
130
135
|
Provides-Extra: notion
|
|
136
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
137
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
131
138
|
Requires-Dist: httpx; extra == "notion"
|
|
132
139
|
Requires-Dist: backoff; extra == "notion"
|
|
133
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
134
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
135
140
|
Provides-Extra: onedrive
|
|
136
141
|
Requires-Dist: bs4; extra == "onedrive"
|
|
137
142
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
@@ -152,8 +157,8 @@ Requires-Dist: praw; extra == "reddit"
|
|
|
152
157
|
Provides-Extra: redis
|
|
153
158
|
Requires-Dist: redis; extra == "redis"
|
|
154
159
|
Provides-Extra: s3
|
|
155
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
156
160
|
Requires-Dist: s3fs; extra == "s3"
|
|
161
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
157
162
|
Provides-Extra: sharepoint
|
|
158
163
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
164
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -165,8 +170,8 @@ Requires-Dist: fsspec; extra == "sftp"
|
|
|
165
170
|
Provides-Extra: slack
|
|
166
171
|
Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
167
172
|
Provides-Extra: snowflake
|
|
168
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
169
173
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
174
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
170
175
|
Provides-Extra: wikipedia
|
|
171
176
|
Requires-Dist: wikipedia; extra == "wikipedia"
|
|
172
177
|
Provides-Extra: weaviate
|
|
@@ -178,17 +183,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
178
183
|
Provides-Extra: singlestore
|
|
179
184
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
185
|
Provides-Extra: vectara
|
|
181
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
182
186
|
Requires-Dist: httpx; extra == "vectara"
|
|
187
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
183
188
|
Requires-Dist: requests; extra == "vectara"
|
|
184
189
|
Provides-Extra: vastdb
|
|
185
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
186
190
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
191
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
187
192
|
Requires-Dist: ibis; extra == "vastdb"
|
|
188
193
|
Provides-Extra: zendesk
|
|
194
|
+
Requires-Dist: bs4; extra == "zendesk"
|
|
189
195
|
Requires-Dist: aiofiles; extra == "zendesk"
|
|
190
196
|
Requires-Dist: httpx; extra == "zendesk"
|
|
191
|
-
Requires-Dist: bs4; extra == "zendesk"
|
|
192
197
|
Provides-Extra: embed-huggingface
|
|
193
198
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
194
199
|
Provides-Extra: embed-octoai
|
|
@@ -21,7 +21,7 @@ test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qko
|
|
|
21
21
|
test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
|
|
22
22
|
test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
|
|
23
23
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
24
|
-
test/integration/connectors/test_redis.py,sha256=
|
|
24
|
+
test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
|
|
25
25
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
26
26
|
test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
|
|
27
27
|
test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
|
|
@@ -93,10 +93,12 @@ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vP
|
|
|
93
93
|
test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
|
|
94
94
|
test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
95
|
test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
|
|
96
|
+
test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
|
+
test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=gvgF9vCA_cPQVS_IC6VFvnP4ojFVKOH7eorM6k5VR84,14518
|
|
96
98
|
test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
99
|
test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
|
|
98
100
|
test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
test/unit/v2/connectors/sql/test_sql.py,sha256=
|
|
101
|
+
test/unit/v2/connectors/sql/test_sql.py,sha256=wA5LvLtmaCi-8YDOd515j3YnP0_E4qi7z50NFXBn75g,4634
|
|
100
102
|
test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
103
|
test/unit/v2/embedders/test_bedrock.py,sha256=HMaweO_v_9Y1SE2m5QImXP73cb26vNTUfc1onTBa1-g,1074
|
|
102
104
|
test/unit/v2/embedders/test_huggingface.py,sha256=TOHUKC7hAadl6XTotr8UqOCq28kbQxOIkPSrMxr2PLU,1546
|
|
@@ -111,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
111
113
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
114
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
113
115
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
114
|
-
unstructured_ingest/__version__.py,sha256=
|
|
116
|
+
unstructured_ingest/__version__.py,sha256=BCszjb86jsmMjfakEG2zLAZFKHpLYTR2k5TCe7RzaBc,43
|
|
115
117
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
116
118
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
117
119
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -426,7 +428,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
|
|
|
426
428
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
427
429
|
unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
|
|
428
430
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
429
|
-
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=
|
|
431
|
+
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=l4Xq4AuzRMTqUv5TU7cE1NbhGCka4SFJFZwG1FoVotE,6666
|
|
430
432
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
431
433
|
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=5xc5pWFicE_-2BV38oK-nnzAMI2EzF-q8XAqQ3qPUR8,18249
|
|
432
434
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
|
|
@@ -446,7 +448,7 @@ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=I-eDLAlThHKKFQfkZpQL
|
|
|
446
448
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=5rg7t40gKxDHNcuJrJHmVzJ9uM7Ct4RBOvFsfwdGc5c,18002
|
|
447
449
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
448
450
|
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
|
|
449
|
-
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=
|
|
451
|
+
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=0h105_MpOO4-uydiyHgM4TvduSkAMAr931KFANcKW8Y,6936
|
|
450
452
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
451
453
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
|
|
452
454
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
@@ -477,6 +479,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5B
|
|
|
477
479
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
|
|
478
480
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
|
|
479
481
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
482
|
+
unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
|
|
483
|
+
unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=zaj5MVsM-uf7IRgZGg7QwRtzjtTM1gCYuqji61TrqWk,11562
|
|
480
484
|
unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
|
|
481
485
|
unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
|
|
482
486
|
unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=UfS41jzV9VxekS6AwWHhURJmJ7RUAw5iiIrj75BWrXQ,10255
|
|
@@ -564,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
|
|
|
564
568
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
565
569
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
|
|
566
570
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
|
|
567
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
571
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=wtVK6CHrQ4McwsPifUoa7KKaY-v0cjDZJetASSAaSIA,15415
|
|
568
572
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
|
|
569
573
|
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
|
|
570
574
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
@@ -577,9 +581,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
|
|
|
577
581
|
unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
|
|
578
582
|
unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
579
583
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
|
|
580
|
-
unstructured_ingest-0.5.
|
|
581
|
-
unstructured_ingest-0.5.
|
|
582
|
-
unstructured_ingest-0.5.
|
|
583
|
-
unstructured_ingest-0.5.
|
|
584
|
-
unstructured_ingest-0.5.
|
|
585
|
-
unstructured_ingest-0.5.
|
|
584
|
+
unstructured_ingest-0.5.20.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
585
|
+
unstructured_ingest-0.5.20.dist-info/METADATA,sha256=S2Yr62sVeW0csT-QRyonnokiHFvvH0FAwQ2x02BqAeM,8697
|
|
586
|
+
unstructured_ingest-0.5.20.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
587
|
+
unstructured_ingest-0.5.20.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
588
|
+
unstructured_ingest-0.5.20.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
589
|
+
unstructured_ingest-0.5.20.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.18.dist-info → unstructured_ingest-0.5.20.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|