unstructured-ingest 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +8 -2
- test/unit/v2/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +459 -0
- test/unit/v2/connectors/sql/test_sql.py +79 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +7 -3
- unstructured_ingest/utils/data_prep.py +17 -5
- unstructured_ingest/utils/table.py +11 -4
- unstructured_ingest/v2/processes/connectors/__init__.py +2 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +301 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -12
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/METADATA +174 -18
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/RECORD +29 -25
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import contextlib
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
from dataclasses import dataclass
|
|
@@ -231,6 +232,13 @@ def test_astra_create_destination():
|
|
|
231
232
|
)
|
|
232
233
|
collection_name = "system_created-123"
|
|
233
234
|
formatted_collection_name = "system_created_123"
|
|
235
|
+
|
|
236
|
+
client = AstraDBClient()
|
|
237
|
+
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
238
|
+
with contextlib.suppress(Exception):
|
|
239
|
+
# drop collection before trying to create it
|
|
240
|
+
db.drop_collection(formatted_collection_name)
|
|
241
|
+
|
|
234
242
|
created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
|
|
235
243
|
assert created
|
|
236
244
|
assert uploader.upload_config.collection_name == formatted_collection_name
|
|
@@ -239,8 +247,6 @@ def test_astra_create_destination():
|
|
|
239
247
|
assert not created
|
|
240
248
|
|
|
241
249
|
# cleanup
|
|
242
|
-
client = AstraDBClient()
|
|
243
|
-
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
244
250
|
db.drop_collection(formatted_collection_name)
|
|
245
251
|
|
|
246
252
|
|
|
File without changes
|
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from unittest.mock import MagicMock
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
from pydantic import Secret
|
|
7
|
+
from pyiceberg.exceptions import CommitFailedException
|
|
8
|
+
from pytest_mock import MockerFixture
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.errors import ProviderError, UserError
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
|
|
14
|
+
IbmWatsonxAccessConfig,
|
|
15
|
+
IbmWatsonxConnectionConfig,
|
|
16
|
+
IbmWatsonxUploader,
|
|
17
|
+
IbmWatsonxUploaderConfig,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def file_data():
|
|
23
|
+
return FileData(
|
|
24
|
+
identifier="test_identifier",
|
|
25
|
+
connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
|
|
26
|
+
source_identifiers=SourceIdentifiers(
|
|
27
|
+
filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture
|
|
33
|
+
def access_config():
|
|
34
|
+
return IbmWatsonxAccessConfig(
|
|
35
|
+
iam_api_key="test_iam_api_key",
|
|
36
|
+
access_key_id="test_access_key_id",
|
|
37
|
+
secret_access_key="test_secret_access_key",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@pytest.fixture
|
|
42
|
+
def connection_config(access_config: IbmWatsonxAccessConfig):
|
|
43
|
+
return IbmWatsonxConnectionConfig(
|
|
44
|
+
access_config=Secret(access_config),
|
|
45
|
+
iceberg_endpoint="test_iceberg_endpoint/",
|
|
46
|
+
object_storage_endpoint="test_object_storage_endpoint/",
|
|
47
|
+
object_storage_region="test_region",
|
|
48
|
+
catalog="test_catalog",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.fixture
|
|
53
|
+
def uploader_config():
|
|
54
|
+
return IbmWatsonxUploaderConfig(
|
|
55
|
+
namespace="test_namespace",
|
|
56
|
+
table="test_table",
|
|
57
|
+
record_id_key="test_record_id_key",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.fixture
|
|
62
|
+
def uploader(
|
|
63
|
+
connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
|
|
64
|
+
):
|
|
65
|
+
return IbmWatsonxUploader(
|
|
66
|
+
connection_config=connection_config,
|
|
67
|
+
upload_config=uploader_config,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.fixture
|
|
72
|
+
def mock_catalog(mocker: MockerFixture):
|
|
73
|
+
mock_catalog = mocker.MagicMock()
|
|
74
|
+
mock_catalog.namespace_exists.return_value = True
|
|
75
|
+
mock_catalog.table_exists.return_value = True
|
|
76
|
+
return mock_catalog
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.fixture
|
|
80
|
+
def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
|
|
81
|
+
mock_get_catalog = mocker.patch.context_manager(
|
|
82
|
+
IbmWatsonxConnectionConfig, "get_catalog", autospec=True
|
|
83
|
+
)
|
|
84
|
+
mock_get_catalog.return_value.__enter__.return_value = mock_catalog
|
|
85
|
+
return mock_get_catalog
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@pytest.fixture
|
|
89
|
+
def mock_table(mocker: MockerFixture):
|
|
90
|
+
mock_table = mocker.MagicMock()
|
|
91
|
+
return mock_table
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.fixture
|
|
95
|
+
def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
|
|
96
|
+
mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
|
|
97
|
+
mock_get_table.return_value.__enter__.return_value = mock_table
|
|
98
|
+
return mock_get_table
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@pytest.fixture
|
|
102
|
+
def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
|
|
103
|
+
mock_transaction = mocker.MagicMock()
|
|
104
|
+
mock_table.transaction.return_value.__enter__.return_value = mock_transaction
|
|
105
|
+
return mock_transaction
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@pytest.fixture
|
|
109
|
+
def mock_data_table(mocker: MockerFixture):
|
|
110
|
+
mock_data_table = mocker.MagicMock()
|
|
111
|
+
mock_data_table.schema = "schema"
|
|
112
|
+
return mock_data_table
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@pytest.fixture
|
|
116
|
+
def mock_delete(mocker: MockerFixture):
|
|
117
|
+
return mocker.patch.object(IbmWatsonxUploader, "_delete")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.fixture
|
|
121
|
+
def test_df():
|
|
122
|
+
return pd.DataFrame(
|
|
123
|
+
{
|
|
124
|
+
"test_column_0": [True, False, True],
|
|
125
|
+
"test_column_1": [1, 2, 3],
|
|
126
|
+
"test_column_2": ["a", "b", "c"],
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@pytest.fixture
|
|
132
|
+
def timestamp_now():
|
|
133
|
+
return int(time.time())
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_ibm_watsonx_connection_config_iceberg_url(
|
|
137
|
+
mocker: MockerFixture,
|
|
138
|
+
connection_config: IbmWatsonxConnectionConfig,
|
|
139
|
+
):
|
|
140
|
+
mocker.patch(
|
|
141
|
+
"unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
142
|
+
new="/mds/iceberg",
|
|
143
|
+
)
|
|
144
|
+
expected_url = "https://test_iceberg_endpoint/mds/iceberg"
|
|
145
|
+
assert connection_config.iceberg_url == expected_url
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_ibm_watsonx_connection_config_object_storage_url(
|
|
149
|
+
connection_config: IbmWatsonxConnectionConfig,
|
|
150
|
+
):
|
|
151
|
+
expected_url = "https://test_object_storage_endpoint"
|
|
152
|
+
assert connection_config.object_storage_url == expected_url
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def test_ibm_watsonx_connection_config_bearer_token_new_token(
|
|
156
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
157
|
+
):
|
|
158
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
159
|
+
IbmWatsonxConnectionConfig,
|
|
160
|
+
"generate_bearer_token",
|
|
161
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
162
|
+
)
|
|
163
|
+
token = connection_config.bearer_token
|
|
164
|
+
assert token == "new_token"
|
|
165
|
+
mock_generate_bearer_token.assert_called_once()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_ibm_watsonx_connection_config_bearer_token_existing_token(
|
|
169
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
170
|
+
):
|
|
171
|
+
connection_config._bearer_token = {
|
|
172
|
+
"access_token": "existing_token",
|
|
173
|
+
"expiration": timestamp_now + 3600,
|
|
174
|
+
}
|
|
175
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
176
|
+
IbmWatsonxConnectionConfig, "generate_bearer_token"
|
|
177
|
+
)
|
|
178
|
+
token = connection_config.bearer_token
|
|
179
|
+
assert token == "existing_token"
|
|
180
|
+
mock_generate_bearer_token.assert_not_called()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_ibm_watsonx_connection_config_bearer_token_expired_token(
|
|
184
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
185
|
+
):
|
|
186
|
+
connection_config._bearer_token = {
|
|
187
|
+
"access_token": "expired_token",
|
|
188
|
+
"expiration": timestamp_now - 3600,
|
|
189
|
+
}
|
|
190
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
191
|
+
IbmWatsonxConnectionConfig,
|
|
192
|
+
"generate_bearer_token",
|
|
193
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
194
|
+
)
|
|
195
|
+
token = connection_config.bearer_token
|
|
196
|
+
assert token == "new_token"
|
|
197
|
+
mock_generate_bearer_token.assert_called_once()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
|
|
201
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
|
|
202
|
+
):
|
|
203
|
+
connection_config._bearer_token = {
|
|
204
|
+
"access_token": "soon_to_expire_token",
|
|
205
|
+
"expiration": timestamp_now + 60,
|
|
206
|
+
}
|
|
207
|
+
mock_generate_bearer_token = mocker.patch.object(
|
|
208
|
+
IbmWatsonxConnectionConfig,
|
|
209
|
+
"generate_bearer_token",
|
|
210
|
+
return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
|
|
211
|
+
)
|
|
212
|
+
token = connection_config.bearer_token
|
|
213
|
+
assert token == "new_token"
|
|
214
|
+
mock_generate_bearer_token.assert_called_once()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def test_ibm_watsonx_connection_config_get_catalog_success(
|
|
218
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
219
|
+
):
|
|
220
|
+
mocker.patch(
|
|
221
|
+
"unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
|
|
222
|
+
new="/mds/iceberg",
|
|
223
|
+
)
|
|
224
|
+
mocker.patch.object(
|
|
225
|
+
IbmWatsonxConnectionConfig,
|
|
226
|
+
"bearer_token",
|
|
227
|
+
new="test_bearer_token",
|
|
228
|
+
)
|
|
229
|
+
mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
|
|
230
|
+
|
|
231
|
+
with connection_config.get_catalog() as catalog:
|
|
232
|
+
assert catalog is not None
|
|
233
|
+
mock_load_catalog.assert_called_once_with(
|
|
234
|
+
**{
|
|
235
|
+
"name": "test_catalog",
|
|
236
|
+
"type": "rest",
|
|
237
|
+
"uri": "https://test_iceberg_endpoint/mds/iceberg",
|
|
238
|
+
"token": "test_bearer_token",
|
|
239
|
+
"warehouse": "test_catalog",
|
|
240
|
+
"s3.endpoint": "https://test_object_storage_endpoint",
|
|
241
|
+
"s3.access-key-id": "test_access_key_id",
|
|
242
|
+
"s3.secret-access-key": "test_secret_access_key",
|
|
243
|
+
"s3.region": "test_region",
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_ibm_watsonx_connection_config_get_catalog_failure(
|
|
249
|
+
mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
|
|
250
|
+
):
|
|
251
|
+
mocker.patch(
|
|
252
|
+
"pyiceberg.catalog.load_catalog",
|
|
253
|
+
side_effect=Exception("Connection error"),
|
|
254
|
+
)
|
|
255
|
+
mocker.patch.object(
|
|
256
|
+
IbmWatsonxConnectionConfig,
|
|
257
|
+
"bearer_token",
|
|
258
|
+
new="test_bearer_token",
|
|
259
|
+
)
|
|
260
|
+
with pytest.raises(ProviderError):
|
|
261
|
+
with connection_config.get_catalog():
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
|
|
266
|
+
mock_get_catalog: MagicMock,
|
|
267
|
+
mock_catalog: MagicMock,
|
|
268
|
+
uploader: IbmWatsonxUploader,
|
|
269
|
+
):
|
|
270
|
+
uploader.precheck()
|
|
271
|
+
|
|
272
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
273
|
+
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
|
|
277
|
+
mock_get_catalog: MagicMock,
|
|
278
|
+
mock_catalog: MagicMock,
|
|
279
|
+
uploader: IbmWatsonxUploader,
|
|
280
|
+
):
|
|
281
|
+
mock_catalog.namespace_exists.return_value = False
|
|
282
|
+
|
|
283
|
+
with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
|
|
284
|
+
uploader.precheck()
|
|
285
|
+
|
|
286
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
287
|
+
mock_catalog.table_exists.assert_not_called()
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
|
|
291
|
+
mock_get_catalog: MagicMock,
|
|
292
|
+
mock_catalog: MagicMock,
|
|
293
|
+
uploader: IbmWatsonxUploader,
|
|
294
|
+
):
|
|
295
|
+
mock_catalog.table_exists.return_value = False
|
|
296
|
+
|
|
297
|
+
with pytest.raises(
|
|
298
|
+
UserError,
|
|
299
|
+
match="Table 'test_table' does not exist in namespace 'test_namespace'",
|
|
300
|
+
):
|
|
301
|
+
uploader.precheck()
|
|
302
|
+
|
|
303
|
+
mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
|
|
304
|
+
mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def test_ibm_watsonx_uploader_upload_data_table_success(
|
|
308
|
+
uploader: IbmWatsonxUploader,
|
|
309
|
+
mock_table: MagicMock,
|
|
310
|
+
mock_transaction: MagicMock,
|
|
311
|
+
mock_data_table: MagicMock,
|
|
312
|
+
mock_delete: MagicMock,
|
|
313
|
+
file_data: FileData,
|
|
314
|
+
):
|
|
315
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
316
|
+
|
|
317
|
+
mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
|
|
318
|
+
mock_transaction.append.assert_called_once_with(mock_data_table)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
|
|
322
|
+
uploader: IbmWatsonxUploader,
|
|
323
|
+
mock_table: MagicMock,
|
|
324
|
+
mock_transaction: MagicMock,
|
|
325
|
+
mock_data_table: MagicMock,
|
|
326
|
+
mock_delete: MagicMock,
|
|
327
|
+
file_data: FileData,
|
|
328
|
+
):
|
|
329
|
+
mock_transaction.append.side_effect = CommitFailedException()
|
|
330
|
+
|
|
331
|
+
with pytest.raises(ProviderError):
|
|
332
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
333
|
+
assert mock_table.refresh.call_count == 5
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def test_ibm_watsonx_uploader_upload_data_table_exception(
|
|
337
|
+
uploader: IbmWatsonxUploader,
|
|
338
|
+
mock_table: MagicMock,
|
|
339
|
+
mock_transaction: MagicMock,
|
|
340
|
+
mock_data_table: MagicMock,
|
|
341
|
+
mock_delete: MagicMock,
|
|
342
|
+
file_data: FileData,
|
|
343
|
+
):
|
|
344
|
+
mock_transaction.append.side_effect = Exception()
|
|
345
|
+
|
|
346
|
+
with pytest.raises(ProviderError):
|
|
347
|
+
uploader.upload_data_table(mock_table, mock_data_table, file_data)
|
|
348
|
+
assert mock_table.refresh.call_count == 0
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def test_ibm_watsonx_uploader_df_to_arrow_table(
|
|
352
|
+
mocker: MockerFixture,
|
|
353
|
+
uploader: IbmWatsonxUploader,
|
|
354
|
+
test_df: pd.DataFrame,
|
|
355
|
+
):
|
|
356
|
+
mock_fit_to_schema = mocker.patch.object(
|
|
357
|
+
IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
result = uploader._df_to_arrow_table(test_df)
|
|
361
|
+
|
|
362
|
+
mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
|
|
363
|
+
assert len(result.column_names) == 3
|
|
364
|
+
assert "test_column_0" in result.column_names
|
|
365
|
+
assert "test_column_1" in result.column_names
|
|
366
|
+
assert "test_column_2" in result.column_names
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def test_ibm_watsonx_uploader_can_delete_column_exists(
|
|
370
|
+
mocker: MockerFixture,
|
|
371
|
+
uploader: IbmWatsonxUploader,
|
|
372
|
+
):
|
|
373
|
+
mocker.patch.object(
|
|
374
|
+
IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
assert uploader.can_delete() is True
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
|
|
381
|
+
mocker: MockerFixture,
|
|
382
|
+
uploader: IbmWatsonxUploader,
|
|
383
|
+
):
|
|
384
|
+
mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
|
|
385
|
+
|
|
386
|
+
assert uploader.can_delete() is False
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def test_ibm_watsonx_uploader_get_table_columns_cache(
|
|
390
|
+
uploader: IbmWatsonxUploader,
|
|
391
|
+
):
|
|
392
|
+
uploader._columns = ["cached_column"]
|
|
393
|
+
|
|
394
|
+
result = uploader.get_table_columns()
|
|
395
|
+
|
|
396
|
+
assert result == ["cached_column"]
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def test_ibm_watsonx_uploader_get_table_columns_no_cache(
|
|
400
|
+
uploader: IbmWatsonxUploader,
|
|
401
|
+
mock_get_table: MagicMock,
|
|
402
|
+
mock_table: MagicMock,
|
|
403
|
+
):
|
|
404
|
+
uploader._columns = None
|
|
405
|
+
mock_table.schema.return_value.column_names = ["column_1", "column_2"]
|
|
406
|
+
|
|
407
|
+
result = uploader.get_table_columns()
|
|
408
|
+
|
|
409
|
+
mock_get_table.assert_called_once()
|
|
410
|
+
assert result == ["column_1", "column_2"]
|
|
411
|
+
assert uploader._columns == ["column_1", "column_2"]
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def test_ibm_watsonx_uploader_upload_dataframe_success(
|
|
415
|
+
mocker: MockerFixture,
|
|
416
|
+
uploader: IbmWatsonxUploader,
|
|
417
|
+
test_df: pd.DataFrame,
|
|
418
|
+
mock_get_table: MagicMock,
|
|
419
|
+
mock_table: MagicMock,
|
|
420
|
+
mock_data_table: MagicMock,
|
|
421
|
+
file_data: FileData,
|
|
422
|
+
):
|
|
423
|
+
mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
|
|
424
|
+
mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
|
|
425
|
+
|
|
426
|
+
uploader.upload_dataframe(test_df, file_data)
|
|
427
|
+
|
|
428
|
+
mock_get_table.assert_called_once()
|
|
429
|
+
mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def test_ibm_watsonx_uploader_delete_can_delete(
|
|
433
|
+
mocker: MockerFixture,
|
|
434
|
+
uploader: IbmWatsonxUploader,
|
|
435
|
+
mock_transaction: MagicMock,
|
|
436
|
+
):
|
|
437
|
+
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
|
|
438
|
+
mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
|
|
439
|
+
|
|
440
|
+
uploader._delete(mock_transaction, "test_identifier")
|
|
441
|
+
|
|
442
|
+
mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
|
|
443
|
+
mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def test_ibm_watsonx_uploader_delete_cannot_delete(
|
|
447
|
+
caplog: pytest.LogCaptureFixture,
|
|
448
|
+
mocker: MockerFixture,
|
|
449
|
+
uploader: IbmWatsonxUploader,
|
|
450
|
+
mock_transaction: MagicMock,
|
|
451
|
+
):
|
|
452
|
+
mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
|
|
453
|
+
|
|
454
|
+
uploader._delete(mock_transaction, "test_identifier")
|
|
455
|
+
mock_transaction.delete.assert_not_called()
|
|
456
|
+
assert (
|
|
457
|
+
"Table doesn't contain expected record id column test_record_id_key, skipping delete"
|
|
458
|
+
in caplog.text
|
|
459
|
+
)
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
import pandas as pd
|
|
3
4
|
import pytest
|
|
4
5
|
from pytest_mock import MockerFixture
|
|
5
6
|
|
|
6
7
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sql.sql import
|
|
8
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
9
|
+
SQLConnectionConfig,
|
|
10
|
+
SQLUploader,
|
|
11
|
+
SQLUploaderConfig,
|
|
12
|
+
SQLUploadStager,
|
|
13
|
+
)
|
|
8
14
|
|
|
9
15
|
|
|
10
16
|
@pytest.fixture
|
|
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
|
|
|
12
18
|
return SQLUploadStager()
|
|
13
19
|
|
|
14
20
|
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def mock_uploader(mocker: MockerFixture) -> SQLUploader:
|
|
23
|
+
mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
|
|
24
|
+
mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
|
|
25
|
+
return SQLUploader(
|
|
26
|
+
upload_config=mock_upload_config,
|
|
27
|
+
connection_config=mock_connection_config,
|
|
28
|
+
connector_type="sql_test",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
15
32
|
@pytest.mark.parametrize(
|
|
16
33
|
("input_filepath", "output_filename", "expected"),
|
|
17
34
|
[
|
|
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
|
|
|
72
89
|
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
73
90
|
)
|
|
74
91
|
assert result.name == expected
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
95
|
+
df = pd.DataFrame(
|
|
96
|
+
{
|
|
97
|
+
"col1": [1, 2],
|
|
98
|
+
"col2": [3, 4],
|
|
99
|
+
"col3": [5, 6],
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
103
|
+
|
|
104
|
+
result = mock_uploader._fit_to_schema(df)
|
|
105
|
+
|
|
106
|
+
assert "col3" not in result.columns
|
|
107
|
+
assert "col1" in result.columns
|
|
108
|
+
assert "col2" in result.columns
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
112
|
+
df = pd.DataFrame(
|
|
113
|
+
{
|
|
114
|
+
"col1": [1, 2],
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
118
|
+
|
|
119
|
+
result = mock_uploader._fit_to_schema(df)
|
|
120
|
+
|
|
121
|
+
assert "col2" in result.columns
|
|
122
|
+
assert result["col2"].isnull().all()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
126
|
+
df = pd.DataFrame(
|
|
127
|
+
{
|
|
128
|
+
"col1": [1, 2],
|
|
129
|
+
"col2": [3, 4],
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
133
|
+
|
|
134
|
+
result = mock_uploader._fit_to_schema(df)
|
|
135
|
+
|
|
136
|
+
assert "col1" in result.columns
|
|
137
|
+
assert "col2" in result.columns
|
|
138
|
+
assert result.equals(df)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
|
|
142
|
+
df = pd.DataFrame(
|
|
143
|
+
{
|
|
144
|
+
"col1": [1, 2],
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
|
|
148
|
+
|
|
149
|
+
result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
|
|
150
|
+
|
|
151
|
+
assert "col2" not in result.columns
|
|
152
|
+
assert "col1" in result.columns
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.21" # pragma: no cover
|
|
@@ -2,10 +2,10 @@ from abc import ABC
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
7
6
|
|
|
8
7
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
EMBEDDINGS_KEY = "embeddings"
|
|
11
11
|
|
|
@@ -32,7 +32,6 @@ class BaseEncoder(ABC):
|
|
|
32
32
|
|
|
33
33
|
@dataclass
|
|
34
34
|
class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
35
|
-
|
|
36
35
|
def initialize(self):
|
|
37
36
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
38
37
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -46,8 +45,11 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
46
45
|
return self.embed_query(query="Q")
|
|
47
46
|
|
|
48
47
|
@property
|
|
48
|
+
@requires_dependencies(["numpy"])
|
|
49
49
|
def is_unit_vector(self) -> bool:
|
|
50
50
|
"""Denotes if the embedding vector is a unit vector."""
|
|
51
|
+
import numpy as np
|
|
52
|
+
|
|
51
53
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
52
54
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
53
55
|
|
|
@@ -86,7 +88,6 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
86
88
|
|
|
87
89
|
@dataclass
|
|
88
90
|
class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
89
|
-
|
|
90
91
|
async def initialize(self):
|
|
91
92
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
92
93
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -100,8 +101,11 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
100
101
|
return await self.embed_query(query="Q")
|
|
101
102
|
|
|
102
103
|
@property
|
|
104
|
+
@requires_dependencies(["numpy"])
|
|
103
105
|
async def is_unit_vector(self) -> bool:
|
|
104
106
|
"""Denotes if the embedding vector is a unit vector."""
|
|
107
|
+
import numpy as np
|
|
108
|
+
|
|
105
109
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
106
110
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
107
111
|
|
|
@@ -2,20 +2,22 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.utils import ndjson
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pandas import DataFrame
|
|
13
|
+
|
|
12
14
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
13
15
|
|
|
14
16
|
T = TypeVar("T")
|
|
15
17
|
IterableT = Iterable[T]
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
def split_dataframe(df:
|
|
20
|
+
def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
|
|
19
21
|
num_chunks = len(df) // chunk_size + 1
|
|
20
22
|
for i in range(num_chunks):
|
|
21
23
|
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
144
146
|
elif path.suffix == ".ndjson":
|
|
145
147
|
return ndjson.load(f)
|
|
146
148
|
elif path.suffix == ".csv":
|
|
149
|
+
import pandas as pd
|
|
150
|
+
|
|
147
151
|
df = pd.read_csv(path)
|
|
148
152
|
return df.to_dict(orient="records")
|
|
149
153
|
elif path.suffix == ".parquet":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
150
156
|
df = pd.read_parquet(path)
|
|
151
157
|
return df.to_dict(orient="records")
|
|
152
158
|
else:
|
|
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
|
|
|
180
186
|
return ndjson.load(f)
|
|
181
187
|
except Exception as e:
|
|
182
188
|
logger.warning(f"failed to read {path} as ndjson: {e}")
|
|
189
|
+
|
|
190
|
+
import pandas as pd
|
|
191
|
+
|
|
183
192
|
try:
|
|
184
193
|
df = pd.read_csv(path)
|
|
185
194
|
return df.to_dict(orient="records")
|
|
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
|
|
|
202
211
|
raise ValueError(f"Unsupported file type: {path}")
|
|
203
212
|
|
|
204
213
|
|
|
205
|
-
|
|
214
|
+
@requires_dependencies(["pandas"])
|
|
215
|
+
def get_data_df(path: Path) -> "DataFrame":
|
|
216
|
+
import pandas as pd
|
|
217
|
+
|
|
206
218
|
with path.open() as f:
|
|
207
219
|
if path.suffix == ".json":
|
|
208
220
|
data = json.load(f)
|