unstructured-ingest 0.5.19__py3-none-any.whl → 0.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,459 @@
1
+ import time
2
+ from unittest.mock import MagicMock
3
+
4
+ import pandas as pd
5
+ import pytest
6
+ from pydantic import Secret
7
+ from pyiceberg.exceptions import CommitFailedException
8
+ from pytest_mock import MockerFixture
9
+
10
+ from unstructured_ingest.v2.errors import ProviderError, UserError
11
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
13
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14
+ IbmWatsonxAccessConfig,
15
+ IbmWatsonxConnectionConfig,
16
+ IbmWatsonxUploader,
17
+ IbmWatsonxUploaderConfig,
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def file_data():
23
+ return FileData(
24
+ identifier="test_identifier",
25
+ connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
26
+ source_identifiers=SourceIdentifiers(
27
+ filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
28
+ ),
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def access_config():
34
+ return IbmWatsonxAccessConfig(
35
+ iam_api_key="test_iam_api_key",
36
+ access_key_id="test_access_key_id",
37
+ secret_access_key="test_secret_access_key",
38
+ )
39
+
40
+
41
+ @pytest.fixture
42
+ def connection_config(access_config: IbmWatsonxAccessConfig):
43
+ return IbmWatsonxConnectionConfig(
44
+ access_config=Secret(access_config),
45
+ iceberg_endpoint="test_iceberg_endpoint/",
46
+ object_storage_endpoint="test_object_storage_endpoint/",
47
+ object_storage_region="test_region",
48
+ catalog="test_catalog",
49
+ )
50
+
51
+
52
+ @pytest.fixture
53
+ def uploader_config():
54
+ return IbmWatsonxUploaderConfig(
55
+ namespace="test_namespace",
56
+ table="test_table",
57
+ record_id_key="test_record_id_key",
58
+ )
59
+
60
+
61
+ @pytest.fixture
62
+ def uploader(
63
+ connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
64
+ ):
65
+ return IbmWatsonxUploader(
66
+ connection_config=connection_config,
67
+ upload_config=uploader_config,
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def mock_catalog(mocker: MockerFixture):
73
+ mock_catalog = mocker.MagicMock()
74
+ mock_catalog.namespace_exists.return_value = True
75
+ mock_catalog.table_exists.return_value = True
76
+ return mock_catalog
77
+
78
+
79
+ @pytest.fixture
80
+ def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
81
+ mock_get_catalog = mocker.patch.context_manager(
82
+ IbmWatsonxConnectionConfig, "get_catalog", autospec=True
83
+ )
84
+ mock_get_catalog.return_value.__enter__.return_value = mock_catalog
85
+ return mock_get_catalog
86
+
87
+
88
+ @pytest.fixture
89
+ def mock_table(mocker: MockerFixture):
90
+ mock_table = mocker.MagicMock()
91
+ return mock_table
92
+
93
+
94
+ @pytest.fixture
95
+ def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
96
+ mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
97
+ mock_get_table.return_value.__enter__.return_value = mock_table
98
+ return mock_get_table
99
+
100
+
101
+ @pytest.fixture
102
+ def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
103
+ mock_transaction = mocker.MagicMock()
104
+ mock_table.transaction.return_value.__enter__.return_value = mock_transaction
105
+ return mock_transaction
106
+
107
+
108
+ @pytest.fixture
109
+ def mock_data_table(mocker: MockerFixture):
110
+ mock_data_table = mocker.MagicMock()
111
+ mock_data_table.schema = "schema"
112
+ return mock_data_table
113
+
114
+
115
+ @pytest.fixture
116
+ def mock_delete(mocker: MockerFixture):
117
+ return mocker.patch.object(IbmWatsonxUploader, "_delete")
118
+
119
+
120
+ @pytest.fixture
121
+ def test_df():
122
+ return pd.DataFrame(
123
+ {
124
+ "test_column_0": [True, False, True],
125
+ "test_column_1": [1, 2, 3],
126
+ "test_column_2": ["a", "b", "c"],
127
+ }
128
+ )
129
+
130
+
131
+ @pytest.fixture
132
+ def timestamp_now():
133
+ return int(time.time())
134
+
135
+
136
+ def test_ibm_watsonx_connection_config_iceberg_url(
137
+ mocker: MockerFixture,
138
+ connection_config: IbmWatsonxConnectionConfig,
139
+ ):
140
+ mocker.patch(
141
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
142
+ new="/mds/iceberg",
143
+ )
144
+ expected_url = "https://test_iceberg_endpoint/mds/iceberg"
145
+ assert connection_config.iceberg_url == expected_url
146
+
147
+
148
+ def test_ibm_watsonx_connection_config_object_storage_url(
149
+ connection_config: IbmWatsonxConnectionConfig,
150
+ ):
151
+ expected_url = "https://test_object_storage_endpoint"
152
+ assert connection_config.object_storage_url == expected_url
153
+
154
+
155
+ def test_ibm_watsonx_connection_config_bearer_token_new_token(
156
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
157
+ ):
158
+ mock_generate_bearer_token = mocker.patch.object(
159
+ IbmWatsonxConnectionConfig,
160
+ "generate_bearer_token",
161
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
162
+ )
163
+ token = connection_config.bearer_token
164
+ assert token == "new_token"
165
+ mock_generate_bearer_token.assert_called_once()
166
+
167
+
168
+ def test_ibm_watsonx_connection_config_bearer_token_existing_token(
169
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
170
+ ):
171
+ connection_config._bearer_token = {
172
+ "access_token": "existing_token",
173
+ "expiration": timestamp_now + 3600,
174
+ }
175
+ mock_generate_bearer_token = mocker.patch.object(
176
+ IbmWatsonxConnectionConfig, "generate_bearer_token"
177
+ )
178
+ token = connection_config.bearer_token
179
+ assert token == "existing_token"
180
+ mock_generate_bearer_token.assert_not_called()
181
+
182
+
183
+ def test_ibm_watsonx_connection_config_bearer_token_expired_token(
184
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
185
+ ):
186
+ connection_config._bearer_token = {
187
+ "access_token": "expired_token",
188
+ "expiration": timestamp_now - 3600,
189
+ }
190
+ mock_generate_bearer_token = mocker.patch.object(
191
+ IbmWatsonxConnectionConfig,
192
+ "generate_bearer_token",
193
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
194
+ )
195
+ token = connection_config.bearer_token
196
+ assert token == "new_token"
197
+ mock_generate_bearer_token.assert_called_once()
198
+
199
+
200
+ def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
201
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
202
+ ):
203
+ connection_config._bearer_token = {
204
+ "access_token": "soon_to_expire_token",
205
+ "expiration": timestamp_now + 60,
206
+ }
207
+ mock_generate_bearer_token = mocker.patch.object(
208
+ IbmWatsonxConnectionConfig,
209
+ "generate_bearer_token",
210
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
211
+ )
212
+ token = connection_config.bearer_token
213
+ assert token == "new_token"
214
+ mock_generate_bearer_token.assert_called_once()
215
+
216
+
217
+ def test_ibm_watsonx_connection_config_get_catalog_success(
218
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
219
+ ):
220
+ mocker.patch(
221
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
222
+ new="/mds/iceberg",
223
+ )
224
+ mocker.patch.object(
225
+ IbmWatsonxConnectionConfig,
226
+ "bearer_token",
227
+ new="test_bearer_token",
228
+ )
229
+ mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
230
+
231
+ with connection_config.get_catalog() as catalog:
232
+ assert catalog is not None
233
+ mock_load_catalog.assert_called_once_with(
234
+ **{
235
+ "name": "test_catalog",
236
+ "type": "rest",
237
+ "uri": "https://test_iceberg_endpoint/mds/iceberg",
238
+ "token": "test_bearer_token",
239
+ "warehouse": "test_catalog",
240
+ "s3.endpoint": "https://test_object_storage_endpoint",
241
+ "s3.access-key-id": "test_access_key_id",
242
+ "s3.secret-access-key": "test_secret_access_key",
243
+ "s3.region": "test_region",
244
+ }
245
+ )
246
+
247
+
248
+ def test_ibm_watsonx_connection_config_get_catalog_failure(
249
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
250
+ ):
251
+ mocker.patch(
252
+ "pyiceberg.catalog.load_catalog",
253
+ side_effect=Exception("Connection error"),
254
+ )
255
+ mocker.patch.object(
256
+ IbmWatsonxConnectionConfig,
257
+ "bearer_token",
258
+ new="test_bearer_token",
259
+ )
260
+ with pytest.raises(ProviderError):
261
+ with connection_config.get_catalog():
262
+ pass
263
+
264
+
265
+ def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
266
+ mock_get_catalog: MagicMock,
267
+ mock_catalog: MagicMock,
268
+ uploader: IbmWatsonxUploader,
269
+ ):
270
+ uploader.precheck()
271
+
272
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
273
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
274
+
275
+
276
+ def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
277
+ mock_get_catalog: MagicMock,
278
+ mock_catalog: MagicMock,
279
+ uploader: IbmWatsonxUploader,
280
+ ):
281
+ mock_catalog.namespace_exists.return_value = False
282
+
283
+ with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
284
+ uploader.precheck()
285
+
286
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
287
+ mock_catalog.table_exists.assert_not_called()
288
+
289
+
290
+ def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
291
+ mock_get_catalog: MagicMock,
292
+ mock_catalog: MagicMock,
293
+ uploader: IbmWatsonxUploader,
294
+ ):
295
+ mock_catalog.table_exists.return_value = False
296
+
297
+ with pytest.raises(
298
+ UserError,
299
+ match="Table 'test_table' does not exist in namespace 'test_namespace'",
300
+ ):
301
+ uploader.precheck()
302
+
303
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
304
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
305
+
306
+
307
+ def test_ibm_watsonx_uploader_upload_data_table_success(
308
+ uploader: IbmWatsonxUploader,
309
+ mock_table: MagicMock,
310
+ mock_transaction: MagicMock,
311
+ mock_data_table: MagicMock,
312
+ mock_delete: MagicMock,
313
+ file_data: FileData,
314
+ ):
315
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
316
+
317
+ mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
318
+ mock_transaction.append.assert_called_once_with(mock_data_table)
319
+
320
+
321
+ def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
322
+ uploader: IbmWatsonxUploader,
323
+ mock_table: MagicMock,
324
+ mock_transaction: MagicMock,
325
+ mock_data_table: MagicMock,
326
+ mock_delete: MagicMock,
327
+ file_data: FileData,
328
+ ):
329
+ mock_transaction.append.side_effect = CommitFailedException()
330
+
331
+ with pytest.raises(ProviderError):
332
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
333
+ assert mock_table.refresh.call_count == 5
334
+
335
+
336
+ def test_ibm_watsonx_uploader_upload_data_table_exception(
337
+ uploader: IbmWatsonxUploader,
338
+ mock_table: MagicMock,
339
+ mock_transaction: MagicMock,
340
+ mock_data_table: MagicMock,
341
+ mock_delete: MagicMock,
342
+ file_data: FileData,
343
+ ):
344
+ mock_transaction.append.side_effect = Exception()
345
+
346
+ with pytest.raises(ProviderError):
347
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
348
+ assert mock_table.refresh.call_count == 0
349
+
350
+
351
+ def test_ibm_watsonx_uploader_df_to_arrow_table(
352
+ mocker: MockerFixture,
353
+ uploader: IbmWatsonxUploader,
354
+ test_df: pd.DataFrame,
355
+ ):
356
+ mock_fit_to_schema = mocker.patch.object(
357
+ IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
358
+ )
359
+
360
+ result = uploader._df_to_arrow_table(test_df)
361
+
362
+ mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
363
+ assert len(result.column_names) == 3
364
+ assert "test_column_0" in result.column_names
365
+ assert "test_column_1" in result.column_names
366
+ assert "test_column_2" in result.column_names
367
+
368
+
369
+ def test_ibm_watsonx_uploader_can_delete_column_exists(
370
+ mocker: MockerFixture,
371
+ uploader: IbmWatsonxUploader,
372
+ ):
373
+ mocker.patch.object(
374
+ IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
375
+ )
376
+
377
+ assert uploader.can_delete() is True
378
+
379
+
380
+ def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
381
+ mocker: MockerFixture,
382
+ uploader: IbmWatsonxUploader,
383
+ ):
384
+ mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
385
+
386
+ assert uploader.can_delete() is False
387
+
388
+
389
+ def test_ibm_watsonx_uploader_get_table_columns_cache(
390
+ uploader: IbmWatsonxUploader,
391
+ ):
392
+ uploader._columns = ["cached_column"]
393
+
394
+ result = uploader.get_table_columns()
395
+
396
+ assert result == ["cached_column"]
397
+
398
+
399
+ def test_ibm_watsonx_uploader_get_table_columns_no_cache(
400
+ uploader: IbmWatsonxUploader,
401
+ mock_get_table: MagicMock,
402
+ mock_table: MagicMock,
403
+ ):
404
+ uploader._columns = None
405
+ mock_table.schema.return_value.column_names = ["column_1", "column_2"]
406
+
407
+ result = uploader.get_table_columns()
408
+
409
+ mock_get_table.assert_called_once()
410
+ assert result == ["column_1", "column_2"]
411
+ assert uploader._columns == ["column_1", "column_2"]
412
+
413
+
414
+ def test_ibm_watsonx_uploader_upload_dataframe_success(
415
+ mocker: MockerFixture,
416
+ uploader: IbmWatsonxUploader,
417
+ test_df: pd.DataFrame,
418
+ mock_get_table: MagicMock,
419
+ mock_table: MagicMock,
420
+ mock_data_table: MagicMock,
421
+ file_data: FileData,
422
+ ):
423
+ mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
424
+ mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
425
+
426
+ uploader.upload_dataframe(test_df, file_data)
427
+
428
+ mock_get_table.assert_called_once()
429
+ mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
430
+
431
+
432
+ def test_ibm_watsonx_uploader_delete_can_delete(
433
+ mocker: MockerFixture,
434
+ uploader: IbmWatsonxUploader,
435
+ mock_transaction: MagicMock,
436
+ ):
437
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
438
+ mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
439
+
440
+ uploader._delete(mock_transaction, "test_identifier")
441
+
442
+ mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
443
+ mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
444
+
445
+
446
+ def test_ibm_watsonx_uploader_delete_cannot_delete(
447
+ caplog: pytest.LogCaptureFixture,
448
+ mocker: MockerFixture,
449
+ uploader: IbmWatsonxUploader,
450
+ mock_transaction: MagicMock,
451
+ ):
452
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
453
+
454
+ uploader._delete(mock_transaction, "test_identifier")
455
+ mock_transaction.delete.assert_not_called()
456
+ assert (
457
+ "Table doesn't contain expected record id column test_record_id_key, skipping delete"
458
+ in caplog.text
459
+ )
@@ -1,10 +1,16 @@
1
1
  from pathlib import Path
2
2
 
3
+ import pandas as pd
3
4
  import pytest
4
5
  from pytest_mock import MockerFixture
5
6
 
6
7
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
7
- from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
8
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
9
+ SQLConnectionConfig,
10
+ SQLUploader,
11
+ SQLUploaderConfig,
12
+ SQLUploadStager,
13
+ )
8
14
 
9
15
 
10
16
  @pytest.fixture
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
12
18
  return SQLUploadStager()
13
19
 
14
20
 
21
+ @pytest.fixture
22
+ def mock_uploader(mocker: MockerFixture) -> SQLUploader:
23
+ mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
24
+ mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
25
+ return SQLUploader(
26
+ upload_config=mock_upload_config,
27
+ connection_config=mock_connection_config,
28
+ connector_type="sql_test",
29
+ )
30
+
31
+
15
32
  @pytest.mark.parametrize(
16
33
  ("input_filepath", "output_filename", "expected"),
17
34
  [
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
72
89
  path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
90
  )
74
91
  assert result.name == expected
92
+
93
+
94
+ def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
95
+ df = pd.DataFrame(
96
+ {
97
+ "col1": [1, 2],
98
+ "col2": [3, 4],
99
+ "col3": [5, 6],
100
+ }
101
+ )
102
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
103
+
104
+ result = mock_uploader._fit_to_schema(df)
105
+
106
+ assert "col3" not in result.columns
107
+ assert "col1" in result.columns
108
+ assert "col2" in result.columns
109
+
110
+
111
+ def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
112
+ df = pd.DataFrame(
113
+ {
114
+ "col1": [1, 2],
115
+ }
116
+ )
117
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
118
+
119
+ result = mock_uploader._fit_to_schema(df)
120
+
121
+ assert "col2" in result.columns
122
+ assert result["col2"].isnull().all()
123
+
124
+
125
+ def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
126
+ df = pd.DataFrame(
127
+ {
128
+ "col1": [1, 2],
129
+ "col2": [3, 4],
130
+ }
131
+ )
132
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
133
+
134
+ result = mock_uploader._fit_to_schema(df)
135
+
136
+ assert "col1" in result.columns
137
+ assert "col2" in result.columns
138
+ assert result.equals(df)
139
+
140
+
141
+ def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
142
+ df = pd.DataFrame(
143
+ {
144
+ "col1": [1, 2],
145
+ }
146
+ )
147
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
148
+
149
+ result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
150
+
151
+ assert "col2" not in result.columns
152
+ assert "col1" in result.columns
@@ -1 +1 @@
1
- __version__ = "0.5.19" # pragma: no cover
1
+ __version__ = "0.5.20" # pragma: no cover
@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
5
5
  import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
6
6
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.ibm_watsonx # noqa: F401
7
8
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
8
9
  import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
9
10
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
121
122
  add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
122
123
 
123
124
  add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
125
+
124
126
  add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
4
+
5
+ from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
6
+ from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
7
+
8
+ add_destination_entry(
9
+ destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
10
+ )
@@ -0,0 +1,301 @@
1
+ import logging
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
7
+
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.utils.data_prep import get_data_df
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ UploaderConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
26
+ SQLUploader,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from pyarrow import Table as ArrowTable
33
+ from pyiceberg.catalog.rest import RestCatalog
34
+ from pyiceberg.table import Table, Transaction
35
+
36
+ CONNECTOR_TYPE = "ibm_watsonx_s3"
37
+
38
+ DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
39
+ DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
40
+ DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
41
+
42
+
43
+ class IcebergCommitFailedException(Exception):
44
+ """Failed to commit changes to the iceberg table."""
45
+
46
+
47
+ class IbmWatsonxAccessConfig(AccessConfig):
48
+ iam_api_key: str = Field(description="IBM IAM API Key")
49
+ access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
50
+ secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
51
+
52
+
53
+ class IbmWatsonxConnectionConfig(ConnectionConfig):
54
+ access_config: Secret[IbmWatsonxAccessConfig]
55
+ iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
56
+ object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
57
+ object_storage_region: str = Field(description="Cloud Object Storage region")
58
+ catalog: str = Field(description="Catalog name")
59
+
60
+ _bearer_token: Optional[dict[str, Any]] = None
61
+
62
+ @property
63
+ def iceberg_url(self) -> str:
64
+ return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
65
+
66
+ @property
67
+ def object_storage_url(self) -> str:
68
+ return f"https://{self.object_storage_endpoint.strip('/')}"
69
+
70
+ @property
71
+ def bearer_token(self) -> str:
72
+ # Add 60 seconds to deal with edge cases where the token expires before the request is made
73
+ timestamp = int(time.time()) + 60
74
+ if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
75
+ self._bearer_token = self.generate_bearer_token()
76
+ return self._bearer_token["access_token"]
77
+
78
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
79
+ def wrap_error(self, e: Exception) -> Exception:
80
+ import httpx
81
+
82
+ if not isinstance(e, httpx.HTTPStatusError):
83
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
84
+ return e
85
+ url = e.request.url
86
+ response_code = e.response.status_code
87
+ if response_code == 401:
88
+ logger.error(
89
+ f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
90
+ )
91
+ return UserAuthError(e)
92
+ if response_code == 403:
93
+ logger.error(
94
+ f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
95
+ )
96
+ return UserAuthError(e)
97
+ if 400 <= response_code < 500:
98
+ logger.error(
99
+ f"Request to {url} failed"
100
+ f"in IBM watsonx.data connector, status code {response_code}"
101
+ )
102
+ return UserError(e)
103
+ if response_code > 500:
104
+ logger.error(
105
+ f"Request to {url} failed"
106
+ f"in IBM watsonx.data connector, status code {response_code}"
107
+ )
108
+ return ProviderError(e)
109
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
110
+ return e
111
+
112
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
113
+ def generate_bearer_token(self) -> dict[str, Any]:
114
+ import httpx
115
+
116
+ headers = {
117
+ "Content-Type": "application/x-www-form-urlencoded",
118
+ "Accept": "application/json",
119
+ }
120
+ data = {
121
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
122
+ "apikey": self.access_config.get_secret_value().iam_api_key,
123
+ }
124
+
125
+ logger.info("Generating IBM IAM Bearer Token")
126
+ try:
127
+ response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
128
+ response.raise_for_status()
129
+ except Exception as e:
130
+ raise self.wrap_error(e)
131
+ return response.json()
132
+
133
+ def get_catalog_config(self) -> dict[str, Any]:
134
+ return {
135
+ "name": self.catalog,
136
+ "type": DEFAULT_ICEBERG_CATALOG_TYPE,
137
+ "uri": self.iceberg_url,
138
+ "token": self.bearer_token,
139
+ "warehouse": self.catalog,
140
+ "s3.endpoint": self.object_storage_url,
141
+ "s3.access-key-id": self.access_config.get_secret_value().access_key_id,
142
+ "s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
143
+ "s3.region": self.object_storage_region,
144
+ }
145
+
146
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
147
+ @contextmanager
148
+ def get_catalog(self) -> Generator["RestCatalog", None, None]:
149
+ from pyiceberg.catalog import load_catalog
150
+
151
+ try:
152
+ catalog_config = self.get_catalog_config()
153
+ catalog = load_catalog(**catalog_config)
154
+ except Exception as e:
155
+ logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
156
+ raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
157
+
158
+ yield catalog
159
+
160
+
161
+ @dataclass
162
+ class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class IbmWatsonxUploadStager(SQLUploadStager):
168
+ upload_stager_config: IbmWatsonxUploadStagerConfig = field(
169
+ default_factory=IbmWatsonxUploadStagerConfig
170
+ )
171
+
172
+
173
+ class IbmWatsonxUploaderConfig(UploaderConfig):
174
+ namespace: str = Field(description="Namespace name")
175
+ table: str = Field(description="Table name")
176
+ max_retries: int = Field(
177
+ default=5, description="Maximum number of retries to upload data", ge=2, le=10
178
+ )
179
+ record_id_key: str = Field(
180
+ default=RECORD_ID_LABEL,
181
+ description="Searchable key to find entries for the same record on previous runs",
182
+ )
183
+
184
+ @property
185
+ def table_identifier(self) -> Tuple[str, str]:
186
+ return (self.namespace, self.table)
187
+
188
+
189
+ @dataclass
190
+ class IbmWatsonxUploader(SQLUploader):
191
+ connection_config: IbmWatsonxConnectionConfig
192
+ upload_config: IbmWatsonxUploaderConfig
193
+ connector_type: str = CONNECTOR_TYPE
194
+
195
+ def precheck(self) -> None:
196
+ with self.connection_config.get_catalog() as catalog:
197
+ if not catalog.namespace_exists(self.upload_config.namespace):
198
+ raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
199
+ if not catalog.table_exists(self.upload_config.table_identifier):
200
+ raise UserError(
201
+ f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
202
+ )
203
+
204
+ @contextmanager
205
+ def get_table(self) -> Generator["Table", None, None]:
206
+ with self.connection_config.get_catalog() as catalog:
207
+ table = catalog.load_table(self.upload_config.table_identifier)
208
+ yield table
209
+
210
+ def get_table_columns(self) -> list[str]:
211
+ if self._columns is None:
212
+ with self.get_table() as table:
213
+ self._columns = table.schema().column_names
214
+ return self._columns
215
+
216
+ def can_delete(self) -> bool:
217
+ return self.upload_config.record_id_key in self.get_table_columns()
218
+
219
+ @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
220
+ def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
221
+ import pyarrow as pa
222
+
223
+ # Iceberg will automatically fill missing columns with nulls
224
+ # Iceberg will throw an error if the DataFrame column has only null values
225
+ # because it can't infer the type of the column and match it with the table schema
226
+ return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
227
+
228
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
229
+ def _delete(self, transaction: "Transaction", identifier: str) -> None:
230
+ from pyiceberg.expressions import EqualTo
231
+
232
+ if self.can_delete():
233
+ transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
234
+ else:
235
+ logger.warning(
236
+ f"Table doesn't contain expected "
237
+ f"record id column "
238
+ f"{self.upload_config.record_id_key}, skipping delete"
239
+ )
240
+
241
+ @requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
242
+ def upload_data_table(
243
+ self, table: "Table", data_table: "ArrowTable", file_data: FileData
244
+ ) -> None:
245
+ from pyiceberg.exceptions import CommitFailedException
246
+ from tenacity import (
247
+ before_log,
248
+ retry,
249
+ retry_if_exception_type,
250
+ stop_after_attempt,
251
+ wait_random,
252
+ )
253
+
254
+ @retry(
255
+ stop=stop_after_attempt(self.upload_config.max_retries),
256
+ wait=wait_random(),
257
+ retry=retry_if_exception_type(IcebergCommitFailedException),
258
+ before=before_log(logger, logging.DEBUG),
259
+ reraise=True,
260
+ )
261
+ def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
262
+ try:
263
+ with table.transaction() as transaction:
264
+ self._delete(transaction, file_data.identifier)
265
+ transaction.append(data_table)
266
+ except CommitFailedException as e:
267
+ table.refresh()
268
+ logger.debug(e)
269
+ raise IcebergCommitFailedException(e)
270
+ except Exception as e:
271
+ raise ProviderError(f"Failed to upload data to table: {e}")
272
+
273
+ try:
274
+ return _upload_data_table(table, data_table, file_data)
275
+ except ProviderError:
276
+ raise
277
+ except Exception as e:
278
+ raise ProviderError(f"Failed to upload data to table: {e}")
279
+
280
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
281
+ data_table = self._df_to_arrow_table(df)
282
+
283
+ with self.get_table() as table:
284
+ self.upload_data_table(table, data_table, file_data)
285
+
286
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
287
+ df = pd.DataFrame(data)
288
+ self.upload_dataframe(df=df, file_data=file_data)
289
+
290
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
291
+ df = get_data_df(path=path)
292
+ self.upload_dataframe(df=df, file_data=file_data)
293
+
294
+
295
+ ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
296
+ connection_config=IbmWatsonxConnectionConfig,
297
+ uploader=IbmWatsonxUploader,
298
+ uploader_config=IbmWatsonxUploaderConfig,
299
+ upload_stager=IbmWatsonxUploadStager,
300
+ upload_stager_config=IbmWatsonxUploadStagerConfig,
301
+ )
@@ -323,7 +323,7 @@ class SQLUploader(Uploader):
323
323
  output.append(tuple(parsed))
324
324
  return output
325
325
 
326
- def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
326
+ def _fit_to_schema(self, df: pd.DataFrame, add_missing_columns: bool = True) -> pd.DataFrame:
327
327
  table_columns = self.get_table_columns()
328
328
  columns = set(df.columns)
329
329
  schema_fields = set(table_columns)
@@ -335,7 +335,7 @@ class SQLUploader(Uploader):
335
335
  "Following columns will be dropped to match the table's schema: "
336
336
  f"{', '.join(columns_to_drop)}"
337
337
  )
338
- if missing_columns:
338
+ if missing_columns and add_missing_columns:
339
339
  logger.info(
340
340
  "Following null filled columns will be added to match the table's schema:"
341
341
  f" {', '.join(missing_columns)} "
@@ -343,8 +343,9 @@ class SQLUploader(Uploader):
343
343
 
344
344
  df = df.drop(columns=columns_to_drop)
345
345
 
346
- for column in missing_columns:
347
- df[column] = pd.Series()
346
+ if add_missing_columns:
347
+ for column in missing_columns:
348
+ df[column] = pd.Series()
348
349
  return df
349
350
 
350
351
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.19
3
+ Version: 0.5.20
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: python-dateutil
26
- Requires-Dist: dataclasses_json
27
- Requires-Dist: opentelemetry-sdk
28
- Requires-Dist: pandas
29
26
  Requires-Dist: click
27
+ Requires-Dist: opentelemetry-sdk
30
28
  Requires-Dist: pydantic>=2.7
29
+ Requires-Dist: pandas
31
30
  Requires-Dist: tqdm
31
+ Requires-Dist: dataclasses_json
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,16 +66,16 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: fsspec; extra == "azure"
70
69
  Requires-Dist: adlfs; extra == "azure"
70
+ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: requests; extra == "biomed"
75
74
  Requires-Dist: bs4; extra == "biomed"
75
+ Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: fsspec; extra == "box"
78
77
  Requires-Dist: boxfs; extra == "box"
78
+ Requires-Dist: fsspec; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
89
- Requires-Dist: deltalake; extra == "delta-table"
90
89
  Requires-Dist: boto3; extra == "delta-table"
90
+ Requires-Dist: deltalake; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
@@ -98,19 +98,24 @@ Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
- Requires-Dist: gcsfs; extra == "gcs"
102
101
  Requires-Dist: bs4; extra == "gcs"
102
+ Requires-Dist: gcsfs; extra == "gcs"
103
103
  Requires-Dist: fsspec; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: requests; extra == "github"
106
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
+ Requires-Dist: requests; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: urllib3; extra == "hubspot"
113
112
  Requires-Dist: hubspot-api-client; extra == "hubspot"
113
+ Requires-Dist: urllib3; extra == "hubspot"
114
+ Provides-Extra: ibm-watsonx-s3
115
+ Requires-Dist: httpx; extra == "ibm-watsonx-s3"
116
+ Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
117
+ Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
118
+ Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
114
119
  Provides-Extra: jira
115
120
  Requires-Dist: atlassian-python-api; extra == "jira"
116
121
  Provides-Extra: kafka
@@ -125,22 +130,22 @@ Provides-Extra: mongodb
125
130
  Requires-Dist: pymongo; extra == "mongodb"
126
131
  Provides-Extra: neo4j
127
132
  Requires-Dist: networkx; extra == "neo4j"
128
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
133
  Requires-Dist: cymple; extra == "neo4j"
134
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
130
135
  Provides-Extra: notion
131
- Requires-Dist: backoff; extra == "notion"
132
- Requires-Dist: httpx; extra == "notion"
133
- Requires-Dist: notion-client; extra == "notion"
134
136
  Requires-Dist: htmlBuilder; extra == "notion"
137
+ Requires-Dist: notion-client; extra == "notion"
138
+ Requires-Dist: httpx; extra == "notion"
139
+ Requires-Dist: backoff; extra == "notion"
135
140
  Provides-Extra: onedrive
136
141
  Requires-Dist: bs4; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
142
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
143
+ Requires-Dist: msal; extra == "onedrive"
139
144
  Provides-Extra: opensearch
140
145
  Requires-Dist: opensearch-py; extra == "opensearch"
141
146
  Provides-Extra: outlook
142
- Requires-Dist: msal; extra == "outlook"
143
147
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
+ Requires-Dist: msal; extra == "outlook"
144
149
  Provides-Extra: pinecone
145
150
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
151
  Provides-Extra: postgres
@@ -155,18 +160,18 @@ Provides-Extra: s3
155
160
  Requires-Dist: s3fs; extra == "s3"
156
161
  Requires-Dist: fsspec; extra == "s3"
157
162
  Provides-Extra: sharepoint
158
- Requires-Dist: msal; extra == "sharepoint"
159
163
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
164
+ Requires-Dist: msal; extra == "sharepoint"
160
165
  Provides-Extra: salesforce
161
166
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
167
  Provides-Extra: sftp
163
- Requires-Dist: fsspec; extra == "sftp"
164
168
  Requires-Dist: paramiko; extra == "sftp"
169
+ Requires-Dist: fsspec; extra == "sftp"
165
170
  Provides-Extra: slack
166
171
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
172
  Provides-Extra: snowflake
168
- Requires-Dist: psycopg2-binary; extra == "snowflake"
169
173
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
174
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
170
175
  Provides-Extra: wikipedia
171
176
  Requires-Dist: wikipedia; extra == "wikipedia"
172
177
  Provides-Extra: weaviate
@@ -178,17 +183,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
183
  Provides-Extra: singlestore
179
184
  Requires-Dist: singlestoredb; extra == "singlestore"
180
185
  Provides-Extra: vectara
181
- Requires-Dist: requests; extra == "vectara"
182
186
  Requires-Dist: httpx; extra == "vectara"
183
187
  Requires-Dist: aiofiles; extra == "vectara"
188
+ Requires-Dist: requests; extra == "vectara"
184
189
  Provides-Extra: vastdb
185
190
  Requires-Dist: pyarrow; extra == "vastdb"
186
191
  Requires-Dist: vastdb; extra == "vastdb"
187
192
  Requires-Dist: ibis; extra == "vastdb"
188
193
  Provides-Extra: zendesk
189
194
  Requires-Dist: bs4; extra == "zendesk"
190
- Requires-Dist: httpx; extra == "zendesk"
191
195
  Requires-Dist: aiofiles; extra == "zendesk"
196
+ Requires-Dist: httpx; extra == "zendesk"
192
197
  Provides-Extra: embed-huggingface
193
198
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
199
  Provides-Extra: embed-octoai
@@ -204,8 +209,8 @@ Provides-Extra: openai
204
209
  Requires-Dist: openai; extra == "openai"
205
210
  Requires-Dist: tiktoken; extra == "openai"
206
211
  Provides-Extra: bedrock
207
- Requires-Dist: boto3; extra == "bedrock"
208
212
  Requires-Dist: aioboto3; extra == "bedrock"
213
+ Requires-Dist: boto3; extra == "bedrock"
209
214
  Provides-Extra: togetherai
210
215
  Requires-Dist: together; extra == "togetherai"
211
216
  Dynamic: author
@@ -93,10 +93,12 @@ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vP
93
93
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
94
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
95
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
96
+ test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=gvgF9vCA_cPQVS_IC6VFvnP4ojFVKOH7eorM6k5VR84,14518
96
98
  test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
99
  test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
98
100
  test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
101
+ test/unit/v2/connectors/sql/test_sql.py,sha256=wA5LvLtmaCi-8YDOd515j3YnP0_E4qi7z50NFXBn75g,4634
100
102
  test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
103
  test/unit/v2/embedders/test_bedrock.py,sha256=HMaweO_v_9Y1SE2m5QImXP73cb26vNTUfc1onTBa1-g,1074
102
104
  test/unit/v2/embedders/test_huggingface.py,sha256=TOHUKC7hAadl6XTotr8UqOCq28kbQxOIkPSrMxr2PLU,1546
@@ -111,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
113
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
114
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
115
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=YeBJuoTNGo0rz_5lKoO5e3ooyBOI71QLt4fdSp1KO_c,43
116
+ unstructured_ingest/__version__.py,sha256=BCszjb86jsmMjfakEG2zLAZFKHpLYTR2k5TCe7RzaBc,43
115
117
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
118
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
119
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -426,7 +428,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
426
428
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
427
429
  unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
428
430
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
429
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ebLvZes84qRx4eS20SkvlVH6WIIM76hifyUgkUJ-dfg,6588
431
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=l4Xq4AuzRMTqUv5TU7cE1NbhGCka4SFJFZwG1FoVotE,6666
430
432
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
431
433
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=5xc5pWFicE_-2BV38oK-nnzAMI2EzF-q8XAqQ3qPUR8,18249
432
434
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
@@ -477,6 +479,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5B
477
479
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
478
480
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
479
481
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
482
+ unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
483
+ unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=zaj5MVsM-uf7IRgZGg7QwRtzjtTM1gCYuqji61TrqWk,11562
480
484
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
481
485
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
482
486
  unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=UfS41jzV9VxekS6AwWHhURJmJ7RUAw5iiIrj75BWrXQ,10255
@@ -564,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
564
568
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
565
569
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
566
570
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
567
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
571
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=wtVK6CHrQ4McwsPifUoa7KKaY-v0cjDZJetASSAaSIA,15415
568
572
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
569
573
  unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
570
574
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -577,9 +581,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
577
581
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
578
582
  unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
583
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
- unstructured_ingest-0.5.19.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
- unstructured_ingest-0.5.19.dist-info/METADATA,sha256=6veKDuElp9klfZfEzZIFIwPCchckH6Mf04qCc0ogN7M,8465
582
- unstructured_ingest-0.5.19.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
- unstructured_ingest-0.5.19.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
- unstructured_ingest-0.5.19.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
- unstructured_ingest-0.5.19.dist-info/RECORD,,
584
+ unstructured_ingest-0.5.20.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
585
+ unstructured_ingest-0.5.20.dist-info/METADATA,sha256=S2Yr62sVeW0csT-QRyonnokiHFvvH0FAwQ2x02BqAeM,8697
586
+ unstructured_ingest-0.5.20.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
587
+ unstructured_ingest-0.5.20.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
588
+ unstructured_ingest-0.5.20.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
589
+ unstructured_ingest-0.5.20.dist-info/RECORD,,