unstructured-ingest 0.5.18__py3-none-any.whl → 0.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -23,20 +23,22 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
23
23
  )
24
24
 
25
25
 
26
- async def delete_record(client: Redis, element_id: str) -> None:
27
- await client.delete(element_id)
26
+ async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
27
+ key_with_prefix = f"{key_prefix}{element_id}"
28
+ await client.delete(key_with_prefix)
28
29
 
29
30
 
30
- async def validate_upload(client: Redis, first_element: dict):
31
+ async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
31
32
  element_id = first_element["element_id"]
33
+ key_with_prefix = f"{key_prefix}{element_id}"
32
34
  expected_text = first_element["text"]
33
35
  expected_embeddings = first_element["embeddings"]
34
36
  async with client.pipeline(transaction=True) as pipe:
35
37
  try:
36
- response = await pipe.json().get(element_id, "$").execute()
38
+ response = await pipe.json().get(key_with_prefix, "$").execute()
37
39
  response = response[0][0]
38
40
  except redis_exceptions.ResponseError:
39
- response = await pipe.get(element_id).execute()
41
+ response = await pipe.get(key_with_prefix).execute()
40
42
  response = json.loads(response[0])
41
43
 
42
44
  embedding_similarity = np.linalg.norm(
@@ -53,6 +55,7 @@ async def redis_destination_test(
53
55
  upload_file: Path,
54
56
  tmp_path: Path,
55
57
  connection_kwargs: dict,
58
+ uploader_config: dict,
56
59
  uri: Optional[str] = None,
57
60
  password: Optional[str] = None,
58
61
  ):
@@ -60,8 +63,9 @@ async def redis_destination_test(
60
63
  connection_config=RedisConnectionConfig(
61
64
  **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
62
65
  ),
63
- upload_config=RedisUploaderConfig(batch_size=10),
66
+ upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
64
67
  )
68
+ key_prefix = uploader.upload_config.key_prefix
65
69
 
66
70
  file_data = FileData(
67
71
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -78,20 +82,32 @@ async def redis_destination_test(
78
82
 
79
83
  if uri:
80
84
  async with from_url(uri) as client:
81
- await validate_upload(client=client, first_element=first_element)
85
+ await validate_upload(
86
+ client=client,
87
+ first_element=first_element,
88
+ key_prefix=key_prefix,
89
+ )
82
90
  else:
83
91
  async with Redis(**connection_kwargs, password=password) as client:
84
- await validate_upload(client=client, first_element=first_element)
92
+ await validate_upload(
93
+ client=client,
94
+ first_element=first_element,
95
+ key_prefix=key_prefix,
96
+ )
85
97
  except Exception as e:
86
98
  raise e
87
99
  finally:
88
100
  if uri:
89
101
  async with from_url(uri) as client:
90
- tasks = [delete_record(client, element["element_id"]) for element in elements]
102
+ tasks = [
103
+ delete_record(client, element["element_id"], key_prefix) for element in elements
104
+ ]
91
105
  await asyncio.gather(*tasks)
92
106
  else:
93
107
  async with Redis(**connection_kwargs, password=password) as client:
94
- tasks = [delete_record(client, element["element_id"]) for element in elements]
108
+ tasks = [
109
+ delete_record(client, element["element_id"], key_prefix) for element in elements
110
+ ]
95
111
  await asyncio.gather(*tasks)
96
112
 
97
113
 
@@ -105,8 +121,13 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
105
121
  "db": 0,
106
122
  "ssl": True,
107
123
  }
124
+ uploader_config = {
125
+ "key_prefix": "test_ingest:",
126
+ }
108
127
  redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
109
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
128
+ await redis_destination_test(
129
+ upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
130
+ )
110
131
 
111
132
 
112
133
  @pytest.mark.asyncio
@@ -114,6 +135,9 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
114
135
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
136
  async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
137
  connection_kwargs = {}
138
+ uploader_config = {
139
+ "key_prefix": "test_ingest:",
140
+ }
117
141
  redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
118
142
  uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
119
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
143
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
File without changes
@@ -0,0 +1,459 @@
1
+ import time
2
+ from unittest.mock import MagicMock
3
+
4
+ import pandas as pd
5
+ import pytest
6
+ from pydantic import Secret
7
+ from pyiceberg.exceptions import CommitFailedException
8
+ from pytest_mock import MockerFixture
9
+
10
+ from unstructured_ingest.v2.errors import ProviderError, UserError
11
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
13
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14
+ IbmWatsonxAccessConfig,
15
+ IbmWatsonxConnectionConfig,
16
+ IbmWatsonxUploader,
17
+ IbmWatsonxUploaderConfig,
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def file_data():
23
+ return FileData(
24
+ identifier="test_identifier",
25
+ connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
26
+ source_identifiers=SourceIdentifiers(
27
+ filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
28
+ ),
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def access_config():
34
+ return IbmWatsonxAccessConfig(
35
+ iam_api_key="test_iam_api_key",
36
+ access_key_id="test_access_key_id",
37
+ secret_access_key="test_secret_access_key",
38
+ )
39
+
40
+
41
+ @pytest.fixture
42
+ def connection_config(access_config: IbmWatsonxAccessConfig):
43
+ return IbmWatsonxConnectionConfig(
44
+ access_config=Secret(access_config),
45
+ iceberg_endpoint="test_iceberg_endpoint/",
46
+ object_storage_endpoint="test_object_storage_endpoint/",
47
+ object_storage_region="test_region",
48
+ catalog="test_catalog",
49
+ )
50
+
51
+
52
+ @pytest.fixture
53
+ def uploader_config():
54
+ return IbmWatsonxUploaderConfig(
55
+ namespace="test_namespace",
56
+ table="test_table",
57
+ record_id_key="test_record_id_key",
58
+ )
59
+
60
+
61
+ @pytest.fixture
62
+ def uploader(
63
+ connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
64
+ ):
65
+ return IbmWatsonxUploader(
66
+ connection_config=connection_config,
67
+ upload_config=uploader_config,
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def mock_catalog(mocker: MockerFixture):
73
+ mock_catalog = mocker.MagicMock()
74
+ mock_catalog.namespace_exists.return_value = True
75
+ mock_catalog.table_exists.return_value = True
76
+ return mock_catalog
77
+
78
+
79
+ @pytest.fixture
80
+ def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
81
+ mock_get_catalog = mocker.patch.context_manager(
82
+ IbmWatsonxConnectionConfig, "get_catalog", autospec=True
83
+ )
84
+ mock_get_catalog.return_value.__enter__.return_value = mock_catalog
85
+ return mock_get_catalog
86
+
87
+
88
+ @pytest.fixture
89
+ def mock_table(mocker: MockerFixture):
90
+ mock_table = mocker.MagicMock()
91
+ return mock_table
92
+
93
+
94
+ @pytest.fixture
95
+ def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
96
+ mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
97
+ mock_get_table.return_value.__enter__.return_value = mock_table
98
+ return mock_get_table
99
+
100
+
101
+ @pytest.fixture
102
+ def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
103
+ mock_transaction = mocker.MagicMock()
104
+ mock_table.transaction.return_value.__enter__.return_value = mock_transaction
105
+ return mock_transaction
106
+
107
+
108
+ @pytest.fixture
109
+ def mock_data_table(mocker: MockerFixture):
110
+ mock_data_table = mocker.MagicMock()
111
+ mock_data_table.schema = "schema"
112
+ return mock_data_table
113
+
114
+
115
+ @pytest.fixture
116
+ def mock_delete(mocker: MockerFixture):
117
+ return mocker.patch.object(IbmWatsonxUploader, "_delete")
118
+
119
+
120
+ @pytest.fixture
121
+ def test_df():
122
+ return pd.DataFrame(
123
+ {
124
+ "test_column_0": [True, False, True],
125
+ "test_column_1": [1, 2, 3],
126
+ "test_column_2": ["a", "b", "c"],
127
+ }
128
+ )
129
+
130
+
131
+ @pytest.fixture
132
+ def timestamp_now():
133
+ return int(time.time())
134
+
135
+
136
+ def test_ibm_watsonx_connection_config_iceberg_url(
137
+ mocker: MockerFixture,
138
+ connection_config: IbmWatsonxConnectionConfig,
139
+ ):
140
+ mocker.patch(
141
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
142
+ new="/mds/iceberg",
143
+ )
144
+ expected_url = "https://test_iceberg_endpoint/mds/iceberg"
145
+ assert connection_config.iceberg_url == expected_url
146
+
147
+
148
+ def test_ibm_watsonx_connection_config_object_storage_url(
149
+ connection_config: IbmWatsonxConnectionConfig,
150
+ ):
151
+ expected_url = "https://test_object_storage_endpoint"
152
+ assert connection_config.object_storage_url == expected_url
153
+
154
+
155
+ def test_ibm_watsonx_connection_config_bearer_token_new_token(
156
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
157
+ ):
158
+ mock_generate_bearer_token = mocker.patch.object(
159
+ IbmWatsonxConnectionConfig,
160
+ "generate_bearer_token",
161
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
162
+ )
163
+ token = connection_config.bearer_token
164
+ assert token == "new_token"
165
+ mock_generate_bearer_token.assert_called_once()
166
+
167
+
168
+ def test_ibm_watsonx_connection_config_bearer_token_existing_token(
169
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
170
+ ):
171
+ connection_config._bearer_token = {
172
+ "access_token": "existing_token",
173
+ "expiration": timestamp_now + 3600,
174
+ }
175
+ mock_generate_bearer_token = mocker.patch.object(
176
+ IbmWatsonxConnectionConfig, "generate_bearer_token"
177
+ )
178
+ token = connection_config.bearer_token
179
+ assert token == "existing_token"
180
+ mock_generate_bearer_token.assert_not_called()
181
+
182
+
183
+ def test_ibm_watsonx_connection_config_bearer_token_expired_token(
184
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
185
+ ):
186
+ connection_config._bearer_token = {
187
+ "access_token": "expired_token",
188
+ "expiration": timestamp_now - 3600,
189
+ }
190
+ mock_generate_bearer_token = mocker.patch.object(
191
+ IbmWatsonxConnectionConfig,
192
+ "generate_bearer_token",
193
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
194
+ )
195
+ token = connection_config.bearer_token
196
+ assert token == "new_token"
197
+ mock_generate_bearer_token.assert_called_once()
198
+
199
+
200
+ def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
201
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
202
+ ):
203
+ connection_config._bearer_token = {
204
+ "access_token": "soon_to_expire_token",
205
+ "expiration": timestamp_now + 60,
206
+ }
207
+ mock_generate_bearer_token = mocker.patch.object(
208
+ IbmWatsonxConnectionConfig,
209
+ "generate_bearer_token",
210
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
211
+ )
212
+ token = connection_config.bearer_token
213
+ assert token == "new_token"
214
+ mock_generate_bearer_token.assert_called_once()
215
+
216
+
217
+ def test_ibm_watsonx_connection_config_get_catalog_success(
218
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
219
+ ):
220
+ mocker.patch(
221
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
222
+ new="/mds/iceberg",
223
+ )
224
+ mocker.patch.object(
225
+ IbmWatsonxConnectionConfig,
226
+ "bearer_token",
227
+ new="test_bearer_token",
228
+ )
229
+ mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
230
+
231
+ with connection_config.get_catalog() as catalog:
232
+ assert catalog is not None
233
+ mock_load_catalog.assert_called_once_with(
234
+ **{
235
+ "name": "test_catalog",
236
+ "type": "rest",
237
+ "uri": "https://test_iceberg_endpoint/mds/iceberg",
238
+ "token": "test_bearer_token",
239
+ "warehouse": "test_catalog",
240
+ "s3.endpoint": "https://test_object_storage_endpoint",
241
+ "s3.access-key-id": "test_access_key_id",
242
+ "s3.secret-access-key": "test_secret_access_key",
243
+ "s3.region": "test_region",
244
+ }
245
+ )
246
+
247
+
248
+ def test_ibm_watsonx_connection_config_get_catalog_failure(
249
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
250
+ ):
251
+ mocker.patch(
252
+ "pyiceberg.catalog.load_catalog",
253
+ side_effect=Exception("Connection error"),
254
+ )
255
+ mocker.patch.object(
256
+ IbmWatsonxConnectionConfig,
257
+ "bearer_token",
258
+ new="test_bearer_token",
259
+ )
260
+ with pytest.raises(ProviderError):
261
+ with connection_config.get_catalog():
262
+ pass
263
+
264
+
265
+ def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
266
+ mock_get_catalog: MagicMock,
267
+ mock_catalog: MagicMock,
268
+ uploader: IbmWatsonxUploader,
269
+ ):
270
+ uploader.precheck()
271
+
272
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
273
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
274
+
275
+
276
+ def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
277
+ mock_get_catalog: MagicMock,
278
+ mock_catalog: MagicMock,
279
+ uploader: IbmWatsonxUploader,
280
+ ):
281
+ mock_catalog.namespace_exists.return_value = False
282
+
283
+ with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
284
+ uploader.precheck()
285
+
286
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
287
+ mock_catalog.table_exists.assert_not_called()
288
+
289
+
290
+ def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
291
+ mock_get_catalog: MagicMock,
292
+ mock_catalog: MagicMock,
293
+ uploader: IbmWatsonxUploader,
294
+ ):
295
+ mock_catalog.table_exists.return_value = False
296
+
297
+ with pytest.raises(
298
+ UserError,
299
+ match="Table 'test_table' does not exist in namespace 'test_namespace'",
300
+ ):
301
+ uploader.precheck()
302
+
303
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
304
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
305
+
306
+
307
+ def test_ibm_watsonx_uploader_upload_data_table_success(
308
+ uploader: IbmWatsonxUploader,
309
+ mock_table: MagicMock,
310
+ mock_transaction: MagicMock,
311
+ mock_data_table: MagicMock,
312
+ mock_delete: MagicMock,
313
+ file_data: FileData,
314
+ ):
315
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
316
+
317
+ mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
318
+ mock_transaction.append.assert_called_once_with(mock_data_table)
319
+
320
+
321
+ def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
322
+ uploader: IbmWatsonxUploader,
323
+ mock_table: MagicMock,
324
+ mock_transaction: MagicMock,
325
+ mock_data_table: MagicMock,
326
+ mock_delete: MagicMock,
327
+ file_data: FileData,
328
+ ):
329
+ mock_transaction.append.side_effect = CommitFailedException()
330
+
331
+ with pytest.raises(ProviderError):
332
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
333
+ assert mock_table.refresh.call_count == 5
334
+
335
+
336
+ def test_ibm_watsonx_uploader_upload_data_table_exception(
337
+ uploader: IbmWatsonxUploader,
338
+ mock_table: MagicMock,
339
+ mock_transaction: MagicMock,
340
+ mock_data_table: MagicMock,
341
+ mock_delete: MagicMock,
342
+ file_data: FileData,
343
+ ):
344
+ mock_transaction.append.side_effect = Exception()
345
+
346
+ with pytest.raises(ProviderError):
347
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
348
+ assert mock_table.refresh.call_count == 0
349
+
350
+
351
+ def test_ibm_watsonx_uploader_df_to_arrow_table(
352
+ mocker: MockerFixture,
353
+ uploader: IbmWatsonxUploader,
354
+ test_df: pd.DataFrame,
355
+ ):
356
+ mock_fit_to_schema = mocker.patch.object(
357
+ IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
358
+ )
359
+
360
+ result = uploader._df_to_arrow_table(test_df)
361
+
362
+ mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
363
+ assert len(result.column_names) == 3
364
+ assert "test_column_0" in result.column_names
365
+ assert "test_column_1" in result.column_names
366
+ assert "test_column_2" in result.column_names
367
+
368
+
369
+ def test_ibm_watsonx_uploader_can_delete_column_exists(
370
+ mocker: MockerFixture,
371
+ uploader: IbmWatsonxUploader,
372
+ ):
373
+ mocker.patch.object(
374
+ IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
375
+ )
376
+
377
+ assert uploader.can_delete() is True
378
+
379
+
380
+ def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
381
+ mocker: MockerFixture,
382
+ uploader: IbmWatsonxUploader,
383
+ ):
384
+ mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
385
+
386
+ assert uploader.can_delete() is False
387
+
388
+
389
+ def test_ibm_watsonx_uploader_get_table_columns_cache(
390
+ uploader: IbmWatsonxUploader,
391
+ ):
392
+ uploader._columns = ["cached_column"]
393
+
394
+ result = uploader.get_table_columns()
395
+
396
+ assert result == ["cached_column"]
397
+
398
+
399
+ def test_ibm_watsonx_uploader_get_table_columns_no_cache(
400
+ uploader: IbmWatsonxUploader,
401
+ mock_get_table: MagicMock,
402
+ mock_table: MagicMock,
403
+ ):
404
+ uploader._columns = None
405
+ mock_table.schema.return_value.column_names = ["column_1", "column_2"]
406
+
407
+ result = uploader.get_table_columns()
408
+
409
+ mock_get_table.assert_called_once()
410
+ assert result == ["column_1", "column_2"]
411
+ assert uploader._columns == ["column_1", "column_2"]
412
+
413
+
414
+ def test_ibm_watsonx_uploader_upload_dataframe_success(
415
+ mocker: MockerFixture,
416
+ uploader: IbmWatsonxUploader,
417
+ test_df: pd.DataFrame,
418
+ mock_get_table: MagicMock,
419
+ mock_table: MagicMock,
420
+ mock_data_table: MagicMock,
421
+ file_data: FileData,
422
+ ):
423
+ mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
424
+ mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
425
+
426
+ uploader.upload_dataframe(test_df, file_data)
427
+
428
+ mock_get_table.assert_called_once()
429
+ mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
430
+
431
+
432
+ def test_ibm_watsonx_uploader_delete_can_delete(
433
+ mocker: MockerFixture,
434
+ uploader: IbmWatsonxUploader,
435
+ mock_transaction: MagicMock,
436
+ ):
437
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
438
+ mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
439
+
440
+ uploader._delete(mock_transaction, "test_identifier")
441
+
442
+ mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
443
+ mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
444
+
445
+
446
+ def test_ibm_watsonx_uploader_delete_cannot_delete(
447
+ caplog: pytest.LogCaptureFixture,
448
+ mocker: MockerFixture,
449
+ uploader: IbmWatsonxUploader,
450
+ mock_transaction: MagicMock,
451
+ ):
452
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
453
+
454
+ uploader._delete(mock_transaction, "test_identifier")
455
+ mock_transaction.delete.assert_not_called()
456
+ assert (
457
+ "Table doesn't contain expected record id column test_record_id_key, skipping delete"
458
+ in caplog.text
459
+ )
@@ -1,10 +1,16 @@
1
1
  from pathlib import Path
2
2
 
3
+ import pandas as pd
3
4
  import pytest
4
5
  from pytest_mock import MockerFixture
5
6
 
6
7
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
7
- from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
8
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
9
+ SQLConnectionConfig,
10
+ SQLUploader,
11
+ SQLUploaderConfig,
12
+ SQLUploadStager,
13
+ )
8
14
 
9
15
 
10
16
  @pytest.fixture
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
12
18
  return SQLUploadStager()
13
19
 
14
20
 
21
+ @pytest.fixture
22
+ def mock_uploader(mocker: MockerFixture) -> SQLUploader:
23
+ mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
24
+ mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
25
+ return SQLUploader(
26
+ upload_config=mock_upload_config,
27
+ connection_config=mock_connection_config,
28
+ connector_type="sql_test",
29
+ )
30
+
31
+
15
32
  @pytest.mark.parametrize(
16
33
  ("input_filepath", "output_filename", "expected"),
17
34
  [
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
72
89
  path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
90
  )
74
91
  assert result.name == expected
92
+
93
+
94
+ def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
95
+ df = pd.DataFrame(
96
+ {
97
+ "col1": [1, 2],
98
+ "col2": [3, 4],
99
+ "col3": [5, 6],
100
+ }
101
+ )
102
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
103
+
104
+ result = mock_uploader._fit_to_schema(df)
105
+
106
+ assert "col3" not in result.columns
107
+ assert "col1" in result.columns
108
+ assert "col2" in result.columns
109
+
110
+
111
+ def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
112
+ df = pd.DataFrame(
113
+ {
114
+ "col1": [1, 2],
115
+ }
116
+ )
117
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
118
+
119
+ result = mock_uploader._fit_to_schema(df)
120
+
121
+ assert "col2" in result.columns
122
+ assert result["col2"].isnull().all()
123
+
124
+
125
+ def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
126
+ df = pd.DataFrame(
127
+ {
128
+ "col1": [1, 2],
129
+ "col2": [3, 4],
130
+ }
131
+ )
132
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
133
+
134
+ result = mock_uploader._fit_to_schema(df)
135
+
136
+ assert "col1" in result.columns
137
+ assert "col2" in result.columns
138
+ assert result.equals(df)
139
+
140
+
141
+ def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
142
+ df = pd.DataFrame(
143
+ {
144
+ "col1": [1, 2],
145
+ }
146
+ )
147
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
148
+
149
+ result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
150
+
151
+ assert "col2" not in result.columns
152
+ assert "col1" in result.columns
@@ -1 +1 @@
1
- __version__ = "0.5.18" # pragma: no cover
1
+ __version__ = "0.5.20" # pragma: no cover
@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
5
5
  import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
6
6
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.ibm_watsonx # noqa: F401
7
8
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
8
9
  import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
9
10
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
121
122
  add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
122
123
 
123
124
  add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
125
+
124
126
  add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
4
+
5
+ from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
6
+ from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
7
+
8
+ add_destination_entry(
9
+ destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
10
+ )
@@ -0,0 +1,301 @@
1
+ import logging
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
7
+
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.utils.data_prep import get_data_df
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ UploaderConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
26
+ SQLUploader,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from pyarrow import Table as ArrowTable
33
+ from pyiceberg.catalog.rest import RestCatalog
34
+ from pyiceberg.table import Table, Transaction
35
+
36
+ CONNECTOR_TYPE = "ibm_watsonx_s3"
37
+
38
+ DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
39
+ DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
40
+ DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
41
+
42
+
43
+ class IcebergCommitFailedException(Exception):
44
+ """Failed to commit changes to the iceberg table."""
45
+
46
+
47
+ class IbmWatsonxAccessConfig(AccessConfig):
48
+ iam_api_key: str = Field(description="IBM IAM API Key")
49
+ access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
50
+ secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
51
+
52
+
53
+ class IbmWatsonxConnectionConfig(ConnectionConfig):
54
+ access_config: Secret[IbmWatsonxAccessConfig]
55
+ iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
56
+ object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
57
+ object_storage_region: str = Field(description="Cloud Object Storage region")
58
+ catalog: str = Field(description="Catalog name")
59
+
60
+ _bearer_token: Optional[dict[str, Any]] = None
61
+
62
+ @property
63
+ def iceberg_url(self) -> str:
64
+ return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
65
+
66
+ @property
67
+ def object_storage_url(self) -> str:
68
+ return f"https://{self.object_storage_endpoint.strip('/')}"
69
+
70
+ @property
71
+ def bearer_token(self) -> str:
72
+ # Add 60 seconds to deal with edge cases where the token expires before the request is made
73
+ timestamp = int(time.time()) + 60
74
+ if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
75
+ self._bearer_token = self.generate_bearer_token()
76
+ return self._bearer_token["access_token"]
77
+
78
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
79
+ def wrap_error(self, e: Exception) -> Exception:
80
+ import httpx
81
+
82
+ if not isinstance(e, httpx.HTTPStatusError):
83
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
84
+ return e
85
+ url = e.request.url
86
+ response_code = e.response.status_code
87
+ if response_code == 401:
88
+ logger.error(
89
+ f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
90
+ )
91
+ return UserAuthError(e)
92
+ if response_code == 403:
93
+ logger.error(
94
+ f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
95
+ )
96
+ return UserAuthError(e)
97
+ if 400 <= response_code < 500:
98
+ logger.error(
99
+ f"Request to {url} failed"
100
+ f"in IBM watsonx.data connector, status code {response_code}"
101
+ )
102
+ return UserError(e)
103
+ if response_code > 500:
104
+ logger.error(
105
+ f"Request to {url} failed"
106
+ f"in IBM watsonx.data connector, status code {response_code}"
107
+ )
108
+ return ProviderError(e)
109
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
110
+ return e
111
+
112
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
113
+ def generate_bearer_token(self) -> dict[str, Any]:
114
+ import httpx
115
+
116
+ headers = {
117
+ "Content-Type": "application/x-www-form-urlencoded",
118
+ "Accept": "application/json",
119
+ }
120
+ data = {
121
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
122
+ "apikey": self.access_config.get_secret_value().iam_api_key,
123
+ }
124
+
125
+ logger.info("Generating IBM IAM Bearer Token")
126
+ try:
127
+ response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
128
+ response.raise_for_status()
129
+ except Exception as e:
130
+ raise self.wrap_error(e)
131
+ return response.json()
132
+
133
+ def get_catalog_config(self) -> dict[str, Any]:
134
+ return {
135
+ "name": self.catalog,
136
+ "type": DEFAULT_ICEBERG_CATALOG_TYPE,
137
+ "uri": self.iceberg_url,
138
+ "token": self.bearer_token,
139
+ "warehouse": self.catalog,
140
+ "s3.endpoint": self.object_storage_url,
141
+ "s3.access-key-id": self.access_config.get_secret_value().access_key_id,
142
+ "s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
143
+ "s3.region": self.object_storage_region,
144
+ }
145
+
146
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
147
+ @contextmanager
148
+ def get_catalog(self) -> Generator["RestCatalog", None, None]:
149
+ from pyiceberg.catalog import load_catalog
150
+
151
+ try:
152
+ catalog_config = self.get_catalog_config()
153
+ catalog = load_catalog(**catalog_config)
154
+ except Exception as e:
155
+ logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
156
+ raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
157
+
158
+ yield catalog
159
+
160
+
161
+ @dataclass
162
+ class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class IbmWatsonxUploadStager(SQLUploadStager):
168
+ upload_stager_config: IbmWatsonxUploadStagerConfig = field(
169
+ default_factory=IbmWatsonxUploadStagerConfig
170
+ )
171
+
172
+
173
+ class IbmWatsonxUploaderConfig(UploaderConfig):
174
+ namespace: str = Field(description="Namespace name")
175
+ table: str = Field(description="Table name")
176
+ max_retries: int = Field(
177
+ default=5, description="Maximum number of retries to upload data", ge=2, le=10
178
+ )
179
+ record_id_key: str = Field(
180
+ default=RECORD_ID_LABEL,
181
+ description="Searchable key to find entries for the same record on previous runs",
182
+ )
183
+
184
+ @property
185
+ def table_identifier(self) -> Tuple[str, str]:
186
+ return (self.namespace, self.table)
187
+
188
+
189
+ @dataclass
190
+ class IbmWatsonxUploader(SQLUploader):
191
+ connection_config: IbmWatsonxConnectionConfig
192
+ upload_config: IbmWatsonxUploaderConfig
193
+ connector_type: str = CONNECTOR_TYPE
194
+
195
+ def precheck(self) -> None:
196
+ with self.connection_config.get_catalog() as catalog:
197
+ if not catalog.namespace_exists(self.upload_config.namespace):
198
+ raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
199
+ if not catalog.table_exists(self.upload_config.table_identifier):
200
+ raise UserError(
201
+ f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
202
+ )
203
+
204
+ @contextmanager
205
+ def get_table(self) -> Generator["Table", None, None]:
206
+ with self.connection_config.get_catalog() as catalog:
207
+ table = catalog.load_table(self.upload_config.table_identifier)
208
+ yield table
209
+
210
+ def get_table_columns(self) -> list[str]:
211
+ if self._columns is None:
212
+ with self.get_table() as table:
213
+ self._columns = table.schema().column_names
214
+ return self._columns
215
+
216
+ def can_delete(self) -> bool:
217
+ return self.upload_config.record_id_key in self.get_table_columns()
218
+
219
+ @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
220
+ def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
221
+ import pyarrow as pa
222
+
223
+ # Iceberg will automatically fill missing columns with nulls
224
+ # Iceberg will throw an error if the DataFrame column has only null values
225
+ # because it can't infer the type of the column and match it with the table schema
226
+ return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
227
+
228
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
229
+ def _delete(self, transaction: "Transaction", identifier: str) -> None:
230
+ from pyiceberg.expressions import EqualTo
231
+
232
+ if self.can_delete():
233
+ transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
234
+ else:
235
+ logger.warning(
236
+ f"Table doesn't contain expected "
237
+ f"record id column "
238
+ f"{self.upload_config.record_id_key}, skipping delete"
239
+ )
240
+
241
+ @requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
242
+ def upload_data_table(
243
+ self, table: "Table", data_table: "ArrowTable", file_data: FileData
244
+ ) -> None:
245
+ from pyiceberg.exceptions import CommitFailedException
246
+ from tenacity import (
247
+ before_log,
248
+ retry,
249
+ retry_if_exception_type,
250
+ stop_after_attempt,
251
+ wait_random,
252
+ )
253
+
254
+ @retry(
255
+ stop=stop_after_attempt(self.upload_config.max_retries),
256
+ wait=wait_random(),
257
+ retry=retry_if_exception_type(IcebergCommitFailedException),
258
+ before=before_log(logger, logging.DEBUG),
259
+ reraise=True,
260
+ )
261
+ def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
262
+ try:
263
+ with table.transaction() as transaction:
264
+ self._delete(transaction, file_data.identifier)
265
+ transaction.append(data_table)
266
+ except CommitFailedException as e:
267
+ table.refresh()
268
+ logger.debug(e)
269
+ raise IcebergCommitFailedException(e)
270
+ except Exception as e:
271
+ raise ProviderError(f"Failed to upload data to table: {e}")
272
+
273
+ try:
274
+ return _upload_data_table(table, data_table, file_data)
275
+ except ProviderError:
276
+ raise
277
+ except Exception as e:
278
+ raise ProviderError(f"Failed to upload data to table: {e}")
279
+
280
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
281
+ data_table = self._df_to_arrow_table(df)
282
+
283
+ with self.get_table() as table:
284
+ self.upload_data_table(table, data_table, file_data)
285
+
286
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
287
+ df = pd.DataFrame(data)
288
+ self.upload_dataframe(df=df, file_data=file_data)
289
+
290
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
291
+ df = get_data_df(path=path)
292
+ self.upload_dataframe(df=df, file_data=file_data)
293
+
294
+
295
+ ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
296
+ connection_config=IbmWatsonxConnectionConfig,
297
+ uploader=IbmWatsonxUploader,
298
+ uploader_config=IbmWatsonxUploaderConfig,
299
+ upload_stager=IbmWatsonxUploadStager,
300
+ upload_stager_config=IbmWatsonxUploadStagerConfig,
301
+ )
@@ -110,6 +110,7 @@ class RedisConnectionConfig(ConnectionConfig):
110
110
 
111
111
  class RedisUploaderConfig(UploaderConfig):
112
112
  batch_size: int = Field(default=100, description="Number of records per batch")
113
+ key_prefix: str = Field(default="", description="Prefix for Redis keys")
113
114
 
114
115
 
115
116
  @dataclass
@@ -145,11 +146,11 @@ class RedisUploader(Uploader):
145
146
  async with self.connection_config.create_async_client() as async_client:
146
147
  async with async_client.pipeline(transaction=True) as pipe:
147
148
  for element in batch:
148
- element_id = element["element_id"]
149
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
149
150
  if redis_stack:
150
- pipe.json().set(element_id, "$", element)
151
+ pipe.json().set(key_with_prefix, "$", element)
151
152
  else:
152
- pipe.set(element_id, json.dumps(element))
153
+ pipe.set(key_with_prefix, json.dumps(element))
153
154
  await pipe.execute()
154
155
 
155
156
  @requires_dependencies(["redis"], extras="redis")
@@ -159,16 +160,16 @@ class RedisUploader(Uploader):
159
160
  redis_stack = True
160
161
  async with self.connection_config.create_async_client() as async_client:
161
162
  async with async_client.pipeline(transaction=True) as pipe:
162
- element_id = element["element_id"]
163
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
163
164
  try:
164
165
  # Redis with stack extension supports JSON type
165
- await pipe.json().set(element_id, "$", element).execute()
166
+ await pipe.json().set(key_with_prefix, "$", element).execute()
166
167
  except redis_exceptions.ResponseError as e:
167
168
  message = str(e)
168
169
  if "unknown command `JSON.SET`" in message:
169
170
  # if this error occurs, Redis server doesn't support JSON type,
170
171
  # so save as string type instead
171
- await pipe.set(element_id, json.dumps(element)).execute()
172
+ await pipe.set(key_with_prefix, json.dumps(element)).execute()
172
173
  redis_stack = False
173
174
  else:
174
175
  raise e
@@ -323,7 +323,7 @@ class SQLUploader(Uploader):
323
323
  output.append(tuple(parsed))
324
324
  return output
325
325
 
326
- def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
326
+ def _fit_to_schema(self, df: pd.DataFrame, add_missing_columns: bool = True) -> pd.DataFrame:
327
327
  table_columns = self.get_table_columns()
328
328
  columns = set(df.columns)
329
329
  schema_fields = set(table_columns)
@@ -335,7 +335,7 @@ class SQLUploader(Uploader):
335
335
  "Following columns will be dropped to match the table's schema: "
336
336
  f"{', '.join(columns_to_drop)}"
337
337
  )
338
- if missing_columns:
338
+ if missing_columns and add_missing_columns:
339
339
  logger.info(
340
340
  "Following null filled columns will be added to match the table's schema:"
341
341
  f" {', '.join(missing_columns)} "
@@ -343,8 +343,9 @@ class SQLUploader(Uploader):
343
343
 
344
344
  df = df.drop(columns=columns_to_drop)
345
345
 
346
- for column in missing_columns:
347
- df[column] = pd.Series()
346
+ if add_missing_columns:
347
+ for column in missing_columns:
348
+ df[column] = pd.Series()
348
349
  return df
349
350
 
350
351
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.18
3
+ Version: 0.5.20
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: click
26
27
  Requires-Dist: opentelemetry-sdk
28
+ Requires-Dist: pydantic>=2.7
27
29
  Requires-Dist: pandas
28
- Requires-Dist: dataclasses_json
29
30
  Requires-Dist: tqdm
30
- Requires-Dist: click
31
- Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: dataclasses_json
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -86,8 +86,8 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
89
- Requires-Dist: deltalake; extra == "delta-table"
90
89
  Requires-Dist: boto3; extra == "delta-table"
90
+ Requires-Dist: deltalake; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
@@ -99,8 +99,8 @@ Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
101
  Requires-Dist: bs4; extra == "gcs"
102
- Requires-Dist: fsspec; extra == "gcs"
103
102
  Requires-Dist: gcsfs; extra == "gcs"
103
+ Requires-Dist: fsspec; extra == "gcs"
104
104
  Provides-Extra: github
105
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
106
  Requires-Dist: requests; extra == "github"
@@ -109,8 +109,13 @@ Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: urllib3; extra == "hubspot"
113
112
  Requires-Dist: hubspot-api-client; extra == "hubspot"
113
+ Requires-Dist: urllib3; extra == "hubspot"
114
+ Provides-Extra: ibm-watsonx-s3
115
+ Requires-Dist: httpx; extra == "ibm-watsonx-s3"
116
+ Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
117
+ Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
118
+ Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
114
119
  Provides-Extra: jira
115
120
  Requires-Dist: atlassian-python-api; extra == "jira"
116
121
  Provides-Extra: kafka
@@ -125,13 +130,13 @@ Provides-Extra: mongodb
125
130
  Requires-Dist: pymongo; extra == "mongodb"
126
131
  Provides-Extra: neo4j
127
132
  Requires-Dist: networkx; extra == "neo4j"
128
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
133
  Requires-Dist: cymple; extra == "neo4j"
134
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
130
135
  Provides-Extra: notion
136
+ Requires-Dist: htmlBuilder; extra == "notion"
137
+ Requires-Dist: notion-client; extra == "notion"
131
138
  Requires-Dist: httpx; extra == "notion"
132
139
  Requires-Dist: backoff; extra == "notion"
133
- Requires-Dist: notion-client; extra == "notion"
134
- Requires-Dist: htmlBuilder; extra == "notion"
135
140
  Provides-Extra: onedrive
136
141
  Requires-Dist: bs4; extra == "onedrive"
137
142
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
@@ -152,8 +157,8 @@ Requires-Dist: praw; extra == "reddit"
152
157
  Provides-Extra: redis
153
158
  Requires-Dist: redis; extra == "redis"
154
159
  Provides-Extra: s3
155
- Requires-Dist: fsspec; extra == "s3"
156
160
  Requires-Dist: s3fs; extra == "s3"
161
+ Requires-Dist: fsspec; extra == "s3"
157
162
  Provides-Extra: sharepoint
158
163
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
164
  Requires-Dist: msal; extra == "sharepoint"
@@ -165,8 +170,8 @@ Requires-Dist: fsspec; extra == "sftp"
165
170
  Provides-Extra: slack
166
171
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
172
  Provides-Extra: snowflake
168
- Requires-Dist: psycopg2-binary; extra == "snowflake"
169
173
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
174
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
170
175
  Provides-Extra: wikipedia
171
176
  Requires-Dist: wikipedia; extra == "wikipedia"
172
177
  Provides-Extra: weaviate
@@ -178,17 +183,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
183
  Provides-Extra: singlestore
179
184
  Requires-Dist: singlestoredb; extra == "singlestore"
180
185
  Provides-Extra: vectara
181
- Requires-Dist: aiofiles; extra == "vectara"
182
186
  Requires-Dist: httpx; extra == "vectara"
187
+ Requires-Dist: aiofiles; extra == "vectara"
183
188
  Requires-Dist: requests; extra == "vectara"
184
189
  Provides-Extra: vastdb
185
- Requires-Dist: vastdb; extra == "vastdb"
186
190
  Requires-Dist: pyarrow; extra == "vastdb"
191
+ Requires-Dist: vastdb; extra == "vastdb"
187
192
  Requires-Dist: ibis; extra == "vastdb"
188
193
  Provides-Extra: zendesk
194
+ Requires-Dist: bs4; extra == "zendesk"
189
195
  Requires-Dist: aiofiles; extra == "zendesk"
190
196
  Requires-Dist: httpx; extra == "zendesk"
191
- Requires-Dist: bs4; extra == "zendesk"
192
197
  Provides-Extra: embed-huggingface
193
198
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
199
  Provides-Extra: embed-octoai
@@ -21,7 +21,7 @@ test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qko
21
21
  test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
22
22
  test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
23
23
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
24
- test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
24
+ test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
27
  test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
@@ -93,10 +93,12 @@ test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vP
93
93
  test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
94
94
  test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
95
  test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
96
+ test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=gvgF9vCA_cPQVS_IC6VFvnP4ojFVKOH7eorM6k5VR84,14518
96
98
  test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
99
  test/unit/v2/connectors/motherduck/test_base.py,sha256=f3W7hppEZ904_I_fKax-5LVDp-0yj04DjF1ccZ4k5O8,2503
98
100
  test/unit/v2/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- test/unit/v2/connectors/sql/test_sql.py,sha256=51-AKUBxw6ThO68bjenLopUUuxM88YZb2rMUV8L6YwY,2464
101
+ test/unit/v2/connectors/sql/test_sql.py,sha256=wA5LvLtmaCi-8YDOd515j3YnP0_E4qi7z50NFXBn75g,4634
100
102
  test/unit/v2/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
103
  test/unit/v2/embedders/test_bedrock.py,sha256=HMaweO_v_9Y1SE2m5QImXP73cb26vNTUfc1onTBa1-g,1074
102
104
  test/unit/v2/embedders/test_huggingface.py,sha256=TOHUKC7hAadl6XTotr8UqOCq28kbQxOIkPSrMxr2PLU,1546
@@ -111,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
113
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
114
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
115
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=QYn6GUOSyCz_KH2wi4yg_FlUU4SE844Xhf0hR6-jv8s,43
116
+ unstructured_ingest/__version__.py,sha256=BCszjb86jsmMjfakEG2zLAZFKHpLYTR2k5TCe7RzaBc,43
115
117
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
118
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
119
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -426,7 +428,7 @@ unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8g
426
428
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
427
429
  unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7v5KG35BowwKIJ_y4e8,9910
428
430
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
429
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ebLvZes84qRx4eS20SkvlVH6WIIM76hifyUgkUJ-dfg,6588
431
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=l4Xq4AuzRMTqUv5TU7cE1NbhGCka4SFJFZwG1FoVotE,6666
430
432
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
431
433
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=5xc5pWFicE_-2BV38oK-nnzAMI2EzF-q8XAqQ3qPUR8,18249
432
434
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
@@ -446,7 +448,7 @@ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=I-eDLAlThHKKFQfkZpQL
446
448
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=5rg7t40gKxDHNcuJrJHmVzJ9uM7Ct4RBOvFsfwdGc5c,18002
447
449
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
448
450
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
449
- unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
451
+ unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=0h105_MpOO4-uydiyHgM4TvduSkAMAr931KFANcKW8Y,6936
450
452
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
451
453
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
452
454
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -477,6 +479,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5B
477
479
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
478
480
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
479
481
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
482
+ unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
483
+ unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=zaj5MVsM-uf7IRgZGg7QwRtzjtTM1gCYuqji61TrqWk,11562
480
484
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
481
485
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
482
486
  unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=UfS41jzV9VxekS6AwWHhURJmJ7RUAw5iiIrj75BWrXQ,10255
@@ -564,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
564
568
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
565
569
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
566
570
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
567
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
571
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=wtVK6CHrQ4McwsPifUoa7KKaY-v0cjDZJetASSAaSIA,15415
568
572
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
569
573
  unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
570
574
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -577,9 +581,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
577
581
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
578
582
  unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
583
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
- unstructured_ingest-0.5.18.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
- unstructured_ingest-0.5.18.dist-info/METADATA,sha256=K47-NP1RfNwqRnvbZ8vO75ab5J5RSmb5nocwSXNwqko,8465
582
- unstructured_ingest-0.5.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
- unstructured_ingest-0.5.18.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
- unstructured_ingest-0.5.18.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
- unstructured_ingest-0.5.18.dist-info/RECORD,,
584
+ unstructured_ingest-0.5.20.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
585
+ unstructured_ingest-0.5.20.dist-info/METADATA,sha256=S2Yr62sVeW0csT-QRyonnokiHFvvH0FAwQ2x02BqAeM,8697
586
+ unstructured_ingest-0.5.20.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
587
+ unstructured_ingest-0.5.20.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
588
+ unstructured_ingest-0.5.20.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
589
+ unstructured_ingest-0.5.20.dist-info/RECORD,,