unstructured-ingest 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. test/integration/connectors/test_astradb.py +8 -2
  2. test/unit/v2/connectors/ibm_watsonx/__init__.py +0 -0
  3. test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +459 -0
  4. test/unit/v2/connectors/sql/test_sql.py +79 -1
  5. unstructured_ingest/__version__.py +1 -1
  6. unstructured_ingest/embed/interfaces.py +7 -3
  7. unstructured_ingest/utils/data_prep.py +17 -5
  8. unstructured_ingest/utils/table.py +11 -4
  9. unstructured_ingest/v2/processes/connectors/__init__.py +2 -0
  10. unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
  11. unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
  12. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
  13. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
  14. unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py +10 -0
  15. unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +301 -0
  16. unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
  17. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
  18. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
  19. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
  20. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
  21. unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -12
  22. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
  23. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
  24. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/METADATA +174 -18
  25. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/RECORD +29 -25
  26. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/LICENSE.md +0 -0
  27. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/WHEEL +0 -0
  28. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt +0 -0
  29. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import contextlib
1
2
  import json
2
3
  import os
3
4
  from dataclasses import dataclass
@@ -231,6 +232,13 @@ def test_astra_create_destination():
231
232
  )
232
233
  collection_name = "system_created-123"
233
234
  formatted_collection_name = "system_created_123"
235
+
236
+ client = AstraDBClient()
237
+ db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
238
+ with contextlib.suppress(Exception):
239
+ # drop collection before trying to create it
240
+ db.drop_collection(formatted_collection_name)
241
+
234
242
  created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
235
243
  assert created
236
244
  assert uploader.upload_config.collection_name == formatted_collection_name
@@ -239,8 +247,6 @@ def test_astra_create_destination():
239
247
  assert not created
240
248
 
241
249
  # cleanup
242
- client = AstraDBClient()
243
- db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
244
250
  db.drop_collection(formatted_collection_name)
245
251
 
246
252
 
File without changes
@@ -0,0 +1,459 @@
1
+ import time
2
+ from unittest.mock import MagicMock
3
+
4
+ import pandas as pd
5
+ import pytest
6
+ from pydantic import Secret
7
+ from pyiceberg.exceptions import CommitFailedException
8
+ from pytest_mock import MockerFixture
9
+
10
+ from unstructured_ingest.v2.errors import ProviderError, UserError
11
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
13
+ from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14
+ IbmWatsonxAccessConfig,
15
+ IbmWatsonxConnectionConfig,
16
+ IbmWatsonxUploader,
17
+ IbmWatsonxUploaderConfig,
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def file_data():
23
+ return FileData(
24
+ identifier="test_identifier",
25
+ connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
26
+ source_identifiers=SourceIdentifiers(
27
+ filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
28
+ ),
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def access_config():
34
+ return IbmWatsonxAccessConfig(
35
+ iam_api_key="test_iam_api_key",
36
+ access_key_id="test_access_key_id",
37
+ secret_access_key="test_secret_access_key",
38
+ )
39
+
40
+
41
+ @pytest.fixture
42
+ def connection_config(access_config: IbmWatsonxAccessConfig):
43
+ return IbmWatsonxConnectionConfig(
44
+ access_config=Secret(access_config),
45
+ iceberg_endpoint="test_iceberg_endpoint/",
46
+ object_storage_endpoint="test_object_storage_endpoint/",
47
+ object_storage_region="test_region",
48
+ catalog="test_catalog",
49
+ )
50
+
51
+
52
+ @pytest.fixture
53
+ def uploader_config():
54
+ return IbmWatsonxUploaderConfig(
55
+ namespace="test_namespace",
56
+ table="test_table",
57
+ record_id_key="test_record_id_key",
58
+ )
59
+
60
+
61
+ @pytest.fixture
62
+ def uploader(
63
+ connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
64
+ ):
65
+ return IbmWatsonxUploader(
66
+ connection_config=connection_config,
67
+ upload_config=uploader_config,
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def mock_catalog(mocker: MockerFixture):
73
+ mock_catalog = mocker.MagicMock()
74
+ mock_catalog.namespace_exists.return_value = True
75
+ mock_catalog.table_exists.return_value = True
76
+ return mock_catalog
77
+
78
+
79
+ @pytest.fixture
80
+ def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
81
+ mock_get_catalog = mocker.patch.context_manager(
82
+ IbmWatsonxConnectionConfig, "get_catalog", autospec=True
83
+ )
84
+ mock_get_catalog.return_value.__enter__.return_value = mock_catalog
85
+ return mock_get_catalog
86
+
87
+
88
+ @pytest.fixture
89
+ def mock_table(mocker: MockerFixture):
90
+ mock_table = mocker.MagicMock()
91
+ return mock_table
92
+
93
+
94
+ @pytest.fixture
95
+ def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
96
+ mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
97
+ mock_get_table.return_value.__enter__.return_value = mock_table
98
+ return mock_get_table
99
+
100
+
101
+ @pytest.fixture
102
+ def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
103
+ mock_transaction = mocker.MagicMock()
104
+ mock_table.transaction.return_value.__enter__.return_value = mock_transaction
105
+ return mock_transaction
106
+
107
+
108
+ @pytest.fixture
109
+ def mock_data_table(mocker: MockerFixture):
110
+ mock_data_table = mocker.MagicMock()
111
+ mock_data_table.schema = "schema"
112
+ return mock_data_table
113
+
114
+
115
+ @pytest.fixture
116
+ def mock_delete(mocker: MockerFixture):
117
+ return mocker.patch.object(IbmWatsonxUploader, "_delete")
118
+
119
+
120
+ @pytest.fixture
121
+ def test_df():
122
+ return pd.DataFrame(
123
+ {
124
+ "test_column_0": [True, False, True],
125
+ "test_column_1": [1, 2, 3],
126
+ "test_column_2": ["a", "b", "c"],
127
+ }
128
+ )
129
+
130
+
131
+ @pytest.fixture
132
+ def timestamp_now():
133
+ return int(time.time())
134
+
135
+
136
+ def test_ibm_watsonx_connection_config_iceberg_url(
137
+ mocker: MockerFixture,
138
+ connection_config: IbmWatsonxConnectionConfig,
139
+ ):
140
+ mocker.patch(
141
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
142
+ new="/mds/iceberg",
143
+ )
144
+ expected_url = "https://test_iceberg_endpoint/mds/iceberg"
145
+ assert connection_config.iceberg_url == expected_url
146
+
147
+
148
+ def test_ibm_watsonx_connection_config_object_storage_url(
149
+ connection_config: IbmWatsonxConnectionConfig,
150
+ ):
151
+ expected_url = "https://test_object_storage_endpoint"
152
+ assert connection_config.object_storage_url == expected_url
153
+
154
+
155
+ def test_ibm_watsonx_connection_config_bearer_token_new_token(
156
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
157
+ ):
158
+ mock_generate_bearer_token = mocker.patch.object(
159
+ IbmWatsonxConnectionConfig,
160
+ "generate_bearer_token",
161
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
162
+ )
163
+ token = connection_config.bearer_token
164
+ assert token == "new_token"
165
+ mock_generate_bearer_token.assert_called_once()
166
+
167
+
168
+ def test_ibm_watsonx_connection_config_bearer_token_existing_token(
169
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
170
+ ):
171
+ connection_config._bearer_token = {
172
+ "access_token": "existing_token",
173
+ "expiration": timestamp_now + 3600,
174
+ }
175
+ mock_generate_bearer_token = mocker.patch.object(
176
+ IbmWatsonxConnectionConfig, "generate_bearer_token"
177
+ )
178
+ token = connection_config.bearer_token
179
+ assert token == "existing_token"
180
+ mock_generate_bearer_token.assert_not_called()
181
+
182
+
183
+ def test_ibm_watsonx_connection_config_bearer_token_expired_token(
184
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
185
+ ):
186
+ connection_config._bearer_token = {
187
+ "access_token": "expired_token",
188
+ "expiration": timestamp_now - 3600,
189
+ }
190
+ mock_generate_bearer_token = mocker.patch.object(
191
+ IbmWatsonxConnectionConfig,
192
+ "generate_bearer_token",
193
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
194
+ )
195
+ token = connection_config.bearer_token
196
+ assert token == "new_token"
197
+ mock_generate_bearer_token.assert_called_once()
198
+
199
+
200
+ def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
201
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
202
+ ):
203
+ connection_config._bearer_token = {
204
+ "access_token": "soon_to_expire_token",
205
+ "expiration": timestamp_now + 60,
206
+ }
207
+ mock_generate_bearer_token = mocker.patch.object(
208
+ IbmWatsonxConnectionConfig,
209
+ "generate_bearer_token",
210
+ return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
211
+ )
212
+ token = connection_config.bearer_token
213
+ assert token == "new_token"
214
+ mock_generate_bearer_token.assert_called_once()
215
+
216
+
217
+ def test_ibm_watsonx_connection_config_get_catalog_success(
218
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
219
+ ):
220
+ mocker.patch(
221
+ "unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
222
+ new="/mds/iceberg",
223
+ )
224
+ mocker.patch.object(
225
+ IbmWatsonxConnectionConfig,
226
+ "bearer_token",
227
+ new="test_bearer_token",
228
+ )
229
+ mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
230
+
231
+ with connection_config.get_catalog() as catalog:
232
+ assert catalog is not None
233
+ mock_load_catalog.assert_called_once_with(
234
+ **{
235
+ "name": "test_catalog",
236
+ "type": "rest",
237
+ "uri": "https://test_iceberg_endpoint/mds/iceberg",
238
+ "token": "test_bearer_token",
239
+ "warehouse": "test_catalog",
240
+ "s3.endpoint": "https://test_object_storage_endpoint",
241
+ "s3.access-key-id": "test_access_key_id",
242
+ "s3.secret-access-key": "test_secret_access_key",
243
+ "s3.region": "test_region",
244
+ }
245
+ )
246
+
247
+
248
+ def test_ibm_watsonx_connection_config_get_catalog_failure(
249
+ mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
250
+ ):
251
+ mocker.patch(
252
+ "pyiceberg.catalog.load_catalog",
253
+ side_effect=Exception("Connection error"),
254
+ )
255
+ mocker.patch.object(
256
+ IbmWatsonxConnectionConfig,
257
+ "bearer_token",
258
+ new="test_bearer_token",
259
+ )
260
+ with pytest.raises(ProviderError):
261
+ with connection_config.get_catalog():
262
+ pass
263
+
264
+
265
+ def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
266
+ mock_get_catalog: MagicMock,
267
+ mock_catalog: MagicMock,
268
+ uploader: IbmWatsonxUploader,
269
+ ):
270
+ uploader.precheck()
271
+
272
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
273
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
274
+
275
+
276
+ def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
277
+ mock_get_catalog: MagicMock,
278
+ mock_catalog: MagicMock,
279
+ uploader: IbmWatsonxUploader,
280
+ ):
281
+ mock_catalog.namespace_exists.return_value = False
282
+
283
+ with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
284
+ uploader.precheck()
285
+
286
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
287
+ mock_catalog.table_exists.assert_not_called()
288
+
289
+
290
+ def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
291
+ mock_get_catalog: MagicMock,
292
+ mock_catalog: MagicMock,
293
+ uploader: IbmWatsonxUploader,
294
+ ):
295
+ mock_catalog.table_exists.return_value = False
296
+
297
+ with pytest.raises(
298
+ UserError,
299
+ match="Table 'test_table' does not exist in namespace 'test_namespace'",
300
+ ):
301
+ uploader.precheck()
302
+
303
+ mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
304
+ mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
305
+
306
+
307
+ def test_ibm_watsonx_uploader_upload_data_table_success(
308
+ uploader: IbmWatsonxUploader,
309
+ mock_table: MagicMock,
310
+ mock_transaction: MagicMock,
311
+ mock_data_table: MagicMock,
312
+ mock_delete: MagicMock,
313
+ file_data: FileData,
314
+ ):
315
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
316
+
317
+ mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
318
+ mock_transaction.append.assert_called_once_with(mock_data_table)
319
+
320
+
321
+ def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
322
+ uploader: IbmWatsonxUploader,
323
+ mock_table: MagicMock,
324
+ mock_transaction: MagicMock,
325
+ mock_data_table: MagicMock,
326
+ mock_delete: MagicMock,
327
+ file_data: FileData,
328
+ ):
329
+ mock_transaction.append.side_effect = CommitFailedException()
330
+
331
+ with pytest.raises(ProviderError):
332
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
333
+ assert mock_table.refresh.call_count == 5
334
+
335
+
336
+ def test_ibm_watsonx_uploader_upload_data_table_exception(
337
+ uploader: IbmWatsonxUploader,
338
+ mock_table: MagicMock,
339
+ mock_transaction: MagicMock,
340
+ mock_data_table: MagicMock,
341
+ mock_delete: MagicMock,
342
+ file_data: FileData,
343
+ ):
344
+ mock_transaction.append.side_effect = Exception()
345
+
346
+ with pytest.raises(ProviderError):
347
+ uploader.upload_data_table(mock_table, mock_data_table, file_data)
348
+ assert mock_table.refresh.call_count == 0
349
+
350
+
351
+ def test_ibm_watsonx_uploader_df_to_arrow_table(
352
+ mocker: MockerFixture,
353
+ uploader: IbmWatsonxUploader,
354
+ test_df: pd.DataFrame,
355
+ ):
356
+ mock_fit_to_schema = mocker.patch.object(
357
+ IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
358
+ )
359
+
360
+ result = uploader._df_to_arrow_table(test_df)
361
+
362
+ mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
363
+ assert len(result.column_names) == 3
364
+ assert "test_column_0" in result.column_names
365
+ assert "test_column_1" in result.column_names
366
+ assert "test_column_2" in result.column_names
367
+
368
+
369
+ def test_ibm_watsonx_uploader_can_delete_column_exists(
370
+ mocker: MockerFixture,
371
+ uploader: IbmWatsonxUploader,
372
+ ):
373
+ mocker.patch.object(
374
+ IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
375
+ )
376
+
377
+ assert uploader.can_delete() is True
378
+
379
+
380
+ def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
381
+ mocker: MockerFixture,
382
+ uploader: IbmWatsonxUploader,
383
+ ):
384
+ mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
385
+
386
+ assert uploader.can_delete() is False
387
+
388
+
389
+ def test_ibm_watsonx_uploader_get_table_columns_cache(
390
+ uploader: IbmWatsonxUploader,
391
+ ):
392
+ uploader._columns = ["cached_column"]
393
+
394
+ result = uploader.get_table_columns()
395
+
396
+ assert result == ["cached_column"]
397
+
398
+
399
+ def test_ibm_watsonx_uploader_get_table_columns_no_cache(
400
+ uploader: IbmWatsonxUploader,
401
+ mock_get_table: MagicMock,
402
+ mock_table: MagicMock,
403
+ ):
404
+ uploader._columns = None
405
+ mock_table.schema.return_value.column_names = ["column_1", "column_2"]
406
+
407
+ result = uploader.get_table_columns()
408
+
409
+ mock_get_table.assert_called_once()
410
+ assert result == ["column_1", "column_2"]
411
+ assert uploader._columns == ["column_1", "column_2"]
412
+
413
+
414
+ def test_ibm_watsonx_uploader_upload_dataframe_success(
415
+ mocker: MockerFixture,
416
+ uploader: IbmWatsonxUploader,
417
+ test_df: pd.DataFrame,
418
+ mock_get_table: MagicMock,
419
+ mock_table: MagicMock,
420
+ mock_data_table: MagicMock,
421
+ file_data: FileData,
422
+ ):
423
+ mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
424
+ mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
425
+
426
+ uploader.upload_dataframe(test_df, file_data)
427
+
428
+ mock_get_table.assert_called_once()
429
+ mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
430
+
431
+
432
+ def test_ibm_watsonx_uploader_delete_can_delete(
433
+ mocker: MockerFixture,
434
+ uploader: IbmWatsonxUploader,
435
+ mock_transaction: MagicMock,
436
+ ):
437
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
438
+ mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
439
+
440
+ uploader._delete(mock_transaction, "test_identifier")
441
+
442
+ mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
443
+ mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
444
+
445
+
446
+ def test_ibm_watsonx_uploader_delete_cannot_delete(
447
+ caplog: pytest.LogCaptureFixture,
448
+ mocker: MockerFixture,
449
+ uploader: IbmWatsonxUploader,
450
+ mock_transaction: MagicMock,
451
+ ):
452
+ mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
453
+
454
+ uploader._delete(mock_transaction, "test_identifier")
455
+ mock_transaction.delete.assert_not_called()
456
+ assert (
457
+ "Table doesn't contain expected record id column test_record_id_key, skipping delete"
458
+ in caplog.text
459
+ )
@@ -1,10 +1,16 @@
1
1
  from pathlib import Path
2
2
 
3
+ import pandas as pd
3
4
  import pytest
4
5
  from pytest_mock import MockerFixture
5
6
 
6
7
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
7
- from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
8
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
9
+ SQLConnectionConfig,
10
+ SQLUploader,
11
+ SQLUploaderConfig,
12
+ SQLUploadStager,
13
+ )
8
14
 
9
15
 
10
16
  @pytest.fixture
@@ -12,6 +18,17 @@ def mock_instance() -> SQLUploadStager:
12
18
  return SQLUploadStager()
13
19
 
14
20
 
21
+ @pytest.fixture
22
+ def mock_uploader(mocker: MockerFixture) -> SQLUploader:
23
+ mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
24
+ mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
25
+ return SQLUploader(
26
+ upload_config=mock_upload_config,
27
+ connection_config=mock_connection_config,
28
+ connector_type="sql_test",
29
+ )
30
+
31
+
15
32
  @pytest.mark.parametrize(
16
33
  ("input_filepath", "output_filename", "expected"),
17
34
  [
@@ -72,3 +89,64 @@ def test_run_output_filename_suffix(
72
89
  path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
90
  )
74
91
  assert result.name == expected
92
+
93
+
94
+ def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
95
+ df = pd.DataFrame(
96
+ {
97
+ "col1": [1, 2],
98
+ "col2": [3, 4],
99
+ "col3": [5, 6],
100
+ }
101
+ )
102
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
103
+
104
+ result = mock_uploader._fit_to_schema(df)
105
+
106
+ assert "col3" not in result.columns
107
+ assert "col1" in result.columns
108
+ assert "col2" in result.columns
109
+
110
+
111
+ def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
112
+ df = pd.DataFrame(
113
+ {
114
+ "col1": [1, 2],
115
+ }
116
+ )
117
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
118
+
119
+ result = mock_uploader._fit_to_schema(df)
120
+
121
+ assert "col2" in result.columns
122
+ assert result["col2"].isnull().all()
123
+
124
+
125
+ def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
126
+ df = pd.DataFrame(
127
+ {
128
+ "col1": [1, 2],
129
+ "col2": [3, 4],
130
+ }
131
+ )
132
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
133
+
134
+ result = mock_uploader._fit_to_schema(df)
135
+
136
+ assert "col1" in result.columns
137
+ assert "col2" in result.columns
138
+ assert result.equals(df)
139
+
140
+
141
+ def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
142
+ df = pd.DataFrame(
143
+ {
144
+ "col1": [1, 2],
145
+ }
146
+ )
147
+ mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
148
+
149
+ result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
150
+
151
+ assert "col2" not in result.columns
152
+ assert "col1" in result.columns
@@ -1 +1 @@
1
- __version__ = "0.5.19" # pragma: no cover
1
+ __version__ = "0.5.21" # pragma: no cover
@@ -2,10 +2,10 @@ from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, Optional
4
4
 
5
- import numpy as np
6
5
  from pydantic import BaseModel, Field
7
6
 
8
7
  from unstructured_ingest.utils.data_prep import batch_generator
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  EMBEDDINGS_KEY = "embeddings"
11
11
 
@@ -32,7 +32,6 @@ class BaseEncoder(ABC):
32
32
 
33
33
  @dataclass
34
34
  class BaseEmbeddingEncoder(BaseEncoder, ABC):
35
-
36
35
  def initialize(self):
37
36
  """Initializes the embedding encoder class. Should also validate the instance
38
37
  is properly configured: e.g., embed a single a element"""
@@ -46,8 +45,11 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
46
45
  return self.embed_query(query="Q")
47
46
 
48
47
  @property
48
+ @requires_dependencies(["numpy"])
49
49
  def is_unit_vector(self) -> bool:
50
50
  """Denotes if the embedding vector is a unit vector."""
51
+ import numpy as np
52
+
51
53
  exemplary_embedding = self.get_exemplary_embedding()
52
54
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
53
55
 
@@ -86,7 +88,6 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
86
88
 
87
89
  @dataclass
88
90
  class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
89
-
90
91
  async def initialize(self):
91
92
  """Initializes the embedding encoder class. Should also validate the instance
92
93
  is properly configured: e.g., embed a single a element"""
@@ -100,8 +101,11 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
100
101
  return await self.embed_query(query="Q")
101
102
 
102
103
  @property
104
+ @requires_dependencies(["numpy"])
103
105
  async def is_unit_vector(self) -> bool:
104
106
  """Denotes if the embedding vector is a unit vector."""
107
+ import numpy as np
108
+
105
109
  exemplary_embedding = await self.get_exemplary_embedding()
106
110
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
107
111
 
@@ -2,20 +2,22 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
6
-
7
- import pandas as pd
5
+ from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
8
6
 
9
7
  from unstructured_ingest.utils import ndjson
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
9
  from unstructured_ingest.v2.logger import logger
11
10
 
11
+ if TYPE_CHECKING:
12
+ from pandas import DataFrame
13
+
12
14
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
13
15
 
14
16
  T = TypeVar("T")
15
17
  IterableT = Iterable[T]
16
18
 
17
19
 
18
- def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
20
+ def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
19
21
  num_chunks = len(df) // chunk_size + 1
20
22
  for i in range(num_chunks):
21
23
  yield df[i * chunk_size : (i + 1) * chunk_size]
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
144
146
  elif path.suffix == ".ndjson":
145
147
  return ndjson.load(f)
146
148
  elif path.suffix == ".csv":
149
+ import pandas as pd
150
+
147
151
  df = pd.read_csv(path)
148
152
  return df.to_dict(orient="records")
149
153
  elif path.suffix == ".parquet":
154
+ import pandas as pd
155
+
150
156
  df = pd.read_parquet(path)
151
157
  return df.to_dict(orient="records")
152
158
  else:
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
180
186
  return ndjson.load(f)
181
187
  except Exception as e:
182
188
  logger.warning(f"failed to read {path} as ndjson: {e}")
189
+
190
+ import pandas as pd
191
+
183
192
  try:
184
193
  df = pd.read_csv(path)
185
194
  return df.to_dict(orient="records")
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
202
211
  raise ValueError(f"Unsupported file type: {path}")
203
212
 
204
213
 
205
- def get_data_df(path: Path) -> pd.DataFrame:
214
+ @requires_dependencies(["pandas"])
215
+ def get_data_df(path: Path) -> "DataFrame":
216
+ import pandas as pd
217
+
206
218
  with path.open() as f:
207
219
  if path.suffix == ".json":
208
220
  data = json.load(f)