unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,459 +0,0 @@
1
- import time
2
- from unittest.mock import MagicMock
3
-
4
- import pandas as pd
5
- import pytest
6
- from pydantic import Secret
7
- from pyiceberg.exceptions import CommitFailedException
8
- from pytest_mock import MockerFixture
9
-
10
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
11
- from unstructured_ingest.errors_v2 import ProviderError, UserError
12
- from unstructured_ingest.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
13
- from unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14
- IbmWatsonxAccessConfig,
15
- IbmWatsonxConnectionConfig,
16
- IbmWatsonxUploader,
17
- IbmWatsonxUploaderConfig,
18
- )
19
-
20
-
21
- @pytest.fixture
22
- def file_data():
23
- return FileData(
24
- identifier="test_identifier",
25
- connector_type=IBM_WATSONX_S3_CONNECTOR_TYPE,
26
- source_identifiers=SourceIdentifiers(
27
- filename="test_file.pdf", fullpath="/tmp/test_file.pdf"
28
- ),
29
- )
30
-
31
-
32
- @pytest.fixture
33
- def access_config():
34
- return IbmWatsonxAccessConfig(
35
- iam_api_key="test_iam_api_key",
36
- access_key_id="test_access_key_id",
37
- secret_access_key="test_secret_access_key",
38
- )
39
-
40
-
41
- @pytest.fixture
42
- def connection_config(access_config: IbmWatsonxAccessConfig):
43
- return IbmWatsonxConnectionConfig(
44
- access_config=Secret(access_config),
45
- iceberg_endpoint="test_iceberg_endpoint/",
46
- object_storage_endpoint="test_object_storage_endpoint/",
47
- object_storage_region="test_region",
48
- catalog="test_catalog",
49
- )
50
-
51
-
52
- @pytest.fixture
53
- def uploader_config():
54
- return IbmWatsonxUploaderConfig(
55
- namespace="test_namespace",
56
- table="test_table",
57
- record_id_key="test_record_id_key",
58
- )
59
-
60
-
61
- @pytest.fixture
62
- def uploader(
63
- connection_config: IbmWatsonxConnectionConfig, uploader_config: IbmWatsonxUploaderConfig
64
- ):
65
- return IbmWatsonxUploader(
66
- connection_config=connection_config,
67
- upload_config=uploader_config,
68
- )
69
-
70
-
71
- @pytest.fixture
72
- def mock_catalog(mocker: MockerFixture):
73
- mock_catalog = mocker.MagicMock()
74
- mock_catalog.namespace_exists.return_value = True
75
- mock_catalog.table_exists.return_value = True
76
- return mock_catalog
77
-
78
-
79
- @pytest.fixture
80
- def mock_get_catalog(mocker: MockerFixture, mock_catalog: MagicMock):
81
- mock_get_catalog = mocker.patch.context_manager(
82
- IbmWatsonxConnectionConfig, "get_catalog", autospec=True
83
- )
84
- mock_get_catalog.return_value.__enter__.return_value = mock_catalog
85
- return mock_get_catalog
86
-
87
-
88
- @pytest.fixture
89
- def mock_table(mocker: MockerFixture):
90
- mock_table = mocker.MagicMock()
91
- return mock_table
92
-
93
-
94
- @pytest.fixture
95
- def mock_get_table(mocker: MockerFixture, mock_table: MagicMock):
96
- mock_get_table = mocker.patch.context_manager(IbmWatsonxUploader, "get_table", autospec=True)
97
- mock_get_table.return_value.__enter__.return_value = mock_table
98
- return mock_get_table
99
-
100
-
101
- @pytest.fixture
102
- def mock_transaction(mocker: MockerFixture, mock_table: MagicMock):
103
- mock_transaction = mocker.MagicMock()
104
- mock_table.transaction.return_value.__enter__.return_value = mock_transaction
105
- return mock_transaction
106
-
107
-
108
- @pytest.fixture
109
- def mock_data_table(mocker: MockerFixture):
110
- mock_data_table = mocker.MagicMock()
111
- mock_data_table.schema = "schema"
112
- return mock_data_table
113
-
114
-
115
- @pytest.fixture
116
- def mock_delete(mocker: MockerFixture):
117
- return mocker.patch.object(IbmWatsonxUploader, "_delete")
118
-
119
-
120
- @pytest.fixture
121
- def test_df():
122
- return pd.DataFrame(
123
- {
124
- "test_column_0": [True, False, True],
125
- "test_column_1": [1, 2, 3],
126
- "test_column_2": ["a", "b", "c"],
127
- }
128
- )
129
-
130
-
131
- @pytest.fixture
132
- def timestamp_now():
133
- return int(time.time())
134
-
135
-
136
- def test_ibm_watsonx_connection_config_iceberg_url(
137
- mocker: MockerFixture,
138
- connection_config: IbmWatsonxConnectionConfig,
139
- ):
140
- mocker.patch(
141
- "unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
142
- new="/mds/iceberg",
143
- )
144
- expected_url = "https://test_iceberg_endpoint/mds/iceberg"
145
- assert connection_config.iceberg_url == expected_url
146
-
147
-
148
- def test_ibm_watsonx_connection_config_object_storage_url(
149
- connection_config: IbmWatsonxConnectionConfig,
150
- ):
151
- expected_url = "https://test_object_storage_endpoint"
152
- assert connection_config.object_storage_url == expected_url
153
-
154
-
155
- def test_ibm_watsonx_connection_config_bearer_token_new_token(
156
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
157
- ):
158
- mock_generate_bearer_token = mocker.patch.object(
159
- IbmWatsonxConnectionConfig,
160
- "generate_bearer_token",
161
- return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
162
- )
163
- token = connection_config.bearer_token
164
- assert token == "new_token"
165
- mock_generate_bearer_token.assert_called_once()
166
-
167
-
168
- def test_ibm_watsonx_connection_config_bearer_token_existing_token(
169
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
170
- ):
171
- connection_config._bearer_token = {
172
- "access_token": "existing_token",
173
- "expiration": timestamp_now + 3600,
174
- }
175
- mock_generate_bearer_token = mocker.patch.object(
176
- IbmWatsonxConnectionConfig, "generate_bearer_token"
177
- )
178
- token = connection_config.bearer_token
179
- assert token == "existing_token"
180
- mock_generate_bearer_token.assert_not_called()
181
-
182
-
183
- def test_ibm_watsonx_connection_config_bearer_token_expired_token(
184
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
185
- ):
186
- connection_config._bearer_token = {
187
- "access_token": "expired_token",
188
- "expiration": timestamp_now - 3600,
189
- }
190
- mock_generate_bearer_token = mocker.patch.object(
191
- IbmWatsonxConnectionConfig,
192
- "generate_bearer_token",
193
- return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
194
- )
195
- token = connection_config.bearer_token
196
- assert token == "new_token"
197
- mock_generate_bearer_token.assert_called_once()
198
-
199
-
200
- def test_ibm_watsonx_connection_config_bearer_token_soon_to_expire_token(
201
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig, timestamp_now: int
202
- ):
203
- connection_config._bearer_token = {
204
- "access_token": "soon_to_expire_token",
205
- "expiration": timestamp_now + 60,
206
- }
207
- mock_generate_bearer_token = mocker.patch.object(
208
- IbmWatsonxConnectionConfig,
209
- "generate_bearer_token",
210
- return_value={"access_token": "new_token", "expiration": timestamp_now + 3600},
211
- )
212
- token = connection_config.bearer_token
213
- assert token == "new_token"
214
- mock_generate_bearer_token.assert_called_once()
215
-
216
-
217
- def test_ibm_watsonx_connection_config_get_catalog_success(
218
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
219
- ):
220
- mocker.patch(
221
- "unstructured_ingest.processes.connectors.ibm_watsonx.ibm_watsonx_s3.DEFAULT_ICEBERG_URI_PATH", # noqa: E501
222
- new="/mds/iceberg",
223
- )
224
- mocker.patch.object(
225
- IbmWatsonxConnectionConfig,
226
- "bearer_token",
227
- new="test_bearer_token",
228
- )
229
- mock_load_catalog = mocker.patch("pyiceberg.catalog.load_catalog")
230
-
231
- with connection_config.get_catalog() as catalog:
232
- assert catalog is not None
233
- mock_load_catalog.assert_called_once_with(
234
- **{
235
- "name": "test_catalog",
236
- "type": "rest",
237
- "uri": "https://test_iceberg_endpoint/mds/iceberg",
238
- "token": "test_bearer_token",
239
- "warehouse": "test_catalog",
240
- "s3.endpoint": "https://test_object_storage_endpoint",
241
- "s3.access-key-id": "test_access_key_id",
242
- "s3.secret-access-key": "test_secret_access_key",
243
- "s3.region": "test_region",
244
- }
245
- )
246
-
247
-
248
- def test_ibm_watsonx_connection_config_get_catalog_failure(
249
- mocker: MockerFixture, connection_config: IbmWatsonxConnectionConfig
250
- ):
251
- mocker.patch(
252
- "pyiceberg.catalog.load_catalog",
253
- side_effect=Exception("Connection error"),
254
- )
255
- mocker.patch.object(
256
- IbmWatsonxConnectionConfig,
257
- "bearer_token",
258
- new="test_bearer_token",
259
- )
260
- with pytest.raises(ProviderError):
261
- with connection_config.get_catalog():
262
- pass
263
-
264
-
265
- def test_ibm_watsonx_uploader_precheck_namespace_exists_table_exists(
266
- mock_get_catalog: MagicMock,
267
- mock_catalog: MagicMock,
268
- uploader: IbmWatsonxUploader,
269
- ):
270
- uploader.precheck()
271
-
272
- mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
273
- mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
274
-
275
-
276
- def test_ibm_watsonx_uploader_precheck_namespace_does_not_exist(
277
- mock_get_catalog: MagicMock,
278
- mock_catalog: MagicMock,
279
- uploader: IbmWatsonxUploader,
280
- ):
281
- mock_catalog.namespace_exists.return_value = False
282
-
283
- with pytest.raises(UserError, match="Namespace 'test_namespace' does not exist"):
284
- uploader.precheck()
285
-
286
- mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
287
- mock_catalog.table_exists.assert_not_called()
288
-
289
-
290
- def test_ibm_watsonx_uploader_precheck_table_does_not_exist(
291
- mock_get_catalog: MagicMock,
292
- mock_catalog: MagicMock,
293
- uploader: IbmWatsonxUploader,
294
- ):
295
- mock_catalog.table_exists.return_value = False
296
-
297
- with pytest.raises(
298
- UserError,
299
- match="Table 'test_table' does not exist in namespace 'test_namespace'",
300
- ):
301
- uploader.precheck()
302
-
303
- mock_catalog.namespace_exists.assert_called_once_with("test_namespace")
304
- mock_catalog.table_exists.assert_called_once_with(("test_namespace", "test_table"))
305
-
306
-
307
- def test_ibm_watsonx_uploader_upload_data_table_success(
308
- uploader: IbmWatsonxUploader,
309
- mock_table: MagicMock,
310
- mock_transaction: MagicMock,
311
- mock_data_table: MagicMock,
312
- mock_delete: MagicMock,
313
- file_data: FileData,
314
- ):
315
- uploader.upload_data_table(mock_table, mock_data_table, file_data)
316
-
317
- mock_delete.assert_called_once_with(mock_transaction, "test_identifier")
318
- mock_transaction.append.assert_called_once_with(mock_data_table)
319
-
320
-
321
- def test_ibm_watsonx_uploader_upload_data_table_commit_exception(
322
- uploader: IbmWatsonxUploader,
323
- mock_table: MagicMock,
324
- mock_transaction: MagicMock,
325
- mock_data_table: MagicMock,
326
- mock_delete: MagicMock,
327
- file_data: FileData,
328
- ):
329
- mock_transaction.append.side_effect = CommitFailedException()
330
-
331
- with pytest.raises(ProviderError):
332
- uploader.upload_data_table(mock_table, mock_data_table, file_data)
333
- assert mock_table.refresh.call_count == 5
334
-
335
-
336
- def test_ibm_watsonx_uploader_upload_data_table_exception(
337
- uploader: IbmWatsonxUploader,
338
- mock_table: MagicMock,
339
- mock_transaction: MagicMock,
340
- mock_data_table: MagicMock,
341
- mock_delete: MagicMock,
342
- file_data: FileData,
343
- ):
344
- mock_transaction.append.side_effect = Exception()
345
-
346
- with pytest.raises(ProviderError):
347
- uploader.upload_data_table(mock_table, mock_data_table, file_data)
348
- assert mock_table.refresh.call_count == 0
349
-
350
-
351
- def test_ibm_watsonx_uploader_df_to_arrow_table(
352
- mocker: MockerFixture,
353
- uploader: IbmWatsonxUploader,
354
- test_df: pd.DataFrame,
355
- ):
356
- mock_fit_to_schema = mocker.patch.object(
357
- IbmWatsonxUploader, "_fit_to_schema", return_value=test_df
358
- )
359
-
360
- result = uploader._df_to_arrow_table(test_df)
361
-
362
- mock_fit_to_schema.assert_called_once_with(test_df, add_missing_columns=False)
363
- assert len(result.column_names) == 3
364
- assert "test_column_0" in result.column_names
365
- assert "test_column_1" in result.column_names
366
- assert "test_column_2" in result.column_names
367
-
368
-
369
- def test_ibm_watsonx_uploader_can_delete_column_exists(
370
- mocker: MockerFixture,
371
- uploader: IbmWatsonxUploader,
372
- ):
373
- mocker.patch.object(
374
- IbmWatsonxUploader, "get_table_columns", return_value=["test_record_id_key"]
375
- )
376
-
377
- assert uploader.can_delete() is True
378
-
379
-
380
- def test_ibm_watsonx_uploader_can_delete_column_does_not_exist(
381
- mocker: MockerFixture,
382
- uploader: IbmWatsonxUploader,
383
- ):
384
- mocker.patch.object(IbmWatsonxUploader, "get_table_columns", return_value=["other_column"])
385
-
386
- assert uploader.can_delete() is False
387
-
388
-
389
- def test_ibm_watsonx_uploader_get_table_columns_cache(
390
- uploader: IbmWatsonxUploader,
391
- ):
392
- uploader._columns = ["cached_column"]
393
-
394
- result = uploader.get_table_columns()
395
-
396
- assert result == ["cached_column"]
397
-
398
-
399
- def test_ibm_watsonx_uploader_get_table_columns_no_cache(
400
- uploader: IbmWatsonxUploader,
401
- mock_get_table: MagicMock,
402
- mock_table: MagicMock,
403
- ):
404
- uploader._columns = None
405
- mock_table.schema.return_value.column_names = ["column_1", "column_2"]
406
-
407
- result = uploader.get_table_columns()
408
-
409
- mock_get_table.assert_called_once()
410
- assert result == ["column_1", "column_2"]
411
- assert uploader._columns == ["column_1", "column_2"]
412
-
413
-
414
- def test_ibm_watsonx_uploader_upload_dataframe_success(
415
- mocker: MockerFixture,
416
- uploader: IbmWatsonxUploader,
417
- test_df: pd.DataFrame,
418
- mock_get_table: MagicMock,
419
- mock_table: MagicMock,
420
- mock_data_table: MagicMock,
421
- file_data: FileData,
422
- ):
423
- mocker.patch.object(IbmWatsonxUploader, "_df_to_arrow_table", return_value=mock_data_table)
424
- mock_upload_data_table = mocker.patch.object(IbmWatsonxUploader, "upload_data_table")
425
-
426
- uploader.upload_dataframe(test_df, file_data)
427
-
428
- mock_get_table.assert_called_once()
429
- mock_upload_data_table.assert_called_once_with(mock_table, mock_data_table, file_data)
430
-
431
-
432
- def test_ibm_watsonx_uploader_delete_can_delete(
433
- mocker: MockerFixture,
434
- uploader: IbmWatsonxUploader,
435
- mock_transaction: MagicMock,
436
- ):
437
- mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=True)
438
- mock_equal_to = mocker.patch("pyiceberg.expressions.EqualTo")
439
-
440
- uploader._delete(mock_transaction, "test_identifier")
441
-
442
- mock_equal_to.assert_called_once_with("test_record_id_key", "test_identifier")
443
- mock_transaction.delete.assert_called_once_with(delete_filter=mock_equal_to.return_value)
444
-
445
-
446
- def test_ibm_watsonx_uploader_delete_cannot_delete(
447
- caplog: pytest.LogCaptureFixture,
448
- mocker: MockerFixture,
449
- uploader: IbmWatsonxUploader,
450
- mock_transaction: MagicMock,
451
- ):
452
- mocker.patch.object(IbmWatsonxUploader, "can_delete", return_value=False)
453
-
454
- uploader._delete(mock_transaction, "test_identifier")
455
- mock_transaction.delete.assert_not_called()
456
- assert (
457
- "Table doesn't contain expected record id column test_record_id_key, skipping delete"
458
- in caplog.text
459
- )
File without changes
@@ -1,73 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
- from pytest_mock import MockerFixture
5
-
6
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
7
- from unstructured_ingest.interfaces import UploadStagerConfig
8
- from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
9
-
10
-
11
- @pytest.fixture
12
- def mock_instance() -> BaseDuckDBUploadStager:
13
- return BaseDuckDBUploadStager(UploadStagerConfig())
14
-
15
-
16
- @pytest.mark.parametrize(
17
- ("input_filepath", "output_filename", "expected"),
18
- [
19
- (
20
- "/path/to/input_file.ndjson",
21
- "output_file.ndjson",
22
- "output_file.ndjson",
23
- ),
24
- ("input_file.txt", "output_file.json", "output_file.txt"),
25
- ("/path/to/input_file.json", "output_file", "output_file.json"),
26
- ],
27
- )
28
- def test_run_output_filename_suffix(
29
- mocker: MockerFixture,
30
- mock_instance: BaseDuckDBUploadStager,
31
- input_filepath: str,
32
- output_filename: str,
33
- expected: str,
34
- ):
35
- output_dir = Path("/tmp/test/output_dir")
36
-
37
- # Mocks
38
- mock_get_data = mocker.patch(
39
- "unstructured_ingest.processes.connectors.duckdb.base.get_data",
40
- return_value=[{"key": "value"}, {"key": "value2"}],
41
- )
42
- mock_conform_dict = mocker.patch.object(
43
- BaseDuckDBUploadStager,
44
- "conform_dict",
45
- side_effect=lambda element_dict, file_data: element_dict,
46
- )
47
- mock_get_output_path = mocker.patch.object(
48
- BaseDuckDBUploadStager, "get_output_path", return_value=output_dir / expected
49
- )
50
- mock_write_output = mocker.patch(
51
- "unstructured_ingest.processes.connectors.duckdb.base.write_data", return_value=None
52
- )
53
-
54
- # Act
55
- result = mock_instance.run(
56
- elements_filepath=Path(input_filepath),
57
- file_data=FileData(
58
- identifier="test",
59
- connector_type="test",
60
- source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
61
- ),
62
- output_dir=output_dir,
63
- output_filename=output_filename,
64
- )
65
-
66
- # Assert
67
- mock_get_data.assert_called_once_with(path=Path(input_filepath))
68
- assert mock_conform_dict.call_count == 2
69
- mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
70
- mock_write_output.assert_called_once_with(
71
- path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
72
- )
73
- assert result.name == expected
File without changes
@@ -1,152 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pandas as pd
4
- import pytest
5
- from pytest_mock import MockerFixture
6
-
7
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
8
- from unstructured_ingest.processes.connectors.sql.sql import (
9
- SQLConnectionConfig,
10
- SQLUploader,
11
- SQLUploaderConfig,
12
- SQLUploadStager,
13
- )
14
-
15
-
16
- @pytest.fixture
17
- def mock_instance() -> SQLUploadStager:
18
- return SQLUploadStager()
19
-
20
-
21
- @pytest.fixture
22
- def mock_uploader(mocker: MockerFixture) -> SQLUploader:
23
- mock_connection_config = mocker.Mock(spec=SQLConnectionConfig)
24
- mock_upload_config = mocker.Mock(spec=SQLUploaderConfig)
25
- return SQLUploader(
26
- upload_config=mock_upload_config,
27
- connection_config=mock_connection_config,
28
- connector_type="sql_test",
29
- )
30
-
31
-
32
- @pytest.mark.parametrize(
33
- ("input_filepath", "output_filename", "expected"),
34
- [
35
- (
36
- "/path/to/input_file.ndjson",
37
- "output_file.ndjson",
38
- "output_file.ndjson",
39
- ),
40
- ("input_file.txt", "output_file.json", "output_file.txt"),
41
- ("/path/to/input_file.json", "output_file", "output_file.json"),
42
- ],
43
- )
44
- def test_run_output_filename_suffix(
45
- mocker: MockerFixture,
46
- mock_instance: SQLUploadStager,
47
- input_filepath: str,
48
- output_filename: str,
49
- expected: str,
50
- ):
51
- output_dir = Path("/tmp/test/output_dir")
52
-
53
- # Mocks
54
- mock_get_data = mocker.patch(
55
- "unstructured_ingest.processes.connectors.sql.sql.get_data",
56
- return_value=[{"key": "value"}, {"key": "value2"}],
57
- )
58
- mock_conform_dict = mocker.patch.object(
59
- SQLUploadStager, "conform_dict", side_effect=lambda element_dict, file_data: element_dict
60
- )
61
- mock_conform_dataframe = mocker.patch.object(
62
- SQLUploadStager, "conform_dataframe", side_effect=lambda df: df
63
- )
64
- mock_get_output_path = mocker.patch.object(
65
- SQLUploadStager, "get_output_path", return_value=output_dir / expected
66
- )
67
- mock_write_output = mocker.patch(
68
- "unstructured_ingest.processes.connectors.sql.sql.write_data", return_value=None
69
- )
70
-
71
- # Act
72
- result = mock_instance.run(
73
- elements_filepath=Path(input_filepath),
74
- file_data=FileData(
75
- identifier="test",
76
- connector_type="test",
77
- source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
78
- ),
79
- output_dir=output_dir,
80
- output_filename=output_filename,
81
- )
82
-
83
- # Assert
84
- mock_get_data.assert_called_once_with(path=Path(input_filepath))
85
- assert mock_conform_dict.call_count == 2
86
- mock_conform_dataframe.assert_called_once()
87
- mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
88
- mock_write_output.assert_called_once_with(
89
- path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
90
- )
91
- assert result.name == expected
92
-
93
-
94
- def test_fit_to_schema_drop_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
95
- df = pd.DataFrame(
96
- {
97
- "col1": [1, 2],
98
- "col2": [3, 4],
99
- "col3": [5, 6],
100
- }
101
- )
102
- mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
103
-
104
- result = mock_uploader._fit_to_schema(df)
105
-
106
- assert "col3" not in result.columns
107
- assert "col1" in result.columns
108
- assert "col2" in result.columns
109
-
110
-
111
- def test_fit_to_schema_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
112
- df = pd.DataFrame(
113
- {
114
- "col1": [1, 2],
115
- }
116
- )
117
- mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
118
-
119
- result = mock_uploader._fit_to_schema(df)
120
-
121
- assert "col2" in result.columns
122
- assert result["col2"].isnull().all()
123
-
124
-
125
- def test_fit_to_schema_no_changes(mocker: MockerFixture, mock_uploader: SQLUploader):
126
- df = pd.DataFrame(
127
- {
128
- "col1": [1, 2],
129
- "col2": [3, 4],
130
- }
131
- )
132
- mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
133
-
134
- result = mock_uploader._fit_to_schema(df)
135
-
136
- assert "col1" in result.columns
137
- assert "col2" in result.columns
138
- assert result.equals(df)
139
-
140
-
141
- def test_fit_to_schema_no_add_missing_columns(mocker: MockerFixture, mock_uploader: SQLUploader):
142
- df = pd.DataFrame(
143
- {
144
- "col1": [1, 2],
145
- }
146
- )
147
- mocker.patch.object(mock_uploader, "get_table_columns", return_value=["col1", "col2"])
148
-
149
- result = mock_uploader._fit_to_schema(df, add_missing_columns=False)
150
-
151
- assert "col2" not in result.columns
152
- assert "col1" in result.columns