unstructured-ingest 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. test/integration/connectors/test_astradb.py +8 -2
  2. test/unit/v2/connectors/ibm_watsonx/__init__.py +0 -0
  3. test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +459 -0
  4. test/unit/v2/connectors/sql/test_sql.py +79 -1
  5. unstructured_ingest/__version__.py +1 -1
  6. unstructured_ingest/embed/interfaces.py +7 -3
  7. unstructured_ingest/utils/data_prep.py +17 -5
  8. unstructured_ingest/utils/table.py +11 -4
  9. unstructured_ingest/v2/processes/connectors/__init__.py +2 -0
  10. unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
  11. unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
  12. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
  13. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
  14. unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py +10 -0
  15. unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +301 -0
  16. unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
  17. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
  18. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
  19. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
  20. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
  21. unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -12
  22. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
  23. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
  24. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/METADATA +174 -18
  25. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/RECORD +29 -25
  26. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/LICENSE.md +0 -0
  27. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/WHEEL +0 -0
  28. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt +0 -0
  29. {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,16 @@
1
- from typing import Any
2
-
3
- import pandas as pd
1
+ from typing import TYPE_CHECKING, Any
4
2
 
5
3
  from unstructured_ingest.utils.data_prep import flatten_dict
4
+ from unstructured_ingest.utils.dep_check import requires_dependencies
5
+
6
+ if TYPE_CHECKING:
7
+ from pandas import DataFrame
6
8
 
7
9
 
10
+ @requires_dependencies(["pandas"])
8
11
  def get_default_pandas_dtypes() -> dict[str, Any]:
12
+ import pandas as pd
13
+
9
14
  return {
10
15
  "text": pd.StringDtype(), # type: ignore
11
16
  "type": pd.StringDtype(), # type: ignore
@@ -57,7 +62,9 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
57
62
  def convert_to_pandas_dataframe(
58
63
  elements_dict: list[dict[str, Any]],
59
64
  drop_empty_cols: bool = False,
60
- ) -> pd.DataFrame:
65
+ ) -> "DataFrame":
66
+ import pandas as pd
67
+
61
68
  # Flatten metadata if it hasn't already been flattened
62
69
  for d in elements_dict:
63
70
  if metadata := d.pop("metadata", None):
@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
5
5
  import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
6
6
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.ibm_watsonx # noqa: F401
7
8
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
8
9
  import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
9
10
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
121
122
  add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
122
123
 
123
124
  add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
125
+
124
126
  add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -3,10 +3,9 @@ import traceback
3
3
  from dataclasses import dataclass, field
4
4
  from multiprocessing import Process, Queue
5
5
  from pathlib import Path
6
- from typing import Any, Optional
6
+ from typing import TYPE_CHECKING, Any, Optional
7
7
  from urllib.parse import urlparse
8
8
 
9
- import pandas as pd
10
9
  from pydantic import Field, Secret
11
10
 
12
11
  from unstructured_ingest.error import DestinationConnectionError
@@ -27,6 +26,9 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
27
26
 
28
27
  CONNECTOR_TYPE = "delta_table"
29
28
 
29
+ if TYPE_CHECKING:
30
+ from pandas import DataFrame
31
+
30
32
 
31
33
  @requires_dependencies(["deltalake"], extras="delta-table")
32
34
  def write_deltalake_with_error_handling(queue, **kwargs):
@@ -136,7 +138,7 @@ class DeltaTableUploader(Uploader):
136
138
  logger.error(f"failed to validate connection: {e}", exc_info=True)
137
139
  raise DestinationConnectionError(f"failed to validate connection: {e}")
138
140
 
139
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
141
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
140
142
  updated_upload_path = os.path.join(
141
143
  self.connection_config.table_uri, file_data.source_identifiers.relative_path
142
144
  )
@@ -172,7 +174,10 @@ class DeltaTableUploader(Uploader):
172
174
  logger.error(f"Exception occurred in write_deltalake: {error_message}")
173
175
  raise RuntimeError(f"Error in write_deltalake: {error_message}")
174
176
 
177
+ @requires_dependencies(["pandas"], extras="delta-table")
175
178
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
179
+ import pandas as pd
180
+
176
181
  df = pd.DataFrame(data=data)
177
182
  self.upload_dataframe(df=df, file_data=file_data)
178
183
 
@@ -2,9 +2,8 @@ from dataclasses import dataclass
2
2
  from pathlib import Path
3
3
  from typing import Any
4
4
 
5
- import pandas as pd
6
-
7
5
  from unstructured_ingest.utils.data_prep import get_data, write_data
6
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
7
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
9
8
  from unstructured_ingest.v2.utils import get_enhanced_element_id
10
9
 
@@ -55,7 +54,6 @@ _COLUMNS = (
55
54
 
56
55
  @dataclass
57
56
  class BaseDuckDBUploadStager(UploadStager):
58
-
59
57
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
60
58
  data = element_dict.copy()
61
59
  metadata: dict[str, Any] = data.pop("metadata", {})
@@ -72,6 +70,7 @@ class BaseDuckDBUploadStager(UploadStager):
72
70
  data = {k: v for k, v in data.items() if k in _COLUMNS}
73
71
  return data
74
72
 
73
+ @requires_dependencies(["pandas"], extras="duckdb")
75
74
  def run(
76
75
  self,
77
76
  elements_filepath: Path,
@@ -80,6 +79,8 @@ class BaseDuckDBUploadStager(UploadStager):
80
79
  output_filename: str,
81
80
  **kwargs: Any,
82
81
  ) -> Path:
82
+ import pandas as pd
83
+
83
84
  elements_contents = get_data(path=elements_filepath)
84
85
  output_filename_suffix = Path(elements_filepath).suffix
85
86
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.error import DestinationConnectionError
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
23
22
 
24
23
  if TYPE_CHECKING:
25
24
  from duckdb import DuckDBPyConnection as DuckDBConnection
25
+ from pandas import DataFrame
26
26
 
27
27
  CONNECTOR_TYPE = "duckdb"
28
28
 
@@ -101,7 +101,7 @@ class DuckDBUploader(Uploader):
101
101
  logger.error(f"failed to validate connection: {e}", exc_info=True)
102
102
  raise DestinationConnectionError(f"failed to validate connection: {e}")
103
103
 
104
- def upload_dataframe(self, df: pd.DataFrame) -> None:
104
+ def upload_dataframe(self, df: "DataFrame") -> None:
105
105
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
106
106
 
107
107
  with self.connection_config.get_client() as conn:
@@ -109,7 +109,10 @@ class DuckDBUploader(Uploader):
109
109
  f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
110
110
  )
111
111
 
112
+ @requires_dependencies(["pandas"], extras="duckdb")
112
113
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
114
+ import pandas as pd
115
+
113
116
  df = pd.DataFrame(data=data)
114
117
  self.upload_dataframe(df=df)
115
118
 
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
24
23
 
25
24
  if TYPE_CHECKING:
26
25
  from duckdb import DuckDBPyConnection as MotherDuckConnection
26
+ from pandas import DataFrame
27
27
 
28
28
  CONNECTOR_TYPE = "motherduck"
29
29
 
@@ -100,7 +100,7 @@ class MotherDuckUploader(Uploader):
100
100
  logger.error(f"failed to validate connection: {e}", exc_info=True)
101
101
  raise DestinationConnectionError(f"failed to validate connection: {e}")
102
102
 
103
- def upload_dataframe(self, df: pd.DataFrame) -> None:
103
+ def upload_dataframe(self, df: "DataFrame") -> None:
104
104
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
105
  database = self.connection_config.database
106
106
  db_schema = self.connection_config.db_schema
@@ -109,7 +109,10 @@ class MotherDuckUploader(Uploader):
109
109
  with self.connection_config.get_client() as conn:
110
110
  conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
111
111
 
112
+ @requires_dependencies(["pandas"], extras="duckdb")
112
113
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
114
+ import pandas as pd
115
+
113
116
  df = pd.DataFrame(data=data)
114
117
  self.upload_dataframe(df=df)
115
118
 
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
4
+
5
+ from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
6
+ from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
7
+
8
+ add_destination_entry(
9
+ destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
10
+ )
@@ -0,0 +1,301 @@
1
+ import logging
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
7
+
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.utils.data_prep import get_data_df
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ UploaderConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
26
+ SQLUploader,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from pyarrow import Table as ArrowTable
33
+ from pyiceberg.catalog.rest import RestCatalog
34
+ from pyiceberg.table import Table, Transaction
35
+
36
+ CONNECTOR_TYPE = "ibm_watsonx_s3"
37
+
38
+ DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
39
+ DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
40
+ DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
41
+
42
+
43
+ class IcebergCommitFailedException(Exception):
44
+ """Failed to commit changes to the iceberg table."""
45
+
46
+
47
+ class IbmWatsonxAccessConfig(AccessConfig):
48
+ iam_api_key: str = Field(description="IBM IAM API Key")
49
+ access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
50
+ secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
51
+
52
+
53
+ class IbmWatsonxConnectionConfig(ConnectionConfig):
54
+ access_config: Secret[IbmWatsonxAccessConfig]
55
+ iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
56
+ object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
57
+ object_storage_region: str = Field(description="Cloud Object Storage region")
58
+ catalog: str = Field(description="Catalog name")
59
+
60
+ _bearer_token: Optional[dict[str, Any]] = None
61
+
62
+ @property
63
+ def iceberg_url(self) -> str:
64
+ return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
65
+
66
+ @property
67
+ def object_storage_url(self) -> str:
68
+ return f"https://{self.object_storage_endpoint.strip('/')}"
69
+
70
+ @property
71
+ def bearer_token(self) -> str:
72
+ # Add 60 seconds to deal with edge cases where the token expires before the request is made
73
+ timestamp = int(time.time()) + 60
74
+ if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
75
+ self._bearer_token = self.generate_bearer_token()
76
+ return self._bearer_token["access_token"]
77
+
78
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
79
+ def wrap_error(self, e: Exception) -> Exception:
80
+ import httpx
81
+
82
+ if not isinstance(e, httpx.HTTPStatusError):
83
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
84
+ return e
85
+ url = e.request.url
86
+ response_code = e.response.status_code
87
+ if response_code == 401:
88
+ logger.error(
89
+ f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
90
+ )
91
+ return UserAuthError(e)
92
+ if response_code == 403:
93
+ logger.error(
94
+ f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
95
+ )
96
+ return UserAuthError(e)
97
+ if 400 <= response_code < 500:
98
+ logger.error(
99
+ f"Request to {url} failed"
100
+ f"in IBM watsonx.data connector, status code {response_code}"
101
+ )
102
+ return UserError(e)
103
+ if response_code > 500:
104
+ logger.error(
105
+ f"Request to {url} failed"
106
+ f"in IBM watsonx.data connector, status code {response_code}"
107
+ )
108
+ return ProviderError(e)
109
+ logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
110
+ return e
111
+
112
+ @requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
113
+ def generate_bearer_token(self) -> dict[str, Any]:
114
+ import httpx
115
+
116
+ headers = {
117
+ "Content-Type": "application/x-www-form-urlencoded",
118
+ "Accept": "application/json",
119
+ }
120
+ data = {
121
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
122
+ "apikey": self.access_config.get_secret_value().iam_api_key,
123
+ }
124
+
125
+ logger.info("Generating IBM IAM Bearer Token")
126
+ try:
127
+ response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
128
+ response.raise_for_status()
129
+ except Exception as e:
130
+ raise self.wrap_error(e)
131
+ return response.json()
132
+
133
+ def get_catalog_config(self) -> dict[str, Any]:
134
+ return {
135
+ "name": self.catalog,
136
+ "type": DEFAULT_ICEBERG_CATALOG_TYPE,
137
+ "uri": self.iceberg_url,
138
+ "token": self.bearer_token,
139
+ "warehouse": self.catalog,
140
+ "s3.endpoint": self.object_storage_url,
141
+ "s3.access-key-id": self.access_config.get_secret_value().access_key_id,
142
+ "s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
143
+ "s3.region": self.object_storage_region,
144
+ }
145
+
146
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
147
+ @contextmanager
148
+ def get_catalog(self) -> Generator["RestCatalog", None, None]:
149
+ from pyiceberg.catalog import load_catalog
150
+
151
+ try:
152
+ catalog_config = self.get_catalog_config()
153
+ catalog = load_catalog(**catalog_config)
154
+ except Exception as e:
155
+ logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
156
+ raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
157
+
158
+ yield catalog
159
+
160
+
161
+ @dataclass
162
+ class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class IbmWatsonxUploadStager(SQLUploadStager):
168
+ upload_stager_config: IbmWatsonxUploadStagerConfig = field(
169
+ default_factory=IbmWatsonxUploadStagerConfig
170
+ )
171
+
172
+
173
+ class IbmWatsonxUploaderConfig(UploaderConfig):
174
+ namespace: str = Field(description="Namespace name")
175
+ table: str = Field(description="Table name")
176
+ max_retries: int = Field(
177
+ default=5, description="Maximum number of retries to upload data", ge=2, le=10
178
+ )
179
+ record_id_key: str = Field(
180
+ default=RECORD_ID_LABEL,
181
+ description="Searchable key to find entries for the same record on previous runs",
182
+ )
183
+
184
+ @property
185
+ def table_identifier(self) -> Tuple[str, str]:
186
+ return (self.namespace, self.table)
187
+
188
+
189
+ @dataclass
190
+ class IbmWatsonxUploader(SQLUploader):
191
+ connection_config: IbmWatsonxConnectionConfig
192
+ upload_config: IbmWatsonxUploaderConfig
193
+ connector_type: str = CONNECTOR_TYPE
194
+
195
+ def precheck(self) -> None:
196
+ with self.connection_config.get_catalog() as catalog:
197
+ if not catalog.namespace_exists(self.upload_config.namespace):
198
+ raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
199
+ if not catalog.table_exists(self.upload_config.table_identifier):
200
+ raise UserError(
201
+ f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
202
+ )
203
+
204
+ @contextmanager
205
+ def get_table(self) -> Generator["Table", None, None]:
206
+ with self.connection_config.get_catalog() as catalog:
207
+ table = catalog.load_table(self.upload_config.table_identifier)
208
+ yield table
209
+
210
+ def get_table_columns(self) -> list[str]:
211
+ if self._columns is None:
212
+ with self.get_table() as table:
213
+ self._columns = table.schema().column_names
214
+ return self._columns
215
+
216
+ def can_delete(self) -> bool:
217
+ return self.upload_config.record_id_key in self.get_table_columns()
218
+
219
+ @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
220
+ def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
221
+ import pyarrow as pa
222
+
223
+ # Iceberg will automatically fill missing columns with nulls
224
+ # Iceberg will throw an error if the DataFrame column has only null values
225
+ # because it can't infer the type of the column and match it with the table schema
226
+ return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
227
+
228
+ @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
229
+ def _delete(self, transaction: "Transaction", identifier: str) -> None:
230
+ from pyiceberg.expressions import EqualTo
231
+
232
+ if self.can_delete():
233
+ transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
234
+ else:
235
+ logger.warning(
236
+ f"Table doesn't contain expected "
237
+ f"record id column "
238
+ f"{self.upload_config.record_id_key}, skipping delete"
239
+ )
240
+
241
+ @requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
242
+ def upload_data_table(
243
+ self, table: "Table", data_table: "ArrowTable", file_data: FileData
244
+ ) -> None:
245
+ from pyiceberg.exceptions import CommitFailedException
246
+ from tenacity import (
247
+ before_log,
248
+ retry,
249
+ retry_if_exception_type,
250
+ stop_after_attempt,
251
+ wait_random,
252
+ )
253
+
254
+ @retry(
255
+ stop=stop_after_attempt(self.upload_config.max_retries),
256
+ wait=wait_random(),
257
+ retry=retry_if_exception_type(IcebergCommitFailedException),
258
+ before=before_log(logger, logging.DEBUG),
259
+ reraise=True,
260
+ )
261
+ def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
262
+ try:
263
+ with table.transaction() as transaction:
264
+ self._delete(transaction, file_data.identifier)
265
+ transaction.append(data_table)
266
+ except CommitFailedException as e:
267
+ table.refresh()
268
+ logger.debug(e)
269
+ raise IcebergCommitFailedException(e)
270
+ except Exception as e:
271
+ raise ProviderError(f"Failed to upload data to table: {e}")
272
+
273
+ try:
274
+ return _upload_data_table(table, data_table, file_data)
275
+ except ProviderError:
276
+ raise
277
+ except Exception as e:
278
+ raise ProviderError(f"Failed to upload data to table: {e}")
279
+
280
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
281
+ data_table = self._df_to_arrow_table(df)
282
+
283
+ with self.get_table() as table:
284
+ self.upload_data_table(table, data_table, file_data)
285
+
286
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
287
+ df = pd.DataFrame(data)
288
+ self.upload_dataframe(df=df, file_data=file_data)
289
+
290
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
291
+ df = get_data_df(path=path)
292
+ self.upload_dataframe(df=df, file_data=file_data)
293
+
294
+
295
+ ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
296
+ connection_config=IbmWatsonxConnectionConfig,
297
+ uploader=IbmWatsonxUploader,
298
+ uploader_config=IbmWatsonxUploaderConfig,
299
+ upload_stager=IbmWatsonxUploadStager,
300
+ upload_stager_config=IbmWatsonxUploadStagerConfig,
301
+ )
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.error import DestinationConnectionError
@@ -26,6 +25,7 @@ from unstructured_ingest.v2.utils import get_enhanced_element_id
26
25
 
27
26
  if TYPE_CHECKING:
28
27
  from kdbai_client import Database, Session, Table
28
+ from pandas import DataFrame
29
29
 
30
30
  CONNECTOR_TYPE = "kdbai"
31
31
 
@@ -118,11 +118,11 @@ class KdbaiUploader(Uploader):
118
118
  table = db.table(self.upload_config.table_name)
119
119
  yield table
120
120
 
121
- def upsert_batch(self, batch: pd.DataFrame):
121
+ def upsert_batch(self, batch: "DataFrame"):
122
122
  with self.get_table() as table:
123
123
  table.insert(batch)
124
124
 
125
- def process_dataframe(self, df: pd.DataFrame):
125
+ def process_dataframe(self, df: "DataFrame"):
126
126
  logger.debug(
127
127
  f"uploading {len(df)} entries to {self.connection_config.endpoint} "
128
128
  f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
@@ -130,7 +130,10 @@ class KdbaiUploader(Uploader):
130
130
  for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
131
131
  self.upsert_batch(batch=batch_df)
132
132
 
133
+ @requires_dependencies(["pandas"], extras="kdbai")
133
134
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
135
+ import pandas as pd
136
+
134
137
  df = pd.DataFrame(data=data)
135
138
  self.process_dataframe(df=df)
136
139
 
@@ -8,7 +8,6 @@ from dataclasses import dataclass, field
8
8
  from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
10
10
 
11
- import pandas as pd
12
11
  from pydantic import Field
13
12
 
14
13
  from unstructured_ingest.error import DestinationConnectionError
@@ -26,6 +25,7 @@ CONNECTOR_TYPE = "lancedb"
26
25
  if TYPE_CHECKING:
27
26
  from lancedb import AsyncConnection
28
27
  from lancedb.table import AsyncTable
28
+ from pandas import DataFrame
29
29
 
30
30
 
31
31
  class LanceDBConnectionConfig(ConnectionConfig, ABC):
@@ -69,6 +69,7 @@ class LanceDBUploadStager(UploadStager):
69
69
  default_factory=LanceDBUploadStagerConfig
70
70
  )
71
71
 
72
+ @requires_dependencies(["pandas"], extras="lancedb")
72
73
  def run(
73
74
  self,
74
75
  elements_filepath: Path,
@@ -77,6 +78,8 @@ class LanceDBUploadStager(UploadStager):
77
78
  output_filename: str,
78
79
  **kwargs: Any,
79
80
  ) -> Path:
81
+ import pandas as pd
82
+
80
83
  with open(elements_filepath) as elements_file:
81
84
  elements_contents: list[dict] = json.load(elements_file)
82
85
 
@@ -129,7 +132,10 @@ class LanceDBUploader(Uploader):
129
132
  finally:
130
133
  table.close()
131
134
 
135
+ @requires_dependencies(["pandas"], extras="lancedb")
132
136
  async def run_async(self, path, file_data, **kwargs):
137
+ import pandas as pd
138
+
133
139
  df = pd.read_feather(path)
134
140
  async with self.get_table() as table:
135
141
  schema = await table.schema()
@@ -144,7 +150,9 @@ class LanceDBUploader(Uploader):
144
150
  await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
145
151
  await table.add(data=df)
146
152
 
147
- def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
153
+ def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
154
+ import pandas as pd
155
+
148
156
  columns = set(df.columns)
149
157
  schema_fields = set(schema.names)
150
158
  columns_to_drop = columns - schema_fields
@@ -3,8 +3,6 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import numpy as np
7
- import pandas as pd
8
6
  from pydantic import Field, Secret
9
7
 
10
8
  from unstructured_ingest.utils.data_prep import split_dataframe
@@ -27,6 +25,7 @@ if TYPE_CHECKING:
27
25
  from databricks.sdk.core import oauth_service_principal
28
26
  from databricks.sql.client import Connection as DeltaTableConnection
29
27
  from databricks.sql.client import Cursor as DeltaTableCursor
28
+ from pandas import DataFrame
30
29
 
31
30
  CONNECTOR_TYPE = "databricks_delta_tables"
32
31
 
@@ -180,7 +179,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
180
179
  )
181
180
  return statement
182
181
 
183
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
182
+ @requires_dependencies(["pandas"], extras="databricks-delta-tables")
183
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
184
+ import numpy as np
185
+
184
186
  if self.can_delete():
185
187
  self.delete_by_record_id(file_data=file_data)
186
188
  else:
@@ -3,9 +3,9 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.processes.connector_registry import (
11
11
  DestinationRegistryEntry,
@@ -46,6 +46,7 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
46
46
  database: Optional[str] = Field(default=None, description="SingleStore database")
47
47
 
48
48
  @contextmanager
49
+ @requires_dependencies(["singlestoredb"], extras="singlestore")
49
50
  def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
50
51
  import singlestoredb as s2
51
52
 
@@ -130,9 +131,12 @@ class SingleStoreUploader(SQLUploader):
130
131
  values_delimiter: str = "%s"
131
132
  connector_type: str = CONNECTOR_TYPE
132
133
 
134
+ @requires_dependencies(["pandas"], extras="singlestore")
133
135
  def prepare_data(
134
136
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
135
137
  ) -> list[tuple[Any, ...]]:
138
+ import pandas as pd
139
+
136
140
  output = []
137
141
  for row in data:
138
142
  parsed = []