unstructured-ingest 1.2.11__py3-none-any.whl → 1.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.2.11" # pragma: no cover
1
+ __version__ = "1.2.14" # pragma: no cover
@@ -58,9 +58,22 @@ def conform_query(query: str, provider: str) -> dict:
58
58
 
59
59
 
60
60
  class BedrockEmbeddingConfig(EmbeddingConfig):
61
- aws_access_key_id: SecretStr = Field(description="aws access key id")
62
- aws_secret_access_key: SecretStr = Field(description="aws secret access key")
63
- region_name: str = Field(description="aws region name", default="us-west-2")
61
+ aws_access_key_id: SecretStr | None = Field(description="aws access key id", default=None)
62
+ aws_secret_access_key: SecretStr | None = Field(
63
+ description="aws secret access key", default=None
64
+ )
65
+ region_name: str = Field(
66
+ description="aws region name",
67
+ default_factory=lambda: (
68
+ os.getenv("BEDROCK_REGION_NAME") or
69
+ os.getenv("AWS_DEFAULT_REGION") or
70
+ "us-west-2"
71
+ )
72
+ )
73
+ endpoint_url: str | None = Field(description="custom bedrock endpoint url", default=None)
74
+ access_method: str = Field(
75
+ description="authentication method", default="credentials"
76
+ ) # "credentials" or "iam"
64
77
  embedder_model_name: str = Field(
65
78
  default="amazon.titan-embed-text-v1",
66
79
  alias="model_name",
@@ -96,6 +109,20 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
96
109
  return e
97
110
 
98
111
  def run_precheck(self) -> None:
112
+ # Validate access method and credentials configuration
113
+ if self.access_method == "credentials":
114
+ if not (self.aws_access_key_id and self.aws_secret_access_key):
115
+ raise ValueError(
116
+ "Credentials access method requires aws_access_key_id and aws_secret_access_key"
117
+ )
118
+ elif self.access_method == "iam":
119
+ # For IAM, credentials are handled by AWS SDK
120
+ pass
121
+ else:
122
+ raise ValueError(
123
+ f"Invalid access_method: {self.access_method}. Must be 'credentials' or 'iam'"
124
+ )
125
+
99
126
  client = self.get_bedrock_client()
100
127
  try:
101
128
  model_info = client.list_foundation_models(byOutputModality="EMBEDDING")
@@ -113,11 +140,30 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
113
140
  raise self.wrap_error(e=e)
114
141
 
115
142
  def get_client_kwargs(self) -> dict:
116
- return {
117
- "aws_access_key_id": self.aws_access_key_id.get_secret_value(),
118
- "aws_secret_access_key": self.aws_secret_access_key.get_secret_value(),
143
+ kwargs = {
119
144
  "region_name": self.region_name,
120
145
  }
146
+
147
+ if self.endpoint_url:
148
+ kwargs["endpoint_url"] = self.endpoint_url
149
+
150
+ if self.access_method == "credentials":
151
+ if self.aws_access_key_id and self.aws_secret_access_key:
152
+ kwargs["aws_access_key_id"] = self.aws_access_key_id.get_secret_value()
153
+ kwargs["aws_secret_access_key"] = self.aws_secret_access_key.get_secret_value()
154
+ else:
155
+ raise ValueError(
156
+ "Credentials access method requires aws_access_key_id and aws_secret_access_key"
157
+ )
158
+ elif self.access_method == "iam":
159
+ # For IAM, boto3 will use default credential chain (IAM roles, environment, etc.)
160
+ pass
161
+ else:
162
+ raise ValueError(
163
+ f"Invalid access_method: {self.access_method}. Must be 'credentials' or 'iam'"
164
+ )
165
+
166
+ return kwargs
121
167
 
122
168
  @requires_dependencies(
123
169
  ["boto3"],
@@ -147,6 +147,10 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
147
147
  "s3.access-key-id": self.access_config.get_secret_value().access_key_id,
148
148
  "s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
149
149
  "s3.region": self.object_storage_region,
150
+ # By default this header is set to `vended-credentials`, and default bucket
151
+ # configuration doesn't allow vending credentials. We need to set it to `None`
152
+ # in order to use user-provided S3 credentials.
153
+ "header.X-Iceberg-Access-Delegation": None,
150
154
  }
151
155
 
152
156
  @requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
@@ -10,7 +10,6 @@ from unstructured_ingest.data_types.file_data import FileData
10
10
  from unstructured_ingest.error import (
11
11
  DestinationConnectionError,
12
12
  KeyError,
13
- ValueError,
14
13
  WriteError,
15
14
  )
16
15
  from unstructured_ingest.interfaces import (
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import logging
4
5
  from dataclasses import dataclass
5
6
  from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
6
7
 
@@ -210,7 +211,7 @@ class SharepointIndexer(OnedriveIndexer):
210
211
 
211
212
 
212
213
  class SharepointDownloaderConfig(OnedriveDownloaderConfig):
213
- pass
214
+ max_retries: int = 10
214
215
 
215
216
 
216
217
  @dataclass
@@ -219,10 +220,22 @@ class SharepointDownloader(OnedriveDownloader):
219
220
  download_config: SharepointDownloaderConfig
220
221
  connector_type: str = CONNECTOR_TYPE
221
222
 
223
+ @staticmethod
224
+ def retry_on_status_code(exc):
225
+ error_msg = str(exc).lower()
226
+ return "429" in error_msg or "activitylimitreached" in error_msg or "throttled" in error_msg
227
+
222
228
  @SourceConnectionNetworkError.wrap
223
229
  @requires_dependencies(["office365"], extras="sharepoint")
224
230
  def _fetch_file(self, file_data: FileData) -> DriveItem:
225
231
  from office365.runtime.client_request_exception import ClientRequestException
232
+ from tenacity import (
233
+ before_log,
234
+ retry,
235
+ retry_if_exception,
236
+ stop_after_attempt,
237
+ wait_exponential,
238
+ )
226
239
 
227
240
  if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
228
241
  raise ValueError(
@@ -233,13 +246,27 @@ class SharepointDownloader(OnedriveDownloader):
233
246
  server_relative_path = file_data.source_identifiers.fullpath
234
247
  client = self.connection_config.get_client()
235
248
 
236
- try:
237
- client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
238
- site_drive_item = self.connection_config._get_drive_item(client_site)
239
- except ClientRequestException:
240
- logger.info("Site not found")
241
- raise SourceConnectionError(f"Site not found: {self.connection_config.site}")
242
- file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
249
+ @retry(
250
+ stop=stop_after_attempt(self.download_config.max_retries),
251
+ wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
252
+ retry=retry_if_exception(self.retry_on_status_code),
253
+ before=before_log(logger, logging.DEBUG),
254
+ reraise=True,
255
+ )
256
+ def _get_item_by_path() -> DriveItem:
257
+ try:
258
+ client_site = (
259
+ client.sites.get_by_url(self.connection_config.site).get().execute_query()
260
+ )
261
+ site_drive_item = self.connection_config._get_drive_item(client_site)
262
+ except ClientRequestException:
263
+ logger.info(f"Site not found: {self.connection_config.site}")
264
+ raise SourceConnectionError(f"Site not found: {self.connection_config.site}")
265
+ file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
266
+ return file
267
+
268
+ # Call the retry-wrapped function
269
+ file = _get_item_by_path()
243
270
 
244
271
  if not file:
245
272
  raise NotFoundError(f"file not found: {server_relative_path}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.2.11
3
+ Version: 1.2.14
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=ah-eP2N3WKTgQpNwUrBqGx6pOEXxVpXyJiiqBeWRLvI,43
2
+ unstructured_ingest/__version__.py,sha256=3NalubdNM3MTB5UJ7yWDPdOo9kbE1-bL_USqp5DEYnk,43
3
3
  unstructured_ingest/error.py,sha256=chM7zQSTKjaKaQt_2_QkoZDUwY5XPNeACML7JqOWRLY,4036
4
4
  unstructured_ingest/errors_v2.py,sha256=chM7zQSTKjaKaQt_2_QkoZDUwY5XPNeACML7JqOWRLY,4036
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -23,7 +23,7 @@ unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrH
23
23
  unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
24
24
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
26
- unstructured_ingest/embed/bedrock.py,sha256=EZWGw8f_QKDbYS7dSiFY6seLGIwGd8LmU-HeYT26Stw,9128
26
+ unstructured_ingest/embed/bedrock.py,sha256=BnlKOuL1e4FfR4nV_Ro3A26Fqj3Pq-ZBaSNAgIzUQH0,10955
27
27
  unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
28
28
  unstructured_ingest/embed/interfaces.py,sha256=VCrCSJiEfIxKB4NL4AHgKb-0vB_SEekb47zMUW6gWf0,5211
29
29
  unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
@@ -76,7 +76,7 @@ unstructured_ingest/processes/connectors/google_drive.py,sha256=O6RtOQH9dHBUtwKs
76
76
  unstructured_ingest/processes/connectors/jira.py,sha256=5BZeYHoWKzcwZKUbJDFsdCUcSHbLDZTErOdNbteDwI0,20290
77
77
  unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
78
78
  unstructured_ingest/processes/connectors/local.py,sha256=Tsp9d9YSx2zPh4yl2U--P6cMIQSKMzsFAGyNXXtdS-4,7529
79
- unstructured_ingest/processes/connectors/milvus.py,sha256=vlUxcG1nVSyUQvBJJ0HoJTTli55ho5TulfUxSShDPfE,12118
79
+ unstructured_ingest/processes/connectors/milvus.py,sha256=Bkt4u1zzrKqpO0CZbmuFfbtd824ws5XouiTAnc4I4BM,12102
80
80
  unstructured_ingest/processes/connectors/mongodb.py,sha256=zhGWnEJYZnKzjuElyYAEJUT3M7J5m0e48TpVPdiKsBA,15412
81
81
  unstructured_ingest/processes/connectors/neo4j.py,sha256=jmnxQmi8EjS22mFKfcdXajZrxoKEkrzHRtrP6QeTuFI,20353
82
82
  unstructured_ingest/processes/connectors/onedrive.py,sha256=qhIeFWotFuIxt1Ehg-6IEWXaDu4p-Zhy0u14CfDcnZo,20142
@@ -84,7 +84,7 @@ unstructured_ingest/processes/connectors/outlook.py,sha256=fVW50hRbk1SJUbhEY1ulu
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=b6Wot25oiELj9oBhAwlps4pGE0QWkj8n0wdLTcM7osg,15056
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=HvqjOjpKdKa5C7i1toH4pPhDhiI9pA1-swozpnuCXZQ,8034
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=C2HWiWglZTu0zTibp4eT1lOGYCB-NYnGPr9-Dt635sY,11730
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=2Se6FoHRerBM06UosGcAbD5r5ZtRzAI0rNJxCdVpx34,10668
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=e7X7bMaI7H5T7mC1hytY0m1Kc-QV6r9WGZakqyW-K-g,11636
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=KgDvZGwcKng0FZzElucM0elgJzcVq7PkUQM6PNW30vE,9330
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=fAGiVtqiouJ7GkSFycoJYxf7hzIZpLs3A_q6n6Li37I,12304
@@ -115,7 +115,7 @@ unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=Zng-aV_Z0B52CFILAXf
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
116
116
  unstructured_ingest/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
117
117
  unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py,sha256=kf0UpgdAY2KK1R1FbAB6GEBBAIOeYQ8cZIr3bp660qM,374
118
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=k_c2PtKcaRA6B9ZFXYCk4-2BWxLJnD_Cfjvluk9hKzs,13876
118
+ unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=TuQPpm9O7_3PZQC1s4S3HzybUWDKUeZDs-V3ZTzqdjA,14171
119
119
  unstructured_ingest/processes/connectors/kafka/__init__.py,sha256=pFN2cWwAStiGTAsQ616GIWKi_hDv0s74ZvNqhJEp1Pc,751
120
120
  unstructured_ingest/processes/connectors/kafka/cloud.py,sha256=Ki6iOLoZ86tYWdnLnMWYvb2hUCneKqo4mTJcfXh7YoQ,3432
121
121
  unstructured_ingest/processes/connectors/kafka/kafka.py,sha256=VI-e7WTzV48mmSwqhlDsNARSzkjauckbJEFvWjuqt7k,10301
@@ -235,8 +235,8 @@ unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3r
235
235
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
236
236
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
237
237
  unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
238
- unstructured_ingest-1.2.11.dist-info/METADATA,sha256=WXxL9qEe3d3pG9xwxbuDRArBBGkDH3k2y1fhtNzcHPQ,8827
239
- unstructured_ingest-1.2.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
- unstructured_ingest-1.2.11.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
- unstructured_ingest-1.2.11.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
- unstructured_ingest-1.2.11.dist-info/RECORD,,
238
+ unstructured_ingest-1.2.14.dist-info/METADATA,sha256=GXAxy85oatbdkNDhFN2Yt0g84XuQfdsuAFXeh_U17tI,8827
239
+ unstructured_ingest-1.2.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
+ unstructured_ingest-1.2.14.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
+ unstructured_ingest-1.2.14.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
+ unstructured_ingest-1.2.14.dist-info/RECORD,,