unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.processes.connector_registry import add_destination_entry
|
|
4
|
+
|
|
5
|
+
from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
6
|
+
from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
|
|
7
|
+
|
|
8
|
+
add_destination_entry(
|
|
9
|
+
destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
|
|
10
|
+
)
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
IcebergCommitFailedException,
|
|
14
|
+
ProviderError,
|
|
15
|
+
UserAuthError,
|
|
16
|
+
UserError,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.interfaces import (
|
|
19
|
+
AccessConfig,
|
|
20
|
+
ConnectionConfig,
|
|
21
|
+
UploaderConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.logger import logger
|
|
24
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
25
|
+
DestinationRegistryEntry,
|
|
26
|
+
)
|
|
27
|
+
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
28
|
+
SQLUploader,
|
|
29
|
+
SQLUploadStager,
|
|
30
|
+
SQLUploadStagerConfig,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
33
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
34
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from pandas import DataFrame
|
|
38
|
+
from pyarrow import Table as ArrowTable
|
|
39
|
+
from pyiceberg.catalog.rest import RestCatalog
|
|
40
|
+
from pyiceberg.table import Table, Transaction
|
|
41
|
+
|
|
42
|
+
CONNECTOR_TYPE = "ibm_watsonx_s3"
|
|
43
|
+
|
|
44
|
+
DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
|
|
45
|
+
DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
|
|
46
|
+
DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class IbmWatsonxAccessConfig(AccessConfig):
|
|
50
|
+
iam_api_key: str = Field(description="IBM IAM API Key")
|
|
51
|
+
access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
|
|
52
|
+
secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
56
|
+
access_config: Secret[IbmWatsonxAccessConfig]
|
|
57
|
+
iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
|
|
58
|
+
object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
|
|
59
|
+
object_storage_region: str = Field(description="Cloud Object Storage region")
|
|
60
|
+
catalog: str = Field(description="Catalog name")
|
|
61
|
+
max_retries_connection: int = Field(
|
|
62
|
+
default=10,
|
|
63
|
+
description="Maximum number of retries in case of a connection error (RESTError)",
|
|
64
|
+
ge=2,
|
|
65
|
+
le=100,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
_bearer_token: Optional[dict[str, Any]] = None
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def iceberg_url(self) -> str:
|
|
72
|
+
return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def object_storage_url(self) -> str:
|
|
76
|
+
return f"https://{self.object_storage_endpoint.strip('/')}"
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def bearer_token(self) -> str:
|
|
80
|
+
# Add 5 minutes to deal with edge cases where the token expires before the request is made
|
|
81
|
+
timestamp = int(time.time()) + (60 * 5)
|
|
82
|
+
if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
|
|
83
|
+
self._bearer_token = self.generate_bearer_token()
|
|
84
|
+
return self._bearer_token["access_token"]
|
|
85
|
+
|
|
86
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
87
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
88
|
+
import httpx
|
|
89
|
+
|
|
90
|
+
if not isinstance(e, httpx.HTTPStatusError):
|
|
91
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
92
|
+
return e
|
|
93
|
+
url = e.request.url
|
|
94
|
+
response_code = e.response.status_code
|
|
95
|
+
if response_code == 401:
|
|
96
|
+
logger.error(
|
|
97
|
+
f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
|
|
98
|
+
)
|
|
99
|
+
return UserAuthError(e)
|
|
100
|
+
if response_code == 403:
|
|
101
|
+
logger.error(
|
|
102
|
+
f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
|
|
103
|
+
)
|
|
104
|
+
return UserAuthError(e)
|
|
105
|
+
if 400 <= response_code < 500:
|
|
106
|
+
logger.error(
|
|
107
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
108
|
+
)
|
|
109
|
+
return UserError(e)
|
|
110
|
+
if response_code > 500:
|
|
111
|
+
logger.error(
|
|
112
|
+
f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
|
|
113
|
+
)
|
|
114
|
+
return ProviderError(e)
|
|
115
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
116
|
+
return e
|
|
117
|
+
|
|
118
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
119
|
+
def generate_bearer_token(self) -> dict[str, Any]:
|
|
120
|
+
import httpx
|
|
121
|
+
|
|
122
|
+
headers = {
|
|
123
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
124
|
+
"Accept": "application/json",
|
|
125
|
+
}
|
|
126
|
+
data = {
|
|
127
|
+
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
|
128
|
+
"apikey": self.access_config.get_secret_value().iam_api_key,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
logger.info("Generating IBM IAM Bearer Token")
|
|
132
|
+
try:
|
|
133
|
+
response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
|
|
134
|
+
response.raise_for_status()
|
|
135
|
+
except Exception as e:
|
|
136
|
+
raise self.wrap_error(e)
|
|
137
|
+
return response.json()
|
|
138
|
+
|
|
139
|
+
def get_catalog_config(self) -> dict[str, Any]:
|
|
140
|
+
return {
|
|
141
|
+
"name": self.catalog,
|
|
142
|
+
"type": DEFAULT_ICEBERG_CATALOG_TYPE,
|
|
143
|
+
"uri": self.iceberg_url,
|
|
144
|
+
"token": self.bearer_token,
|
|
145
|
+
"warehouse": self.catalog,
|
|
146
|
+
"s3.endpoint": self.object_storage_url,
|
|
147
|
+
"s3.access-key-id": self.access_config.get_secret_value().access_key_id,
|
|
148
|
+
"s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
|
|
149
|
+
"s3.region": self.object_storage_region,
|
|
150
|
+
# By default this header is set to `vended-credentials`, and default bucket
|
|
151
|
+
# configuration doesn't allow vending credentials. We need to set it to `None`
|
|
152
|
+
# in order to use user-provided S3 credentials.
|
|
153
|
+
"header.X-Iceberg-Access-Delegation": None,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
157
|
+
@contextmanager
|
|
158
|
+
def get_catalog(self) -> Generator["RestCatalog", None, None]:
|
|
159
|
+
from pyiceberg.catalog import load_catalog
|
|
160
|
+
from pyiceberg.exceptions import RESTError
|
|
161
|
+
from tenacity import (
|
|
162
|
+
before_log,
|
|
163
|
+
retry,
|
|
164
|
+
retry_if_exception_type,
|
|
165
|
+
stop_after_attempt,
|
|
166
|
+
wait_exponential,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Retry connection in case of a connection error
|
|
170
|
+
@retry(
|
|
171
|
+
stop=stop_after_attempt(self.max_retries_connection),
|
|
172
|
+
wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
|
|
173
|
+
retry=retry_if_exception_type(RESTError),
|
|
174
|
+
before=before_log(logger, logging.DEBUG),
|
|
175
|
+
reraise=True,
|
|
176
|
+
)
|
|
177
|
+
def _get_catalog(catalog_config: dict[str, Any]) -> "RestCatalog":
|
|
178
|
+
return load_catalog(**catalog_config)
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
catalog_config = self.get_catalog_config()
|
|
182
|
+
catalog = _get_catalog(catalog_config)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
|
|
185
|
+
raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
|
|
186
|
+
|
|
187
|
+
yield catalog
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@dataclass
|
|
196
|
+
class IbmWatsonxUploadStager(SQLUploadStager):
|
|
197
|
+
upload_stager_config: IbmWatsonxUploadStagerConfig = field(
|
|
198
|
+
default_factory=IbmWatsonxUploadStagerConfig
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class IbmWatsonxUploaderConfig(UploaderConfig):
|
|
203
|
+
namespace: str = Field(description="Namespace name")
|
|
204
|
+
table: str = Field(description="Table name")
|
|
205
|
+
max_retries: int = Field(
|
|
206
|
+
default=50,
|
|
207
|
+
description="Maximum number of retries to upload data (CommitFailedException)",
|
|
208
|
+
ge=2,
|
|
209
|
+
le=500,
|
|
210
|
+
)
|
|
211
|
+
record_id_key: str = Field(
|
|
212
|
+
default=RECORD_ID_LABEL,
|
|
213
|
+
description="Searchable key to find entries for the same record on previous runs",
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def table_identifier(self) -> Tuple[str, str]:
|
|
218
|
+
return (self.namespace, self.table)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@dataclass
|
|
222
|
+
class IbmWatsonxUploader(SQLUploader):
|
|
223
|
+
connection_config: IbmWatsonxConnectionConfig
|
|
224
|
+
upload_config: IbmWatsonxUploaderConfig
|
|
225
|
+
connector_type: str = CONNECTOR_TYPE
|
|
226
|
+
|
|
227
|
+
def precheck(self) -> None:
|
|
228
|
+
with self.connection_config.get_catalog() as catalog:
|
|
229
|
+
if not catalog.namespace_exists(self.upload_config.namespace):
|
|
230
|
+
raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
|
|
231
|
+
if not catalog.table_exists(self.upload_config.table_identifier):
|
|
232
|
+
raise UserError(
|
|
233
|
+
f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
@contextmanager
|
|
237
|
+
def get_table(self) -> Generator["Table", None, None]:
|
|
238
|
+
with self.connection_config.get_catalog() as catalog:
|
|
239
|
+
table = catalog.load_table(self.upload_config.table_identifier)
|
|
240
|
+
yield table
|
|
241
|
+
|
|
242
|
+
def get_table_columns(self) -> list[str]:
|
|
243
|
+
if self._columns is None:
|
|
244
|
+
with self.get_table() as table:
|
|
245
|
+
self._columns = table.schema().column_names
|
|
246
|
+
return self._columns
|
|
247
|
+
|
|
248
|
+
def can_delete(self) -> bool:
|
|
249
|
+
return self.upload_config.record_id_key in self.get_table_columns()
|
|
250
|
+
|
|
251
|
+
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
252
|
+
def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
|
|
253
|
+
import pyarrow as pa
|
|
254
|
+
|
|
255
|
+
# Iceberg will automatically fill missing columns with nulls
|
|
256
|
+
# Iceberg will throw an error if the DataFrame column has only null values
|
|
257
|
+
# because it can't infer the type of the column and match it with the table schema
|
|
258
|
+
return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
|
|
259
|
+
|
|
260
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
261
|
+
def _delete(self, transaction: "Transaction", identifier: str) -> None:
|
|
262
|
+
from pyiceberg.expressions import EqualTo
|
|
263
|
+
|
|
264
|
+
if self.can_delete():
|
|
265
|
+
transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
|
|
266
|
+
else:
|
|
267
|
+
logger.warning(
|
|
268
|
+
f"Table doesn't contain expected "
|
|
269
|
+
f"record id column "
|
|
270
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
274
|
+
def upload_data_table(
|
|
275
|
+
self, table: "Table", data_table: "ArrowTable", file_data: FileData
|
|
276
|
+
) -> None:
|
|
277
|
+
from pyiceberg.exceptions import CommitFailedException, RESTError
|
|
278
|
+
from tenacity import (
|
|
279
|
+
before_log,
|
|
280
|
+
retry,
|
|
281
|
+
retry_if_exception_type,
|
|
282
|
+
stop_after_attempt,
|
|
283
|
+
wait_random,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
@retry(
|
|
287
|
+
stop=stop_after_attempt(self.upload_config.max_retries),
|
|
288
|
+
wait=wait_random(),
|
|
289
|
+
retry=retry_if_exception_type(IcebergCommitFailedException),
|
|
290
|
+
before=before_log(logger, logging.DEBUG),
|
|
291
|
+
reraise=True,
|
|
292
|
+
)
|
|
293
|
+
def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
|
|
294
|
+
try:
|
|
295
|
+
with table.transaction() as transaction:
|
|
296
|
+
self._delete(transaction, file_data.identifier)
|
|
297
|
+
transaction.append(data_table)
|
|
298
|
+
except CommitFailedException as e:
|
|
299
|
+
table.refresh()
|
|
300
|
+
logger.debug(e)
|
|
301
|
+
raise IcebergCommitFailedException(str(e))
|
|
302
|
+
except RESTError as e:
|
|
303
|
+
raise DestinationConnectionError(str(e))
|
|
304
|
+
except Exception as e:
|
|
305
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
return _upload_data_table(table, data_table, file_data)
|
|
309
|
+
except RESTError as e:
|
|
310
|
+
raise DestinationConnectionError(str(e))
|
|
311
|
+
except ProviderError:
|
|
312
|
+
raise
|
|
313
|
+
except Exception as e:
|
|
314
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
315
|
+
|
|
316
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
317
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
318
|
+
from pyiceberg.exceptions import RESTError
|
|
319
|
+
from tenacity import (
|
|
320
|
+
before_log,
|
|
321
|
+
retry,
|
|
322
|
+
retry_if_exception_type,
|
|
323
|
+
stop_after_attempt,
|
|
324
|
+
wait_exponential,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
data_table = self._df_to_arrow_table(df)
|
|
328
|
+
|
|
329
|
+
# Retry connection in case of a connection error or token expiration
|
|
330
|
+
@retry(
|
|
331
|
+
stop=stop_after_attempt(self.connection_config.max_retries_connection),
|
|
332
|
+
wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
|
|
333
|
+
retry=retry_if_exception_type(RESTError),
|
|
334
|
+
before=before_log(logger, logging.DEBUG),
|
|
335
|
+
reraise=True,
|
|
336
|
+
)
|
|
337
|
+
def _upload_dataframe(data_table: Any, file_data: FileData) -> None:
|
|
338
|
+
with self.get_table() as table:
|
|
339
|
+
self.upload_data_table(table, data_table, file_data)
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
return _upload_dataframe(data_table, file_data)
|
|
343
|
+
except ProviderError:
|
|
344
|
+
raise
|
|
345
|
+
except Exception as e:
|
|
346
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
347
|
+
|
|
348
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
349
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
350
|
+
import pandas as pd
|
|
351
|
+
|
|
352
|
+
df = pd.DataFrame(data)
|
|
353
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
354
|
+
|
|
355
|
+
@requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
|
|
356
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
357
|
+
df = get_data_df(path=path)
|
|
358
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
|
|
362
|
+
connection_config=IbmWatsonxConnectionConfig,
|
|
363
|
+
uploader=IbmWatsonxUploader,
|
|
364
|
+
uploader_config=IbmWatsonxUploaderConfig,
|
|
365
|
+
upload_stager=IbmWatsonxUploadStager,
|
|
366
|
+
upload_stager_config=IbmWatsonxUploadStagerConfig,
|
|
367
|
+
)
|