unstructured-ingest 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +8 -2
- test/unit/v2/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +459 -0
- test/unit/v2/connectors/sql/test_sql.py +79 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +7 -3
- unstructured_ingest/utils/data_prep.py +17 -5
- unstructured_ingest/utils/table.py +11 -4
- unstructured_ingest/v2/processes/connectors/__init__.py +2 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +301 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -12
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/METADATA +174 -18
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/RECORD +29 -25
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.19.dist-info → unstructured_ingest-0.5.21.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
4
2
|
|
|
5
3
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
4
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from pandas import DataFrame
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@requires_dependencies(["pandas"])
|
|
8
11
|
def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
9
14
|
return {
|
|
10
15
|
"text": pd.StringDtype(), # type: ignore
|
|
11
16
|
"type": pd.StringDtype(), # type: ignore
|
|
@@ -57,7 +62,9 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
|
57
62
|
def convert_to_pandas_dataframe(
|
|
58
63
|
elements_dict: list[dict[str, Any]],
|
|
59
64
|
drop_empty_cols: bool = False,
|
|
60
|
-
) ->
|
|
65
|
+
) -> "DataFrame":
|
|
66
|
+
import pandas as pd
|
|
67
|
+
|
|
61
68
|
# Flatten metadata if it hasn't already been flattened
|
|
62
69
|
for d in elements_dict:
|
|
63
70
|
if metadata := d.pop("metadata", None):
|
|
@@ -4,6 +4,7 @@ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
|
4
4
|
import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
|
|
5
5
|
import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
|
|
6
6
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
7
|
+
import unstructured_ingest.v2.processes.connectors.ibm_watsonx # noqa: F401
|
|
7
8
|
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
8
9
|
import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
|
|
9
10
|
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
@@ -121,4 +122,5 @@ add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
|
|
|
121
122
|
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
|
|
122
123
|
|
|
123
124
|
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
|
|
125
|
+
|
|
124
126
|
add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
|
|
@@ -3,10 +3,9 @@ import traceback
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from multiprocessing import Process, Queue
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
9
|
from pydantic import Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -27,6 +26,9 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
27
26
|
|
|
28
27
|
CONNECTOR_TYPE = "delta_table"
|
|
29
28
|
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from pandas import DataFrame
|
|
31
|
+
|
|
30
32
|
|
|
31
33
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
32
34
|
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
@@ -136,7 +138,7 @@ class DeltaTableUploader(Uploader):
|
|
|
136
138
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
137
139
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
138
140
|
|
|
139
|
-
def upload_dataframe(self, df:
|
|
141
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
140
142
|
updated_upload_path = os.path.join(
|
|
141
143
|
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
142
144
|
)
|
|
@@ -172,7 +174,10 @@ class DeltaTableUploader(Uploader):
|
|
|
172
174
|
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
173
175
|
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
174
176
|
|
|
177
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
175
178
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
+
import pandas as pd
|
|
180
|
+
|
|
176
181
|
df = pd.DataFrame(data=data)
|
|
177
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
178
183
|
|
|
@@ -2,9 +2,8 @@ from dataclasses import dataclass
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
5
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
7
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
8
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
9
|
|
|
@@ -55,7 +54,6 @@ _COLUMNS = (
|
|
|
55
54
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class BaseDuckDBUploadStager(UploadStager):
|
|
58
|
-
|
|
59
57
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
60
58
|
data = element_dict.copy()
|
|
61
59
|
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
@@ -72,6 +70,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
72
70
|
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
73
71
|
return data
|
|
74
72
|
|
|
73
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
75
74
|
def run(
|
|
76
75
|
self,
|
|
77
76
|
elements_filepath: Path,
|
|
@@ -80,6 +79,8 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
80
79
|
output_filename: str,
|
|
81
80
|
**kwargs: Any,
|
|
82
81
|
) -> Path:
|
|
82
|
+
import pandas as pd
|
|
83
|
+
|
|
83
84
|
elements_contents = get_data(path=elements_filepath)
|
|
84
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
85
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
23
22
|
|
|
24
23
|
if TYPE_CHECKING:
|
|
25
24
|
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
25
|
+
from pandas import DataFrame
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "duckdb"
|
|
28
28
|
|
|
@@ -101,7 +101,7 @@ class DuckDBUploader(Uploader):
|
|
|
101
101
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
102
102
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
103
103
|
|
|
104
|
-
def upload_dataframe(self, df:
|
|
104
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
105
105
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
106
106
|
|
|
107
107
|
with self.connection_config.get_client() as conn:
|
|
@@ -109,7 +109,10 @@ class DuckDBUploader(Uploader):
|
|
|
109
109
|
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
26
|
+
from pandas import DataFrame
|
|
27
27
|
|
|
28
28
|
CONNECTOR_TYPE = "motherduck"
|
|
29
29
|
|
|
@@ -100,7 +100,7 @@ class MotherDuckUploader(Uploader):
|
|
|
100
100
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
101
101
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
102
102
|
|
|
103
|
-
def upload_dataframe(self, df:
|
|
103
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
104
104
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
105
|
database = self.connection_config.database
|
|
106
106
|
db_schema = self.connection_config.db_schema
|
|
@@ -109,7 +109,10 @@ class MotherDuckUploader(Uploader):
|
|
|
109
109
|
with self.connection_config.get_client() as conn:
|
|
110
110
|
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
|
|
4
|
+
|
|
5
|
+
from .ibm_watsonx_s3 import CONNECTOR_TYPE as IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
6
|
+
from .ibm_watsonx_s3 import ibm_watsonx_s3_destination_entry
|
|
7
|
+
|
|
8
|
+
add_destination_entry(
|
|
9
|
+
destination_type=IBM_WATSONX_S3_CONNECTOR_TYPE, entry=ibm_watsonx_s3_destination_entry
|
|
10
|
+
)
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
14
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
DestinationRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
26
|
+
SQLUploader,
|
|
27
|
+
SQLUploadStager,
|
|
28
|
+
SQLUploadStagerConfig,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from pyarrow import Table as ArrowTable
|
|
33
|
+
from pyiceberg.catalog.rest import RestCatalog
|
|
34
|
+
from pyiceberg.table import Table, Transaction
|
|
35
|
+
|
|
36
|
+
CONNECTOR_TYPE = "ibm_watsonx_s3"
|
|
37
|
+
|
|
38
|
+
DEFAULT_IBM_CLOUD_AUTH_URL = "https://iam.cloud.ibm.com/identity/token"
|
|
39
|
+
DEFAULT_ICEBERG_URI_PATH = "/mds/iceberg"
|
|
40
|
+
DEFAULT_ICEBERG_CATALOG_TYPE = "rest"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class IcebergCommitFailedException(Exception):
|
|
44
|
+
"""Failed to commit changes to the iceberg table."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IbmWatsonxAccessConfig(AccessConfig):
|
|
48
|
+
iam_api_key: str = Field(description="IBM IAM API Key")
|
|
49
|
+
access_key_id: str = Field(description="Cloud Object Storage HMAC Access Key ID")
|
|
50
|
+
secret_access_key: str = Field(description="Cloud Object Storage HMAC Secret Access Key")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class IbmWatsonxConnectionConfig(ConnectionConfig):
|
|
54
|
+
access_config: Secret[IbmWatsonxAccessConfig]
|
|
55
|
+
iceberg_endpoint: str = Field(description="Iceberg REST endpoint")
|
|
56
|
+
object_storage_endpoint: str = Field(description="Cloud Object Storage public endpoint")
|
|
57
|
+
object_storage_region: str = Field(description="Cloud Object Storage region")
|
|
58
|
+
catalog: str = Field(description="Catalog name")
|
|
59
|
+
|
|
60
|
+
_bearer_token: Optional[dict[str, Any]] = None
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def iceberg_url(self) -> str:
|
|
64
|
+
return f"https://{self.iceberg_endpoint.strip('/')}{DEFAULT_ICEBERG_URI_PATH}"
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def object_storage_url(self) -> str:
|
|
68
|
+
return f"https://{self.object_storage_endpoint.strip('/')}"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def bearer_token(self) -> str:
|
|
72
|
+
# Add 60 seconds to deal with edge cases where the token expires before the request is made
|
|
73
|
+
timestamp = int(time.time()) + 60
|
|
74
|
+
if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
|
|
75
|
+
self._bearer_token = self.generate_bearer_token()
|
|
76
|
+
return self._bearer_token["access_token"]
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
79
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
80
|
+
import httpx
|
|
81
|
+
|
|
82
|
+
if not isinstance(e, httpx.HTTPStatusError):
|
|
83
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
84
|
+
return e
|
|
85
|
+
url = e.request.url
|
|
86
|
+
response_code = e.response.status_code
|
|
87
|
+
if response_code == 401:
|
|
88
|
+
logger.error(
|
|
89
|
+
f"Failed to authenticate IBM watsonx.data user {url}, status code {response_code}"
|
|
90
|
+
)
|
|
91
|
+
return UserAuthError(e)
|
|
92
|
+
if response_code == 403:
|
|
93
|
+
logger.error(
|
|
94
|
+
f"Given IBM watsonx.data user is not authorized {url}, status code {response_code}"
|
|
95
|
+
)
|
|
96
|
+
return UserAuthError(e)
|
|
97
|
+
if 400 <= response_code < 500:
|
|
98
|
+
logger.error(
|
|
99
|
+
f"Request to {url} failed"
|
|
100
|
+
f"in IBM watsonx.data connector, status code {response_code}"
|
|
101
|
+
)
|
|
102
|
+
return UserError(e)
|
|
103
|
+
if response_code > 500:
|
|
104
|
+
logger.error(
|
|
105
|
+
f"Request to {url} failed"
|
|
106
|
+
f"in IBM watsonx.data connector, status code {response_code}"
|
|
107
|
+
)
|
|
108
|
+
return ProviderError(e)
|
|
109
|
+
logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
|
|
110
|
+
return e
|
|
111
|
+
|
|
112
|
+
@requires_dependencies(["httpx"], extras="ibm-watsonx-s3")
|
|
113
|
+
def generate_bearer_token(self) -> dict[str, Any]:
|
|
114
|
+
import httpx
|
|
115
|
+
|
|
116
|
+
headers = {
|
|
117
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
118
|
+
"Accept": "application/json",
|
|
119
|
+
}
|
|
120
|
+
data = {
|
|
121
|
+
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
|
122
|
+
"apikey": self.access_config.get_secret_value().iam_api_key,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
logger.info("Generating IBM IAM Bearer Token")
|
|
126
|
+
try:
|
|
127
|
+
response = httpx.post(DEFAULT_IBM_CLOUD_AUTH_URL, headers=headers, data=data)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
raise self.wrap_error(e)
|
|
131
|
+
return response.json()
|
|
132
|
+
|
|
133
|
+
def get_catalog_config(self) -> dict[str, Any]:
|
|
134
|
+
return {
|
|
135
|
+
"name": self.catalog,
|
|
136
|
+
"type": DEFAULT_ICEBERG_CATALOG_TYPE,
|
|
137
|
+
"uri": self.iceberg_url,
|
|
138
|
+
"token": self.bearer_token,
|
|
139
|
+
"warehouse": self.catalog,
|
|
140
|
+
"s3.endpoint": self.object_storage_url,
|
|
141
|
+
"s3.access-key-id": self.access_config.get_secret_value().access_key_id,
|
|
142
|
+
"s3.secret-access-key": self.access_config.get_secret_value().secret_access_key,
|
|
143
|
+
"s3.region": self.object_storage_region,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
147
|
+
@contextmanager
|
|
148
|
+
def get_catalog(self) -> Generator["RestCatalog", None, None]:
|
|
149
|
+
from pyiceberg.catalog import load_catalog
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
catalog_config = self.get_catalog_config()
|
|
153
|
+
catalog = load_catalog(**catalog_config)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error(f"Failed to connect to catalog '{self.catalog}': {e}", exc_info=True)
|
|
156
|
+
raise ProviderError(f"Failed to connect to catalog '{self.catalog}': {e}")
|
|
157
|
+
|
|
158
|
+
yield catalog
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass
|
|
162
|
+
class IbmWatsonxUploadStagerConfig(SQLUploadStagerConfig):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class IbmWatsonxUploadStager(SQLUploadStager):
|
|
168
|
+
upload_stager_config: IbmWatsonxUploadStagerConfig = field(
|
|
169
|
+
default_factory=IbmWatsonxUploadStagerConfig
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class IbmWatsonxUploaderConfig(UploaderConfig):
|
|
174
|
+
namespace: str = Field(description="Namespace name")
|
|
175
|
+
table: str = Field(description="Table name")
|
|
176
|
+
max_retries: int = Field(
|
|
177
|
+
default=5, description="Maximum number of retries to upload data", ge=2, le=10
|
|
178
|
+
)
|
|
179
|
+
record_id_key: str = Field(
|
|
180
|
+
default=RECORD_ID_LABEL,
|
|
181
|
+
description="Searchable key to find entries for the same record on previous runs",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def table_identifier(self) -> Tuple[str, str]:
|
|
186
|
+
return (self.namespace, self.table)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@dataclass
|
|
190
|
+
class IbmWatsonxUploader(SQLUploader):
|
|
191
|
+
connection_config: IbmWatsonxConnectionConfig
|
|
192
|
+
upload_config: IbmWatsonxUploaderConfig
|
|
193
|
+
connector_type: str = CONNECTOR_TYPE
|
|
194
|
+
|
|
195
|
+
def precheck(self) -> None:
|
|
196
|
+
with self.connection_config.get_catalog() as catalog:
|
|
197
|
+
if not catalog.namespace_exists(self.upload_config.namespace):
|
|
198
|
+
raise UserError(f"Namespace '{self.upload_config.namespace}' does not exist")
|
|
199
|
+
if not catalog.table_exists(self.upload_config.table_identifier):
|
|
200
|
+
raise UserError(
|
|
201
|
+
f"Table '{self.upload_config.table}' does not exist in namespace '{self.upload_config.namespace}'" # noqa: E501
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
@contextmanager
|
|
205
|
+
def get_table(self) -> Generator["Table", None, None]:
|
|
206
|
+
with self.connection_config.get_catalog() as catalog:
|
|
207
|
+
table = catalog.load_table(self.upload_config.table_identifier)
|
|
208
|
+
yield table
|
|
209
|
+
|
|
210
|
+
def get_table_columns(self) -> list[str]:
|
|
211
|
+
if self._columns is None:
|
|
212
|
+
with self.get_table() as table:
|
|
213
|
+
self._columns = table.schema().column_names
|
|
214
|
+
return self._columns
|
|
215
|
+
|
|
216
|
+
def can_delete(self) -> bool:
|
|
217
|
+
return self.upload_config.record_id_key in self.get_table_columns()
|
|
218
|
+
|
|
219
|
+
@requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
|
|
220
|
+
def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
|
|
221
|
+
import pyarrow as pa
|
|
222
|
+
|
|
223
|
+
# Iceberg will automatically fill missing columns with nulls
|
|
224
|
+
# Iceberg will throw an error if the DataFrame column has only null values
|
|
225
|
+
# because it can't infer the type of the column and match it with the table schema
|
|
226
|
+
return pa.Table.from_pandas(self._fit_to_schema(df, add_missing_columns=False))
|
|
227
|
+
|
|
228
|
+
@requires_dependencies(["pyiceberg"], extras="ibm-watsonx-s3")
|
|
229
|
+
def _delete(self, transaction: "Transaction", identifier: str) -> None:
|
|
230
|
+
from pyiceberg.expressions import EqualTo
|
|
231
|
+
|
|
232
|
+
if self.can_delete():
|
|
233
|
+
transaction.delete(delete_filter=EqualTo(self.upload_config.record_id_key, identifier))
|
|
234
|
+
else:
|
|
235
|
+
logger.warning(
|
|
236
|
+
f"Table doesn't contain expected "
|
|
237
|
+
f"record id column "
|
|
238
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
|
|
242
|
+
def upload_data_table(
|
|
243
|
+
self, table: "Table", data_table: "ArrowTable", file_data: FileData
|
|
244
|
+
) -> None:
|
|
245
|
+
from pyiceberg.exceptions import CommitFailedException
|
|
246
|
+
from tenacity import (
|
|
247
|
+
before_log,
|
|
248
|
+
retry,
|
|
249
|
+
retry_if_exception_type,
|
|
250
|
+
stop_after_attempt,
|
|
251
|
+
wait_random,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
@retry(
|
|
255
|
+
stop=stop_after_attempt(self.upload_config.max_retries),
|
|
256
|
+
wait=wait_random(),
|
|
257
|
+
retry=retry_if_exception_type(IcebergCommitFailedException),
|
|
258
|
+
before=before_log(logger, logging.DEBUG),
|
|
259
|
+
reraise=True,
|
|
260
|
+
)
|
|
261
|
+
def _upload_data_table(table: "Table", data_table: "ArrowTable", file_data: FileData):
|
|
262
|
+
try:
|
|
263
|
+
with table.transaction() as transaction:
|
|
264
|
+
self._delete(transaction, file_data.identifier)
|
|
265
|
+
transaction.append(data_table)
|
|
266
|
+
except CommitFailedException as e:
|
|
267
|
+
table.refresh()
|
|
268
|
+
logger.debug(e)
|
|
269
|
+
raise IcebergCommitFailedException(e)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
return _upload_data_table(table, data_table, file_data)
|
|
275
|
+
except ProviderError:
|
|
276
|
+
raise
|
|
277
|
+
except Exception as e:
|
|
278
|
+
raise ProviderError(f"Failed to upload data to table: {e}")
|
|
279
|
+
|
|
280
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
281
|
+
data_table = self._df_to_arrow_table(df)
|
|
282
|
+
|
|
283
|
+
with self.get_table() as table:
|
|
284
|
+
self.upload_data_table(table, data_table, file_data)
|
|
285
|
+
|
|
286
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
287
|
+
df = pd.DataFrame(data)
|
|
288
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
289
|
+
|
|
290
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
291
|
+
df = get_data_df(path=path)
|
|
292
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
ibm_watsonx_s3_destination_entry = DestinationRegistryEntry(
|
|
296
|
+
connection_config=IbmWatsonxConnectionConfig,
|
|
297
|
+
uploader=IbmWatsonxUploader,
|
|
298
|
+
uploader_config=IbmWatsonxUploaderConfig,
|
|
299
|
+
upload_stager=IbmWatsonxUploadStager,
|
|
300
|
+
upload_stager_config=IbmWatsonxUploadStagerConfig,
|
|
301
|
+
)
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
|
26
25
|
|
|
27
26
|
if TYPE_CHECKING:
|
|
28
27
|
from kdbai_client import Database, Session, Table
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
CONNECTOR_TYPE = "kdbai"
|
|
31
31
|
|
|
@@ -118,11 +118,11 @@ class KdbaiUploader(Uploader):
|
|
|
118
118
|
table = db.table(self.upload_config.table_name)
|
|
119
119
|
yield table
|
|
120
120
|
|
|
121
|
-
def upsert_batch(self, batch:
|
|
121
|
+
def upsert_batch(self, batch: "DataFrame"):
|
|
122
122
|
with self.get_table() as table:
|
|
123
123
|
table.insert(batch)
|
|
124
124
|
|
|
125
|
-
def process_dataframe(self, df:
|
|
125
|
+
def process_dataframe(self, df: "DataFrame"):
|
|
126
126
|
logger.debug(
|
|
127
127
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
128
128
|
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
@@ -130,7 +130,10 @@ class KdbaiUploader(Uploader):
|
|
|
130
130
|
for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
131
131
|
self.upsert_batch(batch=batch_df)
|
|
132
132
|
|
|
133
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
133
134
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
135
|
+
import pandas as pd
|
|
136
|
+
|
|
134
137
|
df = pd.DataFrame(data=data)
|
|
135
138
|
self.process_dataframe(df=df)
|
|
136
139
|
|
|
@@ -8,7 +8,6 @@ from dataclasses import dataclass, field
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
10
10
|
|
|
11
|
-
import pandas as pd
|
|
12
11
|
from pydantic import Field
|
|
13
12
|
|
|
14
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ CONNECTOR_TYPE = "lancedb"
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
27
26
|
from lancedb import AsyncConnection
|
|
28
27
|
from lancedb.table import AsyncTable
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
@@ -69,6 +69,7 @@ class LanceDBUploadStager(UploadStager):
|
|
|
69
69
|
default_factory=LanceDBUploadStagerConfig
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
72
73
|
def run(
|
|
73
74
|
self,
|
|
74
75
|
elements_filepath: Path,
|
|
@@ -77,6 +78,8 @@ class LanceDBUploadStager(UploadStager):
|
|
|
77
78
|
output_filename: str,
|
|
78
79
|
**kwargs: Any,
|
|
79
80
|
) -> Path:
|
|
81
|
+
import pandas as pd
|
|
82
|
+
|
|
80
83
|
with open(elements_filepath) as elements_file:
|
|
81
84
|
elements_contents: list[dict] = json.load(elements_file)
|
|
82
85
|
|
|
@@ -129,7 +132,10 @@ class LanceDBUploader(Uploader):
|
|
|
129
132
|
finally:
|
|
130
133
|
table.close()
|
|
131
134
|
|
|
135
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
132
136
|
async def run_async(self, path, file_data, **kwargs):
|
|
137
|
+
import pandas as pd
|
|
138
|
+
|
|
133
139
|
df = pd.read_feather(path)
|
|
134
140
|
async with self.get_table() as table:
|
|
135
141
|
schema = await table.schema()
|
|
@@ -144,7 +150,9 @@ class LanceDBUploader(Uploader):
|
|
|
144
150
|
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
145
151
|
await table.add(data=df)
|
|
146
152
|
|
|
147
|
-
def _fit_to_schema(self, df:
|
|
153
|
+
def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
148
156
|
columns = set(df.columns)
|
|
149
157
|
schema_fields = set(schema.names)
|
|
150
158
|
columns_to_drop = columns - schema_fields
|
|
@@ -3,8 +3,6 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from pydantic import Field, Secret
|
|
9
7
|
|
|
10
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
@@ -27,6 +25,7 @@ if TYPE_CHECKING:
|
|
|
27
25
|
from databricks.sdk.core import oauth_service_principal
|
|
28
26
|
from databricks.sql.client import Connection as DeltaTableConnection
|
|
29
27
|
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
28
|
+
from pandas import DataFrame
|
|
30
29
|
|
|
31
30
|
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
31
|
|
|
@@ -180,7 +179,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
|
|
|
180
179
|
)
|
|
181
180
|
return statement
|
|
182
181
|
|
|
183
|
-
|
|
182
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
183
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
184
|
+
import numpy as np
|
|
185
|
+
|
|
184
186
|
if self.can_delete():
|
|
185
187
|
self.delete_by_record_id(file_data=file_data)
|
|
186
188
|
else:
|
|
@@ -3,9 +3,9 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
11
|
DestinationRegistryEntry,
|
|
@@ -46,6 +46,7 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
|
|
|
46
46
|
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
47
47
|
|
|
48
48
|
@contextmanager
|
|
49
|
+
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
49
50
|
def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
|
|
50
51
|
import singlestoredb as s2
|
|
51
52
|
|
|
@@ -130,9 +131,12 @@ class SingleStoreUploader(SQLUploader):
|
|
|
130
131
|
values_delimiter: str = "%s"
|
|
131
132
|
connector_type: str = CONNECTOR_TYPE
|
|
132
133
|
|
|
134
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
133
135
|
def prepare_data(
|
|
134
136
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
135
137
|
) -> list[tuple[Any, ...]]:
|
|
138
|
+
import pandas as pd
|
|
139
|
+
|
|
136
140
|
output = []
|
|
137
141
|
for row in data:
|
|
138
142
|
parsed = []
|