awslabs.s3-tables-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +15 -0
- awslabs/s3_tables_mcp_server/__init__.py +18 -0
- awslabs/s3_tables_mcp_server/constants.py +167 -0
- awslabs/s3_tables_mcp_server/database.py +140 -0
- awslabs/s3_tables_mcp_server/engines/__init__.py +13 -0
- awslabs/s3_tables_mcp_server/engines/pyiceberg.py +239 -0
- awslabs/s3_tables_mcp_server/file_processor.py +485 -0
- awslabs/s3_tables_mcp_server/models.py +274 -0
- awslabs/s3_tables_mcp_server/namespaces.py +63 -0
- awslabs/s3_tables_mcp_server/resources.py +231 -0
- awslabs/s3_tables_mcp_server/s3_operations.py +43 -0
- awslabs/s3_tables_mcp_server/server.py +821 -0
- awslabs/s3_tables_mcp_server/table_buckets.py +136 -0
- awslabs/s3_tables_mcp_server/tables.py +307 -0
- awslabs/s3_tables_mcp_server/utils.py +139 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/METADATA +216 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/RECORD +21 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_s3_tables_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
awslabs/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""AWS Labs MCP package."""
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
# This file is part of the awslabs namespace.
|
16
|
+
# It is intentionally minimal to support PEP 420 namespace packages.
|
17
|
+
|
18
|
+
__version__ = '0.0.0'
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""Constants used throughout the S3 Tables MCP Server.
|
16
|
+
|
17
|
+
This module contains all the constant values used across the S3 Tables MCP Server,
|
18
|
+
including version information, regex patterns for validation, and field definitions
|
19
|
+
for Pydantic models.
|
20
|
+
"""
|
21
|
+
|
22
|
+
from pydantic import Field
|
23
|
+
|
24
|
+
|
25
|
+
# Patterns
|
26
|
+
TABLE_BUCKET_NAME_PATTERN = r'[a-z0-9][a-z0-9-]{1,61}[a-z0-9]'
|
27
|
+
"""
|
28
|
+
Regex pattern for validating S3 bucket names.
|
29
|
+
Valid bucket names must:
|
30
|
+
- Be between 3 and 63 characters long
|
31
|
+
- Start and end with a letter or number
|
32
|
+
- Contain only lowercase letters, numbers, and hyphens
|
33
|
+
- Not contain consecutive hyphens
|
34
|
+
"""
|
35
|
+
|
36
|
+
TABLE_BUCKET_ARN_PATTERN = (
|
37
|
+
r'arn:aws[-a-z0-9]*:[a-z0-9]+:[-a-z0-9]*:[0-9]{12}:bucket/[a-z0-9_-]{3,63}'
|
38
|
+
)
|
39
|
+
"""
|
40
|
+
Regex pattern for validating S3 bucket ARNs.
|
41
|
+
Format: arn:aws[-a-z0-9]*:[a-z0-9]+:[-a-z0-9]*:[0-9]{12}:bucket/[bucket-name]
|
42
|
+
Example: arn:aws:s3:::my-bucket
|
43
|
+
"""
|
44
|
+
|
45
|
+
TABLE_NAME_PATTERN = r'[0-9a-z_]*'
|
46
|
+
"""
|
47
|
+
Regex pattern for validating table names.
|
48
|
+
Valid table names must:
|
49
|
+
- Contain only lowercase letters, numbers, and underscores
|
50
|
+
- Have a maximum length of 255 characters
|
51
|
+
"""
|
52
|
+
|
53
|
+
TABLE_ARN_PATTERN = (
|
54
|
+
r'arn:aws[-a-z0-9]*:[a-z0-9]+:[-a-z0-9]*:[0-9]{12}:bucket/[a-z0-9_-]{3,63}/table/[0-9a-f-]{36}'
|
55
|
+
)
|
56
|
+
"""
|
57
|
+
Regex pattern for validating table ARNs.
|
58
|
+
Format: arn:aws[-a-z0-9]*:[a-z0-9]+:[-a-z0-9]*:[0-9]{12}:bucket/[bucket-name]/table/[uuid]
|
59
|
+
Example: arn:aws:s3:::my-bucket/table/123e4567-e89b-12d3-a456-426614174000
|
60
|
+
"""
|
61
|
+
|
62
|
+
# Field Definitions
|
63
|
+
TABLE_BUCKET_ARN_FIELD = Field(
|
64
|
+
...,
|
65
|
+
description='Table bucket ARN',
|
66
|
+
pattern=TABLE_BUCKET_ARN_PATTERN,
|
67
|
+
min_length=1,
|
68
|
+
max_length=2048,
|
69
|
+
)
|
70
|
+
"""
|
71
|
+
Pydantic field for table bucket ARN validation.
|
72
|
+
Required field that must match the TABLE_BUCKET_ARN_PATTERN.
|
73
|
+
"""
|
74
|
+
|
75
|
+
TABLE_ARN_FIELD = Field(..., description='Table ARN', pattern=TABLE_ARN_PATTERN)
|
76
|
+
"""
|
77
|
+
Pydantic field for table ARN validation.
|
78
|
+
Required field that must match the TABLE_ARN_PATTERN.
|
79
|
+
"""
|
80
|
+
|
81
|
+
NAMESPACE_NAME_FIELD = Field(
|
82
|
+
...,
|
83
|
+
description='The name of the namespace. Must be 1-255 characters long and contain only alphanumeric characters, underscores, and hyphens.',
|
84
|
+
min_length=1,
|
85
|
+
max_length=255,
|
86
|
+
pattern=r'^[a-zA-Z0-9_-]+$',
|
87
|
+
)
|
88
|
+
"""
|
89
|
+
Pydantic field for namespace name validation.
|
90
|
+
Required field that must:
|
91
|
+
- Be 1-255 characters long
|
92
|
+
- Contain only alphanumeric characters, underscores, and hyphens
|
93
|
+
"""
|
94
|
+
|
95
|
+
TABLE_NAME_FIELD = Field(
|
96
|
+
...,
|
97
|
+
description='The name of the table. Must be 1-255 characters long and contain only alphanumeric characters, underscores, and hyphens.',
|
98
|
+
min_length=1,
|
99
|
+
max_length=255,
|
100
|
+
pattern=TABLE_NAME_PATTERN,
|
101
|
+
)
|
102
|
+
"""
|
103
|
+
Pydantic field for table name validation.
|
104
|
+
Required field that must:
|
105
|
+
- Be 1-255 characters long
|
106
|
+
- Contain only alphanumeric characters, underscores, and hyphens
|
107
|
+
- Match the TABLE_NAME_PATTERN
|
108
|
+
"""
|
109
|
+
|
110
|
+
REGION_NAME_FIELD = Field(
|
111
|
+
None,
|
112
|
+
description='The AWS region name where the operation should be performed.',
|
113
|
+
min_length=1,
|
114
|
+
max_length=64,
|
115
|
+
)
|
116
|
+
"""
|
117
|
+
Pydantic field for AWS region name.
|
118
|
+
Optional field that can be used to specify the AWS region for operations.
|
119
|
+
Example values: 'us-east-1', 'eu-west-1', 'ap-southeast-2'
|
120
|
+
"""
|
121
|
+
|
122
|
+
# Query-specific fields
|
123
|
+
QUERY_FIELD = Field(
|
124
|
+
default=None,
|
125
|
+
description='Optional SQL query. If not provided, will execute SELECT * FROM table. Must be a read operation.',
|
126
|
+
min_length=1,
|
127
|
+
max_length=10000,
|
128
|
+
)
|
129
|
+
"""
|
130
|
+
Pydantic field for SQL query validation.
|
131
|
+
Optional field that must be a valid read operation.
|
132
|
+
"""
|
133
|
+
|
134
|
+
OUTPUT_LOCATION_FIELD = Field(
|
135
|
+
default=None,
|
136
|
+
description='Optional S3 location for query results. If not provided, will use default Athena results bucket.',
|
137
|
+
pattern=r'^s3://[a-z0-9-]+/[a-z0-9-./]*$',
|
138
|
+
min_length=1,
|
139
|
+
max_length=2048,
|
140
|
+
)
|
141
|
+
"""
|
142
|
+
Pydantic field for output location validation.
|
143
|
+
Optional field that must be a valid S3 URI.
|
144
|
+
"""
|
145
|
+
|
146
|
+
WORKGROUP_FIELD = Field(
|
147
|
+
default='primary',
|
148
|
+
description='Athena workgroup to use for query execution.',
|
149
|
+
pattern=r'^[a-zA-Z0-9_-]+$',
|
150
|
+
min_length=1,
|
151
|
+
max_length=128,
|
152
|
+
)
|
153
|
+
"""
|
154
|
+
Pydantic field for workgroup validation.
|
155
|
+
Optional field that must contain only letters, numbers, hyphens, and underscores.
|
156
|
+
Defaults to 'primary'.
|
157
|
+
"""
|
158
|
+
|
159
|
+
S3_URL_FIELD = Field(
|
160
|
+
...,
|
161
|
+
description='The S3 URL of the file to preview (format: s3://bucket-name/key)',
|
162
|
+
min_length=1,
|
163
|
+
)
|
164
|
+
"""
|
165
|
+
Pydantic field for S3 URL validation.
|
166
|
+
Required field that must be a valid S3 URI.
|
167
|
+
"""
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""Database query operations for S3 Tables MCP Server.
|
16
|
+
|
17
|
+
This module provides functions for executing queries against S3 Tables using Athena.
|
18
|
+
It handles query execution, result retrieval, and proper formatting of responses.
|
19
|
+
"""
|
20
|
+
|
21
|
+
import sqlparse
|
22
|
+
from .engines.pyiceberg import PyIcebergConfig, PyIcebergEngine
|
23
|
+
from typing import Any, Dict
|
24
|
+
|
25
|
+
|
26
|
+
WRITE_OPERATIONS = {
|
27
|
+
'ADD',
|
28
|
+
'ALTER',
|
29
|
+
'ANALYZE',
|
30
|
+
'BEGIN',
|
31
|
+
'COMMIT',
|
32
|
+
'COPY',
|
33
|
+
'CREATE',
|
34
|
+
'DELETE',
|
35
|
+
'DROP',
|
36
|
+
'EXPORT',
|
37
|
+
'GRANT',
|
38
|
+
'IMPORT',
|
39
|
+
'INSERT',
|
40
|
+
'LOAD',
|
41
|
+
'LOCK',
|
42
|
+
'MERGE',
|
43
|
+
'MSCK',
|
44
|
+
'REDUCE',
|
45
|
+
'REFRESH',
|
46
|
+
'REPLACE',
|
47
|
+
'RESET',
|
48
|
+
'REVOKE',
|
49
|
+
'ROLLBACK',
|
50
|
+
'SET',
|
51
|
+
'START',
|
52
|
+
'TRUNCATE',
|
53
|
+
'UNCACHE',
|
54
|
+
'UNLOCK',
|
55
|
+
'UPDATE',
|
56
|
+
'UPSERT',
|
57
|
+
'VACUUM',
|
58
|
+
'VALUES',
|
59
|
+
'WRITE',
|
60
|
+
}
|
61
|
+
|
62
|
+
READ_OPERATIONS = {
|
63
|
+
'DESC',
|
64
|
+
'DESCRIBE',
|
65
|
+
'EXPLAIN',
|
66
|
+
'LIST',
|
67
|
+
'SELECT',
|
68
|
+
'SHOW',
|
69
|
+
'USE',
|
70
|
+
}
|
71
|
+
|
72
|
+
# Disallowed destructive operations for write
|
73
|
+
DESTRUCTIVE_OPERATIONS = {'DELETE', 'DROP', 'MERGE', 'REPLACE', 'TRUNCATE', 'VACUUM'}
|
74
|
+
|
75
|
+
|
76
|
+
def _get_query_operations(query: str) -> set:
|
77
|
+
"""Extract all top-level SQL operations from the query as a set."""
|
78
|
+
parsed = sqlparse.parse(query)
|
79
|
+
operations = set()
|
80
|
+
for stmt in parsed:
|
81
|
+
tokens = [token.value.upper() for token in stmt.tokens if not token.is_whitespace]
|
82
|
+
for token in tokens:
|
83
|
+
if token.isalpha():
|
84
|
+
operations.add(token)
|
85
|
+
return operations
|
86
|
+
|
87
|
+
|
88
|
+
async def query_database_resource(
|
89
|
+
warehouse: str,
|
90
|
+
region: str,
|
91
|
+
namespace: str,
|
92
|
+
query: str,
|
93
|
+
uri: str = 'https://s3tables.us-west-2.amazonaws.com/iceberg',
|
94
|
+
catalog_name: str = 's3tablescatalog',
|
95
|
+
rest_signing_name: str = 's3tables',
|
96
|
+
rest_sigv4_enabled: str = 'true',
|
97
|
+
) -> Dict[str, Any]:
|
98
|
+
"""Execute a read-only query against a database using PyIceberg."""
|
99
|
+
operations = _get_query_operations(query)
|
100
|
+
disallowed = operations & WRITE_OPERATIONS
|
101
|
+
if disallowed:
|
102
|
+
raise ValueError(f'Write operations are not allowed in read-only queries: {disallowed}')
|
103
|
+
config = PyIcebergConfig(
|
104
|
+
warehouse=warehouse,
|
105
|
+
uri=uri,
|
106
|
+
region=region,
|
107
|
+
namespace=namespace,
|
108
|
+
catalog_name=catalog_name,
|
109
|
+
rest_signing_name=rest_signing_name,
|
110
|
+
rest_sigv4_enabled=rest_sigv4_enabled,
|
111
|
+
)
|
112
|
+
engine = PyIcebergEngine(config)
|
113
|
+
result = engine.execute_query(query)
|
114
|
+
return result
|
115
|
+
|
116
|
+
|
117
|
+
async def append_rows_to_table_resource(
|
118
|
+
warehouse: str,
|
119
|
+
region: str,
|
120
|
+
namespace: str,
|
121
|
+
table_name: str,
|
122
|
+
rows: list,
|
123
|
+
uri: str = 'https://s3tables.us-west-2.amazonaws.com/iceberg',
|
124
|
+
catalog_name: str = 's3tablescatalog',
|
125
|
+
rest_signing_name: str = 's3tables',
|
126
|
+
rest_sigv4_enabled: str = 'true',
|
127
|
+
) -> Dict[str, Any]:
|
128
|
+
"""Append rows to an Iceberg table using PyIceberg."""
|
129
|
+
config = PyIcebergConfig(
|
130
|
+
warehouse=warehouse,
|
131
|
+
uri=uri,
|
132
|
+
region=region,
|
133
|
+
namespace=namespace,
|
134
|
+
catalog_name=catalog_name,
|
135
|
+
rest_signing_name=rest_signing_name,
|
136
|
+
rest_sigv4_enabled=rest_sigv4_enabled,
|
137
|
+
)
|
138
|
+
engine = PyIcebergEngine(config)
|
139
|
+
engine.append_rows(table_name, rows)
|
140
|
+
return {'status': 'success', 'rows_appended': len(rows)}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
@@ -0,0 +1,239 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""Engine for interacting with Iceberg tables using pyiceberg and daft (read-only)."""
|
16
|
+
|
17
|
+
import pyarrow as pa
|
18
|
+
from ..utils import pyiceberg_load_catalog
|
19
|
+
from daft import Catalog as DaftCatalog
|
20
|
+
from daft.session import Session
|
21
|
+
from datetime import date, datetime, time
|
22
|
+
from decimal import Decimal
|
23
|
+
from pydantic import BaseModel
|
24
|
+
from pyiceberg.types import (
|
25
|
+
BinaryType,
|
26
|
+
BooleanType,
|
27
|
+
DateType,
|
28
|
+
DecimalType,
|
29
|
+
DoubleType,
|
30
|
+
FixedType,
|
31
|
+
FloatType,
|
32
|
+
IntegerType,
|
33
|
+
ListType,
|
34
|
+
LongType,
|
35
|
+
MapType,
|
36
|
+
StringType,
|
37
|
+
StructType,
|
38
|
+
TimestampType,
|
39
|
+
TimestamptzType,
|
40
|
+
TimeType,
|
41
|
+
UUIDType,
|
42
|
+
)
|
43
|
+
|
44
|
+
# pyiceberg and daft imports
|
45
|
+
from typing import Any, Dict, Optional
|
46
|
+
|
47
|
+
|
48
|
+
class PyIcebergConfig(BaseModel):
|
49
|
+
"""Configuration for PyIceberg/Daft connection."""
|
50
|
+
|
51
|
+
warehouse: str # e.g. 'arn:aws:s3tables:us-west-2:484907528679:bucket/customer-data-bucket'
|
52
|
+
uri: str # e.g. 'https://s3tables.us-west-2.amazonaws.com/iceberg'
|
53
|
+
region: str # e.g. 'us-west-2'
|
54
|
+
namespace: str # e.g. 'retail_data'
|
55
|
+
catalog_name: str = 's3tablescatalog' # default
|
56
|
+
rest_signing_name: str = 's3tables'
|
57
|
+
rest_sigv4_enabled: str = 'true'
|
58
|
+
|
59
|
+
|
60
|
+
def convert_value_for_append(value, iceberg_type):
|
61
|
+
"""Convert a value to the appropriate type for appending to an Iceberg table column.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
value: The value to convert. Can be of various types (str, int, float, etc.).
|
65
|
+
iceberg_type: The Iceberg type to convert the value to.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
The value converted to the appropriate type for the Iceberg column, or None if value is None.
|
69
|
+
|
70
|
+
Raises:
|
71
|
+
NotImplementedError: If the iceberg_type is a complex type (ListType, MapType, StructType).
|
72
|
+
ValueError: If the conversion is unsupported or fails.
|
73
|
+
"""
|
74
|
+
if value is None:
|
75
|
+
return None
|
76
|
+
# Already correct type
|
77
|
+
if isinstance(iceberg_type, BooleanType) and isinstance(value, bool):
|
78
|
+
return value
|
79
|
+
if isinstance(iceberg_type, (IntegerType, LongType)) and isinstance(value, int):
|
80
|
+
return value
|
81
|
+
if isinstance(iceberg_type, (FloatType, DoubleType)) and isinstance(value, float):
|
82
|
+
return value
|
83
|
+
if isinstance(iceberg_type, DecimalType) and isinstance(value, Decimal):
|
84
|
+
return value
|
85
|
+
if isinstance(iceberg_type, DateType) and isinstance(value, date):
|
86
|
+
return value
|
87
|
+
if isinstance(iceberg_type, TimeType) and isinstance(value, time):
|
88
|
+
return value
|
89
|
+
if isinstance(iceberg_type, (TimestampType, TimestamptzType)) and isinstance(value, datetime):
|
90
|
+
return value
|
91
|
+
if isinstance(iceberg_type, StringType) and isinstance(value, str):
|
92
|
+
return value
|
93
|
+
# Convert from string
|
94
|
+
if isinstance(value, str):
|
95
|
+
if isinstance(iceberg_type, BooleanType):
|
96
|
+
return value.lower() in ('true', '1', 'yes')
|
97
|
+
if isinstance(iceberg_type, (IntegerType, LongType)):
|
98
|
+
return int(value)
|
99
|
+
if isinstance(iceberg_type, (FloatType, DoubleType)):
|
100
|
+
return float(value)
|
101
|
+
if isinstance(iceberg_type, DecimalType):
|
102
|
+
return Decimal(value)
|
103
|
+
if isinstance(iceberg_type, DateType):
|
104
|
+
return date.fromisoformat(value)
|
105
|
+
if isinstance(iceberg_type, TimeType):
|
106
|
+
return time.fromisoformat(value)
|
107
|
+
if isinstance(iceberg_type, (TimestampType, TimestamptzType)):
|
108
|
+
return datetime.fromisoformat(value)
|
109
|
+
if isinstance(iceberg_type, StringType):
|
110
|
+
return value
|
111
|
+
if isinstance(iceberg_type, UUIDType):
|
112
|
+
import uuid
|
113
|
+
|
114
|
+
return uuid.UUID(value)
|
115
|
+
if isinstance(iceberg_type, (BinaryType, FixedType)):
|
116
|
+
return bytes.fromhex(value)
|
117
|
+
# Convert from number
|
118
|
+
if isinstance(value, (int, float)):
|
119
|
+
if isinstance(iceberg_type, (IntegerType, LongType)):
|
120
|
+
return int(value)
|
121
|
+
if isinstance(iceberg_type, (FloatType, DoubleType)):
|
122
|
+
return float(value)
|
123
|
+
if isinstance(iceberg_type, DecimalType):
|
124
|
+
return Decimal(str(value))
|
125
|
+
if isinstance(iceberg_type, StringType):
|
126
|
+
return str(value)
|
127
|
+
if isinstance(iceberg_type, (ListType, MapType, StructType)):
|
128
|
+
raise NotImplementedError(f'Complex type {iceberg_type} not supported in append_rows')
|
129
|
+
raise ValueError(f'Unsupported conversion from {type(value)} to {iceberg_type}')
|
130
|
+
|
131
|
+
|
132
|
+
class PyIcebergEngine:
|
133
|
+
"""Engine for read-only queries on Iceberg tables using pyiceberg and daft."""
|
134
|
+
|
135
|
+
def __init__(self, config: PyIcebergConfig):
|
136
|
+
"""Initialize the PyIcebergEngine with the given configuration.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
config: PyIcebergConfig object containing connection parameters.
|
140
|
+
"""
|
141
|
+
self.config = config
|
142
|
+
self._catalog: Optional[Any] = None
|
143
|
+
self._session: Optional[Session] = None
|
144
|
+
self._initialize_connection()
|
145
|
+
|
146
|
+
def _initialize_connection(self):
|
147
|
+
try:
|
148
|
+
self._catalog = pyiceberg_load_catalog(
|
149
|
+
self.config.catalog_name,
|
150
|
+
self.config.warehouse,
|
151
|
+
self.config.uri,
|
152
|
+
self.config.region,
|
153
|
+
self.config.rest_signing_name,
|
154
|
+
self.config.rest_sigv4_enabled,
|
155
|
+
)
|
156
|
+
self._session = Session()
|
157
|
+
self._session.attach(DaftCatalog.from_iceberg(self._catalog))
|
158
|
+
self._session.set_namespace(self.config.namespace)
|
159
|
+
except Exception as e:
|
160
|
+
raise ConnectionError(f'Failed to initialize PyIceberg connection: {str(e)}')
|
161
|
+
|
162
|
+
def execute_query(self, query: str) -> Dict[str, Any]:
|
163
|
+
"""Execute a SQL query against the Iceberg catalog using Daft.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
query: SQL query to execute
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
Dict containing:
|
170
|
+
- columns: List of column names
|
171
|
+
- rows: List of rows, where each row is a list of values
|
172
|
+
"""
|
173
|
+
if not self._session:
|
174
|
+
raise ConnectionError('No active session for PyIceberg/Daft')
|
175
|
+
try:
|
176
|
+
result = self._session.sql(query)
|
177
|
+
if result is None:
|
178
|
+
raise Exception('Query execution returned None result')
|
179
|
+
df = result.collect()
|
180
|
+
columns = df.column_names
|
181
|
+
rows = df.to_pylist()
|
182
|
+
return {
|
183
|
+
'columns': columns,
|
184
|
+
'rows': [list(row.values()) for row in rows],
|
185
|
+
}
|
186
|
+
except Exception as e:
|
187
|
+
raise Exception(f'Error executing query: {str(e)}')
|
188
|
+
|
189
|
+
def test_connection(self) -> bool:
|
190
|
+
"""Test the connection by listing namespaces."""
|
191
|
+
if not self._session:
|
192
|
+
return False
|
193
|
+
try:
|
194
|
+
_ = self._session.list_namespaces()
|
195
|
+
return True
|
196
|
+
except Exception:
|
197
|
+
return False
|
198
|
+
|
199
|
+
def append_rows(self, table_name: str, rows: list[dict]) -> None:
|
200
|
+
"""Append rows to an Iceberg table using pyiceberg.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
table_name: The name of the table (e.g., 'namespace.tablename' or just 'tablename' if namespace is set)
|
204
|
+
rows: List of dictionaries, each representing a row to append
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
Exception: If appending fails
|
208
|
+
"""
|
209
|
+
if not self._catalog:
|
210
|
+
raise ConnectionError('No active catalog for PyIceberg')
|
211
|
+
try:
|
212
|
+
# If table_name does not contain a dot, prepend the namespace
|
213
|
+
if '.' not in table_name:
|
214
|
+
full_table_name = f'{self.config.namespace}.{table_name}'
|
215
|
+
else:
|
216
|
+
full_table_name = table_name
|
217
|
+
table = self._catalog.load_table(full_table_name)
|
218
|
+
iceberg_schema = table.schema()
|
219
|
+
converted_rows = []
|
220
|
+
for row in rows:
|
221
|
+
converted_row = {}
|
222
|
+
for field in iceberg_schema.fields:
|
223
|
+
field_name = field.name
|
224
|
+
field_type = field.field_type
|
225
|
+
value = row.get(field_name)
|
226
|
+
if field.required and value is None:
|
227
|
+
raise ValueError(f'Required field {field_name} is missing or None')
|
228
|
+
try:
|
229
|
+
converted_row[field_name] = convert_value_for_append(value, field_type)
|
230
|
+
except (ValueError, TypeError) as e:
|
231
|
+
raise ValueError(
|
232
|
+
f'Error converting value for field {field_name}: {str(e)}'
|
233
|
+
)
|
234
|
+
converted_rows.append(converted_row)
|
235
|
+
schema = iceberg_schema.as_arrow()
|
236
|
+
pa_table = pa.Table.from_pylist(converted_rows, schema=schema)
|
237
|
+
table.append(pa_table)
|
238
|
+
except Exception as e:
|
239
|
+
raise Exception(f'Error appending rows: {str(e)}')
|