awslabs.s3-tables-mcp-server 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/s3_tables_mcp_server/__init__.py +1 -1
- awslabs/s3_tables_mcp_server/engines/pyiceberg.py +28 -113
- awslabs/s3_tables_mcp_server/file_processor/__init__.py +24 -0
- awslabs/s3_tables_mcp_server/file_processor/csv.py +123 -0
- awslabs/s3_tables_mcp_server/file_processor/parquet.py +116 -0
- awslabs/s3_tables_mcp_server/server.py +76 -24
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/METADATA +1 -1
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/RECORD +12 -10
- awslabs/s3_tables_mcp_server/file_processor.py +0 -485
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/WHEEL +0 -0
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/entry_points.txt +0 -0
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/licenses/LICENSE +0 -0
- {awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/licenses/NOTICE +0 -0
@@ -14,32 +14,14 @@
|
|
14
14
|
|
15
15
|
"""Engine for interacting with Iceberg tables using pyiceberg and daft (read-only)."""
|
16
16
|
|
17
|
+
import io
|
18
|
+
import json
|
17
19
|
import pyarrow as pa
|
20
|
+
import pyarrow.json as pj
|
18
21
|
from ..utils import pyiceberg_load_catalog
|
19
22
|
from daft import Catalog as DaftCatalog
|
20
23
|
from daft.session import Session
|
21
|
-
from datetime import date, datetime, time
|
22
|
-
from decimal import Decimal
|
23
24
|
from pydantic import BaseModel
|
24
|
-
from pyiceberg.types import (
|
25
|
-
BinaryType,
|
26
|
-
BooleanType,
|
27
|
-
DateType,
|
28
|
-
DecimalType,
|
29
|
-
DoubleType,
|
30
|
-
FixedType,
|
31
|
-
FloatType,
|
32
|
-
IntegerType,
|
33
|
-
ListType,
|
34
|
-
LongType,
|
35
|
-
MapType,
|
36
|
-
StringType,
|
37
|
-
StructType,
|
38
|
-
TimestampType,
|
39
|
-
TimestamptzType,
|
40
|
-
TimeType,
|
41
|
-
UUIDType,
|
42
|
-
)
|
43
25
|
|
44
26
|
# pyiceberg and daft imports
|
45
27
|
from typing import Any, Dict, Optional
|
@@ -57,78 +39,6 @@ class PyIcebergConfig(BaseModel):
|
|
57
39
|
rest_sigv4_enabled: str = 'true'
|
58
40
|
|
59
41
|
|
60
|
-
def convert_value_for_append(value, iceberg_type):
|
61
|
-
"""Convert a value to the appropriate type for appending to an Iceberg table column.
|
62
|
-
|
63
|
-
Args:
|
64
|
-
value: The value to convert. Can be of various types (str, int, float, etc.).
|
65
|
-
iceberg_type: The Iceberg type to convert the value to.
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
The value converted to the appropriate type for the Iceberg column, or None if value is None.
|
69
|
-
|
70
|
-
Raises:
|
71
|
-
NotImplementedError: If the iceberg_type is a complex type (ListType, MapType, StructType).
|
72
|
-
ValueError: If the conversion is unsupported or fails.
|
73
|
-
"""
|
74
|
-
if value is None:
|
75
|
-
return None
|
76
|
-
# Already correct type
|
77
|
-
if isinstance(iceberg_type, BooleanType) and isinstance(value, bool):
|
78
|
-
return value
|
79
|
-
if isinstance(iceberg_type, (IntegerType, LongType)) and isinstance(value, int):
|
80
|
-
return value
|
81
|
-
if isinstance(iceberg_type, (FloatType, DoubleType)) and isinstance(value, float):
|
82
|
-
return value
|
83
|
-
if isinstance(iceberg_type, DecimalType) and isinstance(value, Decimal):
|
84
|
-
return value
|
85
|
-
if isinstance(iceberg_type, DateType) and isinstance(value, date):
|
86
|
-
return value
|
87
|
-
if isinstance(iceberg_type, TimeType) and isinstance(value, time):
|
88
|
-
return value
|
89
|
-
if isinstance(iceberg_type, (TimestampType, TimestamptzType)) and isinstance(value, datetime):
|
90
|
-
return value
|
91
|
-
if isinstance(iceberg_type, StringType) and isinstance(value, str):
|
92
|
-
return value
|
93
|
-
# Convert from string
|
94
|
-
if isinstance(value, str):
|
95
|
-
if isinstance(iceberg_type, BooleanType):
|
96
|
-
return value.lower() in ('true', '1', 'yes')
|
97
|
-
if isinstance(iceberg_type, (IntegerType, LongType)):
|
98
|
-
return int(value)
|
99
|
-
if isinstance(iceberg_type, (FloatType, DoubleType)):
|
100
|
-
return float(value)
|
101
|
-
if isinstance(iceberg_type, DecimalType):
|
102
|
-
return Decimal(value)
|
103
|
-
if isinstance(iceberg_type, DateType):
|
104
|
-
return date.fromisoformat(value)
|
105
|
-
if isinstance(iceberg_type, TimeType):
|
106
|
-
return time.fromisoformat(value)
|
107
|
-
if isinstance(iceberg_type, (TimestampType, TimestamptzType)):
|
108
|
-
return datetime.fromisoformat(value)
|
109
|
-
if isinstance(iceberg_type, StringType):
|
110
|
-
return value
|
111
|
-
if isinstance(iceberg_type, UUIDType):
|
112
|
-
import uuid
|
113
|
-
|
114
|
-
return uuid.UUID(value)
|
115
|
-
if isinstance(iceberg_type, (BinaryType, FixedType)):
|
116
|
-
return bytes.fromhex(value)
|
117
|
-
# Convert from number
|
118
|
-
if isinstance(value, (int, float)):
|
119
|
-
if isinstance(iceberg_type, (IntegerType, LongType)):
|
120
|
-
return int(value)
|
121
|
-
if isinstance(iceberg_type, (FloatType, DoubleType)):
|
122
|
-
return float(value)
|
123
|
-
if isinstance(iceberg_type, DecimalType):
|
124
|
-
return Decimal(str(value))
|
125
|
-
if isinstance(iceberg_type, StringType):
|
126
|
-
return str(value)
|
127
|
-
if isinstance(iceberg_type, (ListType, MapType, StructType)):
|
128
|
-
raise NotImplementedError(f'Complex type {iceberg_type} not supported in append_rows')
|
129
|
-
raise ValueError(f'Unsupported conversion from {type(value)} to {iceberg_type}')
|
130
|
-
|
131
|
-
|
132
42
|
class PyIcebergEngine:
|
133
43
|
"""Engine for read-only queries on Iceberg tables using pyiceberg and daft."""
|
134
44
|
|
@@ -197,7 +107,7 @@ class PyIcebergEngine:
|
|
197
107
|
return False
|
198
108
|
|
199
109
|
def append_rows(self, table_name: str, rows: list[dict]) -> None:
|
200
|
-
"""Append rows to an Iceberg table using pyiceberg.
|
110
|
+
"""Append rows to an Iceberg table using pyiceberg with JSON encoding.
|
201
111
|
|
202
112
|
Args:
|
203
113
|
table_name: The name of the table (e.g., 'namespace.tablename' or just 'tablename' if namespace is set)
|
@@ -214,26 +124,31 @@ class PyIcebergEngine:
|
|
214
124
|
full_table_name = f'{self.config.namespace}.{table_name}'
|
215
125
|
else:
|
216
126
|
full_table_name = table_name
|
127
|
+
|
128
|
+
# Load the Iceberg table
|
217
129
|
table = self._catalog.load_table(full_table_name)
|
218
|
-
|
219
|
-
|
130
|
+
# Encode rows as JSON (line-delimited format)
|
131
|
+
json_lines = []
|
220
132
|
for row in rows:
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
133
|
+
json_lines.append(json.dumps(row))
|
134
|
+
json_data = '\n'.join(json_lines)
|
135
|
+
|
136
|
+
# Create a file-like object from the JSON data
|
137
|
+
json_buffer = io.BytesIO(json_data.encode('utf-8'))
|
138
|
+
|
139
|
+
# Read JSON data into PyArrow Table using pyarrow.json.read_json
|
140
|
+
# This enforces the Iceberg schema and validates the data
|
141
|
+
try:
|
142
|
+
new_data_table = pj.read_json(
|
143
|
+
json_buffer, read_options=pj.ReadOptions(use_threads=True)
|
144
|
+
)
|
145
|
+
except pa.ArrowInvalid as e:
|
146
|
+
raise ValueError(
|
147
|
+
f'Schema mismatch detected: {e}. Please ensure your data matches the table schema.'
|
148
|
+
)
|
149
|
+
|
150
|
+
# Append the new data to the Iceberg table
|
151
|
+
table.append(new_data_table)
|
152
|
+
|
238
153
|
except Exception as e:
|
239
154
|
raise Exception(f'Error appending rows: {str(e)}')
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""AWS S3 Tables MCP Server file processing module.
|
16
|
+
|
17
|
+
This module provides functionality for processing and analyzing uploaded files,
|
18
|
+
particularly focusing on CSV and Parquet file handling and import capabilities.
|
19
|
+
"""
|
20
|
+
|
21
|
+
from .csv import import_csv_to_table
|
22
|
+
from .parquet import import_parquet_to_table
|
23
|
+
|
24
|
+
__all__ = ['import_csv_to_table', 'import_parquet_to_table']
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""AWS S3 Tables MCP Server file processing module.
|
16
|
+
|
17
|
+
This module provides functionality for processing and analyzing uploaded files,
|
18
|
+
particularly focusing on CSV file handling and import capabilities.
|
19
|
+
"""
|
20
|
+
|
21
|
+
import io
|
22
|
+
import os
|
23
|
+
import pyarrow.csv as pc
|
24
|
+
from ..utils import get_s3_client, pyiceberg_load_catalog
|
25
|
+
from pyiceberg.exceptions import NoSuchTableError
|
26
|
+
from typing import Dict
|
27
|
+
from urllib.parse import urlparse
|
28
|
+
|
29
|
+
|
30
|
+
async def import_csv_to_table(
|
31
|
+
warehouse: str,
|
32
|
+
region: str,
|
33
|
+
namespace: str,
|
34
|
+
table_name: str,
|
35
|
+
s3_url: str,
|
36
|
+
uri: str,
|
37
|
+
catalog_name: str = 's3tablescatalog',
|
38
|
+
rest_signing_name: str = 's3tables',
|
39
|
+
rest_sigv4_enabled: str = 'true',
|
40
|
+
) -> Dict:
|
41
|
+
"""Import data from a CSV file into an S3 table.
|
42
|
+
|
43
|
+
This function reads data from a CSV file stored in S3 and imports it into an existing S3 table.
|
44
|
+
If the table doesn't exist, it will be created using the schema inferred from the CSV file.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
warehouse: Warehouse string for Iceberg catalog
|
48
|
+
region: AWS region for S3Tables/Iceberg REST endpoint
|
49
|
+
namespace: The namespace containing the table
|
50
|
+
table_name: The name of the table to import data into
|
51
|
+
s3_url: The S3 URL of the CSV file (format: s3://bucket-name/key)
|
52
|
+
uri: REST URI for Iceberg catalog
|
53
|
+
catalog_name: Catalog name
|
54
|
+
rest_signing_name: REST signing name
|
55
|
+
rest_sigv4_enabled: Enable SigV4 signing
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
A dictionary containing:
|
59
|
+
- status: 'success' or 'error'
|
60
|
+
- message: Success message or error details
|
61
|
+
- rows_processed: Number of rows processed (on success)
|
62
|
+
- file_processed: Name of the processed file
|
63
|
+
- table_created: Boolean indicating if a new table was created (on success)
|
64
|
+
"""
|
65
|
+
# Parse S3 URL
|
66
|
+
parsed = urlparse(s3_url)
|
67
|
+
bucket = parsed.netloc
|
68
|
+
key = parsed.path.lstrip('/')
|
69
|
+
|
70
|
+
try:
|
71
|
+
# Load Iceberg catalog
|
72
|
+
catalog = pyiceberg_load_catalog(
|
73
|
+
catalog_name,
|
74
|
+
warehouse,
|
75
|
+
uri,
|
76
|
+
region,
|
77
|
+
rest_signing_name,
|
78
|
+
rest_sigv4_enabled,
|
79
|
+
)
|
80
|
+
|
81
|
+
# Get S3 client and read the CSV file to infer schema
|
82
|
+
s3_client = get_s3_client()
|
83
|
+
response = s3_client.get_object(Bucket=bucket, Key=key)
|
84
|
+
csv_data = response['Body'].read()
|
85
|
+
|
86
|
+
# Read CSV file into PyArrow Table to infer schema
|
87
|
+
# Convert bytes to file-like object for PyArrow
|
88
|
+
csv_buffer = io.BytesIO(csv_data)
|
89
|
+
csv_table = pc.read_csv(csv_buffer)
|
90
|
+
csv_schema = csv_table.schema
|
91
|
+
|
92
|
+
table_created = False
|
93
|
+
try:
|
94
|
+
# Try to load existing table
|
95
|
+
table = catalog.load_table(f'{namespace}.{table_name}')
|
96
|
+
except NoSuchTableError:
|
97
|
+
# Table doesn't exist, create it using the CSV schema
|
98
|
+
try:
|
99
|
+
table = catalog.create_table(
|
100
|
+
identifier=f'{namespace}.{table_name}',
|
101
|
+
schema=csv_schema,
|
102
|
+
)
|
103
|
+
table_created = True
|
104
|
+
except Exception as create_error:
|
105
|
+
return {
|
106
|
+
'status': 'error',
|
107
|
+
'error': f'Failed to create table: {str(create_error)}',
|
108
|
+
}
|
109
|
+
|
110
|
+
# Append data to Iceberg table
|
111
|
+
table.append(csv_table)
|
112
|
+
|
113
|
+
return {
|
114
|
+
'status': 'success',
|
115
|
+
'message': f'Successfully imported {csv_table.num_rows} rows{" and created new table" if table_created else ""}',
|
116
|
+
'rows_processed': csv_table.num_rows,
|
117
|
+
'file_processed': os.path.basename(key),
|
118
|
+
'table_created': table_created,
|
119
|
+
'table_uuid': table.metadata.table_uuid,
|
120
|
+
}
|
121
|
+
|
122
|
+
except Exception as e:
|
123
|
+
return {'status': 'error', 'error': str(e)}
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import pyarrow.parquet as pq
|
16
|
+
from awslabs.s3_tables_mcp_server.utils import get_s3_client, pyiceberg_load_catalog
|
17
|
+
from io import BytesIO
|
18
|
+
from pyiceberg.exceptions import NoSuchTableError
|
19
|
+
from typing import Dict
|
20
|
+
|
21
|
+
|
22
|
+
async def import_parquet_to_table(
|
23
|
+
warehouse: str,
|
24
|
+
region: str,
|
25
|
+
namespace: str,
|
26
|
+
table_name: str,
|
27
|
+
s3_url: str,
|
28
|
+
uri: str,
|
29
|
+
catalog_name: str = 's3tablescatalog',
|
30
|
+
rest_signing_name: str = 's3tables',
|
31
|
+
rest_sigv4_enabled: str = 'true',
|
32
|
+
) -> Dict:
|
33
|
+
"""Import data from a Parquet file into an S3 table.
|
34
|
+
|
35
|
+
This function reads data from a Parquet file stored in S3 and imports it into an existing Iceberg table.
|
36
|
+
If the table doesn't exist, it will be created using the schema from the Parquet file.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
warehouse: Warehouse string for Iceberg catalog
|
40
|
+
region: AWS region for S3Tables/Iceberg REST endpoint
|
41
|
+
namespace: The namespace containing the table
|
42
|
+
table_name: The name of the table to import data into
|
43
|
+
s3_url: The S3 URL of the Parquet file
|
44
|
+
uri: REST URI for Iceberg catalog
|
45
|
+
catalog_name: Catalog name
|
46
|
+
rest_signing_name: REST signing name
|
47
|
+
rest_sigv4_enabled: Enable SigV4 signing
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
A dictionary containing:
|
51
|
+
- status: 'success' or 'error'
|
52
|
+
- message: Success message or error details
|
53
|
+
- rows_processed: Number of rows processed (on success)
|
54
|
+
- file_processed: Name of the processed file
|
55
|
+
- table_created: Boolean indicating if a new table was created (on success)
|
56
|
+
"""
|
57
|
+
import os
|
58
|
+
from urllib.parse import urlparse
|
59
|
+
|
60
|
+
# Parse S3 URL
|
61
|
+
parsed = urlparse(s3_url)
|
62
|
+
bucket = parsed.netloc
|
63
|
+
key = parsed.path.lstrip('/')
|
64
|
+
|
65
|
+
try:
|
66
|
+
# Load Iceberg catalog
|
67
|
+
catalog = pyiceberg_load_catalog(
|
68
|
+
catalog_name,
|
69
|
+
warehouse,
|
70
|
+
uri,
|
71
|
+
region,
|
72
|
+
rest_signing_name,
|
73
|
+
rest_sigv4_enabled,
|
74
|
+
)
|
75
|
+
|
76
|
+
# Get S3 client and read the Parquet file first to get the schema
|
77
|
+
s3_client = get_s3_client()
|
78
|
+
response = s3_client.get_object(Bucket=bucket, Key=key)
|
79
|
+
parquet_data = BytesIO(response['Body'].read())
|
80
|
+
|
81
|
+
# Read Parquet file into PyArrow Table
|
82
|
+
parquet_table = pq.read_table(parquet_data)
|
83
|
+
parquet_schema = parquet_table.schema
|
84
|
+
|
85
|
+
table_created = False
|
86
|
+
try:
|
87
|
+
# Try to load existing table
|
88
|
+
table = catalog.load_table(f'{namespace}.{table_name}')
|
89
|
+
except NoSuchTableError:
|
90
|
+
# Table doesn't exist, create it using the Parquet schema
|
91
|
+
try:
|
92
|
+
table = catalog.create_table(
|
93
|
+
identifier=f'{namespace}.{table_name}',
|
94
|
+
schema=parquet_schema,
|
95
|
+
)
|
96
|
+
table_created = True
|
97
|
+
except Exception as create_error:
|
98
|
+
return {
|
99
|
+
'status': 'error',
|
100
|
+
'error': f'Failed to create table: {str(create_error)}',
|
101
|
+
}
|
102
|
+
|
103
|
+
# Append data to Iceberg table
|
104
|
+
table.append(parquet_table)
|
105
|
+
|
106
|
+
return {
|
107
|
+
'status': 'success',
|
108
|
+
'message': f'Successfully imported {parquet_table.num_rows} rows{" and created new table" if table_created else ""}',
|
109
|
+
'rows_processed': parquet_table.num_rows,
|
110
|
+
'file_processed': os.path.basename(key),
|
111
|
+
'table_created': table_created,
|
112
|
+
'table_uuid': table.metadata.table_uuid,
|
113
|
+
}
|
114
|
+
|
115
|
+
except Exception as e:
|
116
|
+
return {'status': 'error', 'error': str(e)}
|
@@ -32,7 +32,6 @@ from .utils import set_user_agent_mode
|
|
32
32
|
from awslabs.s3_tables_mcp_server import (
|
33
33
|
__version__,
|
34
34
|
database,
|
35
|
-
file_processor,
|
36
35
|
namespaces,
|
37
36
|
resources,
|
38
37
|
s3_operations,
|
@@ -48,6 +47,12 @@ from awslabs.s3_tables_mcp_server.constants import (
|
|
48
47
|
TABLE_BUCKET_NAME_PATTERN,
|
49
48
|
TABLE_NAME_FIELD,
|
50
49
|
)
|
50
|
+
from awslabs.s3_tables_mcp_server.file_processor import (
|
51
|
+
import_csv_to_table as import_csv_to_table_func,
|
52
|
+
)
|
53
|
+
from awslabs.s3_tables_mcp_server.file_processor import (
|
54
|
+
import_parquet_to_table as import_parquet_to_table_func,
|
55
|
+
)
|
51
56
|
from datetime import datetime, timezone
|
52
57
|
from mcp.server.fastmcp import FastMCP
|
53
58
|
from pydantic import Field
|
@@ -567,32 +572,75 @@ async def query_database(
|
|
567
572
|
|
568
573
|
@app.tool()
|
569
574
|
@log_tool_call_with_response
|
570
|
-
|
575
|
+
@write_operation
|
576
|
+
async def import_csv_to_table(
|
577
|
+
warehouse: Annotated[str, Field(..., description='Warehouse string for Iceberg catalog')],
|
578
|
+
region: Annotated[
|
579
|
+
str, Field(..., description='AWS region for S3Tables/Iceberg REST endpoint')
|
580
|
+
],
|
581
|
+
namespace: Annotated[str, NAMESPACE_NAME_FIELD],
|
582
|
+
table_name: Annotated[str, TABLE_NAME_FIELD],
|
571
583
|
s3_url: Annotated[str, S3_URL_FIELD],
|
584
|
+
uri: Annotated[str, Field(..., description='REST URI for Iceberg catalog')],
|
585
|
+
catalog_name: Annotated[
|
586
|
+
str, Field('s3tablescatalog', description='Catalog name')
|
587
|
+
] = 's3tablescatalog',
|
588
|
+
rest_signing_name: Annotated[
|
589
|
+
str, Field('s3tables', description='REST signing name')
|
590
|
+
] = 's3tables',
|
591
|
+
rest_sigv4_enabled: Annotated[str, Field('true', description='Enable SigV4 signing')] = 'true',
|
572
592
|
) -> dict:
|
573
|
-
"""
|
593
|
+
"""Import data from a CSV file into an S3 table.
|
574
594
|
|
575
|
-
This tool
|
576
|
-
|
577
|
-
|
578
|
-
|
595
|
+
This tool reads data from a CSV file stored in S3 and imports it into an S3 table.
|
596
|
+
If the table doesn't exist, it will be created with a schema inferred from the CSV file.
|
597
|
+
If the table exists, the CSV file schema must be compatible with the table's schema.
|
598
|
+
The tool will validate the schema before attempting to import the data.
|
579
599
|
|
580
600
|
Returns error dictionary with status and error message if:
|
581
601
|
- URL is not a valid S3 URL
|
582
602
|
- File is not a CSV file
|
583
603
|
- File cannot be accessed
|
604
|
+
- Table does not exist
|
605
|
+
- CSV headers don't match table schema
|
584
606
|
- Any other error occurs
|
585
607
|
|
608
|
+
Example input values:
|
609
|
+
warehouse: 'arn:aws:s3tables:<Region>:<accountID>:bucket/<bucketname>'
|
610
|
+
region: 'us-west-2'
|
611
|
+
namespace: 'retail_data'
|
612
|
+
table_name: 'customers'
|
613
|
+
s3_url: 's3://bucket-name/path/to/file.csv'
|
614
|
+
uri: 'https://s3tables.us-west-2.amazonaws.com/iceberg'
|
615
|
+
catalog_name: 's3tablescatalog'
|
616
|
+
rest_signing_name: 's3tables'
|
617
|
+
rest_sigv4_enabled: 'true'
|
618
|
+
|
586
619
|
Permissions:
|
587
|
-
You must have
|
620
|
+
You must have:
|
621
|
+
- s3:GetObject permission for the CSV file
|
622
|
+
- s3tables:GetTable and s3tables:GetTables permissions to access table information
|
623
|
+
- s3tables:PutTableData permission to write to the table
|
588
624
|
"""
|
589
|
-
|
625
|
+
if uri is None:
|
626
|
+
uri = _default_uri_for_region(region)
|
627
|
+
return await import_csv_to_table_func(
|
628
|
+
warehouse=warehouse,
|
629
|
+
region=region,
|
630
|
+
namespace=namespace,
|
631
|
+
table_name=table_name,
|
632
|
+
s3_url=s3_url,
|
633
|
+
uri=uri,
|
634
|
+
catalog_name=catalog_name,
|
635
|
+
rest_signing_name=rest_signing_name,
|
636
|
+
rest_sigv4_enabled=rest_sigv4_enabled,
|
637
|
+
)
|
590
638
|
|
591
639
|
|
592
640
|
@app.tool()
|
593
641
|
@log_tool_call_with_response
|
594
642
|
@write_operation
|
595
|
-
async def
|
643
|
+
async def import_parquet_to_table(
|
596
644
|
warehouse: Annotated[str, Field(..., description='Warehouse string for Iceberg catalog')],
|
597
645
|
region: Annotated[
|
598
646
|
str, Field(..., description='AWS region for S3Tables/Iceberg REST endpoint')
|
@@ -609,29 +657,33 @@ async def import_csv_to_table(
|
|
609
657
|
] = 's3tables',
|
610
658
|
rest_sigv4_enabled: Annotated[str, Field('true', description='Enable SigV4 signing')] = 'true',
|
611
659
|
) -> dict:
|
612
|
-
"""Import data from a
|
660
|
+
"""Import data from a Parquet file into an S3 table.
|
613
661
|
|
614
|
-
This tool reads data from a
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
To create a table, first use the preview_csv_file tool to get the schema and data format.
|
619
|
-
Then use the create_table tool to create the table.
|
662
|
+
This tool reads data from a Parquet file stored in S3 and imports it into an S3 table.
|
663
|
+
If the table doesn't exist, it will be created with a schema inferred from the Parquet file.
|
664
|
+
If the table exists, the Parquet file schema must be compatible with the table's schema.
|
665
|
+
The tool will validate the schema before attempting to import the data.
|
620
666
|
|
621
667
|
Returns error dictionary with status and error message if:
|
622
668
|
- URL is not a valid S3 URL
|
623
|
-
- File is not a
|
669
|
+
- File is not a Parquet file
|
624
670
|
- File cannot be accessed
|
625
|
-
-
|
626
|
-
- CSV headers don't match table schema
|
671
|
+
- Parquet schema is incompatible with existing table schema
|
627
672
|
- Any other error occurs
|
628
673
|
|
674
|
+
Returns success dictionary with:
|
675
|
+
- status: 'success'
|
676
|
+
- message: Success message with row count
|
677
|
+
- rows_processed: Number of rows imported
|
678
|
+
- file_processed: Name of the processed file
|
679
|
+
- table_created: True if a new table was created
|
680
|
+
|
629
681
|
Example input values:
|
630
682
|
warehouse: 'arn:aws:s3tables:<Region>:<accountID>:bucket/<bucketname>'
|
631
683
|
region: 'us-west-2'
|
632
684
|
namespace: 'retail_data'
|
633
685
|
table_name: 'customers'
|
634
|
-
s3_url: 's3://bucket-name/path/to/file.
|
686
|
+
s3_url: 's3://bucket-name/path/to/file.parquet'
|
635
687
|
uri: 'https://s3tables.us-west-2.amazonaws.com/iceberg'
|
636
688
|
catalog_name: 's3tablescatalog'
|
637
689
|
rest_signing_name: 's3tables'
|
@@ -639,14 +691,14 @@ async def import_csv_to_table(
|
|
639
691
|
|
640
692
|
Permissions:
|
641
693
|
You must have:
|
642
|
-
- s3:GetObject permission for the
|
643
|
-
- s3tables:GetDatabase and s3tables:GetDatabases permissions to access database information
|
694
|
+
- s3:GetObject permission for the Parquet file
|
644
695
|
- s3tables:GetTable and s3tables:GetTables permissions to access table information
|
645
696
|
- s3tables:PutTableData permission to write to the table
|
697
|
+
- s3tables:CreateTable permission (if table doesn't exist)
|
646
698
|
"""
|
647
699
|
if uri is None:
|
648
700
|
uri = _default_uri_for_region(region)
|
649
|
-
return await
|
701
|
+
return await import_parquet_to_table_func(
|
650
702
|
warehouse=warehouse,
|
651
703
|
region=region,
|
652
704
|
namespace=namespace,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: awslabs.s3-tables-mcp-server
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: An AWS Labs Model Context Protocol (MCP) server for awslabs.s3-tables-mcp-server
|
5
5
|
Project-URL: homepage, https://awslabs.github.io/mcp/
|
6
6
|
Project-URL: docs, https://awslabs.github.io/mcp/servers/s3-tables-mcp-server/
|
{awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/RECORD
RENAMED
@@ -1,21 +1,23 @@
|
|
1
1
|
awslabs/__init__.py,sha256=BHp8_uaBohApyLlmVWvYVe5bSrH59FvLJ5cNTigMV_8,644
|
2
|
-
awslabs/s3_tables_mcp_server/__init__.py,sha256=
|
2
|
+
awslabs/s3_tables_mcp_server/__init__.py,sha256=xRlx7sZhHIEshTK2wTt_3PvKa2tjm26ZydI5L2wYK0c,754
|
3
3
|
awslabs/s3_tables_mcp_server/constants.py,sha256=qCWY9A9PAQXdVz-anO26zbQ72Dp79nGM7xeLR062a_o,4971
|
4
4
|
awslabs/s3_tables_mcp_server/database.py,sha256=YorxcSx-9typfQ5W_LzwNPZkP47u__QSLJlp0fBsZLg,3851
|
5
|
-
awslabs/s3_tables_mcp_server/file_processor.py,sha256=BZR-yMFoB4NKJb1hzD3pYT0ziLS4QiEB5iLWSfDju1U,17031
|
6
5
|
awslabs/s3_tables_mcp_server/models.py,sha256=zWTFJLBhIZRLEgOCTyNcGvbItxqYbFJKH6se1EzXDjY,8097
|
7
6
|
awslabs/s3_tables_mcp_server/namespaces.py,sha256=KZqxJiEnlpxkqvbfygezbr0szwyDP2O0J6osyiPUzwg,2071
|
8
7
|
awslabs/s3_tables_mcp_server/resources.py,sha256=PXZo0sTVn34tXJ4mlw_OS90p12SNoLZs4Re0gV815wk,8281
|
9
8
|
awslabs/s3_tables_mcp_server/s3_operations.py,sha256=Zq3oe-uHuKbW87b_WQyM-6HZ0_ikbgiagb2SVesltdg,1656
|
10
|
-
awslabs/s3_tables_mcp_server/server.py,sha256=
|
9
|
+
awslabs/s3_tables_mcp_server/server.py,sha256=cvXDTZuK1sGpYfjLbF6iLGe49BSA0yx4rSp73UEBcvE,32008
|
11
10
|
awslabs/s3_tables_mcp_server/table_buckets.py,sha256=JHmpB_P9h0Hz5Uis25_GPTD1G-mIODVwjaswwIGyCS4,4471
|
12
11
|
awslabs/s3_tables_mcp_server/tables.py,sha256=ITnRDHHrtRWLsRhff4TP4B7gGT_jRXy994oxK3x10a4,10143
|
13
12
|
awslabs/s3_tables_mcp_server/utils.py,sha256=SReyS3KsdikI9ycL5RsvtVI7MiRnA1W9bTiXGKf1lHc,4517
|
14
13
|
awslabs/s3_tables_mcp_server/engines/__init__.py,sha256=O4wlFva3THWmjfaXfJAwi29mxJSKIhM0jcebVfd3S5U,615
|
15
|
-
awslabs/s3_tables_mcp_server/engines/pyiceberg.py,sha256=
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
awslabs_s3_tables_mcp_server-0.0.
|
20
|
-
awslabs_s3_tables_mcp_server-0.0.
|
21
|
-
awslabs_s3_tables_mcp_server-0.0.
|
14
|
+
awslabs/s3_tables_mcp_server/engines/pyiceberg.py,sha256=wzkySQZgx7L8Fn2Oqk8Yz4V-hQZDUempd8q0IwvCY_4,5784
|
15
|
+
awslabs/s3_tables_mcp_server/file_processor/__init__.py,sha256=8PeggFRY3ZKBdxcFPEqSSHkSJBZ57eOs-z0fqkMHn9E,978
|
16
|
+
awslabs/s3_tables_mcp_server/file_processor/csv.py,sha256=Sngc5mfJDLxQaINBUJLBn5OLc842rv9FqqcJ1upK6iw,4406
|
17
|
+
awslabs/s3_tables_mcp_server/file_processor/parquet.py,sha256=Lr7mtqsK9jqlWokQv74dgdEgYmNKlCJ869yNNMrm69o,4189
|
18
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/METADATA,sha256=KrjJ9PUydW8GhMHSlFuIH_HOH-fJBhiMDa0qK_smSdo,11511
|
19
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
20
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/entry_points.txt,sha256=WRA45Bi2dVY5hskxkka_e7BAGRqG1KiW3ImTBnHSyLs,90
|
21
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/licenses/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
|
22
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/licenses/NOTICE,sha256=jm-1A_8i-wl7KYs2Ynj2A29vXWJCMKLHmGfy4P_B51Y,96
|
23
|
+
awslabs_s3_tables_mcp_server-0.0.4.dist-info/RECORD,,
|
@@ -1,485 +0,0 @@
|
|
1
|
-
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
|
15
|
-
"""AWS S3 Tables MCP Server file processing module.
|
16
|
-
|
17
|
-
This module provides functionality for processing and analyzing uploaded files,
|
18
|
-
particularly focusing on CSV file handling and preview capabilities.
|
19
|
-
"""
|
20
|
-
|
21
|
-
import csv
|
22
|
-
import os
|
23
|
-
import pyarrow as pa
|
24
|
-
import re
|
25
|
-
import uuid
|
26
|
-
from .utils import get_s3_client, pyiceberg_load_catalog
|
27
|
-
from datetime import date, datetime, time
|
28
|
-
from decimal import Decimal
|
29
|
-
from io import StringIO
|
30
|
-
from pyiceberg.types import (
|
31
|
-
BinaryType,
|
32
|
-
BooleanType,
|
33
|
-
DateType,
|
34
|
-
DecimalType,
|
35
|
-
DoubleType,
|
36
|
-
FixedType,
|
37
|
-
FloatType,
|
38
|
-
IntegerType,
|
39
|
-
ListType,
|
40
|
-
LongType,
|
41
|
-
MapType,
|
42
|
-
StringType,
|
43
|
-
StructType,
|
44
|
-
TimestampType,
|
45
|
-
TimestamptzType,
|
46
|
-
TimeType,
|
47
|
-
UUIDType,
|
48
|
-
)
|
49
|
-
from typing import Dict, List, Optional
|
50
|
-
from urllib.parse import urlparse
|
51
|
-
|
52
|
-
|
53
|
-
def validate_s3_url(s3_url: str) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:
|
54
|
-
"""Validate an S3 URL and extract its components.
|
55
|
-
|
56
|
-
Args:
|
57
|
-
s3_url: The S3 URL to validate (format: s3://bucket-name/key)
|
58
|
-
|
59
|
-
Returns:
|
60
|
-
Tuple containing:
|
61
|
-
- bool: Whether the URL is valid
|
62
|
-
- str: Error message if invalid, None if valid
|
63
|
-
- str: Bucket name if valid, None if invalid
|
64
|
-
- str: Object key if valid, None if invalid
|
65
|
-
"""
|
66
|
-
try:
|
67
|
-
parsed = urlparse(s3_url)
|
68
|
-
if parsed.scheme != 's3':
|
69
|
-
return False, f"Invalid URL scheme: {parsed.scheme}. Must be 's3://'", None, None
|
70
|
-
|
71
|
-
if not parsed.netloc:
|
72
|
-
return False, 'Missing bucket name in S3 URL', None, None
|
73
|
-
|
74
|
-
bucket = parsed.netloc
|
75
|
-
key = parsed.path.lstrip('/')
|
76
|
-
|
77
|
-
if not key:
|
78
|
-
return False, 'Missing object key in S3 URL', None, None
|
79
|
-
|
80
|
-
return True, None, bucket, key
|
81
|
-
except Exception as e:
|
82
|
-
return False, f'Error parsing S3 URL: {str(e)}', None, None
|
83
|
-
|
84
|
-
|
85
|
-
def preview_csv_structure(s3_url: str) -> Dict:
|
86
|
-
"""Preview the structure of a CSV file stored in S3 by reading its headers and first row.
|
87
|
-
|
88
|
-
This function provides a quick preview of a CSV file's structure by reading
|
89
|
-
only the headers and first row of data from an S3 location. It's useful for
|
90
|
-
understanding the schema and data format without downloading the entire file.
|
91
|
-
|
92
|
-
Args:
|
93
|
-
s3_url: The S3 URL of the CSV file (format: s3://bucket-name/key)
|
94
|
-
|
95
|
-
Returns:
|
96
|
-
A dictionary containing:
|
97
|
-
- headers: List of column names from the first row
|
98
|
-
- first_row: Dictionary mapping column names to their values from the first data row (empty if no data)
|
99
|
-
- total_columns: Number of columns in the CSV
|
100
|
-
- file_name: Name of the CSV file
|
101
|
-
|
102
|
-
Returns error dictionary with status and error message if:
|
103
|
-
- URL is not a valid S3 URL
|
104
|
-
- File is not a CSV file
|
105
|
-
- File cannot be accessed
|
106
|
-
- Any other error occurs
|
107
|
-
"""
|
108
|
-
try:
|
109
|
-
# Validate S3 URL
|
110
|
-
is_valid, error_msg, bucket, key = validate_s3_url(s3_url)
|
111
|
-
if not is_valid:
|
112
|
-
return {'status': 'error', 'error': error_msg}
|
113
|
-
|
114
|
-
# At this point, bucket and key are guaranteed to be non-None strings
|
115
|
-
if bucket is None or key is None:
|
116
|
-
return {'status': 'error', 'error': 'Invalid S3 URL: bucket or key is None'}
|
117
|
-
|
118
|
-
# Check if file has .csv extension
|
119
|
-
if not key.lower().endswith('.csv'):
|
120
|
-
return {
|
121
|
-
'status': 'error',
|
122
|
-
'error': f'File {key} is not a CSV file. Only .csv files are supported.',
|
123
|
-
}
|
124
|
-
|
125
|
-
# Get S3 client
|
126
|
-
s3_client = get_s3_client()
|
127
|
-
|
128
|
-
# Get the object from S3, only downloading first 8KB (should be enough for headers and first row)
|
129
|
-
response = s3_client.get_object(
|
130
|
-
Bucket=bucket,
|
131
|
-
Key=key,
|
132
|
-
Range='bytes=0-32768', # First 32KB
|
133
|
-
)
|
134
|
-
|
135
|
-
# Read the CSV content
|
136
|
-
csv_content = response['Body'].read().decode('utf-8')
|
137
|
-
|
138
|
-
# Split content into lines
|
139
|
-
lines = csv_content.splitlines()
|
140
|
-
if not lines:
|
141
|
-
return {'status': 'error', 'error': 'File is empty'}
|
142
|
-
|
143
|
-
# Parse the headers
|
144
|
-
headers = next(csv.reader([lines[0]]), [])
|
145
|
-
|
146
|
-
# Try to get first row if it exists
|
147
|
-
first_row = next(csv.reader([lines[1]]), []) if len(lines) > 1 else []
|
148
|
-
|
149
|
-
# Create a dictionary mapping headers to first row values
|
150
|
-
first_row_dict = dict(zip(headers, first_row)) if headers and first_row else {}
|
151
|
-
|
152
|
-
return {
|
153
|
-
'headers': headers,
|
154
|
-
'first_row': first_row_dict,
|
155
|
-
'total_columns': len(headers),
|
156
|
-
'file_name': os.path.basename(key),
|
157
|
-
}
|
158
|
-
except Exception as e:
|
159
|
-
return {'status': 'error', 'error': str(e)}
|
160
|
-
|
161
|
-
|
162
|
-
def convert_value(value: Optional[str], iceberg_type):
|
163
|
-
"""Convert a string value to the appropriate type based on Iceberg schema type.
|
164
|
-
|
165
|
-
Args:
|
166
|
-
value: The string value to convert (can be None)
|
167
|
-
iceberg_type: The Iceberg type to convert to
|
168
|
-
|
169
|
-
Returns:
|
170
|
-
The converted value of the appropriate type
|
171
|
-
|
172
|
-
Raises:
|
173
|
-
ValueError: If the value cannot be converted to the target type
|
174
|
-
NotImplementedError: For unsupported complex types
|
175
|
-
"""
|
176
|
-
if value is None or value == '':
|
177
|
-
return None
|
178
|
-
|
179
|
-
if isinstance(iceberg_type, BooleanType):
|
180
|
-
return value.lower() in ('true', '1', 'yes')
|
181
|
-
|
182
|
-
elif isinstance(iceberg_type, IntegerType):
|
183
|
-
return int(value)
|
184
|
-
|
185
|
-
elif isinstance(iceberg_type, LongType):
|
186
|
-
return int(value)
|
187
|
-
|
188
|
-
elif isinstance(iceberg_type, FloatType):
|
189
|
-
return float(value)
|
190
|
-
|
191
|
-
elif isinstance(iceberg_type, DoubleType):
|
192
|
-
return float(value)
|
193
|
-
|
194
|
-
elif isinstance(iceberg_type, DecimalType):
|
195
|
-
return Decimal(value)
|
196
|
-
|
197
|
-
elif isinstance(iceberg_type, DateType):
|
198
|
-
return date.fromisoformat(value)
|
199
|
-
|
200
|
-
elif isinstance(iceberg_type, TimeType):
|
201
|
-
return time.fromisoformat(value)
|
202
|
-
|
203
|
-
elif isinstance(iceberg_type, TimestampType):
|
204
|
-
return datetime.fromisoformat(value)
|
205
|
-
|
206
|
-
elif isinstance(iceberg_type, TimestamptzType):
|
207
|
-
return datetime.fromisoformat(value) # Ensure it's tz-aware if needed
|
208
|
-
|
209
|
-
elif isinstance(iceberg_type, StringType):
|
210
|
-
return str(value)
|
211
|
-
|
212
|
-
elif isinstance(iceberg_type, UUIDType):
|
213
|
-
return uuid.UUID(value)
|
214
|
-
|
215
|
-
elif isinstance(iceberg_type, BinaryType) or isinstance(iceberg_type, FixedType):
|
216
|
-
return bytes.fromhex(value)
|
217
|
-
|
218
|
-
elif isinstance(iceberg_type, ListType):
|
219
|
-
# naive split for example; you'd want better parsing logic
|
220
|
-
return [convert_value(v.strip(), iceberg_type.element_type) for v in value.split(',')]
|
221
|
-
|
222
|
-
elif isinstance(iceberg_type, MapType):
|
223
|
-
# naive: "key1:value1,key2:value2"
|
224
|
-
return {
|
225
|
-
k.strip(): convert_value(v.strip(), iceberg_type.value_type)
|
226
|
-
for k, v in (item.split(':') for item in value.split(','))
|
227
|
-
}
|
228
|
-
|
229
|
-
elif isinstance(iceberg_type, StructType):
|
230
|
-
raise NotImplementedError('Nested structs need structured input like JSON or dict.')
|
231
|
-
|
232
|
-
else:
|
233
|
-
raise ValueError(f'Unsupported Iceberg type: {iceberg_type}')
|
234
|
-
|
235
|
-
|
236
|
-
def create_pyarrow_schema_from_iceberg(schema) -> pa.Schema:
|
237
|
-
"""Create a PyArrow schema from an Iceberg schema, supporting basic types and decimals."""
|
238
|
-
|
239
|
-
def convert_iceberg_type_to_pyarrow(iceberg_type_str: str):
|
240
|
-
"""Convert an Iceberg type string to a PyArrow type."""
|
241
|
-
iceberg_type_str = iceberg_type_str.lower()
|
242
|
-
|
243
|
-
if iceberg_type_str == 'boolean':
|
244
|
-
return pa.bool_()
|
245
|
-
elif iceberg_type_str == 'int':
|
246
|
-
return pa.int32()
|
247
|
-
elif iceberg_type_str == 'long':
|
248
|
-
return pa.int64()
|
249
|
-
elif iceberg_type_str == 'float':
|
250
|
-
return pa.float32()
|
251
|
-
elif iceberg_type_str == 'double':
|
252
|
-
return pa.float64()
|
253
|
-
elif iceberg_type_str == 'date':
|
254
|
-
return pa.date32()
|
255
|
-
elif iceberg_type_str == 'time':
|
256
|
-
return pa.time64('us')
|
257
|
-
elif iceberg_type_str == 'timestamp':
|
258
|
-
return pa.timestamp('us')
|
259
|
-
elif iceberg_type_str == 'timestamptz':
|
260
|
-
return pa.timestamp('us', tz='UTC')
|
261
|
-
elif iceberg_type_str == 'string':
|
262
|
-
return pa.string()
|
263
|
-
elif iceberg_type_str == 'uuid':
|
264
|
-
return pa.string()
|
265
|
-
elif iceberg_type_str == 'binary':
|
266
|
-
return pa.binary()
|
267
|
-
elif iceberg_type_str.startswith('fixed'):
|
268
|
-
size_match = re.match(r'fixed\((\d+)\)', iceberg_type_str)
|
269
|
-
return pa.binary(int(size_match.group(1))) if size_match else pa.binary()
|
270
|
-
elif iceberg_type_str.startswith('decimal'):
|
271
|
-
decimal_match = re.match(r'decimal\((\d+),\s*(\d+)\)', iceberg_type_str)
|
272
|
-
if decimal_match:
|
273
|
-
precision = int(decimal_match.group(1))
|
274
|
-
scale = int(decimal_match.group(2))
|
275
|
-
if precision <= 18:
|
276
|
-
return pa.decimal128(
|
277
|
-
precision, scale
|
278
|
-
) # Will use INT64 encoding for small precision
|
279
|
-
else:
|
280
|
-
return pa.decimal256(precision, scale) # For large precision decimals
|
281
|
-
else:
|
282
|
-
raise ValueError(f'Invalid decimal type format: {iceberg_type_str}')
|
283
|
-
else:
|
284
|
-
raise ValueError(f'Unsupported Iceberg type: {iceberg_type_str}')
|
285
|
-
|
286
|
-
# Build PyArrow schema
|
287
|
-
pa_fields = []
|
288
|
-
for field in schema.fields:
|
289
|
-
name = field.name
|
290
|
-
iceberg_type_str = str(field.field_type)
|
291
|
-
try:
|
292
|
-
pa_type = convert_iceberg_type_to_pyarrow(iceberg_type_str)
|
293
|
-
except ValueError as e:
|
294
|
-
raise ValueError(f"Error in field '{name}': {e}")
|
295
|
-
|
296
|
-
pa_fields.append(pa.field(name, pa_type, nullable=not field.required))
|
297
|
-
|
298
|
-
return pa.schema(pa_fields)
|
299
|
-
|
300
|
-
|
301
|
-
def process_chunk(chunk: List[Dict], table, chunk_name: str = 'Chunk') -> Dict:
|
302
|
-
"""Process a chunk of data by converting it to a PyArrow table and appending to the table.
|
303
|
-
|
304
|
-
Args:
|
305
|
-
chunk: List of dictionaries representing the data rows
|
306
|
-
table: The Iceberg table to append data to
|
307
|
-
chunk_name: Name identifier for the chunk (for logging purposes)
|
308
|
-
|
309
|
-
Returns:
|
310
|
-
Dictionary with status and message
|
311
|
-
"""
|
312
|
-
try:
|
313
|
-
# Get the Iceberg schema and create PyArrow schema
|
314
|
-
schema = table.schema()
|
315
|
-
pyarrow_schema = create_pyarrow_schema_from_iceberg(schema)
|
316
|
-
|
317
|
-
# Convert list of dictionaries to PyArrow table with proper schema
|
318
|
-
table_data = pa.Table.from_pylist(chunk, schema=pyarrow_schema)
|
319
|
-
|
320
|
-
table.append(table_data)
|
321
|
-
|
322
|
-
return {
|
323
|
-
'status': 'success',
|
324
|
-
'message': f'Successfully processed {len(chunk)} rows in {chunk_name.lower()}',
|
325
|
-
}
|
326
|
-
|
327
|
-
except Exception as e:
|
328
|
-
return {'status': 'error', 'error': f'Error inserting {chunk_name.lower()}: {str(e)}'}
|
329
|
-
|
330
|
-
|
331
|
-
async def import_csv_to_table(
|
332
|
-
warehouse: str,
|
333
|
-
region: str,
|
334
|
-
namespace: str,
|
335
|
-
table_name: str,
|
336
|
-
s3_url: str,
|
337
|
-
uri: str = 'https://s3tables.us-west-2.amazonaws.com/iceberg',
|
338
|
-
catalog_name: str = 's3tablescatalog',
|
339
|
-
rest_signing_name: str = 's3tables',
|
340
|
-
rest_sigv4_enabled: str = 'true',
|
341
|
-
) -> Dict:
|
342
|
-
"""Import data from a CSV file into an S3 table.
|
343
|
-
|
344
|
-
This function reads data from a CSV file stored in S3 and imports it into an existing S3 table.
|
345
|
-
The CSV file must have headers that match the table's schema. The function will validate the CSV structure
|
346
|
-
before attempting to import the data.
|
347
|
-
|
348
|
-
Args:
|
349
|
-
warehouse: Warehouse string for Iceberg catalog
|
350
|
-
region: AWS region for S3Tables/Iceberg REST endpoint
|
351
|
-
namespace: The namespace containing the table
|
352
|
-
table_name: The name of the table to import data into
|
353
|
-
s3_url: The S3 URL of the CSV file (format: s3://bucket-name/key)
|
354
|
-
uri: REST URI for Iceberg catalog
|
355
|
-
catalog_name: Catalog name
|
356
|
-
rest_signing_name: REST signing name
|
357
|
-
rest_sigv4_enabled: Enable SigV4 signing
|
358
|
-
|
359
|
-
Returns:
|
360
|
-
A dictionary containing:
|
361
|
-
- status: 'success' or 'error'
|
362
|
-
- message: Success message or error details
|
363
|
-
- rows_processed: Number of rows processed (on success)
|
364
|
-
- file_processed: Name of the processed file
|
365
|
-
- csv_headers: List of CSV headers
|
366
|
-
|
367
|
-
Returns error dictionary with status and error message if:
|
368
|
-
- URL is not a valid S3 URL
|
369
|
-
- File is not a CSV file
|
370
|
-
- File cannot be accessed
|
371
|
-
- Table does not exist
|
372
|
-
- CSV headers don't match table schema
|
373
|
-
- Any other error occurs
|
374
|
-
"""
|
375
|
-
# Validate S3 URL
|
376
|
-
is_valid, error_msg, bucket, key = validate_s3_url(s3_url)
|
377
|
-
if not is_valid:
|
378
|
-
return {'status': 'error', 'error': error_msg}
|
379
|
-
|
380
|
-
if bucket is None or key is None:
|
381
|
-
return {'status': 'error', 'error': 'Invalid S3 URL: bucket or key is None'}
|
382
|
-
|
383
|
-
if not key.lower().endswith('.csv'):
|
384
|
-
return {
|
385
|
-
'status': 'error',
|
386
|
-
'error': f'File {key} is not a CSV file. Only .csv files are supported.',
|
387
|
-
}
|
388
|
-
|
389
|
-
try:
|
390
|
-
# Load catalog using provided parameters (see pyiceberg.py style)
|
391
|
-
catalog = pyiceberg_load_catalog(
|
392
|
-
catalog_name,
|
393
|
-
warehouse,
|
394
|
-
uri,
|
395
|
-
region,
|
396
|
-
rest_signing_name,
|
397
|
-
rest_sigv4_enabled,
|
398
|
-
)
|
399
|
-
|
400
|
-
# Load existing table
|
401
|
-
table = catalog.load_table(f'{namespace}.{table_name}')
|
402
|
-
|
403
|
-
# Get schema information
|
404
|
-
schema = table.schema()
|
405
|
-
|
406
|
-
# Get S3 client
|
407
|
-
s3_client = get_s3_client()
|
408
|
-
|
409
|
-
# Get the CSV file from S3
|
410
|
-
response = s3_client.get_object(Bucket=bucket, Key=key)
|
411
|
-
csv_content = response['Body'].read().decode('utf-8')
|
412
|
-
|
413
|
-
# Read CSV content
|
414
|
-
csv_reader = csv.DictReader(StringIO(csv_content))
|
415
|
-
|
416
|
-
# Validate headers against schema
|
417
|
-
csv_headers = csv_reader.fieldnames
|
418
|
-
schema_field_names = {field.name for field in schema.fields}
|
419
|
-
|
420
|
-
if not csv_headers:
|
421
|
-
return {'status': 'error', 'error': 'CSV file has no headers'}
|
422
|
-
|
423
|
-
missing_columns = schema_field_names - set(csv_headers)
|
424
|
-
if missing_columns:
|
425
|
-
return {
|
426
|
-
'status': 'error',
|
427
|
-
'error': f'CSV is missing required columns: {", ".join(missing_columns)}',
|
428
|
-
}
|
429
|
-
|
430
|
-
# Process rows in chunks
|
431
|
-
chunk_size = 5000
|
432
|
-
rows_processed = 0
|
433
|
-
current_chunk = []
|
434
|
-
|
435
|
-
for row in csv_reader:
|
436
|
-
# Transform row data according to schema types
|
437
|
-
transformed_row = {}
|
438
|
-
for field in schema.fields:
|
439
|
-
value = row.get(field.name)
|
440
|
-
|
441
|
-
# Handle required fields
|
442
|
-
if field.required and (value is None or value == ''):
|
443
|
-
return {
|
444
|
-
'status': 'error',
|
445
|
-
'error': f'Required field {field.name} is missing or empty in row {rows_processed + 1}',
|
446
|
-
}
|
447
|
-
|
448
|
-
# Transform value based on field type
|
449
|
-
try:
|
450
|
-
if value is None or value == '':
|
451
|
-
transformed_row[field.name] = None
|
452
|
-
else:
|
453
|
-
transformed_row[field.name] = convert_value(value, field.field_type)
|
454
|
-
except (ValueError, TypeError) as e:
|
455
|
-
return {
|
456
|
-
'status': 'error',
|
457
|
-
'error': f'Error converting value for field {field.name} in row {rows_processed + 1}: {str(e)}',
|
458
|
-
}
|
459
|
-
|
460
|
-
current_chunk.append(transformed_row)
|
461
|
-
rows_processed += 1
|
462
|
-
|
463
|
-
# Process chunk when it reaches the chunk size
|
464
|
-
if len(current_chunk) >= chunk_size:
|
465
|
-
result = process_chunk(current_chunk, table, 'Chunk')
|
466
|
-
if result['status'] == 'error':
|
467
|
-
return result
|
468
|
-
current_chunk = []
|
469
|
-
|
470
|
-
# Process any remaining rows
|
471
|
-
if current_chunk:
|
472
|
-
result = process_chunk(current_chunk, table, 'Final Chunk')
|
473
|
-
if result['status'] == 'error':
|
474
|
-
return result
|
475
|
-
|
476
|
-
return {
|
477
|
-
'status': 'success',
|
478
|
-
'message': f'Successfully processed {rows_processed} rows',
|
479
|
-
'rows_processed': rows_processed,
|
480
|
-
'file_processed': os.path.basename(key),
|
481
|
-
'csv_headers': csv_headers,
|
482
|
-
}
|
483
|
-
|
484
|
-
except Exception as e:
|
485
|
-
return {'status': 'error', 'error': str(e)}
|
{awslabs_s3_tables_mcp_server-0.0.2.dist-info → awslabs_s3_tables_mcp_server-0.0.4.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|