PyPI - awslabs.s3-tables-mcp-server - Versions diffs - 0.0.13__tar.gz → 0.0.15__tar.gz - Mend

awslabs.s3-tables-mcp-server 0.0.13tar.gz → 0.0.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/Dockerfile RENAMED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 # dependabot should continue to update this to the latest hash.
-FROM public.ecr.aws/docker/library/python:3.13-slim-bookworm@sha256:61169c2bdb8e6bb44a8dfad33f569d324d52f079fded9a204b322a6fb9c9f799 AS uv
+FROM public.ecr.aws/docker/library/python:3.13-slim-bookworm@sha256:e66df2153a7cc47b4438848efb65e2d9442db4330db9befaee5107fc75464959 AS uv
 # Install the project into `/app`
 WORKDIR /app
@@ -51,7 +51,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # # Make the directory just in case it doesn't exist
 # RUN mkdir -p /root/.local
-FROM public.ecr.aws/docker/library/python:3.13-slim-bookworm@sha256:61169c2bdb8e6bb44a8dfad33f569d324d52f079fded9a204b322a6fb9c9f799
+FROM public.ecr.aws/docker/library/python:3.13-slim-bookworm@sha256:e66df2153a7cc47b4438848efb65e2d9442db4330db9befaee5107fc75464959
 # Place executables in the environment at the front of the path and include other binaries
 ENV PATH="/app/.venv/bin:$PATH:/usr/sbin" \

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: awslabs.s3-tables-mcp-server
-Version: 0.0.13
+Version: 0.0.15
 Summary: An AWS Labs Model Context Protocol (MCP) server for awslabs.s3-tables-mcp-server
 Project-URL: homepage, https://awslabs.github.io/mcp/
 Project-URL: docs, https://awslabs.github.io/mcp/servers/s3-tables-mcp-server/
@@ -25,9 +25,9 @@ Requires-Dist: boto3==1.40.8
 Requires-Dist: daft==0.5.8
 Requires-Dist: loguru==0.7.3
 Requires-Dist: mcp[cli]==1.11.0
-Requires-Dist: pyarrow==20.0.0
+Requires-Dist: pyarrow==22.0.0
 Requires-Dist: pydantic==2.9.2
-Requires-Dist: pyiceberg==0.9.1
+Requires-Dist: pyiceberg==0.10.0
 Requires-Dist: sqlparse==0.5.3
 Description-Content-Type: text/markdown

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/awslabs/s3_tables_mcp_server/__init__.py RENAMED Viewed

@@ -15,4 +15,4 @@
 # This file is part of the awslabs namespace.
 # It is intentionally minimal to support PEP 420 namespace packages.
-__version__ = '0.0.13'
+__version__ = '0.0.15'

awslabs_s3_tables_mcp_server-0.0.15/awslabs/s3_tables_mcp_server/engines/pyiceberg.py ADDED Viewed

@@ -0,0 +1,248 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Engine for interacting with Iceberg tables using pyiceberg and daft (read-only)."""
+import pyarrow as pa
+from ..utils import pyiceberg_load_catalog
+from daft import Catalog as DaftCatalog
+from daft.session import Session
+from datetime import datetime
+from pydantic import BaseModel
+# pyiceberg and daft imports
+from typing import Any, Dict, Optional
+def convert_temporal_fields(rows: list[dict], arrow_schema: pa.Schema) -> list[dict]:
+    """Convert string temporal fields to appropriate datetime objects based on Arrow schema.
+    Args:
+        rows: List of row dictionaries with string temporal values
+        arrow_schema: PyArrow schema defining field types
+    Returns:
+        List of row dictionaries with converted temporal values
+    """
+    converted_rows = []
+    for row in rows:
+        converted_row = {}
+        for field_name, value in row.items():
+            # Early skip for non-string values
+            if not isinstance(value, str):
+                converted_row[field_name] = value
+                continue
+            # Get the field type from schema
+            field = arrow_schema.field(field_name)
+            field_type = field.type
+            # Date32 or Date64 - calendar date without timezone or time
+            if pa.types.is_date(field_type):
+                # Format: "2025-03-14"
+                converted_row[field_name] = datetime.strptime(value, '%Y-%m-%d').date()
+            # Time64 - time of day, microsecond precision, without date or timezone
+            elif pa.types.is_time(field_type):
+                # Format: "17:10:34.123456" or "17:10:34"
+                fmt = '%H:%M:%S.%f' if '.' in value else '%H:%M:%S'
+                converted_row[field_name] = datetime.strptime(value, fmt).time()
+            # Timestamp without timezone
+            elif pa.types.is_timestamp(field_type) and field_type.tz is None:
+                # Format: "2025-03-14 17:10:34.123456" or "2025-03-14T17:10:34.123456"
+                value_normalized = value.replace('T', ' ')
+                if '.' in value_normalized:
+                    # Truncate nanoseconds to microseconds if needed
+                    parts = value_normalized.split('.')
+                    if len(parts[1]) > 6:
+                        value_normalized = f'{parts[0]}.{parts[1][:6]}'
+                    fmt = '%Y-%m-%d %H:%M:%S.%f'
+                else:
+                    fmt = '%Y-%m-%d %H:%M:%S'
+                converted_row[field_name] = datetime.strptime(value_normalized, fmt)
+            # Timestamp with timezone (stored in UTC)
+            elif pa.types.is_timestamp(field_type) and field_type.tz is not None:
+                # Format: "2025-03-14 17:10:34.123456-07" or "2025-03-14T17:10:34.123456+00:00"
+                value_normalized = value.replace('T', ' ')
+                from datetime import timezone
+                # Truncate nanoseconds to microseconds if present
+                if '.' in value_normalized:
+                    # Split on timezone indicator (+ or -)
+                    # Find the last occurrence of + or - which should be the timezone
+                    tz_idx = max(value_normalized.rfind('+'), value_normalized.rfind('-'))
+                    if tz_idx > 10:  # Make sure it's not the date separator
+                        timestamp_part = value_normalized[:tz_idx]
+                        tz_part = value_normalized[tz_idx:]
+                        # Truncate fractional seconds to 6 digits
+                        if '.' in timestamp_part:
+                            parts = timestamp_part.split('.')
+                            if len(parts[1]) > 6:
+                                timestamp_part = f'{parts[0]}.{parts[1][:6]}'
+                        value_normalized = timestamp_part + tz_part
+                # Try different timezone formats
+                for fmt in [
+                    '%Y-%m-%d %H:%M:%S.%f%z',
+                    '%Y-%m-%d %H:%M:%S%z',
+                    '%Y-%m-%d %H:%M:%S.%f',
+                    '%Y-%m-%d %H:%M:%S',
+                ]:
+                    try:
+                        dt = datetime.strptime(value_normalized, fmt)
+                        if dt.tzinfo is None:
+                            dt = dt.replace(tzinfo=timezone.utc)
+                        converted_row[field_name] = dt.astimezone(timezone.utc)
+                        break
+                    except ValueError:
+                        continue
+                else:
+                    raise ValueError(
+                        f'Could not parse timestamp with timezone: {value} for field {field_name}'
+                    )
+            else:
+                # Not a temporal field, keep as is
+                converted_row[field_name] = value
+        converted_rows.append(converted_row)
+    return converted_rows
+class PyIcebergConfig(BaseModel):
+    """Configuration for PyIceberg/Daft connection."""
+    warehouse: str  # e.g. 'arn:aws:s3tables:us-west-2:484907528679:bucket/customer-data-bucket'
+    uri: str  # e.g. 'https://s3tables.us-west-2.amazonaws.com/iceberg'
+    region: str  # e.g. 'us-west-2'
+    namespace: str  # e.g. 'retail_data'
+    catalog_name: str = 's3tablescatalog'  # default
+    rest_signing_name: str = 's3tables'
+    rest_sigv4_enabled: str = 'true'
+class PyIcebergEngine:
+    """Engine for read-only queries on Iceberg tables using pyiceberg and daft."""
+    def __init__(self, config: PyIcebergConfig):
+        """Initialize the PyIcebergEngine with the given configuration.
+        Args:
+            config: PyIcebergConfig object containing connection parameters.
+        """
+        self.config = config
+        self._catalog: Optional[Any] = None
+        self._session: Optional[Session] = None
+        self._initialize_connection()
+    def _initialize_connection(self):
+        try:
+            self._catalog = pyiceberg_load_catalog(
+                self.config.catalog_name,
+                self.config.warehouse,
+                self.config.uri,
+                self.config.region,
+                self.config.rest_signing_name,
+                self.config.rest_sigv4_enabled,
+            )
+            self._session = Session()
+            self._session.attach(DaftCatalog.from_iceberg(self._catalog))
+            self._session.set_namespace(self.config.namespace)
+        except Exception as e:
+            raise ConnectionError(f'Failed to initialize PyIceberg connection: {str(e)}')
+    def execute_query(self, query: str) -> Dict[str, Any]:
+        """Execute a SQL query against the Iceberg catalog using Daft.
+        Args:
+            query: SQL query to execute
+        Returns:
+            Dict containing:
+                - columns: List of column names
+                - rows: List of rows, where each row is a list of values
+        """
+        if not self._session:
+            raise ConnectionError('No active session for PyIceberg/Daft')
+        try:
+            result = self._session.sql(query)
+            if result is None:
+                raise Exception('Query execution returned None result')
+            df = result.collect()
+            columns = df.column_names
+            rows = df.to_pylist()
+            return {
+                'columns': columns,
+                'rows': [list(row.values()) for row in rows],
+            }
+        except Exception as e:
+            raise Exception(f'Error executing query: {str(e)}')
+    def test_connection(self) -> bool:
+        """Test the connection by listing namespaces."""
+        if not self._session:
+            return False
+        try:
+            _ = self._session.list_namespaces()
+            return True
+        except Exception:
+            return False
+    def append_rows(self, table_name: str, rows: list[dict]) -> None:
+        """Append rows to an Iceberg table using pyiceberg.
+        Args:
+            table_name: The name of the table (e.g., 'namespace.tablename' or just 'tablename' if namespace is set)
+            rows: List of dictionaries, each representing a row to append
+        Raises:
+            Exception: If appending fails
+        """
+        if not self._catalog:
+            raise ConnectionError('No active catalog for PyIceberg')
+        try:
+            # If table_name does not contain a dot, prepend the namespace
+            if '.' not in table_name:
+                full_table_name = f'{self.config.namespace}.{table_name}'
+            else:
+                full_table_name = table_name
+            # Load the Iceberg table
+            table = self._catalog.load_table(full_table_name)
+            # Convert Iceberg schema to Arrow schema to ensure types/order match
+            arrow_schema = table.schema().as_arrow()
+            # Convert temporal fields from strings to datetime objects
+            converted_rows = convert_temporal_fields(rows, arrow_schema)
+            # Create PyArrow table directly from pylist with schema validation
+            try:
+                pa_table = pa.Table.from_pylist(converted_rows, schema=arrow_schema)
+            except pa.ArrowInvalid as e:
+                raise ValueError(
+                    f'Schema mismatch detected: {e}. Please ensure your data matches the table schema.'
+                )
+            # Append the PyArrow table to the Iceberg table
+            table.append(pa_table)
+        except Exception as e:
+            raise Exception(f'Error appending rows: {str(e)}')

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/awslabs/s3_tables_mcp_server/file_processor/csv.py RENAMED Viewed

@@ -34,7 +34,7 @@ async def import_csv_to_table(
     rest_sigv4_enabled: str = 'true',
     preserve_case: bool = False,
 ):
-    """Import a CSV file into an S3 table using PyArrow."""
+    """Import a CSV file into an existing S3 table using PyArrow."""
     return await import_file_to_table(
         warehouse=warehouse,
         region=region,

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/awslabs/s3_tables_mcp_server/file_processor/utils.py RENAMED Viewed

@@ -20,6 +20,7 @@ particularly focusing on column name conversion and schema transformation.
 import os
 import pyarrow as pa
+import pyarrow.compute as pc
 from ..utils import get_s3_client, pyiceberg_load_catalog
 from io import BytesIO
 from pydantic.alias_generators import to_snake
@@ -64,6 +65,44 @@ def convert_column_names_to_snake_case(schema: pa.Schema) -> pa.Schema:
     return pa.schema(new_fields, metadata=schema.metadata)
+def convert_temporal_fields_in_table(
+    pyarrow_table: pa.Table, target_schema: pa.Schema
+) -> pa.Table:
+    """Convert string temporal fields in PyArrow table to appropriate temporal types.
+    Args:
+        pyarrow_table: PyArrow table with string temporal values
+        target_schema: Target schema with temporal field types
+    Returns:
+        PyArrow table with converted temporal columns
+    """
+    # Use PyArrow's cast which can handle ISO 8601 formatted strings
+    # This is simpler and more robust than strptime for mixed formats
+    try:
+        # Try direct cast - PyArrow can parse ISO 8601 strings automatically
+        converted_table = pyarrow_table.cast(target_schema, safe=False)
+        return converted_table
+    except pa.ArrowInvalid:
+        # If direct cast fails, fall back to column-by-column conversion
+        arrays = []
+        for i, field in enumerate(target_schema):
+            col_name = field.name
+            col_data = pyarrow_table.column(col_name)
+            field_type = field.type
+            # Try to cast the column to the target type
+            try:
+                col_data = pc.cast(col_data, field_type, safe=False)
+            except pa.ArrowInvalid:
+                # If cast fails, keep original data
+                pass
+            arrays.append(col_data)
+        return pa.Table.from_arrays(arrays, schema=target_schema)
 async def import_file_to_table(
     warehouse: str,
     region: str,
@@ -117,29 +156,30 @@ async def import_file_to_table(
                     'error': f'Column name conversion failed: {str(conv_err)}',
                 }
-        table_created = False
         try:
             # Try to load existing table
             table = catalog.load_table(f'{namespace}.{table_name}')
+            # Convert temporal fields to match existing table schema
+            target_schema = table.schema().as_arrow()
+            pyarrow_table = convert_temporal_fields_in_table(pyarrow_table, target_schema)
         except NoSuchTableError:
-            # Table doesn't exist, create it using the schema
-            try:
-                table = catalog.create_table(
-                    identifier=f'{namespace}.{table_name}',
-                    schema=pyarrow_schema,
-                )
-                table_created = True
-            except Exception as create_error:
-                return {
-                    'status': 'error',
-                    'error': f'Failed to create table: {str(create_error)}',
-                }
+            # Table doesn't exist - return error with schema information
+            # Build column information from the source file schema
+            columns_info = []
+            for field in pyarrow_schema:
+                columns_info.append({'name': field.name, 'type': str(field.type)})
+            return {
+                'status': 'error',
+                'error': f'Table {namespace}.{table_name} does not exist. Please create the table first before importing data.',
+                'columns': columns_info,
+            }
         # Append data to Iceberg table
         table.append(pyarrow_table)
         # Build message with warnings if applicable
-        message = f'Successfully imported {pyarrow_table.num_rows} rows{" and created new table" if table_created else ""}'
+        message = f'Successfully imported {pyarrow_table.num_rows} rows'
         if columns_converted:
             message += '. WARNING: Column names were converted to snake_case format. To preserve the original case, set preserve_case to True.'
@@ -148,7 +188,6 @@ async def import_file_to_table(
             'message': message,
             'rows_processed': pyarrow_table.num_rows,
             'file_processed': os.path.basename(key),
-            'table_created': table_created,
             'table_uuid': table.metadata.table_uuid,
             'columns': pyarrow_schema.names,
         }

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/awslabs/s3_tables_mcp_server/server.py RENAMED Viewed

@@ -307,45 +307,115 @@ async def create_table(
     """Create a new S3 table in an S3 table bucket.
     Creates a new S3 table associated with the given S3 namespace in an S3 table bucket.
-    The S3 table can be configured with specific format and metadata settings. Metadata contains the schema of the table. Use double type for decimals.
+    The S3 table can be configured with specific format and metadata settings. Metadata contains the schema of the table.
     Do not use the metadata parameter if the schema is unclear.
+    Supported Iceberg Primitive Types:
+    - boolean: True or false
+    - int: 32-bit signed integers (can promote to long)
+    - long: 64-bit signed integers
+    - float: 32-bit IEEE 754 floating point (can promote to double)
+    - double: 64-bit IEEE 754 floating point
+    - decimal(P,S): Fixed-point decimal with precision P and scale S (precision must be 38 or less)
+    - date: Calendar date without timezone or time
+    - time: Time of day, microsecond precision, without date or timezone
+    - timestamp: Timestamp, microsecond precision, without timezone (represents date and time regardless of zone)
+    - timestamptz: Timestamp, microsecond precision, with timezone (stored as UTC)
+    - string: Arbitrary-length character sequences (UTF-8 encoded)
+    Note: Binary field types (binary, fixed, uuid) are not supported.
     Example of S3 table metadata:
     {
         "metadata": {
             "iceberg": {
                 "schema": {
                     "type": "struct",
-                    "fields": [{
+                    "fields": [
+                        {
                             "id": 1,
-                            "name": "customer_id",
+                            "name": "id",
                             "type": "long",
                             "required": true
                         },
                         {
                             "id": 2,
-                            "name": "customer_name",
-                            "type": "string",
-                            "required": true
+                            "name": "bool_field",
+                            "type": "boolean",
+                            "required": false
                         },
                         {
                             "id": 3,
-                            "name": "customer_balance",
+                            "name": "int_field",
+                            "type": "int",
+                            "required": false
+                        },
+                        {
+                            "id": 4,
+                            "name": "long_field",
+                            "type": "long",
+                            "required": false
+                        },
+                        {
+                            "id": 5,
+                            "name": "float_field",
+                            "type": "float",
+                            "required": false
+                        },
+                        {
+                            "id": 6,
+                            "name": "double_field",
                             "type": "double",
                             "required": false
+                        },
+                        {
+                            "id": 7,
+                            "name": "decimal_field",
+                            "type": "decimal(10,2)",
+                            "required": false
+                        },
+                        {
+                            "id": 8,
+                            "name": "date_field",
+                            "type": "date",
+                            "required": false
+                        },
+                        {
+                            "id": 9,
+                            "name": "time_field",
+                            "type": "time",
+                            "required": false
+                        },
+                        {
+                            "id": 10,
+                            "name": "timestamp_field",
+                            "type": "timestamp",
+                            "required": false
+                        },
+                        {
+                            "id": 11,
+                            "name": "timestamptz_field",
+                            "type": "timestamptz",
+                            "required": false
+                        },
+                        {
+                            "id": 12,
+                            "name": "string_field",
+                            "type": "string",
+                            "required": false
                         }
                     ]
                 },
                 "partition-spec": [
                     {
-                        "source-id": 1,
+                        "source-id": 8,
                         "field-id": 1000,
                         "transform": "month",
-                        "name": "sale_date_month"
+                        "name": "date_field_month"
                     }
                 ],
                 "table-properties": {
-                    "description": "Customer information table with customer_id for joining with transactions"
+                    "description": "Example table demonstrating supported Iceberg primitive types"
                 }
             }
         }
@@ -353,7 +423,6 @@ async def create_table(
     Permissions:
     You must have the s3tables:CreateTable permission to use this operation.
-    If using metadata parameter, you must have the s3tables:PutTableData permission.
     """
     from awslabs.s3_tables_mcp_server.models import OpenTableFormat, TableMetadata
@@ -665,11 +734,10 @@ async def import_parquet_to_table(
         bool, Field(..., description='Preserve case of column names')
     ] = False,
 ) -> dict:
-    """Import data from a Parquet file into an S3 table.
+    """Import data from a Parquet file into an existing S3 table.
-    This tool reads data from a Parquet file stored in S3 and imports it into an S3 table.
-    If the table doesn't exist, it will be created with a schema inferred from the Parquet file.
-    If the table exists, the Parquet file schema must be compatible with the table's schema.
+    This tool reads data from a Parquet file stored in S3 and imports it into an existing S3 table.
+    The table must already exist. The Parquet file schema must be compatible with the table's schema.
     The tool will validate the schema before attempting to import the data.
     If preserve_case is True, the column names will not be converted to snake_case. Otherwise, the column names will be converted to snake_case.
@@ -677,6 +745,7 @@ async def import_parquet_to_table(
         - URL is not a valid S3 URL
         - File is not a Parquet file
         - File cannot be accessed
+        - Table does not exist
         - Parquet schema is incompatible with existing table schema
         - Any other error occurs
@@ -685,7 +754,6 @@ async def import_parquet_to_table(
         - message: Success message with row count
         - rows_processed: Number of rows imported
         - file_processed: Name of the processed file
-        - table_created: True if a new table was created
     Example input values:
         warehouse: 'arn:aws:s3tables:<Region>:<accountID>:bucket/<bucketname>'
@@ -704,7 +772,6 @@ async def import_parquet_to_table(
     - s3:GetObject permission for the Parquet file
     - s3tables:GetTable and s3tables:GetTables permissions to access table information
     - s3tables:PutTableData permission to write to the table
-    - s3tables:CreateTable permission (if table doesn't exist)
     """
     if uri is None:
         uri = _default_uri_for_region(region)

{awslabs_s3_tables_mcp_server-0.0.13 → awslabs_s3_tables_mcp_server-0.0.15}/pyproject.toml RENAMED Viewed

@@ -2,7 +2,7 @@
 name = "awslabs.s3-tables-mcp-server"
 # NOTE: "Patch"=9223372036854775807 bumps next release to zero.
-version = "0.0.13"
+version = "0.0.15"
 description = "An AWS Labs Model Context Protocol (MCP) server for awslabs.s3-tables-mcp-server"
 readme = "README.md"
@@ -12,8 +12,8 @@ dependencies = [
     "mcp[cli]==1.11.0",
     "pydantic==2.9.2",
     "boto3==1.40.8",
-    "pyiceberg==0.9.1",
-    "pyarrow==20.0.0",
+    "pyiceberg==0.10.0",
+    "pyarrow==22.0.0",
     "sqlparse==0.5.3",
     "daft==0.5.8",
 ]

awslabs.s3-tables-mcp-server 0.0.13__tar.gz → 0.0.15__tar.gz

awslabs.s3-tables-mcp-server 0.0.13tar.gz → 0.0.15tar.gz