PyPI - awslabs.syntheticdata-mcp-server - Versions diffs - 0.0.1__py3-none-any.whl - Mend

awslabs.syntheticdata-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

awslabs/syntheticdata_mcp_server/storage/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Storage module for synthetic data loading."""
+from .base import DataTarget
+from .s3 import S3Target
+from .loader import UnifiedDataLoader
+__all__ = ['DataTarget', 'S3Target', 'UnifiedDataLoader']

awslabs/syntheticdata_mcp_server/storage/base.py ADDED Viewed

@@ -0,0 +1,46 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+# and limitations under the License.
+"""Base classes for data storage targets."""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+class DataTarget(ABC):
+    """Abstract base class for data storage targets."""
+    @abstractmethod
+    async def load(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> Dict:
+        """Load data to the target storage.
+        Args:
+            data: Dictionary mapping table names to lists of records
+            config: Target-specific configuration
+        Returns:
+            Dictionary containing load results
+        """
+        pass
+    @abstractmethod
+    async def validate(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> bool:
+        """Validate data and configuration before loading.
+        Args:
+            data: Dictionary mapping table names to lists of records
+            config: Target-specific configuration
+        Returns:
+            True if validation passes, False otherwise
+        """
+        pass

awslabs/syntheticdata_mcp_server/storage/loader.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+# and limitations under the License.
+"""Unified data loader implementation."""
+from .s3 import S3Target
+from typing import Any, Dict, List
+class UnifiedDataLoader:
+    """Loader that supports multiple storage targets."""
+    def __init__(self):
+        """Initialize with supported storage targets."""
+        self.targets = {'s3': S3Target()}
+    async def load_data(
+        self, data: Dict[str, List[Dict]], targets: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """Load data to multiple storage targets.
+        Args:
+            data: Dictionary mapping table names to lists of records
+            targets: List of target configurations, each containing:
+                - type: Target type (e.g., 's3')
+                - config: Target-specific configuration
+        Returns:
+            Dictionary containing results for each target
+        """
+        results = {}
+        for target_config in targets:
+            # Validate target config structure
+            if not isinstance(target_config, dict):
+                results['unknown'] = {
+                    'success': False,
+                    'error': 'Invalid target configuration format',
+                }
+                continue
+            target_type = target_config.get('type')
+            if not target_type:
+                results['unknown'] = {'success': False, 'error': 'Missing target type'}
+                continue
+            if target_type not in self.targets:
+                results[target_type] = {
+                    'success': False,
+                    'error': f'Unsupported target type: {target_type}',
+                }
+                continue
+            target = self.targets[target_type]
+            config = target_config.get('config', {})
+            # Validate configuration
+            try:
+                is_valid = await target.validate(data, config)
+                if not is_valid:
+                    results[target_type] = {
+                        'success': False,
+                        'error': 'Invalid configuration or data',
+                    }
+                    continue
+            except Exception as e:
+                results[target_type] = {'success': False, 'error': str(e)}
+                continue
+            # Load data
+            try:
+                result = await target.load(data, config)
+                results[target_type] = result
+            except Exception as e:
+                results[target_type] = {'success': False, 'error': str(e)}
+        return {'success': all(r['success'] for r in results.values()), 'results': results}

awslabs/syntheticdata_mcp_server/storage/s3.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""S3 storage target implementation."""
+import asyncio
+import boto3
+import os
+import pandas as pd
+from .base import DataTarget
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional
+class S3Target(DataTarget):
+    """AWS S3 storage target implementation."""
+    def __init__(self):
+        """Initialize S3 target with boto3 client."""
+        session = boto3.Session(profile_name=os.environ.get('AWS_PROFILE'))
+        self.s3_client = session.client('s3')
+        self.supported_formats = ['csv', 'json', 'parquet']
+        self.executor = ThreadPoolExecutor(max_workers=4)
+    async def validate(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> bool:
+        """Validate data and S3 configuration.
+        Args:
+            data: Dictionary mapping table names to lists of records
+            config: S3 configuration including bucket, prefix, format, etc.
+        Returns:
+            True if validation passes, False otherwise
+        """
+        try:
+            # Check required config
+            required_fields = ['bucket', 'prefix', 'format']
+            if not all(field in config for field in required_fields):
+                return False
+            # Validate format
+            if config['format'] not in self.supported_formats:
+                return False
+            # Validate data
+            if not data or not all(isinstance(records, list) for records in data.values()):
+                return False
+            # Check S3 access
+            try:
+                self.s3_client.head_bucket(Bucket=config['bucket'])
+            except Exception:
+                return False
+            return True
+        except Exception:
+            return False
+    async def load(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> Dict:
+        """Load data to S3 with specified configuration.
+        Args:
+            data: Dictionary mapping table names to lists of records
+            config: S3 configuration including:
+                - bucket: S3 bucket name
+                - prefix: Key prefix for S3 objects
+                - format: Output format (csv, json, parquet)
+                - partitioning: Optional partitioning configuration
+                - storage: Optional storage class and encryption settings
+                - metadata: Optional object metadata
+        Returns:
+            Dictionary containing load results
+        """
+        try:
+            # Convert to DataFrames
+            dataframes = {name: pd.DataFrame(records) for name, records in data.items()}
+            # Apply partitioning if enabled
+            if config.get('partitioning', {}).get('enabled'):
+                partitioned_data = self._apply_partitioning(dataframes, config['partitioning'])
+            else:
+                partitioned_data = {name: {'': df} for name, df in dataframes.items()}
+            # Process each table and partition
+            upload_tasks = []
+            for table_name, partitions in partitioned_data.items():
+                for partition_key, df in partitions.items():
+                    # Construct S3 key
+                    partition_path = f'{partition_key}/' if partition_key else ''
+                    key = f'{config["prefix"]}{table_name}/{partition_path}{table_name}.{config["format"]}'
+                    # Convert to specified format
+                    content = self._convert_format(df, config['format'], config.get('compression'))
+                    # Create upload task
+                    task = self._upload_to_s3(
+                        content,
+                        config['bucket'],
+                        key,
+                        config.get('storage', {}),
+                        config.get('metadata', {}),
+                    )
+                    upload_tasks.append(task)
+            # Execute uploads in parallel
+            results = await asyncio.gather(*upload_tasks)
+            return {
+                'success': True,
+                'uploaded_files': results,
+                'total_records': sum(len(df) for df in dataframes.values()),
+            }
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
+    def _convert_format(
+        self, df: pd.DataFrame, format: str, compression: Optional[str] = None
+    ) -> bytes:
+        """Convert DataFrame to specified format.
+        Args:
+            df: pandas DataFrame to convert
+            format: Target format (csv, json, parquet)
+            compression: Optional compression type
+        Returns:
+            Bytes containing the converted data
+        """
+        if format == 'parquet':
+            return df.to_parquet(compression=compression)
+        elif format == 'csv':
+            csv_data = df.to_csv(index=False)
+            return csv_data.encode() if csv_data is not None else b''
+        elif format == 'json':
+            json_data = df.to_json(orient='records')
+            return json_data.encode() if json_data is not None else b''
+        else:
+            raise ValueError(f'Unsupported format: {format}')
+    def _apply_partitioning(
+        self, dataframes: Dict[str, pd.DataFrame], partition_config: Dict[str, Any]
+    ) -> Dict[str, Dict[str, pd.DataFrame]]:
+        """Apply partitioning to DataFrames.
+        Args:
+            dataframes: Dictionary of table name to DataFrame
+            partition_config: Partitioning configuration
+        Returns:
+            Dictionary mapping table names to dictionaries of partition key to DataFrame
+        """
+        partitioned_data = {}
+        partition_cols = partition_config['columns']
+        for table_name, df in dataframes.items():
+            # Skip if partition columns don't exist
+            if not all(col in df.columns for col in partition_cols):
+                partitioned_data[table_name] = {'': df}
+                continue
+            # Group by partition columns
+            grouped = df.groupby(partition_cols)
+            partitions = {}
+            for group_key, group_df in grouped:
+                # Create partition key
+                if isinstance(group_key, tuple):
+                    partition_key = '/'.join(str(k) for k in group_key)
+                else:
+                    partition_key = str(group_key)
+                # Remove partition columns if specified
+                if partition_config.get('drop_columns', False):
+                    group_df = group_df.drop(columns=partition_cols)
+                partitions[partition_key] = group_df
+            partitioned_data[table_name] = partitions
+        return partitioned_data
+    async def _upload_to_s3(
+        self, content: bytes, bucket: str, key: str, storage_config: Dict, metadata: Dict
+    ) -> Dict:
+        """Upload content to S3 with specified configuration.
+        Args:
+            content: Bytes to upload
+            bucket: S3 bucket name
+            key: S3 object key
+            storage_config: Storage class and encryption settings
+            metadata: Object metadata
+        Returns:
+            Dictionary containing upload details
+        """
+        try:
+            # Run S3 upload in thread pool
+            await asyncio.get_event_loop().run_in_executor(
+                self.executor,
+                lambda: self.s3_client.put_object(
+                    Bucket=bucket,
+                    Key=key,
+                    Body=content,
+                    StorageClass=storage_config.get('class', 'STANDARD'),
+                    Metadata=metadata,
+                    **(
+                        {'ServerSideEncryption': storage_config['encryption']}
+                        if storage_config.get('encryption')
+                        else {}
+                    ),
+                ),
+            )
+            return {'bucket': bucket, 'key': key, 'size': len(content), 'metadata': metadata}
+        except Exception as e:
+            raise Exception(f'Failed to upload to S3: {str(e)}')

awslabs_syntheticdata_mcp_server-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,144 @@
+Metadata-Version: 2.4
+Name: awslabs.syntheticdata-mcp-server
+Version: 0.0.1
+Summary: An AWS Labs Model Context Protocol (MCP) server for syntheticdata
+Project-URL: homepage, https://awslabs.github.io/mcp/
+Project-URL: docs, https://awslabs.github.io/mcp/servers/syntheticdata-mcp-server/
+Project-URL: documentation, https://awslabs.github.io/mcp/servers/syntheticdata-mcp-server/
+Project-URL: repository, https://github.com/awslabs/mcp.git
+Project-URL: changelog, https://github.com/awslabs/mcp/blob/main/src/syntheticdata-mcp-server/CHANGELOG.md
+Author: Amazon Web Services
+Author-email: AWSLabs MCP <203918161+awslabs-mcp@users.noreply.github.com>
+License: Apache-2.0
+License-File: LICENSE
+License-File: NOTICE
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Requires-Dist: boto3>=1.34.0
+Requires-Dist: mcp[cli]>=1.6.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: pyarrow>=14.0.1
+Requires-Dist: pydantic>=2.10.6
+Requires-Dist: python-snappy>=0.6.1
+Description-Content-Type: text/markdown
+# Synthetic Data MCP Server
+A Model Context Protocol (MCP) server for generating, validating, and managing synthetic data.
+## Overview
+This MCP server provides tools for generating synthetic data based on business descriptions, executing pandas code safely, validating data structures, and loading data to storage systems like S3.
+## Features
+- **Business-Driven Generation**: Generate synthetic data instructions based on business descriptions
+- **Data Generation Instructions**: Generate structured data generation instructions from business descriptions
+- **Safe Pandas Code Execution**: Run pandas code in a restricted environment with automatic DataFrame detection
+- **JSON Lines Validation**: Validate and convert JSON Lines data to CSV format
+- **Data Validation**: Validate data structure, referential integrity, and save as CSV files
+- **Referential Integrity Checking**: Validate relationships between tables
+- **Data Quality Assessment**: Identify potential issues in data models (3NF validation)
+- **Storage Integration**: Load data to various storage targets (S3) with support for:
+  - Multiple file formats (CSV, JSON, Parquet)
+  - Partitioning options
+  - Storage class configuration
+  - Encryption settings
+## Prerequisites
+1. Install `uv` from [Astral](https://docs.astral.sh/uv/getting-started/installation/) or the [GitHub README](https://github.com/astral-sh/uv#installation)
+2. Install Python using `uv python install 3.10`
+3. Set up AWS credentials with access to AWS services
+   - You need an AWS account with appropriate permissions
+   - Configure AWS credentials with `aws configure` or environment variables
+## Installation
+```json
+{
+  "mcpServers": {
+    "awslabs.syntheticdata-mcp-server": {
+      "command": "uvx",
+      "args": ["awslabs.syntheticdata-mcp-server"],
+      "env": {
+        "FASTMCP_LOG_LEVEL": "ERROR",
+        "AWS_PROFILE": "your-aws-profile",
+        "AWS_REGION": "us-east-1"
+      },
+      "autoApprove": [],
+      "disabled": false
+    }
+  }
+}
+```
+NOTE: Your credentials will need to be kept refreshed from your host
+### AWS Authentication
+The MCP server uses the AWS profile specified in the `AWS_PROFILE` environment variable. If not provided, it defaults to the "default" profile in your AWS configuration file.
+```json
+"env": {
+  "AWS_PROFILE": "your-aws-profile"
+}
+```
+## Usage
+### Getting Data Generation Instructions
+```python
+response = await server.get_data_generation_instructions(
+    business_description="An e-commerce platform with customers, orders, and products"
+)
+```
+### Executing Pandas Code
+```python
+response = await server.execute_pandas_code(
+    code="your_pandas_code_here",
+    workspace_dir="/path/to/workspace",
+    output_dir="data"
+)
+```
+### Validating and Saving Data
+```python
+response = await server.validate_and_save_data(
+    data={
+        "customers": [{"id": 1, "name": "John"}],
+        "orders": [{"id": 101, "customer_id": 1}]
+    },
+    workspace_dir="/path/to/workspace",
+    output_dir="data"
+)
+```
+### Loading to Storage
+```python
+response = await server.load_to_storage(
+    data={
+        "customers": [{"id": 1, "name": "John"}]
+    },
+    targets=[{
+        "type": "s3",
+        "config": {
+            "bucket": "my-bucket",
+            "prefix": "data/",
+            "format": "parquet"
+        }
+    }]
+)
+```

awslabs_syntheticdata_mcp_server-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+awslabs/__init__.py,sha256=8r6KA5knEPJXmJXrBXU0rwzZmSt1Ar0kcDpmQQuo1i4,674
+awslabs/syntheticdata_mcp_server/__init__.py,sha256=kfGBcnyPKEWN5hs3ViopoSKEsW8K2t3466vR3QiC2tQ,626
+awslabs/syntheticdata_mcp_server/pandas_interpreter.py,sha256=Uex1n2ieG9gt-qFb7zPtDHZYRne9qaawUu3NBONixVk,10067
+awslabs/syntheticdata_mcp_server/server.py,sha256=nP58jhBcw9qBuwTmFZ_kiA5rlTG0RFiFQbMB7FxBhow,28164
+awslabs/syntheticdata_mcp_server/storage/__init__.py,sha256=4p31heOr0rR6T6KF3XmZcUB4Nb-uLAw4O1gXfW838Q8,201
+awslabs/syntheticdata_mcp_server/storage/base.py,sha256=JkWFLYqioTlXHkEXoS0Ba-lUln7L5S_8WMSkqo4gTIA,1537
+awslabs/syntheticdata_mcp_server/storage/loader.py,sha256=ukFYmk9v6b6-MB9cfNyZsuCKTh977GWIndVeYOjzAfo,3118
+awslabs/syntheticdata_mcp_server/storage/s3.py,sha256=fSqGWbWjG4Lw3wI6QA3626KKQeKSMeL9_p6lzW2rkjU,7973
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/METADATA,sha256=GEF5t4h7YbFbSN1v2wVa-ypy9H84wIxdVuQEcjAtVYo,4707
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/entry_points.txt,sha256=irsU4Ts5TsNCUmV4-ctPbkeqC6amCzI8Es9At1jylcI,98
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/NOTICE,sha256=6YOTbc8gQC0JqDIjwjdSIVsY9CtVd1vJeKxu6oqEUiE,100
+awslabs_syntheticdata_mcp_server-0.0.1.dist-info/RECORD,,

awslabs_syntheticdata_mcp_server-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

awslabs_syntheticdata_mcp_server-0.0.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ awslabs.syntheticdata-mcp-server = awslabs.syntheticdata_mcp_server.server:main