awslabs.syntheticdata-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """Storage module for synthetic data loading."""
2
+
3
+ from .base import DataTarget
4
+ from .s3 import S3Target
5
+ from .loader import UnifiedDataLoader
6
+
7
+ __all__ = ['DataTarget', 'S3Target', 'UnifiedDataLoader']
@@ -0,0 +1,46 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+
12
+
13
+ """Base classes for data storage targets."""
14
+
15
+ from abc import ABC, abstractmethod
16
+ from typing import Any, Dict, List
17
+
18
+
19
+ class DataTarget(ABC):
20
+ """Abstract base class for data storage targets."""
21
+
22
+ @abstractmethod
23
+ async def load(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> Dict:
24
+ """Load data to the target storage.
25
+
26
+ Args:
27
+ data: Dictionary mapping table names to lists of records
28
+ config: Target-specific configuration
29
+
30
+ Returns:
31
+ Dictionary containing load results
32
+ """
33
+ pass
34
+
35
+ @abstractmethod
36
+ async def validate(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> bool:
37
+ """Validate data and configuration before loading.
38
+
39
+ Args:
40
+ data: Dictionary mapping table names to lists of records
41
+ config: Target-specific configuration
42
+
43
+ Returns:
44
+ True if validation passes, False otherwise
45
+ """
46
+ pass
@@ -0,0 +1,86 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+
12
+
13
+ """Unified data loader implementation."""
14
+
15
+ from .s3 import S3Target
16
+ from typing import Any, Dict, List
17
+
18
+
19
+ class UnifiedDataLoader:
20
+ """Loader that supports multiple storage targets."""
21
+
22
+ def __init__(self):
23
+ """Initialize with supported storage targets."""
24
+ self.targets = {'s3': S3Target()}
25
+
26
+ async def load_data(
27
+ self, data: Dict[str, List[Dict]], targets: List[Dict[str, Any]]
28
+ ) -> Dict[str, Any]:
29
+ """Load data to multiple storage targets.
30
+
31
+ Args:
32
+ data: Dictionary mapping table names to lists of records
33
+ targets: List of target configurations, each containing:
34
+ - type: Target type (e.g., 's3')
35
+ - config: Target-specific configuration
36
+
37
+ Returns:
38
+ Dictionary containing results for each target
39
+ """
40
+ results = {}
41
+
42
+ for target_config in targets:
43
+ # Validate target config structure
44
+ if not isinstance(target_config, dict):
45
+ results['unknown'] = {
46
+ 'success': False,
47
+ 'error': 'Invalid target configuration format',
48
+ }
49
+ continue
50
+
51
+ target_type = target_config.get('type')
52
+ if not target_type:
53
+ results['unknown'] = {'success': False, 'error': 'Missing target type'}
54
+ continue
55
+
56
+ if target_type not in self.targets:
57
+ results[target_type] = {
58
+ 'success': False,
59
+ 'error': f'Unsupported target type: {target_type}',
60
+ }
61
+ continue
62
+
63
+ target = self.targets[target_type]
64
+ config = target_config.get('config', {})
65
+
66
+ # Validate configuration
67
+ try:
68
+ is_valid = await target.validate(data, config)
69
+ if not is_valid:
70
+ results[target_type] = {
71
+ 'success': False,
72
+ 'error': 'Invalid configuration or data',
73
+ }
74
+ continue
75
+ except Exception as e:
76
+ results[target_type] = {'success': False, 'error': str(e)}
77
+ continue
78
+
79
+ # Load data
80
+ try:
81
+ result = await target.load(data, config)
82
+ results[target_type] = result
83
+ except Exception as e:
84
+ results[target_type] = {'success': False, 'error': str(e)}
85
+
86
+ return {'success': all(r['success'] for r in results.values()), 'results': results}
@@ -0,0 +1,218 @@
1
+ """S3 storage target implementation."""
2
+
3
+ import asyncio
4
+ import boto3
5
+ import os
6
+ import pandas as pd
7
+ from .base import DataTarget
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from typing import Any, Dict, List, Optional
10
+
11
+
12
+ class S3Target(DataTarget):
13
+ """AWS S3 storage target implementation."""
14
+
15
+ def __init__(self):
16
+ """Initialize S3 target with boto3 client."""
17
+ session = boto3.Session(profile_name=os.environ.get('AWS_PROFILE'))
18
+ self.s3_client = session.client('s3')
19
+ self.supported_formats = ['csv', 'json', 'parquet']
20
+ self.executor = ThreadPoolExecutor(max_workers=4)
21
+
22
+ async def validate(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> bool:
23
+ """Validate data and S3 configuration.
24
+
25
+ Args:
26
+ data: Dictionary mapping table names to lists of records
27
+ config: S3 configuration including bucket, prefix, format, etc.
28
+
29
+ Returns:
30
+ True if validation passes, False otherwise
31
+ """
32
+ try:
33
+ # Check required config
34
+ required_fields = ['bucket', 'prefix', 'format']
35
+ if not all(field in config for field in required_fields):
36
+ return False
37
+
38
+ # Validate format
39
+ if config['format'] not in self.supported_formats:
40
+ return False
41
+
42
+ # Validate data
43
+ if not data or not all(isinstance(records, list) for records in data.values()):
44
+ return False
45
+
46
+ # Check S3 access
47
+ try:
48
+ self.s3_client.head_bucket(Bucket=config['bucket'])
49
+ except Exception:
50
+ return False
51
+
52
+ return True
53
+
54
+ except Exception:
55
+ return False
56
+
57
+ async def load(self, data: Dict[str, List[Dict]], config: Dict[str, Any]) -> Dict:
58
+ """Load data to S3 with specified configuration.
59
+
60
+ Args:
61
+ data: Dictionary mapping table names to lists of records
62
+ config: S3 configuration including:
63
+ - bucket: S3 bucket name
64
+ - prefix: Key prefix for S3 objects
65
+ - format: Output format (csv, json, parquet)
66
+ - partitioning: Optional partitioning configuration
67
+ - storage: Optional storage class and encryption settings
68
+ - metadata: Optional object metadata
69
+
70
+ Returns:
71
+ Dictionary containing load results
72
+ """
73
+ try:
74
+ # Convert to DataFrames
75
+ dataframes = {name: pd.DataFrame(records) for name, records in data.items()}
76
+
77
+ # Apply partitioning if enabled
78
+ if config.get('partitioning', {}).get('enabled'):
79
+ partitioned_data = self._apply_partitioning(dataframes, config['partitioning'])
80
+ else:
81
+ partitioned_data = {name: {'': df} for name, df in dataframes.items()}
82
+
83
+ # Process each table and partition
84
+ upload_tasks = []
85
+ for table_name, partitions in partitioned_data.items():
86
+ for partition_key, df in partitions.items():
87
+ # Construct S3 key
88
+ partition_path = f'{partition_key}/' if partition_key else ''
89
+ key = f'{config["prefix"]}{table_name}/{partition_path}{table_name}.{config["format"]}'
90
+
91
+ # Convert to specified format
92
+ content = self._convert_format(df, config['format'], config.get('compression'))
93
+
94
+ # Create upload task
95
+ task = self._upload_to_s3(
96
+ content,
97
+ config['bucket'],
98
+ key,
99
+ config.get('storage', {}),
100
+ config.get('metadata', {}),
101
+ )
102
+ upload_tasks.append(task)
103
+
104
+ # Execute uploads in parallel
105
+ results = await asyncio.gather(*upload_tasks)
106
+
107
+ return {
108
+ 'success': True,
109
+ 'uploaded_files': results,
110
+ 'total_records': sum(len(df) for df in dataframes.values()),
111
+ }
112
+
113
+ except Exception as e:
114
+ return {'success': False, 'error': str(e)}
115
+
116
+ def _convert_format(
117
+ self, df: pd.DataFrame, format: str, compression: Optional[str] = None
118
+ ) -> bytes:
119
+ """Convert DataFrame to specified format.
120
+
121
+ Args:
122
+ df: pandas DataFrame to convert
123
+ format: Target format (csv, json, parquet)
124
+ compression: Optional compression type
125
+
126
+ Returns:
127
+ Bytes containing the converted data
128
+ """
129
+ if format == 'parquet':
130
+ return df.to_parquet(compression=compression)
131
+ elif format == 'csv':
132
+ csv_data = df.to_csv(index=False)
133
+ return csv_data.encode() if csv_data is not None else b''
134
+ elif format == 'json':
135
+ json_data = df.to_json(orient='records')
136
+ return json_data.encode() if json_data is not None else b''
137
+ else:
138
+ raise ValueError(f'Unsupported format: {format}')
139
+
140
+ def _apply_partitioning(
141
+ self, dataframes: Dict[str, pd.DataFrame], partition_config: Dict[str, Any]
142
+ ) -> Dict[str, Dict[str, pd.DataFrame]]:
143
+ """Apply partitioning to DataFrames.
144
+
145
+ Args:
146
+ dataframes: Dictionary of table name to DataFrame
147
+ partition_config: Partitioning configuration
148
+
149
+ Returns:
150
+ Dictionary mapping table names to dictionaries of partition key to DataFrame
151
+ """
152
+ partitioned_data = {}
153
+ partition_cols = partition_config['columns']
154
+
155
+ for table_name, df in dataframes.items():
156
+ # Skip if partition columns don't exist
157
+ if not all(col in df.columns for col in partition_cols):
158
+ partitioned_data[table_name] = {'': df}
159
+ continue
160
+
161
+ # Group by partition columns
162
+ grouped = df.groupby(partition_cols)
163
+ partitions = {}
164
+
165
+ for group_key, group_df in grouped:
166
+ # Create partition key
167
+ if isinstance(group_key, tuple):
168
+ partition_key = '/'.join(str(k) for k in group_key)
169
+ else:
170
+ partition_key = str(group_key)
171
+
172
+ # Remove partition columns if specified
173
+ if partition_config.get('drop_columns', False):
174
+ group_df = group_df.drop(columns=partition_cols)
175
+
176
+ partitions[partition_key] = group_df
177
+
178
+ partitioned_data[table_name] = partitions
179
+
180
+ return partitioned_data
181
+
182
+ async def _upload_to_s3(
183
+ self, content: bytes, bucket: str, key: str, storage_config: Dict, metadata: Dict
184
+ ) -> Dict:
185
+ """Upload content to S3 with specified configuration.
186
+
187
+ Args:
188
+ content: Bytes to upload
189
+ bucket: S3 bucket name
190
+ key: S3 object key
191
+ storage_config: Storage class and encryption settings
192
+ metadata: Object metadata
193
+
194
+ Returns:
195
+ Dictionary containing upload details
196
+ """
197
+ try:
198
+ # Run S3 upload in thread pool
199
+ await asyncio.get_event_loop().run_in_executor(
200
+ self.executor,
201
+ lambda: self.s3_client.put_object(
202
+ Bucket=bucket,
203
+ Key=key,
204
+ Body=content,
205
+ StorageClass=storage_config.get('class', 'STANDARD'),
206
+ Metadata=metadata,
207
+ **(
208
+ {'ServerSideEncryption': storage_config['encryption']}
209
+ if storage_config.get('encryption')
210
+ else {}
211
+ ),
212
+ ),
213
+ )
214
+
215
+ return {'bucket': bucket, 'key': key, 'size': len(content), 'metadata': metadata}
216
+
217
+ except Exception as e:
218
+ raise Exception(f'Failed to upload to S3: {str(e)}')
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: awslabs.syntheticdata-mcp-server
3
+ Version: 0.0.1
4
+ Summary: An AWS Labs Model Context Protocol (MCP) server for syntheticdata
5
+ Project-URL: homepage, https://awslabs.github.io/mcp/
6
+ Project-URL: docs, https://awslabs.github.io/mcp/servers/syntheticdata-mcp-server/
7
+ Project-URL: documentation, https://awslabs.github.io/mcp/servers/syntheticdata-mcp-server/
8
+ Project-URL: repository, https://github.com/awslabs/mcp.git
9
+ Project-URL: changelog, https://github.com/awslabs/mcp/blob/main/src/syntheticdata-mcp-server/CHANGELOG.md
10
+ Author: Amazon Web Services
11
+ Author-email: AWSLabs MCP <203918161+awslabs-mcp@users.noreply.github.com>
12
+ License: Apache-2.0
13
+ License-File: LICENSE
14
+ License-File: NOTICE
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: boto3>=1.34.0
25
+ Requires-Dist: mcp[cli]>=1.6.0
26
+ Requires-Dist: pandas>=2.0.0
27
+ Requires-Dist: pyarrow>=14.0.1
28
+ Requires-Dist: pydantic>=2.10.6
29
+ Requires-Dist: python-snappy>=0.6.1
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Synthetic Data MCP Server
33
+
34
+ A Model Context Protocol (MCP) server for generating, validating, and managing synthetic data.
35
+
36
+ ## Overview
37
+
38
+ This MCP server provides tools for generating synthetic data based on business descriptions, executing pandas code safely, validating data structures, and loading data to storage systems like S3.
39
+
40
+ ## Features
41
+
42
+ - **Business-Driven Generation**: Generate synthetic data instructions based on business descriptions
43
+ - **Data Generation Instructions**: Generate structured data generation instructions from business descriptions
44
+ - **Safe Pandas Code Execution**: Run pandas code in a restricted environment with automatic DataFrame detection
45
+ - **JSON Lines Validation**: Validate and convert JSON Lines data to CSV format
46
+ - **Data Validation**: Validate data structure, referential integrity, and save as CSV files
47
+ - **Referential Integrity Checking**: Validate relationships between tables
48
+ - **Data Quality Assessment**: Identify potential issues in data models (3NF validation)
49
+ - **Storage Integration**: Load data to various storage targets (S3) with support for:
50
+ - Multiple file formats (CSV, JSON, Parquet)
51
+ - Partitioning options
52
+ - Storage class configuration
53
+ - Encryption settings
54
+
55
+ ## Prerequisites
56
+
57
+ 1. Install `uv` from [Astral](https://docs.astral.sh/uv/getting-started/installation/) or the [GitHub README](https://github.com/astral-sh/uv#installation)
58
+ 2. Install Python using `uv python install 3.10`
59
+ 3. Set up AWS credentials with access to AWS services
60
+ - You need an AWS account with appropriate permissions
61
+ - Configure AWS credentials with `aws configure` or environment variables
62
+
63
+ ## Installation
64
+
65
+ ```json
66
+ {
67
+ "mcpServers": {
68
+ "awslabs.syntheticdata-mcp-server": {
69
+ "command": "uvx",
70
+ "args": ["awslabs.syntheticdata-mcp-server"],
71
+ "env": {
72
+ "FASTMCP_LOG_LEVEL": "ERROR",
73
+ "AWS_PROFILE": "your-aws-profile",
74
+ "AWS_REGION": "us-east-1"
75
+ },
76
+ "autoApprove": [],
77
+ "disabled": false
78
+ }
79
+ }
80
+ }
81
+ ```
82
+
83
+ NOTE: Your credentials will need to be kept refreshed from your host
84
+
85
+ ### AWS Authentication
86
+
87
+ The MCP server uses the AWS profile specified in the `AWS_PROFILE` environment variable. If not provided, it defaults to the "default" profile in your AWS configuration file.
88
+
89
+ ```json
90
+ "env": {
91
+ "AWS_PROFILE": "your-aws-profile"
92
+ }
93
+ ```
94
+
95
+ ## Usage
96
+
97
+ ### Getting Data Generation Instructions
98
+
99
+ ```python
100
+ response = await server.get_data_generation_instructions(
101
+ business_description="An e-commerce platform with customers, orders, and products"
102
+ )
103
+ ```
104
+
105
+ ### Executing Pandas Code
106
+
107
+ ```python
108
+ response = await server.execute_pandas_code(
109
+ code="your_pandas_code_here",
110
+ workspace_dir="/path/to/workspace",
111
+ output_dir="data"
112
+ )
113
+ ```
114
+
115
+ ### Validating and Saving Data
116
+
117
+ ```python
118
+ response = await server.validate_and_save_data(
119
+ data={
120
+ "customers": [{"id": 1, "name": "John"}],
121
+ "orders": [{"id": 101, "customer_id": 1}]
122
+ },
123
+ workspace_dir="/path/to/workspace",
124
+ output_dir="data"
125
+ )
126
+ ```
127
+
128
+ ### Loading to Storage
129
+
130
+ ```python
131
+ response = await server.load_to_storage(
132
+ data={
133
+ "customers": [{"id": 1, "name": "John"}]
134
+ },
135
+ targets=[{
136
+ "type": "s3",
137
+ "config": {
138
+ "bucket": "my-bucket",
139
+ "prefix": "data/",
140
+ "format": "parquet"
141
+ }
142
+ }]
143
+ )
144
+ ```
@@ -0,0 +1,14 @@
1
+ awslabs/__init__.py,sha256=8r6KA5knEPJXmJXrBXU0rwzZmSt1Ar0kcDpmQQuo1i4,674
2
+ awslabs/syntheticdata_mcp_server/__init__.py,sha256=kfGBcnyPKEWN5hs3ViopoSKEsW8K2t3466vR3QiC2tQ,626
3
+ awslabs/syntheticdata_mcp_server/pandas_interpreter.py,sha256=Uex1n2ieG9gt-qFb7zPtDHZYRne9qaawUu3NBONixVk,10067
4
+ awslabs/syntheticdata_mcp_server/server.py,sha256=nP58jhBcw9qBuwTmFZ_kiA5rlTG0RFiFQbMB7FxBhow,28164
5
+ awslabs/syntheticdata_mcp_server/storage/__init__.py,sha256=4p31heOr0rR6T6KF3XmZcUB4Nb-uLAw4O1gXfW838Q8,201
6
+ awslabs/syntheticdata_mcp_server/storage/base.py,sha256=JkWFLYqioTlXHkEXoS0Ba-lUln7L5S_8WMSkqo4gTIA,1537
7
+ awslabs/syntheticdata_mcp_server/storage/loader.py,sha256=ukFYmk9v6b6-MB9cfNyZsuCKTh977GWIndVeYOjzAfo,3118
8
+ awslabs/syntheticdata_mcp_server/storage/s3.py,sha256=fSqGWbWjG4Lw3wI6QA3626KKQeKSMeL9_p6lzW2rkjU,7973
9
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/METADATA,sha256=GEF5t4h7YbFbSN1v2wVa-ypy9H84wIxdVuQEcjAtVYo,4707
10
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/entry_points.txt,sha256=irsU4Ts5TsNCUmV4-ctPbkeqC6amCzI8Es9At1jylcI,98
12
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
13
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/NOTICE,sha256=6YOTbc8gQC0JqDIjwjdSIVsY9CtVd1vJeKxu6oqEUiE,100
14
+ awslabs_syntheticdata_mcp_server-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ awslabs.syntheticdata-mcp-server = awslabs.syntheticdata_mcp_server.server:main