awslabs.syntheticdata-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +12 -0
- awslabs/syntheticdata_mcp_server/__init__.py +16 -0
- awslabs/syntheticdata_mcp_server/pandas_interpreter.py +267 -0
- awslabs/syntheticdata_mcp_server/server.py +776 -0
- awslabs/syntheticdata_mcp_server/storage/__init__.py +7 -0
- awslabs/syntheticdata_mcp_server/storage/base.py +46 -0
- awslabs/syntheticdata_mcp_server/storage/loader.py +86 -0
- awslabs/syntheticdata_mcp_server/storage/s3.py +218 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/METADATA +144 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/RECORD +14 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_syntheticdata_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
awslabs/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# This file is part of the awslabs namespace.
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
5
|
+
# with the License. A copy of the License is located at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
10
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
11
|
+
# and limitations under the License.
|
|
12
|
+
# It is intentionally minimal to support PEP 420 namespace packages.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
5
|
+
# with the License. A copy of the License is located at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
10
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
11
|
+
# and limitations under the License.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
"""SyntheticData MCP Server package."""
|
|
15
|
+
|
|
16
|
+
__version__ = '0.0.0'
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import os
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from typing import Any, Dict, List
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def safe_eval_dataframe(node: ast.AST) -> pd.DataFrame:
|
|
20
|
+
"""Safely evaluate a DataFrame constructor.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
node: The AST node representing the DataFrame constructor
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
A pandas DataFrame object
|
|
27
|
+
"""
|
|
28
|
+
# Extract the Call node from different node types
|
|
29
|
+
if isinstance(node, ast.Expr):
|
|
30
|
+
call_node = node.value
|
|
31
|
+
elif isinstance(node, ast.Assign):
|
|
32
|
+
call_node = node.value
|
|
33
|
+
elif isinstance(node, ast.Call):
|
|
34
|
+
call_node = node
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError('Invalid DataFrame constructor: unexpected node type')
|
|
37
|
+
|
|
38
|
+
if not isinstance(call_node, ast.Call):
|
|
39
|
+
raise ValueError('Invalid DataFrame constructor: expected Call node')
|
|
40
|
+
|
|
41
|
+
if not isinstance(call_node.func, ast.Attribute) or not isinstance(
|
|
42
|
+
call_node.func.value, ast.Name
|
|
43
|
+
):
|
|
44
|
+
raise ValueError('Invalid DataFrame constructor: invalid function call')
|
|
45
|
+
|
|
46
|
+
if call_node.func.value.id != 'pd' or call_node.func.attr != 'DataFrame':
|
|
47
|
+
raise ValueError('Only pd.DataFrame constructors are allowed')
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
if len(call_node.args) > 0:
|
|
51
|
+
# Handle positional arguments
|
|
52
|
+
data = ast.literal_eval(call_node.args[0])
|
|
53
|
+
return pd.DataFrame(data)
|
|
54
|
+
|
|
55
|
+
# Handle keyword arguments (most common case with dictionary input)
|
|
56
|
+
for kw in call_node.keywords:
|
|
57
|
+
if kw.arg == 'data':
|
|
58
|
+
data = ast.literal_eval(kw.value)
|
|
59
|
+
return pd.DataFrame(data)
|
|
60
|
+
|
|
61
|
+
# If no data argument is found, try to evaluate as empty DataFrame
|
|
62
|
+
return pd.DataFrame()
|
|
63
|
+
except (ValueError, SyntaxError) as e:
|
|
64
|
+
raise ValueError(f'Error evaluating DataFrame constructor: {str(e)}')
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def execute_pandas_code(code_string: str, output_dir: str) -> Dict[str, Any]:
|
|
68
|
+
"""Execute pandas code and save any dataframes to CSV files.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
code_string: A string containing pandas code (without imports)
|
|
72
|
+
output_dir: The directory where to save DataFrames as CSV files
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Dict containing execution results and information about saved files
|
|
76
|
+
"""
|
|
77
|
+
# Verify directory path is valid before attempting anything
|
|
78
|
+
if os.path.exists(output_dir) and not os.path.isdir(output_dir):
|
|
79
|
+
return {
|
|
80
|
+
'success': False,
|
|
81
|
+
'message': 'No such file or directory',
|
|
82
|
+
'error': 'No such file or directory',
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Parse and execute the code
|
|
86
|
+
try:
|
|
87
|
+
# Check for security violations
|
|
88
|
+
if any(keyword in code_string for keyword in ['import', '__import__', 'exec', 'eval']):
|
|
89
|
+
return {
|
|
90
|
+
'success': False,
|
|
91
|
+
'message': 'No DataFrames found in the code',
|
|
92
|
+
'error': 'No DataFrames found in the code',
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
tree = ast.parse(code_string)
|
|
96
|
+
except SyntaxError:
|
|
97
|
+
# For syntax errors, return "No DataFrames found"
|
|
98
|
+
return {
|
|
99
|
+
'success': False,
|
|
100
|
+
'message': 'No DataFrames found in the code',
|
|
101
|
+
'error': 'No DataFrames found in the code',
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Look for DataFrame assignments
|
|
105
|
+
dataframes = {}
|
|
106
|
+
try:
|
|
107
|
+
for node in tree.body:
|
|
108
|
+
if isinstance(node, ast.Assign):
|
|
109
|
+
for target in node.targets:
|
|
110
|
+
if isinstance(target, ast.Name):
|
|
111
|
+
try:
|
|
112
|
+
df = safe_eval_dataframe(node.value)
|
|
113
|
+
dataframes[target.id] = df
|
|
114
|
+
except (ValueError, SyntaxError):
|
|
115
|
+
pass # Not a DataFrame assignment
|
|
116
|
+
|
|
117
|
+
# If no DataFrames found, return early
|
|
118
|
+
if not dataframes:
|
|
119
|
+
return {
|
|
120
|
+
'success': False,
|
|
121
|
+
'message': 'No DataFrames found in the code',
|
|
122
|
+
'error': 'No DataFrames found in the code',
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Try to create output directory and save files
|
|
126
|
+
saved_files = []
|
|
127
|
+
integrity_issues = []
|
|
128
|
+
try:
|
|
129
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
130
|
+
for df_name, df in dataframes.items():
|
|
131
|
+
file_path = os.path.join(output_dir, f'{df_name}.csv')
|
|
132
|
+
df.to_csv(file_path, index=False)
|
|
133
|
+
saved_files.append(
|
|
134
|
+
{
|
|
135
|
+
'name': df_name,
|
|
136
|
+
'path': file_path,
|
|
137
|
+
'shape': df.shape,
|
|
138
|
+
'columns': df.columns.tolist(),
|
|
139
|
+
}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Check referential integrity if multiple dataframes exist
|
|
143
|
+
if len(dataframes) > 1:
|
|
144
|
+
integrity_issues = check_referential_integrity(dataframes)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
'success': True,
|
|
148
|
+
'message': f'Saved {len(saved_files)} DataFrames to {output_dir}',
|
|
149
|
+
'saved_files': saved_files,
|
|
150
|
+
'integrity_issues': integrity_issues,
|
|
151
|
+
}
|
|
152
|
+
except (OSError, PermissionError) as e:
|
|
153
|
+
return {
|
|
154
|
+
'success': False,
|
|
155
|
+
'message': str(e),
|
|
156
|
+
'error': 'Failed to save DataFrames',
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
except Exception:
|
|
160
|
+
# For any other errors, return "No DataFrames found"
|
|
161
|
+
return {
|
|
162
|
+
'success': False,
|
|
163
|
+
'message': 'No DataFrames found in the code',
|
|
164
|
+
'error': 'No DataFrames found in the code',
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def check_referential_integrity(dataframes: Dict[str, pd.DataFrame]) -> List[Dict[str, Any]]:
|
|
169
|
+
"""Check referential integrity between dataframes.
|
|
170
|
+
|
|
171
|
+
This function does basic third normal form checks:
|
|
172
|
+
1. Identifies potential foreign keys (columns with same name across tables)
|
|
173
|
+
2. Checks if values in potential foreign key columns exist in the target table
|
|
174
|
+
3. Checks for functional dependencies within each table
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
dataframes: Dictionary of dataframe name to dataframe object
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of integrity issues found
|
|
181
|
+
"""
|
|
182
|
+
issues = []
|
|
183
|
+
|
|
184
|
+
# Check for potential foreign keys and their integrity
|
|
185
|
+
for source_name, source_df in dataframes.items():
|
|
186
|
+
for target_name, target_df in dataframes.items():
|
|
187
|
+
if source_name == target_name:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# Find columns with same name in both dataframes (potential foreign keys)
|
|
191
|
+
common_cols = set(source_df.columns).intersection(set(target_df.columns))
|
|
192
|
+
|
|
193
|
+
for col in common_cols:
|
|
194
|
+
# Check if column in target_df has unique values (could be a primary key)
|
|
195
|
+
if target_df[col].nunique() == len(target_df):
|
|
196
|
+
# Check if all values in source_df[col] exist in target_df[col]
|
|
197
|
+
source_values = set(source_df[col].dropna())
|
|
198
|
+
target_values = set(target_df[col])
|
|
199
|
+
|
|
200
|
+
missing_values = source_values - target_values
|
|
201
|
+
if missing_values:
|
|
202
|
+
issues.append(
|
|
203
|
+
{
|
|
204
|
+
'type': 'referential_integrity',
|
|
205
|
+
'source_table': source_name,
|
|
206
|
+
'target_table': target_name,
|
|
207
|
+
'column': col,
|
|
208
|
+
'missing_values': list(missing_values)[
|
|
209
|
+
:10
|
|
210
|
+
], # Limit to first 10 values
|
|
211
|
+
'missing_count': len(missing_values),
|
|
212
|
+
}
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Check for functional dependencies
|
|
216
|
+
for df_name, df in dataframes.items():
|
|
217
|
+
for col1 in df.columns:
|
|
218
|
+
for col2 in df.columns:
|
|
219
|
+
if col1 == col2:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Group by potential determinant and check if it determines the dependent
|
|
223
|
+
grouped = df.groupby(col1)[col2].nunique()
|
|
224
|
+
|
|
225
|
+
# Check if each value in col1 maps to exactly one value in col2
|
|
226
|
+
if (grouped == 1).all():
|
|
227
|
+
issues.append(
|
|
228
|
+
{
|
|
229
|
+
'type': 'functional_dependency',
|
|
230
|
+
'table': df_name,
|
|
231
|
+
'determinant': col1,
|
|
232
|
+
'dependent': col2,
|
|
233
|
+
'message': f"Column '{col1}' functionally determines '{col2}' (possible violation of 3NF)",
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return issues
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# Example usage
|
|
241
|
+
if __name__ == '__main__':
|
|
242
|
+
test_code = """
|
|
243
|
+
# Create a customers table
|
|
244
|
+
customers_df = pd.DataFrame({
|
|
245
|
+
'customer_id': [1, 2, 3, 4],
|
|
246
|
+
'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
|
|
247
|
+
'city': ['New York', 'San Francisco', 'Seattle', 'Chicago'],
|
|
248
|
+
'zip_code': ['10001', '94103', '98101', '60601']
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
# Create an orders table with a foreign key
|
|
252
|
+
orders_df = pd.DataFrame({
|
|
253
|
+
'order_id': [101, 102, 103, 104, 105],
|
|
254
|
+
'customer_id': [1, 2, 3, 5, 2], # Note: customer_id 5 doesn't exist
|
|
255
|
+
'amount': [99.99, 149.99, 29.99, 59.99, 199.99],
|
|
256
|
+
'order_date': ['2023-01-15', '2023-01-16', '2023-01-17', '2023-01-18', '2023-01-19']
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
# Create a table with a functional dependency issue (city determines zip_code)
|
|
260
|
+
address_df = pd.DataFrame({
|
|
261
|
+
'address_id': [1, 2, 3, 4],
|
|
262
|
+
'city': ['New York', 'San Francisco', 'New York', 'Seattle'],
|
|
263
|
+
'zip_code': ['10001', '94103', '10001', '98101'] # Note: New York always has 10001
|
|
264
|
+
})
|
|
265
|
+
"""
|
|
266
|
+
result = execute_pandas_code(test_code, 'test_output')
|
|
267
|
+
print(result)
|