awslabs.syntheticdata-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
awslabs/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ # This file is part of the awslabs namespace.
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
5
+ # with the License. A copy of the License is located at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
10
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
11
+ # and limitations under the License.
12
+ # It is intentionally minimal to support PEP 420 namespace packages.
@@ -0,0 +1,16 @@
1
+ #
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
5
+ # with the License. A copy of the License is located at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
10
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
11
+ # and limitations under the License.
12
+
13
+
14
+ """SyntheticData MCP Server package."""
15
+
16
+ __version__ = '0.0.0'
@@ -0,0 +1,267 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+
12
+
13
+ import ast
14
+ import os
15
+ import pandas as pd
16
+ from typing import Any, Dict, List
17
+
18
+
19
+ def safe_eval_dataframe(node: ast.AST) -> pd.DataFrame:
20
+ """Safely evaluate a DataFrame constructor.
21
+
22
+ Args:
23
+ node: The AST node representing the DataFrame constructor
24
+
25
+ Returns:
26
+ A pandas DataFrame object
27
+ """
28
+ # Extract the Call node from different node types
29
+ if isinstance(node, ast.Expr):
30
+ call_node = node.value
31
+ elif isinstance(node, ast.Assign):
32
+ call_node = node.value
33
+ elif isinstance(node, ast.Call):
34
+ call_node = node
35
+ else:
36
+ raise ValueError('Invalid DataFrame constructor: unexpected node type')
37
+
38
+ if not isinstance(call_node, ast.Call):
39
+ raise ValueError('Invalid DataFrame constructor: expected Call node')
40
+
41
+ if not isinstance(call_node.func, ast.Attribute) or not isinstance(
42
+ call_node.func.value, ast.Name
43
+ ):
44
+ raise ValueError('Invalid DataFrame constructor: invalid function call')
45
+
46
+ if call_node.func.value.id != 'pd' or call_node.func.attr != 'DataFrame':
47
+ raise ValueError('Only pd.DataFrame constructors are allowed')
48
+
49
+ try:
50
+ if len(call_node.args) > 0:
51
+ # Handle positional arguments
52
+ data = ast.literal_eval(call_node.args[0])
53
+ return pd.DataFrame(data)
54
+
55
+ # Handle keyword arguments (most common case with dictionary input)
56
+ for kw in call_node.keywords:
57
+ if kw.arg == 'data':
58
+ data = ast.literal_eval(kw.value)
59
+ return pd.DataFrame(data)
60
+
61
+ # If no data argument is found, try to evaluate as empty DataFrame
62
+ return pd.DataFrame()
63
+ except (ValueError, SyntaxError) as e:
64
+ raise ValueError(f'Error evaluating DataFrame constructor: {str(e)}')
65
+
66
+
67
+ def execute_pandas_code(code_string: str, output_dir: str) -> Dict[str, Any]:
68
+ """Execute pandas code and save any dataframes to CSV files.
69
+
70
+ Args:
71
+ code_string: A string containing pandas code (without imports)
72
+ output_dir: The directory where to save DataFrames as CSV files
73
+
74
+ Returns:
75
+ Dict containing execution results and information about saved files
76
+ """
77
+ # Verify directory path is valid before attempting anything
78
+ if os.path.exists(output_dir) and not os.path.isdir(output_dir):
79
+ return {
80
+ 'success': False,
81
+ 'message': 'No such file or directory',
82
+ 'error': 'No such file or directory',
83
+ }
84
+
85
+ # Parse and execute the code
86
+ try:
87
+ # Check for security violations
88
+ if any(keyword in code_string for keyword in ['import', '__import__', 'exec', 'eval']):
89
+ return {
90
+ 'success': False,
91
+ 'message': 'No DataFrames found in the code',
92
+ 'error': 'No DataFrames found in the code',
93
+ }
94
+
95
+ tree = ast.parse(code_string)
96
+ except SyntaxError:
97
+ # For syntax errors, return "No DataFrames found"
98
+ return {
99
+ 'success': False,
100
+ 'message': 'No DataFrames found in the code',
101
+ 'error': 'No DataFrames found in the code',
102
+ }
103
+
104
+ # Look for DataFrame assignments
105
+ dataframes = {}
106
+ try:
107
+ for node in tree.body:
108
+ if isinstance(node, ast.Assign):
109
+ for target in node.targets:
110
+ if isinstance(target, ast.Name):
111
+ try:
112
+ df = safe_eval_dataframe(node.value)
113
+ dataframes[target.id] = df
114
+ except (ValueError, SyntaxError):
115
+ pass # Not a DataFrame assignment
116
+
117
+ # If no DataFrames found, return early
118
+ if not dataframes:
119
+ return {
120
+ 'success': False,
121
+ 'message': 'No DataFrames found in the code',
122
+ 'error': 'No DataFrames found in the code',
123
+ }
124
+
125
+ # Try to create output directory and save files
126
+ saved_files = []
127
+ integrity_issues = []
128
+ try:
129
+ os.makedirs(output_dir, exist_ok=True)
130
+ for df_name, df in dataframes.items():
131
+ file_path = os.path.join(output_dir, f'{df_name}.csv')
132
+ df.to_csv(file_path, index=False)
133
+ saved_files.append(
134
+ {
135
+ 'name': df_name,
136
+ 'path': file_path,
137
+ 'shape': df.shape,
138
+ 'columns': df.columns.tolist(),
139
+ }
140
+ )
141
+
142
+ # Check referential integrity if multiple dataframes exist
143
+ if len(dataframes) > 1:
144
+ integrity_issues = check_referential_integrity(dataframes)
145
+
146
+ return {
147
+ 'success': True,
148
+ 'message': f'Saved {len(saved_files)} DataFrames to {output_dir}',
149
+ 'saved_files': saved_files,
150
+ 'integrity_issues': integrity_issues,
151
+ }
152
+ except (OSError, PermissionError) as e:
153
+ return {
154
+ 'success': False,
155
+ 'message': str(e),
156
+ 'error': 'Failed to save DataFrames',
157
+ }
158
+
159
+ except Exception:
160
+ # For any other errors, return "No DataFrames found"
161
+ return {
162
+ 'success': False,
163
+ 'message': 'No DataFrames found in the code',
164
+ 'error': 'No DataFrames found in the code',
165
+ }
166
+
167
+
168
+ def check_referential_integrity(dataframes: Dict[str, pd.DataFrame]) -> List[Dict[str, Any]]:
169
+ """Check referential integrity between dataframes.
170
+
171
+ This function does basic third normal form checks:
172
+ 1. Identifies potential foreign keys (columns with same name across tables)
173
+ 2. Checks if values in potential foreign key columns exist in the target table
174
+ 3. Checks for functional dependencies within each table
175
+
176
+ Args:
177
+ dataframes: Dictionary of dataframe name to dataframe object
178
+
179
+ Returns:
180
+ List of integrity issues found
181
+ """
182
+ issues = []
183
+
184
+ # Check for potential foreign keys and their integrity
185
+ for source_name, source_df in dataframes.items():
186
+ for target_name, target_df in dataframes.items():
187
+ if source_name == target_name:
188
+ continue
189
+
190
+ # Find columns with same name in both dataframes (potential foreign keys)
191
+ common_cols = set(source_df.columns).intersection(set(target_df.columns))
192
+
193
+ for col in common_cols:
194
+ # Check if column in target_df has unique values (could be a primary key)
195
+ if target_df[col].nunique() == len(target_df):
196
+ # Check if all values in source_df[col] exist in target_df[col]
197
+ source_values = set(source_df[col].dropna())
198
+ target_values = set(target_df[col])
199
+
200
+ missing_values = source_values - target_values
201
+ if missing_values:
202
+ issues.append(
203
+ {
204
+ 'type': 'referential_integrity',
205
+ 'source_table': source_name,
206
+ 'target_table': target_name,
207
+ 'column': col,
208
+ 'missing_values': list(missing_values)[
209
+ :10
210
+ ], # Limit to first 10 values
211
+ 'missing_count': len(missing_values),
212
+ }
213
+ )
214
+
215
+ # Check for functional dependencies
216
+ for df_name, df in dataframes.items():
217
+ for col1 in df.columns:
218
+ for col2 in df.columns:
219
+ if col1 == col2:
220
+ continue
221
+
222
+ # Group by potential determinant and check if it determines the dependent
223
+ grouped = df.groupby(col1)[col2].nunique()
224
+
225
+ # Check if each value in col1 maps to exactly one value in col2
226
+ if (grouped == 1).all():
227
+ issues.append(
228
+ {
229
+ 'type': 'functional_dependency',
230
+ 'table': df_name,
231
+ 'determinant': col1,
232
+ 'dependent': col2,
233
+ 'message': f"Column '{col1}' functionally determines '{col2}' (possible violation of 3NF)",
234
+ }
235
+ )
236
+
237
+ return issues
238
+
239
+
240
+ # Example usage
241
+ if __name__ == '__main__':
242
+ test_code = """
243
+ # Create a customers table
244
+ customers_df = pd.DataFrame({
245
+ 'customer_id': [1, 2, 3, 4],
246
+ 'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
247
+ 'city': ['New York', 'San Francisco', 'Seattle', 'Chicago'],
248
+ 'zip_code': ['10001', '94103', '98101', '60601']
249
+ })
250
+
251
+ # Create an orders table with a foreign key
252
+ orders_df = pd.DataFrame({
253
+ 'order_id': [101, 102, 103, 104, 105],
254
+ 'customer_id': [1, 2, 3, 5, 2], # Note: customer_id 5 doesn't exist
255
+ 'amount': [99.99, 149.99, 29.99, 59.99, 199.99],
256
+ 'order_date': ['2023-01-15', '2023-01-16', '2023-01-17', '2023-01-18', '2023-01-19']
257
+ })
258
+
259
+ # Create a table with a functional dependency issue (city determines zip_code)
260
+ address_df = pd.DataFrame({
261
+ 'address_id': [1, 2, 3, 4],
262
+ 'city': ['New York', 'San Francisco', 'New York', 'Seattle'],
263
+ 'zip_code': ['10001', '94103', '10001', '98101'] # Note: New York always has 10001
264
+ })
265
+ """
266
+ result = execute_pandas_code(test_code, 'test_output')
267
+ print(result)