awslabs.syntheticdata-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,776 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+
12
+ """AWS syntheticdata MCP Server implementation."""
13
+
14
+ import argparse
15
+ import os
16
+ import pandas as pd
17
+ import re
18
+ from awslabs.syntheticdata_mcp_server.pandas_interpreter import (
19
+ execute_pandas_code as _execute_pandas_code,
20
+ )
21
+ from awslabs.syntheticdata_mcp_server.storage import UnifiedDataLoader
22
+ from mcp.server.fastmcp import FastMCP
23
+ from pydantic import BaseModel, Field
24
+ from typing import Any, Dict, List, Optional
25
+
26
+
27
+ class ExecutePandasCodeInput(BaseModel):
28
+ """Input model for executing pandas code to generate synthetic data.
29
+
30
+ This model defines the required parameters for running pandas code in a restricted
31
+ environment and saving the resulting DataFrames as CSV files.
32
+
33
+ Attributes:
34
+ code: Python code that uses pandas to generate synthetic data. The code should
35
+ define one or more pandas DataFrames. Pandas is already available as "pd".
36
+ workspace_dir: The current workspace directory. Critical for saving files to
37
+ the user's current project.
38
+ output_dir: Optional subdirectory within workspace_dir to save CSV files to.
39
+ If not provided, files will be saved directly to workspace_dir.
40
+ """
41
+
42
+ code: str = Field(
43
+ ...,
44
+ description='Python code that uses pandas to generate synthetic data. The code should define one or more pandas DataFrames. Pandas is already available as "pd".',
45
+ )
46
+ workspace_dir: str = Field(
47
+ ...,
48
+ description="CRITICAL: The current workspace directory. Assistant must always provide this parameter to save files to the user's current project.",
49
+ )
50
+ output_dir: Optional[str] = Field(
51
+ None,
52
+ description='Optional subdirectory within workspace_dir to save CSV files to. If not provided, files will be saved directly to workspace_dir.',
53
+ )
54
+
55
+
56
+ class ValidateAndSaveDataInput(BaseModel):
57
+ """Input model for validating and saving data as CSV files.
58
+
59
+ This model defines the required parameters for validating JSON Lines data structure
60
+ and saving the data as CSV files using pandas.
61
+
62
+ Attributes:
63
+ data: Dictionary mapping table names to lists of records. Each record should
64
+ be a dictionary mapping column names to values.
65
+ workspace_dir: The current workspace directory. Critical for saving files to
66
+ the user's current project.
67
+ output_dir: Optional subdirectory within workspace_dir to save CSV files to.
68
+ If not provided, files will be saved directly to workspace_dir.
69
+ """
70
+
71
+ data: Dict[str, List[Dict]] = Field(
72
+ ...,
73
+ description='Dictionary mapping table names to lists of records. Each record should be a dictionary mapping column names to values.',
74
+ )
75
+ workspace_dir: str = Field(
76
+ ...,
77
+ description="CRITICAL: The current workspace directory. Assistant must always provide this parameter to save files to the user's current project.",
78
+ )
79
+ output_dir: Optional[str] = Field(
80
+ None,
81
+ description='Optional subdirectory within workspace_dir to save CSV files to. If not provided, files will be saved directly to workspace_dir.',
82
+ )
83
+
84
+
85
+ class LoadToStorageInput(BaseModel):
86
+ """Input model for loading data to storage targets.
87
+
88
+ This model defines the required parameters for loading data to configured storage
89
+ targets like S3, with support for various formats and optimizations.
90
+
91
+ Attributes:
92
+ data: Dictionary mapping table names to lists of records. Each record should
93
+ be a dictionary mapping column names to values.
94
+ targets: List of target configurations. Each target should have a "type"
95
+ (e.g., "s3") and target-specific "config".
96
+ """
97
+
98
+ data: Dict[str, List[Dict]] = Field(
99
+ ...,
100
+ description='Dictionary mapping table names to lists of records. Each record should be a dictionary mapping column names to values.',
101
+ )
102
+ targets: List[Dict[str, Any]] = Field(
103
+ ...,
104
+ description='List of target configurations. Each target should have a "type" (e.g., "s3") and target-specific "config".',
105
+ )
106
+
107
+
108
+ mcp = FastMCP(
109
+ 'awslabs.syntheticdata-mcp-server',
110
+ instructions="""
111
+ # awslabs Synthetic Data MCP Server
112
+
113
+ This MCP server provides tools for generating high-quality synthetic data based on business use cases.
114
+
115
+ ## Capabilities
116
+
117
+ - Provides detailed instructions for generating synthetic data based on business descriptions
118
+ - Validates and saves JSON Lines data as CSV files
119
+ - Loads data to various storage targets (S3, with more coming soon)
120
+ - Supports multiple data formats (CSV, JSON, Parquet)
121
+ - Handles data partitioning and storage optimization
122
+
123
+ ## Workflow
124
+
125
+ 1. Start by describing your business domain and use case
126
+ 2. Get detailed instructions for generating synthetic data
127
+ 3. Generate the data in JSON Lines format following the instructions
128
+ 4. Validate and save the data as CSV files
129
+ 5. (Optional) Load the data to storage targets like S3 with optimized formats and partitioning
130
+
131
+ ## Use Cases
132
+
133
+ - Development and testing environments
134
+ - ML model training and validation
135
+ - Demo applications and presentations
136
+ - Data pipeline testing
137
+ """,
138
+ dependencies=[
139
+ 'pydantic',
140
+ 'pandas',
141
+ 'boto3',
142
+ ],
143
+ )
144
+
145
+
146
+ @mcp.tool(name='get_data_generation_instructions')
147
+ async def get_data_generation_instructions(
148
+ business_description: str = Field(
149
+ ...,
150
+ description='A detailed description of the business domain and use case. The more specific and comprehensive the description, the better the data generation instructions will be.',
151
+ ),
152
+ ) -> Dict:
153
+ """Get instructions for generating synthetic data based on a business description.
154
+
155
+ This tool analyzes a business description and provides detailed instructions
156
+ for generating synthetic data in JSON Lines format.
157
+
158
+ Parameters:
159
+ business_description: A description of the business use case
160
+
161
+ Returns:
162
+ A dictionary containing detailed instructions for generating synthetic data
163
+ """
164
+ try:
165
+ # Validate input
166
+ if not business_description or not business_description.strip():
167
+ return {'success': False, 'error': 'Business description cannot be empty'}
168
+
169
+ # Extract key entities and concepts from the business description
170
+ entities = _extract_key_entities(business_description)
171
+
172
+ # Generate instructions for data structure
173
+ data_structure_instructions = _generate_data_structure_instructions(
174
+ business_description, entities
175
+ )
176
+
177
+ # Generate instructions for data generation
178
+ data_generation_instructions = _generate_data_generation_instructions(entities)
179
+
180
+ # Generate example data
181
+ example_data = _generate_example_data(entities)
182
+
183
+ # Compile all instructions
184
+ instructions = {
185
+ 'overview': f"Based on the business description: '{business_description}', you should generate synthetic data with the following structure and characteristics:",
186
+ 'data_structure_instructions': data_structure_instructions,
187
+ 'data_generation_instructions': data_generation_instructions,
188
+ 'format_instructions': {
189
+ 'format': 'JSON Lines',
190
+ 'description': 'Each line should be a valid JSON object representing a single record. Different tables should be in separate JSON Lines files.',
191
+ 'example': example_data,
192
+ },
193
+ 'validation_instructions': {
194
+ 'description': 'After generating the data, use the validate_and_save_data tool to validate and save the data as CSV files.',
195
+ 'parameters': {
196
+ 'data': 'The JSON Lines data you generated',
197
+ 'workspace_dir': 'IMPORTANT: Always provide the current workspace directory',
198
+ 'output_dir': 'Optional subdirectory within workspace_dir (defaults to workspace_dir)',
199
+ },
200
+ },
201
+ }
202
+
203
+ return {
204
+ 'success': True,
205
+ 'instructions': instructions,
206
+ }
207
+ except Exception as e:
208
+ return {
209
+ 'success': False,
210
+ 'error': str(e),
211
+ }
212
+
213
+
214
+ @mcp.tool(name='validate_and_save_data')
215
+ async def validate_and_save_data(input_data: ValidateAndSaveDataInput) -> Dict:
216
+ """Validate JSON Lines data and save it as CSV files.
217
+
218
+ This tool validates the structure of JSON Lines data and saves it as CSV files
219
+ using pandas.
220
+
221
+ Parameters:
222
+ data: Dictionary mapping table names to lists of records
223
+ workspace_dir: CRITICAL - The current workspace directory
224
+ output_dir: Optional subdirectory within workspace_dir to save CSV files to
225
+
226
+ Returns:
227
+ A dictionary containing validation results and paths to saved CSV files
228
+ """
229
+ try:
230
+ # Initialize results
231
+ csv_paths = {}
232
+ row_counts = {}
233
+ validation_results = {}
234
+ save_dir = input_data.workspace_dir
235
+ if input_data.output_dir:
236
+ save_dir = os.path.join(input_data.workspace_dir, input_data.output_dir)
237
+
238
+ # Validate all tables first
239
+ for table_name, records in input_data.data.items():
240
+ validation_result = _validate_table_data(table_name, records)
241
+ validation_results[table_name] = validation_result
242
+
243
+ # Check if all tables are valid
244
+ all_valid = all(result['is_valid'] for result in validation_results.values())
245
+
246
+ # If any validation failed, return error
247
+ if not all_valid:
248
+ error_messages = []
249
+ for table_name, result in validation_results.items():
250
+ if not result['is_valid']:
251
+ error_messages.extend(result['errors'])
252
+ return {
253
+ 'success': False,
254
+ 'error': '; '.join(error_messages),
255
+ 'validation_results': validation_results,
256
+ }
257
+
258
+ # Create directory and save tables
259
+ try:
260
+ os.makedirs(save_dir, exist_ok=True)
261
+ for table_name, records in input_data.data.items():
262
+ # Convert to DataFrame
263
+ df = pd.DataFrame(records)
264
+
265
+ # Save as CSV
266
+ csv_path = os.path.join(save_dir, f'{table_name}.csv')
267
+ df.to_csv(csv_path, index=False)
268
+
269
+ # Record results
270
+ csv_paths[table_name] = csv_path
271
+ row_counts[table_name] = len(df)
272
+
273
+ return {
274
+ 'success': True,
275
+ 'validation_results': validation_results,
276
+ 'csv_paths': csv_paths,
277
+ 'row_counts': row_counts,
278
+ 'output_dir': save_dir,
279
+ }
280
+ except Exception as e:
281
+ return {
282
+ 'success': False,
283
+ 'error': str(e),
284
+ 'validation_results': validation_results,
285
+ }
286
+ except Exception as e:
287
+ return {
288
+ 'success': False,
289
+ 'error': str(e),
290
+ }
291
+
292
+
293
+ @mcp.tool(name='load_to_storage')
294
+ async def load_to_storage(input_data: LoadToStorageInput) -> Dict:
295
+ """Load data to one or more storage targets.
296
+
297
+ This tool uses the UnifiedDataLoader to load data to configured storage targets.
298
+ Currently supports:
299
+ - S3: Load data as CSV, JSON, or Parquet files with optional partitioning
300
+
301
+ Example targets configuration:
302
+ ```python
303
+ targets = [
304
+ {
305
+ 'type': 's3',
306
+ 'config': {
307
+ 'bucket': 'my-bucket',
308
+ 'prefix': 'data/users/',
309
+ 'format': 'parquet',
310
+ 'partitioning': {'enabled': True, 'columns': ['region']},
311
+ 'storage': {'class': 'INTELLIGENT_TIERING', 'encryption': 'AES256'},
312
+ },
313
+ }
314
+ ]
315
+ ```
316
+
317
+ Parameters:
318
+ data: Dictionary mapping table names to lists of records
319
+ targets: List of target configurations
320
+
321
+ Returns:
322
+ Dictionary containing results for each target
323
+ """
324
+ try:
325
+ loader = UnifiedDataLoader()
326
+ result = await loader.load_data(input_data.data, input_data.targets)
327
+ return result
328
+ except Exception as e:
329
+ return {
330
+ 'success': False,
331
+ 'error': str(e),
332
+ }
333
+
334
+
335
+ @mcp.tool(name='execute_pandas_code')
336
+ async def execute_pandas_code(input_data: ExecutePandasCodeInput) -> Dict:
337
+ """Execute pandas code to generate synthetic data and save it as CSV files.
338
+
339
+ This tool runs pandas code in a restricted environment to generate synthetic data.
340
+ It then saves any generated DataFrames as CSV files.
341
+
342
+ ## Features
343
+
344
+ 1. **Multiple DataFrame Detection**: The tool automatically finds all pandas DataFrames defined in your code and saves them as separate CSV files.
345
+
346
+ 2. **Referential Integrity Checking**: For multi-table data models, the tool checks for foreign key relationships and validates that references are valid.
347
+
348
+ 3. **Third Normal Form Validation**: The tool identifies potential 3NF violations like functional dependencies between non-key attributes.
349
+
350
+ ## Code Requirements
351
+
352
+ - Your code should define one or more pandas DataFrames
353
+ - No need to include imports - pandas is already available as 'pd'
354
+ - No need to include save logic - all DataFrames will be automatically saved
355
+
356
+ ## Example Usage
357
+
358
+ ```python
359
+ # Simple table
360
+ customers_df = pd.DataFrame(
361
+ {
362
+ 'customer_id': [1, 2, 3],
363
+ 'name': ['Alice', 'Bob', 'Charlie'],
364
+ 'city': ['New York', 'San Francisco', 'Chicago'],
365
+ }
366
+ )
367
+
368
+ # Related table with foreign key
369
+ orders_df = pd.DataFrame(
370
+ {'order_id': [101, 102, 103], 'customer_id': [1, 2, 3], 'amount': [99.99, 149.99, 199.99]}
371
+ )
372
+ ```
373
+
374
+ Parameters:
375
+ code: Python code using pandas to generate synthetic data
376
+ workspace_dir: CRITICAL - The current workspace directory
377
+ output_dir: Optional subdirectory within workspace_dir to save CSV files to
378
+
379
+ Returns:
380
+ A dictionary containing execution results and paths to saved CSV files
381
+ """
382
+ try:
383
+ # Determine the output directory
384
+ save_dir = input_data.workspace_dir
385
+ if input_data.output_dir:
386
+ save_dir = os.path.join(input_data.workspace_dir, input_data.output_dir)
387
+
388
+ # Use the imported execute_pandas_code function
389
+ result = _execute_pandas_code(input_data.code, save_dir)
390
+
391
+ # Only create directory and set success if DataFrames were found
392
+ if result.get('saved_files'):
393
+ os.makedirs(save_dir, exist_ok=True)
394
+ result['success'] = True
395
+ result['workspace_dir'] = input_data.workspace_dir
396
+ if input_data.output_dir:
397
+ result['output_subdir'] = input_data.output_dir
398
+ else:
399
+ result['success'] = False
400
+ result['error'] = 'No DataFrames found in code'
401
+
402
+ return result
403
+ except Exception as e:
404
+ return {
405
+ 'success': False,
406
+ 'error': str(e),
407
+ 'message': f'Error executing pandas code: {str(e)}',
408
+ }
409
+
410
+
411
+ def _extract_key_entities(description: str) -> List[str]:
412
+ """Extract key entities from a business description.
413
+
414
+ This is a simplified implementation that looks for common patterns
415
+ in business descriptions to identify entities.
416
+
417
+ Args:
418
+ description: A string describing the business use case
419
+
420
+ Returns:
421
+ A list of potential entity names
422
+ """
423
+ # Convert to lowercase for easier matching
424
+ desc_lower = description.lower()
425
+
426
+ # Look for common patterns like "X table", "Y database", etc.
427
+ table_patterns = [
428
+ r'(\w+)\s+table',
429
+ r'table\s+of\s+(\w+)s?',
430
+ r'(\w+)\s+database',
431
+ r'(\w+)\s+records',
432
+ r'(\w+)\s+data',
433
+ ]
434
+
435
+ entities = []
436
+ for pattern in table_patterns:
437
+ matches = re.findall(pattern, desc_lower)
438
+ entities.extend(matches)
439
+
440
+ # Look for common entity names in business domains
441
+ common_entities = [
442
+ 'user',
443
+ 'customer',
444
+ 'product',
445
+ 'order',
446
+ 'item',
447
+ 'category',
448
+ 'transaction',
449
+ 'payment',
450
+ 'invoice',
451
+ 'employee',
452
+ 'department',
453
+ 'menu',
454
+ 'reservation',
455
+ 'booking',
456
+ 'review',
457
+ 'comment',
458
+ 'address',
459
+ 'location',
460
+ 'store',
461
+ 'supplier',
462
+ 'inventory',
463
+ ]
464
+
465
+ for entity in common_entities:
466
+ if entity in desc_lower or f'{entity}s' in desc_lower:
467
+ entities.append(entity)
468
+
469
+ # Remove duplicates and normalize
470
+ entities = list(set(entities))
471
+ entities = [e.strip().lower() for e in entities if e.strip()]
472
+
473
+ return entities
474
+
475
+
476
+ def _generate_data_structure_instructions(description: str, entities: List[str]) -> Dict:
477
+ """Generate instructions for data structure.
478
+
479
+ Args:
480
+ description: A string describing the business use case
481
+ entities: A list of potential entity names
482
+
483
+ Returns:
484
+ A dictionary containing instructions for data structure
485
+ """
486
+ # Generate general instructions
487
+ general_instructions = [
488
+ 'Analyze the business description to identify key entities (tables) and their attributes (columns).',
489
+ 'Consider the relationships between entities (one-to-one, one-to-many, many-to-many).',
490
+ 'Design a normalized data structure with appropriate primary and foreign keys.',
491
+ 'Include appropriate data types for each column (string, integer, float, boolean, date, etc.).',
492
+ 'Consider including common fields like created_at, updated_at, status, etc. where appropriate.',
493
+ ]
494
+
495
+ # Generate entity-specific instructions
496
+ entity_instructions = {}
497
+ for entity in entities:
498
+ entity_instructions[entity] = {
499
+ 'description': f'Consider what attributes would be relevant for a {entity} entity in this business context.',
500
+ 'suggestions': _get_entity_attribute_suggestions(entity),
501
+ }
502
+
503
+ # Generate relationship instructions
504
+ relationship_instructions = [
505
+ 'Identify relationships between entities based on the business description.',
506
+ 'Use foreign keys to represent relationships between tables.',
507
+ 'Consider whether junction tables are needed for many-to-many relationships.',
508
+ 'Ensure referential integrity in your data model.',
509
+ ]
510
+
511
+ return {
512
+ 'general_instructions': general_instructions,
513
+ 'entity_instructions': entity_instructions,
514
+ 'relationship_instructions': relationship_instructions,
515
+ }
516
+
517
+
518
+ def _get_entity_attribute_suggestions(entity: str) -> List[str]:
519
+ """Get attribute suggestions for an entity.
520
+
521
+ Args:
522
+ entity: The name of the entity
523
+
524
+ Returns:
525
+ A list of suggested attributes
526
+ """
527
+ # Common attributes for different entity types
528
+ attribute_suggestions = {
529
+ 'user': ['id', 'name', 'email', 'password_hash', 'created_at', 'last_login'],
530
+ 'customer': ['id', 'name', 'email', 'phone', 'address', 'created_at'],
531
+ 'product': ['id', 'name', 'description', 'price', 'category_id', 'stock_quantity'],
532
+ 'order': ['id', 'customer_id', 'order_date', 'total_amount', 'status'],
533
+ 'item': ['id', 'name', 'description', 'price', 'category_id'],
534
+ 'category': ['id', 'name', 'description', 'parent_category_id'],
535
+ 'transaction': ['id', 'order_id', 'amount', 'transaction_date', 'status'],
536
+ 'payment': ['id', 'order_id', 'amount', 'payment_date', 'payment_method'],
537
+ 'invoice': ['id', 'order_id', 'invoice_date', 'due_date', 'amount', 'status'],
538
+ 'employee': ['id', 'name', 'email', 'department_id', 'position', 'hire_date'],
539
+ 'department': ['id', 'name', 'description', 'manager_id'],
540
+ 'menu': ['id', 'name', 'description', 'start_date', 'end_date'],
541
+ 'reservation': ['id', 'customer_id', 'reservation_date', 'party_size', 'status'],
542
+ 'booking': ['id', 'customer_id', 'booking_date', 'status'],
543
+ 'review': ['id', 'customer_id', 'product_id', 'rating', 'comment', 'review_date'],
544
+ 'comment': ['id', 'user_id', 'content', 'created_at'],
545
+ 'address': ['id', 'street', 'city', 'state', 'postal_code', 'country'],
546
+ 'location': ['id', 'name', 'address', 'latitude', 'longitude'],
547
+ 'store': ['id', 'name', 'address', 'phone', 'manager_id'],
548
+ 'supplier': ['id', 'name', 'contact_name', 'email', 'phone'],
549
+ 'inventory': ['id', 'product_id', 'quantity', 'location_id', 'last_updated'],
550
+ }
551
+
552
+ # Return suggestions for the entity, or a generic list if not found
553
+ return attribute_suggestions.get(entity, ['id', 'name', 'description', 'created_at'])
554
+
555
+
556
+ def _generate_data_generation_instructions(entities: List[str]) -> Dict:
557
+ """Generate instructions for data generation.
558
+
559
+ Args:
560
+ entities: A list of potential entity names
561
+
562
+ Returns:
563
+ A dictionary containing instructions for data generation
564
+ """
565
+ # Generate general instructions
566
+ general_instructions = [
567
+ 'Generate realistic and diverse data that reflects the business domain.',
568
+ 'Ensure data consistency across related tables (e.g., foreign keys reference valid primary keys).',
569
+ 'Include a mix of common and edge cases in your data.',
570
+ 'Consider the appropriate number of records for each table based on the business context.',
571
+ 'Generate data that covers various scenarios and use cases.',
572
+ ]
573
+
574
+ # Generate data quality instructions
575
+ data_quality_instructions = [
576
+ 'Ensure data types are consistent (e.g., dates in ISO format, numbers as appropriate numeric types).',
577
+ 'Include appropriate null values where fields are optional.',
578
+ 'Ensure text fields have realistic lengths and formats.',
579
+ 'Generate realistic values for domain-specific fields (e.g., email addresses, phone numbers, etc.).',
580
+ 'Avoid generating duplicate primary keys.',
581
+ ]
582
+
583
+ return {
584
+ 'general_instructions': general_instructions,
585
+ 'data_quality_instructions': data_quality_instructions,
586
+ 'recommended_record_counts': _get_recommended_record_counts(entities),
587
+ }
588
+
589
+
590
+ def _get_recommended_record_counts(entities: List[str]) -> Dict[str, int]:
591
+ """Get recommended record counts for entities.
592
+
593
+ Args:
594
+ entities: A list of potential entity names
595
+
596
+ Returns:
597
+ A dictionary mapping entity names to recommended record counts
598
+ """
599
+ # Default record counts for different entity types
600
+ record_counts = {}
601
+
602
+ for entity in entities:
603
+ # Assign different default counts based on entity type
604
+ if entity in ['user', 'customer', 'employee']:
605
+ record_counts[entity] = 50
606
+ elif entity in ['product', 'item', 'category']:
607
+ record_counts[entity] = 20
608
+ elif entity in ['order', 'transaction', 'payment', 'invoice']:
609
+ record_counts[entity] = 100
610
+ else:
611
+ record_counts[entity] = 30
612
+
613
+ return record_counts
614
+
615
+
616
+ def _generate_example_data(entities: List[str]) -> Dict[str, List[Dict]]:
617
+ """Generate example data for entities.
618
+
619
+ Args:
620
+ entities: A list of potential entity names
621
+
622
+ Returns:
623
+ A dictionary containing example data for entities
624
+ """
625
+ example_data = {}
626
+
627
+ # Generate example data for up to 3 entities
628
+ for entity in entities[:3]:
629
+ example_data[entity] = _get_entity_example_data(entity)
630
+
631
+ return example_data
632
+
633
+
634
+ def _get_entity_example_data(entity: str) -> List[Dict]:
635
+ """Get example data for an entity.
636
+
637
+ Args:
638
+ entity: The name of the entity
639
+
640
+ Returns:
641
+ A list of example records
642
+ """
643
+ # Example data for different entity types
644
+ if entity == 'user':
645
+ return [
646
+ {
647
+ 'id': 1,
648
+ 'name': 'John Doe',
649
+ 'email': 'john.doe@example.com',
650
+ 'created_at': '2023-01-15T10:30:00',
651
+ },
652
+ {
653
+ 'id': 2,
654
+ 'name': 'Jane Smith',
655
+ 'email': 'jane.smith@example.com',
656
+ 'created_at': '2023-02-20T14:45:00',
657
+ },
658
+ ]
659
+ elif entity == 'product':
660
+ return [
661
+ {'id': 1, 'name': 'Laptop', 'price': 999.99, 'category_id': 1, 'stock_quantity': 50},
662
+ {
663
+ 'id': 2,
664
+ 'name': 'Smartphone',
665
+ 'price': 699.99,
666
+ 'category_id': 1,
667
+ 'stock_quantity': 100,
668
+ },
669
+ ]
670
+ elif entity == 'order':
671
+ return [
672
+ {
673
+ 'id': 1,
674
+ 'customer_id': 1,
675
+ 'order_date': '2023-03-10',
676
+ 'total_amount': 1699.98,
677
+ 'status': 'completed',
678
+ },
679
+ {
680
+ 'id': 2,
681
+ 'customer_id': 2,
682
+ 'order_date': '2023-03-15',
683
+ 'total_amount': 699.99,
684
+ 'status': 'processing',
685
+ },
686
+ ]
687
+ else:
688
+ # Generic example data
689
+ return [
690
+ {
691
+ 'id': 1,
692
+ 'name': f'{entity.capitalize()} 1',
693
+ 'description': f'Description for {entity} 1',
694
+ },
695
+ {
696
+ 'id': 2,
697
+ 'name': f'{entity.capitalize()} 2',
698
+ 'description': f'Description for {entity} 2',
699
+ },
700
+ ]
701
+
702
+
703
+ def _validate_table_data(table_name: str, records: List[Dict]) -> Dict:
704
+ """Validate table data.
705
+
706
+ Args:
707
+ table_name: The name of the table
708
+ records: A list of records for the table
709
+
710
+ Returns:
711
+ A dictionary containing validation results
712
+ """
713
+ # Check if records is a list
714
+ if not isinstance(records, list):
715
+ return {
716
+ 'is_valid': False,
717
+ 'errors': [f"Data for table '{table_name}' must be a list of records"],
718
+ }
719
+
720
+ # Check if records is empty
721
+ if not records:
722
+ return {
723
+ 'is_valid': False,
724
+ 'errors': [f"Data for table '{table_name}' cannot be empty"],
725
+ }
726
+
727
+ # Check if all records are dictionaries
728
+ if not all(isinstance(record, dict) for record in records):
729
+ return {
730
+ 'is_valid': False,
731
+ 'errors': [f"All records for table '{table_name}' must be dictionaries"],
732
+ }
733
+
734
+ # Check if all records have the same keys
735
+ keys = set(records[0].keys())
736
+ if not all(set(record.keys()) == keys for record in records):
737
+ return {
738
+ 'is_valid': False,
739
+ 'errors': [f"All records for table '{table_name}' must have the same keys"],
740
+ }
741
+
742
+ # Check for duplicate IDs if 'id' is a key
743
+ if 'id' in keys:
744
+ ids = [record['id'] for record in records]
745
+ if len(ids) != len(set(ids)):
746
+ return {
747
+ 'is_valid': False,
748
+ 'errors': [f"Duplicate IDs found in table '{table_name}'"],
749
+ }
750
+
751
+ return {
752
+ 'is_valid': True,
753
+ 'errors': [],
754
+ }
755
+
756
+
757
+ def main():
758
+ """Run the MCP server with CLI argument support."""
759
+ parser = argparse.ArgumentParser(
760
+ description='MCP server for generating synthetic data based on business use cases'
761
+ )
762
+ parser.add_argument('--sse', action='store_true', help='Use SSE transport')
763
+ parser.add_argument('--port', type=int, default=8888, help='Port to run the server on')
764
+
765
+ args = parser.parse_args()
766
+
767
+ # Run server with appropriate transport
768
+ if args.sse:
769
+ mcp.settings.port = args.port
770
+ mcp.run(transport='sse')
771
+ else:
772
+ mcp.run()
773
+
774
+
775
+ if __name__ == '__main__':
776
+ main()