awslabs.dynamodb-mcp-server 1.0.8__tar.gz → 1.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of awslabs.dynamodb-mcp-server might be problematic. Click here for more details.

Files changed (27) hide show
  1. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/PKG-INFO +3 -1
  2. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/__init__.py +1 -1
  3. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/pyproject.toml +3 -1
  4. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/README.md +357 -0
  5. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/dynamic_evaluators.py +251 -0
  6. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/evaluation_registry.py +243 -0
  7. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/logging_config.py +77 -0
  8. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/multiturn_evaluator.py +376 -0
  9. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/scenarios.py +244 -0
  10. awslabs_dynamodb_mcp_server-1.0.9/tests/evals/test_dspy_evals.py +338 -0
  11. awslabs_dynamodb_mcp_server-1.0.9/uv.lock +3219 -0
  12. awslabs_dynamodb_mcp_server-1.0.8/uv.lock +0 -1594
  13. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/.gitignore +0 -0
  14. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/.python-version +0 -0
  15. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/CHANGELOG.md +0 -0
  16. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/Dockerfile +0 -0
  17. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/LICENSE +0 -0
  18. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/NOTICE +0 -0
  19. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/README.md +0 -0
  20. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/__init__.py +0 -0
  21. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/common.py +0 -0
  22. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/prompts/dynamodb_architect.md +0 -0
  23. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/server.py +0 -0
  24. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/docker-healthcheck.sh +0 -0
  25. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/tests/test_dynamodb_server.py +0 -0
  26. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/tests/test_readonly_delete_table.py +0 -0
  27. {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/uv-requirements.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: awslabs.dynamodb-mcp-server
3
- Version: 1.0.8
3
+ Version: 1.0.9
4
4
  Summary: The official MCP Server for interacting with AWS DynamoDB
5
5
  Project-URL: homepage, https://awslabs.github.io/mcp/
6
6
  Project-URL: docs, https://awslabs.github.io/mcp/servers/dynamodb-mcp-server/
@@ -22,9 +22,11 @@ Classifier: Programming Language :: Python :: 3.12
22
22
  Classifier: Programming Language :: Python :: 3.13
23
23
  Requires-Python: >=3.10
24
24
  Requires-Dist: boto3==1.40.5
25
+ Requires-Dist: dspy-ai>=2.6.27
25
26
  Requires-Dist: loguru==0.7.3
26
27
  Requires-Dist: mcp[cli]==1.12.4
27
28
  Requires-Dist: pydantic==2.11.7
29
+ Requires-Dist: strands-agents>=1.5.0
28
30
  Requires-Dist: typing-extensions==4.14.1
29
31
  Description-Content-Type: text/markdown
30
32
 
@@ -14,4 +14,4 @@
14
14
 
15
15
  """awslabs.dynamodb-mcp-server"""
16
16
 
17
- __version__ = '1.0.8'
17
+ __version__ = '1.0.9'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "awslabs.dynamodb-mcp-server"
3
- version = "1.0.8"
3
+ version = "1.0.9"
4
4
  description = "The official MCP Server for interacting with AWS DynamoDB"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -10,6 +10,8 @@ dependencies = [
10
10
  "mcp[cli]==1.12.4",
11
11
  "pydantic==2.11.7",
12
12
  "typing-extensions==4.14.1",
13
+ "strands-agents>=1.5.0",
14
+ "dspy-ai>=2.6.27"
13
15
  ]
14
16
  license = {text = "Apache-2.0"}
15
17
  license-files = ["LICENSE", "NOTICE" ]
@@ -0,0 +1,357 @@
1
+ # DynamoDB MCP Evaluation System
2
+
3
+ A comprehensive evaluation framework for assessing DynamoDB data modeling guidance quality using advanced conversational AI and structured evaluation methodologies.
4
+
5
+ ## Overview
6
+
7
+ This evaluation system combines realistic conversational interactions with sophisticated quality assessment to evaluate the effectiveness of DynamoDB modeling guidance. It uses a three-layer architecture integrating Strands agents, MCP protocol, and DSPy evaluation engines to provide objective, systematic assessment of both modeling process quality and technical design excellence.
8
+
9
+ ### Key Features
10
+
11
+ - **Realistic Conversations**: Uses Strands agents with MCP protocol for authentic user-expert interactions
12
+ - **Dual Evaluation Framework**: Separately assesses modeling process (HOW) and design quality (WHAT)
13
+ - **Expert Knowledge Integration**: Leverages DynamoDB architect prompt for domain-specific evaluation
14
+ - **Comprehensive Scoring**: 10-dimensional assessment covering methodology and technical excellence
15
+ - **Multiple Scenarios**: Predefined scenarios across different complexity levels and domains
16
+ - **Performance Monitoring**: Detailed timing analysis and efficiency metrics
17
+
18
+ ### Dual Evaluation Framework
19
+
20
+ **Session Evaluation** - Assesses the modeling process quality:
21
+ - Requirements Engineering (1-10)
22
+ - Access Pattern Analysis (1-10)
23
+ - Methodology Adherence (1-10)
24
+ - Technical Reasoning (1-10)
25
+ - Process Documentation (1-10)
26
+
27
+ **Model Evaluation** - Assesses the technical design quality:
28
+ - Completeness (1-10)
29
+ - Technical Accuracy (1-10)
30
+ - Access Pattern Coverage (1-10)
31
+ - Scalability Considerations (1-10)
32
+ - Cost Optimization (1-10)
33
+
34
+ ## Quick Start
35
+
36
+ ### Prerequisites
37
+
38
+ 1. **AWS Credentials**: Configure AWS access with Bedrock permissions
39
+ ```bash
40
+ export AWS_PROFILE=your-profile
41
+ export AWS_REGION=us-east-1
42
+ # OR
43
+ export AWS_ACCESS_KEY_ID=your-key
44
+ export AWS_SECRET_ACCESS_KEY=your-secret
45
+ ```
46
+
47
+ 2. **Python Environment**: Python 3.10+ with uv package manager
48
+ ```bash
49
+ # Install uv if not already installed
50
+ curl -LsSf https://astral.sh/uv/install.sh | sh
51
+
52
+ # Navigate to the DynamoDB MCP server directory
53
+ cd src/dynamodb-mcp-server
54
+ ```
55
+
56
+ 3. **Dependencies**: Install required packages
57
+ ```bash
58
+ uv sync
59
+ ```
60
+
61
+ ### Basic Usage
62
+
63
+ Run a basic evaluation with default settings:
64
+
65
+ ```bash
66
+ uv run python tests/evals/test_dspy_evals.py
67
+ ```
68
+
69
+ This will:
70
+ - Use the default model: `bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0`
71
+ - Run the "Simple E-commerce Schema" scenario
72
+ - Execute the complete evaluation pipeline
73
+ - Display comprehensive results
74
+
75
+ ### Sample Output
76
+
77
+ ```
78
+ 🔧 EVALUATION CONFIGURATION
79
+ ==============================
80
+ Model: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0
81
+ Scenario: Simple E-commerce Schema
82
+
83
+ ✅ DSPy configured with bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0
84
+ 🎯 Testing scenario complexity: beginner
85
+ 🔄 Running conversation for scenario: Simple E-commerce Schema
86
+ ...
87
+ ✅ Conversation completed in 61.20s
88
+ 🔄 Running DSPy session evaluation...
89
+ ✅ Session evaluation completed in 11.61s
90
+ 📊 Session Score: 8.40 (good)
91
+ 🔄 Running DSPy model evaluation...
92
+ ✅ Model evaluation completed in 12.12s
93
+ 📊 Model Score: 8.20 (good)
94
+ 🎯 Complete evaluation finished in 84.93s
95
+
96
+ ============================================================
97
+ COMPREHENSIVE EVALUATION RESULTS
98
+ ============================================================
99
+ ⏱️ Total Duration: 84.93s
100
+ • Conversation: 61.20s
101
+ • Session Evaluation: 11.61s
102
+ • Model Evaluation: 12.12s
103
+
104
+ 📋 SESSION EVALUATION (Requirements & Methodology)
105
+ --------------------------------------------------
106
+ 🎯 Overall Session Score: 8.40 (good)
107
+
108
+ 📊 Detailed Session Scores:
109
+ • Requirements Engineering: 9.0/10
110
+ • Access Pattern Analysis: 8.0/10
111
+ • Methodology Adherence: 8.0/10
112
+ • Technical Reasoning: 8.0/10
113
+ • Process Documentation: 9.0/10
114
+
115
+ 🏗️ MODEL EVALUATION (Technical Design)
116
+ --------------------------------------------------
117
+ 🎯 Overall Model Score: 8.20 (good)
118
+
119
+ 📊 Detailed Model Scores:
120
+ • Completeness: 9.0/10
121
+ • Technical Accuracy: 8.0/10
122
+ • Access Pattern Coverage: 9.0/10
123
+ • Scalability Considerations: 8.0/10
124
+ • Cost Optimization: 7.0/10
125
+
126
+ 🎖️ QUALITY SUMMARY
127
+ --------------------------------------------------
128
+ Session Quality: good
129
+ Model Quality: good
130
+ ```
131
+
132
+ ## Command Line Usage
133
+
134
+ ### Available Commands
135
+
136
+ **Basic evaluation:**
137
+ ```bash
138
+ uv run python tests/evals/test_dspy_evals.py
139
+ ```
140
+
141
+ **Custom model evaluation:**
142
+ ```bash
143
+ uv run python tests/evals/test_dspy_evals.py --model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
144
+ ```
145
+
146
+ **Specific scenario testing:**
147
+ ```bash
148
+ uv run python tests/evals/test_dspy_evals.py --scenario "High-Scale Social Media Platform"
149
+ ```
150
+
151
+ **Combined configuration:**
152
+ ```bash
153
+ uv run python tests/evals/test_dspy_evals.py --model "custom-model" --scenario "Content Management System"
154
+ ```
155
+
156
+ **List available scenarios:**
157
+ ```bash
158
+ uv run python tests/evals/test_dspy_evals.py --list-scenarios
159
+ ```
160
+
161
+ ### Command Line Options
162
+
163
+ | Option | Description | Default |
164
+ |--------|-------------|---------|
165
+ | `--model` | Bedrock model ID to use | `bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0` |
166
+ | `--scenario` | Scenario name to evaluate | `"Simple E-commerce Schema"` |
167
+ | `--list-scenarios` | Show all available scenarios | - |
168
+ | `--debug` | Show raw JSON output | - |
169
+ | `--aws-profile` | AWS profile to use for evaluation | `Bedrock` |
170
+
171
+ ## Available Scenarios
172
+
173
+ The system includes predefined scenarios across different complexity levels:
174
+
175
+ ### Beginner Scenarios
176
+ - **Simple E-commerce Schema**: Basic online retail with users, products, orders
177
+ - **Content Management System**: Blog/CMS with articles, authors, categories
178
+
179
+ ### Advanced Scenarios
180
+ - **High-Scale Social Media Platform**: Social media with posts, likes, comments at scale
181
+
182
+ ### Scenario Structure
183
+
184
+ Each scenario includes:
185
+ - **Application Details**: Type, domain, business model
186
+ - **Entities & Relationships**: Complete data model definition
187
+ - **Access Patterns**: Read/write patterns with performance requirements
188
+ - **Scale Requirements**: User base, transaction volume, growth projections
189
+ - **Performance Targets**: Latency and throughput specifications
190
+
191
+ To see all scenarios with descriptions:
192
+ ```bash
193
+ uv run python tests/evals/test_dspy_evals.py --list-scenarios
194
+ ```
195
+
196
+ ## Understanding Results
197
+
198
+ ### Quality Levels
199
+
200
+ Results are classified into quality levels based on overall scores:
201
+
202
+ | Score Range | Quality Level | Description |
203
+ |-------------|---------------|-------------|
204
+ | 8.5 - 10.0 | `excellent` | Exceptional quality - ready for production |
205
+ | 7.0 - 8.4 | `good` | Solid quality - minor improvements needed |
206
+ | 5.5 - 6.9 | `acceptable` | Adequate - meets basic requirements |
207
+ | 4.0 - 5.4 | `needs_improvement` | Deficient - significant gaps present |
208
+ | 1.0 - 3.9 | `poor` | Major issues - substantial rework required |
209
+
210
+ ### Performance Characteristics
211
+
212
+ Typical evaluation timing:
213
+ - **Conversation Phase**: 30-60 seconds (depends on model and scenario complexity)
214
+ - **Session Evaluation**: 10-15 seconds (DSPy process assessment)
215
+ - **Model Evaluation**: 10-15 seconds (DSPy design assessment)
216
+ - **Total Duration**: 50-90 seconds for complete pipeline
217
+
218
+ ### Session vs Model Evaluation
219
+
220
+ **Session Evaluation** focuses on **HOW** the modeling was conducted:
221
+ - Did the system follow proper methodology?
222
+ - Were requirements properly gathered and analyzed?
223
+ - Was the decision-making process well-documented?
224
+ - Were trade-offs and alternatives considered?
225
+
226
+ **Model Evaluation** focuses on **WHAT** was delivered:
227
+ - Is the final design technically correct?
228
+ - Does it handle all required access patterns?
229
+ - Are scalability concerns addressed?
230
+ - Is the solution cost-optimized?
231
+
232
+ ## Configuration
233
+
234
+ ### Model Selection
235
+
236
+ The system supports any Bedrock-compatible model. Popular choices:
237
+
238
+ ```bash
239
+ # Claude 4 Sonnet (recommended)
240
+ --model "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"
241
+
242
+ # Claude 3.5 Sonnet
243
+ --model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
244
+
245
+ # Other Bedrock models
246
+ --model "bedrock/amazon.titan-text-premier-v1:0"
247
+ ```
248
+
249
+ ### Environment Variables
250
+
251
+ | Variable | Purpose | Default |
252
+ |----------|---------|---------|
253
+ | `AWS_PROFILE` | AWS credential profile | - |
254
+ | `AWS_REGION` | AWS region for Bedrock | `us-east-1` |
255
+ | `AWS_ACCESS_KEY_ID` | Direct AWS credentials | - |
256
+ | `AWS_SECRET_ACCESS_KEY` | Direct AWS credentials | - |
257
+
258
+ ## Troubleshooting
259
+
260
+ ### Common Issues
261
+
262
+ **AWS Credentials Error:**
263
+ ```
264
+ AWS credentials not available - set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY or AWS_PROFILE
265
+ ```
266
+ **Solution**: Configure AWS credentials as shown in Prerequisites section.
267
+
268
+ **Model Access Error:**
269
+ ```
270
+ Could not access the model: bedrock/model-name
271
+ ```
272
+ **Solution**: Ensure your AWS account has access to the requested Bedrock model and proper permissions.
273
+
274
+ **Scenario Not Found:**
275
+ ```
276
+ Scenario 'Invalid Name' not found
277
+ ```
278
+ **Solution**: Use `--list-scenarios` to see available options and check spelling.
279
+
280
+ **MCP Connection Issues:**
281
+ ```
282
+ Error during Strands conversation: MCP connection failed
283
+ ```
284
+ **Solution**: Ensure the DynamoDB MCP server is properly installed and accessible.
285
+
286
+ ### Extending the System
287
+
288
+ **Adding New Scenarios:**
289
+ 1. Add scenario definition to `scenarios.py`
290
+ 2. Include all required fields (entities, access patterns, scale)
291
+ 3. Test with different models for consistency
292
+
293
+ **Adding New Evaluation Dimensions:**
294
+
295
+ The system uses a dynamic registry pattern that makes adding new evaluation dimensions extremely simple - no code changes required to the evaluation engine!
296
+
297
+ **3-Step Process:**
298
+ 1. **Add Dimension to Registry** - Simply add a new `DimensionConfig` to the appropriate evaluation in `evaluation_registry.py`
299
+ 2. **Define Scoring Rubric** - Specify how the dimension should be evaluated (1-10 scale)
300
+ 3. **Test Immediately** - The dynamic evaluator automatically picks up the new dimension
301
+
302
+ **Example - Adding Security Evaluation:**
303
+ ```python
304
+ # In evaluation_registry.py, add to model_dimensions list:
305
+ DimensionConfig(
306
+ name="security_considerations",
307
+ display_name="Security Considerations",
308
+ description="Data security and access control planning",
309
+ scoring_rubric=(
310
+ "Score 1-10: Evaluate security measures including "
311
+ "encryption, access patterns, IAM policies, and data protection. "
312
+ "Return single number 1-10."
313
+ ),
314
+ weight=1.0,
315
+ justification_prompt="Explain security assessment and recommendations"
316
+ )
317
+ ```
318
+
319
+ **That's it!** The `DynamicEvaluationEngine` will automatically:
320
+ - Generate DSPy signatures with your new dimension
321
+ - Create result dataclasses including the new field
322
+ - Integrate scoring and justification collection
323
+ - Make it available in CLI evaluations
324
+
325
+ **No changes needed to:**
326
+ - `dynamic_evaluators.py` - automatically adapts
327
+ - `test_dspy_evals.py` - CLI works immediately
328
+ - Result processing - handled automatically
329
+
330
+ **New Model Support:**
331
+ 1. Ensure model is available in AWS Bedrock
332
+ 2. Test compatibility with DSPy framework
333
+ 3. Adjust timeout settings if needed
334
+
335
+ **Architecture Overview:**
336
+ - `evaluation_registry.py`: Dynamic registry for evaluation dimensions and types
337
+ - `dynamic_evaluators.py`: DSPy evaluation engine that adapts to registry configurations
338
+ - `multiturn_evaluator.py`: Multi-turn conversation evaluator using Strands agents
339
+ - `scenarios.py`: Test scenario definitions for evaluation
340
+ - `test_dspy_evals.py`: Command-line interface for the evaluation system
341
+
342
+ ## Development and Contributing
343
+
344
+ ### Running Tests
345
+
346
+ ```bash
347
+ # Run a quick evaluation
348
+ uv run python tests/evals/test_dspy_evals.py
349
+
350
+ # Test different models
351
+ uv run python tests/evals/test_dspy_evals.py --model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
352
+
353
+ # Test all scenarios
354
+ for scenario in "Simple E-commerce Schema" "High-Scale Social Media Platform" "Content Management System"; do
355
+ uv run python tests/evals/test_dspy_evals.py --scenario "$scenario"
356
+ done
357
+ ```
@@ -0,0 +1,251 @@
1
+ """Dynamic DSPy evaluator using evaluation registry for easy dimension management."""
2
+
3
+ import dspy
4
+ import json
5
+ from dataclasses import dataclass
6
+ from evaluation_registry import EvaluationConfig, registry
7
+ from logging_config import get_logger
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Type
10
+
11
+
12
+ # Initialize logger for this module
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def create_dspy_signature(evaluation_config: EvaluationConfig) -> Type[dspy.Signature]:
17
+ """Dynamically create a DSPy signature class based on evaluation configuration."""
18
+ signature_attrs = {}
19
+
20
+ input_fields = evaluation_config.input_fields or {}
21
+ for field_name, field_desc in input_fields.items():
22
+ signature_attrs[field_name] = dspy.InputField(desc=field_desc)
23
+
24
+ for dimension in evaluation_config.dimensions:
25
+ score_field_name = f'{dimension.name}_score'
26
+ signature_attrs[score_field_name] = dspy.OutputField(desc=dimension.scoring_rubric)
27
+
28
+ for dimension in evaluation_config.dimensions:
29
+ if dimension.justification_prompt:
30
+ justification_field_name = f'{dimension.name}_justification'
31
+ signature_attrs[justification_field_name] = dspy.OutputField(
32
+ desc=dimension.justification_prompt
33
+ )
34
+
35
+ signature_attrs['strengths'] = dspy.OutputField(
36
+ desc=f'Key strengths of the {evaluation_config.display_name.lower()}, highlighting what was done exceptionally well'
37
+ )
38
+ signature_attrs['weaknesses'] = dspy.OutputField(
39
+ desc=f'Main weaknesses and areas where the {evaluation_config.display_name.lower()} fell short or could be significantly improved'
40
+ )
41
+ signature_attrs['improvement_recommendations'] = dspy.OutputField(
42
+ desc=f'Specific, actionable recommendations for improving the {evaluation_config.display_name.lower()}, with concrete suggestions for addressing identified weaknesses'
43
+ )
44
+
45
+ signature_class_name = f'{evaluation_config.name.title().replace("_", "")}Signature'
46
+ signature_class = type(signature_class_name, (dspy.Signature,), signature_attrs)
47
+
48
+ signature_class.__doc__ = f'Generated DSPy signature for {evaluation_config.display_name} with {len(evaluation_config.dimensions)} dimensions.'
49
+
50
+ return signature_class
51
+
52
+
53
+ def create_result_dataclass(evaluation_config: EvaluationConfig) -> Type:
54
+ """Dynamically create a result dataclass based on evaluation configuration."""
55
+ class_fields = []
56
+
57
+ for dimension in evaluation_config.dimensions:
58
+ class_fields.append((dimension.name, float))
59
+
60
+ class_fields.extend(
61
+ [('justifications', Dict[str, str]), ('overall_score', float), ('quality_level', str)]
62
+ )
63
+
64
+ dataclass_name = f'{evaluation_config.name.title().replace("_", "")}Result'
65
+ annotations = dict(class_fields)
66
+ result_class = type(dataclass_name, (), {'__annotations__': annotations})
67
+ result_class = dataclass(result_class)
68
+
69
+ def to_dict(self):
70
+ """Convert result object to dictionary for JSON serialization."""
71
+ result_dict = {}
72
+ for dimension in evaluation_config.dimensions:
73
+ result_dict[dimension.name] = getattr(self, dimension.name)
74
+ result_dict.update(
75
+ {
76
+ 'justifications': self.justifications,
77
+ 'overall_score': self.overall_score,
78
+ 'quality_level': self.quality_level,
79
+ }
80
+ )
81
+ return result_dict
82
+
83
+ setattr(result_class, 'to_dict', to_dict)
84
+ result_class.__doc__ = f'Generated result container for {evaluation_config.display_name} with {len(evaluation_config.dimensions)} dimensions.'
85
+
86
+ return result_class
87
+
88
+
89
+ class DynamicEvaluationEngine:
90
+ """Dynamic evaluation engine that adapts to any registered evaluation type."""
91
+
92
+ def __init__(self):
93
+ """Initialize the dynamic evaluation engine with empty caches."""
94
+ self._evaluators = {}
95
+ self._result_classes = {}
96
+ self._expert_knowledge_cache = None
97
+
98
+ def _get_evaluator(self, evaluation_name: str):
99
+ """Get or create evaluator for the specified evaluation type."""
100
+ if evaluation_name not in self._evaluators:
101
+ evaluation_config = registry.get_evaluation(evaluation_name)
102
+ signature_class = create_dspy_signature(evaluation_config)
103
+ self._evaluators[evaluation_name] = dspy.ChainOfThought(signature_class)
104
+
105
+ return self._evaluators[evaluation_name]
106
+
107
+ def _get_result_class(self, evaluation_name: str):
108
+ """Get or create result class for the specified evaluation type."""
109
+ if evaluation_name not in self._result_classes:
110
+ evaluation_config = registry.get_evaluation(evaluation_name)
111
+ self._result_classes[evaluation_name] = create_result_dataclass(evaluation_config)
112
+
113
+ return self._result_classes[evaluation_name]
114
+
115
+ def _load_expert_knowledge(self) -> str:
116
+ """Load DynamoDB expert knowledge (cached)."""
117
+ if self._expert_knowledge_cache is None:
118
+ try:
119
+ prompt_path = (
120
+ Path(__file__).parent.parent.parent
121
+ / 'awslabs'
122
+ / 'dynamodb_mcp_server'
123
+ / 'prompts'
124
+ / 'dynamodb_architect.md'
125
+ )
126
+ self._expert_knowledge_cache = prompt_path.read_text(encoding='utf-8')
127
+ except Exception as e:
128
+ logger.warning(f'Warning: Could not load expert knowledge: {e}')
129
+ self._expert_knowledge_cache = 'Expert knowledge not available.'
130
+
131
+ return self._expert_knowledge_cache
132
+
133
+ def evaluate(self, evaluation_name: str, scenario: Dict[str, Any], content: str, **kwargs):
134
+ """Evaluate content using the specified evaluation type."""
135
+ evaluation_config = registry.get_evaluation(evaluation_name)
136
+ evaluator = self._get_evaluator(evaluation_name)
137
+ result_class = self._get_result_class(evaluation_name)
138
+
139
+ # Prepare input arguments
140
+ eval_inputs = {}
141
+
142
+ scenario_json = json.dumps(scenario, indent=2)
143
+
144
+ input_mappings = {
145
+ 'scenario_requirements': scenario_json,
146
+ 'guidance_response': content,
147
+ 'modeling_requirement_content': content,
148
+ 'dynamodb_expert_knowledge': self._load_expert_knowledge(),
149
+ 'architect_methodology': self._load_expert_knowledge(),
150
+ }
151
+
152
+ # Add inputs based on evaluation configuration
153
+ for field_name in evaluation_config.input_fields.keys():
154
+ if field_name in input_mappings:
155
+ eval_inputs[field_name] = input_mappings[field_name]
156
+ elif field_name in kwargs:
157
+ eval_inputs[field_name] = kwargs[field_name]
158
+
159
+ # Run the evaluation
160
+ raw_result = evaluator(**eval_inputs)
161
+
162
+ # Process results into structured format
163
+ return self._process_results(evaluation_config, raw_result, result_class)
164
+
165
+ def _process_results(self, evaluation_config: EvaluationConfig, raw_result, result_class):
166
+ """Process raw DSPy results into structured result object."""
167
+ # Extract dimension scores
168
+ dimension_scores = {}
169
+ for dimension in evaluation_config.dimensions:
170
+ score_field = f'{dimension.name}_score'
171
+ score_value = getattr(raw_result, score_field, 0.0)
172
+ # Handle DSPy returning various types
173
+ if isinstance(score_value, (int, float)):
174
+ dimension_scores[dimension.name] = float(score_value)
175
+ else:
176
+ # Try to parse if string
177
+ try:
178
+ dimension_scores[dimension.name] = float(str(score_value).split()[0])
179
+ except (ValueError, IndexError):
180
+ dimension_scores[dimension.name] = 0.0
181
+
182
+ # Calculate overall score using weighted average
183
+ total_weight = sum(dim.weight for dim in evaluation_config.dimensions)
184
+ if total_weight > 0:
185
+ weighted_sum = sum(
186
+ dimension_scores[dim.name] * dim.weight for dim in evaluation_config.dimensions
187
+ )
188
+ overall_score = round(weighted_sum / total_weight, 2)
189
+ else:
190
+ overall_score = 0.0
191
+
192
+ # Determine quality level using existing thresholds
193
+ quality_thresholds = {
194
+ 'excellent': 8.5,
195
+ 'good': 7.0,
196
+ 'acceptable': 5.5,
197
+ 'needs_improvement': 4.0,
198
+ 'poor': 2.0,
199
+ }
200
+
201
+ if overall_score >= quality_thresholds['excellent']:
202
+ quality_level = 'excellent'
203
+ elif overall_score >= quality_thresholds['good']:
204
+ quality_level = 'good'
205
+ elif overall_score >= quality_thresholds['acceptable']:
206
+ quality_level = 'acceptable'
207
+ elif overall_score >= quality_thresholds['needs_improvement']:
208
+ quality_level = 'needs_improvement'
209
+ else:
210
+ quality_level = 'poor'
211
+
212
+ # Build justifications dictionary
213
+ justifications = {}
214
+
215
+ # Add dimension justifications
216
+ for dimension in evaluation_config.dimensions:
217
+ if dimension.justification_prompt:
218
+ justification_field = f'{dimension.name}_justification'
219
+ justifications[dimension.name] = str(getattr(raw_result, justification_field, ''))
220
+
221
+ # Add overall assessment fields
222
+ justifications.update(
223
+ {
224
+ 'strengths': str(getattr(raw_result, 'strengths', '')),
225
+ 'weaknesses': str(getattr(raw_result, 'weaknesses', '')),
226
+ 'improvement_recommendations': str(
227
+ getattr(raw_result, 'improvement_recommendations', '')
228
+ ),
229
+ }
230
+ )
231
+
232
+ # Create result object
233
+ result_kwargs = {
234
+ **dimension_scores,
235
+ 'justifications': justifications,
236
+ 'overall_score': overall_score,
237
+ 'quality_level': quality_level,
238
+ }
239
+
240
+ return result_class(**result_kwargs)
241
+
242
+
243
+ # Create global instance
244
+ dynamic_engine = DynamicEvaluationEngine()
245
+
246
+ __all__ = [
247
+ 'DynamicEvaluationEngine',
248
+ 'create_dspy_signature',
249
+ 'create_result_dataclass',
250
+ 'dynamic_engine',
251
+ ]