awslabs.dynamodb-mcp-server 1.0.8__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of awslabs.dynamodb-mcp-server might be problematic. Click here for more details.
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/PKG-INFO +3 -1
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/__init__.py +1 -1
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/pyproject.toml +3 -1
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/README.md +357 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/dynamic_evaluators.py +251 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/evaluation_registry.py +243 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/logging_config.py +77 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/multiturn_evaluator.py +376 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/scenarios.py +244 -0
- awslabs_dynamodb_mcp_server-1.0.9/tests/evals/test_dspy_evals.py +338 -0
- awslabs_dynamodb_mcp_server-1.0.9/uv.lock +3219 -0
- awslabs_dynamodb_mcp_server-1.0.8/uv.lock +0 -1594
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/.gitignore +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/.python-version +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/CHANGELOG.md +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/Dockerfile +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/LICENSE +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/NOTICE +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/README.md +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/__init__.py +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/common.py +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/prompts/dynamodb_architect.md +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/awslabs/dynamodb_mcp_server/server.py +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/docker-healthcheck.sh +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/tests/test_dynamodb_server.py +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/tests/test_readonly_delete_table.py +0 -0
- {awslabs_dynamodb_mcp_server-1.0.8 → awslabs_dynamodb_mcp_server-1.0.9}/uv-requirements.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: awslabs.dynamodb-mcp-server
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: The official MCP Server for interacting with AWS DynamoDB
|
|
5
5
|
Project-URL: homepage, https://awslabs.github.io/mcp/
|
|
6
6
|
Project-URL: docs, https://awslabs.github.io/mcp/servers/dynamodb-mcp-server/
|
|
@@ -22,9 +22,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
22
22
|
Classifier: Programming Language :: Python :: 3.13
|
|
23
23
|
Requires-Python: >=3.10
|
|
24
24
|
Requires-Dist: boto3==1.40.5
|
|
25
|
+
Requires-Dist: dspy-ai>=2.6.27
|
|
25
26
|
Requires-Dist: loguru==0.7.3
|
|
26
27
|
Requires-Dist: mcp[cli]==1.12.4
|
|
27
28
|
Requires-Dist: pydantic==2.11.7
|
|
29
|
+
Requires-Dist: strands-agents>=1.5.0
|
|
28
30
|
Requires-Dist: typing-extensions==4.14.1
|
|
29
31
|
Description-Content-Type: text/markdown
|
|
30
32
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "awslabs.dynamodb-mcp-server"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.9"
|
|
4
4
|
description = "The official MCP Server for interacting with AWS DynamoDB"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -10,6 +10,8 @@ dependencies = [
|
|
|
10
10
|
"mcp[cli]==1.12.4",
|
|
11
11
|
"pydantic==2.11.7",
|
|
12
12
|
"typing-extensions==4.14.1",
|
|
13
|
+
"strands-agents>=1.5.0",
|
|
14
|
+
"dspy-ai>=2.6.27"
|
|
13
15
|
]
|
|
14
16
|
license = {text = "Apache-2.0"}
|
|
15
17
|
license-files = ["LICENSE", "NOTICE" ]
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
# DynamoDB MCP Evaluation System
|
|
2
|
+
|
|
3
|
+
A comprehensive evaluation framework for assessing DynamoDB data modeling guidance quality using advanced conversational AI and structured evaluation methodologies.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
This evaluation system combines realistic conversational interactions with sophisticated quality assessment to evaluate the effectiveness of DynamoDB modeling guidance. It uses a three-layer architecture integrating Strands agents, MCP protocol, and DSPy evaluation engines to provide objective, systematic assessment of both modeling process quality and technical design excellence.
|
|
8
|
+
|
|
9
|
+
### Key Features
|
|
10
|
+
|
|
11
|
+
- **Realistic Conversations**: Uses Strands agents with MCP protocol for authentic user-expert interactions
|
|
12
|
+
- **Dual Evaluation Framework**: Separately assesses modeling process (HOW) and design quality (WHAT)
|
|
13
|
+
- **Expert Knowledge Integration**: Leverages DynamoDB architect prompt for domain-specific evaluation
|
|
14
|
+
- **Comprehensive Scoring**: 10-dimensional assessment covering methodology and technical excellence
|
|
15
|
+
- **Multiple Scenarios**: Predefined scenarios across different complexity levels and domains
|
|
16
|
+
- **Performance Monitoring**: Detailed timing analysis and efficiency metrics
|
|
17
|
+
|
|
18
|
+
### Dual Evaluation Framework
|
|
19
|
+
|
|
20
|
+
**Session Evaluation** - Assesses the modeling process quality:
|
|
21
|
+
- Requirements Engineering (1-10)
|
|
22
|
+
- Access Pattern Analysis (1-10)
|
|
23
|
+
- Methodology Adherence (1-10)
|
|
24
|
+
- Technical Reasoning (1-10)
|
|
25
|
+
- Process Documentation (1-10)
|
|
26
|
+
|
|
27
|
+
**Model Evaluation** - Assesses the technical design quality:
|
|
28
|
+
- Completeness (1-10)
|
|
29
|
+
- Technical Accuracy (1-10)
|
|
30
|
+
- Access Pattern Coverage (1-10)
|
|
31
|
+
- Scalability Considerations (1-10)
|
|
32
|
+
- Cost Optimization (1-10)
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### Prerequisites
|
|
37
|
+
|
|
38
|
+
1. **AWS Credentials**: Configure AWS access with Bedrock permissions
|
|
39
|
+
```bash
|
|
40
|
+
export AWS_PROFILE=your-profile
|
|
41
|
+
export AWS_REGION=us-east-1
|
|
42
|
+
# OR
|
|
43
|
+
export AWS_ACCESS_KEY_ID=your-key
|
|
44
|
+
export AWS_SECRET_ACCESS_KEY=your-secret
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
2. **Python Environment**: Python 3.10+ with uv package manager
|
|
48
|
+
```bash
|
|
49
|
+
# Install uv if not already installed
|
|
50
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
51
|
+
|
|
52
|
+
# Navigate to the DynamoDB MCP server directory
|
|
53
|
+
cd src/dynamodb-mcp-server
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
3. **Dependencies**: Install required packages
|
|
57
|
+
```bash
|
|
58
|
+
uv sync
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Basic Usage
|
|
62
|
+
|
|
63
|
+
Run a basic evaluation with default settings:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
uv run python tests/evals/test_dspy_evals.py
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This will:
|
|
70
|
+
- Use the default model: `bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0`
|
|
71
|
+
- Run the "Simple E-commerce Schema" scenario
|
|
72
|
+
- Execute the complete evaluation pipeline
|
|
73
|
+
- Display comprehensive results
|
|
74
|
+
|
|
75
|
+
### Sample Output
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
🔧 EVALUATION CONFIGURATION
|
|
79
|
+
==============================
|
|
80
|
+
Model: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0
|
|
81
|
+
Scenario: Simple E-commerce Schema
|
|
82
|
+
|
|
83
|
+
✅ DSPy configured with bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0
|
|
84
|
+
🎯 Testing scenario complexity: beginner
|
|
85
|
+
🔄 Running conversation for scenario: Simple E-commerce Schema
|
|
86
|
+
...
|
|
87
|
+
✅ Conversation completed in 61.20s
|
|
88
|
+
🔄 Running DSPy session evaluation...
|
|
89
|
+
✅ Session evaluation completed in 11.61s
|
|
90
|
+
📊 Session Score: 8.40 (good)
|
|
91
|
+
🔄 Running DSPy model evaluation...
|
|
92
|
+
✅ Model evaluation completed in 12.12s
|
|
93
|
+
📊 Model Score: 8.20 (good)
|
|
94
|
+
🎯 Complete evaluation finished in 84.93s
|
|
95
|
+
|
|
96
|
+
============================================================
|
|
97
|
+
COMPREHENSIVE EVALUATION RESULTS
|
|
98
|
+
============================================================
|
|
99
|
+
⏱️ Total Duration: 84.93s
|
|
100
|
+
• Conversation: 61.20s
|
|
101
|
+
• Session Evaluation: 11.61s
|
|
102
|
+
• Model Evaluation: 12.12s
|
|
103
|
+
|
|
104
|
+
📋 SESSION EVALUATION (Requirements & Methodology)
|
|
105
|
+
--------------------------------------------------
|
|
106
|
+
🎯 Overall Session Score: 8.40 (good)
|
|
107
|
+
|
|
108
|
+
📊 Detailed Session Scores:
|
|
109
|
+
• Requirements Engineering: 9.0/10
|
|
110
|
+
• Access Pattern Analysis: 8.0/10
|
|
111
|
+
• Methodology Adherence: 8.0/10
|
|
112
|
+
• Technical Reasoning: 8.0/10
|
|
113
|
+
• Process Documentation: 9.0/10
|
|
114
|
+
|
|
115
|
+
🏗️ MODEL EVALUATION (Technical Design)
|
|
116
|
+
--------------------------------------------------
|
|
117
|
+
🎯 Overall Model Score: 8.20 (good)
|
|
118
|
+
|
|
119
|
+
📊 Detailed Model Scores:
|
|
120
|
+
• Completeness: 9.0/10
|
|
121
|
+
• Technical Accuracy: 8.0/10
|
|
122
|
+
• Access Pattern Coverage: 9.0/10
|
|
123
|
+
• Scalability Considerations: 8.0/10
|
|
124
|
+
• Cost Optimization: 7.0/10
|
|
125
|
+
|
|
126
|
+
🎖️ QUALITY SUMMARY
|
|
127
|
+
--------------------------------------------------
|
|
128
|
+
Session Quality: good
|
|
129
|
+
Model Quality: good
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Command Line Usage
|
|
133
|
+
|
|
134
|
+
### Available Commands
|
|
135
|
+
|
|
136
|
+
**Basic evaluation:**
|
|
137
|
+
```bash
|
|
138
|
+
uv run python tests/evals/test_dspy_evals.py
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Custom model evaluation:**
|
|
142
|
+
```bash
|
|
143
|
+
uv run python tests/evals/test_dspy_evals.py --model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Specific scenario testing:**
|
|
147
|
+
```bash
|
|
148
|
+
uv run python tests/evals/test_dspy_evals.py --scenario "High-Scale Social Media Platform"
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**Combined configuration:**
|
|
152
|
+
```bash
|
|
153
|
+
uv run python tests/evals/test_dspy_evals.py --model "custom-model" --scenario "Content Management System"
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**List available scenarios:**
|
|
157
|
+
```bash
|
|
158
|
+
uv run python tests/evals/test_dspy_evals.py --list-scenarios
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Command Line Options
|
|
162
|
+
|
|
163
|
+
| Option | Description | Default |
|
|
164
|
+
|--------|-------------|---------|
|
|
165
|
+
| `--model` | Bedrock model ID to use | `bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0` |
|
|
166
|
+
| `--scenario` | Scenario name to evaluate | `"Simple E-commerce Schema"` |
|
|
167
|
+
| `--list-scenarios` | Show all available scenarios | - |
|
|
168
|
+
| `--debug` | Show raw JSON output | - |
|
|
169
|
+
| `--aws-profile` | AWS profile to use for evaluation | `Bedrock` |
|
|
170
|
+
|
|
171
|
+
## Available Scenarios
|
|
172
|
+
|
|
173
|
+
The system includes predefined scenarios across different complexity levels:
|
|
174
|
+
|
|
175
|
+
### Beginner Scenarios
|
|
176
|
+
- **Simple E-commerce Schema**: Basic online retail with users, products, orders
|
|
177
|
+
- **Content Management System**: Blog/CMS with articles, authors, categories
|
|
178
|
+
|
|
179
|
+
### Advanced Scenarios
|
|
180
|
+
- **High-Scale Social Media Platform**: Social media with posts, likes, comments at scale
|
|
181
|
+
|
|
182
|
+
### Scenario Structure
|
|
183
|
+
|
|
184
|
+
Each scenario includes:
|
|
185
|
+
- **Application Details**: Type, domain, business model
|
|
186
|
+
- **Entities & Relationships**: Complete data model definition
|
|
187
|
+
- **Access Patterns**: Read/write patterns with performance requirements
|
|
188
|
+
- **Scale Requirements**: User base, transaction volume, growth projections
|
|
189
|
+
- **Performance Targets**: Latency and throughput specifications
|
|
190
|
+
|
|
191
|
+
To see all scenarios with descriptions:
|
|
192
|
+
```bash
|
|
193
|
+
uv run python tests/evals/test_dspy_evals.py --list-scenarios
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Understanding Results
|
|
197
|
+
|
|
198
|
+
### Quality Levels
|
|
199
|
+
|
|
200
|
+
Results are classified into quality levels based on overall scores:
|
|
201
|
+
|
|
202
|
+
| Score Range | Quality Level | Description |
|
|
203
|
+
|-------------|---------------|-------------|
|
|
204
|
+
| 8.5 - 10.0 | `excellent` | Exceptional quality - ready for production |
|
|
205
|
+
| 7.0 - 8.4 | `good` | Solid quality - minor improvements needed |
|
|
206
|
+
| 5.5 - 6.9 | `acceptable` | Adequate - meets basic requirements |
|
|
207
|
+
| 4.0 - 5.4 | `needs_improvement` | Deficient - significant gaps present |
|
|
208
|
+
| 1.0 - 3.9 | `poor` | Major issues - substantial rework required |
|
|
209
|
+
|
|
210
|
+
### Performance Characteristics
|
|
211
|
+
|
|
212
|
+
Typical evaluation timing:
|
|
213
|
+
- **Conversation Phase**: 30-60 seconds (depends on model and scenario complexity)
|
|
214
|
+
- **Session Evaluation**: 10-15 seconds (DSPy process assessment)
|
|
215
|
+
- **Model Evaluation**: 10-15 seconds (DSPy design assessment)
|
|
216
|
+
- **Total Duration**: 50-90 seconds for complete pipeline
|
|
217
|
+
|
|
218
|
+
### Session vs Model Evaluation
|
|
219
|
+
|
|
220
|
+
**Session Evaluation** focuses on **HOW** the modeling was conducted:
|
|
221
|
+
- Did the system follow proper methodology?
|
|
222
|
+
- Were requirements properly gathered and analyzed?
|
|
223
|
+
- Was the decision-making process well-documented?
|
|
224
|
+
- Were trade-offs and alternatives considered?
|
|
225
|
+
|
|
226
|
+
**Model Evaluation** focuses on **WHAT** was delivered:
|
|
227
|
+
- Is the final design technically correct?
|
|
228
|
+
- Does it handle all required access patterns?
|
|
229
|
+
- Are scalability concerns addressed?
|
|
230
|
+
- Is the solution cost-optimized?
|
|
231
|
+
|
|
232
|
+
## Configuration
|
|
233
|
+
|
|
234
|
+
### Model Selection
|
|
235
|
+
|
|
236
|
+
The system supports any Bedrock-compatible model. Popular choices:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
# Claude 4 Sonnet (recommended)
|
|
240
|
+
--model "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"
|
|
241
|
+
|
|
242
|
+
# Claude 3.5 Sonnet
|
|
243
|
+
--model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
|
|
244
|
+
|
|
245
|
+
# Other Bedrock models
|
|
246
|
+
--model "bedrock/amazon.titan-text-premier-v1:0"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Environment Variables
|
|
250
|
+
|
|
251
|
+
| Variable | Purpose | Default |
|
|
252
|
+
|----------|---------|---------|
|
|
253
|
+
| `AWS_PROFILE` | AWS credential profile | - |
|
|
254
|
+
| `AWS_REGION` | AWS region for Bedrock | `us-east-1` |
|
|
255
|
+
| `AWS_ACCESS_KEY_ID` | Direct AWS credentials | - |
|
|
256
|
+
| `AWS_SECRET_ACCESS_KEY` | Direct AWS credentials | - |
|
|
257
|
+
|
|
258
|
+
## Troubleshooting
|
|
259
|
+
|
|
260
|
+
### Common Issues
|
|
261
|
+
|
|
262
|
+
**AWS Credentials Error:**
|
|
263
|
+
```
|
|
264
|
+
AWS credentials not available - set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY or AWS_PROFILE
|
|
265
|
+
```
|
|
266
|
+
**Solution**: Configure AWS credentials as shown in Prerequisites section.
|
|
267
|
+
|
|
268
|
+
**Model Access Error:**
|
|
269
|
+
```
|
|
270
|
+
Could not access the model: bedrock/model-name
|
|
271
|
+
```
|
|
272
|
+
**Solution**: Ensure your AWS account has access to the requested Bedrock model and proper permissions.
|
|
273
|
+
|
|
274
|
+
**Scenario Not Found:**
|
|
275
|
+
```
|
|
276
|
+
Scenario 'Invalid Name' not found
|
|
277
|
+
```
|
|
278
|
+
**Solution**: Use `--list-scenarios` to see available options and check spelling.
|
|
279
|
+
|
|
280
|
+
**MCP Connection Issues:**
|
|
281
|
+
```
|
|
282
|
+
Error during Strands conversation: MCP connection failed
|
|
283
|
+
```
|
|
284
|
+
**Solution**: Ensure the DynamoDB MCP server is properly installed and accessible.
|
|
285
|
+
|
|
286
|
+
### Extending the System
|
|
287
|
+
|
|
288
|
+
**Adding New Scenarios:**
|
|
289
|
+
1. Add scenario definition to `scenarios.py`
|
|
290
|
+
2. Include all required fields (entities, access patterns, scale)
|
|
291
|
+
3. Test with different models for consistency
|
|
292
|
+
|
|
293
|
+
**Adding New Evaluation Dimensions:**
|
|
294
|
+
|
|
295
|
+
The system uses a dynamic registry pattern that makes adding new evaluation dimensions extremely simple - no code changes required to the evaluation engine!
|
|
296
|
+
|
|
297
|
+
**3-Step Process:**
|
|
298
|
+
1. **Add Dimension to Registry** - Simply add a new `DimensionConfig` to the appropriate evaluation in `evaluation_registry.py`
|
|
299
|
+
2. **Define Scoring Rubric** - Specify how the dimension should be evaluated (1-10 scale)
|
|
300
|
+
3. **Test Immediately** - The dynamic evaluator automatically picks up the new dimension
|
|
301
|
+
|
|
302
|
+
**Example - Adding Security Evaluation:**
|
|
303
|
+
```python
|
|
304
|
+
# In evaluation_registry.py, add to model_dimensions list:
|
|
305
|
+
DimensionConfig(
|
|
306
|
+
name="security_considerations",
|
|
307
|
+
display_name="Security Considerations",
|
|
308
|
+
description="Data security and access control planning",
|
|
309
|
+
scoring_rubric=(
|
|
310
|
+
"Score 1-10: Evaluate security measures including "
|
|
311
|
+
"encryption, access patterns, IAM policies, and data protection. "
|
|
312
|
+
"Return single number 1-10."
|
|
313
|
+
),
|
|
314
|
+
weight=1.0,
|
|
315
|
+
justification_prompt="Explain security assessment and recommendations"
|
|
316
|
+
)
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
**That's it!** The `DynamicEvaluationEngine` will automatically:
|
|
320
|
+
- Generate DSPy signatures with your new dimension
|
|
321
|
+
- Create result dataclasses including the new field
|
|
322
|
+
- Integrate scoring and justification collection
|
|
323
|
+
- Make it available in CLI evaluations
|
|
324
|
+
|
|
325
|
+
**No changes needed to:**
|
|
326
|
+
- `dynamic_evaluators.py` - automatically adapts
|
|
327
|
+
- `test_dspy_evals.py` - CLI works immediately
|
|
328
|
+
- Result processing - handled automatically
|
|
329
|
+
|
|
330
|
+
**New Model Support:**
|
|
331
|
+
1. Ensure model is available in AWS Bedrock
|
|
332
|
+
2. Test compatibility with DSPy framework
|
|
333
|
+
3. Adjust timeout settings if needed
|
|
334
|
+
|
|
335
|
+
**Architecture Overview:**
|
|
336
|
+
- `evaluation_registry.py`: Dynamic registry for evaluation dimensions and types
|
|
337
|
+
- `dynamic_evaluators.py`: DSPy evaluation engine that adapts to registry configurations
|
|
338
|
+
- `multiturn_evaluator.py`: Multi-turn conversation evaluator using Strands agents
|
|
339
|
+
- `scenarios.py`: Test scenario definitions for evaluation
|
|
340
|
+
- `test_dspy_evals.py`: Command-line interface for the evaluation system
|
|
341
|
+
|
|
342
|
+
## Development and Contributing
|
|
343
|
+
|
|
344
|
+
### Running Tests
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# Run a quick evaluation
|
|
348
|
+
uv run python tests/evals/test_dspy_evals.py
|
|
349
|
+
|
|
350
|
+
# Test different models
|
|
351
|
+
uv run python tests/evals/test_dspy_evals.py --model "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
|
|
352
|
+
|
|
353
|
+
# Test all scenarios
|
|
354
|
+
for scenario in "Simple E-commerce Schema" "High-Scale Social Media Platform" "Content Management System"; do
|
|
355
|
+
uv run python tests/evals/test_dspy_evals.py --scenario "$scenario"
|
|
356
|
+
done
|
|
357
|
+
```
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Dynamic DSPy evaluator using evaluation registry for easy dimension management."""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from evaluation_registry import EvaluationConfig, registry
|
|
7
|
+
from logging_config import get_logger
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, Type
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Initialize logger for this module
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_dspy_signature(evaluation_config: EvaluationConfig) -> Type[dspy.Signature]:
|
|
17
|
+
"""Dynamically create a DSPy signature class based on evaluation configuration."""
|
|
18
|
+
signature_attrs = {}
|
|
19
|
+
|
|
20
|
+
input_fields = evaluation_config.input_fields or {}
|
|
21
|
+
for field_name, field_desc in input_fields.items():
|
|
22
|
+
signature_attrs[field_name] = dspy.InputField(desc=field_desc)
|
|
23
|
+
|
|
24
|
+
for dimension in evaluation_config.dimensions:
|
|
25
|
+
score_field_name = f'{dimension.name}_score'
|
|
26
|
+
signature_attrs[score_field_name] = dspy.OutputField(desc=dimension.scoring_rubric)
|
|
27
|
+
|
|
28
|
+
for dimension in evaluation_config.dimensions:
|
|
29
|
+
if dimension.justification_prompt:
|
|
30
|
+
justification_field_name = f'{dimension.name}_justification'
|
|
31
|
+
signature_attrs[justification_field_name] = dspy.OutputField(
|
|
32
|
+
desc=dimension.justification_prompt
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
signature_attrs['strengths'] = dspy.OutputField(
|
|
36
|
+
desc=f'Key strengths of the {evaluation_config.display_name.lower()}, highlighting what was done exceptionally well'
|
|
37
|
+
)
|
|
38
|
+
signature_attrs['weaknesses'] = dspy.OutputField(
|
|
39
|
+
desc=f'Main weaknesses and areas where the {evaluation_config.display_name.lower()} fell short or could be significantly improved'
|
|
40
|
+
)
|
|
41
|
+
signature_attrs['improvement_recommendations'] = dspy.OutputField(
|
|
42
|
+
desc=f'Specific, actionable recommendations for improving the {evaluation_config.display_name.lower()}, with concrete suggestions for addressing identified weaknesses'
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
signature_class_name = f'{evaluation_config.name.title().replace("_", "")}Signature'
|
|
46
|
+
signature_class = type(signature_class_name, (dspy.Signature,), signature_attrs)
|
|
47
|
+
|
|
48
|
+
signature_class.__doc__ = f'Generated DSPy signature for {evaluation_config.display_name} with {len(evaluation_config.dimensions)} dimensions.'
|
|
49
|
+
|
|
50
|
+
return signature_class
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def create_result_dataclass(evaluation_config: EvaluationConfig) -> Type:
|
|
54
|
+
"""Dynamically create a result dataclass based on evaluation configuration."""
|
|
55
|
+
class_fields = []
|
|
56
|
+
|
|
57
|
+
for dimension in evaluation_config.dimensions:
|
|
58
|
+
class_fields.append((dimension.name, float))
|
|
59
|
+
|
|
60
|
+
class_fields.extend(
|
|
61
|
+
[('justifications', Dict[str, str]), ('overall_score', float), ('quality_level', str)]
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
dataclass_name = f'{evaluation_config.name.title().replace("_", "")}Result'
|
|
65
|
+
annotations = dict(class_fields)
|
|
66
|
+
result_class = type(dataclass_name, (), {'__annotations__': annotations})
|
|
67
|
+
result_class = dataclass(result_class)
|
|
68
|
+
|
|
69
|
+
def to_dict(self):
|
|
70
|
+
"""Convert result object to dictionary for JSON serialization."""
|
|
71
|
+
result_dict = {}
|
|
72
|
+
for dimension in evaluation_config.dimensions:
|
|
73
|
+
result_dict[dimension.name] = getattr(self, dimension.name)
|
|
74
|
+
result_dict.update(
|
|
75
|
+
{
|
|
76
|
+
'justifications': self.justifications,
|
|
77
|
+
'overall_score': self.overall_score,
|
|
78
|
+
'quality_level': self.quality_level,
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
return result_dict
|
|
82
|
+
|
|
83
|
+
setattr(result_class, 'to_dict', to_dict)
|
|
84
|
+
result_class.__doc__ = f'Generated result container for {evaluation_config.display_name} with {len(evaluation_config.dimensions)} dimensions.'
|
|
85
|
+
|
|
86
|
+
return result_class
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DynamicEvaluationEngine:
|
|
90
|
+
"""Dynamic evaluation engine that adapts to any registered evaluation type."""
|
|
91
|
+
|
|
92
|
+
def __init__(self):
|
|
93
|
+
"""Initialize the dynamic evaluation engine with empty caches."""
|
|
94
|
+
self._evaluators = {}
|
|
95
|
+
self._result_classes = {}
|
|
96
|
+
self._expert_knowledge_cache = None
|
|
97
|
+
|
|
98
|
+
def _get_evaluator(self, evaluation_name: str):
|
|
99
|
+
"""Get or create evaluator for the specified evaluation type."""
|
|
100
|
+
if evaluation_name not in self._evaluators:
|
|
101
|
+
evaluation_config = registry.get_evaluation(evaluation_name)
|
|
102
|
+
signature_class = create_dspy_signature(evaluation_config)
|
|
103
|
+
self._evaluators[evaluation_name] = dspy.ChainOfThought(signature_class)
|
|
104
|
+
|
|
105
|
+
return self._evaluators[evaluation_name]
|
|
106
|
+
|
|
107
|
+
def _get_result_class(self, evaluation_name: str):
|
|
108
|
+
"""Get or create result class for the specified evaluation type."""
|
|
109
|
+
if evaluation_name not in self._result_classes:
|
|
110
|
+
evaluation_config = registry.get_evaluation(evaluation_name)
|
|
111
|
+
self._result_classes[evaluation_name] = create_result_dataclass(evaluation_config)
|
|
112
|
+
|
|
113
|
+
return self._result_classes[evaluation_name]
|
|
114
|
+
|
|
115
|
+
def _load_expert_knowledge(self) -> str:
|
|
116
|
+
"""Load DynamoDB expert knowledge (cached)."""
|
|
117
|
+
if self._expert_knowledge_cache is None:
|
|
118
|
+
try:
|
|
119
|
+
prompt_path = (
|
|
120
|
+
Path(__file__).parent.parent.parent
|
|
121
|
+
/ 'awslabs'
|
|
122
|
+
/ 'dynamodb_mcp_server'
|
|
123
|
+
/ 'prompts'
|
|
124
|
+
/ 'dynamodb_architect.md'
|
|
125
|
+
)
|
|
126
|
+
self._expert_knowledge_cache = prompt_path.read_text(encoding='utf-8')
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.warning(f'Warning: Could not load expert knowledge: {e}')
|
|
129
|
+
self._expert_knowledge_cache = 'Expert knowledge not available.'
|
|
130
|
+
|
|
131
|
+
return self._expert_knowledge_cache
|
|
132
|
+
|
|
133
|
+
def evaluate(self, evaluation_name: str, scenario: Dict[str, Any], content: str, **kwargs):
|
|
134
|
+
"""Evaluate content using the specified evaluation type."""
|
|
135
|
+
evaluation_config = registry.get_evaluation(evaluation_name)
|
|
136
|
+
evaluator = self._get_evaluator(evaluation_name)
|
|
137
|
+
result_class = self._get_result_class(evaluation_name)
|
|
138
|
+
|
|
139
|
+
# Prepare input arguments
|
|
140
|
+
eval_inputs = {}
|
|
141
|
+
|
|
142
|
+
scenario_json = json.dumps(scenario, indent=2)
|
|
143
|
+
|
|
144
|
+
input_mappings = {
|
|
145
|
+
'scenario_requirements': scenario_json,
|
|
146
|
+
'guidance_response': content,
|
|
147
|
+
'modeling_requirement_content': content,
|
|
148
|
+
'dynamodb_expert_knowledge': self._load_expert_knowledge(),
|
|
149
|
+
'architect_methodology': self._load_expert_knowledge(),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Add inputs based on evaluation configuration
|
|
153
|
+
for field_name in evaluation_config.input_fields.keys():
|
|
154
|
+
if field_name in input_mappings:
|
|
155
|
+
eval_inputs[field_name] = input_mappings[field_name]
|
|
156
|
+
elif field_name in kwargs:
|
|
157
|
+
eval_inputs[field_name] = kwargs[field_name]
|
|
158
|
+
|
|
159
|
+
# Run the evaluation
|
|
160
|
+
raw_result = evaluator(**eval_inputs)
|
|
161
|
+
|
|
162
|
+
# Process results into structured format
|
|
163
|
+
return self._process_results(evaluation_config, raw_result, result_class)
|
|
164
|
+
|
|
165
|
+
def _process_results(self, evaluation_config: EvaluationConfig, raw_result, result_class):
|
|
166
|
+
"""Process raw DSPy results into structured result object."""
|
|
167
|
+
# Extract dimension scores
|
|
168
|
+
dimension_scores = {}
|
|
169
|
+
for dimension in evaluation_config.dimensions:
|
|
170
|
+
score_field = f'{dimension.name}_score'
|
|
171
|
+
score_value = getattr(raw_result, score_field, 0.0)
|
|
172
|
+
# Handle DSPy returning various types
|
|
173
|
+
if isinstance(score_value, (int, float)):
|
|
174
|
+
dimension_scores[dimension.name] = float(score_value)
|
|
175
|
+
else:
|
|
176
|
+
# Try to parse if string
|
|
177
|
+
try:
|
|
178
|
+
dimension_scores[dimension.name] = float(str(score_value).split()[0])
|
|
179
|
+
except (ValueError, IndexError):
|
|
180
|
+
dimension_scores[dimension.name] = 0.0
|
|
181
|
+
|
|
182
|
+
# Calculate overall score using weighted average
|
|
183
|
+
total_weight = sum(dim.weight for dim in evaluation_config.dimensions)
|
|
184
|
+
if total_weight > 0:
|
|
185
|
+
weighted_sum = sum(
|
|
186
|
+
dimension_scores[dim.name] * dim.weight for dim in evaluation_config.dimensions
|
|
187
|
+
)
|
|
188
|
+
overall_score = round(weighted_sum / total_weight, 2)
|
|
189
|
+
else:
|
|
190
|
+
overall_score = 0.0
|
|
191
|
+
|
|
192
|
+
# Determine quality level using existing thresholds
|
|
193
|
+
quality_thresholds = {
|
|
194
|
+
'excellent': 8.5,
|
|
195
|
+
'good': 7.0,
|
|
196
|
+
'acceptable': 5.5,
|
|
197
|
+
'needs_improvement': 4.0,
|
|
198
|
+
'poor': 2.0,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if overall_score >= quality_thresholds['excellent']:
|
|
202
|
+
quality_level = 'excellent'
|
|
203
|
+
elif overall_score >= quality_thresholds['good']:
|
|
204
|
+
quality_level = 'good'
|
|
205
|
+
elif overall_score >= quality_thresholds['acceptable']:
|
|
206
|
+
quality_level = 'acceptable'
|
|
207
|
+
elif overall_score >= quality_thresholds['needs_improvement']:
|
|
208
|
+
quality_level = 'needs_improvement'
|
|
209
|
+
else:
|
|
210
|
+
quality_level = 'poor'
|
|
211
|
+
|
|
212
|
+
# Build justifications dictionary
|
|
213
|
+
justifications = {}
|
|
214
|
+
|
|
215
|
+
# Add dimension justifications
|
|
216
|
+
for dimension in evaluation_config.dimensions:
|
|
217
|
+
if dimension.justification_prompt:
|
|
218
|
+
justification_field = f'{dimension.name}_justification'
|
|
219
|
+
justifications[dimension.name] = str(getattr(raw_result, justification_field, ''))
|
|
220
|
+
|
|
221
|
+
# Add overall assessment fields
|
|
222
|
+
justifications.update(
|
|
223
|
+
{
|
|
224
|
+
'strengths': str(getattr(raw_result, 'strengths', '')),
|
|
225
|
+
'weaknesses': str(getattr(raw_result, 'weaknesses', '')),
|
|
226
|
+
'improvement_recommendations': str(
|
|
227
|
+
getattr(raw_result, 'improvement_recommendations', '')
|
|
228
|
+
),
|
|
229
|
+
}
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Create result object
|
|
233
|
+
result_kwargs = {
|
|
234
|
+
**dimension_scores,
|
|
235
|
+
'justifications': justifications,
|
|
236
|
+
'overall_score': overall_score,
|
|
237
|
+
'quality_level': quality_level,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return result_class(**result_kwargs)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# Create global instance
|
|
244
|
+
dynamic_engine = DynamicEvaluationEngine()
|
|
245
|
+
|
|
246
|
+
__all__ = [
|
|
247
|
+
'DynamicEvaluationEngine',
|
|
248
|
+
'create_dspy_signature',
|
|
249
|
+
'create_result_dataclass',
|
|
250
|
+
'dynamic_engine',
|
|
251
|
+
]
|