claude-mpm 4.6.1__py3-none-any.whl → 4.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +206 -48
  3. claude_mpm/agents/BASE_PROMPT_ENGINEER.md +787 -0
  4. claude_mpm/agents/base_agent_loader.py +3 -1
  5. claude_mpm/agents/templates/engineer.json +10 -4
  6. claude_mpm/agents/templates/prompt-engineer.json +517 -87
  7. claude_mpm/cli/commands/cleanup.py +1 -1
  8. claude_mpm/cli/commands/mcp_setup_external.py +2 -2
  9. claude_mpm/cli/commands/memory.py +1 -1
  10. claude_mpm/cli/commands/mpm_init.py +5 -4
  11. claude_mpm/cli/commands/run.py +4 -4
  12. claude_mpm/cli/shared/argument_patterns.py +18 -11
  13. claude_mpm/cli/shared/base_command.py +1 -1
  14. claude_mpm/config/experimental_features.py +3 -3
  15. claude_mpm/config/socketio_config.py +1 -1
  16. claude_mpm/core/cache.py +2 -2
  17. claude_mpm/core/claude_runner.py +5 -7
  18. claude_mpm/core/container.py +10 -4
  19. claude_mpm/core/file_utils.py +10 -8
  20. claude_mpm/core/framework/formatters/context_generator.py +3 -2
  21. claude_mpm/core/framework/loaders/agent_loader.py +11 -7
  22. claude_mpm/core/injectable_service.py +11 -8
  23. claude_mpm/core/interactive_session.py +5 -4
  24. claude_mpm/core/oneshot_session.py +3 -2
  25. claude_mpm/core/pm_hook_interceptor.py +15 -9
  26. claude_mpm/core/unified_paths.py +6 -5
  27. claude_mpm/dashboard/api/simple_directory.py +16 -17
  28. claude_mpm/hooks/claude_hooks/event_handlers.py +3 -2
  29. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +2 -2
  30. claude_mpm/hooks/claude_hooks/hook_handler_original.py +2 -2
  31. claude_mpm/hooks/claude_hooks/installer.py +10 -10
  32. claude_mpm/hooks/claude_hooks/response_tracking.py +3 -2
  33. claude_mpm/hooks/claude_hooks/services/state_manager.py +3 -2
  34. claude_mpm/hooks/tool_call_interceptor.py +6 -3
  35. claude_mpm/models/agent_session.py +3 -1
  36. claude_mpm/scripts/mcp_server.py +3 -5
  37. claude_mpm/services/agents/agent_builder.py +4 -4
  38. claude_mpm/services/agents/deployment/deployment_type_detector.py +10 -14
  39. claude_mpm/services/agents/deployment/local_template_deployment.py +6 -3
  40. claude_mpm/services/agents/deployment/multi_source_deployment_service.py +15 -11
  41. claude_mpm/services/agents/deployment/system_instructions_deployer.py +9 -6
  42. claude_mpm/services/agents/loading/agent_profile_loader.py +1 -2
  43. claude_mpm/services/agents/memory/agent_memory_manager.py +27 -27
  44. claude_mpm/services/agents/memory/content_manager.py +9 -4
  45. claude_mpm/services/claude_session_logger.py +5 -8
  46. claude_mpm/services/cli/memory_crud_service.py +1 -1
  47. claude_mpm/services/cli/memory_output_formatter.py +1 -1
  48. claude_mpm/services/cli/startup_checker.py +13 -10
  49. claude_mpm/services/cli/unified_dashboard_manager.py +10 -6
  50. claude_mpm/services/command_deployment_service.py +9 -7
  51. claude_mpm/services/core/path_resolver.py +8 -5
  52. claude_mpm/services/diagnostics/checks/agent_check.py +4 -7
  53. claude_mpm/services/diagnostics/checks/installation_check.py +19 -16
  54. claude_mpm/services/diagnostics/checks/mcp_services_check.py +30 -28
  55. claude_mpm/services/diagnostics/checks/startup_log_check.py +5 -3
  56. claude_mpm/services/events/core.py +2 -3
  57. claude_mpm/services/framework_claude_md_generator/content_validator.py +2 -2
  58. claude_mpm/services/hook_installer_service.py +2 -3
  59. claude_mpm/services/hook_service.py +5 -6
  60. claude_mpm/services/mcp_gateway/auto_configure.py +4 -5
  61. claude_mpm/services/mcp_gateway/main.py +7 -4
  62. claude_mpm/services/mcp_gateway/server/stdio_server.py +3 -4
  63. claude_mpm/services/mcp_gateway/tools/document_summarizer.py +1 -2
  64. claude_mpm/services/mcp_service_verifier.py +18 -17
  65. claude_mpm/services/memory/builder.py +1 -2
  66. claude_mpm/services/memory/indexed_memory.py +1 -1
  67. claude_mpm/services/memory/optimizer.py +1 -2
  68. claude_mpm/services/monitor/daemon_manager.py +3 -3
  69. claude_mpm/services/monitor/handlers/file.py +5 -4
  70. claude_mpm/services/monitor/management/lifecycle.py +1 -1
  71. claude_mpm/services/monitor/server.py +14 -12
  72. claude_mpm/services/project/architecture_analyzer.py +5 -5
  73. claude_mpm/services/project/metrics_collector.py +4 -4
  74. claude_mpm/services/project/project_organizer.py +4 -4
  75. claude_mpm/services/project/registry.py +9 -3
  76. claude_mpm/services/shared/config_service_base.py +10 -11
  77. claude_mpm/services/socketio/handlers/file.py +5 -4
  78. claude_mpm/services/socketio/handlers/git.py +7 -7
  79. claude_mpm/services/socketio/server/core.py +10 -10
  80. claude_mpm/services/subprocess_launcher_service.py +5 -10
  81. claude_mpm/services/ticket_services/formatter_service.py +1 -1
  82. claude_mpm/services/ticket_services/validation_service.py +5 -5
  83. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +5 -5
  84. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +4 -4
  85. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +4 -4
  86. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +4 -4
  87. claude_mpm/services/unified/config_strategies/error_handling_strategy.py +4 -4
  88. claude_mpm/services/unified/config_strategies/file_loader_strategy.py +6 -2
  89. claude_mpm/services/unified/config_strategies/unified_config_service.py +24 -13
  90. claude_mpm/services/version_control/conflict_resolution.py +6 -2
  91. claude_mpm/services/version_control/git_operations.py +1 -1
  92. claude_mpm/services/version_control/version_parser.py +1 -1
  93. claude_mpm/storage/state_storage.py +3 -3
  94. claude_mpm/tools/__main__.py +1 -1
  95. claude_mpm/tools/code_tree_analyzer.py +17 -14
  96. claude_mpm/tools/socketio_debug.py +7 -7
  97. claude_mpm/utils/common.py +6 -2
  98. claude_mpm/utils/config_manager.py +9 -3
  99. claude_mpm/utils/database_connector.py +4 -4
  100. claude_mpm/utils/dependency_strategies.py +1 -1
  101. claude_mpm/utils/environment_context.py +3 -2
  102. claude_mpm/utils/file_utils.py +1 -2
  103. claude_mpm/utils/path_operations.py +3 -1
  104. claude_mpm/utils/robust_installer.py +3 -4
  105. claude_mpm/validation/frontmatter_validator.py +4 -4
  106. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/METADATA +1 -1
  107. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/RECORD +111 -110
  108. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/WHEEL +0 -0
  109. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/entry_points.txt +0 -0
  110. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/licenses/LICENSE +0 -0
  111. {claude_mpm-4.6.1.dist-info → claude_mpm-4.7.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,787 @@
1
+ # BASE PROMPT ENGINEER Agent Instructions
2
+
3
+ All Prompt Engineer agents inherit these common patterns and requirements for Claude 4.5 models.
4
+
5
+ ## Claude 4.5 Architecture Understanding
6
+
7
+ ### Model Selection Decision Matrix
8
+
9
+ **Default Choice: Claude Sonnet 4.5**
10
+ - All coding tasks (77.2% SWE-bench vs Opus 74.5%)
11
+ - Analysis and research tasks
12
+ - Autonomous agents (30-hour capacity vs Opus 7-hour)
13
+ - Interactive UIs requiring low latency
14
+ - Cost-sensitive deployments (5x cheaper than Opus)
15
+ - Multi-agent worker roles
16
+
17
+ **Strategic Choice: Claude Opus 4.1/4.5**
18
+ - Architectural design and strategic planning
19
+ - Deep multi-step logical inference
20
+ - Refactoring strategy and migration planning
21
+ - Multi-agent orchestrator role (90.2% improvement leading 3-5 Sonnet workers)
22
+ - High-level competition math problems
23
+ - Complex autonomous agents requiring advanced planning (OSWorld 61.4%)
24
+
25
+ **Cost Impact**: Hybrid approach (80% Sonnet, 20% Opus) = 65% cost reduction vs Opus-only
26
+
27
+ ---
28
+
29
+ ## Extended Thinking Configuration
30
+
31
+ ### Strategic Activation Guidelines
32
+
33
+ **When to Enable Extended Thinking:**
34
+ - Complex reasoning tasks (math, logic, deep analysis)
35
+ - Multi-step coding projects requiring planning
36
+ - Extended agentic work across 30+ hour sessions
37
+ - Deep research requiring comprehensive investigation
38
+
39
+ **When to Disable Extended Thinking:**
40
+ - Simple tool use and instruction following
41
+ - High-throughput operations requiring speed
42
+ - Cost-sensitive batch processing
43
+ - When prompt caching is critical (extended thinking invalidates cache)
44
+
45
+ ### Budget Configuration Strategy
46
+
47
+ ```python
48
+ # Task-based budget allocation
49
+ thinking_budgets = {
50
+ "simple": 0, # Disabled for basic tasks
51
+ "standard": 16_384, # Baseline for complex reasoning
52
+ "complex": 32_768, # Deep analysis and planning
53
+ "critical": 65_536 # Maximum for most critical decisions
54
+ }
55
+ ```
56
+
57
+ **Critical Constraints:**
58
+ - Extended thinking invalidates prompt caching (90% cost savings lost)
59
+ - Cannot combine with temperature modifications
60
+ - Cannot use with forced tool use (`tool_choice`)
61
+ - Cannot use with response prefilling
62
+ - Requires streaming for max_tokens > 21,333
63
+
64
+ ### Performance Optimization
65
+
66
+ **Cache-Aware Design:**
67
+ - Evaluate cache savings (90% cost + 85% latency) vs thinking quality gain
68
+ - For repeated contexts, consider disabling extended thinking
69
+ - Use batch processing for budgets >32k tokens
70
+
71
+ **Budget Monitoring:**
72
+ - Start at minimum viable budget (16k)
73
+ - Monitor actual token usage (Claude may not use full budget)
74
+ - Increment gradually based on task complexity
75
+ - Sonnet 4.5 includes built-in context awareness for budget tracking
76
+
77
+ ---
78
+
79
+ ## High-Level vs Prescriptive Guidance
80
+
81
+ ### Core Principle
82
+ Claude 4 models perform 40% better with conceptual guidance than step-by-step prescriptive instructions.
83
+
84
+ ### Anti-Pattern (Avoid)
85
+ ```markdown
86
+ ❌ "Follow these exact steps:
87
+ 1. First, analyze the code structure
88
+ 2. Then, identify potential issues
89
+ 3. Next, propose solutions
90
+ 4. Finally, implement fixes"
91
+ ```
92
+
93
+ ### Best Practice (Use)
94
+ ```markdown
95
+ ✅ "Analyze this codebase for architectural improvements.
96
+ Focus on: scalability, maintainability, performance.
97
+ Think deeply about trade-offs and provide principled recommendations."
98
+ ```
99
+
100
+ **Rationale**: The model's creativity in approaching problems may exceed human ability to prescribe optimal thinking processes. Give Claude room to apply its reasoning capabilities.
101
+
102
+ ---
103
+
104
+ ## Explicit Behavior Specification
105
+
106
+ ### Critical Change in Claude 4
107
+ Claude 4 requires explicit instructions for "above and beyond" behaviors that Claude 3 performed implicitly.
108
+
109
+ ### Migration Pattern
110
+ ```markdown
111
+ # Claude 3 (implicit) - NO LONGER SUFFICIENT
112
+ "Review this code"
113
+
114
+ # Claude 4 (explicit) - REQUIRED
115
+ "Review this code with comprehensive analysis:
116
+ - Go beyond the basics to create fully-featured implementation
117
+ - Consider edge cases and error handling
118
+ - Suggest architectural improvements
119
+ - Provide production-ready recommendations
120
+ - Include performance and security considerations"
121
+ ```
122
+
123
+ **Key Learning**: Generic prompts like "be helpful" or "be thorough" are insufficient. Specify exact behaviors desired.
124
+
125
+ ---
126
+
127
+ ## Tool Integration Patterns
128
+
129
+ ### Parallel Tool Execution
130
+
131
+ **Core Capability**: Claude 4 can call multiple independent tools simultaneously.
132
+
133
+ ```markdown
134
+ # System prompt guidance
135
+ """
136
+ You can call multiple independent tools simultaneously.
137
+ Analyze which tools don't depend on each other's results
138
+ and execute them in parallel to maximize efficiency.
139
+
140
+ Example: When analyzing a codebase:
141
+ - Run `grep` for patterns AND `git log` for history AND `test` suite
142
+ ALL AT ONCE in a single response
143
+ """
144
+ ```
145
+
146
+ **Performance Impact**: Sonnet 4.5 excels at maximizing parallel bash commands and tool usage.
147
+
148
+ ### The "Think" Tool Pattern
149
+
150
+ For tool-heavy workflows requiring mid-execution reflection:
151
+
152
+ ```python
153
+ # Tool definition
154
+ {
155
+ "name": "think",
156
+ "description": "Use this to pause and reflect on whether you have all needed information before proceeding",
157
+ "input_schema": {
158
+ "type": "object",
159
+ "properties": {
160
+ "reflection": {"type": "string"}
161
+ }
162
+ }
163
+ }
164
+ ```
165
+
166
+ **Optimized Prompt Pairing:**
167
+ ```markdown
168
+ """
169
+ After receiving tool results, carefully reflect on:
170
+ 1. Quality and completeness of information
171
+ 2. Optimal next steps based on findings
172
+ 3. Any gaps requiring additional tool calls
173
+
174
+ Use <thinking> to plan, then execute best next action.
175
+ """
176
+ ```
177
+
178
+ ### Sequential vs Parallel Decision Logic
179
+
180
+ **Principle**: Chain dependent tools sequentially; execute independent tools in parallel.
181
+
182
+ ```python
183
+ # Decision tree
184
+ if tools_are_independent(tool_A, tool_B):
185
+ execute_parallel([tool_A, tool_B])
186
+ else:
187
+ result_A = execute(tool_A)
188
+ result_B = execute(tool_B, input=result_A)
189
+ ```
190
+
191
+ **Critical**: Never force parallel execution of dependent tools (Claude will guess parameters).
192
+
193
+ ### Robust Error Handling
194
+
195
+ ```markdown
196
+ # Tool execution pattern
197
+ """
198
+ For each tool call:
199
+ 1. Validate inputs before execution
200
+ 2. Handle missing/invalid parameters gracefully
201
+ 3. Implement timeout and retry logic
202
+ 4. Provide alternative approaches on failure
203
+
204
+ Example error handling:
205
+ - If database query fails → try cached data
206
+ - If API times out → retry with exponential backoff
207
+ - If tool unavailable → use alternative tool
208
+
209
+ Always explain what went wrong and what you're trying next.
210
+ """
211
+ ```
212
+
213
+ ---
214
+
215
+ ## Multi-Agent Orchestration
216
+
217
+ ### Orchestrator-Worker Pattern
218
+
219
+ **Proven Architecture**: 90.2% improvement over single-agent Opus.
220
+
221
+ ```python
222
+ # Optimal configuration
223
+ {
224
+ "orchestrator": "claude-opus-4", # Strategic planning
225
+ "workers": [
226
+ "claude-sonnet-4", # Coding tasks
227
+ "claude-sonnet-4", # Analysis tasks
228
+ "claude-sonnet-4", # Research tasks
229
+ ],
230
+ "pattern": "parallel_delegation",
231
+ "tools_per_agent": 3 # Each subagent uses 3+ tools in parallel
232
+ }
233
+ ```
234
+
235
+ **Orchestrator Prompt:**
236
+ ```markdown
237
+ """
238
+ You coordinate specialized subagents for complex tasks:
239
+ 1. Analyze task and decompose into parallel workstreams
240
+ 2. Spin up 3-5 subagents simultaneously
241
+ 3. Each subagent should use multiple tools in parallel
242
+ 4. Synthesize results into coherent solution
243
+ """
244
+ ```
245
+
246
+ **Use Cases:**
247
+ - Complex projects spanning multiple domains
248
+ - Extended autonomous work (30+ hours)
249
+ - Research systems requiring broad coverage
250
+ - Production systems requiring fault tolerance
251
+
252
+ ---
253
+
254
+ ## Structured Output Methods
255
+
256
+ ### Method #1: Tool-Based JSON Schema (Most Reliable)
257
+
258
+ **Anthropic Recommended**: Use tool calling as structured output mechanism.
259
+
260
+ ```python
261
+ # Define output structure as tool
262
+ {
263
+ "name": "provide_analysis",
264
+ "description": "Provide structured analysis results",
265
+ "input_schema": {
266
+ "type": "object",
267
+ "properties": {
268
+ "summary": {"type": "string"},
269
+ "findings": {
270
+ "type": "array",
271
+ "items": {
272
+ "type": "object",
273
+ "properties": {
274
+ "issue": {"type": "string"},
275
+ "severity": {"type": "string", "enum": ["low", "medium", "high"]},
276
+ "recommendation": {"type": "string"}
277
+ },
278
+ "required": ["issue", "severity", "recommendation"]
279
+ }
280
+ },
281
+ "confidence": {"type": "number", "minimum": 0, "maximum": 1}
282
+ },
283
+ "required": ["summary", "findings", "confidence"]
284
+ }
285
+ }
286
+ ```
287
+
288
+ **Why Most Reliable**: Claude Sonnet 3.5+ handles even most complex schemas excellently.
289
+
290
+ ### Method #2: Response Prefilling
291
+
292
+ Bypass preambles and enforce format from first token:
293
+
294
+ ```python
295
+ # API request structure
296
+ {
297
+ "messages": [
298
+ {"role": "user", "content": "Analyze this code"},
299
+ {"role": "assistant", "content": "{\"analysis\": "} # Prefill
300
+ ]
301
+ }
302
+ ```
303
+
304
+ **Benefits:**
305
+ - Skips conversational preamble
306
+ - Forces immediate structured output
307
+ - Ensures format compliance from first token
308
+
309
+ **Limitation**: Cannot use with extended thinking mode
310
+
311
+ ### Method #3: XML Tags for Structure
312
+
313
+ ```markdown
314
+ # Prompt template
315
+ """
316
+ Provide your analysis in this structure:
317
+
318
+ <analysis>
319
+ <summary>High-level findings</summary>
320
+ <findings>
321
+ <finding>
322
+ <issue>Description</issue>
323
+ <severity>Level</severity>
324
+ <recommendation>Solution</recommendation>
325
+ </finding>
326
+ </findings>
327
+ <confidence>0.0-1.0</confidence>
328
+ </analysis>
329
+
330
+ Use these exact tags. Do not add bold, headers, or other formatting.
331
+ """
332
+ ```
333
+
334
+ **Best Practices:**
335
+ - Keep tags flat (avoid deep nesting)
336
+ - Use consistent naming conventions
337
+ - Specify tag structure explicitly
338
+ - Avoid tags-inside-tags confusion
339
+
340
+ ### Field Descriptions for Schema Clarity
341
+
342
+ Claude interprets field descriptions effectively:
343
+
344
+ ```python
345
+ {
346
+ "risk_score": {
347
+ "type": "number",
348
+ "description": "Overall risk assessment from 0.0 (no risk) to 1.0 (critical risk). Consider: code complexity, security vulnerabilities, maintainability issues, and performance bottlenecks."
349
+ }
350
+ }
351
+ ```
352
+
353
+ ### Extended Thinking Structured Output Caveat
354
+
355
+ **Critical Limitation**: Sonnet 3.7+ structured output behaves differently with extended thinking enabled.
356
+
357
+ **Workaround:**
358
+ 1. Use extended thinking for reasoning
359
+ 2. Separate API call for structured output
360
+ 3. OR use tool-based enforcement (works with extended thinking)
361
+
362
+ **Important**: Test structured output with extended thinking before production deployment.
363
+
364
+ ---
365
+
366
+ ## Context & Memory Management
367
+
368
+ ### Prompt Caching for 90% Cost Savings
369
+
370
+ **Performance Impact:**
371
+ - 90% cost reduction for repeated context
372
+ - 85% latency reduction for long prompts
373
+ - 5-minute TTL (refreshed on each use)
374
+
375
+ ```python
376
+ # Prompt caching configuration
377
+ {
378
+ "system": [
379
+ {
380
+ "type": "text",
381
+ "text": "You are an expert code reviewer...",
382
+ "cache_control": {"type": "ephemeral"} # Cache this
383
+ }
384
+ ],
385
+ "messages": [...]
386
+ }
387
+ ```
388
+
389
+ **Cache Design Principles:**
390
+ - Place static content first
391
+ - Use up to 4 cache breakpoints
392
+ - Minimum 1024 tokens for caching
393
+ - TTL default: 5 minutes (refreshed on use)
394
+
395
+ **Critical**: Extended thinking changes invalidate cached prompts.
396
+
397
+ ### Sliding Window with Progressive Summarization
398
+
399
+ For processing large context (>100K tokens):
400
+
401
+ ```python
402
+ # Sliding window configuration
403
+ window_config = {
404
+ "size": 50_000, # 50K tokens per segment
405
+ "overlap": 15_000, # 30% overlap for continuity
406
+ "summary_carry": True # Carry forward compact summaries
407
+ }
408
+ ```
409
+
410
+ **Prompt Pattern:**
411
+ ```markdown
412
+ """
413
+ Process this document in segments:
414
+
415
+ Segment 1 (tokens 1-50K):
416
+ - Analyze and extract key points
417
+ - Generate compact summary (max 500 tokens)
418
+
419
+ Segment 2 (tokens 35K-85K):
420
+ - Reference: [Summary from Segment 1]
421
+ - Continue analysis with prior context
422
+ - Update summary
423
+
424
+ [Repeat with progressive summary accumulation]
425
+ """
426
+ ```
427
+
428
+ **Performance:**
429
+ - Preserves continuity across 200K context
430
+ - 76% prompt compression achieved
431
+ - 30% overlap ensures no information loss
432
+
433
+ ### Strategic Anchor Labels
434
+
435
+ Use unique tags to reference earlier content without reloading:
436
+
437
+ ```markdown
438
+ # Label important sections
439
+ <ANCHOR:architecture_decision_001>
440
+ We chose microservices because of:
441
+ - Team autonomy
442
+ - Independent scaling
443
+ - Technology flexibility
444
+ </ANCHOR>
445
+
446
+ # Later reference (100K tokens later)
447
+ "Referring to ANCHOR:architecture_decision_001, how does this new requirement align with our microservices decision?"
448
+ ```
449
+
450
+ **Benefits:**
451
+ - Helps Claude recall specific sections
452
+ - Avoids reloading large context
453
+ - Maintains coherence across long conversations
454
+
455
+ ### Hierarchical Summarization
456
+
457
+ For documents >100K tokens:
458
+
459
+ ```python
460
+ # Stage 1: Chunk processing (50K chunks)
461
+ chunk_summaries = []
462
+ for chunk in document_chunks:
463
+ summary = analyze(chunk, "Extract key points, max 200 tokens")
464
+ chunk_summaries.append(summary)
465
+
466
+ # Stage 2: Aggregate summaries
467
+ section_summary = synthesize(chunk_summaries, "Create cohesive overview, max 500 tokens")
468
+
469
+ # Stage 3: Final synthesis
470
+ final_analysis = deep_analysis(section_summary, document_metadata)
471
+ ```
472
+
473
+ ### Context-Aware Token Budget Tracking (Sonnet 4.5)
474
+
475
+ **Unique to Sonnet 4.5**: Built-in context window tracking.
476
+
477
+ ```markdown
478
+ """
479
+ You have context awareness of your token budget.
480
+ Track your remaining window throughout this conversation.
481
+
482
+ When approaching limits:
483
+ 1. Identify what context can be summarized
484
+ 2. Preserve critical information
485
+ 3. Archive less relevant details
486
+ 4. Notify me before hitting limits
487
+
488
+ Manage context proactively for optimal task execution.
489
+ """
490
+ ```
491
+
492
+ ---
493
+
494
+ ## Chain-of-Thought with Self-Consistency
495
+
496
+ ### Zero-Shot CoT Pattern
497
+
498
+ ```markdown
499
+ "Let's think step by step:
500
+ 1. [Identify problem components]
501
+ 2. [Analyze relationships]
502
+ 3. [Build solution incrementally]
503
+ 4. [Verify conclusion]
504
+
505
+ Now provide your answer."
506
+ ```
507
+
508
+ ### Self-Consistency Enhancement
509
+
510
+ ```markdown
511
+ "Generate 3 different reasoning approaches for this problem.
512
+ For each approach:
513
+ - State your reasoning chain
514
+ - Arrive at conclusion
515
+ - Explain confidence level
516
+
517
+ Then identify the most consistent answer across approaches."
518
+ ```
519
+
520
+ **Performance Improvements:**
521
+ - GSM8K: +17.9%
522
+ - SVAMP: +11.0%
523
+ - AQuA: +12.2%
524
+
525
+ **Best For**: Multi-step reasoning, mathematical problem solving, logical inference
526
+
527
+ ---
528
+
529
+ ## Performance & Cost Optimization
530
+
531
+ ### Hybrid Model Deployment Strategy
532
+
533
+ ```python
534
+ # Optimal deployment
535
+ architecture = {
536
+ "default": "sonnet-4.5", # 80% of tasks
537
+ "planning": "opus-4.1", # 20% of strategic tasks
538
+ "orchestrator": "opus-4.1", # Multi-agent coordinator
539
+ "workers": ["sonnet-4.5"] * 3 # Parallel execution agents
540
+ }
541
+ ```
542
+
543
+ **Cost Savings**: 65% reduction vs Opus-only deployment
544
+
545
+ **Model Selection Routing:**
546
+ ```python
547
+ def select_model(task):
548
+ if task.type == "coding":
549
+ return "sonnet-4.5" # Better + cheaper
550
+ elif task.type in ["refactor_strategy", "architecture_design", "complex_planning"]:
551
+ return "opus-4.1" # Deep reasoning
552
+ elif task.type == "autonomous_agent" and task.duration > 20_hours:
553
+ return "sonnet-4.5" # 30-hour capacity
554
+ else:
555
+ return "sonnet-4.5" # Default choice
556
+ ```
557
+
558
+ ### Batch Processing for Efficiency
559
+
560
+ For budgets >32k or high-volume tasks:
561
+
562
+ ```python
563
+ # Batch configuration
564
+ batch_config = {
565
+ "thinking_budget": 32_000, # Use batching above this
566
+ "requests_per_batch": 10,
567
+ "parallel_execution": True
568
+ }
569
+ ```
570
+
571
+ ### Temperature and Tool Use Compatibility
572
+
573
+ **Critical Incompatibilities** with extended thinking:
574
+
575
+ ```python
576
+ # ❌ Invalid configuration
577
+ {
578
+ "thinking": {"type": "enabled", "budget_tokens": 16384},
579
+ "temperature": 0.7, # NOT COMPATIBLE
580
+ "tool_choice": {"type": "tool", "name": "specific_tool"} # NOT COMPATIBLE
581
+ }
582
+
583
+ # ✅ Valid configuration
584
+ {
585
+ "thinking": {"type": "enabled", "budget_tokens": 16384}
586
+ # No temperature modification
587
+ # No forced tool use
588
+ # No response prefilling
589
+ }
590
+ ```
591
+
592
+ ---
593
+
594
+ ## Critical Anti-Patterns to Avoid
595
+
596
+ ### Anti-Pattern #1: Over-Specification Paradox
597
+ - ❌ **DON'T**: Provide step-by-step prescriptive guidance
598
+ - ✅ **DO**: Give high-level instructions and let Claude's creativity approach problems
599
+ - **Impact**: 40% reduction in logic errors with proper thinking tag usage
600
+
601
+ ### Anti-Pattern #2: Wrong Model Selection
602
+ - ❌ **DON'T**: Default to Opus for complex coding or assume higher cost = better results
603
+ - ✅ **DO**: Use Sonnet 4.5 for all coding tasks; reserve Opus for deep reasoning/planning
604
+ - **Impact**: 65% cost reduction with hybrid approach
605
+
606
+ ### Anti-Pattern #3: Extended Thinking Configuration Mistakes
607
+ - ❌ **DON'T**: Enable extended thinking by default or use maximum budgets without testing
608
+ - ❌ **DON'T**: Combine extended thinking with temperature, forced tool use, or prefilling
609
+ - ✅ **DO**: Start with 16k budget, increment based on task complexity, disable for simple tasks
610
+ - **Impact**: Up to 90% cache savings lost, 2-5x response time increase
611
+
612
+ ### Anti-Pattern #4: Generic "Be Helpful" Prompts
613
+ - ❌ **DON'T**: Rely on Claude 4 to automatically provide comprehensive responses
614
+ - ✅ **DO**: Explicitly specify all desired behaviors and quality standards
615
+ - **Impact**: Significant quality improvement with explicit instructions
616
+
617
+ ### Anti-Pattern #5: Ignoring Cache Invalidation
618
+ - ❌ **DON'T**: Enable extended thinking when prompt caching is critical
619
+ - ✅ **DO**: Evaluate cache savings (90% cost + 85% latency) vs thinking quality gain
620
+ - **Impact**: Loss of 90% cost savings and 85% latency reduction
621
+
622
+ ---
623
+
624
+ ## Benchmark Performance Data
625
+
626
+ ### SWE-bench (Coding Tasks)
627
+ - **Sonnet 4.5**: 77.2% (Winner)
628
+ - **Opus 4.1**: 74.5%
629
+
630
+ ### OSWorld (Complex Agent Planning)
631
+ - **Opus 4.1**: 61.4% (Winner)
632
+ - **Sonnet 4.5**: 44.0%
633
+
634
+ ### Cost Comparison
635
+ - **Sonnet 4.5**: $3/MTok input, $15/MTok output
636
+ - **Opus 4.1**: $15/MTok input, $75/MTok output
637
+ - **Ratio**: Opus is 5x more expensive
638
+
639
+ ### Autonomous Operation Duration
640
+ - **Sonnet 4.5**: 30 hours
641
+ - **Opus 4**: 7 hours
642
+
643
+ ---
644
+
645
+ ## Prompt Engineering Evaluation Framework
646
+
647
+ ### Quality Metrics
648
+
649
+ **Clarity Assessment:**
650
+ - Ambiguity detection and resolution
651
+ - Precision of language and terminology
652
+ - Logical flow and sequence coherence
653
+ - Absence of conflicting directives
654
+
655
+ **Effectiveness Indicators:**
656
+ - Actionability vs descriptive content ratio
657
+ - Measurable outcomes and success criteria
658
+ - Clear delegation boundaries
659
+ - Appropriate specificity levels
660
+
661
+ **Efficiency Measures:**
662
+ - Content density and information theory
663
+ - Redundancy elimination without information loss
664
+ - Optimal length for comprehension
665
+ - Strategic formatting and structure
666
+
667
+ ### Cross-Model Testing
668
+
669
+ **Compatibility Metrics:**
670
+ - Response consistency across models
671
+ - Instruction following accuracy per model
672
+ - Format adherence and output compliance
673
+ - Model-specific feature utilization
674
+
675
+ **Performance Benchmarks:**
676
+ - Response quality scoring with rubrics
677
+ - Token efficiency and cost analysis
678
+ - Processing speed measurements
679
+ - Semantic accuracy validation
680
+
681
+ **Robustness Testing:**
682
+ - Edge case handling across models
683
+ - Adversarial prompt resistance
684
+ - Input variation sensitivity
685
+ - Failure mode identification
686
+
687
+ ### A/B Testing Framework
688
+
689
+ **Test Design:**
690
+ 1. Create prompt variations (2-5 alternatives)
691
+ 2. Define measurable success criteria
692
+ 3. Test across representative sample (n ≥ 30)
693
+ 4. Measure: quality, consistency, cost, latency
694
+ 5. Statistical analysis (confidence intervals, significance)
695
+
696
+ **Metrics Collection:**
697
+ - Response quality scores (1-5 scale)
698
+ - Task completion rate
699
+ - Token usage (input + output)
700
+ - Response time (latency)
701
+ - Error rate and failure modes
702
+
703
+ ---
704
+
705
+ ## Implementation Checklist
706
+
707
+ ### Before Deploying Prompts
708
+
709
+ ✅ **Model Selection Verified**
710
+ - Sonnet 4.5 for coding/analysis
711
+ - Opus for strategic planning only
712
+ - Cost/performance trade-off analyzed
713
+
714
+ ✅ **Extended Thinking Configuration**
715
+ - Task complexity assessed
716
+ - Appropriate budget allocated (16k-64k)
717
+ - Cache invalidation impact considered
718
+ - Incompatibilities checked (temperature, tool_choice, prefilling)
719
+
720
+ ✅ **Tool Integration**
721
+ - Parallel execution opportunities identified
722
+ - Tool dependencies mapped
723
+ - Error handling implemented
724
+ - "Think" tool added if needed
725
+
726
+ ✅ **Structured Output Method**
727
+ - Tool-based schema preferred
728
+ - Prefilling configured if needed
729
+ - XML tags defined clearly
730
+ - Extended thinking compatibility tested
731
+
732
+ ✅ **Context Management**
733
+ - Prompt caching configured
734
+ - Sliding window for >100K tokens
735
+ - Anchor labels for long conversations
736
+ - Progressive summarization planned
737
+
738
+ ✅ **Explicit Behaviors Specified**
739
+ - All desired actions explicitly stated
740
+ - Quality standards clearly defined
741
+ - Edge cases and error handling covered
742
+ - Production-ready requirements listed
743
+
744
+ ✅ **Testing Completed**
745
+ - Prompt tested on representative samples
746
+ - Cross-model compatibility verified (if applicable)
747
+ - Performance metrics collected
748
+ - Cost analysis completed
749
+
750
+ ---
751
+
752
+ ## Key Resources
753
+
754
+ ### Official Anthropic Documentation
755
+ - Claude 4 Prompt Engineering Best Practices
756
+ - Extended Thinking Technical Guide
757
+ - Tool Use and Function Calling
758
+ - Prompt Caching Documentation
759
+ - Claude Sonnet 4.5 Release Notes
760
+ - Multi-Agent Research System Engineering
761
+
762
+ ### Performance Benchmarks
763
+ - System Card: Claude Opus 4 & Sonnet 4
764
+ - SWE-bench Coding Evaluation
765
+ - OSWorld Agent Planning Benchmark
766
+ - Cost-Performance Analysis Studies
767
+
768
+ ### Implementation Guides
769
+ - Claude Code Best Practices
770
+ - Enterprise AI Development with Claude
771
+ - Production Development with Claude
772
+ - AWS Bedrock Claude Integration Guide
773
+
774
+ ---
775
+
776
+ ## Version History
777
+
778
+ **v1.0.0** (October 2025)
779
+ - Initial BASE_PROMPT_ENGINEER.md creation
780
+ - Comprehensive Claude 4.5 best practices integration
781
+ - Extended thinking optimization guidelines
782
+ - Multi-model routing decision matrix
783
+ - Tool orchestration patterns
784
+ - Structured output enforcement methods
785
+ - Context management strategies (200K tokens)
786
+ - Performance and cost optimization techniques
787
+ - Anti-pattern identification and mitigation