groundswell 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/.claude/system_prompts/task-breakdown.md +100 -0
- package/PRPs/001-hierarchical-workflow-engine.md +2438 -0
- package/PRPs/PRDs/001-hierarchical-workflow-engine.md +543 -0
- package/PRPs/PRDs/002-agent-prompt.md +390 -0
- package/PRPs/PRDs/003-agent-prompt.md +943 -0
- package/PRPs/PRDs/004-agent-prompt.md +1136 -0
- package/PRPs/PRDs/tasks-001.json +492 -0
- package/PRPs/README.md +83 -0
- package/PRPs/templates/prp_base.md +222 -0
- package/README.md +218 -0
- package/docs/agent.md +422 -0
- package/docs/prompt.md +419 -0
- package/docs/workflow.md +600 -0
- package/examples/README.md +244 -0
- package/examples/examples/01-basic-workflow.ts +100 -0
- package/examples/examples/02-decorator-options.ts +217 -0
- package/examples/examples/03-parent-child.ts +241 -0
- package/examples/examples/04-observers-debugger.ts +340 -0
- package/examples/examples/05-error-handling.ts +387 -0
- package/examples/examples/06-concurrent-tasks.ts +352 -0
- package/examples/examples/07-agent-loops.ts +432 -0
- package/examples/examples/08-sdk-features.ts +667 -0
- package/examples/examples/09-reflection.ts +573 -0
- package/examples/examples/10-introspection.ts +550 -0
- package/examples/index.ts +143 -0
- package/examples/utils/helpers.ts +57 -0
- package/llms_full.txt +5890 -0
- package/package.json +63 -0
- package/plan/P1P2/PRP.md +527 -0
- package/plan/P1P2/research/LRU_CACHE_BEST_PRACTICES.md +1929 -0
- package/plan/P1P2/research/LRU_CACHE_CODE_PATTERNS.md +857 -0
- package/plan/P1P2/research/LRU_CACHE_INTEGRATION_GUIDE.md +738 -0
- package/plan/P1P2/research/LRU_CACHE_RESEARCH_INDEX.md +424 -0
- package/plan/P1P2/research/REFLECTION_INDEX.md +291 -0
- package/plan/P1P2/research/REFLECTION_RESEARCH_REPORT.md +1342 -0
- package/plan/P1P2/research/RESEARCH_SUMMARY.md +342 -0
- package/plan/P1P2/research/anthropic-sdk.md +174 -0
- package/plan/P1P2/research/async-local-storage.md +200 -0
- package/plan/P1P2/research/reflection-code-patterns.md +1205 -0
- package/plan/P1P2/research/reflection-decision-matrix.md +421 -0
- package/plan/P1P2/research/reflection-implementation-guide.md +1341 -0
- package/plan/P1P2/research/reflection-integration-guide.md +834 -0
- package/plan/P1P2/research/reflection-patterns.md +1468 -0
- package/plan/P1P2/research/reflection-quick-reference.md +558 -0
- package/plan/P1P2/research/zod-schema.md +152 -0
- package/plan/P3P4/PRP.md +1388 -0
- package/plan/P3P4/research/caching-lru.md +116 -0
- package/plan/P3P4/research/introspection-tools.md +177 -0
- package/plan/P3P4/research/reflection-patterns.md +117 -0
- package/plan/P4P5/PRP.md +1136 -0
- package/plan/P4P5/research/RESEARCH_SUMMARY.md +151 -0
- package/plan/architecture/external_deps.md +358 -0
- package/plan/architecture/system_context.md +242 -0
- package/plan/backlog.json +867 -0
- package/plan/research/INTROSPECTION_RESEARCH_SUMMARY.md +378 -0
- package/plan/research/README-INTROSPECTION.md +352 -0
- package/plan/research/agent-introspection-patterns.md +1085 -0
- package/plan/research/introspection-security-guide.md +928 -0
- package/plan/research/introspection-tool-examples.md +875 -0
- package/scripts/generate-llms-full.ts +206 -0
- package/src/__tests__/integration/agent-workflow.test.ts +256 -0
- package/src/__tests__/integration/tree-mirroring.test.ts +114 -0
- package/src/__tests__/unit/agent.test.ts +169 -0
- package/src/__tests__/unit/cache-key.test.ts +182 -0
- package/src/__tests__/unit/cache.test.ts +172 -0
- package/src/__tests__/unit/context.test.ts +138 -0
- package/src/__tests__/unit/decorators.test.ts +100 -0
- package/src/__tests__/unit/introspection-tools.test.ts +277 -0
- package/src/__tests__/unit/prompt.test.ts +135 -0
- package/src/__tests__/unit/reflection.test.ts +210 -0
- package/src/__tests__/unit/tree-debugger.test.ts +85 -0
- package/src/__tests__/unit/workflow.test.ts +81 -0
- package/src/cache/cache-key.ts +244 -0
- package/src/cache/cache.ts +236 -0
- package/src/cache/index.ts +8 -0
- package/src/core/agent.ts +573 -0
- package/src/core/context.ts +119 -0
- package/src/core/event-tree.ts +260 -0
- package/src/core/factory.ts +123 -0
- package/src/core/index.ts +17 -0
- package/src/core/logger.ts +87 -0
- package/src/core/mcp-handler.ts +184 -0
- package/src/core/prompt.ts +150 -0
- package/src/core/workflow-context.ts +349 -0
- package/src/core/workflow.ts +302 -0
- package/src/debugger/index.ts +1 -0
- package/src/debugger/tree-debugger.ts +210 -0
- package/src/decorators/index.ts +3 -0
- package/src/decorators/observed-state.ts +95 -0
- package/src/decorators/step.ts +139 -0
- package/src/decorators/task.ts +96 -0
- package/src/examples/index.ts +2 -0
- package/src/examples/tdd-orchestrator.ts +65 -0
- package/src/examples/test-cycle-workflow.ts +64 -0
- package/src/index.ts +140 -0
- package/src/reflection/index.ts +5 -0
- package/src/reflection/reflection.ts +407 -0
- package/src/tools/index.ts +36 -0
- package/src/tools/introspection.ts +464 -0
- package/src/types/agent.ts +90 -0
- package/src/types/decorators.ts +25 -0
- package/src/types/error-strategy.ts +13 -0
- package/src/types/error.ts +20 -0
- package/src/types/events.ts +74 -0
- package/src/types/index.ts +55 -0
- package/src/types/logging.ts +24 -0
- package/src/types/observer.ts +18 -0
- package/src/types/prompt.ts +40 -0
- package/src/types/reflection.ts +117 -0
- package/src/types/sdk-primitives.ts +128 -0
- package/src/types/snapshot.ts +14 -0
- package/src/types/workflow-context.ts +163 -0
- package/src/types/workflow.ts +37 -0
- package/src/utils/id.ts +11 -0
- package/src/utils/index.ts +3 -0
- package/src/utils/observable.ts +77 -0
- package/tasks.json +0 -0
- package/tsconfig.json +22 -0
- package/vitest.config.ts +16 -0
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
# Reflection Patterns: Quick Reference Guide
|
|
2
|
+
|
|
3
|
+
## Decision Tree: When and How to Use Reflection
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
START: Should I use reflection?
|
|
7
|
+
|
|
|
8
|
+
├─ Does your task need high quality? (not time-critical)
|
|
9
|
+
| └─ YES: Consider reflection
|
|
10
|
+
| |
|
|
11
|
+
| ├─ Can you get external feedback? (tool results, tests, retrieval)
|
|
12
|
+
| | └─ YES: Use Reflexion (evidence-grounded)
|
|
13
|
+
| | └─ NO: Use basic reflection with internal evaluation
|
|
14
|
+
| |
|
|
15
|
+
| ├─ Multiple attempts possible?
|
|
16
|
+
| | └─ YES: Set max_attempts = 2-3
|
|
17
|
+
| | └─ NO: Single-pass reflection only
|
|
18
|
+
| |
|
|
19
|
+
| └─ Can you allocate extra tokens?
|
|
20
|
+
| └─ YES: Proceed with implementation
|
|
21
|
+
| └─ NO: Use minimal reflection (1 cycle max)
|
|
22
|
+
|
|
|
23
|
+
└─ NO: Skip reflection, use single-pass generation
|
|
24
|
+
(E.g., real-time chat, low-latency APIs)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Reflection Approach Selection Matrix
|
|
30
|
+
|
|
31
|
+
| Task Type | Approach | Max Attempts | Feedback Source | Notes |
|
|
32
|
+
|-----------|----------|--------------|-----------------|-------|
|
|
33
|
+
| Code Generation | Reflexion | 2-3 | Test results | Tool-assisted validation critical |
|
|
34
|
+
| Writing/Content | Basic Reflection | 2-3 | Quality criteria | Simple evaluation works well |
|
|
35
|
+
| Analysis/Research | Reflexion | 2-3 | Fact-checking, retrieval | Ground in external data |
|
|
36
|
+
| Planning | Basic Reflection | 1-2 | Feasibility check | Keep lightweight |
|
|
37
|
+
| Dialogue/Conversation | None | 0 | Real-time feedback | Too slow for interactive |
|
|
38
|
+
| Multi-step workflows | Hierarchical | 1-2 per step | Manager review | Reflect at orchestration level |
|
|
39
|
+
| Math/Logic Problems | Tool-Interactive | 2-3 | Verification | Use solver tools |
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Prompt Template Quick Reference
|
|
44
|
+
|
|
45
|
+
### Template 1: Quick Retry (Fastest)
|
|
46
|
+
```
|
|
47
|
+
Generated: [OUTPUT]
|
|
48
|
+
Issues: [BRIEF_ERROR]
|
|
49
|
+
|
|
50
|
+
Try again, fixing these issues.
|
|
51
|
+
```
|
|
52
|
+
**Use when**: Time-critical, simple corrections needed
|
|
53
|
+
**Token cost**: Low
|
|
54
|
+
**Effectiveness**: 60-70% improvement
|
|
55
|
+
|
|
56
|
+
### Template 2: Evidence-Grounded (Recommended)
|
|
57
|
+
```
|
|
58
|
+
Your response: [OUTPUT]
|
|
59
|
+
Evidence check: [TOOL_RESULTS]
|
|
60
|
+
Issues: [CONTRADICTIONS]
|
|
61
|
+
|
|
62
|
+
Fix issues based on evidence.
|
|
63
|
+
Cite your sources.
|
|
64
|
+
```
|
|
65
|
+
**Use when**: Accuracy matters, external tools available
|
|
66
|
+
**Token cost**: Medium
|
|
67
|
+
**Effectiveness**: 80-90% improvement
|
|
68
|
+
|
|
69
|
+
### Template 3: Self-Critique (Detailed)
|
|
70
|
+
```
|
|
71
|
+
Your response: [OUTPUT]
|
|
72
|
+
Quality evaluation: [SCORING]
|
|
73
|
+
|
|
74
|
+
Identify weaknesses.
|
|
75
|
+
Propose specific improvements.
|
|
76
|
+
Rewrite addressing each weakness.
|
|
77
|
+
```
|
|
78
|
+
**Use when**: Complex tasks, nuanced improvements needed
|
|
79
|
+
**Token cost**: Medium-High
|
|
80
|
+
**Effectiveness**: 75-85% improvement
|
|
81
|
+
|
|
82
|
+
### Template 4: Multi-Agent (Highest Quality)
|
|
83
|
+
```
|
|
84
|
+
Initial response: [OUTPUT]
|
|
85
|
+
|
|
86
|
+
As a critic, identify problems with this response.
|
|
87
|
+
Be specific and cite evidence.
|
|
88
|
+
|
|
89
|
+
[SEPARATE LLM CALL]
|
|
90
|
+
|
|
91
|
+
Based on criticism: [FEEDBACK]
|
|
92
|
+
|
|
93
|
+
Provide improved response addressing all feedback.
|
|
94
|
+
```
|
|
95
|
+
**Use when**: Critical quality required, budget available
|
|
96
|
+
**Token cost**: High (2 LLM calls)
|
|
97
|
+
**Effectiveness**: 85-95% improvement
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Configuration Profiles
|
|
102
|
+
|
|
103
|
+
### Profile: Speed-Optimized
|
|
104
|
+
```
|
|
105
|
+
max_attempts: 1
|
|
106
|
+
reflection_style: "minimal"
|
|
107
|
+
external_feedback: false
|
|
108
|
+
token_budget: 20000
|
|
109
|
+
timeout_seconds: 10
|
|
110
|
+
```
|
|
111
|
+
**Best for**: Real-time applications, chat interfaces
|
|
112
|
+
|
|
113
|
+
### Profile: Quality-Optimized
|
|
114
|
+
```
|
|
115
|
+
max_attempts: 3
|
|
116
|
+
reflection_style: "evidence_grounded"
|
|
117
|
+
external_feedback: true
|
|
118
|
+
token_budget: 100000
|
|
119
|
+
timeout_seconds: 60
|
|
120
|
+
```
|
|
121
|
+
**Best for**: Knowledge work, analysis, content creation
|
|
122
|
+
|
|
123
|
+
### Profile: Balanced
|
|
124
|
+
```
|
|
125
|
+
max_attempts: 2
|
|
126
|
+
reflection_style: "self_critique"
|
|
127
|
+
external_feedback: conditional
|
|
128
|
+
token_budget: 50000
|
|
129
|
+
timeout_seconds: 30
|
|
130
|
+
```
|
|
131
|
+
**Best for**: Most production applications
|
|
132
|
+
|
|
133
|
+
### Profile: Safety-Critical
|
|
134
|
+
```
|
|
135
|
+
max_attempts: 3
|
|
136
|
+
reflection_style: "multi_agent"
|
|
137
|
+
external_feedback: required
|
|
138
|
+
token_budget: 150000
|
|
139
|
+
timeout_seconds: 120
|
|
140
|
+
loop_detection: aggressive
|
|
141
|
+
security_validation: strict
|
|
142
|
+
```
|
|
143
|
+
**Best for**: Medical, legal, financial applications
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Stopping Conditions Checklist
|
|
148
|
+
|
|
149
|
+
Check these in order (first true = stop):
|
|
150
|
+
|
|
151
|
+
1. **Hard Limit**: `attempt_number >= max_attempts`
|
|
152
|
+
- Never exceed configured maximum
|
|
153
|
+
- Typically 2-3 for reflection
|
|
154
|
+
|
|
155
|
+
2. **Quality Achieved**: `quality_score >= target_threshold`
|
|
156
|
+
- Task complete if quality is good enough
|
|
157
|
+
- Typical threshold: 0.8 (0-1 scale)
|
|
158
|
+
|
|
159
|
+
3. **Improvement Stalled**: `improvement < min_improvement_threshold`
|
|
160
|
+
- If quality improved less than 5% this cycle
|
|
161
|
+
- Indicates diminishing returns
|
|
162
|
+
|
|
163
|
+
4. **Loop Detected**: `detect_infinite_loop(output, error, history)`
|
|
164
|
+
- Stop if exact same output repeated
|
|
165
|
+
- Stop if same error repeated 2+ times
|
|
166
|
+
- Stop if outputs too similar (>95%)
|
|
167
|
+
|
|
168
|
+
5. **Budget Exceeded**: `tokens_used > token_budget OR time_elapsed > timeout`
|
|
169
|
+
- Token budget exhausted
|
|
170
|
+
- Wall-clock timeout reached
|
|
171
|
+
|
|
172
|
+
6. **User Intervention**: `user_requested_stop()`
|
|
173
|
+
- If humans cancel the operation
|
|
174
|
+
- If user provides different instruction
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Common Mistakes and Fixes
|
|
179
|
+
|
|
180
|
+
### Mistake 1: Reflecting Without External Feedback
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
WRONG:
|
|
184
|
+
Generate output
|
|
185
|
+
→ LLM reflects on own output
|
|
186
|
+
→ LLM generates improvement
|
|
187
|
+
→ Often doesn't improve or gets worse
|
|
188
|
+
|
|
189
|
+
CORRECT:
|
|
190
|
+
Generate output
|
|
191
|
+
→ Run tests/retrieve data/check facts
|
|
192
|
+
→ Provide concrete evidence to LLM
|
|
193
|
+
→ LLM reflects grounded in evidence
|
|
194
|
+
→ Improvement is reliable
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Mistake 2: Infinite Reflection Loops
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
WRONG:
|
|
201
|
+
while true:
|
|
202
|
+
output = generate()
|
|
203
|
+
feedback = reflect(output)
|
|
204
|
+
output = improve(output, feedback)
|
|
205
|
+
|
|
206
|
+
CORRECT:
|
|
207
|
+
for attempt in range(max_attempts):
|
|
208
|
+
if detect_loop(output):
|
|
209
|
+
break
|
|
210
|
+
output = generate()
|
|
211
|
+
if is_good_enough(output):
|
|
212
|
+
break
|
|
213
|
+
feedback = reflect(output)
|
|
214
|
+
output = improve(output, feedback)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Mistake 3: Including Full History in Every Cycle
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
WRONG:
|
|
221
|
+
Attempt 1: 100 tokens output
|
|
222
|
+
Reflect with 100 tokens context + feedback = 150 tokens
|
|
223
|
+
Attempt 2: 150 tokens output
|
|
224
|
+
Reflect with 250 tokens context + feedback = 400 tokens
|
|
225
|
+
... context window balloons exponentially
|
|
226
|
+
|
|
227
|
+
CORRECT:
|
|
228
|
+
Keep rolling window of last 2 attempts only
|
|
229
|
+
Summarize older attempts: "Attempts 1-3 hit these issues: ..."
|
|
230
|
+
Use external memory for full history
|
|
231
|
+
Track lessons learned separately from raw outputs
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Mistake 4: Waiting for Perfect Quality
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
WRONG:
|
|
238
|
+
Set target_quality = 1.0 (perfect)
|
|
239
|
+
Keep reflecting until perfect
|
|
240
|
+
Uses all tokens, never actually achieves perfection
|
|
241
|
+
|
|
242
|
+
CORRECT:
|
|
243
|
+
Set target_quality = 0.8 (good enough)
|
|
244
|
+
Stop when threshold reached
|
|
245
|
+
Accept "best effort" after max attempts
|
|
246
|
+
Use remaining budget for other tasks
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Mistake 5: Reflecting on Unpredictable Outputs
|
|
250
|
+
|
|
251
|
+
```
|
|
252
|
+
WRONG:
|
|
253
|
+
Task: "Write a creative story"
|
|
254
|
+
→ Every reflection produces completely different story
|
|
255
|
+
→ Can't detect improvement or loops
|
|
256
|
+
→ Metrics meaningless
|
|
257
|
+
|
|
258
|
+
CORRECT:
|
|
259
|
+
Only use reflection for deterministic/measurable tasks
|
|
260
|
+
For creative tasks: use single-pass generation
|
|
261
|
+
Or define specific evaluation criteria (tone, length, style)
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
## Performance Benchmarks
|
|
267
|
+
|
|
268
|
+
These are typical baselines - adjust based on your models and tasks.
|
|
269
|
+
|
|
270
|
+
### Code Generation
|
|
271
|
+
```
|
|
272
|
+
Task: "Write function that..."
|
|
273
|
+
Approach: Reflexion with test feedback
|
|
274
|
+
|
|
275
|
+
Without reflection:
|
|
276
|
+
- Success rate: 60%
|
|
277
|
+
- Time: 2-3 seconds
|
|
278
|
+
- Token cost: 2000 tokens
|
|
279
|
+
|
|
280
|
+
With reflection (2 cycles):
|
|
281
|
+
- Success rate: 88%
|
|
282
|
+
- Time: 5-8 seconds
|
|
283
|
+
- Token cost: 5000 tokens
|
|
284
|
+
|
|
285
|
+
ROI: +28% success rate, 2.5x cost
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
### Fact-Checking / Analysis
|
|
289
|
+
```
|
|
290
|
+
Task: "Analyze this research finding"
|
|
291
|
+
Approach: Reflexion with web search
|
|
292
|
+
|
|
293
|
+
Without reflection:
|
|
294
|
+
- Error rate: 20%
|
|
295
|
+
- Token cost: 3000 tokens
|
|
296
|
+
|
|
297
|
+
With reflection (2 cycles):
|
|
298
|
+
- Error rate: 3%
|
|
299
|
+
- Token cost: 8000 tokens
|
|
300
|
+
|
|
301
|
+
ROI: Error reduction worth cost in high-stakes use cases
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### Writing Quality
|
|
305
|
+
```
|
|
306
|
+
Task: "Write product description"
|
|
307
|
+
Approach: Self-critique reflection
|
|
308
|
+
|
|
309
|
+
Without reflection:
|
|
310
|
+
- Quality score: 6.5/10
|
|
311
|
+
- Time: 3 seconds
|
|
312
|
+
- Token cost: 2000 tokens
|
|
313
|
+
|
|
314
|
+
With reflection (2 cycles):
|
|
315
|
+
- Quality score: 8.2/10
|
|
316
|
+
- Time: 8 seconds
|
|
317
|
+
- Token cost: 5000 tokens
|
|
318
|
+
|
|
319
|
+
ROI: 26% quality improvement, 2.5x cost
|
|
320
|
+
Worth it for marketing/professional content
|
|
321
|
+
Not worth it for chat responses
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## Token Budget Calculator
|
|
327
|
+
|
|
328
|
+
Quick estimation for reflection:
|
|
329
|
+
|
|
330
|
+
```
|
|
331
|
+
Initial output generation:
|
|
332
|
+
~2-3 KB text = 500-750 tokens
|
|
333
|
+
|
|
334
|
+
Per reflection cycle:
|
|
335
|
+
- Feedback generation: 200-300 tokens
|
|
336
|
+
- Improvement generation: similar to initial = 500-750 tokens
|
|
337
|
+
- Total per cycle: 700-1050 tokens
|
|
338
|
+
|
|
339
|
+
Examples:
|
|
340
|
+
1 cycle: 500 + 850 = 1350 tokens
|
|
341
|
+
2 cycles: 500 + 850 + 850 = 2200 tokens
|
|
342
|
+
3 cycles: 500 + 850 + 850 + 850 = 3050 tokens
|
|
343
|
+
|
|
344
|
+
With memory/context:
|
|
345
|
+
Add 20-30% overhead for context window usage
|
|
346
|
+
|
|
347
|
+
Total budget recommendation:
|
|
348
|
+
Simple task: 5000-10000 tokens
|
|
349
|
+
Complex task: 20000-50000 tokens
|
|
350
|
+
Very complex: 50000-100000+ tokens
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
---
|
|
354
|
+
|
|
355
|
+
## Security Checklist
|
|
356
|
+
|
|
357
|
+
Before deploying reflection system:
|
|
358
|
+
|
|
359
|
+
- [ ] Sanitize all feedback before sending to LLM
|
|
360
|
+
- [ ] Validate tool calls mentioned in feedback are whitelisted
|
|
361
|
+
- [ ] Limit feedback length (max 1000 characters)
|
|
362
|
+
- [ ] Filter credentials/secrets from history
|
|
363
|
+
- [ ] Implement reflection depth limits
|
|
364
|
+
- [ ] Log all reflection activities for audit
|
|
365
|
+
- [ ] Test with adversarial feedback/prompts
|
|
366
|
+
- [ ] Define clear escalation paths
|
|
367
|
+
- [ ] Set rate limits on reflection API calls
|
|
368
|
+
- [ ] Monitor for unusual reflection patterns
|
|
369
|
+
|
|
370
|
+
---
|
|
371
|
+
|
|
372
|
+
## Introspection Tool Permissions Matrix
|
|
373
|
+
|
|
374
|
+
| Tool | Worker Agent | Supervisor | Manager | Admin |
|
|
375
|
+
|------|--------------|-----------|---------|-------|
|
|
376
|
+
| `read_own_history` | Yes | Yes | Yes | Yes |
|
|
377
|
+
| `read_own_metadata` | Yes | Yes | Yes | Yes |
|
|
378
|
+
| `read_parent_context` | Limited | Yes | Yes | Yes |
|
|
379
|
+
| `read_sibling_context` | No | Limited | Yes | Yes |
|
|
380
|
+
| `modify_own_state` | No | No | Limited | Yes |
|
|
381
|
+
| `escalate_to_parent` | Yes | Yes | Limited | No |
|
|
382
|
+
| `query_execution_metrics` | Limited | Yes | Yes | Yes |
|
|
383
|
+
| `query_cost_metrics` | No | Limited | Yes | Yes |
|
|
384
|
+
| `read_credentials` | No | No | No | Yes |
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
## Monitoring Dashboard Essentials
|
|
389
|
+
|
|
390
|
+
Key metrics to track:
|
|
391
|
+
|
|
392
|
+
**Real-time:**
|
|
393
|
+
- Active reflection cycles
|
|
394
|
+
- Avg quality improvement this hour
|
|
395
|
+
- Tokens used this hour
|
|
396
|
+
- Loop detections this hour
|
|
397
|
+
|
|
398
|
+
**Daily/Weekly:**
|
|
399
|
+
- Success rate (with/without reflection)
|
|
400
|
+
- Avg attempts per successful task
|
|
401
|
+
- Most common errors
|
|
402
|
+
- Most effective reflection approaches
|
|
403
|
+
|
|
404
|
+
**Cost Analysis:**
|
|
405
|
+
- Cost per improved result
|
|
406
|
+
- Cost per percentage improvement
|
|
407
|
+
- ROI per use case
|
|
408
|
+
|
|
409
|
+
---
|
|
410
|
+
|
|
411
|
+
## Integration Checklist
|
|
412
|
+
|
|
413
|
+
### Before deploying reflection:
|
|
414
|
+
|
|
415
|
+
**Architecture**
|
|
416
|
+
- [ ] LLM client configured with retry logic
|
|
417
|
+
- [ ] Token tracking integrated
|
|
418
|
+
- [ ] State persistence implemented (checkpoints)
|
|
419
|
+
- [ ] Loop detection system active
|
|
420
|
+
|
|
421
|
+
**Safety**
|
|
422
|
+
- [ ] Input validation in place
|
|
423
|
+
- [ ] Rate limits configured
|
|
424
|
+
- [ ] Timeout limits set
|
|
425
|
+
- [ ] Credentials filtered from context
|
|
426
|
+
|
|
427
|
+
**Observability**
|
|
428
|
+
- [ ] Logging configured
|
|
429
|
+
- [ ] Metrics collection active
|
|
430
|
+
- [ ] Alerting rules defined
|
|
431
|
+
- [ ] Dashboard created
|
|
432
|
+
|
|
433
|
+
**Testing**
|
|
434
|
+
- [ ] Unit tests for reflection logic
|
|
435
|
+
- [ ] Integration tests with LLM calls
|
|
436
|
+
- [ ] Load tests on token budgets
|
|
437
|
+
- [ ] Security/adversarial tests
|
|
438
|
+
|
|
439
|
+
**Documentation**
|
|
440
|
+
- [ ] How to configure reflection per task
|
|
441
|
+
- [ ] How to interpret metrics
|
|
442
|
+
- [ ] How to troubleshoot issues
|
|
443
|
+
- [ ] How to modify templates
|
|
444
|
+
|
|
445
|
+
---
|
|
446
|
+
|
|
447
|
+
## When to Use vs. When NOT to Use Reflection
|
|
448
|
+
|
|
449
|
+
### Use Reflection When:
|
|
450
|
+
|
|
451
|
+
✓ Quality is more important than speed
|
|
452
|
+
✓ You have time/tokens to spend
|
|
453
|
+
✓ You can get external feedback (tests, tools, retrieval)
|
|
454
|
+
✓ The task is deterministic/measurable
|
|
455
|
+
✓ Users are willing to wait
|
|
456
|
+
✓ Cost is not the primary constraint
|
|
457
|
+
✓ Correctness is critical
|
|
458
|
+
|
|
459
|
+
### DO NOT Use Reflection When:
|
|
460
|
+
|
|
461
|
+
✗ Sub-second latency required
|
|
462
|
+
✗ Operating under strict token/cost limits
|
|
463
|
+
✗ Task is purely creative (no criteria)
|
|
464
|
+
✗ No external feedback available
|
|
465
|
+
✗ Frequent updates needed (information changes rapidly)
|
|
466
|
+
✗ Task is inherently random/unpredictable
|
|
467
|
+
✗ User engagement requires immediate response
|
|
468
|
+
|
|
469
|
+
---
|
|
470
|
+
|
|
471
|
+
## Quick Troubleshooting Guide
|
|
472
|
+
|
|
473
|
+
| Problem | Symptoms | Solution |
|
|
474
|
+
|---------|----------|----------|
|
|
475
|
+
| Infinite Loop | Same output repeated, timeouts | Reduce max_attempts, add loop detection |
|
|
476
|
+
| Token Overflow | Out of memory errors | Reduce budget, compress history, use external memory |
|
|
477
|
+
| No Improvement | Quality stays same despite reflection | Add external feedback, change template |
|
|
478
|
+
| Getting Worse | Quality decreases after reflection | Disable reflection for this task type |
|
|
479
|
+
| Too Slow | Timeouts at reflection stage | Reduce reflection depth, use faster model |
|
|
480
|
+
| Misleading Feedback | Loop keeps trying same wrong approach | Use multi-agent reflection, add evidence requirement |
|
|
481
|
+
| Security Issues | Injection attempts in feedback | Add input validation, limit tool mentions |
|
|
482
|
+
|
|
483
|
+
---
|
|
484
|
+
|
|
485
|
+
## Example Configuration Files
|
|
486
|
+
|
|
487
|
+
### TypeScript Config
|
|
488
|
+
```typescript
|
|
489
|
+
const reflectionConfig = {
|
|
490
|
+
enabled: true,
|
|
491
|
+
maxAttempts: 2,
|
|
492
|
+
approach: "evidence_grounded",
|
|
493
|
+
tokenBudget: 30000,
|
|
494
|
+
qualityThreshold: 0.8,
|
|
495
|
+
loopDetection: {
|
|
496
|
+
enabled: true,
|
|
497
|
+
identicalThreshold: 2,
|
|
498
|
+
similarityThreshold: 0.95,
|
|
499
|
+
},
|
|
500
|
+
security: {
|
|
501
|
+
maxFeedbackLength: 1000,
|
|
502
|
+
forbiddenKeywords: ["api_key", "password"],
|
|
503
|
+
allowedTools: ["search", "test", "validate"],
|
|
504
|
+
},
|
|
505
|
+
timeouts: {
|
|
506
|
+
perCycleSeconds: 30,
|
|
507
|
+
totalSeconds: 120,
|
|
508
|
+
},
|
|
509
|
+
};
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### YAML Config
|
|
513
|
+
```yaml
|
|
514
|
+
reflection:
|
|
515
|
+
enabled: true
|
|
516
|
+
max_attempts: 2
|
|
517
|
+
approach: "evidence_grounded"
|
|
518
|
+
|
|
519
|
+
budget:
|
|
520
|
+
tokens: 30000
|
|
521
|
+
time_seconds: 120
|
|
522
|
+
|
|
523
|
+
quality:
|
|
524
|
+
threshold: 0.8
|
|
525
|
+
min_improvement: 0.05
|
|
526
|
+
|
|
527
|
+
safety:
|
|
528
|
+
max_feedback_length: 1000
|
|
529
|
+
loop_detection: true
|
|
530
|
+
forbidden_keywords:
|
|
531
|
+
- api_key
|
|
532
|
+
- password
|
|
533
|
+
- token
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
---
|
|
537
|
+
|
|
538
|
+
## Further Reading
|
|
539
|
+
|
|
540
|
+
### Academic Papers
|
|
541
|
+
- [Self-Reflection in LLM Agents](https://arxiv.org/pdf/2405.06682) - Core research
|
|
542
|
+
- [Reflexion Framework](https://arxiv.org/abs/2303.11366) - Evidence grounding
|
|
543
|
+
- [Language Agent Tree Search (LATS)](https://arxiv.org/abs/2310.04406) - Tree-based reflection
|
|
544
|
+
|
|
545
|
+
### Framework Documentation
|
|
546
|
+
- [LangGraph Reflection](https://langchain-ai.github.io/langgraph/tutorials/reflection/reflection/)
|
|
547
|
+
- [CrewAI Hierarchical Process](https://docs.crewai.com/how-to/hierarchical-process)
|
|
548
|
+
- [LangChain Reflection Agents](https://blog.langchain.com/reflection-agents/)
|
|
549
|
+
|
|
550
|
+
### Security Resources
|
|
551
|
+
- [OWASP LLM Security](https://genai.owasp.org/llmrisk/llm01-prompt-injection/)
|
|
552
|
+
- [OpenAI on Prompt Injection](https://openai.com/index/prompt-injections/)
|
|
553
|
+
|
|
554
|
+
---
|
|
555
|
+
|
|
556
|
+
**Last Updated**: December 2025
|
|
557
|
+
**Version**: 1.0
|
|
558
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Zod Schema Validation Research
|
|
2
|
+
|
|
3
|
+
## Official Documentation URLs
|
|
4
|
+
|
|
5
|
+
| Resource | URL |
|
|
6
|
+
|----------|-----|
|
|
7
|
+
| **Zod v3 Docs** | https://v3.zod.dev/ |
|
|
8
|
+
| **Current Docs** | https://zod.dev/ |
|
|
9
|
+
| **Basics Guide** | https://zod.dev/basics |
|
|
10
|
+
| **API Reference** | https://zod.dev/api |
|
|
11
|
+
| **GitHub** | https://github.com/colinhacks/zod |
|
|
12
|
+
| **NPM** | https://www.npmjs.com/package/zod |
|
|
13
|
+
| **Error Formatting** | https://zod.dev/error-formatting |
|
|
14
|
+
| **JSON Schema** | https://zod.dev/json-schema |
|
|
15
|
+
|
|
16
|
+
## Key Patterns
|
|
17
|
+
|
|
18
|
+
### Basic Schema Definition
|
|
19
|
+
```typescript
|
|
20
|
+
import { z } from 'zod';
|
|
21
|
+
|
|
22
|
+
const UserSchema = z.object({
|
|
23
|
+
name: z.string(),
|
|
24
|
+
email: z.string().email(),
|
|
25
|
+
age: z.number().int().positive(),
|
|
26
|
+
active: z.boolean()
|
|
27
|
+
});
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Type Inference with z.infer<T>
|
|
31
|
+
```typescript
|
|
32
|
+
type User = z.infer<typeof UserSchema>;
|
|
33
|
+
// { name: string; email: string; age: number; active: boolean }
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Schema Validation
|
|
37
|
+
```typescript
|
|
38
|
+
// Method 1: .parse() - throws ZodError on failure
|
|
39
|
+
try {
|
|
40
|
+
const result = userSchema.parse(data);
|
|
41
|
+
} catch (error) {
|
|
42
|
+
if (error instanceof z.ZodError) {
|
|
43
|
+
console.error(error.issues);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Method 2: .safeParse() - returns discriminated union
|
|
48
|
+
const result = userSchema.safeParse(data);
|
|
49
|
+
if (result.success) {
|
|
50
|
+
console.log(result.data); // Type-safe
|
|
51
|
+
} else {
|
|
52
|
+
console.error(result.error.issues);
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Error Handling
|
|
57
|
+
```typescript
|
|
58
|
+
const result = schema.safeParse(data);
|
|
59
|
+
if (!result.success) {
|
|
60
|
+
// Access issues array
|
|
61
|
+
console.log(result.error.issues);
|
|
62
|
+
|
|
63
|
+
// Format as nested object
|
|
64
|
+
const formatted = result.error.format();
|
|
65
|
+
|
|
66
|
+
// Flatten for forms
|
|
67
|
+
const flattened = z.flattenError(result.error);
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Schema Introspection
|
|
72
|
+
|
|
73
|
+
### Accessing _def (Internal)
|
|
74
|
+
```typescript
|
|
75
|
+
const schema = z.object({
|
|
76
|
+
name: z.string(),
|
|
77
|
+
tags: z.array(z.string())
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// Access object shape
|
|
81
|
+
console.log(schema._def.shape());
|
|
82
|
+
|
|
83
|
+
// Get array element type
|
|
84
|
+
const arraySchema = z.array(z.string());
|
|
85
|
+
console.log(arraySchema._def.type);
|
|
86
|
+
|
|
87
|
+
// Detect schema type
|
|
88
|
+
console.log(z.string()._def.typeName); // "ZodString"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### JSON Schema Conversion (v3)
|
|
92
|
+
```typescript
|
|
93
|
+
// Use zod-to-json-schema for v3
|
|
94
|
+
import { zodToJsonSchema } from 'zod-to-json-schema';
|
|
95
|
+
|
|
96
|
+
const jsonSchema = zodToJsonSchema(zodSchema);
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Advanced Features
|
|
100
|
+
|
|
101
|
+
### Optional Fields
|
|
102
|
+
```typescript
|
|
103
|
+
const schema = z.object({
|
|
104
|
+
name: z.string(),
|
|
105
|
+
middleName: z.string().optional(), // string | undefined
|
|
106
|
+
nickname: z.string().nullable() // string | null
|
|
107
|
+
});
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Union Types
|
|
111
|
+
```typescript
|
|
112
|
+
const stringOrNumber = z.union([z.string(), z.number()]);
|
|
113
|
+
|
|
114
|
+
// Discriminated union (more efficient)
|
|
115
|
+
const result = z.discriminatedUnion('status', [
|
|
116
|
+
z.object({ status: z.literal('success'), data: z.string() }),
|
|
117
|
+
z.object({ status: z.literal('error'), message: z.string() })
|
|
118
|
+
]);
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Arrays
|
|
122
|
+
```typescript
|
|
123
|
+
const stringArray = z.array(z.string());
|
|
124
|
+
const boundedArray = z.array(z.number()).min(1).max(10);
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## TypeScript Integration
|
|
128
|
+
|
|
129
|
+
### Generic ZodType Usage
|
|
130
|
+
```typescript
|
|
131
|
+
import { z, ZodType } from 'zod';
|
|
132
|
+
|
|
133
|
+
function validateData<T extends ZodType>(
|
|
134
|
+
data: unknown,
|
|
135
|
+
schema: T
|
|
136
|
+
): z.infer<T> {
|
|
137
|
+
return schema.parse(data);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Generic schema factory
|
|
141
|
+
function createEnvelopeSchema<T extends ZodType>(messageSchema: T) {
|
|
142
|
+
return z.object({
|
|
143
|
+
from: z.string(),
|
|
144
|
+
to: z.string(),
|
|
145
|
+
message: messageSchema
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Package Version
|
|
151
|
+
|
|
152
|
+
Use **zod@^3.23.0** for stability (not v4.x which is in beta).
|