ag-cortex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/.agent/commands/test-browser.md +339 -0
  2. package/.agent/rules/00-constitution.md +46 -0
  3. package/.agent/rules/project-rules.md +49 -0
  4. package/.agent/skills/agent-browser/SKILL.md +223 -0
  5. package/.agent/skills/agent-native-architecture/SKILL.md +435 -0
  6. package/.agent/skills/agent-native-architecture/references/action-parity-discipline.md +409 -0
  7. package/.agent/skills/agent-native-architecture/references/agent-execution-patterns.md +467 -0
  8. package/.agent/skills/agent-native-architecture/references/agent-native-testing.md +582 -0
  9. package/.agent/skills/agent-native-architecture/references/architecture-patterns.md +478 -0
  10. package/.agent/skills/agent-native-architecture/references/dynamic-context-injection.md +338 -0
  11. package/.agent/skills/agent-native-architecture/references/files-universal-interface.md +301 -0
  12. package/.agent/skills/agent-native-architecture/references/from-primitives-to-domain-tools.md +359 -0
  13. package/.agent/skills/agent-native-architecture/references/mcp-tool-design.md +506 -0
  14. package/.agent/skills/agent-native-architecture/references/mobile-patterns.md +871 -0
  15. package/.agent/skills/agent-native-architecture/references/product-implications.md +443 -0
  16. package/.agent/skills/agent-native-architecture/references/refactoring-to-prompt-native.md +317 -0
  17. package/.agent/skills/agent-native-architecture/references/self-modification.md +269 -0
  18. package/.agent/skills/agent-native-architecture/references/shared-workspace-architecture.md +680 -0
  19. package/.agent/skills/agent-native-architecture/references/system-prompt-design.md +250 -0
  20. package/.agent/skills/agent-native-reviewer/SKILL.md +246 -0
  21. package/.agent/skills/andrew-kane-gem-writer/SKILL.md +184 -0
  22. package/.agent/skills/andrew-kane-gem-writer/references/database-adapters.md +231 -0
  23. package/.agent/skills/andrew-kane-gem-writer/references/module-organization.md +121 -0
  24. package/.agent/skills/andrew-kane-gem-writer/references/rails-integration.md +183 -0
  25. package/.agent/skills/andrew-kane-gem-writer/references/resources.md +119 -0
  26. package/.agent/skills/andrew-kane-gem-writer/references/testing-patterns.md +261 -0
  27. package/.agent/skills/ankane-readme-writer/SKILL.md +50 -0
  28. package/.agent/skills/architecture-strategist/SKILL.md +52 -0
  29. package/.agent/skills/best-practices-researcher/SKILL.md +100 -0
  30. package/.agent/skills/bug-reproduction-validator/SKILL.md +67 -0
  31. package/.agent/skills/code-simplicity-reviewer/SKILL.md +85 -0
  32. package/.agent/skills/coding-tutor/.claude-plugin/plugin.json +9 -0
  33. package/.agent/skills/coding-tutor/README.md +37 -0
  34. package/.agent/skills/coding-tutor/commands/quiz-me.md +1 -0
  35. package/.agent/skills/coding-tutor/commands/sync-tutorials.md +25 -0
  36. package/.agent/skills/coding-tutor/commands/teach-me.md +1 -0
  37. package/.agent/skills/coding-tutor/skills/coding-tutor/SKILL.md +214 -0
  38. package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/create_tutorial.py +202 -0
  39. package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/index_tutorials.py +203 -0
  40. package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/quiz_priority.py +190 -0
  41. package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/setup_tutorials.py +132 -0
  42. package/.agent/skills/compound-docs/SKILL.md +510 -0
  43. package/.agent/skills/compound-docs/assets/critical-pattern-template.md +34 -0
  44. package/.agent/skills/compound-docs/assets/resolution-template.md +93 -0
  45. package/.agent/skills/compound-docs/references/yaml-schema.md +65 -0
  46. package/.agent/skills/compound-docs/schema.yaml +176 -0
  47. package/.agent/skills/create-agent-skills/SKILL.md +299 -0
  48. package/.agent/skills/create-agent-skills/references/api-security.md +226 -0
  49. package/.agent/skills/create-agent-skills/references/be-clear-and-direct.md +531 -0
  50. package/.agent/skills/create-agent-skills/references/best-practices.md +404 -0
  51. package/.agent/skills/create-agent-skills/references/common-patterns.md +595 -0
  52. package/.agent/skills/create-agent-skills/references/core-principles.md +437 -0
  53. package/.agent/skills/create-agent-skills/references/executable-code.md +175 -0
  54. package/.agent/skills/create-agent-skills/references/iteration-and-testing.md +474 -0
  55. package/.agent/skills/create-agent-skills/references/official-spec.md +185 -0
  56. package/.agent/skills/create-agent-skills/references/recommended-structure.md +168 -0
  57. package/.agent/skills/create-agent-skills/references/skill-structure.md +372 -0
  58. package/.agent/skills/create-agent-skills/references/using-scripts.md +113 -0
  59. package/.agent/skills/create-agent-skills/references/using-templates.md +112 -0
  60. package/.agent/skills/create-agent-skills/references/workflows-and-validation.md +510 -0
  61. package/.agent/skills/create-agent-skills/templates/router-skill.md +73 -0
  62. package/.agent/skills/create-agent-skills/templates/simple-skill.md +33 -0
  63. package/.agent/skills/create-agent-skills/workflows/add-reference.md +96 -0
  64. package/.agent/skills/create-agent-skills/workflows/add-script.md +93 -0
  65. package/.agent/skills/create-agent-skills/workflows/add-template.md +74 -0
  66. package/.agent/skills/create-agent-skills/workflows/add-workflow.md +120 -0
  67. package/.agent/skills/create-agent-skills/workflows/audit-skill.md +138 -0
  68. package/.agent/skills/create-agent-skills/workflows/create-domain-expertise-skill.md +605 -0
  69. package/.agent/skills/create-agent-skills/workflows/create-new-skill.md +191 -0
  70. package/.agent/skills/create-agent-skills/workflows/get-guidance.md +121 -0
  71. package/.agent/skills/create-agent-skills/workflows/upgrade-to-router.md +161 -0
  72. package/.agent/skills/create-agent-skills/workflows/verify-skill.md +204 -0
  73. package/.agent/skills/data-integrity-guardian/SKILL.md +70 -0
  74. package/.agent/skills/data-migration-expert/SKILL.md +97 -0
  75. package/.agent/skills/deployment-verification-agent/SKILL.md +159 -0
  76. package/.agent/skills/design-implementation-reviewer/SKILL.md +85 -0
  77. package/.agent/skills/design-iterator/SKILL.md +197 -0
  78. package/.agent/skills/dhh-rails-reviewer/SKILL.md +45 -0
  79. package/.agent/skills/dhh-rails-style/SKILL.md +184 -0
  80. package/.agent/skills/dhh-rails-style/references/architecture.md +653 -0
  81. package/.agent/skills/dhh-rails-style/references/controllers.md +303 -0
  82. package/.agent/skills/dhh-rails-style/references/frontend.md +510 -0
  83. package/.agent/skills/dhh-rails-style/references/gems.md +266 -0
  84. package/.agent/skills/dhh-rails-style/references/models.md +359 -0
  85. package/.agent/skills/dhh-rails-style/references/testing.md +338 -0
  86. package/.agent/skills/dspy-ruby/SKILL.md +594 -0
  87. package/.agent/skills/dspy-ruby/assets/config-template.rb +359 -0
  88. package/.agent/skills/dspy-ruby/assets/module-template.rb +326 -0
  89. package/.agent/skills/dspy-ruby/assets/signature-template.rb +143 -0
  90. package/.agent/skills/dspy-ruby/references/core-concepts.md +265 -0
  91. package/.agent/skills/dspy-ruby/references/optimization.md +623 -0
  92. package/.agent/skills/dspy-ruby/references/providers.md +305 -0
  93. package/.agent/skills/every-style-editor/SKILL.md +134 -0
  94. package/.agent/skills/every-style-editor/references/EVERY_WRITE_STYLE.md +529 -0
  95. package/.agent/skills/figma-design-sync/SKILL.md +166 -0
  96. package/.agent/skills/file-todos/SKILL.md +251 -0
  97. package/.agent/skills/file-todos/assets/todo-template.md +155 -0
  98. package/.agent/skills/framework-docs-researcher/SKILL.md +83 -0
  99. package/.agent/skills/frontend-design/SKILL.md +42 -0
  100. package/.agent/skills/gemini-imagegen/SKILL.md +237 -0
  101. package/.agent/skills/gemini-imagegen/requirements.txt +2 -0
  102. package/.agent/skills/gemini-imagegen/scripts/compose_images.py +168 -0
  103. package/.agent/skills/gemini-imagegen/scripts/edit_image.py +157 -0
  104. package/.agent/skills/gemini-imagegen/scripts/gemini_images.py +265 -0
  105. package/.agent/skills/gemini-imagegen/scripts/generate_image.py +147 -0
  106. package/.agent/skills/gemini-imagegen/scripts/multi_turn_chat.py +215 -0
  107. package/.agent/skills/git-history-analyzer/SKILL.md +42 -0
  108. package/.agent/skills/git-worktree/SKILL.md +302 -0
  109. package/.agent/skills/git-worktree/scripts/worktree-manager.sh +345 -0
  110. package/.agent/skills/julik-frontend-races-reviewer/SKILL.md +222 -0
  111. package/.agent/skills/kieran-python-reviewer/SKILL.md +104 -0
  112. package/.agent/skills/kieran-rails-reviewer/SKILL.md +86 -0
  113. package/.agent/skills/kieran-typescript-reviewer/SKILL.md +95 -0
  114. package/.agent/skills/lint/SKILL.md +16 -0
  115. package/.agent/skills/pattern-recognition-specialist/SKILL.md +57 -0
  116. package/.agent/skills/performance-oracle/SKILL.md +110 -0
  117. package/.agent/skills/pr-comment-resolver/SKILL.md +69 -0
  118. package/.agent/skills/rclone/SKILL.md +150 -0
  119. package/.agent/skills/rclone/scripts/check_setup.sh +60 -0
  120. package/.agent/skills/repo-research-analyst/SKILL.md +113 -0
  121. package/.agent/skills/security-sentinel/SKILL.md +93 -0
  122. package/.agent/skills/skill-creator/SKILL.md +209 -0
  123. package/.agent/skills/skill-creator/scripts/init_skill.py +304 -0
  124. package/.agent/skills/skill-creator/scripts/package_skill.py +112 -0
  125. package/.agent/skills/skill-creator/scripts/quick_validate.py +72 -0
  126. package/.agent/skills/spec-flow-analyzer/SKILL.md +113 -0
  127. package/.agent/skills/test-agent/SKILL.md +4 -0
  128. package/.agent/workflows/agent-native-audit.md +277 -0
  129. package/.agent/workflows/ask-user-question.md +21 -0
  130. package/.agent/workflows/changelog.md +137 -0
  131. package/.agent/workflows/compound.md +202 -0
  132. package/.agent/workflows/create-agent-skill.md +8 -0
  133. package/.agent/workflows/deepen-plan-research.md +334 -0
  134. package/.agent/workflows/deepen-plan-synthesis.md +182 -0
  135. package/.agent/workflows/deepen-plan.md +79 -0
  136. package/.agent/workflows/feature-video.md +342 -0
  137. package/.agent/workflows/generate-command.md +162 -0
  138. package/.agent/workflows/heal-skill.md +142 -0
  139. package/.agent/workflows/lfg.md +20 -0
  140. package/.agent/workflows/plan-analysis.md +67 -0
  141. package/.agent/workflows/plan-next-steps.md +63 -0
  142. package/.agent/workflows/plan-review.md +33 -0
  143. package/.agent/workflows/plan-synthesis.md +106 -0
  144. package/.agent/workflows/plan.md +49 -0
  145. package/.agent/workflows/report-bug.md +150 -0
  146. package/.agent/workflows/reproduce-bug.md +99 -0
  147. package/.agent/workflows/resolve-parallel.md +34 -0
  148. package/.agent/workflows/resolve-pr-parallel.md +49 -0
  149. package/.agent/workflows/resolve-todo-parallel.md +35 -0
  150. package/.agent/workflows/review-analysis.md +145 -0
  151. package/.agent/workflows/review-synthesis.md +262 -0
  152. package/.agent/workflows/review.md +64 -0
  153. package/.agent/workflows/ship.md +90 -0
  154. package/.agent/workflows/test-command.md +3 -0
  155. package/.agent/workflows/triage.md +310 -0
  156. package/.agent/workflows/work.md +157 -0
  157. package/.agent/workflows/xcode-test.md +332 -0
  158. package/LICENSE +22 -0
  159. package/README.md +49 -0
  160. package/bin/ag-cortex.js +54 -0
  161. package/lib/core.js +165 -0
  162. package/package.json +31 -0
@@ -0,0 +1,582 @@
1
+ <overview>
2
+ Testing agent-native apps requires different approaches than traditional unit testing. You're testing whether the agent achieves outcomes, not whether it calls specific functions. This guide provides concrete testing patterns for verifying your app is truly agent-native.
3
+ </overview>
4
+
5
+ <testing_philosophy>
6
+ ## Testing Philosophy
7
+
8
+ ### Test Outcomes, Not Procedures
9
+
10
+ **Traditional (procedure-focused):**
11
+ ```typescript
12
+ // Testing that a specific function was called with specific args
13
+ expect(mockProcessFeedback).toHaveBeenCalledWith({
14
+ message: "Great app!",
15
+ category: "praise",
16
+ priority: 2
17
+ });
18
+ ```
19
+
20
+ **Agent-native (outcome-focused):**
21
+ ```typescript
22
+ // Testing that the outcome was achieved
23
+ const result = await agent.process("Great app!");
24
+ const storedFeedback = await db.feedback.getLatest();
25
+
26
+ expect(storedFeedback.content).toContain("Great app");
27
+ expect(storedFeedback.importance).toBeGreaterThanOrEqual(1);
28
+ expect(storedFeedback.importance).toBeLessThanOrEqual(5);
29
+ // We don't care exactly how it categorized—just that it's reasonable
30
+ ```
31
+
32
+ ### Accept Variability
33
+
34
+ Agents may solve problems differently each time. Your tests should:
35
+ - Verify the end state, not the path
36
+ - Accept reasonable ranges, not exact values
37
+ - Check for presence of required elements, not exact format
38
+ </testing_philosophy>
39
+
40
+ <can_agent_do_it_test>
41
+ ## The "Can Agent Do It?" Test
42
+
43
+ For each UI feature, write a test prompt and verify the agent can accomplish it.
44
+
45
+ ### Template
46
+
47
+ ```typescript
48
+ describe('Agent Capability Tests', () => {
49
+ test('Agent can add a book to library', async () => {
50
+ const result = await agent.chat("Add 'Moby Dick' by Herman Melville to my library");
51
+
52
+ // Verify outcome
53
+ const library = await libraryService.getBooks();
54
+ const mobyDick = library.find(b => b.title.includes("Moby Dick"));
55
+
56
+ expect(mobyDick).toBeDefined();
57
+ expect(mobyDick.author).toContain("Melville");
58
+ });
59
+
60
+ test('Agent can publish to feed', async () => {
61
+ // Setup: ensure a book exists
62
+ await libraryService.addBook({ id: "book_123", title: "1984" });
63
+
64
+ const result = await agent.chat("Write something about surveillance themes in my feed");
65
+
66
+ // Verify outcome
67
+ const feed = await feedService.getItems();
68
+ const newItem = feed.find(item => item.bookId === "book_123");
69
+
70
+ expect(newItem).toBeDefined();
71
+ expect(newItem.content.toLowerCase()).toMatch(/surveillance|watching|control/);
72
+ });
73
+
74
+ test('Agent can search and save research', async () => {
75
+ await libraryService.addBook({ id: "book_456", title: "Moby Dick" });
76
+
77
+ const result = await agent.chat("Research whale symbolism in Moby Dick");
78
+
79
+ // Verify files were created
80
+ const files = await fileService.listFiles("Research/book_456/");
81
+ expect(files.length).toBeGreaterThan(0);
82
+
83
+ // Verify content is relevant
84
+ const content = await fileService.readFile(files[0]);
85
+ expect(content.toLowerCase()).toMatch(/whale|symbolism|melville/);
86
+ });
87
+ });
88
+ ```
89
+
90
+ ### The "Write to Location" Test
91
+
92
+ A key litmus test: can the agent create content in specific app locations?
93
+
94
+ ```typescript
95
+ describe('Location Awareness Tests', () => {
96
+ const locations = [
97
+ { userPhrase: "my reading feed", expectedTool: "publish_to_feed" },
98
+ { userPhrase: "my library", expectedTool: "add_book" },
99
+ { userPhrase: "my research folder", expectedTool: "write_file" },
100
+ { userPhrase: "my profile", expectedTool: "write_file" },
101
+ ];
102
+
103
+ for (const { userPhrase, expectedTool } of locations) {
104
+ test(`Agent knows how to write to "${userPhrase}"`, async () => {
105
+ const prompt = `Write a test note to ${userPhrase}`;
106
+ const result = await agent.chat(prompt);
107
+
108
+ // Check that agent used the right tool (or achieved the outcome)
109
+ expect(result.toolCalls).toContainEqual(
110
+ expect.objectContaining({ name: expectedTool })
111
+ );
112
+
113
+ // Or verify outcome directly
114
+ // expect(await locationHasNewContent(userPhrase)).toBe(true);
115
+ });
116
+ }
117
+ });
118
+ ```
119
+ </can_agent_do_it_test>
120
+
121
+ <surprise_test>
122
+ ## The "Surprise Test"
123
+
124
+ A well-designed agent-native app lets the agent figure out creative approaches. Test this by giving open-ended requests.
125
+
126
+ ### The Test
127
+
128
+ ```typescript
129
+ describe('Agent Creativity Tests', () => {
130
+ test('Agent can handle open-ended requests', async () => {
131
+ // Setup: user has some books
132
+ await libraryService.addBook({ id: "1", title: "1984", author: "Orwell" });
133
+ await libraryService.addBook({ id: "2", title: "Brave New World", author: "Huxley" });
134
+ await libraryService.addBook({ id: "3", title: "Fahrenheit 451", author: "Bradbury" });
135
+
136
+ // Open-ended request
137
+ const result = await agent.chat("Help me organize my reading for next month");
138
+
139
+ // The agent should do SOMETHING useful
140
+ // We don't specify exactly what—that's the point
141
+ expect(result.toolCalls.length).toBeGreaterThan(0);
142
+
143
+ // It should have engaged with the library
144
+ const libraryTools = ["read_library", "write_file", "publish_to_feed"];
145
+ const usedLibraryTool = result.toolCalls.some(
146
+ call => libraryTools.includes(call.name)
147
+ );
148
+ expect(usedLibraryTool).toBe(true);
149
+ });
150
+
151
+ test('Agent finds creative solutions', async () => {
152
+ // Don't specify HOW to accomplish the task
153
+ const result = await agent.chat(
154
+ "I want to understand the dystopian themes across my sci-fi books"
155
+ );
156
+
157
+ // Agent might:
158
+ // - Read all books and create a comparison document
159
+ // - Research dystopian literature and relate it to user's books
160
+ // - Create a mind map in a markdown file
161
+ // - Publish a series of insights to the feed
162
+
163
+ // We just verify it did something substantive
164
+ expect(result.response.length).toBeGreaterThan(100);
165
+ expect(result.toolCalls.length).toBeGreaterThan(0);
166
+ });
167
+ });
168
+ ```
169
+
170
+ ### What Failure Looks Like
171
+
172
+ ```typescript
173
+ // FAILURE: Agent can only say it can't do that
174
+ const result = await agent.chat("Help me prepare for a book club discussion");
175
+
176
+ // Bad outcome:
177
+ expect(result.response).not.toContain("I can't");
178
+ expect(result.response).not.toContain("I don't have a tool");
179
+ expect(result.response).not.toContain("Could you clarify");
180
+
181
+ // If the agent asks for clarification on something it should understand,
182
+ // you have a context injection or capability gap
183
+ ```
184
+ </surprise_test>
185
+
186
+ <parity_testing>
187
+ ## Automated Parity Testing
188
+
189
+ Ensure every UI action has an agent equivalent.
190
+
191
+ ### Capability Map Testing
192
+
193
+ ```typescript
194
+ // capability-map.ts
195
+ export const capabilityMap = {
196
+ // UI Action: Agent Tool
197
+ "View library": "read_library",
198
+ "Add book": "add_book",
199
+ "Delete book": "delete_book",
200
+ "Publish insight": "publish_to_feed",
201
+ "Start research": "start_research",
202
+ "View highlights": "read_library", // same tool, different query
203
+ "Edit profile": "write_file",
204
+ "Search web": "web_search",
205
+ "Export data": "N/A", // UI-only action
206
+ };
207
+
208
+ // parity.test.ts
209
+ import { capabilityMap } from './capability-map';
210
+ import { getAgentTools } from './agent-config';
211
+ import { getSystemPrompt } from './system-prompt';
212
+
213
+ describe('Action Parity', () => {
214
+ const agentTools = getAgentTools();
215
+ const systemPrompt = getSystemPrompt();
216
+
217
+ for (const [uiAction, toolName] of Object.entries(capabilityMap)) {
218
+ if (toolName === 'N/A') continue;
219
+
220
+ test(`"${uiAction}" has agent tool: ${toolName}`, () => {
221
+ const toolNames = agentTools.map(t => t.name);
222
+ expect(toolNames).toContain(toolName);
223
+ });
224
+
225
+ test(`${toolName} is documented in system prompt`, () => {
226
+ expect(systemPrompt).toContain(toolName);
227
+ });
228
+ }
229
+ });
230
+ ```
231
+
232
+ ### Context Parity Testing
233
+
234
+ ```typescript
235
+ describe('Context Parity', () => {
236
+ test('Agent sees all data that UI shows', async () => {
237
+ // Setup: create some data
238
+ await libraryService.addBook({ id: "1", title: "Test Book" });
239
+ await feedService.addItem({ id: "f1", content: "Test insight" });
240
+
241
+ // Get system prompt (which includes context)
242
+ const systemPrompt = await buildSystemPrompt();
243
+
244
+ // Verify data is included
245
+ expect(systemPrompt).toContain("Test Book");
246
+ expect(systemPrompt).toContain("Test insight");
247
+ });
248
+
249
+ test('Recent activity is visible to agent', async () => {
250
+ // Perform some actions
251
+ await activityService.log({ action: "highlighted", bookId: "1" });
252
+ await activityService.log({ action: "researched", bookId: "2" });
253
+
254
+ const systemPrompt = await buildSystemPrompt();
255
+
256
+ // Verify activity is included
257
+ expect(systemPrompt).toMatch(/highlighted|researched/);
258
+ });
259
+ });
260
+ ```
261
+ </parity_testing>
262
+
263
+ <integration_testing>
264
+ ## Integration Testing
265
+
266
+ Test the full flow from user request to outcome.
267
+
268
+ ### End-to-End Flow Tests
269
+
270
+ ```typescript
271
+ describe('End-to-End Flows', () => {
272
+ test('Research flow: request → web search → file creation', async () => {
273
+ // Setup
274
+ const bookId = "book_123";
275
+ await libraryService.addBook({ id: bookId, title: "Moby Dick" });
276
+
277
+ // User request
278
+ await agent.chat("Research the historical context of whaling in Moby Dick");
279
+
280
+ // Verify: web search was performed
281
+ const searchCalls = mockWebSearch.mock.calls;
282
+ expect(searchCalls.length).toBeGreaterThan(0);
283
+ expect(searchCalls.some(call =>
284
+ call[0].query.toLowerCase().includes("whaling")
285
+ )).toBe(true);
286
+
287
+ // Verify: files were created
288
+ const researchFiles = await fileService.listFiles(`Research/${bookId}/`);
289
+ expect(researchFiles.length).toBeGreaterThan(0);
290
+
291
+ // Verify: content is relevant
292
+ const content = await fileService.readFile(researchFiles[0]);
293
+ expect(content.toLowerCase()).toMatch(/whale|whaling|nantucket|melville/);
294
+ });
295
+
296
+ test('Publish flow: request → tool call → feed update → UI reflects', async () => {
297
+ // Setup
298
+ await libraryService.addBook({ id: "book_1", title: "1984" });
299
+
300
+ // Initial state
301
+ const feedBefore = await feedService.getItems();
302
+
303
+ // User request
304
+ await agent.chat("Write something about Big Brother for my reading feed");
305
+
306
+ // Verify feed updated
307
+ const feedAfter = await feedService.getItems();
308
+ expect(feedAfter.length).toBe(feedBefore.length + 1);
309
+
310
+ // Verify content
311
+ const newItem = feedAfter.find(item =>
312
+ !feedBefore.some(old => old.id === item.id)
313
+ );
314
+ expect(newItem).toBeDefined();
315
+ expect(newItem.content.toLowerCase()).toMatch(/big brother|surveillance|watching/);
316
+ });
317
+ });
318
+ ```
319
+
320
+ ### Failure Recovery Tests
321
+
322
+ ```typescript
323
+ describe('Failure Recovery', () => {
324
+ test('Agent handles missing book gracefully', async () => {
325
+ const result = await agent.chat("Tell me about 'Nonexistent Book'");
326
+
327
+ // Agent should not crash
328
+ expect(result.error).toBeUndefined();
329
+
330
+ // Agent should acknowledge the issue
331
+ expect(result.response.toLowerCase()).toMatch(
332
+ /not found|don't see|can't find|library/
333
+ );
334
+ });
335
+
336
+ test('Agent recovers from API failure', async () => {
337
+ // Mock API failure
338
+ mockWebSearch.mockRejectedValueOnce(new Error("Network error"));
339
+
340
+ const result = await agent.chat("Research this topic");
341
+
342
+ // Agent should handle gracefully
343
+ expect(result.error).toBeUndefined();
344
+ expect(result.response).not.toContain("unhandled exception");
345
+
346
+ // Agent should communicate the issue
347
+ expect(result.response.toLowerCase()).toMatch(
348
+ /couldn't search|unable to|try again/
349
+ );
350
+ });
351
+ });
352
+ ```
353
+ </integration_testing>
354
+
355
+ <snapshot_testing>
356
+ ## Snapshot Testing for System Prompts
357
+
358
+ Track changes to system prompts and context injection over time.
359
+
360
+ ```typescript
361
+ describe('System Prompt Stability', () => {
362
+ test('System prompt structure matches snapshot', async () => {
363
+ const systemPrompt = await buildSystemPrompt();
364
+
365
+ // Extract structure (removing dynamic data)
366
+ const structure = systemPrompt
367
+ .replace(/id: \w+/g, 'id: [ID]')
368
+ .replace(/"[^"]+"/g, '"[TITLE]"')
369
+ .replace(/\d{4}-\d{2}-\d{2}/g, '[DATE]');
370
+
371
+ expect(structure).toMatchSnapshot();
372
+ });
373
+
374
+ test('All capability sections are present', async () => {
375
+ const systemPrompt = await buildSystemPrompt();
376
+
377
+ const requiredSections = [
378
+ "Your Capabilities",
379
+ "Available Books",
380
+ "Recent Activity",
381
+ ];
382
+
383
+ for (const section of requiredSections) {
384
+ expect(systemPrompt).toContain(section);
385
+ }
386
+ });
387
+ });
388
+ ```
389
+ </snapshot_testing>
390
+
391
+ <manual_testing>
392
+ ## Manual Testing Checklist
393
+
394
+ Some things are best tested manually during development:
395
+
396
+ ### Natural Language Variation Test
397
+
398
+ Try multiple phrasings for the same request:
399
+
400
+ ```
401
+ "Add this to my feed"
402
+ "Write something in my reading feed"
403
+ "Publish an insight about this"
404
+ "Put this in the feed"
405
+ "I want this in my feed"
406
+ ```
407
+
408
+ All should work if context injection is correct.
409
+
410
+ ### Edge Case Prompts
411
+
412
+ ```
413
+ "What can you do?"
414
+ → Agent should describe capabilities
415
+
416
+ "Help me with my books"
417
+ → Agent should engage with library, not ask what "books" means
418
+
419
+ "Write something"
420
+ → Agent should ask WHERE (feed, file, etc.) if not clear
421
+
422
+ "Delete everything"
423
+ → Agent should confirm before destructive actions
424
+ ```
425
+
426
+ ### Confusion Test
427
+
428
+ Ask about things that should exist but might not be properly connected:
429
+
430
+ ```
431
+ "What's in my research folder?"
432
+ → Should list files, not ask "what research folder?"
433
+
434
+ "Show me my recent reading"
435
+ → Should show activity, not ask "what do you mean?"
436
+
437
+ "Continue where I left off"
438
+ → Should reference recent activity if available
439
+ ```
440
+ </manual_testing>
441
+
442
+ <ci_integration>
443
+ ## CI/CD Integration
444
+
445
+ Add agent-native tests to your CI pipeline:
446
+
447
+ ```yaml
448
+ # .github/workflows/test.yml
449
+ name: Agent-Native Tests
450
+
451
+ on: [push, pull_request]
452
+
453
+ jobs:
454
+ agent-tests:
455
+ runs-on: ubuntu-latest
456
+ steps:
457
+ - uses: actions/checkout@v3
458
+
459
+ - name: Setup
460
+ run: npm install
461
+
462
+ - name: Run Parity Tests
463
+ run: npm run test:parity
464
+
465
+ - name: Run Capability Tests
466
+ run: npm run test:capabilities
467
+ env:
468
+ GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
469
+
470
+ - name: Check System Prompt Completeness
471
+ run: npm run test:system-prompt
472
+
473
+ - name: Verify Capability Map
474
+ run: |
475
+ # Ensure capability map is up to date
476
+ npm run generate:capability-map
477
+ git diff --exit-code capability-map.ts
478
+ ```
479
+
480
+ ### Cost-Aware Testing
481
+
482
+ Agent tests cost API tokens. Strategies to manage:
483
+
484
+ ```typescript
485
+ // Use smaller models for basic tests
486
+ const testConfig = {
487
+ model: process.env.CI ? "gemini-flash" : "gemini-ultra",
488
+ maxTokens: 500, // Limit output length
489
+ };
490
+
491
+ // Cache responses for deterministic tests
492
+ const cachedAgent = new CachedAgent({
493
+ cacheDir: ".test-cache",
494
+ ttl: 24 * 60 * 60 * 1000, // 24 hours
495
+ });
496
+
497
+ // Run expensive tests only on main branch
498
+ if (process.env.GITHUB_REF === 'refs/heads/main') {
499
+ describe('Full Integration Tests', () => { ... });
500
+ }
501
+ ```
502
+ </ci_integration>
503
+
504
+ <test_utilities>
505
+ ## Test Utilities
506
+
507
+ ### Agent Test Harness
508
+
509
+ ```typescript
510
+ class AgentTestHarness {
511
+ private agent: Agent;
512
+ private mockServices: MockServices;
513
+
514
+ async setup() {
515
+ this.mockServices = createMockServices();
516
+ this.agent = await createAgent({
517
+ services: this.mockServices,
518
+ model: "gemini-flash", // Cheaper for tests
519
+ });
520
+ }
521
+
522
+ async chat(message: string): Promise<AgentResponse> {
523
+ return this.agent.chat(message);
524
+ }
525
+
526
+ async expectToolCall(toolName: string) {
527
+ const lastResponse = this.agent.getLastResponse();
528
+ expect(lastResponse.toolCalls.map(t => t.name)).toContain(toolName);
529
+ }
530
+
531
+ async expectOutcome(check: () => Promise<boolean>) {
532
+ const result = await check();
533
+ expect(result).toBe(true);
534
+ }
535
+
536
+ getState() {
537
+ return {
538
+ library: this.mockServices.library.getBooks(),
539
+ feed: this.mockServices.feed.getItems(),
540
+ files: this.mockServices.files.listAll(),
541
+ };
542
+ }
543
+ }
544
+
545
+ // Usage
546
+ test('full flow', async () => {
547
+ const harness = new AgentTestHarness();
548
+ await harness.setup();
549
+
550
+ await harness.chat("Add 'Moby Dick' to my library");
551
+ await harness.expectToolCall("add_book");
552
+ await harness.expectOutcome(async () => {
553
+ const state = harness.getState();
554
+ return state.library.some(b => b.title.includes("Moby"));
555
+ });
556
+ });
557
+ ```
558
+ </test_utilities>
559
+
560
+ <checklist>
561
+ ## Testing Checklist
562
+
563
+ Automated Tests:
564
+ - [ ] "Can Agent Do It?" tests for each UI action
565
+ - [ ] Location awareness tests ("write to my feed")
566
+ - [ ] Parity tests (tool exists, documented in prompt)
567
+ - [ ] Context parity tests (agent sees what UI shows)
568
+ - [ ] End-to-end flow tests
569
+ - [ ] Failure recovery tests
570
+
571
+ Manual Tests:
572
+ - [ ] Natural language variation (multiple phrasings work)
573
+ - [ ] Edge case prompts (open-ended requests)
574
+ - [ ] Confusion test (agent knows app vocabulary)
575
+ - [ ] Surprise test (agent can be creative)
576
+
577
+ CI Integration:
578
+ - [ ] Parity tests run on every PR
579
+ - [ ] Capability tests run with API key
580
+ - [ ] System prompt completeness check
581
+ - [ ] Capability map drift detection
582
+ </checklist>