ultimate-pi 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (516) hide show
  1. package/.agents/skills/ck-search/SKILL.md +99 -0
  2. package/.agents/skills/defuddle/SKILL.md +90 -0
  3. package/.agents/skills/find-skills/SKILL.md +142 -0
  4. package/.agents/skills/firecrawl/SKILL.md +150 -0
  5. package/.agents/skills/firecrawl/rules/install.md +82 -0
  6. package/.agents/skills/firecrawl/rules/security.md +26 -0
  7. package/.agents/skills/firecrawl-agent/SKILL.md +57 -0
  8. package/.agents/skills/firecrawl-build-interact/SKILL.md +67 -0
  9. package/.agents/skills/firecrawl-build-onboarding/SKILL.md +102 -0
  10. package/.agents/skills/firecrawl-build-onboarding/references/auth-flow.md +39 -0
  11. package/.agents/skills/firecrawl-build-onboarding/references/project-setup.md +20 -0
  12. package/.agents/skills/firecrawl-build-onboarding/references/sdk-installation.md +17 -0
  13. package/.agents/skills/firecrawl-build-scrape/SKILL.md +68 -0
  14. package/.agents/skills/firecrawl-build-search/SKILL.md +68 -0
  15. package/.agents/skills/firecrawl-crawl/SKILL.md +58 -0
  16. package/.agents/skills/firecrawl-download/SKILL.md +69 -0
  17. package/.agents/skills/firecrawl-interact/SKILL.md +83 -0
  18. package/.agents/skills/firecrawl-map/SKILL.md +50 -0
  19. package/.agents/skills/firecrawl-parse/SKILL.md +61 -0
  20. package/.agents/skills/firecrawl-scrape/SKILL.md +68 -0
  21. package/.agents/skills/firecrawl-search/SKILL.md +59 -0
  22. package/.agents/skills/obsidian-bases/SKILL.md +299 -0
  23. package/.agents/skills/obsidian-markdown/SKILL.md +237 -0
  24. package/.agents/skills/posthog-analyst/SKILL.md +306 -0
  25. package/.agents/skills/posthog-analyst/evals/evals.json +23 -0
  26. package/.agents/skills/wiki/SKILL.md +215 -0
  27. package/.agents/skills/wiki/references/css-snippets.md +122 -0
  28. package/.agents/skills/wiki/references/frontmatter.md +107 -0
  29. package/.agents/skills/wiki/references/git-setup.md +58 -0
  30. package/.agents/skills/wiki/references/mcp-setup.md +149 -0
  31. package/.agents/skills/wiki/references/modes.md +259 -0
  32. package/.agents/skills/wiki/references/plugins.md +96 -0
  33. package/.agents/skills/wiki/references/rest-api.md +124 -0
  34. package/.agents/skills/wiki-autoresearch/SKILL.md +211 -0
  35. package/.agents/skills/wiki-autoresearch/references/program.md +75 -0
  36. package/.agents/skills/wiki-fold/SKILL.md +204 -0
  37. package/.agents/skills/wiki-fold/references/fold-template.md +133 -0
  38. package/.agents/skills/wiki-ingest/SKILL.md +288 -0
  39. package/.agents/skills/wiki-lint/SKILL.md +183 -0
  40. package/.agents/skills/wiki-query/SKILL.md +176 -0
  41. package/.agents/skills/wiki-save/SKILL.md +128 -0
  42. package/.ckignore +41 -0
  43. package/.env.example +9 -0
  44. package/.github/workflows/lint.yml +33 -0
  45. package/.github/workflows/publish-github-packages.yml +35 -0
  46. package/.github/workflows/publish-npm.yml +1 -1
  47. package/.pi/SYSTEM.md +107 -40
  48. package/.pi/agents/pi-pi/agent-expert.md +205 -0
  49. package/.pi/agents/pi-pi/cli-expert.md +47 -0
  50. package/.pi/agents/pi-pi/config-expert.md +67 -0
  51. package/.pi/agents/pi-pi/ext-expert.md +53 -0
  52. package/.pi/agents/pi-pi/keybinding-expert.md +123 -0
  53. package/.pi/agents/pi-pi/pi-orchestrator.md +103 -0
  54. package/.pi/agents/pi-pi/prompt-expert.md +83 -0
  55. package/.pi/agents/pi-pi/skill-expert.md +52 -0
  56. package/.pi/agents/pi-pi/theme-expert.md +46 -0
  57. package/.pi/agents/pi-pi/tui-expert.md +100 -0
  58. package/.pi/agents/rethink.md +140 -0
  59. package/.pi/agents/wiki-ingest.md +67 -0
  60. package/.pi/agents/wiki-lint.md +75 -0
  61. package/.pi/auto-commit.json +20 -0
  62. package/.pi/extensions/banner.png +0 -0
  63. package/.pi/extensions/ck-enforce.ts +216 -0
  64. package/.pi/extensions/custom-footer.ts +308 -0
  65. package/.pi/extensions/custom-header.ts +116 -0
  66. package/.pi/extensions/dotenv-loader.ts +170 -0
  67. package/.pi/internal/cursor-sdk-transcript-parser.ts +59 -0
  68. package/.pi/model-router.json +95 -0
  69. package/.pi/npm/.gitignore +2 -0
  70. package/.pi/prompts/git-sync.md +124 -0
  71. package/.pi/prompts/harness-setup.md +509 -0
  72. package/.pi/prompts/save.md +16 -0
  73. package/.pi/prompts/wiki-autoresearch.md +19 -0
  74. package/.pi/prompts/wiki.md +23 -0
  75. package/.pi/providers/cursor-sdk-provider.test.mjs +476 -0
  76. package/.pi/providers/cursor-sdk-provider.ts +1085 -0
  77. package/.pi/settings.json +14 -4
  78. package/.pi/skills/agent-router/SKILL.md +174 -0
  79. package/.pi/sounds/alert/1-kaching-track.mp3 +0 -0
  80. package/.pi/sounds/error/1-ksi-wth-track.mp3 +0 -0
  81. package/.pi/sounds/error/2-smash-track.mp3 +0 -0
  82. package/.pi/sounds/error/3-buzzer-track.mp3 +0 -0
  83. package/.pi/sounds/notification/1-soft-notification-track.mp3 +0 -0
  84. package/.pi/sounds/project-sounds.json +25 -0
  85. package/.pi/sounds/reminder/1-soft-notification-track.mp3 +0 -0
  86. package/.pi/sounds/success/1-tada-track.mp3 +0 -0
  87. package/.pi/sounds/success/2-jobs-done-track.mp3 +0 -0
  88. package/.pi/sounds/success/3-yay-track.mp3 +0 -0
  89. package/CONTRIBUTING.md +116 -0
  90. package/README.md +32 -39
  91. package/biome.json +34 -0
  92. package/firecrawl/.env.template +58 -0
  93. package/firecrawl/README.md +49 -0
  94. package/firecrawl/docker-compose.yaml +201 -0
  95. package/firecrawl/searxng/searxng.env +3 -0
  96. package/firecrawl/searxng/settings.yml +85 -0
  97. package/lefthook.yml +8 -0
  98. package/package.json +55 -24
  99. package/vault/AGENTS.md +37 -0
  100. package/vault/wiki/_templates/comparison.md +39 -0
  101. package/vault/wiki/_templates/concept.md +40 -0
  102. package/vault/wiki/_templates/decision.md +21 -0
  103. package/vault/wiki/_templates/entity.md +32 -0
  104. package/vault/wiki/_templates/flow.md +14 -0
  105. package/vault/wiki/_templates/module.md +18 -0
  106. package/vault/wiki/_templates/question.md +31 -0
  107. package/vault/wiki/_templates/source.md +39 -0
  108. package/vault/wiki/concepts/AST-Aware Code Chunking.md +44 -0
  109. package/vault/wiki/concepts/Build-Time Prompt Compilation.md +107 -0
  110. package/vault/wiki/concepts/Context Engine (AI Coding).md +47 -0
  111. package/vault/wiki/concepts/Context-Aware System Reminders.md +61 -0
  112. package/vault/wiki/concepts/Contextualized Text Embedding.md +42 -0
  113. package/vault/wiki/concepts/Contractor vs Employee AI Model.md +55 -0
  114. package/vault/wiki/concepts/Dual-Model Agent Architecture.md +65 -0
  115. package/vault/wiki/concepts/Late Chunking vs Early Chunking.md +43 -0
  116. package/vault/wiki/concepts/Majority Vote Ensembling.md +68 -0
  117. package/vault/wiki/concepts/Meta-Harness.md +16 -0
  118. package/vault/wiki/concepts/Multi-Agent AI Coding Architecture.md +75 -0
  119. package/vault/wiki/concepts/Prompt Enhancement.md +90 -0
  120. package/vault/wiki/concepts/Prompt Renderer.md +89 -0
  121. package/vault/wiki/concepts/Semantic Codebase Indexing.md +67 -0
  122. package/vault/wiki/concepts/additive-config-hierarchy.md +16 -0
  123. package/vault/wiki/concepts/agent-artifacts-verifiable-deliverables.md +71 -0
  124. package/vault/wiki/concepts/agent-browser-browser-automation.md +99 -0
  125. package/vault/wiki/concepts/agent-codebase-interface.md +43 -0
  126. package/vault/wiki/concepts/agent-harness-architecture.md +67 -0
  127. package/vault/wiki/concepts/agent-loop-detection-patterns.md +133 -0
  128. package/vault/wiki/concepts/agent-search-enforcement.md +126 -0
  129. package/vault/wiki/concepts/agent-skills-ecosystem.md +74 -0
  130. package/vault/wiki/concepts/agent-skills-pattern.md +68 -0
  131. package/vault/wiki/concepts/agentic-harness-context-enforcement.md +91 -0
  132. package/vault/wiki/concepts/agentic-harness.md +34 -0
  133. package/vault/wiki/concepts/agentic-orchestration-pipeline.md +56 -0
  134. package/vault/wiki/concepts/agentic-search-no-embeddings.md +18 -0
  135. package/vault/wiki/concepts/anthropic-context-engineering.md +13 -0
  136. package/vault/wiki/concepts/antigravity-agent-first-architecture.md +61 -0
  137. package/vault/wiki/concepts/ast-compression.md +19 -0
  138. package/vault/wiki/concepts/ast-truncation.md +66 -0
  139. package/vault/wiki/concepts/barrel-files.md +37 -0
  140. package/vault/wiki/concepts/browser-harness-agent.md +41 -0
  141. package/vault/wiki/concepts/browser-subagent-visual-verification.md +82 -0
  142. package/vault/wiki/concepts/codebase-intelligence-ecosystem-comparison.md +192 -0
  143. package/vault/wiki/concepts/codebase-intelligence-harness-integration.md +161 -0
  144. package/vault/wiki/concepts/codebase-to-context-ingestion.md +46 -0
  145. package/vault/wiki/concepts/codex-harness-innovations.md +147 -0
  146. package/vault/wiki/concepts/consensus-debate-flow.md +17 -0
  147. package/vault/wiki/concepts/consensus-debate.md +206 -0
  148. package/vault/wiki/concepts/content-addressed-spec-identity.md +166 -0
  149. package/vault/wiki/concepts/context-anxiety.md +57 -0
  150. package/vault/wiki/concepts/context-compression-techniques.md +19 -0
  151. package/vault/wiki/concepts/context-continuity.md +22 -0
  152. package/vault/wiki/concepts/context-drift-in-agents.md +106 -0
  153. package/vault/wiki/concepts/context-engineering.md +62 -0
  154. package/vault/wiki/concepts/context-folding.md +67 -0
  155. package/vault/wiki/concepts/context-mode.md +38 -0
  156. package/vault/wiki/concepts/cursor-harness-innovations.md +107 -0
  157. package/vault/wiki/concepts/deterministic-session-compaction.md +79 -0
  158. package/vault/wiki/concepts/drift-detection-unified.md +296 -0
  159. package/vault/wiki/concepts/execution-feedback-loop.md +46 -0
  160. package/vault/wiki/concepts/feedforward-feedback-harness.md +60 -0
  161. package/vault/wiki/concepts/five-root-cause-metrics-sentrux.md +40 -0
  162. package/vault/wiki/concepts/fork-safe-spec-storage.md +89 -0
  163. package/vault/wiki/concepts/fts5-sandbox.md +19 -0
  164. package/vault/wiki/concepts/fuzzy-edit-matching.md +71 -0
  165. package/vault/wiki/concepts/gemini-cli-architecture.md +104 -0
  166. package/vault/wiki/concepts/generator-evaluator-architecture.md +64 -0
  167. package/vault/wiki/concepts/guardian-agent-pattern.md +67 -0
  168. package/vault/wiki/concepts/harness-configuration-layers.md +89 -0
  169. package/vault/wiki/concepts/harness-control-frameworks.md +155 -0
  170. package/vault/wiki/concepts/harness-engineering-first-principles.md +90 -0
  171. package/vault/wiki/concepts/harness-h-formalism.md +53 -0
  172. package/vault/wiki/concepts/hybrid-code-search.md +61 -0
  173. package/vault/wiki/concepts/inline-post-edit-validation.md +112 -0
  174. package/vault/wiki/concepts/legendary-engineering-patterns-harness.md +110 -0
  175. package/vault/wiki/concepts/lifecycle-hooks.md +94 -0
  176. package/vault/wiki/concepts/mcp-tool-routing.md +102 -0
  177. package/vault/wiki/concepts/memory-system-of-record-vs-ephemeral-cache.md +47 -0
  178. package/vault/wiki/concepts/meta-agent-context-pruning.md +151 -0
  179. package/vault/wiki/concepts/model-adaptive-harness.md +122 -0
  180. package/vault/wiki/concepts/model-routing-agents.md +101 -0
  181. package/vault/wiki/concepts/monorepo-architecture.md +45 -0
  182. package/vault/wiki/concepts/multi-agent-specialization.md +61 -0
  183. package/vault/wiki/concepts/permission-subsystem.md +16 -0
  184. package/vault/wiki/concepts/pi-messenger-analysis.md +243 -0
  185. package/vault/wiki/concepts/pi-vscode-extension-landscape.md +37 -0
  186. package/vault/wiki/concepts/policy-engine-pattern.md +78 -0
  187. package/vault/wiki/concepts/progressive-disclosure-agents.md +53 -0
  188. package/vault/wiki/concepts/progressive-skill-disclosure.md +17 -0
  189. package/vault/wiki/concepts/provider-native-prompting.md +203 -0
  190. package/vault/wiki/concepts/quality-signal-sentrux.md +37 -0
  191. package/vault/wiki/concepts/repo-map-ranking.md +42 -0
  192. package/vault/wiki/concepts/result-monad-error-handling.md +47 -0
  193. package/vault/wiki/concepts/safety-defense-in-depth.md +83 -0
  194. package/vault/wiki/concepts/sandbox-os-enforcement.md +18 -0
  195. package/vault/wiki/concepts/selective-debate-routing.md +70 -0
  196. package/vault/wiki/concepts/self-evolving-harness.md +60 -0
  197. package/vault/wiki/concepts/sentrux-mcp-integration.md +36 -0
  198. package/vault/wiki/concepts/sentrux-rules-engine.md +49 -0
  199. package/vault/wiki/concepts/shell-pattern-compression.md +24 -0
  200. package/vault/wiki/concepts/skill-first-architecture.md +166 -0
  201. package/vault/wiki/concepts/structured-compaction.md +78 -0
  202. package/vault/wiki/concepts/subagent-orchestration.md +17 -0
  203. package/vault/wiki/concepts/subagent-worktree-isolation.md +68 -0
  204. package/vault/wiki/concepts/superpowers-methodology.md +78 -0
  205. package/vault/wiki/concepts/think-in-code.md +73 -0
  206. package/vault/wiki/concepts/ts-execution-layer.md +100 -0
  207. package/vault/wiki/concepts/typescript-strict-mode.md +37 -0
  208. package/vault/wiki/concepts/vcc-conversation-compaction-for-pi.md +51 -0
  209. package/vault/wiki/concepts/verification-drift-detection.md +19 -0
  210. package/vault/wiki/consensus/consensus-records.md +58 -0
  211. package/vault/wiki/decisions/2026-04-30-pi-lean-ctx-native.md +122 -0
  212. package/vault/wiki/decisions/adr-008.md +40 -0
  213. package/vault/wiki/decisions/adr-009.md +46 -0
  214. package/vault/wiki/decisions/adr-010.md +55 -0
  215. package/vault/wiki/decisions/adr-011.md +165 -0
  216. package/vault/wiki/decisions/adr-012.md +102 -0
  217. package/vault/wiki/decisions/adr-013.md +59 -0
  218. package/vault/wiki/decisions/adr-014.md +73 -0
  219. package/vault/wiki/decisions/adr-015.md +81 -0
  220. package/vault/wiki/decisions/adr-016.md +91 -0
  221. package/vault/wiki/decisions/adr-017.md +79 -0
  222. package/vault/wiki/decisions/adr-018.md +100 -0
  223. package/vault/wiki/decisions/adr-019.md +75 -0
  224. package/vault/wiki/decisions/adr-020.md +106 -0
  225. package/vault/wiki/decisions/adr-021.md +86 -0
  226. package/vault/wiki/decisions/adr-022.md +113 -0
  227. package/vault/wiki/decisions/adr-023.md +113 -0
  228. package/vault/wiki/decisions/adr-024.md +73 -0
  229. package/vault/wiki/decisions/adr-025.md +130 -0
  230. package/vault/wiki/decisions/adr-026.md +56 -0
  231. package/vault/wiki/decisions/colocate-wiki.md +34 -0
  232. package/vault/wiki/entities/Anders Hejlsberg.md +29 -0
  233. package/vault/wiki/entities/Anthropic.md +17 -0
  234. package/vault/wiki/entities/Augment Code.md +49 -0
  235. package/vault/wiki/entities/Bjarne Stroustrup.md +26 -0
  236. package/vault/wiki/entities/Bolt.new (StackBlitz).md +39 -0
  237. package/vault/wiki/entities/Boris Cherny.md +11 -0
  238. package/vault/wiki/entities/Claude Code.md +19 -0
  239. package/vault/wiki/entities/Dennis Ritchie.md +26 -0
  240. package/vault/wiki/entities/Emergent Labs.md +32 -0
  241. package/vault/wiki/entities/Google Cloud.md +16 -0
  242. package/vault/wiki/entities/Guido van Rossum.md +28 -0
  243. package/vault/wiki/entities/Ken Thompson.md +28 -0
  244. package/vault/wiki/entities/Lee et al.md +16 -0
  245. package/vault/wiki/entities/Linus Torvalds.md +28 -0
  246. package/vault/wiki/entities/Lovable (company).md +40 -0
  247. package/vault/wiki/entities/Martin Fowler.md +16 -0
  248. package/vault/wiki/entities/Meng et al.md +16 -0
  249. package/vault/wiki/entities/OpenAI.md +16 -0
  250. package/vault/wiki/entities/Rocket.new.md +38 -0
  251. package/vault/wiki/entities/VILA-Lab.md +15 -0
  252. package/vault/wiki/entities/autodev-codebase.md +18 -0
  253. package/vault/wiki/entities/ck-tool.md +59 -0
  254. package/vault/wiki/entities/codesearch.md +18 -0
  255. package/vault/wiki/entities/disler-indydevdan.md +33 -0
  256. package/vault/wiki/entities/gsd-get-shit-done.md +56 -0
  257. package/vault/wiki/entities/javascript-runtimes.md +48 -0
  258. package/vault/wiki/entities/jesse-vincent.md +38 -0
  259. package/vault/wiki/entities/lean-ctx.md +32 -0
  260. package/vault/wiki/entities/opendev.md +41 -0
  261. package/vault/wiki/entities/ops-codegraph-tool.md +18 -0
  262. package/vault/wiki/entities/pi-coding-agent.md +53 -0
  263. package/vault/wiki/entities/sentrux.md +54 -0
  264. package/vault/wiki/entities/vgrep-tool.md +57 -0
  265. package/vault/wiki/entities/vitest.md +41 -0
  266. package/vault/wiki/flows/harness-wiki-pipeline.md +204 -0
  267. package/vault/wiki/hot.md +932 -0
  268. package/vault/wiki/index.md +437 -0
  269. package/vault/wiki/log.md +418 -0
  270. package/vault/wiki/meta/dashboard.md +30 -0
  271. package/vault/wiki/meta/lint-report-2026-04-30.md +86 -0
  272. package/vault/wiki/meta/lint-report-2026-05-02.md +251 -0
  273. package/vault/wiki/meta/overview.canvas +43 -0
  274. package/vault/wiki/modules/adversarial-verification.md +57 -0
  275. package/vault/wiki/modules/automated-observability.md +54 -0
  276. package/vault/wiki/modules/bench.md +20 -0
  277. package/vault/wiki/modules/extensions.md +23 -0
  278. package/vault/wiki/modules/grounding-checkpoints.md +62 -0
  279. package/vault/wiki/modules/harness-implementation-plan.md +345 -0
  280. package/vault/wiki/modules/harness-wiki-skill-mapping.md +135 -0
  281. package/vault/wiki/modules/harness.md +86 -0
  282. package/vault/wiki/modules/persistent-memory.md +85 -0
  283. package/vault/wiki/modules/schema-orchestration.md +68 -0
  284. package/vault/wiki/modules/skills.md +27 -0
  285. package/vault/wiki/modules/spec-hardening.md +58 -0
  286. package/vault/wiki/modules/structured-planning.md +53 -0
  287. package/vault/wiki/modules/think-in-code-enforcement.md +153 -0
  288. package/vault/wiki/modules/wiki-query-interface.md +64 -0
  289. package/vault/wiki/overview.md +51 -0
  290. package/vault/wiki/questions/Research-pi-vs-claude-code-agentic-orchestration-pipeline.md +87 -0
  291. package/vault/wiki/questions/Research-sentrux-dev.md +123 -0
  292. package/vault/wiki/questions/Research-superpowers-skill-for-agentic-coding-agents.md +164 -0
  293. package/vault/wiki/questions/Research: Augment Code Context Engine.md +244 -0
  294. package/vault/wiki/questions/Research: Automating Software Engineering - Lovable, Bolt, Emergent, Rocket.md +112 -0
  295. package/vault/wiki/questions/Research: Claude Code State-of-the-Art Harness Improvements.md +209 -0
  296. package/vault/wiki/questions/Research: Codex State-of-the-Art Harness Improvements.md +99 -0
  297. package/vault/wiki/questions/Research: Engineering Workflows of Legendary Programmers and AI Harness Mapping.md +107 -0
  298. package/vault/wiki/questions/Research: Fallow Codebase Intelligence Harness Integration.md +72 -0
  299. package/vault/wiki/questions/Research: Gemini CLI SOTA Harness Integration.md +166 -0
  300. package/vault/wiki/questions/Research: GitHub Issues as Harness Spec Storage.md +188 -0
  301. package/vault/wiki/questions/Research: Google Antigravity Harness Integration.md +120 -0
  302. package/vault/wiki/questions/Research: Meta-Agent Context Drift Detection.md +236 -0
  303. package/vault/wiki/questions/Research: Model-Adaptive Agent Harness Design.md +95 -0
  304. package/vault/wiki/questions/Research: Model-Specific Prompting Guides.md +165 -0
  305. package/vault/wiki/questions/Research: Prompt Renderer for Multi-Model Agent Harness.md +216 -0
  306. package/vault/wiki/questions/Research: Skill-First Harness Architecture.md +91 -0
  307. package/vault/wiki/questions/Research: TypeScript Best Practices and Codebase Structure.md +88 -0
  308. package/vault/wiki/questions/Research: TypeScript Execution Layer for Agent Tool Calling.md +81 -0
  309. package/vault/wiki/questions/Research: claude-mem over Obsidian for Harness Layer.md +71 -0
  310. package/vault/wiki/questions/Research: claude-mem over obsidian wiki as the knowledge base for our agentic harness pipeline. think from first principles. does this replace or complement our current setup? no hard feelings about previous decisions. gimme accurate points.md +80 -0
  311. package/vault/wiki/questions/Research: context-mode vs lean-ctx.md +72 -0
  312. package/vault/wiki/questions/Research: cursor.sh Harness Innovations.md +92 -0
  313. package/vault/wiki/questions/Research: executor.sh Harness Integration.md +170 -0
  314. package/vault/wiki/questions/Research: how GSD fits into our coding harness setup.md +97 -0
  315. package/vault/wiki/questions/Research: how claude-mem fits into our workflow. and whether it should replace obsidian in the codebase. no hard feelings about previous actions, rethink from first principles always.md +80 -0
  316. package/vault/wiki/questions/Research: pi-vcc.md +113 -0
  317. package/vault/wiki/questions/Research: semantic code search tools.md +69 -0
  318. package/vault/wiki/questions/Research: vcc extension for pi coding agent.md +73 -0
  319. package/vault/wiki/questions/how-to-enable-semantic-code-search-now.md +111 -0
  320. package/vault/wiki/questions/mvp-implementation-blueprint.md +552 -0
  321. package/vault/wiki/questions/research-agent-first-codebase-exploration.md +199 -0
  322. package/vault/wiki/questions/research-agentic-coding-harness-latest-papers.md +142 -0
  323. package/vault/wiki/questions/research-gitingest-gitreverse-integration.md +100 -0
  324. package/vault/wiki/questions/research-wozcode-token-reduction.md +67 -0
  325. package/vault/wiki/questions/resolved-context-pruning-inplace-vs-restart.md +95 -0
  326. package/vault/wiki/questions/resolved-context-window-economics.md +167 -0
  327. package/vault/wiki/questions/resolved-imad-debate-gating-transfer.md +126 -0
  328. package/vault/wiki/questions/resolved-mcp-tool-preference.md +112 -0
  329. package/vault/wiki/questions/resolved-small-model-meta-agents.md +107 -0
  330. package/vault/wiki/questions/resolved-treesitter-dynamic-languages.md +95 -0
  331. package/vault/wiki/sources/Auggie Context MCP Server.md +63 -0
  332. package/vault/wiki/sources/Augment Code Codacy AI Giants.md +61 -0
  333. package/vault/wiki/sources/Augment Code MCP SiliconAngle.md +49 -0
  334. package/vault/wiki/sources/Augment Code WorkOS ERC 2025.md +55 -0
  335. package/vault/wiki/sources/Augment Context Engine Official.md +71 -0
  336. package/vault/wiki/sources/Augment SWE-bench Agent GitHub.md +74 -0
  337. package/vault/wiki/sources/Augment SWE-bench Pro Blog.md +58 -0
  338. package/vault/wiki/sources/Source: AgentBus Jinja2 Prompt Pipelines.md +75 -0
  339. package/vault/wiki/sources/Source: Arxiv /342/200/224 Don't Break the Cache.md" +85 -0
  340. package/vault/wiki/sources/Source: Augment - Harness Engineering for AI Coding Agents.md +58 -0
  341. package/vault/wiki/sources/Source: Blake Crosley Agent Architecture Guide.md +100 -0
  342. package/vault/wiki/sources/Source: Bolt.new Architecture & Case Study.md +75 -0
  343. package/vault/wiki/sources/Source: Build-Time Prompt Compilation Architecture.md +107 -0
  344. package/vault/wiki/sources/Source: Claude API Agent Skills Overview.md +70 -0
  345. package/vault/wiki/sources/Source: Gemini CLI Changelogs.md +88 -0
  346. package/vault/wiki/sources/Source: Google Blog - Gemini CLI Announcement.md +57 -0
  347. package/vault/wiki/sources/Source: Google Gemini CLI Architecture Docs.md +53 -0
  348. package/vault/wiki/sources/Source: LangChain - Anatomy of Agent Harness.md +65 -0
  349. package/vault/wiki/sources/Source: Lovable Architecture & Clone Analysis.md +83 -0
  350. package/vault/wiki/sources/Source: Martin Fowler - Harness Engineering.md +70 -0
  351. package/vault/wiki/sources/Source: OpenAI Harness Engineering Five Principles.md +58 -0
  352. package/vault/wiki/sources/Source: OpenAI Harness Engineering /342/200/224 0 Lines of Human Code.md" +101 -0
  353. package/vault/wiki/sources/Source: OpenDev /342/200/224 Building AI Coding Agents for the Terminal.md" +100 -0
  354. package/vault/wiki/sources/Source: Render AI Coding Agents Benchmark 2025.md +53 -0
  355. package/vault/wiki/sources/Source: Rocket.new /342/200/224 Vibe Solutioning Platform.md" +70 -0
  356. package/vault/wiki/sources/Source: SwirlAI Agent Skills Progressive Disclosure.md +71 -0
  357. package/vault/wiki/sources/Source: TianPan Prompt Caching Architecture.md +89 -0
  358. package/vault/wiki/sources/Source: Vercel Labs agent-browser.md +155 -0
  359. package/vault/wiki/sources/Source: browser-harness CDP Harness.md +126 -0
  360. package/vault/wiki/sources/agent-drift-academic-paper.md +79 -0
  361. package/vault/wiki/sources/aider-repomap-tree-sitter.md +42 -0
  362. package/vault/wiki/sources/anthropic-compaction-api.md +58 -0
  363. package/vault/wiki/sources/anthropic-effective-harnesses.md +42 -0
  364. package/vault/wiki/sources/anthropic-prompt-best-practices.md +100 -0
  365. package/vault/wiki/sources/anthropic2026-harness-design.md +63 -0
  366. package/vault/wiki/sources/barrel-files-tkdodo.md +38 -0
  367. package/vault/wiki/sources/birth-of-unix-kernighan-interview.md +57 -0
  368. package/vault/wiki/sources/bockeler2026-harness-engineering.md +69 -0
  369. package/vault/wiki/sources/cast-code-chunking-paper.md +50 -0
  370. package/vault/wiki/sources/ck-semantic-search.md +78 -0
  371. package/vault/wiki/sources/claude-code-architecture-karaxai-2026.md +71 -0
  372. package/vault/wiki/sources/claude-code-architecture-qubytes-2026.md +50 -0
  373. package/vault/wiki/sources/claude-code-architecture-vila-lab-2026.md +64 -0
  374. package/vault/wiki/sources/claude-code-security-architecture-penligent-2026.md +70 -0
  375. package/vault/wiki/sources/claude-context-editing-docs.md +13 -0
  376. package/vault/wiki/sources/cloudflare-codemode.md +63 -0
  377. package/vault/wiki/sources/code-chunk-library-supermemory.md +63 -0
  378. package/vault/wiki/sources/codeact-apple-2024.md +62 -0
  379. package/vault/wiki/sources/codex-dsc-rfc-8573.md +41 -0
  380. package/vault/wiki/sources/codex-open-source-agent-2026.md +110 -0
  381. package/vault/wiki/sources/coir-code-retrieval-benchmark.md +51 -0
  382. package/vault/wiki/sources/colinmcnamara-context-optimization-codemode.md +48 -0
  383. package/vault/wiki/sources/context-folding-paper.md +61 -0
  384. package/vault/wiki/sources/context-mode-website.md +63 -0
  385. package/vault/wiki/sources/cursor-agent-best-practices-2026.md +62 -0
  386. package/vault/wiki/sources/cursor-fork-29b-2025.md +50 -0
  387. package/vault/wiki/sources/cursor-harness-april-2026.md +76 -0
  388. package/vault/wiki/sources/cursor-instant-apply-2024.md +45 -0
  389. package/vault/wiki/sources/cursor-shadow-workspace-2024.md +52 -0
  390. package/vault/wiki/sources/cursor-shipped-coding-agent-2026.md +53 -0
  391. package/vault/wiki/sources/cursor-vs-antigravity-2026.md +51 -0
  392. package/vault/wiki/sources/disler-pi-vs-claude-code.md +69 -0
  393. package/vault/wiki/sources/distill-deterministic-context-compression.md +53 -0
  394. package/vault/wiki/sources/embedding-models-benchmark-supermemory-2025.md +48 -0
  395. package/vault/wiki/sources/executor-rhyssullivan.md +122 -0
  396. package/vault/wiki/sources/fallow-rs-codebase-intelligence.md +125 -0
  397. package/vault/wiki/sources/fan2025-imad.md +60 -0
  398. package/vault/wiki/sources/forgecode-gpt5-agent-improvements.md +63 -0
  399. package/vault/wiki/sources/gemini-3-prompting-guide.md +78 -0
  400. package/vault/wiki/sources/gh-cli-sub-issue-rfc.md +50 -0
  401. package/vault/wiki/sources/gh-sub-issue-extension.md +72 -0
  402. package/vault/wiki/sources/github-fork-issues-discussion.md +44 -0
  403. package/vault/wiki/sources/github-issue-dependencies-docs.md +49 -0
  404. package/vault/wiki/sources/github-sub-issues-docs.md +51 -0
  405. package/vault/wiki/sources/gitingest.md +91 -0
  406. package/vault/wiki/sources/gitreverse.md +63 -0
  407. package/vault/wiki/sources/google-antigravity-official-blog.md +47 -0
  408. package/vault/wiki/sources/google-antigravity-wikipedia.md +53 -0
  409. package/vault/wiki/sources/gsd-codecentric-deep-dive.md +57 -0
  410. package/vault/wiki/sources/gsd-github-repo.md +51 -0
  411. package/vault/wiki/sources/gsd-hn-discussion.md +59 -0
  412. package/vault/wiki/sources/guido-python-design-philosophy.md +56 -0
  413. package/vault/wiki/sources/hejlsberg-7-learnings.md +48 -0
  414. package/vault/wiki/sources/ironclaw-drift-monitor.md +80 -0
  415. package/vault/wiki/sources/langsight-loop-detection.md +80 -0
  416. package/vault/wiki/sources/leanctx-website.md +69 -0
  417. package/vault/wiki/sources/lee2026-meta-harness.md +59 -0
  418. package/vault/wiki/sources/linux-kernel-coding-workflow.md +50 -0
  419. package/vault/wiki/sources/lou2026-autoharness.md +53 -0
  420. package/vault/wiki/sources/martin-fowler-harness-engineering.md +73 -0
  421. package/vault/wiki/sources/mcp-architecture-docs.md +13 -0
  422. package/vault/wiki/sources/meng2026-agent-harness-survey.md +79 -0
  423. package/vault/wiki/sources/mindstudio-four-agent-types.md +68 -0
  424. package/vault/wiki/sources/ms-chat-history-management.md +13 -0
  425. package/vault/wiki/sources/openai-prompt-guidance.md +104 -0
  426. package/vault/wiki/sources/openclaw-session-pruning.md +13 -0
  427. package/vault/wiki/sources/opencode-dcp.md +13 -0
  428. package/vault/wiki/sources/opendev-arxiv-2603.05344v1.md +79 -0
  429. package/vault/wiki/sources/openhands-platform.md +39 -0
  430. package/vault/wiki/sources/oss-guide-codebase-exploration.md +53 -0
  431. package/vault/wiki/sources/pi-compaction-extensions-ecosystem.md +102 -0
  432. package/vault/wiki/sources/pi-context-prune-github-repo.md +38 -0
  433. package/vault/wiki/sources/pi-mono-compaction-docs.md +38 -0
  434. package/vault/wiki/sources/pi-omni-compact-github-repo.md +50 -0
  435. package/vault/wiki/sources/pi-rtk-optimizer-github-repo.md +45 -0
  436. package/vault/wiki/sources/pi-vcc-github-repo.md +69 -0
  437. package/vault/wiki/sources/pi-vscode-marketplace.md +41 -0
  438. package/vault/wiki/sources/pi-vscode-model-provider-marketplace.md +39 -0
  439. package/vault/wiki/sources/py-tree-sitter.md +13 -0
  440. package/vault/wiki/sources/sentrux-dev-landing.md +40 -0
  441. package/vault/wiki/sources/sentrux-docs-pro-architecture.md +75 -0
  442. package/vault/wiki/sources/sentrux-docs-quality-signal.md +46 -0
  443. package/vault/wiki/sources/sentrux-docs-root-cause-metrics.md +57 -0
  444. package/vault/wiki/sources/sentrux-docs-rules-engine.md +58 -0
  445. package/vault/wiki/sources/sentrux-github-repo.md +56 -0
  446. package/vault/wiki/sources/superpowers-github-repo.md +56 -0
  447. package/vault/wiki/sources/superpowers-release-blog.md +54 -0
  448. package/vault/wiki/sources/superpowers-termdock-analysis.md +45 -0
  449. package/vault/wiki/sources/swe-agent-aci.md +42 -0
  450. package/vault/wiki/sources/swe-bench.md +45 -0
  451. package/vault/wiki/sources/swe-pruner-context-pruning.md +13 -0
  452. package/vault/wiki/sources/think-in-code-blog.md +48 -0
  453. package/vault/wiki/sources/tree-sitter-docs.md +13 -0
  454. package/vault/wiki/sources/ts-best-practices-2025-devto.md +42 -0
  455. package/vault/wiki/sources/ts-folder-structure-mingyang.md +58 -0
  456. package/vault/wiki/sources/ts-monorepo-koerselman.md +44 -0
  457. package/vault/wiki/sources/ts-result-error-handling-kkalamarski.md +52 -0
  458. package/vault/wiki/sources/ts-runtimes-comparison-betterstack.md +42 -0
  459. package/vault/wiki/sources/ts-strict-mode-rishikc.md +43 -0
  460. package/vault/wiki/sources/unix-philosophy.md +48 -0
  461. package/vault/wiki/sources/vectara-chunking-vs-embedding-naacl2025.md +39 -0
  462. package/vault/wiki/sources/vectara-guardian-agents.md +79 -0
  463. package/vault/wiki/sources/vgrep-semantic-search.md +76 -0
  464. package/vault/wiki/sources/vitest-official.md +41 -0
  465. package/vault/wiki/sources/vscode-pi-community-extension.md +40 -0
  466. package/vault/wiki/sources/wozcode.md +79 -0
  467. package/.agents/skills/compress/SKILL.md +0 -111
  468. package/.agents/skills/compress/scripts/__init__.py +0 -9
  469. package/.agents/skills/compress/scripts/__main__.py +0 -3
  470. package/.agents/skills/compress/scripts/benchmark.py +0 -78
  471. package/.agents/skills/compress/scripts/cli.py +0 -73
  472. package/.agents/skills/compress/scripts/compress.py +0 -227
  473. package/.agents/skills/compress/scripts/detect.py +0 -121
  474. package/.agents/skills/compress/scripts/validate.py +0 -189
  475. package/.agents/skills/emil-design-eng/SKILL.md +0 -679
  476. package/.agents/skills/lean-ctx/SKILL.md +0 -149
  477. package/.agents/skills/lean-ctx/scripts/install.sh +0 -95
  478. package/.agents/skills/scrapling-official/LICENSE.txt +0 -28
  479. package/.agents/skills/scrapling-official/SKILL.md +0 -390
  480. package/.agents/skills/scrapling-official/examples/01_fetcher_session.py +0 -26
  481. package/.agents/skills/scrapling-official/examples/02_dynamic_session.py +0 -26
  482. package/.agents/skills/scrapling-official/examples/03_stealthy_session.py +0 -26
  483. package/.agents/skills/scrapling-official/examples/04_spider.py +0 -58
  484. package/.agents/skills/scrapling-official/examples/README.md +0 -45
  485. package/.agents/skills/scrapling-official/references/fetching/choosing.md +0 -78
  486. package/.agents/skills/scrapling-official/references/fetching/dynamic.md +0 -352
  487. package/.agents/skills/scrapling-official/references/fetching/static.md +0 -432
  488. package/.agents/skills/scrapling-official/references/fetching/stealthy.md +0 -255
  489. package/.agents/skills/scrapling-official/references/mcp-server.md +0 -214
  490. package/.agents/skills/scrapling-official/references/migrating_from_beautifulsoup.md +0 -86
  491. package/.agents/skills/scrapling-official/references/parsing/adaptive.md +0 -212
  492. package/.agents/skills/scrapling-official/references/parsing/main_classes.md +0 -586
  493. package/.agents/skills/scrapling-official/references/parsing/selection.md +0 -494
  494. package/.agents/skills/scrapling-official/references/spiders/advanced.md +0 -344
  495. package/.agents/skills/scrapling-official/references/spiders/architecture.md +0 -94
  496. package/.agents/skills/scrapling-official/references/spiders/getting-started.md +0 -164
  497. package/.agents/skills/scrapling-official/references/spiders/proxy-blocking.md +0 -235
  498. package/.agents/skills/scrapling-official/references/spiders/requests-responses.md +0 -196
  499. package/.agents/skills/scrapling-official/references/spiders/sessions.md +0 -205
  500. package/PLAN.md +0 -11
  501. package/extensions/lean-ctx-enforce.ts +0 -166
  502. package/skills-lock.json +0 -35
  503. package/wiki/README.md +0 -19
  504. package/wiki/decisions/0001-establish-project-wiki-and-decision-record-format.md +0 -25
  505. package/wiki/decisions/0002-add-project-banner-to-readme.md +0 -26
  506. package/wiki/decisions/0003-remove-redundant-readme-title-heading.md +0 -26
  507. package/wiki/decisions/0004-publish-package-to-npm-as-ultimate-pi.md +0 -26
  508. package/wiki/decisions/0005-automate-npm-publish-with-github-actions.md +0 -27
  509. package/wiki/decisions/0006-switch-to-npm-trusted-publishing.md +0 -26
  510. package/wiki/decisions/0007-use-absolute-banner-url-for-npm-readme-rendering.md +0 -26
  511. package/wiki/decisions/0008-rename-banner-asset-for-cache-busting.md +0 -26
  512. package/wiki/decisions/0009-force-oidc-path-by-clearing-node-auth-token-in-publish-step.md +0 -25
  513. package/wiki/decisions/0010-simplify-setup-node-for-npm-trusted-publishing.md +0 -26
  514. package/wiki/decisions/0011-add-noop-workflow-change-to-force-fresh-publish-run.md +0 -25
  515. package/wiki/decisions/0012-align-workflow-runtime-with-npm-trusted-publishing-requirements.md +0 -26
  516. package/wiki/decisions/0013-add-package-repository-url-for-provenance-validation.md +0 -25
@@ -0,0 +1,80 @@
1
+ ---
2
+ type: source
3
+ status: ingested
4
+ source_type: blog
5
+ title: "How to Detect and Stop AI Agent Loops in Production"
6
+ author: LangSight Engineering
7
+ date_published: 2026-03-22
8
+ url: https://langsight.dev/blog/ai-agent-loop-detection/
9
+ confidence: high
10
+ key_claims:
11
+ - "Agent loops are the most common production failure mode"
12
+ - "Argument hash comparison catches >90% of real loops with zero false positives at threshold 3"
13
+ - "Three detection approaches: argument hash, sliding window rate, LLM output similarity"
14
+ - "Always combine loop detection with budget guardrails"
15
+ tags:
16
+ - source
17
+ - loop-detection
18
+ - production
19
+ - agent-reliability
20
+ - langsight
21
+ related:
22
+ - "[[Research: Meta-Agent Context Drift Detection]]"
23
+ - "[[agent-loop-detection-patterns]]"
24
+ - "[[context-drift-in-agents]]"
25
+ created: 2026-05-02
26
+ updated: 2026-05-02
27
+
28
+ ---# LangSight Loop Detection
29
+
30
+ ## Summary
31
+
32
+ LangSight's production guide for detecting and stopping AI agent loops — the most common failure mode in deployed agent systems. Provides three detection approaches with working code, intervention strategies, and integration patterns. Based on production experience: a single support agent burned $214 calling the same CRM tool 89 times with identical arguments.
33
+
34
+ ## What It Contributes
35
+
36
+ Validates that loop detection is production-critical and that argument hashing is the most reliable method. Provides concrete code for the detection layer of a meta-agent system. The $214 cautionary tale demonstrates the economic case for automated intervention.
37
+
38
+ ## Three Loop Patterns
39
+
40
+ 1. **Direct repetition**: Same tool + identical arguments multiple times in a row. Most common. Caused by tool returning error/unexpected result and LLM retry logic not distinguishing transient vs. structural failure.
41
+ 2. **Ping-pong between tools**: Two tools called alternately without state change. Agent calls A → B → A → B with same arguments.
42
+ 3. **Retry-without-progress**: Tool call succeeds but response doesn't satisfy agent's internal goal. Agent keeps calling with minor argument variations.
43
+
44
+ ## Three Detection Approaches
45
+
46
+ ### Approach 1: Argument Hash Comparison (Recommended)
47
+ Most reliable. Hash `(tool_name, normalized_args)` and count occurrences in session window. Threshold 3 catches >90% of real loops.
48
+
49
+ ```python
50
+ def compute_call_hash(tool_name: str, args: dict) -> str:
51
+ payload = f"{tool_name}:{json.dumps(args, sort_keys=True)}"
52
+ return hashlib.sha256(payload.encode()).hexdigest()[:16]
53
+ ```
54
+
55
+ ### Approach 2: Sliding Window Rate Detection
56
+ Catches high-frequency calls regardless of argument variation. If tool called >N times in M seconds, flag it.
57
+
58
+ ### Approach 3: LLM Output Similarity
59
+ Semantic similarity between consecutive reasoning outputs. High similarity (>0.95 cosine) across multiple steps = reasoning in circles. Computationally expensive, usually overkill.
60
+
61
+ ## Intervention Options
62
+
63
+ 1. **Warn and continue**: Log + alert, agent keeps running. Good for early monitoring.
64
+ 2. **Terminate session**: Hard stop. Mark session `loop_detected`, return structured error. Right default for production.
65
+ 3. **Inject recovery message**: System message telling agent it's stuck. Gives chance to self-recover before termination.
66
+
67
+ ## Budget Guardrails
68
+
69
+ Backstop for unknown failure patterns: max cost, max steps, max wall time, soft alert at 80%.
70
+
71
+ ## Threshold Tuning
72
+
73
+ - **Polling agents**: Use time-based windows, not count-based
74
+ - **Retry-heavy workflows**: Increase threshold to 5-7
75
+ - **Sub-agents**: Each sub-agent gets own loop detector
76
+ - **Default**: Threshold 3 works for most agents
77
+
78
+ ## Relevance to Meta-Agent Concept
79
+
80
+ LangSight provides the **detection layer** of the meta-agent pipeline. The argument hash approach is production-validated. Their three intervention options map to the proposed escalation model (warn → inject → terminate). What's missing: context pruning after detection. LangSight terminates or injects but doesn't remove dead-end history.
@@ -0,0 +1,69 @@
1
+ ---
2
+ type: source
3
+ source_type: website
4
+ title: leanctx.com
5
+ author: yvgude
6
+ date_published: 2026
7
+ url: https://leanctx.com
8
+ confidence: medium
9
+ key_claims:
10
+ - "60–99% token reduction per file read"
11
+ - "46 MCP tools, 10 read modes, 90+ shell compression patterns"
12
+ - "Supports 24 AI tools"
13
+ - "Single Rust binary, zero telemetry, Apache 2.0"
14
+ - "Agent governance with profiles, budgets, SLOs, anomaly detection"
15
+ created: 2026-04-30
16
+ updated: 2026-04-30
17
+ status: ingested
18
+ tags: [#source/website]
19
+ ---
20
+
21
+ # leanctx.com
22
+
23
+ Landing page for LeanCTX — "The Context Engineering Layer for AI Coding."
24
+
25
+ ## Architecture (3 layers)
26
+
27
+ 1. **Context Server**: 49 intelligent MCP tools for file reads, shell commands, code search. Intent-aware compression with adaptive mode selection per task type.
28
+ 2. **Shell Hook**: Intercepts shell output. Recognizes 90+ command patterns (git, npm, cargo, docker, kubectl, etc). Compresses automatically.
29
+ 3. **Protocols**: CEP (Context Efficiency Protocol), CCP (Cross-session Continuity Protocol), TDD (symbol shorthand). Teaches AI to communicate leaner. 8–25% additional savings.
30
+
31
+ ## Read Modes
32
+
33
+ - `full`: Complete content
34
+ - `map`: Dependency graph + exports + API (~5-15% tokens)
35
+ - `signatures`: Function/class signatures only (~10-20%)
36
+ - `aggressive`: Syntax-stripped (~30-50%)
37
+ - `entropy`: Shannon entropy filtered (~20-40%)
38
+ - `diff`: Only changed lines since last read
39
+
40
+ ## Agent Governance
41
+
42
+ - 5 built-in roles: Admin, Coder, Debugger, Reviewer, Ops
43
+ - Token, cost, and shell budgets per agent
44
+ - SLOs with automatic throttling
45
+ - Anomaly detection for runaway consumption
46
+
47
+ ## Compression Results (claimed)
48
+
49
+ - 60–95% per file read depending on mode
50
+ - 99% on cached re-reads (13 tokens)
51
+ - Shell builds: 847 → 42 tokens (95%)
52
+
53
+ ## Platforms
54
+
55
+ Aider, Amazon Q, Amp, Antigravity, AWS Kiro, Claude Code, Cline, Continue, Cursor, Emacs, Gemini CLI, GitHub Copilot, JetBrains, Neovim, OpenAI Codex, OpenCode, Pi, Qwen Code, Roo Code, Sublime Text, Trae, Verdent, Windsurf, Zed
56
+
57
+ ## GitHub
58
+
59
+ - Stars: 924 (as of 2026-04-30)
60
+ - Forks: 109
61
+ - Language: Rust
62
+ - Created: 2026-03-23
63
+ - License: Apache 2.0
64
+
65
+ ## crates.io
66
+
67
+ - Package: lean-ctx
68
+ - Total downloads: 3,188
69
+ - Version: 3.4.5
@@ -0,0 +1,59 @@
1
+ ---
2
+ type: source
3
+ source_type: paper
4
+ title: "Meta-Harness: End-to-End Optimization of Model Harnesses"
5
+ author: "Lee, Yoonho; Nair, Roshen; Zhang, Qizheng; et al."
6
+ date_published: 2026-03-30
7
+ url: "https://arxiv.org/abs/2603.28052"
8
+ confidence: medium
9
+ key_claims:
10
+ - "Outer-loop system searches over harness code for LLM applications"
11
+ - "Agentic proposer accesses source code, scores, and execution traces via filesystem"
12
+ - "7.7pt improvement on text classification with 4x fewer context tokens"
13
+ - "4.7pt improvement on IMO-level math problems across 5 held-out models"
14
+ - "Surpasses best hand-engineered baselines on TerminalBench-2"
15
+ tags:
16
+ - harness
17
+ - meta-learning
18
+ - optimization
19
+ - terminal-bench
20
+ created: 2026-04-30
21
+ updated: 2026-04-30
22
+ status: ingested
23
+
24
+ ---# Meta-Harness: End-to-End Optimization of Model Harnesses
25
+
26
+ Lee et al., March 2026. Stanford / Together AI.
27
+
28
+ ## Core Idea
29
+
30
+ Harnesses are still designed largely by hand. Meta-Harness is an outer-loop system that automatically searches over harness code, using an agentic proposer with access to source code, scores, and execution traces from all prior candidates.
31
+
32
+ ## Architecture
33
+
34
+ - **Agentic Proposer**: LLM that reads existing harness code + execution traces + scores
35
+ - **Filesystem-based memory**: All prior candidates, their code, traces, and scores available
36
+ - **Outer-loop**: Proposer generates new harness variant → evaluate → add to candidate pool → repeat
37
+
38
+ Key difference from AutoHarness: Meta-Harness sees ALL prior experiments, not just the last one.
39
+
40
+ ## Results
41
+
42
+ | Domain | Improvement | Context Savings |
43
+ |--------|-------------|-----------------|
44
+ | Text classification | +7.7 pts | 4x fewer tokens |
45
+ | IMO math reasoning | +4.7 pts | Across 5 held-out models |
46
+ | Agentic coding (TerminalBench-2) | Surpasses hand-engineered | — |
47
+
48
+ ## Key Insight
49
+
50
+ > Richer access to prior experience can enable automated harness engineering.
51
+
52
+ This directly challenges the assumption that harness design must be a human engineering practice. It suggests a future where harnesses self-optimize from execution traces.
53
+
54
+ ## Relevance to Our Harness
55
+
56
+ Our current pipeline is manually configured. Meta-Harness suggests:
57
+ - Adding a "harness optimizer" that runs off failure traces
58
+ - Auto-tuning token budgets per layer based on observed vs actual usage
59
+ - Generating model-specific harness variants (our model-adaptive profiles could be learned, not hand-coded)
@@ -0,0 +1,50 @@
1
+ ---
2
+ type: source
3
+ source_type: official-documentation
4
+ title: "Linux Kernel Coding Style and Development Workflow"
5
+ author: "Linus Torvalds, Jonathan Corbet et al."
6
+ date_published: 2026-01-20
7
+ url: "https://github.com/torvalds/linux/blob/master/Documentation/process/coding-style.rst"
8
+ confidence: high
9
+ key_claims:
10
+ - "Functions should be short and do one thing well"
11
+ - "8-character tabs enforce shallow nesting; >3 levels is a design problem"
12
+ - "K&R brace style, functions get opening brace on next line"
13
+ - "Centralized error handling via goto for cleanup paths"
14
+ - "Reference counting mandatory for data structures visible to multiple threads"
15
+ - "Don't crash the kernel — WARN_ON_ONCE preferred over BUG()"
16
+ - "Time-based release cycle: 2-week merge window, 6-10 week rc stabilization"
17
+ - "Chain-of-trust maintainer hierarchy: patches flow up through subsystem trees"
18
+ tags: [linux, coding-style, kernel, torvalds]
19
+ ---
20
+
21
+ # Linux Kernel Coding Style and Development Workflow
22
+
23
+ ## Coding Style — Direct from Linus
24
+
25
+ The Linux kernel coding style document enforces strict, opinionated rules:
26
+
27
+ - **8-character tabs**: not just aesthetic — forces refactoring when nesting exceeds 3 levels
28
+ - **K&R braces**: opening brace on same line for statements, on next line for functions. "K&R are right."
29
+ - **Short functions**: should fit on 1-2 screenfuls (80x24). Local variables ≤ 5-10.
30
+ - **Descriptive globals, short locals**: `count_active_users()`, not `cntusr()`; loop counter is `i`, not `loop_counter`
31
+ - **No typedefs** for structs/pointers: `struct virtual_container *a` beats `vps_t a`
32
+ - **Centralized exit via goto**: for cleanup in functions with multiple exit points. Label names describe what they free.
33
+ - **Comments say WHAT, not HOW**: if function needs inline comments explaining how it works, rewrite it.
34
+ - **Reference counting**: mandatory for any data structure accessible from another thread. "If another thread can find your data structure and you don't have a reference count, you almost certainly have a bug."
35
+ - **Don't crash the kernel**: use `WARN_ON_ONCE()`, not `BUG()`. Kernel crashes are user decisions.
36
+
37
+ ## Development Process
38
+
39
+ - **Time-based releases**: new major kernel every 2-3 months
40
+ - **2-week merge window**: all new features land here. ~1,000 patches/day.
41
+ - **6-10 week stabilization**: only fixes after -rc1. Regressions are the primary metric.
42
+ - **Chain of trust**: patches flow through subsystem maintainers → Linus. Only ~1.3% of patches chosen directly by Linus.
43
+ - **linux-next**: integration tree where all pending patches are tested before merge window.
44
+ - **Staging trees**: drivers/staging/ for code not yet meeting quality standards; includes TODO files.
45
+
46
+ ## Linus on AI-Generated Code (2026)
47
+
48
+ - Vibe coding: "fairly positive" for learning, "horrible, horrible idea from a maintenance standpoint" for production.
49
+ - Linux kernel policy: AI-generated code is acceptable if reviewed by a human who takes responsibility. "If the code is good, it's good. If it's hallucinatory AI slop that breaks the kernel, the human who clicked submit is responsible."
50
+ - "Code is cheap. Show me the talk." — prioritizes demonstrated understanding over volume of output.
@@ -0,0 +1,53 @@
1
+ ---
2
+ type: source
3
+ source_type: paper
4
+ title: "AutoHarness: Improving LLM Agents by Automatically Synthesizing a Code Harness"
5
+ author: "Lou, Xinghua; Lázaro-Gredilla, Miguel; Dedieu, Antoine; et al."
6
+ date_published: 2026-02-10
7
+ url: "https://arxiv.org/abs/2603.03329"
8
+ confidence: medium
9
+ key_claims:
10
+ - "Smaller model (Gemini Flash) can automatically synthesize code harness via iterative refinement"
11
+ - "78% of chess losses were illegal moves — harness eliminates all illegal moves in 145 TextArena games"
12
+ - "Synthesized harness enables smaller model to outperform larger models (Gemini Pro, GPT-5.2)"
13
+ - "Code-policy (entire policy in code, no LLM at decision time) beats larger models on 16 games"
14
+ tags:
15
+ - harness
16
+ - auto-synthesis
17
+ - code-generation
18
+ - gemini
19
+ created: 2026-04-30
20
+ updated: 2026-04-30
21
+ status: ingested
22
+
23
+ ---# AutoHarness: Automatically Synthesizing Code Harnesses
24
+
25
+ Lou et al., February 2026.
26
+
27
+ ## Core Idea
28
+
29
+ LLM agents often attempt actions that are prohibited by the environment. Instead of manually writing guardrails, AutoHarness demonstrates that a LLM can automatically synthesize a code harness via iterative refinement with environment feedback.
30
+
31
+ ## Key Numbers
32
+
33
+ - **78% of Gemini-2.5-Flash losses** in Kaggle GameArena chess attributed to illegal moves
34
+ - After AutoHarness: **all illegal moves prevented** across 145 TextArena games
35
+ - Synthesized harness + Flash outperforms Gemini-2.5-Pro bare
36
+ - Code-policy (fully compiled harness, no LLM at decision time) beats GPT-5.2-High on 16/16 games
37
+
38
+ ## Mechanism
39
+
40
+ 1. LLM generates initial harness code
41
+ 2. Environment provides feedback (illegal move detection, score)
42
+ 3. LLM iteratively refines harness code
43
+ 4. Final harness: prevents all illegal actions, optimizes for reward
44
+
45
+ ## Key Insight
46
+
47
+ > Using a smaller model to synthesize a custom code harness can outperform a much larger model, while also being more cost effective.
48
+
49
+ This is the automation of what the survey calls "harness engineering" — turning it from a human practice into an LLM-driven one. Directly relevant to [[lee2026-meta-harness]] which takes this further with outer-loop optimization.
50
+
51
+ ## Relevance to Our Harness
52
+
53
+ Our harness is manually designed (skill files, schemas, gate logic). AutoHarness suggests that harness components could be automatically synthesized from failure traces. The token budget optimization problem (Phase 10-13) is a natural candidate for auto-synthesis.
@@ -0,0 +1,73 @@
1
+ ---
2
+ type: source
3
+ source_type: article
4
+ author: "Birgitta Böckeler (Thoughtworks)"
5
+ date_published: 2026-04-02
6
+ url: https://martinfowler.com/articles/harness-engineering.html
7
+ confidence: high
8
+ tags:
9
+ - harness-engineering
10
+ - context-engineering
11
+ - agent-trust
12
+ - feedback-loops
13
+ key_claims:
14
+ - "Agent = Model + Harness. A harness is everything in an AI agent except the model itself"
15
+ - "Feedforward guides (before action) + Feedback sensors (after action) form the steering loop"
16
+ - "Computational controls (deterministic, fast) vs Inferential controls (LLM-based, semantic)"
17
+ - "Three regulation categories: Maintainability, Architecture Fitness, Behaviour"
18
+ - "The human's job is to steer the agent by iterating on the harness"
19
+ - "Harness templates can encode topologies (CRUD service, event processor, data dashboard)"
20
+ ---
21
+
22
+ # Harness Engineering for Coding Agent Users
23
+
24
+ Martin Fowler blog — April 2026. By Birgitta Böckeler, Distinguished Engineer at Thoughtworks.
25
+
26
+ ## Core Mental Model
27
+
28
+ **Agent = Model + Harness**. The harness is everything except the model: system prompts, tools, feedback loops, approval gates, context management.
29
+
30
+ Three concentric circles:
31
+ 1. **Model** (core)
32
+ 2. **Builder harness** (coding agent's built-in infrastructure)
33
+ 3. **User harness** (what we build — guides + sensors specific to our use case)
34
+
35
+ ## Feedforward and Feedback
36
+
37
+ | Direction | Purpose | Examples |
38
+ |-----------|---------|----------|
39
+ | **Feedforward (Guides)** | Steer agent *before* it acts | AGENTS.md, Skills, coding conventions, architecture docs |
40
+ | **Feedback (Sensors)** | Observe *after* agent acts, enable self-correction | Linters, tests, review agents, type checkers |
41
+
42
+ Two execution types:
43
+ - **Computational**: Deterministic, fast (tests, linters, type checkers, structural analysis)
44
+ - **Inferential**: LLM-based, semantic (AI code review, "LLM as judge")
45
+
46
+ ## The Steering Loop
47
+
48
+ Human's role: Iterate on the harness. When issues recur, improve feedforward guides or feedback sensors to make them less probable. Agents can help build harness components (write structural tests, generate linter rules, create how-to guides).
49
+
50
+ ## Three Regulation Categories
51
+
52
+ 1. **Maintainability Harness**: Code quality, style, complexity, test coverage. Computational sensors catch structural issues reliably. LLMs partially address semantic judgment (duplicate code, brute-force fixes) but expensively.
53
+ 2. **Architecture Fitness Harness**: Performance requirements, logging standards, observability. Fitness functions as feedback sensors.
54
+ 3. **Behaviour Harness**: Functional correctness. The hardest category — still relies heavily on human review and manual testing. AI-generated tests put too much faith in AI.
55
+
56
+ ## Key Timing Principle: Keep Quality Left
57
+
58
+ Checks distributed across the change lifecycle by cost and speed:
59
+ - **Pre-commit**: Linters, fast tests, basic code review agent
60
+ - **Post-integration pipeline**: Mutation testing, broad architecture review
61
+ - **Continuous**: Dead code detection, dependency scanning, SLO monitoring
62
+
63
+ ## Harness Templates
64
+
65
+ For enterprises with common service topologies (CRUD APIs, event processors, dashboards), harness templates bundle guides + sensors for each topology. Teams pick tech stacks partly based on available harnesses.
66
+
67
+ ## Relevance to Our Harness
68
+
69
+ - Our `.pi/skills/` system implements feedforward guides
70
+ - Our `wiki-lint` and `posthog-analyst` skills implement inferential feedback sensors
71
+ - The steering loop is what we're building: improve harness as agents make mistakes
72
+ - We need computational sensors: pre-commit hooks, structural tests, architecture fitness checks
73
+ - Harness templates are our `lean-ctx` and `wiki` patterns — reusable across projects
@@ -0,0 +1,13 @@
1
+ ---
2
+ type: source
3
+ status: stub
4
+ created: 2026-05-02
5
+ updated: 2026-05-02
6
+ tags: [source, external-doc]
7
+ ---
8
+
9
+ # MCP Architecture Docs
10
+
11
+ Official Model Context Protocol (MCP) architecture documentation. Describes the protocol design, tool registration, and server-client model.
12
+
13
+ Referenced in: [[resolved-mcp-tool-preference]]
@@ -0,0 +1,79 @@
1
+ ---
2
+ type: source
3
+ source_type: paper
4
+ title: "Agent Harness for Large Language Model Agents: A Survey"
5
+ author: "Meng, Qianyu; Wang, Yanan; Chen, Liyi; et al."
6
+ date_published: 2026-04
7
+ url: "https://github.com/Gloriaameng/Awesome-Agent-Harness"
8
+ confidence: high
9
+ key_claims:
10
+ - "Formalizes harness as six-component tuple H = (E, T, C, S, L, V)"
11
+ - "Surveys 110+ papers and 23 production systems"
12
+ - "Harness completeness matrix maps which components each system implements"
13
+ - "Maps 9 open technical challenges: security, evaluation, protocols, context, tools, memory, planning, multi-agent, compute economics"
14
+ tags:
15
+ - harness
16
+ - survey
17
+ - agent-architecture
18
+ - llm-agents
19
+ created: 2026-04-30
20
+ updated: 2026-04-30
21
+ status: ingested
22
+
23
+ ---# Agent Harness for Large Language Model Agents: A Survey
24
+
25
+ Meng et al., April 2026. 110+ papers, 23 systems analyzed.
26
+
27
+ ## Core Contribution
28
+
29
+ The survey formalizes the **agent execution harness** as a first-class architectural object:
30
+
31
+ ```
32
+ H = (E, T, C, S, L, V)
33
+ ```
34
+
35
+ | Component | Symbol | Role |
36
+ |-----------|--------|------|
37
+ | Execution Loop | E | Observe-think-act cycle, termination, error recovery |
38
+ | Tool Registry | T | Typed tool catalog, routing, monitoring |
39
+ | Context Manager | C | Context window control, compaction, retrieval |
40
+ | State Store | S | Persistence across turns/sessions, crash recovery |
41
+ | Lifecycle Hooks | L | Auth, logging, policy enforcement, instrumentation |
42
+ | Evaluation Interface | V | Action trajectories, intermediate states, success signals |
43
+
44
+ ## Key Empirical Evidence
45
+
46
+ - **Pi Research**: Grok Code Fast 1 jumped 6.7% → 68.3% on SWE-bench by changing ONLY the harness edit-tool format — model unchanged
47
+ - **OpenAI Codex**: 1M lines of code, 0 hand-written over 5 months — failure attributed to "underspecified environments"
48
+ - **Stripe Minions**: 1,300 PRs/week, 0 human-written code — harness-first engineering
49
+ - **METR**: Benchmark-passing PRs have 24.2pp lower human merge rate, gap widening at 9.6pp/year
50
+ - **Vercel**: Removing 80% of tools helped more than any model upgrade
51
+
52
+ ## Key Finding
53
+
54
+ > The agent execution harness — not the model — is the primary determinant of agent reliability at scale.
55
+
56
+ No agent framework can achieve production reliability without implementing ALL six governance components.
57
+
58
+ ## 9 Open Technical Challenges
59
+
60
+ 1. Security & Sandboxing — agents intentionally interact with sensitive resources
61
+ 2. Evaluation & Benchmarking — benchmark validity crisis (METR gap)
62
+ 3. Protocol Standardization — MCP (2-15ms) vs A2A (50-200ms) vs ACP
63
+ 4. Runtime Context Management — 1M+ token/task budgets
64
+ 5. Tool Use & Registry — schema-based contracts insufficient alone
65
+ 6. Memory Architecture — six patterns: flat → hierarchical → episodic → semantic → procedural → graph
66
+ 7. Planning & Reasoning — interface design outweighs model capability
67
+ 8. Multi-Agent Coordination — Byzantine fault tolerance unsolved
68
+ 9. Compute Economics — 13T tokens/week, doubling every 4 weeks
69
+
70
+ ## Relevance to Our Harness
71
+
72
+ Our 8-layer harness (L1-L8) maps to these six components:
73
+ - L1-L4 → E (Execution Loop with verification gates)
74
+ - Tool Schema → T (Tool Registry)
75
+ - Wiki/Knowledge Base → C, S (Context + State)
76
+ - Archon L7 → L (Lifecycle hooks, orchestration)
77
+ - QA/Critics L4-L5 → V (Evaluation)
78
+
79
+ Missing from our implementation: formal H=(E,T,C,S,L,V) specification language, cross-harness portability, harness transparency specification.
@@ -0,0 +1,68 @@
1
+ ---
2
+ type: source
3
+ source_type: article
4
+ author: MindStudio
5
+ date_published: 2026-04
6
+ url: https://www.mindstudio.ai/blog/four-types-of-ai-agents-explained/
7
+ confidence: medium
8
+ tags:
9
+ - agent-types
10
+ - multi-agent
11
+ - orchestration
12
+ - architecture
13
+ key_claims:
14
+ - "Four distinct agent types with different architectures: Coding Harnesses, Dark Factories, Auto Research, Orchestration"
15
+ - "Mismatching agent type to task is a primary cause of AI system failure in production"
16
+ - "Architecture matters more than model choice for multi-agent systems"
17
+ - "Orchestration agents add overhead — start simple, add complexity only when needed"
18
+ ---
19
+
20
+ # Four Types of AI Agents Explained
21
+
22
+ MindStudio blog — 2026. Classifies production AI agents into four architecturally distinct types.
23
+
24
+ ## The Four Types
25
+
26
+ ### 1. Coding Harnesses
27
+ Operate within bounded technical environments (codebases). Tight feedback loop with deterministic execution environment — write code, run tests, see results, revise. Tools: file system access, terminal execution, test runners, code search, version control. Examples: Claude Code, GitHub Copilot Workspace, Devin.
28
+
29
+ **Use when**: Task involves writing/editing/debugging code with testable success conditions.
30
+
31
+ **Avoid when**: Task requires cross-domain reasoning or multi-agent coordination.
32
+
33
+ ### 2. Dark Factories
34
+ Fully automated, humanless pipelines processing work at scale. Ingest inputs → process through defined steps → produce outputs. Run unattended on schedules or events.
35
+
36
+ **Use when**: High volume, structurally similar inputs, scheduled/event-driven processing.
37
+
38
+ **Avoid when**: Tasks require dynamic judgment or high-variance inputs.
39
+
40
+ ### 3. Auto Research Agents
41
+ Autonomous information gathering: decompose questions, search multiple sources, evaluate relevance, synthesize findings. Dynamic retrieval path — decides what to retrieve based on findings.
42
+
43
+ **Use when**: Answer requires multiple sources, retrieval path uncertain upfront, synthesis needed.
44
+
45
+ **Avoid when**: Information available in structured database, simple RAG would suffice.
46
+
47
+ ### 4. Orchestration Agents
48
+ Coordinate other agents: decompose complex goals, assign subtasks to specialists, handle dependencies, assemble final output. Acts as project manager, not implementer.
49
+
50
+ **Use when**: Task requires multiple distinct capabilities or parallel workstreams.
51
+
52
+ **Avoid when**: Single agent can handle the task — orchestration adds overhead (more LLM calls, more latency, more failure points).
53
+
54
+ ## Production Pattern: Combined Types
55
+
56
+ Real systems combine types under an orchestrator. Example competitive intelligence system:
57
+ 1. Orchestrator receives brief
58
+ 2. Dispatches auto research agent for web gathering
59
+ 3. Dispatches dark factory for high-volume review processing
60
+ 4. Dispatches coding harness for structured data analysis
61
+ 5. Orchestrator synthesizes all outputs
62
+
63
+ ## Relevance to Our Harness
64
+
65
+ - Our `wiki-autoresearch` skill is an auto research agent
66
+ - Our `Agent` tool (subagent spawning) maps to orchestration
67
+ - Our core coding loop is a coding harness
68
+ - The key insight: match architecture to task, don't over-architect
@@ -0,0 +1,13 @@
1
+ ---
2
+ type: source
3
+ status: stub
4
+ created: 2026-05-02
5
+ updated: 2026-05-02
6
+ tags: [source, external-doc]
7
+ ---
8
+
9
+ # MS Chat History Management
10
+
11
+ Microsoft documentation on chat history management patterns for AI agents. Describes strategies for maintaining conversation state across context windows.
12
+
13
+ Referenced in: [[resolved-context-pruning-inplace-vs-restart]]
@@ -0,0 +1,104 @@
1
+ ---
2
+ type: source
3
+ status: ingested
4
+ source_type: official-documentation
5
+ title: "OpenAI Prompt Guidance (GPT-5.5 through GPT-4.1)"
6
+ author: "OpenAI"
7
+ date_published: 2026-04-01
8
+ date_fetched: 2026-05-01
9
+ url: "https://developers.openai.com/api/docs/guides/prompt-guidance"
10
+ confidence: high
11
+ key_claims:
12
+ - "GPT-5.5 works best with outcome-first prompts that define the destination, not every step"
13
+ - "GPT-5.4 requires explicit tool persistence, verification loops, and completion criteria"
14
+ - "GPT-5.3 Codex ships with a canonical Codex-Max starter prompt optimized for coding agents"
15
+ - "GPT-5.2+ reasoning effort is the primary tuning knob: none/low/medium/high/xhigh"
16
+ - "GPT-5.1 introduced apply_patch and shell tools as native API tool types"
17
+ - "Contradictory instructions damage reasoning models more than older models"
18
+ - "Structured XML specs like `<instruction_spec>` improved instruction adherence"
19
+ tags:
20
+ - prompting
21
+ - openai
22
+ - gpt
23
+ - model-specific
24
+ - harness-design
25
+ created: 2026-05-02
26
+ updated: 2026-05-02
27
+
28
+ ---# OpenAI Prompt Guidance
29
+
30
+ Official prompting guide from OpenAI covering all GPT models from GPT-5.5 down to GPT-4.1. Each model generation has a dedicated section with model-specific guidance.
31
+
32
+ ## Model-Specific Key Findings
33
+
34
+ ### GPT-5.5
35
+ - **Outcome-first prompts**: Define the destination, let model choose path
36
+ - **Shorter prompts**: Legacy process-heavy prompts add noise
37
+ - **Shorter, outcome-oriented**: "describe what good looks like, what constraints matter, what evidence is available"
38
+ - **Personality + collaboration style**: Separate blocks for tone and task behavior
39
+ - **Preamble for streaming**: Short user-visible update before tool calls
40
+ - **Explicit stopping conditions**: "After each result, ask: can I answer now?"
41
+ - **Retrieval budgets**: Stopping rules for search depth
42
+ - **Phase parameter**: `commentary` vs `final_answer` distinction
43
+
44
+ ### GPT-5.4
45
+ - **Tool persistence rules**: "Keep calling tools until task complete AND verification passes"
46
+ - **Verification loop**: Check correctness, grounding, formatting, safety before finalizing
47
+ - **Completeness contract**: Internal checklist, track processed items, confirm coverage
48
+ - **Dependency checks**: Don't skip prerequisites because end state seems obvious
49
+ - **Research mode**: Plan → Retrieve → Synthesize in 3 passes
50
+ - **Small model guidance**: gpt-5.4-mini is more literal, needs explicit execution order
51
+ - **Reasoning effort**: Start at none, increase only if evals regress
52
+
53
+ ### GPT-5.3 Codex
54
+ - **Canonical Codex-Max prompt**: Full starter prompt published by OpenAI
55
+ - **apply_patch**: First-class tool with Responses API integration; 35% fewer failures than manual
56
+ - **Shell tool**: Structured shell_command with workdir, timeout, permissions
57
+ - **Update plan tool**: JSON-based TODO with pending/in_progress/completed states
58
+ - **Phase parameter**: Required; dropping phase causes significant degradation
59
+ - **Parallel tool calls**: `multi_tool_use.parallel` with batch ordering
60
+ - **Compaction**: First-class support for multi-hour reasoning
61
+ - **Agents.md**: Automatically merged directory-scoped instruction files
62
+ - **Personalities**: Friendly vs Pragmatic shipped with Codex CLI
63
+
64
+ ### GPT-5.2
65
+ - **Verbosity controls**: Output verbosity spec with sentence/bullet limits per task type
66
+ - **Scope drift prevention**: Explicit "no extra features" rules for frontend
67
+ - **Long-context handling**: Force summarization and re-grounding
68
+ - **Ambiguity mitigation**: Uncertainty-and-ambiguity block for hallucination-prone queries
69
+ - **Tool persistence**: "Prefer tools over internal knowledge whenever fresh data needed"
70
+ - **Compaction endpoint**: `/responses/compact` for extending effective context
71
+ - **Reasoning effort migration**: GPT-4o/4.1 → `none`, GPT-5 → same, GPT-5.1 → same
72
+
73
+ ### GPT-5.1
74
+ - **Agentic steerability**: Personality blocks, user update specs, solution persistence
75
+ - **User updates (preambles)**: Frequency, verbosity, tone, content axes; "at least every 6 steps"
76
+ - **Tool preambles**: Brief plan before tools, progress updates during
77
+ - **Reasoning modes**: New `none` mode (no reasoning tokens at all)
78
+ - **apply_patch tool**: Named tool type in Responses API; freeform under the hood
79
+ - **Shell tool**: Native tool type for controlled command execution
80
+ - **Metaprompting**: Model can debug and rewrite its own prompts
81
+
82
+ ### GPT-5
83
+ - **Agentic eagerness**: Calibrate proactivity vs waiting for guidance
84
+ - **Context gathering**: Batch search → minimal plan → complete task
85
+ - **Frontend development**: Self-reflection rubrics, design system enforcement
86
+ - **Cursor prompt tuning**: Real-world production agent findings
87
+ - **Responses API**: Reasoning persisted between tool calls; 4.3% score improvement
88
+
89
+ ### GPT-4.1
90
+ - **Literal instruction follower**: More literal than predecessors
91
+ - **Persistence reminders**: "keep going until query completely resolved"
92
+ - **Planning induction**: "plan extensively before each function call"
93
+ - **SWE-bench prompt**: Full 55% pass rate agent prompt published
94
+ - **Diff format**: V4A diff format with context-based (not line-number) matching
95
+
96
+ ## Cross-Model Patterns
97
+
98
+ 1. **Structured XML blocks work better than markdown** for complex instruction sets
99
+ 2. **Tool definitions should use API tools field**, not manual prompt injection
100
+ 3. **Reasoning effort is the primary tuning knob** across all GPT-5+ models
101
+ 4. **Verbosity API parameter + prompt-level overrides** for output length control
102
+ 5. **Metaprompting is officially recommended** for prompt optimization
103
+ 6. **Contradictory prompts hurt reasoning models significantly**
104
+ 7. **Small models need more explicit, structured instructions**