contextro 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. contextro-0.0.1/.agent/skills/applied-ai-engineer/SKILL.md +174 -0
  2. contextro-0.0.1/.agent/skills/applied-ai-engineer/evals/evals.json +286 -0
  3. contextro-0.0.1/.agent/skills/applied-ai-engineer/references/engineering-patterns.md +59 -0
  4. contextro-0.0.1/.agent/skills/applied-ai-engineer/references/eval-rubric.md +18 -0
  5. contextro-0.0.1/.agent/skills/autoresearch/SKILL.md +218 -0
  6. contextro-0.0.1/.agent/skills/autoresearch/evals/evals.json +912 -0
  7. contextro-0.0.1/.agent/skills/autoresearch/references/eval-rubric.md +13 -0
  8. contextro-0.0.1/.agent/skills/autoresearch/references/experiment-patterns.md +38 -0
  9. contextro-0.0.1/.agent/skills/breakthrough-researcher/SKILL.md +166 -0
  10. contextro-0.0.1/.agent/skills/breakthrough-researcher/evals/evals.json +249 -0
  11. contextro-0.0.1/.agent/skills/breakthrough-researcher/references/eval-rubric.md +19 -0
  12. contextro-0.0.1/.agent/skills/breakthrough-researcher/references/research-patterns.md +66 -0
  13. contextro-0.0.1/.agent/skills/dev-contextia-mcp/SKILL.md +156 -0
  14. contextro-0.0.1/.agent/skills/dev-contextia-mcp/evals/evals-new-tools-e2e.json +306 -0
  15. contextro-0.0.1/.agent/skills/dev-contextia-mcp/evals/evals-workflow.json +326 -0
  16. contextro-0.0.1/.agent/skills/dev-contextia-mcp/evals/evals.json +347 -0
  17. contextro-0.0.1/.agent/skills/dev-contextia-mcp/references/benchmark-results.md +42 -0
  18. contextro-0.0.1/.agent/skills/dev-contextia-mcp/references/eval-rubric.md +55 -0
  19. contextro-0.0.1/.agent/skills/dev-contextia-mcp/references/tool-decision-tree.md +64 -0
  20. contextro-0.0.1/.agent/skills/fastmcp-server-engineer/SKILL.md +90 -0
  21. contextro-0.0.1/.agent/skills/fastmcp-server-engineer/evals/evals.json +28 -0
  22. contextro-0.0.1/.agent/skills/fastmcp-server-engineer/references/eval-rubric.md +15 -0
  23. contextro-0.0.1/.agent/skills/fastmcp-server-engineer/references/fastmcp-patterns.md +21 -0
  24. contextro-0.0.1/.agent/skills/mcp-protocol-architect/SKILL.md +87 -0
  25. contextro-0.0.1/.agent/skills/mcp-protocol-architect/evals/evals.json +274 -0
  26. contextro-0.0.1/.agent/skills/mcp-protocol-architect/references/eval-rubric.md +15 -0
  27. contextro-0.0.1/.agent/skills/mcp-protocol-architect/references/mcp-patterns.md +27 -0
  28. contextro-0.0.1/.agent/skills/python-systems-engineer/SKILL.md +89 -0
  29. contextro-0.0.1/.agent/skills/python-systems-engineer/evals/evals.json +28 -0
  30. contextro-0.0.1/.agent/skills/python-systems-engineer/references/eval-rubric.md +14 -0
  31. contextro-0.0.1/.agent/skills/python-systems-engineer/references/python-patterns.md +19 -0
  32. contextro-0.0.1/.agent/skills/rust-extension-engineer/SKILL.md +89 -0
  33. contextro-0.0.1/.agent/skills/rust-extension-engineer/evals/evals.json +28 -0
  34. contextro-0.0.1/.agent/skills/rust-extension-engineer/references/eval-rubric.md +14 -0
  35. contextro-0.0.1/.agent/skills/rust-extension-engineer/references/rust-patterns.md +18 -0
  36. contextro-0.0.1/.dockerignore +17 -0
  37. contextro-0.0.1/.github/workflows/alpha.yml +136 -0
  38. contextro-0.0.1/.github/workflows/publish.yml +79 -0
  39. contextro-0.0.1/.gitignore +57 -0
  40. contextro-0.0.1/.python-version +1 -0
  41. contextro-0.0.1/AGENTS.md +63 -0
  42. contextro-0.0.1/CHANGELOG.md +104 -0
  43. contextro-0.0.1/CLAUDE.md +207 -0
  44. contextro-0.0.1/CONTRIBUTING.md +274 -0
  45. contextro-0.0.1/Dockerfile +70 -0
  46. contextro-0.0.1/LICENSE +21 -0
  47. contextro-0.0.1/PKG-INFO +493 -0
  48. contextro-0.0.1/README.md +444 -0
  49. contextro-0.0.1/commit-one-by-one.sh +92 -0
  50. contextro-0.0.1/contextro-mcp-ci +7 -0
  51. contextro-0.0.1/deploy/alpha/docker-compose.yml +36 -0
  52. contextro-0.0.1/deploy/alpha/pull-and-restart.sh +13 -0
  53. contextro-0.0.1/docker-compose.dev.yml +48 -0
  54. contextro-0.0.1/docker-compose.yml +43 -0
  55. contextro-0.0.1/docs/ARCHITECTURE.md +140 -0
  56. contextro-0.0.1/docs/DEVELOPER_GUIDE.md +293 -0
  57. contextro-0.0.1/docs/FUTURE_CONTRIBUTIONS.md +371 -0
  58. contextro-0.0.1/docs/IMPLEMENTATION_PLAN.md +339 -0
  59. contextro-0.0.1/docs/INSTALLATION.md +425 -0
  60. contextro-0.0.1/docs/PROJECT_INFO.md +209 -0
  61. contextro-0.0.1/docs/RESEARCH.md +238 -0
  62. contextro-0.0.1/docs/USAGE_GUIDE.md +296 -0
  63. contextro-0.0.1/docs/research/INDEX.md +19 -0
  64. contextro-0.0.1/docs/research/RESEARCH-TEMPLATE.md +30 -0
  65. contextro-0.0.1/docs/research/contextia-best-in-class-plan.md +341 -0
  66. contextro-0.0.1/opencode.json +46 -0
  67. contextro-0.0.1/pyproject.toml +103 -0
  68. contextro-0.0.1/rust/ctx_fast/Cargo.lock +390 -0
  69. contextro-0.0.1/rust/ctx_fast/Cargo.toml +16 -0
  70. contextro-0.0.1/rust/ctx_fast/src/file_scanner.rs +135 -0
  71. contextro-0.0.1/rust/ctx_fast/src/git_ops.rs +98 -0
  72. contextro-0.0.1/rust/ctx_fast/src/hasher.rs +28 -0
  73. contextro-0.0.1/rust/ctx_fast/src/lib.rs +185 -0
  74. contextro-0.0.1/scripts/bench_final.py +281 -0
  75. contextro-0.0.1/scripts/benchmark_browser_use.py +186 -0
  76. contextro-0.0.1/scripts/benchmark_browser_use_results.json +14 -0
  77. contextro-0.0.1/scripts/benchmark_chunk_profiles.py +106 -0
  78. contextro-0.0.1/scripts/benchmark_disclosure.py +334 -0
  79. contextro-0.0.1/scripts/benchmark_embeddings.py +354 -0
  80. contextro-0.0.1/scripts/benchmark_embeddings_full.py +398 -0
  81. contextro-0.0.1/scripts/benchmark_platform_live.py +542 -0
  82. contextro-0.0.1/scripts/benchmark_results.json +58 -0
  83. contextro-0.0.1/scripts/benchmark_results_full.json +90 -0
  84. contextro-0.0.1/scripts/benchmark_retrieval_quality.py +211 -0
  85. contextro-0.0.1/scripts/benchmark_token_efficiency.py +422 -0
  86. contextro-0.0.1/scripts/benchmark_utils.py +128 -0
  87. contextro-0.0.1/scripts/dev_http_server.py +185 -0
  88. contextro-0.0.1/scripts/docker_healthcheck.py +21 -0
  89. contextro-0.0.1/scripts/evaluate_contextia_skill.py +325 -0
  90. contextro-0.0.1/scripts/init.sh +36 -0
  91. contextro-0.0.1/scripts/results.tsv +20 -0
  92. contextro-0.0.1/scripts/results_browser_use.tsv +2 -0
  93. contextro-0.0.1/scripts/results_indexing_speed.tsv +11 -0
  94. contextro-0.0.1/scripts/results_platform_staging.tsv +2 -0
  95. contextro-0.0.1/scripts/test_tool.sh +30 -0
  96. contextro-0.0.1/scripts/token_benchmark_results.json +16 -0
  97. contextro-0.0.1/setup.py +27 -0
  98. contextro-0.0.1/setup.sh +211 -0
  99. contextro-0.0.1/smithery.yaml +55 -0
  100. contextro-0.0.1/src/contextro_mcp/__init__.py +3 -0
  101. contextro-0.0.1/src/contextro_mcp/accelerator.py +321 -0
  102. contextro-0.0.1/src/contextro_mcp/analysis/__init__.py +0 -0
  103. contextro-0.0.1/src/contextro_mcp/analysis/code_analyzer.py +237 -0
  104. contextro-0.0.1/src/contextro_mcp/config.py +248 -0
  105. contextro-0.0.1/src/contextro_mcp/core/__init__.py +0 -0
  106. contextro-0.0.1/src/contextro_mcp/core/exceptions.py +77 -0
  107. contextro-0.0.1/src/contextro_mcp/core/graph_models.py +276 -0
  108. contextro-0.0.1/src/contextro_mcp/core/interfaces.py +53 -0
  109. contextro-0.0.1/src/contextro_mcp/core/models.py +285 -0
  110. contextro-0.0.1/src/contextro_mcp/engines/__init__.py +0 -0
  111. contextro-0.0.1/src/contextro_mcp/engines/bm25_engine.py +153 -0
  112. contextro-0.0.1/src/contextro_mcp/engines/fusion.py +208 -0
  113. contextro-0.0.1/src/contextro_mcp/engines/graph_engine.py +285 -0
  114. contextro-0.0.1/src/contextro_mcp/engines/live_grep.py +169 -0
  115. contextro-0.0.1/src/contextro_mcp/engines/output_sandbox.py +140 -0
  116. contextro-0.0.1/src/contextro_mcp/engines/query_cache.py +163 -0
  117. contextro-0.0.1/src/contextro_mcp/engines/reranker.py +117 -0
  118. contextro-0.0.1/src/contextro_mcp/engines/vector_engine.py +209 -0
  119. contextro-0.0.1/src/contextro_mcp/execution/__init__.py +11 -0
  120. contextro-0.0.1/src/contextro_mcp/execution/ast_compression.py +187 -0
  121. contextro-0.0.1/src/contextro_mcp/execution/compaction.py +310 -0
  122. contextro-0.0.1/src/contextro_mcp/execution/interfaces.py +31 -0
  123. contextro-0.0.1/src/contextro_mcp/execution/response_policy.py +303 -0
  124. contextro-0.0.1/src/contextro_mcp/execution/runtime.py +73 -0
  125. contextro-0.0.1/src/contextro_mcp/execution/search.py +446 -0
  126. contextro-0.0.1/src/contextro_mcp/formatting/__init__.py +0 -0
  127. contextro-0.0.1/src/contextro_mcp/formatting/response_builder.py +121 -0
  128. contextro-0.0.1/src/contextro_mcp/formatting/token_budget.py +59 -0
  129. contextro-0.0.1/src/contextro_mcp/formatting/toon_encoder.py +81 -0
  130. contextro-0.0.1/src/contextro_mcp/git/__init__.py +1 -0
  131. contextro-0.0.1/src/contextro_mcp/git/branch_watcher.py +258 -0
  132. contextro-0.0.1/src/contextro_mcp/git/commit_indexer.py +564 -0
  133. contextro-0.0.1/src/contextro_mcp/git/cross_repo.py +210 -0
  134. contextro-0.0.1/src/contextro_mcp/indexing/__init__.py +14 -0
  135. contextro-0.0.1/src/contextro_mcp/indexing/chunk_context.py +110 -0
  136. contextro-0.0.1/src/contextro_mcp/indexing/chunker.py +144 -0
  137. contextro-0.0.1/src/contextro_mcp/indexing/embedding_service.py +429 -0
  138. contextro-0.0.1/src/contextro_mcp/indexing/file_discovery.py +95 -0
  139. contextro-0.0.1/src/contextro_mcp/indexing/parallel_indexer.py +141 -0
  140. contextro-0.0.1/src/contextro_mcp/indexing/pipeline.py +865 -0
  141. contextro-0.0.1/src/contextro_mcp/indexing/smart_chunker.py +223 -0
  142. contextro-0.0.1/src/contextro_mcp/memory/__init__.py +0 -0
  143. contextro-0.0.1/src/contextro_mcp/memory/compaction_archive.py +131 -0
  144. contextro-0.0.1/src/contextro_mcp/memory/memory_store.py +305 -0
  145. contextro-0.0.1/src/contextro_mcp/memory/session_tracker.py +146 -0
  146. contextro-0.0.1/src/contextro_mcp/middleware/__init__.py +1 -0
  147. contextro-0.0.1/src/contextro_mcp/middleware/audit.py +83 -0
  148. contextro-0.0.1/src/contextro_mcp/parsing/__init__.py +0 -0
  149. contextro-0.0.1/src/contextro_mcp/parsing/astgrep_parser.py +497 -0
  150. contextro-0.0.1/src/contextro_mcp/parsing/file_watcher.py +162 -0
  151. contextro-0.0.1/src/contextro_mcp/parsing/language_registry.py +274 -0
  152. contextro-0.0.1/src/contextro_mcp/parsing/treesitter_parser.py +421 -0
  153. contextro-0.0.1/src/contextro_mcp/persistence/__init__.py +0 -0
  154. contextro-0.0.1/src/contextro_mcp/persistence/store.py +196 -0
  155. contextro-0.0.1/src/contextro_mcp/research/__init__.py +13 -0
  156. contextro-0.0.1/src/contextro_mcp/research/catalog.py +256 -0
  157. contextro-0.0.1/src/contextro_mcp/schemas/__init__.py +59 -0
  158. contextro-0.0.1/src/contextro_mcp/schemas/inputs.py +218 -0
  159. contextro-0.0.1/src/contextro_mcp/schemas/responses.py +257 -0
  160. contextro-0.0.1/src/contextro_mcp/security/__init__.py +1 -0
  161. contextro-0.0.1/src/contextro_mcp/security/permissions.py +106 -0
  162. contextro-0.0.1/src/contextro_mcp/security/rate_limiter.py +87 -0
  163. contextro-0.0.1/src/contextro_mcp/server.py +3110 -0
  164. contextro-0.0.1/src/contextro_mcp/state.py +283 -0
  165. contextro-0.0.1/tests/__init__.py +0 -0
  166. contextro-0.0.1/tests/conftest.py +153 -0
  167. contextro-0.0.1/tests/test_accelerator.py +213 -0
  168. contextro-0.0.1/tests/test_analyze_tool.py +175 -0
  169. contextro-0.0.1/tests/test_audit.py +122 -0
  170. contextro-0.0.1/tests/test_bm25_engine.py +200 -0
  171. contextro-0.0.1/tests/test_branch_watcher.py +222 -0
  172. contextro-0.0.1/tests/test_chunker.py +237 -0
  173. contextro-0.0.1/tests/test_commit_indexer.py +301 -0
  174. contextro-0.0.1/tests/test_config.py +64 -0
  175. contextro-0.0.1/tests/test_cross_repo.py +237 -0
  176. contextro-0.0.1/tests/test_e2e.py +175 -0
  177. contextro-0.0.1/tests/test_embedding_properties.py +280 -0
  178. contextro-0.0.1/tests/test_exceptions.py +58 -0
  179. contextro-0.0.1/tests/test_explain_tool.py +107 -0
  180. contextro-0.0.1/tests/test_fusion.py +194 -0
  181. contextro-0.0.1/tests/test_git_tools.py +336 -0
  182. contextro-0.0.1/tests/test_graph_engine.py +151 -0
  183. contextro-0.0.1/tests/test_graph_models.py +195 -0
  184. contextro-0.0.1/tests/test_graph_persistence.py +137 -0
  185. contextro-0.0.1/tests/test_graph_tools.py +200 -0
  186. contextro-0.0.1/tests/test_health.py +135 -0
  187. contextro-0.0.1/tests/test_hybrid_search.py +170 -0
  188. contextro-0.0.1/tests/test_impact_tool.py +128 -0
  189. contextro-0.0.1/tests/test_interfaces.py +53 -0
  190. contextro-0.0.1/tests/test_language_registry.py +99 -0
  191. contextro-0.0.1/tests/test_live_grep.py +74 -0
  192. contextro-0.0.1/tests/test_memory_store.py +200 -0
  193. contextro-0.0.1/tests/test_memory_usage.py +77 -0
  194. contextro-0.0.1/tests/test_models.py +258 -0
  195. contextro-0.0.1/tests/test_performance.py +69 -0
  196. contextro-0.0.1/tests/test_permissions.py +207 -0
  197. contextro-0.0.1/tests/test_pipeline.py +307 -0
  198. contextro-0.0.1/tests/test_rate_limiter.py +111 -0
  199. contextro-0.0.1/tests/test_reranker.py +114 -0
  200. contextro-0.0.1/tests/test_research_catalog.py +23 -0
  201. contextro-0.0.1/tests/test_response_builder.py +94 -0
  202. contextro-0.0.1/tests/test_schemas.py +232 -0
  203. contextro-0.0.1/tests/test_search_execution.py +306 -0
  204. contextro-0.0.1/tests/test_security.py +180 -0
  205. contextro-0.0.1/tests/test_smart_chunker.py +206 -0
  206. contextro-0.0.1/tests/test_state.py +59 -0
  207. contextro-0.0.1/tests/test_token_budget.py +92 -0
  208. contextro-0.0.1/tests/test_tool_response_policy.py +133 -0
  209. contextro-0.0.1/tests/test_tools_basic.py +284 -0
  210. contextro-0.0.1/tests/test_treesitter_parser.py +168 -0
  211. contextro-0.0.1/tests/test_trust_remote_code.py +84 -0
  212. contextro-0.0.1/tests/test_vector_engine.py +188 -0
  213. contextro-0.0.1/uv.lock +3192 -0
@@ -0,0 +1,174 @@
1
+ ---
2
+ name: applied-ai-engineer
3
+ description: >
4
+ Use for turning research ideas or agent workflows into robust, benchmarked, observable,
5
+ production-ready systems. Trigger when the user asks to productionize an AI feature, build
6
+ a harness, add evals, improve reliability, reduce regressions, add observability, create a
7
+ rollout plan, improve agent performance through better scaffolding, or convert a promising
8
+ research idea into a safe implementation path. Do not use for pure literature review,
9
+ speculative research with no implementation intent, or trivial code changes.
10
+ when_to_use: >
11
+ Especially useful for harness engineering, evaluator design, benchmark discipline,
12
+ instrumentation, rollout safety, architecture legibility, compaction and resume flows,
13
+ workflow governance, and making agent systems reliable under real constraints.
14
+ metadata:
15
+ version: "1.0.0"
16
+ category: engineering
17
+ tags: [applied-ai, harness, evals, observability, rollout, reliability, benchmarking]
18
+ license: MIT
19
+ ---
20
+
21
+ # Applied AI Engineer
22
+
23
+ You are the applied AI engineering role.
24
+
25
+ Your job is to turn a good idea into a reliable system with guardrails, observability, and a
26
+ repeatable evaluation story.
27
+
28
+ ## Use This Skill To Produce
29
+
30
+ - a concrete implementation path
31
+ - a benchmark or eval harness
32
+ - regression guardrails
33
+ - observability requirements
34
+ - rollout and rollback criteria
35
+ - repository artifacts that make the system legible to future agents
36
+
37
+ ## Method
38
+
39
+ ### 1. Define The Outcome And Constraints
40
+
41
+ Start every task by naming:
42
+
43
+ - user-visible outcome
44
+ - primary metric
45
+ - secondary guardrails
46
+ - hard constraints such as memory, latency, privacy, local-first behavior, and test integrity
47
+
48
+ If the metric is unclear, make it explicit before changing the system.
49
+
50
+ ### 2. Make The System Legible
51
+
52
+ Prefer repository-local artifacts over hidden conversational guidance.
53
+
54
+ Use or improve:
55
+
56
+ - concise top-level instructions
57
+ - structured docs in `docs/`
58
+ - executable benchmark scripts in `scripts/`
59
+ - tests and linters
60
+ - eval definitions
61
+ - stable response shapes and resume artifacts
62
+
63
+ OpenAI's lesson applies here: give the agent a map, not a manual.
64
+
65
+ ### 3. Build The Harness Before Trusting The Change
66
+
67
+ For meaningful AI or retrieval changes, define:
68
+
69
+ - baseline benchmark command
70
+ - realistic task set or eval set
71
+ - deterministic checks where possible
72
+ - evaluator workflow where deterministic checks are insufficient
73
+ - before vs after comparison
74
+
75
+ For Contextro, prefer the existing benchmark surfaces:
76
+
77
+ - `python scripts/benchmark_token_efficiency.py`
78
+ - `python scripts/benchmark_retrieval_quality.py --path src --query-limit 20`
79
+ - `python scripts/benchmark_chunk_profiles.py --path src --query-limit 20`
80
+ - `python scripts/benchmark_disclosure.py`
81
+ - `python scripts/bench_final.py`
82
+ - `pytest -v`
83
+ - `ruff check .`
84
+
85
+ ### 4. Implement The Smallest Enforceable Slice
86
+
87
+ Do not solve a broad problem with a large rewrite unless the harness proves you need one.
88
+
89
+ Prefer:
90
+
91
+ - one clear invariant at a time
92
+ - one benchmarked change at a time
93
+ - thin entrypoints with logic moved into focused modules
94
+ - explicit boundaries between orchestration, state, formatting, and domain logic
95
+ - structure that can be tested and observed
96
+ - reusable system surfaces over prompt-only behavior
97
+
98
+ ### 5. Add Observability And Recovery
99
+
100
+ If a system can fail, drift, or regress, add the signals that reveal it:
101
+
102
+ - metrics
103
+ - logs
104
+ - traces or event records
105
+ - resume and compaction artifacts
106
+ - stable prefixes for cache-friendly outputs
107
+ - searchable history when long-running tasks matter
108
+
109
+ Devin and DeepSeek patterns both matter here: realistic feedback loops and resumable trajectories.
110
+
111
+ ### 6. Validate Before You Ship
112
+
113
+ Every significant change should have:
114
+
115
+ - test result
116
+ - benchmark result
117
+ - regression guardrail status
118
+ - failure-mode review
119
+ - rollback plan
120
+
121
+ Do not trade away correctness or maintainability for a single benchmark win.
122
+
123
+ ### 7. Encode Taste Into The Repo
124
+
125
+ If a human review comment is likely to recur, turn it into one of:
126
+
127
+ - documentation
128
+ - a lint or test
129
+ - a benchmark assertion
130
+ - an eval case
131
+ - an explicit workflow rule
132
+
133
+ The goal is not to keep fixing the same thing manually.
134
+
135
+ ## Company Patterns To Reuse
136
+
137
+ - OpenAI: harness engineering, repo-local system of record, architecture legibility, enforceable invariants
138
+ - Anthropic: smallest high-signal context, progressive disclosure, explicit long-running-agent support
139
+ - Cursor: keep implementation tightly coupled to codebase retrieval, fast local iteration, and low-friction edits
140
+ - Windsurf: pair planning with execution, preserve working state across long tasks, and keep agent actions IDE-aware
141
+ - Mistral: use efficient model/task routing and modular context slices before reaching for heavier system complexity
142
+ - Devin and Cognition: realistic environments, evaluator loops, autonomous feedback, environment-aware critique
143
+ - NVIDIA: benchmark the whole pipeline, not just one subcomponent
144
+ - DeepSeek: checkpointing, cache-aware structure, trajectory logging, resumability
145
+
146
+ ## Output Format
147
+
148
+ Return results in this order:
149
+
150
+ 1. `Outcome and metric`
151
+ 2. `Constraints`
152
+ 3. `Current baseline`
153
+ 4. `Implementation plan`
154
+ 5. `Harness and eval plan`
155
+ 6. `Observability and guardrails`
156
+ 7. `Rollout and rollback`
157
+
158
+ ## Anti-Patterns
159
+
160
+ - Do not ship AI behavior with no evals.
161
+ - Do not benchmark one metric while ignoring tests, latency, memory, or user-visible regressions.
162
+ - Do not rely on giant instruction blobs when code, docs, lint, or evals can enforce the behavior.
163
+ - Do not hide critical workflow knowledge only in chat.
164
+ - Do not choose architectural rewrites before testing smaller enforceable changes.
165
+
166
+ ## Handoff Rule
167
+
168
+ - use `breakthrough-researcher` when the solution space is still unclear
169
+ - use `autoresearch` when the metric and experiment loop are already defined and ready to run autonomously
170
+
171
+ ## References
172
+
173
+ - Engineering patterns: `references/engineering-patterns.md`
174
+ - Skill eval rubric: `references/eval-rubric.md`
@@ -0,0 +1,286 @@
1
+ {
2
+ "skill_name": "applied-ai-engineer",
3
+ "version": "1.1.0",
4
+ "description": "Eval suite for the applied AI engineering skill. Measures trigger quality, harness-first engineering, observability discipline, benchmark and eval rigor, and production-safe implementation planning.",
5
+ "categories": {
6
+ "triggering": "Skill loads for productionization and reliability work",
7
+ "workflow": "Skill follows metric, baseline, harness, implementation, and validation order",
8
+ "evals": "Skill insists on benchmarks or evaluator design before trusting changes",
9
+ "observability": "Skill adds signals and recovery paths for long-running systems",
10
+ "anti_pattern": "Skill avoids shipping without guardrails or using prompt blobs as system design",
11
+ "output_quality": "Skill returns rollout, rollback, and concrete enforcement mechanisms"
12
+ },
13
+ "evals": [
14
+ {
15
+ "id": 1,
16
+ "name": "trigger-productionize-agent-feature",
17
+ "category": "triggering",
18
+ "prompt": "We have a promising idea for searchable compaction archive. I need an applied AI engineer plan to productionize it safely.",
19
+ "should_trigger": true,
20
+ "expected_behavior": "Skill loads and frames the task as harness, implementation, observability, and rollout work.",
21
+ "assertions": [
22
+ {
23
+ "id": "a1",
24
+ "text": "skill triggers for productionization request",
25
+ "type": "routing",
26
+ "passing_condition": "Skill loads for productionize and safely ship phrasing"
27
+ }
28
+ ]
29
+ },
30
+ {
31
+ "id": 2,
32
+ "name": "trigger-build-harness",
33
+ "category": "triggering",
34
+ "prompt": "Build me a proper harness and eval plan for improving Contextro's retrieval pipeline.",
35
+ "should_trigger": true,
36
+ "expected_behavior": "Skill loads for harness and evaluation design.",
37
+ "assertions": [
38
+ {
39
+ "id": "a1",
40
+ "text": "skill triggers for harness work",
41
+ "type": "routing",
42
+ "passing_condition": "Skill loads for harness and eval request"
43
+ }
44
+ ]
45
+ },
46
+ {
47
+ "id": 3,
48
+ "name": "no-trigger-pure-research",
49
+ "category": "triggering",
50
+ "prompt": "Research the most novel ideas in long-context memory systems and summarize the papers.",
51
+ "should_trigger": false,
52
+ "expected_behavior": "Skill does not load for pure literature review with no implementation intent.",
53
+ "assertions": [
54
+ {
55
+ "id": "a1",
56
+ "text": "skill does not trigger for pure research",
57
+ "type": "routing",
58
+ "passing_condition": "Skill does not load for literature-only request"
59
+ }
60
+ ]
61
+ },
62
+ {
63
+ "id": 4,
64
+ "name": "no-trigger-trivial-edit",
65
+ "category": "triggering",
66
+ "prompt": "Add a comment to the SearchEngine class.",
67
+ "should_trigger": false,
68
+ "expected_behavior": "Skill does not load for trivial code edits.",
69
+ "assertions": [
70
+ {
71
+ "id": "a1",
72
+ "text": "skill does not trigger for trivial edit",
73
+ "type": "routing",
74
+ "passing_condition": "Skill does not load for small direct edit"
75
+ }
76
+ ]
77
+ },
78
+ {
79
+ "id": 5,
80
+ "name": "workflow-metric-first",
81
+ "category": "workflow",
82
+ "prompt": "Make Contextro's compaction system better.",
83
+ "expected_behavior": "Skill starts by naming user-visible outcome, metric, and constraints before implementation steps.",
84
+ "assertions": [
85
+ {
86
+ "id": "a1",
87
+ "text": "metric and constraints come first",
88
+ "type": "workflow",
89
+ "passing_condition": "Response begins with outcome, metric, or constraints before implementation details"
90
+ }
91
+ ]
92
+ },
93
+ {
94
+ "id": 6,
95
+ "name": "workflow-baseline-before-change",
96
+ "category": "workflow",
97
+ "prompt": "Plan the engineering work to reduce token output in search responses.",
98
+ "expected_behavior": "Skill establishes current benchmark command and baseline before proposing changes.",
99
+ "assertions": [
100
+ {
101
+ "id": "a1",
102
+ "text": "baseline is established",
103
+ "type": "workflow",
104
+ "passing_condition": "Response includes benchmark command and current baseline before proposed changes"
105
+ }
106
+ ]
107
+ },
108
+ {
109
+ "id": 7,
110
+ "name": "evals-harness-before-trust",
111
+ "category": "evals",
112
+ "prompt": "We think AST-aware compression will help. Give me the implementation plan.",
113
+ "expected_behavior": "Skill includes a benchmark or eval harness before trusting the improvement.",
114
+ "assertions": [
115
+ {
116
+ "id": "a1",
117
+ "text": "benchmark or eval plan is included",
118
+ "type": "evals",
119
+ "passing_condition": "Response includes a benchmark command, eval set, or before-vs-after comparison plan"
120
+ }
121
+ ]
122
+ },
123
+ {
124
+ "id": 8,
125
+ "name": "evals-realistic-environment",
126
+ "category": "evals",
127
+ "prompt": "How would you validate a new cross-repo search workflow?",
128
+ "expected_behavior": "Skill prefers realistic task evaluation over only synthetic microbenchmarks.",
129
+ "assertions": [
130
+ {
131
+ "id": "a1",
132
+ "text": "realistic evaluation is included",
133
+ "type": "evals",
134
+ "passing_condition": "Response includes realistic task scenarios or evaluator-style checks, not only isolated microbenchmarks"
135
+ }
136
+ ]
137
+ },
138
+ {
139
+ "id": 9,
140
+ "name": "observability-signals-added",
141
+ "category": "observability",
142
+ "prompt": "Design the engineering plan for a long-running agent feature that may regress over time.",
143
+ "expected_behavior": "Skill adds logs, metrics, traces, or equivalent signals plus resume or recovery artifacts.",
144
+ "assertions": [
145
+ {
146
+ "id": "a1",
147
+ "text": "signals are proposed",
148
+ "type": "observability",
149
+ "passing_condition": "Response includes logs, metrics, traces, event records, or resume artifacts"
150
+ }
151
+ ]
152
+ },
153
+ {
154
+ "id": 10,
155
+ "name": "observability-resume-and-recovery",
156
+ "category": "observability",
157
+ "prompt": "How would you engineer Contextro to survive compaction and long tasks more reliably?",
158
+ "expected_behavior": "Skill includes resume, checkpoint, archive, or trajectory concepts.",
159
+ "assertions": [
160
+ {
161
+ "id": "a1",
162
+ "text": "resume or checkpoint path is included",
163
+ "type": "observability",
164
+ "passing_condition": "Response includes checkpointing, session packets, searchable archive, or trajectory logging"
165
+ }
166
+ ]
167
+ },
168
+ {
169
+ "id": 11,
170
+ "name": "anti-pattern-no-ship-without-evals",
171
+ "category": "anti_pattern",
172
+ "prompt": "We don't need benchmarks. Just implement searchable archive and ship it.",
173
+ "expected_behavior": "Skill rejects shipping without evaluation.",
174
+ "assertions": [
175
+ {
176
+ "id": "a1",
177
+ "text": "shipping without evals is rejected",
178
+ "type": "anti_pattern",
179
+ "passing_condition": "Response refuses to skip benchmark or eval validation"
180
+ }
181
+ ]
182
+ },
183
+ {
184
+ "id": 12,
185
+ "name": "anti-pattern-no-single-metric-blindness",
186
+ "category": "anti_pattern",
187
+ "prompt": "If latency drops by 20%, I don't care if tests fail or memory doubles.",
188
+ "expected_behavior": "Skill rejects single-metric optimization that ignores guardrails.",
189
+ "assertions": [
190
+ {
191
+ "id": "a1",
192
+ "text": "guardrails are defended",
193
+ "type": "anti_pattern",
194
+ "passing_condition": "Response rejects ignoring tests, memory, or user-visible regressions"
195
+ }
196
+ ]
197
+ },
198
+ {
199
+ "id": 13,
200
+ "name": "anti-pattern-no-giant-prompt-manual",
201
+ "category": "anti_pattern",
202
+ "prompt": "Let's solve agent reliability by writing a massive 5000-line AGENTS file.",
203
+ "expected_behavior": "Skill rejects giant prompt manuals and prefers enforceable artifacts.",
204
+ "assertions": [
205
+ {
206
+ "id": "a1",
207
+ "text": "prompt blob solution is rejected",
208
+ "type": "anti_pattern",
209
+ "passing_condition": "Response recommends concise maps, docs, lints, tests, or evals instead of giant manuals"
210
+ }
211
+ ]
212
+ },
213
+ {
214
+ "id": 14,
215
+ "name": "output-quality-rollout-and-rollback",
216
+ "category": "output_quality",
217
+ "prompt": "Give me the implementation plan for a new memory feature.",
218
+ "expected_behavior": "Skill includes rollout and rollback criteria.",
219
+ "assertions": [
220
+ {
221
+ "id": "a1",
222
+ "text": "rollout and rollback are included",
223
+ "type": "output_quality",
224
+ "passing_condition": "Response includes rollout, rollback, or failure recovery criteria"
225
+ }
226
+ ]
227
+ },
228
+ {
229
+ "id": 15,
230
+ "name": "output-quality-enforcement-artifacts",
231
+ "category": "output_quality",
232
+ "prompt": "How do we make agent behavior consistent across future sessions?",
233
+ "expected_behavior": "Skill prefers enforceable repo artifacts over repeated human reminding.",
234
+ "assertions": [
235
+ {
236
+ "id": "a1",
237
+ "text": "enforcement mechanisms are proposed",
238
+ "type": "output_quality",
239
+ "passing_condition": "Response includes docs, lint, tests, evals, rules, or hooks as enforcement mechanisms"
240
+ }
241
+ ]
242
+ },
243
+ {
244
+ "id": 16,
245
+ "name": "workflow-modular-slice-before-rewrite",
246
+ "category": "workflow",
247
+ "prompt": "The FastMCP server is getting messy. Let's rewrite half the codebase and move everything around while we add searchable archive.",
248
+ "expected_behavior": "Skill rejects the broad rewrite impulse and instead proposes a small, benchmarked slice with explicit module boundaries.",
249
+ "assertions": [
250
+ {
251
+ "id": "a1",
252
+ "text": "small enforceable slice is preferred",
253
+ "type": "workflow",
254
+ "passing_condition": "Response proposes a minimal incremental slice before any broad rewrite"
255
+ },
256
+ {
257
+ "id": "a2",
258
+ "text": "module boundaries are named",
259
+ "type": "workflow",
260
+ "passing_condition": "Response calls out boundaries such as entrypoint vs domain logic, formatting, state, or orchestration"
261
+ }
262
+ ]
263
+ },
264
+ {
265
+ "id": 17,
266
+ "name": "output-quality-thin-entrypoint-plan",
267
+ "category": "output_quality",
268
+ "prompt": "Plan the engineering work for a new MCP feature without letting server.py become the dumping ground.",
269
+ "expected_behavior": "Skill returns a plan that keeps entrypoints thin and routes reusable logic into focused modules with verification.",
270
+ "assertions": [
271
+ {
272
+ "id": "a1",
273
+ "text": "thin entrypoint approach included",
274
+ "type": "output_quality",
275
+ "passing_condition": "Response says to keep the entrypoint thin or avoid putting all feature logic in server.py"
276
+ },
277
+ {
278
+ "id": "a2",
279
+ "text": "verification remains part of the modularization plan",
280
+ "type": "output_quality",
281
+ "passing_condition": "Response includes tests, benchmark, or eval checks alongside the modular implementation plan"
282
+ }
283
+ ]
284
+ }
285
+ ]
286
+ }
@@ -0,0 +1,59 @@
1
+ # Applied AI Engineering Patterns
2
+
3
+ This reference captures the repeated engineering patterns behind strong AI product teams.
4
+
5
+ ## OpenAI
6
+
7
+ - The engineering role shifts from manual coding to systems, scaffolding, and leverage.
8
+ - Keep top-level instructions short and use the repository as the system of record.
9
+ - Enforce architecture and taste through tooling, not repeated review comments.
10
+ - Favor legibility, strict boundaries, and mechanically checked invariants.
11
+
12
+ ## Anthropic
13
+
14
+ - Use the smallest high-signal context that still solves the task.
15
+ - Make long-running work resumable with explicit artifacts.
16
+ - Use progressive disclosure to avoid flooding the model with low-value detail.
17
+
18
+ ## Cursor
19
+
20
+ - Keep the coding loop close to the local codebase: retrieve, inspect, edit, verify.
21
+ - Reduce friction between research, implementation, and validation so good ideas survive contact with the repo.
22
+ - Prefer precise context targeting over broad prompt stuffing.
23
+
24
+ ## Windsurf
25
+
26
+ - Treat agent engineering as a coordinated plan-plus-execution system, not only a chat interaction.
27
+ - Keep visible intermediate state so long tasks can recover without losing intent.
28
+ - Make the environment, tools, and execution status legible enough for iterative autonomous work.
29
+
30
+ ## Mistral
31
+
32
+ - Efficient model usage depends on good routing, compact context, and clean task decomposition.
33
+ - Smaller passes with strong structure can outperform a single large opaque pass.
34
+ - System quality comes from orchestration and interfaces, not only raw model size.
35
+
36
+ ## Devin And Cognition
37
+
38
+ - Prefer realistic task environments over abstract unit-only evaluation.
39
+ - Use autonomous evaluator flows when deterministic checks are insufficient.
40
+ - Store external notes and environment state so long tasks can resume cleanly.
41
+
42
+ ## NVIDIA
43
+
44
+ - Measure the entire RAG system: chunking, retrieval, reranking, shaping, latency, memory.
45
+ - The best architecture on paper is not the best architecture until it wins on the target corpus.
46
+
47
+ ## DeepSeek
48
+
49
+ - Long-horizon systems benefit from checkpointing and resumable trajectories.
50
+ - Stable prompt structure improves reuse and efficiency.
51
+ - Preserve the useful state for tool-calling loops without replaying everything.
52
+
53
+ ## What This Means For Contextro
54
+
55
+ - The next gains should come from harness quality, workflow control, observability, and resume flows.
56
+ - Research ideas should be translated into benchmarked, enforceable repo artifacts.
57
+ - New behavior should land with tests, evals, metrics, and a rollback story.
58
+ - Implementation patterns from Cursor, Windsurf, and Mistral should be translated into repo-local
59
+ harnesses, workflow state, and efficient context/task routing rather than copied superficially.
@@ -0,0 +1,18 @@
1
+ # Applied AI Engineer Eval Rubric
2
+
3
+ ## The skill passes when it:
4
+
5
+ - triggers for productionization, harness, eval, observability, rollout, or reliability work
6
+ - does not trigger for pure research or trivial code edits
7
+ - defines metric, constraints, baseline, and guardrails
8
+ - proposes a concrete harness or evaluation method
9
+ - includes observability and rollback thinking
10
+ - prefers enforceable repository artifacts over prompt-only guidance
11
+
12
+ ## The skill fails when it:
13
+
14
+ - ships changes without evals or benchmarks
15
+ - optimizes a single metric while ignoring regressions
16
+ - turns into a pure literature-review skill
17
+ - recommends big rewrites before smaller benchmarked slices
18
+ - omits rollout, rollback, or failure detection