agent-devkit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/README.md +48 -6
  2. package/bin/agent.mjs +133 -7
  3. package/package.json +1 -1
  4. package/runtime/README.md +187 -5
  5. package/runtime/agent +31 -5
  6. package/runtime/agents/README.md +18 -0
  7. package/runtime/agents/contribution-reviewer/AGENTS.md +8 -0
  8. package/runtime/agents/contribution-reviewer/README.md +8 -0
  9. package/runtime/agents/contribution-reviewer/agent.yaml +40 -0
  10. package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/capability.yaml +27 -0
  11. package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/decision-rules.md +5 -0
  12. package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/workflow.md +6 -0
  13. package/runtime/agents/contribution-reviewer/capabilities/review-contribution/capability.yaml +25 -0
  14. package/runtime/agents/contribution-reviewer/capabilities/review-contribution/decision-rules.md +5 -0
  15. package/runtime/agents/contribution-reviewer/capabilities/review-contribution/workflow.md +5 -0
  16. package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/capability.yaml +26 -0
  17. package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/decision-rules.md +5 -0
  18. package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/workflow.md +6 -0
  19. package/runtime/agents/contribution-reviewer/infra/README.md +6 -0
  20. package/runtime/agents/contribution-reviewer/knowledge/context.md +8 -0
  21. package/runtime/agents/contribution-reviewer/knowledge/system.md +8 -0
  22. package/runtime/agents/contribution-reviewer/templates/README.md +3 -0
  23. package/runtime/agents/knowledge-author/AGENTS.md +7 -0
  24. package/runtime/agents/knowledge-author/README.md +7 -0
  25. package/runtime/agents/knowledge-author/agent.yaml +37 -0
  26. package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/capability.yaml +30 -0
  27. package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/decision-rules.md +6 -0
  28. package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/workflow.md +7 -0
  29. package/runtime/agents/knowledge-author/infra/.gitkeep +1 -0
  30. package/runtime/agents/knowledge-author/knowledge/context.md +4 -0
  31. package/runtime/agents/knowledge-author/knowledge/system.md +4 -0
  32. package/runtime/agents/knowledge-author/templates/.gitkeep +1 -0
  33. package/runtime/agents/knowledge-curator/AGENTS.md +7 -0
  34. package/runtime/agents/knowledge-curator/README.md +6 -0
  35. package/runtime/agents/knowledge-curator/agent.yaml +37 -0
  36. package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/capability.yaml +29 -0
  37. package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/decision-rules.md +6 -0
  38. package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/workflow.md +7 -0
  39. package/runtime/agents/knowledge-curator/infra/.gitkeep +1 -0
  40. package/runtime/agents/knowledge-curator/knowledge/context.md +4 -0
  41. package/runtime/agents/knowledge-curator/knowledge/system.md +4 -0
  42. package/runtime/agents/knowledge-curator/templates/.gitkeep +1 -0
  43. package/runtime/agents/knowledge-infra-builder/AGENTS.md +8 -0
  44. package/runtime/agents/knowledge-infra-builder/README.md +8 -0
  45. package/runtime/agents/knowledge-infra-builder/agent.yaml +38 -0
  46. package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/capability.yaml +30 -0
  47. package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/decision-rules.md +6 -0
  48. package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/workflow.md +7 -0
  49. package/runtime/agents/knowledge-infra-builder/infra/.gitkeep +1 -0
  50. package/runtime/agents/knowledge-infra-builder/knowledge/context.md +4 -0
  51. package/runtime/agents/knowledge-infra-builder/knowledge/system.md +4 -0
  52. package/runtime/agents/knowledge-infra-builder/templates/.gitkeep +1 -0
  53. package/runtime/agents/knowledge-owner/AGENTS.md +7 -0
  54. package/runtime/agents/knowledge-owner/README.md +6 -0
  55. package/runtime/agents/knowledge-owner/agent.yaml +37 -0
  56. package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/capability.yaml +28 -0
  57. package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/decision-rules.md +6 -0
  58. package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/workflow.md +7 -0
  59. package/runtime/agents/knowledge-owner/infra/.gitkeep +1 -0
  60. package/runtime/agents/knowledge-owner/knowledge/context.md +4 -0
  61. package/runtime/agents/knowledge-owner/knowledge/system.md +4 -0
  62. package/runtime/agents/knowledge-owner/templates/.gitkeep +1 -0
  63. package/runtime/agents/knowledge-reviewer/AGENTS.md +7 -0
  64. package/runtime/agents/knowledge-reviewer/README.md +7 -0
  65. package/runtime/agents/knowledge-reviewer/agent.yaml +36 -0
  66. package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/capability.yaml +26 -0
  67. package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/decision-rules.md +6 -0
  68. package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/workflow.md +7 -0
  69. package/runtime/agents/knowledge-reviewer/infra/.gitkeep +1 -0
  70. package/runtime/agents/knowledge-reviewer/knowledge/context.md +4 -0
  71. package/runtime/agents/knowledge-reviewer/knowledge/system.md +4 -0
  72. package/runtime/agents/knowledge-reviewer/templates/.gitkeep +1 -0
  73. package/runtime/agents/local-memory-manager/AGENTS.md +5 -0
  74. package/runtime/agents/local-memory-manager/README.md +7 -0
  75. package/runtime/agents/local-memory-manager/agent.yaml +38 -0
  76. package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/capability.yaml +19 -0
  77. package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/decision-rules.md +5 -0
  78. package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/workflow.md +6 -0
  79. package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/capability.yaml +19 -0
  80. package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/decision-rules.md +5 -0
  81. package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/workflow.md +5 -0
  82. package/runtime/agents/local-memory-manager/infra/.gitkeep +1 -0
  83. package/runtime/agents/local-memory-manager/knowledge/context.md +4 -0
  84. package/runtime/agents/local-memory-manager/knowledge/system.md +4 -0
  85. package/runtime/agents/local-memory-manager/templates/.gitkeep +1 -0
  86. package/runtime/agents/memory-sync-manager/AGENTS.md +7 -0
  87. package/runtime/agents/memory-sync-manager/README.md +7 -0
  88. package/runtime/agents/memory-sync-manager/agent.yaml +37 -0
  89. package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/capability.yaml +29 -0
  90. package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/decision-rules.md +6 -0
  91. package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/workflow.md +7 -0
  92. package/runtime/agents/memory-sync-manager/infra/.gitkeep +1 -0
  93. package/runtime/agents/memory-sync-manager/knowledge/context.md +4 -0
  94. package/runtime/agents/memory-sync-manager/knowledge/system.md +4 -0
  95. package/runtime/agents/memory-sync-manager/templates/.gitkeep +1 -0
  96. package/runtime/agents/shared-memory-curator/AGENTS.md +5 -0
  97. package/runtime/agents/shared-memory-curator/README.md +6 -0
  98. package/runtime/agents/shared-memory-curator/agent.yaml +38 -0
  99. package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/capability.yaml +19 -0
  100. package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/decision-rules.md +5 -0
  101. package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/workflow.md +5 -0
  102. package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/capability.yaml +19 -0
  103. package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/decision-rules.md +5 -0
  104. package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/workflow.md +5 -0
  105. package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/capability.yaml +19 -0
  106. package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/decision-rules.md +5 -0
  107. package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/workflow.md +5 -0
  108. package/runtime/agents/shared-memory-curator/infra/.gitkeep +1 -0
  109. package/runtime/agents/shared-memory-curator/knowledge/context.md +5 -0
  110. package/runtime/agents/shared-memory-curator/knowledge/system.md +4 -0
  111. package/runtime/agents/shared-memory-curator/templates/.gitkeep +1 -0
  112. package/runtime/cli/README.md +35 -4
  113. package/runtime/cli/aikit/__init__.py +1 -1
  114. package/runtime/cli/aikit/agent_registry.py +4 -2
  115. package/runtime/cli/aikit/agentic_commands.py +158 -0
  116. package/runtime/cli/aikit/app_home.py +1 -0
  117. package/runtime/cli/aikit/audit.py +16 -6
  118. package/runtime/cli/aikit/catalog.py +278 -8
  119. package/runtime/cli/aikit/cli_dispatch.py +489 -13
  120. package/runtime/cli/aikit/cli_parser.py +145 -7
  121. package/runtime/cli/aikit/contribution.py +132 -2
  122. package/runtime/cli/aikit/doctor_runtime.py +85 -0
  123. package/runtime/cli/aikit/eval.py +356 -10
  124. package/runtime/cli/aikit/human_output.py +310 -4
  125. package/runtime/cli/aikit/interactive_wizard.py +148 -0
  126. package/runtime/cli/aikit/knowledge_base.py +1067 -0
  127. package/runtime/cli/aikit/llm.py +12 -4
  128. package/runtime/cli/aikit/local_artifacts.py +444 -0
  129. package/runtime/cli/aikit/local_llm.py +161 -0
  130. package/runtime/cli/aikit/main.py +15 -0
  131. package/runtime/cli/aikit/mcp_manifest.py +798 -0
  132. package/runtime/cli/aikit/mcp_tools.py +643 -5
  133. package/runtime/cli/aikit/memory.py +405 -0
  134. package/runtime/cli/aikit/mini_brain.py +20 -1
  135. package/runtime/cli/aikit/natural_prompt_runtime.py +125 -1
  136. package/runtime/cli/aikit/ollama.py +64 -15
  137. package/runtime/cli/aikit/onboarding.py +551 -0
  138. package/runtime/cli/aikit/output.py +67 -0
  139. package/runtime/cli/aikit/prompt_injection.py +12 -1
  140. package/runtime/cli/aikit/roadmap_cli.py +1 -1
  141. package/runtime/cli/aikit/secrets.py +3 -2
  142. package/runtime/cli/aikit/setup_wizard_payload.py +3 -0
  143. package/runtime/cli/aikit/shared_memory.py +415 -0
  144. package/runtime/cli/aikit/specialist_readiness.py +152 -0
  145. package/runtime/cli/aikit/tasks.py +104 -1
  146. package/runtime/cli/aikit/team.py +380 -0
  147. package/runtime/cli/aikit/toolchain.py +7 -2
  148. package/runtime/cli/aikit/workflows.py +115 -14
  149. package/runtime/providers/knowledge-github.yaml +40 -0
  150. package/runtime/providers/knowledge-google-drive.yaml +32 -0
  151. package/runtime/providers/knowledge-local.yaml +26 -0
  152. package/runtime/providers/knowledge-notion.yaml +32 -0
  153. package/runtime/providers/knowledge-obsidian.yaml +24 -0
  154. package/runtime/providers/knowledge-onedrive.yaml +36 -0
  155. package/runtime/providers/knowledge-s3.yaml +45 -0
  156. package/runtime/providers/knowledge-sharepoint.yaml +39 -0
  157. package/runtime/providers/knowledge-supabase.yaml +43 -0
  158. package/runtime/providers/knowledge-vector.yaml +39 -0
  159. package/runtime/requirements.txt +6 -0
  160. package/runtime/scripts/docker-cli-qa.sh +453 -0
  161. package/runtime/scripts/release-catalog-snapshot.json +55 -4
  162. package/runtime/scripts/release-gate.py +54 -13
  163. package/runtime/tooling/toolchain.yaml +92 -0
  164. package/runtime/vendor/skills/napkin/napkin.md +21 -7
  165. package/runtime/workflows/azure-card-analysis/README.md +3 -0
  166. package/runtime/workflows/azure-card-analysis/workflow.yaml +30 -0
  167. package/runtime/workflows/daily-pr-review/README.md +3 -0
  168. package/runtime/workflows/daily-pr-review/workflow.yaml +31 -0
  169. package/runtime/workflows/incident-analysis/README.md +3 -0
  170. package/runtime/workflows/incident-analysis/workflow.yaml +33 -0
  171. package/runtime/workflows/release-prep/README.md +3 -0
  172. package/runtime/workflows/release-prep/workflow.yaml +30 -0
@@ -3,25 +3,51 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import re
7
+ import tempfile
6
8
  from datetime import datetime, timezone
7
9
  from pathlib import Path
8
10
  from typing import Any, Callable
9
11
 
12
+ from cli.aikit.agentic_commands import agentic_plan
13
+ from cli.aikit.app_home import app_path, ensure_app_home
10
14
  from cli.aikit.catalog import catalog_search
15
+ from cli.aikit.configuration_orchestrator import provider_setup_wizard
16
+ from cli.aikit.contribution import contribution_pr
17
+ from cli.aikit.extensions import local_extensions_list
18
+ from cli.aikit.identity import identity_system_prompt
11
19
  from cli.aikit.mcp_manifest import mcp_tools
20
+ from cli.aikit.model_router import build_model_plan
21
+ from cli.aikit.providers import load_providers
12
22
  from cli.aikit.prompt_injection import prompt_injection_eval_fixture
23
+ from cli.aikit.review_gate import build_review_gate
13
24
  from cli.aikit.router_explain import explain_route
14
25
  from cli.aikit.runtime_paths import ROOT
26
+ from cli.aikit.secrets import secrets_doctor
27
+ from cli.aikit.sources import list_sources
28
+ from cli.aikit.workflows import workflow_list
15
29
 
16
30
 
17
31
  EVAL_SCHEMA_VERSION = "agent-devkit.eval/v1"
18
32
  SUITES = (
19
33
  "routing",
20
34
  "catalog",
35
+ "wizard",
36
+ "source_config",
37
+ "identity_enforcement",
38
+ "review_gate",
39
+ "model_router",
40
+ "agentic_plan",
21
41
  "write_policy",
22
42
  "source_readiness",
23
43
  "mcp",
24
44
  "mcp_contract",
45
+ "workflow_contract",
46
+ "extension_contract",
47
+ "contribution_contract",
48
+ "team_contract",
49
+ "knowledge_contract",
50
+ "secret_refs",
25
51
  "prompt-injection",
26
52
  "prompt_injection",
27
53
  "mini_brain_limits",
@@ -42,15 +68,28 @@ def eval_run(suite: str, root: Path | None = None) -> dict[str, Any]:
42
68
  root = root or ROOT
43
69
  suite = normalize_suite_id(suite)
44
70
  if suite == "all":
71
+ started_at = datetime.now(timezone.utc)
45
72
  runs = [eval_run(item, root) for item in canonical_suite_ids()]
46
73
  status = "passed" if all(item["status"] == "passed" for item in runs) else "failed"
47
- return run_payload("all", status, runs)
74
+ return persist_run(run_payload("all", status, runs, started_at=started_at))
48
75
  handlers: dict[str, Callable[[Path], list[dict[str, Any]]]] = {
49
76
  "routing": eval_routing,
50
77
  "catalog": eval_catalog,
78
+ "wizard": eval_wizard,
79
+ "source_config": eval_source_config,
80
+ "identity_enforcement": eval_identity_enforcement,
81
+ "review_gate": eval_review_gate,
82
+ "model_router": eval_model_router,
83
+ "agentic_plan": eval_agentic_plan,
51
84
  "write_policy": eval_write_policy,
52
85
  "source_readiness": eval_source_readiness,
53
86
  "mcp_contract": eval_mcp_contract,
87
+ "workflow_contract": eval_workflow_contract,
88
+ "extension_contract": eval_extension_contract,
89
+ "contribution_contract": eval_contribution_contract,
90
+ "team_contract": eval_team_contract,
91
+ "knowledge_contract": eval_knowledge_contract,
92
+ "secret_refs": eval_secret_refs,
54
93
  "prompt_injection": eval_prompt_injection,
55
94
  "mini_brain_limits": eval_mini_brain_limits,
56
95
  "generated_agent_contract": eval_generated_agent_contract,
@@ -58,18 +97,29 @@ def eval_run(suite: str, root: Path | None = None) -> dict[str, Any]:
58
97
  handler = handlers.get(suite)
59
98
  if not handler:
60
99
  raise ValueError(f"unknown eval suite: {suite}")
100
+ started_at = datetime.now(timezone.utc)
61
101
  checks = handler(root)
62
102
  status = "passed" if all(item.get("status") == "passed" for item in checks) else "failed"
63
- return run_payload(display_suite_id(suite), status, checks)
64
-
65
-
66
- def eval_report() -> dict[str, Any]:
103
+ return persist_run(run_payload(display_suite_id(suite), status, checks, started_at=started_at))
104
+
105
+
106
+ def eval_report(run_id: str | None = None) -> dict[str, Any]:
107
+ runs = list_eval_runs()
108
+ if run_id:
109
+ payload = read_eval_run(run_id)
110
+ return {
111
+ "kind": "eval-report",
112
+ "schema_version": EVAL_SCHEMA_VERSION,
113
+ "status": "ok",
114
+ "run": payload,
115
+ "runs": runs,
116
+ }
67
117
  return {
68
118
  "kind": "eval-report",
69
119
  "schema_version": EVAL_SCHEMA_VERSION,
70
120
  "status": "ok",
71
- "message": "Persistent eval run reports are not enabled in the MVP.",
72
- "runs": [],
121
+ "message": "Use `agent eval report <run-id>` to inspect a persisted run.",
122
+ "runs": runs,
73
123
  }
74
124
 
75
125
 
@@ -89,6 +139,84 @@ def eval_catalog(root: Path) -> list[dict[str, Any]]:
89
139
  return [{"id": "catalog.search-pr", "status": "passed" if payload["items"] else "failed", "count": payload["count"]}]
90
140
 
91
141
 
142
+ def eval_wizard(root: Path) -> list[dict[str, Any]]:
143
+ wizard = provider_setup_wizard(root, "azure-devops", prompt="analise o card 1")
144
+ question = wizard.get("next_question") if isinstance(wizard.get("next_question"), dict) else {}
145
+ checks = [
146
+ {
147
+ "id": "wizard.provider-opt-in",
148
+ "status": "passed" if wizard.get("kind") == "provider-setup-wizard" and question.get("type") == "confirm" else "failed",
149
+ "provider": wizard.get("provider"),
150
+ "question": question.get("id"),
151
+ }
152
+ ]
153
+ for provider in load_providers(root):
154
+ provider_id = str(provider.get("id") or "")
155
+ if not provider_id:
156
+ continue
157
+ try:
158
+ candidate = provider_setup_wizard(root, provider_id)
159
+ except Exception as exc: # noqa: BLE001 - eval must report coverage failures.
160
+ checks.append({"id": f"wizard.provider-coverage.{provider_id}", "status": "failed", "error": type(exc).__name__})
161
+ continue
162
+ questions = candidate.get("questions") if isinstance(candidate.get("questions"), list) else []
163
+ checks.append(
164
+ {
165
+ "id": f"wizard.provider-coverage.{provider_id}",
166
+ "status": "passed"
167
+ if candidate.get("kind") == "provider-setup-wizard"
168
+ and candidate.get("provider") == provider_id
169
+ and isinstance(candidate.get("next_question"), dict)
170
+ else "failed",
171
+ "provider": provider_id,
172
+ "questions": len(questions),
173
+ "stores_secret": candidate.get("stores_secret"),
174
+ }
175
+ )
176
+ return checks
177
+
178
+
179
+ def eval_source_config(_root: Path) -> list[dict[str, Any]]:
180
+ payload = list_sources()
181
+ return [{"id": "source-config.no-stored-secret", "status": "passed" if payload.get("stored_secret") is False else "failed"}]
182
+
183
+
184
+ def eval_identity_enforcement(_root: Path) -> list[dict[str, Any]]:
185
+ prompt = identity_system_prompt(name="Agent DevKit")
186
+ required = ["Nunca responda", "Claude", "Codex", "ChatGPT", "identidade publica"]
187
+ return [{"id": "identity.system-prompt", "status": "passed" if all(item in prompt for item in required) else "failed"}]
188
+
189
+
190
+ def eval_review_gate(_root: Path) -> list[dict[str, Any]]:
191
+ gate = build_review_gate("implemente codigo e revise a entrega")
192
+ return [{"id": "review-gate.deliverable-required", "status": "passed" if gate.get("required") else "failed"}]
193
+
194
+
195
+ def eval_model_router(_root: Path) -> list[dict[str, Any]]:
196
+ plan = build_model_plan("resuma estes logs")
197
+ return [
198
+ {
199
+ "id": "model-router.operational-policy",
200
+ "status": "passed" if plan.get("local_llm_recommended") and plan.get("local_llm_role") == "operational-worker" else "failed",
201
+ "strategy": plan.get("strategy"),
202
+ }
203
+ ]
204
+
205
+
206
+ def eval_agentic_plan(root: Path) -> list[dict[str, Any]]:
207
+ payload = agentic_plan(root, ["analise o card 7914 do azure"])
208
+ plan = payload.get("execution_plan") if isinstance(payload.get("execution_plan"), dict) else {}
209
+ return [
210
+ {
211
+ "id": "agentic-plan.explicit-contract",
212
+ "status": "passed"
213
+ if payload.get("kind") == "agentic-plan" and plan.get("kind") == "agentic-execution-plan" and plan.get("trace")
214
+ else "failed",
215
+ "summary": payload.get("summary"),
216
+ }
217
+ ]
218
+
219
+
92
220
  def eval_write_policy(_root: Path) -> list[dict[str, Any]]:
93
221
  return [{"id": "write-policy.normalized", "status": "passed"}]
94
222
 
@@ -103,6 +231,94 @@ def eval_mcp_contract(_root: Path) -> list[dict[str, Any]]:
103
231
  return [{"id": "mcp.v2-tools", "status": "passed" if required <= names else "failed", "required": sorted(required)}]
104
232
 
105
233
 
234
+ def eval_workflow_contract(_root: Path) -> list[dict[str, Any]]:
235
+ payload = workflow_list()
236
+ ids = {item.get("id") for item in payload.get("items") or []}
237
+ required = {"daily-pr-review", "incident-analysis", "azure-card-analysis", "release-prep"}
238
+ return [{"id": "workflow.required-manifests", "status": "passed" if required <= ids else "failed", "required": sorted(required)}]
239
+
240
+
241
+ def eval_extension_contract(_root: Path) -> list[dict[str, Any]]:
242
+ payload = local_extensions_list()
243
+ return [{"id": "extension.registry-readable", "status": "passed" if payload.get("kind") == "local-extensions" else "failed"}]
244
+
245
+
246
+ def eval_contribution_contract(root: Path) -> list[dict[str, Any]]:
247
+ catalog = catalog_search("contribution-reviewer", root, item_type="agent")
248
+ pr = contribution_pr("missing-extension", dry_run=True)
249
+ return [
250
+ {
251
+ "id": "contribution-reviewer.catalogued",
252
+ "status": "passed" if any(item.get("id") == "contribution-reviewer" for item in catalog.get("items") or []) else "failed",
253
+ },
254
+ {
255
+ "id": "contribution-pr.report-only",
256
+ "status": "passed"
257
+ if pr.get("kind") == "contribution-pr"
258
+ and pr.get("status") == "blocked"
259
+ and (pr.get("plan") or {}).get("external_writes") is True
260
+ else "failed",
261
+ },
262
+ ]
263
+
264
+
265
+ def eval_team_contract(_root: Path) -> list[dict[str, Any]]:
266
+ from cli.aikit.team import team_doctor, team_init
267
+
268
+ with tempfile.TemporaryDirectory() as project:
269
+ root = Path(project)
270
+ init = team_init(root)
271
+ doctor = team_doctor(root)
272
+ return [
273
+ {
274
+ "id": "team-profile.project-local",
275
+ "status": "passed" if init.get("status") == "initialized" and init.get("secret_free") is True else "failed",
276
+ },
277
+ {
278
+ "id": "team-doctor.secret-free",
279
+ "status": "passed" if doctor.get("status") == "ok" else "failed",
280
+ },
281
+ ]
282
+
283
+
284
+ def eval_knowledge_contract(_root: Path) -> list[dict[str, Any]]:
285
+ from cli.aikit.knowledge_base import (
286
+ knowledge_doctor,
287
+ knowledge_init,
288
+ knowledge_publish,
289
+ knowledge_review,
290
+ knowledge_search,
291
+ knowledge_snapshot_create,
292
+ )
293
+
294
+ with tempfile.TemporaryDirectory() as project:
295
+ root = Path(project)
296
+ init = knowledge_init(root)
297
+ snapshot = knowledge_snapshot_create(
298
+ title="Runbook de teste",
299
+ content="# Runbook de teste\n\nProcedimento reutilizavel e sem segredo.",
300
+ from_file=None,
301
+ entry_type="runbook",
302
+ project=root,
303
+ )
304
+ review = knowledge_review(str(snapshot["snapshot_id"]), root)
305
+ publish = knowledge_publish(str(snapshot["snapshot_id"]), root, yes=True, owner_agent="knowledge-owner")
306
+ search = knowledge_search("procedimento reutilizavel", root)
307
+ doctor = knowledge_doctor(root)
308
+ return [
309
+ {"id": "knowledge.init", "status": "passed" if init.get("status") == "initialized" else "failed"},
310
+ {"id": "knowledge.review", "status": "passed" if review.get("status") == "approved" else "failed"},
311
+ {"id": "knowledge.publish", "status": "passed" if publish.get("status") == "published" else "failed"},
312
+ {"id": "knowledge.search", "status": "passed" if search.get("count", 0) >= 1 else "failed"},
313
+ {"id": "knowledge.doctor", "status": "passed" if doctor.get("status") == "ok" else "failed"},
314
+ ]
315
+
316
+
317
+ def eval_secret_refs(_root: Path) -> list[dict[str, Any]]:
318
+ payload = secrets_doctor()
319
+ return [{"id": "secret-refs.no-values", "status": "passed" if payload.get("stored_values") is False else "failed"}]
320
+
321
+
106
322
  def eval_prompt_injection(_root: Path) -> list[dict[str, Any]]:
107
323
  return [prompt_injection_eval_fixture()]
108
324
 
@@ -117,15 +333,27 @@ def eval_generated_agent_contract(_root: Path) -> list[dict[str, Any]]:
117
333
  return [{"id": "generated-agent.contract-placeholder", "status": "passed"}]
118
334
 
119
335
 
120
- def run_payload(suite: str, status: str, checks: list[dict[str, Any]]) -> dict[str, Any]:
336
+ def run_payload(
337
+ suite: str,
338
+ status: str,
339
+ checks: list[dict[str, Any]],
340
+ *,
341
+ started_at: datetime | None = None,
342
+ ) -> dict[str, Any]:
343
+ started_at = started_at or datetime.now(timezone.utc)
344
+ run_id = f"eval_{started_at.strftime('%Y%m%d%H%M%S')}_{suite.replace('-', '_')}"
345
+ finished_at = datetime.now(timezone.utc)
346
+ duration_ms = max(0, int((finished_at - started_at).total_seconds() * 1000))
121
347
  return {
122
348
  "kind": "eval-run",
123
349
  "schema_version": EVAL_SCHEMA_VERSION,
350
+ "run_id": run_id,
124
351
  "suite": suite,
125
352
  "status": status,
126
353
  "ok": status == "passed",
127
- "started_at": datetime.now(timezone.utc).isoformat(),
128
- "finished_at": datetime.now(timezone.utc).isoformat(),
354
+ "started_at": started_at.isoformat(),
355
+ "finished_at": finished_at.isoformat(),
356
+ "metrics": eval_metrics(checks, duration_ms=duration_ms),
129
357
  "checks": json.loads(json.dumps(checks, ensure_ascii=False)),
130
358
  }
131
359
 
@@ -134,9 +362,21 @@ def canonical_suite_ids() -> list[str]:
134
362
  return [
135
363
  "routing",
136
364
  "catalog",
365
+ "wizard",
366
+ "source_config",
367
+ "identity_enforcement",
368
+ "review_gate",
369
+ "model_router",
370
+ "agentic_plan",
137
371
  "write_policy",
138
372
  "source_readiness",
139
373
  "mcp_contract",
374
+ "workflow_contract",
375
+ "extension_contract",
376
+ "contribution_contract",
377
+ "team_contract",
378
+ "knowledge_contract",
379
+ "secret_refs",
140
380
  "prompt_injection",
141
381
  "mini_brain_limits",
142
382
  "generated_agent_contract",
@@ -147,6 +387,8 @@ def normalize_suite_id(value: str) -> str:
147
387
  normalized = (value or "").strip().replace("-", "_")
148
388
  if normalized == "mcp":
149
389
  return "mcp_contract"
390
+ if normalized == "prompt-injection":
391
+ return "prompt_injection"
150
392
  return normalized
151
393
 
152
394
 
@@ -156,3 +398,107 @@ def display_suite_id(value: str) -> str:
156
398
  if value == "mcp_contract":
157
399
  return "mcp"
158
400
  return value
401
+
402
+
403
+ def persist_run(payload: dict[str, Any]) -> dict[str, Any]:
404
+ path = eval_run_path(str(payload["run_id"]))
405
+ path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
406
+ markdown = eval_markdown_path(str(payload["run_id"]))
407
+ markdown.write_text(render_eval_markdown(payload), encoding="utf-8")
408
+ payload["json_path"] = str(path)
409
+ payload["markdown_path"] = str(markdown)
410
+ return payload
411
+
412
+
413
+ def eval_runs_home() -> Path:
414
+ ensure_app_home()
415
+ path = app_path("evals", "runs")
416
+ path.mkdir(parents=True, exist_ok=True)
417
+ return path
418
+
419
+
420
+ def eval_run_path(run_id: str) -> Path:
421
+ return eval_runs_home() / f"{safe_run_id(run_id)}.json"
422
+
423
+
424
+ def eval_markdown_path(run_id: str) -> Path:
425
+ return eval_runs_home() / f"{safe_run_id(run_id)}.md"
426
+
427
+
428
+ def list_eval_runs() -> list[dict[str, Any]]:
429
+ runs = []
430
+ for path in sorted(eval_runs_home().glob("*.json"), reverse=True):
431
+ try:
432
+ payload = json.loads(path.read_text(encoding="utf-8"))
433
+ except (OSError, json.JSONDecodeError):
434
+ continue
435
+ runs.append(
436
+ {
437
+ "run_id": payload.get("run_id"),
438
+ "suite": payload.get("suite"),
439
+ "status": payload.get("status"),
440
+ "ok": payload.get("ok"),
441
+ "started_at": payload.get("started_at"),
442
+ "json_path": str(path),
443
+ "markdown_path": str(path.with_suffix(".md")),
444
+ }
445
+ )
446
+ return runs
447
+
448
+
449
+ def read_eval_run(run_id: str) -> dict[str, Any]:
450
+ path = eval_run_path(run_id)
451
+ if not path.exists():
452
+ raise ValueError(f"eval run not found: {run_id}")
453
+ return json.loads(path.read_text(encoding="utf-8"))
454
+
455
+
456
+ def render_eval_markdown(payload: dict[str, Any]) -> str:
457
+ lines = [
458
+ f"# Eval {payload.get('run_id')}",
459
+ "",
460
+ f"- Suite: {payload.get('suite')}",
461
+ f"- Status: {payload.get('status')}",
462
+ f"- Started: {payload.get('started_at')}",
463
+ "",
464
+ "## Checks",
465
+ ]
466
+ for check in payload.get("checks") or []:
467
+ if isinstance(check, dict):
468
+ lines.append(f"- {check.get('id')}: {check.get('status')}")
469
+ lines.append("")
470
+ return "\n".join(lines)
471
+
472
+
473
+ def eval_metrics(checks: list[dict[str, Any]], *, duration_ms: int = 0) -> dict[str, Any]:
474
+ flat = flatten_checks(checks)
475
+ total = len(flat)
476
+ passed = len([item for item in flat if item.get("status") == "passed"])
477
+ failed = len([item for item in flat if item.get("status") == "failed"])
478
+ success = passed == total if total else False
479
+ completeness = passed / total if total else 0.0
480
+ return {
481
+ "total": total,
482
+ "passed": passed,
483
+ "failed": failed,
484
+ "success": success,
485
+ "regression": "passed" if success else "failed",
486
+ "completeness": round(completeness, 4),
487
+ "schema": "passed",
488
+ "security": "passed" if failed == 0 else "needs-review",
489
+ "duration_ms": duration_ms,
490
+ }
491
+
492
+
493
+ def flatten_checks(checks: list[dict[str, Any]]) -> list[dict[str, Any]]:
494
+ flat: list[dict[str, Any]] = []
495
+ for check in checks:
496
+ if isinstance(check, dict) and check.get("kind") == "eval-run":
497
+ flat.extend(flatten_checks(check.get("checks") or []))
498
+ elif isinstance(check, dict):
499
+ flat.append(check)
500
+ return flat
501
+
502
+
503
+ def safe_run_id(run_id: str) -> str:
504
+ return re.sub(r"[^a-zA-Z0-9_.-]+", "_", run_id)