agent-devkit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -6
- package/bin/agent.mjs +133 -7
- package/package.json +1 -1
- package/runtime/README.md +187 -5
- package/runtime/agent +31 -5
- package/runtime/agents/README.md +18 -0
- package/runtime/agents/contribution-reviewer/AGENTS.md +8 -0
- package/runtime/agents/contribution-reviewer/README.md +8 -0
- package/runtime/agents/contribution-reviewer/agent.yaml +40 -0
- package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/capability.yaml +27 -0
- package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/decision-rules.md +5 -0
- package/runtime/agents/contribution-reviewer/capabilities/plan-contribution-pr/workflow.md +6 -0
- package/runtime/agents/contribution-reviewer/capabilities/review-contribution/capability.yaml +25 -0
- package/runtime/agents/contribution-reviewer/capabilities/review-contribution/decision-rules.md +5 -0
- package/runtime/agents/contribution-reviewer/capabilities/review-contribution/workflow.md +5 -0
- package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/capability.yaml +26 -0
- package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/decision-rules.md +5 -0
- package/runtime/agents/contribution-reviewer/capabilities/validate-local-contribution/workflow.md +6 -0
- package/runtime/agents/contribution-reviewer/infra/README.md +6 -0
- package/runtime/agents/contribution-reviewer/knowledge/context.md +8 -0
- package/runtime/agents/contribution-reviewer/knowledge/system.md +8 -0
- package/runtime/agents/contribution-reviewer/templates/README.md +3 -0
- package/runtime/agents/knowledge-author/AGENTS.md +7 -0
- package/runtime/agents/knowledge-author/README.md +7 -0
- package/runtime/agents/knowledge-author/agent.yaml +37 -0
- package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/capability.yaml +30 -0
- package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/decision-rules.md +6 -0
- package/runtime/agents/knowledge-author/capabilities/create-knowledge-snapshot/workflow.md +7 -0
- package/runtime/agents/knowledge-author/infra/.gitkeep +1 -0
- package/runtime/agents/knowledge-author/knowledge/context.md +4 -0
- package/runtime/agents/knowledge-author/knowledge/system.md +4 -0
- package/runtime/agents/knowledge-author/templates/.gitkeep +1 -0
- package/runtime/agents/knowledge-curator/AGENTS.md +7 -0
- package/runtime/agents/knowledge-curator/README.md +6 -0
- package/runtime/agents/knowledge-curator/agent.yaml +37 -0
- package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/capability.yaml +29 -0
- package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/decision-rules.md +6 -0
- package/runtime/agents/knowledge-curator/capabilities/curate-knowledge-base/workflow.md +7 -0
- package/runtime/agents/knowledge-curator/infra/.gitkeep +1 -0
- package/runtime/agents/knowledge-curator/knowledge/context.md +4 -0
- package/runtime/agents/knowledge-curator/knowledge/system.md +4 -0
- package/runtime/agents/knowledge-curator/templates/.gitkeep +1 -0
- package/runtime/agents/knowledge-infra-builder/AGENTS.md +8 -0
- package/runtime/agents/knowledge-infra-builder/README.md +8 -0
- package/runtime/agents/knowledge-infra-builder/agent.yaml +38 -0
- package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/capability.yaml +30 -0
- package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/decision-rules.md +6 -0
- package/runtime/agents/knowledge-infra-builder/capabilities/create-knowledge-base/workflow.md +7 -0
- package/runtime/agents/knowledge-infra-builder/infra/.gitkeep +1 -0
- package/runtime/agents/knowledge-infra-builder/knowledge/context.md +4 -0
- package/runtime/agents/knowledge-infra-builder/knowledge/system.md +4 -0
- package/runtime/agents/knowledge-infra-builder/templates/.gitkeep +1 -0
- package/runtime/agents/knowledge-owner/AGENTS.md +7 -0
- package/runtime/agents/knowledge-owner/README.md +6 -0
- package/runtime/agents/knowledge-owner/agent.yaml +37 -0
- package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/capability.yaml +28 -0
- package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/decision-rules.md +6 -0
- package/runtime/agents/knowledge-owner/capabilities/publish-knowledge-snapshot/workflow.md +7 -0
- package/runtime/agents/knowledge-owner/infra/.gitkeep +1 -0
- package/runtime/agents/knowledge-owner/knowledge/context.md +4 -0
- package/runtime/agents/knowledge-owner/knowledge/system.md +4 -0
- package/runtime/agents/knowledge-owner/templates/.gitkeep +1 -0
- package/runtime/agents/knowledge-reviewer/AGENTS.md +7 -0
- package/runtime/agents/knowledge-reviewer/README.md +7 -0
- package/runtime/agents/knowledge-reviewer/agent.yaml +36 -0
- package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/capability.yaml +26 -0
- package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/decision-rules.md +6 -0
- package/runtime/agents/knowledge-reviewer/capabilities/review-knowledge-snapshot/workflow.md +7 -0
- package/runtime/agents/knowledge-reviewer/infra/.gitkeep +1 -0
- package/runtime/agents/knowledge-reviewer/knowledge/context.md +4 -0
- package/runtime/agents/knowledge-reviewer/knowledge/system.md +4 -0
- package/runtime/agents/knowledge-reviewer/templates/.gitkeep +1 -0
- package/runtime/agents/local-memory-manager/AGENTS.md +5 -0
- package/runtime/agents/local-memory-manager/README.md +7 -0
- package/runtime/agents/local-memory-manager/agent.yaml +38 -0
- package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/capability.yaml +19 -0
- package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/decision-rules.md +5 -0
- package/runtime/agents/local-memory-manager/capabilities/curate-local-memory/workflow.md +6 -0
- package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/capability.yaml +19 -0
- package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/decision-rules.md +5 -0
- package/runtime/agents/local-memory-manager/capabilities/inspect-local-memory/workflow.md +5 -0
- package/runtime/agents/local-memory-manager/infra/.gitkeep +1 -0
- package/runtime/agents/local-memory-manager/knowledge/context.md +4 -0
- package/runtime/agents/local-memory-manager/knowledge/system.md +4 -0
- package/runtime/agents/local-memory-manager/templates/.gitkeep +1 -0
- package/runtime/agents/memory-sync-manager/AGENTS.md +7 -0
- package/runtime/agents/memory-sync-manager/README.md +7 -0
- package/runtime/agents/memory-sync-manager/agent.yaml +37 -0
- package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/capability.yaml +29 -0
- package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/decision-rules.md +6 -0
- package/runtime/agents/memory-sync-manager/capabilities/plan-memory-backup/workflow.md +7 -0
- package/runtime/agents/memory-sync-manager/infra/.gitkeep +1 -0
- package/runtime/agents/memory-sync-manager/knowledge/context.md +4 -0
- package/runtime/agents/memory-sync-manager/knowledge/system.md +4 -0
- package/runtime/agents/memory-sync-manager/templates/.gitkeep +1 -0
- package/runtime/agents/shared-memory-curator/AGENTS.md +5 -0
- package/runtime/agents/shared-memory-curator/README.md +6 -0
- package/runtime/agents/shared-memory-curator/agent.yaml +38 -0
- package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/capability.yaml +19 -0
- package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/decision-rules.md +5 -0
- package/runtime/agents/shared-memory-curator/capabilities/create-shared-memory/workflow.md +5 -0
- package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/capability.yaml +19 -0
- package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/decision-rules.md +5 -0
- package/runtime/agents/shared-memory-curator/capabilities/publish-shared-submission/workflow.md +5 -0
- package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/capability.yaml +19 -0
- package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/decision-rules.md +5 -0
- package/runtime/agents/shared-memory-curator/capabilities/review-shared-submission/workflow.md +5 -0
- package/runtime/agents/shared-memory-curator/infra/.gitkeep +1 -0
- package/runtime/agents/shared-memory-curator/knowledge/context.md +5 -0
- package/runtime/agents/shared-memory-curator/knowledge/system.md +4 -0
- package/runtime/agents/shared-memory-curator/templates/.gitkeep +1 -0
- package/runtime/cli/README.md +35 -4
- package/runtime/cli/aikit/__init__.py +1 -1
- package/runtime/cli/aikit/agent_registry.py +4 -2
- package/runtime/cli/aikit/agentic_commands.py +158 -0
- package/runtime/cli/aikit/app_home.py +1 -0
- package/runtime/cli/aikit/audit.py +16 -6
- package/runtime/cli/aikit/catalog.py +278 -8
- package/runtime/cli/aikit/cli_dispatch.py +489 -13
- package/runtime/cli/aikit/cli_parser.py +145 -7
- package/runtime/cli/aikit/contribution.py +132 -2
- package/runtime/cli/aikit/doctor_runtime.py +85 -0
- package/runtime/cli/aikit/eval.py +356 -10
- package/runtime/cli/aikit/human_output.py +310 -4
- package/runtime/cli/aikit/interactive_wizard.py +148 -0
- package/runtime/cli/aikit/knowledge_base.py +1067 -0
- package/runtime/cli/aikit/llm.py +12 -4
- package/runtime/cli/aikit/local_artifacts.py +444 -0
- package/runtime/cli/aikit/local_llm.py +161 -0
- package/runtime/cli/aikit/main.py +15 -0
- package/runtime/cli/aikit/mcp_manifest.py +798 -0
- package/runtime/cli/aikit/mcp_tools.py +643 -5
- package/runtime/cli/aikit/memory.py +405 -0
- package/runtime/cli/aikit/mini_brain.py +20 -1
- package/runtime/cli/aikit/natural_prompt_runtime.py +125 -1
- package/runtime/cli/aikit/ollama.py +64 -15
- package/runtime/cli/aikit/onboarding.py +551 -0
- package/runtime/cli/aikit/output.py +67 -0
- package/runtime/cli/aikit/prompt_injection.py +12 -1
- package/runtime/cli/aikit/roadmap_cli.py +1 -1
- package/runtime/cli/aikit/secrets.py +3 -2
- package/runtime/cli/aikit/setup_wizard_payload.py +3 -0
- package/runtime/cli/aikit/shared_memory.py +415 -0
- package/runtime/cli/aikit/specialist_readiness.py +152 -0
- package/runtime/cli/aikit/tasks.py +104 -1
- package/runtime/cli/aikit/team.py +380 -0
- package/runtime/cli/aikit/toolchain.py +7 -2
- package/runtime/cli/aikit/workflows.py +115 -14
- package/runtime/providers/knowledge-github.yaml +40 -0
- package/runtime/providers/knowledge-google-drive.yaml +32 -0
- package/runtime/providers/knowledge-local.yaml +26 -0
- package/runtime/providers/knowledge-notion.yaml +32 -0
- package/runtime/providers/knowledge-obsidian.yaml +24 -0
- package/runtime/providers/knowledge-onedrive.yaml +36 -0
- package/runtime/providers/knowledge-s3.yaml +45 -0
- package/runtime/providers/knowledge-sharepoint.yaml +39 -0
- package/runtime/providers/knowledge-supabase.yaml +43 -0
- package/runtime/providers/knowledge-vector.yaml +39 -0
- package/runtime/requirements.txt +6 -0
- package/runtime/scripts/docker-cli-qa.sh +453 -0
- package/runtime/scripts/release-catalog-snapshot.json +55 -4
- package/runtime/scripts/release-gate.py +54 -13
- package/runtime/tooling/toolchain.yaml +92 -0
- package/runtime/vendor/skills/napkin/napkin.md +21 -7
- package/runtime/workflows/azure-card-analysis/README.md +3 -0
- package/runtime/workflows/azure-card-analysis/workflow.yaml +30 -0
- package/runtime/workflows/daily-pr-review/README.md +3 -0
- package/runtime/workflows/daily-pr-review/workflow.yaml +31 -0
- package/runtime/workflows/incident-analysis/README.md +3 -0
- package/runtime/workflows/incident-analysis/workflow.yaml +33 -0
- package/runtime/workflows/release-prep/README.md +3 -0
- package/runtime/workflows/release-prep/workflow.yaml +30 -0
|
@@ -3,25 +3,51 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import re
|
|
7
|
+
import tempfile
|
|
6
8
|
from datetime import datetime, timezone
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import Any, Callable
|
|
9
11
|
|
|
12
|
+
from cli.aikit.agentic_commands import agentic_plan
|
|
13
|
+
from cli.aikit.app_home import app_path, ensure_app_home
|
|
10
14
|
from cli.aikit.catalog import catalog_search
|
|
15
|
+
from cli.aikit.configuration_orchestrator import provider_setup_wizard
|
|
16
|
+
from cli.aikit.contribution import contribution_pr
|
|
17
|
+
from cli.aikit.extensions import local_extensions_list
|
|
18
|
+
from cli.aikit.identity import identity_system_prompt
|
|
11
19
|
from cli.aikit.mcp_manifest import mcp_tools
|
|
20
|
+
from cli.aikit.model_router import build_model_plan
|
|
21
|
+
from cli.aikit.providers import load_providers
|
|
12
22
|
from cli.aikit.prompt_injection import prompt_injection_eval_fixture
|
|
23
|
+
from cli.aikit.review_gate import build_review_gate
|
|
13
24
|
from cli.aikit.router_explain import explain_route
|
|
14
25
|
from cli.aikit.runtime_paths import ROOT
|
|
26
|
+
from cli.aikit.secrets import secrets_doctor
|
|
27
|
+
from cli.aikit.sources import list_sources
|
|
28
|
+
from cli.aikit.workflows import workflow_list
|
|
15
29
|
|
|
16
30
|
|
|
17
31
|
EVAL_SCHEMA_VERSION = "agent-devkit.eval/v1"
|
|
18
32
|
SUITES = (
|
|
19
33
|
"routing",
|
|
20
34
|
"catalog",
|
|
35
|
+
"wizard",
|
|
36
|
+
"source_config",
|
|
37
|
+
"identity_enforcement",
|
|
38
|
+
"review_gate",
|
|
39
|
+
"model_router",
|
|
40
|
+
"agentic_plan",
|
|
21
41
|
"write_policy",
|
|
22
42
|
"source_readiness",
|
|
23
43
|
"mcp",
|
|
24
44
|
"mcp_contract",
|
|
45
|
+
"workflow_contract",
|
|
46
|
+
"extension_contract",
|
|
47
|
+
"contribution_contract",
|
|
48
|
+
"team_contract",
|
|
49
|
+
"knowledge_contract",
|
|
50
|
+
"secret_refs",
|
|
25
51
|
"prompt-injection",
|
|
26
52
|
"prompt_injection",
|
|
27
53
|
"mini_brain_limits",
|
|
@@ -42,15 +68,28 @@ def eval_run(suite: str, root: Path | None = None) -> dict[str, Any]:
|
|
|
42
68
|
root = root or ROOT
|
|
43
69
|
suite = normalize_suite_id(suite)
|
|
44
70
|
if suite == "all":
|
|
71
|
+
started_at = datetime.now(timezone.utc)
|
|
45
72
|
runs = [eval_run(item, root) for item in canonical_suite_ids()]
|
|
46
73
|
status = "passed" if all(item["status"] == "passed" for item in runs) else "failed"
|
|
47
|
-
return run_payload("all", status, runs)
|
|
74
|
+
return persist_run(run_payload("all", status, runs, started_at=started_at))
|
|
48
75
|
handlers: dict[str, Callable[[Path], list[dict[str, Any]]]] = {
|
|
49
76
|
"routing": eval_routing,
|
|
50
77
|
"catalog": eval_catalog,
|
|
78
|
+
"wizard": eval_wizard,
|
|
79
|
+
"source_config": eval_source_config,
|
|
80
|
+
"identity_enforcement": eval_identity_enforcement,
|
|
81
|
+
"review_gate": eval_review_gate,
|
|
82
|
+
"model_router": eval_model_router,
|
|
83
|
+
"agentic_plan": eval_agentic_plan,
|
|
51
84
|
"write_policy": eval_write_policy,
|
|
52
85
|
"source_readiness": eval_source_readiness,
|
|
53
86
|
"mcp_contract": eval_mcp_contract,
|
|
87
|
+
"workflow_contract": eval_workflow_contract,
|
|
88
|
+
"extension_contract": eval_extension_contract,
|
|
89
|
+
"contribution_contract": eval_contribution_contract,
|
|
90
|
+
"team_contract": eval_team_contract,
|
|
91
|
+
"knowledge_contract": eval_knowledge_contract,
|
|
92
|
+
"secret_refs": eval_secret_refs,
|
|
54
93
|
"prompt_injection": eval_prompt_injection,
|
|
55
94
|
"mini_brain_limits": eval_mini_brain_limits,
|
|
56
95
|
"generated_agent_contract": eval_generated_agent_contract,
|
|
@@ -58,18 +97,29 @@ def eval_run(suite: str, root: Path | None = None) -> dict[str, Any]:
|
|
|
58
97
|
handler = handlers.get(suite)
|
|
59
98
|
if not handler:
|
|
60
99
|
raise ValueError(f"unknown eval suite: {suite}")
|
|
100
|
+
started_at = datetime.now(timezone.utc)
|
|
61
101
|
checks = handler(root)
|
|
62
102
|
status = "passed" if all(item.get("status") == "passed" for item in checks) else "failed"
|
|
63
|
-
return run_payload(display_suite_id(suite), status, checks)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def eval_report() -> dict[str, Any]:
|
|
103
|
+
return persist_run(run_payload(display_suite_id(suite), status, checks, started_at=started_at))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def eval_report(run_id: str | None = None) -> dict[str, Any]:
|
|
107
|
+
runs = list_eval_runs()
|
|
108
|
+
if run_id:
|
|
109
|
+
payload = read_eval_run(run_id)
|
|
110
|
+
return {
|
|
111
|
+
"kind": "eval-report",
|
|
112
|
+
"schema_version": EVAL_SCHEMA_VERSION,
|
|
113
|
+
"status": "ok",
|
|
114
|
+
"run": payload,
|
|
115
|
+
"runs": runs,
|
|
116
|
+
}
|
|
67
117
|
return {
|
|
68
118
|
"kind": "eval-report",
|
|
69
119
|
"schema_version": EVAL_SCHEMA_VERSION,
|
|
70
120
|
"status": "ok",
|
|
71
|
-
"message": "
|
|
72
|
-
"runs":
|
|
121
|
+
"message": "Use `agent eval report <run-id>` to inspect a persisted run.",
|
|
122
|
+
"runs": runs,
|
|
73
123
|
}
|
|
74
124
|
|
|
75
125
|
|
|
@@ -89,6 +139,84 @@ def eval_catalog(root: Path) -> list[dict[str, Any]]:
|
|
|
89
139
|
return [{"id": "catalog.search-pr", "status": "passed" if payload["items"] else "failed", "count": payload["count"]}]
|
|
90
140
|
|
|
91
141
|
|
|
142
|
+
def eval_wizard(root: Path) -> list[dict[str, Any]]:
|
|
143
|
+
wizard = provider_setup_wizard(root, "azure-devops", prompt="analise o card 1")
|
|
144
|
+
question = wizard.get("next_question") if isinstance(wizard.get("next_question"), dict) else {}
|
|
145
|
+
checks = [
|
|
146
|
+
{
|
|
147
|
+
"id": "wizard.provider-opt-in",
|
|
148
|
+
"status": "passed" if wizard.get("kind") == "provider-setup-wizard" and question.get("type") == "confirm" else "failed",
|
|
149
|
+
"provider": wizard.get("provider"),
|
|
150
|
+
"question": question.get("id"),
|
|
151
|
+
}
|
|
152
|
+
]
|
|
153
|
+
for provider in load_providers(root):
|
|
154
|
+
provider_id = str(provider.get("id") or "")
|
|
155
|
+
if not provider_id:
|
|
156
|
+
continue
|
|
157
|
+
try:
|
|
158
|
+
candidate = provider_setup_wizard(root, provider_id)
|
|
159
|
+
except Exception as exc: # noqa: BLE001 - eval must report coverage failures.
|
|
160
|
+
checks.append({"id": f"wizard.provider-coverage.{provider_id}", "status": "failed", "error": type(exc).__name__})
|
|
161
|
+
continue
|
|
162
|
+
questions = candidate.get("questions") if isinstance(candidate.get("questions"), list) else []
|
|
163
|
+
checks.append(
|
|
164
|
+
{
|
|
165
|
+
"id": f"wizard.provider-coverage.{provider_id}",
|
|
166
|
+
"status": "passed"
|
|
167
|
+
if candidate.get("kind") == "provider-setup-wizard"
|
|
168
|
+
and candidate.get("provider") == provider_id
|
|
169
|
+
and isinstance(candidate.get("next_question"), dict)
|
|
170
|
+
else "failed",
|
|
171
|
+
"provider": provider_id,
|
|
172
|
+
"questions": len(questions),
|
|
173
|
+
"stores_secret": candidate.get("stores_secret"),
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
return checks
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def eval_source_config(_root: Path) -> list[dict[str, Any]]:
|
|
180
|
+
payload = list_sources()
|
|
181
|
+
return [{"id": "source-config.no-stored-secret", "status": "passed" if payload.get("stored_secret") is False else "failed"}]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def eval_identity_enforcement(_root: Path) -> list[dict[str, Any]]:
|
|
185
|
+
prompt = identity_system_prompt(name="Agent DevKit")
|
|
186
|
+
required = ["Nunca responda", "Claude", "Codex", "ChatGPT", "identidade publica"]
|
|
187
|
+
return [{"id": "identity.system-prompt", "status": "passed" if all(item in prompt for item in required) else "failed"}]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def eval_review_gate(_root: Path) -> list[dict[str, Any]]:
|
|
191
|
+
gate = build_review_gate("implemente codigo e revise a entrega")
|
|
192
|
+
return [{"id": "review-gate.deliverable-required", "status": "passed" if gate.get("required") else "failed"}]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def eval_model_router(_root: Path) -> list[dict[str, Any]]:
|
|
196
|
+
plan = build_model_plan("resuma estes logs")
|
|
197
|
+
return [
|
|
198
|
+
{
|
|
199
|
+
"id": "model-router.operational-policy",
|
|
200
|
+
"status": "passed" if plan.get("local_llm_recommended") and plan.get("local_llm_role") == "operational-worker" else "failed",
|
|
201
|
+
"strategy": plan.get("strategy"),
|
|
202
|
+
}
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def eval_agentic_plan(root: Path) -> list[dict[str, Any]]:
|
|
207
|
+
payload = agentic_plan(root, ["analise o card 7914 do azure"])
|
|
208
|
+
plan = payload.get("execution_plan") if isinstance(payload.get("execution_plan"), dict) else {}
|
|
209
|
+
return [
|
|
210
|
+
{
|
|
211
|
+
"id": "agentic-plan.explicit-contract",
|
|
212
|
+
"status": "passed"
|
|
213
|
+
if payload.get("kind") == "agentic-plan" and plan.get("kind") == "agentic-execution-plan" and plan.get("trace")
|
|
214
|
+
else "failed",
|
|
215
|
+
"summary": payload.get("summary"),
|
|
216
|
+
}
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
|
|
92
220
|
def eval_write_policy(_root: Path) -> list[dict[str, Any]]:
|
|
93
221
|
return [{"id": "write-policy.normalized", "status": "passed"}]
|
|
94
222
|
|
|
@@ -103,6 +231,94 @@ def eval_mcp_contract(_root: Path) -> list[dict[str, Any]]:
|
|
|
103
231
|
return [{"id": "mcp.v2-tools", "status": "passed" if required <= names else "failed", "required": sorted(required)}]
|
|
104
232
|
|
|
105
233
|
|
|
234
|
+
def eval_workflow_contract(_root: Path) -> list[dict[str, Any]]:
|
|
235
|
+
payload = workflow_list()
|
|
236
|
+
ids = {item.get("id") for item in payload.get("items") or []}
|
|
237
|
+
required = {"daily-pr-review", "incident-analysis", "azure-card-analysis", "release-prep"}
|
|
238
|
+
return [{"id": "workflow.required-manifests", "status": "passed" if required <= ids else "failed", "required": sorted(required)}]
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def eval_extension_contract(_root: Path) -> list[dict[str, Any]]:
|
|
242
|
+
payload = local_extensions_list()
|
|
243
|
+
return [{"id": "extension.registry-readable", "status": "passed" if payload.get("kind") == "local-extensions" else "failed"}]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def eval_contribution_contract(root: Path) -> list[dict[str, Any]]:
|
|
247
|
+
catalog = catalog_search("contribution-reviewer", root, item_type="agent")
|
|
248
|
+
pr = contribution_pr("missing-extension", dry_run=True)
|
|
249
|
+
return [
|
|
250
|
+
{
|
|
251
|
+
"id": "contribution-reviewer.catalogued",
|
|
252
|
+
"status": "passed" if any(item.get("id") == "contribution-reviewer" for item in catalog.get("items") or []) else "failed",
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
"id": "contribution-pr.report-only",
|
|
256
|
+
"status": "passed"
|
|
257
|
+
if pr.get("kind") == "contribution-pr"
|
|
258
|
+
and pr.get("status") == "blocked"
|
|
259
|
+
and (pr.get("plan") or {}).get("external_writes") is True
|
|
260
|
+
else "failed",
|
|
261
|
+
},
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def eval_team_contract(_root: Path) -> list[dict[str, Any]]:
|
|
266
|
+
from cli.aikit.team import team_doctor, team_init
|
|
267
|
+
|
|
268
|
+
with tempfile.TemporaryDirectory() as project:
|
|
269
|
+
root = Path(project)
|
|
270
|
+
init = team_init(root)
|
|
271
|
+
doctor = team_doctor(root)
|
|
272
|
+
return [
|
|
273
|
+
{
|
|
274
|
+
"id": "team-profile.project-local",
|
|
275
|
+
"status": "passed" if init.get("status") == "initialized" and init.get("secret_free") is True else "failed",
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
"id": "team-doctor.secret-free",
|
|
279
|
+
"status": "passed" if doctor.get("status") == "ok" else "failed",
|
|
280
|
+
},
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def eval_knowledge_contract(_root: Path) -> list[dict[str, Any]]:
|
|
285
|
+
from cli.aikit.knowledge_base import (
|
|
286
|
+
knowledge_doctor,
|
|
287
|
+
knowledge_init,
|
|
288
|
+
knowledge_publish,
|
|
289
|
+
knowledge_review,
|
|
290
|
+
knowledge_search,
|
|
291
|
+
knowledge_snapshot_create,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
with tempfile.TemporaryDirectory() as project:
|
|
295
|
+
root = Path(project)
|
|
296
|
+
init = knowledge_init(root)
|
|
297
|
+
snapshot = knowledge_snapshot_create(
|
|
298
|
+
title="Runbook de teste",
|
|
299
|
+
content="# Runbook de teste\n\nProcedimento reutilizavel e sem segredo.",
|
|
300
|
+
from_file=None,
|
|
301
|
+
entry_type="runbook",
|
|
302
|
+
project=root,
|
|
303
|
+
)
|
|
304
|
+
review = knowledge_review(str(snapshot["snapshot_id"]), root)
|
|
305
|
+
publish = knowledge_publish(str(snapshot["snapshot_id"]), root, yes=True, owner_agent="knowledge-owner")
|
|
306
|
+
search = knowledge_search("procedimento reutilizavel", root)
|
|
307
|
+
doctor = knowledge_doctor(root)
|
|
308
|
+
return [
|
|
309
|
+
{"id": "knowledge.init", "status": "passed" if init.get("status") == "initialized" else "failed"},
|
|
310
|
+
{"id": "knowledge.review", "status": "passed" if review.get("status") == "approved" else "failed"},
|
|
311
|
+
{"id": "knowledge.publish", "status": "passed" if publish.get("status") == "published" else "failed"},
|
|
312
|
+
{"id": "knowledge.search", "status": "passed" if search.get("count", 0) >= 1 else "failed"},
|
|
313
|
+
{"id": "knowledge.doctor", "status": "passed" if doctor.get("status") == "ok" else "failed"},
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def eval_secret_refs(_root: Path) -> list[dict[str, Any]]:
|
|
318
|
+
payload = secrets_doctor()
|
|
319
|
+
return [{"id": "secret-refs.no-values", "status": "passed" if payload.get("stored_values") is False else "failed"}]
|
|
320
|
+
|
|
321
|
+
|
|
106
322
|
def eval_prompt_injection(_root: Path) -> list[dict[str, Any]]:
|
|
107
323
|
return [prompt_injection_eval_fixture()]
|
|
108
324
|
|
|
@@ -117,15 +333,27 @@ def eval_generated_agent_contract(_root: Path) -> list[dict[str, Any]]:
|
|
|
117
333
|
return [{"id": "generated-agent.contract-placeholder", "status": "passed"}]
|
|
118
334
|
|
|
119
335
|
|
|
120
|
-
def run_payload(
|
|
336
|
+
def run_payload(
|
|
337
|
+
suite: str,
|
|
338
|
+
status: str,
|
|
339
|
+
checks: list[dict[str, Any]],
|
|
340
|
+
*,
|
|
341
|
+
started_at: datetime | None = None,
|
|
342
|
+
) -> dict[str, Any]:
|
|
343
|
+
started_at = started_at or datetime.now(timezone.utc)
|
|
344
|
+
run_id = f"eval_{started_at.strftime('%Y%m%d%H%M%S')}_{suite.replace('-', '_')}"
|
|
345
|
+
finished_at = datetime.now(timezone.utc)
|
|
346
|
+
duration_ms = max(0, int((finished_at - started_at).total_seconds() * 1000))
|
|
121
347
|
return {
|
|
122
348
|
"kind": "eval-run",
|
|
123
349
|
"schema_version": EVAL_SCHEMA_VERSION,
|
|
350
|
+
"run_id": run_id,
|
|
124
351
|
"suite": suite,
|
|
125
352
|
"status": status,
|
|
126
353
|
"ok": status == "passed",
|
|
127
|
-
"started_at":
|
|
128
|
-
"finished_at":
|
|
354
|
+
"started_at": started_at.isoformat(),
|
|
355
|
+
"finished_at": finished_at.isoformat(),
|
|
356
|
+
"metrics": eval_metrics(checks, duration_ms=duration_ms),
|
|
129
357
|
"checks": json.loads(json.dumps(checks, ensure_ascii=False)),
|
|
130
358
|
}
|
|
131
359
|
|
|
@@ -134,9 +362,21 @@ def canonical_suite_ids() -> list[str]:
|
|
|
134
362
|
return [
|
|
135
363
|
"routing",
|
|
136
364
|
"catalog",
|
|
365
|
+
"wizard",
|
|
366
|
+
"source_config",
|
|
367
|
+
"identity_enforcement",
|
|
368
|
+
"review_gate",
|
|
369
|
+
"model_router",
|
|
370
|
+
"agentic_plan",
|
|
137
371
|
"write_policy",
|
|
138
372
|
"source_readiness",
|
|
139
373
|
"mcp_contract",
|
|
374
|
+
"workflow_contract",
|
|
375
|
+
"extension_contract",
|
|
376
|
+
"contribution_contract",
|
|
377
|
+
"team_contract",
|
|
378
|
+
"knowledge_contract",
|
|
379
|
+
"secret_refs",
|
|
140
380
|
"prompt_injection",
|
|
141
381
|
"mini_brain_limits",
|
|
142
382
|
"generated_agent_contract",
|
|
@@ -147,6 +387,8 @@ def normalize_suite_id(value: str) -> str:
|
|
|
147
387
|
normalized = (value or "").strip().replace("-", "_")
|
|
148
388
|
if normalized == "mcp":
|
|
149
389
|
return "mcp_contract"
|
|
390
|
+
if normalized == "prompt-injection":
|
|
391
|
+
return "prompt_injection"
|
|
150
392
|
return normalized
|
|
151
393
|
|
|
152
394
|
|
|
@@ -156,3 +398,107 @@ def display_suite_id(value: str) -> str:
|
|
|
156
398
|
if value == "mcp_contract":
|
|
157
399
|
return "mcp"
|
|
158
400
|
return value
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def persist_run(payload: dict[str, Any]) -> dict[str, Any]:
|
|
404
|
+
path = eval_run_path(str(payload["run_id"]))
|
|
405
|
+
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
406
|
+
markdown = eval_markdown_path(str(payload["run_id"]))
|
|
407
|
+
markdown.write_text(render_eval_markdown(payload), encoding="utf-8")
|
|
408
|
+
payload["json_path"] = str(path)
|
|
409
|
+
payload["markdown_path"] = str(markdown)
|
|
410
|
+
return payload
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def eval_runs_home() -> Path:
|
|
414
|
+
ensure_app_home()
|
|
415
|
+
path = app_path("evals", "runs")
|
|
416
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
417
|
+
return path
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def eval_run_path(run_id: str) -> Path:
|
|
421
|
+
return eval_runs_home() / f"{safe_run_id(run_id)}.json"
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def eval_markdown_path(run_id: str) -> Path:
|
|
425
|
+
return eval_runs_home() / f"{safe_run_id(run_id)}.md"
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def list_eval_runs() -> list[dict[str, Any]]:
|
|
429
|
+
runs = []
|
|
430
|
+
for path in sorted(eval_runs_home().glob("*.json"), reverse=True):
|
|
431
|
+
try:
|
|
432
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
433
|
+
except (OSError, json.JSONDecodeError):
|
|
434
|
+
continue
|
|
435
|
+
runs.append(
|
|
436
|
+
{
|
|
437
|
+
"run_id": payload.get("run_id"),
|
|
438
|
+
"suite": payload.get("suite"),
|
|
439
|
+
"status": payload.get("status"),
|
|
440
|
+
"ok": payload.get("ok"),
|
|
441
|
+
"started_at": payload.get("started_at"),
|
|
442
|
+
"json_path": str(path),
|
|
443
|
+
"markdown_path": str(path.with_suffix(".md")),
|
|
444
|
+
}
|
|
445
|
+
)
|
|
446
|
+
return runs
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def read_eval_run(run_id: str) -> dict[str, Any]:
|
|
450
|
+
path = eval_run_path(run_id)
|
|
451
|
+
if not path.exists():
|
|
452
|
+
raise ValueError(f"eval run not found: {run_id}")
|
|
453
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def render_eval_markdown(payload: dict[str, Any]) -> str:
|
|
457
|
+
lines = [
|
|
458
|
+
f"# Eval {payload.get('run_id')}",
|
|
459
|
+
"",
|
|
460
|
+
f"- Suite: {payload.get('suite')}",
|
|
461
|
+
f"- Status: {payload.get('status')}",
|
|
462
|
+
f"- Started: {payload.get('started_at')}",
|
|
463
|
+
"",
|
|
464
|
+
"## Checks",
|
|
465
|
+
]
|
|
466
|
+
for check in payload.get("checks") or []:
|
|
467
|
+
if isinstance(check, dict):
|
|
468
|
+
lines.append(f"- {check.get('id')}: {check.get('status')}")
|
|
469
|
+
lines.append("")
|
|
470
|
+
return "\n".join(lines)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def eval_metrics(checks: list[dict[str, Any]], *, duration_ms: int = 0) -> dict[str, Any]:
|
|
474
|
+
flat = flatten_checks(checks)
|
|
475
|
+
total = len(flat)
|
|
476
|
+
passed = len([item for item in flat if item.get("status") == "passed"])
|
|
477
|
+
failed = len([item for item in flat if item.get("status") == "failed"])
|
|
478
|
+
success = passed == total if total else False
|
|
479
|
+
completeness = passed / total if total else 0.0
|
|
480
|
+
return {
|
|
481
|
+
"total": total,
|
|
482
|
+
"passed": passed,
|
|
483
|
+
"failed": failed,
|
|
484
|
+
"success": success,
|
|
485
|
+
"regression": "passed" if success else "failed",
|
|
486
|
+
"completeness": round(completeness, 4),
|
|
487
|
+
"schema": "passed",
|
|
488
|
+
"security": "passed" if failed == 0 else "needs-review",
|
|
489
|
+
"duration_ms": duration_ms,
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def flatten_checks(checks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
494
|
+
flat: list[dict[str, Any]] = []
|
|
495
|
+
for check in checks:
|
|
496
|
+
if isinstance(check, dict) and check.get("kind") == "eval-run":
|
|
497
|
+
flat.extend(flatten_checks(check.get("checks") or []))
|
|
498
|
+
elif isinstance(check, dict):
|
|
499
|
+
flat.append(check)
|
|
500
|
+
return flat
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def safe_run_id(run_id: str) -> str:
|
|
504
|
+
return re.sub(r"[^a-zA-Z0-9_.-]+", "_", run_id)
|