aw-ecc 1.4.32 → 1.4.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.codex/hooks/aw-post-tool-use.sh +8 -2
- package/.codex/hooks/aw-session-start.sh +11 -4
- package/.codex/hooks/aw-stop.sh +8 -2
- package/.codex/hooks/aw-user-prompt-submit.sh +10 -2
- package/.codex/hooks.json +8 -8
- package/.cursor/INSTALL.md +7 -5
- package/.cursor/hooks/adapter.js +41 -4
- package/.cursor/hooks/after-agent-response.js +62 -0
- package/.cursor/hooks/before-submit-prompt.js +7 -1
- package/.cursor/hooks/post-tool-use-failure.js +21 -0
- package/.cursor/hooks/post-tool-use.js +39 -0
- package/.cursor/hooks/shared/aw-phase-definitions.js +53 -0
- package/.cursor/hooks/shared/aw-phase-runner.js +3 -1
- package/.cursor/hooks/subagent-start.js +22 -4
- package/.cursor/hooks/subagent-stop.js +18 -1
- package/.cursor/hooks.json +23 -2
- package/.opencode/package.json +1 -1
- package/AGENTS.md +3 -3
- package/README.md +5 -5
- package/commands/adk.md +52 -0
- package/commands/build.md +22 -9
- package/commands/deploy.md +12 -0
- package/commands/execute.md +9 -0
- package/commands/feature.md +333 -0
- package/commands/investigate.md +18 -5
- package/commands/plan.md +23 -9
- package/commands/publish.md +65 -0
- package/commands/review.md +12 -0
- package/commands/ship.md +12 -0
- package/commands/test.md +12 -0
- package/commands/verify.md +9 -0
- package/hooks/hooks.json +36 -0
- package/manifests/install-components.json +8 -0
- package/manifests/install-modules.json +83 -0
- package/manifests/install-profiles.json +7 -0
- package/package.json +1 -1
- package/scripts/ci/validate-rules.js +51 -0
- package/scripts/cursor-aw-home/hooks.json +23 -2
- package/scripts/cursor-aw-hooks/adapter.js +41 -4
- package/scripts/cursor-aw-hooks/before-submit-prompt.js +7 -1
- package/scripts/hooks/aw-usage-commit-created.js +32 -0
- package/scripts/hooks/aw-usage-post-tool-use-failure.js +56 -0
- package/scripts/hooks/aw-usage-post-tool-use.js +242 -0
- package/scripts/hooks/aw-usage-prompt-submit.js +112 -0
- package/scripts/hooks/aw-usage-session-start.js +48 -0
- package/scripts/hooks/aw-usage-stop.js +182 -0
- package/scripts/hooks/aw-usage-telemetry-send.js +84 -0
- package/scripts/hooks/cost-tracker.js +3 -23
- package/scripts/hooks/shared/aw-phase-definitions.js +53 -0
- package/scripts/hooks/shared/aw-phase-runner.js +3 -1
- package/scripts/lib/aw-hook-contract.js +2 -2
- package/scripts/lib/aw-pricing.js +306 -0
- package/scripts/lib/aw-usage-telemetry.js +472 -0
- package/scripts/lib/codex-hook-config.js +8 -8
- package/scripts/lib/cursor-hook-config.js +25 -10
- package/scripts/lib/install-targets/cursor-project.js +3 -0
- package/scripts/lib/install-targets/helpers.js +20 -3
- package/skills/aw-adk/SKILL.md +317 -0
- package/skills/aw-adk/agents/analyzer.md +113 -0
- package/skills/aw-adk/agents/comparator.md +113 -0
- package/skills/aw-adk/agents/grader.md +115 -0
- package/skills/aw-adk/assets/eval_review.html +76 -0
- package/skills/aw-adk/eval-viewer/generate_review.py +164 -0
- package/skills/aw-adk/eval-viewer/viewer.html +181 -0
- package/skills/aw-adk/evals/eval-colocated-placement.md +84 -0
- package/skills/aw-adk/evals/eval-create-agent.md +90 -0
- package/skills/aw-adk/evals/eval-create-command.md +98 -0
- package/skills/aw-adk/evals/eval-create-eval.md +89 -0
- package/skills/aw-adk/evals/eval-create-rule.md +99 -0
- package/skills/aw-adk/evals/eval-create-skill.md +97 -0
- package/skills/aw-adk/evals/eval-delete-agent.md +79 -0
- package/skills/aw-adk/evals/eval-delete-command.md +89 -0
- package/skills/aw-adk/evals/eval-delete-rule.md +86 -0
- package/skills/aw-adk/evals/eval-delete-skill.md +90 -0
- package/skills/aw-adk/evals/eval-meta-eval-coverage.md +78 -0
- package/skills/aw-adk/evals/eval-meta-eval-determinism.md +81 -0
- package/skills/aw-adk/evals/eval-meta-eval-false-pass.md +81 -0
- package/skills/aw-adk/evals/eval-score-accuracy.md +95 -0
- package/skills/aw-adk/evals/eval-type-redirect.md +68 -0
- package/skills/aw-adk/evals/evals.json +96 -0
- package/skills/aw-adk/references/artifact-wiring.md +162 -0
- package/skills/aw-adk/references/cross-ide-mapping.md +71 -0
- package/skills/aw-adk/references/eval-placement-guide.md +183 -0
- package/skills/aw-adk/references/external-resources.md +75 -0
- package/skills/aw-adk/references/getting-started.md +66 -0
- package/skills/aw-adk/references/registry-structure.md +152 -0
- package/skills/aw-adk/references/rubric-agent.md +36 -0
- package/skills/aw-adk/references/rubric-command.md +36 -0
- package/skills/aw-adk/references/rubric-eval.md +36 -0
- package/skills/aw-adk/references/rubric-meta-eval.md +132 -0
- package/skills/aw-adk/references/rubric-rule.md +36 -0
- package/skills/aw-adk/references/rubric-skill.md +36 -0
- package/skills/aw-adk/references/schemas.md +222 -0
- package/skills/aw-adk/references/template-agent.md +251 -0
- package/skills/aw-adk/references/template-command.md +279 -0
- package/skills/aw-adk/references/template-eval.md +176 -0
- package/skills/aw-adk/references/template-rule.md +119 -0
- package/skills/aw-adk/references/template-skill.md +123 -0
- package/skills/aw-adk/references/type-classifier.md +98 -0
- package/skills/aw-adk/references/writing-good-agents.md +227 -0
- package/skills/aw-adk/references/writing-good-commands.md +258 -0
- package/skills/aw-adk/references/writing-good-evals.md +271 -0
- package/skills/aw-adk/references/writing-good-rules.md +214 -0
- package/skills/aw-adk/references/writing-good-skills.md +159 -0
- package/skills/aw-adk/scripts/aggregate-benchmark.py +190 -0
- package/skills/aw-adk/scripts/lint-artifact.sh +211 -0
- package/skills/aw-adk/scripts/score-artifact.sh +179 -0
- package/skills/aw-adk/scripts/trigger-eval.py +192 -0
- package/skills/aw-build/SKILL.md +19 -2
- package/skills/aw-deploy/SKILL.md +65 -3
- package/skills/aw-design/SKILL.md +156 -0
- package/skills/aw-design/references/highrise-tokens.md +394 -0
- package/skills/aw-design/references/micro-interactions.md +76 -0
- package/skills/aw-design/references/prompt-template.md +160 -0
- package/skills/aw-design/references/quality-checklist.md +70 -0
- package/skills/aw-design/references/self-review.md +497 -0
- package/skills/aw-design/references/stitch-workflow.md +127 -0
- package/skills/aw-feature/SKILL.md +293 -0
- package/skills/aw-investigate/SKILL.md +17 -0
- package/skills/aw-plan/SKILL.md +34 -3
- package/skills/aw-publish/SKILL.md +300 -0
- package/skills/aw-publish/evals/eval-confirmation-gate.md +60 -0
- package/skills/aw-publish/evals/eval-intent-detection.md +111 -0
- package/skills/aw-publish/evals/eval-push-modes.md +67 -0
- package/skills/aw-publish/evals/eval-rules-push.md +60 -0
- package/skills/aw-publish/evals/evals.json +29 -0
- package/skills/aw-publish/references/push-modes.md +38 -0
- package/skills/aw-review/SKILL.md +88 -9
- package/skills/aw-rules-review/SKILL.md +124 -0
- package/skills/aw-rules-review/agents/openai.yaml +3 -0
- package/skills/aw-rules-review/scripts/generate-review-template.mjs +323 -0
- package/skills/aw-ship/SKILL.md +16 -0
- package/skills/aw-spec/SKILL.md +15 -0
- package/skills/aw-tasks/SKILL.md +15 -0
- package/skills/aw-test/SKILL.md +16 -0
- package/skills/aw-yolo/SKILL.md +4 -0
- package/skills/diagnose/SKILL.md +121 -0
- package/skills/diagnose/scripts/hitl-loop.template.sh +41 -0
- package/skills/finish-only-when-green/SKILL.md +265 -0
- package/skills/grill-me/SKILL.md +24 -0
- package/skills/grill-with-docs/SKILL.md +92 -0
- package/skills/grill-with-docs/adr-format.md +47 -0
- package/skills/grill-with-docs/context-format.md +67 -0
- package/skills/improve-codebase-architecture/SKILL.md +75 -0
- package/skills/improve-codebase-architecture/deepening.md +37 -0
- package/skills/improve-codebase-architecture/interface-design.md +44 -0
- package/skills/improve-codebase-architecture/language.md +53 -0
- package/skills/local-ghl-setup-from-screenshot/SKILL.md +538 -0
- package/skills/tdd/SKILL.md +115 -0
- package/skills/tdd/deep-modules.md +33 -0
- package/skills/tdd/interface-design.md +31 -0
- package/skills/tdd/mocking.md +59 -0
- package/skills/tdd/refactoring.md +10 -0
- package/skills/tdd/tests.md +61 -0
- package/skills/to-issues/SKILL.md +62 -0
- package/skills/to-prd/SKILL.md +75 -0
- package/skills/using-aw-skills/SKILL.md +170 -237
- package/skills/using-aw-skills/hooks/session-start.sh +11 -41
- package/skills/zoom-out/SKILL.md +24 -0
- package/.cursor/rules/common-agents.md +0 -53
- package/.cursor/rules/common-aw-routing.md +0 -43
- package/.cursor/rules/common-coding-style.md +0 -52
- package/.cursor/rules/common-development-workflow.md +0 -33
- package/.cursor/rules/common-git-workflow.md +0 -28
- package/.cursor/rules/common-hooks.md +0 -34
- package/.cursor/rules/common-patterns.md +0 -35
- package/.cursor/rules/common-performance.md +0 -59
- package/.cursor/rules/common-security.md +0 -33
- package/.cursor/rules/common-testing.md +0 -33
- package/.cursor/skills/api-and-interface-design/SKILL.md +0 -75
- package/.cursor/skills/article-writing/SKILL.md +0 -85
- package/.cursor/skills/aw-brainstorm/SKILL.md +0 -115
- package/.cursor/skills/aw-build/SKILL.md +0 -152
- package/.cursor/skills/aw-build/evals/build-stage-cases.json +0 -28
- package/.cursor/skills/aw-debug/SKILL.md +0 -49
- package/.cursor/skills/aw-deploy/SKILL.md +0 -101
- package/.cursor/skills/aw-deploy/evals/deploy-stage-cases.json +0 -32
- package/.cursor/skills/aw-execute/SKILL.md +0 -47
- package/.cursor/skills/aw-execute/references/mode-code.md +0 -47
- package/.cursor/skills/aw-execute/references/mode-docs.md +0 -28
- package/.cursor/skills/aw-execute/references/mode-infra.md +0 -44
- package/.cursor/skills/aw-execute/references/mode-migration.md +0 -58
- package/.cursor/skills/aw-execute/references/worker-implementer.md +0 -26
- package/.cursor/skills/aw-execute/references/worker-parallel-worker.md +0 -23
- package/.cursor/skills/aw-execute/references/worker-quality-reviewer.md +0 -23
- package/.cursor/skills/aw-execute/references/worker-spec-reviewer.md +0 -23
- package/.cursor/skills/aw-execute/scripts/build-worker-bundle.js +0 -229
- package/.cursor/skills/aw-finish/SKILL.md +0 -111
- package/.cursor/skills/aw-investigate/SKILL.md +0 -109
- package/.cursor/skills/aw-plan/SKILL.md +0 -368
- package/.cursor/skills/aw-prepare/SKILL.md +0 -118
- package/.cursor/skills/aw-review/SKILL.md +0 -118
- package/.cursor/skills/aw-ship/SKILL.md +0 -115
- package/.cursor/skills/aw-spec/SKILL.md +0 -104
- package/.cursor/skills/aw-tasks/SKILL.md +0 -138
- package/.cursor/skills/aw-test/SKILL.md +0 -118
- package/.cursor/skills/aw-verify/SKILL.md +0 -51
- package/.cursor/skills/aw-yolo/SKILL.md +0 -111
- package/.cursor/skills/browser-testing-with-devtools/SKILL.md +0 -81
- package/.cursor/skills/bun-runtime/SKILL.md +0 -84
- package/.cursor/skills/ci-cd-and-automation/SKILL.md +0 -71
- package/.cursor/skills/code-simplification/SKILL.md +0 -74
- package/.cursor/skills/content-engine/SKILL.md +0 -88
- package/.cursor/skills/context-engineering/SKILL.md +0 -74
- package/.cursor/skills/deprecation-and-migration/SKILL.md +0 -75
- package/.cursor/skills/documentation-and-adrs/SKILL.md +0 -75
- package/.cursor/skills/documentation-lookup/SKILL.md +0 -90
- package/.cursor/skills/frontend-slides/SKILL.md +0 -184
- package/.cursor/skills/frontend-slides/STYLE_PRESETS.md +0 -330
- package/.cursor/skills/frontend-ui-engineering/SKILL.md +0 -68
- package/.cursor/skills/git-workflow-and-versioning/SKILL.md +0 -75
- package/.cursor/skills/idea-refine/SKILL.md +0 -84
- package/.cursor/skills/incremental-implementation/SKILL.md +0 -75
- package/.cursor/skills/investor-materials/SKILL.md +0 -96
- package/.cursor/skills/investor-outreach/SKILL.md +0 -76
- package/.cursor/skills/market-research/SKILL.md +0 -75
- package/.cursor/skills/mcp-server-patterns/SKILL.md +0 -67
- package/.cursor/skills/nextjs-turbopack/SKILL.md +0 -44
- package/.cursor/skills/performance-optimization/SKILL.md +0 -77
- package/.cursor/skills/security-and-hardening/SKILL.md +0 -70
- package/.cursor/skills/using-aw-skills/SKILL.md +0 -290
- package/.cursor/skills/using-aw-skills/evals/skill-trigger-cases.tsv +0 -25
- package/.cursor/skills/using-aw-skills/evals/test-skill-triggers.sh +0 -171
- package/.cursor/skills/using-aw-skills/hooks/hooks.json +0 -9
- package/.cursor/skills/using-aw-skills/hooks/session-start.sh +0 -67
- package/.cursor/skills/using-platform-skills/SKILL.md +0 -163
- package/.cursor/skills/using-platform-skills/evals/platform-selection-cases.json +0 -52
- /package/.cursor/rules/{golang-coding-style.md → golang-coding-style.mdc} +0 -0
- /package/.cursor/rules/{golang-hooks.md → golang-hooks.mdc} +0 -0
- /package/.cursor/rules/{golang-patterns.md → golang-patterns.mdc} +0 -0
- /package/.cursor/rules/{golang-security.md → golang-security.mdc} +0 -0
- /package/.cursor/rules/{golang-testing.md → golang-testing.mdc} +0 -0
- /package/.cursor/rules/{kotlin-coding-style.md → kotlin-coding-style.mdc} +0 -0
- /package/.cursor/rules/{kotlin-hooks.md → kotlin-hooks.mdc} +0 -0
- /package/.cursor/rules/{kotlin-patterns.md → kotlin-patterns.mdc} +0 -0
- /package/.cursor/rules/{kotlin-security.md → kotlin-security.mdc} +0 -0
- /package/.cursor/rules/{kotlin-testing.md → kotlin-testing.mdc} +0 -0
- /package/.cursor/rules/{php-coding-style.md → php-coding-style.mdc} +0 -0
- /package/.cursor/rules/{php-hooks.md → php-hooks.mdc} +0 -0
- /package/.cursor/rules/{php-patterns.md → php-patterns.mdc} +0 -0
- /package/.cursor/rules/{php-security.md → php-security.mdc} +0 -0
- /package/.cursor/rules/{php-testing.md → php-testing.mdc} +0 -0
- /package/.cursor/rules/{python-coding-style.md → python-coding-style.mdc} +0 -0
- /package/.cursor/rules/{python-hooks.md → python-hooks.mdc} +0 -0
- /package/.cursor/rules/{python-patterns.md → python-patterns.mdc} +0 -0
- /package/.cursor/rules/{python-security.md → python-security.mdc} +0 -0
- /package/.cursor/rules/{python-testing.md → python-testing.mdc} +0 -0
- /package/.cursor/rules/{swift-coding-style.md → swift-coding-style.mdc} +0 -0
- /package/.cursor/rules/{swift-hooks.md → swift-hooks.mdc} +0 -0
- /package/.cursor/rules/{swift-patterns.md → swift-patterns.mdc} +0 -0
- /package/.cursor/rules/{swift-security.md → swift-security.mdc} +0 -0
- /package/.cursor/rules/{swift-testing.md → swift-testing.mdc} +0 -0
- /package/.cursor/rules/{typescript-coding-style.md → typescript-coding-style.mdc} +0 -0
- /package/.cursor/rules/{typescript-hooks.md → typescript-hooks.mdc} +0 -0
- /package/.cursor/rules/{typescript-patterns.md → typescript-patterns.mdc} +0 -0
- /package/.cursor/rules/{typescript-security.md → typescript-security.mdc} +0 -0
- /package/.cursor/rules/{typescript-testing.md → typescript-testing.mdc} +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
aggregate-benchmark.py — Aggregates ADK eval results into benchmark.json
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python skills/aw-adk/scripts/aggregate-benchmark.py <workspace>/iteration-N --artifact-name <name>
|
|
7
|
+
|
|
8
|
+
Reads grading.json and timing.json from each eval directory,
|
|
9
|
+
produces benchmark.json and benchmark.md with aggregate statistics.
|
|
10
|
+
|
|
11
|
+
Adapted from skill-creator's aggregate_benchmark.py for CASRE context.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from statistics import mean, stdev
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_json(path: str) -> dict:
|
|
23
|
+
"""Load a JSON file, returning empty dict if not found."""
|
|
24
|
+
try:
|
|
25
|
+
with open(path, "r") as f:
|
|
26
|
+
return json.load(f)
|
|
27
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
|
28
|
+
return {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def collect_runs(iteration_dir: str) -> list[dict]:
|
|
32
|
+
"""Collect all grading results from an iteration directory."""
|
|
33
|
+
runs = []
|
|
34
|
+
iteration_path = Path(iteration_dir)
|
|
35
|
+
|
|
36
|
+
for eval_dir in sorted(iteration_path.iterdir()):
|
|
37
|
+
if not eval_dir.is_dir():
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
for config_dir in sorted(eval_dir.iterdir()):
|
|
41
|
+
if not config_dir.is_dir():
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
grading_path = config_dir / "grading.json"
|
|
45
|
+
timing_path = config_dir / "timing.json"
|
|
46
|
+
metadata_path = eval_dir / "eval_metadata.json"
|
|
47
|
+
|
|
48
|
+
grading = load_json(str(grading_path))
|
|
49
|
+
timing = load_json(str(timing_path))
|
|
50
|
+
metadata = load_json(str(metadata_path))
|
|
51
|
+
|
|
52
|
+
if not grading:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
summary = grading.get("summary", {})
|
|
56
|
+
run = {
|
|
57
|
+
"eval_id": metadata.get("eval_id", eval_dir.name),
|
|
58
|
+
"eval_name": metadata.get("eval_name", eval_dir.name),
|
|
59
|
+
"configuration": config_dir.name,
|
|
60
|
+
"run_number": 1,
|
|
61
|
+
"result": {
|
|
62
|
+
"pass_rate": summary.get("pass_rate", 0),
|
|
63
|
+
"passed": summary.get("passed", 0),
|
|
64
|
+
"failed": summary.get("failed", 0),
|
|
65
|
+
"total": summary.get("total", 0),
|
|
66
|
+
"time_seconds": timing.get("total_duration_seconds", 0),
|
|
67
|
+
"tokens": timing.get("total_tokens", 0),
|
|
68
|
+
"errors": grading.get("execution_metrics", {}).get("errors_encountered", 0),
|
|
69
|
+
},
|
|
70
|
+
"expectations": grading.get("expectations", []),
|
|
71
|
+
}
|
|
72
|
+
runs.append(run)
|
|
73
|
+
|
|
74
|
+
return runs
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def compute_summary(runs: list[dict]) -> dict:
|
|
78
|
+
"""Compute aggregate statistics per configuration."""
|
|
79
|
+
configs: dict[str, list[dict]] = {}
|
|
80
|
+
for run in runs:
|
|
81
|
+
config = run["configuration"]
|
|
82
|
+
configs.setdefault(config, []).append(run)
|
|
83
|
+
|
|
84
|
+
summary = {}
|
|
85
|
+
for config, config_runs in configs.items():
|
|
86
|
+
pass_rates = [r["result"]["pass_rate"] for r in config_runs]
|
|
87
|
+
times = [r["result"]["time_seconds"] for r in config_runs if r["result"]["time_seconds"] > 0]
|
|
88
|
+
tokens = [r["result"]["tokens"] for r in config_runs if r["result"]["tokens"] > 0]
|
|
89
|
+
|
|
90
|
+
summary[config] = {
|
|
91
|
+
"pass_rate": {
|
|
92
|
+
"mean": round(mean(pass_rates), 3) if pass_rates else 0,
|
|
93
|
+
"stddev": round(stdev(pass_rates), 3) if len(pass_rates) > 1 else 0,
|
|
94
|
+
"min": round(min(pass_rates), 3) if pass_rates else 0,
|
|
95
|
+
"max": round(max(pass_rates), 3) if pass_rates else 0,
|
|
96
|
+
},
|
|
97
|
+
"time_seconds": {
|
|
98
|
+
"mean": round(mean(times), 1) if times else 0,
|
|
99
|
+
"stddev": round(stdev(times), 1) if len(times) > 1 else 0,
|
|
100
|
+
},
|
|
101
|
+
"tokens": {
|
|
102
|
+
"mean": round(mean(tokens)) if tokens else 0,
|
|
103
|
+
"stddev": round(stdev(tokens)) if len(tokens) > 1 else 0,
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Compute delta between with_artifact and without_artifact (or with_skill/without_skill)
|
|
108
|
+
with_key = next((k for k in summary if "with" in k), None)
|
|
109
|
+
without_key = next((k for k in summary if "without" in k or "old" in k), None)
|
|
110
|
+
|
|
111
|
+
if with_key and without_key:
|
|
112
|
+
summary["delta"] = {
|
|
113
|
+
"pass_rate": f"+{summary[with_key]['pass_rate']['mean'] - summary[without_key]['pass_rate']['mean']:.3f}",
|
|
114
|
+
"time_seconds": f"+{summary[with_key]['time_seconds']['mean'] - summary[without_key]['time_seconds']['mean']:.1f}",
|
|
115
|
+
"tokens": f"+{summary[with_key]['tokens']['mean'] - summary[without_key]['tokens']['mean']:.0f}",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return summary
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def generate_markdown(benchmark: dict) -> str:
|
|
122
|
+
"""Generate a human-readable benchmark.md summary."""
|
|
123
|
+
lines = [f"# Benchmark: {benchmark['metadata']['artifact_name']}", ""]
|
|
124
|
+
|
|
125
|
+
summary = benchmark.get("run_summary", {})
|
|
126
|
+
for config, stats in summary.items():
|
|
127
|
+
if config == "delta":
|
|
128
|
+
continue
|
|
129
|
+
lines.append(f"## {config}")
|
|
130
|
+
pr = stats.get("pass_rate", {})
|
|
131
|
+
lines.append(f"- Pass rate: {pr.get('mean', 0):.1%} ± {pr.get('stddev', 0):.1%}")
|
|
132
|
+
ts = stats.get("time_seconds", {})
|
|
133
|
+
lines.append(f"- Time: {ts.get('mean', 0):.1f}s ± {ts.get('stddev', 0):.1f}s")
|
|
134
|
+
tk = stats.get("tokens", {})
|
|
135
|
+
lines.append(f"- Tokens: {tk.get('mean', 0):.0f} ± {tk.get('stddev', 0):.0f}")
|
|
136
|
+
lines.append("")
|
|
137
|
+
|
|
138
|
+
if "delta" in summary:
|
|
139
|
+
d = summary["delta"]
|
|
140
|
+
lines.append("## Delta")
|
|
141
|
+
lines.append(f"- Pass rate: {d['pass_rate']}")
|
|
142
|
+
lines.append(f"- Time: {d['time_seconds']}s")
|
|
143
|
+
lines.append(f"- Tokens: {d['tokens']}")
|
|
144
|
+
lines.append("")
|
|
145
|
+
|
|
146
|
+
return "\n".join(lines)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def main():
|
|
150
|
+
parser = argparse.ArgumentParser(description="Aggregate ADK eval results into benchmark")
|
|
151
|
+
parser.add_argument("iteration_dir", help="Path to iteration directory")
|
|
152
|
+
parser.add_argument("--artifact-name", required=True, help="Name of the artifact being benchmarked")
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
if not os.path.isdir(args.iteration_dir):
|
|
156
|
+
print(f"Error: {args.iteration_dir} is not a directory", file=sys.stderr)
|
|
157
|
+
sys.exit(1)
|
|
158
|
+
|
|
159
|
+
runs = collect_runs(args.iteration_dir)
|
|
160
|
+
|
|
161
|
+
if not runs:
|
|
162
|
+
print(f"Warning: No grading results found in {args.iteration_dir}", file=sys.stderr)
|
|
163
|
+
|
|
164
|
+
benchmark = {
|
|
165
|
+
"metadata": {
|
|
166
|
+
"artifact_name": args.artifact_name,
|
|
167
|
+
"iteration_dir": args.iteration_dir,
|
|
168
|
+
"evals_run": list({r["eval_name"] for r in runs}),
|
|
169
|
+
"total_runs": len(runs),
|
|
170
|
+
},
|
|
171
|
+
"runs": runs,
|
|
172
|
+
"run_summary": compute_summary(runs),
|
|
173
|
+
"notes": [],
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# Write benchmark.json
|
|
177
|
+
output_path = os.path.join(args.iteration_dir, "benchmark.json")
|
|
178
|
+
with open(output_path, "w") as f:
|
|
179
|
+
json.dump(benchmark, f, indent=2)
|
|
180
|
+
print(f"Wrote {output_path}")
|
|
181
|
+
|
|
182
|
+
# Write benchmark.md
|
|
183
|
+
md_path = os.path.join(args.iteration_dir, "benchmark.md")
|
|
184
|
+
with open(md_path, "w") as f:
|
|
185
|
+
f.write(generate_markdown(benchmark))
|
|
186
|
+
print(f"Wrote {md_path}")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
main()
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# lint-artifact.sh — Validates CASRE artifact structure
|
|
3
|
+
# Usage: bash skills/aw-adk/scripts/lint-artifact.sh <path> <type>
|
|
4
|
+
# Types: command | agent | skill | rule | eval
|
|
5
|
+
# Returns JSON with pass/fail per check
|
|
6
|
+
|
|
7
|
+
set -euo pipefail
|
|
8
|
+
|
|
9
|
+
ARTIFACT_PATH="${1:-}"
|
|
10
|
+
ARTIFACT_TYPE="${2:-}"
|
|
11
|
+
ERRORS=()
|
|
12
|
+
WARNINGS=()
|
|
13
|
+
|
|
14
|
+
if [[ -z "$ARTIFACT_PATH" || -z "$ARTIFACT_TYPE" ]]; then
|
|
15
|
+
echo '{"error": "Usage: lint-artifact.sh <path> <type>"}' >&2
|
|
16
|
+
exit 1
|
|
17
|
+
fi
|
|
18
|
+
|
|
19
|
+
if [[ ! -f "$ARTIFACT_PATH" ]]; then
|
|
20
|
+
echo "{\"error\": \"File not found: $ARTIFACT_PATH\"}" >&2
|
|
21
|
+
exit 1
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
# Read file content
|
|
25
|
+
CONTENT=$(cat "$ARTIFACT_PATH")
|
|
26
|
+
|
|
27
|
+
# Check 1: YAML frontmatter exists
|
|
28
|
+
if ! echo "$CONTENT" | head -1 | grep -q '^---$'; then
|
|
29
|
+
ERRORS+=("frontmatter_missing: No YAML frontmatter (must start with ---)")
|
|
30
|
+
fi
|
|
31
|
+
|
|
32
|
+
# Check 2: Frontmatter has 'name' field
|
|
33
|
+
if ! echo "$CONTENT" | grep -q '^name:'; then
|
|
34
|
+
ERRORS+=("name_missing: No 'name' field in frontmatter")
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
# Check 3: Frontmatter has 'description' field
|
|
38
|
+
if ! echo "$CONTENT" | grep -q '^description:'; then
|
|
39
|
+
ERRORS+=("description_missing: No 'description' field in frontmatter")
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
# Type-specific checks
|
|
43
|
+
case "$ARTIFACT_TYPE" in
|
|
44
|
+
command)
|
|
45
|
+
# Commands need phases
|
|
46
|
+
if ! echo "$CONTENT" | grep -qi 'phase'; then
|
|
47
|
+
WARNINGS+=("no_phases: Command has no phase structure")
|
|
48
|
+
fi
|
|
49
|
+
# Commands need agent roster
|
|
50
|
+
if ! echo "$CONTENT" | grep -qi 'agent.*roster\|## Agent'; then
|
|
51
|
+
WARNINGS+=("no_agent_roster: Command has no agent roster section")
|
|
52
|
+
fi
|
|
53
|
+
;;
|
|
54
|
+
agent)
|
|
55
|
+
# Agents need identity section
|
|
56
|
+
if ! echo "$CONTENT" | grep -qi '## Identity\|## Core Mission'; then
|
|
57
|
+
WARNINGS+=("no_identity: Agent has no Identity or Core Mission section")
|
|
58
|
+
fi
|
|
59
|
+
# Agents need tools in frontmatter
|
|
60
|
+
if ! echo "$CONTENT" | grep -q '^tools:'; then
|
|
61
|
+
WARNINGS+=("no_tools: Agent has no 'tools' field in frontmatter")
|
|
62
|
+
fi
|
|
63
|
+
;;
|
|
64
|
+
skill)
|
|
65
|
+
# Skills: SKILL.md must be the filename
|
|
66
|
+
BASENAME=$(basename "$ARTIFACT_PATH")
|
|
67
|
+
if [[ "$BASENAME" != "SKILL.md" ]]; then
|
|
68
|
+
ERRORS+=("wrong_filename: Skill must be named SKILL.md, got $BASENAME")
|
|
69
|
+
fi
|
|
70
|
+
# Skills need When to Use
|
|
71
|
+
if ! echo "$CONTENT" | grep -qi '## When to Use'; then
|
|
72
|
+
WARNINGS+=("no_when_to_use: Skill has no 'When to Use' section")
|
|
73
|
+
fi
|
|
74
|
+
# Check word count (should be < 5000)
|
|
75
|
+
WORD_COUNT=$(echo "$CONTENT" | wc -w | tr -d ' ')
|
|
76
|
+
if [[ "$WORD_COUNT" -gt 5000 ]]; then
|
|
77
|
+
WARNINGS+=("too_long: Skill is $WORD_COUNT words (recommended < 5000)")
|
|
78
|
+
fi
|
|
79
|
+
;;
|
|
80
|
+
rule)
|
|
81
|
+
# Rules need WRONG/RIGHT or Always/Never examples
|
|
82
|
+
if ! echo "$CONTENT" | grep -qi 'WRONG\|Never\|Always'; then
|
|
83
|
+
WARNINGS+=("no_examples: Rule has no WRONG/RIGHT or Always/Never examples")
|
|
84
|
+
fi
|
|
85
|
+
;;
|
|
86
|
+
eval)
|
|
87
|
+
# Evals need pass/fail criteria
|
|
88
|
+
if ! echo "$CONTENT" | grep -qi 'pass\|fail\|expect'; then
|
|
89
|
+
WARNINGS+=("no_criteria: Eval has no pass/fail criteria")
|
|
90
|
+
fi
|
|
91
|
+
;;
|
|
92
|
+
*)
|
|
93
|
+
echo "{\"error\": \"Unknown type: $ARTIFACT_TYPE. Must be: command|agent|skill|rule|eval\"}" >&2
|
|
94
|
+
exit 1
|
|
95
|
+
;;
|
|
96
|
+
esac
|
|
97
|
+
|
|
98
|
+
# ─── Dependency resolution checks ───
|
|
99
|
+
# Find the .aw/.aw_registry/ root by walking up from the artifact path
|
|
100
|
+
REGISTRY_ROOT=""
|
|
101
|
+
SEARCH_DIR=$(dirname "$ARTIFACT_PATH")
|
|
102
|
+
for _ in $(seq 1 10); do
|
|
103
|
+
if [[ -d "$SEARCH_DIR/.aw_registry" ]]; then
|
|
104
|
+
# We're inside .aw/ already
|
|
105
|
+
REGISTRY_ROOT="$SEARCH_DIR"
|
|
106
|
+
break
|
|
107
|
+
elif [[ -d "$SEARCH_DIR/.aw/.aw_registry" ]]; then
|
|
108
|
+
REGISTRY_ROOT="$SEARCH_DIR/.aw"
|
|
109
|
+
break
|
|
110
|
+
fi
|
|
111
|
+
SEARCH_DIR=$(dirname "$SEARCH_DIR")
|
|
112
|
+
done
|
|
113
|
+
|
|
114
|
+
if [[ -n "$REGISTRY_ROOT" ]]; then
|
|
115
|
+
case "$ARTIFACT_TYPE" in
|
|
116
|
+
agent)
|
|
117
|
+
# Check skills: frontmatter — each skill must exist in the registry
|
|
118
|
+
SKILLS_LINE=$(echo "$CONTENT" | grep -E '^skills:' | head -1)
|
|
119
|
+
if [[ -n "$SKILLS_LINE" ]]; then
|
|
120
|
+
# Extract skill names from YAML: skills: [skill-1, skill-2] or skills: skill-1, skill-2
|
|
121
|
+
SKILL_NAMES=$(echo "$SKILLS_LINE" | sed 's/^skills:[[:space:]]*//;s/\[//g;s/\]//g;s/,/ /g;s/"//g;s/'"'"'//g' | tr -s ' ')
|
|
122
|
+
for SKILL_NAME in $SKILL_NAMES; do
|
|
123
|
+
SKILL_NAME=$(echo "$SKILL_NAME" | xargs) # trim whitespace
|
|
124
|
+
[[ -z "$SKILL_NAME" || "$SKILL_NAME" == "[]" || "$SKILL_NAME" == "-" ]] && continue
|
|
125
|
+
# Search for the skill in the registry
|
|
126
|
+
FOUND=$(find "$REGISTRY_ROOT/.aw_registry" -path "*/skills/$SKILL_NAME/SKILL.md" 2>/dev/null | head -1)
|
|
127
|
+
if [[ -z "$FOUND" ]]; then
|
|
128
|
+
ERRORS+=("phantom_skill: Agent references skill '$SKILL_NAME' but no skills/$SKILL_NAME/SKILL.md found in registry")
|
|
129
|
+
fi
|
|
130
|
+
done
|
|
131
|
+
fi
|
|
132
|
+
;;
|
|
133
|
+
command)
|
|
134
|
+
# Check agent roster — only extract from the "Agent Roster" section, not Skill Loading Gate
|
|
135
|
+
# Extract lines between "## Agent Roster" and the next "##" heading
|
|
136
|
+
ROSTER_SECTION=$(echo "$CONTENT" | sed -n '/^## Agent Roster/,/^## [^A]/p' | sed '$d')
|
|
137
|
+
if [[ -n "$ROSTER_SECTION" ]]; then
|
|
138
|
+
AGENT_NAMES=$(echo "$ROSTER_SECTION" | grep -E '^\| *`[a-zA-Z]' | sed 's/.*`\([a-zA-Z0-9_-]*\)`.*/\1/' 2>/dev/null || true)
|
|
139
|
+
# Known built-in subagent types (Claude Code native agents, not registry artifacts)
|
|
140
|
+
BUILTIN_AGENTS="general-purpose|planner|code-reviewer|security-reviewer|build-error-resolver|typescript-reviewer|python-reviewer|go-reviewer|rust-reviewer|java-reviewer|kotlin-reviewer|cpp-reviewer|flutter-reviewer|e2e-runner|refactor-cleaner|doc-updater|tdd-guide|architect|harness-optimizer|docs-lookup|database-reviewer|loop-operator"
|
|
141
|
+
for AGENT_NAME in $AGENT_NAMES; do
|
|
142
|
+
[[ -z "$AGENT_NAME" ]] && continue
|
|
143
|
+
# Skip built-in subagent types
|
|
144
|
+
if echo "$AGENT_NAME" | grep -qE "^($BUILTIN_AGENTS)$"; then
|
|
145
|
+
continue
|
|
146
|
+
fi
|
|
147
|
+
# Try exact name first, then progressively strip namespace prefixes
|
|
148
|
+
# e.g. platform-review-security-reviewer → review-security-reviewer → security-reviewer
|
|
149
|
+
FOUND=""
|
|
150
|
+
SEARCH_NAME="$AGENT_NAME"
|
|
151
|
+
while [[ -z "$FOUND" && -n "$SEARCH_NAME" ]]; do
|
|
152
|
+
FOUND=$(find "$REGISTRY_ROOT/.aw_registry" -path "*/agents/$SEARCH_NAME.md" 2>/dev/null | head -1)
|
|
153
|
+
if [[ -z "$FOUND" ]]; then
|
|
154
|
+
# Strip first segment (up to and including first hyphen)
|
|
155
|
+
NEW_NAME="${SEARCH_NAME#*-}"
|
|
156
|
+
# If stripping didn't change anything, stop
|
|
157
|
+
[[ "$NEW_NAME" == "$SEARCH_NAME" ]] && break
|
|
158
|
+
SEARCH_NAME="$NEW_NAME"
|
|
159
|
+
fi
|
|
160
|
+
done
|
|
161
|
+
if [[ -z "$FOUND" ]]; then
|
|
162
|
+
ERRORS+=("phantom_agent: Command references agent '$AGENT_NAME' but no agents/$AGENT_NAME.md found in registry")
|
|
163
|
+
fi
|
|
164
|
+
done
|
|
165
|
+
fi
|
|
166
|
+
;;
|
|
167
|
+
esac
|
|
168
|
+
fi
|
|
169
|
+
|
|
170
|
+
# Check: file has minimum content (not just frontmatter)
|
|
171
|
+
LINE_COUNT=$(echo "$CONTENT" | wc -l | tr -d ' ')
|
|
172
|
+
if [[ "$LINE_COUNT" -lt 20 ]]; then
|
|
173
|
+
WARNINGS+=("too_short: Artifact is only $LINE_COUNT lines (minimum recommended: 20)")
|
|
174
|
+
fi
|
|
175
|
+
|
|
176
|
+
# Check: has at least one markdown heading
|
|
177
|
+
if ! echo "$CONTENT" | grep -q '^#'; then
|
|
178
|
+
ERRORS+=("no_heading: No markdown heading found")
|
|
179
|
+
fi
|
|
180
|
+
|
|
181
|
+
# Build JSON output
|
|
182
|
+
ERROR_COUNT=${#ERRORS[@]}
|
|
183
|
+
WARNING_COUNT=${#WARNINGS[@]}
|
|
184
|
+
PASS=$([[ $ERROR_COUNT -eq 0 ]] && echo "true" || echo "false")
|
|
185
|
+
|
|
186
|
+
{
|
|
187
|
+
echo "{"
|
|
188
|
+
echo " \"path\": \"$ARTIFACT_PATH\","
|
|
189
|
+
echo " \"type\": \"$ARTIFACT_TYPE\","
|
|
190
|
+
echo " \"pass\": $PASS,"
|
|
191
|
+
echo " \"error_count\": $ERROR_COUNT,"
|
|
192
|
+
echo " \"warning_count\": $WARNING_COUNT,"
|
|
193
|
+
|
|
194
|
+
# Errors array
|
|
195
|
+
echo " \"errors\": ["
|
|
196
|
+
for i in "${!ERRORS[@]}"; do
|
|
197
|
+
COMMA=$([[ $i -lt $((ERROR_COUNT - 1)) ]] && echo "," || echo "")
|
|
198
|
+
echo " \"${ERRORS[$i]}\"$COMMA"
|
|
199
|
+
done
|
|
200
|
+
echo " ],"
|
|
201
|
+
|
|
202
|
+
# Warnings array
|
|
203
|
+
echo " \"warnings\": ["
|
|
204
|
+
for i in "${!WARNINGS[@]}"; do
|
|
205
|
+
COMMA=$([[ $i -lt $((WARNING_COUNT - 1)) ]] && echo "," || echo "")
|
|
206
|
+
echo " \"${WARNINGS[$i]}\"$COMMA"
|
|
207
|
+
done
|
|
208
|
+
echo " ]"
|
|
209
|
+
|
|
210
|
+
echo "}"
|
|
211
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# score-artifact.sh — Applies quality rubric to a CASRE artifact
|
|
3
|
+
# Usage: bash skills/aw-adk/scripts/score-artifact.sh <path> <type>
|
|
4
|
+
# Types: command | agent | skill | rule | eval
|
|
5
|
+
#
|
|
6
|
+
# This script performs structural scoring (presence/absence of sections).
|
|
7
|
+
# For deeper qualitative scoring, use the ADK skill's score mode which
|
|
8
|
+
# applies the full rubric with LLM judgment.
|
|
9
|
+
#
|
|
10
|
+
# Returns JSON with per-dimension scores and tier assignment.
|
|
11
|
+
|
|
12
|
+
set -euo pipefail
|
|
13
|
+
|
|
14
|
+
ARTIFACT_PATH="${1:-}"
|
|
15
|
+
ARTIFACT_TYPE="${2:-}"
|
|
16
|
+
|
|
17
|
+
if [[ -z "$ARTIFACT_PATH" || -z "$ARTIFACT_TYPE" ]]; then
|
|
18
|
+
echo '{"error": "Usage: score-artifact.sh <path> <type>"}' >&2
|
|
19
|
+
exit 1
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
if [[ ! -f "$ARTIFACT_PATH" ]]; then
|
|
23
|
+
echo "{\"error\": \"File not found: $ARTIFACT_PATH\"}" >&2
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
CONTENT=$(cat "$ARTIFACT_PATH")
|
|
28
|
+
|
|
29
|
+
# Helper: check if content matches pattern (case-insensitive), return 1 if found
|
|
30
|
+
has_pattern() {
|
|
31
|
+
if echo "$CONTENT" | grep -qi "$1" 2>/dev/null; then
|
|
32
|
+
echo 1
|
|
33
|
+
else
|
|
34
|
+
echo 0
|
|
35
|
+
fi
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Helper: count occurrences of pattern
|
|
39
|
+
count_pattern() {
|
|
40
|
+
local count
|
|
41
|
+
count=$(echo "$CONTENT" | grep -ci "$1" 2>/dev/null) || true
|
|
42
|
+
echo "${count:-0}"
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Scoring function: returns 0-10 based on presence/quality signals
|
|
46
|
+
score_dimension() {
|
|
47
|
+
local name="$1"
|
|
48
|
+
local score=0
|
|
49
|
+
|
|
50
|
+
case "$name" in
|
|
51
|
+
frontmatter)
|
|
52
|
+
[[ $(has_pattern '^---$') -eq 1 ]] && score=$((score + 3))
|
|
53
|
+
[[ $(has_pattern '^name:') -eq 1 ]] && score=$((score + 3))
|
|
54
|
+
[[ $(has_pattern '^description:') -eq 1 ]] && score=$((score + 2))
|
|
55
|
+
[[ $(has_pattern '^trigger:') -eq 1 ]] && score=$((score + 2))
|
|
56
|
+
;;
|
|
57
|
+
sections)
|
|
58
|
+
HEADING_COUNT=$(count_pattern '^##')
|
|
59
|
+
if [[ $HEADING_COUNT -ge 8 ]]; then score=10
|
|
60
|
+
elif [[ $HEADING_COUNT -ge 5 ]]; then score=7
|
|
61
|
+
elif [[ $HEADING_COUNT -ge 3 ]]; then score=5
|
|
62
|
+
elif [[ $HEADING_COUNT -ge 1 ]]; then score=3
|
|
63
|
+
fi
|
|
64
|
+
;;
|
|
65
|
+
code_examples)
|
|
66
|
+
CODE_BLOCKS=$(echo "$CONTENT" | grep -c '```' 2>/dev/null) || true
|
|
67
|
+
CODE_BLOCKS=${CODE_BLOCKS:-0}
|
|
68
|
+
CODE_BLOCKS=$((CODE_BLOCKS / 2)) # pairs of ```
|
|
69
|
+
if [[ $CODE_BLOCKS -ge 3 ]]; then score=10
|
|
70
|
+
elif [[ $CODE_BLOCKS -ge 2 ]]; then score=7
|
|
71
|
+
elif [[ $CODE_BLOCKS -ge 1 ]]; then score=5
|
|
72
|
+
fi
|
|
73
|
+
;;
|
|
74
|
+
length)
|
|
75
|
+
WORD_COUNT=$(echo "$CONTENT" | wc -w | tr -d ' ')
|
|
76
|
+
if [[ $WORD_COUNT -ge 200 && $WORD_COUNT -le 5000 ]]; then score=10
|
|
77
|
+
elif [[ $WORD_COUNT -ge 100 ]]; then score=7
|
|
78
|
+
elif [[ $WORD_COUNT -ge 50 ]]; then score=5
|
|
79
|
+
elif [[ $WORD_COUNT -ge 20 ]]; then score=3
|
|
80
|
+
fi
|
|
81
|
+
;;
|
|
82
|
+
checklists)
|
|
83
|
+
CHECKLIST_ITEMS=$(count_pattern '^\- \[')
|
|
84
|
+
TABLE_ROWS=$(count_pattern '^\|')
|
|
85
|
+
TOTAL=$((CHECKLIST_ITEMS + TABLE_ROWS))
|
|
86
|
+
if [[ $TOTAL -ge 10 ]]; then score=10
|
|
87
|
+
elif [[ $TOTAL -ge 5 ]]; then score=7
|
|
88
|
+
elif [[ $TOTAL -ge 2 ]]; then score=5
|
|
89
|
+
elif [[ $TOTAL -ge 1 ]]; then score=3
|
|
90
|
+
fi
|
|
91
|
+
;;
|
|
92
|
+
esac
|
|
93
|
+
|
|
94
|
+
echo $score
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Score universal dimensions
|
|
98
|
+
D_FRONTMATTER=$(score_dimension "frontmatter")
|
|
99
|
+
D_SECTIONS=$(score_dimension "sections")
|
|
100
|
+
D_CODE=$(score_dimension "code_examples")
|
|
101
|
+
D_LENGTH=$(score_dimension "length")
|
|
102
|
+
D_CHECKLISTS=$(score_dimension "checklists")
|
|
103
|
+
|
|
104
|
+
# Type-specific dimensions (structural check only)
|
|
105
|
+
D_TYPE_1=0
|
|
106
|
+
D_TYPE_2=0
|
|
107
|
+
D_TYPE_3=0
|
|
108
|
+
D_TYPE_4=0
|
|
109
|
+
D_TYPE_5=0
|
|
110
|
+
|
|
111
|
+
case "$ARTIFACT_TYPE" in
|
|
112
|
+
command)
|
|
113
|
+
[[ $(has_pattern 'protocol\|AW-PROTOCOL') -eq 1 ]] && D_TYPE_1=7 || D_TYPE_1=0
|
|
114
|
+
[[ $(has_pattern 'agent.*roster\|## Agent') -eq 1 ]] && D_TYPE_2=7 || D_TYPE_2=0
|
|
115
|
+
[[ $(has_pattern 'skill.*load\|loading gate') -eq 1 ]] && D_TYPE_3=7 || D_TYPE_3=0
|
|
116
|
+
[[ $(has_pattern 'phase') -eq 1 ]] && D_TYPE_4=7 || D_TYPE_4=0
|
|
117
|
+
[[ $(has_pattern 'checkpoint\|human') -eq 1 ]] && D_TYPE_5=7 || D_TYPE_5=0
|
|
118
|
+
;;
|
|
119
|
+
agent)
|
|
120
|
+
[[ $(has_pattern 'identity\|personality') -eq 1 ]] && D_TYPE_1=7 || D_TYPE_1=0
|
|
121
|
+
[[ $(has_pattern 'core mission\|## Mission') -eq 1 ]] && D_TYPE_2=7 || D_TYPE_2=0
|
|
122
|
+
[[ $(has_pattern 'critical rules\|## Rules') -eq 1 ]] && D_TYPE_3=7 || D_TYPE_3=0
|
|
123
|
+
[[ $(has_pattern 'deliverable') -eq 1 ]] && D_TYPE_4=7 || D_TYPE_4=0
|
|
124
|
+
[[ $(has_pattern 'communication\|voice') -eq 1 ]] && D_TYPE_5=7 || D_TYPE_5=0
|
|
125
|
+
;;
|
|
126
|
+
skill)
|
|
127
|
+
[[ $(has_pattern 'when to use') -eq 1 ]] && D_TYPE_1=7 || D_TYPE_1=0
|
|
128
|
+
[[ $(has_pattern 'quick start\|## Instructions') -eq 1 ]] && D_TYPE_2=7 || D_TYPE_2=0
|
|
129
|
+
[[ $(has_pattern 'reference') -eq 1 ]] && D_TYPE_3=7 || D_TYPE_3=0
|
|
130
|
+
[[ $(has_pattern 'output format\|## Output') -eq 1 ]] && D_TYPE_4=7 || D_TYPE_4=0
|
|
131
|
+
[[ $(has_pattern 'progressive\|disclosure') -eq 1 ]] && D_TYPE_5=5 || D_TYPE_5=0
|
|
132
|
+
;;
|
|
133
|
+
rule)
|
|
134
|
+
[[ $(has_pattern 'WRONG\|wrong') -eq 1 ]] && D_TYPE_1=7 || D_TYPE_1=0
|
|
135
|
+
[[ $(has_pattern 'RIGHT\|right\|correct') -eq 1 ]] && D_TYPE_2=7 || D_TYPE_2=0
|
|
136
|
+
[[ $(has_pattern 'MUST\|SHOULD\|severity') -eq 1 ]] && D_TYPE_3=7 || D_TYPE_3=0
|
|
137
|
+
[[ $(has_pattern 'automat\|enforce\|lint') -eq 1 ]] && D_TYPE_4=7 || D_TYPE_4=0
|
|
138
|
+
[[ $(has_pattern 'manifest\|coverage') -eq 1 ]] && D_TYPE_5=5 || D_TYPE_5=0
|
|
139
|
+
;;
|
|
140
|
+
eval)
|
|
141
|
+
[[ $(has_pattern 'scenario') -eq 1 ]] && D_TYPE_1=7 || D_TYPE_1=0
|
|
142
|
+
[[ $(has_pattern 'grader\|assert') -eq 1 ]] && D_TYPE_2=7 || D_TYPE_2=0
|
|
143
|
+
[[ $(has_pattern 'pass.*criter\|expected') -eq 1 ]] && D_TYPE_3=7 || D_TYPE_3=0
|
|
144
|
+
[[ $(has_pattern 'fail\|edge.*case') -eq 1 ]] && D_TYPE_4=7 || D_TYPE_4=0
|
|
145
|
+
[[ $(has_pattern 'baseline\|reproduc') -eq 1 ]] && D_TYPE_5=5 || D_TYPE_5=0
|
|
146
|
+
;;
|
|
147
|
+
esac
|
|
148
|
+
|
|
149
|
+
TOTAL=$((D_FRONTMATTER + D_SECTIONS + D_CODE + D_LENGTH + D_CHECKLISTS + D_TYPE_1 + D_TYPE_2 + D_TYPE_3 + D_TYPE_4 + D_TYPE_5))
|
|
150
|
+
|
|
151
|
+
# Assign tier
|
|
152
|
+
if [[ $TOTAL -ge 90 ]]; then TIER="S"
|
|
153
|
+
elif [[ $TOTAL -ge 75 ]]; then TIER="A"
|
|
154
|
+
elif [[ $TOTAL -ge 60 ]]; then TIER="B"
|
|
155
|
+
elif [[ $TOTAL -ge 40 ]]; then TIER="C"
|
|
156
|
+
else TIER="D"
|
|
157
|
+
fi
|
|
158
|
+
|
|
159
|
+
cat <<EOF
|
|
160
|
+
{
|
|
161
|
+
"path": "$ARTIFACT_PATH",
|
|
162
|
+
"type": "$ARTIFACT_TYPE",
|
|
163
|
+
"total": $TOTAL,
|
|
164
|
+
"tier": "$TIER",
|
|
165
|
+
"dimensions": {
|
|
166
|
+
"frontmatter": $D_FRONTMATTER,
|
|
167
|
+
"sections": $D_SECTIONS,
|
|
168
|
+
"code_examples": $D_CODE,
|
|
169
|
+
"length": $D_LENGTH,
|
|
170
|
+
"checklists": $D_CHECKLISTS,
|
|
171
|
+
"type_specific_1": $D_TYPE_1,
|
|
172
|
+
"type_specific_2": $D_TYPE_2,
|
|
173
|
+
"type_specific_3": $D_TYPE_3,
|
|
174
|
+
"type_specific_4": $D_TYPE_4,
|
|
175
|
+
"type_specific_5": $D_TYPE_5
|
|
176
|
+
},
|
|
177
|
+
"note": "Structural scoring only. For qualitative rubric scoring, use the ADK skill score mode."
|
|
178
|
+
}
|
|
179
|
+
EOF
|