@event4u/agent-config 2.11.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/.agent-src/commands/council/analysis.md +142 -0
  2. package/.agent-src/commands/council/debate.md +129 -0
  3. package/.agent-src/commands/council/default.md +8 -0
  4. package/.agent-src/commands/council/design.md +16 -12
  5. package/.agent-src/commands/council/optimize.md +16 -15
  6. package/.agent-src/commands/council/pr.md +12 -12
  7. package/.agent-src/commands/council.md +48 -2
  8. package/.agent-src/personas/advisors/contrarian.md +95 -0
  9. package/.agent-src/personas/advisors/executor.md +99 -0
  10. package/.agent-src/personas/advisors/expansionist.md +98 -0
  11. package/.agent-src/personas/advisors/first-principles.md +98 -0
  12. package/.agent-src/personas/advisors/outsider.md +102 -0
  13. package/.agent-src/rules/copilot-routing.md +19 -0
  14. package/.agent-src/rules/devcontainer-routing.md +20 -0
  15. package/.agent-src/rules/laravel-routing.md +20 -0
  16. package/.agent-src/rules/symfony-routing.md +20 -0
  17. package/.agent-src/skills/ai-council/SKILL.md +180 -2
  18. package/.agent-src/skills/canvas-design/SKILL.md +132 -0
  19. package/.agent-src/skills/canvas-design/evals/triggers.json +16 -0
  20. package/.agent-src/skills/copilot-config/SKILL.md +1 -1
  21. package/.agent-src/skills/devcontainer/SKILL.md +1 -1
  22. package/.agent-src/skills/doc-coauthoring/SKILL.md +129 -0
  23. package/.agent-src/skills/doc-coauthoring/evals/triggers.json +16 -0
  24. package/.agent-src/skills/laravel/SKILL.md +1 -1
  25. package/.agent-src/skills/project-analysis-core/SKILL.md +1 -1
  26. package/.agent-src/skills/project-analyzer/SKILL.md +1 -1
  27. package/.agent-src/skills/skill-writing/SKILL.md +101 -16
  28. package/.agent-src/skills/sql-writing/SKILL.md +1 -1
  29. package/.agent-src/skills/symfony-workflow/SKILL.md +1 -1
  30. package/.agent-src/skills/universal-project-analysis/SKILL.md +1 -1
  31. package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
  32. package/.claude-plugin/marketplace.json +5 -1
  33. package/AGENTS.md +1 -1
  34. package/CHANGELOG.md +78 -0
  35. package/CONTRIBUTING.md +5 -0
  36. package/README.md +3 -3
  37. package/config/agent-settings.template.yml +5 -84
  38. package/docs/architecture/multi-tool-projection.md +53 -0
  39. package/docs/architecture/{compression.md → source-projection.md} +21 -3
  40. package/docs/architecture.md +6 -6
  41. package/docs/catalog.md +21 -11
  42. package/docs/contracts/adr-architectural-consensus-mechanism.md +67 -0
  43. package/docs/contracts/adr-level-6-productization.md +2 -2
  44. package/docs/contracts/ai-council-config.md +186 -0
  45. package/docs/contracts/command-clusters.md +57 -1
  46. package/docs/contracts/multi-tool-projection-fidelity.md +109 -0
  47. package/docs/getting-started.md +2 -2
  48. package/package.json +1 -1
  49. package/scripts/_archive/README.md +59 -0
  50. package/scripts/ai_council/_default_prices.py +10 -1
  51. package/scripts/ai_council/advisors.py +148 -0
  52. package/scripts/ai_council/clients.py +189 -4
  53. package/scripts/ai_council/config.py +368 -0
  54. package/scripts/ai_council/consensus.py +290 -0
  55. package/scripts/ai_council/orchestrator.py +634 -16
  56. package/scripts/ai_council/prompts.py +335 -0
  57. package/scripts/check_compressed_paths.py +6 -1
  58. package/scripts/check_references.py +25 -0
  59. package/scripts/ci_time_ratio.py +168 -0
  60. package/scripts/council_cli.py +1007 -32
  61. package/scripts/measure_projection_bytes.py +159 -0
  62. package/scripts/measure_roadmap_trajectory.py +112 -0
  63. package/scripts/probe_projection_fidelity.py +202 -0
  64. package/scripts/run_skill_evals.py +185 -0
  65. package/scripts/schemas/skill.schema.json +4 -0
  66. package/scripts/score_skill_selection.py +198 -0
  67. package/scripts/skill_collision_clusters.py +162 -0
  68. package/scripts/skill_linter.py +71 -1
  69. /package/scripts/{_backfill_skill_domains.py → _archive/_backfill_skill_domains.py} +0 -0
  70. /package/scripts/{_bootstrap_tier_frontmatter.py → _archive/_bootstrap_tier_frontmatter.py} +0 -0
  71. /package/scripts/{_p43_bodies.py → _archive/_p43_bodies.py} +0 -0
  72. /package/scripts/{_p43_compress.py → _archive/_p43_compress.py} +0 -0
  73. /package/scripts/{_p4_migrate.py → _archive/_p4_migrate.py} +0 -0
  74. /package/scripts/{_phase2_shim_helper.py → _archive/_phase2_shim_helper.py} +0 -0
  75. /package/scripts/{_pilot_council_question.py → _archive/_pilot_council_question.py} +0 -0
@@ -0,0 +1,198 @@
1
+ #!/usr/bin/env python3
2
+ """Selection-accuracy scorer (council file 05, Phase 2.2).
3
+
4
+ Reads `tests/fixtures/skill_selection/fixtures.yml` and a predictions
5
+ JSON (`{fixture_id: selected_skill_name}`), then computes:
6
+
7
+ - (a) intended-skill hit rate — exact `intended` match
8
+ - (b) correct-cluster hit rate — any member of the same cluster
9
+
10
+ Per-cluster pass/fail uses the Round-3 protocol:
11
+ pass = (a) >= 0.90 OR (b) >= 0.95
12
+ fail = (a) < 0.80 AND (b) < 0.80 → cluster needs `routes_to`
13
+
14
+ Predictions source:
15
+ - `--predictions <path>`: external JSON file (LLM run, eval harness, manual).
16
+ - `--baseline`: built-in TF-IDF-style description-similarity baseline. The
17
+ baseline does NOT speak for any specific host tool; it estimates what
18
+ pure description-matching would do and provides a numeric floor.
19
+
20
+ Output: human-readable summary on stdout + machine JSON to
21
+ `agents/reports/skill-selection-accuracy.json` (or `--out`).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import math
29
+ import re
30
+ import sys
31
+ from collections import Counter, defaultdict
32
+ from pathlib import Path
33
+
34
+ import yaml
35
+
36
+ REPO_ROOT = Path(__file__).resolve().parent.parent
37
+ FIXTURES = REPO_ROOT / "tests" / "fixtures" / "skill_selection" / "fixtures.yml"
38
+ CLUSTERS = REPO_ROOT / "agents" / "reports" / "skill-collision-clusters.json"
39
+ SKILLS_DIR = REPO_ROOT / ".agent-src.uncompressed" / "skills"
40
+ DEFAULT_OUT = REPO_ROOT / "agents" / "reports" / "skill-selection-accuracy.json"
41
+
42
+ PASS_A = 0.90
43
+ PASS_B = 0.95
44
+ FAIL_THRESHOLD = 0.80
45
+
46
+ STOPWORDS = {
47
+ "the", "and", "for", "with", "when", "use", "or", "of", "to", "a", "an",
48
+ "is", "in", "on", "by", "be", "at", "as", "it", "if", "are", "this",
49
+ "that", "from", "but", "not", "can", "any", "all", "no", "after",
50
+ "before", "during", "user", "agent", "code", "project", "via", "into",
51
+ "onto", "even", "without", "naming", "uses", "used", "using", "also",
52
+ "etc", "across", "between",
53
+ }
54
+
55
+
56
+ def tokenize(text: str) -> list[str]:
57
+ tokens = re.findall(r"[A-Za-z][A-Za-z0-9_-]{2,}", text.lower())
58
+ return [t for t in tokens if t not in STOPWORDS and not t.isdigit()]
59
+
60
+
61
+ def load_skills() -> dict[str, str]:
62
+ out = {}
63
+ for skill_md in sorted(SKILLS_DIR.glob("*/SKILL.md")):
64
+ text = skill_md.read_text()
65
+ if not text.startswith("---"):
66
+ continue
67
+ parts = text.split("---", 2)
68
+ if len(parts) < 3:
69
+ continue
70
+ try:
71
+ fm = yaml.safe_load(parts[1]) or {}
72
+ except yaml.YAMLError:
73
+ continue
74
+ name = fm.get("name") or skill_md.parent.name
75
+ desc = (fm.get("description") or "").strip()
76
+ if desc:
77
+ out[name] = desc
78
+ return out
79
+
80
+
81
+ def tfidf_vectors(docs: dict[str, str]) -> tuple[dict[str, dict[str, float]], dict[str, float]]:
82
+ n_docs = len(docs)
83
+ df: Counter[str] = Counter()
84
+ tokenized = {k: tokenize(v) for k, v in docs.items()}
85
+ for toks in tokenized.values():
86
+ for term in set(toks):
87
+ df[term] += 1
88
+ idf = {term: math.log((n_docs + 1) / (count + 1)) + 1 for term, count in df.items()}
89
+ vectors: dict[str, dict[str, float]] = {}
90
+ for name, toks in tokenized.items():
91
+ tf = Counter(toks)
92
+ vectors[name] = {term: tf[term] * idf.get(term, 0.0) for term in tf}
93
+ return vectors, idf
94
+
95
+
96
+ def cosine(a: dict[str, float], b: dict[str, float]) -> float:
97
+ if not a or not b:
98
+ return 0.0
99
+ common = set(a) & set(b)
100
+ dot = sum(a[t] * b[t] for t in common)
101
+ na = math.sqrt(sum(v * v for v in a.values()))
102
+ nb = math.sqrt(sum(v * v for v in b.values()))
103
+ if na == 0 or nb == 0:
104
+ return 0.0
105
+ return dot / (na * nb)
106
+
107
+
108
+ def baseline_predict(fixtures: list[dict], skills: dict[str, str]) -> dict[str, str]:
109
+ vectors, idf = tfidf_vectors(skills)
110
+ preds: dict[str, str] = {}
111
+ for fx in fixtures:
112
+ prompt_tokens = tokenize(fx["prompt"])
113
+ tf = Counter(prompt_tokens)
114
+ pv = {term: tf[term] * idf.get(term, 0.0) for term in tf}
115
+ best_name, best_score = "", -1.0
116
+ for name, vec in vectors.items():
117
+ score = cosine(pv, vec)
118
+ if score > best_score:
119
+ best_name, best_score = name, score
120
+ preds[fx["id"]] = best_name
121
+ return preds
122
+
123
+
124
+ def score(fixtures: list[dict], clusters: list[dict], preds: dict[str, str]) -> dict:
125
+ # Look up cluster membership by intended-skill (robust to cluster_id renumbering).
126
+ by_member: dict[str, set[str]] = {}
127
+ for c in clusters:
128
+ members = set(c["members"])
129
+ for m in members:
130
+ by_member[m] = members
131
+ per_cluster = defaultdict(lambda: {"total": 0, "hits_a": 0, "hits_b": 0, "misses": [], "label": ""})
132
+ for fx in fixtures:
133
+ intended = fx["intended"]
134
+ members = by_member.get(intended, {intended})
135
+ # Stable label: sorted members joined — survives cluster_id renumbering.
136
+ cid = fx.get("cluster") or "+".join(sorted(members)[:2])
137
+ pred = preds.get(fx["id"], "")
138
+ rec = per_cluster[cid]
139
+ rec["total"] += 1
140
+ rec["label"] = ",".join(sorted(members))
141
+ if pred == intended:
142
+ rec["hits_a"] += 1
143
+ if pred in members:
144
+ rec["hits_b"] += 1
145
+ else:
146
+ rec["misses"].append({"id": fx["id"], "intended": intended, "predicted": pred})
147
+ results = []
148
+ for cid, rec in sorted(per_cluster.items()):
149
+ a = rec["hits_a"] / rec["total"]
150
+ b = rec["hits_b"] / rec["total"]
151
+ if a >= PASS_A or b >= PASS_B:
152
+ verdict = "pass"
153
+ elif a < FAIL_THRESHOLD and b < FAIL_THRESHOLD:
154
+ verdict = "fail-needs-routes_to"
155
+ else:
156
+ verdict = "mixed"
157
+ results.append({"cluster": cid, "n": rec["total"], "hit_a": round(a, 3),
158
+ "hit_b": round(b, 3), "verdict": verdict, "misses": rec["misses"]})
159
+ total = sum(r["n"] for r in results)
160
+ overall_a = sum(r["hit_a"] * r["n"] for r in results) / total if total else 0.0
161
+ overall_b = sum(r["hit_b"] * r["n"] for r in results) / total if total else 0.0
162
+ return {"clusters": results,
163
+ "overall": {"n": total, "hit_a": round(overall_a, 3), "hit_b": round(overall_b, 3)}}
164
+
165
+
166
+ def main() -> int:
167
+ p = argparse.ArgumentParser()
168
+ p.add_argument("--predictions", type=Path, help="JSON file: {fixture_id: skill_name}")
169
+ p.add_argument("--baseline", action="store_true", help="Use built-in TF-IDF baseline")
170
+ p.add_argument("--source", default="external", help="Label recorded in output")
171
+ p.add_argument("--out", type=Path, default=DEFAULT_OUT)
172
+ args = p.parse_args()
173
+
174
+ if not args.predictions and not args.baseline:
175
+ print("❌ Specify --predictions <file> or --baseline", file=sys.stderr)
176
+ return 2
177
+ fixtures = yaml.safe_load(FIXTURES.read_text())["fixtures"]
178
+ clusters = json.loads(CLUSTERS.read_text())["clusters"]
179
+ skills = load_skills()
180
+ if args.baseline:
181
+ preds = baseline_predict(fixtures, skills)
182
+ source = "tfidf-baseline"
183
+ else:
184
+ preds = json.loads(args.predictions.read_text())
185
+ source = args.source
186
+ report = score(fixtures, clusters, preds)
187
+ report["source"] = source
188
+ args.out.parent.mkdir(parents=True, exist_ok=True)
189
+ args.out.write_text(json.dumps(report, indent=2) + "\n")
190
+ print(f"✅ Wrote {args.out.relative_to(REPO_ROOT)} (source={source})")
191
+ print(f" overall: hit_a={report['overall']['hit_a']:.3f} hit_b={report['overall']['hit_b']:.3f} n={report['overall']['n']}")
192
+ for c in report["clusters"]:
193
+ print(f" {c['cluster']:6} n={c['n']:2} hit_a={c['hit_a']:.2f} hit_b={c['hit_b']:.2f} {c['verdict']}")
194
+ return 0
195
+
196
+
197
+ if __name__ == "__main__":
198
+ sys.exit(main())
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+ """Skill-collision cluster analysis (Phase 2.2 of step-1-v2-feedback-followup).
3
+
4
+ Walks `.agent-src.uncompressed/skills/<id>/SKILL.md`, extracts the
5
+ `description` frontmatter, computes pairwise keyword overlap, and groups
6
+ high-overlap skill pairs into clusters. The output drives the
7
+ selection-accuracy fixture set defined by council file 05 (Round-3
8
+ protocol — ≥ 3 shared significant terms → collision cluster).
9
+
10
+ Output: `agents/reports/skill-collision-clusters.json`
11
+
12
+ Schema:
13
+ {
14
+ "skill_count": int,
15
+ "cluster_count": int,
16
+ "clusters": [
17
+ {
18
+ "cluster_id": "C01",
19
+ "members": ["skill-a", "skill-b", ...],
20
+ "shared_keywords": [...],
21
+ "max_overlap": float,
22
+ "descriptions": {"skill-a": "...", ...}
23
+ },
24
+ ...
25
+ ]
26
+ }
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ import re
33
+ import sys
34
+ from itertools import combinations
35
+ from pathlib import Path
36
+
37
+ import yaml
38
+
39
+ REPO_ROOT = Path(__file__).resolve().parent.parent
40
+ SKILLS_DIR = REPO_ROOT / ".agent-src.uncompressed" / "skills"
41
+ OUT_JSON = REPO_ROOT / "agents" / "reports" / "skill-collision-clusters.json"
42
+
43
+ KEYWORD_OVERLAP_THRESHOLD = 0.40
44
+ MIN_SHARED_KEYWORDS = 3
45
+ TOP_N_CLUSTERS = 10
46
+
47
+ STOPWORDS = {
48
+ "the", "and", "for", "with", "when", "use", "or", "of", "to", "a",
49
+ "an", "is", "in", "on", "by", "be", "at", "as", "it", "if", "are",
50
+ "this", "that", "from", "but", "not", "can", "any", "all", "no",
51
+ "after", "before", "during", "user", "agent", "code", "project",
52
+ "via", "into", "onto", "even", "without", "naming", "uses", "used",
53
+ "using", "also", "etc", "across", "between", "review", "design",
54
+ "writing", "create", "creating", "edit", "editing", "make", "making",
55
+ "set", "setting", "based", "well", "right", "left", "new",
56
+ }
57
+
58
+
59
+ def keyword_set(text: str) -> set[str]:
60
+ tokens = re.findall(r"[A-Za-z][A-Za-z0-9_-]{2,}", text.lower())
61
+ return {t for t in tokens if t not in STOPWORDS and not t.isdigit()}
62
+
63
+
64
+ def overlap_fraction(a: set[str], b: set[str]) -> float:
65
+ if not a or not b:
66
+ return 0.0
67
+ return len(a & b) / min(len(a), len(b))
68
+
69
+
70
+ def load_skills() -> list[dict]:
71
+ skills = []
72
+ for skill_md in sorted(SKILLS_DIR.glob("*/SKILL.md")):
73
+ text = skill_md.read_text()
74
+ if not text.startswith("---"):
75
+ continue
76
+ parts = text.split("---", 2)
77
+ if len(parts) < 3:
78
+ continue
79
+ try:
80
+ fm = yaml.safe_load(parts[1]) or {}
81
+ except yaml.YAMLError:
82
+ continue
83
+ name = fm.get("name") or skill_md.parent.name
84
+ description = (fm.get("description") or "").strip()
85
+ if not description:
86
+ continue
87
+ skills.append(
88
+ {
89
+ "name": name,
90
+ "description": description,
91
+ "_keywords": keyword_set(description),
92
+ }
93
+ )
94
+ return skills
95
+
96
+
97
+ def build_clusters(skills: list[dict]) -> list[dict]:
98
+ # Pairwise edges where overlap & shared-keyword threshold is met.
99
+ edges: list[tuple[str, str, set[str], float]] = []
100
+ by_name = {s["name"]: s for s in skills}
101
+ for a, b in combinations(skills, 2):
102
+ shared = a["_keywords"] & b["_keywords"]
103
+ ov = overlap_fraction(a["_keywords"], b["_keywords"])
104
+ if len(shared) >= MIN_SHARED_KEYWORDS and ov >= KEYWORD_OVERLAP_THRESHOLD:
105
+ edges.append((a["name"], b["name"], shared, ov))
106
+
107
+ # Union-find over edge set → connected-component clusters.
108
+ parent: dict[str, str] = {}
109
+
110
+ def find(x: str) -> str:
111
+ parent.setdefault(x, x)
112
+ while parent[x] != x:
113
+ parent[x] = parent[parent[x]]
114
+ x = parent[x]
115
+ return x
116
+
117
+ def union(x: str, y: str) -> None:
118
+ rx, ry = find(x), find(y)
119
+ if rx != ry:
120
+ parent[rx] = ry
121
+
122
+ for a, b, _, _ in edges:
123
+ union(a, b)
124
+
125
+ components: dict[str, list[str]] = {}
126
+ for name in {n for edge in edges for n in edge[:2]}:
127
+ components.setdefault(find(name), []).append(name)
128
+
129
+ clusters: list[dict] = []
130
+ for idx, (_, members) in enumerate(sorted(components.items(), key=lambda kv: -len(kv[1])), start=1):
131
+ member_kws = [by_name[m]["_keywords"] for m in members]
132
+ shared_all = set.intersection(*member_kws) if member_kws else set()
133
+ member_edges = [(a, b, sk, ov) for a, b, sk, ov in edges if a in members and b in members]
134
+ max_ov = max((ov for *_, ov in member_edges), default=0.0)
135
+ clusters.append({
136
+ "cluster_id": f"C{idx:02d}",
137
+ "members": sorted(members),
138
+ "shared_keywords": sorted(shared_all),
139
+ "max_overlap": round(max_ov, 3),
140
+ "descriptions": {m: by_name[m]["description"] for m in sorted(members)},
141
+ })
142
+ return clusters[:TOP_N_CLUSTERS]
143
+
144
+
145
+ def main() -> int:
146
+ if not SKILLS_DIR.exists():
147
+ print(f"❌ Skills dir not found: {SKILLS_DIR}", file=sys.stderr)
148
+ return 2
149
+ skills = load_skills()
150
+ clusters = build_clusters(skills)
151
+ OUT_JSON.parent.mkdir(parents=True, exist_ok=True)
152
+ OUT_JSON.write_text(json.dumps({
153
+ "skill_count": len(skills),
154
+ "cluster_count": len(clusters),
155
+ "clusters": clusters,
156
+ }, indent=2) + "\n")
157
+ print(f"✅ Wrote {OUT_JSON.relative_to(REPO_ROOT)} — {len(clusters)} clusters from {len(skills)} skills")
158
+ return 0
159
+
160
+
161
+ if __name__ == "__main__":
162
+ sys.exit(main())
@@ -775,8 +775,14 @@ def lint_skill(path: Path, text: str) -> LintResult:
775
775
  # is *both* large AND prose-dominant OR ships ≥ 2 independently invocable
776
776
  # procedures. Reference catalogues (quality-tools 411 L / density 0.83)
777
777
  # pass; multi-procedure skills are flagged for split.
778
+ #
779
+ # Frontmatter opt-out: `meta_skill: true` exempts a skill from the size
780
+ # warn when the skill's purpose *is* breadth (skill-writing, agent-docs-
781
+ # writing, skill-reviewer, etc.). Meta-skills inherently bundle multiple
782
+ # procedures and inline examples.
778
783
  total_lines = len(text.splitlines())
779
- if total_lines > 400:
784
+ is_meta_skill = bool(fm) and re.search(r"^meta_skill:\s*true\s*$", fm, re.MULTILINE)
785
+ if total_lines > 400 and not is_meta_skill:
780
786
  density = _density_score(text)
781
787
  procedures = _count_procedure_sections(text)
782
788
  if density < 0.6 or procedures >= 2:
@@ -832,6 +838,12 @@ def lint_skill(path: Path, text: str) -> LintResult:
832
838
  f"{meaningful_steps} steps) — may lack its own executable workflow"))
833
839
  suggestions.append("Expand the skill so it remains executable without opening a guideline")
834
840
 
841
+ # --- evals.json schema validator ---
842
+ # When a skill ships sibling `evals/evals.json` (quantitative behavior
843
+ # eval per skill-writing § 7), validate its shape. Triggers.json is a
844
+ # separate concern handled elsewhere. All issues here are WARN.
845
+ issues.extend(validate_evals_json(path))
846
+
835
847
  return LintResult(
836
848
  file=str(path),
837
849
  artifact_type="skill",
@@ -841,6 +853,64 @@ def lint_skill(path: Path, text: str) -> LintResult:
841
853
  )
842
854
 
843
855
 
856
+ def validate_evals_json(skill_path: Path) -> list[Issue]:
857
+ """Validate `{skill_dir}/evals/evals.json` against the schema declared
858
+ in `skill-writing` § 7. Returns WARN-level issues only; never blocks.
859
+ Skipped entirely when the file is absent."""
860
+ evals_path = skill_path.parent / "evals" / "evals.json"
861
+ if not evals_path.is_file():
862
+ return []
863
+ issues: list[Issue] = []
864
+ try:
865
+ data = json.loads(evals_path.read_text(encoding="utf-8"))
866
+ except (OSError, json.JSONDecodeError) as exc:
867
+ return [Issue("warning", "evals_json_unreadable",
868
+ f"evals/evals.json could not be parsed: {exc}")]
869
+ if not isinstance(data, dict):
870
+ return [Issue("warning", "evals_json_shape",
871
+ "evals/evals.json root must be an object")]
872
+ if "skill" not in data or not isinstance(data["skill"], str):
873
+ issues.append(Issue("warning", "evals_json_missing_skill",
874
+ "evals/evals.json must declare top-level 'skill' (string)"))
875
+ scenarios = data.get("scenarios")
876
+ if not isinstance(scenarios, list) or len(scenarios) < 1:
877
+ issues.append(Issue("warning", "evals_json_no_scenarios",
878
+ "evals/evals.json must declare 'scenarios' (non-empty array)"))
879
+ return issues
880
+ valid_kinds = {"contains", "file_exists", "rubric"}
881
+ for idx, scenario in enumerate(scenarios):
882
+ loc = f"scenarios[{idx}]"
883
+ if not isinstance(scenario, dict):
884
+ issues.append(Issue("warning", "evals_json_scenario_shape",
885
+ f"{loc} must be an object"))
886
+ continue
887
+ for key in ("id", "prompt"):
888
+ if key not in scenario or not isinstance(scenario[key], str) or not scenario[key].strip():
889
+ issues.append(Issue("warning", "evals_json_scenario_missing_field",
890
+ f"{loc} missing required string field '{key}'"))
891
+ assertions = scenario.get("assertions")
892
+ if not isinstance(assertions, list) or len(assertions) < 1:
893
+ issues.append(Issue("warning", "evals_json_scenario_no_assertions",
894
+ f"{loc}.assertions must be a non-empty array"))
895
+ continue
896
+ for a_idx, assertion in enumerate(assertions):
897
+ a_loc = f"{loc}.assertions[{a_idx}]"
898
+ if not isinstance(assertion, dict):
899
+ issues.append(Issue("warning", "evals_json_assertion_shape",
900
+ f"{a_loc} must be an object"))
901
+ continue
902
+ kind = assertion.get("kind")
903
+ if kind not in valid_kinds:
904
+ issues.append(Issue("warning", "evals_json_assertion_kind",
905
+ f"{a_loc}.kind must be one of {sorted(valid_kinds)}, got {kind!r}"))
906
+ continue
907
+ required_field = {"contains": "value", "file_exists": "path", "rubric": "criterion"}[kind]
908
+ if required_field not in assertion or not isinstance(assertion[required_field], str):
909
+ issues.append(Issue("warning", "evals_json_assertion_missing_field",
910
+ f"{a_loc} (kind={kind}) missing required string field '{required_field}'"))
911
+ return issues
912
+
913
+
844
914
  def extract_frontmatter(text: str) -> Optional[str]:
845
915
  match = FRONTMATTER_PATTERN.search(text)
846
916
  return match.group(1) if match else None