conectese 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. package/README.md +265 -0
  2. package/_conectese/.conectese-version +1 -0
  3. package/_conectese/config/playwright.config.json +11 -0
  4. package/_conectese/core/architect.agent.yaml +110 -0
  5. package/_conectese/core/best-practices/_catalog.yaml +116 -0
  6. package/_conectese/core/best-practices/blog-post.md +132 -0
  7. package/_conectese/core/best-practices/blog-seo.md +127 -0
  8. package/_conectese/core/best-practices/copywriting.md +426 -0
  9. package/_conectese/core/best-practices/data-analysis.md +401 -0
  10. package/_conectese/core/best-practices/email-newsletter.md +118 -0
  11. package/_conectese/core/best-practices/email-sales.md +110 -0
  12. package/_conectese/core/best-practices/image-design.md +348 -0
  13. package/_conectese/core/best-practices/instagram-feed.md +235 -0
  14. package/_conectese/core/best-practices/instagram-reels.md +112 -0
  15. package/_conectese/core/best-practices/instagram-stories.md +107 -0
  16. package/_conectese/core/best-practices/linkedin-article.md +116 -0
  17. package/_conectese/core/best-practices/linkedin-post.md +121 -0
  18. package/_conectese/core/best-practices/researching.md +349 -0
  19. package/_conectese/core/best-practices/review.md +269 -0
  20. package/_conectese/core/best-practices/social-networks-publishing.md +294 -0
  21. package/_conectese/core/best-practices/strategist.md +344 -0
  22. package/_conectese/core/best-practices/technical-writing.md +365 -0
  23. package/_conectese/core/best-practices/twitter-post.md +105 -0
  24. package/_conectese/core/best-practices/twitter-thread.md +122 -0
  25. package/_conectese/core/best-practices/whatsapp-broadcast.md +107 -0
  26. package/_conectese/core/best-practices/youtube-script.md +122 -0
  27. package/_conectese/core/best-practices/youtube-shorts.md +112 -0
  28. package/_conectese/core/prompts/build.prompt.md +547 -0
  29. package/_conectese/core/prompts/design.prompt.md +469 -0
  30. package/_conectese/core/prompts/discovery.prompt.md +269 -0
  31. package/_conectese/core/prompts/sherlock-instagram.md +123 -0
  32. package/_conectese/core/prompts/sherlock-linkedin.md +73 -0
  33. package/_conectese/core/prompts/sherlock-shared.md +684 -0
  34. package/_conectese/core/prompts/sherlock-twitter.md +78 -0
  35. package/_conectese/core/prompts/sherlock-youtube.md +85 -0
  36. package/_conectese/core/runner.pipeline.md +535 -0
  37. package/_conectese/core/skills.engine.md +381 -0
  38. package/agents/data-extractor/AGENT.md +13 -0
  39. package/agents/direito-adaneiro/AGENT.md +18 -0
  40. package/agents/direito-administrativo/AGENT.md +18 -0
  41. package/agents/direito-aeroporta-rio/AGENT.md +18 -0
  42. package/agents/direito-agra-rio/AGENT.md +18 -0
  43. package/agents/direito-ambiental/AGENT.md +18 -0
  44. package/agents/direito-banca-rio/AGENT.md +18 -0
  45. package/agents/direito-civil/AGENT.md +18 -0
  46. package/agents/direito-constitcional/AGENT.md +18 -0
  47. package/agents/direito-da-crianc-a-e-do-adolescente-eca/AGENT.md +18 -0
  48. package/agents/direito-da-propriedade-intelectal/AGENT.md +18 -0
  49. package/agents/direito-de-ami-lia/AGENT.md +18 -0
  50. package/agents/direito-de-tra-nsito/AGENT.md +18 -0
  51. package/agents/direito-desportivo/AGENT.md +18 -0
  52. package/agents/direito-digital/AGENT.md +18 -0
  53. package/agents/direito-do-consmidor/AGENT.md +18 -0
  54. package/agents/direito-do-trabalho/AGENT.md +18 -0
  55. package/agents/direito-econo-mico/AGENT.md +18 -0
  56. package/agents/direito-eleitoral/AGENT.md +18 -0
  57. package/agents/direito-empresarial/AGENT.md +18 -0
  58. package/agents/direito-imobilia-rio/AGENT.md +18 -0
  59. package/agents/direito-inanceiro/AGENT.md +18 -0
  60. package/agents/direito-internacional/AGENT.md +18 -0
  61. package/agents/direito-mari-timo/AGENT.md +18 -0
  62. package/agents/direito-me-dico-e-da-sa-de/AGENT.md +18 -0
  63. package/agents/direito-militar/AGENT.md +18 -0
  64. package/agents/direito-ndia-rio/AGENT.md +18 -0
  65. package/agents/direito-notarial-e-registral/AGENT.md +18 -0
  66. package/agents/direito-penal/AGENT.md +18 -0
  67. package/agents/direito-previdencia-rio/AGENT.md +18 -0
  68. package/agents/direito-processal-civil/AGENT.md +18 -0
  69. package/agents/direito-processal-do-trabalho/AGENT.md +18 -0
  70. package/agents/direito-processal-militar/AGENT.md +18 -0
  71. package/agents/direito-processal-penal/AGENT.md +18 -0
  72. package/agents/direito-rbani-stico/AGENT.md +18 -0
  73. package/agents/direito-secrita-rio/AGENT.md +18 -0
  74. package/agents/direito-sindical/AGENT.md +18 -0
  75. package/agents/direito-societa-rio/AGENT.md +18 -0
  76. package/agents/direito-tribta-rio/AGENT.md +18 -0
  77. package/agents/direitos-hmanos/AGENT.md +18 -0
  78. package/agents/legal-analyst/AGENT.md +16 -0
  79. package/agents/legal-synthesizer/AGENT.md +13 -0
  80. package/agents/lgpd-anonymizer/AGENT.md +14 -0
  81. package/agents/lgpd-restorer/AGENT.md +14 -0
  82. package/agents/task-router/AGENT.md +13 -0
  83. package/bin/conectese.js +73 -0
  84. package/dashboard/index.html +12 -0
  85. package/dashboard/package-lock.json +1971 -0
  86. package/dashboard/package.json +28 -0
  87. package/dashboard/public/assets/avatars/Female1_1wave.png +0 -0
  88. package/dashboard/public/assets/avatars/Female1_2wave.png +0 -0
  89. package/dashboard/public/assets/avatars/Female1_blink.png +0 -0
  90. package/dashboard/public/assets/avatars/Female1_talk.png +0 -0
  91. package/dashboard/public/assets/avatars/Female2_1wave.png +0 -0
  92. package/dashboard/public/assets/avatars/Female2_2wave.png +0 -0
  93. package/dashboard/public/assets/avatars/Female2_blink.png +0 -0
  94. package/dashboard/public/assets/avatars/Female2_talk.png +0 -0
  95. package/dashboard/public/assets/avatars/Female3_blink.png +0 -0
  96. package/dashboard/public/assets/avatars/Female3_talk.png +0 -0
  97. package/dashboard/public/assets/avatars/Female3_wave.png +0 -0
  98. package/dashboard/public/assets/avatars/Female4_blink.png +0 -0
  99. package/dashboard/public/assets/avatars/Female4_talk.png +0 -0
  100. package/dashboard/public/assets/avatars/Female4_wave.png +0 -0
  101. package/dashboard/public/assets/avatars/Female5_blink.png +0 -0
  102. package/dashboard/public/assets/avatars/Female5_talk.png +0 -0
  103. package/dashboard/public/assets/avatars/Female5_wave.png +0 -0
  104. package/dashboard/public/assets/avatars/Female6_blink.png +0 -0
  105. package/dashboard/public/assets/avatars/Female6_talk.png +0 -0
  106. package/dashboard/public/assets/avatars/Female6_wave.png +0 -0
  107. package/dashboard/public/assets/avatars/Male1_1wave.png +0 -0
  108. package/dashboard/public/assets/avatars/Male1_2wave.png +0 -0
  109. package/dashboard/public/assets/avatars/Male1_blink.png +0 -0
  110. package/dashboard/public/assets/avatars/Male1_talk.png +0 -0
  111. package/dashboard/public/assets/avatars/Male2_1wave.png +0 -0
  112. package/dashboard/public/assets/avatars/Male2_2wave.png +0 -0
  113. package/dashboard/public/assets/avatars/Male2_blink.png +0 -0
  114. package/dashboard/public/assets/avatars/Male2_talk.png +0 -0
  115. package/dashboard/public/assets/avatars/Male3_blink.png +0 -0
  116. package/dashboard/public/assets/avatars/Male3_talk.png +0 -0
  117. package/dashboard/public/assets/avatars/Male3_wave.png +0 -0
  118. package/dashboard/public/assets/avatars/Male4_blink.png +0 -0
  119. package/dashboard/public/assets/avatars/Male4_talk.png +0 -0
  120. package/dashboard/public/assets/avatars/Male4_wave.png +0 -0
  121. package/dashboard/public/assets/desks/desktop_set_black_down.png +0 -0
  122. package/dashboard/public/assets/desks/desktop_set_black_down_coding-1.png +0 -0
  123. package/dashboard/public/assets/desks/desktop_set_black_down_coding.png +0 -0
  124. package/dashboard/public/assets/desks/desktop_set_black_up.png +0 -0
  125. package/dashboard/public/assets/desks/desktop_set_white_down.png +0 -0
  126. package/dashboard/public/assets/desks/desktop_set_white_down_coding-1.png +0 -0
  127. package/dashboard/public/assets/desks/desktop_set_white_down_coding.png +0 -0
  128. package/dashboard/public/assets/desks/desktop_set_white_up.png +0 -0
  129. package/dashboard/public/assets/furniture/armchair_tan.png +0 -0
  130. package/dashboard/public/assets/furniture/armchair_tan_down.png +0 -0
  131. package/dashboard/public/assets/furniture/backpack_blue.png +0 -0
  132. package/dashboard/public/assets/furniture/backpack_red.png +0 -0
  133. package/dashboard/public/assets/furniture/blinds.png +0 -0
  134. package/dashboard/public/assets/furniture/blinds_large_closed_white.png +0 -0
  135. package/dashboard/public/assets/furniture/bookshelf.png +0 -0
  136. package/dashboard/public/assets/furniture/bookshelf_purple_tall.png +0 -0
  137. package/dashboard/public/assets/furniture/bulletin_board.png +0 -0
  138. package/dashboard/public/assets/furniture/clock.png +0 -0
  139. package/dashboard/public/assets/furniture/coffee_mug.png +0 -0
  140. package/dashboard/public/assets/furniture/coffee_mug_blue.png +0 -0
  141. package/dashboard/public/assets/furniture/coffee_table.png +0 -0
  142. package/dashboard/public/assets/furniture/coffeepot_right.png +0 -0
  143. package/dashboard/public/assets/furniture/coffeetable_black_horizontal.png +0 -0
  144. package/dashboard/public/assets/furniture/couch.png +0 -0
  145. package/dashboard/public/assets/furniture/couch_tan_down.png +0 -0
  146. package/dashboard/public/assets/furniture/cushion_blue.png +0 -0
  147. package/dashboard/public/assets/furniture/cushion_tan.png +0 -0
  148. package/dashboard/public/assets/furniture/desk_wood.png +0 -0
  149. package/dashboard/public/assets/furniture/fancy_rug.png +0 -0
  150. package/dashboard/public/assets/furniture/fancy_rug_wide.png +0 -0
  151. package/dashboard/public/assets/furniture/flowers1.png +0 -0
  152. package/dashboard/public/assets/furniture/flowers2.png +0 -0
  153. package/dashboard/public/assets/furniture/lamp_tan.png +0 -0
  154. package/dashboard/public/assets/furniture/lantern.png +0 -0
  155. package/dashboard/public/assets/furniture/monstera.png +0 -0
  156. package/dashboard/public/assets/furniture/monstera_small.png +0 -0
  157. package/dashboard/public/assets/furniture/picture_frame.png +0 -0
  158. package/dashboard/public/assets/furniture/plant1.png +0 -0
  159. package/dashboard/public/assets/furniture/plant2.png +0 -0
  160. package/dashboard/public/assets/furniture/plant3.png +0 -0
  161. package/dashboard/public/assets/furniture/plant_poof.png +0 -0
  162. package/dashboard/public/assets/furniture/plant_spindly.png +0 -0
  163. package/dashboard/public/assets/furniture/poster_blue.png +0 -0
  164. package/dashboard/public/assets/furniture/rug.png +0 -0
  165. package/dashboard/public/assets/furniture/succulent_blue.png +0 -0
  166. package/dashboard/public/assets/furniture/succulent_green.png +0 -0
  167. package/dashboard/public/assets/furniture/treasurechest_closed_gold.png +0 -0
  168. package/dashboard/public/assets/furniture/water_cooler_better.png +0 -0
  169. package/dashboard/public/assets/furniture/whiteboard.png +0 -0
  170. package/dashboard/public/assets/furniture/whiteboard_stand_graph.png +0 -0
  171. package/dashboard/public/assets/furniture/window_blinds_open.png +0 -0
  172. package/dashboard/src/App.tsx +46 -0
  173. package/dashboard/src/components/SquadCard.tsx +47 -0
  174. package/dashboard/src/components/SquadSelector.tsx +61 -0
  175. package/dashboard/src/components/StatusBadge.tsx +32 -0
  176. package/dashboard/src/components/StatusBar.tsx +97 -0
  177. package/dashboard/src/hooks/useSquadSocket.ts +135 -0
  178. package/dashboard/src/lib/formatTime.ts +16 -0
  179. package/dashboard/src/lib/normalizeState.ts +25 -0
  180. package/dashboard/src/main.tsx +10 -0
  181. package/dashboard/src/office/AgentSprite.ts +241 -0
  182. package/dashboard/src/office/OfficeScene.ts +153 -0
  183. package/dashboard/src/office/PhaserGame.tsx +80 -0
  184. package/dashboard/src/office/RoomBuilder.ts +190 -0
  185. package/dashboard/src/office/assetKeys.ts +150 -0
  186. package/dashboard/src/office/palette.ts +32 -0
  187. package/dashboard/src/plugin/squadWatcher.ts +233 -0
  188. package/dashboard/src/store/useSquadStore.ts +56 -0
  189. package/dashboard/src/styles/globals.css +36 -0
  190. package/dashboard/src/types/state.ts +63 -0
  191. package/dashboard/src/vite-env.d.ts +1 -0
  192. package/dashboard/test-results/.last-run.json +4 -0
  193. package/dashboard/tsconfig.json +24 -0
  194. package/dashboard/tsconfig.tsbuildinfo +1 -0
  195. package/dashboard/vite.config.ts +13 -0
  196. package/package.json +53 -0
  197. package/skills/README.md +63 -0
  198. package/skills/apify/SKILL.md +55 -0
  199. package/skills/blotato/SKILL.md +63 -0
  200. package/skills/canva/SKILL.md +60 -0
  201. package/skills/conectese-agent-creator/SKILL.md +192 -0
  202. package/skills/conectese-skill-creator/SKILL.md +407 -0
  203. package/skills/conectese-skill-creator/agents/analyzer.md +274 -0
  204. package/skills/conectese-skill-creator/agents/comparator.md +202 -0
  205. package/skills/conectese-skill-creator/agents/grader.md +223 -0
  206. package/skills/conectese-skill-creator/assets/eval_review.html +146 -0
  207. package/skills/conectese-skill-creator/eval-viewer/generate_review.py +471 -0
  208. package/skills/conectese-skill-creator/eval-viewer/viewer.html +1325 -0
  209. package/skills/conectese-skill-creator/references/schemas.md +430 -0
  210. package/skills/conectese-skill-creator/references/skill-format.md +235 -0
  211. package/skills/conectese-skill-creator/scripts/__init__.py +0 -0
  212. package/skills/conectese-skill-creator/scripts/aggregate_benchmark.py +401 -0
  213. package/skills/conectese-skill-creator/scripts/quick_validate.py +103 -0
  214. package/skills/conectese-skill-creator/scripts/run_eval.py +310 -0
  215. package/skills/conectese-skill-creator/scripts/utils.py +47 -0
  216. package/skills/image-ai-generator/SKILL.md +124 -0
  217. package/skills/image-ai-generator/scripts/generate.py +175 -0
  218. package/skills/image-creator/SKILL.md +155 -0
  219. package/skills/image-fetcher/SKILL.md +91 -0
  220. package/skills/instagram-publisher/SKILL.md +119 -0
  221. package/skills/instagram-publisher/scripts/publish.js +165 -0
  222. package/skills/resend/SKILL.md +80 -0
  223. package/skills/template-designer/SKILL.md +201 -0
  224. package/skills/template-designer/base-templates/model-a.html +27 -0
  225. package/skills/template-designer/base-templates/model-b.html +31 -0
  226. package/skills/template-designer/base-templates/model-c.html +42 -0
  227. package/src/agents-cli.js +158 -0
  228. package/src/agents.js +134 -0
  229. package/src/i18n.js +48 -0
  230. package/src/init.js +341 -0
  231. package/src/locales/en.json +73 -0
  232. package/src/locales/es.json +72 -0
  233. package/src/locales/pt-BR.json +72 -0
  234. package/src/logger.js +38 -0
  235. package/src/prompt.js +46 -0
  236. package/src/readme/README.md +119 -0
  237. package/src/runs.js +90 -0
  238. package/src/skills-cli.js +157 -0
  239. package/src/skills.js +146 -0
  240. package/src/update.js +169 -0
  241. package/templates/_conectese/.conectese-version +1 -0
  242. package/templates/_conectese/_investigations/.gitkeep +0 -0
  243. package/templates/ide-templates/antigravity/.agent/rules/conectese.md +55 -0
  244. package/templates/ide-templates/antigravity/.agent/workflows/conectese.md +102 -0
  245. package/templates/ide-templates/claude-code/.claude/skills/conectese/SKILL.md +182 -0
  246. package/templates/ide-templates/claude-code/.mcp.json +8 -0
  247. package/templates/ide-templates/claude-code/CLAUDE.md +43 -0
  248. package/templates/ide-templates/codex/.agents/skills/conectese/SKILL.md +6 -0
  249. package/templates/ide-templates/codex/AGENTS.md +105 -0
  250. package/templates/ide-templates/cursor/.cursor/commands/conectese.md +9 -0
  251. package/templates/ide-templates/cursor/.cursor/mcp.json +8 -0
  252. package/templates/ide-templates/cursor/.cursor/rules/conectese.mdc +48 -0
  253. package/templates/ide-templates/cursor/.cursorignore +3 -0
  254. package/templates/ide-templates/opencode/.opencode/commands/conectese.md +9 -0
  255. package/templates/ide-templates/opencode/AGENTS.md +105 -0
  256. package/templates/ide-templates/vscode-copilot/.github/prompts/conectese.prompt.md +201 -0
  257. package/templates/ide-templates/vscode-copilot/.vscode/mcp.json +8 -0
  258. package/templates/ide-templates/vscode-copilot/.vscode/settings.json +3 -0
  259. package/templates/package.json +8 -0
  260. package/templates/squads/.gitkeep +0 -0
@@ -0,0 +1,401 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Aggregate individual run results into benchmark summary statistics.
4
+
5
+ Reads grading.json files from run directories and produces:
6
+ - run_summary with mean, stddev, min, max for each metric
7
+ - delta between with_skill and without_skill configurations
8
+
9
+ Usage:
10
+ python aggregate_benchmark.py <benchmark_dir>
11
+
12
+ Example:
13
+ python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14
+
15
+ The script supports two directory layouts:
16
+
17
+ Workspace layout (from skill-creator iterations):
18
+ <benchmark_dir>/
19
+ └── eval-N/
20
+ ├── with_skill/
21
+ │ ├── run-1/grading.json
22
+ │ └── run-2/grading.json
23
+ └── without_skill/
24
+ ├── run-1/grading.json
25
+ └── run-2/grading.json
26
+
27
+ Legacy layout (with runs/ subdirectory):
28
+ <benchmark_dir>/
29
+ └── runs/
30
+ └── eval-N/
31
+ ├── with_skill/
32
+ │ └── run-1/grading.json
33
+ └── without_skill/
34
+ └── run-1/grading.json
35
+ """
36
+
37
+ import argparse
38
+ import json
39
+ import math
40
+ import sys
41
+ from datetime import datetime, timezone
42
+ from pathlib import Path
43
+
44
+
45
+ def calculate_stats(values: list[float]) -> dict:
46
+ """Calculate mean, stddev, min, max for a list of values."""
47
+ if not values:
48
+ return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49
+
50
+ n = len(values)
51
+ mean = sum(values) / n
52
+
53
+ if n > 1:
54
+ variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
+ stddev = math.sqrt(variance)
56
+ else:
57
+ stddev = 0.0
58
+
59
+ return {
60
+ "mean": round(mean, 4),
61
+ "stddev": round(stddev, 4),
62
+ "min": round(min(values), 4),
63
+ "max": round(max(values), 4)
64
+ }
65
+
66
+
67
+ def load_run_results(benchmark_dir: Path) -> dict:
68
+ """
69
+ Load all run results from a benchmark directory.
70
+
71
+ Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72
+ or "new_skill"/"old_skill"), each containing a list of run results.
73
+ """
74
+ # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75
+ runs_dir = benchmark_dir / "runs"
76
+ if runs_dir.exists():
77
+ search_dir = runs_dir
78
+ elif list(benchmark_dir.glob("eval-*")):
79
+ search_dir = benchmark_dir
80
+ else:
81
+ print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
+ return {}
83
+
84
+ results: dict[str, list] = {}
85
+
86
+ for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
+ metadata_path = eval_dir / "eval_metadata.json"
88
+ if metadata_path.exists():
89
+ try:
90
+ with open(metadata_path) as mf:
91
+ eval_id = json.load(mf).get("eval_id", eval_idx)
92
+ except (json.JSONDecodeError, OSError):
93
+ eval_id = eval_idx
94
+ else:
95
+ try:
96
+ eval_id = int(eval_dir.name.split("-")[1])
97
+ except ValueError:
98
+ eval_id = eval_idx
99
+
100
+ # Discover config directories dynamically rather than hardcoding names
101
+ for config_dir in sorted(eval_dir.iterdir()):
102
+ if not config_dir.is_dir():
103
+ continue
104
+ # Skip non-config directories (inputs, outputs, etc.)
105
+ if not list(config_dir.glob("run-*")):
106
+ continue
107
+ config = config_dir.name
108
+ if config not in results:
109
+ results[config] = []
110
+
111
+ for run_dir in sorted(config_dir.glob("run-*")):
112
+ run_number = int(run_dir.name.split("-")[1])
113
+ grading_file = run_dir / "grading.json"
114
+
115
+ if not grading_file.exists():
116
+ print(f"Warning: grading.json not found in {run_dir}")
117
+ continue
118
+
119
+ try:
120
+ with open(grading_file) as f:
121
+ grading = json.load(f)
122
+ except json.JSONDecodeError as e:
123
+ print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
+ continue
125
+
126
+ # Extract metrics
127
+ result = {
128
+ "eval_id": eval_id,
129
+ "run_number": run_number,
130
+ "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
+ "passed": grading.get("summary", {}).get("passed", 0),
132
+ "failed": grading.get("summary", {}).get("failed", 0),
133
+ "total": grading.get("summary", {}).get("total", 0),
134
+ }
135
+
136
+ # Extract timing — check grading.json first, then sibling timing.json
137
+ timing = grading.get("timing", {})
138
+ result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
+ timing_file = run_dir / "timing.json"
140
+ if result["time_seconds"] == 0.0 and timing_file.exists():
141
+ try:
142
+ with open(timing_file) as tf:
143
+ timing_data = json.load(tf)
144
+ result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
+ result["tokens"] = timing_data.get("total_tokens", 0)
146
+ except json.JSONDecodeError:
147
+ pass
148
+
149
+ # Extract metrics if available
150
+ metrics = grading.get("execution_metrics", {})
151
+ result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
+ if not result.get("tokens"):
153
+ result["tokens"] = metrics.get("output_chars", 0)
154
+ result["errors"] = metrics.get("errors_encountered", 0)
155
+
156
+ # Extract expectations — viewer requires fields: text, passed, evidence
157
+ raw_expectations = grading.get("expectations", [])
158
+ for exp in raw_expectations:
159
+ if "text" not in exp or "passed" not in exp:
160
+ print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
+ result["expectations"] = raw_expectations
162
+
163
+ # Extract notes from user_notes_summary
164
+ notes_summary = grading.get("user_notes_summary", {})
165
+ notes = []
166
+ notes.extend(notes_summary.get("uncertainties", []))
167
+ notes.extend(notes_summary.get("needs_review", []))
168
+ notes.extend(notes_summary.get("workarounds", []))
169
+ result["notes"] = notes
170
+
171
+ results[config].append(result)
172
+
173
+ return results
174
+
175
+
176
+ def aggregate_results(results: dict) -> dict:
177
+ """
178
+ Aggregate run results into summary statistics.
179
+
180
+ Returns run_summary with stats for each configuration and delta.
181
+ """
182
+ run_summary = {}
183
+ configs = list(results.keys())
184
+
185
+ for config in configs:
186
+ runs = results.get(config, [])
187
+
188
+ if not runs:
189
+ run_summary[config] = {
190
+ "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
+ "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
+ "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
+ }
194
+ continue
195
+
196
+ pass_rates = [r["pass_rate"] for r in runs]
197
+ times = [r["time_seconds"] for r in runs]
198
+ tokens = [r.get("tokens", 0) for r in runs]
199
+
200
+ run_summary[config] = {
201
+ "pass_rate": calculate_stats(pass_rates),
202
+ "time_seconds": calculate_stats(times),
203
+ "tokens": calculate_stats(tokens)
204
+ }
205
+
206
+ # Calculate delta between the first two configs (if two exist)
207
+ if len(configs) >= 2:
208
+ primary = run_summary.get(configs[0], {})
209
+ baseline = run_summary.get(configs[1], {})
210
+ else:
211
+ primary = run_summary.get(configs[0], {}) if configs else {}
212
+ baseline = {}
213
+
214
+ delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
+ delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
+ delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217
+
218
+ run_summary["delta"] = {
219
+ "pass_rate": f"{delta_pass_rate:+.2f}",
220
+ "time_seconds": f"{delta_time:+.1f}",
221
+ "tokens": f"{delta_tokens:+.0f}"
222
+ }
223
+
224
+ return run_summary
225
+
226
+
227
+ def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
+ """
229
+ Generate complete benchmark.json from run results.
230
+ """
231
+ results = load_run_results(benchmark_dir)
232
+ run_summary = aggregate_results(results)
233
+
234
+ # Build runs array for benchmark.json
235
+ runs = []
236
+ for config in results:
237
+ for result in results[config]:
238
+ runs.append({
239
+ "eval_id": result["eval_id"],
240
+ "configuration": config,
241
+ "run_number": result["run_number"],
242
+ "result": {
243
+ "pass_rate": result["pass_rate"],
244
+ "passed": result["passed"],
245
+ "failed": result["failed"],
246
+ "total": result["total"],
247
+ "time_seconds": result["time_seconds"],
248
+ "tokens": result.get("tokens", 0),
249
+ "tool_calls": result.get("tool_calls", 0),
250
+ "errors": result.get("errors", 0)
251
+ },
252
+ "expectations": result["expectations"],
253
+ "notes": result["notes"]
254
+ })
255
+
256
+ # Determine eval IDs from results
257
+ eval_ids = sorted(set(
258
+ r["eval_id"]
259
+ for config in results.values()
260
+ for r in config
261
+ ))
262
+
263
+ benchmark = {
264
+ "metadata": {
265
+ "skill_name": skill_name or "<skill-name>",
266
+ "skill_path": skill_path or "<path/to/skill>",
267
+ "executor_model": "<model-name>",
268
+ "analyzer_model": "<model-name>",
269
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
+ "evals_run": eval_ids,
271
+ "runs_per_configuration": 3
272
+ },
273
+ "runs": runs,
274
+ "run_summary": run_summary,
275
+ "notes": [] # To be filled by analyzer
276
+ }
277
+
278
+ return benchmark
279
+
280
+
281
+ def generate_markdown(benchmark: dict) -> str:
282
+ """Generate human-readable benchmark.md from benchmark data."""
283
+ metadata = benchmark["metadata"]
284
+ run_summary = benchmark["run_summary"]
285
+
286
+ # Determine config names (excluding "delta")
287
+ configs = [k for k in run_summary if k != "delta"]
288
+ config_a = configs[0] if len(configs) >= 1 else "config_a"
289
+ config_b = configs[1] if len(configs) >= 2 else "config_b"
290
+ label_a = config_a.replace("_", " ").title()
291
+ label_b = config_b.replace("_", " ").title()
292
+
293
+ lines = [
294
+ f"# Skill Benchmark: {metadata['skill_name']}",
295
+ "",
296
+ f"**Model**: {metadata['executor_model']}",
297
+ f"**Date**: {metadata['timestamp']}",
298
+ f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
+ "",
300
+ "## Summary",
301
+ "",
302
+ f"| Metric | {label_a} | {label_b} | Delta |",
303
+ "|--------|------------|---------------|-------|",
304
+ ]
305
+
306
+ a_summary = run_summary.get(config_a, {})
307
+ b_summary = run_summary.get(config_b, {})
308
+ delta = run_summary.get("delta", {})
309
+
310
+ # Format pass rate
311
+ a_pr = a_summary.get("pass_rate", {})
312
+ b_pr = b_summary.get("pass_rate", {})
313
+ lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
+
315
+ # Format time
316
+ a_time = a_summary.get("time_seconds", {})
317
+ b_time = b_summary.get("time_seconds", {})
318
+ lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
+
320
+ # Format tokens
321
+ a_tokens = a_summary.get("tokens", {})
322
+ b_tokens = b_summary.get("tokens", {})
323
+ lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
+
325
+ # Notes section
326
+ if benchmark.get("notes"):
327
+ lines.extend([
328
+ "",
329
+ "## Notes",
330
+ ""
331
+ ])
332
+ for note in benchmark["notes"]:
333
+ lines.append(f"- {note}")
334
+
335
+ return "\n".join(lines)
336
+
337
+
338
+ def main():
339
+ parser = argparse.ArgumentParser(
340
+ description="Aggregate benchmark run results into summary statistics"
341
+ )
342
+ parser.add_argument(
343
+ "benchmark_dir",
344
+ type=Path,
345
+ help="Path to the benchmark directory"
346
+ )
347
+ parser.add_argument(
348
+ "--skill-name",
349
+ default="",
350
+ help="Name of the skill being benchmarked"
351
+ )
352
+ parser.add_argument(
353
+ "--skill-path",
354
+ default="",
355
+ help="Path to the skill being benchmarked"
356
+ )
357
+ parser.add_argument(
358
+ "--output", "-o",
359
+ type=Path,
360
+ help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361
+ )
362
+
363
+ args = parser.parse_args()
364
+
365
+ if not args.benchmark_dir.exists():
366
+ print(f"Directory not found: {args.benchmark_dir}")
367
+ sys.exit(1)
368
+
369
+ # Generate benchmark
370
+ benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
+
372
+ # Determine output paths
373
+ output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
+ output_md = output_json.with_suffix(".md")
375
+
376
+ # Write benchmark.json
377
+ with open(output_json, "w") as f:
378
+ json.dump(benchmark, f, indent=2)
379
+ print(f"Generated: {output_json}")
380
+
381
+ # Write benchmark.md
382
+ markdown = generate_markdown(benchmark)
383
+ with open(output_md, "w") as f:
384
+ f.write(markdown)
385
+ print(f"Generated: {output_md}")
386
+
387
+ # Print summary
388
+ run_summary = benchmark["run_summary"]
389
+ configs = [k for k in run_summary if k != "delta"]
390
+ delta = run_summary.get("delta", {})
391
+
392
+ print(f"\nSummary:")
393
+ for config in configs:
394
+ pr = run_summary[config]["pass_rate"]["mean"]
395
+ label = config.replace("_", " ").title()
396
+ print(f" {label}: {pr*100:.1f}% pass rate")
397
+ print(f" Delta: {delta.get('pass_rate', '—')}")
398
+
399
+
400
+ if __name__ == "__main__":
401
+ main()
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick validation script for skills - minimal version
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ import re
9
+ import yaml
10
+ from pathlib import Path
11
+
12
+ def validate_skill(skill_path):
13
+ """Basic validation of a skill"""
14
+ skill_path = Path(skill_path)
15
+
16
+ # Check SKILL.md exists
17
+ skill_md = skill_path / 'SKILL.md'
18
+ if not skill_md.exists():
19
+ return False, "SKILL.md not found"
20
+
21
+ # Read and validate frontmatter
22
+ content = skill_md.read_text()
23
+ if not content.startswith('---'):
24
+ return False, "No YAML frontmatter found"
25
+
26
+ # Extract frontmatter
27
+ match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
28
+ if not match:
29
+ return False, "Invalid frontmatter format"
30
+
31
+ frontmatter_text = match.group(1)
32
+
33
+ # Parse YAML frontmatter
34
+ try:
35
+ frontmatter = yaml.safe_load(frontmatter_text)
36
+ if not isinstance(frontmatter, dict):
37
+ return False, "Frontmatter must be a YAML dictionary"
38
+ except yaml.YAMLError as e:
39
+ return False, f"Invalid YAML in frontmatter: {e}"
40
+
41
+ # Define allowed properties
42
+ ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
43
+
44
+ # Check for unexpected properties (excluding nested keys under metadata)
45
+ unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
46
+ if unexpected_keys:
47
+ return False, (
48
+ f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
49
+ f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
50
+ )
51
+
52
+ # Check required fields
53
+ if 'name' not in frontmatter:
54
+ return False, "Missing 'name' in frontmatter"
55
+ if 'description' not in frontmatter:
56
+ return False, "Missing 'description' in frontmatter"
57
+
58
+ # Extract name for validation
59
+ name = frontmatter.get('name', '')
60
+ if not isinstance(name, str):
61
+ return False, f"Name must be a string, got {type(name).__name__}"
62
+ name = name.strip()
63
+ if name:
64
+ # Check naming convention (kebab-case: lowercase with hyphens)
65
+ if not re.match(r'^[a-z0-9-]+$', name):
66
+ return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
67
+ if name.startswith('-') or name.endswith('-') or '--' in name:
68
+ return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
69
+ # Check name length (max 64 characters per spec)
70
+ if len(name) > 64:
71
+ return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
72
+
73
+ # Extract and validate description
74
+ description = frontmatter.get('description', '')
75
+ if not isinstance(description, str):
76
+ return False, f"Description must be a string, got {type(description).__name__}"
77
+ description = description.strip()
78
+ if description:
79
+ # Check for angle brackets
80
+ if '<' in description or '>' in description:
81
+ return False, "Description cannot contain angle brackets (< or >)"
82
+ # Check description length (max 1024 characters per spec)
83
+ if len(description) > 1024:
84
+ return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
85
+
86
+ # Validate compatibility field if present (optional)
87
+ compatibility = frontmatter.get('compatibility', '')
88
+ if compatibility:
89
+ if not isinstance(compatibility, str):
90
+ return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
91
+ if len(compatibility) > 500:
92
+ return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
93
+
94
+ return True, "Skill is valid!"
95
+
96
+ if __name__ == "__main__":
97
+ if len(sys.argv) != 2:
98
+ print("Usage: python quick_validate.py <skill_directory>")
99
+ sys.exit(1)
100
+
101
+ valid, message = validate_skill(sys.argv[1])
102
+ print(message)
103
+ sys.exit(0 if valid else 1)