@devrev-computer/skills 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/bin/install.mjs +158 -0
- package/package.json +33 -0
- package/skills/account-evaluation/account-evaluation.md +64 -0
- package/skills/account-research/account-research.md +323 -0
- package/skills/account-research/references/signals-guide.md +52 -0
- package/skills/create-workflow-template/create-workflow-template.md +1091 -0
- package/skills/create-workflow-template/examples/3592-Generate rca from pia-template.json +1 -0
- package/skills/create-workflow-template/examples/4392-Async opportunity review agent-template.json +1 -0
- package/skills/create-workflow-template/examples/4441-Ticket escalator from customer message-template.json +1 -0
- package/skills/create-workflow-template/examples/4505-Auto-update issue tcd as end of sprint date-template.json +1 -0
- package/skills/create-workflow-template/examples/5040-Devrevu - enablement journey - poc emails-template.json +1 -0
- package/skills/create-workflow-template/examples/5158-Devrevu - enablement journey - mailing for non enablement journey users-template.json +1 -0
- package/skills/create-workflow-template/examples/5216-Account segment missing notification-template.json +1 -0
- package/skills/create-workflow-template/examples/working-csat-score-on-ticket-resolved.json +1 -0
- package/skills/create-workflow-template/examples/working-enhancement-replace-agent.json +1 -0
- package/skills/create-workflow-template/examples/working-invoke-code-sample.json +1 -0
- package/skills/create-workflow-template/examples/working-loop-variable-sample.json +1 -0
- package/skills/create-workflow-template/operations/actions.md +2919 -0
- package/skills/create-workflow-template/operations/blockings.md +38 -0
- package/skills/create-workflow-template/operations/controls.md +108 -0
- package/skills/create-workflow-template/operations/schema-index.md +166 -0
- package/skills/create-workflow-template/operations/schemas/account_created.md +58 -0
- package/skills/create-workflow-template/operations/schemas/account_updated.md +73 -0
- package/skills/create-workflow-template/operations/schemas/add_comment.md +29 -0
- package/skills/create-workflow-template/operations/schemas/airdrop_sync_run_started.md +33 -0
- package/skills/create-workflow-template/operations/schemas/airdrop_sync_run_status_updated.md +35 -0
- package/skills/create-workflow-template/operations/schemas/article_created.md +96 -0
- package/skills/create-workflow-template/operations/schemas/article_updated.md +135 -0
- package/skills/create-workflow-template/operations/schemas/ask_ai.md +11 -0
- package/skills/create-workflow-template/operations/schemas/classify_object.md +22 -0
- package/skills/create-workflow-template/operations/schemas/contact_created.md +43 -0
- package/skills/create-workflow-template/operations/schemas/contact_updated.md +65 -0
- package/skills/create-workflow-template/operations/schemas/conversation_created.md +108 -0
- package/skills/create-workflow-template/operations/schemas/conversation_sla_tracker_updated.md +46 -0
- package/skills/create-workflow-template/operations/schemas/conversation_updated.md +130 -0
- package/skills/create-workflow-template/operations/schemas/convert_conversation_to_ticket.md +13 -0
- package/skills/create-workflow-template/operations/schemas/create_account.md +62 -0
- package/skills/create-workflow-template/operations/schemas/create_article.md +79 -0
- package/skills/create-workflow-template/operations/schemas/create_brand.md +42 -0
- package/skills/create-workflow-template/operations/schemas/create_contact.md +65 -0
- package/skills/create-workflow-template/operations/schemas/create_dm.md +53 -0
- package/skills/create-workflow-template/operations/schemas/create_enhancement.md +63 -0
- package/skills/create-workflow-template/operations/schemas/create_incident.md +136 -0
- package/skills/create-workflow-template/operations/schemas/create_issue.md +150 -0
- package/skills/create-workflow-template/operations/schemas/create_meeting.md +105 -0
- package/skills/create-workflow-template/operations/schemas/create_opportunity.md +123 -0
- package/skills/create-workflow-template/operations/schemas/create_ticket.md +184 -0
- package/skills/create-workflow-template/operations/schemas/csat_response_received.md +73 -0
- package/skills/create-workflow-template/operations/schemas/dev_user_created.md +54 -0
- package/skills/create-workflow-template/operations/schemas/dev_user_updated.md +99 -0
- package/skills/create-workflow-template/operations/schemas/enhancement_created.md +46 -0
- package/skills/create-workflow-template/operations/schemas/enhancement_updated.md +89 -0
- package/skills/create-workflow-template/operations/schemas/evaluate_sentiment.md +14 -0
- package/skills/create-workflow-template/operations/schemas/execute_metric_action.md +11 -0
- package/skills/create-workflow-template/operations/schemas/feature_created.md +40 -0
- package/skills/create-workflow-template/operations/schemas/for_each.md +45 -0
- package/skills/create-workflow-template/operations/schemas/get_account.md +59 -0
- package/skills/create-workflow-template/operations/schemas/get_airdrop_sync_unit.md +32 -0
- package/skills/create-workflow-template/operations/schemas/get_brand.md +40 -0
- package/skills/create-workflow-template/operations/schemas/get_complete_enhancement_details.md +13 -0
- package/skills/create-workflow-template/operations/schemas/get_conversation.md +120 -0
- package/skills/create-workflow-template/operations/schemas/get_customer.md +60 -0
- package/skills/create-workflow-template/operations/schemas/get_enhancement.md +66 -0
- package/skills/create-workflow-template/operations/schemas/get_feature.md +56 -0
- package/skills/create-workflow-template/operations/schemas/get_incident.md +85 -0
- package/skills/create-workflow-template/operations/schemas/get_issue.md +117 -0
- package/skills/create-workflow-template/operations/schemas/get_kg_schema.md +23 -0
- package/skills/create-workflow-template/operations/schemas/get_meeting.md +87 -0
- package/skills/create-workflow-template/operations/schemas/get_metric_trackers.md +20 -0
- package/skills/create-workflow-template/operations/schemas/get_node_schema.md +29 -0
- package/skills/create-workflow-template/operations/schemas/get_opportunity.md +93 -0
- package/skills/create-workflow-template/operations/schemas/get_org_user.md +57 -0
- package/skills/create-workflow-template/operations/schemas/get_org_user_preference.md +40 -0
- package/skills/create-workflow-template/operations/schemas/get_part.md +55 -0
- package/skills/create-workflow-template/operations/schemas/get_self.md +54 -0
- package/skills/create-workflow-template/operations/schemas/get_session_details.md +45 -0
- package/skills/create-workflow-template/operations/schemas/get_sprint_board.md +103 -0
- package/skills/create-workflow-template/operations/schemas/get_ticket.md +136 -0
- package/skills/create-workflow-template/operations/schemas/get_workspace.md +21 -0
- package/skills/create-workflow-template/operations/schemas/go_back.md +13 -0
- package/skills/create-workflow-template/operations/schemas/http.md +38 -0
- package/skills/create-workflow-template/operations/schemas/hybrid_search.md +144 -0
- package/skills/create-workflow-template/operations/schemas/if_else.md +16 -0
- package/skills/create-workflow-template/operations/schemas/incident_created.md +88 -0
- package/skills/create-workflow-template/operations/schemas/incident_updated.md +126 -0
- package/skills/create-workflow-template/operations/schemas/init_variable.md +67 -0
- package/skills/create-workflow-template/operations/schemas/invoice_created.md +21 -0
- package/skills/create-workflow-template/operations/schemas/invoice_updated.md +41 -0
- package/skills/create-workflow-template/operations/schemas/invoke_code.md +132 -0
- package/skills/create-workflow-template/operations/schemas/issue_created.md +105 -0
- package/skills/create-workflow-template/operations/schemas/issue_sla_tracker_updated.md +46 -0
- package/skills/create-workflow-template/operations/schemas/issue_updated.md +172 -0
- package/skills/create-workflow-template/operations/schemas/link_incident_with_issue.md +14 -0
- package/skills/create-workflow-template/operations/schemas/link_ticket_with_issue.md +14 -0
- package/skills/create-workflow-template/operations/schemas/list_enhancements.md +74 -0
- package/skills/create-workflow-template/operations/schemas/list_issues.md +108 -0
- package/skills/create-workflow-template/operations/schemas/list_sessions.md +79 -0
- package/skills/create-workflow-template/operations/schemas/list_sprint.md +29 -0
- package/skills/create-workflow-template/operations/schemas/list_web_sessions.md +87 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_accounts.md +106 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_articles.md +126 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_customers.md +88 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_dev_users.md +75 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_enhancements.md +112 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_incidents.md +113 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_issues.md +217 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_meetings.md +150 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_opportunity.md +161 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_sprints.md +50 -0
- package/skills/create-workflow-template/operations/schemas/loop_over_tickets.md +203 -0
- package/skills/create-workflow-template/operations/schemas/manual_trigger.md +11 -0
- package/skills/create-workflow-template/operations/schemas/meeting_created.md +116 -0
- package/skills/create-workflow-template/operations/schemas/meeting_updated.md +152 -0
- package/skills/create-workflow-template/operations/schemas/oasis_sql_execute.md +11 -0
- package/skills/create-workflow-template/operations/schemas/opportunity_created.md +92 -0
- package/skills/create-workflow-template/operations/schemas/opportunity_updated.md +124 -0
- package/skills/create-workflow-template/operations/schemas/pick_user.md +16 -0
- package/skills/create-workflow-template/operations/schemas/question_answer_created.md +44 -0
- package/skills/create-workflow-template/operations/schemas/question_answer_updated.md +75 -0
- package/skills/create-workflow-template/operations/schemas/recall_chats.md +13 -0
- package/skills/create-workflow-template/operations/schemas/router.md +15 -0
- package/skills/create-workflow-template/operations/schemas/send_notification.md +19 -0
- package/skills/create-workflow-template/operations/schemas/set_variable.md +67 -0
- package/skills/create-workflow-template/operations/schemas/sleep_for.md +12 -0
- package/skills/create-workflow-template/operations/schemas/sleep_until.md +17 -0
- package/skills/create-workflow-template/operations/schemas/sprint_updated.md +37 -0
- package/skills/create-workflow-template/operations/schemas/suggest_part.md +14 -0
- package/skills/create-workflow-template/operations/schemas/task_updated.md +79 -0
- package/skills/create-workflow-template/operations/schemas/test_example.md +16 -0
- package/skills/create-workflow-template/operations/schemas/ticket_created.md +136 -0
- package/skills/create-workflow-template/operations/schemas/ticket_sla_tracker_updated.md +46 -0
- package/skills/create-workflow-template/operations/schemas/ticket_updated.md +198 -0
- package/skills/create-workflow-template/operations/schemas/timeline_comment_created.md +70 -0
- package/skills/create-workflow-template/operations/schemas/update_account.md +68 -0
- package/skills/create-workflow-template/operations/schemas/update_article.md +95 -0
- package/skills/create-workflow-template/operations/schemas/update_brand.md +44 -0
- package/skills/create-workflow-template/operations/schemas/update_contact.md +53 -0
- package/skills/create-workflow-template/operations/schemas/update_conversation.md +149 -0
- package/skills/create-workflow-template/operations/schemas/update_enhancement.md +64 -0
- package/skills/create-workflow-template/operations/schemas/update_incident.md +156 -0
- package/skills/create-workflow-template/operations/schemas/update_issue.md +173 -0
- package/skills/create-workflow-template/operations/schemas/update_meeting.md +114 -0
- package/skills/create-workflow-template/operations/schemas/update_opportunity.md +137 -0
- package/skills/create-workflow-template/operations/schemas/update_question_answer.md +60 -0
- package/skills/create-workflow-template/operations/schemas/update_ticket.md +188 -0
- package/skills/create-workflow-template/operations/schemas/watch_ticket_for_updates.md +225 -0
- package/skills/create-workflow-template/operations/schemas/web_search.md +17 -0
- package/skills/create-workflow-template/operations/schemas/while.md +24 -0
- package/skills/create-workflow-template/operations/schemas/widget_created.md +75 -0
- package/skills/create-workflow-template/operations/schemas/widget_updated.md +98 -0
- package/skills/create-workflow-template/operations/schemas/workspace_created.md +20 -0
- package/skills/create-workflow-template/operations/triggers.md +1583 -0
- package/skills/customer-brief/customer-brief.md +66 -0
- package/skills/deal-review-meddpicc/deal-review-meddpicc.md +58 -0
- package/skills/next-step-for-opportunity/next-step-for-opportunity.md +55 -0
- package/skills/opportunity-feature-prioritizer/SKILL.md +183 -0
- package/skills/sales-call-plan-coach/sales-call-plan-coach.md +73 -0
- package/skills/sales-context/sales-context.md +44 -0
- package/skills/sales-search-and-lookup/sales-search-and-lookup.md +58 -0
- package/skills/skill-creator/SKILL.md +570 -0
- package/skills/skill-creator/agents/analyzer.md +274 -0
- package/skills/skill-creator/agents/comparator.md +202 -0
- package/skills/skill-creator/agents/grader.md +223 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/references/tool-patterns.md +290 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +247 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +310 -0
- package/skills/skill-creator/scripts/run_loop.py +328 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/skills/trace-diagnosis/trace-diagnosis.md +186 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# Tool Patterns for Platform-Integrated Skills
|
|
2
|
+
|
|
3
|
+
This reference covers patterns for writing skills that leverage platform tools effectively. The key principle: **describe capabilities, not tool names**. Tools change; capabilities persist.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
|
|
7
|
+
1. [Tool Discovery Pattern](#tool-discovery-pattern)
|
|
8
|
+
2. [Data Querying Skills](#data-querying-skills)
|
|
9
|
+
3. [Workflow Management Skills](#workflow-management-skills)
|
|
10
|
+
4. [Communication Skills](#communication-skills)
|
|
11
|
+
5. [Observability Skills](#observability-skills)
|
|
12
|
+
6. [Multi-Tool Orchestration](#multi-tool-orchestration)
|
|
13
|
+
7. [Common Skill Archetypes](#common-skill-archetypes)
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Tool Discovery Pattern
|
|
18
|
+
|
|
19
|
+
Every skill that depends on external tools should include a discovery step. Here's the pattern:
|
|
20
|
+
|
|
21
|
+
```markdown
|
|
22
|
+
## Prerequisites
|
|
23
|
+
|
|
24
|
+
Before starting, list the tools available in your current session:
|
|
25
|
+
|
|
26
|
+
1. Scan the tool list and identify tools relevant to [the domain this skill needs]
|
|
27
|
+
2. Group them by capability (data querying, work management, messaging, etc.)
|
|
28
|
+
3. If the required capabilities aren't available, inform the user what's needed and stop
|
|
29
|
+
4. Note the exact tool names for use in subsequent steps
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Why this matters: A skill installed in one environment might be used in another where different tools are configured. Tool discovery makes skills portable. The discovery step should work in any client — just list the tools you have.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Data Querying Skills
|
|
37
|
+
|
|
38
|
+
Skills that query organizational data (reports, dashboards, analytics) typically follow this pattern:
|
|
39
|
+
|
|
40
|
+
### The Schema-First Pattern
|
|
41
|
+
|
|
42
|
+
```markdown
|
|
43
|
+
## Workflow
|
|
44
|
+
|
|
45
|
+
### Step 1: Understand the data model
|
|
46
|
+
Before writing any queries, discover what data is available:
|
|
47
|
+
- Look for schema or knowledge graph tools in your tool list
|
|
48
|
+
- Use them to get an overview of all objects and relationships
|
|
49
|
+
- For specific objects you'll query, get their detailed schema (fields, types, relationships)
|
|
50
|
+
- Note which objects are global vs organization-specific
|
|
51
|
+
|
|
52
|
+
### Step 2: Construct queries
|
|
53
|
+
Based on the schema and the user's request:
|
|
54
|
+
- Build SQL queries that use the correct field names from the schema
|
|
55
|
+
- If natural-language-to-SQL tools are available, use them — they have context on query syntax
|
|
56
|
+
- Always validate field names against the schema before executing
|
|
57
|
+
|
|
58
|
+
### Step 3: Execute and validate
|
|
59
|
+
- Execute queries using the available SQL execution tools
|
|
60
|
+
- Check that results make sense given the schema
|
|
61
|
+
- Handle empty results gracefully — explain what was queried and why it might be empty
|
|
62
|
+
|
|
63
|
+
### Step 4: Present results
|
|
64
|
+
- Format results according to the user's needs (table, summary, chart data)
|
|
65
|
+
- Include the query used so the user can modify it
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Key principles for data skills
|
|
69
|
+
|
|
70
|
+
- **Schema before query**: The model should always understand the data model before writing SQL. This prevents hallucinated column names and wrong joins.
|
|
71
|
+
- **Iterative refinement**: SQL queries often need adjustment. The skill should encourage trying a query, examining results, and refining.
|
|
72
|
+
- **Explain the data**: Don't just dump results. Summarize what the data shows and highlight interesting patterns.
|
|
73
|
+
|
|
74
|
+
### Common data skill types
|
|
75
|
+
|
|
76
|
+
| Skill Type | Typical Queries | Key Considerations |
|
|
77
|
+
|------------|----------------|-------------------|
|
|
78
|
+
| Dashboard | Aggregations, GROUP BY, time series | Handle date ranges, support filtering |
|
|
79
|
+
| Report | JOINs across objects, computed fields | May need multiple queries stitched together |
|
|
80
|
+
| Analytics | Statistical queries, trend analysis | Consider data freshness and completeness |
|
|
81
|
+
| Search | WHERE with multiple conditions, LIKE | Handle fuzzy matching, suggest alternatives |
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Workflow Management Skills
|
|
86
|
+
|
|
87
|
+
Skills that interact with work management systems (issues, tickets, enhancements, sprints).
|
|
88
|
+
|
|
89
|
+
### Object Lifecycle Awareness
|
|
90
|
+
|
|
91
|
+
Work objects typically have specific lifecycles with valid stage transitions. A skill that creates or updates work items should:
|
|
92
|
+
|
|
93
|
+
```markdown
|
|
94
|
+
## Working with work items
|
|
95
|
+
|
|
96
|
+
### Before creating or updating
|
|
97
|
+
1. List the available work management tools in your session
|
|
98
|
+
2. Check available subtypes for the work type you're creating
|
|
99
|
+
3. For updates, verify the stage transition is valid
|
|
100
|
+
4. When linking objects, understand the relationship types available
|
|
101
|
+
|
|
102
|
+
### Creating work items
|
|
103
|
+
- Set the appropriate subtype based on the nature of the work
|
|
104
|
+
- Assign to the right product area if known
|
|
105
|
+
- Add to the current sprint if the work is immediate
|
|
106
|
+
- Include a clear title and description with enough context for anyone to pick it up
|
|
107
|
+
|
|
108
|
+
### Updating work items
|
|
109
|
+
- Always check current state before updating
|
|
110
|
+
- Use valid stage transitions — don't skip stages
|
|
111
|
+
- Add timeline entries to explain why changes were made
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Common workflow skill patterns
|
|
115
|
+
|
|
116
|
+
- **Triage skills**: Read incoming items, categorize, assign, set priority
|
|
117
|
+
- **Sprint management**: Plan sprints, track progress, identify blockers
|
|
118
|
+
- **Reporting skills**: Aggregate work item data for status reports
|
|
119
|
+
- **Escalation skills**: Detect stale items, notify owners, update status
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Communication Skills
|
|
124
|
+
|
|
125
|
+
Skills that post messages, create issues/PRs, or otherwise communicate externally.
|
|
126
|
+
|
|
127
|
+
### The Confirm-Before-Send Pattern
|
|
128
|
+
|
|
129
|
+
```markdown
|
|
130
|
+
## Posting messages
|
|
131
|
+
|
|
132
|
+
External communication is hard to undo. Before posting:
|
|
133
|
+
1. Draft the message and show it to the user
|
|
134
|
+
2. Wait for explicit confirmation before sending
|
|
135
|
+
3. After sending, confirm what was sent and where
|
|
136
|
+
|
|
137
|
+
Exception: If the user has explicitly said "just post it" or the skill is designed for automated posting, skip the confirmation. But default to confirming.
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Messaging patterns
|
|
141
|
+
|
|
142
|
+
- **Channel/recipient discovery**: Use list/search tools to find the right destination
|
|
143
|
+
- **Thread awareness**: When responding to a discussion, post in the thread, not the channel
|
|
144
|
+
- **Formatting**: Different platforms have different markdown flavors — the skill should note this
|
|
145
|
+
- **User mentions**: Search for users by name to get their IDs for mentions
|
|
146
|
+
|
|
147
|
+
### Code collaboration patterns
|
|
148
|
+
|
|
149
|
+
- **PR creation**: Include a clear title, description with context, and link to relevant issues
|
|
150
|
+
- **Code review**: Use inline comments on specific files/lines, not just top-level comments
|
|
151
|
+
- **Issue management**: Check for duplicates before creating, link related issues
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Observability Skills
|
|
156
|
+
|
|
157
|
+
Skills that query monitoring platforms for logs, metrics, traces, and incidents.
|
|
158
|
+
|
|
159
|
+
### The Context-First Pattern
|
|
160
|
+
|
|
161
|
+
```markdown
|
|
162
|
+
## Investigating issues
|
|
163
|
+
|
|
164
|
+
### Step 1: Establish context
|
|
165
|
+
- What service or component is involved?
|
|
166
|
+
- What time range are we looking at?
|
|
167
|
+
- Are there known incidents or deployments in that window?
|
|
168
|
+
|
|
169
|
+
### Step 2: Gather signals
|
|
170
|
+
- List the observability tools available in your session
|
|
171
|
+
- Search logs with relevant filters (service, severity, time range)
|
|
172
|
+
- Check metrics for the affected service
|
|
173
|
+
- Look for related traces if available
|
|
174
|
+
- Check for active monitors or incidents
|
|
175
|
+
|
|
176
|
+
### Step 3: Correlate
|
|
177
|
+
- Cross-reference logs, metrics, and traces
|
|
178
|
+
- Look for patterns (error spikes, latency increases, deployment markers)
|
|
179
|
+
- Check upstream/downstream dependencies
|
|
180
|
+
|
|
181
|
+
### Step 4: Summarize findings
|
|
182
|
+
- Present a timeline of events
|
|
183
|
+
- Highlight the most likely root cause
|
|
184
|
+
- Suggest next steps
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Multi-Tool Orchestration
|
|
190
|
+
|
|
191
|
+
Many valuable skills combine multiple tool categories. Here's how to structure them:
|
|
192
|
+
|
|
193
|
+
### The Pipeline Pattern
|
|
194
|
+
|
|
195
|
+
```markdown
|
|
196
|
+
## Workflow
|
|
197
|
+
|
|
198
|
+
This skill uses multiple platform capabilities in sequence:
|
|
199
|
+
|
|
200
|
+
### Phase 1: Gather (data tools)
|
|
201
|
+
- Query the relevant data using available querying tools
|
|
202
|
+
- Pull context from work items
|
|
203
|
+
|
|
204
|
+
### Phase 2: Analyze (compute)
|
|
205
|
+
- Process the gathered data (scripts, inline analysis)
|
|
206
|
+
- Identify patterns, anomalies, or action items
|
|
207
|
+
|
|
208
|
+
### Phase 3: Act (workflow/communication tools)
|
|
209
|
+
- Create work items for action items found
|
|
210
|
+
- Post summaries to relevant channels (if messaging tools are available)
|
|
211
|
+
- Update dashboards or reports
|
|
212
|
+
|
|
213
|
+
### Phase 4: Verify
|
|
214
|
+
- Confirm all actions were taken
|
|
215
|
+
- Summarize what was done for the user
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Tool dependency chains
|
|
219
|
+
|
|
220
|
+
When skills chain tools, make the dependencies explicit:
|
|
221
|
+
|
|
222
|
+
```markdown
|
|
223
|
+
## Tool chain
|
|
224
|
+
This skill needs tools from these categories (list your tools and check availability before starting):
|
|
225
|
+
1. **Data querying** — to pull organizational data
|
|
226
|
+
2. **Work management** — to create/update work items based on findings
|
|
227
|
+
3. **Messaging** — to notify stakeholders (optional — gracefully degrade if unavailable)
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Common Skill Archetypes
|
|
233
|
+
|
|
234
|
+
These are the most common types of skills people build. Use these as starting points:
|
|
235
|
+
|
|
236
|
+
### 1. Report Generator
|
|
237
|
+
**Purpose**: Pull data, analyze it, produce a formatted report
|
|
238
|
+
**Tools needed**: Data querying (required), filesystem (for output)
|
|
239
|
+
**Key pattern**: Schema-first, iterative query refinement, structured output template
|
|
240
|
+
|
|
241
|
+
### 2. Workflow Automator
|
|
242
|
+
**Purpose**: Automate a multi-step process (triage, sprint planning, release tracking)
|
|
243
|
+
**Tools needed**: Work management (required), messaging (optional for notifications)
|
|
244
|
+
**Key pattern**: Object lifecycle awareness, confirm-before-act, audit trail via timeline entries
|
|
245
|
+
|
|
246
|
+
### 3. Dashboard Builder
|
|
247
|
+
**Purpose**: Create visual or data-driven dashboards from organizational data
|
|
248
|
+
**Tools needed**: Data querying (required), filesystem (for HTML/charts)
|
|
249
|
+
**Key pattern**: Schema-first, aggregation queries, HTML/chart generation scripts
|
|
250
|
+
|
|
251
|
+
### 4. Investigation Assistant
|
|
252
|
+
**Purpose**: Help debug production issues by correlating signals
|
|
253
|
+
**Tools needed**: Observability (required), work management (optional for linked incidents)
|
|
254
|
+
**Key pattern**: Context-first, multi-signal correlation, timeline reconstruction
|
|
255
|
+
|
|
256
|
+
### 5. Communication Drafter
|
|
257
|
+
**Purpose**: Draft and send messages (status updates, incident comms, release notes)
|
|
258
|
+
**Tools needed**: Messaging (required), work management (for data)
|
|
259
|
+
**Key pattern**: Confirm-before-send, audience-aware tone, structured templates
|
|
260
|
+
|
|
261
|
+
### 6. Data Explorer
|
|
262
|
+
**Purpose**: Help users explore and understand their organizational data
|
|
263
|
+
**Tools needed**: Data querying (required)
|
|
264
|
+
**Key pattern**: Interactive schema exploration, progressive query building, explain-as-you-go
|
|
265
|
+
|
|
266
|
+
### 7. Sprint/Project Tracker
|
|
267
|
+
**Purpose**: Track sprint progress, identify risks, generate status reports
|
|
268
|
+
**Tools needed**: Work management (required), messaging (optional)
|
|
269
|
+
**Key pattern**: Aggregate work item states, compare against goals, highlight blockers
|
|
270
|
+
|
|
271
|
+
### 8. Deck/Presentation Builder
|
|
272
|
+
**Purpose**: Create slide decks or presentations from organizational data
|
|
273
|
+
**Tools needed**: Data querying (for data), filesystem (for output), templates (in assets/)
|
|
274
|
+
**Key pattern**: Data gathering -> narrative construction -> template population -> output generation
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## Anti-Patterns to Avoid
|
|
279
|
+
|
|
280
|
+
1. **Hardcoding tool names**: Don't write `Use mcp_devrev__create_work`. Write "Create a work item using the available work management tools."
|
|
281
|
+
|
|
282
|
+
2. **Assuming tool availability**: Don't write a skill that silently fails if a tool is missing. Include discovery and graceful degradation.
|
|
283
|
+
|
|
284
|
+
3. **Ignoring schema discovery**: Don't guess field names or object structures. Always discover the schema first for data-querying skills.
|
|
285
|
+
|
|
286
|
+
4. **Skipping confirmation for external actions**: Creating issues, posting messages, and updating work items should be confirmed with the user unless the skill is explicitly designed for automation.
|
|
287
|
+
|
|
288
|
+
5. **Monolithic tool chains**: Don't write a skill that requires 10 different tool categories. Keep tool dependencies minimal and make optional tools truly optional.
|
|
289
|
+
|
|
290
|
+
6. **Tool-specific error messages**: Don't say "ExecuteSQL returned error 422". Say "The query failed — the error suggests [interpretation]. Try [alternative approach]."
|
|
File without changes
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Aggregate individual run results into benchmark summary statistics.
|
|
4
|
+
|
|
5
|
+
Reads grading.json files from run directories and produces:
|
|
6
|
+
- run_summary with mean, stddev, min, max for each metric
|
|
7
|
+
- delta between with_skill and without_skill configurations
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python aggregate_benchmark.py <benchmark_dir>
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
|
|
14
|
+
|
|
15
|
+
The script supports two directory layouts:
|
|
16
|
+
|
|
17
|
+
Workspace layout (from skill-creator iterations):
|
|
18
|
+
<benchmark_dir>/
|
|
19
|
+
└── eval-N/
|
|
20
|
+
├── with_skill/
|
|
21
|
+
│ ├── run-1/grading.json
|
|
22
|
+
│ └── run-2/grading.json
|
|
23
|
+
└── without_skill/
|
|
24
|
+
├── run-1/grading.json
|
|
25
|
+
└── run-2/grading.json
|
|
26
|
+
|
|
27
|
+
Legacy layout (with runs/ subdirectory):
|
|
28
|
+
<benchmark_dir>/
|
|
29
|
+
└── runs/
|
|
30
|
+
└── eval-N/
|
|
31
|
+
├── with_skill/
|
|
32
|
+
│ └── run-1/grading.json
|
|
33
|
+
└── without_skill/
|
|
34
|
+
└── run-1/grading.json
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import argparse
|
|
38
|
+
import json
|
|
39
|
+
import math
|
|
40
|
+
import sys
|
|
41
|
+
from datetime import datetime, timezone
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def calculate_stats(values: list[float]) -> dict:
|
|
46
|
+
"""Calculate mean, stddev, min, max for a list of values."""
|
|
47
|
+
if not values:
|
|
48
|
+
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
|
|
49
|
+
|
|
50
|
+
n = len(values)
|
|
51
|
+
mean = sum(values) / n
|
|
52
|
+
|
|
53
|
+
if n > 1:
|
|
54
|
+
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
|
55
|
+
stddev = math.sqrt(variance)
|
|
56
|
+
else:
|
|
57
|
+
stddev = 0.0
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
"mean": round(mean, 4),
|
|
61
|
+
"stddev": round(stddev, 4),
|
|
62
|
+
"min": round(min(values), 4),
|
|
63
|
+
"max": round(max(values), 4)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_run_results(benchmark_dir: Path) -> dict:
|
|
68
|
+
"""
|
|
69
|
+
Load all run results from a benchmark directory.
|
|
70
|
+
|
|
71
|
+
Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
|
|
72
|
+
or "new_skill"/"old_skill"), each containing a list of run results.
|
|
73
|
+
"""
|
|
74
|
+
# Support both layouts: eval dirs directly under benchmark_dir, or under runs/
|
|
75
|
+
runs_dir = benchmark_dir / "runs"
|
|
76
|
+
if runs_dir.exists():
|
|
77
|
+
search_dir = runs_dir
|
|
78
|
+
elif list(benchmark_dir.glob("eval-*")):
|
|
79
|
+
search_dir = benchmark_dir
|
|
80
|
+
else:
|
|
81
|
+
print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
|
|
82
|
+
return {}
|
|
83
|
+
|
|
84
|
+
results: dict[str, list] = {}
|
|
85
|
+
|
|
86
|
+
for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
|
|
87
|
+
metadata_path = eval_dir / "eval_metadata.json"
|
|
88
|
+
if metadata_path.exists():
|
|
89
|
+
try:
|
|
90
|
+
with open(metadata_path) as mf:
|
|
91
|
+
eval_id = json.load(mf).get("eval_id", eval_idx)
|
|
92
|
+
except (json.JSONDecodeError, OSError):
|
|
93
|
+
eval_id = eval_idx
|
|
94
|
+
else:
|
|
95
|
+
try:
|
|
96
|
+
eval_id = int(eval_dir.name.split("-")[1])
|
|
97
|
+
except ValueError:
|
|
98
|
+
eval_id = eval_idx
|
|
99
|
+
|
|
100
|
+
# Discover config directories dynamically rather than hardcoding names
|
|
101
|
+
for config_dir in sorted(eval_dir.iterdir()):
|
|
102
|
+
if not config_dir.is_dir():
|
|
103
|
+
continue
|
|
104
|
+
# Skip non-config directories (inputs, outputs, etc.)
|
|
105
|
+
if not list(config_dir.glob("run-*")):
|
|
106
|
+
continue
|
|
107
|
+
config = config_dir.name
|
|
108
|
+
if config not in results:
|
|
109
|
+
results[config] = []
|
|
110
|
+
|
|
111
|
+
for run_dir in sorted(config_dir.glob("run-*")):
|
|
112
|
+
run_number = int(run_dir.name.split("-")[1])
|
|
113
|
+
grading_file = run_dir / "grading.json"
|
|
114
|
+
|
|
115
|
+
if not grading_file.exists():
|
|
116
|
+
print(f"Warning: grading.json not found in {run_dir}")
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
with open(grading_file) as f:
|
|
121
|
+
grading = json.load(f)
|
|
122
|
+
except json.JSONDecodeError as e:
|
|
123
|
+
print(f"Warning: Invalid JSON in {grading_file}: {e}")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Extract metrics
|
|
127
|
+
result = {
|
|
128
|
+
"eval_id": eval_id,
|
|
129
|
+
"run_number": run_number,
|
|
130
|
+
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
|
131
|
+
"passed": grading.get("summary", {}).get("passed", 0),
|
|
132
|
+
"failed": grading.get("summary", {}).get("failed", 0),
|
|
133
|
+
"total": grading.get("summary", {}).get("total", 0),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Extract timing — check grading.json first, then sibling timing.json
|
|
137
|
+
timing = grading.get("timing", {})
|
|
138
|
+
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
|
|
139
|
+
timing_file = run_dir / "timing.json"
|
|
140
|
+
if result["time_seconds"] == 0.0 and timing_file.exists():
|
|
141
|
+
try:
|
|
142
|
+
with open(timing_file) as tf:
|
|
143
|
+
timing_data = json.load(tf)
|
|
144
|
+
result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
|
|
145
|
+
result["tokens"] = timing_data.get("total_tokens", 0)
|
|
146
|
+
except json.JSONDecodeError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
# Extract metrics if available
|
|
150
|
+
metrics = grading.get("execution_metrics", {})
|
|
151
|
+
result["tool_calls"] = metrics.get("total_tool_calls", 0)
|
|
152
|
+
if not result.get("tokens"):
|
|
153
|
+
result["tokens"] = metrics.get("output_chars", 0)
|
|
154
|
+
result["errors"] = metrics.get("errors_encountered", 0)
|
|
155
|
+
|
|
156
|
+
# Extract expectations — viewer requires fields: text, passed, evidence
|
|
157
|
+
raw_expectations = grading.get("expectations", [])
|
|
158
|
+
for exp in raw_expectations:
|
|
159
|
+
if "text" not in exp or "passed" not in exp:
|
|
160
|
+
print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
|
|
161
|
+
result["expectations"] = raw_expectations
|
|
162
|
+
|
|
163
|
+
# Extract notes from user_notes_summary
|
|
164
|
+
notes_summary = grading.get("user_notes_summary", {})
|
|
165
|
+
notes = []
|
|
166
|
+
notes.extend(notes_summary.get("uncertainties", []))
|
|
167
|
+
notes.extend(notes_summary.get("needs_review", []))
|
|
168
|
+
notes.extend(notes_summary.get("workarounds", []))
|
|
169
|
+
result["notes"] = notes
|
|
170
|
+
|
|
171
|
+
results[config].append(result)
|
|
172
|
+
|
|
173
|
+
return results
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def aggregate_results(results: dict) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
Aggregate run results into summary statistics.
|
|
179
|
+
|
|
180
|
+
Returns run_summary with stats for each configuration and delta.
|
|
181
|
+
"""
|
|
182
|
+
run_summary = {}
|
|
183
|
+
configs = list(results.keys())
|
|
184
|
+
|
|
185
|
+
for config in configs:
|
|
186
|
+
runs = results.get(config, [])
|
|
187
|
+
|
|
188
|
+
if not runs:
|
|
189
|
+
run_summary[config] = {
|
|
190
|
+
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
191
|
+
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
192
|
+
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
|
193
|
+
}
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
pass_rates = [r["pass_rate"] for r in runs]
|
|
197
|
+
times = [r["time_seconds"] for r in runs]
|
|
198
|
+
tokens = [r.get("tokens", 0) for r in runs]
|
|
199
|
+
|
|
200
|
+
run_summary[config] = {
|
|
201
|
+
"pass_rate": calculate_stats(pass_rates),
|
|
202
|
+
"time_seconds": calculate_stats(times),
|
|
203
|
+
"tokens": calculate_stats(tokens)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Calculate delta between the first two configs (if two exist)
|
|
207
|
+
if len(configs) >= 2:
|
|
208
|
+
primary = run_summary.get(configs[0], {})
|
|
209
|
+
baseline = run_summary.get(configs[1], {})
|
|
210
|
+
else:
|
|
211
|
+
primary = run_summary.get(configs[0], {}) if configs else {}
|
|
212
|
+
baseline = {}
|
|
213
|
+
|
|
214
|
+
delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
|
|
215
|
+
delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
|
|
216
|
+
delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
|
|
217
|
+
|
|
218
|
+
run_summary["delta"] = {
|
|
219
|
+
"pass_rate": f"{delta_pass_rate:+.2f}",
|
|
220
|
+
"time_seconds": f"{delta_time:+.1f}",
|
|
221
|
+
"tokens": f"{delta_tokens:+.0f}"
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return run_summary
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
|
|
228
|
+
"""
|
|
229
|
+
Generate complete benchmark.json from run results.
|
|
230
|
+
"""
|
|
231
|
+
results = load_run_results(benchmark_dir)
|
|
232
|
+
run_summary = aggregate_results(results)
|
|
233
|
+
|
|
234
|
+
# Build runs array for benchmark.json
|
|
235
|
+
runs = []
|
|
236
|
+
for config in results:
|
|
237
|
+
for result in results[config]:
|
|
238
|
+
runs.append({
|
|
239
|
+
"eval_id": result["eval_id"],
|
|
240
|
+
"configuration": config,
|
|
241
|
+
"run_number": result["run_number"],
|
|
242
|
+
"result": {
|
|
243
|
+
"pass_rate": result["pass_rate"],
|
|
244
|
+
"passed": result["passed"],
|
|
245
|
+
"failed": result["failed"],
|
|
246
|
+
"total": result["total"],
|
|
247
|
+
"time_seconds": result["time_seconds"],
|
|
248
|
+
"tokens": result.get("tokens", 0),
|
|
249
|
+
"tool_calls": result.get("tool_calls", 0),
|
|
250
|
+
"errors": result.get("errors", 0)
|
|
251
|
+
},
|
|
252
|
+
"expectations": result["expectations"],
|
|
253
|
+
"notes": result["notes"]
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
# Determine eval IDs from results
|
|
257
|
+
eval_ids = sorted(set(
|
|
258
|
+
r["eval_id"]
|
|
259
|
+
for config in results.values()
|
|
260
|
+
for r in config
|
|
261
|
+
))
|
|
262
|
+
|
|
263
|
+
benchmark = {
|
|
264
|
+
"metadata": {
|
|
265
|
+
"skill_name": skill_name or "<skill-name>",
|
|
266
|
+
"skill_path": skill_path or "<path/to/skill>",
|
|
267
|
+
"executor_model": "<model-name>",
|
|
268
|
+
"analyzer_model": "<model-name>",
|
|
269
|
+
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
270
|
+
"evals_run": eval_ids,
|
|
271
|
+
"runs_per_configuration": 3
|
|
272
|
+
},
|
|
273
|
+
"runs": runs,
|
|
274
|
+
"run_summary": run_summary,
|
|
275
|
+
"notes": [] # To be filled by analyzer
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return benchmark
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def generate_markdown(benchmark: dict) -> str:
|
|
282
|
+
"""Generate human-readable benchmark.md from benchmark data."""
|
|
283
|
+
metadata = benchmark["metadata"]
|
|
284
|
+
run_summary = benchmark["run_summary"]
|
|
285
|
+
|
|
286
|
+
# Determine config names (excluding "delta")
|
|
287
|
+
configs = [k for k in run_summary if k != "delta"]
|
|
288
|
+
config_a = configs[0] if len(configs) >= 1 else "config_a"
|
|
289
|
+
config_b = configs[1] if len(configs) >= 2 else "config_b"
|
|
290
|
+
label_a = config_a.replace("_", " ").title()
|
|
291
|
+
label_b = config_b.replace("_", " ").title()
|
|
292
|
+
|
|
293
|
+
lines = [
|
|
294
|
+
f"# Skill Benchmark: {metadata['skill_name']}",
|
|
295
|
+
"",
|
|
296
|
+
f"**Model**: {metadata['executor_model']}",
|
|
297
|
+
f"**Date**: {metadata['timestamp']}",
|
|
298
|
+
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
|
|
299
|
+
"",
|
|
300
|
+
"## Summary",
|
|
301
|
+
"",
|
|
302
|
+
f"| Metric | {label_a} | {label_b} | Delta |",
|
|
303
|
+
"|--------|------------|---------------|-------|",
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
a_summary = run_summary.get(config_a, {})
|
|
307
|
+
b_summary = run_summary.get(config_b, {})
|
|
308
|
+
delta = run_summary.get("delta", {})
|
|
309
|
+
|
|
310
|
+
# Format pass rate
|
|
311
|
+
a_pr = a_summary.get("pass_rate", {})
|
|
312
|
+
b_pr = b_summary.get("pass_rate", {})
|
|
313
|
+
lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
|
|
314
|
+
|
|
315
|
+
# Format time
|
|
316
|
+
a_time = a_summary.get("time_seconds", {})
|
|
317
|
+
b_time = b_summary.get("time_seconds", {})
|
|
318
|
+
lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
|
|
319
|
+
|
|
320
|
+
# Format tokens
|
|
321
|
+
a_tokens = a_summary.get("tokens", {})
|
|
322
|
+
b_tokens = b_summary.get("tokens", {})
|
|
323
|
+
lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
|
|
324
|
+
|
|
325
|
+
# Notes section
|
|
326
|
+
if benchmark.get("notes"):
|
|
327
|
+
lines.extend([
|
|
328
|
+
"",
|
|
329
|
+
"## Notes",
|
|
330
|
+
""
|
|
331
|
+
])
|
|
332
|
+
for note in benchmark["notes"]:
|
|
333
|
+
lines.append(f"- {note}")
|
|
334
|
+
|
|
335
|
+
return "\n".join(lines)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def main():
|
|
339
|
+
parser = argparse.ArgumentParser(
|
|
340
|
+
description="Aggregate benchmark run results into summary statistics"
|
|
341
|
+
)
|
|
342
|
+
parser.add_argument(
|
|
343
|
+
"benchmark_dir",
|
|
344
|
+
type=Path,
|
|
345
|
+
help="Path to the benchmark directory"
|
|
346
|
+
)
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
"--skill-name",
|
|
349
|
+
default="",
|
|
350
|
+
help="Name of the skill being benchmarked"
|
|
351
|
+
)
|
|
352
|
+
parser.add_argument(
|
|
353
|
+
"--skill-path",
|
|
354
|
+
default="",
|
|
355
|
+
help="Path to the skill being benchmarked"
|
|
356
|
+
)
|
|
357
|
+
parser.add_argument(
|
|
358
|
+
"--output", "-o",
|
|
359
|
+
type=Path,
|
|
360
|
+
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
args = parser.parse_args()
|
|
364
|
+
|
|
365
|
+
if not args.benchmark_dir.exists():
|
|
366
|
+
print(f"Directory not found: {args.benchmark_dir}")
|
|
367
|
+
sys.exit(1)
|
|
368
|
+
|
|
369
|
+
# Generate benchmark
|
|
370
|
+
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
|
|
371
|
+
|
|
372
|
+
# Determine output paths
|
|
373
|
+
output_json = args.output or (args.benchmark_dir / "benchmark.json")
|
|
374
|
+
output_md = output_json.with_suffix(".md")
|
|
375
|
+
|
|
376
|
+
# Write benchmark.json
|
|
377
|
+
with open(output_json, "w") as f:
|
|
378
|
+
json.dump(benchmark, f, indent=2)
|
|
379
|
+
print(f"Generated: {output_json}")
|
|
380
|
+
|
|
381
|
+
# Write benchmark.md
|
|
382
|
+
markdown = generate_markdown(benchmark)
|
|
383
|
+
with open(output_md, "w") as f:
|
|
384
|
+
f.write(markdown)
|
|
385
|
+
print(f"Generated: {output_md}")
|
|
386
|
+
|
|
387
|
+
# Print summary
|
|
388
|
+
run_summary = benchmark["run_summary"]
|
|
389
|
+
configs = [k for k in run_summary if k != "delta"]
|
|
390
|
+
delta = run_summary.get("delta", {})
|
|
391
|
+
|
|
392
|
+
print(f"\nSummary:")
|
|
393
|
+
for config in configs:
|
|
394
|
+
pr = run_summary[config]["pass_rate"]["mean"]
|
|
395
|
+
label = config.replace("_", " ").title()
|
|
396
|
+
print(f" {label}: {pr*100:.1f}% pass rate")
|
|
397
|
+
print(f" Delta: {delta.get('pass_rate', '—')}")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
if __name__ == "__main__":
|
|
401
|
+
main()
|