pi-evalset-lab 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.copier-answers.yml +5 -0
- package/.githooks/pre-commit +12 -0
- package/.github/CODEOWNERS +12 -0
- package/.github/ISSUE_TEMPLATE/bug-report.yml +63 -0
- package/.github/ISSUE_TEMPLATE/config.yml +5 -0
- package/.github/ISSUE_TEMPLATE/docs.yml +39 -0
- package/.github/ISSUE_TEMPLATE/feature-request.yml +41 -0
- package/.github/VOUCHED.td +8 -0
- package/.github/dependabot.yml +13 -0
- package/.github/pull_request_template.md +34 -0
- package/.github/workflows/ci.yml +37 -0
- package/.github/workflows/publish.yml +60 -0
- package/.github/workflows/release-please.yml +25 -0
- package/.github/workflows/vouch-check-pr.yml +29 -0
- package/.github/workflows/vouch-manage.yml +34 -0
- package/.pi/extensions/startup-intake-router.ts +151 -0
- package/.pi/prompts/init-project-docs.md +32 -0
- package/.release-please-config.json +11 -0
- package/.release-please-manifest.json +3 -0
- package/AGENTS.md +39 -0
- package/CHANGELOG.md +43 -0
- package/CODE_OF_CONDUCT.md +50 -0
- package/CONTRIBUTING.md +28 -0
- package/NEXT_SESSION_PROMPT.md +14 -0
- package/README.md +246 -0
- package/SECURITY.md +34 -0
- package/SUPPORT.md +37 -0
- package/docs/dev/CONTRIBUTING.md +37 -0
- package/docs/dev/EXTENSION_SOP.md +43 -0
- package/docs/dev/next_steps.md +17 -0
- package/docs/dev/plans/001-initial-plan.md +24 -0
- package/docs/dev/status.md +21 -0
- package/docs/org/operating_model.md +39 -0
- package/docs/org/project-docs-intake.questions.json +60 -0
- package/docs/project/foundation.md +28 -0
- package/docs/project/incentives.md +17 -0
- package/docs/project/resources.md +26 -0
- package/docs/project/skills.md +17 -0
- package/docs/project/strategic_goals.md +18 -0
- package/docs/project/tactical_goals.md +39 -0
- package/docs/project/vision.md +21 -0
- package/examples/.gitkeep +0 -0
- package/examples/fixed-task-set-v2.json +127 -0
- package/examples/fixed-task-set-v3.json +126 -0
- package/examples/fixed-task-set.json +22 -0
- package/examples/system-baseline.txt +1 -0
- package/examples/system-candidate.txt +6 -0
- package/extensions/evalset.ts +1090 -0
- package/external/.gitkeep +0 -0
- package/ontology/.gitkeep +0 -0
- package/package.json +31 -0
- package/policy/security-policy.json +10 -0
- package/prek.toml +15 -0
- package/prompts/implementation-planning.md +17 -0
- package/prompts/init-project-docs.md +32 -0
- package/prompts/security-review.md +17 -0
- package/scripts/docs-list.sh +50 -0
- package/scripts/init-project-docs.sh +56 -0
- package/scripts/install-hooks.sh +13 -0
- package/scripts/sync-to-live.sh +91 -0
- package/scripts/validate-structure.sh +325 -0
- package/src/.gitkeep +0 -0
- package/tests/.gitkeep +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Initialize organization and project documents",
|
|
3
|
+
"questions": [
|
|
4
|
+
{
|
|
5
|
+
"id": "org_purpose",
|
|
6
|
+
"type": "text",
|
|
7
|
+
"question": "What is the organization purpose?"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"id": "org_mission",
|
|
11
|
+
"type": "text",
|
|
12
|
+
"question": "What is the current organization mission?"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"id": "org_vision",
|
|
16
|
+
"type": "text",
|
|
17
|
+
"question": "What is the organization vision?"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "org_strategic_objectives",
|
|
21
|
+
"type": "text",
|
|
22
|
+
"question": "List 3-5 organization strategic objectives."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "org_core_values",
|
|
26
|
+
"type": "text",
|
|
27
|
+
"question": "List the core values that should drive ethics and culture."
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": "project_purpose",
|
|
31
|
+
"type": "text",
|
|
32
|
+
"question": "What is this project purpose (distinct from organization purpose)?"
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"id": "project_mission",
|
|
36
|
+
"type": "text",
|
|
37
|
+
"question": "What is the project mission for the next delivery cycle?"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"id": "project_vision",
|
|
41
|
+
"type": "text",
|
|
42
|
+
"question": "What is the project vision?"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"id": "project_strategic_objectives",
|
|
46
|
+
"type": "text",
|
|
47
|
+
"question": "List 3-5 project strategic objectives."
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"id": "project_constraints",
|
|
51
|
+
"type": "text",
|
|
52
|
+
"question": "List hard constraints (time, quality, security, scope)."
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"id": "project_success_criteria",
|
|
56
|
+
"type": "text",
|
|
57
|
+
"question": "How will success be measured?"
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Compact project model with explicit project-purpose framing."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Aligning project purpose, strategy, and delivery behavior."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Project-level concepts and boundaries for this repository."
|
|
7
|
+
compass: "Translate project purpose into executable outcomes."
|
|
8
|
+
engine: "Project purpose -> mission -> vision -> strategic objectives."
|
|
9
|
+
fog: "Project scope and priorities can drift without explicit review."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Project foundation model
|
|
13
|
+
|
|
14
|
+
## Project purpose
|
|
15
|
+
Build and maintain `pi-evalset-lab` as a practical extension package for fixed-task-set evaluations in pi.
|
|
16
|
+
|
|
17
|
+
## Project mission (current cycle)
|
|
18
|
+
Stabilize `/evalset` workflows, clarify execution behavior (especially `pi -e` and non-interactive mode), and keep reports reproducible.
|
|
19
|
+
|
|
20
|
+
## Scope boundary
|
|
21
|
+
- Organization purpose lives in [Organization operating model](../org/operating_model.md).
|
|
22
|
+
- This project purpose is repository-specific and narrower.
|
|
23
|
+
|
|
24
|
+
## Project operating principles
|
|
25
|
+
- Keep changes small and reviewable.
|
|
26
|
+
- Prefer explicit run metadata over implicit assumptions.
|
|
27
|
+
- Document behavior when UX is surprising.
|
|
28
|
+
- Validate structure after meaningful changes.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Why this project matters and what outcomes are rewarded."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Prioritizing roadmap work or evaluating tradeoffs."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Motivators and value model."
|
|
7
|
+
compass: "Favor changes that improve reliability and operator confidence."
|
|
8
|
+
engine: "Map work to measurable maintenance and delivery wins."
|
|
9
|
+
fog: "Incentives can drift if outcomes are not tracked."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Incentives
|
|
13
|
+
|
|
14
|
+
- Faster extension delivery.
|
|
15
|
+
- Lower regression risk.
|
|
16
|
+
- Clear docs for handoffs.
|
|
17
|
+
- Repeatable release and maintenance workflow.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Key resources for building and maintaining the extension."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Looking up references, docs, or operational artifacts."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Reference catalog for the project."
|
|
7
|
+
compass: "Centralize discovery paths for maintainers."
|
|
8
|
+
engine: "Link docs, scripts, and examples used in execution."
|
|
9
|
+
fog: "External links may become stale over time."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Resources
|
|
13
|
+
|
|
14
|
+
- [Extension entrypoint](../../extensions/evalset.ts)
|
|
15
|
+
- [Prompt templates](../../prompts)
|
|
16
|
+
- [Smoke dataset](../../examples/fixed-task-set.json)
|
|
17
|
+
- [Larger dataset v2](../../examples/fixed-task-set-v2.json)
|
|
18
|
+
- [Recommended dataset v3](../../examples/fixed-task-set-v3.json)
|
|
19
|
+
- [Baseline system prompt](../../examples/system-baseline.txt)
|
|
20
|
+
- [Candidate system prompt](../../examples/system-candidate.txt)
|
|
21
|
+
- Local report artifacts path: `.evalset/reports/*.json` (and optional `*.html` exports)
|
|
22
|
+
- [Project-local router extension](../../.pi/extensions/startup-intake-router.ts)
|
|
23
|
+
- [Organization operating model](../org/operating_model.md)
|
|
24
|
+
- [Interview questions](../org/project-docs-intake.questions.json)
|
|
25
|
+
- [Security policy](../../policy/security-policy.json)
|
|
26
|
+
- [Validation script](../../scripts/validate-structure.sh)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Skills needed to maintain and extend this package."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Onboarding maintainers or planning implementation work."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Capability map for project contributors."
|
|
7
|
+
compass: "Build confidence in extension APIs, testing, and release hygiene."
|
|
8
|
+
engine: "Assess gaps and train through small delivery cycles."
|
|
9
|
+
fog: "Skill requirements change as scope expands."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Skills
|
|
13
|
+
|
|
14
|
+
- TypeScript extension development
|
|
15
|
+
- Shell scripting and git hooks
|
|
16
|
+
- Prompt template design
|
|
17
|
+
- Security review and dependency hygiene
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Long-horizon goals for extension package maturity."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Roadmapping quarterly or milestone-level outcomes."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Strategic objective set."
|
|
7
|
+
compass: "Reliability, maintainability, and team adoption."
|
|
8
|
+
engine: "Sequence milestones with measurable checks."
|
|
9
|
+
fog: "Long-term assumptions may need periodic resets."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Strategic goals
|
|
13
|
+
|
|
14
|
+
1. Reach stable, documented `/evalset` behavior across interactive and non-interactive usage.
|
|
15
|
+
2. Improve reproducibility signals (dataset hash, case hash, variant hash, run lineage) in all report paths.
|
|
16
|
+
3. Keep extension docs concise and discoverable for new maintainers.
|
|
17
|
+
4. Maintain secure, low-friction release and validation workflow.
|
|
18
|
+
5. Expand confidence with focused tests for command parsing and report generation.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Near-term execution goals tied to current sprint cycles."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Planning immediate tasks and delivery scope."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Tactical work queue framing."
|
|
7
|
+
compass: "Ship the next smallest valuable increment safely."
|
|
8
|
+
engine: "Break work into verifiable, low-risk tasks."
|
|
9
|
+
fog: "Unexpected integration constraints may reprioritize tasks."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Tactical goals
|
|
13
|
+
|
|
14
|
+
## Current cycle status
|
|
15
|
+
|
|
16
|
+
### Completed
|
|
17
|
+
- Added troubleshooting notes for `/evalset` invocation (interactive vs `pi -p` non-interactive).
|
|
18
|
+
- Kept run/compare reports discoverable under `.evalset/reports/`.
|
|
19
|
+
- Added stronger example datasets (`fixed-task-set-v2.json`, `fixed-task-set-v3.json`).
|
|
20
|
+
- Synchronized docs and changelog with behavior updates.
|
|
21
|
+
|
|
22
|
+
### Remaining
|
|
23
|
+
1. Add smoke tests for argument parsing and report/scoring behavior.
|
|
24
|
+
2. Add a repeatable JSON -> HTML report export helper.
|
|
25
|
+
3. Complete npm publish after npmjs auth/registry setup.
|
|
26
|
+
|
|
27
|
+
## Hard constraints
|
|
28
|
+
- Time: keep work in small slices that can be reviewed quickly.
|
|
29
|
+
- Quality: no regressions in existing `/evalset` commands.
|
|
30
|
+
- Security: no insecure defaults in scripts or extension loading.
|
|
31
|
+
- Scope: prioritize reproducibility and UX clarity over feature breadth.
|
|
32
|
+
|
|
33
|
+
## Success criteria
|
|
34
|
+
- [x] `npm run check` passes.
|
|
35
|
+
- [x] Documented troubleshooting path for `pi -e` confusion exists.
|
|
36
|
+
- [x] Reports include stable run metadata and are easy to locate.
|
|
37
|
+
- [x] New maintainers can execute a compare workflow from README without intervention.
|
|
38
|
+
- [ ] Automated tests cover parser/scoring basics.
|
|
39
|
+
- [ ] Package is published on npm.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: "Product and technical vision for pi-evalset-lab."
|
|
3
|
+
read_when:
|
|
4
|
+
- "Defining or revisiting project direction."
|
|
5
|
+
system4d:
|
|
6
|
+
container: "Project north-star statement."
|
|
7
|
+
compass: "Build a reliable pi extension package with low maintenance overhead."
|
|
8
|
+
engine: "Translate goals into concrete implementation slices."
|
|
9
|
+
fog: "Real user workflows may reshape priorities."
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Vision
|
|
13
|
+
|
|
14
|
+
Make `pi-evalset-lab` the lightweight default for comparing prompt/system variants with reproducible, inspectable reports.
|
|
15
|
+
|
|
16
|
+
The project should feel predictable for maintainers:
|
|
17
|
+
- clear command behavior in both interactive and `-p` modes,
|
|
18
|
+
- clear report locations and run identity metadata,
|
|
19
|
+
- clear docs for troubleshooting common setup confusion.
|
|
20
|
+
|
|
21
|
+
Project vision is intentionally separate from organization vision in [Organization operating model](../org/operating_model.md).
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "maintainer-clarity-v2",
|
|
3
|
+
"systemPrompt": "Answer in concise, plain English. Prefer direct wording over jargon.",
|
|
4
|
+
"cases": [
|
|
5
|
+
{
|
|
6
|
+
"id": "fixed-task-set-definition",
|
|
7
|
+
"input": "In plain English, what does a fixed task set mean in eval workflows?",
|
|
8
|
+
"expectContains": ["same", "tasks"]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "fixed-task-set-benefit",
|
|
12
|
+
"input": "Why does using a fixed task set improve comparison quality across runs?",
|
|
13
|
+
"expectContains": ["compare", "runs"]
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "reproducibility-two-factors",
|
|
17
|
+
"input": "Name two things you should lock or record for reproducible eval runs.",
|
|
18
|
+
"expectContains": ["model", "dataset"]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pass-rate-calculation",
|
|
22
|
+
"input": "If 18 of 24 scored cases pass, what pass rate should be reported?",
|
|
23
|
+
"expectContains": ["75"]
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"id": "scored-case-definition",
|
|
27
|
+
"input": "What makes a case \"scored\" in this evalset format?",
|
|
28
|
+
"expectContains": ["checks", "scored"]
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"id": "variant-hash-purpose",
|
|
32
|
+
"input": "What does variantHash help you verify?",
|
|
33
|
+
"expectContains": ["variant", "hash"]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"id": "dataset-hash-purpose",
|
|
37
|
+
"input": "Why is datasetHash useful when comparing two reports?",
|
|
38
|
+
"expectContains": ["dataset", "same"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": "delta-interpretation-speed-cost",
|
|
42
|
+
"input": "Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.",
|
|
43
|
+
"expectContains": ["faster", "cost"]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "command-non-interactive-pattern",
|
|
47
|
+
"input": "Show the correct non-interactive pattern to run compare with a local extension file.",
|
|
48
|
+
"expectContains": ["pi -e", "-p", "/evalset compare"]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": "slash-command-shell",
|
|
52
|
+
"input": "Can you run /evalset directly in bash without pi? Answer yes or no and one reason.",
|
|
53
|
+
"expectContains": ["no", "pi"]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"id": "report-default-location",
|
|
57
|
+
"input": "If --out is omitted, where are evalset reports written by default?",
|
|
58
|
+
"expectContains": [".evalset/reports"]
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"id": "model-prerequisite",
|
|
62
|
+
"input": "What should you do before /evalset run if no active model is selected?",
|
|
63
|
+
"expectContains": ["/model"]
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"id": "max-cases-behavior",
|
|
67
|
+
"input": "What does --max-cases 5 do during run/compare?",
|
|
68
|
+
"expectContains": ["first", "5"]
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"id": "temperature-range",
|
|
72
|
+
"input": "What is the accepted numeric range for --temperature in this extension?",
|
|
73
|
+
"expectContains": ["0", "2"]
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"id": "system-merge-behavior",
|
|
77
|
+
"input": "If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?",
|
|
78
|
+
"expectContains": ["merge"]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": "mutually-exclusive-system-options",
|
|
82
|
+
"input": "Can --system-file and --system-text be used together?",
|
|
83
|
+
"expectContains": ["no"]
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"id": "run-identity-fields",
|
|
87
|
+
"input": "Name any three run identity fields in a run report.",
|
|
88
|
+
"expectContains": ["runid", "datasethash", "caseshash"]
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"id": "compare-identity-fields",
|
|
92
|
+
"input": "Name the two run-id fields that link baseline and candidate inside compare.run.",
|
|
93
|
+
"expectContains": ["baselinerunid", "candidaterunid"]
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"id": "delta-passrate-zero",
|
|
97
|
+
"input": "If delta pass rate is 0, what does that imply?",
|
|
98
|
+
"expectContains": ["same", "pass rate"]
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"id": "keyword-check-limitation",
|
|
102
|
+
"input": "Why can simple keyword checks be misleading for quality?",
|
|
103
|
+
"expectContains": ["wording", "false"]
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"id": "improve-weak-evalset",
|
|
107
|
+
"input": "Give two concrete ways to improve a weak 3-case evalset.",
|
|
108
|
+
"expectContains": ["more", "cases", "criteria"]
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"id": "no-overclaim-rollout",
|
|
112
|
+
"input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
|
|
113
|
+
"expectContains": ["no"],
|
|
114
|
+
"expectNotContains": ["huge replacement"]
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"id": "stakeholder-brief",
|
|
118
|
+
"input": "Write a one-line stakeholder brief that includes pilot scope and measurement.",
|
|
119
|
+
"expectContains": ["pilot", "measure"]
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"id": "tie-communication",
|
|
123
|
+
"input": "Baseline and candidate both scored 33.3% pass rate. How should that be communicated?",
|
|
124
|
+
"expectContains": ["same", "pass rate"]
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "maintainer-clarity-v3",
|
|
3
|
+
"systemPrompt": "Answer in concise, plain English. Prefer direct wording over jargon.",
|
|
4
|
+
"cases": [
|
|
5
|
+
{
|
|
6
|
+
"id": "fixed-task-set-definition",
|
|
7
|
+
"input": "In plain English, what does a fixed task set mean in eval workflows?",
|
|
8
|
+
"expectContains": ["same", "tasks"]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "fixed-task-set-benefit",
|
|
12
|
+
"input": "Why does a fixed task set improve comparison quality across runs?",
|
|
13
|
+
"expectContains": ["compar"]
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "reproducibility-two-factors",
|
|
17
|
+
"input": "Name two things you should lock or record for reproducible eval runs.",
|
|
18
|
+
"expectContains": ["model", "dataset"]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pass-rate-calculation",
|
|
22
|
+
"input": "If 18 of 24 scored cases pass, what pass rate should be reported?",
|
|
23
|
+
"expectContains": ["75"]
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"id": "scored-case-definition",
|
|
27
|
+
"input": "What makes a case scored in this evalset format?",
|
|
28
|
+
"expectRegex": "([Ee]xpect|[Cc]heck|[Cc]riteria|[Rr]ule)"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"id": "variant-hash-purpose",
|
|
32
|
+
"input": "What does variantHash help you verify?",
|
|
33
|
+
"expectContains": ["variant", "hash"]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"id": "dataset-hash-purpose",
|
|
37
|
+
"input": "Why is datasetHash useful when comparing two reports?",
|
|
38
|
+
"expectContains": ["dataset", "hash"]
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": "delta-interpretation-speed-cost",
|
|
42
|
+
"input": "Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.",
|
|
43
|
+
"expectContains": ["faster"]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "command-non-interactive-pattern",
|
|
47
|
+
"input": "Show the correct non-interactive pattern to run compare from shell.",
|
|
48
|
+
"expectContains": ["pi", "-p", "evalset"]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": "slash-command-shell",
|
|
52
|
+
"input": "Can you run /evalset directly in bash without pi? Answer yes or no and one reason.",
|
|
53
|
+
"expectContains": ["no", "slash"]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"id": "report-default-location",
|
|
57
|
+
"input": "If --out is omitted, where are evalset reports written by default?",
|
|
58
|
+
"expectContains": ["evalset", "report"]
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"id": "model-prerequisite",
|
|
62
|
+
"input": "What should you do before /evalset run if no active model is selected?",
|
|
63
|
+
"expectContains": ["/model"]
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"id": "max-cases-behavior",
|
|
67
|
+
"input": "What does --max-cases 5 do during run/compare?",
|
|
68
|
+
"expectRegex": "([Ff]irst|[Ll]imit|5.*case|case.*5)"
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"id": "temperature-range",
|
|
72
|
+
"input": "What is the accepted numeric range for --temperature in this extension?",
|
|
73
|
+
"expectContains": ["0", "2"]
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"id": "system-merge-behavior",
|
|
77
|
+
"input": "If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?",
|
|
78
|
+
"expectContains": ["merge"]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": "mutually-exclusive-system-options",
|
|
82
|
+
"input": "Can --system-file and --system-text be used together?",
|
|
83
|
+
"expectRegex": "([Nn]o|[Ee]ither|[Oo]ne|not together|[Bb]oth)"
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"id": "run-identity-fields",
|
|
87
|
+
"input": "Name any three run identity fields in a run report.",
|
|
88
|
+
"expectContains": ["run", "dataset", "case"]
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"id": "compare-identity-fields",
|
|
92
|
+
"input": "Name the two run-id fields that link baseline and candidate inside compare.run.",
|
|
93
|
+
"expectContains": ["baseline", "candidate", "run"]
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"id": "delta-passrate-zero",
|
|
97
|
+
"input": "If delta pass rate is 0, what does that imply?",
|
|
98
|
+
"expectRegex": "([Nn]o change|same pass rate|[Uu]nchanged)"
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"id": "keyword-check-limitation",
|
|
102
|
+
"input": "Why can simple keyword checks be misleading for quality?",
|
|
103
|
+
"expectRegex": "([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword"
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"id": "improve-weak-evalset",
|
|
107
|
+
"input": "Give two concrete ways to improve a weak 3-case evalset.",
|
|
108
|
+
"expectContains": ["more", "cases"]
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"id": "no-overclaim-rollout",
|
|
112
|
+
"input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
|
|
113
|
+
"expectContains": ["no", "phase"]
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"id": "stakeholder-brief",
|
|
117
|
+
"input": "Write a one-line stakeholder brief that includes pilot scope and measurement.",
|
|
118
|
+
"expectContains": ["pilot", "measure"]
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"id": "tie-communication",
|
|
122
|
+
"input": "Baseline and candidate both scored 33.3% pass rate. How should that be communicated?",
|
|
123
|
+
"expectRegex": "([Ss]ame|[Nn]o difference|[Uu]nchanged)"
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "maintainer-clarity-smoke",
|
|
3
|
+
"systemPrompt": "Answer concisely and explicitly. No fluff.",
|
|
4
|
+
"cases": [
|
|
5
|
+
{
|
|
6
|
+
"id": "fixed-task-set-definition",
|
|
7
|
+
"input": "In one sentence, define what a fixed task set means for eval workflows.",
|
|
8
|
+
"expectContains": ["same tasks"]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "extension-gaps",
|
|
12
|
+
"input": "Name two extension-level gaps that can affect reproducible eval workflows in pi.",
|
|
13
|
+
"expectContains": ["trace", "reproducibility"]
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "do-not-overclaim",
|
|
17
|
+
"input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
|
|
18
|
+
"expectContains": ["no"],
|
|
19
|
+
"expectNotContains": ["huge replacement"]
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
You are a concise technical assistant. Keep answers short and avoid jargon.
|