npm - @tryinget/pi-evalset-lab - Versions diffs - 0.2.0 - Mend

@tryinget/pi-evalset-lab 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/CHANGELOG.md +60 -0
package/LICENSE +78 -0
package/README.md +141 -0
package/examples/.gitkeep +0 -0
package/examples/evalset-compare-sample-embedded.html +142 -0
package/examples/evalset-compare-sample.png +0 -0
package/examples/fixed-task-set-v2.json +127 -0
package/examples/fixed-task-set-v3.json +126 -0
package/examples/fixed-task-set.json +22 -0
package/examples/system-baseline.txt +1 -0
package/examples/system-candidate.txt +6 -0
package/extensions/evalset.ts +1148 -0
package/package.json +85 -0
package/policy/security-policy.json +10 -0
package/policy/stack-lane.json +10 -0
package/prompts/implementation-planning.md +21 -0
package/prompts/security-review.md +21 -0
package/scripts/export-evalset-report-html.mjs +364 -0

package/examples/fixed-task-set-v2.json ADDED Viewed

@@ -0,0 +1,127 @@
+{
+  "name": "maintainer-clarity-v2",
+  "systemPrompt": "Answer in concise, plain English. Prefer direct wording over jargon.",
+  "cases": [
+    {
+      "id": "fixed-task-set-definition",
+      "input": "In plain English, what does a fixed task set mean in eval workflows?",
+      "expectContains": ["same", "tasks"]
+    },
+    {
+      "id": "fixed-task-set-benefit",
+      "input": "Why does using a fixed task set improve comparison quality across runs?",
+      "expectContains": ["compare", "runs"]
+    },
+    {
+      "id": "reproducibility-two-factors",
+      "input": "Name two things you should lock or record for reproducible eval runs.",
+      "expectContains": ["model", "dataset"]
+    },
+    {
+      "id": "pass-rate-calculation",
+      "input": "If 18 of 24 scored cases pass, what pass rate should be reported?",
+      "expectContains": ["75"]
+    },
+    {
+      "id": "scored-case-definition",
+      "input": "What makes a case \"scored\" in this evalset format?",
+      "expectContains": ["checks", "scored"]
+    },
+    {
+      "id": "variant-hash-purpose",
+      "input": "What does variantHash help you verify?",
+      "expectContains": ["variant", "hash"]
+    },
+    {
+      "id": "dataset-hash-purpose",
+      "input": "Why is datasetHash useful when comparing two reports?",
+      "expectContains": ["dataset", "same"]
+    },
+    {
+      "id": "delta-interpretation-speed-cost",
+      "input": "Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.",
+      "expectContains": ["faster", "cost"]
+    },
+    {
+      "id": "command-non-interactive-pattern",
+      "input": "Show the correct non-interactive pattern to run compare with a local extension file.",
+      "expectContains": ["pi -e", "-p", "/evalset compare"]
+    },
+    {
+      "id": "slash-command-shell",
+      "input": "Can you run /evalset directly in bash without pi? Answer yes or no and one reason.",
+      "expectContains": ["no", "pi"]
+    },
+    {
+      "id": "report-default-location",
+      "input": "If --out is omitted, where are evalset reports written by default?",
+      "expectContains": [".evalset/reports"]
+    },
+    {
+      "id": "model-prerequisite",
+      "input": "What should you do before /evalset run if no active model is selected?",
+      "expectContains": ["/model"]
+    },
+    {
+      "id": "max-cases-behavior",
+      "input": "What does --max-cases 5 do during run/compare?",
+      "expectContains": ["first", "5"]
+    },
+    {
+      "id": "temperature-range",
+      "input": "What is the accepted numeric range for --temperature in this extension?",
+      "expectContains": ["0", "2"]
+    },
+    {
+      "id": "system-merge-behavior",
+      "input": "If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?",
+      "expectContains": ["merge"]
+    },
+    {
+      "id": "mutually-exclusive-system-options",
+      "input": "Can --system-file and --system-text be used together?",
+      "expectContains": ["no"]
+    },
+    {
+      "id": "run-identity-fields",
+      "input": "Name any three run identity fields in a run report.",
+      "expectContains": ["runid", "datasethash", "caseshash"]
+    },
+    {
+      "id": "compare-identity-fields",
+      "input": "Name the two run-id fields that link baseline and candidate inside compare.run.",
+      "expectContains": ["baselinerunid", "candidaterunid"]
+    },
+    {
+      "id": "delta-passrate-zero",
+      "input": "If delta pass rate is 0, what does that imply?",
+      "expectContains": ["same", "pass rate"]
+    },
+    {
+      "id": "keyword-check-limitation",
+      "input": "Why can simple keyword checks be misleading for quality?",
+      "expectContains": ["wording", "false"]
+    },
+    {
+      "id": "improve-weak-evalset",
+      "input": "Give two concrete ways to improve a weak 3-case evalset.",
+      "expectContains": ["more", "cases", "criteria"]
+    },
+    {
+      "id": "no-overclaim-rollout",
+      "input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
+      "expectContains": ["no"],
+      "expectNotContains": ["huge replacement"]
+    },
+    {
+      "id": "stakeholder-brief",
+      "input": "Write a one-line stakeholder brief that includes pilot scope and measurement.",
+      "expectContains": ["pilot", "measure"]
+    },
+    {
+      "id": "tie-communication",
+      "input": "Baseline and candidate both scored 33.3% pass rate. How should that be communicated?",
+      "expectContains": ["same", "pass rate"]
+    }
+  ]
+}

package/examples/fixed-task-set-v3.json ADDED Viewed

@@ -0,0 +1,126 @@
+{
+  "name": "maintainer-clarity-v3",
+  "systemPrompt": "Answer in concise, plain English. Prefer direct wording over jargon.",
+  "cases": [
+    {
+      "id": "fixed-task-set-definition",
+      "input": "In plain English, what does a fixed task set mean in eval workflows?",
+      "expectContains": ["same", "tasks"]
+    },
+    {
+      "id": "fixed-task-set-benefit",
+      "input": "Why does a fixed task set improve comparison quality across runs?",
+      "expectContains": ["compar"]
+    },
+    {
+      "id": "reproducibility-two-factors",
+      "input": "Name two things you should lock or record for reproducible eval runs.",
+      "expectContains": ["model", "dataset"]
+    },
+    {
+      "id": "pass-rate-calculation",
+      "input": "If 18 of 24 scored cases pass, what pass rate should be reported?",
+      "expectContains": ["75"]
+    },
+    {
+      "id": "scored-case-definition",
+      "input": "What makes a case scored in this evalset format?",
+      "expectRegex": "([Ee]xpect|[Cc]heck|[Cc]riteria|[Rr]ule)"
+    },
+    {
+      "id": "variant-hash-purpose",
+      "input": "What does variantHash help you verify?",
+      "expectContains": ["variant", "hash"]
+    },
+    {
+      "id": "dataset-hash-purpose",
+      "input": "Why is datasetHash useful when comparing two reports?",
+      "expectContains": ["dataset", "hash"]
+    },
+    {
+      "id": "delta-interpretation-speed-cost",
+      "input": "Candidate delta avg latency is -800ms and delta total cost is +0.0003. Summarize the tradeoff.",
+      "expectContains": ["faster"]
+    },
+    {
+      "id": "command-non-interactive-pattern",
+      "input": "Show the correct non-interactive pattern to run compare from shell.",
+      "expectContains": ["pi", "-p", "evalset"]
+    },
+    {
+      "id": "slash-command-shell",
+      "input": "Can you run /evalset directly in bash without pi? Answer yes or no and one reason.",
+      "expectContains": ["no", "slash"]
+    },
+    {
+      "id": "report-default-location",
+      "input": "If --out is omitted, where are evalset reports written by default?",
+      "expectContains": ["evalset", "report"]
+    },
+    {
+      "id": "model-prerequisite",
+      "input": "What should you do before /evalset run if no active model is selected?",
+      "expectContains": ["/model"]
+    },
+    {
+      "id": "max-cases-behavior",
+      "input": "What does --max-cases 5 do during run/compare?",
+      "expectRegex": "([Ff]irst|[Ll]imit|5.*case|case.*5)"
+    },
+    {
+      "id": "temperature-range",
+      "input": "What is the accepted numeric range for --temperature in this extension?",
+      "expectContains": ["0", "2"]
+    },
+    {
+      "id": "system-merge-behavior",
+      "input": "If dataset.systemPrompt exists and --system-file is provided, are prompts merged or replaced?",
+      "expectContains": ["merge"]
+    },
+    {
+      "id": "mutually-exclusive-system-options",
+      "input": "Can --system-file and --system-text be used together?",
+      "expectRegex": "([Nn]o|[Ee]ither|[Oo]ne|not together|[Bb]oth)"
+    },
+    {
+      "id": "run-identity-fields",
+      "input": "Name any three run identity fields in a run report.",
+      "expectContains": ["run", "dataset", "case"]
+    },
+    {
+      "id": "compare-identity-fields",
+      "input": "Name the two run-id fields that link baseline and candidate inside compare.run.",
+      "expectContains": ["baseline", "candidate", "run"]
+    },
+    {
+      "id": "delta-passrate-zero",
+      "input": "If delta pass rate is 0, what does that imply?",
+      "expectRegex": "([Nn]o change|same pass rate|[Uu]nchanged)"
+    },
+    {
+      "id": "keyword-check-limitation",
+      "input": "Why can simple keyword checks be misleading for quality?",
+      "expectRegex": "([Kk]eyword).*(mislead|false|[Ee]rror|[Bb]rittle)|([Mm]islead|false|[Ee]rror|[Bb]rittle).*[Kk]eyword"
+    },
+    {
+      "id": "improve-weak-evalset",
+      "input": "Give two concrete ways to improve a weak 3-case evalset.",
+      "expectContains": ["more", "cases"]
+    },
+    {
+      "id": "no-overclaim-rollout",
+      "input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
+      "expectContains": ["no", "phase"]
+    },
+    {
+      "id": "stakeholder-brief",
+      "input": "Write a one-line stakeholder brief that includes pilot scope and measurement.",
+      "expectContains": ["pilot", "measure"]
+    },
+    {
+      "id": "tie-communication",
+      "input": "Baseline and candidate both scored 33.3% pass rate. How should that be communicated?",
+      "expectRegex": "([Ss]ame|[Nn]o difference|[Uu]nchanged)"
+    }
+  ]
+}

package/examples/fixed-task-set.json ADDED Viewed

@@ -0,0 +1,22 @@
+{
+  "name": "maintainer-clarity-smoke",
+  "systemPrompt": "Answer concisely and explicitly. No fluff.",
+  "cases": [
+    {
+      "id": "fixed-task-set-definition",
+      "input": "In one sentence, define what a fixed task set means for eval workflows.",
+      "expectContains": ["same tasks"]
+    },
+    {
+      "id": "extension-gaps",
+      "input": "Name two extension-level gaps that can affect reproducible eval workflows in pi.",
+      "expectContains": ["trace", "reproducibility"]
+    },
+    {
+      "id": "do-not-overclaim",
+      "input": "Should this be pitched as a huge replacement right away? Answer yes or no and one short reason.",
+      "expectContains": ["no"],
+      "expectNotContains": ["huge replacement"]
+    }
+  ]
+}

package/examples/system-baseline.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ You are a concise technical assistant. Keep answers short and avoid jargon.

package/examples/system-candidate.txt ADDED Viewed

@@ -0,0 +1,6 @@
+You are a concise technical assistant.
+Priorities:
+1) Explain terms in plain language first.
+2) Keep each answer to 2-4 sentences.
+3) Prefer concrete examples over abstractions.