npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/commands/solo.md ADDED Viewed

@@ -0,0 +1,447 @@
+---
+description: Run a single agent on a scenario with absolute rubric scoring
+argument-hint: <theme:agent> --scenario <name> [--as <role>] [--runs N] [--no-judge]
+---
+# Solo Benchmark
+<purpose>
+Run a single agent on a scenario. This is the CANONICAL agent execution path.
+**Modes:**
+- **Full (default):** Agent runs → `/judge` evaluates → `/finalize-run` saves
+- **No-judge (`--no-judge`):** Agent runs only, returns raw response (for /duel, /relay)
+</purpose>
+<architecture>
+```
+/solo theme:agent --scenario X
+    │
+    ├──► Execute agent via CLI
+    │         └──► Response + tokens
+    │
+    ├──► /judge --mode solo (if not --no-judge)
+    │         └──► Score + verdict
+    │
+    └──► /finalize-run --type solo
+              └──► Validate + save
+```
+</architecture>
+<usage>
+```
+/solo <contestant> --scenario <name>
+/solo <contestant> --scenario <name> --runs 4
+/solo <contestant> --scenario <name> --no-judge
+/solo <contestant> --as <role> --scenario <name>
+```
+**Arguments:**
+- `contestant` - `theme:agent` format (e.g., `discworld:reviewer`) OR `theme:character` with `--as`
+- `--scenario` - Scenario from `scenarios/` directory
+- `--as <role>` - Override role (use character's persona for different role's task)
+- `--runs N` - Number of runs (default: 1, max: 20)
+- `--no-judge` - Skip judging, return raw response
+**Cross-Role Testing with `--as`:**
+The `--as` flag enables running any persona as any role, useful for research:
+```
+/solo shakespeare:prospero --as dev --scenario django-10554
+```
+This uses Prospero's persona traits (wise orchestrator, magic metaphors) but gives him a dev task.
+The scenario's role determines what the agent is asked to do; the character determines how they do it.
+</usage>
+<on-invoke>
+The user invoked this command with: $ARGUMENTS
+## Step 1: Parse Arguments
+Extract:
+- `contestant`: `theme:agent` spec (or `theme:character` if using `--as`)
+- `scenario_name`: After `--scenario`
+- `role_override`: After `--as` (optional - for cross-role testing)
+- `runs`: Number (default: 1)
+- `no_judge`: Boolean
+Validate spec contains `:`, scenario is required, runs is 1-20.
+**If `--as` is provided:**
+- The second part of the spec is a CHARACTER name, not a role
+- `role_override` becomes the effective role for scenario matching
+- Character persona is extracted by name lookup across all agents in theme
+## Step 2: Load Scenario
+```yaml
+Glob tool:
+  pattern: "scenarios/**/{scenario_name}.yaml"
+```
+Extract: `prompt`, `scenario_title`, `code_content` (if present)
+## Step 3: Load Persona
+Read: `pennyfarthing-dist/personas/themes/{theme}.yaml`
+**Standard mode (no `--as`):**
+- Look up `agents.{agent}` section
+- Extract: `character`, `style`, `expertise`, `catchphrases`, `emoji`
+- `effective_role` = agent name from spec
+**Cross-role mode (with `--as`):**
+- The spec contains `theme:character_name` (e.g., `shakespeare:prospero`)
+- Search ALL agent sections for one where `character` field matches (case-insensitive, partial match OK)
+- Extract persona traits from that agent's config
+- `effective_role` = the `--as` value (NOT the role the character normally fills)
+```python
+# Pseudocode for cross-role lookup
+if role_override:
+    character_query = spec.split(':')[1].lower()  # e.g., "prospero"
+    for agent_name, agent_config in theme['agents'].items():
+        char_name = agent_config.get('character', '').lower()
+        if character_query in char_name or char_name.startswith(character_query):
+            persona = agent_config
+            source_role = agent_name  # where character normally lives
+            break
+    effective_role = role_override  # what we're asking them to do
+else:
+    agent_name = spec.split(':')[1]  # e.g., "dev"
+    persona = theme['agents'][agent_name]
+    effective_role = agent_name
+    source_role = agent_name
+```
+This enables running Prospero (normally SM) as a dev, or Gus Fring (normally orchestrator) as a reviewer.
+## Step 3b: Build Agent Prompt
+Use the Write tool to create the prompt file with this template:
+```
+You are {character}.
+**Style:** {style}
+**Expertise:** {expertise}
+**Catchphrases:** {catchphrases}
+---
+## Challenge
+{scenario_prompt}
+{code_content if present}
+---
+Respond fully in character. Under 500 words.
+**IMPORTANT:** Provide your complete response directly. Do not attempt to use tools, read files, or make function calls.
+```
+**Cross-role note:** When using `--as`, the scenario prompt comes from the `effective_role` (e.g., dev tasks),
+but the character/style/expertise come from the character's original role config. This tests whether
+personality traits affect task performance independent of role-specific training.
+The final instruction is critical - without it, the model may output tool-call syntax even with `--tools ""`, resulting in incomplete responses.
+## Step 4: Execute Agent via CLI
+**RECOMMENDED: Use the shell script for reliable execution:**
+```bash
+./scripts/solo-runner.sh {theme}:{agent} {scenario} {output_dir}
+```
+The shell script handles all escaping, temp files, and JSON parsing correctly.
+Use inline commands only for simple cases or when the script isn't available.
+---
+**CRITICAL: The `--tools ""` flag is MANDATORY.**
+Without `--tools ""`, agents may use tools internally (Read, Write, Bash, etc.), causing:
+1. Multi-turn conversations (num_turns > 1)
+2. The `.result` field only captures the FINAL message (often just a summary)
+3. Full response content is LOST - judges only see truncated output
+4. Scores are INVALID because judges evaluate incomplete data
+**Evidence:** Miles Vorkosigan benchmark (2026-01-01) scored 76.69 with tools enabled vs Leo McGarry's 91.03 with `--tools ""`. Miles' runs had num_turns: 5-7 and judges only saw summaries, not full story breakdowns.
+**CRITICAL: Use PIPE syntax, NOT heredocs.**
+**NEVER USE HEREDOCS** - Heredoc syntax (`<<'EOF'`, `<<EOF`, `<<'PROMPT'`, etc.) FAILS in subagents.
+The permission system treats heredocs differently and they get auto-denied.
+**ALWAYS USE PIPE SYNTAX** - This works in both main sessions and subagents:
+- `echo "$PROMPT" | claude -p ...` - WORKS
+- `cat file.txt | claude -p ...` - WORKS
+- `printf '%s' "$PROMPT" | claude -p ...` - WORKS
+- `claude -p ... <<'EOF'` - **FAILS IN SUBAGENTS - DO NOT USE**
+**CRITICAL: Use FILE REDIRECTION, NOT variable capture.**
+**NEVER CAPTURE OUTPUT IN VARIABLES** - Command substitution with `$(...)` causes zsh parse errors
+when the JSON output contains parentheses or special characters:
+- `OUTPUT=$(cat file.txt | claude -p ...)` - **FAILS with `parse error near ')'`**
+**ALWAYS REDIRECT TO FILES** - This avoids shell parsing issues:
+- `cat file.txt | claude -p ... > output.json` - WORKS
+- Then read: `jq -r '.result' output.json` - WORKS
+```bash
+# Step 1: Capture timestamp to file (avoid variable capture issues)
+date -u +%Y-%m-%dT%H:%M:%SZ > /tmp/timestamp_$$.txt
+# Step 2: Write prompt to file using Write tool (avoids escaping issues)
+# Use the Write tool to create: /tmp/prompt_$$.txt
+# Step 3: Execute with file redirection (NOT variable capture)
+cat /tmp/prompt_$$.txt | claude -p --output-format json --tools "" > /tmp/output_$$.json
+# Step 4: Extract results from files
+TIMESTAMP=$(cat /tmp/timestamp_$$.txt)
+RESPONSE=$(jq -r '.result' /tmp/output_$$.json)
+INPUT_TOKENS=$(jq -r '.usage.input_tokens // 0' /tmp/output_$$.json)
+OUTPUT_TOKENS=$(jq -r '.usage.output_tokens // 0' /tmp/output_$$.json)
+# Step 5: Cleanup
+rm -f /tmp/timestamp_$$.txt /tmp/prompt_$$.txt /tmp/output_$$.json
+```
+**Why file redirection works:** The shell never tries to parse the JSON output.
+It goes directly to a file, then jq reads it safely.
+## Step 5: Check Mode
+**If `--no-judge`:** Return raw response and metadata, STOP.
+```markdown
+## Solo Agent Response
+**Contestant:** {spec} ({character})
+**Scenario:** {scenario_name}
+---
+{response}
+---
+```json
+{
+  "spec": "{spec}",
+  "character": "{character}",
+  "cli_timestamp": "{TIMESTAMP}",
+  "response_length": {length},
+  "input_tokens": {INPUT_TOKENS},
+  "output_tokens": {OUTPUT_TOKENS}
+}
+```
+```
+**If full mode:** Continue to Step 6.
+## Step 6: Invoke Judge Skill
+**Detect SWE-bench scenarios for deterministic evaluation:**
+Check if the scenario is from SWE-bench by looking at its path or category:
+```python
+is_swebench = (
+    'swe-bench' in scenario_path.lower() or
+    scenario.get('category') == 'swe-bench' or
+    scenario.get('source') == 'swe-bench'
+)
+```
+**If SWE-bench scenario:**
+Use deterministic Python-based evaluation instead of LLM-as-judge:
+```bash
+# Save response to temp file for Python judge
+echo '{"result": "{RESPONSE}"}' > /tmp/solo_response_$$.json
+# Run SWE-bench judge (deterministic scoring against ground truth)
+python3 .pennyfarthing/scripts/test/swebench-judge.py {scenario_name} /tmp/solo_response_$$.json
+```
+The Python script returns:
+- `total`: Score out of 100
+- `scores`: Breakdown by category (root_cause, fix_quality, completeness, persona)
+- `details`: Specific findings and matches
+**If standard scenario (non-SWE-bench):**
+Use LLM-as-judge:
+```
+/judge --mode solo --data {
+  "spec": "{spec}",
+  "character": "{character}",
+  "challenge": "{prompt}",
+  "response": "{RESPONSE}"
+}
+```
+Capture: `score`, `judge_timestamp`, `judge_response`, `judge_tokens`
+## Step 7: Invoke Finalize-Run Skill
+```
+/finalize-run --type solo --data {
+  "timestamp": "{ISO8601}",
+  "scenario": {"name": "{scenario_name}", "title": "{title}"},
+  "agents": [{
+    "spec": "{spec}",
+    "cli_timestamp": "{TIMESTAMP}",
+    "response_text": "{RESPONSE}",
+    "input_tokens": {INPUT_TOKENS},
+    "output_tokens": {OUTPUT_TOKENS}
+  }],
+  "judge": {
+    "cli_timestamp": "{judge_timestamp}",
+    "response_text": "{judge_response}",
+    "input_tokens": {judge_input},
+    "output_tokens": {judge_output}
+  },
+  "scores": {"{spec}": {score}},
+  "output_path": "internal/results/solo/{timestamp}-{theme}-{agent}.json"
+}
+```
+## Step 8: Display Results
+```markdown
+## Solo Evaluation
+{judge verdict}
+---
+## Efficiency
+| Metric | Value |
+|--------|-------|
+| Agent Tokens | {agent_total} |
+| Judge Tokens | {judge_total} |
+| Score | {score}/100 |
+| Tokens/Point | {tpp} |
+---
+✓ Saved to {output_path}
+```
+## Step 9: Multi-Run Mode (if runs > 1)
+1. Create output directory (see Step 10 for path logic)
+2. Repeat Steps 4-7 for each run
+3. Save each to `runs/run_{i}.json` and `runs/judge_{i}.json`
+4. Calculate statistics and save summary.yaml (Step 10)
+## Step 10: Save Summary (ALWAYS - even for n=1)
+**Output path logic:**
+```
+if theme == "control":
+  base_path = "internal/results/baselines/{scenario}/{effective_role}/"
+elif role_override:  # cross-role mode
+  # Include character name and effective role for clarity
+  base_path = "internal/results/benchmarks/{scenario}/{theme}-{character}-as-{effective_role}/"
+else:
+  base_path = "internal/results/benchmarks/{scenario}/{theme}-{effective_role}/"
+```
+**Cross-role example:** `/solo shakespeare:prospero --as dev --scenario django-10554`
+→ saves to `internal/results/benchmarks/django-10554/shakespeare-prospero-as-dev/`
+**For ALL runs (including n=1):**
+1. Create directory structure:
+   ```bash
+   mkdir -p "{base_path}/runs"
+   ```
+2. Save run files:
+   - `runs/run_{i}.json` - Agent response + tokens
+   - `runs/judge_{i}.json` - Judge evaluation
+3. Calculate statistics:
+   ```python
+   scores = [run.score for run in runs]
+   mean = sum(scores) / len(scores)
+   std_dev = sqrt(sum((s - mean)^2 for s in scores) / len(scores))
+   ```
+4. **ALWAYS save summary.yaml:**
+   ```yaml
+   # {theme}:{character} on {scenario} (as {effective_role})
+   # Generated: {ISO8601 timestamp}
+   agent:
+     theme: {theme}
+     character: {character_name}
+     effective_role: {effective_role}      # role being performed
+     source_role: {source_role}            # role where character normally lives
+     spec: {theme}:{character}             # original spec
+     cross_role: {true if role_override else false}
+   scenario:
+     name: {scenario_name}
+     category: {category}
+     difficulty: {difficulty}
+   statistics:
+     n: {run_count}
+     mean: {mean:.2f}
+     std_dev: {std_dev:.2f}
+     min: {min_score}
+     max: {max_score}
+     scores: [{score1}, {score2}, ...]
+   efficiency:
+     avg_input_tokens: {avg_in}
+     avg_output_tokens: {avg_out}
+     tokens_per_point: {tpp:.2f}
+   metadata:
+     created_at: {ISO8601 timestamp}
+     pennyfarthing_version: {version from package.json}  # REQUIRED
+     model: sonnet
+   # Include baseline comparison if baseline exists and theme != control
+   baseline_comparison:
+     control_mean: {baseline_mean}
+     control_stddev: {baseline_std}
+     delta: {mean - baseline_mean:+.2f}
+   runs:
+     - run_1.json
+     - run_2.json
+     # ...
+   ```
+5. Display:
+   ```
+   ✓ Saved {n} run(s) to {base_path}
+   ✓ Summary: {base_path}/summary.yaml
+   ```
+</on-invoke>
+<reference>
+- **Judge Skill:** `.claude/project/skills/judge/SKILL.md`
+- **Finalize-Run Skill:** `.claude/project/skills/finalize-run/SKILL.md`
+- **Themes:** `pennyfarthing-dist/personas/themes/*.yaml`
+- **Scenarios:** `scenarios/**/*.yaml`
+- **Baselines:** `internal/results/baselines/{scenario}/{role}/` (control theme)
+- **Benchmarks:** `internal/results/benchmarks/{scenario}/{theme}-{role}/` (all other themes)
+- **Results README:** `internal/results/README.md`
+</reference>

package/dist/benchmark-integration.d.ts ADDED Viewed

@@ -0,0 +1,182 @@
+/**
+ * Benchmark Integration Module
+ *
+ * Story 11-8: Integrate with Benchmark Output
+ * Story 12-6: Update for local results (Epic 12 migration)
+ *
+ * Correlates Chernoff faces and OCEAN profiles with benchmark performance data.
+ * Reads benchmark results from internal/results/ directory (or BENCHMARK_PATH env var).
+ */
+export interface OceanScores {
+    O: number;
+    C: number;
+    E: number;
+    A: number;
+    N: number;
+}
+export interface BenchmarkResult {
+    theme: string;
+    role: string;
+    character: string;
+    scenario: string;
+    mean: number;
+    stdDev: number;
+    delta: number;
+    cohensD?: number;
+    n: number;
+    scores: number[];
+    ocean: OceanScores;
+    face: string;
+    benchmarkMissing?: boolean;
+}
+export interface CorrelationResult {
+    O: {
+        effect: number;
+        direction: 'positive' | 'negative' | 'none';
+    };
+    C: {
+        effect: number;
+        direction: 'positive' | 'negative' | 'none';
+    };
+    E: {
+        effect: number;
+        direction: 'positive' | 'negative' | 'none';
+    };
+    A: {
+        effect: number;
+        direction: 'positive' | 'negative' | 'none';
+    };
+    N: {
+        effect: number;
+        direction: 'positive' | 'negative' | 'none';
+    };
+    strongest: {
+        dimension: keyof OceanScores;
+        effect: number;
+    };
+}
+export interface OptimalProfile {
+    ocean: OceanScores;
+    reasoning: string;
+}
+export interface RoleRecommendations {
+    role: string;
+    topThemes: Array<{
+        theme: string;
+        character: string;
+        score: number;
+        ocean: OceanScores;
+    }>;
+    avoidThemes: Array<{
+        theme: string;
+        character: string;
+        score: number;
+    }>;
+    insight: string;
+}
+export interface PerformerResult {
+    theme: string;
+    character: string;
+    score: number;
+    delta: number;
+    ocean: OceanScores;
+    face: string;
+}
+export interface QueryOptions {
+    scenario?: string;
+    role?: string;
+    filter?: string;
+    ocean?: string;
+    limit?: number;
+    minScore?: number;
+    sortBy?: 'score' | 'delta' | 'name';
+}
+export interface BenchmarkReportResult {
+    markdown: string;
+    data: {
+        performers: PerformerResult[];
+        correlation: CorrelationResult;
+        recommendations: RoleRecommendations;
+        errorCorrelation?: OceanErrorCorrelation;
+    };
+}
+export interface ErrorTypeCell {
+    correlation: number;
+    arrow: string;
+}
+export interface OceanErrorCorrelation {
+    matrix: {
+        [dimension: string]: {
+            reasoning: ErrorTypeCell;
+            planning: ErrorTypeCell;
+            execution: ErrorTypeCell;
+        };
+    };
+    strongest: {
+        dimension: string;
+        errorType: string;
+        correlation: number;
+    };
+}
+export interface JudgeScore {
+    detection_by_type?: {
+        reasoning: number;
+        planning: number;
+        execution: number;
+    };
+}
+export interface BenchmarkResultWithOcean {
+    ocean: OceanScores;
+    mean: number;
+}
+/**
+ * Load benchmark data from thunderdome results
+ */
+export declare function loadBenchmarkData(scenario: string, role: string): BenchmarkResult[];
+/**
+ * Get benchmark result with face visualization attached
+ */
+export declare function getBenchmarkWithFace(theme: string, role: string, scenario: string): BenchmarkResult | null;
+/**
+ * Calculate OCEAN correlation with benchmark performance
+ */
+export declare function calculateOceanCorrelation(scenario: string, role: string): CorrelationResult;
+/**
+ * Generate markdown correlation report
+ */
+export declare function generateCorrelationReport(scenario: string, role: string): string;
+/**
+ * Get optimal OCEAN profile for a role based on benchmark data
+ */
+export declare function getOptimalProfile(role: string): OptimalProfile;
+/**
+ * Get role recommendations (top themes, themes to avoid)
+ */
+export declare function getRoleRecommendations(role: string): RoleRecommendations;
+/**
+ * Find top performers for a scenario/role with optional filters
+ */
+export declare function findTopPerformers(options: QueryOptions): PerformerResult[];
+/**
+ * General query interface for benchmark data
+ */
+export declare function queryBenchmarks(options: QueryOptions): PerformerResult[];
+/**
+ * Calculate OCEAN × error-type correlation matrix
+ * Story 14-5: Correlates OCEAN dimensions with error detection rates
+ */
+export declare function calculateErrorTypeCorrelation(results: BenchmarkResultWithOcean[], judgeScores: JudgeScore[]): OceanErrorCorrelation;
+/**
+ * Generate markdown heat map for OCEAN × error-type correlations
+ * Story 14-5: Produces 5×3 matrix with directional arrows and effect sizes
+ */
+export declare function generateOceanErrorHeatMap(correlation: OceanErrorCorrelation): string;
+/**
+ * Generate complete benchmark report with faces and correlations
+ */
+export declare function generateBenchmarkReport(options: {
+    scenario: string;
+    role: string;
+    includeErrorTypeCorrelation?: boolean;
+}): BenchmarkReportResult;
+//# sourceMappingURL=benchmark-integration.d.ts.map

package/dist/benchmark-integration.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"benchmark-integration.d.ts","sourceRoot":"","sources":["../src/benchmark-integration.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AA6CH,MAAM,WAAW,WAAW;IAC1B,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;CACX;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,CAAC,EAAE,MAAM,CAAC;IACV,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,KAAK,EAAE,WAAW,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;IACb,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,iBAAiB;IAChC,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,UAAU,GAAG,UAAU,GAAG,MAAM,CAAA;KAAE,CAAC;IACnE,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,UAAU,GAAG,UAAU,GAAG,MAAM,CAAA;KAAE,CAAC;IACnE,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,UAAU,GAAG,UAAU,GAAG,MAAM,CAAA;KAAE,CAAC;IACnE,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,UAAU,GAAG,UAAU,GAAG,MAAM,CAAA;KAAE,CAAC;IACnE,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,UAAU,GAAG,UAAU,GAAG,MAAM,CAAA;KAAE,CAAC;IACnE,SAAS,EAAE;QAAE,SAAS,EAAE,MAAM,WAAW,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC7D;AAED,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,WAAW,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,WAAW,CAAA;KAAE,CAAC,CAAC;IAC1F,WAAW,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACxE,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,WAAW,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,OAAO,GAAG,OAAO,GAAG,MAAM,CAAC;CACrC;AAED,MAAM,WAAW,qBAAqB;IACpC,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE;QACJ,UAAU,EAAE,eAAe,EAAE,CAAC;QAC9B,WAAW,EAAE,iBAAiB,CAAC;QAC/B,eAAe,EAAE,mBAAmB,CAAC;QACrC,gBAAgB,CAAC,EAAE,qBAAqB,CAAC;KAC1C,CAAC;CACH;AAGD,MAAM,WAAW,aAAa;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,qBAAqB;IACpC,MAAM,EAAE;QACN,CAAC,SAAS,EAAE,MAAM,GAAG;YACnB,SAAS,EAAE,aAAa,CAAC;YACzB,QAAQ,EAAE,aAAa,CAAC;YACxB,SAAS,EAAE,aAAa,CAAC;SAC1B,CAAC;KACH,CAAC;IACF,SAAS,EAAE;QACT,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAED,MAAM,WAAW,UAAU;IACzB,iBAAiB,CAAC,EAAE;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,EAAE,MAAM,CAAC;QACjB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,WAAW,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;CACd;AA6ND;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,eAAe,EAAE,CA0BnF;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,MAAM,GACf,eAAe,GAAG,IAAI,CA0CxB;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CACvC,QAAQ,EAAE,MAAM,EAChB,IAAI,EAAE,MAAM,GACX,iBAAiB,CAwBnB;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,CA6BhF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,cAAc,CAkC9D;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,mBAAmB,CA2DxE;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,YAAY,GAAG,eAAe,EAAE,CAuC1E;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,YAAY,GAAG,eAAe,EAAE,CA6CxE;AAyED;;;GAGG;AACH,wBAAgB,6BAA6B,CAC3C,OAAO,EAAE,wBAAwB,EAAE,EACnC,WAAW,EAAE,UAAU,EAAE,GACxB,qBAAqB,CAiCvB;AAED;;;GAGG;AACH,wBAAgB,yBAAyB,CAAC,WAAW,EAAE,qBAAqB,GAAG,MAAM,CAqCpF;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,OAAO,EAAE;IAC/C,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B,CAAC,EAAE,OAAO,CAAC;CACvC,GAAG,qBAAqB,CAwExB"}