npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scripts/test/test-setup.sh ADDED Viewed

@@ -0,0 +1,337 @@
+#!/usr/bin/env zsh
+# Pennyfarthing Test Setup Utilities
+# Config-driven test utilities that work with any project structure
+#
+# Usage: source scripts/test/test-setup.sh
+#
+# Configuration is read from .pennyfarthing/repos.yaml
+# See repos.yaml for schema documentation
+#
+# Functions:
+#   generate_run_id             - Create timestamp-based unique run ID
+#   get_log_path TYPE RUN_ID    - Return log file path for a test type
+#   ensure_test_containers      - Start test containers if configured
+#   setup_repo_test_env REPO    - Export test env vars for a repo
+#   check_skip_violations REPO  - Check for forbidden skip patterns
+#   show_skip_violations REPO   - Display skip violation details
+#   cleanup_test_logs           - Remove old test/lint log files
+# Don't exit on error - we want to handle errors gracefully
+set +e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/../../../.." && pwd)}"
+# Source repo-utils for config access
+source "$CLAUDE_PROJECT_DIR/scripts/repo-utils.sh" 2>/dev/null || {
+    echo "Warning: repo-utils.sh not found, using defaults" >&2
+}
+# ============================================================================
+# Run ID and Logging
+# ============================================================================
+# Generate a unique run ID based on timestamp
+# Usage: RUN_ID=$(generate_run_id)
+generate_run_id() {
+    date +%Y%m%d-%H%M%S
+}
+# Get log file path for a given type
+# Usage: LOG_PATH=$(get_log_path "test-myrepo" "$RUN_ID")
+# Types: test-{repo}, lint-{repo}, or any custom prefix
+get_log_path() {
+    local log_type="$1"
+    local run_id="${2:-$(generate_run_id)}"
+    # Use configured log dir or default
+    local log_dir
+    if declare -f get_test_log_dir &>/dev/null; then
+        log_dir=$(get_test_log_dir)
+    else
+        log_dir="$CLAUDE_PROJECT_DIR/.session"
+    fi
+    echo "${log_dir}/${log_type}-results-${run_id}.log"
+}
+# ============================================================================
+# Container Management
+# ============================================================================
+# Check if test containers are running, start if configured
+# Returns: 0 if containers ready (or none needed), 1 if failed to start
+ensure_test_containers() {
+    # Get container command from config
+    local container_cmd
+    if declare -f get_container_command &>/dev/null; then
+        container_cmd=$(get_container_command)
+    fi
+    # If no container command configured, nothing to do
+    if [[ -z "$container_cmd" ]]; then
+        return 0
+    fi
+    echo "Starting test containers via: $container_cmd"
+    eval "$container_cmd"
+    return $?
+}
+# ============================================================================
+# Environment Setup
+# ============================================================================
+# Export test environment variables for a specific repo
+# Usage: setup_repo_test_env "Pennyfarthing-api"
+setup_repo_test_env() {
+    local repo="$1"
+    if declare -f get_test_env &>/dev/null; then
+        local env_exports
+        env_exports=$(get_test_env "$repo")
+        if [[ -n "$env_exports" ]]; then
+            eval "$env_exports"
+        fi
+    fi
+}
+# Export test environment for all repos
+# Usage: setup_all_test_env
+setup_all_test_env() {
+    if ! declare -f get_repos &>/dev/null; then
+        return 0
+    fi
+    for repo in $(get_repos); do
+        setup_repo_test_env "$repo"
+    done
+}
+# ============================================================================
+# Skip Violation Checks
+# ============================================================================
+# Check for forbidden skip patterns in a repo's test files
+# Usage: VIOLATIONS=$(check_skip_violations "Pennyfarthing-api")
+# Returns: count of violations found
+check_skip_violations() {
+    local repo="$1"
+    local count=0
+    # Get repo info
+    local repo_path language
+    if declare -f get_repo_full_path &>/dev/null; then
+        repo_path=$(get_repo_full_path "$repo")
+        language=$(get_repo_language "$repo")
+    else
+        repo_path="$CLAUDE_PROJECT_DIR/$repo"
+        language="unknown"
+    fi
+    if [[ ! -d "$repo_path" ]]; then
+        echo "0"
+        return 0
+    fi
+    # Get patterns for this language
+    local skip_patterns skip_exceptions file_pattern
+    if declare -f get_skip_patterns &>/dev/null; then
+        skip_patterns=$(get_skip_patterns "$language")
+        skip_exceptions=$(get_skip_exceptions "$language")
+        file_pattern=$(get_test_file_pattern "$language")
+    else
+        # Fallback defaults
+        case "$language" in
+            go)
+                skip_patterns='t\.Skip'
+                skip_exceptions='LocalStack|not available'
+                file_pattern='*_test.go'
+                ;;
+            typescript|javascript)
+                skip_patterns='it\.skip|describe\.skip|test\.skip'
+                skip_exceptions=''
+                file_pattern='*.test.*'
+                ;;
+            *)
+                echo "0"
+                return 0
+                ;;
+        esac
+    fi
+    if [[ -z "$skip_patterns" ]]; then
+        echo "0"
+        return 0
+    fi
+    # Search for violations
+    local grep_result
+    grep_result=$(grep -r -E "$skip_patterns" "$repo_path" --include="$file_pattern" 2>/dev/null || true)
+    # Filter out exceptions
+    if [[ -n "$skip_exceptions" && -n "$grep_result" ]]; then
+        grep_result=$(echo "$grep_result" | grep -v -E "$skip_exceptions" || true)
+    fi
+    # Count remaining violations
+    if [[ -n "$grep_result" ]]; then
+        count=$(echo "$grep_result" | wc -l | tr -d ' ')
+    fi
+    echo "$count"
+}
+# Show skip violations with file locations
+# Usage: show_skip_violations "Pennyfarthing-api" [max_lines]
+show_skip_violations() {
+    local repo="$1"
+    local max_lines="${2:-10}"
+    # Get repo info
+    local repo_path language
+    if declare -f get_repo_full_path &>/dev/null; then
+        repo_path=$(get_repo_full_path "$repo")
+        language=$(get_repo_language "$repo")
+    else
+        repo_path="$CLAUDE_PROJECT_DIR/$repo"
+        language="unknown"
+    fi
+    if [[ ! -d "$repo_path" ]]; then
+        return 0
+    fi
+    # Get patterns for this language
+    local skip_patterns skip_exceptions file_pattern
+    if declare -f get_skip_patterns &>/dev/null; then
+        skip_patterns=$(get_skip_patterns "$language")
+        skip_exceptions=$(get_skip_exceptions "$language")
+        file_pattern=$(get_test_file_pattern "$language")
+    else
+        return 0
+    fi
+    if [[ -z "$skip_patterns" ]]; then
+        return 0
+    fi
+    # Search and display
+    local grep_result
+    grep_result=$(grep -r -E "$skip_patterns" "$repo_path" --include="$file_pattern" 2>/dev/null || true)
+    if [[ -n "$skip_exceptions" && -n "$grep_result" ]]; then
+        grep_result=$(echo "$grep_result" | grep -v -E "$skip_exceptions" || true)
+    fi
+    if [[ -n "$grep_result" ]]; then
+        echo "$grep_result" | head -"$max_lines"
+    fi
+}
+# Check all repos for skip violations
+# Usage: TOTAL=$(check_all_skip_violations)
+check_all_skip_violations() {
+    local total=0
+    if ! declare -f get_repos &>/dev/null; then
+        echo "0"
+        return 0
+    fi
+    for repo in $(get_repos); do
+        local count
+        count=$(check_skip_violations "$repo")
+        total=$((total + count))
+    done
+    echo "$total"
+}
+# ============================================================================
+# Cleanup
+# ============================================================================
+# Remove old test and lint log files
+cleanup_test_logs() {
+    local log_dir
+    if declare -f get_test_log_dir &>/dev/null; then
+        log_dir=$(get_test_log_dir)
+    else
+        log_dir="$CLAUDE_PROJECT_DIR/.session"
+    fi
+    rm -f "$log_dir"/test-*-results-*.log 2>/dev/null
+    rm -f "$log_dir"/lint-*-results-*.log 2>/dev/null
+}
+# ============================================================================
+# High-Level Test Running
+# ============================================================================
+# Run tests for a specific repo with logging
+# Usage: run_repo_tests "Pennyfarthing-api" "$RUN_ID"
+run_repo_tests() {
+    local repo="$1"
+    local run_id="${2:-$(generate_run_id)}"
+    local repo_path test_cmd log_path
+    if declare -f get_repo_full_path &>/dev/null; then
+        repo_path=$(get_repo_full_path "$repo")
+        test_cmd=$(get_test_command "$repo")
+    else
+        repo_path="$CLAUDE_PROJECT_DIR/$repo"
+        test_cmd=""
+    fi
+    if [[ -z "$test_cmd" ]]; then
+        echo "SKIP: $repo (no test command configured)"
+        return 0
+    fi
+    if [[ ! -d "$repo_path" ]]; then
+        echo "SKIP: $repo (path not found: $repo_path)"
+        return 0
+    fi
+    log_path=$(get_log_path "test-$repo" "$run_id")
+    # Setup environment for this repo
+    setup_repo_test_env "$repo"
+    echo "=== Testing $repo ==="
+    (cd "$repo_path" && eval "$test_cmd") 2>&1 | tee "$log_path"
+    local exit_code=${pipestatus[1]}
+    if [[ $exit_code -eq 0 ]]; then
+        echo "PASS: $repo"
+    else
+        echo "FAIL: $repo (exit code: $exit_code)"
+    fi
+    return $exit_code
+}
+# Run tests for all repos
+# Usage: run_all_repo_tests "$RUN_ID"
+run_all_repo_tests() {
+    local run_id="${1:-$(generate_run_id)}"
+    local failed=0
+    ensure_test_containers || {
+        echo "Warning: Container setup failed, continuing anyway"
+    }
+    if ! declare -f get_repos &>/dev/null; then
+        echo "Warning: repo-utils not loaded, cannot iterate repos"
+        return 1
+    fi
+    for repo in $(get_build_order); do
+        if ! run_repo_tests "$repo" "$run_id"; then
+            ((failed++)) || true
+        fi
+    done
+    return $failed
+}

package/scripts/theme/compute-theme-tiers.sh ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# compute-theme-tiers.sh - Compute tier rankings from job-fair results
+#
+# Uses the MOST COMPLETE run for each theme (most matrix entries),
+# not the most recent. This prevents incomplete runs from overriding good data.
+#
+# Usage: compute-theme-tiers.sh [--dry-run] [--verbose] [--min-entries N]
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec python3 "$SCRIPT_DIR/compute_theme_tiers.py" "$@"