npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/tea/cli-tool-tests.yaml ADDED Viewed

@@ -0,0 +1,561 @@
+---
+# Scenario: CLI Tool Test Suite
+# Category: tea
+# Purpose: Test CLI/systems testing skills with Go
+id: tea-004
+name: cli-tool-tests
+title: "CLI Tool Test Suite Design"
+category: tea
+difficulty: hard  # Empirical: control mean 64.50
+version: "1.0"
+description: |
+  A CLI tool for managing configuration files with subcommands for init, get,
+  set, delete, and validate. Must write tests covering argument parsing, file
+  I/O, error handling, and user-facing output. Tests systems programming test
+  skills distinct from web/API testing.
+purpose: |
+  This scenario tests CLI testing expertise. CLI tools have unique testing
+  concerns: argument parsing, exit codes, stdout/stderr handling, file system
+  interactions. Measures ability to test command-line interfaces thoroughly.
+prompt: |
+  You are a Test Engineer designing tests for a configuration management CLI tool.
+  Write a comprehensive test suite using Go's testing package.
+  The CLI has these commands:
+  - `config init` - Initialize a new config file
+  - `config get <key>` - Get a config value
+  - `config set <key> <value>` - Set a config value
+  - `config delete <key>` - Delete a config key
+  - `config validate` - Validate config file syntax
+  - `config list` - List all config keys
+  Global flags:
+  - `--config <path>` - Config file path (default: ~/.config/app/config.yaml)
+  - `--format <json|yaml|text>` - Output format
+  - `--quiet` - Suppress non-essential output
+  - `--verbose` - Show detailed output
+  Write tests covering:
+  1. Argument parsing (valid/invalid args, flags, subcommands)
+  2. Exit codes (0 for success, 1 for error, 2 for usage error)
+  3. Stdout vs stderr output (data to stdout, errors to stderr)
+  4. File system operations (create, read, write, permissions)
+  5. Error messages (user-friendly, actionable)
+  6. Edge cases (missing file, invalid YAML, concurrent access)
+  For each test:
+  1. Use table-driven tests where appropriate
+  2. Test both success and failure paths
+  3. Verify exit codes, stdout, and stderr
+  4. Use temporary directories for file tests
+code:
+  language: go
+  filename: config_cli.go
+  content: |
+    package main
+    import (
+        "encoding/json"
+        "fmt"
+        "io"
+        "os"
+        "path/filepath"
+        "gopkg.in/yaml.v3"
+    )
+    // Config represents the configuration structure
+    type Config struct {
+        data     map[string]interface{}
+        filepath string
+    }
+    // CLI represents the command-line interface
+    type CLI struct {
+        stdout    io.Writer
+        stderr    io.Writer
+        configDir string
+    }
+    // NewCLI creates a new CLI instance
+    func NewCLI(stdout, stderr io.Writer, configDir string) *CLI {
+        return &CLI{
+            stdout:    stdout,
+            stderr:    stderr,
+            configDir: configDir,
+        }
+    }
+    // Run executes the CLI with the given arguments
+    func (c *CLI) Run(args []string) int {
+        if len(args) < 1 {
+            fmt.Fprintln(c.stderr, "Usage: config <command> [options]")
+            return 2
+        }
+        // Parse global flags
+        configPath := filepath.Join(c.configDir, "config.yaml")
+        format := "text"
+        quiet := false
+        verbose := false
+        // Filter out flags and get command
+        var cmdArgs []string
+        for i := 0; i < len(args); i++ {
+            switch args[i] {
+            case "--config":
+                if i+1 < len(args) {
+                    configPath = args[i+1]
+                    i++
+                }
+            case "--format":
+                if i+1 < len(args) {
+                    format = args[i+1]
+                    i++
+                }
+            case "--quiet":
+                quiet = true
+            case "--verbose":
+                verbose = true
+            default:
+                cmdArgs = append(cmdArgs, args[i])
+            }
+        }
+        if len(cmdArgs) < 1 {
+            fmt.Fprintln(c.stderr, "Error: no command specified")
+            return 2
+        }
+        command := cmdArgs[0]
+        cmdArgs = cmdArgs[1:]
+        switch command {
+        case "init":
+            return c.cmdInit(configPath, quiet)
+        case "get":
+            if len(cmdArgs) < 1 {
+                fmt.Fprintln(c.stderr, "Error: get requires a key argument")
+                return 2
+            }
+            return c.cmdGet(configPath, cmdArgs[0], format)
+        case "set":
+            if len(cmdArgs) < 2 {
+                fmt.Fprintln(c.stderr, "Error: set requires key and value arguments")
+                return 2
+            }
+            return c.cmdSet(configPath, cmdArgs[0], cmdArgs[1], quiet)
+        case "delete":
+            if len(cmdArgs) < 1 {
+                fmt.Fprintln(c.stderr, "Error: delete requires a key argument")
+                return 2
+            }
+            return c.cmdDelete(configPath, cmdArgs[0], quiet)
+        case "validate":
+            return c.cmdValidate(configPath, verbose)
+        case "list":
+            return c.cmdList(configPath, format)
+        default:
+            fmt.Fprintf(c.stderr, "Error: unknown command %q\n", command)
+            return 2
+        }
+    }
+    func (c *CLI) cmdInit(configPath string, quiet bool) int {
+        dir := filepath.Dir(configPath)
+        if err := os.MkdirAll(dir, 0755); err != nil {
+            fmt.Fprintf(c.stderr, "Error creating directory: %v\n", err)
+            return 1
+        }
+        if _, err := os.Stat(configPath); err == nil {
+            fmt.Fprintln(c.stderr, "Error: config file already exists")
+            return 1
+        }
+        initialConfig := map[string]interface{}{
+            "version": "1.0",
+        }
+        data, err := yaml.Marshal(initialConfig)
+        if err != nil {
+            fmt.Fprintf(c.stderr, "Error encoding config: %v\n", err)
+            return 1
+        }
+        if err := os.WriteFile(configPath, data, 0644); err != nil {
+            fmt.Fprintf(c.stderr, "Error writing config: %v\n", err)
+            return 1
+        }
+        if !quiet {
+            fmt.Fprintf(c.stdout, "Initialized config at %s\n", configPath)
+        }
+        return 0
+    }
+    func (c *CLI) cmdGet(configPath, key, format string) int {
+        config, err := c.loadConfig(configPath)
+        if err != nil {
+            fmt.Fprintf(c.stderr, "Error loading config: %v\n", err)
+            return 1
+        }
+        value, ok := config[key]
+        if !ok {
+            fmt.Fprintf(c.stderr, "Error: key %q not found\n", key)
+            return 1
+        }
+        return c.outputValue(value, format)
+    }
+    func (c *CLI) cmdSet(configPath, key, value string, quiet bool) int {
+        config, err := c.loadConfig(configPath)
+        if err != nil {
+            // If file doesn't exist, start fresh
+            if os.IsNotExist(err) {
+                config = make(map[string]interface{})
+            } else {
+                fmt.Fprintf(c.stderr, "Error loading config: %v\n", err)
+                return 1
+            }
+        }
+        config[key] = value
+        if err := c.saveConfig(configPath, config); err != nil {
+            fmt.Fprintf(c.stderr, "Error saving config: %v\n", err)
+            return 1
+        }
+        if !quiet {
+            fmt.Fprintf(c.stdout, "Set %s = %s\n", key, value)
+        }
+        return 0
+    }
+    func (c *CLI) cmdDelete(configPath, key string, quiet bool) int {
+        config, err := c.loadConfig(configPath)
+        if err != nil {
+            fmt.Fprintf(c.stderr, "Error loading config: %v\n", err)
+            return 1
+        }
+        if _, ok := config[key]; !ok {
+            fmt.Fprintf(c.stderr, "Error: key %q not found\n", key)
+            return 1
+        }
+        delete(config, key)
+        if err := c.saveConfig(configPath, config); err != nil {
+            fmt.Fprintf(c.stderr, "Error saving config: %v\n", err)
+            return 1
+        }
+        if !quiet {
+            fmt.Fprintf(c.stdout, "Deleted %s\n", key)
+        }
+        return 0
+    }
+    func (c *CLI) cmdValidate(configPath string, verbose bool) int {
+        data, err := os.ReadFile(configPath)
+        if err != nil {
+            fmt.Fprintf(c.stderr, "Error reading config: %v\n", err)
+            return 1
+        }
+        var config map[string]interface{}
+        if err := yaml.Unmarshal(data, &config); err != nil {
+            fmt.Fprintf(c.stderr, "Error: invalid YAML syntax: %v\n", err)
+            return 1
+        }
+        if verbose {
+            fmt.Fprintf(c.stdout, "Config file: %s\n", configPath)
+            fmt.Fprintf(c.stdout, "Keys: %d\n", len(config))
+        }
+        fmt.Fprintln(c.stdout, "Config is valid")
+        return 0
+    }
+    func (c *CLI) cmdList(configPath, format string) int {
+        config, err := c.loadConfig(configPath)
+        if err != nil {
+            fmt.Fprintf(c.stderr, "Error loading config: %v\n", err)
+            return 1
+        }
+        switch format {
+        case "json":
+            data, _ := json.MarshalIndent(config, "", "  ")
+            fmt.Fprintln(c.stdout, string(data))
+        case "yaml":
+            data, _ := yaml.Marshal(config)
+            fmt.Fprint(c.stdout, string(data))
+        default:
+            for k, v := range config {
+                fmt.Fprintf(c.stdout, "%s: %v\n", k, v)
+            }
+        }
+        return 0
+    }
+    func (c *CLI) loadConfig(path string) (map[string]interface{}, error) {
+        data, err := os.ReadFile(path)
+        if err != nil {
+            return nil, err
+        }
+        var config map[string]interface{}
+        if err := yaml.Unmarshal(data, &config); err != nil {
+            return nil, err
+        }
+        return config, nil
+    }
+    func (c *CLI) saveConfig(path string, config map[string]interface{}) error {
+        data, err := yaml.Marshal(config)
+        if err != nil {
+            return err
+        }
+        return os.WriteFile(path, data, 0644)
+    }
+    func (c *CLI) outputValue(value interface{}, format string) int {
+        switch format {
+        case "json":
+            data, _ := json.Marshal(value)
+            fmt.Fprintln(c.stdout, string(data))
+        case "yaml":
+            data, _ := yaml.Marshal(value)
+            fmt.Fprint(c.stdout, string(data))
+        default:
+            fmt.Fprintln(c.stdout, value)
+        }
+        return 0
+    }
+    func main() {
+        cli := NewCLI(os.Stdout, os.Stderr, os.Getenv("HOME"))
+        os.Exit(cli.Run(os.Args[1:]))
+    }
+# =============================================================================
+# BASELINE TEST SCENARIOS
+# =============================================================================
+baseline_issues:
+  argument_parsing:
+    - id: VALID_COMMANDS
+      description: "All valid commands are recognized"
+    - id: INVALID_COMMAND
+      description: "Unknown commands return exit code 2"
+    - id: MISSING_ARGS
+      description: "Missing required args show usage error"
+    - id: FLAG_PARSING
+      description: "Global flags parsed correctly"
+  exit_codes:
+    - id: SUCCESS_EXIT_0
+      description: "Successful operations return 0"
+    - id: ERROR_EXIT_1
+      description: "Errors return 1"
+    - id: USAGE_EXIT_2
+      description: "Usage errors return 2"
+  output_streams:
+    - id: DATA_TO_STDOUT
+      description: "Data output goes to stdout"
+    - id: ERRORS_TO_STDERR
+      description: "Error messages go to stderr"
+    - id: QUIET_MODE
+      description: "--quiet suppresses informational output"
+  file_operations:
+    - id: INIT_CREATES_FILE
+      description: "init creates config file and directories"
+    - id: GET_READS_VALUE
+      description: "get returns correct value"
+    - id: SET_PERSISTS
+      description: "set writes to file correctly"
+    - id: DELETE_REMOVES
+      description: "delete removes key from file"
+  error_handling:
+    - id: MISSING_FILE
+      description: "Graceful error for missing config"
+    - id: INVALID_YAML
+      description: "Clear error for malformed YAML"
+    - id: KEY_NOT_FOUND
+      description: "Clear error when key doesn't exist"
+    - id: PERMISSION_ERROR
+      description: "Clear error for permission issues"
+  output_formats:
+    - id: TEXT_FORMAT
+      description: "Default text format works"
+    - id: JSON_FORMAT
+      description: "--format json outputs valid JSON"
+    - id: YAML_FORMAT
+      description: "--format yaml outputs valid YAML"
+# =============================================================================
+# BONUS TEST SCENARIOS
+# =============================================================================
+bonus_issues:
+  advanced:
+    - id: CONCURRENT_ACCESS
+      description: "Handles concurrent reads/writes safely"
+    - id: LARGE_CONFIG
+      description: "Handles large config files efficiently"
+    - id: SPECIAL_CHARACTERS
+      description: "Keys/values with special chars handled"
+  robustness:
+    - id: PARTIAL_WRITE
+      description: "Atomic writes prevent corruption"
+    - id: SYMLINK_HANDLING
+      description: "Config path symlinks work correctly"
+    - id: RELATIVE_PATHS
+      description: "Relative --config paths resolved correctly"
+  usability:
+    - id: HELPFUL_ERRORS
+      description: "Error messages suggest fixes"
+    - id: VERBOSE_MODE
+      description: "--verbose shows additional info"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_scenarios: 20
+  total_bonus_scenarios: 8
+  categories:
+    - name: coverage
+      weight: 40
+      criteria:
+        - id: BASELINE_COVERED
+          description: "All baseline test scenarios covered"
+          points: 30
+        - id: BONUS_COVERED
+          description: "Additional valuable test scenarios"
+          points: 10
+    - name: quality
+      weight: 30
+      criteria:
+        - id: TABLE_DRIVEN
+          description: "Uses table-driven tests appropriately"
+          points: 10
+        - id: ISOLATION
+          description: "Tests use temp dirs, are isolated"
+          points: 10
+        - id: ASSERTIONS
+          description: "Clear, comprehensive assertions"
+          points: 10
+    - name: cli_specific
+      weight: 15
+      criteria:
+        - id: EXIT_CODES
+          description: "Tests verify exit codes"
+          points: 5
+        - id: STREAM_SEPARATION
+          description: "Tests verify stdout vs stderr"
+          points: 5
+        - id: FILE_CLEANUP
+          description: "Tests clean up temp files"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances test documentation"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: test_style
+      description: "How tests are organized"
+      spectrum:
+        individual: "One test per scenario"
+        table_driven: "Extensive use of table tests"
+        hybrid: "Mix based on complexity"
+    - name: coverage_focus
+      description: "What gets tested most"
+      spectrum:
+        happy_path: "Focus on successful operations"
+        error_focused: "Emphasizes error handling"
+        comprehensive: "Equal coverage of both"
+    - name: systems_depth
+      description: "Depth of file system testing"
+      spectrum:
+        basic: "Simple file read/write"
+        intermediate: "Permissions, directories"
+        advanced: "Symlinks, atomicity, concurrency"
+expected_tendencies:
+  discworld_tea:
+    character: "Igor"
+    expected_traits:
+      - "Thorough - covers edge cases"
+      - "Practical - focuses on real failures"
+      - "May suggest unusual test scenarios"
+    coverage_prediction: "comprehensive"
+  star_trek_tea:
+    character: "Scotty"
+    expected_traits:
+      - "Systems focus - file operations"
+      - "Engineering rigor - thorough coverage"
+      - "May emphasize robustness testing"
+    coverage_prediction: "comprehensive"
+  control_tea:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard CLI testing approach"
+    coverage_prediction: "baseline reference"