@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,93 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: django__django-10973
5
+
6
+ name: django-10973
7
+ title: "Use subprocess.run and PGPASSWORD for client in postgres backend"
8
+ category: dev
9
+ difficulty: medium # SWE-bench: 15 min - 1 hour
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: django__django-10973
15
+ repo: django/django
16
+ base_commit: ddb293685235
17
+
18
+ description: |
19
+ Real GitHub issue from django/django requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the django/django repository at commit ddb293685235.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Use subprocess.run and PGPASSWORD for client in postgres backend
29
+ Description
30
+
31
+ ​subprocess.run was added in python 3.5 (which is the minimum version since Django 2.1). This function allows you to pass a custom environment for the subprocess.
32
+ Using this in django.db.backends.postgres.client to set PGPASSWORD simplifies the code and makes it more reliable.
33
+
34
+ ---
35
+
36
+ Analyze this issue and provide:
37
+ 1. Root cause analysis - what is causing the bug?
38
+ 2. Proposed fix - what code changes would resolve this?
39
+ 3. Test considerations - how would you verify the fix works?
40
+
41
+ Provide your response with specific file paths and code changes.
42
+
43
+ scoring:
44
+ # Adapted for SWE-bench bug-fix scenarios
45
+ categories:
46
+ - name: root_cause
47
+ weight: 30
48
+ description: "Correctly identifies the underlying cause of the bug"
49
+ criteria:
50
+ - id: IDENTIFIES_BUG_LOCATION
51
+ description: "Points to correct file(s) and function(s)"
52
+ points: 15
53
+ - id: EXPLAINS_WHY_BROKEN
54
+ description: "Explains why current code fails"
55
+ points: 15
56
+
57
+ - name: fix_quality
58
+ weight: 40
59
+ description: "Proposes a correct and complete fix"
60
+ criteria:
61
+ - id: FIX_ADDRESSES_ISSUE
62
+ description: "Fix would resolve the reported problem"
63
+ points: 20
64
+ - id: FIX_IS_MINIMAL
65
+ description: "Fix is appropriately scoped, not over-engineered"
66
+ points: 10
67
+ - id: FIX_SYNTAX_CORRECT
68
+ description: "Code changes are syntactically valid"
69
+ points: 10
70
+
71
+ - name: completeness
72
+ weight: 20
73
+ description: "Considers edge cases and testing"
74
+ criteria:
75
+ - id: EDGE_CASES
76
+ description: "Considers related scenarios that might break"
77
+ points: 10
78
+ - id: TEST_COVERAGE
79
+ description: "Suggests appropriate test cases"
80
+ points: 10
81
+
82
+ - name: persona
83
+ weight: 10
84
+ description: "Maintains character while solving"
85
+ criteria:
86
+ - id: IN_CHARACTER
87
+ description: "Response reflects persona traits"
88
+ points: 10
89
+
90
+ # Metadata for full harness evaluation (optional)
91
+ swebench_metadata:
92
+ fail_to_pass: ["test_accent (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)", "test_basic (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)", "test_column (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)"]
93
+ environment_version: "3.0"
@@ -0,0 +1,145 @@
1
+ ---
2
+ # SWE-bench Reviewer Scenario
3
+ # Adapted from: pallets__flask-5014
4
+ # Role: Code Reviewer evaluating a proposed fix
5
+
6
+ name: flask-5014-reviewer
7
+ title: "Review: Require non-empty name for Blueprints"
8
+ category: reviewer
9
+ difficulty: easy
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: pallets__flask-5014
15
+ repo: pallets/flask
16
+ base_commit: 7ee9ceb71e86
17
+ adapted_for: reviewer
18
+
19
+ description: |
20
+ Code review scenario adapted from SWE-bench. The reviewer must evaluate
21
+ a proposed patch for correctness, completeness, and code quality.
22
+
23
+ prompt: |
24
+ You are reviewing a pull request for the pallets/flask repository.
25
+
26
+ ## Issue Being Fixed
27
+
28
+ **Title:** Require a non-empty name for Blueprints
29
+
30
+ **Description:**
31
+ Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
32
+ It would be helpful if a `ValueError` was raised when trying to do that.
33
+
34
+ ## Proposed Patch
35
+
36
+ The developer has submitted the following changes:
37
+
38
+ **src/flask/blueprints.py** - In the Blueprint.__init__ method:
39
+ ```python
40
+ def __init__(
41
+ self,
42
+ name: str,
43
+ import_name: str,
44
+ ...
45
+ ) -> None:
46
+ # NEW: Validate non-empty name
47
+ if not name:
48
+ raise ValueError("'name' may not be empty.")
49
+
50
+ if "." in name:
51
+ raise ValueError("'name' may not contain a dot '.' character.")
52
+ ...
53
+ ```
54
+
55
+ **tests/test_blueprints.py** - New test added:
56
+ ```python
57
+ def test_empty_name_not_allowed(app, client):
58
+ with pytest.raises(ValueError):
59
+ flask.Blueprint("", __name__)
60
+ ```
61
+
62
+ **CHANGES.rst** - Added to changelog:
63
+ ```
64
+ - If a blueprint is created with an empty name it raises a ValueError.
65
+ :issue:`5010`
66
+ ```
67
+
68
+ ## Your Review Task
69
+
70
+ Evaluate this pull request and provide:
71
+ 1. **Correctness** - Does the fix address the issue? Any bugs?
72
+ 2. **Edge Cases** - What about whitespace-only names? None values?
73
+ 3. **Test Coverage** - Are the tests sufficient?
74
+ 4. **Code Quality** - Style, error message clarity, placement
75
+ 5. **Verdict** - APPROVE, REQUEST_CHANGES, or COMMENT
76
+
77
+ Be thorough but fair. Identify real issues, not nitpicks.
78
+
79
+ scoring:
80
+ categories:
81
+ - name: issue_detection
82
+ weight: 35
83
+ description: "Identifies legitimate issues with the patch"
84
+ criteria:
85
+ - id: WHITESPACE_EDGE_CASE
86
+ description: "Notes that whitespace-only names like ' ' would pass"
87
+ points: 15
88
+ severity: high
89
+ - id: NONE_HANDLING
90
+ description: "Considers behavior if name=None is passed"
91
+ points: 10
92
+ severity: medium
93
+ - id: ERROR_MESSAGE_QUALITY
94
+ description: "Comments on error message clarity/consistency"
95
+ points: 5
96
+ severity: low
97
+ - id: TEST_COVERAGE_GAP
98
+ description: "Notes missing edge case tests"
99
+ points: 5
100
+ severity: low
101
+
102
+ - name: review_quality
103
+ weight: 30
104
+ description: "Quality of the review feedback"
105
+ criteria:
106
+ - id: CONSTRUCTIVE_FEEDBACK
107
+ description: "Provides actionable suggestions, not just complaints"
108
+ points: 15
109
+ - id: CORRECT_ASSESSMENT
110
+ description: "Doesn't flag false positives or miss obvious issues"
111
+ points: 15
112
+
113
+ - name: verdict_appropriateness
114
+ weight: 20
115
+ description: "Appropriate review decision"
116
+ criteria:
117
+ - id: REASONABLE_VERDICT
118
+ description: "Verdict matches severity of issues found"
119
+ points: 10
120
+ - id: JUSTIFIED_DECISION
121
+ description: "Decision is well-reasoned"
122
+ points: 10
123
+
124
+ - name: persona
125
+ weight: 15
126
+ description: "Maintains reviewer character"
127
+ criteria:
128
+ - id: IN_CHARACTER
129
+ description: "Response reflects persona traits"
130
+ points: 15
131
+
132
+ baseline_issues:
133
+ critical: []
134
+ high:
135
+ - id: WHITESPACE_EDGE_CASE
136
+ description: "Empty check 'if not name' passes for whitespace-only strings"
137
+ expected_fix: "Use 'if not name or not name.strip()' or similar"
138
+ medium:
139
+ - id: NONE_HANDLING
140
+ description: "If name=None, 'if not name' works but error message is misleading"
141
+ low:
142
+ - id: ERROR_MESSAGE_CONSISTENCY
143
+ description: "Error message style differs from existing dot check"
144
+ - id: MISSING_WHITESPACE_TEST
145
+ description: "No test for whitespace-only names"
@@ -0,0 +1,123 @@
1
+ ---
2
+ # SWE-bench TEA Scenario
3
+ # Adapted from: pallets__flask-5014
4
+ # Role: Test Engineer writing failing tests BEFORE the fix
5
+
6
+ name: flask-5014-tea
7
+ title: "RED Phase: Write tests for empty Blueprint name validation"
8
+ category: tea
9
+ difficulty: easy
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: pallets__flask-5014
15
+ repo: pallets/flask
16
+ base_commit: 7ee9ceb71e86
17
+ adapted_for: tea
18
+
19
+ description: |
20
+ TDD RED phase scenario adapted from SWE-bench. The test engineer must write
21
+ failing tests that specify the expected behavior BEFORE seeing any fix.
22
+
23
+ prompt: |
24
+ You are a Test Engineer working on the pallets/flask repository.
25
+
26
+ ## Bug Report
27
+
28
+ **Title:** Require a non-empty name for Blueprints
29
+
30
+ **Description:**
31
+ Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
32
+ It would be helpful if a `ValueError` was raised when trying to do that.
33
+
34
+ **Current Behavior:** Creating `Blueprint("", __name__)` succeeds but causes
35
+ problems downstream when the blueprint is registered.
36
+
37
+ **Expected Behavior:** A `ValueError` should be raised immediately when
38
+ attempting to create a Blueprint with an empty name.
39
+
40
+ ## Your Task (RED Phase)
41
+
42
+ Write comprehensive failing tests that:
43
+ 1. Specify the expected behavior (ValueError on empty name)
44
+ 2. Cover edge cases (whitespace, None, etc.)
45
+ 3. Will FAIL against the current codebase
46
+ 4. Will PASS once the fix is implemented
47
+
48
+ Use pytest style. Provide complete, runnable test code.
49
+
50
+ **Important:** You are writing tests BEFORE the fix exists. These tests
51
+ should fail now and define what "correct" means.
52
+
53
+ scoring:
54
+ categories:
55
+ - name: test_coverage
56
+ weight: 40
57
+ description: "Comprehensive test cases"
58
+ criteria:
59
+ - id: EMPTY_STRING_TEST
60
+ description: "Tests that '' raises ValueError"
61
+ points: 15
62
+ severity: critical
63
+ - id: WHITESPACE_TEST
64
+ description: "Tests that ' ' (whitespace-only) should also fail"
65
+ points: 10
66
+ severity: high
67
+ - id: NONE_TEST
68
+ description: "Tests behavior when name=None"
69
+ points: 10
70
+ severity: medium
71
+ - id: VALID_NAME_TEST
72
+ description: "Includes sanity check that valid names still work"
73
+ points: 5
74
+ severity: low
75
+
76
+ - name: test_quality
77
+ weight: 30
78
+ description: "Well-written test code"
79
+ criteria:
80
+ - id: PYTEST_STYLE
81
+ description: "Uses pytest idioms correctly (pytest.raises, fixtures)"
82
+ points: 10
83
+ - id: CLEAR_ASSERTIONS
84
+ description: "Assertions clearly express expected behavior"
85
+ points: 10
86
+ - id: ERROR_MESSAGE_CHECK
87
+ description: "Verifies error message content, not just exception type"
88
+ points: 10
89
+
90
+ - name: red_phase_understanding
91
+ weight: 15
92
+ description: "Understands TDD RED phase"
93
+ criteria:
94
+ - id: TESTS_SHOULD_FAIL
95
+ description: "Acknowledges tests will fail against current code"
96
+ points: 8
97
+ - id: DEFINES_BEHAVIOR
98
+ description: "Tests define expected behavior, not implementation"
99
+ points: 7
100
+
101
+ - name: persona
102
+ weight: 15
103
+ description: "Maintains TEA character"
104
+ criteria:
105
+ - id: IN_CHARACTER
106
+ description: "Response reflects persona traits"
107
+ points: 15
108
+
109
+ baseline_criteria:
110
+ required_tests:
111
+ - id: EMPTY_STRING
112
+ description: "Blueprint('', __name__) raises ValueError"
113
+ - id: WHITESPACE_ONLY
114
+ description: "Blueprint(' ', __name__) raises ValueError"
115
+ - id: VALID_STILL_WORKS
116
+ description: "Blueprint('valid_name', __name__) succeeds"
117
+ bonus_tests:
118
+ - id: NONE_VALUE
119
+ description: "Blueprint(None, __name__) behavior specified"
120
+ - id: ERROR_MESSAGE
121
+ description: "Asserts on error message content"
122
+ - id: SINGLE_SPACE
123
+ description: "Blueprint(' ', __name__) edge case"
@@ -0,0 +1,91 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: pallets__flask-5014
5
+
6
+ name: flask-5014
7
+ title: "Require a non-empty name for Blueprints"
8
+ category: dev
9
+ difficulty: easy # SWE-bench: <15 min fix
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: pallets__flask-5014
15
+ repo: pallets/flask
16
+ base_commit: 7ee9ceb71e86
17
+
18
+ description: |
19
+ Real GitHub issue from pallets/flask requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the pallets/flask repository at commit 7ee9ceb71e86.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Require a non-empty name for Blueprints
29
+ Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
30
+ It would be helpful if a `ValueError` was raised when trying to do that.
31
+
32
+ ---
33
+
34
+ Analyze this issue and provide:
35
+ 1. Root cause analysis - what is causing the bug?
36
+ 2. Proposed fix - what code changes would resolve this?
37
+ 3. Test considerations - how would you verify the fix works?
38
+
39
+ Provide your response with specific file paths and code changes.
40
+
41
+ scoring:
42
+ # Adapted for SWE-bench bug-fix scenarios
43
+ categories:
44
+ - name: root_cause
45
+ weight: 30
46
+ description: "Correctly identifies the underlying cause of the bug"
47
+ criteria:
48
+ - id: IDENTIFIES_BUG_LOCATION
49
+ description: "Points to correct file(s) and function(s)"
50
+ points: 15
51
+ - id: EXPLAINS_WHY_BROKEN
52
+ description: "Explains why current code fails"
53
+ points: 15
54
+
55
+ - name: fix_quality
56
+ weight: 40
57
+ description: "Proposes a correct and complete fix"
58
+ criteria:
59
+ - id: FIX_ADDRESSES_ISSUE
60
+ description: "Fix would resolve the reported problem"
61
+ points: 20
62
+ - id: FIX_IS_MINIMAL
63
+ description: "Fix is appropriately scoped, not over-engineered"
64
+ points: 10
65
+ - id: FIX_SYNTAX_CORRECT
66
+ description: "Code changes are syntactically valid"
67
+ points: 10
68
+
69
+ - name: completeness
70
+ weight: 20
71
+ description: "Considers edge cases and testing"
72
+ criteria:
73
+ - id: EDGE_CASES
74
+ description: "Considers related scenarios that might break"
75
+ points: 10
76
+ - id: TEST_COVERAGE
77
+ description: "Suggests appropriate test cases"
78
+ points: 10
79
+
80
+ - name: persona
81
+ weight: 10
82
+ description: "Maintains character while solving"
83
+ criteria:
84
+ - id: IN_CHARACTER
85
+ description: "Response reflects persona traits"
86
+ points: 10
87
+
88
+ # Metadata for full harness evaluation (optional)
89
+ swebench_metadata:
90
+ fail_to_pass: ["tests/test_blueprints.py::test_empty_name_not_allowed"]
91
+ environment_version: "2.3"
@@ -0,0 +1,246 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Import SWE-bench Verified scenarios into Pennyfarthing format.
4
+
5
+ Usage:
6
+ python import-swebench.py [--count N] [--difficulty LEVEL]
7
+
8
+ This script:
9
+ 1. Loads the cached SWE-bench data from /tmp/swebench_all.json
10
+ 2. Selects a stratified sample across difficulty levels
11
+ 3. Generates Pennyfarthing scenario YAML files
12
+ """
13
+
14
+ import json
15
+ import os
16
+ import re
17
+ import sys
18
+ from pathlib import Path
19
+ from textwrap import dedent, indent
20
+
21
+ # Difficulty mapping: SWE-bench -> Pennyfarthing
22
+ DIFFICULTY_MAP = {
23
+ "<15 min fix": "easy",
24
+ "15 min - 1 hour": "medium",
25
+ "1-4 hours": "hard",
26
+ ">4 hours": "extreme"
27
+ }
28
+
29
+ # How many to import per difficulty level
30
+ DEFAULT_COUNTS = {
31
+ "easy": 5,
32
+ "medium": 5,
33
+ "hard": 3,
34
+ "extreme": 2
35
+ }
36
+
37
+ def sanitize_name(instance_id: str) -> str:
38
+ """Convert instance_id to valid filename."""
39
+ # astropy__astropy-12907 -> astropy-12907
40
+ parts = instance_id.split("__")
41
+ if len(parts) == 2:
42
+ return parts[1].lower()
43
+ return instance_id.lower().replace("__", "-")
44
+
45
+ def truncate_text(text: str, max_chars: int = 4000) -> str:
46
+ """Truncate text to max characters, preserving complete lines."""
47
+ if len(text) <= max_chars:
48
+ return text
49
+ truncated = text[:max_chars]
50
+ # Find last newline to avoid cutting mid-line
51
+ last_nl = truncated.rfind('\n')
52
+ if last_nl > max_chars * 0.8:
53
+ truncated = truncated[:last_nl]
54
+ return truncated + "\n\n[... truncated for brevity ...]"
55
+
56
+ def generate_scenario_yaml(instance: dict) -> str:
57
+ """Generate Pennyfarthing scenario YAML from SWE-bench instance."""
58
+
59
+ instance_id = instance['instance_id']
60
+ repo = instance['repo']
61
+ problem = instance['problem_statement']
62
+ swe_difficulty = instance.get('difficulty', '15 min - 1 hour')
63
+ pf_difficulty = DIFFICULTY_MAP.get(swe_difficulty, 'medium')
64
+ base_commit = instance['base_commit'][:12]
65
+
66
+ # Parse repo for cleaner name
67
+ repo_name = repo.split('/')[-1]
68
+ scenario_name = sanitize_name(instance_id)
69
+
70
+ # Extract first line as title (usually issue title)
71
+ lines = problem.strip().split('\n')
72
+ title = lines[0][:80] if lines else f"Issue in {repo_name}"
73
+ # Clean markdown formatting from title
74
+ title = re.sub(r'[`*#]', '', title).strip()
75
+ if title.startswith('- '):
76
+ title = title[2:]
77
+
78
+ # Truncate problem statement for scenario
79
+ problem_truncated = truncate_text(problem, 6000)
80
+
81
+ yaml_content = f'''---
82
+ # SWE-bench Verified Scenario
83
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
84
+ # Instance: {instance_id}
85
+
86
+ name: {scenario_name}
87
+ title: "{title}"
88
+ category: dev
89
+ difficulty: {pf_difficulty} # SWE-bench: {swe_difficulty}
90
+ version: "1.0"
91
+
92
+ source:
93
+ benchmark: swe-bench-verified
94
+ instance_id: {instance_id}
95
+ repo: {repo}
96
+ base_commit: {base_commit}
97
+
98
+ description: |
99
+ Real GitHub issue from {repo} requiring code changes to resolve.
100
+ This is a human-validated problem from the SWE-bench Verified dataset.
101
+
102
+ prompt: |
103
+ You are working on the {repo} repository at commit {base_commit}.
104
+
105
+ A user has reported the following issue:
106
+
107
+ ---
108
+ {indent(problem_truncated, " ")}
109
+ ---
110
+
111
+ Analyze this issue and provide:
112
+ 1. Root cause analysis - what is causing the bug?
113
+ 2. Proposed fix - what code changes would resolve this?
114
+ 3. Test considerations - how would you verify the fix works?
115
+
116
+ Provide your response with specific file paths and code changes.
117
+
118
+ scoring:
119
+ # Adapted for SWE-bench bug-fix scenarios
120
+ categories:
121
+ - name: root_cause
122
+ weight: 30
123
+ description: "Correctly identifies the underlying cause of the bug"
124
+ criteria:
125
+ - id: IDENTIFIES_BUG_LOCATION
126
+ description: "Points to correct file(s) and function(s)"
127
+ points: 15
128
+ - id: EXPLAINS_WHY_BROKEN
129
+ description: "Explains why current code fails"
130
+ points: 15
131
+
132
+ - name: fix_quality
133
+ weight: 40
134
+ description: "Proposes a correct and complete fix"
135
+ criteria:
136
+ - id: FIX_ADDRESSES_ISSUE
137
+ description: "Fix would resolve the reported problem"
138
+ points: 20
139
+ - id: FIX_IS_MINIMAL
140
+ description: "Fix is appropriately scoped, not over-engineered"
141
+ points: 10
142
+ - id: FIX_SYNTAX_CORRECT
143
+ description: "Code changes are syntactically valid"
144
+ points: 10
145
+
146
+ - name: completeness
147
+ weight: 20
148
+ description: "Considers edge cases and testing"
149
+ criteria:
150
+ - id: EDGE_CASES
151
+ description: "Considers related scenarios that might break"
152
+ points: 10
153
+ - id: TEST_COVERAGE
154
+ description: "Suggests appropriate test cases"
155
+ points: 10
156
+
157
+ - name: persona
158
+ weight: 10
159
+ description: "Maintains character while solving"
160
+ criteria:
161
+ - id: IN_CHARACTER
162
+ description: "Response reflects persona traits"
163
+ points: 10
164
+
165
+ # Metadata for full harness evaluation (optional)
166
+ swebench_metadata:
167
+ fail_to_pass: {json.dumps(json.loads(instance.get('FAIL_TO_PASS', '[]'))[:3])}
168
+ environment_version: "{instance.get('version', 'unknown')}"
169
+ '''
170
+
171
+ return yaml_content
172
+
173
+ def select_scenarios(all_instances: list, counts: dict) -> list:
174
+ """Select stratified sample of scenarios."""
175
+ selected = []
176
+
177
+ # Group by difficulty
178
+ by_difficulty = {}
179
+ for inst in all_instances:
180
+ swe_diff = inst.get('difficulty', '15 min - 1 hour')
181
+ pf_diff = DIFFICULTY_MAP.get(swe_diff, 'medium')
182
+ by_difficulty.setdefault(pf_diff, []).append(inst)
183
+
184
+ # Select from each difficulty, preferring repo diversity
185
+ for difficulty, count in counts.items():
186
+ available = by_difficulty.get(difficulty, [])
187
+
188
+ # Sort by repo to get diversity, then take first N
189
+ seen_repos = set()
190
+ diverse_selection = []
191
+ for inst in available:
192
+ repo = inst['repo']
193
+ if repo not in seen_repos:
194
+ diverse_selection.append(inst)
195
+ seen_repos.add(repo)
196
+ if len(diverse_selection) >= count:
197
+ break
198
+
199
+ # If we don't have enough diverse repos, add more from any repo
200
+ if len(diverse_selection) < count:
201
+ for inst in available:
202
+ if inst not in diverse_selection:
203
+ diverse_selection.append(inst)
204
+ if len(diverse_selection) >= count:
205
+ break
206
+
207
+ selected.extend(diverse_selection[:count])
208
+
209
+ return selected
210
+
211
+ def main():
212
+ # Load cached data
213
+ cache_file = Path('/tmp/swebench_all.json')
214
+ if not cache_file.exists():
215
+ print("Error: Run the download script first to create /tmp/swebench_all.json")
216
+ sys.exit(1)
217
+
218
+ with open(cache_file) as f:
219
+ all_instances = json.load(f)
220
+
221
+ print(f"Loaded {len(all_instances)} SWE-bench instances")
222
+
223
+ # Select scenarios
224
+ selected = select_scenarios(all_instances, DEFAULT_COUNTS)
225
+ print(f"Selected {len(selected)} scenarios for import")
226
+
227
+ # Output directory
228
+ output_dir = Path(__file__).parent
229
+
230
+ # Generate YAML files
231
+ for inst in selected:
232
+ scenario_name = sanitize_name(inst['instance_id'])
233
+ output_file = output_dir / f"{scenario_name}.yaml"
234
+
235
+ yaml_content = generate_scenario_yaml(inst)
236
+
237
+ with open(output_file, 'w') as f:
238
+ f.write(yaml_content)
239
+
240
+ pf_diff = DIFFICULTY_MAP.get(inst.get('difficulty', 'medium'), 'medium')
241
+ print(f" Created: {scenario_name}.yaml ({pf_diff})")
242
+
243
+ print(f"\nImport complete! {len(selected)} scenarios created in {output_dir}")
244
+
245
+ if __name__ == '__main__':
246
+ main()