npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/swe-bench/requests-2931.yaml ADDED Viewed

@@ -0,0 +1,98 @@
+---
+# SWE-bench Verified Scenario
+# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
+# Instance: psf__requests-2931
+name: requests-2931
+title: "Request with binary payload fails due to calling to_native_string"
+category: dev
+difficulty: medium  # SWE-bench: 15 min - 1 hour
+version: "1.0"
+source:
+  benchmark: swe-bench-verified
+  instance_id: psf__requests-2931
+  repo: psf/requests
+  base_commit: 5f7a3a74aab1
+description: |
+  Real GitHub issue from psf/requests requiring code changes to resolve.
+  This is a human-validated problem from the SWE-bench Verified dataset.
+prompt: |
+  You are working on the psf/requests repository at commit 5f7a3a74aab1.
+  A user has reported the following issue:
+  ---
+  Request with binary payload fails due to calling to_native_string
+  Introduced with https://github.com/kennethreitz/requests/issues/2844
+  ```
+  import requests
+  requests.put("http://httpbin.org/put", data=u"ööö".encode("utf-8"))
+  ```
+  This works with 2.8.1, but not with 2.9.
+  ---
+  Analyze this issue and provide:
+  1. Root cause analysis - what is causing the bug?
+  2. Proposed fix - what code changes would resolve this?
+  3. Test considerations - how would you verify the fix works?
+  Provide your response with specific file paths and code changes.
+scoring:
+  # Adapted for SWE-bench bug-fix scenarios
+  categories:
+    - name: root_cause
+      weight: 30
+      description: "Correctly identifies the underlying cause of the bug"
+      criteria:
+        - id: IDENTIFIES_BUG_LOCATION
+          description: "Points to correct file(s) and function(s)"
+          points: 15
+        - id: EXPLAINS_WHY_BROKEN
+          description: "Explains why current code fails"
+          points: 15
+    - name: fix_quality
+      weight: 40
+      description: "Proposes a correct and complete fix"
+      criteria:
+        - id: FIX_ADDRESSES_ISSUE
+          description: "Fix would resolve the reported problem"
+          points: 20
+        - id: FIX_IS_MINIMAL
+          description: "Fix is appropriately scoped, not over-engineered"
+          points: 10
+        - id: FIX_SYNTAX_CORRECT
+          description: "Code changes are syntactically valid"
+          points: 10
+    - name: completeness
+      weight: 20
+      description: "Considers edge cases and testing"
+      criteria:
+        - id: EDGE_CASES
+          description: "Considers related scenarios that might break"
+          points: 10
+        - id: TEST_COVERAGE
+          description: "Suggests appropriate test cases"
+          points: 10
+    - name: persona
+      weight: 10
+      description: "Maintains character while solving"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Response reflects persona traits"
+          points: 10
+# Metadata for full harness evaluation (optional)
+swebench_metadata:
+  fail_to_pass: ["test_requests.py::TestRequests::test_binary_put"]
+  environment_version: "2.9"

package/scenarios/swe-bench/seaborn-3069.yaml ADDED Viewed

@@ -0,0 +1,102 @@
+---
+# SWE-bench Verified Scenario
+# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
+# Instance: mwaskom__seaborn-3069
+name: seaborn-3069
+title: "Nominal scale should be drawn the same way as categorical scales"
+category: dev
+difficulty: medium  # SWE-bench: 15 min - 1 hour
+version: "1.0"
+source:
+  benchmark: swe-bench-verified
+  instance_id: mwaskom__seaborn-3069
+  repo: mwaskom/seaborn
+  base_commit: 54cab15bdacf
+description: |
+  Real GitHub issue from mwaskom/seaborn requiring code changes to resolve.
+  This is a human-validated problem from the SWE-bench Verified dataset.
+prompt: |
+  You are working on the mwaskom/seaborn repository at commit 54cab15bdacf.
+  A user has reported the following issue:
+  ---
+  Nominal scale should be drawn the same way as categorical scales
+  Three distinctive things happen on the categorical axis in seaborn's categorical plots:
+  1. The scale is drawn to +/- 0.5 from the first and last tick, rather than using the normal margin logic
+  2. A grid is not shown, even when it otherwise would be with the active style
+  3. If on the y axis, the axis is inverted
+  It probably makes sense to have `so.Nominal` scales (including inferred ones) do this too. Some comments on implementation:
+  1. This is actually trickier than you'd think; I may have posted an issue over in matplotlib about this at one point, or just discussed on their gitter. I believe the suggested approach is to add an invisible artist with sticky edges and set the margin to 0. Feels like a hack! I might have looked into setting the sticky edges _on the spine artist_ at one point?
+  2. Probably straightforward to do in `Plotter._finalize_figure`. Always a good idea? How do we defer to the theme if the user wants to force a grid? Should the grid be something that is set in the scale object itself
+  3. Probably straightforward to implement but I am not exactly sure where would be best.
+  ---
+  Analyze this issue and provide:
+  1. Root cause analysis - what is causing the bug?
+  2. Proposed fix - what code changes would resolve this?
+  3. Test considerations - how would you verify the fix works?
+  Provide your response with specific file paths and code changes.
+scoring:
+  # Adapted for SWE-bench bug-fix scenarios
+  categories:
+    - name: root_cause
+      weight: 30
+      description: "Correctly identifies the underlying cause of the bug"
+      criteria:
+        - id: IDENTIFIES_BUG_LOCATION
+          description: "Points to correct file(s) and function(s)"
+          points: 15
+        - id: EXPLAINS_WHY_BROKEN
+          description: "Explains why current code fails"
+          points: 15
+    - name: fix_quality
+      weight: 40
+      description: "Proposes a correct and complete fix"
+      criteria:
+        - id: FIX_ADDRESSES_ISSUE
+          description: "Fix would resolve the reported problem"
+          points: 20
+        - id: FIX_IS_MINIMAL
+          description: "Fix is appropriately scoped, not over-engineered"
+          points: 10
+        - id: FIX_SYNTAX_CORRECT
+          description: "Code changes are syntactically valid"
+          points: 10
+    - name: completeness
+      weight: 20
+      description: "Considers edge cases and testing"
+      criteria:
+        - id: EDGE_CASES
+          description: "Considers related scenarios that might break"
+          points: 10
+        - id: TEST_COVERAGE
+          description: "Suggests appropriate test cases"
+          points: 10
+    - name: persona
+      weight: 10
+      description: "Maintains character while solving"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Response reflects persona traits"
+          points: 10
+# Metadata for full harness evaluation (optional)
+swebench_metadata:
+  fail_to_pass: ["tests/_core/test_plot.py::TestScaling::test_nominal_x_axis_tweaks", "tests/_core/test_plot.py::TestScaling::test_nominal_y_axis_tweaks"]
+  environment_version: "0.12"

package/scenarios/swe-bench/sphinx-7590.yaml ADDED Viewed

@@ -0,0 +1,108 @@
+---
+# SWE-bench Verified Scenario
+# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
+# Instance: sphinx-doc__sphinx-7590
+name: sphinx-7590
+title: "C++ User Defined Literals not supported"
+category: dev
+difficulty: extreme  # SWE-bench: >4 hours
+version: "1.0"
+source:
+  benchmark: swe-bench-verified
+  instance_id: sphinx-doc__sphinx-7590
+  repo: sphinx-doc/sphinx
+  base_commit: 2e506c5ab457
+description: |
+  Real GitHub issue from sphinx-doc/sphinx requiring code changes to resolve.
+  This is a human-validated problem from the SWE-bench Verified dataset.
+prompt: |
+  You are working on the sphinx-doc/sphinx repository at commit 2e506c5ab457.
+  A user has reported the following issue:
+  ---
+  C++ User Defined Literals not supported
+  The code as below
+  ```cpp
+  namespace units::si {
+  inline constexpr auto planck_constant = 6.62607015e-34q_J * 1q_s;
+  }
+  ```
+  causes the following error:
+  ```
+  WARNING: Invalid definition: Expected end of definition. [error at 58]
+  [build]   constexpr auto units::si::planck_constant = 6.62607015e-34q_J * 1q_s
+  [build]   ----------------------------------------------------------^
+  ```
+  According to <https://github.com/sphinx-doc/sphinx/blob/3.x/sphinx/domains/cpp.py#L4770> Sphinx seems to not have features for UDLs. Could you please add those?
+  ---
+  Analyze this issue and provide:
+  1. Root cause analysis - what is causing the bug?
+  2. Proposed fix - what code changes would resolve this?
+  3. Test considerations - how would you verify the fix works?
+  Provide your response with specific file paths and code changes.
+scoring:
+  # Adapted for SWE-bench bug-fix scenarios
+  categories:
+    - name: root_cause
+      weight: 30
+      description: "Correctly identifies the underlying cause of the bug"
+      criteria:
+        - id: IDENTIFIES_BUG_LOCATION
+          description: "Points to correct file(s) and function(s)"
+          points: 15
+        - id: EXPLAINS_WHY_BROKEN
+          description: "Explains why current code fails"
+          points: 15
+    - name: fix_quality
+      weight: 40
+      description: "Proposes a correct and complete fix"
+      criteria:
+        - id: FIX_ADDRESSES_ISSUE
+          description: "Fix would resolve the reported problem"
+          points: 20
+        - id: FIX_IS_MINIMAL
+          description: "Fix is appropriately scoped, not over-engineered"
+          points: 10
+        - id: FIX_SYNTAX_CORRECT
+          description: "Code changes are syntactically valid"
+          points: 10
+    - name: completeness
+      weight: 20
+      description: "Considers edge cases and testing"
+      criteria:
+        - id: EDGE_CASES
+          description: "Considers related scenarios that might break"
+          points: 10
+        - id: TEST_COVERAGE
+          description: "Suggests appropriate test cases"
+          points: 10
+    - name: persona
+      weight: 10
+      description: "Maintains character while solving"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Response reflects persona traits"
+          points: 10
+# Metadata for full harness evaluation (optional)
+swebench_metadata:
+  fail_to_pass: ["tests/test_domain_cpp.py::test_expressions"]
+  environment_version: "3.1"

package/scenarios/swe-bench/xarray-3993.yaml ADDED Viewed

@@ -0,0 +1,104 @@
+---
+# SWE-bench Verified Scenario
+# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
+# Instance: pydata__xarray-3993
+name: xarray-3993
+title: "DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg"
+category: dev
+difficulty: hard  # SWE-bench: 1-4 hours
+version: "1.0"
+source:
+  benchmark: swe-bench-verified
+  instance_id: pydata__xarray-3993
+  repo: pydata/xarray
+  base_commit: 8cc34cb412ba
+description: |
+  Real GitHub issue from pydata/xarray requiring code changes to resolve.
+  This is a human-validated problem from the SWE-bench Verified dataset.
+prompt: |
+  You are working on the pydata/xarray repository at commit 8cc34cb412ba.
+  A user has reported the following issue:
+  ---
+  DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg
+  This is just a minor gripe but I think it should be fixed.
+  The API syntax is inconsistent:
+  ```python
+  ds.differentiate(coord='x')
+  da.differentiate(coord='x')
+  ds.integrate(coord='x')
+  da.integrate(dim='x')   # why dim??
+  ```
+  It should definitely be `coord` - IMO it doesn't make sense to integrate or differentiate over a dim because a dim by definition has no information about the distance between grid points. I think because the distinction between dims and coords is one of the things that new users have to learn about, we should be strict to not confuse up the meanings in the documentation/API.
+  The discussion on the original PR [seems to agree](https://github.com/pydata/xarray/pull/2653#discussion_r246164990), so I think this was just an small oversight.
+  The only question is whether it requires a deprecation cycle?
+  ---
+  Analyze this issue and provide:
+  1. Root cause analysis - what is causing the bug?
+  2. Proposed fix - what code changes would resolve this?
+  3. Test considerations - how would you verify the fix works?
+  Provide your response with specific file paths and code changes.
+scoring:
+  # Adapted for SWE-bench bug-fix scenarios
+  categories:
+    - name: root_cause
+      weight: 30
+      description: "Correctly identifies the underlying cause of the bug"
+      criteria:
+        - id: IDENTIFIES_BUG_LOCATION
+          description: "Points to correct file(s) and function(s)"
+          points: 15
+        - id: EXPLAINS_WHY_BROKEN
+          description: "Explains why current code fails"
+          points: 15
+    - name: fix_quality
+      weight: 40
+      description: "Proposes a correct and complete fix"
+      criteria:
+        - id: FIX_ADDRESSES_ISSUE
+          description: "Fix would resolve the reported problem"
+          points: 20
+        - id: FIX_IS_MINIMAL
+          description: "Fix is appropriately scoped, not over-engineered"
+          points: 10
+        - id: FIX_SYNTAX_CORRECT
+          description: "Code changes are syntactically valid"
+          points: 10
+    - name: completeness
+      weight: 20
+      description: "Considers edge cases and testing"
+      criteria:
+        - id: EDGE_CASES
+          description: "Considers related scenarios that might break"
+          points: 10
+        - id: TEST_COVERAGE
+          description: "Suggests appropriate test cases"
+          points: 10
+    - name: persona
+      weight: 10
+      description: "Maintains character while solving"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Response reflects persona traits"
+          points: 10
+# Metadata for full harness evaluation (optional)
+swebench_metadata:
+  fail_to_pass: ["xarray/tests/test_dataset.py::test_integrate[True]", "xarray/tests/test_dataset.py::test_integrate[False]"]
+  environment_version: "0.12"

package/scenarios/swe-bench/xarray-6992.yaml ADDED Viewed

@@ -0,0 +1,136 @@
+---
+# SWE-bench Verified Scenario
+# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
+# Instance: pydata__xarray-6992
+name: xarray-6992
+title: "index refactor: more _coord_names than _variables on Dataset"
+category: dev
+difficulty: extreme  # SWE-bench: >4 hours
+version: "1.0"
+source:
+  benchmark: swe-bench-verified
+  instance_id: pydata__xarray-6992
+  repo: pydata/xarray
+  base_commit: 45c0a114e2b7
+description: |
+  Real GitHub issue from pydata/xarray requiring code changes to resolve.
+  This is a human-validated problem from the SWE-bench Verified dataset.
+prompt: |
+  You are working on the pydata/xarray repository at commit 45c0a114e2b7.
+  A user has reported the following issue:
+  ---
+  index refactor: more `_coord_names` than `_variables` on Dataset
+  ### What happened?
+  `xr.core.dataset.DataVariables` assumes that everything that is in `ds._dataset._variables` and not in `self._dataset._coord_names` is a "data variable". However, since the index refactor we can end up with more `_coord_names` than `_variables` which breaks a number of stuff (e.g. the repr).
+  ### What did you expect to happen?
+  Well it seems this assumption is now wrong.
+  ### Minimal Complete Verifiable Example
+  ```Python
+  ds = xr.Dataset(coords={"a": ("x", [1, 2, 3]), "b": ("x", ['a', 'b', 'c'])})
+  ds.set_index(z=['a', 'b']).reset_index("z", drop=True)
+  ```
+  ### MVCE confirmation
+  - [ ] Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
+  - [ ] Complete example — the example is self-contained, including all data and the text of any traceback.
+  - [ ] Verifiable example — the example copy & pastes into an IPython prompt or [Binder notebook](https://mybinder.org/v2/gh/pydata/xarray/main?urlpath=lab/tree/doc/examples/blank_template.ipynb), returning the result.
+  - [ ] New issue — a search of GitHub Issues suggests this is not a duplicate.
+  ### Relevant log output
+  ```Python
+  ValueError: __len__() should return >= 0
+  ```
+  ### Anything else we need to know?
+  The error comes from here
+  https://github.com/pydata/xarray/blob/63ba862d03c8d0cd8b44d2071bc360e9fed4519d/xarray/core/dataset.py#L368
+  Bisected to #5692 - which probably does not help too much.
+  ### Environment
+  <details>
+  </details>
+  ---
+  Analyze this issue and provide:
+  1. Root cause analysis - what is causing the bug?
+  2. Proposed fix - what code changes would resolve this?
+  3. Test considerations - how would you verify the fix works?
+  Provide your response with specific file paths and code changes.
+scoring:
+  # Adapted for SWE-bench bug-fix scenarios
+  categories:
+    - name: root_cause
+      weight: 30
+      description: "Correctly identifies the underlying cause of the bug"
+      criteria:
+        - id: IDENTIFIES_BUG_LOCATION
+          description: "Points to correct file(s) and function(s)"
+          points: 15
+        - id: EXPLAINS_WHY_BROKEN
+          description: "Explains why current code fails"
+          points: 15
+    - name: fix_quality
+      weight: 40
+      description: "Proposes a correct and complete fix"
+      criteria:
+        - id: FIX_ADDRESSES_ISSUE
+          description: "Fix would resolve the reported problem"
+          points: 20
+        - id: FIX_IS_MINIMAL
+          description: "Fix is appropriately scoped, not over-engineered"
+          points: 10
+        - id: FIX_SYNTAX_CORRECT
+          description: "Code changes are syntactically valid"
+          points: 10
+    - name: completeness
+      weight: 20
+      description: "Considers edge cases and testing"
+      criteria:
+        - id: EDGE_CASES
+          description: "Considers related scenarios that might break"
+          points: 10
+        - id: TEST_COVERAGE
+          description: "Suggests appropriate test cases"
+          points: 10
+    - name: persona
+      weight: 10
+      description: "Maintains character while solving"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Response reflects persona traits"
+          points: 10
+# Metadata for full harness evaluation (optional)
+swebench_metadata:
+  fail_to_pass: ["xarray/tests/test_dataarray.py::TestDataArray::test_reset_index", "xarray/tests/test_dataset.py::TestDataset::test_reset_index", "xarray/tests/test_dataset.py::TestDataset::test_reset_index_drop_dims"]
+  environment_version: "2022.06"