devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -15,6 +15,50 @@ The report is the output of `npx devlyn-cli benchmark`. Ship-gate.py consumes su
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
import argparse, json, pathlib, sys, subprocess, datetime
|
|
17
17
|
|
|
18
|
+
SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
|
|
19
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
20
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
21
|
+
|
|
22
|
+
from pair_evidence_contract import is_score, is_strict_number, loads_strict_json_object
|
|
23
|
+
|
|
24
|
+
KNOWN_ARMS = {"variant", "solo_claude", "bare"}
|
|
25
|
+
PASS_VERDICTS = {"PASS", "PASS_WITH_ISSUES"}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def verify_score_clean(value) -> bool:
|
|
29
|
+
return is_strict_number(value) and value >= 1.0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def exact_bool(value):
|
|
33
|
+
if value is True or value is False:
|
|
34
|
+
return value
|
|
35
|
+
if value is None:
|
|
36
|
+
return False
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def skill_verdict_clean(result: dict, arm: str) -> bool:
|
|
41
|
+
if arm == "bare":
|
|
42
|
+
return True
|
|
43
|
+
return (
|
|
44
|
+
result.get("terminal_verdict") in PASS_VERDICTS
|
|
45
|
+
and result.get("verify_verdict") in PASS_VERDICTS
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def utc_now_iso() -> str:
|
|
50
|
+
return datetime.datetime.now(datetime.UTC).isoformat(timespec="seconds").replace("+00:00", "Z")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_dict_json(path: pathlib.Path) -> dict:
|
|
54
|
+
if not path.exists():
|
|
55
|
+
return {}
|
|
56
|
+
try:
|
|
57
|
+
data = loads_strict_json_object(path.read_text())
|
|
58
|
+
except (ValueError, json.JSONDecodeError):
|
|
59
|
+
return {}
|
|
60
|
+
return data
|
|
61
|
+
|
|
18
62
|
|
|
19
63
|
def git_sha() -> str:
|
|
20
64
|
try:
|
|
@@ -30,6 +74,58 @@ def git_branch() -> str:
|
|
|
30
74
|
return "unknown"
|
|
31
75
|
|
|
32
76
|
|
|
77
|
+
def axis_validation_breakdown(judge: dict):
|
|
78
|
+
raw_validation = judge.get("_axis_validation")
|
|
79
|
+
validation = raw_validation if isinstance(raw_validation, dict) else {}
|
|
80
|
+
cells = validation.get("out_of_range_cells") or []
|
|
81
|
+
declared_count = validation.get("out_of_range_count")
|
|
82
|
+
total_invalid = max(
|
|
83
|
+
declared_count if isinstance(declared_count, int) else 0,
|
|
84
|
+
len(cells) if isinstance(cells, list) else 0,
|
|
85
|
+
)
|
|
86
|
+
raw_blind_mapping = judge.get("_blind_mapping")
|
|
87
|
+
blind_mapping = raw_blind_mapping if isinstance(raw_blind_mapping, dict) else {}
|
|
88
|
+
breakdown_to_letter = {
|
|
89
|
+
"a_breakdown": "A",
|
|
90
|
+
"b_breakdown": "B",
|
|
91
|
+
"c_breakdown": "C",
|
|
92
|
+
}
|
|
93
|
+
by_arm = {}
|
|
94
|
+
mapped_count = 0
|
|
95
|
+
unmapped_cells = []
|
|
96
|
+
if not isinstance(cells, list):
|
|
97
|
+
return by_arm, total_invalid, [{"reason": "out_of_range_cells is not a list"}]
|
|
98
|
+
for cell in cells:
|
|
99
|
+
if not isinstance(cell, dict):
|
|
100
|
+
unmapped_cells.append(cell)
|
|
101
|
+
continue
|
|
102
|
+
letter = breakdown_to_letter.get(cell.get("breakdown"))
|
|
103
|
+
arm = blind_mapping.get(letter) if letter else None
|
|
104
|
+
if arm in KNOWN_ARMS:
|
|
105
|
+
by_arm.setdefault(arm, []).append(cell)
|
|
106
|
+
mapped_count += 1
|
|
107
|
+
else:
|
|
108
|
+
unmapped_cells.append(cell)
|
|
109
|
+
unmapped_count = max(0, total_invalid - mapped_count)
|
|
110
|
+
if unmapped_count > len(unmapped_cells):
|
|
111
|
+
unmapped_cells.extend(
|
|
112
|
+
{"reason": "out_of_range_count exceeds mapped cells"}
|
|
113
|
+
for _ in range(unmapped_count - len(unmapped_cells))
|
|
114
|
+
)
|
|
115
|
+
return by_arm, unmapped_count, unmapped_cells
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def blind_mapped_arms(judge: dict) -> set[str]:
|
|
119
|
+
mapping = judge.get("_blind_mapping")
|
|
120
|
+
if not isinstance(mapping, dict):
|
|
121
|
+
return set()
|
|
122
|
+
return {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def strict_number(value):
|
|
126
|
+
return value if is_strict_number(value) else None
|
|
127
|
+
|
|
128
|
+
|
|
33
129
|
def main() -> int:
|
|
34
130
|
p = argparse.ArgumentParser()
|
|
35
131
|
p.add_argument("--run-id", required=True)
|
|
@@ -49,26 +145,34 @@ def main() -> int:
|
|
|
49
145
|
if not judge_path.exists():
|
|
50
146
|
rows.append({"fixture": fid, "status": "NO_JUDGE", "reason": "judge.json missing"})
|
|
51
147
|
continue
|
|
52
|
-
judge =
|
|
148
|
+
judge = load_dict_json(judge_path)
|
|
53
149
|
# iter-0019: 3-arm aware. judge.json now carries scores_by_arm /
|
|
54
150
|
# findings_by_arm / disqualifiers_by_arm / margins. Older judge.json
|
|
55
|
-
#
|
|
56
|
-
#
|
|
57
|
-
|
|
151
|
+
# can populate legacy fields, but any score still requires a matching
|
|
152
|
+
# _blind_mapping arm before downstream consumers may trust it.
|
|
153
|
+
raw_scores_by_arm = judge.get("scores_by_arm")
|
|
154
|
+
scores_by_arm = raw_scores_by_arm if isinstance(raw_scores_by_arm, dict) else {}
|
|
58
155
|
if not scores_by_arm:
|
|
59
|
-
if "variant_score"
|
|
156
|
+
if is_score(judge.get("variant_score")):
|
|
60
157
|
scores_by_arm["variant"] = judge["variant_score"]
|
|
61
|
-
if "bare_score"
|
|
158
|
+
if is_score(judge.get("bare_score")):
|
|
62
159
|
scores_by_arm["bare"] = judge["bare_score"]
|
|
63
160
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
161
|
+
raw_findings_by_arm = judge.get("findings_by_arm")
|
|
162
|
+
findings_by_arm = raw_findings_by_arm if isinstance(raw_findings_by_arm, dict) else {}
|
|
163
|
+
raw_dq_by_arm = judge.get("disqualifiers_by_arm")
|
|
164
|
+
dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
|
|
165
|
+
axis_invalid_by_arm, axis_unmapped_count, axis_unmapped_cells = axis_validation_breakdown(judge)
|
|
166
|
+
mapped_arms = blind_mapped_arms(judge)
|
|
167
|
+
trusted_scores_by_arm = {
|
|
168
|
+
arm: score for arm, score in scores_by_arm.items()
|
|
169
|
+
if arm in mapped_arms and is_score(score)
|
|
170
|
+
}
|
|
67
171
|
|
|
68
172
|
arm_results = {}
|
|
69
173
|
for arm in ("variant", "solo_claude", "bare"):
|
|
70
174
|
res_p = fdir / arm / "result.json"
|
|
71
|
-
arm_results[arm] =
|
|
175
|
+
arm_results[arm] = load_dict_json(res_p)
|
|
72
176
|
var_res = arm_results["variant"]
|
|
73
177
|
solo_res = arm_results["solo_claude"]
|
|
74
178
|
bare_res = arm_results["bare"]
|
|
@@ -77,12 +181,12 @@ def main() -> int:
|
|
|
77
181
|
category = "unknown"
|
|
78
182
|
if meta_p.exists():
|
|
79
183
|
try:
|
|
80
|
-
category =
|
|
184
|
+
category = load_dict_json(meta_p).get("category", "unknown")
|
|
81
185
|
except Exception:
|
|
82
186
|
pass
|
|
83
187
|
|
|
84
188
|
def wall_ratio(numer, denom):
|
|
85
|
-
if numer and denom:
|
|
189
|
+
if is_strict_number(numer) and is_strict_number(denom):
|
|
86
190
|
return round(numer / denom, 2)
|
|
87
191
|
return None
|
|
88
192
|
|
|
@@ -91,44 +195,93 @@ def main() -> int:
|
|
|
91
195
|
# A/B-letter shape if present).
|
|
92
196
|
def arm_dq_judge(arm: str):
|
|
93
197
|
if arm in dq_by_arm:
|
|
94
|
-
|
|
95
|
-
|
|
198
|
+
entry = dq_by_arm[arm]
|
|
199
|
+
value = entry.get("disqualifier") if isinstance(entry, dict) else entry
|
|
200
|
+
parsed = exact_bool(value)
|
|
201
|
+
return (parsed is True or parsed is None, parsed is None)
|
|
202
|
+
raw_mapping = judge.get("_blind_mapping")
|
|
203
|
+
mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
|
|
96
204
|
for letter in ("A", "B", "C"):
|
|
97
205
|
if mapping.get(letter) == arm:
|
|
98
|
-
|
|
99
|
-
|
|
206
|
+
raw_dqs = judge.get("disqualifiers")
|
|
207
|
+
dqs = raw_dqs if isinstance(raw_dqs, dict) else {}
|
|
208
|
+
parsed = exact_bool(dqs.get(letter))
|
|
209
|
+
return (parsed is True or parsed is None, parsed is None)
|
|
210
|
+
return False, False
|
|
211
|
+
|
|
212
|
+
def critical_findings_for(arm: str):
|
|
213
|
+
entry = findings_by_arm.get(arm)
|
|
214
|
+
if isinstance(entry, list):
|
|
215
|
+
return entry
|
|
216
|
+
if entry:
|
|
217
|
+
return [entry]
|
|
218
|
+
return []
|
|
100
219
|
|
|
101
220
|
# Per-arm payload — arm absent = scores_by_arm key absent, downstream
|
|
102
221
|
# consumers null-check.
|
|
103
222
|
arms_block = {}
|
|
104
223
|
for arm in ("variant", "solo_claude", "bare"):
|
|
105
224
|
r = arm_results.get(arm) or {}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
225
|
+
raw_score = scores_by_arm.get(arm)
|
|
226
|
+
score = trusted_scores_by_arm.get(arm)
|
|
227
|
+
blind_mapping_arm_missing = raw_score is not None and arm not in mapped_arms
|
|
228
|
+
judge_dq, judge_dq_malformed = arm_dq_judge(arm)
|
|
229
|
+
result_bool_values = {
|
|
230
|
+
field: exact_bool(r.get(field))
|
|
231
|
+
for field in ("disqualifier", "timed_out", "invoke_failure", "environment_contamination")
|
|
232
|
+
}
|
|
233
|
+
malformed_boolean_fields = [
|
|
234
|
+
field for field, value in result_bool_values.items() if value is None
|
|
235
|
+
]
|
|
236
|
+
det_dq = bool(
|
|
237
|
+
result_bool_values["disqualifier"] is True
|
|
238
|
+
or result_bool_values["timed_out"] is True
|
|
239
|
+
or result_bool_values["invoke_failure"] is True
|
|
240
|
+
or result_bool_values["environment_contamination"] is True
|
|
241
|
+
or bool(malformed_boolean_fields)
|
|
242
|
+
or not verify_score_clean(r.get("verify_score"))
|
|
243
|
+
or not skill_verdict_clean(r, arm)
|
|
244
|
+
or blind_mapping_arm_missing
|
|
245
|
+
)
|
|
109
246
|
arms_block[arm] = {
|
|
110
247
|
"score": score,
|
|
111
|
-
"wall_s": r.get("elapsed_seconds"),
|
|
112
|
-
"verify_score": r.get("verify_score"),
|
|
248
|
+
"wall_s": strict_number(r.get("elapsed_seconds")),
|
|
249
|
+
"verify_score": strict_number(r.get("verify_score")),
|
|
113
250
|
"files_changed": r.get("files_changed"),
|
|
114
|
-
"timed_out":
|
|
251
|
+
"timed_out": result_bool_values["timed_out"] is True,
|
|
252
|
+
"invoke_failure": result_bool_values["invoke_failure"] is True,
|
|
253
|
+
"invoke_failure_reason": r.get("invoke_failure_reason"),
|
|
254
|
+
"environment_contamination": result_bool_values["environment_contamination"] is True,
|
|
115
255
|
"disqualifier": judge_dq or det_dq,
|
|
116
256
|
"dq_judge": judge_dq,
|
|
257
|
+
"dq_judge_malformed": judge_dq_malformed,
|
|
117
258
|
"dq_deterministic": det_dq,
|
|
118
|
-
"
|
|
259
|
+
"malformed_boolean_fields": malformed_boolean_fields,
|
|
260
|
+
"blind_mapping_arm_missing": blind_mapping_arm_missing,
|
|
261
|
+
"critical_findings": critical_findings_for(arm),
|
|
262
|
+
"_axis_validation_out_of_range_count": len(axis_invalid_by_arm.get(arm, [])),
|
|
263
|
+
"_axis_validation_out_of_range_cells": axis_invalid_by_arm.get(arm, []),
|
|
119
264
|
}
|
|
120
265
|
|
|
121
|
-
# Pairwise margins
|
|
122
|
-
#
|
|
123
|
-
#
|
|
266
|
+
# Pairwise margins are derived from trusted mapped scores only. Cached
|
|
267
|
+
# judge-side margins are redundant and can be stale if a partial artifact
|
|
268
|
+
# is reused.
|
|
124
269
|
def m(left, right, key):
|
|
125
|
-
if
|
|
126
|
-
return
|
|
127
|
-
l =
|
|
270
|
+
if left not in mapped_arms or right not in mapped_arms:
|
|
271
|
+
return None
|
|
272
|
+
l = trusted_scores_by_arm.get(left); r2 = trusted_scores_by_arm.get(right)
|
|
128
273
|
if l is None or r2 is None:
|
|
129
274
|
return None
|
|
130
275
|
return l - r2
|
|
131
276
|
|
|
277
|
+
def trusted_winner():
|
|
278
|
+
winner = judge.get("winner_arm")
|
|
279
|
+
if winner == "tie":
|
|
280
|
+
return winner
|
|
281
|
+
if winner in trusted_scores_by_arm:
|
|
282
|
+
return winner
|
|
283
|
+
return None
|
|
284
|
+
|
|
132
285
|
row = {
|
|
133
286
|
"fixture": fid,
|
|
134
287
|
"category": category,
|
|
@@ -146,7 +299,9 @@ def main() -> int:
|
|
|
146
299
|
"solo_over_bare": wall_ratio(arms_block["solo_claude"]["wall_s"], arms_block["bare"]["wall_s"]),
|
|
147
300
|
"variant_over_solo": wall_ratio(arms_block["variant"]["wall_s"], arms_block["solo_claude"]["wall_s"]),
|
|
148
301
|
},
|
|
149
|
-
"winner":
|
|
302
|
+
"winner": trusted_winner(),
|
|
303
|
+
"_axis_validation_unmapped_out_of_range_count": axis_unmapped_count,
|
|
304
|
+
"_axis_validation_unmapped_out_of_range_cells": axis_unmapped_cells,
|
|
150
305
|
# Legacy fields preserved so older summary readers still parse.
|
|
151
306
|
"variant_score": arms_block["variant"]["score"],
|
|
152
307
|
"bare_score": arms_block["bare"]["score"],
|
|
@@ -204,18 +359,23 @@ def main() -> int:
|
|
|
204
359
|
# because the legacy ship-gate.py reads that. Pair-aware gates get
|
|
205
360
|
# added in iter-0021 / 0022 once the data shape stabilizes.
|
|
206
361
|
margin_ge_5 = sum(1 for r in gated_rows if (r.get("margin") or 0) >= 5)
|
|
207
|
-
disqualifier_count = sum(1 for r in
|
|
362
|
+
disqualifier_count = sum(1 for r in rows if r.get("variant_disqualifier"))
|
|
208
363
|
|
|
209
364
|
# arm-presence flags so consumers know whether the iter is 2-arm legacy
|
|
210
365
|
# or 3-arm post-iter-0019.
|
|
211
|
-
has_solo = any(
|
|
366
|
+
has_solo = any(
|
|
367
|
+
(arm := (r.get("arms", {}).get("solo_claude") or {})).get("score") is not None
|
|
368
|
+
or arm.get("wall_s") is not None
|
|
369
|
+
or bool(arm.get("disqualifier"))
|
|
370
|
+
for r in rows
|
|
371
|
+
)
|
|
212
372
|
|
|
213
373
|
summary = {
|
|
214
374
|
"run_id": args.run_id,
|
|
215
375
|
"label": args.label,
|
|
216
376
|
"git_sha": git_sha(),
|
|
217
377
|
"branch": git_branch(),
|
|
218
|
-
"completed_at":
|
|
378
|
+
"completed_at": utc_now_iso(),
|
|
219
379
|
"fixtures_total": len(rows),
|
|
220
380
|
"fixtures_scored": len(scored),
|
|
221
381
|
# Legacy 2-arm fields kept for ship-gate.py + history readers.
|
|
@@ -245,12 +405,12 @@ def main() -> int:
|
|
|
245
405
|
f"Branch: `{summary['branch']}`",
|
|
246
406
|
f"Git SHA: `{summary['git_sha'][:12]}`",
|
|
247
407
|
"",
|
|
248
|
-
"| Fixture | Category |
|
|
249
|
-
"
|
|
408
|
+
"| Fixture | Category | variant (L2) | solo_claude (L1) | bare (L0) | variant-bare | solo_claude-bare | variant-solo_claude | Winner | Wall variant/solo_claude/bare | Wall variant/solo_claude | Wall variant/bare |",
|
|
409
|
+
"|---------|----------|--------------|------------------|-----------|--------------|-------------------|----------------------|--------|--------------------------------|--------------------------|-------------------|",
|
|
250
410
|
]
|
|
251
411
|
for r in rows:
|
|
252
412
|
if r.get("variant_score") is None:
|
|
253
|
-
lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — |")
|
|
413
|
+
lines.append(f"| {r['fixture']} | — | — | — | — | — | — | — | NO_JUDGE | — | — | — |")
|
|
254
414
|
continue
|
|
255
415
|
arms = r.get("arms", {}) or {}
|
|
256
416
|
v = arms.get("variant", {}) or {}
|
|
@@ -267,11 +427,13 @@ def main() -> int:
|
|
|
267
427
|
def fmt_wall(arm):
|
|
268
428
|
return f"{arm['wall_s']}s" if arm.get("wall_s") else "?"
|
|
269
429
|
l2_l0_wall = f"{wallr.get('variant_over_bare'):.1f}x" if wallr.get("variant_over_bare") else "—"
|
|
430
|
+
l2_l1_wall = f"{wallr.get('variant_over_solo'):.1f}x" if wallr.get("variant_over_solo") else "—"
|
|
270
431
|
wall_triplet = f"{fmt_wall(v)}/{fmt_wall(s)}/{fmt_wall(b)}"
|
|
271
432
|
lines.append(
|
|
272
433
|
f"| {r['fixture']} | {r['category']} | {fmt_score(v)} | {fmt_score(s)} | {fmt_score(b)} | "
|
|
273
434
|
f"{fmt_margin(margins.get('variant_over_bare'))} | {fmt_margin(margins.get('solo_over_bare'))} | "
|
|
274
|
-
f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} |
|
|
435
|
+
f"{fmt_margin(margins.get('variant_over_solo'))} | {r.get('winner') or '—'} | "
|
|
436
|
+
f"{wall_triplet} | {l2_l1_wall} | {l2_l0_wall} |"
|
|
275
437
|
)
|
|
276
438
|
def fmt_avg(v): return f"{v:.1f}" if isinstance(v, (int, float)) else "n/a"
|
|
277
439
|
def fmt_signed(v): return f"{v:+.1f}" if isinstance(v, (int, float)) else "n/a"
|
|
@@ -289,23 +451,23 @@ def main() -> int:
|
|
|
289
451
|
lines += [
|
|
290
452
|
f"**Suite average bare (L0) score:** {fmt_avg(summary['bare_avg'])}",
|
|
291
453
|
"",
|
|
292
|
-
f"**L2 vs L0 margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
|
|
454
|
+
f"**variant (L2) vs bare (L0) margin avg:** {margin_avg_str} (ship floor: +5, NORTH-STAR preferred: +8)",
|
|
293
455
|
]
|
|
294
456
|
if summary.get("arms_present", {}).get("solo_claude"):
|
|
295
457
|
ms = summary.get("margins_avg", {}) or {}
|
|
296
458
|
ws = summary.get("wall_ratio_avg_by_pair", {}) or {}
|
|
297
459
|
lines += [
|
|
298
|
-
f"**L1 vs L0 margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
|
|
299
|
-
f"**L2 vs L1 margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
|
|
300
|
-
f"**Wall ratio L2/L0:** {fmt_ratio(ws.get('variant_over_bare'))}",
|
|
301
|
-
f"**Wall ratio L1/L0:** {fmt_ratio(ws.get('solo_over_bare'))}",
|
|
302
|
-
f"**Wall ratio L2/L1:** {fmt_ratio(ws.get('variant_over_solo'))}",
|
|
460
|
+
f"**solo_claude (L1) vs bare (L0) margin avg:** {fmt_signed(ms.get('solo_over_bare'))} (NORTH-STAR L1 contract: ≥+5)",
|
|
461
|
+
f"**variant (L2) vs solo_claude (L1) margin avg:** {fmt_signed(ms.get('variant_over_solo'))} (NORTH-STAR L2 contract: ≥+5 on pair-eligible)",
|
|
462
|
+
f"**Wall ratio variant (L2) / bare (L0):** {fmt_ratio(ws.get('variant_over_bare'))}",
|
|
463
|
+
f"**Wall ratio solo_claude (L1) / bare (L0):** {fmt_ratio(ws.get('solo_over_bare'))}",
|
|
464
|
+
f"**Wall ratio variant (L2) / solo_claude (L1):** {fmt_ratio(ws.get('variant_over_solo'))}",
|
|
303
465
|
]
|
|
304
466
|
else:
|
|
305
|
-
lines.append(f"**Wall ratio variant/bare (mean
|
|
467
|
+
lines.append(f"**Wall ratio variant (L2) / bare (L0) mean:** {wall_ratio_str} (no solo_claude arm in this run)")
|
|
306
468
|
lines += [
|
|
307
469
|
f"**Hard-floor violations:** {summary['hard_floor_violations']}",
|
|
308
|
-
f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7
|
|
470
|
+
f"**Fixtures with margin ≥ +5:** {summary['margin_ge_5_count']} / {summary['gated_fixtures']} (gate: ≥ 7)",
|
|
309
471
|
]
|
|
310
472
|
# Critical findings digest — per-arm sections.
|
|
311
473
|
def has_findings(arm):
|
|
@@ -315,7 +477,7 @@ def main() -> int:
|
|
|
315
477
|
lines += ["", "## Critical Findings", ""]
|
|
316
478
|
for r in cf_rows:
|
|
317
479
|
lines.append(f"### {r['fixture']}")
|
|
318
|
-
for arm_label, arm_key in [("
|
|
480
|
+
for arm_label, arm_key in [("variant (L2)", "variant"), ("solo_claude (L1)", "solo_claude"), ("bare (L0)", "bare")]:
|
|
319
481
|
arm = (r.get("arms") or {}).get(arm_key) or {}
|
|
320
482
|
if has_findings(arm):
|
|
321
483
|
lines.append(f"**{arm_label}:**")
|
|
@@ -11,6 +11,8 @@ import urllib.request
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
from pair_evidence_contract import reject_json_constant
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
DATASETS = {
|
|
16
18
|
"lite": "princeton-nlp/SWE-bench_Lite",
|
|
@@ -31,7 +33,17 @@ def fetch_rows(dataset: str, split: str, offset: int, length: int) -> dict[str,
|
|
|
31
33
|
)
|
|
32
34
|
url = f"https://datasets-server.huggingface.co/rows?{params}"
|
|
33
35
|
with urllib.request.urlopen(url, timeout=60) as response:
|
|
34
|
-
return json.load(response)
|
|
36
|
+
return json.load(response, parse_constant=reject_json_constant)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def positive_int(value: str) -> int:
|
|
40
|
+
try:
|
|
41
|
+
parsed = int(value)
|
|
42
|
+
except ValueError as exc:
|
|
43
|
+
raise argparse.ArgumentTypeError("must be an integer") from exc
|
|
44
|
+
if parsed <= 0:
|
|
45
|
+
raise argparse.ArgumentTypeError("must be > 0")
|
|
46
|
+
return parsed
|
|
35
47
|
|
|
36
48
|
|
|
37
49
|
def main() -> int:
|
|
@@ -39,7 +51,7 @@ def main() -> int:
|
|
|
39
51
|
parser.add_argument("--dataset", choices=sorted(DATASETS), default="lite")
|
|
40
52
|
parser.add_argument("--dataset-id", help="Override the Hugging Face dataset id.")
|
|
41
53
|
parser.add_argument("--split", default="test")
|
|
42
|
-
parser.add_argument("--limit", type=
|
|
54
|
+
parser.add_argument("--limit", type=positive_int, help="Fetch at most N rows.")
|
|
43
55
|
parser.add_argument("--page-size", type=int, default=100)
|
|
44
56
|
parser.add_argument("--instance-id", action="append", help="Keep only these instance ids.")
|
|
45
57
|
parser.add_argument("--out", required=True, type=Path)
|
|
@@ -63,13 +75,19 @@ def main() -> int:
|
|
|
63
75
|
if total is None:
|
|
64
76
|
total = int(page.get("num_rows_total") or 0)
|
|
65
77
|
page_rows = page.get("rows") or []
|
|
78
|
+
if not isinstance(page_rows, list):
|
|
79
|
+
raise ValueError("fetched page malformed: rows must be a list")
|
|
66
80
|
if not page_rows:
|
|
67
81
|
break
|
|
68
|
-
for wrapper in page_rows:
|
|
82
|
+
for row_index, wrapper in enumerate(page_rows, start=1):
|
|
83
|
+
if not isinstance(wrapper, dict):
|
|
84
|
+
raise ValueError(f"malformed fetched row {row_index}: wrapper must be object")
|
|
69
85
|
row = wrapper.get("row")
|
|
70
86
|
if not isinstance(row, dict):
|
|
71
|
-
|
|
87
|
+
raise ValueError(f"malformed fetched row {row_index}: row must be object")
|
|
72
88
|
instance_id = row.get("instance_id")
|
|
89
|
+
if not isinstance(instance_id, str) or not instance_id:
|
|
90
|
+
raise ValueError(f"malformed fetched row {row_index}: instance_id must be a non-empty string")
|
|
73
91
|
if keep and instance_id not in keep:
|
|
74
92
|
continue
|
|
75
93
|
rows.append(row)
|