@jinn-network/client 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/deployments/deployment-jinn-mvi-l1-sepolia-fast.json +23 -4
- package/deployments/deployment-jinn-mvi-l1-sepolia.json +23 -4
- package/deployments/deployment-jinn-mvi-l2-baseSepolia.json +5 -4
- package/dist/adapters/mech/adapter.d.ts +38 -1
- package/dist/adapters/mech/adapter.js +241 -54
- package/dist/adapters/mech/adapter.js.map +1 -1
- package/dist/adapters/mech/contracts.d.ts +17 -4
- package/dist/adapters/mech/contracts.js +8 -2
- package/dist/adapters/mech/contracts.js.map +1 -1
- package/dist/adapters/mech/safe-revert.d.ts +20 -0
- package/dist/adapters/mech/safe-revert.js +12 -4
- package/dist/adapters/mech/safe-revert.js.map +1 -1
- package/dist/adapters/mech/safe.d.ts +5 -1
- package/dist/adapters/mech/safe.js +27 -8
- package/dist/adapters/mech/safe.js.map +1 -1
- package/dist/adapters/mech/verdict-code.d.ts +1 -0
- package/dist/adapters/mech/verdict-code.js +18 -0
- package/dist/adapters/mech/verdict-code.js.map +1 -1
- package/dist/api/admin-endpoint.d.ts +15 -3
- package/dist/api/admin-endpoint.js +24 -2
- package/dist/api/admin-endpoint.js.map +1 -1
- package/dist/api/bootstrap-endpoint.js +49 -0
- package/dist/api/bootstrap-endpoint.js.map +1 -1
- package/dist/api/codex-doctor-endpoint.d.ts +73 -0
- package/dist/api/codex-doctor-endpoint.js +177 -0
- package/dist/api/codex-doctor-endpoint.js.map +1 -0
- package/dist/api/discovery-endpoint.d.ts +1 -0
- package/dist/api/discovery-endpoint.js +26 -0
- package/dist/api/discovery-endpoint.js.map +1 -1
- package/dist/api/fleet-build.d.ts +1 -0
- package/dist/api/fleet-build.js +2 -1
- package/dist/api/fleet-build.js.map +1 -1
- package/dist/api/gather-status.d.ts +11 -0
- package/dist/api/gather-status.js +400 -4
- package/dist/api/gather-status.js.map +1 -1
- package/dist/api/hermes-doctor-endpoint.d.ts +117 -0
- package/dist/api/hermes-doctor-endpoint.js +229 -23
- package/dist/api/hermes-doctor-endpoint.js.map +1 -1
- package/dist/api/launcher-status.d.ts +21 -16
- package/dist/api/launcher-status.js +2 -1
- package/dist/api/launcher-status.js.map +1 -1
- package/dist/api/portfolio-v0-build.d.ts +10 -0
- package/dist/api/portfolio-v0-build.js +24 -5
- package/dist/api/portfolio-v0-build.js.map +1 -1
- package/dist/api/prediction-v1-build.d.ts +10 -0
- package/dist/api/prediction-v1-build.js +7 -1
- package/dist/api/prediction-v1-build.js.map +1 -1
- package/dist/api/server.d.ts +31 -1
- package/dist/api/server.js +68 -1
- package/dist/api/server.js.map +1 -1
- package/dist/api/setup-endpoints.d.ts +16 -0
- package/dist/api/setup-endpoints.js +78 -4
- package/dist/api/setup-endpoints.js.map +1 -1
- package/dist/api/setup-retry-endpoint.d.ts +19 -0
- package/dist/api/setup-retry-endpoint.js +32 -0
- package/dist/api/setup-retry-endpoint.js.map +1 -0
- package/dist/api/solvernets-endpoints.d.ts +8 -0
- package/dist/api/solvernets-endpoints.js +71 -43
- package/dist/api/solvernets-endpoints.js.map +1 -1
- package/dist/api/status-build.d.ts +72 -0
- package/dist/api/status-build.js +73 -18
- package/dist/api/status-build.js.map +1 -1
- package/dist/api/task-run-routing.d.ts +7 -0
- package/dist/api/task-run-routing.js +12 -0
- package/dist/api/task-run-routing.js.map +1 -0
- package/dist/api/task-runs-build.d.ts +21 -0
- package/dist/api/task-runs-build.js +14 -1
- package/dist/api/task-runs-build.js.map +1 -1
- package/dist/build-info.json +4 -4
- package/dist/build-meta.json +1 -1
- package/dist/chain-read-errors.d.ts +10 -0
- package/dist/chain-read-errors.js +15 -0
- package/dist/chain-read-errors.js.map +1 -1
- package/dist/cli/commands/auth.js +1 -1
- package/dist/cli/commands/auth.js.map +1 -1
- package/dist/cli/commands/create.js +3 -2
- package/dist/cli/commands/create.js.map +1 -1
- package/dist/cli/commands/doctor.d.ts +2 -0
- package/dist/cli/commands/doctor.js +2 -0
- package/dist/cli/commands/doctor.js.map +1 -1
- package/dist/cli/commands/rewards.js +11 -7
- package/dist/cli/commands/rewards.js.map +1 -1
- package/dist/cli/commands/solver-nets.js +24 -9
- package/dist/cli/commands/solver-nets.js.map +1 -1
- package/dist/cli/commands/status.js +1 -1
- package/dist/cli/commands/status.js.map +1 -1
- package/dist/cli/commands/tasks.js +86 -9
- package/dist/cli/commands/tasks.js.map +1 -1
- package/dist/cli/commands/update.d.ts +10 -0
- package/dist/cli/commands/update.js +36 -0
- package/dist/cli/commands/update.js.map +1 -1
- package/dist/cli/introspection-context.js +5 -0
- package/dist/cli/introspection-context.js.map +1 -1
- package/dist/cli/task-native-readiness.d.ts +3 -1
- package/dist/cli/task-native-readiness.js +28 -6
- package/dist/cli/task-native-readiness.js.map +1 -1
- package/dist/config.d.ts +106 -5
- package/dist/config.js +97 -18
- package/dist/config.js.map +1 -1
- package/dist/daemon/checkpoint-loop.d.ts +48 -0
- package/dist/daemon/checkpoint-loop.js +76 -0
- package/dist/daemon/checkpoint-loop.js.map +1 -0
- package/dist/daemon/creator.d.ts +1 -1
- package/dist/daemon/creator.js +7 -3
- package/dist/daemon/creator.js.map +1 -1
- package/dist/daemon/daemon.d.ts +19 -0
- package/dist/daemon/daemon.js +68 -1
- package/dist/daemon/daemon.js.map +1 -1
- package/dist/daemon/eviction-loop.d.ts +40 -0
- package/dist/daemon/eviction-loop.js +67 -0
- package/dist/daemon/eviction-loop.js.map +1 -0
- package/dist/daemon/jinn-claim-loop-wiring.d.ts +33 -0
- package/dist/daemon/jinn-claim-loop-wiring.js +40 -0
- package/dist/daemon/jinn-claim-loop-wiring.js.map +1 -0
- package/dist/daemon/jinn-claim-loop.d.ts +24 -17
- package/dist/daemon/jinn-claim-loop.js +77 -23
- package/dist/daemon/jinn-claim-loop.js.map +1 -1
- package/dist/daemon/skip-log-dedup.d.ts +69 -0
- package/dist/daemon/skip-log-dedup.js +106 -0
- package/dist/daemon/skip-log-dedup.js.map +1 -0
- package/dist/dashboard/assets/index-BUlE8F3Y.js +330 -0
- package/dist/dashboard/assets/index-blqc7eqq.css +32 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/discovery/factory.d.ts +17 -5
- package/dist/discovery/factory.js +46 -18
- package/dist/discovery/factory.js.map +1 -1
- package/dist/discovery/http.js +142 -3
- package/dist/discovery/http.js.map +1 -1
- package/dist/discovery/onchain.d.ts +5 -0
- package/dist/discovery/onchain.js +407 -15
- package/dist/discovery/onchain.js.map +1 -1
- package/dist/discovery/types.d.ts +45 -1
- package/dist/discovery/types.js +8 -10
- package/dist/discovery/types.js.map +1 -1
- package/dist/discovery/with-fallback.d.ts +7 -0
- package/dist/discovery/with-fallback.js +10 -0
- package/dist/discovery/with-fallback.js.map +1 -1
- package/dist/earning/bootstrap.d.ts +92 -1
- package/dist/earning/bootstrap.js +203 -63
- package/dist/earning/bootstrap.js.map +1 -1
- package/dist/earning/contracts.d.ts +14 -0
- package/dist/earning/contracts.js +17 -5
- package/dist/earning/contracts.js.map +1 -1
- package/dist/earning/funding-plan.js +27 -18
- package/dist/earning/funding-plan.js.map +1 -1
- package/dist/earning/jinn-rewards.d.ts +46 -0
- package/dist/earning/jinn-rewards.js +32 -0
- package/dist/earning/jinn-rewards.js.map +1 -1
- package/dist/earning/safe-adapter.d.ts +2 -0
- package/dist/earning/safe-adapter.js +26 -12
- package/dist/earning/safe-adapter.js.map +1 -1
- package/dist/earning/store.d.ts +8 -0
- package/dist/earning/store.js.map +1 -1
- package/dist/earning/testnet-setup-migration.d.ts +12 -0
- package/dist/earning/testnet-setup-migration.js +27 -1
- package/dist/earning/testnet-setup-migration.js.map +1 -1
- package/dist/earning/types.d.ts +15 -0
- package/dist/erc8004/reputation.d.ts +8 -0
- package/dist/erc8004/reputation.js +22 -3
- package/dist/erc8004/reputation.js.map +1 -1
- package/dist/harnesses/cost-estimates.d.ts +145 -0
- package/dist/harnesses/cost-estimates.js +297 -0
- package/dist/harnesses/cost-estimates.js.map +1 -0
- package/dist/harnesses/engine/engine.d.ts +72 -0
- package/dist/harnesses/engine/engine.js +105 -8
- package/dist/harnesses/engine/engine.js.map +1 -1
- package/dist/harnesses/engine/persistence.d.ts +51 -1
- package/dist/harnesses/engine/persistence.js +118 -5
- package/dist/harnesses/engine/persistence.js.map +1 -1
- package/dist/harnesses/engine/work-dir-reaper.d.ts +65 -0
- package/dist/harnesses/engine/work-dir-reaper.js +100 -0
- package/dist/harnesses/engine/work-dir-reaper.js.map +1 -0
- package/dist/harnesses/impls/hermes-agent/adapter.js +40 -0
- package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +20 -0
- package/dist/harnesses/impls/hermes-agent/bootstrap.js +40 -6
- package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/harness.d.ts +59 -1
- package/dist/harnesses/impls/hermes-agent/harness.js +104 -0
- package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
- package/dist/harnesses/impls/index.d.ts +7 -0
- package/dist/harnesses/impls/index.js +16 -1
- package/dist/harnesses/impls/index.js.map +1 -1
- package/dist/harnesses/impls/learner/harness.d.ts +38 -4
- package/dist/harnesses/impls/learner/harness.js +96 -2
- package/dist/harnesses/impls/learner/harness.js.map +1 -1
- package/dist/harnesses/impls/learner/plugin-path.d.ts +0 -13
- package/dist/harnesses/impls/learner/plugin-path.js +35 -15
- package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
- package/dist/harnesses/impls/learner/types.d.ts +11 -0
- package/dist/harnesses/impls/stub.d.ts +58 -0
- package/dist/harnesses/impls/stub.js +89 -0
- package/dist/harnesses/impls/stub.js.map +1 -0
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.d.ts +69 -50
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js +178 -93
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.d.ts +12 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +121 -7
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.d.ts +15 -0
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js +54 -4
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +6 -0
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
- package/dist/harnesses/readiness-registry.js +9 -1
- package/dist/harnesses/readiness-registry.js.map +1 -1
- package/dist/main.js +371 -82
- package/dist/main.js.map +1 -1
- package/dist/observability/emit-event.d.ts +1 -1
- package/dist/observability/emit-event.js.map +1 -1
- package/dist/operator-errors.d.ts +7 -0
- package/dist/operator-errors.js +13 -1
- package/dist/operator-errors.js.map +1 -1
- package/dist/plugins/learner/.claude-plugin/plugin.json +9 -0
- package/dist/plugins/learner/.codex-plugin/plugin.json +39 -0
- package/dist/plugins/learner/AGENTS.md +40 -0
- package/dist/plugins/learner/CLAUDE.md +33 -0
- package/dist/plugins/learner/README.md +59 -0
- package/dist/plugins/learner/hooks/hooks.json +16 -0
- package/dist/plugins/learner/hooks/session-start +38 -0
- package/dist/plugins/learner/skills/learn/SKILL.md +412 -0
- package/dist/plugins/learner/skills/learn/analyst-prompt.md +68 -0
- package/dist/plugins/learner/skills/learn/consolidator-prompt.md +94 -0
- package/dist/plugins/learner/skills/learn/explorer-prompt.md +53 -0
- package/dist/plugins/learner/skills/learn/planner-prompt.md +87 -0
- package/dist/plugins/learner/skills/learn/promoter-prompt.md +113 -0
- package/dist/plugins/learner/skills/learn/step-worker-prompt.md +47 -0
- package/dist/plugins/learner/skills/learn/strategist-prompt.md +85 -0
- package/dist/restart-daemon.d.ts +90 -0
- package/dist/restart-daemon.js +95 -0
- package/dist/restart-daemon.js.map +1 -0
- package/dist/setup/halt-mode.d.ts +14 -0
- package/dist/setup/halt-mode.js +17 -0
- package/dist/setup/halt-mode.js.map +1 -0
- package/dist/solver-nets/prediction-operator-ux.js +43 -3
- package/dist/solver-nets/prediction-operator-ux.js.map +1 -1
- package/dist/solver-nets/registry.d.ts +1 -0
- package/dist/solver-nets/registry.js +1 -1
- package/dist/solver-nets/registry.js.map +1 -1
- package/dist/solver-types/_swe-rebench-v2-pool-cache.d.ts +58 -0
- package/dist/solver-types/_swe-rebench-v2-pool-cache.js +87 -0
- package/dist/solver-types/_swe-rebench-v2-pool-cache.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-substrate.d.ts +1 -0
- package/dist/solver-types/_swe-rebench-v2-substrate.js +10 -0
- package/dist/solver-types/_swe-rebench-v2-substrate.js.map +1 -1
- package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +65 -0
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js +243 -26
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
- package/dist/solver-types/swe-rebench-v2-auto.d.ts +22 -7
- package/dist/solver-types/swe-rebench-v2-auto.js +45 -20
- package/dist/solver-types/swe-rebench-v2-auto.js.map +1 -1
- package/dist/solver-types/swe-rebench-v2.d.ts +13 -2
- package/dist/solver-types/swe-rebench-v2.js +233 -94
- package/dist/solver-types/swe-rebench-v2.js.map +1 -1
- package/dist/solvernets/daemon-init.d.ts +10 -2
- package/dist/solvernets/daemon-init.js +22 -2
- package/dist/solvernets/daemon-init.js.map +1 -1
- package/dist/solvernets/launched-record-dispatcher.js +35 -7
- package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
- package/dist/solvernets/store.d.ts +5 -0
- package/dist/solvernets/store.js +1 -0
- package/dist/solvernets/store.js.map +1 -1
- package/dist/store/store.d.ts +15 -0
- package/dist/store/store.js +118 -3
- package/dist/store/store.js.map +1 -1
- package/dist/tasks/sources.d.ts +18 -1
- package/dist/tasks/sources.js +33 -5
- package/dist/tasks/sources.js.map +1 -1
- package/dist/tx-retry.d.ts +151 -19
- package/dist/tx-retry.js +286 -32
- package/dist/tx-retry.js.map +1 -1
- package/dist/types/payloads/prediction-apy-v0.d.ts +5 -5
- package/dist/types/payloads/prediction-v0.d.ts +5 -5
- package/dist/types/task-document.d.ts +392 -0
- package/dist/types/task-document.js +10 -0
- package/dist/types/task-document.js.map +1 -1
- package/dist/types/task.d.ts +28 -0
- package/dist/util/extract-tx-hash.d.ts +14 -0
- package/dist/util/extract-tx-hash.js +19 -0
- package/dist/util/extract-tx-hash.js.map +1 -0
- package/dist/vendor/@jinn-network/sdk/dist/contracts.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.d.ts +3 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.js +1 -0
- package/package.json +29 -12
- package/dist/dashboard/assets/index-DOlzFN8a.css +0 -32
- package/dist/dashboard/assets/index-NkZ7CTAT.js +0 -140
|
@@ -121,4 +121,15 @@ export interface LearnerHarnessConfig {
|
|
|
121
121
|
* Defaults to 'bare'.
|
|
122
122
|
*/
|
|
123
123
|
runtimeMode?: 'bare' | 'container' | 'docker-compose';
|
|
124
|
+
/**
|
|
125
|
+
* Path to the `codex` executable. Used by `isReady()` when this
|
|
126
|
+
* `LearnerHarness` is the Codex variant (`name === CODEX_HARNESS`) — it is
|
|
127
|
+
* passed to `probeCodexDoctor()`. Defaults to 'codex' (from PATH).
|
|
128
|
+
*/
|
|
129
|
+
codexPath?: string;
|
|
130
|
+
/**
|
|
131
|
+
* Timeout (ms) for the `codex --version` probe in the Codex variant's
|
|
132
|
+
* `isReady()`. Defaults to 30s.
|
|
133
|
+
*/
|
|
134
|
+
codexDoctorTimeoutMs?: number;
|
|
124
135
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Env-gated stub harness for the T2.2 producer/evaluator gate.
|
|
3
|
+
*
|
|
4
|
+
* When JINN_HARNESS_STUB_INSTANCE is set, the canned patch at
|
|
5
|
+
* <fixturesDir>/<instanceMatcher>.patch is returned as a SWE-rebench v2
|
|
6
|
+
* restoration solution. Never calls an LLM; never accepts tasks whose
|
|
7
|
+
* spec.instance_id differs from the configured matcher.
|
|
8
|
+
*
|
|
9
|
+
* PRODUCTION SAFETY — two-env-var requirement.
|
|
10
|
+
* This is a *fake* harness: it produces canned, non-genuine work. If it ever
|
|
11
|
+
* entered a real operator run it would generate fraudulent on-chain activity.
|
|
12
|
+
* To make accidental activation impossible, the factory requires BOTH:
|
|
13
|
+
* JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to
|
|
14
|
+
* JINN_TEST_MODE === '1' — explicit test-mode sentinel
|
|
15
|
+
* If JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not '1', the
|
|
16
|
+
* factory THROWS rather than silently registering the stub. A single stray
|
|
17
|
+
* exported env var in an operator's shell can no longer activate it.
|
|
18
|
+
*
|
|
19
|
+
* Activated by environment variables:
|
|
20
|
+
* JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to (required to activate)
|
|
21
|
+
* JINN_TEST_MODE — must equal '1' (defense-in-depth; required to activate)
|
|
22
|
+
* JINN_HARNESS_STUB_FIXTURES_DIR — dir containing <instanceMatcher>.patch files
|
|
23
|
+
* (default: client/test/release/tier-2/fixtures)
|
|
24
|
+
*/
|
|
25
|
+
import type { Harness, HarnessContext, ReadyStatus, Solution } from '../types.js';
|
|
26
|
+
export interface StubHarnessConfig {
|
|
27
|
+
/** Directory containing <instanceMatcher>.patch files. */
|
|
28
|
+
fixturesDir: string;
|
|
29
|
+
/** The instance ID this stub will accept. Tasks with other instance IDs are rejected. */
|
|
30
|
+
instanceMatcher: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* A zero-LLM Harness that returns a canned patch for a specific SWE-rebench v2
|
|
34
|
+
* instance. Intended exclusively for T2.2 release-gate automation.
|
|
35
|
+
*/
|
|
36
|
+
export declare class StubHarness implements Harness {
|
|
37
|
+
readonly name = "harness:stub";
|
|
38
|
+
readonly version = "0.1.0-stub";
|
|
39
|
+
private readonly fixturesDir;
|
|
40
|
+
private readonly instanceMatcher;
|
|
41
|
+
constructor(config: StubHarnessConfig);
|
|
42
|
+
supports(ctx: {
|
|
43
|
+
solverType: string;
|
|
44
|
+
role?: 'restoration' | 'evaluation';
|
|
45
|
+
}): boolean;
|
|
46
|
+
isReady(): Promise<ReadyStatus>;
|
|
47
|
+
run(ctx: HarnessContext): Promise<Solution>;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Factory that reads JINN_HARNESS_STUB_INSTANCE and JINN_HARNESS_STUB_FIXTURES_DIR
|
|
51
|
+
* from the environment and returns a configured StubHarness, or null if the env
|
|
52
|
+
* var is absent (allowing the registry to skip registration silently).
|
|
53
|
+
*
|
|
54
|
+
* Defense-in-depth: if JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is
|
|
55
|
+
* not exactly '1', this THROWS rather than returning a harness — a real
|
|
56
|
+
* operator run must never silently pick up the fake stub harness.
|
|
57
|
+
*/
|
|
58
|
+
export declare function maybeCreateStubHarnessFromEnv(): StubHarness | null;
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Env-gated stub harness for the T2.2 producer/evaluator gate.
|
|
3
|
+
*
|
|
4
|
+
* When JINN_HARNESS_STUB_INSTANCE is set, the canned patch at
|
|
5
|
+
* <fixturesDir>/<instanceMatcher>.patch is returned as a SWE-rebench v2
|
|
6
|
+
* restoration solution. Never calls an LLM; never accepts tasks whose
|
|
7
|
+
* spec.instance_id differs from the configured matcher.
|
|
8
|
+
*
|
|
9
|
+
* PRODUCTION SAFETY — two-env-var requirement.
|
|
10
|
+
* This is a *fake* harness: it produces canned, non-genuine work. If it ever
|
|
11
|
+
* entered a real operator run it would generate fraudulent on-chain activity.
|
|
12
|
+
* To make accidental activation impossible, the factory requires BOTH:
|
|
13
|
+
* JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to
|
|
14
|
+
* JINN_TEST_MODE === '1' — explicit test-mode sentinel
|
|
15
|
+
* If JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not '1', the
|
|
16
|
+
* factory THROWS rather than silently registering the stub. A single stray
|
|
17
|
+
* exported env var in an operator's shell can no longer activate it.
|
|
18
|
+
*
|
|
19
|
+
* Activated by environment variables:
|
|
20
|
+
* JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to (required to activate)
|
|
21
|
+
* JINN_TEST_MODE — must equal '1' (defense-in-depth; required to activate)
|
|
22
|
+
* JINN_HARNESS_STUB_FIXTURES_DIR — dir containing <instanceMatcher>.patch files
|
|
23
|
+
* (default: client/test/release/tier-2/fixtures)
|
|
24
|
+
*/
|
|
25
|
+
import * as fs from 'node:fs/promises';
|
|
26
|
+
import * as path from 'node:path';
|
|
27
|
+
/**
|
|
28
|
+
* A zero-LLM Harness that returns a canned patch for a specific SWE-rebench v2
|
|
29
|
+
* instance. Intended exclusively for T2.2 release-gate automation.
|
|
30
|
+
*/
|
|
31
|
+
export class StubHarness {
|
|
32
|
+
name = 'harness:stub';
|
|
33
|
+
version = '0.1.0-stub';
|
|
34
|
+
fixturesDir;
|
|
35
|
+
instanceMatcher;
|
|
36
|
+
constructor(config) {
|
|
37
|
+
this.fixturesDir = config.fixturesDir;
|
|
38
|
+
this.instanceMatcher = config.instanceMatcher;
|
|
39
|
+
}
|
|
40
|
+
supports(ctx) {
|
|
41
|
+
if (ctx.role === 'evaluation')
|
|
42
|
+
return false;
|
|
43
|
+
return ctx.solverType === 'swe-rebench-v2.v1';
|
|
44
|
+
}
|
|
45
|
+
async isReady() {
|
|
46
|
+
return { ready: true };
|
|
47
|
+
}
|
|
48
|
+
async run(ctx) {
|
|
49
|
+
const taskInstanceId = ctx.task.spec?.['instance_id'];
|
|
50
|
+
if (taskInstanceId !== this.instanceMatcher) {
|
|
51
|
+
throw new Error(`stub harness: task.spec.instance_id=${String(taskInstanceId)} does not match configured instanceMatcher=${this.instanceMatcher}`);
|
|
52
|
+
}
|
|
53
|
+
const patchPath = path.join(this.fixturesDir, `${this.instanceMatcher}.patch`);
|
|
54
|
+
const patch = await fs.readFile(patchPath, 'utf-8');
|
|
55
|
+
return {
|
|
56
|
+
venueRef: { name: this.name },
|
|
57
|
+
gating: {},
|
|
58
|
+
solutionPayload: {
|
|
59
|
+
schemaVersion: 'swe-rebench-v2-solution.v1',
|
|
60
|
+
patch,
|
|
61
|
+
},
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Factory that reads JINN_HARNESS_STUB_INSTANCE and JINN_HARNESS_STUB_FIXTURES_DIR
|
|
67
|
+
* from the environment and returns a configured StubHarness, or null if the env
|
|
68
|
+
* var is absent (allowing the registry to skip registration silently).
|
|
69
|
+
*
|
|
70
|
+
* Defense-in-depth: if JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is
|
|
71
|
+
* not exactly '1', this THROWS rather than returning a harness — a real
|
|
72
|
+
* operator run must never silently pick up the fake stub harness.
|
|
73
|
+
*/
|
|
74
|
+
export function maybeCreateStubHarnessFromEnv() {
|
|
75
|
+
const instanceMatcher = process.env['JINN_HARNESS_STUB_INSTANCE'];
|
|
76
|
+
if (!instanceMatcher)
|
|
77
|
+
return null;
|
|
78
|
+
if (process.env['JINN_TEST_MODE'] !== '1') {
|
|
79
|
+
throw new Error('stub harness must never activate in a real operator run: ' +
|
|
80
|
+
'JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not "1". ' +
|
|
81
|
+
'The stub harness produces canned, non-genuine work and would generate ' +
|
|
82
|
+
'fraudulent on-chain activity. Set JINN_TEST_MODE=1 if this is a Tier 2 ' +
|
|
83
|
+
'test; otherwise unset JINN_HARNESS_STUB_INSTANCE.');
|
|
84
|
+
}
|
|
85
|
+
const fixturesDir = process.env['JINN_HARNESS_STUB_FIXTURES_DIR'] ??
|
|
86
|
+
path.resolve(process.cwd(), 'test', 'release', 'tier-2', 'fixtures');
|
|
87
|
+
return new StubHarness({ instanceMatcher, fixturesDir });
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=stub.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stub.js","sourceRoot":"","sources":["../../../src/harnesses/impls/stub.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAUlC;;;GAGG;AACH,MAAM,OAAO,WAAW;IACb,IAAI,GAAG,cAAc,CAAC;IACtB,OAAO,GAAG,YAAY,CAAC;IAEf,WAAW,CAAS;IACpB,eAAe,CAAS;IAEzC,YAAY,MAAyB;QACnC,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;QACtC,IAAI,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe,CAAC;IAChD,CAAC;IAED,QAAQ,CAAC,GAAgE;QACvE,IAAI,GAAG,CAAC,IAAI,KAAK,YAAY;YAAE,OAAO,KAAK,CAAC;QAC5C,OAAO,GAAG,CAAC,UAAU,KAAK,mBAAmB,CAAC;IAChD,CAAC;IAED,KAAK,CAAC,OAAO;QACX,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,GAAmB;QAC3B,MAAM,cAAc,GAAI,GAAG,CAAC,IAAI,CAAC,IAA4C,EAAE,CAAC,aAAa,CAAC,CAAC;QAC/F,IAAI,cAAc,KAAK,IAAI,CAAC,eAAe,EAAE,CAAC;YAC5C,MAAM,IAAI,KAAK,CACb,uCAAuC,MAAM,CAAC,cAAc,CAAC,8CAA8C,IAAI,CAAC,eAAe,EAAE,CAClI,CAAC;QACJ,CAAC;QACD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,GAAG,IAAI,CAAC,eAAe,QAAQ,CAAC,CAAC;QAC/E,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QACpD,OAAO;YACL,QAAQ,EAAE,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;YAC7B,MAAM,EAAE,EAAE;YACV,eAAe,EAAE;gBACf,aAAa,EAAE,4BAA4B;gBAC3C,KAAK;aACN;SACF,CAAC;IACJ,CAAC;CACF;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,6BAA6B;IAC3C,MAAM,eAAe,GAAG,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAClE,IAAI,CAAC,eAAe;QAAE,OAAO,IAAI,CAAC;IAClC,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,KAAK,GAAG,EAAE,CAAC;QAC1C,MAAM,IAAI,KAAK,CACb,2DAA2D;YACzD,mEAAmE;YACnE,wEAAwE;YACxE,yEAAyE;YACzE,mDAAmD,CACtD,CAAC;IACJ,CAAC;IACD,MAAM,WAAW,GACf,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC;QAC7C,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;IACvE,OAAO,IAAI,WAAW,CAAC,EAAE,eAAe,EAAE,WAAW,EAAE,CAAC,CAAC;AAC3D,CAAC"}
|
|
@@ -37,6 +37,36 @@ export declare class EvalCouldNotGradeError extends Error {
|
|
|
37
37
|
readonly logExcerpt: string;
|
|
38
38
|
constructor(reason: string, logExcerpt?: string);
|
|
39
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* Thrown by `runEval` when the disk cannot be brought above the eval
|
|
42
|
+
* disk-floor even after a broad prune. A clean abort — the caller stops
|
|
43
|
+
* gracefully; no instance is graded, nothing is marked. Distinct from
|
|
44
|
+
* `EvalCouldNotGradeError`: this is operator-environment, retryable, and must
|
|
45
|
+
* never be turned into a `scorable: false` admission (#476).
|
|
46
|
+
*/
|
|
47
|
+
export declare class InsufficientDiskError extends Error {
|
|
48
|
+
readonly freeBytes: number;
|
|
49
|
+
readonly floorBytes: number;
|
|
50
|
+
constructor(freeBytes: number, floorBytes: number);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Default free-disk floor required before an eval round: 20 GB. A single
|
|
54
|
+
* SWE-rebench eval image was observed to peak transiently at ~12.6 GB, so the
|
|
55
|
+
* floor clears the worst observed instance with real margin. Override with
|
|
56
|
+
* `JINN_EVAL_DISK_FLOOR_GB` on constrained hosts.
|
|
57
|
+
*/
|
|
58
|
+
export declare const DEFAULT_EVAL_DISK_FLOOR_BYTES = 20000000000;
|
|
59
|
+
/** Resolve the disk floor: explicit option > `JINN_EVAL_DISK_FLOOR_GB` env > default. */
|
|
60
|
+
export declare function resolveDiskFloorBytes(opt: number | undefined): number;
|
|
61
|
+
/**
|
|
62
|
+
* Default wall-clock limit for one upstream eval.py invocation: 2 hours. Some
|
|
63
|
+
* linux/amd64 SWE-rebench images can wedge indefinitely under Apple Silicon
|
|
64
|
+
* emulation after a native crash, so the subprocess gets a hard guardrail.
|
|
65
|
+
* Override with `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS`; set `0` to disable.
|
|
66
|
+
*/
|
|
67
|
+
export declare const DEFAULT_EVAL_TIMEOUT_MS: number;
|
|
68
|
+
/** Resolve the eval timeout: explicit option > env > default. */
|
|
69
|
+
export declare function resolveEvalTimeoutMs(opt: number | undefined): number;
|
|
40
70
|
export interface PythonEvalRunnerOptions {
|
|
41
71
|
/** Path to the cloned SWE-rebench-V2 repo (cached locally). */
|
|
42
72
|
upstreamRepoDir: string;
|
|
@@ -45,66 +75,55 @@ export interface PythonEvalRunnerOptions {
|
|
|
45
75
|
/** Workers for parallel eval (defaults to 1; we run one task at a time). */
|
|
46
76
|
maxWorkers?: number;
|
|
47
77
|
/**
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
53
|
-
* The leaderboard pool has hundreds of unique instances at ~3 GB/image, so
|
|
54
|
-
* an unbounded cache fills operator disks in days (jinn-mono-uy6v.11).
|
|
78
|
+
* Removes a completed round's entire Docker footprint — the round's image,
|
|
79
|
+
* stopped containers, and build cache — so eval disk usage never
|
|
80
|
+
* accumulates across instances (#476). Called once per `runEval`, in a
|
|
81
|
+
* `finally`, even when the eval threw.
|
|
55
82
|
*
|
|
56
|
-
*
|
|
57
|
-
* `
|
|
83
|
+
* Defaults to {@link defaultPruneRound}. Implementations MUST NOT throw —
|
|
84
|
+
* `runEval` guards defensively, but cleanup failures should be swallowed
|
|
85
|
+
* (logged elsewhere if desired) so a flaky `docker` never escapes `runEval`.
|
|
58
86
|
*/
|
|
59
|
-
|
|
87
|
+
pruneRound?: (image: string) => Promise<void>;
|
|
60
88
|
/**
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
* elsewhere if desired) so a missing/failed `docker rmi` never escapes
|
|
69
|
-
* `runEval`. The runner enforces this defensively too.
|
|
89
|
+
* Resolves the eval image digest while the image is still local, before
|
|
90
|
+
* per-round pruning removes it. Defaults to `docker image inspect`.
|
|
91
|
+
*/
|
|
92
|
+
resolveImageDigest?: (image: string) => Promise<string | null>;
|
|
93
|
+
/**
|
|
94
|
+
* Required free disk (bytes) before an eval round starts. Explicit value >
|
|
95
|
+
* `JINN_EVAL_DISK_FLOOR_GB` env > {@link DEFAULT_EVAL_DISK_FLOOR_BYTES}.
|
|
70
96
|
*/
|
|
71
|
-
|
|
97
|
+
diskFloorBytes?: number;
|
|
98
|
+
/** Probe of free disk (bytes). Defaults to a `statfs` on the temp dir. */
|
|
99
|
+
freeDiskBytes?: () => Promise<number>;
|
|
100
|
+
/**
|
|
101
|
+
* Broad reclaim invoked when free disk is below the floor. Defaults to
|
|
102
|
+
* `docker system prune -f`. MUST NOT throw.
|
|
103
|
+
*/
|
|
104
|
+
systemPrune?: () => Promise<void>;
|
|
105
|
+
/**
|
|
106
|
+
* Wall-clock timeout (ms) for one upstream eval.py invocation. Explicit value
|
|
107
|
+
* > `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS` env > {@link DEFAULT_EVAL_TIMEOUT_MS}.
|
|
108
|
+
* Set to 0 to disable.
|
|
109
|
+
*/
|
|
110
|
+
evalTimeoutMs?: number;
|
|
72
111
|
}
|
|
73
|
-
/**
|
|
74
|
-
* Default cap on the per-instance Docker image cache when no explicit
|
|
75
|
-
* `imageCacheMax` and no `JINN_EVAL_IMAGE_CACHE_MAX` env var are configured.
|
|
76
|
-
*
|
|
77
|
-
* 20 images × ~3 GB/image ≈ 60 GB working set — small enough that even a
|
|
78
|
-
* 256 GB disk has headroom, large enough that the steady-state loop on a
|
|
79
|
-
* frequently-repeating subset of the pool rarely re-pulls.
|
|
80
|
-
*/
|
|
81
|
-
export declare const DEFAULT_EVAL_IMAGE_CACHE_MAX = 20;
|
|
82
|
-
export declare function resolveImageCacheMax(opt: number | undefined): number;
|
|
83
112
|
export declare function matchInfraSignature(log: string): string | null;
|
|
84
113
|
export declare class PythonEvalRunner implements EvalRunner {
|
|
85
114
|
private readonly opts;
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
private readonly imageLru;
|
|
93
|
-
private readonly imageCacheMax;
|
|
94
|
-
private readonly cleanupImage;
|
|
115
|
+
private readonly pruneRound;
|
|
116
|
+
private readonly diskFloorBytes;
|
|
117
|
+
private readonly freeDiskBytes;
|
|
118
|
+
private readonly systemPrune;
|
|
119
|
+
private readonly resolveImageDigest;
|
|
120
|
+
private readonly evalTimeoutMs;
|
|
95
121
|
constructor(opts: PythonEvalRunnerOptions);
|
|
96
|
-
runEval(args: Parameters<EvalRunner['runEval']>[0]): ReturnType<EvalRunner['runEval']>;
|
|
97
122
|
/**
|
|
98
|
-
*
|
|
99
|
-
*
|
|
100
|
-
* {@link cleanupImage}. Eviction failures are swallowed so a flaky
|
|
101
|
-
* `docker rmi` cannot escape `runEval`.
|
|
102
|
-
*
|
|
103
|
-
* The cap is enforced after the just-used image is inserted: the
|
|
104
|
-
* just-evaluated image is the *most* recent, so repeat-evals of recently
|
|
105
|
-
* used instances never re-pull. Only when more than N distinct images have
|
|
106
|
-
* been used does the oldest get rmi'd.
|
|
123
|
+
* Ensure enough free disk for an eval round. Below the floor → broad prune →
|
|
124
|
+
* re-probe; still below → `InsufficientDiskError` (clean abort). (#476)
|
|
107
125
|
*/
|
|
108
|
-
private
|
|
126
|
+
private ensureDiskHeadroom;
|
|
127
|
+
runEval(args: Parameters<EvalRunner['runEval']>[0]): ReturnType<EvalRunner['runEval']>;
|
|
109
128
|
private runEvalImpl;
|
|
110
129
|
}
|
|
@@ -26,9 +26,10 @@
|
|
|
26
26
|
* re-raises as `SkippableError` (no signed verdict).
|
|
27
27
|
*/
|
|
28
28
|
import { spawn } from 'node:child_process';
|
|
29
|
-
import { mkdtemp, writeFile, readFile, rm } from 'node:fs/promises';
|
|
29
|
+
import { mkdtemp, writeFile, readFile, rm, statfs } from 'node:fs/promises';
|
|
30
30
|
import { tmpdir } from 'node:os';
|
|
31
31
|
import { isAbsolute, join } from 'node:path';
|
|
32
|
+
import { defaultCommandRunner, resolveImageDigest as resolveSubstrateImageDigest, } from '../../../solver-types/_swe-rebench-v2-substrate.js';
|
|
32
33
|
/**
|
|
33
34
|
* Thrown when the eval could not actually grade the solution. There is no
|
|
34
35
|
* signal about the solver here, only about the operator's environment — the
|
|
@@ -46,62 +47,102 @@ export class EvalCouldNotGradeError extends Error {
|
|
|
46
47
|
}
|
|
47
48
|
}
|
|
48
49
|
/**
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
54
|
-
|
|
50
|
+
* Thrown by `runEval` when the disk cannot be brought above the eval
|
|
51
|
+
* disk-floor even after a broad prune. A clean abort — the caller stops
|
|
52
|
+
* gracefully; no instance is graded, nothing is marked. Distinct from
|
|
53
|
+
* `EvalCouldNotGradeError`: this is operator-environment, retryable, and must
|
|
54
|
+
* never be turned into a `scorable: false` admission (#476).
|
|
55
|
+
*/
|
|
56
|
+
export class InsufficientDiskError extends Error {
|
|
57
|
+
freeBytes;
|
|
58
|
+
floorBytes;
|
|
59
|
+
constructor(freeBytes, floorBytes) {
|
|
60
|
+
const gb = (n) => (n / 1_000_000_000).toFixed(1);
|
|
61
|
+
super(`insufficient disk for swe-rebench eval: ${gb(freeBytes)} GB free, ` +
|
|
62
|
+
`need ≥ ${gb(floorBytes)} GB`);
|
|
63
|
+
this.name = 'InsufficientDiskError';
|
|
64
|
+
this.freeBytes = freeBytes;
|
|
65
|
+
this.floorBytes = floorBytes;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Default free-disk floor required before an eval round: 20 GB. A single
|
|
70
|
+
* SWE-rebench eval image was observed to peak transiently at ~12.6 GB, so the
|
|
71
|
+
* floor clears the worst observed instance with real margin. Override with
|
|
72
|
+
* `JINN_EVAL_DISK_FLOOR_GB` on constrained hosts.
|
|
55
73
|
*/
|
|
56
|
-
export const
|
|
57
|
-
|
|
74
|
+
export const DEFAULT_EVAL_DISK_FLOOR_BYTES = 20_000_000_000;
|
|
75
|
+
/** Resolve the disk floor: explicit option > `JINN_EVAL_DISK_FLOOR_GB` env > default. */
|
|
76
|
+
export function resolveDiskFloorBytes(opt) {
|
|
58
77
|
if (typeof opt === 'number' && Number.isFinite(opt) && opt > 0)
|
|
59
78
|
return Math.floor(opt);
|
|
60
|
-
const envRaw = process.env['
|
|
79
|
+
const envRaw = process.env['JINN_EVAL_DISK_FLOOR_GB'];
|
|
61
80
|
if (envRaw !== undefined) {
|
|
62
|
-
// `Number()` returns 0 for `""` / whitespace and NaN for strings with
|
|
63
|
-
// non-numeric content (e.g. `"garbage"`, `"1e3oops"`) — unlike `parseInt`,
|
|
64
|
-
// which would silently accept `parseInt("1e3oops") === 1`. Either way we
|
|
65
|
-
// reject anything that isn't a positive integer.
|
|
66
81
|
const parsed = Number(envRaw);
|
|
67
|
-
if (Number.isFinite(parsed) &&
|
|
68
|
-
return parsed;
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
console.warn(`[swe-rebench-v2] JINN_EVAL_IMAGE_CACHE_MAX=${JSON.stringify(envRaw)} is not a positive integer — using default ${DEFAULT_EVAL_IMAGE_CACHE_MAX}`);
|
|
82
|
+
if (Number.isFinite(parsed) && parsed > 0)
|
|
83
|
+
return Math.floor(parsed * 1_000_000_000);
|
|
84
|
+
console.warn(`[swe-rebench-v2] JINN_EVAL_DISK_FLOOR_GB=${JSON.stringify(envRaw)} is not a positive ` +
|
|
85
|
+
`number — using default ${DEFAULT_EVAL_DISK_FLOOR_BYTES / 1_000_000_000} GB`);
|
|
72
86
|
}
|
|
73
|
-
return
|
|
87
|
+
return DEFAULT_EVAL_DISK_FLOOR_BYTES;
|
|
74
88
|
}
|
|
75
89
|
/**
|
|
76
|
-
*
|
|
77
|
-
*
|
|
78
|
-
*
|
|
79
|
-
*
|
|
80
|
-
* daemon (or a permission slip) becomes visible before disks fill. Silent
|
|
81
|
-
* leaks were the original failure mode `jinn-mono-uy6v.11` exists to fix.
|
|
82
|
-
*
|
|
83
|
-
* We listen on `'exit'` rather than `'close'` and route stdio to `'ignore'`
|
|
84
|
-
* so the resolve path doesn't depend on parent-side stream draining (which
|
|
85
|
-
* can fail to fire `'close'` cleanly when piped without backpressure on the
|
|
86
|
-
* right tick). The image tag + exit code is sufficient signal; operators can
|
|
87
|
-
* grep the docker daemon log for the underlying reason.
|
|
90
|
+
* Default wall-clock limit for one upstream eval.py invocation: 2 hours. Some
|
|
91
|
+
* linux/amd64 SWE-rebench images can wedge indefinitely under Apple Silicon
|
|
92
|
+
* emulation after a native crash, so the subprocess gets a hard guardrail.
|
|
93
|
+
* Override with `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS`; set `0` to disable.
|
|
88
94
|
*/
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
95
|
+
export const DEFAULT_EVAL_TIMEOUT_MS = 2 * 60 * 60 * 1000;
|
|
96
|
+
/** Resolve the eval timeout: explicit option > env > default. */
|
|
97
|
+
export function resolveEvalTimeoutMs(opt) {
|
|
98
|
+
if (typeof opt === 'number' && Number.isFinite(opt) && opt >= 0)
|
|
99
|
+
return Math.floor(opt);
|
|
100
|
+
const envRaw = process.env['JINN_SWE_REBENCH_EVAL_TIMEOUT_MS'];
|
|
101
|
+
if (envRaw !== undefined) {
|
|
102
|
+
const parsed = Number(envRaw);
|
|
103
|
+
if (Number.isFinite(parsed) && parsed >= 0)
|
|
104
|
+
return Math.floor(parsed);
|
|
105
|
+
console.warn(`[swe-rebench-v2] JINN_SWE_REBENCH_EVAL_TIMEOUT_MS=${JSON.stringify(envRaw)} is not a ` +
|
|
106
|
+
`non-negative number — using default ${DEFAULT_EVAL_TIMEOUT_MS} ms`);
|
|
107
|
+
}
|
|
108
|
+
return DEFAULT_EVAL_TIMEOUT_MS;
|
|
109
|
+
}
|
|
110
|
+
/** Production disk probe: free bytes on the filesystem backing the temp dir. */
|
|
111
|
+
async function defaultFreeDiskBytes() {
|
|
112
|
+
const s = await statfs(tmpdir());
|
|
113
|
+
return s.bavail * s.bsize;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Spawn `docker <args>`, resolving regardless of outcome — a failed cleanup
|
|
117
|
+
* command is logged, never thrown (#476: cleanup must not break the eval loop).
|
|
118
|
+
*/
|
|
119
|
+
function runDocker(args) {
|
|
120
|
+
return defaultCommandRunner('docker', args)
|
|
121
|
+
.then((res) => {
|
|
122
|
+
if (res.exitCode !== 0) {
|
|
123
|
+
const detail = (res.stderr || res.stdout).trim();
|
|
124
|
+
console.warn(`[swe-rebench-v2] docker ${args.join(' ')} exited ${res.exitCode}` +
|
|
125
|
+
`${detail ? `: ${detail.slice(-500)}` : ''}`);
|
|
126
|
+
}
|
|
127
|
+
})
|
|
128
|
+
.catch((err) => {
|
|
129
|
+
const reason = err instanceof Error ? err.message : String(err);
|
|
130
|
+
console.warn(`[swe-rebench-v2] docker ${args.join(' ')} failed to spawn: ${reason}`);
|
|
103
131
|
});
|
|
104
132
|
}
|
|
133
|
+
/**
|
|
134
|
+
* Production `pruneRound`: remove the round's image, then prune stopped
|
|
135
|
+
* containers and build cache. Each step is best-effort.
|
|
136
|
+
*/
|
|
137
|
+
async function defaultPruneRound(image) {
|
|
138
|
+
if (image)
|
|
139
|
+
await runDocker(['rmi', '-f', image]);
|
|
140
|
+
await runDocker(['container', 'prune', '-f']);
|
|
141
|
+
await runDocker(['builder', 'prune', '-f']);
|
|
142
|
+
}
|
|
143
|
+
async function defaultResolveImageDigest(imageName) {
|
|
144
|
+
return resolveSubstrateImageDigest(imageName, defaultCommandRunner);
|
|
145
|
+
}
|
|
105
146
|
/**
|
|
106
147
|
* Container-output signatures that mean the eval aborted before producing a
|
|
107
148
|
* usable result — i.e. the operator's environment is the problem, not the
|
|
@@ -120,6 +161,7 @@ const INFRA_SIGNATURES = [
|
|
|
120
161
|
{ rx: /Failed building editable|Failed to build installable wheels/i, reason: 'install_build_failed' },
|
|
121
162
|
{ rx: /No virtual environment found/i, reason: 'venv_missing' },
|
|
122
163
|
{ rx: /exec format error|the requested image's platform .* does not match/i, reason: 'image_arch_mismatch' },
|
|
164
|
+
{ rx: /Fatal Python error:\s*Illegal instruction|Illegal instruction(?:\s+\(core dumped\))?/i, reason: 'image_arch_mismatch' },
|
|
123
165
|
// 2026-05-14 triage (jinn-mono-fufn) — failure fingerprints from real verdicts:
|
|
124
166
|
{ rx: /A virtual environment already exists at \S+\.venv\b/i, reason: 'venv_collision' },
|
|
125
167
|
{ rx: /No module named pytest\b/i, reason: 'pytest_missing' },
|
|
@@ -178,64 +220,62 @@ function buildTestCommands(args) {
|
|
|
178
220
|
}
|
|
179
221
|
export class PythonEvalRunner {
|
|
180
222
|
opts;
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
imageLru = new Set();
|
|
188
|
-
imageCacheMax;
|
|
189
|
-
cleanupImage;
|
|
223
|
+
pruneRound;
|
|
224
|
+
diskFloorBytes;
|
|
225
|
+
freeDiskBytes;
|
|
226
|
+
systemPrune;
|
|
227
|
+
resolveImageDigest;
|
|
228
|
+
evalTimeoutMs;
|
|
190
229
|
constructor(opts) {
|
|
191
230
|
this.opts = opts;
|
|
192
|
-
this.
|
|
193
|
-
this.
|
|
231
|
+
this.pruneRound = opts.pruneRound ?? defaultPruneRound;
|
|
232
|
+
this.diskFloorBytes = resolveDiskFloorBytes(opts.diskFloorBytes);
|
|
233
|
+
this.freeDiskBytes = opts.freeDiskBytes ?? defaultFreeDiskBytes;
|
|
234
|
+
this.systemPrune = opts.systemPrune ?? (() => runDocker(['system', 'prune', '-f']));
|
|
235
|
+
this.resolveImageDigest = opts.resolveImageDigest ?? defaultResolveImageDigest;
|
|
236
|
+
this.evalTimeoutMs = resolveEvalTimeoutMs(opts.evalTimeoutMs);
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Ensure enough free disk for an eval round. Below the floor → broad prune →
|
|
240
|
+
* re-probe; still below → `InsufficientDiskError` (clean abort). (#476)
|
|
241
|
+
*/
|
|
242
|
+
async ensureDiskHeadroom() {
|
|
243
|
+
const free = await this.freeDiskBytes();
|
|
244
|
+
if (free >= this.diskFloorBytes)
|
|
245
|
+
return;
|
|
246
|
+
console.warn(`[swe-rebench-v2] low disk (${(free / 1e9).toFixed(1)} GB) — running docker system prune`);
|
|
247
|
+
await this.systemPrune();
|
|
248
|
+
const afterPrune = await this.freeDiskBytes();
|
|
249
|
+
if (afterPrune < this.diskFloorBytes) {
|
|
250
|
+
throw new InsufficientDiskError(afterPrune, this.diskFloorBytes);
|
|
251
|
+
}
|
|
194
252
|
}
|
|
195
253
|
async runEval(args) {
|
|
254
|
+
await this.ensureDiskHeadroom();
|
|
196
255
|
try {
|
|
197
|
-
|
|
256
|
+
const result = await this.runEvalImpl(args);
|
|
257
|
+
let imageDigest = null;
|
|
258
|
+
try {
|
|
259
|
+
imageDigest = await this.resolveImageDigest(args.image);
|
|
260
|
+
}
|
|
261
|
+
catch (err) {
|
|
262
|
+
const reason = err instanceof Error ? err.message : String(err);
|
|
263
|
+
console.warn(`[swe-rebench-v2] resolveImageDigest failed for ${args.image}: ${reason}`);
|
|
264
|
+
}
|
|
265
|
+
return {
|
|
266
|
+
...result,
|
|
267
|
+
...(imageDigest ? { imageDigest } : {}),
|
|
268
|
+
};
|
|
198
269
|
}
|
|
199
270
|
finally {
|
|
200
|
-
//
|
|
201
|
-
// pull-and-crash
|
|
202
|
-
// patch_corrupt, eval_no_report) still left an image on disk; we must
|
|
203
|
-
// count it toward the cache cap so the failure path can't leak the LRU.
|
|
204
|
-
await this.recordImageUsage(args.image);
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
/**
|
|
208
|
-
* Move `image` to the most-recently-used slot of the in-process LRU; if the
|
|
209
|
-
* set now exceeds {@link imageCacheMax}, evict the oldest entries via
|
|
210
|
-
* {@link cleanupImage}. Eviction failures are swallowed so a flaky
|
|
211
|
-
* `docker rmi` cannot escape `runEval`.
|
|
212
|
-
*
|
|
213
|
-
* The cap is enforced after the just-used image is inserted: the
|
|
214
|
-
* just-evaluated image is the *most* recent, so repeat-evals of recently
|
|
215
|
-
* used instances never re-pull. Only when more than N distinct images have
|
|
216
|
-
* been used does the oldest get rmi'd.
|
|
217
|
-
*/
|
|
218
|
-
async recordImageUsage(image) {
|
|
219
|
-
if (!image)
|
|
220
|
-
return;
|
|
221
|
-
// Refresh recency: delete-then-add reinserts at the tail of the set.
|
|
222
|
-
this.imageLru.delete(image);
|
|
223
|
-
this.imageLru.add(image);
|
|
224
|
-
while (this.imageLru.size > this.imageCacheMax) {
|
|
225
|
-
const oldest = this.imageLru.values().next().value;
|
|
226
|
-
if (!oldest)
|
|
227
|
-
break;
|
|
228
|
-
this.imageLru.delete(oldest);
|
|
271
|
+
// Prune this round's full Docker footprint — even when the eval threw,
|
|
272
|
+
// a pull-and-crash still left an image on disk (#476).
|
|
229
273
|
try {
|
|
230
|
-
await this.
|
|
274
|
+
await this.pruneRound(args.image);
|
|
231
275
|
}
|
|
232
276
|
catch (err) {
|
|
233
|
-
// Best-effort GC: a failed rmi leaves the image on disk but mustn't
|
|
234
|
-
// break the loop. Warn so a flaky `docker` (or a permission slip)
|
|
235
|
-
// becomes visible before disks fill — silent leaks were the whole
|
|
236
|
-
// problem this bead exists to fix.
|
|
237
277
|
const reason = err instanceof Error ? err.message : String(err);
|
|
238
|
-
console.warn(`[swe-rebench-v2]
|
|
278
|
+
console.warn(`[swe-rebench-v2] pruneRound failed for ${args.image}: ${reason}`);
|
|
239
279
|
}
|
|
240
280
|
}
|
|
241
281
|
}
|
|
@@ -280,6 +320,7 @@ export class PythonEvalRunner {
|
|
|
280
320
|
const child = spawn(this.opts.pythonBin ?? 'python3', pyArgs, {
|
|
281
321
|
cwd: this.opts.upstreamRepoDir,
|
|
282
322
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
323
|
+
detached: process.platform !== 'win32',
|
|
283
324
|
// SWE-rebench eval images are published for linux/amd64. Pin the platform
|
|
284
325
|
// so the upstream `docker run` is consistent on amd64 hosts and does not
|
|
285
326
|
// silently crash under arm64 emulation on dev machines.
|
|
@@ -289,10 +330,54 @@ export class PythonEvalRunner {
|
|
|
289
330
|
child.stderr.on('data', (d) => { stderr += d.toString(); });
|
|
290
331
|
let stdout = '';
|
|
291
332
|
child.stdout.on('data', (d) => { stdout += d.toString(); });
|
|
333
|
+
let timedOut = false;
|
|
334
|
+
let closed = false;
|
|
335
|
+
let killTimer;
|
|
336
|
+
const killChild = (signal) => {
|
|
337
|
+
const pid = child.pid;
|
|
338
|
+
if (!pid)
|
|
339
|
+
return;
|
|
340
|
+
try {
|
|
341
|
+
if (process.platform === 'win32') {
|
|
342
|
+
child.kill(signal);
|
|
343
|
+
}
|
|
344
|
+
else {
|
|
345
|
+
process.kill(-pid, signal);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
catch {
|
|
349
|
+
try {
|
|
350
|
+
child.kill(signal);
|
|
351
|
+
}
|
|
352
|
+
catch { }
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
const timeoutTimer = this.evalTimeoutMs > 0
|
|
356
|
+
? setTimeout(() => {
|
|
357
|
+
timedOut = true;
|
|
358
|
+
killChild('SIGTERM');
|
|
359
|
+
killTimer = setTimeout(() => {
|
|
360
|
+
if (!closed)
|
|
361
|
+
killChild('SIGKILL');
|
|
362
|
+
}, 10_000);
|
|
363
|
+
killTimer.unref?.();
|
|
364
|
+
}, this.evalTimeoutMs)
|
|
365
|
+
: undefined;
|
|
366
|
+
timeoutTimer?.unref?.();
|
|
292
367
|
const exitCode = await new Promise((resolve, reject) => {
|
|
293
368
|
child.on('close', (code) => resolve(code ?? 1));
|
|
294
369
|
child.on('error', reject);
|
|
370
|
+
}).finally(() => {
|
|
371
|
+
closed = true;
|
|
372
|
+
if (timeoutTimer)
|
|
373
|
+
clearTimeout(timeoutTimer);
|
|
374
|
+
if (killTimer)
|
|
375
|
+
clearTimeout(killTimer);
|
|
295
376
|
});
|
|
377
|
+
if (timedOut) {
|
|
378
|
+
await rm(tmp, { recursive: true, force: true });
|
|
379
|
+
throw new EvalCouldNotGradeError('eval_timeout', `python eval timed out after ${this.evalTimeoutMs}ms; ${(stderr || stdout).slice(-800)}`);
|
|
380
|
+
}
|
|
296
381
|
let report;
|
|
297
382
|
try {
|
|
298
383
|
report = JSON.parse(await readFile(reportPath, 'utf8'));
|