@veewo/gitnexus 1.5.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark/agent-context/runner.js +3 -0
- package/dist/benchmark/agent-context/runner.test.js +22 -0
- package/dist/benchmark/agent-context/tool-runner.d.ts +7 -6
- package/dist/benchmark/agent-safe-query-context/io.d.ts +2 -0
- package/dist/benchmark/agent-safe-query-context/io.js +86 -0
- package/dist/benchmark/agent-safe-query-context/io.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/io.test.js +13 -0
- package/dist/benchmark/agent-safe-query-context/report.d.ts +57 -0
- package/dist/benchmark/agent-safe-query-context/report.js +159 -0
- package/dist/benchmark/agent-safe-query-context/report.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/report.test.js +362 -0
- package/dist/benchmark/agent-safe-query-context/runner.d.ts +44 -0
- package/dist/benchmark/agent-safe-query-context/runner.js +406 -0
- package/dist/benchmark/agent-safe-query-context/runner.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/runner.test.js +290 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.d.ts +20 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.js +225 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.js +122 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.d.ts +47 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.js +128 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.test.js +155 -0
- package/dist/benchmark/agent-safe-query-context/telemetry-tool.d.ts +9 -0
- package/dist/benchmark/agent-safe-query-context/telemetry-tool.js +77 -0
- package/dist/benchmark/agent-safe-query-context/types.d.ts +61 -0
- package/dist/benchmark/agent-safe-query-context/types.js +8 -0
- package/dist/benchmark/runtime-poc/provenance-artifact.d.ts +47 -0
- package/dist/benchmark/runtime-poc/provenance-artifact.js +89 -0
- package/dist/benchmark/runtime-poc/runner.d.ts +31 -0
- package/dist/benchmark/runtime-poc/runner.js +163 -0
- package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.d.ts +8 -0
- package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.js +21 -0
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.d.ts +0 -1
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.js +53 -51
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.test.js +0 -1
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.d.ts +1 -1
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.js +82 -18
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.test.js +1 -2
- package/dist/benchmark/u2-e2e/retrieval-runner.js +15 -7
- package/dist/benchmark/u2-e2e/retrieval-runner.test.js +46 -0
- package/dist/cli/ai-context.js +2 -12
- package/dist/cli/ai-context.test.js +8 -0
- package/dist/cli/analyze-runtime-summary.js +1 -0
- package/dist/cli/analyze-runtime-summary.test.js +2 -0
- package/dist/cli/analyze-summary.d.ts +2 -0
- package/dist/cli/analyze-summary.js +24 -0
- package/dist/cli/analyze-summary.test.js +65 -1
- package/dist/cli/analyze.js +5 -1
- package/dist/cli/benchmark-agent-safe-query-context.d.ts +20 -0
- package/dist/cli/benchmark-agent-safe-query-context.js +39 -0
- package/dist/cli/benchmark-agent-safe-query-context.test.d.ts +1 -0
- package/dist/cli/benchmark-agent-safe-query-context.test.js +271 -0
- package/dist/cli/benchmark.d.ts +29 -0
- package/dist/cli/benchmark.js +55 -0
- package/dist/cli/index.js +23 -0
- package/dist/cli/rule-lab.d.ts +3 -7
- package/dist/cli/rule-lab.js +13 -22
- package/dist/cli/rule-lab.test.js +23 -3
- package/dist/cli/tool.d.ts +2 -0
- package/dist/cli/tool.js +2 -0
- package/dist/core/config/unity-config.d.ts +0 -1
- package/dist/core/config/unity-config.js +0 -1
- package/dist/core/ingestion/pipeline.js +35 -6
- package/dist/core/ingestion/unity-lifecycle-synthetic-calls.test.js +18 -20
- package/dist/core/ingestion/unity-parity-seed.d.ts +2 -1
- package/dist/core/ingestion/unity-parity-seed.js +8 -0
- package/dist/core/ingestion/unity-resource-processor.d.ts +11 -0
- package/dist/core/ingestion/unity-resource-processor.js +102 -0
- package/dist/core/ingestion/unity-resource-processor.test.js +449 -0
- package/dist/core/ingestion/unity-runtime-binding-rules.d.ts +15 -0
- package/dist/core/ingestion/unity-runtime-binding-rules.js +178 -30
- package/dist/core/lbug/csv-generator.test.js +2 -2
- package/dist/core/unity/doc-contract.test.d.ts +1 -0
- package/dist/core/unity/doc-contract.test.js +30 -0
- package/dist/core/unity/prefab-source-scan.d.ts +25 -0
- package/dist/core/unity/prefab-source-scan.js +152 -0
- package/dist/core/unity/prefab-source-scan.test.d.ts +1 -0
- package/dist/core/unity/prefab-source-scan.test.js +70 -0
- package/dist/core/unity/scan-context.d.ts +12 -0
- package/dist/core/unity/scan-context.js +50 -2
- package/dist/core/unity/scan-context.test.js +74 -0
- package/dist/mcp/local/agent-safe-response.d.ts +10 -0
- package/dist/mcp/local/agent-safe-response.js +639 -0
- package/dist/mcp/local/derived-process-reader.js +1 -1
- package/dist/mcp/local/local-backend.d.ts +18 -1
- package/dist/mcp/local/local-backend.js +319 -125
- package/dist/mcp/local/process-confidence.d.ts +1 -2
- package/dist/mcp/local/process-confidence.js +0 -3
- package/dist/mcp/local/process-confidence.test.js +4 -2
- package/dist/mcp/local/process-evidence.d.ts +1 -8
- package/dist/mcp/local/process-evidence.js +1 -23
- package/dist/mcp/local/process-evidence.test.js +2 -16
- package/dist/mcp/local/process-ref.d.ts +1 -1
- package/dist/mcp/local/runtime-chain-closure-evaluator.d.ts +33 -0
- package/dist/mcp/local/runtime-chain-closure-evaluator.js +273 -0
- package/dist/mcp/local/runtime-chain-graph-candidates.d.ts +23 -0
- package/dist/mcp/local/runtime-chain-graph-candidates.js +131 -0
- package/dist/mcp/local/runtime-chain-verify.d.ts +1 -1
- package/dist/mcp/local/runtime-chain-verify.js +149 -138
- package/dist/mcp/local/runtime-chain-verify.test.js +126 -68
- package/dist/mcp/local/runtime-claim-rule-registry.d.ts +4 -0
- package/dist/mcp/local/runtime-claim-rule-registry.js +4 -0
- package/dist/mcp/local/runtime-claim-rule-registry.test.js +37 -4
- package/dist/mcp/local/runtime-claim.d.ts +11 -0
- package/dist/mcp/local/runtime-claim.js +28 -0
- package/dist/mcp/local/unity-evidence-view.d.ts +1 -1
- package/dist/mcp/local/unity-evidence-view.js +1 -1
- package/dist/mcp/local/unity-evidence-view.test.js +22 -0
- package/dist/mcp/tools.js +51 -21
- package/dist/rule-lab/analyze.d.ts +2 -1
- package/dist/rule-lab/analyze.js +94 -59
- package/dist/rule-lab/analyze.test.js +238 -20
- package/dist/rule-lab/curate.d.ts +2 -1
- package/dist/rule-lab/curate.js +24 -3
- package/dist/rule-lab/curate.test.js +65 -0
- package/dist/rule-lab/curation-input-builder.d.ts +45 -0
- package/dist/rule-lab/curation-input-builder.js +133 -0
- package/dist/rule-lab/promote.js +80 -7
- package/dist/rule-lab/promote.test.js +150 -0
- package/dist/rule-lab/review-pack.d.ts +3 -0
- package/dist/rule-lab/review-pack.js +41 -1
- package/dist/rule-lab/review-pack.test.js +67 -0
- package/dist/rule-lab/types.d.ts +29 -0
- package/dist/types/pipeline.d.ts +3 -0
- package/package.json +4 -3
- package/scripts/run-node-tests.mjs +61 -0
- package/skills/_shared/unity-rule-authoring-contract.md +64 -0
- package/skills/_shared/unity-runtime-process-contract.md +16 -0
- package/skills/gitnexus-cli.md +8 -0
- package/skills/gitnexus-debugging.md +9 -0
- package/skills/gitnexus-exploring.md +66 -18
- package/skills/gitnexus-guide.md +42 -3
- package/skills/gitnexus-impact-analysis.md +8 -0
- package/skills/gitnexus-pr-review.md +8 -0
- package/skills/gitnexus-refactoring.md +8 -0
- package/skills/gitnexus-unity-rule-gen.md +66 -312
|
@@ -6,6 +6,9 @@ function buildToolInput(step, repo) {
|
|
|
6
6
|
if (repo) {
|
|
7
7
|
input.repo = repo;
|
|
8
8
|
}
|
|
9
|
+
if ((step.tool === 'query' || step.tool === 'context') && !('response_profile' in input)) {
|
|
10
|
+
input.response_profile = 'full';
|
|
11
|
+
}
|
|
9
12
|
// LocalBackend impact contract uses `target_uid`, while dataset rows may carry `uid`.
|
|
10
13
|
if (step.tool === 'impact') {
|
|
11
14
|
const uid = input.uid;
|
|
@@ -77,3 +77,25 @@ test('executeToolPlan maps impact uid to target_uid for backend impact contract'
|
|
|
77
77
|
assert.equal(calls.length, 1);
|
|
78
78
|
assert.equal(calls[0].target_uid, 'Class:Assets/NEON/Code/NetworkCode/NeonMgr/MirrorNetMgr.cs:MirrorNetMgr');
|
|
79
79
|
});
|
|
80
|
+
test('executeToolPlan injects response_profile=full for legacy query/context payloads', async () => {
|
|
81
|
+
const calls = [];
|
|
82
|
+
const fakeRunner = {
|
|
83
|
+
query: async (params) => {
|
|
84
|
+
calls.push({ tool: 'query', params });
|
|
85
|
+
return {};
|
|
86
|
+
},
|
|
87
|
+
context: async (params) => {
|
|
88
|
+
calls.push({ tool: 'context', params });
|
|
89
|
+
return {};
|
|
90
|
+
},
|
|
91
|
+
impact: async () => ({}),
|
|
92
|
+
cypher: async () => ({}),
|
|
93
|
+
close: async () => { },
|
|
94
|
+
};
|
|
95
|
+
await executeToolPlan([
|
|
96
|
+
{ tool: 'query', input: { query: 'Target' } },
|
|
97
|
+
{ tool: 'context', input: { name: 'Target' } },
|
|
98
|
+
], fakeRunner, 'sample-repo');
|
|
99
|
+
assert.equal(calls[0].params.response_profile, 'full');
|
|
100
|
+
assert.equal(calls[1].params.response_profile, 'full');
|
|
101
|
+
});
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
export
|
|
2
|
-
query: (params:
|
|
3
|
-
context: (params:
|
|
4
|
-
impact: (params:
|
|
5
|
-
cypher: (params:
|
|
1
|
+
export interface AgentContextToolRunner {
|
|
2
|
+
query: (params: Record<string, unknown>) => Promise<any>;
|
|
3
|
+
context: (params: Record<string, unknown>) => Promise<any>;
|
|
4
|
+
impact: (params: Record<string, unknown>) => Promise<any>;
|
|
5
|
+
cypher: (params: Record<string, unknown>) => Promise<any>;
|
|
6
6
|
close: () => Promise<void>;
|
|
7
|
-
}
|
|
7
|
+
}
|
|
8
|
+
export declare function createAgentContextToolRunner(): Promise<AgentContextToolRunner>;
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { AGENT_SAFE_CASE_KEYS } from './types.js';
|
|
4
|
+
const PLACEHOLDER_RE = /TODO|TBD|placeholder|<resource>|<symbol>/i;
|
|
5
|
+
export async function loadAgentSafeQueryContextSuite(root) {
|
|
6
|
+
const thresholds = JSON.parse(await fs.readFile(path.join(root, 'thresholds.json'), 'utf-8'));
|
|
7
|
+
const cases = JSON.parse(await fs.readFile(path.join(root, 'cases.json'), 'utf-8'));
|
|
8
|
+
for (const key of AGENT_SAFE_CASE_KEYS) {
|
|
9
|
+
assertCase(key, cases[key]);
|
|
10
|
+
}
|
|
11
|
+
return { thresholds, cases };
|
|
12
|
+
}
|
|
13
|
+
function assertCase(name, value) {
|
|
14
|
+
if (!value) {
|
|
15
|
+
throw new Error(`missing required case: ${name}`);
|
|
16
|
+
}
|
|
17
|
+
for (const field of ['label', 'start_query', 'retry_query', 'proof_cypher']) {
|
|
18
|
+
const candidate = value[field];
|
|
19
|
+
if (!candidate || typeof candidate !== 'string') {
|
|
20
|
+
throw new Error(`missing required field: ${name}.${field}`);
|
|
21
|
+
}
|
|
22
|
+
assertNoPlaceholder(`${name}.${field}`, candidate);
|
|
23
|
+
}
|
|
24
|
+
if (!Array.isArray(value.proof_contexts) || value.proof_contexts.length === 0) {
|
|
25
|
+
throw new Error(`missing required field: ${name}.proof_contexts`);
|
|
26
|
+
}
|
|
27
|
+
value.proof_contexts.forEach((entry, index) => assertNoPlaceholder(`${name}.proof_contexts[${index}]`, entry));
|
|
28
|
+
if (!Array.isArray(value.tool_plan) || value.tool_plan.length === 0) {
|
|
29
|
+
throw new Error(`missing required field: ${name}.tool_plan`);
|
|
30
|
+
}
|
|
31
|
+
assertLiveTask(name, value.live_task, value.semantic_tuple);
|
|
32
|
+
if (value.start_query_input && typeof value.start_query_input === 'object') {
|
|
33
|
+
for (const entry of Object.values(value.start_query_input)) {
|
|
34
|
+
if (typeof entry === 'string') {
|
|
35
|
+
assertNoPlaceholder(`${name}.start_query_input`, entry);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (value.retry_query_input && typeof value.retry_query_input === 'object') {
|
|
40
|
+
for (const entry of Object.values(value.retry_query_input)) {
|
|
41
|
+
if (typeof entry === 'string') {
|
|
42
|
+
assertNoPlaceholder(`${name}.retry_query_input`, entry);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
assertSemanticTuple(name, value.semantic_tuple);
|
|
47
|
+
}
|
|
48
|
+
function assertLiveTask(name, liveTask, tuple) {
|
|
49
|
+
if (!liveTask) {
|
|
50
|
+
throw new Error(`missing required field: ${name}.live_task`);
|
|
51
|
+
}
|
|
52
|
+
for (const field of ['objective', 'symbol_seed', 'resource_seed']) {
|
|
53
|
+
const candidate = liveTask[field];
|
|
54
|
+
if (!candidate || typeof candidate !== 'string') {
|
|
55
|
+
throw new Error(`missing required field: ${name}.live_task.${field}`);
|
|
56
|
+
}
|
|
57
|
+
assertNoPlaceholder(`${name}.live_task.${field}`, candidate);
|
|
58
|
+
}
|
|
59
|
+
if (tuple.proof_edge && liveTask.objective.includes(tuple.proof_edge)) {
|
|
60
|
+
throw new Error(`${name}.live_task.objective leaks canonical proof_edge`);
|
|
61
|
+
}
|
|
62
|
+
if (tuple.proof_edges?.every((edge) => liveTask.objective.includes(edge))) {
|
|
63
|
+
throw new Error(`${name}.live_task.objective leaks canonical proof_edges`);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
function assertSemanticTuple(name, tuple) {
|
|
67
|
+
if (!tuple) {
|
|
68
|
+
throw new Error(`missing required field: ${name}.semantic_tuple`);
|
|
69
|
+
}
|
|
70
|
+
assertNoPlaceholder(`${name}.semantic_tuple.resource_anchor`, tuple.resource_anchor);
|
|
71
|
+
assertNoPlaceholder(`${name}.semantic_tuple.symbol_anchor`, tuple.symbol_anchor);
|
|
72
|
+
if (tuple.proof_edge) {
|
|
73
|
+
assertNoPlaceholder(`${name}.semantic_tuple.proof_edge`, tuple.proof_edge);
|
|
74
|
+
}
|
|
75
|
+
if (tuple.proof_edges) {
|
|
76
|
+
tuple.proof_edges.forEach((entry, index) => assertNoPlaceholder(`${name}.semantic_tuple.proof_edges[${index}]`, entry));
|
|
77
|
+
}
|
|
78
|
+
if (!tuple.proof_edge && (!tuple.proof_edges || tuple.proof_edges.length === 0)) {
|
|
79
|
+
throw new Error(`missing proof edge(s): ${name}.semantic_tuple`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
function assertNoPlaceholder(field, value) {
|
|
83
|
+
if (PLACEHOLDER_RE.test(value)) {
|
|
84
|
+
throw new Error(`${field} contains placeholder text`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import test from 'node:test';
|
|
2
|
+
import assert from 'node:assert/strict';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { loadAgentSafeQueryContextSuite } from './io.js';
|
|
5
|
+
test('loads canonical benchmark cases without placeholders', async () => {
|
|
6
|
+
const suite = await loadAgentSafeQueryContextSuite(path.resolve('../benchmarks/agent-safe-query-context/neonspark-v1'));
|
|
7
|
+
assert.deepEqual(Object.keys(suite.cases).sort(), ['reload', 'weapon_powerup']);
|
|
8
|
+
assert.equal(suite.cases.weapon_powerup.semantic_tuple.resource_anchor, 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset');
|
|
9
|
+
assert.equal(suite.cases.reload.semantic_tuple.proof_edge, 'ReloadBase.GetValue -> ReloadBase.CheckReload');
|
|
10
|
+
assert.equal(suite.cases.weapon_powerup.live_task.symbol_seed, 'WeaponPowerUp');
|
|
11
|
+
assert.equal(suite.cases.reload.live_task.resource_seed, 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset');
|
|
12
|
+
assert.equal(suite.cases.weapon_powerup.live_task.objective.includes('HoldPickup -> WeaponPowerUp.PickItUp'), false);
|
|
13
|
+
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { executeToolPlan } from '../agent-context/runner.js';
|
|
2
|
+
import { type AgentContextToolRunner } from '../agent-context/tool-runner.js';
|
|
3
|
+
import { type WorkflowReplayResult } from './runner.js';
|
|
4
|
+
import { loadSubagentLiveCaseResult, type SubagentLiveResult, type TelemetryStep } from './subagent-live.js';
|
|
5
|
+
import type { AgentSafeBenchmarkCase, AgentSafeBenchmarkSuite, AgentSafeCaseKey, SemanticTuple } from './types.js';
|
|
6
|
+
type CaseKey = AgentSafeCaseKey;
|
|
7
|
+
export interface SameScriptCaseResult {
|
|
8
|
+
tool_plan: AgentSafeBenchmarkCase['tool_plan'];
|
|
9
|
+
steps: TelemetryStep[];
|
|
10
|
+
semantic_tuple: SemanticTuple;
|
|
11
|
+
semantic_tuple_pass: boolean;
|
|
12
|
+
tool_calls_to_completion: number;
|
|
13
|
+
tokens_to_completion: number;
|
|
14
|
+
}
|
|
15
|
+
export interface AgentSafeQueryContextBenchmarkReport {
|
|
16
|
+
generatedAt: string;
|
|
17
|
+
workflow_replay_full: Record<CaseKey, WorkflowReplayResult>;
|
|
18
|
+
workflow_replay_slim: Record<CaseKey, WorkflowReplayResult>;
|
|
19
|
+
same_script_full: Record<CaseKey, SameScriptCaseResult>;
|
|
20
|
+
same_script_slim: Record<CaseKey, SameScriptCaseResult>;
|
|
21
|
+
subagent_live: Record<CaseKey, SubagentLiveResult>;
|
|
22
|
+
acceptance: {
|
|
23
|
+
pass: boolean;
|
|
24
|
+
cases: Record<CaseKey, boolean>;
|
|
25
|
+
};
|
|
26
|
+
pass: boolean;
|
|
27
|
+
cases: Record<CaseKey, SubagentLiveResult>;
|
|
28
|
+
same_script: {
|
|
29
|
+
tool_plan: Record<CaseKey, AgentSafeBenchmarkCase['tool_plan']>;
|
|
30
|
+
cases: Record<CaseKey, SameScriptCaseResult>;
|
|
31
|
+
};
|
|
32
|
+
semantic_equivalence: {
|
|
33
|
+
pass: boolean;
|
|
34
|
+
cases: Record<CaseKey, boolean>;
|
|
35
|
+
};
|
|
36
|
+
token_summary: Record<CaseKey, {
|
|
37
|
+
before: number;
|
|
38
|
+
after: number;
|
|
39
|
+
saved: number;
|
|
40
|
+
reduction: number;
|
|
41
|
+
}>;
|
|
42
|
+
call_summary: Record<CaseKey, {
|
|
43
|
+
before: number;
|
|
44
|
+
after: number;
|
|
45
|
+
saved: number;
|
|
46
|
+
}>;
|
|
47
|
+
}
|
|
48
|
+
export declare function runAgentSafeQueryContextBenchmark(suite: AgentSafeBenchmarkSuite, options: {
|
|
49
|
+
repo?: string;
|
|
50
|
+
subagentRunsDir?: string;
|
|
51
|
+
}, deps?: {
|
|
52
|
+
runner?: AgentContextToolRunner;
|
|
53
|
+
executeToolPlan?: typeof executeToolPlan;
|
|
54
|
+
loadSubagentLiveCaseResult?: typeof loadSubagentLiveCaseResult;
|
|
55
|
+
}): Promise<AgentSafeQueryContextBenchmarkReport>;
|
|
56
|
+
export declare function writeAgentSafeQueryContextReports(reportDir: string, report: AgentSafeQueryContextBenchmarkReport): Promise<void>;
|
|
57
|
+
export {};
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { estimateTokens } from '../u2-e2e/metrics.js';
|
|
3
|
+
import { writeReports } from '../report.js';
|
|
4
|
+
import { executeToolPlan } from '../agent-context/runner.js';
|
|
5
|
+
import { createAgentContextToolRunner } from '../agent-context/tool-runner.js';
|
|
6
|
+
import { deriveSemanticTuple, semanticTuplePass } from './semantic-tuple.js';
|
|
7
|
+
import { runWorkflowReplay } from './runner.js';
|
|
8
|
+
import { loadSubagentLiveCaseResult } from './subagent-live.js';
|
|
9
|
+
export async function runAgentSafeQueryContextBenchmark(suite, options, deps = {}) {
|
|
10
|
+
const runner = deps.runner || (await createAgentContextToolRunner());
|
|
11
|
+
const ownsRunner = !deps.runner;
|
|
12
|
+
const executeToolPlanImpl = deps.executeToolPlan || executeToolPlan;
|
|
13
|
+
const loadSubagentLiveCaseResultImpl = deps.loadSubagentLiveCaseResult || loadSubagentLiveCaseResult;
|
|
14
|
+
const workflowReplayFullCases = {};
|
|
15
|
+
const workflowReplaySlimCases = {};
|
|
16
|
+
const sameScriptFullCases = {};
|
|
17
|
+
const sameScriptSlimCases = {};
|
|
18
|
+
const subagentLiveCases = {};
|
|
19
|
+
const acceptanceCases = {};
|
|
20
|
+
const semanticEquivalenceCases = {};
|
|
21
|
+
const tokenSummary = {};
|
|
22
|
+
const callSummary = {};
|
|
23
|
+
if (!options.subagentRunsDir) {
|
|
24
|
+
throw new Error('subagentRunsDir is required for real subagent benchmark runs');
|
|
25
|
+
}
|
|
26
|
+
try {
|
|
27
|
+
for (const key of Object.keys(suite.cases)) {
|
|
28
|
+
const benchmarkCase = suite.cases[key];
|
|
29
|
+
const workflowReplayFull = await runWorkflowReplay(benchmarkCase, runner, {
|
|
30
|
+
repo: options.repo,
|
|
31
|
+
maxSteps: suite.thresholds.workflowReplay.maxSteps,
|
|
32
|
+
responseProfile: 'full',
|
|
33
|
+
});
|
|
34
|
+
const workflowReplaySlim = await runWorkflowReplay(benchmarkCase, runner, {
|
|
35
|
+
repo: options.repo,
|
|
36
|
+
maxSteps: suite.thresholds.workflowReplay.maxSteps,
|
|
37
|
+
responseProfile: 'slim',
|
|
38
|
+
});
|
|
39
|
+
const sameScriptFull = await runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, {
|
|
40
|
+
repo: options.repo,
|
|
41
|
+
responseProfile: 'full',
|
|
42
|
+
});
|
|
43
|
+
const sameScriptSlim = await runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, {
|
|
44
|
+
repo: options.repo,
|
|
45
|
+
responseProfile: 'slim',
|
|
46
|
+
});
|
|
47
|
+
const subagentLive = await loadSubagentLiveCaseResultImpl(path.join(options.subagentRunsDir, key), benchmarkCase);
|
|
48
|
+
workflowReplayFullCases[key] = workflowReplayFull;
|
|
49
|
+
workflowReplaySlimCases[key] = workflowReplaySlim;
|
|
50
|
+
sameScriptFullCases[key] = sameScriptFull;
|
|
51
|
+
sameScriptSlimCases[key] = sameScriptSlim;
|
|
52
|
+
subagentLiveCases[key] = subagentLive;
|
|
53
|
+
acceptanceCases[key] = workflowReplaySlim.semantic_tuple_pass
|
|
54
|
+
&& workflowReplaySlim.post_narrowing_anchor_pass
|
|
55
|
+
&& workflowReplaySlim.post_narrowing_follow_up_hit
|
|
56
|
+
&& workflowReplaySlim.guid_invariance_pass
|
|
57
|
+
&& workflowReplaySlim.live_tool_evidence_pass
|
|
58
|
+
&& workflowReplaySlim.freeze_ready
|
|
59
|
+
&& workflowReplaySlim.tier_envelope.facts_present
|
|
60
|
+
&& workflowReplaySlim.tier_envelope.closure_present
|
|
61
|
+
&& workflowReplaySlim.tier_envelope.clues_present
|
|
62
|
+
&& workflowReplaySlim.tier_envelope.semantic_order_pass
|
|
63
|
+
&& !workflowReplaySlim.placeholder_leak_detected
|
|
64
|
+
&& !workflowReplaySlim.heuristic_top_summary_detected;
|
|
65
|
+
semanticEquivalenceCases[key] = sameScriptSlim.semantic_tuple_pass && subagentLive.semantic_tuple_pass;
|
|
66
|
+
const tokenSaved = sameScriptFull.tokens_to_completion - sameScriptSlim.tokens_to_completion;
|
|
67
|
+
tokenSummary[key] = {
|
|
68
|
+
before: sameScriptFull.tokens_to_completion,
|
|
69
|
+
after: sameScriptSlim.tokens_to_completion,
|
|
70
|
+
saved: tokenSaved,
|
|
71
|
+
reduction: sameScriptFull.tokens_to_completion > 0 ? Number((tokenSaved / sameScriptFull.tokens_to_completion).toFixed(3)) : 0,
|
|
72
|
+
};
|
|
73
|
+
callSummary[key] = {
|
|
74
|
+
before: sameScriptFull.tool_calls_to_completion,
|
|
75
|
+
after: sameScriptSlim.tool_calls_to_completion,
|
|
76
|
+
saved: sameScriptFull.tool_calls_to_completion - sameScriptSlim.tool_calls_to_completion,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
if (ownsRunner) {
|
|
82
|
+
await runner.close();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
const pass = Object.values(acceptanceCases).every(Boolean);
|
|
86
|
+
return {
|
|
87
|
+
generatedAt: new Date().toISOString(),
|
|
88
|
+
workflow_replay_full: workflowReplayFullCases,
|
|
89
|
+
workflow_replay_slim: workflowReplaySlimCases,
|
|
90
|
+
same_script_full: sameScriptFullCases,
|
|
91
|
+
same_script_slim: sameScriptSlimCases,
|
|
92
|
+
subagent_live: subagentLiveCases,
|
|
93
|
+
acceptance: {
|
|
94
|
+
pass,
|
|
95
|
+
cases: acceptanceCases,
|
|
96
|
+
},
|
|
97
|
+
pass,
|
|
98
|
+
cases: subagentLiveCases,
|
|
99
|
+
same_script: {
|
|
100
|
+
tool_plan: {
|
|
101
|
+
weapon_powerup: suite.cases.weapon_powerup.tool_plan,
|
|
102
|
+
reload: suite.cases.reload.tool_plan,
|
|
103
|
+
},
|
|
104
|
+
cases: sameScriptSlimCases,
|
|
105
|
+
},
|
|
106
|
+
semantic_equivalence: {
|
|
107
|
+
pass: Object.values(semanticEquivalenceCases).every(Boolean),
|
|
108
|
+
cases: semanticEquivalenceCases,
|
|
109
|
+
},
|
|
110
|
+
token_summary: tokenSummary,
|
|
111
|
+
call_summary: callSummary,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
export async function writeAgentSafeQueryContextReports(reportDir, report) {
|
|
115
|
+
const markdown = [
|
|
116
|
+
'# Agent-Safe Query/Context Benchmark Summary',
|
|
117
|
+
'',
|
|
118
|
+
`- Pass: ${report.acceptance.pass ? 'YES' : 'NO'}`,
|
|
119
|
+
'',
|
|
120
|
+
'## Cases',
|
|
121
|
+
...['weapon_powerup', 'reload'].map((key) => `- ${key}: live_pass=${report.subagent_live[key].semantic_tuple_pass}, token_saved=${report.token_summary[key].saved}, call_saved=${report.call_summary[key].saved}, anchor_top1_pass=${report.workflow_replay_slim[key].anchor_top1_pass}, recommended_follow_up_hit=${report.workflow_replay_slim[key].recommended_follow_up_hit}, post_narrowing_anchor_pass=${report.workflow_replay_slim[key].post_narrowing_anchor_pass}, post_narrowing_follow_up_hit=${report.workflow_replay_slim[key].post_narrowing_follow_up_hit}, guid_invariance_pass=${report.workflow_replay_slim[key].guid_invariance_pass}, live_tool_evidence_pass=${report.workflow_replay_slim[key].live_tool_evidence_pass}, freeze_ready=${report.workflow_replay_slim[key].freeze_ready}, confirmed_chain_steps=${report.workflow_replay_slim[key].confirmed_chain.steps.length}, tier_facts=${report.workflow_replay_slim[key].tier_envelope.facts_present}, tier_closure=${report.workflow_replay_slim[key].tier_envelope.closure_present}, tier_clues=${report.workflow_replay_slim[key].tier_envelope.clues_present}, tier_semantic_order=${report.workflow_replay_slim[key].tier_envelope.semantic_order_pass}, tier_summary_source=${report.workflow_replay_slim[key].tier_envelope.summary_source}, ambiguity_detour_count=${report.workflow_replay_slim[key].ambiguity_detour_count}, placeholder_leak_detected=${report.workflow_replay_slim[key].placeholder_leak_detected}, heuristic_top_summary_detected=${report.workflow_replay_slim[key].heuristic_top_summary_detected}`),
|
|
122
|
+
].join('\n');
|
|
123
|
+
await writeReports(reportDir, report, markdown);
|
|
124
|
+
}
|
|
125
|
+
async function runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, options) {
|
|
126
|
+
const toolPlan = applyResponseProfileToToolPlan(benchmarkCase.tool_plan, options.responseProfile);
|
|
127
|
+
const outputs = await executeToolPlanImpl(toolPlan, runner, options.repo);
|
|
128
|
+
const steps = outputs.map((step) => ({
|
|
129
|
+
tool: step.tool,
|
|
130
|
+
input: step.input,
|
|
131
|
+
output: step.output,
|
|
132
|
+
durationMs: 0,
|
|
133
|
+
totalTokensEst: estimateTokens(JSON.stringify(step.input)) + estimateTokens(JSON.stringify(step.output)),
|
|
134
|
+
timestamp: new Date(0).toISOString(),
|
|
135
|
+
}));
|
|
136
|
+
const semanticTuple = deriveSemanticTuple(benchmarkCase.semantic_tuple, steps.map((step) => step.output));
|
|
137
|
+
return {
|
|
138
|
+
tool_plan: toolPlan,
|
|
139
|
+
steps,
|
|
140
|
+
semantic_tuple: semanticTuple,
|
|
141
|
+
semantic_tuple_pass: semanticTuplePass(semanticTuple, benchmarkCase.semantic_tuple),
|
|
142
|
+
tool_calls_to_completion: steps.length,
|
|
143
|
+
tokens_to_completion: steps.reduce((sum, step) => sum + step.totalTokensEst, 0),
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
function applyResponseProfileToToolPlan(toolPlan, responseProfile) {
|
|
147
|
+
return toolPlan.map((step) => {
|
|
148
|
+
if (step.tool !== 'query' && step.tool !== 'context') {
|
|
149
|
+
return step;
|
|
150
|
+
}
|
|
151
|
+
return {
|
|
152
|
+
...step,
|
|
153
|
+
input: {
|
|
154
|
+
...step.input,
|
|
155
|
+
response_profile: responseProfile,
|
|
156
|
+
},
|
|
157
|
+
};
|
|
158
|
+
});
|
|
159
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|