@veewo/gitnexus 1.5.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/dist/benchmark/agent-context/runner.js +3 -0
  2. package/dist/benchmark/agent-context/runner.test.js +22 -0
  3. package/dist/benchmark/agent-context/tool-runner.d.ts +7 -6
  4. package/dist/benchmark/agent-safe-query-context/io.d.ts +2 -0
  5. package/dist/benchmark/agent-safe-query-context/io.js +86 -0
  6. package/dist/benchmark/agent-safe-query-context/io.test.d.ts +1 -0
  7. package/dist/benchmark/agent-safe-query-context/io.test.js +13 -0
  8. package/dist/benchmark/agent-safe-query-context/report.d.ts +57 -0
  9. package/dist/benchmark/agent-safe-query-context/report.js +159 -0
  10. package/dist/benchmark/agent-safe-query-context/report.test.d.ts +1 -0
  11. package/dist/benchmark/agent-safe-query-context/report.test.js +362 -0
  12. package/dist/benchmark/agent-safe-query-context/runner.d.ts +44 -0
  13. package/dist/benchmark/agent-safe-query-context/runner.js +406 -0
  14. package/dist/benchmark/agent-safe-query-context/runner.test.d.ts +1 -0
  15. package/dist/benchmark/agent-safe-query-context/runner.test.js +290 -0
  16. package/dist/benchmark/agent-safe-query-context/semantic-tuple.d.ts +20 -0
  17. package/dist/benchmark/agent-safe-query-context/semantic-tuple.js +225 -0
  18. package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.d.ts +1 -0
  19. package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.js +122 -0
  20. package/dist/benchmark/agent-safe-query-context/subagent-live.d.ts +47 -0
  21. package/dist/benchmark/agent-safe-query-context/subagent-live.js +128 -0
  22. package/dist/benchmark/agent-safe-query-context/subagent-live.test.d.ts +1 -0
  23. package/dist/benchmark/agent-safe-query-context/subagent-live.test.js +155 -0
  24. package/dist/benchmark/agent-safe-query-context/telemetry-tool.d.ts +9 -0
  25. package/dist/benchmark/agent-safe-query-context/telemetry-tool.js +77 -0
  26. package/dist/benchmark/agent-safe-query-context/types.d.ts +61 -0
  27. package/dist/benchmark/agent-safe-query-context/types.js +8 -0
  28. package/dist/benchmark/runtime-poc/provenance-artifact.d.ts +47 -0
  29. package/dist/benchmark/runtime-poc/provenance-artifact.js +89 -0
  30. package/dist/benchmark/runtime-poc/runner.d.ts +31 -0
  31. package/dist/benchmark/runtime-poc/runner.js +163 -0
  32. package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.d.ts +8 -0
  33. package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.js +21 -0
  34. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.d.ts +0 -1
  35. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.js +53 -51
  36. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.test.js +0 -1
  37. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.d.ts +1 -1
  38. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.js +82 -18
  39. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.test.js +1 -2
  40. package/dist/benchmark/u2-e2e/retrieval-runner.js +15 -7
  41. package/dist/benchmark/u2-e2e/retrieval-runner.test.js +46 -0
  42. package/dist/cli/ai-context.js +2 -12
  43. package/dist/cli/ai-context.test.js +8 -0
  44. package/dist/cli/analyze-runtime-summary.js +1 -0
  45. package/dist/cli/analyze-runtime-summary.test.js +2 -0
  46. package/dist/cli/analyze-summary.d.ts +2 -0
  47. package/dist/cli/analyze-summary.js +24 -0
  48. package/dist/cli/analyze-summary.test.js +65 -1
  49. package/dist/cli/analyze.js +5 -1
  50. package/dist/cli/benchmark-agent-safe-query-context.d.ts +20 -0
  51. package/dist/cli/benchmark-agent-safe-query-context.js +39 -0
  52. package/dist/cli/benchmark-agent-safe-query-context.test.d.ts +1 -0
  53. package/dist/cli/benchmark-agent-safe-query-context.test.js +271 -0
  54. package/dist/cli/benchmark.d.ts +29 -0
  55. package/dist/cli/benchmark.js +55 -0
  56. package/dist/cli/index.js +23 -0
  57. package/dist/cli/rule-lab.d.ts +3 -7
  58. package/dist/cli/rule-lab.js +13 -22
  59. package/dist/cli/rule-lab.test.js +23 -3
  60. package/dist/cli/tool.d.ts +2 -0
  61. package/dist/cli/tool.js +2 -0
  62. package/dist/core/config/unity-config.d.ts +0 -1
  63. package/dist/core/config/unity-config.js +0 -1
  64. package/dist/core/ingestion/pipeline.js +35 -6
  65. package/dist/core/ingestion/unity-lifecycle-synthetic-calls.test.js +18 -20
  66. package/dist/core/ingestion/unity-parity-seed.d.ts +2 -1
  67. package/dist/core/ingestion/unity-parity-seed.js +8 -0
  68. package/dist/core/ingestion/unity-resource-processor.d.ts +11 -0
  69. package/dist/core/ingestion/unity-resource-processor.js +102 -0
  70. package/dist/core/ingestion/unity-resource-processor.test.js +449 -0
  71. package/dist/core/ingestion/unity-runtime-binding-rules.d.ts +15 -0
  72. package/dist/core/ingestion/unity-runtime-binding-rules.js +178 -30
  73. package/dist/core/lbug/csv-generator.test.js +2 -2
  74. package/dist/core/unity/doc-contract.test.d.ts +1 -0
  75. package/dist/core/unity/doc-contract.test.js +30 -0
  76. package/dist/core/unity/prefab-source-scan.d.ts +25 -0
  77. package/dist/core/unity/prefab-source-scan.js +152 -0
  78. package/dist/core/unity/prefab-source-scan.test.d.ts +1 -0
  79. package/dist/core/unity/prefab-source-scan.test.js +70 -0
  80. package/dist/core/unity/scan-context.d.ts +12 -0
  81. package/dist/core/unity/scan-context.js +50 -2
  82. package/dist/core/unity/scan-context.test.js +74 -0
  83. package/dist/mcp/local/agent-safe-response.d.ts +10 -0
  84. package/dist/mcp/local/agent-safe-response.js +639 -0
  85. package/dist/mcp/local/derived-process-reader.js +1 -1
  86. package/dist/mcp/local/local-backend.d.ts +18 -1
  87. package/dist/mcp/local/local-backend.js +319 -125
  88. package/dist/mcp/local/process-confidence.d.ts +1 -2
  89. package/dist/mcp/local/process-confidence.js +0 -3
  90. package/dist/mcp/local/process-confidence.test.js +4 -2
  91. package/dist/mcp/local/process-evidence.d.ts +1 -8
  92. package/dist/mcp/local/process-evidence.js +1 -23
  93. package/dist/mcp/local/process-evidence.test.js +2 -16
  94. package/dist/mcp/local/process-ref.d.ts +1 -1
  95. package/dist/mcp/local/runtime-chain-closure-evaluator.d.ts +33 -0
  96. package/dist/mcp/local/runtime-chain-closure-evaluator.js +273 -0
  97. package/dist/mcp/local/runtime-chain-graph-candidates.d.ts +23 -0
  98. package/dist/mcp/local/runtime-chain-graph-candidates.js +131 -0
  99. package/dist/mcp/local/runtime-chain-verify.d.ts +1 -1
  100. package/dist/mcp/local/runtime-chain-verify.js +149 -138
  101. package/dist/mcp/local/runtime-chain-verify.test.js +126 -68
  102. package/dist/mcp/local/runtime-claim-rule-registry.d.ts +4 -0
  103. package/dist/mcp/local/runtime-claim-rule-registry.js +4 -0
  104. package/dist/mcp/local/runtime-claim-rule-registry.test.js +37 -4
  105. package/dist/mcp/local/runtime-claim.d.ts +11 -0
  106. package/dist/mcp/local/runtime-claim.js +28 -0
  107. package/dist/mcp/local/unity-evidence-view.d.ts +1 -1
  108. package/dist/mcp/local/unity-evidence-view.js +1 -1
  109. package/dist/mcp/local/unity-evidence-view.test.js +22 -0
  110. package/dist/mcp/tools.js +51 -21
  111. package/dist/rule-lab/analyze.d.ts +2 -1
  112. package/dist/rule-lab/analyze.js +94 -59
  113. package/dist/rule-lab/analyze.test.js +238 -20
  114. package/dist/rule-lab/curate.d.ts +2 -1
  115. package/dist/rule-lab/curate.js +24 -3
  116. package/dist/rule-lab/curate.test.js +65 -0
  117. package/dist/rule-lab/curation-input-builder.d.ts +45 -0
  118. package/dist/rule-lab/curation-input-builder.js +133 -0
  119. package/dist/rule-lab/promote.js +80 -7
  120. package/dist/rule-lab/promote.test.js +150 -0
  121. package/dist/rule-lab/review-pack.d.ts +3 -0
  122. package/dist/rule-lab/review-pack.js +41 -1
  123. package/dist/rule-lab/review-pack.test.js +67 -0
  124. package/dist/rule-lab/types.d.ts +29 -0
  125. package/dist/types/pipeline.d.ts +3 -0
  126. package/package.json +4 -3
  127. package/scripts/run-node-tests.mjs +61 -0
  128. package/skills/_shared/unity-rule-authoring-contract.md +64 -0
  129. package/skills/_shared/unity-runtime-process-contract.md +16 -0
  130. package/skills/gitnexus-cli.md +8 -0
  131. package/skills/gitnexus-debugging.md +9 -0
  132. package/skills/gitnexus-exploring.md +66 -18
  133. package/skills/gitnexus-guide.md +42 -3
  134. package/skills/gitnexus-impact-analysis.md +8 -0
  135. package/skills/gitnexus-pr-review.md +8 -0
  136. package/skills/gitnexus-refactoring.md +8 -0
  137. package/skills/gitnexus-unity-rule-gen.md +66 -312
@@ -6,6 +6,9 @@ function buildToolInput(step, repo) {
6
6
  if (repo) {
7
7
  input.repo = repo;
8
8
  }
9
+ if ((step.tool === 'query' || step.tool === 'context') && !('response_profile' in input)) {
10
+ input.response_profile = 'full';
11
+ }
9
12
  // LocalBackend impact contract uses `target_uid`, while dataset rows may carry `uid`.
10
13
  if (step.tool === 'impact') {
11
14
  const uid = input.uid;
@@ -77,3 +77,25 @@ test('executeToolPlan maps impact uid to target_uid for backend impact contract'
77
77
  assert.equal(calls.length, 1);
78
78
  assert.equal(calls[0].target_uid, 'Class:Assets/NEON/Code/NetworkCode/NeonMgr/MirrorNetMgr.cs:MirrorNetMgr');
79
79
  });
80
+ test('executeToolPlan injects response_profile=full for legacy query/context payloads', async () => {
81
+ const calls = [];
82
+ const fakeRunner = {
83
+ query: async (params) => {
84
+ calls.push({ tool: 'query', params });
85
+ return {};
86
+ },
87
+ context: async (params) => {
88
+ calls.push({ tool: 'context', params });
89
+ return {};
90
+ },
91
+ impact: async () => ({}),
92
+ cypher: async () => ({}),
93
+ close: async () => { },
94
+ };
95
+ await executeToolPlan([
96
+ { tool: 'query', input: { query: 'Target' } },
97
+ { tool: 'context', input: { name: 'Target' } },
98
+ ], fakeRunner, 'sample-repo');
99
+ assert.equal(calls[0].params.response_profile, 'full');
100
+ assert.equal(calls[1].params.response_profile, 'full');
101
+ });
@@ -1,7 +1,8 @@
1
- export declare function createAgentContextToolRunner(): Promise<{
2
- query: (params: any) => Promise<any>;
3
- context: (params: any) => Promise<any>;
4
- impact: (params: any) => Promise<any>;
5
- cypher: (params: any) => Promise<any>;
1
+ export interface AgentContextToolRunner {
2
+ query: (params: Record<string, unknown>) => Promise<any>;
3
+ context: (params: Record<string, unknown>) => Promise<any>;
4
+ impact: (params: Record<string, unknown>) => Promise<any>;
5
+ cypher: (params: Record<string, unknown>) => Promise<any>;
6
6
  close: () => Promise<void>;
7
- }>;
7
+ }
8
+ export declare function createAgentContextToolRunner(): Promise<AgentContextToolRunner>;
@@ -0,0 +1,2 @@
1
+ import type { AgentSafeBenchmarkSuite } from './types.js';
2
+ export declare function loadAgentSafeQueryContextSuite(root: string): Promise<AgentSafeBenchmarkSuite>;
@@ -0,0 +1,86 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import { AGENT_SAFE_CASE_KEYS } from './types.js';
4
+ const PLACEHOLDER_RE = /TODO|TBD|placeholder|<resource>|<symbol>/i;
5
+ export async function loadAgentSafeQueryContextSuite(root) {
6
+ const thresholds = JSON.parse(await fs.readFile(path.join(root, 'thresholds.json'), 'utf-8'));
7
+ const cases = JSON.parse(await fs.readFile(path.join(root, 'cases.json'), 'utf-8'));
8
+ for (const key of AGENT_SAFE_CASE_KEYS) {
9
+ assertCase(key, cases[key]);
10
+ }
11
+ return { thresholds, cases };
12
+ }
13
+ function assertCase(name, value) {
14
+ if (!value) {
15
+ throw new Error(`missing required case: ${name}`);
16
+ }
17
+ for (const field of ['label', 'start_query', 'retry_query', 'proof_cypher']) {
18
+ const candidate = value[field];
19
+ if (!candidate || typeof candidate !== 'string') {
20
+ throw new Error(`missing required field: ${name}.${field}`);
21
+ }
22
+ assertNoPlaceholder(`${name}.${field}`, candidate);
23
+ }
24
+ if (!Array.isArray(value.proof_contexts) || value.proof_contexts.length === 0) {
25
+ throw new Error(`missing required field: ${name}.proof_contexts`);
26
+ }
27
+ value.proof_contexts.forEach((entry, index) => assertNoPlaceholder(`${name}.proof_contexts[${index}]`, entry));
28
+ if (!Array.isArray(value.tool_plan) || value.tool_plan.length === 0) {
29
+ throw new Error(`missing required field: ${name}.tool_plan`);
30
+ }
31
+ assertLiveTask(name, value.live_task, value.semantic_tuple);
32
+ if (value.start_query_input && typeof value.start_query_input === 'object') {
33
+ for (const entry of Object.values(value.start_query_input)) {
34
+ if (typeof entry === 'string') {
35
+ assertNoPlaceholder(`${name}.start_query_input`, entry);
36
+ }
37
+ }
38
+ }
39
+ if (value.retry_query_input && typeof value.retry_query_input === 'object') {
40
+ for (const entry of Object.values(value.retry_query_input)) {
41
+ if (typeof entry === 'string') {
42
+ assertNoPlaceholder(`${name}.retry_query_input`, entry);
43
+ }
44
+ }
45
+ }
46
+ assertSemanticTuple(name, value.semantic_tuple);
47
+ }
48
+ function assertLiveTask(name, liveTask, tuple) {
49
+ if (!liveTask) {
50
+ throw new Error(`missing required field: ${name}.live_task`);
51
+ }
52
+ for (const field of ['objective', 'symbol_seed', 'resource_seed']) {
53
+ const candidate = liveTask[field];
54
+ if (!candidate || typeof candidate !== 'string') {
55
+ throw new Error(`missing required field: ${name}.live_task.${field}`);
56
+ }
57
+ assertNoPlaceholder(`${name}.live_task.${field}`, candidate);
58
+ }
59
+ if (tuple.proof_edge && liveTask.objective.includes(tuple.proof_edge)) {
60
+ throw new Error(`${name}.live_task.objective leaks canonical proof_edge`);
61
+ }
62
+ if (tuple.proof_edges?.every((edge) => liveTask.objective.includes(edge))) {
63
+ throw new Error(`${name}.live_task.objective leaks canonical proof_edges`);
64
+ }
65
+ }
66
+ function assertSemanticTuple(name, tuple) {
67
+ if (!tuple) {
68
+ throw new Error(`missing required field: ${name}.semantic_tuple`);
69
+ }
70
+ assertNoPlaceholder(`${name}.semantic_tuple.resource_anchor`, tuple.resource_anchor);
71
+ assertNoPlaceholder(`${name}.semantic_tuple.symbol_anchor`, tuple.symbol_anchor);
72
+ if (tuple.proof_edge) {
73
+ assertNoPlaceholder(`${name}.semantic_tuple.proof_edge`, tuple.proof_edge);
74
+ }
75
+ if (tuple.proof_edges) {
76
+ tuple.proof_edges.forEach((entry, index) => assertNoPlaceholder(`${name}.semantic_tuple.proof_edges[${index}]`, entry));
77
+ }
78
+ if (!tuple.proof_edge && (!tuple.proof_edges || tuple.proof_edges.length === 0)) {
79
+ throw new Error(`missing proof edge(s): ${name}.semantic_tuple`);
80
+ }
81
+ }
82
+ function assertNoPlaceholder(field, value) {
83
+ if (PLACEHOLDER_RE.test(value)) {
84
+ throw new Error(`${field} contains placeholder text`);
85
+ }
86
+ }
@@ -0,0 +1,13 @@
1
+ import test from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import path from 'node:path';
4
+ import { loadAgentSafeQueryContextSuite } from './io.js';
5
+ test('loads canonical benchmark cases without placeholders', async () => {
6
+ const suite = await loadAgentSafeQueryContextSuite(path.resolve('../benchmarks/agent-safe-query-context/neonspark-v1'));
7
+ assert.deepEqual(Object.keys(suite.cases).sort(), ['reload', 'weapon_powerup']);
8
+ assert.equal(suite.cases.weapon_powerup.semantic_tuple.resource_anchor, 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset');
9
+ assert.equal(suite.cases.reload.semantic_tuple.proof_edge, 'ReloadBase.GetValue -> ReloadBase.CheckReload');
10
+ assert.equal(suite.cases.weapon_powerup.live_task.symbol_seed, 'WeaponPowerUp');
11
+ assert.equal(suite.cases.reload.live_task.resource_seed, 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset');
12
+ assert.equal(suite.cases.weapon_powerup.live_task.objective.includes('HoldPickup -> WeaponPowerUp.PickItUp'), false);
13
+ });
@@ -0,0 +1,57 @@
1
+ import { executeToolPlan } from '../agent-context/runner.js';
2
+ import { type AgentContextToolRunner } from '../agent-context/tool-runner.js';
3
+ import { type WorkflowReplayResult } from './runner.js';
4
+ import { loadSubagentLiveCaseResult, type SubagentLiveResult, type TelemetryStep } from './subagent-live.js';
5
+ import type { AgentSafeBenchmarkCase, AgentSafeBenchmarkSuite, AgentSafeCaseKey, SemanticTuple } from './types.js';
6
+ type CaseKey = AgentSafeCaseKey;
7
+ export interface SameScriptCaseResult {
8
+ tool_plan: AgentSafeBenchmarkCase['tool_plan'];
9
+ steps: TelemetryStep[];
10
+ semantic_tuple: SemanticTuple;
11
+ semantic_tuple_pass: boolean;
12
+ tool_calls_to_completion: number;
13
+ tokens_to_completion: number;
14
+ }
15
+ export interface AgentSafeQueryContextBenchmarkReport {
16
+ generatedAt: string;
17
+ workflow_replay_full: Record<CaseKey, WorkflowReplayResult>;
18
+ workflow_replay_slim: Record<CaseKey, WorkflowReplayResult>;
19
+ same_script_full: Record<CaseKey, SameScriptCaseResult>;
20
+ same_script_slim: Record<CaseKey, SameScriptCaseResult>;
21
+ subagent_live: Record<CaseKey, SubagentLiveResult>;
22
+ acceptance: {
23
+ pass: boolean;
24
+ cases: Record<CaseKey, boolean>;
25
+ };
26
+ pass: boolean;
27
+ cases: Record<CaseKey, SubagentLiveResult>;
28
+ same_script: {
29
+ tool_plan: Record<CaseKey, AgentSafeBenchmarkCase['tool_plan']>;
30
+ cases: Record<CaseKey, SameScriptCaseResult>;
31
+ };
32
+ semantic_equivalence: {
33
+ pass: boolean;
34
+ cases: Record<CaseKey, boolean>;
35
+ };
36
+ token_summary: Record<CaseKey, {
37
+ before: number;
38
+ after: number;
39
+ saved: number;
40
+ reduction: number;
41
+ }>;
42
+ call_summary: Record<CaseKey, {
43
+ before: number;
44
+ after: number;
45
+ saved: number;
46
+ }>;
47
+ }
48
+ export declare function runAgentSafeQueryContextBenchmark(suite: AgentSafeBenchmarkSuite, options: {
49
+ repo?: string;
50
+ subagentRunsDir?: string;
51
+ }, deps?: {
52
+ runner?: AgentContextToolRunner;
53
+ executeToolPlan?: typeof executeToolPlan;
54
+ loadSubagentLiveCaseResult?: typeof loadSubagentLiveCaseResult;
55
+ }): Promise<AgentSafeQueryContextBenchmarkReport>;
56
+ export declare function writeAgentSafeQueryContextReports(reportDir: string, report: AgentSafeQueryContextBenchmarkReport): Promise<void>;
57
+ export {};
@@ -0,0 +1,159 @@
1
+ import path from 'node:path';
2
+ import { estimateTokens } from '../u2-e2e/metrics.js';
3
+ import { writeReports } from '../report.js';
4
+ import { executeToolPlan } from '../agent-context/runner.js';
5
+ import { createAgentContextToolRunner } from '../agent-context/tool-runner.js';
6
+ import { deriveSemanticTuple, semanticTuplePass } from './semantic-tuple.js';
7
+ import { runWorkflowReplay } from './runner.js';
8
+ import { loadSubagentLiveCaseResult } from './subagent-live.js';
9
+ export async function runAgentSafeQueryContextBenchmark(suite, options, deps = {}) {
10
+ const runner = deps.runner || (await createAgentContextToolRunner());
11
+ const ownsRunner = !deps.runner;
12
+ const executeToolPlanImpl = deps.executeToolPlan || executeToolPlan;
13
+ const loadSubagentLiveCaseResultImpl = deps.loadSubagentLiveCaseResult || loadSubagentLiveCaseResult;
14
+ const workflowReplayFullCases = {};
15
+ const workflowReplaySlimCases = {};
16
+ const sameScriptFullCases = {};
17
+ const sameScriptSlimCases = {};
18
+ const subagentLiveCases = {};
19
+ const acceptanceCases = {};
20
+ const semanticEquivalenceCases = {};
21
+ const tokenSummary = {};
22
+ const callSummary = {};
23
+ if (!options.subagentRunsDir) {
24
+ throw new Error('subagentRunsDir is required for real subagent benchmark runs');
25
+ }
26
+ try {
27
+ for (const key of Object.keys(suite.cases)) {
28
+ const benchmarkCase = suite.cases[key];
29
+ const workflowReplayFull = await runWorkflowReplay(benchmarkCase, runner, {
30
+ repo: options.repo,
31
+ maxSteps: suite.thresholds.workflowReplay.maxSteps,
32
+ responseProfile: 'full',
33
+ });
34
+ const workflowReplaySlim = await runWorkflowReplay(benchmarkCase, runner, {
35
+ repo: options.repo,
36
+ maxSteps: suite.thresholds.workflowReplay.maxSteps,
37
+ responseProfile: 'slim',
38
+ });
39
+ const sameScriptFull = await runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, {
40
+ repo: options.repo,
41
+ responseProfile: 'full',
42
+ });
43
+ const sameScriptSlim = await runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, {
44
+ repo: options.repo,
45
+ responseProfile: 'slim',
46
+ });
47
+ const subagentLive = await loadSubagentLiveCaseResultImpl(path.join(options.subagentRunsDir, key), benchmarkCase);
48
+ workflowReplayFullCases[key] = workflowReplayFull;
49
+ workflowReplaySlimCases[key] = workflowReplaySlim;
50
+ sameScriptFullCases[key] = sameScriptFull;
51
+ sameScriptSlimCases[key] = sameScriptSlim;
52
+ subagentLiveCases[key] = subagentLive;
53
+ acceptanceCases[key] = workflowReplaySlim.semantic_tuple_pass
54
+ && workflowReplaySlim.post_narrowing_anchor_pass
55
+ && workflowReplaySlim.post_narrowing_follow_up_hit
56
+ && workflowReplaySlim.guid_invariance_pass
57
+ && workflowReplaySlim.live_tool_evidence_pass
58
+ && workflowReplaySlim.freeze_ready
59
+ && workflowReplaySlim.tier_envelope.facts_present
60
+ && workflowReplaySlim.tier_envelope.closure_present
61
+ && workflowReplaySlim.tier_envelope.clues_present
62
+ && workflowReplaySlim.tier_envelope.semantic_order_pass
63
+ && !workflowReplaySlim.placeholder_leak_detected
64
+ && !workflowReplaySlim.heuristic_top_summary_detected;
65
+ semanticEquivalenceCases[key] = sameScriptSlim.semantic_tuple_pass && subagentLive.semantic_tuple_pass;
66
+ const tokenSaved = sameScriptFull.tokens_to_completion - sameScriptSlim.tokens_to_completion;
67
+ tokenSummary[key] = {
68
+ before: sameScriptFull.tokens_to_completion,
69
+ after: sameScriptSlim.tokens_to_completion,
70
+ saved: tokenSaved,
71
+ reduction: sameScriptFull.tokens_to_completion > 0 ? Number((tokenSaved / sameScriptFull.tokens_to_completion).toFixed(3)) : 0,
72
+ };
73
+ callSummary[key] = {
74
+ before: sameScriptFull.tool_calls_to_completion,
75
+ after: sameScriptSlim.tool_calls_to_completion,
76
+ saved: sameScriptFull.tool_calls_to_completion - sameScriptSlim.tool_calls_to_completion,
77
+ };
78
+ }
79
+ }
80
+ finally {
81
+ if (ownsRunner) {
82
+ await runner.close();
83
+ }
84
+ }
85
+ const pass = Object.values(acceptanceCases).every(Boolean);
86
+ return {
87
+ generatedAt: new Date().toISOString(),
88
+ workflow_replay_full: workflowReplayFullCases,
89
+ workflow_replay_slim: workflowReplaySlimCases,
90
+ same_script_full: sameScriptFullCases,
91
+ same_script_slim: sameScriptSlimCases,
92
+ subagent_live: subagentLiveCases,
93
+ acceptance: {
94
+ pass,
95
+ cases: acceptanceCases,
96
+ },
97
+ pass,
98
+ cases: subagentLiveCases,
99
+ same_script: {
100
+ tool_plan: {
101
+ weapon_powerup: suite.cases.weapon_powerup.tool_plan,
102
+ reload: suite.cases.reload.tool_plan,
103
+ },
104
+ cases: sameScriptSlimCases,
105
+ },
106
+ semantic_equivalence: {
107
+ pass: Object.values(semanticEquivalenceCases).every(Boolean),
108
+ cases: semanticEquivalenceCases,
109
+ },
110
+ token_summary: tokenSummary,
111
+ call_summary: callSummary,
112
+ };
113
+ }
114
+ export async function writeAgentSafeQueryContextReports(reportDir, report) {
115
+ const markdown = [
116
+ '# Agent-Safe Query/Context Benchmark Summary',
117
+ '',
118
+ `- Pass: ${report.acceptance.pass ? 'YES' : 'NO'}`,
119
+ '',
120
+ '## Cases',
121
+ ...['weapon_powerup', 'reload'].map((key) => `- ${key}: live_pass=${report.subagent_live[key].semantic_tuple_pass}, token_saved=${report.token_summary[key].saved}, call_saved=${report.call_summary[key].saved}, anchor_top1_pass=${report.workflow_replay_slim[key].anchor_top1_pass}, recommended_follow_up_hit=${report.workflow_replay_slim[key].recommended_follow_up_hit}, post_narrowing_anchor_pass=${report.workflow_replay_slim[key].post_narrowing_anchor_pass}, post_narrowing_follow_up_hit=${report.workflow_replay_slim[key].post_narrowing_follow_up_hit}, guid_invariance_pass=${report.workflow_replay_slim[key].guid_invariance_pass}, live_tool_evidence_pass=${report.workflow_replay_slim[key].live_tool_evidence_pass}, freeze_ready=${report.workflow_replay_slim[key].freeze_ready}, confirmed_chain_steps=${report.workflow_replay_slim[key].confirmed_chain.steps.length}, tier_facts=${report.workflow_replay_slim[key].tier_envelope.facts_present}, tier_closure=${report.workflow_replay_slim[key].tier_envelope.closure_present}, tier_clues=${report.workflow_replay_slim[key].tier_envelope.clues_present}, tier_semantic_order=${report.workflow_replay_slim[key].tier_envelope.semantic_order_pass}, tier_summary_source=${report.workflow_replay_slim[key].tier_envelope.summary_source}, ambiguity_detour_count=${report.workflow_replay_slim[key].ambiguity_detour_count}, placeholder_leak_detected=${report.workflow_replay_slim[key].placeholder_leak_detected}, heuristic_top_summary_detected=${report.workflow_replay_slim[key].heuristic_top_summary_detected}`),
122
+ ].join('\n');
123
+ await writeReports(reportDir, report, markdown);
124
+ }
125
+ async function runSameScriptCase(benchmarkCase, runner, executeToolPlanImpl, options) {
126
+ const toolPlan = applyResponseProfileToToolPlan(benchmarkCase.tool_plan, options.responseProfile);
127
+ const outputs = await executeToolPlanImpl(toolPlan, runner, options.repo);
128
+ const steps = outputs.map((step) => ({
129
+ tool: step.tool,
130
+ input: step.input,
131
+ output: step.output,
132
+ durationMs: 0,
133
+ totalTokensEst: estimateTokens(JSON.stringify(step.input)) + estimateTokens(JSON.stringify(step.output)),
134
+ timestamp: new Date(0).toISOString(),
135
+ }));
136
+ const semanticTuple = deriveSemanticTuple(benchmarkCase.semantic_tuple, steps.map((step) => step.output));
137
+ return {
138
+ tool_plan: toolPlan,
139
+ steps,
140
+ semantic_tuple: semanticTuple,
141
+ semantic_tuple_pass: semanticTuplePass(semanticTuple, benchmarkCase.semantic_tuple),
142
+ tool_calls_to_completion: steps.length,
143
+ tokens_to_completion: steps.reduce((sum, step) => sum + step.totalTokensEst, 0),
144
+ };
145
+ }
146
+ function applyResponseProfileToToolPlan(toolPlan, responseProfile) {
147
+ return toolPlan.map((step) => {
148
+ if (step.tool !== 'query' && step.tool !== 'context') {
149
+ return step;
150
+ }
151
+ return {
152
+ ...step,
153
+ input: {
154
+ ...step.input,
155
+ response_profile: responseProfile,
156
+ },
157
+ };
158
+ });
159
+ }