@veewo/gitnexus 1.5.0-rc.4 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark/agent-context/runner.js +3 -0
- package/dist/benchmark/agent-context/runner.test.js +22 -0
- package/dist/benchmark/agent-context/tool-runner.d.ts +7 -6
- package/dist/benchmark/agent-safe-query-context/io.d.ts +2 -0
- package/dist/benchmark/agent-safe-query-context/io.js +86 -0
- package/dist/benchmark/agent-safe-query-context/io.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/io.test.js +13 -0
- package/dist/benchmark/agent-safe-query-context/report.d.ts +57 -0
- package/dist/benchmark/agent-safe-query-context/report.js +159 -0
- package/dist/benchmark/agent-safe-query-context/report.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/report.test.js +362 -0
- package/dist/benchmark/agent-safe-query-context/runner.d.ts +44 -0
- package/dist/benchmark/agent-safe-query-context/runner.js +406 -0
- package/dist/benchmark/agent-safe-query-context/runner.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/runner.test.js +290 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.d.ts +20 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.js +225 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.js +122 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.d.ts +47 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.js +128 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.test.d.ts +1 -0
- package/dist/benchmark/agent-safe-query-context/subagent-live.test.js +155 -0
- package/dist/benchmark/agent-safe-query-context/telemetry-tool.d.ts +9 -0
- package/dist/benchmark/agent-safe-query-context/telemetry-tool.js +77 -0
- package/dist/benchmark/agent-safe-query-context/types.d.ts +61 -0
- package/dist/benchmark/agent-safe-query-context/types.js +8 -0
- package/dist/benchmark/analyze-runner.d.ts +1 -1
- package/dist/benchmark/analyze-runner.js +4 -3
- package/dist/benchmark/analyze-runner.test.js +7 -0
- package/dist/benchmark/runtime-poc/provenance-artifact.d.ts +47 -0
- package/dist/benchmark/runtime-poc/provenance-artifact.js +89 -0
- package/dist/benchmark/runtime-poc/runner.d.ts +31 -0
- package/dist/benchmark/runtime-poc/runner.js +163 -0
- package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.d.ts +8 -0
- package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.js +21 -0
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.d.ts +0 -1
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.js +53 -51
- package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.test.js +0 -1
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.d.ts +1 -1
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.js +82 -18
- package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.test.js +1 -2
- package/dist/benchmark/u2-e2e/retrieval-runner.js +15 -7
- package/dist/benchmark/u2-e2e/retrieval-runner.test.js +46 -0
- package/dist/cli/ai-context.d.ts +0 -1
- package/dist/cli/ai-context.js +5 -6
- package/dist/cli/ai-context.test.js +8 -0
- package/dist/cli/analyze-options.js +58 -34
- package/dist/cli/analyze-options.test.js +57 -0
- package/dist/cli/analyze-runtime-summary.js +2 -0
- package/dist/cli/analyze-runtime-summary.test.js +12 -0
- package/dist/cli/analyze-summary.d.ts +4 -0
- package/dist/cli/analyze-summary.js +43 -0
- package/dist/cli/analyze-summary.test.js +65 -1
- package/dist/cli/analyze.d.ts +11 -0
- package/dist/cli/analyze.js +34 -5
- package/dist/cli/analyze.test.d.ts +1 -0
- package/dist/cli/analyze.test.js +25 -0
- package/dist/cli/benchmark-agent-context.js +1 -1
- package/dist/cli/benchmark-agent-safe-query-context.d.ts +20 -0
- package/dist/cli/benchmark-agent-safe-query-context.js +39 -0
- package/dist/cli/benchmark-agent-safe-query-context.test.d.ts +1 -0
- package/dist/cli/benchmark-agent-safe-query-context.test.js +271 -0
- package/dist/cli/benchmark-unity.js +1 -1
- package/dist/cli/benchmark-unity.test.js +5 -1
- package/dist/cli/benchmark.d.ts +29 -0
- package/dist/cli/benchmark.js +55 -0
- package/dist/cli/index.js +27 -2
- package/dist/cli/rule-lab.d.ts +3 -7
- package/dist/cli/rule-lab.js +13 -22
- package/dist/cli/rule-lab.test.js +23 -3
- package/dist/cli/scope-manifest-config.d.ts +9 -0
- package/dist/cli/scope-manifest-config.js +37 -0
- package/dist/cli/setup.js +40 -41
- package/dist/cli/setup.test.js +14 -14
- package/dist/cli/sync-manifest.d.ts +27 -0
- package/dist/cli/sync-manifest.js +200 -0
- package/dist/cli/sync-manifest.test.d.ts +1 -0
- package/dist/cli/sync-manifest.test.js +88 -0
- package/dist/cli/tool.d.ts +2 -0
- package/dist/cli/tool.js +2 -0
- package/dist/core/config/unity-config.d.ts +1 -1
- package/dist/core/config/unity-config.js +1 -1
- package/dist/core/ingestion/call-processor.d.ts +2 -1
- package/dist/core/ingestion/call-processor.js +28 -6
- package/dist/core/ingestion/heritage-processor.d.ts +2 -1
- package/dist/core/ingestion/heritage-processor.js +30 -7
- package/dist/core/ingestion/import-processor.d.ts +2 -1
- package/dist/core/ingestion/import-processor.js +28 -6
- package/dist/core/ingestion/parsing-processor.d.ts +5 -3
- package/dist/core/ingestion/parsing-processor.js +46 -13
- package/dist/core/ingestion/pipeline.js +100 -19
- package/dist/core/ingestion/unity-lifecycle-synthetic-calls.test.js +18 -20
- package/dist/core/ingestion/unity-parity-seed.d.ts +2 -1
- package/dist/core/ingestion/unity-parity-seed.js +8 -0
- package/dist/core/ingestion/unity-resource-processor.d.ts +11 -0
- package/dist/core/ingestion/unity-resource-processor.js +102 -0
- package/dist/core/ingestion/unity-resource-processor.test.js +449 -0
- package/dist/core/ingestion/unity-runtime-binding-rules.d.ts +16 -1
- package/dist/core/ingestion/unity-runtime-binding-rules.js +193 -42
- package/dist/core/ingestion/workers/parse-worker.d.ts +2 -0
- package/dist/core/ingestion/workers/parse-worker.js +50 -6
- package/dist/core/lbug/csv-generator.test.js +2 -2
- package/dist/core/tree-sitter/csharp-define-profile.d.ts +6 -0
- package/dist/core/tree-sitter/csharp-define-profile.js +43 -0
- package/dist/core/tree-sitter/csharp-preproc-normalizer.d.ts +14 -0
- package/dist/core/tree-sitter/csharp-preproc-normalizer.js +261 -0
- package/dist/core/tree-sitter/parser-loader.d.ts +10 -0
- package/dist/core/tree-sitter/parser-loader.js +19 -0
- package/dist/core/unity/doc-contract.test.d.ts +1 -0
- package/dist/core/unity/doc-contract.test.js +30 -0
- package/dist/core/unity/prefab-source-scan.d.ts +25 -0
- package/dist/core/unity/prefab-source-scan.js +152 -0
- package/dist/core/unity/prefab-source-scan.test.d.ts +1 -0
- package/dist/core/unity/prefab-source-scan.test.js +70 -0
- package/dist/core/unity/scan-context.d.ts +12 -0
- package/dist/core/unity/scan-context.js +50 -2
- package/dist/core/unity/scan-context.test.js +74 -0
- package/dist/mcp/local/agent-safe-response.d.ts +10 -0
- package/dist/mcp/local/agent-safe-response.js +639 -0
- package/dist/mcp/local/derived-process-reader.js +1 -1
- package/dist/mcp/local/local-backend.d.ts +18 -1
- package/dist/mcp/local/local-backend.js +319 -125
- package/dist/mcp/local/process-confidence.d.ts +1 -2
- package/dist/mcp/local/process-confidence.js +0 -3
- package/dist/mcp/local/process-confidence.test.js +4 -2
- package/dist/mcp/local/process-evidence.d.ts +1 -8
- package/dist/mcp/local/process-evidence.js +1 -23
- package/dist/mcp/local/process-evidence.test.js +2 -16
- package/dist/mcp/local/process-ref.d.ts +1 -1
- package/dist/mcp/local/runtime-chain-closure-evaluator.d.ts +33 -0
- package/dist/mcp/local/runtime-chain-closure-evaluator.js +273 -0
- package/dist/mcp/local/runtime-chain-graph-candidates.d.ts +23 -0
- package/dist/mcp/local/runtime-chain-graph-candidates.js +131 -0
- package/dist/mcp/local/runtime-chain-verify.d.ts +1 -1
- package/dist/mcp/local/runtime-chain-verify.js +149 -138
- package/dist/mcp/local/runtime-chain-verify.test.js +126 -68
- package/dist/mcp/local/runtime-claim-rule-registry.d.ts +4 -0
- package/dist/mcp/local/runtime-claim-rule-registry.js +4 -0
- package/dist/mcp/local/runtime-claim-rule-registry.test.js +37 -4
- package/dist/mcp/local/runtime-claim.d.ts +11 -0
- package/dist/mcp/local/runtime-claim.js +28 -0
- package/dist/mcp/local/unity-evidence-view.d.ts +1 -1
- package/dist/mcp/local/unity-evidence-view.js +1 -1
- package/dist/mcp/local/unity-evidence-view.test.js +22 -0
- package/dist/mcp/tools.js +51 -21
- package/dist/rule-lab/analyze.d.ts +2 -1
- package/dist/rule-lab/analyze.js +94 -59
- package/dist/rule-lab/analyze.test.js +238 -20
- package/dist/rule-lab/curate.d.ts +2 -1
- package/dist/rule-lab/curate.js +24 -3
- package/dist/rule-lab/curate.test.js +65 -0
- package/dist/rule-lab/curation-input-builder.d.ts +45 -0
- package/dist/rule-lab/curation-input-builder.js +133 -0
- package/dist/rule-lab/promote.js +80 -7
- package/dist/rule-lab/promote.test.js +150 -0
- package/dist/rule-lab/review-pack.d.ts +3 -0
- package/dist/rule-lab/review-pack.js +41 -1
- package/dist/rule-lab/review-pack.test.js +67 -0
- package/dist/rule-lab/types.d.ts +29 -0
- package/dist/types/pipeline.d.ts +16 -0
- package/package.json +14 -13
- package/scripts/check-sync-manifest-traceability.mjs +203 -0
- package/scripts/run-node-tests.mjs +61 -0
- package/scripts/tree-sitter-audit-classify.mjs +172 -0
- package/skills/_shared/unity-rule-authoring-contract.md +64 -0
- package/skills/_shared/unity-runtime-process-contract.md +16 -0
- package/skills/gitnexus-cli.md +44 -4
- package/skills/gitnexus-debugging.md +9 -0
- package/skills/gitnexus-exploring.md +66 -18
- package/skills/gitnexus-guide.md +42 -3
- package/skills/gitnexus-impact-analysis.md +8 -0
- package/skills/gitnexus-pr-review.md +8 -0
- package/skills/gitnexus-refactoring.md +8 -0
- package/skills/gitnexus-unity-rule-gen.md +66 -312
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
import test from 'node:test';
|
|
2
|
+
import assert from 'node:assert/strict';
|
|
3
|
+
import { runAgentSafeQueryContextBenchmark } from './report.js';
|
|
4
|
+
const fakeSuite = {
|
|
5
|
+
thresholds: {
|
|
6
|
+
workflowReplay: { maxSteps: 5 },
|
|
7
|
+
tokenReduction: {
|
|
8
|
+
weapon_powerup: 0.5,
|
|
9
|
+
reload: 0.4,
|
|
10
|
+
},
|
|
11
|
+
},
|
|
12
|
+
cases: {
|
|
13
|
+
weapon_powerup: {
|
|
14
|
+
label: 'weapon_powerup',
|
|
15
|
+
start_query: 'weapon powerup equip chain',
|
|
16
|
+
retry_query: '1_weapon_orb_key.asset WeaponPowerUp HoldPickup EquipWithEvent Equip',
|
|
17
|
+
proof_contexts: ['WeaponPowerUp'],
|
|
18
|
+
proof_cypher: 'MATCH () RETURN 1',
|
|
19
|
+
tool_plan: [{ tool: 'query', input: { query: 'weapon powerup equip chain' } }],
|
|
20
|
+
live_task: {
|
|
21
|
+
objective: 'Investigate WeaponPowerUp from the provided asset seed and report the best supported runtime relation.',
|
|
22
|
+
symbol_seed: 'WeaponPowerUp',
|
|
23
|
+
resource_seed: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
|
|
24
|
+
},
|
|
25
|
+
semantic_tuple: {
|
|
26
|
+
resource_anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
|
|
27
|
+
symbol_anchor: 'WeaponPowerUp',
|
|
28
|
+
proof_edges: [
|
|
29
|
+
'HoldPickup -> WeaponPowerUp.PickItUp',
|
|
30
|
+
'EquipWithEvent -> WeaponPowerUp.Equip',
|
|
31
|
+
],
|
|
32
|
+
closure_status: 'not_verified_full',
|
|
33
|
+
},
|
|
34
|
+
},
|
|
35
|
+
reload: {
|
|
36
|
+
label: 'reload',
|
|
37
|
+
start_query: 'reload getvalue checkreload',
|
|
38
|
+
retry_query: 'Gungraph_use/1_weapon_orb_key.asset ReloadBase GetValue CheckReload',
|
|
39
|
+
proof_contexts: ['ReloadBase'],
|
|
40
|
+
proof_cypher: 'MATCH () RETURN 1',
|
|
41
|
+
tool_plan: [{ tool: 'query', input: { query: 'reload getvalue checkreload' } }],
|
|
42
|
+
live_task: {
|
|
43
|
+
objective: 'Investigate ReloadBase from the provided graph asset seed and report the best supported reload relation.',
|
|
44
|
+
symbol_seed: 'ReloadBase',
|
|
45
|
+
resource_seed: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
|
|
46
|
+
},
|
|
47
|
+
semantic_tuple: {
|
|
48
|
+
resource_anchor: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
|
|
49
|
+
symbol_anchor: 'ReloadBase',
|
|
50
|
+
proof_edge: 'ReloadBase.GetValue -> ReloadBase.CheckReload',
|
|
51
|
+
closure_status: 'not_verified_full',
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
};
|
|
56
|
+
test('benchmark report includes explicit benchmark tracks', async () => {
|
|
57
|
+
const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
|
|
58
|
+
repo: 'neonspark-core',
|
|
59
|
+
subagentRunsDir: '/tmp/subagent-runs',
|
|
60
|
+
}, {
|
|
61
|
+
runner: {
|
|
62
|
+
query: async (input) => {
|
|
63
|
+
const queryText = String(input?.query || '');
|
|
64
|
+
if (/reload|ReloadBase|CheckReload/.test(queryText)) {
|
|
65
|
+
return {
|
|
66
|
+
candidates: [{ name: 'ReloadBase' }],
|
|
67
|
+
resource_hints: [{ path: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
return {
|
|
71
|
+
candidates: [{ name: 'WeaponPowerUp' }],
|
|
72
|
+
resource_hints: [{ path: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
|
|
73
|
+
};
|
|
74
|
+
},
|
|
75
|
+
context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
|
|
76
|
+
impact: async () => ({ impactedCount: 0 }),
|
|
77
|
+
cypher: async (input) => {
|
|
78
|
+
const queryText = String(input?.query || '');
|
|
79
|
+
if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
|
|
80
|
+
return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
row_count: 2,
|
|
84
|
+
rows: [
|
|
85
|
+
{ src: 'HoldPickup', dst: 'PickItUp' },
|
|
86
|
+
{ src: 'EquipWithEvent', dst: 'Equip' },
|
|
87
|
+
],
|
|
88
|
+
};
|
|
89
|
+
},
|
|
90
|
+
close: async () => { },
|
|
91
|
+
},
|
|
92
|
+
executeToolPlan: async (plan) => plan.map((step) => ({
|
|
93
|
+
tool: step.tool,
|
|
94
|
+
input: step.input,
|
|
95
|
+
output: {
|
|
96
|
+
anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
|
|
97
|
+
symbol: 'WeaponPowerUp',
|
|
98
|
+
proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
|
|
99
|
+
},
|
|
100
|
+
})),
|
|
101
|
+
loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
|
|
102
|
+
prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
|
|
103
|
+
prompt_path: '/tmp/prompt.txt',
|
|
104
|
+
result_path: '/tmp/result.json',
|
|
105
|
+
telemetry_path: '/tmp/telemetry.jsonl',
|
|
106
|
+
final_result: {},
|
|
107
|
+
steps: [{
|
|
108
|
+
tool: 'query',
|
|
109
|
+
input: { query: benchmarkCase.start_query },
|
|
110
|
+
output: { value: benchmarkCase.semantic_tuple.resource_anchor },
|
|
111
|
+
durationMs: 1,
|
|
112
|
+
totalTokensEst: 10,
|
|
113
|
+
timestamp: '2026-04-08T00:00:00.000Z',
|
|
114
|
+
}],
|
|
115
|
+
semantic_tuple: benchmarkCase.semantic_tuple,
|
|
116
|
+
normalized_tuple_pass: true,
|
|
117
|
+
evidence_validation_pass: true,
|
|
118
|
+
failure_class: undefined,
|
|
119
|
+
semantic_tuple_pass: true,
|
|
120
|
+
tool_calls_to_completion: 1,
|
|
121
|
+
tokens_to_completion: 10,
|
|
122
|
+
stop_reason: 'semantic_tuple_satisfied',
|
|
123
|
+
}),
|
|
124
|
+
});
|
|
125
|
+
assert.equal(report.cases.weapon_powerup.semantic_tuple_pass, true);
|
|
126
|
+
assert.ok(report.same_script.tool_plan.weapon_powerup);
|
|
127
|
+
assert.ok(report.subagent_live.reload.steps);
|
|
128
|
+
assert.ok(report.token_summary.weapon_powerup);
|
|
129
|
+
assert.ok(report.call_summary.reload);
|
|
130
|
+
assert.ok(report.workflow_replay_full.weapon_powerup);
|
|
131
|
+
assert.ok(report.workflow_replay_slim.weapon_powerup);
|
|
132
|
+
assert.ok(report.same_script_full.reload);
|
|
133
|
+
assert.ok(report.same_script_slim.reload);
|
|
134
|
+
assert.ok(report.subagent_live.weapon_powerup);
|
|
135
|
+
assert.equal(report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass, true);
|
|
136
|
+
assert.equal(typeof report.workflow_replay_slim.weapon_powerup.anchor_top1_pass, 'boolean');
|
|
137
|
+
assert.equal(typeof report.workflow_replay_slim.weapon_powerup.recommended_follow_up_hit, 'boolean');
|
|
138
|
+
assert.equal(typeof report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass, 'boolean');
|
|
139
|
+
assert.equal(typeof report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit, 'boolean');
|
|
140
|
+
assert.equal(typeof report.workflow_replay_slim.weapon_powerup.ambiguity_detour_count, 'number');
|
|
141
|
+
assert.equal(report.workflow_replay_slim.reload.guid_invariance_pass, true);
|
|
142
|
+
assert.equal(report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass, true);
|
|
143
|
+
assert.equal(report.acceptance.pass, report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass
|
|
144
|
+
&& report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass
|
|
145
|
+
&& report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit
|
|
146
|
+
&& report.workflow_replay_slim.weapon_powerup.guid_invariance_pass
|
|
147
|
+
&& report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass
|
|
148
|
+
&& report.workflow_replay_slim.weapon_powerup.freeze_ready
|
|
149
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.facts_present
|
|
150
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.closure_present
|
|
151
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.clues_present
|
|
152
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.semantic_order_pass
|
|
153
|
+
&& !report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected
|
|
154
|
+
&& !report.workflow_replay_slim.weapon_powerup.heuristic_top_summary_detected
|
|
155
|
+
&& report.workflow_replay_slim.reload.semantic_tuple_pass
|
|
156
|
+
&& report.workflow_replay_slim.reload.post_narrowing_anchor_pass
|
|
157
|
+
&& report.workflow_replay_slim.reload.post_narrowing_follow_up_hit
|
|
158
|
+
&& report.workflow_replay_slim.reload.guid_invariance_pass
|
|
159
|
+
&& report.workflow_replay_slim.reload.live_tool_evidence_pass
|
|
160
|
+
&& report.workflow_replay_slim.reload.freeze_ready
|
|
161
|
+
&& report.workflow_replay_slim.reload.tier_envelope.facts_present
|
|
162
|
+
&& report.workflow_replay_slim.reload.tier_envelope.closure_present
|
|
163
|
+
&& report.workflow_replay_slim.reload.tier_envelope.clues_present
|
|
164
|
+
&& report.workflow_replay_slim.reload.tier_envelope.semantic_order_pass
|
|
165
|
+
&& !report.workflow_replay_slim.reload.placeholder_leak_detected
|
|
166
|
+
&& !report.workflow_replay_slim.reload.heuristic_top_summary_detected);
|
|
167
|
+
assert.equal(report.pass, report.acceptance.pass);
|
|
168
|
+
});
|
|
169
|
+
test('benchmark report enforces track split, acceptance source, prompt secrecy, and live scoring taxonomy', async () => {
|
|
170
|
+
const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
|
|
171
|
+
repo: 'neonspark-core',
|
|
172
|
+
subagentRunsDir: '/tmp/subagent-runs',
|
|
173
|
+
}, {
|
|
174
|
+
runner: {
|
|
175
|
+
query: async (input) => {
|
|
176
|
+
const queryText = String(input?.query || '');
|
|
177
|
+
if (/reload|ReloadBase|CheckReload/.test(queryText)) {
|
|
178
|
+
return {
|
|
179
|
+
candidates: [{ name: 'ReloadBase' }],
|
|
180
|
+
resource_hints: [{ path: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
candidates: [{ name: 'WeaponPowerUp' }],
|
|
185
|
+
resource_hints: [{ path: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
|
|
186
|
+
};
|
|
187
|
+
},
|
|
188
|
+
context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
|
|
189
|
+
impact: async () => ({ impactedCount: 0 }),
|
|
190
|
+
cypher: async (input) => {
|
|
191
|
+
const queryText = String(input?.query || '');
|
|
192
|
+
if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
|
|
193
|
+
return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
|
|
194
|
+
}
|
|
195
|
+
return {
|
|
196
|
+
row_count: 2,
|
|
197
|
+
rows: [
|
|
198
|
+
{ src: 'HoldPickup', dst: 'PickItUp' },
|
|
199
|
+
{ src: 'EquipWithEvent', dst: 'Equip' },
|
|
200
|
+
],
|
|
201
|
+
};
|
|
202
|
+
},
|
|
203
|
+
close: async () => { },
|
|
204
|
+
},
|
|
205
|
+
executeToolPlan: async (plan) => plan.map((step) => ({
|
|
206
|
+
tool: step.tool,
|
|
207
|
+
input: step.input,
|
|
208
|
+
output: {
|
|
209
|
+
anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
|
|
210
|
+
symbol: 'WeaponPowerUp',
|
|
211
|
+
proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
|
|
212
|
+
},
|
|
213
|
+
})),
|
|
214
|
+
loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
|
|
215
|
+
prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
|
|
216
|
+
prompt_path: '/tmp/prompt.txt',
|
|
217
|
+
result_path: '/tmp/result.json',
|
|
218
|
+
telemetry_path: '/tmp/telemetry.jsonl',
|
|
219
|
+
final_result: {},
|
|
220
|
+
steps: [{
|
|
221
|
+
tool: 'query',
|
|
222
|
+
input: { query: benchmarkCase.start_query },
|
|
223
|
+
output: { value: benchmarkCase.semantic_tuple.resource_anchor },
|
|
224
|
+
durationMs: 1,
|
|
225
|
+
totalTokensEst: 10,
|
|
226
|
+
timestamp: '2026-04-08T00:00:00.000Z',
|
|
227
|
+
}],
|
|
228
|
+
semantic_tuple: benchmarkCase.semantic_tuple,
|
|
229
|
+
normalized_tuple_pass: true,
|
|
230
|
+
evidence_validation_pass: true,
|
|
231
|
+
failure_class: undefined,
|
|
232
|
+
semantic_tuple_pass: true,
|
|
233
|
+
tool_calls_to_completion: 1,
|
|
234
|
+
tokens_to_completion: 10,
|
|
235
|
+
stop_reason: 'semantic_tuple_satisfied',
|
|
236
|
+
}),
|
|
237
|
+
});
|
|
238
|
+
assert.equal(Object.keys(report.workflow_replay_full).length > 0, true);
|
|
239
|
+
assert.equal(Object.keys(report.workflow_replay_slim).length > 0, true);
|
|
240
|
+
assert.equal(Object.keys(report.same_script_full).length > 0, true);
|
|
241
|
+
assert.equal(Object.keys(report.same_script_slim).length > 0, true);
|
|
242
|
+
assert.equal(Object.keys(report.subagent_live).length > 0, true);
|
|
243
|
+
assert.deepEqual(report.acceptance.cases, {
|
|
244
|
+
weapon_powerup: report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass
|
|
245
|
+
&& report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass
|
|
246
|
+
&& report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit
|
|
247
|
+
&& report.workflow_replay_slim.weapon_powerup.guid_invariance_pass
|
|
248
|
+
&& report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass
|
|
249
|
+
&& report.workflow_replay_slim.weapon_powerup.freeze_ready
|
|
250
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.facts_present
|
|
251
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.closure_present
|
|
252
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.clues_present
|
|
253
|
+
&& report.workflow_replay_slim.weapon_powerup.tier_envelope.semantic_order_pass
|
|
254
|
+
&& !report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected
|
|
255
|
+
&& !report.workflow_replay_slim.weapon_powerup.heuristic_top_summary_detected,
|
|
256
|
+
reload: report.workflow_replay_slim.reload.semantic_tuple_pass
|
|
257
|
+
&& report.workflow_replay_slim.reload.post_narrowing_anchor_pass
|
|
258
|
+
&& report.workflow_replay_slim.reload.post_narrowing_follow_up_hit
|
|
259
|
+
&& report.workflow_replay_slim.reload.guid_invariance_pass
|
|
260
|
+
&& report.workflow_replay_slim.reload.live_tool_evidence_pass
|
|
261
|
+
&& report.workflow_replay_slim.reload.freeze_ready
|
|
262
|
+
&& report.workflow_replay_slim.reload.tier_envelope.facts_present
|
|
263
|
+
&& report.workflow_replay_slim.reload.tier_envelope.closure_present
|
|
264
|
+
&& report.workflow_replay_slim.reload.tier_envelope.clues_present
|
|
265
|
+
&& report.workflow_replay_slim.reload.tier_envelope.semantic_order_pass
|
|
266
|
+
&& !report.workflow_replay_slim.reload.placeholder_leak_detected
|
|
267
|
+
&& !report.workflow_replay_slim.reload.heuristic_top_summary_detected,
|
|
268
|
+
});
|
|
269
|
+
assert.equal(report.subagent_live.weapon_powerup.prompt.includes('HoldPickup -> WeaponPowerUp.PickItUp'), false);
|
|
270
|
+
assert.equal(report.subagent_live.reload.prompt.includes('ReloadBase.GetValue -> ReloadBase.CheckReload'), false);
|
|
271
|
+
for (const row of Object.values(report.subagent_live)) {
|
|
272
|
+
assert.equal(typeof row.normalized_tuple_pass, 'boolean');
|
|
273
|
+
assert.equal(typeof row.evidence_validation_pass, 'boolean');
|
|
274
|
+
if (!row.semantic_tuple_pass) {
|
|
275
|
+
assert.ok(row.failure_class);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
test('acceptance fails when semantic tuple passes but placeholder leakage is detected', async () => {
|
|
280
|
+
const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
|
|
281
|
+
repo: 'neonspark-core',
|
|
282
|
+
subagentRunsDir: '/tmp/subagent-runs',
|
|
283
|
+
}, {
|
|
284
|
+
runner: {
|
|
285
|
+
query: async (input) => {
|
|
286
|
+
const queryText = String(input?.query || '');
|
|
287
|
+
if (/reload|ReloadBase|CheckReload/.test(queryText)) {
|
|
288
|
+
return {
|
|
289
|
+
summary: 'ReloadBase flow',
|
|
290
|
+
decision: {
|
|
291
|
+
primary_candidate: 'ReloadBase',
|
|
292
|
+
recommended_follow_up: 'resource_path_prefix=Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
|
|
293
|
+
},
|
|
294
|
+
candidates: [{ name: 'ReloadBase' }],
|
|
295
|
+
resource_hints: [{ target: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
return {
|
|
299
|
+
summary: 'WeaponPowerUp flow',
|
|
300
|
+
decision: {
|
|
301
|
+
primary_candidate: 'WeaponPowerUp',
|
|
302
|
+
recommended_follow_up: 'resource_path_prefix=Reload NEON.Game.Graph.Nodes.Reloads',
|
|
303
|
+
},
|
|
304
|
+
candidates: [{ name: 'WeaponPowerUp' }],
|
|
305
|
+
resource_hints: [{ target: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
|
|
306
|
+
};
|
|
307
|
+
},
|
|
308
|
+
context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
|
|
309
|
+
impact: async () => ({ impactedCount: 0 }),
|
|
310
|
+
cypher: async (input) => {
|
|
311
|
+
const queryText = String(input?.query || '');
|
|
312
|
+
if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
|
|
313
|
+
return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
|
|
314
|
+
}
|
|
315
|
+
return {
|
|
316
|
+
row_count: 2,
|
|
317
|
+
rows: [
|
|
318
|
+
{ src: 'HoldPickup', dst: 'PickItUp' },
|
|
319
|
+
{ src: 'EquipWithEvent', dst: 'Equip' },
|
|
320
|
+
],
|
|
321
|
+
};
|
|
322
|
+
},
|
|
323
|
+
close: async () => { },
|
|
324
|
+
},
|
|
325
|
+
executeToolPlan: async (plan) => plan.map((step) => ({
|
|
326
|
+
tool: step.tool,
|
|
327
|
+
input: step.input,
|
|
328
|
+
output: {
|
|
329
|
+
anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
|
|
330
|
+
symbol: 'WeaponPowerUp',
|
|
331
|
+
proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
|
|
332
|
+
},
|
|
333
|
+
})),
|
|
334
|
+
loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
|
|
335
|
+
prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
|
|
336
|
+
prompt_path: '/tmp/prompt.txt',
|
|
337
|
+
result_path: '/tmp/result.json',
|
|
338
|
+
telemetry_path: '/tmp/telemetry.jsonl',
|
|
339
|
+
final_result: {},
|
|
340
|
+
steps: [{
|
|
341
|
+
tool: 'query',
|
|
342
|
+
input: { query: benchmarkCase.start_query },
|
|
343
|
+
output: { value: benchmarkCase.semantic_tuple.resource_anchor },
|
|
344
|
+
durationMs: 1,
|
|
345
|
+
totalTokensEst: 10,
|
|
346
|
+
timestamp: '2026-04-08T00:00:00.000Z',
|
|
347
|
+
}],
|
|
348
|
+
semantic_tuple: benchmarkCase.semantic_tuple,
|
|
349
|
+
normalized_tuple_pass: true,
|
|
350
|
+
evidence_validation_pass: true,
|
|
351
|
+
failure_class: undefined,
|
|
352
|
+
semantic_tuple_pass: true,
|
|
353
|
+
tool_calls_to_completion: 1,
|
|
354
|
+
tokens_to_completion: 10,
|
|
355
|
+
stop_reason: 'semantic_tuple_satisfied',
|
|
356
|
+
}),
|
|
357
|
+
});
|
|
358
|
+
assert.equal(report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass, true);
|
|
359
|
+
assert.equal(report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected, true);
|
|
360
|
+
assert.equal(report.acceptance.cases.weapon_powerup, false);
|
|
361
|
+
assert.equal(report.acceptance.pass, false);
|
|
362
|
+
});
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { AgentContextToolRunner } from '../agent-context/tool-runner.js';
|
|
2
|
+
import type { AgentSafeBenchmarkCase, SemanticDriftMetrics, SemanticTuple } from './types.js';
|
|
3
|
+
export interface WorkflowReplayStep {
|
|
4
|
+
tool: 'query' | 'context' | 'cypher';
|
|
5
|
+
input: Record<string, unknown>;
|
|
6
|
+
output: unknown;
|
|
7
|
+
durationMs: number;
|
|
8
|
+
totalTokensEst: number;
|
|
9
|
+
}
|
|
10
|
+
export interface WorkflowReplayResult extends SemanticDriftMetrics {
|
|
11
|
+
steps: WorkflowReplayStep[];
|
|
12
|
+
base: {
|
|
13
|
+
primary_candidate: string;
|
|
14
|
+
recommended_follow_up: string;
|
|
15
|
+
};
|
|
16
|
+
guid_variant: {
|
|
17
|
+
primary_candidate: string;
|
|
18
|
+
recommended_follow_up: string;
|
|
19
|
+
};
|
|
20
|
+
confirmed_chain: {
|
|
21
|
+
steps: string[];
|
|
22
|
+
};
|
|
23
|
+
semantic_tuple: SemanticTuple;
|
|
24
|
+
semantic_tuple_pass: boolean;
|
|
25
|
+
tool_calls_to_completion: number;
|
|
26
|
+
tokens_to_completion: number;
|
|
27
|
+
retry_breakdown: {
|
|
28
|
+
query_retry_count: number;
|
|
29
|
+
context_retry_count: number;
|
|
30
|
+
cypher_retry_count: number;
|
|
31
|
+
};
|
|
32
|
+
stop_reason: 'semantic_tuple_satisfied' | 'max_steps_reached';
|
|
33
|
+
}
|
|
34
|
+
export type WorkflowReplayResponseProfile = 'full' | 'slim';
|
|
35
|
+
export declare function runWorkflowReplay(benchmarkCase: AgentSafeBenchmarkCase, runner: Pick<AgentContextToolRunner, 'query' | 'context' | 'cypher'>, options?: {
|
|
36
|
+
repo?: string;
|
|
37
|
+
maxSteps?: number;
|
|
38
|
+
responseProfile?: WorkflowReplayResponseProfile;
|
|
39
|
+
}): Promise<WorkflowReplayResult>;
|
|
40
|
+
export declare function runWorkflowReplayWithDefaultRunner(benchmarkCase: AgentSafeBenchmarkCase, options?: {
|
|
41
|
+
repo?: string;
|
|
42
|
+
maxSteps?: number;
|
|
43
|
+
responseProfile?: WorkflowReplayResponseProfile;
|
|
44
|
+
}): Promise<WorkflowReplayResult>;
|