@veewo/gitnexus 1.5.0-rc.4 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/dist/benchmark/agent-context/runner.js +3 -0
  2. package/dist/benchmark/agent-context/runner.test.js +22 -0
  3. package/dist/benchmark/agent-context/tool-runner.d.ts +7 -6
  4. package/dist/benchmark/agent-safe-query-context/io.d.ts +2 -0
  5. package/dist/benchmark/agent-safe-query-context/io.js +86 -0
  6. package/dist/benchmark/agent-safe-query-context/io.test.d.ts +1 -0
  7. package/dist/benchmark/agent-safe-query-context/io.test.js +13 -0
  8. package/dist/benchmark/agent-safe-query-context/report.d.ts +57 -0
  9. package/dist/benchmark/agent-safe-query-context/report.js +159 -0
  10. package/dist/benchmark/agent-safe-query-context/report.test.d.ts +1 -0
  11. package/dist/benchmark/agent-safe-query-context/report.test.js +362 -0
  12. package/dist/benchmark/agent-safe-query-context/runner.d.ts +44 -0
  13. package/dist/benchmark/agent-safe-query-context/runner.js +406 -0
  14. package/dist/benchmark/agent-safe-query-context/runner.test.d.ts +1 -0
  15. package/dist/benchmark/agent-safe-query-context/runner.test.js +290 -0
  16. package/dist/benchmark/agent-safe-query-context/semantic-tuple.d.ts +20 -0
  17. package/dist/benchmark/agent-safe-query-context/semantic-tuple.js +225 -0
  18. package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.d.ts +1 -0
  19. package/dist/benchmark/agent-safe-query-context/semantic-tuple.test.js +122 -0
  20. package/dist/benchmark/agent-safe-query-context/subagent-live.d.ts +47 -0
  21. package/dist/benchmark/agent-safe-query-context/subagent-live.js +128 -0
  22. package/dist/benchmark/agent-safe-query-context/subagent-live.test.d.ts +1 -0
  23. package/dist/benchmark/agent-safe-query-context/subagent-live.test.js +155 -0
  24. package/dist/benchmark/agent-safe-query-context/telemetry-tool.d.ts +9 -0
  25. package/dist/benchmark/agent-safe-query-context/telemetry-tool.js +77 -0
  26. package/dist/benchmark/agent-safe-query-context/types.d.ts +61 -0
  27. package/dist/benchmark/agent-safe-query-context/types.js +8 -0
  28. package/dist/benchmark/analyze-runner.d.ts +1 -1
  29. package/dist/benchmark/analyze-runner.js +4 -3
  30. package/dist/benchmark/analyze-runner.test.js +7 -0
  31. package/dist/benchmark/runtime-poc/provenance-artifact.d.ts +47 -0
  32. package/dist/benchmark/runtime-poc/provenance-artifact.js +89 -0
  33. package/dist/benchmark/runtime-poc/runner.d.ts +31 -0
  34. package/dist/benchmark/runtime-poc/runner.js +163 -0
  35. package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.d.ts +8 -0
  36. package/dist/benchmark/u2-e2e/hydration-policy-repeatability-runner.js +21 -0
  37. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.d.ts +0 -1
  38. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.js +53 -51
  39. package/dist/benchmark/u2-e2e/phase2-runtime-claim-acceptance-runner.test.js +0 -1
  40. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.d.ts +1 -1
  41. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.js +82 -18
  42. package/dist/benchmark/u2-e2e/phase5-rule-lab-acceptance-runner.test.js +1 -2
  43. package/dist/benchmark/u2-e2e/retrieval-runner.js +15 -7
  44. package/dist/benchmark/u2-e2e/retrieval-runner.test.js +46 -0
  45. package/dist/cli/ai-context.d.ts +0 -1
  46. package/dist/cli/ai-context.js +5 -6
  47. package/dist/cli/ai-context.test.js +8 -0
  48. package/dist/cli/analyze-options.js +58 -34
  49. package/dist/cli/analyze-options.test.js +57 -0
  50. package/dist/cli/analyze-runtime-summary.js +2 -0
  51. package/dist/cli/analyze-runtime-summary.test.js +12 -0
  52. package/dist/cli/analyze-summary.d.ts +4 -0
  53. package/dist/cli/analyze-summary.js +43 -0
  54. package/dist/cli/analyze-summary.test.js +65 -1
  55. package/dist/cli/analyze.d.ts +11 -0
  56. package/dist/cli/analyze.js +34 -5
  57. package/dist/cli/analyze.test.d.ts +1 -0
  58. package/dist/cli/analyze.test.js +25 -0
  59. package/dist/cli/benchmark-agent-context.js +1 -1
  60. package/dist/cli/benchmark-agent-safe-query-context.d.ts +20 -0
  61. package/dist/cli/benchmark-agent-safe-query-context.js +39 -0
  62. package/dist/cli/benchmark-agent-safe-query-context.test.d.ts +1 -0
  63. package/dist/cli/benchmark-agent-safe-query-context.test.js +271 -0
  64. package/dist/cli/benchmark-unity.js +1 -1
  65. package/dist/cli/benchmark-unity.test.js +5 -1
  66. package/dist/cli/benchmark.d.ts +29 -0
  67. package/dist/cli/benchmark.js +55 -0
  68. package/dist/cli/index.js +27 -2
  69. package/dist/cli/rule-lab.d.ts +3 -7
  70. package/dist/cli/rule-lab.js +13 -22
  71. package/dist/cli/rule-lab.test.js +23 -3
  72. package/dist/cli/scope-manifest-config.d.ts +9 -0
  73. package/dist/cli/scope-manifest-config.js +37 -0
  74. package/dist/cli/setup.js +40 -41
  75. package/dist/cli/setup.test.js +14 -14
  76. package/dist/cli/sync-manifest.d.ts +27 -0
  77. package/dist/cli/sync-manifest.js +200 -0
  78. package/dist/cli/sync-manifest.test.d.ts +1 -0
  79. package/dist/cli/sync-manifest.test.js +88 -0
  80. package/dist/cli/tool.d.ts +2 -0
  81. package/dist/cli/tool.js +2 -0
  82. package/dist/core/config/unity-config.d.ts +1 -1
  83. package/dist/core/config/unity-config.js +1 -1
  84. package/dist/core/ingestion/call-processor.d.ts +2 -1
  85. package/dist/core/ingestion/call-processor.js +28 -6
  86. package/dist/core/ingestion/heritage-processor.d.ts +2 -1
  87. package/dist/core/ingestion/heritage-processor.js +30 -7
  88. package/dist/core/ingestion/import-processor.d.ts +2 -1
  89. package/dist/core/ingestion/import-processor.js +28 -6
  90. package/dist/core/ingestion/parsing-processor.d.ts +5 -3
  91. package/dist/core/ingestion/parsing-processor.js +46 -13
  92. package/dist/core/ingestion/pipeline.js +100 -19
  93. package/dist/core/ingestion/unity-lifecycle-synthetic-calls.test.js +18 -20
  94. package/dist/core/ingestion/unity-parity-seed.d.ts +2 -1
  95. package/dist/core/ingestion/unity-parity-seed.js +8 -0
  96. package/dist/core/ingestion/unity-resource-processor.d.ts +11 -0
  97. package/dist/core/ingestion/unity-resource-processor.js +102 -0
  98. package/dist/core/ingestion/unity-resource-processor.test.js +449 -0
  99. package/dist/core/ingestion/unity-runtime-binding-rules.d.ts +16 -1
  100. package/dist/core/ingestion/unity-runtime-binding-rules.js +193 -42
  101. package/dist/core/ingestion/workers/parse-worker.d.ts +2 -0
  102. package/dist/core/ingestion/workers/parse-worker.js +50 -6
  103. package/dist/core/lbug/csv-generator.test.js +2 -2
  104. package/dist/core/tree-sitter/csharp-define-profile.d.ts +6 -0
  105. package/dist/core/tree-sitter/csharp-define-profile.js +43 -0
  106. package/dist/core/tree-sitter/csharp-preproc-normalizer.d.ts +14 -0
  107. package/dist/core/tree-sitter/csharp-preproc-normalizer.js +261 -0
  108. package/dist/core/tree-sitter/parser-loader.d.ts +10 -0
  109. package/dist/core/tree-sitter/parser-loader.js +19 -0
  110. package/dist/core/unity/doc-contract.test.d.ts +1 -0
  111. package/dist/core/unity/doc-contract.test.js +30 -0
  112. package/dist/core/unity/prefab-source-scan.d.ts +25 -0
  113. package/dist/core/unity/prefab-source-scan.js +152 -0
  114. package/dist/core/unity/prefab-source-scan.test.d.ts +1 -0
  115. package/dist/core/unity/prefab-source-scan.test.js +70 -0
  116. package/dist/core/unity/scan-context.d.ts +12 -0
  117. package/dist/core/unity/scan-context.js +50 -2
  118. package/dist/core/unity/scan-context.test.js +74 -0
  119. package/dist/mcp/local/agent-safe-response.d.ts +10 -0
  120. package/dist/mcp/local/agent-safe-response.js +639 -0
  121. package/dist/mcp/local/derived-process-reader.js +1 -1
  122. package/dist/mcp/local/local-backend.d.ts +18 -1
  123. package/dist/mcp/local/local-backend.js +319 -125
  124. package/dist/mcp/local/process-confidence.d.ts +1 -2
  125. package/dist/mcp/local/process-confidence.js +0 -3
  126. package/dist/mcp/local/process-confidence.test.js +4 -2
  127. package/dist/mcp/local/process-evidence.d.ts +1 -8
  128. package/dist/mcp/local/process-evidence.js +1 -23
  129. package/dist/mcp/local/process-evidence.test.js +2 -16
  130. package/dist/mcp/local/process-ref.d.ts +1 -1
  131. package/dist/mcp/local/runtime-chain-closure-evaluator.d.ts +33 -0
  132. package/dist/mcp/local/runtime-chain-closure-evaluator.js +273 -0
  133. package/dist/mcp/local/runtime-chain-graph-candidates.d.ts +23 -0
  134. package/dist/mcp/local/runtime-chain-graph-candidates.js +131 -0
  135. package/dist/mcp/local/runtime-chain-verify.d.ts +1 -1
  136. package/dist/mcp/local/runtime-chain-verify.js +149 -138
  137. package/dist/mcp/local/runtime-chain-verify.test.js +126 -68
  138. package/dist/mcp/local/runtime-claim-rule-registry.d.ts +4 -0
  139. package/dist/mcp/local/runtime-claim-rule-registry.js +4 -0
  140. package/dist/mcp/local/runtime-claim-rule-registry.test.js +37 -4
  141. package/dist/mcp/local/runtime-claim.d.ts +11 -0
  142. package/dist/mcp/local/runtime-claim.js +28 -0
  143. package/dist/mcp/local/unity-evidence-view.d.ts +1 -1
  144. package/dist/mcp/local/unity-evidence-view.js +1 -1
  145. package/dist/mcp/local/unity-evidence-view.test.js +22 -0
  146. package/dist/mcp/tools.js +51 -21
  147. package/dist/rule-lab/analyze.d.ts +2 -1
  148. package/dist/rule-lab/analyze.js +94 -59
  149. package/dist/rule-lab/analyze.test.js +238 -20
  150. package/dist/rule-lab/curate.d.ts +2 -1
  151. package/dist/rule-lab/curate.js +24 -3
  152. package/dist/rule-lab/curate.test.js +65 -0
  153. package/dist/rule-lab/curation-input-builder.d.ts +45 -0
  154. package/dist/rule-lab/curation-input-builder.js +133 -0
  155. package/dist/rule-lab/promote.js +80 -7
  156. package/dist/rule-lab/promote.test.js +150 -0
  157. package/dist/rule-lab/review-pack.d.ts +3 -0
  158. package/dist/rule-lab/review-pack.js +41 -1
  159. package/dist/rule-lab/review-pack.test.js +67 -0
  160. package/dist/rule-lab/types.d.ts +29 -0
  161. package/dist/types/pipeline.d.ts +16 -0
  162. package/package.json +14 -13
  163. package/scripts/check-sync-manifest-traceability.mjs +203 -0
  164. package/scripts/run-node-tests.mjs +61 -0
  165. package/scripts/tree-sitter-audit-classify.mjs +172 -0
  166. package/skills/_shared/unity-rule-authoring-contract.md +64 -0
  167. package/skills/_shared/unity-runtime-process-contract.md +16 -0
  168. package/skills/gitnexus-cli.md +44 -4
  169. package/skills/gitnexus-debugging.md +9 -0
  170. package/skills/gitnexus-exploring.md +66 -18
  171. package/skills/gitnexus-guide.md +42 -3
  172. package/skills/gitnexus-impact-analysis.md +8 -0
  173. package/skills/gitnexus-pr-review.md +8 -0
  174. package/skills/gitnexus-refactoring.md +8 -0
  175. package/skills/gitnexus-unity-rule-gen.md +66 -312
@@ -0,0 +1,362 @@
1
+ import test from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import { runAgentSafeQueryContextBenchmark } from './report.js';
4
+ const fakeSuite = {
5
+ thresholds: {
6
+ workflowReplay: { maxSteps: 5 },
7
+ tokenReduction: {
8
+ weapon_powerup: 0.5,
9
+ reload: 0.4,
10
+ },
11
+ },
12
+ cases: {
13
+ weapon_powerup: {
14
+ label: 'weapon_powerup',
15
+ start_query: 'weapon powerup equip chain',
16
+ retry_query: '1_weapon_orb_key.asset WeaponPowerUp HoldPickup EquipWithEvent Equip',
17
+ proof_contexts: ['WeaponPowerUp'],
18
+ proof_cypher: 'MATCH () RETURN 1',
19
+ tool_plan: [{ tool: 'query', input: { query: 'weapon powerup equip chain' } }],
20
+ live_task: {
21
+ objective: 'Investigate WeaponPowerUp from the provided asset seed and report the best supported runtime relation.',
22
+ symbol_seed: 'WeaponPowerUp',
23
+ resource_seed: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
24
+ },
25
+ semantic_tuple: {
26
+ resource_anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
27
+ symbol_anchor: 'WeaponPowerUp',
28
+ proof_edges: [
29
+ 'HoldPickup -> WeaponPowerUp.PickItUp',
30
+ 'EquipWithEvent -> WeaponPowerUp.Equip',
31
+ ],
32
+ closure_status: 'not_verified_full',
33
+ },
34
+ },
35
+ reload: {
36
+ label: 'reload',
37
+ start_query: 'reload getvalue checkreload',
38
+ retry_query: 'Gungraph_use/1_weapon_orb_key.asset ReloadBase GetValue CheckReload',
39
+ proof_contexts: ['ReloadBase'],
40
+ proof_cypher: 'MATCH () RETURN 1',
41
+ tool_plan: [{ tool: 'query', input: { query: 'reload getvalue checkreload' } }],
42
+ live_task: {
43
+ objective: 'Investigate ReloadBase from the provided graph asset seed and report the best supported reload relation.',
44
+ symbol_seed: 'ReloadBase',
45
+ resource_seed: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
46
+ },
47
+ semantic_tuple: {
48
+ resource_anchor: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
49
+ symbol_anchor: 'ReloadBase',
50
+ proof_edge: 'ReloadBase.GetValue -> ReloadBase.CheckReload',
51
+ closure_status: 'not_verified_full',
52
+ },
53
+ },
54
+ },
55
+ };
56
+ test('benchmark report includes explicit benchmark tracks', async () => {
57
+ const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
58
+ repo: 'neonspark-core',
59
+ subagentRunsDir: '/tmp/subagent-runs',
60
+ }, {
61
+ runner: {
62
+ query: async (input) => {
63
+ const queryText = String(input?.query || '');
64
+ if (/reload|ReloadBase|CheckReload/.test(queryText)) {
65
+ return {
66
+ candidates: [{ name: 'ReloadBase' }],
67
+ resource_hints: [{ path: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
68
+ };
69
+ }
70
+ return {
71
+ candidates: [{ name: 'WeaponPowerUp' }],
72
+ resource_hints: [{ path: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
73
+ };
74
+ },
75
+ context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
76
+ impact: async () => ({ impactedCount: 0 }),
77
+ cypher: async (input) => {
78
+ const queryText = String(input?.query || '');
79
+ if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
80
+ return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
81
+ }
82
+ return {
83
+ row_count: 2,
84
+ rows: [
85
+ { src: 'HoldPickup', dst: 'PickItUp' },
86
+ { src: 'EquipWithEvent', dst: 'Equip' },
87
+ ],
88
+ };
89
+ },
90
+ close: async () => { },
91
+ },
92
+ executeToolPlan: async (plan) => plan.map((step) => ({
93
+ tool: step.tool,
94
+ input: step.input,
95
+ output: {
96
+ anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
97
+ symbol: 'WeaponPowerUp',
98
+ proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
99
+ },
100
+ })),
101
+ loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
102
+ prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
103
+ prompt_path: '/tmp/prompt.txt',
104
+ result_path: '/tmp/result.json',
105
+ telemetry_path: '/tmp/telemetry.jsonl',
106
+ final_result: {},
107
+ steps: [{
108
+ tool: 'query',
109
+ input: { query: benchmarkCase.start_query },
110
+ output: { value: benchmarkCase.semantic_tuple.resource_anchor },
111
+ durationMs: 1,
112
+ totalTokensEst: 10,
113
+ timestamp: '2026-04-08T00:00:00.000Z',
114
+ }],
115
+ semantic_tuple: benchmarkCase.semantic_tuple,
116
+ normalized_tuple_pass: true,
117
+ evidence_validation_pass: true,
118
+ failure_class: undefined,
119
+ semantic_tuple_pass: true,
120
+ tool_calls_to_completion: 1,
121
+ tokens_to_completion: 10,
122
+ stop_reason: 'semantic_tuple_satisfied',
123
+ }),
124
+ });
125
+ assert.equal(report.cases.weapon_powerup.semantic_tuple_pass, true);
126
+ assert.ok(report.same_script.tool_plan.weapon_powerup);
127
+ assert.ok(report.subagent_live.reload.steps);
128
+ assert.ok(report.token_summary.weapon_powerup);
129
+ assert.ok(report.call_summary.reload);
130
+ assert.ok(report.workflow_replay_full.weapon_powerup);
131
+ assert.ok(report.workflow_replay_slim.weapon_powerup);
132
+ assert.ok(report.same_script_full.reload);
133
+ assert.ok(report.same_script_slim.reload);
134
+ assert.ok(report.subagent_live.weapon_powerup);
135
+ assert.equal(report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass, true);
136
+ assert.equal(typeof report.workflow_replay_slim.weapon_powerup.anchor_top1_pass, 'boolean');
137
+ assert.equal(typeof report.workflow_replay_slim.weapon_powerup.recommended_follow_up_hit, 'boolean');
138
+ assert.equal(typeof report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass, 'boolean');
139
+ assert.equal(typeof report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit, 'boolean');
140
+ assert.equal(typeof report.workflow_replay_slim.weapon_powerup.ambiguity_detour_count, 'number');
141
+ assert.equal(report.workflow_replay_slim.reload.guid_invariance_pass, true);
142
+ assert.equal(report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass, true);
143
+ assert.equal(report.acceptance.pass, report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass
144
+ && report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass
145
+ && report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit
146
+ && report.workflow_replay_slim.weapon_powerup.guid_invariance_pass
147
+ && report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass
148
+ && report.workflow_replay_slim.weapon_powerup.freeze_ready
149
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.facts_present
150
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.closure_present
151
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.clues_present
152
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.semantic_order_pass
153
+ && !report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected
154
+ && !report.workflow_replay_slim.weapon_powerup.heuristic_top_summary_detected
155
+ && report.workflow_replay_slim.reload.semantic_tuple_pass
156
+ && report.workflow_replay_slim.reload.post_narrowing_anchor_pass
157
+ && report.workflow_replay_slim.reload.post_narrowing_follow_up_hit
158
+ && report.workflow_replay_slim.reload.guid_invariance_pass
159
+ && report.workflow_replay_slim.reload.live_tool_evidence_pass
160
+ && report.workflow_replay_slim.reload.freeze_ready
161
+ && report.workflow_replay_slim.reload.tier_envelope.facts_present
162
+ && report.workflow_replay_slim.reload.tier_envelope.closure_present
163
+ && report.workflow_replay_slim.reload.tier_envelope.clues_present
164
+ && report.workflow_replay_slim.reload.tier_envelope.semantic_order_pass
165
+ && !report.workflow_replay_slim.reload.placeholder_leak_detected
166
+ && !report.workflow_replay_slim.reload.heuristic_top_summary_detected);
167
+ assert.equal(report.pass, report.acceptance.pass);
168
+ });
169
+ test('benchmark report enforces track split, acceptance source, prompt secrecy, and live scoring taxonomy', async () => {
170
+ const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
171
+ repo: 'neonspark-core',
172
+ subagentRunsDir: '/tmp/subagent-runs',
173
+ }, {
174
+ runner: {
175
+ query: async (input) => {
176
+ const queryText = String(input?.query || '');
177
+ if (/reload|ReloadBase|CheckReload/.test(queryText)) {
178
+ return {
179
+ candidates: [{ name: 'ReloadBase' }],
180
+ resource_hints: [{ path: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
181
+ };
182
+ }
183
+ return {
184
+ candidates: [{ name: 'WeaponPowerUp' }],
185
+ resource_hints: [{ path: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
186
+ };
187
+ },
188
+ context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
189
+ impact: async () => ({ impactedCount: 0 }),
190
+ cypher: async (input) => {
191
+ const queryText = String(input?.query || '');
192
+ if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
193
+ return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
194
+ }
195
+ return {
196
+ row_count: 2,
197
+ rows: [
198
+ { src: 'HoldPickup', dst: 'PickItUp' },
199
+ { src: 'EquipWithEvent', dst: 'Equip' },
200
+ ],
201
+ };
202
+ },
203
+ close: async () => { },
204
+ },
205
+ executeToolPlan: async (plan) => plan.map((step) => ({
206
+ tool: step.tool,
207
+ input: step.input,
208
+ output: {
209
+ anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
210
+ symbol: 'WeaponPowerUp',
211
+ proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
212
+ },
213
+ })),
214
+ loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
215
+ prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
216
+ prompt_path: '/tmp/prompt.txt',
217
+ result_path: '/tmp/result.json',
218
+ telemetry_path: '/tmp/telemetry.jsonl',
219
+ final_result: {},
220
+ steps: [{
221
+ tool: 'query',
222
+ input: { query: benchmarkCase.start_query },
223
+ output: { value: benchmarkCase.semantic_tuple.resource_anchor },
224
+ durationMs: 1,
225
+ totalTokensEst: 10,
226
+ timestamp: '2026-04-08T00:00:00.000Z',
227
+ }],
228
+ semantic_tuple: benchmarkCase.semantic_tuple,
229
+ normalized_tuple_pass: true,
230
+ evidence_validation_pass: true,
231
+ failure_class: undefined,
232
+ semantic_tuple_pass: true,
233
+ tool_calls_to_completion: 1,
234
+ tokens_to_completion: 10,
235
+ stop_reason: 'semantic_tuple_satisfied',
236
+ }),
237
+ });
238
+ assert.equal(Object.keys(report.workflow_replay_full).length > 0, true);
239
+ assert.equal(Object.keys(report.workflow_replay_slim).length > 0, true);
240
+ assert.equal(Object.keys(report.same_script_full).length > 0, true);
241
+ assert.equal(Object.keys(report.same_script_slim).length > 0, true);
242
+ assert.equal(Object.keys(report.subagent_live).length > 0, true);
243
+ assert.deepEqual(report.acceptance.cases, {
244
+ weapon_powerup: report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass
245
+ && report.workflow_replay_slim.weapon_powerup.post_narrowing_anchor_pass
246
+ && report.workflow_replay_slim.weapon_powerup.post_narrowing_follow_up_hit
247
+ && report.workflow_replay_slim.weapon_powerup.guid_invariance_pass
248
+ && report.workflow_replay_slim.weapon_powerup.live_tool_evidence_pass
249
+ && report.workflow_replay_slim.weapon_powerup.freeze_ready
250
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.facts_present
251
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.closure_present
252
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.clues_present
253
+ && report.workflow_replay_slim.weapon_powerup.tier_envelope.semantic_order_pass
254
+ && !report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected
255
+ && !report.workflow_replay_slim.weapon_powerup.heuristic_top_summary_detected,
256
+ reload: report.workflow_replay_slim.reload.semantic_tuple_pass
257
+ && report.workflow_replay_slim.reload.post_narrowing_anchor_pass
258
+ && report.workflow_replay_slim.reload.post_narrowing_follow_up_hit
259
+ && report.workflow_replay_slim.reload.guid_invariance_pass
260
+ && report.workflow_replay_slim.reload.live_tool_evidence_pass
261
+ && report.workflow_replay_slim.reload.freeze_ready
262
+ && report.workflow_replay_slim.reload.tier_envelope.facts_present
263
+ && report.workflow_replay_slim.reload.tier_envelope.closure_present
264
+ && report.workflow_replay_slim.reload.tier_envelope.clues_present
265
+ && report.workflow_replay_slim.reload.tier_envelope.semantic_order_pass
266
+ && !report.workflow_replay_slim.reload.placeholder_leak_detected
267
+ && !report.workflow_replay_slim.reload.heuristic_top_summary_detected,
268
+ });
269
+ assert.equal(report.subagent_live.weapon_powerup.prompt.includes('HoldPickup -> WeaponPowerUp.PickItUp'), false);
270
+ assert.equal(report.subagent_live.reload.prompt.includes('ReloadBase.GetValue -> ReloadBase.CheckReload'), false);
271
+ for (const row of Object.values(report.subagent_live)) {
272
+ assert.equal(typeof row.normalized_tuple_pass, 'boolean');
273
+ assert.equal(typeof row.evidence_validation_pass, 'boolean');
274
+ if (!row.semantic_tuple_pass) {
275
+ assert.ok(row.failure_class);
276
+ }
277
+ }
278
+ });
279
+ test('acceptance fails when semantic tuple passes but placeholder leakage is detected', async () => {
280
+ const report = await runAgentSafeQueryContextBenchmark(fakeSuite, {
281
+ repo: 'neonspark-core',
282
+ subagentRunsDir: '/tmp/subagent-runs',
283
+ }, {
284
+ runner: {
285
+ query: async (input) => {
286
+ const queryText = String(input?.query || '');
287
+ if (/reload|ReloadBase|CheckReload/.test(queryText)) {
288
+ return {
289
+ summary: 'ReloadBase flow',
290
+ decision: {
291
+ primary_candidate: 'ReloadBase',
292
+ recommended_follow_up: 'resource_path_prefix=Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset',
293
+ },
294
+ candidates: [{ name: 'ReloadBase' }],
295
+ resource_hints: [{ target: 'Assets/NEON/Graphs/PlayerGun/Gungraph_use/1_weapon_orb_key.asset' }],
296
+ };
297
+ }
298
+ return {
299
+ summary: 'WeaponPowerUp flow',
300
+ decision: {
301
+ primary_candidate: 'WeaponPowerUp',
302
+ recommended_follow_up: 'resource_path_prefix=Reload NEON.Game.Graph.Nodes.Reloads',
303
+ },
304
+ candidates: [{ name: 'WeaponPowerUp' }],
305
+ resource_hints: [{ target: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset' }],
306
+ };
307
+ },
308
+ context: async (input) => ({ symbol: { name: String(input?.name || 'WeaponPowerUp') } }),
309
+ impact: async () => ({ impactedCount: 0 }),
310
+ cypher: async (input) => {
311
+ const queryText = String(input?.query || '');
312
+ if (queryText.includes('CheckReload') || queryText.includes('GetValue')) {
313
+ return { row_count: 1, rows: [{ src: 'GetValue', dst: 'CheckReload' }] };
314
+ }
315
+ return {
316
+ row_count: 2,
317
+ rows: [
318
+ { src: 'HoldPickup', dst: 'PickItUp' },
319
+ { src: 'EquipWithEvent', dst: 'Equip' },
320
+ ],
321
+ };
322
+ },
323
+ close: async () => { },
324
+ },
325
+ executeToolPlan: async (plan) => plan.map((step) => ({
326
+ tool: step.tool,
327
+ input: step.input,
328
+ output: {
329
+ anchor: 'Assets/NEON/DataAssets/Powerups/1_newWeapon/0_pick/法器_Orb/1_weapon_orb_key.asset',
330
+ symbol: 'WeaponPowerUp',
331
+ proof: 'HoldPickup -> WeaponPowerUp.PickItUp',
332
+ },
333
+ })),
334
+ loadSubagentLiveCaseResult: async (_runDir, benchmarkCase) => ({
335
+ prompt: 'Use only telemetry-tool.js\nFinal JSON schema:',
336
+ prompt_path: '/tmp/prompt.txt',
337
+ result_path: '/tmp/result.json',
338
+ telemetry_path: '/tmp/telemetry.jsonl',
339
+ final_result: {},
340
+ steps: [{
341
+ tool: 'query',
342
+ input: { query: benchmarkCase.start_query },
343
+ output: { value: benchmarkCase.semantic_tuple.resource_anchor },
344
+ durationMs: 1,
345
+ totalTokensEst: 10,
346
+ timestamp: '2026-04-08T00:00:00.000Z',
347
+ }],
348
+ semantic_tuple: benchmarkCase.semantic_tuple,
349
+ normalized_tuple_pass: true,
350
+ evidence_validation_pass: true,
351
+ failure_class: undefined,
352
+ semantic_tuple_pass: true,
353
+ tool_calls_to_completion: 1,
354
+ tokens_to_completion: 10,
355
+ stop_reason: 'semantic_tuple_satisfied',
356
+ }),
357
+ });
358
+ assert.equal(report.workflow_replay_slim.weapon_powerup.semantic_tuple_pass, true);
359
+ assert.equal(report.workflow_replay_slim.weapon_powerup.placeholder_leak_detected, true);
360
+ assert.equal(report.acceptance.cases.weapon_powerup, false);
361
+ assert.equal(report.acceptance.pass, false);
362
+ });
@@ -0,0 +1,44 @@
1
+ import type { AgentContextToolRunner } from '../agent-context/tool-runner.js';
2
+ import type { AgentSafeBenchmarkCase, SemanticDriftMetrics, SemanticTuple } from './types.js';
3
+ export interface WorkflowReplayStep {
4
+ tool: 'query' | 'context' | 'cypher';
5
+ input: Record<string, unknown>;
6
+ output: unknown;
7
+ durationMs: number;
8
+ totalTokensEst: number;
9
+ }
10
+ export interface WorkflowReplayResult extends SemanticDriftMetrics {
11
+ steps: WorkflowReplayStep[];
12
+ base: {
13
+ primary_candidate: string;
14
+ recommended_follow_up: string;
15
+ };
16
+ guid_variant: {
17
+ primary_candidate: string;
18
+ recommended_follow_up: string;
19
+ };
20
+ confirmed_chain: {
21
+ steps: string[];
22
+ };
23
+ semantic_tuple: SemanticTuple;
24
+ semantic_tuple_pass: boolean;
25
+ tool_calls_to_completion: number;
26
+ tokens_to_completion: number;
27
+ retry_breakdown: {
28
+ query_retry_count: number;
29
+ context_retry_count: number;
30
+ cypher_retry_count: number;
31
+ };
32
+ stop_reason: 'semantic_tuple_satisfied' | 'max_steps_reached';
33
+ }
34
+ export type WorkflowReplayResponseProfile = 'full' | 'slim';
35
+ export declare function runWorkflowReplay(benchmarkCase: AgentSafeBenchmarkCase, runner: Pick<AgentContextToolRunner, 'query' | 'context' | 'cypher'>, options?: {
36
+ repo?: string;
37
+ maxSteps?: number;
38
+ responseProfile?: WorkflowReplayResponseProfile;
39
+ }): Promise<WorkflowReplayResult>;
40
+ export declare function runWorkflowReplayWithDefaultRunner(benchmarkCase: AgentSafeBenchmarkCase, options?: {
41
+ repo?: string;
42
+ maxSteps?: number;
43
+ responseProfile?: WorkflowReplayResponseProfile;
44
+ }): Promise<WorkflowReplayResult>;