@principles/core 1.160.0 → 1.162.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/runtime-v2/__tests__/adversarial-loop.test.js +24 -78
  2. package/dist/runtime-v2/__tests__/adversarial-loop.test.js.map +1 -1
  3. package/dist/runtime-v2/__tests__/architecture-regression.test.js +8 -5
  4. package/dist/runtime-v2/__tests__/architecture-regression.test.js.map +1 -1
  5. package/dist/runtime-v2/__tests__/artificer-runner-vslice.test.js +32 -80
  6. package/dist/runtime-v2/__tests__/artificer-runner-vslice.test.js.map +1 -1
  7. package/dist/runtime-v2/__tests__/full-chain-real-llm.test.js +2 -2
  8. package/dist/runtime-v2/__tests__/full-chain-real-llm.test.js.map +1 -1
  9. package/dist/runtime-v2/__tests__/principle-compiler-core.test.js +7 -7
  10. package/dist/runtime-v2/__tests__/principle-compiler-core.test.js.map +1 -1
  11. package/dist/runtime-v2/activation/__tests__/production-gate-deps.test.js +24 -1
  12. package/dist/runtime-v2/activation/__tests__/production-gate-deps.test.js.map +1 -1
  13. package/dist/runtime-v2/activation/production-gate-deps.d.ts.map +1 -1
  14. package/dist/runtime-v2/activation/production-gate-deps.js +18 -1
  15. package/dist/runtime-v2/activation/production-gate-deps.js.map +1 -1
  16. package/dist/runtime-v2/adapter/__tests__/artificer-l2-adapter.test.js +272 -442
  17. package/dist/runtime-v2/adapter/__tests__/artificer-l2-adapter.test.js.map +1 -1
  18. package/dist/runtime-v2/adapter/artificer-l2-adapter.d.ts +14 -34
  19. package/dist/runtime-v2/adapter/artificer-l2-adapter.d.ts.map +1 -1
  20. package/dist/runtime-v2/adapter/artificer-l2-adapter.js +182 -220
  21. package/dist/runtime-v2/adapter/artificer-l2-adapter.js.map +1 -1
  22. package/dist/runtime-v2/adapter/pi-ai-runtime-adapter.js +2 -2
  23. package/dist/runtime-v2/adapter/pi-ai-runtime-adapter.js.map +1 -1
  24. package/dist/runtime-v2/adversarial-loop.d.ts.map +1 -1
  25. package/dist/runtime-v2/adversarial-loop.js +5 -27
  26. package/dist/runtime-v2/adversarial-loop.js.map +1 -1
  27. package/dist/runtime-v2/golden-trace-replay-validator.d.ts +8 -0
  28. package/dist/runtime-v2/golden-trace-replay-validator.d.ts.map +1 -1
  29. package/dist/runtime-v2/golden-trace-replay-validator.js +3 -3
  30. package/dist/runtime-v2/golden-trace-replay-validator.js.map +1 -1
  31. package/dist/runtime-v2/golden-trace.d.ts +16 -1
  32. package/dist/runtime-v2/golden-trace.d.ts.map +1 -1
  33. package/dist/runtime-v2/golden-trace.js +13 -4
  34. package/dist/runtime-v2/golden-trace.js.map +1 -1
  35. package/dist/runtime-v2/index.d.ts +8 -5
  36. package/dist/runtime-v2/index.d.ts.map +1 -1
  37. package/dist/runtime-v2/index.js +11 -4
  38. package/dist/runtime-v2/index.js.map +1 -1
  39. package/dist/runtime-v2/internalization/__tests__/artificer-rule-output.test.d.ts +2 -0
  40. package/dist/runtime-v2/internalization/__tests__/artificer-rule-output.test.d.ts.map +1 -0
  41. package/dist/runtime-v2/internalization/__tests__/{artificer-output-v2.test.js → artificer-rule-output.test.js} +126 -127
  42. package/dist/runtime-v2/internalization/__tests__/artificer-rule-output.test.js.map +1 -0
  43. package/dist/runtime-v2/internalization/__tests__/rule-code-dialect.test.d.ts +2 -0
  44. package/dist/runtime-v2/internalization/__tests__/rule-code-dialect.test.d.ts.map +1 -0
  45. package/dist/runtime-v2/internalization/__tests__/rule-code-dialect.test.js +270 -0
  46. package/dist/runtime-v2/internalization/__tests__/rule-code-dialect.test.js.map +1 -0
  47. package/dist/runtime-v2/internalization/__tests__/rule-host-input-builder.test.d.ts +2 -0
  48. package/dist/runtime-v2/internalization/__tests__/rule-host-input-builder.test.d.ts.map +1 -0
  49. package/dist/runtime-v2/internalization/__tests__/rule-host-input-builder.test.js +180 -0
  50. package/dist/runtime-v2/internalization/__tests__/rule-host-input-builder.test.js.map +1 -0
  51. package/dist/runtime-v2/internalization/artificer-output.d.ts +33 -51
  52. package/dist/runtime-v2/internalization/artificer-output.d.ts.map +1 -1
  53. package/dist/runtime-v2/internalization/artificer-output.js +48 -87
  54. package/dist/runtime-v2/internalization/artificer-output.js.map +1 -1
  55. package/dist/runtime-v2/internalization/artificer-runner.d.ts +8 -8
  56. package/dist/runtime-v2/internalization/artificer-runner.d.ts.map +1 -1
  57. package/dist/runtime-v2/internalization/artificer-runner.js +5 -5
  58. package/dist/runtime-v2/internalization/artificer-runner.js.map +1 -1
  59. package/dist/runtime-v2/internalization/evaluator-runner.js +2 -2
  60. package/dist/runtime-v2/internalization/index.d.ts +9 -4
  61. package/dist/runtime-v2/internalization/index.d.ts.map +1 -1
  62. package/dist/runtime-v2/internalization/index.js +8 -3
  63. package/dist/runtime-v2/internalization/index.js.map +1 -1
  64. package/dist/runtime-v2/internalization/rule-code-validator.d.ts +16 -0
  65. package/dist/runtime-v2/internalization/rule-code-validator.d.ts.map +1 -1
  66. package/dist/runtime-v2/internalization/rule-code-validator.js +50 -1
  67. package/dist/runtime-v2/internalization/rule-code-validator.js.map +1 -1
  68. package/dist/runtime-v2/internalization/rule-host-evaluator.d.ts +1 -0
  69. package/dist/runtime-v2/internalization/rule-host-evaluator.d.ts.map +1 -1
  70. package/dist/runtime-v2/internalization/rule-host-evaluator.js +6 -2
  71. package/dist/runtime-v2/internalization/rule-host-evaluator.js.map +1 -1
  72. package/dist/runtime-v2/internalization/rule-host-input-builder.d.ts +62 -0
  73. package/dist/runtime-v2/internalization/rule-host-input-builder.d.ts.map +1 -0
  74. package/dist/runtime-v2/internalization/rule-host-input-builder.js +182 -0
  75. package/dist/runtime-v2/internalization/rule-host-input-builder.js.map +1 -0
  76. package/dist/runtime-v2/internalization/rule-host-validator.d.ts.map +1 -1
  77. package/dist/runtime-v2/internalization/rule-host-validator.js +22 -1
  78. package/dist/runtime-v2/internalization/rule-host-validator.js.map +1 -1
  79. package/dist/runtime-v2/internalization/template-generator.d.ts +7 -2
  80. package/dist/runtime-v2/internalization/template-generator.d.ts.map +1 -1
  81. package/dist/runtime-v2/internalization/template-generator.js +10 -5
  82. package/dist/runtime-v2/internalization/template-generator.js.map +1 -1
  83. package/dist/runtime-v2/tools/__tests__/artificer-l2-tool-contract.test.d.ts +2 -0
  84. package/dist/runtime-v2/tools/__tests__/artificer-l2-tool-contract.test.d.ts.map +1 -0
  85. package/dist/runtime-v2/tools/__tests__/artificer-l2-tool-contract.test.js +322 -0
  86. package/dist/runtime-v2/tools/__tests__/artificer-l2-tool-contract.test.js.map +1 -0
  87. package/dist/runtime-v2/tools/__tests__/artificer-output-typebox.test.d.ts +2 -0
  88. package/dist/runtime-v2/tools/__tests__/artificer-output-typebox.test.d.ts.map +1 -0
  89. package/dist/runtime-v2/tools/__tests__/artificer-output-typebox.test.js +149 -0
  90. package/dist/runtime-v2/tools/__tests__/artificer-output-typebox.test.js.map +1 -0
  91. package/dist/runtime-v2/tools/artificer-l2-tool-contract.d.ts +72 -0
  92. package/dist/runtime-v2/tools/artificer-l2-tool-contract.d.ts.map +1 -0
  93. package/dist/runtime-v2/tools/artificer-l2-tool-contract.js +275 -0
  94. package/dist/runtime-v2/tools/artificer-l2-tool-contract.js.map +1 -0
  95. package/dist/runtime-v2/tools/artificer-output-typebox.d.ts +78 -0
  96. package/dist/runtime-v2/tools/artificer-output-typebox.d.ts.map +1 -0
  97. package/dist/runtime-v2/tools/artificer-output-typebox.js +70 -0
  98. package/dist/runtime-v2/tools/artificer-output-typebox.js.map +1 -0
  99. package/dist/telemetry-event.d.ts +2 -2
  100. package/dist/telemetry-event.d.ts.map +1 -1
  101. package/dist/telemetry-event.js +5 -3
  102. package/dist/telemetry-event.js.map +1 -1
  103. package/package.json +1 -1
  104. package/dist/runtime-v2/internalization/__tests__/artificer-output-v2.test.d.ts +0 -2
  105. package/dist/runtime-v2/internalization/__tests__/artificer-output-v2.test.d.ts.map +0 -1
  106. package/dist/runtime-v2/internalization/__tests__/artificer-output-v2.test.js.map +0 -1
@@ -1,64 +1,73 @@
1
1
  /**
2
- * ArtificerL2Adapter tests (RuleHost MVP Activation, ADR-0014 Amendment 2026-06-17,
3
- * PRD Decision 8, test module 7).
2
+ * ArtificerL2Adapter tests (PRI-439 Phase 4 — tool-using L2 agent).
4
3
  *
5
- * TDD Phase 4.1 RED asserts behavior not yet implemented in
6
- * artificer-l2-adapter.ts.
4
+ * Mocks runAgentLoop (no real LLM calls) to verify the adapter's orchestration:
5
+ * - submit_rulecode capture terminates the loop and stores the output
6
+ * - maxTurns cap forces stop when submit_rulecode is never called
7
+ * - beforeToolCall whitelist blocks non-allowlisted tools
8
+ * - shouldStopAfterTurn checks output capture + turn count
9
+ * - no V1/L1 fallback: exhaustion throws PDRuntimeError
10
+ * - timeout: abort signal triggers timed_out failure
11
+ * - telemetry events (artificer_l2_turn / artificer_l2_complete) are emitted
7
12
  *
8
- * The adapter encapsulates a write-test-fix loop (generate code → sandbox replay →
9
- * inject RefinerSandboxFailedCase[] feedback regenerate, max 3 attempts) inside
10
- * a PDRuntimeAdapter. BasePeerRunner sees a single startRun(); the loop is invisible
11
- * to it. This follows the Dreamer L2 precedent (L2AgentLoopAdapter) of putting the
12
- * multi-attempt logic in the adapter, not in succeedTask().
13
- *
14
- * Testability: LLM calls are mocked via an injected `generateCode` function.
15
- * Sandbox replay uses real evaluateRefinerRuleHostGate with a controllable
16
- * RefinerRuleHostGateDeps. No real LLM calls.
17
- *
18
- * Coverage (PRD test module 7):
19
- * - happy path: 1st attempt passes replay → V2 output (1 LLM call)
20
- * - fix path: 1st attempt fails → feedback injected → 2nd passes → V2 (2 LLM calls)
21
- * - exhaustion: 3 attempts all fail → V1 degraded output (no code fields)
22
- * - error types: forbidden_pattern / runtime_error / timeout / validation_failed
23
- * - V1 backward compat: degraded V1 output is NOT detected as V2 by isArtificerOutputV2
24
- *
25
- * ERR checklist (EP-05 Loop State Freshness): each attempt reads fresh sandbox
26
- * errors; the feedback injected into attempt N+1 is from attempt N's failure,
27
- * never stale. (ERR-015/018/019)
13
+ * ERR checklist:
14
+ * - EP-05 Loop State Freshness: each startRun uses fresh outputCapture + turnCount
15
+ * - EP-03 Fail Loud: exhaustion throws PDRuntimeError with structured nextAction
16
+ * - EP-01 Trust Boundary: submit_rulecode validates via injected validator
28
17
  */
29
- import { describe, it, expect } from 'vitest';
18
+ import { describe, it, expect, vi, beforeEach } from 'vitest';
19
+ const hoisted = vi.hoisted(() => {
20
+ return {
21
+ lastLoopConfig: {},
22
+ mockReturn: [],
23
+ impl: null,
24
+ };
25
+ });
26
+ /* eslint-disable @typescript-eslint/max-params -- runAgentLoop mock mirrors the real 5-param signature */
27
+ vi.mock('@earendil-works/pi-agent-core', () => ({
28
+ runAgentLoop: vi.fn(async (prompts, context, config, emit, signal) => {
29
+ hoisted.lastLoopConfig = config;
30
+ if (typeof hoisted.impl === 'function') {
31
+ const fn = hoisted.impl;
32
+ return fn(prompts, context, config, emit, signal);
33
+ }
34
+ return hoisted.mockReturn.slice();
35
+ }),
36
+ }));
37
+ /* eslint-enable @typescript-eslint/max-params */
38
+ // Mock resolveL2Model's pi-ai dependencies (getModel/getProviders) — the adapter
39
+ // uses the custom baseUrl path so these stubs are never called for real.
40
+ vi.mock('@earendil-works/pi-ai', () => ({
41
+ completeSimple: vi.fn(),
42
+ getModel: vi.fn(() => ({ id: 'test', name: 'test', api: 'openai-completions', provider: 'test-provider' })),
43
+ getProviders: vi.fn(() => []),
44
+ }));
45
+ vi.mock('../../store/event-emitter.js', () => ({
46
+ storeEmitter: { emitTelemetry: vi.fn() },
47
+ }));
48
+ import { storeEmitter } from '../../store/event-emitter.js';
30
49
  import { ArtificerL2Adapter } from '../artificer-l2-adapter.js';
31
- import { isArtificerOutputV2, DefaultArtificerValidator } from '../../internalization/artificer-output.js';
32
- import { validateGoldenTrace } from '../../golden-trace.js';
33
- import { Value } from '@sinclair/typebox/value';
34
- import { RunHandleSchema, RuntimeKindSchema } from '../../runtime-protocol.js';
50
+ import { DefaultArtificerValidator } from '../../internalization/artificer-output.js';
51
+ const emitTelemetryMock = storeEmitter.emitTelemetry;
35
52
  const TASK_ID = 'task-artificer-l2-001';
36
- /** A valid V2 output the LLM might produce. */
37
- function makeV2Output(overrides = {}) {
53
+ /** A valid ArtificerRuleOutput the model might submit via submit_rulecode. */
54
+ function makeRuleOutput(overrides = {}) {
38
55
  return {
39
56
  taskId: TASK_ID,
40
- sourceScribeArtifactId: 'pi-art-scribe-001-run-001',
41
- implementationPlan: {
42
- summary: 'Block writes to system dirs',
43
- targetSurface: 'edit gate',
44
- changes: ['path prefix check'],
45
- tests: ['golden trace replay'],
46
- rolloutNotes: ['shadow first'],
47
- confidence: 0.8,
48
- },
49
- sourceTrace: { scribeArtifactId: 'pi-art-scribe-001-run-001' },
50
- risks: [],
51
- generatedAt: '2026-06-17T00:00:00.000Z',
57
+ sourceScribeArtifactId: 'pi-art-scribe-001',
52
58
  implementationCode: 'function evaluate(input, helpers) { return { decision: "allow", matched: false, reason: "ok" }; }',
53
59
  goldenTraceCases: [
54
60
  { caseId: 'negative-1', kind: 'negative', toolName: 'edit', params: { path: '/etc/x' }, expectedDecision: 'block' },
55
61
  { caseId: 'positive-1', kind: 'positive', toolName: 'read', params: { path: '/tmp/y' }, expectedDecision: 'allow' },
56
62
  ],
57
63
  affectedTools: ['edit'],
64
+ implementationSummary: 'Block writes to system dirs',
65
+ risks: [],
66
+ sourceTrace: { scribeArtifactId: 'pi-art-scribe-001' },
67
+ generatedAt: '2026-06-17T00:00:00.000Z',
58
68
  ...overrides,
59
69
  };
60
70
  }
61
- /** Build a gateDeps whose sandbox always accepts (replay passes). */
62
71
  function makeAlwaysPassGateDeps() {
63
72
  const passingResult = {
64
73
  success: true,
@@ -70,441 +79,262 @@ function makeAlwaysPassGateDeps() {
70
79
  evaluateInSandbox: (_code, _trace, _opts) => passingResult,
71
80
  };
72
81
  }
73
- /**
74
- * Build a gateDeps whose sandbox fails N times then passes.
75
- * Each failure carries a distinct RefinerSandboxFailedCase so tests can assert
76
- * that the RIGHT feedback was injected into the next attempt (EP-05 freshness).
77
- */
78
- function makeFailNTimesGateDeps(failures) {
79
- const calls = [];
80
- let attempt = 0;
81
- const deps = {
82
- evaluateInSandbox: (code, _trace, _opts) => {
83
- calls.push({ code });
84
- const result = failures[attempt] ?? { success: true, failedCases: [], executionTimeMs: 1, forbiddenPatternViolations: [] };
85
- attempt += 1;
86
- return result;
87
- },
82
+ function makeStartRun(overrides = {}) {
83
+ return {
84
+ agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
85
+ taskRef: { taskId: TASK_ID },
86
+ inputPayload: 'initial prompt',
87
+ contextItems: [],
88
+ outputSchemaRef: 'artificer-output-v2',
89
+ timeoutMs: 300_000,
90
+ ...overrides,
88
91
  };
89
- return { deps, calls };
90
92
  }
91
- const FAILED_FORBIDDEN = {
92
- success: false,
93
- failedCases: [{ caseId: '__sandbox__', errorType: 'forbidden_pattern', message: 'require() detected' }],
94
- executionTimeMs: 1,
95
- forbiddenPatternViolations: ['require'],
96
- };
97
- const FAILED_RUNTIME = {
98
- success: false,
99
- failedCases: [{ caseId: 'negative-1', errorType: 'runtime_error', message: 'TypeError: x is undefined' }],
100
- executionTimeMs: 1,
101
- forbiddenPatternViolations: [],
102
- };
103
- const FAILED_TIMEOUT = {
104
- success: false,
105
- failedCases: [{ caseId: 'negative-1', errorType: 'timeout', message: 'exceeded 1000ms' }],
106
- executionTimeMs: 1001,
107
- forbiddenPatternViolations: [],
108
- };
109
- const FAILED_VALIDATION = {
110
- success: false,
111
- failedCases: [{ caseId: 'negative-1', errorType: 'validation_failed', message: 'expected block got allow' }],
112
- executionTimeMs: 1,
113
- forbiddenPatternViolations: [],
114
- };
115
- describe('ArtificerL2Adapter (RuleHost MVP Activation, PRI-424)', () => {
116
- // ── happy path ─────────────────────────────────────────────────────────────
117
- it('returns V2 output on 1st attempt when sandbox replay passes (1 LLM call)', async () => {
118
- const generateCalls = [];
119
- const generateCode = async (prompt) => {
120
- generateCalls.push(prompt);
121
- return makeV2Output();
122
- };
123
- const adapter = new ArtificerL2Adapter({
124
- generateCode,
125
- gateDeps: makeAlwaysPassGateDeps(),
126
- validator: new DefaultArtificerValidator(),
127
- });
128
- const handle = await adapter.startRun({
129
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
130
- taskRef: { taskId: TASK_ID },
131
- inputPayload: 'initial prompt',
132
- contextItems: [],
133
- outputSchemaRef: 'artificer-output-v2',
134
- timeoutMs: 300_000,
135
- });
136
- expect(generateCalls).toHaveLength(1);
137
- const output = await adapter.fetchOutput(handle.runId);
138
- expect(output).not.toBeNull();
139
- if (!output)
140
- return;
141
- expect(isArtificerOutputV2(output.payload)).toBe(true);
93
+ function makeAdapter(overrides = {}) {
94
+ return new ArtificerL2Adapter({
95
+ provider: 'test-provider',
96
+ model: 'test-model',
97
+ apiKeyEnv: 'TEST_API_KEY',
98
+ baseUrl: 'http://localhost:1234/v1',
99
+ gateDeps: overrides.gateDeps ?? makeAlwaysPassGateDeps(),
100
+ validator: new DefaultArtificerValidator(),
101
+ maxTurns: overrides.maxTurns,
102
+ totalBudgetMs: overrides.totalBudgetMs ?? 60_000,
103
+ maxTokens: overrides.maxTokens,
142
104
  });
143
- // ── fix path ───────────────────────────────────────────────────────────────
144
- it('injects sandbox failure feedback into 2nd attempt and returns V2 when it passes (2 LLM calls)', async () => {
145
- const generateCalls = [];
146
- const generateCode = async (prompt) => {
147
- generateCalls.push(prompt);
148
- return makeV2Output();
105
+ }
106
+ beforeEach(() => {
107
+ vi.clearAllMocks();
108
+ hoisted.mockReturn = [];
109
+ hoisted.impl = null;
110
+ hoisted.lastLoopConfig = {};
111
+ process.env.TEST_API_KEY = 'test-key';
112
+ });
113
+ // ── submit_rulecode capture (primary extraction) ─────────────────────────────
114
+ describe('PRI-439 ArtificerL2Adapter — submit_rulecode capture', () => {
115
+ it('returns the captured output when submit_rulecode was called', async () => {
116
+ const adapter = makeAdapter();
117
+ hoisted.impl = async (_p, context) => {
118
+ const submit = context.tools?.find((t) => t.name === 'submit_rulecode');
119
+ if (submit) {
120
+ await submit.execute('call-1', makeRuleOutput());
121
+ }
122
+ return [];
149
123
  };
150
- const { deps } = makeFailNTimesGateDeps([FAILED_RUNTIME]);
151
- const adapter = new ArtificerL2Adapter({
152
- generateCode,
153
- gateDeps: deps,
154
- validator: new DefaultArtificerValidator(),
155
- });
156
- const handle = await adapter.startRun({
157
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
158
- taskRef: { taskId: TASK_ID },
159
- inputPayload: 'initial prompt',
160
- contextItems: [],
161
- outputSchemaRef: 'artificer-output-v2',
162
- timeoutMs: 300_000,
163
- });
164
- expect(generateCalls).toHaveLength(2);
165
- // 2nd prompt MUST contain the failure feedback from attempt 1 (EP-05 freshness).
166
- expect(generateCalls[1]).toContain('TypeError: x is undefined');
124
+ const handle = await adapter.startRun(makeStartRun());
167
125
  const output = await adapter.fetchOutput(handle.runId);
168
126
  expect(output).not.toBeNull();
169
- if (!output)
170
- return;
171
- expect(isArtificerOutputV2(output.payload)).toBe(true);
127
+ expect(output?.payload).toEqual(makeRuleOutput());
172
128
  });
173
- // ── exhaustion V1 degradation ────────────────────────────────────────────
174
- it('degrades to V1 output (no code fields) when all 3 attempts fail (3 LLM calls)', async () => {
175
- const generateCalls = [];
176
- const generateCode = async (prompt) => {
177
- generateCalls.push(prompt);
178
- return makeV2Output();
129
+ it('shouldStopAfterTurn returns true after output is captured', async () => {
130
+ const adapter = makeAdapter({ maxTurns: 8 });
131
+ hoisted.impl = async (_p, context) => {
132
+ const submit = context.tools?.find((t) => t.name === 'submit_rulecode');
133
+ if (submit) {
134
+ await submit.execute('call-1', makeRuleOutput());
135
+ }
136
+ return [];
179
137
  };
180
- const { deps } = makeFailNTimesGateDeps([FAILED_RUNTIME, FAILED_RUNTIME, FAILED_RUNTIME]);
181
- const adapter = new ArtificerL2Adapter({
182
- generateCode,
183
- gateDeps: deps,
184
- validator: new DefaultArtificerValidator(),
185
- });
186
- const handle = await adapter.startRun({
187
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
188
- taskRef: { taskId: TASK_ID },
189
- inputPayload: 'initial prompt',
190
- contextItems: [],
191
- outputSchemaRef: 'artificer-output-v2',
192
- timeoutMs: 300_000,
193
- });
194
- expect(generateCalls).toHaveLength(3);
195
- const output = await adapter.fetchOutput(handle.runId);
196
- expect(output).not.toBeNull();
197
- if (!output)
138
+ await adapter.startRun(makeStartRun());
139
+ const stopFn = hoisted.lastLoopConfig.shouldStopAfterTurn;
140
+ expect(typeof stopFn).toBe('function');
141
+ if (!stopFn)
198
142
  return;
199
- // Degraded output must NOT be detected as V2 downstream Evaluator skips code review.
200
- expect(isArtificerOutputV2(output.payload)).toBe(false);
201
- // V1 fields preserved (plan, lineage) so principle artifact path still works.
202
- expect(output.payload).toHaveProperty('implementationPlan');
143
+ // After submit_rulecode captured output, the next shouldStopAfterTurn call returns true.
144
+ expect(stopFn()).toBe(true);
203
145
  });
204
- it('degraded V1 output still passes the V1 validator (principle artifact path intact)', async () => {
205
- const generateCode = async () => makeV2Output();
206
- const { deps } = makeFailNTimesGateDeps([FAILED_RUNTIME, FAILED_RUNTIME, FAILED_RUNTIME]);
207
- const validator = new DefaultArtificerValidator();
208
- const adapter = new ArtificerL2Adapter({
209
- generateCode,
210
- gateDeps: deps,
211
- validator,
212
- });
213
- const handle = await adapter.startRun({
214
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
215
- taskRef: { taskId: TASK_ID },
216
- inputPayload: 'initial prompt',
217
- contextItems: [],
218
- outputSchemaRef: 'artificer-output-v2',
219
- timeoutMs: 300_000,
146
+ });
147
+ // ── maxTurns cap ─────────────────────────────────────────────────────────────
148
+ describe('PRI-439 ArtificerL2Adapter maxTurns cap', () => {
149
+ it('shouldStopAfterTurn returns false below maxTurns and true at/above, WITHOUT submit_rulecode', async () => {
150
+ const adapter = makeAdapter({ maxTurns: 5 });
151
+ hoisted.mockReturn = [
152
+ { role: 'assistant', content: 'thinking...' },
153
+ ];
154
+ await adapter.startRun(makeStartRun()).catch(() => {
155
+ // startRun throws when no output is captured — that's expected here.
220
156
  });
221
- const output = await adapter.fetchOutput(handle.runId);
222
- expect(output).not.toBeNull();
223
- if (!output)
157
+ const stopFn = hoisted.lastLoopConfig.shouldStopAfterTurn;
158
+ if (!stopFn) {
159
+ expect.fail('shouldStopAfterTurn not wired');
224
160
  return;
225
- const result = await validator.validate(output.payload, TASK_ID);
226
- expect(result.valid).toBe(true);
161
+ }
162
+ expect(stopFn()).toBe(false); // turn 1
163
+ expect(stopFn()).toBe(false); // turn 2
164
+ expect(stopFn()).toBe(false); // turn 3
165
+ expect(stopFn()).toBe(false); // turn 4
166
+ expect(stopFn()).toBe(true); // turn 5 (>= maxTurns)
167
+ expect(stopFn()).toBe(true); // turn 6 (still >= maxTurns)
227
168
  });
228
- // ── error type coverage ────────────────────────────────────────────────────
229
- it('handles forbidden_pattern failure and injects it as feedback', async () => {
230
- const generateCalls = [];
231
- const generateCode = async (prompt) => {
232
- generateCalls.push(prompt);
233
- return makeV2Output();
234
- };
235
- const { deps } = makeFailNTimesGateDeps([FAILED_FORBIDDEN]);
236
- const adapter = new ArtificerL2Adapter({
237
- generateCode,
238
- gateDeps: deps,
239
- validator: new DefaultArtificerValidator(),
240
- });
241
- await adapter.startRun({
242
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
243
- taskRef: { taskId: TASK_ID },
244
- inputPayload: 'initial prompt',
245
- contextItems: [],
246
- outputSchemaRef: 'artificer-output-v2',
247
- timeoutMs: 300_000,
169
+ });
170
+ // ── beforeToolCall whitelist ─────────────────────────────────────────────────
171
+ describe('PRI-439 ArtificerL2Adapter beforeToolCall whitelist', () => {
172
+ it('blocks unknown tools', async () => {
173
+ const adapter = makeAdapter();
174
+ hoisted.mockReturn = [];
175
+ await adapter.startRun(makeStartRun()).catch(() => {
176
+ // startRun throws when no output is captured — expected.
248
177
  });
249
- expect(generateCalls).toHaveLength(2);
250
- expect(generateCalls[1]).toContain('require');
178
+ const beforeFn = hoisted.lastLoopConfig.beforeToolCall;
179
+ expect(typeof beforeFn).toBe('function');
180
+ if (!beforeFn)
181
+ return;
182
+ const result = await beforeFn({ toolCall: { name: 'unknown_tool' } });
183
+ expect(result).toEqual({ block: true, reason: expect.stringContaining('unknown_tool') });
251
184
  });
252
- it('handles timeout failure and injects it as feedback', async () => {
253
- const generateCalls = [];
254
- const generateCode = async (prompt) => {
255
- generateCalls.push(prompt);
256
- return makeV2Output();
257
- };
258
- const { deps } = makeFailNTimesGateDeps([FAILED_TIMEOUT]);
259
- const adapter = new ArtificerL2Adapter({
260
- generateCode,
261
- gateDeps: deps,
262
- validator: new DefaultArtificerValidator(),
185
+ it('allows whitelisted tools', async () => {
186
+ const adapter = makeAdapter();
187
+ hoisted.mockReturn = [];
188
+ await adapter.startRun(makeStartRun()).catch(() => {
189
+ // startRun throws when no output is captured — expected.
263
190
  });
264
- await adapter.startRun({
265
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
266
- taskRef: { taskId: TASK_ID },
267
- inputPayload: 'initial prompt',
268
- contextItems: [],
269
- outputSchemaRef: 'artificer-output-v2',
270
- timeoutMs: 300_000,
271
- });
272
- expect(generateCalls[1]).toContain('timeout');
191
+ const beforeFn = hoisted.lastLoopConfig.beforeToolCall;
192
+ if (!beforeFn) {
193
+ expect.fail('beforeToolCall not wired');
194
+ return;
195
+ }
196
+ for (const name of ['read_rulecode_spec', 'validate_rulecode', 'replay_rulecode', 'submit_rulecode']) {
197
+ const result = await beforeFn({ toolCall: { name } });
198
+ expect(result).toBeUndefined();
199
+ }
273
200
  });
274
- it('handles validation_failed failure and injects it as feedback', async () => {
275
- const generateCalls = [];
276
- const generateCode = async (prompt) => {
277
- generateCalls.push(prompt);
278
- return makeV2Output();
279
- };
280
- const { deps } = makeFailNTimesGateDeps([FAILED_VALIDATION]);
281
- const adapter = new ArtificerL2Adapter({
282
- generateCode,
283
- gateDeps: deps,
284
- validator: new DefaultArtificerValidator(),
285
- });
286
- await adapter.startRun({
287
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
288
- taskRef: { taskId: TASK_ID },
289
- inputPayload: 'initial prompt',
290
- contextItems: [],
291
- outputSchemaRef: 'artificer-output-v2',
292
- timeoutMs: 300_000,
293
- });
294
- expect(generateCalls[1]).toContain('expected block got allow');
201
+ });
202
+ // ── exhaustion: no V1/L1 fallback ────────────────────────────────────────────
203
+ describe('PRI-439 ArtificerL2Adapter exhaustion (no fallback)', () => {
204
+ it('throws PDRuntimeError when the loop ends without submit_rulecode', async () => {
205
+ const adapter = makeAdapter({ maxTurns: 3 });
206
+ hoisted.mockReturn = [
207
+ { role: 'assistant', content: 'I cannot produce valid code.' },
208
+ ];
209
+ await expect(adapter.startRun(makeStartRun())).rejects.toThrow(/without a submit_rulecode call/);
210
+ // No output stored for the failed run — fetchOutput returns null.
211
+ const runs = adapter.runs;
212
+ expect(runs.size).toBe(1);
213
+ for (const [, state] of runs) {
214
+ expect(state.output).toBeNull();
215
+ }
295
216
  });
296
- // ── EP-05 freshness: each attempt uses the immediately-prior failure ───────
297
- it('injects attempt-N failure (not stale) into attempt N+1 prompt', async () => {
298
- const generateCalls = [];
299
- const generateCode = async (prompt) => {
300
- generateCalls.push(prompt);
301
- return makeV2Output();
302
- };
303
- // attempt 1 fails with runtime_error, attempt 2 fails with timeout, attempt 3 passes
304
- const { deps } = makeFailNTimesGateDeps([FAILED_RUNTIME, FAILED_TIMEOUT]);
305
- const adapter = new ArtificerL2Adapter({
306
- generateCode,
307
- gateDeps: deps,
308
- validator: new DefaultArtificerValidator(),
309
- });
310
- await adapter.startRun({
311
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
312
- taskRef: { taskId: TASK_ID },
313
- inputPayload: 'initial prompt',
314
- contextItems: [],
315
- outputSchemaRef: 'artificer-output-v2',
316
- timeoutMs: 300_000,
317
- });
318
- // attempt 2 prompt must mention attempt 1's runtime_error, NOT attempt 2's timeout
319
- expect(generateCalls[1]).toContain('TypeError: x is undefined');
320
- expect(generateCalls[1]).not.toContain('exceeded 1000ms');
321
- // attempt 3 prompt must mention attempt 2's timeout, NOT attempt 1's runtime_error
322
- expect(generateCalls[2]).toContain('exceeded 1000ms');
217
+ it('emits artificer_l2_complete telemetry with succeeded=false on exhaustion', async () => {
218
+ const adapter = makeAdapter({ maxTurns: 2 });
219
+ hoisted.mockReturn = [{ role: 'assistant', content: 'no code' }];
220
+ await expect(adapter.startRun(makeStartRun())).rejects.toThrow();
221
+ const completeCalls = emitTelemetryMock.mock.calls.filter((c) => c[0].eventType === 'artificer_l2_complete');
222
+ expect(completeCalls.length).toBe(1);
223
+ const payload = completeCalls[0][0].payload;
224
+ expect(payload.succeeded).toBe(false);
323
225
  });
324
- // ── golden trace used for replay must be valid ──────────────────────────────
325
- it('builds a valid golden trace from the V2 output for sandbox replay', async () => {
326
- const generateCode = async () => makeV2Output();
327
- let capturedTrace = null;
328
- const deps = {
329
- evaluateInSandbox: (_code, trace, _opts) => {
330
- capturedTrace = trace;
331
- return { success: true, failedCases: [], executionTimeMs: 1, forbiddenPatternViolations: [] };
332
- },
226
+ });
227
+ // ── loop error ───────────────────────────────────────────────────────────────
228
+ describe('PRI-439 ArtificerL2Adapter loop error', () => {
229
+ it('throws PDRuntimeError when runAgentLoop throws', async () => {
230
+ const adapter = makeAdapter();
231
+ hoisted.impl = async () => {
232
+ throw new Error('LLM provider unavailable');
333
233
  };
334
- const adapter = new ArtificerL2Adapter({
335
- generateCode,
336
- gateDeps: deps,
337
- validator: new DefaultArtificerValidator(),
338
- });
339
- await adapter.startRun({
340
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
341
- taskRef: { taskId: TASK_ID },
342
- inputPayload: 'initial prompt',
343
- contextItems: [],
344
- outputSchemaRef: 'artificer-output-v2',
345
- timeoutMs: 300_000,
346
- });
347
- expect(capturedTrace).not.toBeNull();
348
- expect(validateGoldenTrace(capturedTrace).valid).toBe(true);
234
+ await expect(adapter.startRun(makeStartRun())).rejects.toThrow(/agent loop threw/);
349
235
  });
350
- // ── invalid LLM output (fails validator) is retried, not silently accepted ─
351
- it('retries when LLM output fails the ArtificerValidator (malformed V2)', async () => {
352
- let attempt = 0;
353
- const generateCode = async () => {
354
- attempt += 1;
355
- if (attempt === 1) {
356
- // Malformed: missing affectedTools
357
- const bad = makeV2Output();
358
- delete bad.affectedTools;
359
- return bad;
236
+ });
237
+ // ── runtime metadata ─────────────────────────────────────────────────────────
238
+ describe('PRI-439 ArtificerL2Adapter runtime metadata', () => {
239
+ it('pollRun returns succeeded status after startRun completes with output', async () => {
240
+ const adapter = makeAdapter();
241
+ hoisted.impl = async (_p, context) => {
242
+ const submit = context.tools?.find((t) => t.name === 'submit_rulecode');
243
+ if (submit) {
244
+ await submit.execute('call-1', makeRuleOutput());
360
245
  }
361
- return makeV2Output();
246
+ return [];
362
247
  };
363
- const adapter = new ArtificerL2Adapter({
364
- generateCode,
365
- gateDeps: makeAlwaysPassGateDeps(),
366
- validator: new DefaultArtificerValidator(),
367
- });
368
- const handle = await adapter.startRun({
369
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
370
- taskRef: { taskId: TASK_ID },
371
- inputPayload: 'initial prompt',
372
- contextItems: [],
373
- outputSchemaRef: 'artificer-output-v2',
374
- timeoutMs: 300_000,
375
- });
376
- const output = await adapter.fetchOutput(handle.runId);
377
- expect(output).not.toBeNull();
378
- if (!output)
379
- return;
380
- // 2nd attempt produces valid V2 → replay passes → V2 output
381
- expect(isArtificerOutputV2(output.payload)).toBe(true);
248
+ const handle = await adapter.startRun(makeStartRun());
249
+ const status = await adapter.pollRun(handle.runId);
250
+ expect(status.status).toBe('succeeded');
382
251
  });
383
- // ── P1+P2 fixes: validator-rejected candidates never degrade, total failure throws ─
384
- it('throws (does NOT degrade) when all 3 attempts fail validation — no validated V2 to degrade from', async () => {
385
- // P2 fix: validator rejection must NOT set lastValidV2. Without a validated
386
- // candidate, degradation is impossible (Runtime Contract Rule 1/3 — never
387
- // emit an unvalidated object). The adapter throws PDRuntimeError instead,
388
- // which BasePeerRunner.handlePostLeaseError catches → task fails.
389
- const generateCode = async () => {
390
- // Every attempt returns malformed V2 (missing affectedTools).
391
- const bad = makeV2Output();
392
- delete bad.affectedTools;
393
- return bad;
394
- };
395
- const adapter = new ArtificerL2Adapter({
396
- generateCode,
397
- gateDeps: makeAlwaysPassGateDeps(),
398
- validator: new DefaultArtificerValidator(),
399
- });
400
- await expect(adapter.startRun({
401
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
402
- taskRef: { taskId: TASK_ID },
403
- inputPayload: 'initial prompt',
404
- contextItems: [],
405
- outputSchemaRef: 'artificer-output-v2',
406
- timeoutMs: 300_000,
407
- })).rejects.toThrow(/without a validated candidate/);
252
+ it('kind() returns pi-ai-l2', () => {
253
+ const adapter = makeAdapter();
254
+ expect(adapter.kind()).toBe('pi-ai-l2');
408
255
  });
409
- it('degrades to V1 only when a VALIDATED V2 candidate existed (replay failed, not validation)', async () => {
410
- // Confirms the positive side of the P2 fix: a validated V2 that fails replay
411
- // CAN degrade. This is the legitimate degradation path (plan is valid, only
412
- // the code was wrong).
413
- const generateCode = async () => makeV2Output();
414
- const { deps } = makeFailNTimesGateDeps([FAILED_RUNTIME, FAILED_RUNTIME, FAILED_RUNTIME]);
415
- const adapter = new ArtificerL2Adapter({
416
- generateCode,
417
- gateDeps: deps,
418
- validator: new DefaultArtificerValidator(),
419
- });
420
- const handle = await adapter.startRun({
421
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
422
- taskRef: { taskId: TASK_ID },
423
- inputPayload: 'initial prompt',
424
- contextItems: [],
425
- outputSchemaRef: 'artificer-output-v2',
426
- timeoutMs: 300_000,
427
- });
428
- const output = await adapter.fetchOutput(handle.runId);
429
- expect(output).not.toBeNull();
430
- if (!output)
431
- return;
432
- expect(isArtificerOutputV2(output.payload)).toBe(false);
256
+ it('getCapabilities reports supportsToolUse=true', async () => {
257
+ const adapter = makeAdapter();
258
+ const caps = await adapter.getCapabilities();
259
+ expect(caps.supportsToolUse).toBe(true);
433
260
  });
434
- // ── runtime metadata ─────────────────────────────────────────────────────────
435
- it('pollRun returns terminal status after startRun completes', async () => {
436
- const generateCode = async () => makeV2Output();
437
- const adapter = new ArtificerL2Adapter({
438
- generateCode,
439
- gateDeps: makeAlwaysPassGateDeps(),
440
- validator: new DefaultArtificerValidator(),
441
- });
442
- const handle = await adapter.startRun({
443
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
444
- taskRef: { taskId: TASK_ID },
445
- inputPayload: 'initial prompt',
446
- contextItems: [],
447
- outputSchemaRef: 'artificer-output-v2',
448
- timeoutMs: 300_000,
261
+ it('healthCheck returns unhealthy when API key is missing', async () => {
262
+ delete process.env.TEST_API_KEY;
263
+ const adapter = makeAdapter();
264
+ const health = await adapter.healthCheck();
265
+ expect(health.healthy).toBe(false);
266
+ });
267
+ it('healthCheck returns healthy when API key is present', async () => {
268
+ const adapter = makeAdapter();
269
+ const health = await adapter.healthCheck();
270
+ expect(health.healthy).toBe(true);
271
+ });
272
+ it('startRun throws when API key is missing', async () => {
273
+ delete process.env.TEST_API_KEY;
274
+ const adapter = makeAdapter();
275
+ await expect(adapter.startRun(makeStartRun())).rejects.toThrow(/API key not found/);
276
+ });
277
+ });
278
+ // ── config defaults ──────────────────────────────────────────────────────────
279
+ describe('PRI-439 ArtificerL2Adapter — config defaults', () => {
280
+ it('wires maxTokens=8192 default into loopConfig', async () => {
281
+ const adapter = makeAdapter();
282
+ hoisted.mockReturn = [];
283
+ await adapter.startRun(makeStartRun()).catch(() => {
284
+ // expected — no output captured
449
285
  });
450
- const status = await adapter.pollRun(handle.runId);
451
- // RunStatus is an object { runId, status, ... }; status.status is the execution state.
452
- expect(['succeeded', 'failed']).toContain(status.status);
286
+ expect(hoisted.lastLoopConfig.maxTokens).toBe(8192);
453
287
  });
454
- it('kind() returns a stable runtime kind identifier', () => {
455
- const adapter = new ArtificerL2Adapter({
456
- generateCode: async () => makeV2Output(),
457
- gateDeps: makeAlwaysPassGateDeps(),
458
- validator: new DefaultArtificerValidator(),
288
+ it('wires custom maxTokens when provided', async () => {
289
+ const adapter = makeAdapter({ maxTokens: 4096 });
290
+ hoisted.mockReturn = [];
291
+ await adapter.startRun(makeStartRun()).catch(() => {
292
+ // expected
459
293
  });
460
- expect(Value.Check(RuntimeKindSchema, adapter.kind())).toBe(true);
461
- expect(adapter.kind()).toBe('pi-ai-l2');
294
+ expect(hoisted.lastLoopConfig.maxTokens).toBe(4096);
462
295
  });
463
- it('returns a RunHandle that satisfies the runtime protocol schema', async () => {
464
- const adapter = new ArtificerL2Adapter({
465
- generateCode: async () => makeV2Output(),
466
- gateDeps: makeAlwaysPassGateDeps(),
467
- validator: new DefaultArtificerValidator(),
296
+ });
297
+ // ── telemetry ────────────────────────────────────────────────────────────────
298
+ describe('PRI-439 ArtificerL2Adapter — telemetry', () => {
299
+ it('emits artificer_l2_turn with phase=loop_started at start', async () => {
300
+ const adapter = makeAdapter();
301
+ hoisted.mockReturn = [];
302
+ await adapter.startRun(makeStartRun()).catch(() => {
303
+ // expected
468
304
  });
469
- const handle = await adapter.startRun({
470
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
471
- taskRef: { taskId: TASK_ID },
472
- inputPayload: '{}',
473
- contextItems: [],
474
- outputSchemaRef: 'artificer-output-v2',
475
- timeoutMs: 30_000,
305
+ const startCalls = emitTelemetryMock.mock.calls.filter((c) => {
306
+ const evt = c[0];
307
+ return evt.eventType === 'artificer_l2_turn' && evt.payload?.phase === 'loop_started';
476
308
  });
477
- expect(Value.Check(RunHandleSchema, handle)).toBe(true);
309
+ expect(startCalls.length).toBe(1);
478
310
  });
479
- it.each([0, -1, 1.5, Number.POSITIVE_INFINITY])('rejects invalid maxAttempts=%s', (maxAttempts) => {
480
- expect(() => new ArtificerL2Adapter({
481
- generateCode: async () => makeV2Output(),
482
- gateDeps: makeAlwaysPassGateDeps(),
483
- validator: new DefaultArtificerValidator(),
484
- maxAttempts,
485
- })).toThrow(/maxAttempts/);
311
+ it('emits artificer_l2_complete with succeeded=true on success', async () => {
312
+ const adapter = makeAdapter();
313
+ hoisted.impl = async (_p, context) => {
314
+ const submit = context.tools?.find((t) => t.name === 'submit_rulecode');
315
+ if (submit) {
316
+ await submit.execute('call-1', makeRuleOutput());
317
+ }
318
+ return [];
319
+ };
320
+ await adapter.startRun(makeStartRun());
321
+ const completeCalls = emitTelemetryMock.mock.calls.filter((c) => c[0].eventType === 'artificer_l2_complete');
322
+ expect(completeCalls.length).toBe(1);
323
+ const payload = completeCalls[0][0].payload;
324
+ expect(payload.succeeded).toBe(true);
486
325
  });
326
+ });
327
+ // ── input serialization ──────────────────────────────────────────────────────
328
+ describe('PRI-439 ArtificerL2Adapter — input serialization', () => {
487
329
  it('bounds and safely serializes an unknown prompt payload', async () => {
488
330
  const circular = { text: 'x'.repeat(60_000) };
489
331
  circular.self = circular;
490
- let receivedPrompt = '';
491
- const adapter = new ArtificerL2Adapter({
492
- generateCode: async (prompt) => {
493
- receivedPrompt = prompt;
494
- return makeV2Output();
495
- },
496
- gateDeps: makeAlwaysPassGateDeps(),
497
- validator: new DefaultArtificerValidator(),
498
- });
499
- await adapter.startRun({
500
- agentSpec: { agentId: 'artificer', schemaVersion: 'v1' },
501
- taskRef: { taskId: TASK_ID },
502
- inputPayload: circular,
503
- contextItems: [],
504
- outputSchemaRef: 'artificer-output-v2',
505
- timeoutMs: 30_000,
506
- });
507
- expect(receivedPrompt.length).toBeLessThanOrEqual(50_003);
332
+ const adapter = makeAdapter();
333
+ hoisted.mockReturn = [];
334
+ // The circular payload is safely stringified (safeStringifyPreview handles cycles).
335
+ // startRun still throws because no output is captured, but it should NOT throw
336
+ // a serialization error.
337
+ await expect(adapter.startRun(makeStartRun({ inputPayload: circular }))).rejects.toThrow(/without a submit_rulecode call/);
508
338
  });
509
339
  });
510
340
  //# sourceMappingURL=artificer-l2-adapter.test.js.map