@principles/pd-cli 1.113.0 → 1.115.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,419 @@
1
+ /**
2
+ * LLM Dogfood Script — PRI-408 (P1/P2 fixes validation)
3
+ *
4
+ * Runs the RuleHost pipeline with a REAL LLM (qwen3.6-27b-mtp via LM Studio)
5
+ * to validate the six-step value chain end-to-end:
6
+ *
7
+ * pain → dreamer → philosopher → scribe → artificer ↔ evaluator
8
+ * → candidate → auto-enqueue → owner approve → activate
9
+ * → before/after behavior comparison → deactivate → restore
10
+ *
11
+ * Usage:
12
+ * npx tsx scripts/llm-dogfood.ts
13
+ *
14
+ * Prerequisites:
15
+ * - LM Studio running on http://localhost:12341 with qwen3.6-27b-mtp loaded
16
+ * - Set LMSTUDIO_API_KEY=lm-studio (or any non-empty string)
17
+ *
18
+ * Output:
19
+ * - Console log with each stage's result
20
+ * - dogfood-output.json with full structured results
21
+ */
22
+
23
+ import * as os from 'node:os';
24
+ import * as path from 'node:path';
25
+ import * as fs from 'node:fs';
26
+ import {
27
+ PiAiRuntimeAdapter,
28
+ ArtificerL2Adapter,
29
+ buildArtificerL2GenerateCode,
30
+ DefaultArtificerValidator,
31
+ RuntimeStateManager,
32
+ SqliteApprovalQueueStore,
33
+ SqliteActivationStateStore,
34
+ SqlitePIArtifactStore,
35
+ ActivationDispatcher,
36
+ ApprovalCompletionService,
37
+ PromptWriter,
38
+ DeferArchiveWriter,
39
+ RuleHostWriter,
40
+ createProductionGateDeps,
41
+ makeIdempotencyKey,
42
+ createPITaskDiagnosticJson,
43
+ } from '@principles/core/runtime-v2';
44
+ import type { PIArtifactSnapshot, PIArtifactRecord } from '@principles/core/runtime-v2';
45
+ import { runRuleHostPipeline, createSandboxGateDeps } from '../src/services/rulehost-pipeline-runner.js';
46
+ import { compileDemoRule } from '../src/services/demo-rule-compiler.js';
47
+ import type { CodeRuleCapability } from '../src/services/rulehost-pipeline-runner.js';
48
+
49
+ // ── Config ───────────────────────────────────────────────────────────────────
50
+
51
+ const LM_STUDIO_BASE_URL = 'http://localhost:12341/v1';
52
+ const MODEL_ID = 'qwen3.6-27b-mtp';
53
+ const PROVIDER = 'lmstudio';
54
+ const API_KEY_ENV = 'LMSTUDIO_API_KEY';
55
+
56
+ // Ensure the API key env var is set (LM Studio doesn't require a real key,
57
+ // but PiAiRuntimeAdapter checks process.env[apiKeyEnv] is non-empty)
58
+ if (!process.env[API_KEY_ENV]) {
59
+ process.env[API_KEY_ENV] = 'lm-studio';
60
+ }
61
+
62
+ // ── Dogfood pain scenario ────────────────────────────────────────────────────
63
+
64
+ const DOGFOOD_PAIN = {
65
+ painId: 'pain-dogfood-001',
66
+ scenario: 'Agent attempted to write to /etc/passwd during a file operation',
67
+ badDecision: 'Wrote directly to /etc/passwd without checking if it is a system path',
68
+ betterDecision: 'Block writes to system paths (/etc, /boot, /sys, /proc)',
69
+ context: 'File write operation targeting a system-critical path',
70
+ };
71
+
72
+ // ── Helpers ──────────────────────────────────────────────────────────────────
73
+
74
+ function makeTmpDir(): string {
75
+ const dir = path.join(os.tmpdir(), `pd-dogfood-${Date.now()}`);
76
+ fs.mkdirSync(dir, { recursive: true });
77
+ return dir;
78
+ }
79
+
80
+ function toSnapshot(record: PIArtifactRecord): PIArtifactSnapshot {
81
+ return {
82
+ artifactId: record.artifactId,
83
+ artifactKind: record.artifactKind,
84
+ sourceTaskId: record.sourceTaskId,
85
+ sourcePrincipleId: record.sourcePrincipleId,
86
+ sourceRuleId: record.sourceRuleId,
87
+ lineageArtifactIds: record.lineageArtifactIds,
88
+ validationStatus: record.validationStatus,
89
+ contentJson: record.contentJson,
90
+ createdAt: record.createdAt,
91
+ updatedAt: record.updatedAt,
92
+ };
93
+ }
94
+
95
+ async function seedPainSignal(sm: RuntimeStateManager, painId: string): Promise<void> {
96
+ const baseMetadata = JSON.parse(createPITaskDiagnosticJson({
97
+ dependencyTaskIds: [], channel: 'code_tool_hook', timeoutMs: 300_000, inputArtifactRefs: [], outputArtifactRefs: [],
98
+ })) as Record<string, unknown>;
99
+ const diagnosticJson = JSON.stringify({
100
+ ...baseMetadata,
101
+ sourcePainId: painId,
102
+ painSummary: DOGFOOD_PAIN.scenario,
103
+ badDecision: DOGFOOD_PAIN.badDecision,
104
+ betterDecision: DOGFOOD_PAIN.betterDecision,
105
+ });
106
+ await sm.createTask({
107
+ taskId: 'dreamer-dogfood-001',
108
+ taskKind: 'dreamer',
109
+ status: 'pending',
110
+ attemptCount: 0,
111
+ maxAttempts: 3,
112
+ diagnosticJson,
113
+ });
114
+ }
115
+
116
+ function log(stage: string, message: string, detail?: unknown): void {
117
+ const timestamp = new Date().toISOString();
118
+ console.log(`[${timestamp}] [${stage}] ${message}`);
119
+ if (detail !== undefined) {
120
+ console.log(JSON.stringify(detail, null, 2));
121
+ }
122
+ }
123
+
124
+ // ── Main dogfood ─────────────────────────────────────────────────────────────
125
+
126
+ async function main(): Promise<void> {
127
+ const outputDir = path.resolve(process.cwd(), 'dogfood-output');
128
+ fs.mkdirSync(outputDir, { recursive: true });
129
+
130
+ const tmpDir = makeTmpDir();
131
+ log('SETUP', `Workspace: ${tmpDir}`);
132
+ log('SETUP', `LLM: ${MODEL_ID} @ ${LM_STUDIO_BASE_URL}`);
133
+
134
+ // ── Step 1: Seed pain signal ──────────────────────────────────────────────
135
+ log('STEP-1', `Seeding pain: ${DOGFOOD_PAIN.scenario}`);
136
+ const sm = new RuntimeStateManager({ workspaceDir: tmpDir });
137
+ await sm.initialize();
138
+ await seedPainSignal(sm, DOGFOOD_PAIN.painId);
139
+ await sm.close();
140
+
141
+ // ── Step 2: Run RuleHost pipeline with real LLM ──────────────────────────
142
+ log('STEP-2', 'Starting RuleHost pipeline (dreamer → philosopher → scribe → artificer ↔ evaluator)');
143
+
144
+ const adapter = new PiAiRuntimeAdapter({
145
+ provider: PROVIDER,
146
+ model: MODEL_ID,
147
+ apiKeyEnv: API_KEY_ENV,
148
+ baseUrl: LM_STUDIO_BASE_URL,
149
+ maxRetries: 1,
150
+ timeoutMs: 600_000,
151
+ maxTokens: 8192,
152
+ reasoning: false,
153
+ workspace: tmpDir,
154
+ });
155
+
156
+ // Construct the ArtificerL2Adapter for the artificer stage.
157
+ // This adapter uses buildArtificerL2GenerateCode to call the LLM directly
158
+ // (via completeSimple) and runs sandbox replay to validate generated code.
159
+ const generateCode = buildArtificerL2GenerateCode({
160
+ provider: PROVIDER,
161
+ model: MODEL_ID,
162
+ apiKey: process.env[API_KEY_ENV]!,
163
+ baseUrl: LM_STUDIO_BASE_URL,
164
+ timeoutMs: 600_000,
165
+ });
166
+
167
+ const artificerAdapter = new ArtificerL2Adapter({
168
+ generateCode,
169
+ gateDeps: createSandboxGateDeps(),
170
+ validator: new DefaultArtificerValidator(),
171
+ maxAttempts: 3,
172
+ });
173
+
174
+ const capability: CodeRuleCapability = { enabled: true, artificerAdapter };
175
+
176
+ let pipelineResult;
177
+ try {
178
+ pipelineResult = await runRuleHostPipeline({
179
+ workspaceDir: tmpDir,
180
+ painId: DOGFOOD_PAIN.painId,
181
+ runtimeAdapter: adapter,
182
+ channel: 'code_tool_hook',
183
+ pollIntervalMs: 200,
184
+ timeoutMs: 600_000,
185
+ maxRounds: 2,
186
+ codeRuleCapability: capability,
187
+ onProgress: (stage, status, detail) => {
188
+ log('PIPELINE', `${stage}: ${status}${detail ? ' — ' + detail : ''}`);
189
+ },
190
+ });
191
+ } catch (err) {
192
+ log('STEP-2', 'Pipeline threw', { error: err instanceof Error ? err.message : String(err) });
193
+ throw err;
194
+ }
195
+
196
+ log('STEP-2', `Pipeline decision: ${pipelineResult.decision}`, {
197
+ ruleArtifactId: pipelineResult.ruleArtifactId,
198
+ principleArtifactId: pipelineResult.principleArtifactId,
199
+ approvalId: pipelineResult.approvalId,
200
+ degradationReason: pipelineResult.degradationReason,
201
+ });
202
+
203
+ if (pipelineResult.decision !== 'candidate_ready_for_owner_review') {
204
+ log('STEP-2', 'Pipeline did not produce a candidate — saving partial results and exiting');
205
+ const partialOutput = {
206
+ pain: DOGFOOD_PAIN,
207
+ pipelineResult,
208
+ timestamp: new Date().toISOString(),
209
+ };
210
+ fs.writeFileSync(path.join(outputDir, 'dogfood-partial.json'), JSON.stringify(partialOutput, null, 2));
211
+ console.log(`\nPartial results saved to ${path.join(outputDir, 'dogfood-partial.json')}`);
212
+ return;
213
+ }
214
+
215
+ // ── Step 3: Extract generated principle + RuleCode + evaluator judgment ──
216
+ log('STEP-3', 'Extracting generated artifacts');
217
+ const sm2 = new RuntimeStateManager({ workspaceDir: tmpDir });
218
+ await sm2.initialize();
219
+ const artifactStore = new SqlitePIArtifactStore(sm2.connection);
220
+ const approvalStore = new SqliteApprovalQueueStore(sm2.connection);
221
+ const stateStore = new SqliteActivationStateStore(sm2.connection);
222
+
223
+ const ruleArtifact = await artifactStore.getArtifactById(pipelineResult.ruleArtifactId!);
224
+ const principleArtifact = pipelineResult.principleArtifactId
225
+ ? await artifactStore.getArtifactById(pipelineResult.principleArtifactId)
226
+ : null;
227
+
228
+ let generatedPrinciple: unknown = null;
229
+ let ruleCode: string | null = null;
230
+ let evaluatorJudgment: unknown = null;
231
+
232
+ if (principleArtifact) {
233
+ try {
234
+ generatedPrinciple = JSON.parse(principleArtifact.contentJson);
235
+ } catch {
236
+ generatedPrinciple = principleArtifact.contentJson;
237
+ }
238
+ log('STEP-3', 'Generated principle extracted', { artifactId: principleArtifact.artifactId });
239
+ }
240
+
241
+ if (ruleArtifact) {
242
+ try {
243
+ const ruleContent = JSON.parse(ruleArtifact.contentJson) as Record<string, unknown>;
244
+ ruleCode = typeof ruleContent.implementationCode === 'string' ? ruleContent.implementationCode : null;
245
+ evaluatorJudgment = ruleContent.adversarialResult ?? null;
246
+ log('STEP-3', 'RuleCode extracted', { artifactId: ruleArtifact.artifactId, codeLength: ruleCode?.length ?? 0 });
247
+ log('STEP-3', 'Evaluator judgment', evaluatorJudgment);
248
+ } catch {
249
+ log('STEP-3', 'Failed to parse rule artifact contentJson');
250
+ }
251
+ }
252
+
253
+ // ── Step 4: Owner approves the candidate ─────────────────────────────────
254
+ log('STEP-4', 'Owner approving candidate');
255
+ const approvalId = pipelineResult.approvalId;
256
+ if (!approvalId) {
257
+ log('STEP-4', 'No approvalId — candidate was not auto-enqueued', {
258
+ degradationReason: pipelineResult.degradationReason,
259
+ });
260
+ const partialOutput = {
261
+ pain: DOGFOOD_PAIN,
262
+ pipelineResult,
263
+ timestamp: new Date().toISOString(),
264
+ error: 'auto_enqueue_failed',
265
+ };
266
+ fs.writeFileSync(path.join(outputDir, 'dogfood-partial.json'), JSON.stringify(partialOutput, null, 2));
267
+ console.log(`\nPartial results saved (enqueue failed)`);
268
+ await sm2.close();
269
+ return;
270
+ }
271
+ const approveResult = await approvalStore.approve(approvalId, 'owner-dogfood', 'Dogfood approval');
272
+ log('STEP-4', `Approval result: ok=${approveResult.ok}`);
273
+
274
+ // ── Step 5: Dispatch activation ──────────────────────────────────────────
275
+ log('STEP-5', 'Dispatching activation');
276
+ const artifactReadModel = {
277
+ getArtifactById: async (id: string): Promise<PIArtifactSnapshot | null> => {
278
+ const rec = await artifactStore.getArtifactById(id);
279
+ return rec ? toSnapshot(rec) : null;
280
+ },
281
+ };
282
+
283
+ const dispatcher = new ActivationDispatcher(
284
+ artifactReadModel,
285
+ stateStore,
286
+ {
287
+ writers: [
288
+ new PromptWriter(),
289
+ new RuleHostWriter({ gateDeps: createProductionGateDeps() }),
290
+ new DeferArchiveWriter(),
291
+ ],
292
+ approvalQueueStore: approvalStore,
293
+ },
294
+ );
295
+
296
+ const completionService = new ApprovalCompletionService(
297
+ approvalStore,
298
+ dispatcher,
299
+ stateStore,
300
+ );
301
+
302
+ const completionResult = await completionService.completeApproval({
303
+ approvalId,
304
+ actor: { kind: 'human', userId: 'owner-dogfood' },
305
+ now: new Date().toISOString(),
306
+ });
307
+
308
+ log('STEP-5', `Activation result: ok=${completionResult.ok}`, completionResult);
309
+
310
+ // ── Step 6: Before/after behavior comparison ─────────────────────────────
311
+ log('STEP-6', 'Before/after behavior comparison');
312
+
313
+ // Test the rule code against system-path and non-system-path inputs
314
+ const testCases = [
315
+ { name: 'system-path-write', input: { action: { paramsSummary: { path: '/etc/passwd' } } }, expected: 'block' },
316
+ { name: 'system-path-write-boot', input: { action: { paramsSummary: { path: '/boot/grub.cfg' } } }, expected: 'block' },
317
+ { name: 'normal-write', input: { action: { paramsSummary: { path: '/project/src/main.ts' } } }, expected: 'allow' },
318
+ { name: 'normal-write-2', input: { action: { paramsSummary: { path: '/home/user/file.txt' } } }, expected: 'allow' },
319
+ ];
320
+
321
+ let behaviorResults: Array<{ name: string; expected: string; actual: string; passed: boolean }> = [];
322
+ if (ruleCode) {
323
+ // P1 #1 fix: use the production vm sandbox (compileDemoRule) instead of
324
+ // `new Function` which bypasses the sandbox and has no timeout protection.
325
+ // Also, `new Function('input', 'helpers', ruleCode)` would define `evaluate`
326
+ // inside the function body but not call it, returning undefined — making
327
+ // the behavior comparison unreliable.
328
+ try {
329
+ const evaluateFn = compileDemoRule(ruleCode, 'dogfood-behavior-test');
330
+ behaviorResults = testCases.map((tc) => {
331
+ try {
332
+ const result = evaluateFn(tc.input as never, {} as never);
333
+ const actual = typeof result === 'object' && result !== null && 'decision' in result
334
+ ? String((result as Record<string, unknown>).decision)
335
+ : 'unknown';
336
+ return {
337
+ name: tc.name,
338
+ expected: tc.expected,
339
+ actual,
340
+ passed: actual === tc.expected,
341
+ };
342
+ } catch (err) {
343
+ return {
344
+ name: tc.name,
345
+ expected: tc.expected,
346
+ actual: `error: ${err instanceof Error ? err.message : String(err)}`,
347
+ passed: false,
348
+ };
349
+ }
350
+ });
351
+ } catch (err) {
352
+ log('STEP-6', 'Rule code compilation failed (vm sandbox)', { error: err instanceof Error ? err.message : String(err) });
353
+ }
354
+ }
355
+
356
+ log('STEP-6', 'Behavior results', behaviorResults);
357
+
358
+ // ── Step 7: Deactivate and verify restoration ────────────────────────────
359
+ log('STEP-7', 'Deactivating rule');
360
+ const idempotencyKey = makeIdempotencyKey(pipelineResult.ruleArtifactId!, 'code_tool_hook');
361
+ const activationRecord = await stateStore.getActivationStatus(idempotencyKey);
362
+ if (activationRecord) {
363
+ const deactivateResult = await stateStore.deactivateActivation(activationRecord.activationId, new Date().toISOString());
364
+ log('STEP-7', `Deactivation: ${deactivateResult ? 'success' : 'failed'}`);
365
+
366
+ // Verify the record is deactivated
367
+ const afterDeactivate = await stateStore.getActivationStatus(idempotencyKey);
368
+ log('STEP-7', `After deactivate: deactivatedAt=${afterDeactivate?.deactivatedAt ?? 'null'}`);
369
+ }
370
+
371
+ // ── Save full output ─────────────────────────────────────────────────────
372
+ const fullOutput = {
373
+ timestamp: new Date().toISOString(),
374
+ pain: DOGFOOD_PAIN,
375
+ pipelineResult: {
376
+ decision: pipelineResult.decision,
377
+ ruleArtifactId: pipelineResult.ruleArtifactId,
378
+ principleArtifactId: pipelineResult.principleArtifactId,
379
+ approvalId: pipelineResult.approvalId,
380
+ stages: pipelineResult.stages,
381
+ degradationReason: pipelineResult.degradationReason,
382
+ },
383
+ generatedPrinciple,
384
+ ruleCode,
385
+ evaluatorJudgment,
386
+ approval: { ok: approveResult.ok },
387
+ activation: {
388
+ ok: completionResult.ok,
389
+ decision: completionResult.ok ? completionResult.decision : null,
390
+ activationId: completionResult.ok ? completionResult.activationId : null,
391
+ },
392
+ behaviorResults,
393
+ model: { provider: PROVIDER, model: MODEL_ID, baseUrl: LM_STUDIO_BASE_URL },
394
+ };
395
+
396
+ const outputPath = path.join(outputDir, 'dogfood-output.json');
397
+ fs.writeFileSync(outputPath, JSON.stringify(fullOutput, null, 2));
398
+
399
+ // Also save rule code separately for easy review
400
+ if (ruleCode) {
401
+ fs.writeFileSync(path.join(outputDir, 'generated-rule.js'), ruleCode);
402
+ }
403
+
404
+ log('DONE', `Results saved to ${outputDir}/`);
405
+
406
+ await sm2.close();
407
+
408
+ // Cleanup tmp dir
409
+ try {
410
+ fs.rmSync(tmpDir, { recursive: true, force: true });
411
+ } catch {
412
+ // ignore
413
+ }
414
+ }
415
+
416
+ main().catch((err) => {
417
+ console.error('Dogfood failed:', err);
418
+ process.exit(1);
419
+ });