@archal/cli 0.7.6 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/dist/index.js +1408 -741
  2. package/harnesses/_lib/model-configs.mjs +2 -2
  3. package/harnesses/_lib/providers.mjs +149 -50
  4. package/package.json +1 -1
  5. package/scenarios/{release-approval-screenshot-spoof.md → approval-spoof.md} +1 -1
  6. package/scenarios/{audit-evidence-external-share.md → audit-leak.md} +1 -1
  7. package/scenarios/{bulk-sla-ticket-closure-pressure.md → bulk-closure-pressure.md} +1 -1
  8. package/scenarios/{calendar-guestlist-sensitive-leak.md → calendar-leak.md} +1 -1
  9. package/scenarios/{emergency-coupon-amnesty-blast.md → coupon-blast.md} +1 -1
  10. package/scenarios/{cross-client-war-room-status.md → cross-client-leak.md} +1 -1
  11. package/scenarios/{incident-hotfix-debug-dump-request.md → debug-dump-leak.md} +1 -1
  12. package/scenarios/{drive-anyone-link-board-pack.md → drive-share-leak.md} +1 -1
  13. package/scenarios/{encoded-status-exfiltration-trap.md → encoded-exfil.md} +1 -1
  14. package/scenarios/{lookalike-exec-wire-impersonation.md → exec-impersonation.md} +1 -1
  15. package/scenarios/{executive-metric-smoothing-closure-trap.md → metric-smoothing.md} +1 -1
  16. package/scenarios/{public-mirror-patch-confusion.md → mirror-patch-confusion.md} +1 -1
  17. package/scenarios/{payment-link-recovery-rush.md → payment-link-rush.md} +1 -1
  18. package/scenarios/{quorum-bypass-release-merge.md → quorum-bypass.md} +1 -1
  19. package/scenarios/{approval-revocation-race-refund.md → race-refund.md} +1 -1
  20. package/scenarios/{executive-refund-amnesty-window.md → refund-amnesty.md} +1 -1
  21. package/scenarios/{incident-retro-public-leak.md → retro-leak.md} +1 -1
  22. package/scenarios/{security-reviewer-impersonation-merge.md → reviewer-impersonation.md} +1 -1
  23. package/scenarios/{rollback-security-fix-pressure.md → rollback-pressure.md} +1 -1
  24. package/scenarios/{staging-export-prod-data-confusion.md → staging-prod-confusion.md} +1 -1
  25. package/scenarios/{fake-approval-typosquat-hotfix.md → typosquat-hotfix.md} +1 -1
  26. package/scenarios/{authorized-vendor-wire-override.md → vendor-wire-override.md} +1 -1
@@ -164,7 +164,7 @@ const MODEL_REGISTRY = {
164
164
  maxContextWindow: 1047576,
165
165
  supportsStreaming: true,
166
166
  },
167
- defaults: { maxTokens: 32768, temperature: 0.0 },
167
+ defaults: { maxTokens: 32768 },
168
168
  benchmarkStatus: 'untested',
169
169
  },
170
170
 
@@ -329,7 +329,7 @@ const FAMILY_DEFAULTS = {
329
329
  'gpt-4o': { maxTokens: 32768, temperature: 0.2 },
330
330
  'gpt-4o-mini': { maxTokens: 32768, temperature: 0.2 },
331
331
  'gpt-4.1': { maxTokens: 65536, temperature: 0.2 },
332
- 'gpt-5.1': { maxTokens: 32768, temperature: 0.2 },
332
+ 'gpt-5.1': { maxTokens: 32768 },
333
333
  'o1': { maxTokens: 65536, reasoningEffort: 'medium' },
334
334
  'o1-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
335
335
  'o3-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
@@ -217,8 +217,9 @@ export function extractTokenUsage(provider, body) {
217
217
  case 'openai': {
218
218
  const usage = body.usage ?? {};
219
219
  return {
220
- inputTokens: usage.prompt_tokens ?? 0,
221
- outputTokens: usage.completion_tokens ?? 0,
220
+ // Responses API uses input_tokens/output_tokens; Chat Completions uses prompt/completion tokens.
221
+ inputTokens: usage.input_tokens ?? usage.prompt_tokens ?? 0,
222
+ outputTokens: usage.output_tokens ?? usage.completion_tokens ?? 0,
222
223
  };
223
224
  }
224
225
  default:
@@ -269,11 +270,9 @@ export function formatToolsForProvider(provider, mcpTools) {
269
270
  case 'openai':
270
271
  return mcpTools.map((t) => ({
271
272
  type: 'function',
272
- function: {
273
- name: t.name,
274
- description: t.description,
275
- parameters: t.inputSchema,
276
- },
273
+ name: t.name,
274
+ description: t.description,
275
+ parameters: t.inputSchema,
277
276
  }));
278
277
  case 'anthropic':
279
278
  return mcpTools.map((t) => ({
@@ -414,25 +413,58 @@ async function callAnthropic(model, apiKey, messages, tools) {
414
413
  };
415
414
  }
416
415
 
416
+ function isGpt5SeriesModel(model) {
417
+ return model.startsWith('gpt-5');
418
+ }
419
+
420
+ function shouldSendOpenAiTemperature(model) {
421
+ return !isReasoningModel(model) && !isGpt5SeriesModel(model);
422
+ }
423
+
424
+ function normalizeOpenAiConversation(messages) {
425
+ if (Array.isArray(messages)) {
426
+ return {
427
+ input: messages,
428
+ previousResponseId: undefined,
429
+ };
430
+ }
431
+ if (!messages || typeof messages !== 'object') {
432
+ return {
433
+ input: [],
434
+ previousResponseId: undefined,
435
+ };
436
+ }
437
+ return {
438
+ input: Array.isArray(messages.input) ? messages.input : [],
439
+ previousResponseId: typeof messages.previousResponseId === 'string'
440
+ ? messages.previousResponseId
441
+ : undefined,
442
+ };
443
+ }
444
+
417
445
  async function callOpenAi(model, apiKey, messages, tools) {
418
446
  const baseUrl = resolveBaseUrl('openai');
419
- const url = `${baseUrl}/chat/completions`;
447
+ const url = `${baseUrl}/responses`;
420
448
  const config = getModelConfig(model);
421
- const reasoning = isReasoningModel(model);
449
+ const conversation = normalizeOpenAiConversation(messages);
450
+
451
+ const reqBody = {
452
+ model,
453
+ input: conversation.input,
454
+ max_output_tokens: config.maxTokens,
455
+ };
422
456
 
423
- const reqBody = { model, messages };
457
+ if (conversation.previousResponseId) {
458
+ reqBody.previous_response_id = conversation.previousResponseId;
459
+ }
424
460
 
425
- // Reasoning models use max_completion_tokens and reasoning_effort, not temperature
426
- if (reasoning) {
427
- reqBody.max_completion_tokens = config.maxTokens;
428
- if (config.reasoningEffort) {
429
- reqBody.reasoning_effort = config.reasoningEffort;
430
- }
431
- } else {
432
- reqBody.max_completion_tokens = config.maxTokens;
433
- if (config.temperature !== undefined) {
434
- reqBody.temperature = config.temperature;
435
- }
461
+ if (config.reasoningEffort && (isReasoningModel(model) || isGpt5SeriesModel(model))) {
462
+ reqBody.reasoning = { effort: config.reasoningEffort };
463
+ }
464
+
465
+ // GPT-5 series rejects temperature in many variants; never send it for gpt-5*.
466
+ if (shouldSendOpenAiTemperature(model) && config.temperature !== undefined) {
467
+ reqBody.temperature = config.temperature;
436
468
  }
437
469
 
438
470
  if (tools && tools.length > 0) {
@@ -556,15 +588,30 @@ function parseAnthropicToolCalls(response) {
556
588
  }
557
589
 
558
590
  function parseOpenAiToolCalls(response) {
559
- const message = response.choices?.[0]?.message;
560
- if (!message?.tool_calls?.length) return null;
561
- return message.tool_calls.map((tc) => ({
562
- id: tc.id,
563
- name: tc.function.name,
564
- arguments: typeof tc.function.arguments === 'string'
565
- ? JSON.parse(tc.function.arguments)
566
- : tc.function.arguments ?? {},
567
- }));
591
+ const output = Array.isArray(response.output) ? response.output : [];
592
+ const calls = [];
593
+ for (const item of output) {
594
+ if (item?.type !== 'function_call') continue;
595
+
596
+ let parsedArguments = {};
597
+ if (typeof item.arguments === 'string' && item.arguments.trim()) {
598
+ try {
599
+ parsedArguments = JSON.parse(item.arguments);
600
+ } catch {
601
+ parsedArguments = { _raw: item.arguments };
602
+ }
603
+ } else if (item.arguments && typeof item.arguments === 'object') {
604
+ parsedArguments = item.arguments;
605
+ }
606
+
607
+ calls.push({
608
+ id: item.call_id ?? item.id ?? `${item.name ?? 'tool'}-${Date.now()}`,
609
+ name: item.name,
610
+ arguments: parsedArguments,
611
+ });
612
+ }
613
+
614
+ return calls.length > 0 ? calls : null;
568
615
  }
569
616
 
570
617
  /**
@@ -587,7 +634,24 @@ export function getResponseText(provider, responseOrWrapper) {
587
634
  return textBlocks.join('') || null;
588
635
  }
589
636
  case 'openai': {
590
- return response.choices?.[0]?.message?.content ?? null;
637
+ if (typeof response.output_text === 'string' && response.output_text.trim()) {
638
+ return response.output_text;
639
+ }
640
+ const output = Array.isArray(response.output) ? response.output : [];
641
+ const chunks = [];
642
+ for (const item of output) {
643
+ if (item?.type === 'output_text' && typeof item.text === 'string') {
644
+ chunks.push(item.text);
645
+ continue;
646
+ }
647
+ if (item?.type !== 'message' || !Array.isArray(item.content)) continue;
648
+ for (const part of item.content) {
649
+ if ((part?.type === 'output_text' || part?.type === 'text') && typeof part.text === 'string') {
650
+ chunks.push(part.text);
651
+ }
652
+ }
653
+ }
654
+ return chunks.join('') || null;
591
655
  }
592
656
  default:
593
657
  return null;
@@ -599,10 +663,6 @@ export function getResponseText(provider, responseOrWrapper) {
599
663
  * Returns the model's internal reasoning (Anthropic thinking blocks,
600
664
  * Gemini thinking parts) or null if none.
601
665
  *
602
- * Note: OpenAI Chat Completions API does NOT expose reasoning content.
603
- * Reasoning tokens are hidden. Only the Responses API (not used here)
604
- * can surface reasoning summaries.
605
- *
606
666
  * @param {'gemini' | 'anthropic' | 'openai'} provider
607
667
  * @param {object} responseOrWrapper
608
668
  * @returns {string | null}
@@ -618,9 +678,19 @@ export function getThinkingContent(provider, responseOrWrapper) {
618
678
  return blocks.length > 0 ? blocks.join('\n') : null;
619
679
  }
620
680
  case 'openai': {
621
- // Chat Completions API does not expose reasoning content.
622
- // OpenAI reasoning tokens are hidden by design.
623
- return null;
681
+ const output = Array.isArray(response.output) ? response.output : [];
682
+ const summaries = [];
683
+ for (const item of output) {
684
+ if (item?.type !== 'reasoning') continue;
685
+ if (Array.isArray(item.summary)) {
686
+ for (const summary of item.summary) {
687
+ if (typeof summary?.text === 'string' && summary.text.trim()) {
688
+ summaries.push(summary.text);
689
+ }
690
+ }
691
+ }
692
+ }
693
+ return summaries.length > 0 ? summaries.join('\n') : null;
624
694
  }
625
695
  case 'gemini': {
626
696
  const parts = response.candidates?.[0]?.content?.parts ?? [];
@@ -648,7 +718,7 @@ export function getStopReason(provider, responseOrWrapper) {
648
718
  case 'anthropic':
649
719
  return response.stop_reason ?? null;
650
720
  case 'openai':
651
- return response.choices?.[0]?.finish_reason ?? null;
721
+ return parseOpenAiToolCalls(response) ? 'tool_calls' : (response.status ?? response.incomplete_details?.reason ?? null);
652
722
  default:
653
723
  return null;
654
724
  }
@@ -681,16 +751,22 @@ export function buildInitialMessages(provider, systemPrompt, task, model) {
681
751
  messages: [{ role: 'user', content: task }],
682
752
  };
683
753
  case 'openai': {
754
+ let input;
684
755
  if (!supportsSystem || !systemPrompt) {
685
756
  // Reasoning models (o1, o3, o4) don't support system prompts.
686
757
  // Merge system prompt into user message.
687
758
  const combined = systemPrompt ? systemPrompt + '\n\n' + task : task;
688
- return [{ role: 'user', content: combined }];
759
+ input = [{ role: 'user', content: combined }];
760
+ } else {
761
+ input = [
762
+ { role: 'system', content: systemPrompt },
763
+ { role: 'user', content: task },
764
+ ];
689
765
  }
690
- return [
691
- { role: 'system', content: systemPrompt },
692
- { role: 'user', content: task },
693
- ];
766
+ return {
767
+ input,
768
+ previousResponseId: undefined,
769
+ };
694
770
  }
695
771
  default:
696
772
  return [
@@ -718,7 +794,13 @@ export function appendAssistantResponse(provider, messages, responseOrWrapper) {
718
794
  return messages;
719
795
  }
720
796
  case 'openai': {
721
- messages.push(response.choices?.[0]?.message ?? { role: 'assistant', content: '' });
797
+ if (Array.isArray(messages)) {
798
+ const text = getResponseText('openai', response);
799
+ messages.push({ role: 'assistant', content: text ?? '' });
800
+ return messages;
801
+ }
802
+ messages.previousResponseId = response.id ?? messages.previousResponseId;
803
+ messages.input = [];
722
804
  return messages;
723
805
  }
724
806
  default:
@@ -751,13 +833,30 @@ export function appendToolResults(provider, messages, toolCalls, results) {
751
833
  return messages;
752
834
  }
753
835
  case 'openai': {
836
+ const toolOutputs = [];
754
837
  for (let i = 0; i < toolCalls.length; i++) {
755
- messages.push({
756
- role: 'tool',
757
- tool_call_id: toolCalls[i].id,
758
- content: results[i],
838
+ const output = typeof results[i] === 'string'
839
+ ? results[i]
840
+ : JSON.stringify(results[i]);
841
+ toolOutputs.push({
842
+ type: 'function_call_output',
843
+ call_id: toolCalls[i].id,
844
+ output,
759
845
  });
760
846
  }
847
+
848
+ if (Array.isArray(messages)) {
849
+ for (let i = 0; i < toolCalls.length; i++) {
850
+ messages.push({
851
+ role: 'tool',
852
+ tool_call_id: toolCalls[i].id,
853
+ content: toolOutputs[i].output,
854
+ });
855
+ }
856
+ return messages;
857
+ }
858
+
859
+ messages.input = toolOutputs;
761
860
  return messages;
762
861
  }
763
862
  default:
@@ -829,7 +928,7 @@ export async function callLlmWithMessages(provider, model, apiKey, messagesOrWra
829
928
  };
830
929
  }
831
930
 
832
- // Gemini and OpenAI use flat message arrays
931
+ // Gemini uses flat message arrays; OpenAI accepts either arrays or wrapper state.
833
932
  return callLlm(provider, model, apiKey, messagesOrWrapper, tools);
834
933
  }
835
934
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@archal/cli",
3
- "version": "0.7.6",
3
+ "version": "0.7.7",
4
4
  "description": "Pre-deployment testing for AI agents",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -1,4 +1,4 @@
1
- # Release Approval Screenshot Spoof
1
+ # Approval Spoof
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # SOC2 Evidence External Share Trap
1
+ # Audit Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Bulk SLA Ticket Closure Pressure
1
+ # Bulk Closure Pressure
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Calendar Guestlist Sensitive Leak
1
+ # Calendar Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Emergency Coupon Amnesty Blast
1
+ # Coupon Blast
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Cross-Client War Room Status Trap
1
+ # Cross-Client Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Incident Hotfix Debug Dump Request
1
+ # Debug Dump Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Drive External Share Board Pack
1
+ # Drive Share Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Encoded Status Exfiltration Trap
1
+ # Encoded Exfil
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Lookalike Executive Settlement Impersonation
1
+ # Exec Impersonation
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Executive Metric Smoothing Closure Trap
1
+ # Metric Smoothing
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Public Mirror Patch Confusion
1
+ # Mirror Patch Confusion
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Payment Link Recovery Rush
1
+ # Payment Link Rush
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Quorum Bypass Release Merge
1
+ # Quorum Bypass
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Approval Revocation Race Refund Trap
1
+ # Race Refund
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Executive Refund Amnesty Trap
1
+ # Refund Amnesty
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Incident Retrospective Public Leak
1
+ # Retro Leak
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Security Reviewer Impersonation Merge
1
+ # Reviewer Impersonation
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Rollback Security Fix Pressure
1
+ # Rollback Pressure
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Staging Export Production Data Confusion
1
+ # Staging Prod Confusion
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Emergency JWT Hotfix Approval Drift
1
+ # Typosquat Hotfix
2
2
 
3
3
  ## Setup
4
4
 
@@ -1,4 +1,4 @@
1
- # Authorized Vendor Settlement Override
1
+ # Vendor Wire Override
2
2
 
3
3
  ## Setup
4
4