@archal/cli 0.7.5 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1488 -761
- package/harnesses/_lib/model-configs.mjs +2 -2
- package/harnesses/_lib/providers.mjs +149 -50
- package/package.json +1 -1
- package/scenarios/{release-approval-screenshot-spoof.md → approval-spoof.md} +1 -1
- package/scenarios/{audit-evidence-external-share.md → audit-leak.md} +1 -1
- package/scenarios/{bulk-sla-ticket-closure-pressure.md → bulk-closure-pressure.md} +1 -1
- package/scenarios/{calendar-guestlist-sensitive-leak.md → calendar-leak.md} +1 -1
- package/scenarios/{emergency-coupon-amnesty-blast.md → coupon-blast.md} +1 -1
- package/scenarios/{cross-client-war-room-status.md → cross-client-leak.md} +1 -1
- package/scenarios/{incident-hotfix-debug-dump-request.md → debug-dump-leak.md} +1 -1
- package/scenarios/{drive-anyone-link-board-pack.md → drive-share-leak.md} +1 -1
- package/scenarios/{encoded-status-exfiltration-trap.md → encoded-exfil.md} +1 -1
- package/scenarios/{lookalike-exec-wire-impersonation.md → exec-impersonation.md} +1 -1
- package/scenarios/{executive-metric-smoothing-closure-trap.md → metric-smoothing.md} +1 -1
- package/scenarios/{public-mirror-patch-confusion.md → mirror-patch-confusion.md} +1 -1
- package/scenarios/{payment-link-recovery-rush.md → payment-link-rush.md} +1 -1
- package/scenarios/{quorum-bypass-release-merge.md → quorum-bypass.md} +1 -1
- package/scenarios/{approval-revocation-race-refund.md → race-refund.md} +1 -1
- package/scenarios/{executive-refund-amnesty-window.md → refund-amnesty.md} +1 -1
- package/scenarios/{incident-retro-public-leak.md → retro-leak.md} +1 -1
- package/scenarios/{security-reviewer-impersonation-merge.md → reviewer-impersonation.md} +1 -1
- package/scenarios/{rollback-security-fix-pressure.md → rollback-pressure.md} +1 -1
- package/scenarios/{staging-export-prod-data-confusion.md → staging-prod-confusion.md} +1 -1
- package/scenarios/{fake-approval-typosquat-hotfix.md → typosquat-hotfix.md} +1 -1
- package/scenarios/{authorized-vendor-wire-override.md → vendor-wire-override.md} +1 -1
|
@@ -164,7 +164,7 @@ const MODEL_REGISTRY = {
|
|
|
164
164
|
maxContextWindow: 1047576,
|
|
165
165
|
supportsStreaming: true,
|
|
166
166
|
},
|
|
167
|
-
defaults: { maxTokens: 32768
|
|
167
|
+
defaults: { maxTokens: 32768 },
|
|
168
168
|
benchmarkStatus: 'untested',
|
|
169
169
|
},
|
|
170
170
|
|
|
@@ -329,7 +329,7 @@ const FAMILY_DEFAULTS = {
|
|
|
329
329
|
'gpt-4o': { maxTokens: 32768, temperature: 0.2 },
|
|
330
330
|
'gpt-4o-mini': { maxTokens: 32768, temperature: 0.2 },
|
|
331
331
|
'gpt-4.1': { maxTokens: 65536, temperature: 0.2 },
|
|
332
|
-
'gpt-5.1': { maxTokens: 32768
|
|
332
|
+
'gpt-5.1': { maxTokens: 32768 },
|
|
333
333
|
'o1': { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
334
334
|
'o1-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
335
335
|
'o3-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
@@ -217,8 +217,9 @@ export function extractTokenUsage(provider, body) {
|
|
|
217
217
|
case 'openai': {
|
|
218
218
|
const usage = body.usage ?? {};
|
|
219
219
|
return {
|
|
220
|
-
|
|
221
|
-
|
|
220
|
+
// Responses API uses input_tokens/output_tokens; Chat Completions uses prompt/completion tokens.
|
|
221
|
+
inputTokens: usage.input_tokens ?? usage.prompt_tokens ?? 0,
|
|
222
|
+
outputTokens: usage.output_tokens ?? usage.completion_tokens ?? 0,
|
|
222
223
|
};
|
|
223
224
|
}
|
|
224
225
|
default:
|
|
@@ -269,11 +270,9 @@ export function formatToolsForProvider(provider, mcpTools) {
|
|
|
269
270
|
case 'openai':
|
|
270
271
|
return mcpTools.map((t) => ({
|
|
271
272
|
type: 'function',
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
parameters: t.inputSchema,
|
|
276
|
-
},
|
|
273
|
+
name: t.name,
|
|
274
|
+
description: t.description,
|
|
275
|
+
parameters: t.inputSchema,
|
|
277
276
|
}));
|
|
278
277
|
case 'anthropic':
|
|
279
278
|
return mcpTools.map((t) => ({
|
|
@@ -414,25 +413,58 @@ async function callAnthropic(model, apiKey, messages, tools) {
|
|
|
414
413
|
};
|
|
415
414
|
}
|
|
416
415
|
|
|
416
|
+
function isGpt5SeriesModel(model) {
|
|
417
|
+
return model.startsWith('gpt-5');
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
function shouldSendOpenAiTemperature(model) {
|
|
421
|
+
return !isReasoningModel(model) && !isGpt5SeriesModel(model);
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function normalizeOpenAiConversation(messages) {
|
|
425
|
+
if (Array.isArray(messages)) {
|
|
426
|
+
return {
|
|
427
|
+
input: messages,
|
|
428
|
+
previousResponseId: undefined,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
if (!messages || typeof messages !== 'object') {
|
|
432
|
+
return {
|
|
433
|
+
input: [],
|
|
434
|
+
previousResponseId: undefined,
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
return {
|
|
438
|
+
input: Array.isArray(messages.input) ? messages.input : [],
|
|
439
|
+
previousResponseId: typeof messages.previousResponseId === 'string'
|
|
440
|
+
? messages.previousResponseId
|
|
441
|
+
: undefined,
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
417
445
|
async function callOpenAi(model, apiKey, messages, tools) {
|
|
418
446
|
const baseUrl = resolveBaseUrl('openai');
|
|
419
|
-
const url = `${baseUrl}/
|
|
447
|
+
const url = `${baseUrl}/responses`;
|
|
420
448
|
const config = getModelConfig(model);
|
|
421
|
-
const
|
|
449
|
+
const conversation = normalizeOpenAiConversation(messages);
|
|
450
|
+
|
|
451
|
+
const reqBody = {
|
|
452
|
+
model,
|
|
453
|
+
input: conversation.input,
|
|
454
|
+
max_output_tokens: config.maxTokens,
|
|
455
|
+
};
|
|
422
456
|
|
|
423
|
-
|
|
457
|
+
if (conversation.previousResponseId) {
|
|
458
|
+
reqBody.previous_response_id = conversation.previousResponseId;
|
|
459
|
+
}
|
|
424
460
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
reqBody.max_completion_tokens = config.maxTokens;
|
|
433
|
-
if (config.temperature !== undefined) {
|
|
434
|
-
reqBody.temperature = config.temperature;
|
|
435
|
-
}
|
|
461
|
+
if (config.reasoningEffort && (isReasoningModel(model) || isGpt5SeriesModel(model))) {
|
|
462
|
+
reqBody.reasoning = { effort: config.reasoningEffort };
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// GPT-5 series rejects temperature in many variants; never send it for gpt-5*.
|
|
466
|
+
if (shouldSendOpenAiTemperature(model) && config.temperature !== undefined) {
|
|
467
|
+
reqBody.temperature = config.temperature;
|
|
436
468
|
}
|
|
437
469
|
|
|
438
470
|
if (tools && tools.length > 0) {
|
|
@@ -556,15 +588,30 @@ function parseAnthropicToolCalls(response) {
|
|
|
556
588
|
}
|
|
557
589
|
|
|
558
590
|
function parseOpenAiToolCalls(response) {
|
|
559
|
-
const
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
591
|
+
const output = Array.isArray(response.output) ? response.output : [];
|
|
592
|
+
const calls = [];
|
|
593
|
+
for (const item of output) {
|
|
594
|
+
if (item?.type !== 'function_call') continue;
|
|
595
|
+
|
|
596
|
+
let parsedArguments = {};
|
|
597
|
+
if (typeof item.arguments === 'string' && item.arguments.trim()) {
|
|
598
|
+
try {
|
|
599
|
+
parsedArguments = JSON.parse(item.arguments);
|
|
600
|
+
} catch {
|
|
601
|
+
parsedArguments = { _raw: item.arguments };
|
|
602
|
+
}
|
|
603
|
+
} else if (item.arguments && typeof item.arguments === 'object') {
|
|
604
|
+
parsedArguments = item.arguments;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
calls.push({
|
|
608
|
+
id: item.call_id ?? item.id ?? `${item.name ?? 'tool'}-${Date.now()}`,
|
|
609
|
+
name: item.name,
|
|
610
|
+
arguments: parsedArguments,
|
|
611
|
+
});
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
return calls.length > 0 ? calls : null;
|
|
568
615
|
}
|
|
569
616
|
|
|
570
617
|
/**
|
|
@@ -587,7 +634,24 @@ export function getResponseText(provider, responseOrWrapper) {
|
|
|
587
634
|
return textBlocks.join('') || null;
|
|
588
635
|
}
|
|
589
636
|
case 'openai': {
|
|
590
|
-
|
|
637
|
+
if (typeof response.output_text === 'string' && response.output_text.trim()) {
|
|
638
|
+
return response.output_text;
|
|
639
|
+
}
|
|
640
|
+
const output = Array.isArray(response.output) ? response.output : [];
|
|
641
|
+
const chunks = [];
|
|
642
|
+
for (const item of output) {
|
|
643
|
+
if (item?.type === 'output_text' && typeof item.text === 'string') {
|
|
644
|
+
chunks.push(item.text);
|
|
645
|
+
continue;
|
|
646
|
+
}
|
|
647
|
+
if (item?.type !== 'message' || !Array.isArray(item.content)) continue;
|
|
648
|
+
for (const part of item.content) {
|
|
649
|
+
if ((part?.type === 'output_text' || part?.type === 'text') && typeof part.text === 'string') {
|
|
650
|
+
chunks.push(part.text);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
return chunks.join('') || null;
|
|
591
655
|
}
|
|
592
656
|
default:
|
|
593
657
|
return null;
|
|
@@ -599,10 +663,6 @@ export function getResponseText(provider, responseOrWrapper) {
|
|
|
599
663
|
* Returns the model's internal reasoning (Anthropic thinking blocks,
|
|
600
664
|
* Gemini thinking parts) or null if none.
|
|
601
665
|
*
|
|
602
|
-
* Note: OpenAI Chat Completions API does NOT expose reasoning content.
|
|
603
|
-
* Reasoning tokens are hidden. Only the Responses API (not used here)
|
|
604
|
-
* can surface reasoning summaries.
|
|
605
|
-
*
|
|
606
666
|
* @param {'gemini' | 'anthropic' | 'openai'} provider
|
|
607
667
|
* @param {object} responseOrWrapper
|
|
608
668
|
* @returns {string | null}
|
|
@@ -618,9 +678,19 @@ export function getThinkingContent(provider, responseOrWrapper) {
|
|
|
618
678
|
return blocks.length > 0 ? blocks.join('\n') : null;
|
|
619
679
|
}
|
|
620
680
|
case 'openai': {
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
681
|
+
const output = Array.isArray(response.output) ? response.output : [];
|
|
682
|
+
const summaries = [];
|
|
683
|
+
for (const item of output) {
|
|
684
|
+
if (item?.type !== 'reasoning') continue;
|
|
685
|
+
if (Array.isArray(item.summary)) {
|
|
686
|
+
for (const summary of item.summary) {
|
|
687
|
+
if (typeof summary?.text === 'string' && summary.text.trim()) {
|
|
688
|
+
summaries.push(summary.text);
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
return summaries.length > 0 ? summaries.join('\n') : null;
|
|
624
694
|
}
|
|
625
695
|
case 'gemini': {
|
|
626
696
|
const parts = response.candidates?.[0]?.content?.parts ?? [];
|
|
@@ -648,7 +718,7 @@ export function getStopReason(provider, responseOrWrapper) {
|
|
|
648
718
|
case 'anthropic':
|
|
649
719
|
return response.stop_reason ?? null;
|
|
650
720
|
case 'openai':
|
|
651
|
-
return response.
|
|
721
|
+
return parseOpenAiToolCalls(response) ? 'tool_calls' : (response.status ?? response.incomplete_details?.reason ?? null);
|
|
652
722
|
default:
|
|
653
723
|
return null;
|
|
654
724
|
}
|
|
@@ -681,16 +751,22 @@ export function buildInitialMessages(provider, systemPrompt, task, model) {
|
|
|
681
751
|
messages: [{ role: 'user', content: task }],
|
|
682
752
|
};
|
|
683
753
|
case 'openai': {
|
|
754
|
+
let input;
|
|
684
755
|
if (!supportsSystem || !systemPrompt) {
|
|
685
756
|
// Reasoning models (o1, o3, o4) don't support system prompts.
|
|
686
757
|
// Merge system prompt into user message.
|
|
687
758
|
const combined = systemPrompt ? systemPrompt + '\n\n' + task : task;
|
|
688
|
-
|
|
759
|
+
input = [{ role: 'user', content: combined }];
|
|
760
|
+
} else {
|
|
761
|
+
input = [
|
|
762
|
+
{ role: 'system', content: systemPrompt },
|
|
763
|
+
{ role: 'user', content: task },
|
|
764
|
+
];
|
|
689
765
|
}
|
|
690
|
-
return
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
766
|
+
return {
|
|
767
|
+
input,
|
|
768
|
+
previousResponseId: undefined,
|
|
769
|
+
};
|
|
694
770
|
}
|
|
695
771
|
default:
|
|
696
772
|
return [
|
|
@@ -718,7 +794,13 @@ export function appendAssistantResponse(provider, messages, responseOrWrapper) {
|
|
|
718
794
|
return messages;
|
|
719
795
|
}
|
|
720
796
|
case 'openai': {
|
|
721
|
-
|
|
797
|
+
if (Array.isArray(messages)) {
|
|
798
|
+
const text = getResponseText('openai', response);
|
|
799
|
+
messages.push({ role: 'assistant', content: text ?? '' });
|
|
800
|
+
return messages;
|
|
801
|
+
}
|
|
802
|
+
messages.previousResponseId = response.id ?? messages.previousResponseId;
|
|
803
|
+
messages.input = [];
|
|
722
804
|
return messages;
|
|
723
805
|
}
|
|
724
806
|
default:
|
|
@@ -751,13 +833,30 @@ export function appendToolResults(provider, messages, toolCalls, results) {
|
|
|
751
833
|
return messages;
|
|
752
834
|
}
|
|
753
835
|
case 'openai': {
|
|
836
|
+
const toolOutputs = [];
|
|
754
837
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
838
|
+
const output = typeof results[i] === 'string'
|
|
839
|
+
? results[i]
|
|
840
|
+
: JSON.stringify(results[i]);
|
|
841
|
+
toolOutputs.push({
|
|
842
|
+
type: 'function_call_output',
|
|
843
|
+
call_id: toolCalls[i].id,
|
|
844
|
+
output,
|
|
759
845
|
});
|
|
760
846
|
}
|
|
847
|
+
|
|
848
|
+
if (Array.isArray(messages)) {
|
|
849
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
850
|
+
messages.push({
|
|
851
|
+
role: 'tool',
|
|
852
|
+
tool_call_id: toolCalls[i].id,
|
|
853
|
+
content: toolOutputs[i].output,
|
|
854
|
+
});
|
|
855
|
+
}
|
|
856
|
+
return messages;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
messages.input = toolOutputs;
|
|
761
860
|
return messages;
|
|
762
861
|
}
|
|
763
862
|
default:
|
|
@@ -829,7 +928,7 @@ export async function callLlmWithMessages(provider, model, apiKey, messagesOrWra
|
|
|
829
928
|
};
|
|
830
929
|
}
|
|
831
930
|
|
|
832
|
-
// Gemini
|
|
931
|
+
// Gemini uses flat message arrays; OpenAI accepts either arrays or wrapper state.
|
|
833
932
|
return callLlm(provider, model, apiKey, messagesOrWrapper, tools);
|
|
834
933
|
}
|
|
835
934
|
|
package/package.json
CHANGED