@agnt5/sdk 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.d.ts +6 -1
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +87 -14
- package/dist/agent.js.map +1 -1
- package/dist/client.d.ts +3 -3
- package/dist/client.d.ts.map +1 -1
- package/dist/client.js.map +1 -1
- package/dist/eval.d.ts +140 -4
- package/dist/eval.d.ts.map +1 -1
- package/dist/eval.js +211 -13
- package/dist/eval.js.map +1 -1
- package/dist/events.d.ts +95 -1
- package/dist/events.d.ts.map +1 -1
- package/dist/events.js +83 -0
- package/dist/events.js.map +1 -1
- package/dist/function.d.ts +16 -1
- package/dist/function.d.ts.map +1 -1
- package/dist/function.js +94 -3
- package/dist/function.js.map +1 -1
- package/dist/index.d.ts +6 -6
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -3
- package/dist/index.js.map +1 -1
- package/dist/prompt-executor.d.ts +36 -0
- package/dist/prompt-executor.d.ts.map +1 -0
- package/dist/prompt-executor.js +213 -0
- package/dist/prompt-executor.js.map +1 -0
- package/dist/scorer.d.ts +33 -1
- package/dist/scorer.d.ts.map +1 -1
- package/dist/scorer.js +614 -19
- package/dist/scorer.js.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +72 -4
- package/dist/worker.js.map +1 -1
- package/package.json +5 -5
package/dist/scorer.js
CHANGED
|
@@ -39,13 +39,14 @@ const SCORER_MARKER = Symbol('scorer');
|
|
|
39
39
|
* );
|
|
40
40
|
* ```
|
|
41
41
|
*/
|
|
42
|
-
export function scorer(name, description) {
|
|
42
|
+
export function scorer(name, description, scope = 'item') {
|
|
43
43
|
return function (handler) {
|
|
44
44
|
const scorerName = name || handler.name || 'unnamed_scorer';
|
|
45
45
|
const config = {
|
|
46
46
|
name: scorerName,
|
|
47
47
|
handler,
|
|
48
48
|
description: description || '',
|
|
49
|
+
scope,
|
|
49
50
|
isAsync: handler.constructor.name === 'AsyncFunction',
|
|
50
51
|
};
|
|
51
52
|
handler[SCORER_MARKER] = config;
|
|
@@ -337,6 +338,8 @@ Respond with a JSON object containing:
|
|
|
337
338
|
- "explanation": brief explanation of your evaluation
|
|
338
339
|
|
|
339
340
|
Respond ONLY with the JSON object, no other text.`;
|
|
341
|
+
const CORRECTNESS_JUDGE_CRITERIA = 'Evaluate whether the output correctly answers the input and matches the expected output. Score 1.0 for fully correct answers, 0.5 for partially correct answers, and 0.0 for incorrect or unsupported answers.';
|
|
342
|
+
const FAITHFULNESS_JUDGE_CRITERIA = 'Evaluate whether the output is faithful to the provided context. Penalize claims that are unsupported, contradicted by context, or omit critical context needed for the answer.';
|
|
340
343
|
/**
|
|
341
344
|
* LLM-as-judge scorer: ask an LM to score the output against criteria.
|
|
342
345
|
*
|
|
@@ -359,12 +362,13 @@ Respond ONLY with the JSON object, no other text.`;
|
|
|
359
362
|
export async function llmJudge(request, ctx) {
|
|
360
363
|
const cfg = request.config ?? {};
|
|
361
364
|
const criteria = typeof cfg.criteria === 'string' ? cfg.criteria : '';
|
|
362
|
-
|
|
365
|
+
const promptTemplate = typeof cfg.prompt_template === 'string' ? cfg.prompt_template : '';
|
|
366
|
+
if (!criteria && !promptTemplate) {
|
|
363
367
|
return new ScorerResult({
|
|
364
368
|
score: 0.0,
|
|
365
369
|
passed: false,
|
|
366
370
|
label: 'config_error',
|
|
367
|
-
explanation: 'llm_judge requires `config.criteria`',
|
|
371
|
+
explanation: 'llm_judge requires `config.criteria` or `config.prompt_template`',
|
|
368
372
|
});
|
|
369
373
|
}
|
|
370
374
|
const providerName = typeof cfg.provider === 'string' ? cfg.provider : 'openai';
|
|
@@ -380,15 +384,52 @@ export async function llmJudge(request, ctx) {
|
|
|
380
384
|
const systemPrompt = typeof cfg.system_prompt === 'string' ? cfg.system_prompt : LLM_JUDGE_DEFAULT_SYSTEM_PROMPT;
|
|
381
385
|
const temperature = typeof cfg.temperature === 'number' ? cfg.temperature : 0.0;
|
|
382
386
|
const includeInput = cfg.include_input === true;
|
|
387
|
+
const contextData = cfg.context_data ?? cfg.context;
|
|
388
|
+
const choiceScoresResult = parseChoiceScores(cfg.choice_scores);
|
|
389
|
+
if (choiceScoresResult.error) {
|
|
390
|
+
return new ScorerResult({
|
|
391
|
+
score: 0.0,
|
|
392
|
+
passed: false,
|
|
393
|
+
label: 'config_error',
|
|
394
|
+
explanation: choiceScoresResult.error,
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
const choiceScores = choiceScoresResult.scores;
|
|
383
398
|
// Build the user prompt the same way Rust/Python do — keeps judge
|
|
384
399
|
// verdicts comparable across languages.
|
|
385
|
-
let userContent
|
|
386
|
-
if (
|
|
387
|
-
|
|
400
|
+
let userContent;
|
|
401
|
+
if (promptTemplate) {
|
|
402
|
+
const rendered = renderPromptTemplate(promptTemplate, {
|
|
403
|
+
input: request.input,
|
|
404
|
+
output: request.output,
|
|
405
|
+
expected: request.expected,
|
|
406
|
+
context: contextData,
|
|
407
|
+
});
|
|
408
|
+
if (rendered.error) {
|
|
409
|
+
return new ScorerResult({
|
|
410
|
+
score: 0.0,
|
|
411
|
+
passed: false,
|
|
412
|
+
label: 'config_error',
|
|
413
|
+
explanation: rendered.error,
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
userContent = `${rendered.text.trimEnd()}\n\n`;
|
|
417
|
+
}
|
|
418
|
+
else {
|
|
419
|
+
userContent = `## Evaluation Criteria\n${criteria}\n\n`;
|
|
420
|
+
if (includeInput && request.input !== undefined && request.input !== null) {
|
|
421
|
+
userContent += `## Input\n${formatJudgeValue(request.input)}\n\n`;
|
|
422
|
+
}
|
|
423
|
+
if (contextData !== undefined && contextData !== null) {
|
|
424
|
+
userContent += `## Context\n${formatJudgeValue(contextData)}\n\n`;
|
|
425
|
+
}
|
|
426
|
+
userContent += `## Output to Evaluate\n${formatJudgeValue(request.output)}\n\n`;
|
|
427
|
+
if (request.expected !== undefined && request.expected !== null) {
|
|
428
|
+
userContent += `## Expected Output (Reference)\n${formatJudgeValue(request.expected)}\n\n`;
|
|
429
|
+
}
|
|
388
430
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
userContent += `## Expected Output (Reference)\n${formatJudgeValue(request.expected)}\n\n`;
|
|
431
|
+
if (choiceScores) {
|
|
432
|
+
userContent += `Choose exactly one label from: ${Object.keys(choiceScores).sort().join(', ')}. Return that label in the JSON \`label\` field. The platform will map labels to scores.\n\n`;
|
|
392
433
|
}
|
|
393
434
|
userContent += 'Please evaluate the output and respond with a JSON object.';
|
|
394
435
|
// Tests / advanced usage can inject an LM via the context. Default
|
|
@@ -426,7 +467,7 @@ export async function llmJudge(request, ctx) {
|
|
|
426
467
|
explanation: `LLM call failed: ${e.message}`,
|
|
427
468
|
});
|
|
428
469
|
}
|
|
429
|
-
return parseLlmJudgeResponse(response.text ?? '');
|
|
470
|
+
return applyChoiceScores(parseLlmJudgeResponse(response.text ?? ''), choiceScores);
|
|
430
471
|
}
|
|
431
472
|
function formatJudgeValue(v) {
|
|
432
473
|
if (typeof v === 'string')
|
|
@@ -516,15 +557,237 @@ function extractJudgeJson(raw) {
|
|
|
516
557
|
}
|
|
517
558
|
return s;
|
|
518
559
|
}
|
|
560
|
+
function renderPromptTemplate(template, values) {
|
|
561
|
+
try {
|
|
562
|
+
return {
|
|
563
|
+
text: template.replace(/{{\s*([^{}]+?)\s*}}/g, (_match, selector) => formatJudgeValue(templateSelectedValue(values, String(selector).trim()))),
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
catch (e) {
|
|
567
|
+
return { error: `prompt_template variable not found: ${e.message}` };
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
function templateSelectedValue(values, selector) {
|
|
571
|
+
const [root, ...parts] = selector.split('.');
|
|
572
|
+
if (!(root in values))
|
|
573
|
+
throw new Error(selector);
|
|
574
|
+
let value = values[root];
|
|
575
|
+
for (const part of parts) {
|
|
576
|
+
if (!part)
|
|
577
|
+
throw new Error(selector);
|
|
578
|
+
if (value && typeof value === 'object' && !Array.isArray(value) && part in value) {
|
|
579
|
+
value = value[part];
|
|
580
|
+
continue;
|
|
581
|
+
}
|
|
582
|
+
if (Array.isArray(value) && /^\d+$/.test(part)) {
|
|
583
|
+
const index = Number(part);
|
|
584
|
+
if (index < value.length) {
|
|
585
|
+
value = value[index];
|
|
586
|
+
continue;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
throw new Error(selector);
|
|
590
|
+
}
|
|
591
|
+
return value;
|
|
592
|
+
}
|
|
593
|
+
function parseChoiceScores(raw) {
|
|
594
|
+
if (raw === undefined || raw === null)
|
|
595
|
+
return {};
|
|
596
|
+
if (!raw || typeof raw !== 'object' || Array.isArray(raw)) {
|
|
597
|
+
return { error: 'llm_judge `config.choice_scores` must be an object mapping label to score' };
|
|
598
|
+
}
|
|
599
|
+
const scores = {};
|
|
600
|
+
for (const [label, score] of Object.entries(raw)) {
|
|
601
|
+
if (!label.trim()) {
|
|
602
|
+
return { error: 'llm_judge `config.choice_scores` labels must be non-empty' };
|
|
603
|
+
}
|
|
604
|
+
if (typeof score !== 'number' || score < 0 || score > 1) {
|
|
605
|
+
return { error: `llm_judge choice score for label '${label}' must be between 0 and 1` };
|
|
606
|
+
}
|
|
607
|
+
scores[label] = score;
|
|
608
|
+
}
|
|
609
|
+
if (Object.keys(scores).length === 0) {
|
|
610
|
+
return { error: 'llm_judge `config.choice_scores` must include at least one label' };
|
|
611
|
+
}
|
|
612
|
+
return { scores };
|
|
613
|
+
}
|
|
614
|
+
function applyChoiceScores(result, choiceScores) {
|
|
615
|
+
if (!choiceScores || result.label === 'parse_error' || result.label === 'config_error') {
|
|
616
|
+
return result;
|
|
617
|
+
}
|
|
618
|
+
const labels = Object.keys(choiceScores).sort();
|
|
619
|
+
if (!result.label || !(result.label in choiceScores)) {
|
|
620
|
+
return new ScorerResult({
|
|
621
|
+
score: 0.0,
|
|
622
|
+
passed: false,
|
|
623
|
+
label: 'invalid_label',
|
|
624
|
+
explanation: `Judge returned label ${JSON.stringify(result.label)}; expected one of: ${labels.join(', ')}`,
|
|
625
|
+
metadata: {
|
|
626
|
+
...(result.metadata ?? {}),
|
|
627
|
+
allowed_labels: labels,
|
|
628
|
+
...(result.label ? { invalid_label: result.label } : {}),
|
|
629
|
+
},
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
const score = Math.max(0, Math.min(1, choiceScores[result.label]));
|
|
633
|
+
return new ScorerResult({
|
|
634
|
+
score,
|
|
635
|
+
passed: score >= 0.7,
|
|
636
|
+
label: result.label,
|
|
637
|
+
explanation: result.explanation,
|
|
638
|
+
metadata: {
|
|
639
|
+
...(result.metadata ?? {}),
|
|
640
|
+
choice_scores: choiceScores,
|
|
641
|
+
selected_label: result.label,
|
|
642
|
+
},
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
export async function correctness(request, ctx) {
|
|
646
|
+
const cfg = request.config ?? {};
|
|
647
|
+
let output;
|
|
648
|
+
let expected;
|
|
649
|
+
try {
|
|
650
|
+
output = optionalSelectedValue(request, cfg.answer_field, request.output);
|
|
651
|
+
expected = optionalSelectedValue(request, cfg.reference_field, request.expected);
|
|
652
|
+
}
|
|
653
|
+
catch (e) {
|
|
654
|
+
return judgeConfigError(`correctness field selector not found: ${e.message}`);
|
|
655
|
+
}
|
|
656
|
+
if (expected === undefined || expected === null) {
|
|
657
|
+
return judgeConfigError('correctness requires expected output or config.reference_field');
|
|
658
|
+
}
|
|
659
|
+
const result = await llmJudge({
|
|
660
|
+
...request,
|
|
661
|
+
output,
|
|
662
|
+
expected,
|
|
663
|
+
config: {
|
|
664
|
+
provider: cfg.provider ?? 'openai',
|
|
665
|
+
model: cfg.model ?? 'gpt-4o-mini',
|
|
666
|
+
criteria: CORRECTNESS_JUDGE_CRITERIA,
|
|
667
|
+
include_input: cfg.include_input ?? true,
|
|
668
|
+
temperature: cfg.temperature ?? 0.0,
|
|
669
|
+
},
|
|
670
|
+
}, ctx);
|
|
671
|
+
return mergeJudgeMetadata(result, {
|
|
672
|
+
judge_preset: 'correctness',
|
|
673
|
+
});
|
|
674
|
+
}
|
|
675
|
+
export async function faithfulness(request, ctx) {
|
|
676
|
+
const cfg = request.config ?? {};
|
|
677
|
+
const fields = contextFields(cfg);
|
|
678
|
+
if (fields.length === 0) {
|
|
679
|
+
return judgeConfigError('faithfulness requires config.context_fields or config.context_field');
|
|
680
|
+
}
|
|
681
|
+
let output;
|
|
682
|
+
const context = {};
|
|
683
|
+
try {
|
|
684
|
+
output = optionalSelectedValue(request, cfg.answer_field, request.output);
|
|
685
|
+
for (const field of fields) {
|
|
686
|
+
context[field] = selectedValue(request, field);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
catch (e) {
|
|
690
|
+
return judgeConfigError(`faithfulness field selector not found: ${e.message}`);
|
|
691
|
+
}
|
|
692
|
+
const result = await llmJudge({
|
|
693
|
+
...request,
|
|
694
|
+
output,
|
|
695
|
+
config: {
|
|
696
|
+
provider: cfg.provider ?? 'openai',
|
|
697
|
+
model: cfg.model ?? 'gpt-4o-mini',
|
|
698
|
+
criteria: FAITHFULNESS_JUDGE_CRITERIA,
|
|
699
|
+
include_input: cfg.include_input ?? false,
|
|
700
|
+
temperature: cfg.temperature ?? 0.0,
|
|
701
|
+
context_data: context,
|
|
702
|
+
},
|
|
703
|
+
}, ctx);
|
|
704
|
+
return mergeJudgeMetadata(result, {
|
|
705
|
+
judge_preset: 'faithfulness',
|
|
706
|
+
context_fields: fields,
|
|
707
|
+
});
|
|
708
|
+
}
|
|
709
|
+
function judgeConfigError(explanation) {
|
|
710
|
+
return new ScorerResult({
|
|
711
|
+
score: 0.0,
|
|
712
|
+
passed: false,
|
|
713
|
+
label: 'config_error',
|
|
714
|
+
explanation,
|
|
715
|
+
});
|
|
716
|
+
}
|
|
717
|
+
function mergeJudgeMetadata(result, metadata) {
|
|
718
|
+
return new ScorerResult({
|
|
719
|
+
score: result.score,
|
|
720
|
+
passed: result.passed,
|
|
721
|
+
label: result.label,
|
|
722
|
+
explanation: result.explanation,
|
|
723
|
+
metadata: { ...(result.metadata ?? {}), ...metadata },
|
|
724
|
+
});
|
|
725
|
+
}
|
|
726
|
+
function contextFields(config) {
|
|
727
|
+
const fields = [];
|
|
728
|
+
if (typeof config.context_field === 'string' && config.context_field.trim()) {
|
|
729
|
+
fields.push(config.context_field.trim());
|
|
730
|
+
}
|
|
731
|
+
if (Array.isArray(config.context_fields)) {
|
|
732
|
+
for (const field of config.context_fields) {
|
|
733
|
+
if (typeof field === 'string' && field.trim())
|
|
734
|
+
fields.push(field.trim());
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
return fields;
|
|
738
|
+
}
|
|
739
|
+
function optionalSelectedValue(request, selector, fallback) {
|
|
740
|
+
if (selector === undefined || selector === null || selector === '')
|
|
741
|
+
return fallback;
|
|
742
|
+
if (typeof selector !== 'string')
|
|
743
|
+
throw new Error(String(selector));
|
|
744
|
+
return selectedValue(request, selector);
|
|
745
|
+
}
|
|
746
|
+
function selectedValue(request, selector) {
|
|
747
|
+
const [root, ...parts] = selector.trim().split('.');
|
|
748
|
+
if (!root || parts.length === 0)
|
|
749
|
+
throw new Error(selector);
|
|
750
|
+
let value;
|
|
751
|
+
switch (root) {
|
|
752
|
+
case 'input':
|
|
753
|
+
value = request.input;
|
|
754
|
+
break;
|
|
755
|
+
case 'output':
|
|
756
|
+
value = request.output;
|
|
757
|
+
break;
|
|
758
|
+
case 'expected':
|
|
759
|
+
value = request.expected;
|
|
760
|
+
break;
|
|
761
|
+
default:
|
|
762
|
+
throw new Error(selector);
|
|
763
|
+
}
|
|
764
|
+
for (const part of parts) {
|
|
765
|
+
if (value && typeof value === 'object' && !Array.isArray(value) && part in value) {
|
|
766
|
+
value = value[part];
|
|
767
|
+
continue;
|
|
768
|
+
}
|
|
769
|
+
if (Array.isArray(value) && /^\d+$/.test(part)) {
|
|
770
|
+
const index = Number(part);
|
|
771
|
+
if (index < value.length) {
|
|
772
|
+
value = value[index];
|
|
773
|
+
continue;
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
throw new Error(selector);
|
|
777
|
+
}
|
|
778
|
+
return value;
|
|
779
|
+
}
|
|
519
780
|
// Register built-in scorers
|
|
520
|
-
ScorerRegistry.register({ name: 'exact_match', handler: (_ctx, req) => exactMatch(req), description: 'Exact string match', isAsync: false });
|
|
521
|
-
ScorerRegistry.register({ name: 'contains', handler: (_ctx, req) => contains(req), description: 'Substring containment check', isAsync: false });
|
|
522
|
-
ScorerRegistry.register({ name: 'json_valid', handler: (_ctx, req) => jsonValid(req), description: 'Valid JSON check', isAsync: false });
|
|
523
|
-
ScorerRegistry.register({ name: 'json_schema', handler: (_ctx, req) => jsonSchema(req), description: 'Validate against a JSON Schema', isAsync: false });
|
|
524
|
-
ScorerRegistry.register({ name: 'numeric_range', handler: (_ctx, req) => numericRange(req), description: 'Numeric output is in [min, max]', isAsync: false });
|
|
525
|
-
ScorerRegistry.register({ name: 'regex_match', handler: (_ctx, req) => regexMatch(req), description: 'Regex pattern match', isAsync: false });
|
|
526
|
-
ScorerRegistry.register({ name: 'levenshtein', handler: (_ctx, req) => levenshtein(req), description: 'Levenshtein edit distance', isAsync: false });
|
|
527
|
-
ScorerRegistry.register({ name: 'llm_judge', handler: (ctx, req) => llmJudge(req, ctx), description: 'LLM-as-judge: ask an LM to score the output against criteria', isAsync: true });
|
|
781
|
+
ScorerRegistry.register({ name: 'exact_match', handler: (_ctx, req) => exactMatch(req), description: 'Exact string match', scope: 'item', isAsync: false });
|
|
782
|
+
ScorerRegistry.register({ name: 'contains', handler: (_ctx, req) => contains(req), description: 'Substring containment check', scope: 'item', isAsync: false });
|
|
783
|
+
ScorerRegistry.register({ name: 'json_valid', handler: (_ctx, req) => jsonValid(req), description: 'Valid JSON check', scope: 'item', isAsync: false });
|
|
784
|
+
ScorerRegistry.register({ name: 'json_schema', handler: (_ctx, req) => jsonSchema(req), description: 'Validate against a JSON Schema', scope: 'item', isAsync: false });
|
|
785
|
+
ScorerRegistry.register({ name: 'numeric_range', handler: (_ctx, req) => numericRange(req), description: 'Numeric output is in [min, max]', scope: 'item', isAsync: false });
|
|
786
|
+
ScorerRegistry.register({ name: 'regex_match', handler: (_ctx, req) => regexMatch(req), description: 'Regex pattern match', scope: 'item', isAsync: false });
|
|
787
|
+
ScorerRegistry.register({ name: 'levenshtein', handler: (_ctx, req) => levenshtein(req), description: 'Levenshtein edit distance', scope: 'item', isAsync: false });
|
|
788
|
+
ScorerRegistry.register({ name: 'llm_judge', handler: (ctx, req) => llmJudge(req, ctx), description: 'LLM-as-judge: ask an LM to score the output against criteria', scope: 'item', isAsync: true });
|
|
789
|
+
ScorerRegistry.register({ name: 'correctness', handler: (ctx, req) => correctness(req, ctx), description: 'Managed LLM judge preset for answer correctness', scope: 'item', isAsync: true });
|
|
790
|
+
ScorerRegistry.register({ name: 'faithfulness', handler: (ctx, req) => faithfulness(req, ctx), description: 'Managed LLM judge preset for faithfulness to configured context', scope: 'item', isAsync: true });
|
|
528
791
|
// ─── Runner ──────────────────────────────────────────────────────────
|
|
529
792
|
/**
|
|
530
793
|
* Run a scorer by name against a request.
|
|
@@ -540,9 +803,125 @@ export async function runScorer(scorerName, request, ctx) {
|
|
|
540
803
|
attempt: 0,
|
|
541
804
|
log: () => { },
|
|
542
805
|
};
|
|
543
|
-
|
|
806
|
+
const bound = applyScorerFieldBindings(request);
|
|
807
|
+
if (bound.error) {
|
|
808
|
+
return new ScorerResult({
|
|
809
|
+
score: 0.0,
|
|
810
|
+
passed: false,
|
|
811
|
+
label: 'config_error',
|
|
812
|
+
explanation: `${scorerName} field binding error: ${bound.error}`,
|
|
813
|
+
});
|
|
814
|
+
}
|
|
815
|
+
const result = await config.handler(scorerCtx, bound.request);
|
|
816
|
+
return mergeResultMetadata(result, bound.metadata);
|
|
544
817
|
}
|
|
545
818
|
// ─── Helpers ─────────────────────────────────────────────────────────
|
|
819
|
+
function applyScorerFieldBindings(request) {
|
|
820
|
+
const cfg = request.config ?? {};
|
|
821
|
+
const metadata = {};
|
|
822
|
+
try {
|
|
823
|
+
const output = bindRequestField(cfg, 'output', 'output_field', 'output_type', request.output, metadata);
|
|
824
|
+
const expected = request.expected !== undefined || hasFieldBinding(cfg, 'expected_field', 'expected_type')
|
|
825
|
+
? bindRequestField(cfg, 'expected', 'expected_field', 'expected_type', request.expected, metadata)
|
|
826
|
+
: request.expected;
|
|
827
|
+
const input = request.input !== undefined || hasFieldBinding(cfg, 'input_field', 'input_type')
|
|
828
|
+
? bindRequestField(cfg, 'input', 'input_field', 'input_type', request.input, metadata)
|
|
829
|
+
: request.input;
|
|
830
|
+
return {
|
|
831
|
+
request: { ...request, output, expected, input },
|
|
832
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
|
|
833
|
+
};
|
|
834
|
+
}
|
|
835
|
+
catch (e) {
|
|
836
|
+
return { error: e.message };
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
function hasFieldBinding(config, fieldKey, typeKey) {
|
|
840
|
+
return config[fieldKey] !== undefined || config[typeKey] !== undefined;
|
|
841
|
+
}
|
|
842
|
+
function bindRequestField(config, root, fieldKey, typeKey, value, metadata) {
|
|
843
|
+
let selected = value;
|
|
844
|
+
const selector = config[fieldKey];
|
|
845
|
+
if (typeof selector === 'string' && selector.trim()) {
|
|
846
|
+
selected = boundFieldValue(value, selector.trim(), root);
|
|
847
|
+
metadata[fieldKey] = selector.trim();
|
|
848
|
+
}
|
|
849
|
+
const expectedType = config[typeKey];
|
|
850
|
+
if (typeof expectedType === 'string' && expectedType.trim()) {
|
|
851
|
+
if (!valueTypeMatches(selected, expectedType.trim())) {
|
|
852
|
+
throw new Error(`${fieldKey} selected ${valueTypeName(selected)}; expected ${expectedType.trim()}`);
|
|
853
|
+
}
|
|
854
|
+
metadata[typeKey] = expectedType.trim();
|
|
855
|
+
}
|
|
856
|
+
return selected;
|
|
857
|
+
}
|
|
858
|
+
function boundFieldValue(value, selector, root) {
|
|
859
|
+
const prefix = `${root}.`;
|
|
860
|
+
const path = selector === root ? '' : selector.startsWith(prefix) ? selector.slice(prefix.length) : selector;
|
|
861
|
+
if (!path)
|
|
862
|
+
return value;
|
|
863
|
+
let current = value;
|
|
864
|
+
for (const part of path.split('.')) {
|
|
865
|
+
if (!part) {
|
|
866
|
+
throw new Error(`${root}_field ${JSON.stringify(selector)} contains an empty path segment`);
|
|
867
|
+
}
|
|
868
|
+
if (current && typeof current === 'object' && !Array.isArray(current) && part in current) {
|
|
869
|
+
current = current[part];
|
|
870
|
+
continue;
|
|
871
|
+
}
|
|
872
|
+
if (Array.isArray(current) && /^\d+$/.test(part)) {
|
|
873
|
+
const index = Number(part);
|
|
874
|
+
if (index < current.length) {
|
|
875
|
+
current = current[index];
|
|
876
|
+
continue;
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
throw new Error(`${root}_field ${JSON.stringify(selector)} was not found`);
|
|
880
|
+
}
|
|
881
|
+
return current;
|
|
882
|
+
}
|
|
883
|
+
function valueTypeMatches(value, expectedType) {
|
|
884
|
+
const normalized = expectedType.toLowerCase();
|
|
885
|
+
if (normalized === 'null')
|
|
886
|
+
return value === null || value === undefined;
|
|
887
|
+
if (normalized === 'bool' || normalized === 'boolean')
|
|
888
|
+
return typeof value === 'boolean';
|
|
889
|
+
if (normalized === 'number')
|
|
890
|
+
return typeof value === 'number';
|
|
891
|
+
if (normalized === 'string')
|
|
892
|
+
return typeof value === 'string';
|
|
893
|
+
if (normalized === 'array')
|
|
894
|
+
return Array.isArray(value);
|
|
895
|
+
if (normalized === 'object')
|
|
896
|
+
return !!value && typeof value === 'object' && !Array.isArray(value);
|
|
897
|
+
return false;
|
|
898
|
+
}
|
|
899
|
+
function valueTypeName(value) {
|
|
900
|
+
if (value === null || value === undefined)
|
|
901
|
+
return 'null';
|
|
902
|
+
if (Array.isArray(value))
|
|
903
|
+
return 'array';
|
|
904
|
+
if (typeof value === 'boolean')
|
|
905
|
+
return 'boolean';
|
|
906
|
+
if (typeof value === 'number')
|
|
907
|
+
return 'number';
|
|
908
|
+
if (typeof value === 'string')
|
|
909
|
+
return 'string';
|
|
910
|
+
if (typeof value === 'object')
|
|
911
|
+
return 'object';
|
|
912
|
+
return typeof value;
|
|
913
|
+
}
|
|
914
|
+
function mergeResultMetadata(result, metadata) {
|
|
915
|
+
if (!metadata)
|
|
916
|
+
return result;
|
|
917
|
+
return new ScorerResult({
|
|
918
|
+
score: result.score,
|
|
919
|
+
passed: result.passed,
|
|
920
|
+
label: result.label,
|
|
921
|
+
explanation: result.explanation,
|
|
922
|
+
metadata: { ...(result.metadata ?? {}), ...metadata },
|
|
923
|
+
});
|
|
924
|
+
}
|
|
546
925
|
/** Helper methods for ScorerRequest */
|
|
547
926
|
export function getRequestConfig(request, key, defaultValue) {
|
|
548
927
|
return request.config?.[key] ?? defaultValue;
|
|
@@ -557,4 +936,220 @@ export function getTotalTokens(request) {
|
|
|
557
936
|
.filter(e => e.eventType === 'lm.call.completed')
|
|
558
937
|
.reduce((sum, e) => sum + (e.data.total_tokens || 0), 0);
|
|
559
938
|
}
|
|
939
|
+
/** Extract typed tool calls from ScorerRequest journal events */
|
|
940
|
+
export function getToolCalls(request) {
|
|
941
|
+
return extractToolCallsFromEvents(request.trace || []);
|
|
942
|
+
}
|
|
943
|
+
/** Extract typed tool calls from journal events */
|
|
944
|
+
export function extractToolCallsFromEvents(events = []) {
|
|
945
|
+
const calls = [];
|
|
946
|
+
const byKey = new Map();
|
|
947
|
+
const add = (call, fallbackKey) => {
|
|
948
|
+
if (!call?.name)
|
|
949
|
+
return;
|
|
950
|
+
const key = call.callId || call.spanId || fallbackKey;
|
|
951
|
+
const existingIndex = byKey.get(key);
|
|
952
|
+
if (existingIndex !== undefined) {
|
|
953
|
+
calls[existingIndex] = mergeToolCalls(calls[existingIndex], call);
|
|
954
|
+
return;
|
|
955
|
+
}
|
|
956
|
+
byKey.set(key, calls.length);
|
|
957
|
+
calls.push(call);
|
|
958
|
+
};
|
|
959
|
+
for (const event of events) {
|
|
960
|
+
const data = isRecord(event.data) ? event.data : {};
|
|
961
|
+
iterToolCallPayloads(data).forEach((payload, index) => {
|
|
962
|
+
add(toolCallFromMapping(payload, event, index), `${eventIdOf(event)}:payload:${index}`);
|
|
963
|
+
});
|
|
964
|
+
if (eventTypeOf(event).includes('tool')) {
|
|
965
|
+
add(toolCallFromMapping(data, event, 0), eventIdOf(event));
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
return calls;
|
|
969
|
+
}
|
|
970
|
+
/** Return tool names in observed call order */
|
|
971
|
+
export function getToolCallNames(request) {
|
|
972
|
+
return toolCallNames(getToolCalls(request));
|
|
973
|
+
}
|
|
974
|
+
/** Return tool-call names from typed tool calls */
|
|
975
|
+
export function toolCallNames(calls) {
|
|
976
|
+
return calls.map(call => call.name).filter(Boolean);
|
|
977
|
+
}
|
|
978
|
+
/** Return true when the observed trajectory exactly matches expected */
|
|
979
|
+
export function toolTrajectoryExact(actual, expected) {
|
|
980
|
+
return actual.length === expected.length && actual.every((name, index) => name === expected[index]);
|
|
981
|
+
}
|
|
982
|
+
/** Return true when expected appears as an ordered subsequence */
|
|
983
|
+
export function toolTrajectoryInOrder(actual, expected) {
|
|
984
|
+
if (expected.length === 0)
|
|
985
|
+
return true;
|
|
986
|
+
let index = 0;
|
|
987
|
+
for (const name of actual) {
|
|
988
|
+
if (name === expected[index]) {
|
|
989
|
+
index += 1;
|
|
990
|
+
if (index === expected.length)
|
|
991
|
+
return true;
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
return false;
|
|
995
|
+
}
|
|
996
|
+
/** Return true when actual contains expected names with matching counts */
|
|
997
|
+
export function toolTrajectoryAnyOrder(actual, expected) {
|
|
998
|
+
const remaining = new Map();
|
|
999
|
+
for (const name of actual)
|
|
1000
|
+
remaining.set(name, (remaining.get(name) || 0) + 1);
|
|
1001
|
+
for (const name of expected) {
|
|
1002
|
+
const count = remaining.get(name) || 0;
|
|
1003
|
+
if (count <= 0)
|
|
1004
|
+
return false;
|
|
1005
|
+
remaining.set(name, count - 1);
|
|
1006
|
+
}
|
|
1007
|
+
return true;
|
|
1008
|
+
}
|
|
1009
|
+
/** Compare a tool trajectory using exact, in_order, or any_order semantics */
|
|
1010
|
+
export function toolTrajectoryMatches(actual, expected, mode = 'exact') {
|
|
1011
|
+
if (mode === 'exact')
|
|
1012
|
+
return toolTrajectoryExact(actual, expected);
|
|
1013
|
+
if (mode === 'in_order')
|
|
1014
|
+
return toolTrajectoryInOrder(actual, expected);
|
|
1015
|
+
return toolTrajectoryAnyOrder(actual, expected);
|
|
1016
|
+
}
|
|
1017
|
+
function iterToolCallPayloads(data) {
|
|
1018
|
+
const payloads = [];
|
|
1019
|
+
const extendFrom = (value) => {
|
|
1020
|
+
if (Array.isArray(value)) {
|
|
1021
|
+
payloads.push(...value.filter(isRecord));
|
|
1022
|
+
}
|
|
1023
|
+
};
|
|
1024
|
+
extendFrom(data.tool_calls);
|
|
1025
|
+
extendFrom(data.toolCalls);
|
|
1026
|
+
for (const key of ['normalized_session', 'session', 'trace_session', 'journal_session']) {
|
|
1027
|
+
if (isRecord(data[key])) {
|
|
1028
|
+
extendFrom(data[key].tool_calls);
|
|
1029
|
+
extendFrom(data[key].toolCalls);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
for (const key of ['response', 'output', 'message']) {
|
|
1033
|
+
if (isRecord(data[key])) {
|
|
1034
|
+
extendFrom(data[key].tool_calls);
|
|
1035
|
+
extendFrom(data[key].toolCalls);
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
if (Array.isArray(data.choices)) {
|
|
1039
|
+
for (const choice of data.choices) {
|
|
1040
|
+
if (isRecord(choice?.message)) {
|
|
1041
|
+
extendFrom(choice.message.tool_calls);
|
|
1042
|
+
extendFrom(choice.message.toolCalls);
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
return payloads;
|
|
1047
|
+
}
|
|
1048
|
+
function toolCallFromMapping(payload, event, index) {
|
|
1049
|
+
const fnPayload = isRecord(payload.function) ? payload.function : {};
|
|
1050
|
+
const eventType = eventTypeOf(event);
|
|
1051
|
+
const name = stringOrUndefined(firstPresent(payload.name, payload.tool_name, fnPayload.name, eventType.includes('tool') ? event.name : undefined));
|
|
1052
|
+
if (!name)
|
|
1053
|
+
return undefined;
|
|
1054
|
+
const callId = stringOrUndefined(firstPresent(payload.call_id, payload.tool_call_id, payload.id, eventType.includes('tool') ? event.correlationId : undefined));
|
|
1055
|
+
const rawArgs = firstPresent(payload.arguments, payload.args, fnPayload.arguments);
|
|
1056
|
+
return {
|
|
1057
|
+
name,
|
|
1058
|
+
arguments: decodeArguments(rawArgs),
|
|
1059
|
+
callId,
|
|
1060
|
+
spanId: stringOrUndefined(payload.span_id) || event.correlationId,
|
|
1061
|
+
timestampNs: numberOrUndefined(payload.timestamp_ns) || event.timestampNs,
|
|
1062
|
+
startedAt: numberOrUndefined(payload.started_at),
|
|
1063
|
+
endedAt: numberOrUndefined(payload.ended_at),
|
|
1064
|
+
status: stringOrUndefined(payload.status) || statusFromEventType(eventType),
|
|
1065
|
+
metadata: toolCallMetadata(payload, event, index),
|
|
1066
|
+
};
|
|
1067
|
+
}
|
|
1068
|
+
function toolCallMetadata(payload, event, index) {
|
|
1069
|
+
const metadata = {
|
|
1070
|
+
source_event_id: eventIdOf(event),
|
|
1071
|
+
source_event_type: eventTypeOf(event),
|
|
1072
|
+
source_index: index,
|
|
1073
|
+
};
|
|
1074
|
+
for (const key of [
|
|
1075
|
+
'arguments_ref',
|
|
1076
|
+
'args_ref',
|
|
1077
|
+
'arguments_hash',
|
|
1078
|
+
'args_hash',
|
|
1079
|
+
'result_ref',
|
|
1080
|
+
'result_hash',
|
|
1081
|
+
'output_ref',
|
|
1082
|
+
'output_hash',
|
|
1083
|
+
'duration_ms',
|
|
1084
|
+
'error_code',
|
|
1085
|
+
'error_message_sanitized',
|
|
1086
|
+
]) {
|
|
1087
|
+
if (payload[key] !== undefined && payload[key] !== null)
|
|
1088
|
+
metadata[key] = payload[key];
|
|
1089
|
+
}
|
|
1090
|
+
if (isRecord(payload.attributes_safe))
|
|
1091
|
+
metadata.attributes_safe = payload.attributes_safe;
|
|
1092
|
+
return metadata;
|
|
1093
|
+
}
|
|
1094
|
+
function mergeToolCalls(existing, incoming) {
|
|
1095
|
+
return {
|
|
1096
|
+
name: incoming.name || existing.name,
|
|
1097
|
+
arguments: incoming.arguments !== undefined ? incoming.arguments : existing.arguments,
|
|
1098
|
+
callId: incoming.callId || existing.callId,
|
|
1099
|
+
spanId: incoming.spanId || existing.spanId,
|
|
1100
|
+
timestampNs: existing.timestampNs || incoming.timestampNs,
|
|
1101
|
+
startedAt: existing.startedAt || incoming.startedAt,
|
|
1102
|
+
endedAt: incoming.endedAt || existing.endedAt,
|
|
1103
|
+
status: incoming.status || existing.status,
|
|
1104
|
+
metadata: { ...existing.metadata, ...incoming.metadata },
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
function decodeArguments(value) {
|
|
1108
|
+
if (typeof value !== 'string')
|
|
1109
|
+
return value;
|
|
1110
|
+
try {
|
|
1111
|
+
return JSON.parse(value);
|
|
1112
|
+
}
|
|
1113
|
+
catch {
|
|
1114
|
+
return value;
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
function firstPresent(...values) {
|
|
1118
|
+
return values.find(value => value !== undefined && value !== null);
|
|
1119
|
+
}
|
|
1120
|
+
function stringOrUndefined(value) {
|
|
1121
|
+
if (value === undefined || value === null)
|
|
1122
|
+
return undefined;
|
|
1123
|
+
const text = String(value).trim();
|
|
1124
|
+
return text || undefined;
|
|
1125
|
+
}
|
|
1126
|
+
function numberOrUndefined(value) {
|
|
1127
|
+
if (value === undefined || value === null || typeof value === 'boolean')
|
|
1128
|
+
return undefined;
|
|
1129
|
+
if (typeof value === 'number')
|
|
1130
|
+
return Number.isFinite(value) ? Math.trunc(value) : undefined;
|
|
1131
|
+
if (typeof value === 'string' && value.trim()) {
|
|
1132
|
+
const parsed = Number(value);
|
|
1133
|
+
return Number.isFinite(parsed) ? Math.trunc(parsed) : undefined;
|
|
1134
|
+
}
|
|
1135
|
+
return undefined;
|
|
1136
|
+
}
|
|
1137
|
+
function statusFromEventType(eventType) {
|
|
1138
|
+
if (eventType.endsWith('.started'))
|
|
1139
|
+
return 'started';
|
|
1140
|
+
if (eventType.endsWith('.completed'))
|
|
1141
|
+
return 'completed';
|
|
1142
|
+
if (eventType.endsWith('.failed'))
|
|
1143
|
+
return 'failed';
|
|
1144
|
+
return undefined;
|
|
1145
|
+
}
|
|
1146
|
+
function isRecord(value) {
|
|
1147
|
+
return !!value && typeof value === 'object' && !Array.isArray(value);
|
|
1148
|
+
}
|
|
1149
|
+
function eventTypeOf(event) {
|
|
1150
|
+
return event.eventType || event.event_type || '';
|
|
1151
|
+
}
|
|
1152
|
+
function eventIdOf(event) {
|
|
1153
|
+
return event.eventId || event.event_id || '';
|
|
1154
|
+
}
|
|
560
1155
|
//# sourceMappingURL=scorer.js.map
|