@agnt5/sdk 0.3.6 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/scorer.js CHANGED
@@ -39,13 +39,14 @@ const SCORER_MARKER = Symbol('scorer');
39
39
  * );
40
40
  * ```
41
41
  */
42
- export function scorer(name, description) {
42
+ export function scorer(name, description, scope = 'item') {
43
43
  return function (handler) {
44
44
  const scorerName = name || handler.name || 'unnamed_scorer';
45
45
  const config = {
46
46
  name: scorerName,
47
47
  handler,
48
48
  description: description || '',
49
+ scope,
49
50
  isAsync: handler.constructor.name === 'AsyncFunction',
50
51
  };
51
52
  handler[SCORER_MARKER] = config;
@@ -337,6 +338,8 @@ Respond with a JSON object containing:
337
338
  - "explanation": brief explanation of your evaluation
338
339
 
339
340
  Respond ONLY with the JSON object, no other text.`;
341
+ const CORRECTNESS_JUDGE_CRITERIA = 'Evaluate whether the output correctly answers the input and matches the expected output. Score 1.0 for fully correct answers, 0.5 for partially correct answers, and 0.0 for incorrect or unsupported answers.';
342
+ const FAITHFULNESS_JUDGE_CRITERIA = 'Evaluate whether the output is faithful to the provided context. Penalize claims that are unsupported, contradicted by context, or omit critical context needed for the answer.';
340
343
  /**
341
344
  * LLM-as-judge scorer: ask an LM to score the output against criteria.
342
345
  *
@@ -359,12 +362,13 @@ Respond ONLY with the JSON object, no other text.`;
359
362
  export async function llmJudge(request, ctx) {
360
363
  const cfg = request.config ?? {};
361
364
  const criteria = typeof cfg.criteria === 'string' ? cfg.criteria : '';
362
- if (!criteria) {
365
+ const promptTemplate = typeof cfg.prompt_template === 'string' ? cfg.prompt_template : '';
366
+ if (!criteria && !promptTemplate) {
363
367
  return new ScorerResult({
364
368
  score: 0.0,
365
369
  passed: false,
366
370
  label: 'config_error',
367
- explanation: 'llm_judge requires `config.criteria`',
371
+ explanation: 'llm_judge requires `config.criteria` or `config.prompt_template`',
368
372
  });
369
373
  }
370
374
  const providerName = typeof cfg.provider === 'string' ? cfg.provider : 'openai';
@@ -380,15 +384,63 @@ export async function llmJudge(request, ctx) {
380
384
  const systemPrompt = typeof cfg.system_prompt === 'string' ? cfg.system_prompt : LLM_JUDGE_DEFAULT_SYSTEM_PROMPT;
381
385
  const temperature = typeof cfg.temperature === 'number' ? cfg.temperature : 0.0;
382
386
  const includeInput = cfg.include_input === true;
387
+ const contextData = cfg.context_data ?? cfg.context;
388
+ const choiceScoresResult = parseChoiceScores(cfg.choice_scores);
389
+ if (choiceScoresResult.error) {
390
+ return new ScorerResult({
391
+ score: 0.0,
392
+ passed: false,
393
+ label: 'config_error',
394
+ explanation: choiceScoresResult.error,
395
+ });
396
+ }
397
+ const choiceScores = choiceScoresResult.scores;
383
398
  // Build the user prompt the same way Rust/Python do — keeps judge
384
399
  // verdicts comparable across languages.
385
- let userContent = `## Evaluation Criteria\n${criteria}\n\n`;
386
- if (includeInput && request.input !== undefined && request.input !== null) {
387
- userContent += `## Input\n${formatJudgeValue(request.input)}\n\n`;
400
+ let userContent;
401
+ if (promptTemplate) {
402
+ const rendered = renderPromptTemplate(promptTemplate, {
403
+ input: request.input,
404
+ output: request.output,
405
+ expected: request.expected,
406
+ context: contextData,
407
+ metadata: cfg.metadata,
408
+ tags: cfg.tags,
409
+ });
410
+ if (rendered.error) {
411
+ return new ScorerResult({
412
+ score: 0.0,
413
+ passed: false,
414
+ label: 'config_error',
415
+ explanation: rendered.error,
416
+ });
417
+ }
418
+ userContent = `${rendered.text.trimEnd()}\n\n`;
419
+ if (!templateReferencesSelector(promptTemplate, 'output')) {
420
+ userContent += `## Output to Evaluate\n${formatJudgeValue(request.output)}\n\n`;
421
+ }
422
+ }
423
+ else {
424
+ userContent = `## Evaluation Criteria\n${criteria}\n\n`;
425
+ if (includeInput && request.input !== undefined && request.input !== null) {
426
+ userContent += `## Input\n${formatJudgeValue(request.input)}\n\n`;
427
+ }
428
+ if (contextData !== undefined && contextData !== null) {
429
+ userContent += `## Context\n${formatJudgeValue(contextData)}\n\n`;
430
+ }
431
+ userContent += `## Output to Evaluate\n${formatJudgeValue(request.output)}\n\n`;
432
+ if (request.expected !== undefined && request.expected !== null) {
433
+ userContent += `## Expected Output (Reference)\n${formatJudgeValue(request.expected)}\n\n`;
434
+ }
435
+ }
436
+ if (choiceScores) {
437
+ userContent += `Choose exactly one label from: ${Object.keys(choiceScores).sort().join(', ')}. Return that label in the JSON \`label\` field. The platform will map labels to scores.\n\n`;
438
+ }
439
+ if (cfg.use_cot === true) {
440
+ userContent += 'Reason through the rubric before deciding, but do not include hidden chain-of-thought. Put only a concise rationale in the JSON `explanation` field.\n\n';
388
441
  }
389
- userContent += `## Output to Evaluate\n${formatJudgeValue(request.output)}\n\n`;
390
- if (request.expected !== undefined && request.expected !== null) {
391
- userContent += `## Expected Output (Reference)\n${formatJudgeValue(request.expected)}\n\n`;
442
+ if (cfg.output_schema && typeof cfg.output_schema === 'object' && !Array.isArray(cfg.output_schema)) {
443
+ userContent += `Return a JSON object matching this requested output shape:\n${formatJudgeValue(cfg.output_schema)}\nFor experiment scoring, the JSON should include \`score\` (0.0 to 1.0), \`label\`, and \`explanation\` fields.\n\n`;
392
444
  }
393
445
  userContent += 'Please evaluate the output and respond with a JSON object.';
394
446
  // Tests / advanced usage can inject an LM via the context. Default
@@ -426,7 +478,7 @@ export async function llmJudge(request, ctx) {
426
478
  explanation: `LLM call failed: ${e.message}`,
427
479
  });
428
480
  }
429
- return parseLlmJudgeResponse(response.text ?? '');
481
+ return applyChoiceScores(parseLlmJudgeResponse(response.text ?? ''), choiceScores);
430
482
  }
431
483
  function formatJudgeValue(v) {
432
484
  if (typeof v === 'string')
@@ -516,15 +568,256 @@ function extractJudgeJson(raw) {
516
568
  }
517
569
  return s;
518
570
  }
571
+ function renderPromptTemplate(template, values) {
572
+ try {
573
+ return {
574
+ text: template.replace(/{{\s*([^{}]+?)\s*}}/g, (_match, selector) => formatJudgeValue(templateSelectedValue(values, String(selector).trim()))),
575
+ };
576
+ }
577
+ catch (e) {
578
+ return { error: `prompt_template variable not found: ${e.message}` };
579
+ }
580
+ }
581
+ function templateReferencesSelector(template, root) {
582
+ const pattern = /{{\s*([^{}]+?)\s*}}/g;
583
+ let match;
584
+ while ((match = pattern.exec(template)) !== null) {
585
+ const selector = String(match[1]).trim();
586
+ if (selector === root || selector.startsWith(`${root}.`)) {
587
+ return true;
588
+ }
589
+ }
590
+ return false;
591
+ }
592
+ function templateSelectedValue(values, selector) {
593
+ const [root, ...parts] = selector.split('.');
594
+ if (!(root in values))
595
+ throw new Error(selector);
596
+ let value = values[root];
597
+ for (const part of parts) {
598
+ if (!part)
599
+ throw new Error(selector);
600
+ if (value && typeof value === 'object' && !Array.isArray(value) && part in value) {
601
+ value = value[part];
602
+ continue;
603
+ }
604
+ if (Array.isArray(value) && /^\d+$/.test(part)) {
605
+ const index = Number(part);
606
+ if (index < value.length) {
607
+ value = value[index];
608
+ continue;
609
+ }
610
+ }
611
+ throw new Error(selector);
612
+ }
613
+ return value;
614
+ }
615
+ function parseChoiceScores(raw) {
616
+ if (raw === undefined || raw === null)
617
+ return {};
618
+ if (!raw || typeof raw !== 'object' || Array.isArray(raw)) {
619
+ return { error: 'llm_judge `config.choice_scores` must be an object mapping label to score' };
620
+ }
621
+ const scores = {};
622
+ for (const [label, score] of Object.entries(raw)) {
623
+ if (!label.trim()) {
624
+ return { error: 'llm_judge `config.choice_scores` labels must be non-empty' };
625
+ }
626
+ if (typeof score !== 'number' || score < 0 || score > 1) {
627
+ return { error: `llm_judge choice score for label '${label}' must be between 0 and 1` };
628
+ }
629
+ scores[label] = score;
630
+ }
631
+ if (Object.keys(scores).length === 0) {
632
+ return { error: 'llm_judge `config.choice_scores` must include at least one label' };
633
+ }
634
+ return { scores };
635
+ }
636
+ function applyChoiceScores(result, choiceScores) {
637
+ if (!choiceScores || result.label === 'parse_error' || result.label === 'config_error') {
638
+ return result;
639
+ }
640
+ const labels = Object.keys(choiceScores).sort();
641
+ const selectedLabel = result.label && result.label in choiceScores
642
+ ? result.label
643
+ : result.label
644
+ ? undefined
645
+ : labelForChoiceScore(result.score, choiceScores);
646
+ if (!selectedLabel || !(selectedLabel in choiceScores)) {
647
+ return new ScorerResult({
648
+ score: 0.0,
649
+ passed: false,
650
+ label: 'invalid_label',
651
+ explanation: `Judge returned label ${JSON.stringify(result.label)}; expected one of: ${labels.join(', ')}`,
652
+ metadata: {
653
+ ...(result.metadata ?? {}),
654
+ allowed_labels: labels,
655
+ ...(result.label ? { invalid_label: result.label } : {}),
656
+ },
657
+ });
658
+ }
659
+ const score = Math.max(0, Math.min(1, choiceScores[selectedLabel]));
660
+ return new ScorerResult({
661
+ score,
662
+ passed: score >= 0.7,
663
+ label: selectedLabel,
664
+ explanation: result.explanation,
665
+ metadata: {
666
+ ...(result.metadata ?? {}),
667
+ choice_scores: choiceScores,
668
+ selected_label: selectedLabel,
669
+ },
670
+ });
671
+ }
672
+ function labelForChoiceScore(score, choiceScores) {
673
+ const matches = Object.entries(choiceScores)
674
+ .filter(([, choiceScore]) => Math.abs(choiceScore - score) < 1e-9)
675
+ .map(([label]) => label);
676
+ return matches.length === 1 ? matches[0] : undefined;
677
+ }
678
+ export async function correctness(request, ctx) {
679
+ const cfg = request.config ?? {};
680
+ let output;
681
+ let expected;
682
+ try {
683
+ output = optionalSelectedValue(request, cfg.answer_field, request.output);
684
+ expected = optionalSelectedValue(request, cfg.reference_field, request.expected);
685
+ }
686
+ catch (e) {
687
+ return judgeConfigError(`correctness field selector not found: ${e.message}`);
688
+ }
689
+ const result = await llmJudge({
690
+ ...request,
691
+ output,
692
+ expected,
693
+ config: {
694
+ provider: cfg.provider ?? 'openai',
695
+ model: cfg.model ?? 'gpt-4o-mini',
696
+ criteria: CORRECTNESS_JUDGE_CRITERIA,
697
+ include_input: cfg.include_input ?? true,
698
+ temperature: cfg.temperature ?? 0.0,
699
+ },
700
+ }, ctx);
701
+ return mergeJudgeMetadata(result, {
702
+ judge_preset: 'correctness',
703
+ });
704
+ }
705
+ export async function faithfulness(request, ctx) {
706
+ const cfg = request.config ?? {};
707
+ const fields = contextFields(cfg);
708
+ if (fields.length === 0) {
709
+ return judgeConfigError('faithfulness requires config.context_fields or config.context_field');
710
+ }
711
+ let output;
712
+ const context = {};
713
+ try {
714
+ output = optionalSelectedValue(request, cfg.answer_field, request.output);
715
+ for (const field of fields) {
716
+ context[field] = selectedValue(request, field);
717
+ }
718
+ }
719
+ catch (e) {
720
+ return judgeConfigError(`faithfulness field selector not found: ${e.message}`);
721
+ }
722
+ const result = await llmJudge({
723
+ ...request,
724
+ output,
725
+ config: {
726
+ provider: cfg.provider ?? 'openai',
727
+ model: cfg.model ?? 'gpt-4o-mini',
728
+ criteria: FAITHFULNESS_JUDGE_CRITERIA,
729
+ include_input: cfg.include_input ?? false,
730
+ temperature: cfg.temperature ?? 0.0,
731
+ context_data: context,
732
+ },
733
+ }, ctx);
734
+ return mergeJudgeMetadata(result, {
735
+ judge_preset: 'faithfulness',
736
+ context_fields: fields,
737
+ });
738
+ }
739
+ function judgeConfigError(explanation) {
740
+ return new ScorerResult({
741
+ score: 0.0,
742
+ passed: false,
743
+ label: 'config_error',
744
+ explanation,
745
+ });
746
+ }
747
+ function mergeJudgeMetadata(result, metadata) {
748
+ return new ScorerResult({
749
+ score: result.score,
750
+ passed: result.passed,
751
+ label: result.label,
752
+ explanation: result.explanation,
753
+ metadata: { ...(result.metadata ?? {}), ...metadata },
754
+ });
755
+ }
756
+ function contextFields(config) {
757
+ const fields = [];
758
+ if (typeof config.context_field === 'string' && config.context_field.trim()) {
759
+ fields.push(config.context_field.trim());
760
+ }
761
+ if (Array.isArray(config.context_fields)) {
762
+ for (const field of config.context_fields) {
763
+ if (typeof field === 'string' && field.trim())
764
+ fields.push(field.trim());
765
+ }
766
+ }
767
+ return fields;
768
+ }
769
+ function optionalSelectedValue(request, selector, fallback) {
770
+ if (selector === undefined || selector === null || selector === '')
771
+ return fallback;
772
+ if (typeof selector !== 'string')
773
+ throw new Error(String(selector));
774
+ return selectedValue(request, selector);
775
+ }
776
+ function selectedValue(request, selector) {
777
+ const [root, ...parts] = selector.trim().split('.');
778
+ if (!root || parts.length === 0)
779
+ throw new Error(selector);
780
+ let value;
781
+ switch (root) {
782
+ case 'input':
783
+ value = request.input;
784
+ break;
785
+ case 'output':
786
+ value = request.output;
787
+ break;
788
+ case 'expected':
789
+ value = request.expected;
790
+ break;
791
+ default:
792
+ throw new Error(selector);
793
+ }
794
+ for (const part of parts) {
795
+ if (value && typeof value === 'object' && !Array.isArray(value) && part in value) {
796
+ value = value[part];
797
+ continue;
798
+ }
799
+ if (Array.isArray(value) && /^\d+$/.test(part)) {
800
+ const index = Number(part);
801
+ if (index < value.length) {
802
+ value = value[index];
803
+ continue;
804
+ }
805
+ }
806
+ throw new Error(selector);
807
+ }
808
+ return value;
809
+ }
519
810
  // Register built-in scorers
520
- ScorerRegistry.register({ name: 'exact_match', handler: (_ctx, req) => exactMatch(req), description: 'Exact string match', isAsync: false });
521
- ScorerRegistry.register({ name: 'contains', handler: (_ctx, req) => contains(req), description: 'Substring containment check', isAsync: false });
522
- ScorerRegistry.register({ name: 'json_valid', handler: (_ctx, req) => jsonValid(req), description: 'Valid JSON check', isAsync: false });
523
- ScorerRegistry.register({ name: 'json_schema', handler: (_ctx, req) => jsonSchema(req), description: 'Validate against a JSON Schema', isAsync: false });
524
- ScorerRegistry.register({ name: 'numeric_range', handler: (_ctx, req) => numericRange(req), description: 'Numeric output is in [min, max]', isAsync: false });
525
- ScorerRegistry.register({ name: 'regex_match', handler: (_ctx, req) => regexMatch(req), description: 'Regex pattern match', isAsync: false });
526
- ScorerRegistry.register({ name: 'levenshtein', handler: (_ctx, req) => levenshtein(req), description: 'Levenshtein edit distance', isAsync: false });
527
- ScorerRegistry.register({ name: 'llm_judge', handler: (ctx, req) => llmJudge(req, ctx), description: 'LLM-as-judge: ask an LM to score the output against criteria', isAsync: true });
811
+ ScorerRegistry.register({ name: 'exact_match', handler: (_ctx, req) => exactMatch(req), description: 'Exact string match', scope: 'item', isAsync: false });
812
+ ScorerRegistry.register({ name: 'contains', handler: (_ctx, req) => contains(req), description: 'Substring containment check', scope: 'item', isAsync: false });
813
+ ScorerRegistry.register({ name: 'json_valid', handler: (_ctx, req) => jsonValid(req), description: 'Valid JSON check', scope: 'item', isAsync: false });
814
+ ScorerRegistry.register({ name: 'json_schema', handler: (_ctx, req) => jsonSchema(req), description: 'Validate against a JSON Schema', scope: 'item', isAsync: false });
815
+ ScorerRegistry.register({ name: 'numeric_range', handler: (_ctx, req) => numericRange(req), description: 'Numeric output is in [min, max]', scope: 'item', isAsync: false });
816
+ ScorerRegistry.register({ name: 'regex_match', handler: (_ctx, req) => regexMatch(req), description: 'Regex pattern match', scope: 'item', isAsync: false });
817
+ ScorerRegistry.register({ name: 'levenshtein', handler: (_ctx, req) => levenshtein(req), description: 'Levenshtein edit distance', scope: 'item', isAsync: false });
818
+ ScorerRegistry.register({ name: 'llm_judge', handler: (ctx, req) => llmJudge(req, ctx), description: 'LLM-as-judge: ask an LM to score the output against criteria', scope: 'item', isAsync: true });
819
+ ScorerRegistry.register({ name: 'correctness', handler: (ctx, req) => correctness(req, ctx), description: 'Managed LLM judge preset for answer correctness', scope: 'item', isAsync: true });
820
+ ScorerRegistry.register({ name: 'faithfulness', handler: (ctx, req) => faithfulness(req, ctx), description: 'Managed LLM judge preset for faithfulness to configured context', scope: 'item', isAsync: true });
528
821
  // ─── Runner ──────────────────────────────────────────────────────────
529
822
  /**
530
823
  * Run a scorer by name against a request.
@@ -540,9 +833,137 @@ export async function runScorer(scorerName, request, ctx) {
540
833
  attempt: 0,
541
834
  log: () => { },
542
835
  };
543
- return config.handler(scorerCtx, request);
836
+ const bound = applyScorerFieldBindings(request);
837
+ if (bound.error) {
838
+ return new ScorerResult({
839
+ score: 0.0,
840
+ passed: false,
841
+ label: 'config_error',
842
+ explanation: `${scorerName} field binding error: ${bound.error}`,
843
+ });
844
+ }
845
+ const result = await config.handler(scorerCtx, bound.request);
846
+ return mergeResultMetadata(result, bound.metadata);
544
847
  }
545
848
  // ─── Helpers ─────────────────────────────────────────────────────────
849
+ function applyScorerFieldBindings(request) {
850
+ const cfg = request.config ?? {};
851
+ const metadata = {};
852
+ try {
853
+ const output = bindRequestField(cfg, 'output', 'output_field', 'output_type', request.output, metadata);
854
+ const expected = request.expected !== undefined || hasFieldBinding(cfg, 'expected_field', 'expected_type')
855
+ ? bindRequestField(cfg, 'expected', 'expected_field', 'expected_type', request.expected, metadata)
856
+ : request.expected;
857
+ const input = request.input !== undefined || hasFieldBinding(cfg, 'input_field', 'input_type')
858
+ ? bindRequestField(cfg, 'input', 'input_field', 'input_type', request.input, metadata)
859
+ : request.input;
860
+ return {
861
+ request: { ...request, output, expected, input },
862
+ metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
863
+ };
864
+ }
865
+ catch (e) {
866
+ return { error: e.message };
867
+ }
868
+ }
869
+ function hasFieldBinding(config, fieldKey, typeKey) {
870
+ return config[fieldKey] !== undefined || config[typeKey] !== undefined;
871
+ }
872
+ function bindRequestField(config, root, fieldKey, typeKey, value, metadata) {
873
+ let selected = value;
874
+ const selector = config[fieldKey];
875
+ if (typeof selector === 'string' && selector.trim()) {
876
+ selected = boundFieldValue(value, selector.trim(), root);
877
+ metadata[fieldKey] = selector.trim();
878
+ }
879
+ const expectedType = config[typeKey];
880
+ const bindingType = fieldBindingExpectedType(expectedType);
881
+ if (bindingType) {
882
+ if (!valueTypeMatches(selected, bindingType)) {
883
+ throw new Error(`${fieldKey} selected ${valueTypeName(selected)}; expected ${bindingType}`);
884
+ }
885
+ metadata[typeKey] = bindingType;
886
+ }
887
+ return selected;
888
+ }
889
+ function fieldBindingExpectedType(value) {
890
+ if (typeof value !== 'string')
891
+ return undefined;
892
+ const normalized = value.trim().toLowerCase();
893
+ if (!normalized)
894
+ return undefined;
895
+ if (normalized === 'score' || normalized === 'classification' || normalized === 'json') {
896
+ return undefined;
897
+ }
898
+ return normalized;
899
+ }
900
+ function boundFieldValue(value, selector, root) {
901
+ const prefix = `${root}.`;
902
+ const path = selector === root ? '' : selector.startsWith(prefix) ? selector.slice(prefix.length) : selector;
903
+ if (!path)
904
+ return value;
905
+ let current = value;
906
+ for (const part of path.split('.')) {
907
+ if (!part) {
908
+ throw new Error(`${root}_field ${JSON.stringify(selector)} contains an empty path segment`);
909
+ }
910
+ if (current && typeof current === 'object' && !Array.isArray(current) && part in current) {
911
+ current = current[part];
912
+ continue;
913
+ }
914
+ if (Array.isArray(current) && /^\d+$/.test(part)) {
915
+ const index = Number(part);
916
+ if (index < current.length) {
917
+ current = current[index];
918
+ continue;
919
+ }
920
+ }
921
+ throw new Error(`${root}_field ${JSON.stringify(selector)} was not found`);
922
+ }
923
+ return current;
924
+ }
925
+ function valueTypeMatches(value, expectedType) {
926
+ const normalized = expectedType.toLowerCase();
927
+ if (normalized === 'null')
928
+ return value === null || value === undefined;
929
+ if (normalized === 'bool' || normalized === 'boolean')
930
+ return typeof value === 'boolean';
931
+ if (normalized === 'number')
932
+ return typeof value === 'number';
933
+ if (normalized === 'string')
934
+ return typeof value === 'string';
935
+ if (normalized === 'array')
936
+ return Array.isArray(value);
937
+ if (normalized === 'object')
938
+ return !!value && typeof value === 'object' && !Array.isArray(value);
939
+ return false;
940
+ }
941
+ function valueTypeName(value) {
942
+ if (value === null || value === undefined)
943
+ return 'null';
944
+ if (Array.isArray(value))
945
+ return 'array';
946
+ if (typeof value === 'boolean')
947
+ return 'boolean';
948
+ if (typeof value === 'number')
949
+ return 'number';
950
+ if (typeof value === 'string')
951
+ return 'string';
952
+ if (typeof value === 'object')
953
+ return 'object';
954
+ return typeof value;
955
+ }
956
+ function mergeResultMetadata(result, metadata) {
957
+ if (!metadata)
958
+ return result;
959
+ return new ScorerResult({
960
+ score: result.score,
961
+ passed: result.passed,
962
+ label: result.label,
963
+ explanation: result.explanation,
964
+ metadata: { ...(result.metadata ?? {}), ...metadata },
965
+ });
966
+ }
546
967
  /** Helper methods for ScorerRequest */
547
968
  export function getRequestConfig(request, key, defaultValue) {
548
969
  return request.config?.[key] ?? defaultValue;
@@ -557,4 +978,220 @@ export function getTotalTokens(request) {
557
978
  .filter(e => e.eventType === 'lm.call.completed')
558
979
  .reduce((sum, e) => sum + (e.data.total_tokens || 0), 0);
559
980
  }
981
+ /** Extract typed tool calls from ScorerRequest journal events */
982
+ export function getToolCalls(request) {
983
+ return extractToolCallsFromEvents(request.trace || []);
984
+ }
985
+ /** Extract typed tool calls from journal events */
986
+ export function extractToolCallsFromEvents(events = []) {
987
+ const calls = [];
988
+ const byKey = new Map();
989
+ const add = (call, fallbackKey) => {
990
+ if (!call?.name)
991
+ return;
992
+ const key = call.callId || call.spanId || fallbackKey;
993
+ const existingIndex = byKey.get(key);
994
+ if (existingIndex !== undefined) {
995
+ calls[existingIndex] = mergeToolCalls(calls[existingIndex], call);
996
+ return;
997
+ }
998
+ byKey.set(key, calls.length);
999
+ calls.push(call);
1000
+ };
1001
+ for (const event of events) {
1002
+ const data = isRecord(event.data) ? event.data : {};
1003
+ iterToolCallPayloads(data).forEach((payload, index) => {
1004
+ add(toolCallFromMapping(payload, event, index), `${eventIdOf(event)}:payload:${index}`);
1005
+ });
1006
+ if (eventTypeOf(event).includes('tool')) {
1007
+ add(toolCallFromMapping(data, event, 0), eventIdOf(event));
1008
+ }
1009
+ }
1010
+ return calls;
1011
+ }
1012
+ /** Return tool names in observed call order */
1013
+ export function getToolCallNames(request) {
1014
+ return toolCallNames(getToolCalls(request));
1015
+ }
1016
+ /** Return tool-call names from typed tool calls */
1017
+ export function toolCallNames(calls) {
1018
+ return calls.map(call => call.name).filter(Boolean);
1019
+ }
1020
+ /** Return true when the observed trajectory exactly matches expected */
1021
+ export function toolTrajectoryExact(actual, expected) {
1022
+ return actual.length === expected.length && actual.every((name, index) => name === expected[index]);
1023
+ }
1024
+ /** Return true when expected appears as an ordered subsequence */
1025
+ export function toolTrajectoryInOrder(actual, expected) {
1026
+ if (expected.length === 0)
1027
+ return true;
1028
+ let index = 0;
1029
+ for (const name of actual) {
1030
+ if (name === expected[index]) {
1031
+ index += 1;
1032
+ if (index === expected.length)
1033
+ return true;
1034
+ }
1035
+ }
1036
+ return false;
1037
+ }
1038
+ /** Return true when actual contains expected names with matching counts */
1039
+ export function toolTrajectoryAnyOrder(actual, expected) {
1040
+ const remaining = new Map();
1041
+ for (const name of actual)
1042
+ remaining.set(name, (remaining.get(name) || 0) + 1);
1043
+ for (const name of expected) {
1044
+ const count = remaining.get(name) || 0;
1045
+ if (count <= 0)
1046
+ return false;
1047
+ remaining.set(name, count - 1);
1048
+ }
1049
+ return true;
1050
+ }
1051
+ /** Compare a tool trajectory using exact, in_order, or any_order semantics */
1052
+ export function toolTrajectoryMatches(actual, expected, mode = 'exact') {
1053
+ if (mode === 'exact')
1054
+ return toolTrajectoryExact(actual, expected);
1055
+ if (mode === 'in_order')
1056
+ return toolTrajectoryInOrder(actual, expected);
1057
+ return toolTrajectoryAnyOrder(actual, expected);
1058
+ }
1059
+ function iterToolCallPayloads(data) {
1060
+ const payloads = [];
1061
+ const extendFrom = (value) => {
1062
+ if (Array.isArray(value)) {
1063
+ payloads.push(...value.filter(isRecord));
1064
+ }
1065
+ };
1066
+ extendFrom(data.tool_calls);
1067
+ extendFrom(data.toolCalls);
1068
+ for (const key of ['normalized_session', 'session', 'trace_session', 'journal_session']) {
1069
+ if (isRecord(data[key])) {
1070
+ extendFrom(data[key].tool_calls);
1071
+ extendFrom(data[key].toolCalls);
1072
+ }
1073
+ }
1074
+ for (const key of ['response', 'output', 'message']) {
1075
+ if (isRecord(data[key])) {
1076
+ extendFrom(data[key].tool_calls);
1077
+ extendFrom(data[key].toolCalls);
1078
+ }
1079
+ }
1080
+ if (Array.isArray(data.choices)) {
1081
+ for (const choice of data.choices) {
1082
+ if (isRecord(choice?.message)) {
1083
+ extendFrom(choice.message.tool_calls);
1084
+ extendFrom(choice.message.toolCalls);
1085
+ }
1086
+ }
1087
+ }
1088
+ return payloads;
1089
+ }
1090
+ function toolCallFromMapping(payload, event, index) {
1091
+ const fnPayload = isRecord(payload.function) ? payload.function : {};
1092
+ const eventType = eventTypeOf(event);
1093
+ const name = stringOrUndefined(firstPresent(payload.name, payload.tool_name, fnPayload.name, eventType.includes('tool') ? event.name : undefined));
1094
+ if (!name)
1095
+ return undefined;
1096
+ const callId = stringOrUndefined(firstPresent(payload.call_id, payload.tool_call_id, payload.id, eventType.includes('tool') ? event.correlationId : undefined));
1097
+ const rawArgs = firstPresent(payload.arguments, payload.args, fnPayload.arguments);
1098
+ return {
1099
+ name,
1100
+ arguments: decodeArguments(rawArgs),
1101
+ callId,
1102
+ spanId: stringOrUndefined(payload.span_id) || event.correlationId,
1103
+ timestampNs: numberOrUndefined(payload.timestamp_ns) || event.timestampNs,
1104
+ startedAt: numberOrUndefined(payload.started_at),
1105
+ endedAt: numberOrUndefined(payload.ended_at),
1106
+ status: stringOrUndefined(payload.status) || statusFromEventType(eventType),
1107
+ metadata: toolCallMetadata(payload, event, index),
1108
+ };
1109
+ }
1110
+ function toolCallMetadata(payload, event, index) {
1111
+ const metadata = {
1112
+ source_event_id: eventIdOf(event),
1113
+ source_event_type: eventTypeOf(event),
1114
+ source_index: index,
1115
+ };
1116
+ for (const key of [
1117
+ 'arguments_ref',
1118
+ 'args_ref',
1119
+ 'arguments_hash',
1120
+ 'args_hash',
1121
+ 'result_ref',
1122
+ 'result_hash',
1123
+ 'output_ref',
1124
+ 'output_hash',
1125
+ 'duration_ms',
1126
+ 'error_code',
1127
+ 'error_message_sanitized',
1128
+ ]) {
1129
+ if (payload[key] !== undefined && payload[key] !== null)
1130
+ metadata[key] = payload[key];
1131
+ }
1132
+ if (isRecord(payload.attributes_safe))
1133
+ metadata.attributes_safe = payload.attributes_safe;
1134
+ return metadata;
1135
+ }
1136
+ function mergeToolCalls(existing, incoming) {
1137
+ return {
1138
+ name: incoming.name || existing.name,
1139
+ arguments: incoming.arguments !== undefined ? incoming.arguments : existing.arguments,
1140
+ callId: incoming.callId || existing.callId,
1141
+ spanId: incoming.spanId || existing.spanId,
1142
+ timestampNs: existing.timestampNs || incoming.timestampNs,
1143
+ startedAt: existing.startedAt || incoming.startedAt,
1144
+ endedAt: incoming.endedAt || existing.endedAt,
1145
+ status: incoming.status || existing.status,
1146
+ metadata: { ...existing.metadata, ...incoming.metadata },
1147
+ };
1148
+ }
1149
+ function decodeArguments(value) {
1150
+ if (typeof value !== 'string')
1151
+ return value;
1152
+ try {
1153
+ return JSON.parse(value);
1154
+ }
1155
+ catch {
1156
+ return value;
1157
+ }
1158
+ }
1159
+ function firstPresent(...values) {
1160
+ return values.find(value => value !== undefined && value !== null);
1161
+ }
1162
+ function stringOrUndefined(value) {
1163
+ if (value === undefined || value === null)
1164
+ return undefined;
1165
+ const text = String(value).trim();
1166
+ return text || undefined;
1167
+ }
1168
+ function numberOrUndefined(value) {
1169
+ if (value === undefined || value === null || typeof value === 'boolean')
1170
+ return undefined;
1171
+ if (typeof value === 'number')
1172
+ return Number.isFinite(value) ? Math.trunc(value) : undefined;
1173
+ if (typeof value === 'string' && value.trim()) {
1174
+ const parsed = Number(value);
1175
+ return Number.isFinite(parsed) ? Math.trunc(parsed) : undefined;
1176
+ }
1177
+ return undefined;
1178
+ }
1179
+ function statusFromEventType(eventType) {
1180
+ if (eventType.endsWith('.started'))
1181
+ return 'started';
1182
+ if (eventType.endsWith('.completed'))
1183
+ return 'completed';
1184
+ if (eventType.endsWith('.failed'))
1185
+ return 'failed';
1186
+ return undefined;
1187
+ }
1188
+ function isRecord(value) {
1189
+ return !!value && typeof value === 'object' && !Array.isArray(value);
1190
+ }
1191
+ function eventTypeOf(event) {
1192
+ return event.eventType || event.event_type || '';
1193
+ }
1194
+ function eventIdOf(event) {
1195
+ return event.eventId || event.event_id || '';
1196
+ }
560
1197
  //# sourceMappingURL=scorer.js.map