observability-toolkit 1.8.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +126 -5
- package/dist/backends/index.d.ts +163 -0
- package/dist/backends/index.d.ts.map +1 -1
- package/dist/backends/index.js +57 -0
- package/dist/backends/index.js.map +1 -1
- package/dist/backends/index.test.js +55 -1
- package/dist/backends/index.test.js.map +1 -1
- package/dist/backends/local-jsonl.d.ts +30 -0
- package/dist/backends/local-jsonl.d.ts.map +1 -1
- package/dist/backends/local-jsonl.js +912 -550
- package/dist/backends/local-jsonl.js.map +1 -1
- package/dist/backends/signoz-api-rate-limiter.test.js +2 -1
- package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -1
- package/dist/backends/signoz-api.d.ts +16 -2
- package/dist/backends/signoz-api.d.ts.map +1 -1
- package/dist/backends/signoz-api.js +650 -534
- package/dist/backends/signoz-api.js.map +1 -1
- package/dist/backends/signoz-api.test.js +6 -5
- package/dist/backends/signoz-api.test.js.map +1 -1
- package/dist/lib/agent-as-judge.d.ts +388 -0
- package/dist/lib/agent-as-judge.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.js +740 -0
- package/dist/lib/agent-as-judge.js.map +1 -0
- package/dist/lib/agent-as-judge.test.d.ts +5 -0
- package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.test.js +816 -0
- package/dist/lib/agent-as-judge.test.js.map +1 -0
- package/dist/lib/cache.d.ts +15 -2
- package/dist/lib/cache.d.ts.map +1 -1
- package/dist/lib/cache.js +16 -2
- package/dist/lib/cache.js.map +1 -1
- package/dist/lib/circuit-breaker.d.ts +18 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -1
- package/dist/lib/circuit-breaker.js +41 -8
- package/dist/lib/circuit-breaker.js.map +1 -1
- package/dist/lib/confident-export.d.ts +101 -0
- package/dist/lib/confident-export.d.ts.map +1 -0
- package/dist/lib/confident-export.js +393 -0
- package/dist/lib/confident-export.js.map +1 -0
- package/dist/lib/confident-export.test.d.ts +7 -0
- package/dist/lib/confident-export.test.d.ts.map +1 -0
- package/dist/lib/confident-export.test.js +835 -0
- package/dist/lib/confident-export.test.js.map +1 -0
- package/dist/lib/constants.d.ts +75 -0
- package/dist/lib/constants.d.ts.map +1 -1
- package/dist/lib/constants.js +104 -1
- package/dist/lib/constants.js.map +1 -1
- package/dist/lib/datadog-export.d.ts +156 -0
- package/dist/lib/datadog-export.d.ts.map +1 -0
- package/dist/lib/datadog-export.js +464 -0
- package/dist/lib/datadog-export.js.map +1 -0
- package/dist/lib/datadog-export.test.d.ts +14 -0
- package/dist/lib/datadog-export.test.d.ts.map +1 -0
- package/dist/lib/datadog-export.test.js +890 -0
- package/dist/lib/datadog-export.test.js.map +1 -0
- package/dist/lib/evaluation-hooks.d.ts +49 -0
- package/dist/lib/evaluation-hooks.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.js +488 -0
- package/dist/lib/evaluation-hooks.js.map +1 -0
- package/dist/lib/evaluation-hooks.test.d.ts +8 -0
- package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.test.js +624 -0
- package/dist/lib/evaluation-hooks.test.js.map +1 -0
- package/dist/lib/export-utils.d.ts +99 -0
- package/dist/lib/export-utils.d.ts.map +1 -0
- package/dist/lib/export-utils.js +238 -0
- package/dist/lib/export-utils.js.map +1 -0
- package/dist/lib/export-utils.test.d.ts +5 -0
- package/dist/lib/export-utils.test.d.ts.map +1 -0
- package/dist/lib/export-utils.test.js +193 -0
- package/dist/lib/export-utils.test.js.map +1 -0
- package/dist/lib/file-utils.d.ts +17 -2
- package/dist/lib/file-utils.d.ts.map +1 -1
- package/dist/lib/file-utils.js +24 -5
- package/dist/lib/file-utils.js.map +1 -1
- package/dist/lib/file-utils.test.js +30 -0
- package/dist/lib/file-utils.test.js.map +1 -1
- package/dist/lib/histogram.d.ts +119 -0
- package/dist/lib/histogram.d.ts.map +1 -0
- package/dist/lib/histogram.js +202 -0
- package/dist/lib/histogram.js.map +1 -0
- package/dist/lib/histogram.test.d.ts +5 -0
- package/dist/lib/histogram.test.d.ts.map +1 -0
- package/dist/lib/histogram.test.js +381 -0
- package/dist/lib/histogram.test.js.map +1 -0
- package/dist/lib/instrumentation.d.ts +153 -0
- package/dist/lib/instrumentation.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.d.ts +2 -0
- package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.js +589 -0
- package/dist/lib/instrumentation.integration.test.js.map +1 -0
- package/dist/lib/instrumentation.js +520 -0
- package/dist/lib/instrumentation.js.map +1 -0
- package/dist/lib/instrumentation.test.d.ts +2 -0
- package/dist/lib/instrumentation.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.test.js +821 -0
- package/dist/lib/instrumentation.test.js.map +1 -0
- package/dist/lib/langfuse-export.d.ts +125 -0
- package/dist/lib/langfuse-export.d.ts.map +1 -0
- package/dist/lib/langfuse-export.js +367 -0
- package/dist/lib/langfuse-export.js.map +1 -0
- package/dist/lib/langfuse-export.test.d.ts +7 -0
- package/dist/lib/langfuse-export.test.d.ts.map +1 -0
- package/dist/lib/langfuse-export.test.js +1007 -0
- package/dist/lib/langfuse-export.test.js.map +1 -0
- package/dist/lib/llm-as-judge.d.ts +657 -0
- package/dist/lib/llm-as-judge.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.js +1397 -0
- package/dist/lib/llm-as-judge.js.map +1 -0
- package/dist/lib/llm-as-judge.test.d.ts +2 -0
- package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.test.js +2409 -0
- package/dist/lib/llm-as-judge.test.js.map +1 -0
- package/dist/lib/logger.d.ts +1 -1
- package/dist/lib/logger.d.ts.map +1 -1
- package/dist/lib/logger.js.map +1 -1
- package/dist/lib/metrics.d.ts +62 -0
- package/dist/lib/metrics.d.ts.map +1 -0
- package/dist/lib/metrics.js +166 -0
- package/dist/lib/metrics.js.map +1 -0
- package/dist/lib/metrics.test.d.ts +5 -0
- package/dist/lib/metrics.test.d.ts.map +1 -0
- package/dist/lib/metrics.test.js +189 -0
- package/dist/lib/metrics.test.js.map +1 -0
- package/dist/lib/parse-stats.d.ts +119 -0
- package/dist/lib/parse-stats.d.ts.map +1 -0
- package/dist/lib/parse-stats.js +206 -0
- package/dist/lib/parse-stats.js.map +1 -0
- package/dist/lib/parse-stats.test.d.ts +5 -0
- package/dist/lib/parse-stats.test.d.ts.map +1 -0
- package/dist/lib/parse-stats.test.js +283 -0
- package/dist/lib/parse-stats.test.js.map +1 -0
- package/dist/lib/phoenix-export.d.ts +109 -0
- package/dist/lib/phoenix-export.d.ts.map +1 -0
- package/dist/lib/phoenix-export.js +429 -0
- package/dist/lib/phoenix-export.js.map +1 -0
- package/dist/lib/phoenix-export.test.d.ts +11 -0
- package/dist/lib/phoenix-export.test.d.ts.map +1 -0
- package/dist/lib/phoenix-export.test.js +725 -0
- package/dist/lib/phoenix-export.test.js.map +1 -0
- package/dist/lib/server-utils.d.ts +6 -1
- package/dist/lib/server-utils.d.ts.map +1 -1
- package/dist/lib/server-utils.js +9 -1
- package/dist/lib/server-utils.js.map +1 -1
- package/dist/lib/shared-schemas.d.ts +6 -0
- package/dist/lib/shared-schemas.d.ts.map +1 -1
- package/dist/lib/shared-schemas.js +11 -4
- package/dist/lib/shared-schemas.js.map +1 -1
- package/dist/lib/verification-events.d.ts +100 -0
- package/dist/lib/verification-events.d.ts.map +1 -0
- package/dist/lib/verification-events.js +162 -0
- package/dist/lib/verification-events.js.map +1 -0
- package/dist/lib/verification-events.test.d.ts +5 -0
- package/dist/lib/verification-events.test.d.ts.map +1 -0
- package/dist/lib/verification-events.test.js +193 -0
- package/dist/lib/verification-events.test.js.map +1 -0
- package/dist/server.d.ts +5 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +77 -21
- package/dist/server.js.map +1 -1
- package/dist/tools/context-stats.d.ts.map +1 -1
- package/dist/tools/context-stats.js +6 -8
- package/dist/tools/context-stats.js.map +1 -1
- package/dist/tools/export-confident.d.ts +145 -0
- package/dist/tools/export-confident.d.ts.map +1 -0
- package/dist/tools/export-confident.js +134 -0
- package/dist/tools/export-confident.js.map +1 -0
- package/dist/tools/export-confident.test.d.ts +7 -0
- package/dist/tools/export-confident.test.d.ts.map +1 -0
- package/dist/tools/export-confident.test.js +332 -0
- package/dist/tools/export-confident.test.js.map +1 -0
- package/dist/tools/export-datadog.d.ts +160 -0
- package/dist/tools/export-datadog.d.ts.map +1 -0
- package/dist/tools/export-datadog.js +160 -0
- package/dist/tools/export-datadog.js.map +1 -0
- package/dist/tools/export-datadog.test.d.ts +8 -0
- package/dist/tools/export-datadog.test.d.ts.map +1 -0
- package/dist/tools/export-datadog.test.js +419 -0
- package/dist/tools/export-datadog.test.js.map +1 -0
- package/dist/tools/export-langfuse.d.ts +137 -0
- package/dist/tools/export-langfuse.d.ts.map +1 -0
- package/dist/tools/export-langfuse.js +131 -0
- package/dist/tools/export-langfuse.js.map +1 -0
- package/dist/tools/export-langfuse.test.d.ts +7 -0
- package/dist/tools/export-langfuse.test.d.ts.map +1 -0
- package/dist/tools/export-langfuse.test.js +303 -0
- package/dist/tools/export-langfuse.test.js.map +1 -0
- package/dist/tools/export-phoenix.d.ts +145 -0
- package/dist/tools/export-phoenix.d.ts.map +1 -0
- package/dist/tools/export-phoenix.js +135 -0
- package/dist/tools/export-phoenix.js.map +1 -0
- package/dist/tools/export-phoenix.test.d.ts +7 -0
- package/dist/tools/export-phoenix.test.d.ts.map +1 -0
- package/dist/tools/export-phoenix.test.js +316 -0
- package/dist/tools/export-phoenix.test.js.map +1 -0
- package/dist/tools/health-check.d.ts +26 -0
- package/dist/tools/health-check.d.ts.map +1 -1
- package/dist/tools/health-check.js +36 -7
- package/dist/tools/health-check.js.map +1 -1
- package/dist/tools/index.d.ts +6 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +6 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/inject-evaluations.d.ts +1315 -0
- package/dist/tools/inject-evaluations.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.js +121 -0
- package/dist/tools/inject-evaluations.js.map +1 -0
- package/dist/tools/inject-evaluations.test.d.ts +5 -0
- package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.test.js +359 -0
- package/dist/tools/inject-evaluations.test.js.map +1 -0
- package/dist/tools/query-evaluations.d.ts +25 -4
- package/dist/tools/query-evaluations.d.ts.map +1 -1
- package/dist/tools/query-evaluations.js +10 -0
- package/dist/tools/query-evaluations.js.map +1 -1
- package/dist/tools/query-llm-events.js +2 -2
- package/dist/tools/query-llm-events.js.map +1 -1
- package/dist/tools/query-logs.d.ts +8 -8
- package/dist/tools/query-logs.js +3 -3
- package/dist/tools/query-logs.js.map +1 -1
- package/dist/tools/query-metrics.d.ts +4 -4
- package/dist/tools/query-metrics.js +2 -2
- package/dist/tools/query-metrics.js.map +1 -1
- package/dist/tools/query-traces.d.ts +8 -8
- package/dist/tools/query-verifications.d.ts +111 -0
- package/dist/tools/query-verifications.d.ts.map +1 -0
- package/dist/tools/query-verifications.js +101 -0
- package/dist/tools/query-verifications.js.map +1 -0
- package/dist/tools/query-verifications.test.d.ts +5 -0
- package/dist/tools/query-verifications.test.d.ts.map +1 -0
- package/dist/tools/query-verifications.test.js +156 -0
- package/dist/tools/query-verifications.test.js.map +1 -0
- package/dist/types/evaluation-hooks.d.ts +176 -0
- package/dist/types/evaluation-hooks.d.ts.map +1 -0
- package/dist/types/evaluation-hooks.js +49 -0
- package/dist/types/evaluation-hooks.js.map +1 -0
- package/package.json +10 -2
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"inject-evaluations.d.ts","sourceRoot":"","sources":["../../src/tools/inject-evaluations.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AA2DxB;;GAEG;AACH,eAAO,MAAM,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAcnC,CAAC;AAEF,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,uBAAuB,CAAC,CAAC;AAE7E;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACtC,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,KAAK,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,UAAU,CAAC,EAAE,MAAM,CAAC;KACrB,CAAC,CAAC;CACJ;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACrC,KAAK,EAAE,sBAAsB,GAC5B,OAAO,CAAC,uBAAuB,CAAC,CAiDlC;AAED,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAKjC,CAAC"}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inject external evaluation results into JSONL storage
|
|
3
|
+
*
|
|
4
|
+
* Enables external evaluators (webhooks, APIs, human reviewers) to submit
|
|
5
|
+
* quality scores that integrate with the evaluation query system.
|
|
6
|
+
*/
|
|
7
|
+
import { z } from 'zod';
|
|
8
|
+
import { TELEMETRY_DIR } from '../lib/constants.js';
|
|
9
|
+
import { processEvaluation, processBatch, verifyHmacSignature, MAX_BATCH_SIZE, } from '../lib/evaluation-hooks.js';
|
|
10
|
+
/**
|
|
11
|
+
* Step score schema for agent trajectory evaluation
|
|
12
|
+
*/
|
|
13
|
+
const stepScoreSchema = z.object({
|
|
14
|
+
step: z.union([z.string(), z.number()]).describe('Step identifier (index or name)'),
|
|
15
|
+
score: z.number().min(0).max(1).describe('Score for this step (0-1)'),
|
|
16
|
+
evidence: z.record(z.unknown()).optional().describe('Supporting evidence'),
|
|
17
|
+
explanation: z.string().optional().describe('Explanation for step score'),
|
|
18
|
+
});
|
|
19
|
+
/**
|
|
20
|
+
* Tool verification schema for agent tool usage evaluation
|
|
21
|
+
*/
|
|
22
|
+
const toolVerificationSchema = z.object({
|
|
23
|
+
toolName: z.string().describe('Name of the tool'),
|
|
24
|
+
toolCallId: z.string().optional().describe('Tool call ID for correlation'),
|
|
25
|
+
toolCorrect: z.boolean().describe('Whether correct tool was selected'),
|
|
26
|
+
argsCorrect: z.boolean().describe('Whether arguments were correct'),
|
|
27
|
+
resultCorrect: z.boolean().optional().describe('Whether result was correct'),
|
|
28
|
+
score: z.number().min(0).max(1).describe('Weighted correctness score'),
|
|
29
|
+
expectedTool: z.string().optional().describe('Expected tool if different'),
|
|
30
|
+
evidence: z.record(z.unknown()).optional().describe('Supporting evidence'),
|
|
31
|
+
});
|
|
32
|
+
/**
|
|
33
|
+
* Single evaluation payload schema
|
|
34
|
+
*/
|
|
35
|
+
const evaluationPayloadSchema = z.object({
|
|
36
|
+
evaluationName: z.string().min(1).max(256).describe('Evaluation metric name (e.g., "relevance", "faithfulness")'),
|
|
37
|
+
scoreValue: z.number().optional().describe('Numeric score (0-1 normalized recommended)'),
|
|
38
|
+
scoreLabel: z.string().optional().describe('Human-readable label (e.g., "pass", "fail")'),
|
|
39
|
+
scoreUnit: z.string().optional().describe('Score unit/scale (e.g., "ratio_0_1", "percentage")'),
|
|
40
|
+
explanation: z.string().max(10240).optional().describe('Explanation/rationale'),
|
|
41
|
+
evaluator: z.string().min(1).max(256).describe('Evaluator identity (e.g., "claude-3-sonnet", "human-reviewer-001")'),
|
|
42
|
+
evaluatorType: z.enum(['llm', 'human', 'rule', 'classifier']).describe('Type of evaluator'),
|
|
43
|
+
responseId: z.string().max(128).optional().describe('Response ID for correlation'),
|
|
44
|
+
traceId: z.string().max(64).optional().describe('Trace ID for correlation'),
|
|
45
|
+
sessionId: z.string().max(128).optional().describe('Session ID'),
|
|
46
|
+
agentId: z.string().max(128).optional().describe('Subject agent ID'),
|
|
47
|
+
agentName: z.string().max(256).optional().describe('Subject agent name'),
|
|
48
|
+
stepScores: z.array(stepScoreSchema).max(1000).optional().describe('Per-step evaluation breakdown'),
|
|
49
|
+
toolVerifications: z.array(toolVerificationSchema).max(500).optional().describe('Tool call correctness'),
|
|
50
|
+
trajectoryLength: z.number().int().nonnegative().optional().describe('Agent trajectory length'),
|
|
51
|
+
metadata: z.record(z.unknown()).optional().describe('Custom metadata'),
|
|
52
|
+
}).refine(data => data.scoreValue !== undefined || data.scoreLabel !== undefined, { message: 'At least one of scoreValue or scoreLabel is required' });
|
|
53
|
+
/**
|
|
54
|
+
* Input schema for inject-evaluations tool
|
|
55
|
+
*/
|
|
56
|
+
export const injectEvaluationsSchema = z.object({
|
|
57
|
+
// Single evaluation (simple mode)
|
|
58
|
+
evaluation: evaluationPayloadSchema.optional().describe('Single evaluation to inject'),
|
|
59
|
+
// Batch mode
|
|
60
|
+
evaluations: z.array(evaluationPayloadSchema).max(MAX_BATCH_SIZE).optional()
|
|
61
|
+
.describe(`Batch of evaluations (max ${MAX_BATCH_SIZE})`),
|
|
62
|
+
// HMAC signature verification (optional security)
|
|
63
|
+
signature: z.string().optional().describe('HMAC-SHA256 signature (format: sha256=<hex>)'),
|
|
64
|
+
secret: z.string().optional().describe('HMAC secret for signature verification'),
|
|
65
|
+
}).refine(data => data.evaluation !== undefined || (data.evaluations !== undefined && data.evaluations.length > 0), { message: 'Either evaluation or evaluations array is required' });
|
|
66
|
+
/**
|
|
67
|
+
* Inject evaluation(s) into JSONL storage
|
|
68
|
+
*/
|
|
69
|
+
export async function injectEvaluations(input) {
|
|
70
|
+
const options = {
|
|
71
|
+
telemetryDir: TELEMETRY_DIR,
|
|
72
|
+
maxBatchSize: MAX_BATCH_SIZE,
|
|
73
|
+
};
|
|
74
|
+
// Verify signature if provided
|
|
75
|
+
if (input.signature && input.secret) {
|
|
76
|
+
// For signature verification, we need the raw payload
|
|
77
|
+
// In MCP context, we verify against the serialized input
|
|
78
|
+
const payload = JSON.stringify(input.evaluation || input.evaluations);
|
|
79
|
+
const result = verifyHmacSignature(payload, input.signature, input.secret);
|
|
80
|
+
if (!result.valid) {
|
|
81
|
+
return {
|
|
82
|
+
success: false,
|
|
83
|
+
message: `Signature verification failed: ${result.error}`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// Process single evaluation
|
|
88
|
+
if (input.evaluation) {
|
|
89
|
+
const result = await processEvaluation(input.evaluation, options);
|
|
90
|
+
return {
|
|
91
|
+
success: result.success,
|
|
92
|
+
message: result.message,
|
|
93
|
+
processedCount: result.processedCount,
|
|
94
|
+
errors: result.errors,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
// Process batch
|
|
98
|
+
if (input.evaluations && input.evaluations.length > 0) {
|
|
99
|
+
const batch = {
|
|
100
|
+
evaluations: input.evaluations,
|
|
101
|
+
};
|
|
102
|
+
const result = await processBatch(batch, options);
|
|
103
|
+
return {
|
|
104
|
+
success: result.success,
|
|
105
|
+
message: result.message,
|
|
106
|
+
processedCount: result.processedCount,
|
|
107
|
+
errors: result.errors,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
return {
|
|
111
|
+
success: false,
|
|
112
|
+
message: 'No evaluations provided',
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
export const injectEvaluationsTool = {
|
|
116
|
+
name: 'obs_inject_evaluations',
|
|
117
|
+
description: 'Inject evaluations from external evaluators. Supports single or batch mode with optional HMAC verification.',
|
|
118
|
+
inputSchema: injectEvaluationsSchema,
|
|
119
|
+
handler: injectEvaluations,
|
|
120
|
+
};
|
|
121
|
+
//# sourceMappingURL=inject-evaluations.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"inject-evaluations.js","sourceRoot":"","sources":["../../src/tools/inject-evaluations.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EACL,iBAAiB,EACjB,YAAY,EACZ,mBAAmB,EACnB,cAAc,GACf,MAAM,4BAA4B,CAAC;AAGpC;;GAEG;AACH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,iCAAiC,CAAC;IACnF,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,2BAA2B,CAAC;IACrE,QAAQ,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,qBAAqB,CAAC;IAC1E,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,4BAA4B,CAAC;CAC1E,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,sBAAsB,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,kBAAkB,CAAC;IACjD,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,8BAA8B,CAAC;IAC1E,WAAW,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,mCAAmC,CAAC;IACtE,WAAW,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACnE,aAAa,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IAC5E,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IACtE,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IAC1E,QAAQ,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,qBAAqB,CAAC;CAC3E,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,uBAAuB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;IACjH,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,4CAA4C,CAAC;IACxF,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,6CAA6C,CAAC;IACzF,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,oDAAoD,CAAC;IAC/F,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IAC/E,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,oEAAoE,CAAC;IACpH,aAAa,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,mBAAmB,CAAC;IAC3F,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,6BAA6B,CAAC;IAClF,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,0BAA0B,CAAC;IAC3E,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC;IAChE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,kBAAkB,CAAC;IACpE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,oBAAoB,CAAC;IACxE,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,+BAA+B,CAAC;IACnG,iBAAiB,EAAE,CAAC,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACxG,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,yBAAyB,CAAC;IAC/F,QAAQ,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,iBAAiB,CAAC;CACvE,CAAC,CAAC,MAAM,CACP,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,UAAU,KAAK,SAAS,IAAI,IAAI,CAAC,UAAU,KAAK,SAAS,EACtE,EAAE,OAAO,EAAE,sDAAsD,EAAE,CACpE,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9C,kCAAkC;IAClC,UAAU,EAAE,uBAAuB,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,6BAA6B,CAAC;IAEtF,aAAa;IACb,WAAW,EAAE,CAAC,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,QAAQ,EAAE;SACzE,QAAQ,CAAC,6BAA6B,cAAc,GAAG,CAAC;IAE3D,kDAAkD;IAClD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,8CAA8C,CAAC;IACzF,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,wCAAwC,CAAC;CACjF,CAAC,CAAC,MAAM,CACP,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,IAAI,CAAC,WAAW,KAAK,SAAS,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,EACxG,EAAE,OAAO,EAAE,oDAAoD,EAAE,CAClE,CAAC;AAmBF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAA6B;IAE7B,MAAM,OAAO,GAAwB;QACnC,YAAY,EAAE,aAAa;QAC3B,YAAY,EAAE,cAAc;KAC7B,CAAC;IAEF,+BAA+B;IAC/B,IAAI,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;QACpC,sDAAsD;QACtD,yDAAyD;QACzD,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,UAAU,IAAI,KAAK,CAAC,WAAW,CAAC,CAAC;QACtE,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,EAAE,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;QAC3E,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YAClB,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,OAAO,EAAE,kCAAkC,MAAM,CAAC,KAAK,EAAE;aAC1D,CAAC;QACJ,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;QACrB,MAAM,MAAM,GAAG,MAAM,iBAAiB,CAAC,KAAK,CAAC,UAAsC,EAAE,OAAO,CAAC,CAAC;QAC9F,OAAO;YACL,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,MAAM,EAAE,MAAM,CAAC,MAAM;SACtB,CAAC;IACJ,CAAC;IAED,gBAAgB;IAChB,IAAI,KAAK,CAAC,WAAW,IAAI,KAAK,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtD,MAAM,KAAK,GAAwB;YACjC,WAAW,EAAE,KAAK,CAAC,WAAyC;SAC7D,CAAC;QACF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAClD,OAAO;YACL,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,MAAM,EAAE,MAAM,CAAC,MAAM;SACtB,CAAC;IACJ,CAAC;IAED,OAAO;QACL,OAAO,EAAE,KAAK;QACd,OAAO,EAAE,yBAAyB;KACnC,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,qBAAqB,GAAG;IACnC,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,6GAA6G;IAC1H,WAAW,EAAE,uBAAuB;IACpC,OAAO,EAAE,iBAAiB;CAC3B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"inject-evaluations.test.d.ts","sourceRoot":"","sources":["../../src/tools/inject-evaluations.test.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for inject-evaluations tool
|
|
3
|
+
*/
|
|
4
|
+
import { describe, it, beforeEach } from 'node:test';
|
|
5
|
+
import assert from 'node:assert';
|
|
6
|
+
import { createHmac } from 'crypto';
|
|
7
|
+
import { mkdirSync, rmSync } from 'fs';
|
|
8
|
+
import { join } from 'path';
|
|
9
|
+
import { tmpdir } from 'os';
|
|
10
|
+
import { injectEvaluations, injectEvaluationsSchema, injectEvaluationsTool, } from './inject-evaluations.js';
|
|
11
|
+
import { validateToolDefinition } from '../test-helpers/tool-validators.js';
|
|
12
|
+
import { resetHookStats } from '../lib/evaluation-hooks.js';
|
|
13
|
+
// Test helper for creating temp directories
|
|
14
|
+
function createTempDir() {
|
|
15
|
+
const dir = join(tmpdir(), `inject-eval-test-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
16
|
+
mkdirSync(dir, { recursive: true });
|
|
17
|
+
return dir;
|
|
18
|
+
}
|
|
19
|
+
function removeTempDir(dir) {
|
|
20
|
+
try {
|
|
21
|
+
rmSync(dir, { recursive: true, force: true });
|
|
22
|
+
}
|
|
23
|
+
catch {
|
|
24
|
+
// Ignore cleanup errors
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
describe('inject-evaluations', () => {
|
|
28
|
+
beforeEach(() => {
|
|
29
|
+
resetHookStats();
|
|
30
|
+
});
|
|
31
|
+
describe('tool definition', () => {
|
|
32
|
+
validateToolDefinition(injectEvaluationsTool, 'obs_inject_evaluations', injectEvaluationsSchema, injectEvaluations, ['inject', 'evaluations']);
|
|
33
|
+
});
|
|
34
|
+
describe('schema validation', () => {
|
|
35
|
+
it('should require evaluation or evaluations', () => {
|
|
36
|
+
assert.throws(() => {
|
|
37
|
+
injectEvaluationsSchema.parse({});
|
|
38
|
+
});
|
|
39
|
+
});
|
|
40
|
+
it('should accept single evaluation', () => {
|
|
41
|
+
const result = injectEvaluationsSchema.parse({
|
|
42
|
+
evaluation: {
|
|
43
|
+
evaluationName: 'test',
|
|
44
|
+
scoreValue: 0.8,
|
|
45
|
+
evaluator: 'test-evaluator',
|
|
46
|
+
evaluatorType: 'llm',
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
assert.ok(result.evaluation);
|
|
50
|
+
});
|
|
51
|
+
it('should accept evaluations array', () => {
|
|
52
|
+
const result = injectEvaluationsSchema.parse({
|
|
53
|
+
evaluations: [
|
|
54
|
+
{
|
|
55
|
+
evaluationName: 'test',
|
|
56
|
+
scoreValue: 0.8,
|
|
57
|
+
evaluator: 'test-evaluator',
|
|
58
|
+
evaluatorType: 'llm',
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
});
|
|
62
|
+
assert.ok(result.evaluations);
|
|
63
|
+
assert.strictEqual(result.evaluations.length, 1);
|
|
64
|
+
});
|
|
65
|
+
it('should require scoreValue or scoreLabel', () => {
|
|
66
|
+
assert.throws(() => {
|
|
67
|
+
injectEvaluationsSchema.parse({
|
|
68
|
+
evaluation: {
|
|
69
|
+
evaluationName: 'test',
|
|
70
|
+
evaluator: 'test-evaluator',
|
|
71
|
+
evaluatorType: 'llm',
|
|
72
|
+
},
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
});
|
|
76
|
+
it('should accept scoreLabel without scoreValue', () => {
|
|
77
|
+
const result = injectEvaluationsSchema.parse({
|
|
78
|
+
evaluation: {
|
|
79
|
+
evaluationName: 'test',
|
|
80
|
+
scoreLabel: 'pass',
|
|
81
|
+
evaluator: 'test-evaluator',
|
|
82
|
+
evaluatorType: 'llm',
|
|
83
|
+
},
|
|
84
|
+
});
|
|
85
|
+
assert.ok(result.evaluation);
|
|
86
|
+
assert.strictEqual(result.evaluation.scoreLabel, 'pass');
|
|
87
|
+
});
|
|
88
|
+
it('should enforce evaluatorType enum', () => {
|
|
89
|
+
assert.throws(() => {
|
|
90
|
+
injectEvaluationsSchema.parse({
|
|
91
|
+
evaluation: {
|
|
92
|
+
evaluationName: 'test',
|
|
93
|
+
scoreValue: 0.8,
|
|
94
|
+
evaluator: 'test-evaluator',
|
|
95
|
+
evaluatorType: 'invalid',
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
it('should accept all valid evaluator types', () => {
|
|
101
|
+
const types = ['llm', 'human', 'rule', 'classifier'];
|
|
102
|
+
for (const evaluatorType of types) {
|
|
103
|
+
const result = injectEvaluationsSchema.parse({
|
|
104
|
+
evaluation: {
|
|
105
|
+
evaluationName: 'test',
|
|
106
|
+
scoreValue: 0.8,
|
|
107
|
+
evaluator: 'test-evaluator',
|
|
108
|
+
evaluatorType,
|
|
109
|
+
},
|
|
110
|
+
});
|
|
111
|
+
assert.strictEqual(result.evaluation?.evaluatorType, evaluatorType);
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
it('should enforce max batch size', () => {
|
|
115
|
+
assert.throws(() => {
|
|
116
|
+
injectEvaluationsSchema.parse({
|
|
117
|
+
evaluations: Array(101).fill({
|
|
118
|
+
evaluationName: 'test',
|
|
119
|
+
scoreValue: 0.8,
|
|
120
|
+
evaluator: 'test-evaluator',
|
|
121
|
+
evaluatorType: 'llm',
|
|
122
|
+
}),
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
it('should accept optional signature fields', () => {
|
|
127
|
+
const result = injectEvaluationsSchema.parse({
|
|
128
|
+
evaluation: {
|
|
129
|
+
evaluationName: 'test',
|
|
130
|
+
scoreValue: 0.8,
|
|
131
|
+
evaluator: 'test-evaluator',
|
|
132
|
+
evaluatorType: 'llm',
|
|
133
|
+
},
|
|
134
|
+
signature: 'sha256=abc123',
|
|
135
|
+
secret: 'my-secret',
|
|
136
|
+
});
|
|
137
|
+
assert.strictEqual(result.signature, 'sha256=abc123');
|
|
138
|
+
assert.strictEqual(result.secret, 'my-secret');
|
|
139
|
+
});
|
|
140
|
+
it('should accept stepScores', () => {
|
|
141
|
+
const result = injectEvaluationsSchema.parse({
|
|
142
|
+
evaluation: {
|
|
143
|
+
evaluationName: 'test',
|
|
144
|
+
scoreValue: 0.8,
|
|
145
|
+
evaluator: 'test-evaluator',
|
|
146
|
+
evaluatorType: 'llm',
|
|
147
|
+
stepScores: [
|
|
148
|
+
{ step: 0, score: 0.9 },
|
|
149
|
+
{ step: 'reasoning', score: 0.8, explanation: 'Good logic' },
|
|
150
|
+
],
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
assert.strictEqual(result.evaluation?.stepScores?.length, 2);
|
|
154
|
+
});
|
|
155
|
+
it('should accept toolVerifications', () => {
|
|
156
|
+
const result = injectEvaluationsSchema.parse({
|
|
157
|
+
evaluation: {
|
|
158
|
+
evaluationName: 'test',
|
|
159
|
+
scoreValue: 0.8,
|
|
160
|
+
evaluator: 'test-evaluator',
|
|
161
|
+
evaluatorType: 'llm',
|
|
162
|
+
toolVerifications: [
|
|
163
|
+
{
|
|
164
|
+
toolName: 'search',
|
|
165
|
+
toolCorrect: true,
|
|
166
|
+
argsCorrect: true,
|
|
167
|
+
score: 1.0,
|
|
168
|
+
},
|
|
169
|
+
],
|
|
170
|
+
},
|
|
171
|
+
});
|
|
172
|
+
assert.strictEqual(result.evaluation?.toolVerifications?.length, 1);
|
|
173
|
+
});
|
|
174
|
+
it('should enforce stepScores max', () => {
|
|
175
|
+
assert.throws(() => {
|
|
176
|
+
injectEvaluationsSchema.parse({
|
|
177
|
+
evaluation: {
|
|
178
|
+
evaluationName: 'test',
|
|
179
|
+
scoreValue: 0.8,
|
|
180
|
+
evaluator: 'test-evaluator',
|
|
181
|
+
evaluatorType: 'llm',
|
|
182
|
+
stepScores: Array(1001).fill({ step: 0, score: 0.5 }),
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
});
|
|
186
|
+
});
|
|
187
|
+
it('should enforce toolVerifications max', () => {
|
|
188
|
+
assert.throws(() => {
|
|
189
|
+
injectEvaluationsSchema.parse({
|
|
190
|
+
evaluation: {
|
|
191
|
+
evaluationName: 'test',
|
|
192
|
+
scoreValue: 0.8,
|
|
193
|
+
evaluator: 'test-evaluator',
|
|
194
|
+
evaluatorType: 'llm',
|
|
195
|
+
toolVerifications: Array(501).fill({
|
|
196
|
+
toolName: 'test',
|
|
197
|
+
toolCorrect: true,
|
|
198
|
+
argsCorrect: true,
|
|
199
|
+
score: 1.0,
|
|
200
|
+
}),
|
|
201
|
+
},
|
|
202
|
+
});
|
|
203
|
+
});
|
|
204
|
+
});
|
|
205
|
+
});
|
|
206
|
+
describe('signature verification', () => {
|
|
207
|
+
it('should verify correct signature format', () => {
|
|
208
|
+
const payload = JSON.stringify({
|
|
209
|
+
evaluationName: 'test',
|
|
210
|
+
scoreValue: 0.8,
|
|
211
|
+
evaluator: 'test-evaluator',
|
|
212
|
+
evaluatorType: 'llm',
|
|
213
|
+
});
|
|
214
|
+
const secret = 'test-secret';
|
|
215
|
+
const signature = 'sha256=' + createHmac('sha256', secret).update(payload).digest('hex');
|
|
216
|
+
assert.ok(signature.startsWith('sha256='));
|
|
217
|
+
assert.strictEqual(signature.length, 7 + 64);
|
|
218
|
+
});
|
|
219
|
+
it('should reject invalid signature in handler', async () => {
|
|
220
|
+
// Use an invalid signature (wrong length or format)
|
|
221
|
+
const result = await injectEvaluations({
|
|
222
|
+
evaluation: {
|
|
223
|
+
evaluationName: 'test',
|
|
224
|
+
scoreValue: 0.8,
|
|
225
|
+
evaluator: 'test-evaluator',
|
|
226
|
+
evaluatorType: 'llm',
|
|
227
|
+
},
|
|
228
|
+
signature: 'sha256=' + 'a'.repeat(64), // Valid format but wrong signature
|
|
229
|
+
secret: 'test-secret',
|
|
230
|
+
});
|
|
231
|
+
assert.strictEqual(result.success, false);
|
|
232
|
+
assert.ok(result.message.includes('Signature verification failed'));
|
|
233
|
+
});
|
|
234
|
+
});
|
|
235
|
+
describe('Agent-as-Judge fields', () => {
|
|
236
|
+
it('should accept agentId and agentName', () => {
|
|
237
|
+
const result = injectEvaluationsSchema.parse({
|
|
238
|
+
evaluation: {
|
|
239
|
+
evaluationName: 'task_completion',
|
|
240
|
+
scoreValue: 0.9,
|
|
241
|
+
evaluator: 'agent-judge',
|
|
242
|
+
evaluatorType: 'llm',
|
|
243
|
+
agentId: 'agent-123',
|
|
244
|
+
agentName: 'TestAgent',
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
assert.strictEqual(result.evaluation?.agentId, 'agent-123');
|
|
248
|
+
assert.strictEqual(result.evaluation?.agentName, 'TestAgent');
|
|
249
|
+
});
|
|
250
|
+
it('should accept trajectoryLength', () => {
|
|
251
|
+
const result = injectEvaluationsSchema.parse({
|
|
252
|
+
evaluation: {
|
|
253
|
+
evaluationName: 'efficiency',
|
|
254
|
+
scoreValue: 0.7,
|
|
255
|
+
evaluator: 'agent-judge',
|
|
256
|
+
evaluatorType: 'llm',
|
|
257
|
+
trajectoryLength: 5,
|
|
258
|
+
},
|
|
259
|
+
});
|
|
260
|
+
assert.strictEqual(result.evaluation?.trajectoryLength, 5);
|
|
261
|
+
});
|
|
262
|
+
it('should reject negative trajectoryLength', () => {
|
|
263
|
+
assert.throws(() => {
|
|
264
|
+
injectEvaluationsSchema.parse({
|
|
265
|
+
evaluation: {
|
|
266
|
+
evaluationName: 'efficiency',
|
|
267
|
+
scoreValue: 0.7,
|
|
268
|
+
evaluator: 'agent-judge',
|
|
269
|
+
evaluatorType: 'llm',
|
|
270
|
+
trajectoryLength: -1,
|
|
271
|
+
},
|
|
272
|
+
});
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
});
|
|
276
|
+
describe('metadata handling', () => {
|
|
277
|
+
it('should accept custom metadata', () => {
|
|
278
|
+
const result = injectEvaluationsSchema.parse({
|
|
279
|
+
evaluation: {
|
|
280
|
+
evaluationName: 'test',
|
|
281
|
+
scoreValue: 0.8,
|
|
282
|
+
evaluator: 'test-evaluator',
|
|
283
|
+
evaluatorType: 'llm',
|
|
284
|
+
metadata: {
|
|
285
|
+
source: 'external-api',
|
|
286
|
+
version: '1.0',
|
|
287
|
+
custom_field: 42,
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
});
|
|
291
|
+
assert.ok(result.evaluation?.metadata);
|
|
292
|
+
assert.strictEqual(result.evaluation?.metadata?.source, 'external-api');
|
|
293
|
+
});
|
|
294
|
+
});
|
|
295
|
+
describe('score normalization', () => {
|
|
296
|
+
it('should accept scoreUnit field', () => {
|
|
297
|
+
const result = injectEvaluationsSchema.parse({
|
|
298
|
+
evaluation: {
|
|
299
|
+
evaluationName: 'test',
|
|
300
|
+
scoreValue: 85,
|
|
301
|
+
scoreUnit: 'percentage',
|
|
302
|
+
evaluator: 'test-evaluator',
|
|
303
|
+
evaluatorType: 'llm',
|
|
304
|
+
},
|
|
305
|
+
});
|
|
306
|
+
assert.strictEqual(result.evaluation?.scoreValue, 85);
|
|
307
|
+
assert.strictEqual(result.evaluation?.scoreUnit, 'percentage');
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
describe('string field length limits', () => {
|
|
311
|
+
it('should enforce evaluationName max length', () => {
|
|
312
|
+
const result = injectEvaluationsSchema.safeParse({
|
|
313
|
+
evaluation: {
|
|
314
|
+
evaluationName: 'a'.repeat(257),
|
|
315
|
+
scoreValue: 0.8,
|
|
316
|
+
evaluator: 'test',
|
|
317
|
+
evaluatorType: 'llm',
|
|
318
|
+
},
|
|
319
|
+
});
|
|
320
|
+
assert.strictEqual(result.success, false);
|
|
321
|
+
});
|
|
322
|
+
it('should enforce evaluator max length', () => {
|
|
323
|
+
const result = injectEvaluationsSchema.safeParse({
|
|
324
|
+
evaluation: {
|
|
325
|
+
evaluationName: 'test',
|
|
326
|
+
scoreValue: 0.8,
|
|
327
|
+
evaluator: 'a'.repeat(257),
|
|
328
|
+
evaluatorType: 'llm',
|
|
329
|
+
},
|
|
330
|
+
});
|
|
331
|
+
assert.strictEqual(result.success, false);
|
|
332
|
+
});
|
|
333
|
+
it('should enforce agentId max length', () => {
|
|
334
|
+
const result = injectEvaluationsSchema.safeParse({
|
|
335
|
+
evaluation: {
|
|
336
|
+
evaluationName: 'test',
|
|
337
|
+
scoreValue: 0.8,
|
|
338
|
+
evaluator: 'test',
|
|
339
|
+
evaluatorType: 'llm',
|
|
340
|
+
agentId: 'a'.repeat(129),
|
|
341
|
+
},
|
|
342
|
+
});
|
|
343
|
+
assert.strictEqual(result.success, false);
|
|
344
|
+
});
|
|
345
|
+
it('should enforce explanation max length', () => {
|
|
346
|
+
const result = injectEvaluationsSchema.safeParse({
|
|
347
|
+
evaluation: {
|
|
348
|
+
evaluationName: 'test',
|
|
349
|
+
scoreValue: 0.8,
|
|
350
|
+
evaluator: 'test',
|
|
351
|
+
evaluatorType: 'llm',
|
|
352
|
+
explanation: 'a'.repeat(10241),
|
|
353
|
+
},
|
|
354
|
+
});
|
|
355
|
+
assert.strictEqual(result.success, false);
|
|
356
|
+
});
|
|
357
|
+
});
|
|
358
|
+
});
|
|
359
|
+
//# sourceMappingURL=inject-evaluations.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"inject-evaluations.test.js","sourceRoot":"","sources":["../../src/tools/inject-evaluations.test.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,UAAU,EAAa,MAAM,WAAW,CAAC;AAChE,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AACpC,OAAO,EAAE,SAAS,EAAE,MAAM,EAAgB,MAAM,IAAI,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,MAAM,EAAE,MAAM,IAAI,CAAC;AAE5B,OAAO,EACL,iBAAiB,EACjB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,yBAAyB,CAAC;AACjC,OAAO,EAAE,sBAAsB,EAAE,MAAM,oCAAoC,CAAC;AAC5E,OAAO,EAAE,cAAc,EAAgB,MAAM,4BAA4B,CAAC;AAE1E,4CAA4C;AAC5C,SAAS,aAAa;IACpB,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,oBAAoB,IAAI,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACpG,SAAS,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACpC,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,aAAa,CAAC,GAAW;IAChC,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;AACH,CAAC;AAED,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;IAClC,UAAU,CAAC,GAAG,EAAE;QACd,cAAc,EAAE,CAAC;IACnB,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,sBAAsB,CACpB,qBAAqB,EACrB,wBAAwB,EACxB,uBAAuB,EACvB,iBAAiB,EACjB,CAAC,QAAQ,EAAE,aAAa,CAAC,CAC1B,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YACpC,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;iBACrB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,WAAW,EAAE;oBACX;wBACE,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,KAAK;qBACrB;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;YAC9B,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;YACjD,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,UAAU,EAAE;wBACV,cAAc,EAAE,MAAM;wBACtB,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,KAAK;qBACrB;iBACF,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,GAAG,EAAE;YACrD,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,MAAM;oBAClB,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;iBACrB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;YAC7B,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;QAC3D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;YAC3C,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,UAAU,EAAE;wBACV,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,SAAS;qBACzB;iBACF,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;YACjD,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;YACrD,KAAK,MAAM,aAAa,IAAI,KAAK,EAAE,CAAC;gBAClC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;oBAC3C,UAAU,EAAE;wBACV,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa;qBACd;iBACF,CAAC,CAAC;gBACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,EAAE,aAAa,CAAC,CAAC;YACtE,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,WAAW,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC;wBAC3B,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,KAAK;qBACrB,CAAC;iBACH,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;YACjD,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;iBACrB;gBACD,SAAS,EAAE,eAAe;gBAC1B,MAAM,EAAE,WAAW;aACpB,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;YACtD,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;YAClC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;oBACpB,UAAU,EAAE;wBACV,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE;wBACvB,EAAE,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,EAAE,YAAY,EAAE;qBAC7D;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;oBACpB,iBAAiB,EAAE;wBACjB;4BACE,QAAQ,EAAE,QAAQ;4BAClB,WAAW,EAAE,IAAI;4BACjB,WAAW,EAAE,IAAI;4BACjB,KAAK,EAAE,GAAG;yBACX;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,iBAAiB,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC;QACtE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,UAAU,EAAE;wBACV,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,KAAK;wBACpB,UAAU,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC;qBACtD;iBACF,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;YAC9C,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,UAAU,EAAE;wBACV,cAAc,EAAE,MAAM;wBACtB,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,gBAAgB;wBAC3B,aAAa,EAAE,KAAK;wBACpB,iBAAiB,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC;4BACjC,QAAQ,EAAE,MAAM;4BAChB,WAAW,EAAE,IAAI;4BACjB,WAAW,EAAE,IAAI;4BACjB,KAAK,EAAE,GAAG;yBACX,CAAC;qBACH;iBACF,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;QACtC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;YAChD,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,MAAM;gBACtB,UAAU,EAAE,GAAG;gBACf,SAAS,EAAE,gBAAgB;gBAC3B,aAAa,EAAE,KAAK;aACrB,CAAC,CAAC;YACH,MAAM,MAAM,GAAG,aAAa,CAAC;YAC7B,MAAM,SAAS,GAAG,SAAS,GAAG,UAAU,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAEzF,MAAM,CAAC,EAAE,CAAC,SAAS,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;YAC3C,MAAM,CAAC,WAAW,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,oDAAoD;YACpD,MAAM,MAAM,GAAG,MAAM,iBAAiB,CAAC;gBACrC,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;iBACrB;gBACD,SAAS,EAAE,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,mCAAmC;gBAC1E,MAAM,EAAE,aAAa;aACtB,CAAC,CAAC;YAEH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,+BAA+B,CAAC,CAAC,CAAC;QACtE,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;QACrC,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;YAC7C,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,iBAAiB;oBACjC,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,aAAa;oBACxB,aAAa,EAAE,KAAK;oBACpB,OAAO,EAAE,WAAW;oBACpB,SAAS,EAAE,WAAW;iBACvB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC;YAC5D,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QAChE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,YAAY;oBAC5B,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,aAAa;oBACxB,aAAa,EAAE,KAAK;oBACpB,gBAAgB,EAAE,CAAC;iBACpB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,gBAAgB,EAAE,CAAC,CAAC,CAAC;QAC7D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;YACjD,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE;gBACjB,uBAAuB,CAAC,KAAK,CAAC;oBAC5B,UAAU,EAAE;wBACV,cAAc,EAAE,YAAY;wBAC5B,UAAU,EAAE,GAAG;wBACf,SAAS,EAAE,aAAa;wBACxB,aAAa,EAAE,KAAK;wBACpB,gBAAgB,EAAE,CAAC,CAAC;qBACrB;iBACF,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;oBACpB,QAAQ,EAAE;wBACR,MAAM,EAAE,cAAc;wBACtB,OAAO,EAAE,KAAK;wBACd,YAAY,EAAE,EAAE;qBACjB;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;YACvC,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,QAAQ,EAAE,MAAM,EAAE,cAAc,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,MAAM,GAAG,uBAAuB,CAAC,KAAK,CAAC;gBAC3C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,EAAE;oBACd,SAAS,EAAE,YAAY;oBACvB,SAAS,EAAE,gBAAgB;oBAC3B,aAAa,EAAE,KAAK;iBACrB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,EAAE,EAAE,CAAC,CAAC;YACtD,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;QACjE,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,4BAA4B,EAAE,GAAG,EAAE;QAC1C,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,CAAC;gBAC/C,UAAU,EAAE;oBACV,cAAc,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC;oBAC/B,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,MAAM;oBACjB,aAAa,EAAE,KAAK;iBACrB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;YAC7C,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,CAAC;gBAC/C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC;oBAC1B,aAAa,EAAE,KAAK;iBACrB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;YAC3C,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,CAAC;gBAC/C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,MAAM;oBACjB,aAAa,EAAE,KAAK;oBACpB,OAAO,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC;iBACzB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,CAAC;gBAC/C,UAAU,EAAE;oBACV,cAAc,EAAE,MAAM;oBACtB,UAAU,EAAE,GAAG;oBACf,SAAS,EAAE,MAAM;oBACjB,aAAa,EAAE,KAAK;oBACpB,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC/B;aACF,CAAC,CAAC;YACH,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -21,9 +21,14 @@ export declare const queryEvaluationsSchema: z.ZodObject<{
|
|
|
21
21
|
evaluator: z.ZodOptional<z.ZodString>;
|
|
22
22
|
evaluatorType: z.ZodOptional<z.ZodEnum<["llm", "human", "rule", "classifier"]>>;
|
|
23
23
|
responseId: z.ZodOptional<z.ZodString>;
|
|
24
|
+
agentId: z.ZodOptional<z.ZodString>;
|
|
25
|
+
agentName: z.ZodOptional<z.ZodString>;
|
|
24
26
|
}, "strip", z.ZodTypeAny, {
|
|
25
27
|
limit: number;
|
|
26
28
|
traceId?: string | undefined;
|
|
29
|
+
sessionId?: string | undefined;
|
|
30
|
+
agentId?: string | undefined;
|
|
31
|
+
agentName?: string | undefined;
|
|
27
32
|
aggregation?: "avg" | "min" | "max" | "count" | "p50" | "p95" | "p99" | undefined;
|
|
28
33
|
groupBy?: ("evaluationName" | "scoreLabel" | "evaluator")[] | undefined;
|
|
29
34
|
evaluationName?: string | undefined;
|
|
@@ -33,12 +38,14 @@ export declare const queryEvaluationsSchema: z.ZodObject<{
|
|
|
33
38
|
scoreMax?: number | undefined;
|
|
34
39
|
evaluatorType?: "llm" | "human" | "rule" | "classifier" | undefined;
|
|
35
40
|
responseId?: string | undefined;
|
|
36
|
-
sessionId?: string | undefined;
|
|
37
41
|
endDate?: string | undefined;
|
|
38
42
|
startDate?: string | undefined;
|
|
39
43
|
}, {
|
|
40
44
|
limit?: number | undefined;
|
|
41
45
|
traceId?: string | undefined;
|
|
46
|
+
sessionId?: string | undefined;
|
|
47
|
+
agentId?: string | undefined;
|
|
48
|
+
agentName?: string | undefined;
|
|
42
49
|
aggregation?: "avg" | "min" | "max" | "count" | "p50" | "p95" | "p99" | undefined;
|
|
43
50
|
groupBy?: ("evaluationName" | "scoreLabel" | "evaluator")[] | undefined;
|
|
44
51
|
evaluationName?: string | undefined;
|
|
@@ -48,7 +55,6 @@ export declare const queryEvaluationsSchema: z.ZodObject<{
|
|
|
48
55
|
scoreMax?: number | undefined;
|
|
49
56
|
evaluatorType?: "llm" | "human" | "rule" | "classifier" | undefined;
|
|
50
57
|
responseId?: string | undefined;
|
|
51
|
-
sessionId?: string | undefined;
|
|
52
58
|
endDate?: string | undefined;
|
|
53
59
|
startDate?: string | undefined;
|
|
54
60
|
}>;
|
|
@@ -80,6 +86,12 @@ export interface EvaluationResponse {
|
|
|
80
86
|
traceId?: string;
|
|
81
87
|
/** Session ID for session-scoped queries */
|
|
82
88
|
sessionId?: string;
|
|
89
|
+
/** Subject agent ID being evaluated (gen_ai.agent.id) */
|
|
90
|
+
agentId?: string;
|
|
91
|
+
/** Subject agent name being evaluated (gen_ai.agent.name) */
|
|
92
|
+
agentName?: string;
|
|
93
|
+
/** Number of steps in agent trajectory */
|
|
94
|
+
trajectoryLength?: number;
|
|
83
95
|
}
|
|
84
96
|
/** Summary data structure */
|
|
85
97
|
interface EvaluationSummary {
|
|
@@ -126,6 +138,9 @@ export declare function queryEvaluations(rawInput: QueryEvaluationsInput, backen
|
|
|
126
138
|
responseId: string | undefined;
|
|
127
139
|
traceId: string | undefined;
|
|
128
140
|
sessionId: string | undefined;
|
|
141
|
+
agentId: string | undefined;
|
|
142
|
+
agentName: string | undefined;
|
|
143
|
+
trajectoryLength: number | undefined;
|
|
129
144
|
}[];
|
|
130
145
|
truncated?: boolean | undefined;
|
|
131
146
|
originalCount?: number | undefined;
|
|
@@ -149,9 +164,14 @@ export declare const queryEvaluationsTool: {
|
|
|
149
164
|
evaluator: z.ZodOptional<z.ZodString>;
|
|
150
165
|
evaluatorType: z.ZodOptional<z.ZodEnum<["llm", "human", "rule", "classifier"]>>;
|
|
151
166
|
responseId: z.ZodOptional<z.ZodString>;
|
|
167
|
+
agentId: z.ZodOptional<z.ZodString>;
|
|
168
|
+
agentName: z.ZodOptional<z.ZodString>;
|
|
152
169
|
}, "strip", z.ZodTypeAny, {
|
|
153
170
|
limit: number;
|
|
154
171
|
traceId?: string | undefined;
|
|
172
|
+
sessionId?: string | undefined;
|
|
173
|
+
agentId?: string | undefined;
|
|
174
|
+
agentName?: string | undefined;
|
|
155
175
|
aggregation?: "avg" | "min" | "max" | "count" | "p50" | "p95" | "p99" | undefined;
|
|
156
176
|
groupBy?: ("evaluationName" | "scoreLabel" | "evaluator")[] | undefined;
|
|
157
177
|
evaluationName?: string | undefined;
|
|
@@ -161,12 +181,14 @@ export declare const queryEvaluationsTool: {
|
|
|
161
181
|
scoreMax?: number | undefined;
|
|
162
182
|
evaluatorType?: "llm" | "human" | "rule" | "classifier" | undefined;
|
|
163
183
|
responseId?: string | undefined;
|
|
164
|
-
sessionId?: string | undefined;
|
|
165
184
|
endDate?: string | undefined;
|
|
166
185
|
startDate?: string | undefined;
|
|
167
186
|
}, {
|
|
168
187
|
limit?: number | undefined;
|
|
169
188
|
traceId?: string | undefined;
|
|
189
|
+
sessionId?: string | undefined;
|
|
190
|
+
agentId?: string | undefined;
|
|
191
|
+
agentName?: string | undefined;
|
|
170
192
|
aggregation?: "avg" | "min" | "max" | "count" | "p50" | "p95" | "p99" | undefined;
|
|
171
193
|
groupBy?: ("evaluationName" | "scoreLabel" | "evaluator")[] | undefined;
|
|
172
194
|
evaluationName?: string | undefined;
|
|
@@ -176,7 +198,6 @@ export declare const queryEvaluationsTool: {
|
|
|
176
198
|
scoreMax?: number | undefined;
|
|
177
199
|
evaluatorType?: "llm" | "human" | "rule" | "classifier" | undefined;
|
|
178
200
|
responseId?: string | undefined;
|
|
179
|
-
sessionId?: string | undefined;
|
|
180
201
|
endDate?: string | undefined;
|
|
181
202
|
startDate?: string | undefined;
|
|
182
203
|
}>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"query-evaluations.d.ts","sourceRoot":"","sources":["../../src/tools/query-evaluations.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,sBAAsB,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAc7I,eAAO,MAAM,sBAAsB
|
|
1
|
+
{"version":3,"file":"query-evaluations.d.ts","sourceRoot":"","sources":["../../src/tools/query-evaluations.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,sBAAsB,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAc7I,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAiBjC,CAAC;AAEH,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAG3E,MAAM,WAAW,uBAAuB;IACtC,YAAY,CAAC,EAAE,gBAAgB,CAAC;CACjC;AAED,iCAAiC;AACjC,MAAM,WAAW,kBAAkB;IACjC,sCAAsC;IACtC,SAAS,EAAE,MAAM,CAAC;IAClB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;IACvB,oDAAoD;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iEAAiE;IACjE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,yDAAyD;IACzD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,4DAA4D;IAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,uDAAuD;IACvD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,uDAAuD;IACvD,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,uDAAuD;IACvD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,+BAA+B;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,4CAA4C;IAC5C,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,yDAAyD;IACzD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,6DAA6D;IAC7D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0CAA0C;IAC1C,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAuHD,6BAA6B;AAC7B,UAAU,iBAAiB;IACzB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACzC,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACrC,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACpC,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,8BAA8B;AAC9B,UAAU,iBAAiB;IACzB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;GAGG;AACH,wBAAgB,YAAY,CAAC,WAAW,EAAE,gBAAgB,EAAE,GAAG,iBAAiB,CA6B/E;AAED;;;GAGG;AACH,wBAAgB,iBAAiB,CAC/B,WAAW,EAAE,gBAAgB,EAAE,EAC/B,WAAW,EAAE,qBAAqB,EAClC,OAAO,EAAE,sBAAsB,EAAE,GAChC,iBAAiB,EAAE,CAiDrB;AAED,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,qBAAqB,EAAE,cAAc,CAAC,EAAE,uBAAuB;;;0BA7G5F,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;sBAC1B,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;qBACvB,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;uBACpB,MAAM;;;;;;;;;;;;;;;;;;;;;GAoRtB;AAED,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAKhC,CAAC"}
|